From 8dcaef9ae92528674c0509f902c597ce16ef87eb Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 17 Mar 2022 16:45:09 +0800 Subject: [PATCH 1/3] Add paddleaudio doc. --- .../paddleaudio/backends/soundfile_backend.py | 140 +++++-- paddleaudio/paddleaudio/compliance/kaldi.py | 12 +- paddleaudio/paddleaudio/compliance/librosa.py | 358 ++++++++++-------- paddleaudio/paddleaudio/features/layers.py | 33 +- 4 files changed, 325 insertions(+), 218 deletions(-) diff --git a/paddleaudio/paddleaudio/backends/soundfile_backend.py b/paddleaudio/paddleaudio/backends/soundfile_backend.py index 2b920284..c1155654 100644 --- a/paddleaudio/paddleaudio/backends/soundfile_backend.py +++ b/paddleaudio/paddleaudio/backends/soundfile_backend.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import warnings from typing import Optional from typing import Tuple @@ -19,7 +20,6 @@ from typing import Union import numpy as np import resampy import soundfile as sf -from numpy import ndarray as array from scipy.io import wavfile from ..utils import ParameterError @@ -38,13 +38,21 @@ RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast'] EPS = 1e-8 -def resample(y: array, src_sr: int, target_sr: int, - mode: str='kaiser_fast') -> array: - """ Audio resampling - This function is the same as using resampy.resample(). - Notes: - The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast' - """ +def resample(y: np.ndarray, + src_sr: int, + target_sr: int, + mode: str='kaiser_fast') -> np.ndarray: + """Audio resampling. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + src_sr (int): Source sample rate. + target_sr (int): Target sample rate. + mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. + + Returns: + np.ndarray: `y` resampled to `target_sr` + """ if mode == 'kaiser_best': warnings.warn( @@ -53,7 +61,7 @@ def resample(y: array, src_sr: int, target_sr: int, if not isinstance(y, np.ndarray): raise ParameterError( - 'Only support numpy array, but received y in {type(y)}') + 'Only support numpy np.ndarray, but received y in {type(y)}') if mode not in RESAMPLE_MODES: raise ParameterError(f'resample mode must in {RESAMPLE_MODES}') @@ -61,9 +69,17 @@ def resample(y: array, src_sr: int, target_sr: int, return resampy.resample(y, src_sr, target_sr, filter=mode) -def to_mono(y: array, merge_type: str='average') -> array: - """ convert sterior audio to mono +def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray: + """Convert sterior audio to mono. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'. + + Returns: + np.ndarray: `y` with mono channel. """ + if merge_type not in MERGE_TYPES: raise ParameterError( f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}' @@ -101,18 +117,34 @@ def to_mono(y: array, merge_type: str='average') -> array: return y_out -def _safe_cast(y: array, dtype: Union[type, str]) -> array: - """ data type casting in a safe way, i.e., prevent overflow or underflow - This function is used internally. +def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray: + """Data type casting in a safe way, i.e., prevent overflow or underflow. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + dtype (Union[type, str]): Data type of waveform. + + Returns: + np.ndarray: `y` after safe casting. """ - return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype) + if 'float' in str(y.dtype): + return np.clip(y, np.finfo(dtype).min, + np.finfo(dtype).max).astype(dtype) + else: + return np.clip(y, np.iinfo(dtype).min, + np.iinfo(dtype).max).astype(dtype) -def depth_convert(y: array, dtype: Union[type, str], - dithering: bool=True) -> array: - """Convert audio array to target dtype safely - This function convert audio waveform to a target dtype, with addition steps of +def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray: + """Convert audio array to target dtype safely. This function convert audio waveform to a target dtype, with addition steps of preventing overflow/underflow and preserving audio range. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + dtype (Union[type, str]): Data type of waveform. + + Returns: + np.ndarray: `y` after safe casting. """ SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64'] @@ -157,14 +189,20 @@ def depth_convert(y: array, dtype: Union[type, str], return y -def sound_file_load(file: str, +def sound_file_load(file: os.PathLike, offset: Optional[float]=None, dtype: str='int16', - duration: Optional[int]=None) -> Tuple[array, int]: - """Load audio using soundfile library - This function load audio file using libsndfile. - Reference: - http://www.mega-nerd.com/libsndfile/#Features + duration: Optional[int]=None) -> Tuple[np.ndarray, int]: + """Load audio using soundfile library. This function load audio file using libsndfile. + + Args: + file (os.PathLike): File of waveform. + offset (Optional[float], optional): Offset to the start of waveform. Defaults to None. + dtype (str, optional): Data type of waveform. Defaults to 'int16'. + duration (Optional[int], optional): Duration of waveform to read. Defaults to None. + + Returns: + Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. """ with sf.SoundFile(file) as sf_desc: sr_native = sf_desc.samplerate @@ -179,9 +217,17 @@ def sound_file_load(file: str, return y, sf_desc.samplerate -def normalize(y: array, norm_type: str='linear', - mul_factor: float=1.0) -> array: - """ normalize an input audio with additional multiplier. +def normalize(y: np.ndarray, norm_type: str='linear', + mul_factor: float=1.0) -> np.ndarray: + """Normalize an input audio with additional multiplier. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + norm_type (str, optional): Type of normalization. Defaults to 'linear'. + mul_factor (float, optional): Scaling factor. Defaults to 1.0. + + Returns: + np.ndarray: `y` after normalization. """ if norm_type == 'linear': @@ -199,12 +245,13 @@ def normalize(y: array, norm_type: str='linear', return y -def save(y: array, sr: int, file: str) -> None: - """Save audio file to disk. - This function saves audio to disk using scipy.io.wavfile, with additional step - to convert input waveform to int16 unless it already is int16 - Notes: - It only support raw wav format. +def save(y: np.ndarray, sr: int, file: os.PathLike) -> None: + """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + sr (int): Sample rate. + file (os.PathLike): Path of auido file to save. """ if not file.endswith('.wav'): raise ParameterError( @@ -226,7 +273,7 @@ def save(y: array, sr: int, file: str) -> None: def load( - file: str, + file: os.PathLike, sr: Optional[int]=None, mono: bool=True, merge_type: str='average', # ch0,ch1,random,average @@ -236,11 +283,24 @@ def load( offset: float=0.0, duration: Optional[int]=None, dtype: str='float32', - resample_mode: str='kaiser_fast') -> Tuple[array, int]: - """Load audio file from disk. - This function loads audio from disk using using audio beackend. - Parameters: - Notes: + resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]: + """Load audio file from disk. This function loads audio from disk using using audio beackend. + + Args: + file (os.PathLike): Path of auido file to load. + sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None. + mono (bool, optional): Return waveform with mono channel. Defaults to True. + merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'. + normal (bool, optional): Waveform normalization. Defaults to True. + norm_type (str, optional): Type of normalization. Defaults to 'linear'. + norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0. + offset (float, optional): Offset to the start of waveform. Defaults to 0.0. + duration (Optional[int], optional): Duration of waveform to read. Defaults to None. + dtype (str, optional): Data type of waveform. Defaults to 'float32'. + resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. + + Returns: + Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. """ y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration) diff --git a/paddleaudio/paddleaudio/compliance/kaldi.py b/paddleaudio/paddleaudio/compliance/kaldi.py index 8cb9b666..538be019 100644 --- a/paddleaudio/paddleaudio/compliance/kaldi.py +++ b/paddleaudio/paddleaudio/compliance/kaldi.py @@ -220,7 +220,7 @@ def spectrogram(waveform: Tensor, """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. Args: - waveform (Tensor): A waveform tensor with shape [C, T]. + waveform (Tensor): A waveform tensor with shape `(C, T)`. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. channel (int, optional): Select the channel of waveform. Defaults to -1. dither (float, optional): Dithering constant . Defaults to 0.0. @@ -239,7 +239,7 @@ def spectrogram(waveform: Tensor, window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. Returns: - Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames + Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames depends on frame_length and frame_shift. """ dtype = waveform.dtype @@ -422,7 +422,7 @@ def fbank(waveform: Tensor, """Compute and return filter banks from a waveform. The output is identical to Kaldi's. Args: - waveform (Tensor): A waveform tensor with shape [C, T]. + waveform (Tensor): A waveform tensor with shape `(C, T)`. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. channel (int, optional): Select the channel of waveform. Defaults to -1. dither (float, optional): Dithering constant . Defaults to 0.0. @@ -451,7 +451,7 @@ def fbank(waveform: Tensor, window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. Returns: - Tensor: A filter banks tensor with shape (m, n_mels). + Tensor: A filter banks tensor with shape `(m, n_mels)`. """ dtype = waveform.dtype @@ -542,7 +542,7 @@ def mfcc(waveform: Tensor, identical to Kaldi's. Args: - waveform (Tensor): A waveform tensor with shape [C, T]. + waveform (Tensor): A waveform tensor with shape `(C, T)`. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0. channel (int, optional): Select the channel of waveform. Defaults to -1. @@ -571,7 +571,7 @@ def mfcc(waveform: Tensor, window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. Returns: - Tensor: A mel frequency cepstral coefficients tensor with shape (m, n_mfcc). + Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`. """ assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( n_mfcc, n_mels) diff --git a/paddleaudio/paddleaudio/compliance/librosa.py b/paddleaudio/paddleaudio/compliance/librosa.py index 167795c3..d7ceb2b4 100644 --- a/paddleaudio/paddleaudio/compliance/librosa.py +++ b/paddleaudio/paddleaudio/compliance/librosa.py @@ -19,7 +19,6 @@ from typing import Union import numpy as np import scipy -from numpy import ndarray as array from numpy.lib.stride_tricks import as_strided from scipy import signal @@ -32,7 +31,6 @@ __all__ = [ 'mfcc', 'hz_to_mel', 'mel_to_hz', - 'split_frames', 'mel_frequencies', 'power_to_db', 'compute_fbank_matrix', @@ -49,7 +47,8 @@ __all__ = [ ] -def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array: +def _pad_center(data: np.ndarray, size: int, axis: int=-1, + **kwargs) -> np.ndarray: """Pad an array to a target length along a target axis. This differs from `np.pad` by centering the data prior to padding, @@ -69,8 +68,10 @@ def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array: return np.pad(data, lengths, **kwargs) -def split_frames(x: array, frame_length: int, hop_length: int, - axis: int=-1) -> array: +def _split_frames(x: np.ndarray, + frame_length: int, + hop_length: int, + axis: int=-1) -> np.ndarray: """Slice a data array into (overlapping) frames. This function is aligned with librosa.frame @@ -142,11 +143,16 @@ def _check_audio(y, mono=True) -> bool: return True -def hz_to_mel(frequencies: Union[float, List[float], array], - htk: bool=False) -> array: - """Convert Hz to Mels +def hz_to_mel(frequencies: Union[float, List[float], np.ndarray], + htk: bool=False) -> np.ndarray: + """Convert Hz to Mels. - This function is aligned with librosa. + Args: + frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + np.ndarray: Frequency in mels. """ freq = np.asanyarray(frequencies) @@ -177,10 +183,16 @@ def hz_to_mel(frequencies: Union[float, List[float], array], return mels -def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array: +def mel_to_hz(mels: Union[float, List[float], np.ndarray], + htk: int=False) -> np.ndarray: """Convert mel bin numbers to frequencies. - This function is aligned with librosa. + Args: + mels (Union[float, List[float], np.ndarray]): Frequency in mels. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + np.ndarray: Frequencies in Hz. """ mel_array = np.asanyarray(mels) @@ -212,10 +224,17 @@ def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array: def mel_frequencies(n_mels: int=128, fmin: float=0.0, fmax: float=11025.0, - htk: bool=False) -> array: - """Compute mel frequencies + htk: bool=False) -> np.ndarray: + """Compute mel frequencies. + + Args: + n_mels (int, optional): Number of mel bins. Defaults to 128. + fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0. + htk (bool, optional): Use htk scaling. Defaults to False. - This function is aligned with librosa. + Returns: + np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`. """ # 'Center freqs' of mel bands - uniformly spaced between limits min_mel = hz_to_mel(fmin, htk=htk) @@ -226,10 +245,15 @@ def mel_frequencies(n_mels: int=128, return mel_to_hz(mels, htk=htk) -def fft_frequencies(sr: int, n_fft: int) -> array: +def fft_frequencies(sr: int, n_fft: int) -> np.ndarray: """Compute fourier frequencies. - This function is aligned with librosa. + Args: + sr (int): Sample rate. + n_fft (int): FFT size. + + Returns: + np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. """ return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True) @@ -241,10 +265,22 @@ def compute_fbank_matrix(sr: int, fmax: Optional[float]=None, htk: bool=False, norm: str="slaney", - dtype: type=np.float32): + dtype: type=np.float32) -> np.ndarray: """Compute fbank matrix. - This funciton is aligned with librosa. + Args: + sr (int): Sample rate. + n_fft (int): FFT size. + n_mels (int, optional): Number of mel bins. Defaults to 128. + fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use htk scaling. Defaults to False. + norm (str, optional): Type of normalization. Defaults to "slaney". + dtype (type, optional): Data type. Defaults to np.float32. + + + Returns: + np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. """ if norm != "slaney": raise ParameterError('norm must set to slaney') @@ -289,17 +325,28 @@ def compute_fbank_matrix(sr: int, return weights -def stft(x: array, +def stft(x: np.ndarray, n_fft: int=2048, hop_length: Optional[int]=None, win_length: Optional[int]=None, window: str="hann", center: bool=True, dtype: type=np.complex64, - pad_mode: str="reflect") -> array: + pad_mode: str="reflect") -> np.ndarray: """Short-time Fourier transform (STFT). - This function is aligned with librosa. + Args: + x (np.ndarray): Input waveform in one dimension. + n_fft (int, optional): FFT size. Defaults to 2048. + hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None. + win_length (Optional[int], optional): The size of window. Defaults to None. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + dtype (type, optional): Data type of STFT results. Defaults to np.complex64. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + + Returns: + np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)` """ _check_audio(x) @@ -314,7 +361,7 @@ def stft(x: array, fft_window = signal.get_window(window, win_length, fftbins=True) # Pad the window out to n_fft size - fft_window = pad_center(fft_window, n_fft) + fft_window = _pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) @@ -333,7 +380,7 @@ def stft(x: array, ) # Window the time series. - x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length) + x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length) # Pre-allocate the STFT matrix stft_matrix = np.empty( (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F") @@ -352,16 +399,20 @@ def stft(x: array, return stft_matrix -def power_to_db(spect: array, +def power_to_db(spect: np.ndarray, ref: float=1.0, amin: float=1e-10, - top_db: Optional[float]=80.0) -> array: - """Convert a power spectrogram (amplitude squared) to decibel (dB) units + top_db: Optional[float]=80.0) -> np.ndarray: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. This computes the scaling `10 * log10(spect / ref)` in a numerically stable way. - This computes the scaling ``10 * log10(spect / ref)`` in a numerically - stable way. + Args: + spect (np.ndarray): STFT power spectrogram of an input waveform. + ref (float, optional): Scaling factor of spectrogram. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0. - This function is aligned with librosa. + Returns: + np.ndarray: Power spectrogram in db scale. """ spect = np.asarray(spect) @@ -394,49 +445,27 @@ def power_to_db(spect: array, return log_spec -def mfcc(x, +def mfcc(x: np.ndarray, sr: int=16000, - spect: Optional[array]=None, + spect: Optional[np.ndarray]=None, n_mfcc: int=20, dct_type: int=2, norm: str="ortho", lifter: int=0, - **kwargs) -> array: + **kwargs) -> np.ndarray: """Mel-frequency cepstral coefficients (MFCCs) - This function is NOT strictly aligned with librosa. The following example shows how to get the - same result with librosa: - - # mfcc: - kwargs = { - 'window_size':512, - 'hop_length':320, - 'mel_bins':64, - 'fmin':50, - 'to_db':False} - a = mfcc(x, - spect=None, - n_mfcc=20, - dct_type=2, - norm='ortho', - lifter=0, - **kwargs) - - # librosa mfcc: - spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512, - win_length=512, - hop_length=320, - n_mels=64, fmin=50) - b = librosa.feature.mfcc(y=x, - sr=16000, - S=spect, - n_mfcc=20, - dct_type=2, - norm='ortho', - lifter=0) - - assert np.mean( (a-b)**2) < 1e-8 + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None. + n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20. + dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2. + norm (str, optional): Type of normalization. Defaults to "ortho". + lifter (int, optional): Cepstral filtering. Defaults to 0. + Returns: + np.ndarray: A mel frequency cepstral coefficients tensor with shape `(n_mfcc, num_frames)`. """ if spect is None: spect = melspectrogram(x, sr=sr, **kwargs) @@ -454,12 +483,12 @@ def mfcc(x, f"MFCC lifter={lifter} must be a non-negative number") -def melspectrogram(x: array, +def melspectrogram(x: np.ndarray, sr: int=16000, window_size: int=512, hop_length: int=320, n_mels: int=64, - fmin: int=50, + fmin: float=50.0, fmax: Optional[float]=None, window: str='hann', center: bool=True, @@ -468,27 +497,28 @@ def melspectrogram(x: array, to_db: bool=True, ref: float=1.0, amin: float=1e-10, - top_db: Optional[float]=None) -> array: + top_db: Optional[float]=None) -> np.ndarray: """Compute mel-spectrogram. - Parameters: - x: numpy.ndarray - The input wavform is a numpy array [shape=(n,)] - - window_size: int, typically 512, 1024, 2048, etc. - The window size for framing, also used as n_fft for stft - + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + window_size (int, optional): Size of FFT and window length. Defaults to 512. + hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. + n_mels (int, optional): Number of mel bins. Defaults to 64. + fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0. + fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. + to_db (bool, optional): Enable db scale. Defaults to True. + ref (float, optional): Scaling factor of spectrogram. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. Returns: - The mel-spectrogram in power scale or db scale(default) - - - Notes: - 1. sr is default to 16000, which is commonly used in speech/speaker processing. - 2. when fmax is None, it is set to sr//2. - 3. this function will convert mel spectgrum to db scale by default. This is different - that of librosa. - + np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`. """ _check_audio(x, mono=True) if len(x) <= 0: @@ -518,18 +548,28 @@ def melspectrogram(x: array, return mel_spect -def spectrogram(x: array, +def spectrogram(x: np.ndarray, sr: int=16000, window_size: int=512, hop_length: int=320, window: str='hann', center: bool=True, pad_mode: str='reflect', - power: float=2.0) -> array: - """Compute spectrogram from an input waveform. + power: float=2.0) -> np.ndarray: + """Compute spectrogram. + + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + window_size (int, optional): Size of FFT and window length. Defaults to 512. + hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. - This function is a wrapper for librosa.feature.stft, with addition step to - compute the magnitude of the complex spectrogram. + Returns: + np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`. """ s = stft( @@ -544,18 +584,16 @@ def spectrogram(x: array, return np.abs(s)**power -def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array: - """Mu-law encoding. - - Compute the mu-law decoding given an input code. - When quantized is True, the result will be converted to - integer in range [0,mu-1]. Otherwise, the resulting signal - is in range [-1,1] - +def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray: + """Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`. - Reference: - https://en.wikipedia.org/wiki/%CE%9C-law_algorithm + Args: + x (np.ndarray): The input waveform to encode. + mu (int, optional): The endoceding parameter. Defaults to 255. + quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True. + Returns: + np.ndarray: The mu-law encoded waveform. """ mu = 255 y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu) @@ -564,17 +602,16 @@ def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array: return y -def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array: - """Mu-law decoding. - - Compute the mu-law decoding given an input code. +def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray: + """Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise. - it assumes that the input y is in - range [0,mu-1] when quantize is True and [-1,1] otherwise - - Reference: - https://en.wikipedia.org/wiki/%CE%9C-law_algorithm + Args: + y (np.ndarray): The encoded waveform. + mu (int, optional): The endoceding parameter. Defaults to 255. + quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True. + Returns: + np.ndarray: The mu-law decoded waveform. """ if mu < 1: raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...') @@ -586,7 +623,7 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array: return x -def randint(high: int) -> int: +def _randint(high: int) -> int: """Generate one random integer in range [0 high) This is a helper function for random data augmentaiton @@ -594,20 +631,18 @@ def randint(high: int) -> int: return int(np.random.randint(0, high=high)) -def rand() -> float: - """Generate one floating-point number in range [0 1) - - This is a helper function for random data augmentaiton - """ - return float(np.random.rand(1)) - - -def depth_augment(y: array, +def depth_augment(y: np.ndarray, choices: List=['int8', 'int16'], - probs: List[float]=[0.5, 0.5]) -> array: - """ Audio depth augmentation + probs: List[float]=[0.5, 0.5]) -> np.ndarray: + """ Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16']. + probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5]. - Do audio depth augmentation to simulate the distortion brought by quantization. + Returns: + np.ndarray: The augmented waveform. """ assert len(probs) == len( choices @@ -621,13 +656,18 @@ def depth_augment(y: array, return y2 -def adaptive_spect_augment(spect: array, tempo_axis: int=0, - level: float=0.1) -> array: - """Do adpative spectrogram augmentation +def adaptive_spect_augment(spect: np.ndarray, + tempo_axis: int=0, + level: float=0.1) -> np.ndarray: + """Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation. - The level of the augmentation is gowern by the paramter level, - ranging from 0 to 1, with 0 represents no augmentation。 + Args: + spect (np.ndarray): Input spectrogram. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + level (float, optional): The level factor of masking. Defaults to 0.1. + Returns: + np.ndarray: The augmented spectrogram. """ assert spect.ndim == 2., 'only supports 2d tensor or numpy array' if tempo_axis == 0: @@ -643,32 +683,40 @@ def adaptive_spect_augment(spect: array, tempo_axis: int=0, if tempo_axis == 0: for _ in range(num_time_mask): - start = randint(nt - time_mask_width) + start = _randint(nt - time_mask_width) spect[start:start + time_mask_width, :] = 0 for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) + start = _randint(nf - freq_mask_width) spect[:, start:start + freq_mask_width] = 0 else: for _ in range(num_time_mask): - start = randint(nt - time_mask_width) + start = _randint(nt - time_mask_width) spect[:, start:start + time_mask_width] = 0 for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) + start = _randint(nf - freq_mask_width) spect[start:start + freq_mask_width, :] = 0 return spect -def spect_augment(spect: array, +def spect_augment(spect: np.ndarray, tempo_axis: int=0, max_time_mask: int=3, max_freq_mask: int=3, max_time_mask_width: int=30, - max_freq_mask_width: int=20) -> array: - """Do spectrogram augmentation in both time and freq axis + max_freq_mask_width: int=20) -> np.ndarray: + """Do spectrogram augmentation in both time and freq axis. - Reference: + Args: + spect (np.ndarray): Input spectrogram. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + max_time_mask (int, optional): Maximum number of time masking. Defaults to 3. + max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3. + max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30. + max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20. + Returns: + np.ndarray: The augmented spectrogram. """ assert spect.ndim == 2., 'only supports 2d tensor or numpy array' if tempo_axis == 0: @@ -676,52 +724,64 @@ def spect_augment(spect: array, else: nf, nt = spect.shape - num_time_mask = randint(max_time_mask) - num_freq_mask = randint(max_freq_mask) + num_time_mask = _randint(max_time_mask) + num_freq_mask = _randint(max_freq_mask) - time_mask_width = randint(max_time_mask_width) - freq_mask_width = randint(max_freq_mask_width) + time_mask_width = _randint(max_time_mask_width) + freq_mask_width = _randint(max_freq_mask_width) if tempo_axis == 0: for _ in range(num_time_mask): - start = randint(nt - time_mask_width) + start = _randint(nt - time_mask_width) spect[start:start + time_mask_width, :] = 0 for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) + start = _randint(nf - freq_mask_width) spect[:, start:start + freq_mask_width] = 0 else: for _ in range(num_time_mask): - start = randint(nt - time_mask_width) + start = _randint(nt - time_mask_width) spect[:, start:start + time_mask_width] = 0 for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) + start = _randint(nf - freq_mask_width) spect[start:start + freq_mask_width, :] = 0 return spect -def random_crop1d(y: array, crop_len: int) -> array: - """ Do random cropping on 1d input signal +def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray: + """ Random cropping on a input waveform. - The input is a 1d signal, typically a sound waveform + Args: + y (np.ndarray): Input waveform array in 1D. + crop_len (int): Length of waveform to crop. + + Returns: + np.ndarray: The cropped waveform. """ if y.ndim != 1: 'only accept 1d tensor or numpy array' n = len(y) - idx = randint(n - crop_len) + idx = _randint(n - crop_len) return y[idx:idx + crop_len] -def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array: - """ Do random cropping for 2D array, typically a spectrogram. +def random_crop2d(s: np.ndarray, crop_len: int, + tempo_axis: int=0) -> np.ndarray: + """ Random cropping on a spectrogram. - The cropping is done in temporal direction on the time-freq input signal. + Args: + s (np.ndarray): Input spectrogram in 2D. + crop_len (int): Length of spectrogram to crop. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + + Returns: + np.ndarray: The cropped spectrogram. """ if tempo_axis >= s.ndim: raise ParameterError('axis out of range') n = s.shape[tempo_axis] - idx = randint(high=n - crop_len) + idx = _randint(high=n - crop_len) sli = [slice(None) for i in range(s.ndim)] sli[tempo_axis] = slice(idx, idx + crop_len) out = s[tuple(sli)] diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py index 6afd234a..877a5ae8 100644 --- a/paddleaudio/paddleaudio/features/layers.py +++ b/paddleaudio/paddleaudio/features/layers.py @@ -44,29 +44,16 @@ class Spectrogram(nn.Layer): """Compute spectrogram of a given signal, typically an audio waveform. The spectorgram is defined as the complex norm of the short-time Fourier transformation. - Parameters: - n_fft (int): the number of frequency components of the discrete Fourier transform. - The default value is 2048, - hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4. - The default value is None. - win_length: the window length of the short time FFt. If None, it is set to same as n_fft. - The default value is None. - window (str): the name of the window function applied to the single before the Fourier transform. - The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', - 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. - The default value is 'hann' - power (float): Exponent for the magnitude spectrogram. The default value is 2.0. - center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. - If False, frame t begins at x[t * hop_length] - The default value is True - pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect' - and 'constant'. The default value is 'reflect'. - dtype (str): the data type of input and window. - Notes: - The Spectrogram transform relies on STFT transform to compute the spectrogram. - By default, the weights are not learnable. To fine-tune the Fourier coefficients, - set stop_gradient=False before training. - For more information, see STFT(). + + Args: + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + dtype (str, optional): Data type of input and window. Defaults to paddle.float32. """ super(Spectrogram, self).__init__() From 43a9e39c05d43f4644e29735e2250cceb5f58951 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 17 Mar 2022 20:18:36 +0800 Subject: [PATCH 2/3] Add paddleaudio doc. --- paddleaudio/paddleaudio/compliance/librosa.py | 4 +- paddleaudio/paddleaudio/features/layers.py | 239 +++++++++--------- 2 files changed, 117 insertions(+), 126 deletions(-) diff --git a/paddleaudio/paddleaudio/compliance/librosa.py b/paddleaudio/paddleaudio/compliance/librosa.py index d7ceb2b4..1342b251 100644 --- a/paddleaudio/paddleaudio/compliance/librosa.py +++ b/paddleaudio/paddleaudio/compliance/librosa.py @@ -346,7 +346,7 @@ def stft(x: np.ndarray, pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". Returns: - np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)` + np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`. """ _check_audio(x) @@ -465,7 +465,7 @@ def mfcc(x: np.ndarray, lifter (int, optional): Cepstral filtering. Defaults to 0. Returns: - np.ndarray: A mel frequency cepstral coefficients tensor with shape `(n_mfcc, num_frames)`. + np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`. """ if spect is None: spect = melspectrogram(x, sr=sr, **kwargs) diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py index 877a5ae8..ad990b78 100644 --- a/paddleaudio/paddleaudio/features/layers.py +++ b/paddleaudio/paddleaudio/features/layers.py @@ -17,6 +17,7 @@ from typing import Union import paddle import paddle.nn as nn +from paddle import Tensor from ..functional import compute_fbank_matrix from ..functional import create_dct @@ -32,6 +33,20 @@ __all__ = [ class Spectrogram(nn.Layer): + """Compute spectrogram of given signals, typically audio waveforms. + The spectorgram is defined as the complex norm of the short-time Fourier transformation. + + Args: + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + def __init__(self, n_fft: int=512, hop_length: Optional[int]=None, @@ -40,21 +55,7 @@ class Spectrogram(nn.Layer): power: float=2.0, center: bool=True, pad_mode: str='reflect', - dtype: str=paddle.float32): - """Compute spectrogram of a given signal, typically an audio waveform. - The spectorgram is defined as the complex norm of the short-time - Fourier transformation. - - Args: - n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. - hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. - win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. - power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. - center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. - pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. - dtype (str, optional): Data type of input and window. Defaults to paddle.float32. - """ + dtype: str='float32') -> None: super(Spectrogram, self).__init__() assert power > 0, 'Power of spectrogram must be > 0.' @@ -75,13 +76,39 @@ class Spectrogram(nn.Layer): pad_mode=pad_mode) self.register_buffer('fft_window', self.fft_window) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`. + """ stft = self._stft(x) spectrogram = paddle.pow(paddle.abs(stft), self.power) return spectrogram class MelSpectrogram(nn.Layer): + """Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix. + + Args: + sr (int, optional): Sample rate. Defaults to 22050. + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. + norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + def __init__(self, sr: int=22050, n_fft: int=512, @@ -96,39 +123,7 @@ class MelSpectrogram(nn.Layer): f_max: Optional[float]=None, htk: bool=False, norm: Union[str, float]='slaney', - dtype: str=paddle.float32): - """Compute the melspectrogram of a given signal, typically an audio waveform. - The melspectrogram is also known as filterbank or fbank feature in audio community. - It is computed by multiplying spectrogram with Mel filter bank matrix. - Parameters: - sr(int): the audio sample rate. - The default value is 22050. - n_fft(int): the number of frequency components of the discrete Fourier transform. - The default value is 2048, - hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. - The default value is None. - win_length: the window length of the short time FFt. If None, it is set to same as n_fft. - The default value is None. - window(str): the name of the window function applied to the single before the Fourier transform. - The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', - 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. - The default value is 'hann' - power (float): Exponent for the magnitude spectrogram. The default value is 2.0. - center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. - If False, frame t begins at x[t * hop_length] - The default value is True - pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' - and 'constant'. - The default value is 'reflect'. - n_mels(int): the mel bins. - f_min(float): the lower cut-off frequency, below which the filter response is zero. - f_max(float): the upper cut-off frequency, above which the filter response is zeros. - htk(bool): whether to use HTK formula in computing fbank matrix. - norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. - You can specify norm=1.0/2.0 to use customized p-norm normalization. - dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical - accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. - """ + dtype: str='float32') -> None: super(MelSpectrogram, self).__init__() self._spectrogram = Spectrogram( @@ -158,13 +153,42 @@ class MelSpectrogram(nn.Layer): dtype=dtype) # float64 for better numerical results self.register_buffer('fbank_matrix', self.fbank_matrix) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`. + """ spect_feature = self._spectrogram(x) mel_feature = paddle.matmul(self.fbank_matrix, spect_feature) return mel_feature class LogMelSpectrogram(nn.Layer): + """Compute log-mel-spectrogram feature of given signals, typically audio waveforms. + + Args: + sr (int, optional): Sample rate. Defaults to 22050. + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. + norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. + ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. + top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + def __init__(self, sr: int=22050, n_fft: int=512, @@ -182,41 +206,7 @@ class LogMelSpectrogram(nn.Layer): ref_value: float=1.0, amin: float=1e-10, top_db: Optional[float]=None, - dtype: str=paddle.float32): - """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal, - typically an audio waveform. - Parameters: - sr (int): the audio sample rate. - The default value is 22050. - n_fft (int): the number of frequency components of the discrete Fourier transform. - The default value is 2048, - hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4. - The default value is None. - win_length: the window length of the short time FFt. If None, it is set to same as n_fft. - The default value is None. - window (str): the name of the window function applied to the single before the Fourier transform. - The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', - 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. - The default value is 'hann' - center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. - If False, frame t begins at x[t * hop_length] - The default value is True - pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect' - and 'constant'. - The default value is 'reflect'. - n_mels (int): the mel bins. - f_min (float): the lower cut-off frequency, below which the filter response is zero. - f_max (float): the upper cut-off frequency, above which the filter response is zeros. - htk (bool): whether to use HTK formula in computing fbank matrix. - norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. - You can specify norm=1.0/2.0 to use customized p-norm normalization. - ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. - amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin). - top_db (float): the maximum db value of resulting spectrum, above which the - spectrum is clipped(to top_db). - dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical - accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. - """ + dtype: str='float32') -> None: super(LogMelSpectrogram, self).__init__() self._melspectrogram = MelSpectrogram( @@ -239,7 +229,14 @@ class LogMelSpectrogram(nn.Layer): self.amin = amin self.top_db = top_db - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`. + """ mel_feature = self._melspectrogram(x) log_mel_feature = power_to_db( mel_feature, @@ -250,6 +247,29 @@ class LogMelSpectrogram(nn.Layer): class MFCC(nn.Layer): + """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms. + + Args: + sr (int, optional): Sample rate. Defaults to 22050. + n_mfcc (int, optional): [description]. Defaults to 40. + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. + norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. + ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. + top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + def __init__(self, sr: int=22050, n_mfcc: int=40, @@ -268,43 +288,7 @@ class MFCC(nn.Layer): ref_value: float=1.0, amin: float=1e-10, top_db: Optional[float]=None, - dtype: str=paddle.float32): - """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms. - - Parameters: - sr(int): the audio sample rate. - The default value is 22050. - n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40. - n_fft (int): the number of frequency components of the discrete Fourier transform. - The default value is 2048, - hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4. - The default value is None. - win_length: the window length of the short time FFt. If None, it is set to same as n_fft. - The default value is None. - window (str): the name of the window function applied to the single before the Fourier transform. - The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', - 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. - The default value is 'hann' - power (float): Exponent for the magnitude spectrogram. The default value is 2.0. - center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. - If False, frame t begins at x[t * hop_length] - The default value is True - pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect' - and 'constant'. - The default value is 'reflect'. - n_mels (int): the mel bins. - f_min (float): the lower cut-off frequency, below which the filter response is zero. - f_max (float): the upper cut-off frequency, above which the filter response is zeros. - htk (bool): whether to use HTK formula in computing fbank matrix. - norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. - You can specify norm=1.0/2.0 to use customized p-norm normalization. - ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. - amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin). - top_db (float): the maximum db value of resulting spectrum, above which the - spectrum is clipped(to top_db). - dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical - accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. - """ + dtype: str=paddle.float32) -> None: super(MFCC, self).__init__() assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( n_mfcc, n_mels) @@ -329,7 +313,14 @@ class MFCC(nn.Layer): self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype) self.register_buffer('dct_matrix', self.dct_matrix) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`. + """ log_mel_feature = self._log_melspectrogram(x) mfcc = paddle.matmul( log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose( From 831cadacc74b99a425d8b0d151863fca21f188a4 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 17 Mar 2022 21:17:41 +0800 Subject: [PATCH 3/3] Add paddleaudio doc. --- paddleaudio/paddleaudio/compliance/librosa.py | 6 +- paddleaudio/paddleaudio/features/layers.py | 8 +- .../paddleaudio/functional/functional.py | 139 ++++++------- paddleaudio/paddleaudio/functional/window.py | 186 +++++------------- paddleaudio/paddleaudio/metric/dtw.py | 4 +- paddlespeech/cli/executor.py | 3 +- 6 files changed, 127 insertions(+), 219 deletions(-) diff --git a/paddleaudio/paddleaudio/compliance/librosa.py b/paddleaudio/paddleaudio/compliance/librosa.py index 1342b251..740584ca 100644 --- a/paddleaudio/paddleaudio/compliance/librosa.py +++ b/paddleaudio/paddleaudio/compliance/librosa.py @@ -403,11 +403,11 @@ def power_to_db(spect: np.ndarray, ref: float=1.0, amin: float=1e-10, top_db: Optional[float]=80.0) -> np.ndarray: - """Convert a power spectrogram (amplitude squared) to decibel (dB) units. This computes the scaling `10 * log10(spect / ref)` in a numerically stable way. + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way. Args: spect (np.ndarray): STFT power spectrogram of an input waveform. - ref (float, optional): Scaling factor of spectrogram. Defaults to 1.0. + ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. amin (float, optional): Minimum threshold. Defaults to 1e-10. top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0. @@ -513,7 +513,7 @@ def melspectrogram(x: np.ndarray, pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. to_db (bool, optional): Enable db scale. Defaults to True. - ref (float, optional): Scaling factor of spectrogram. Defaults to 1.0. + ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. amin (float, optional): Minimum threshold. Defaults to 1e-10. top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py index ad990b78..09037255 100644 --- a/paddleaudio/paddleaudio/features/layers.py +++ b/paddleaudio/paddleaudio/features/layers.py @@ -40,7 +40,7 @@ class Spectrogram(nn.Layer): n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. @@ -97,7 +97,7 @@ class MelSpectrogram(nn.Layer): n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. @@ -174,7 +174,7 @@ class LogMelSpectrogram(nn.Layer): n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. @@ -255,7 +255,7 @@ class MFCC(nn.Layer): n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. - window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. diff --git a/paddleaudio/paddleaudio/functional/functional.py b/paddleaudio/paddleaudio/functional/functional.py index c5ab3045..19c63a9a 100644 --- a/paddleaudio/paddleaudio/functional/functional.py +++ b/paddleaudio/paddleaudio/functional/functional.py @@ -17,6 +17,7 @@ from typing import Optional from typing import Union import paddle +from paddle import Tensor __all__ = [ 'hz_to_mel', @@ -29,19 +30,20 @@ __all__ = [ ] -def hz_to_mel(freq: Union[paddle.Tensor, float], - htk: bool=False) -> Union[paddle.Tensor, float]: +def hz_to_mel(freq: Union[Tensor, float], + htk: bool=False) -> Union[Tensor, float]: """Convert Hz to Mels. - Parameters: - freq: the input tensor of arbitrary shape, or a single floating point number. - htk: use HTK formula to do the conversion. - The default value is False. + + Args: + freq (Union[Tensor, float]): The input tensor with arbitrary shape. + htk (bool, optional): Use htk scaling. Defaults to False. + Returns: - The frequencies represented in Mel-scale. + Union[Tensor, float]: Frequency in mels. """ if htk: - if isinstance(freq, paddle.Tensor): + if isinstance(freq, Tensor): return 2595.0 * paddle.log10(1.0 + freq / 700.0) else: return 2595.0 * math.log10(1.0 + freq / 700.0) @@ -58,7 +60,7 @@ def hz_to_mel(freq: Union[paddle.Tensor, float], min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) logstep = math.log(6.4) / 27.0 # step size for log region - if isinstance(freq, paddle.Tensor): + if isinstance(freq, Tensor): target = min_log_mel + paddle.log( freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10 mask = (freq > min_log_hz).astype(freq.dtype) @@ -71,14 +73,16 @@ def hz_to_mel(freq: Union[paddle.Tensor, float], return mels -def mel_to_hz(mel: Union[float, paddle.Tensor], - htk: bool=False) -> Union[float, paddle.Tensor]: +def mel_to_hz(mel: Union[float, Tensor], + htk: bool=False) -> Union[float, Tensor]: """Convert mel bin numbers to frequencies. - Parameters: - mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number. - htk: use HTK formula to do the conversion. + + Args: + mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape. + htk (bool, optional): Use htk scaling. Defaults to False. + Returns: - The frequencies represented in hz. + Union[float, Tensor]: Frequencies in Hz. """ if htk: return 700.0 * (10.0**(mel / 2595.0) - 1.0) @@ -90,7 +94,7 @@ def mel_to_hz(mel: Union[float, paddle.Tensor], min_log_hz = 1000.0 # beginning of log region (Hz) min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) logstep = math.log(6.4) / 27.0 # step size for log region - if isinstance(mel, paddle.Tensor): + if isinstance(mel, Tensor): target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel)) mask = (mel > min_log_mel).astype(mel.dtype) freqs = target * mask + freqs * ( @@ -106,16 +110,18 @@ def mel_frequencies(n_mels: int=64, f_min: float=0.0, f_max: float=11025.0, htk: bool=False, - dtype: str=paddle.float32): + dtype: str='float32') -> Tensor: """Compute mel frequencies. - Parameters: - n_mels(int): number of Mel bins. - f_min(float): the lower cut-off frequency, below which the filter response is zero. - f_max(float): the upper cut-off frequency, above which the filter response is zero. - htk(bool): whether to use htk formula. - dtype(str): the datatype of the return frequencies. + + Args: + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0. + htk (bool, optional): Use htk scaling. Defaults to False. + dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'. + Returns: - The frequencies represented in Mel-scale + Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`. """ # 'Center freqs' of mel bands - uniformly spaced between limits min_mel = hz_to_mel(f_min, htk=htk) @@ -125,14 +131,16 @@ def mel_frequencies(n_mels: int=64, return freqs -def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32): +def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor: """Compute fourier frequencies. - Parameters: - sr(int): the audio sample rate. - n_fft(float): the number of fft bins. - dtype(str): the datatype of the return frequencies. + + Args: + sr (int): Sample rate. + n_fft (int): Number of fft bins. + dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'. + Returns: - The frequencies represented in hz. + Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. """ return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype) @@ -144,23 +152,21 @@ def compute_fbank_matrix(sr: int, f_max: Optional[float]=None, htk: bool=False, norm: Union[str, float]='slaney', - dtype: str=paddle.float32): + dtype: str='float32') -> Tensor: """Compute fbank matrix. - Parameters: - sr(int): the audio sample rate. - n_fft(int): the number of fft bins. - n_mels(int): the number of Mel bins. - f_min(float): the lower cut-off frequency, below which the filter response is zero. - f_max(float): the upper cut-off frequency, above which the filter response is zero. - htk: whether to use htk formula. - return_complex(bool): whether to return complex matrix. If True, the matrix will - be complex type. Otherwise, the real and image part will be stored in the last - axis of returned tensor. - dtype(str): the datatype of the returned fbank matrix. + + Args: + sr (int): Sample rate. + n_fft (int): Number of fft bins. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use htk scaling. Defaults to False. + norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'. + dtype (str, optional): The data type of the return matrix. Defaults to 'float32'. + Returns: - The fbank matrix of shape (n_mels, int(1+n_fft//2)). - Shape: - output: (n_mels, int(1+n_fft//2)) + Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. """ if f_max is None: @@ -199,27 +205,20 @@ def compute_fbank_matrix(sr: int, return weights -def power_to_db(magnitude: paddle.Tensor, +def power_to_db(spect: Tensor, ref_value: float=1.0, amin: float=1e-10, - top_db: Optional[float]=None) -> paddle.Tensor: - """Convert a power spectrogram (amplitude squared) to decibel (dB) units. - The function computes the scaling ``10 * log10(x / ref)`` in a numerically - stable way. - Parameters: - magnitude(Tensor): the input magnitude tensor of any shape. - ref_value(float): the reference value. If smaller than 1.0, the db level - of the signal will be pulled up accordingly. Otherwise, the db level - is pushed down. - amin(float): the minimum value of input magnitude, below which the input - magnitude is clipped(to amin). - top_db(float): the maximum db value of resulting spectrum, above which the - spectrum is clipped(to top_db). + top_db: Optional[float]=None) -> Tensor: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way. + + Args: + spect (Tensor): STFT power spectrogram. + ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. + Returns: - The spectrogram in log-scale. - shape: - input: any shape - output: same as input + Tensor: Power spectrogram in db scale. """ if amin <= 0: raise Exception("amin must be strictly positive") @@ -227,8 +226,8 @@ def power_to_db(magnitude: paddle.Tensor, if ref_value <= 0: raise Exception("ref_value must be strictly positive") - ones = paddle.ones_like(magnitude) - log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude)) + ones = paddle.ones_like(spect) + log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect)) log_spec -= 10.0 * math.log10(max(ref_value, amin)) if top_db is not None: @@ -242,15 +241,17 @@ def power_to_db(magnitude: paddle.Tensor, def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]='ortho', - dtype: Optional[str]=paddle.float32) -> paddle.Tensor: + dtype: str='float32') -> Tensor: """Create a discrete cosine transform(DCT) matrix. - Parameters: + Args: n_mfcc (int): Number of mel frequency cepstral coefficients. n_mels (int): Number of mel filterbanks. - norm (str, optional): Normalizaiton type. Defaults to 'ortho'. + norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'. + dtype (str, optional): The data type of the return matrix. Defaults to 'float32'. + Returns: - Tensor: The DCT matrix with shape (n_mels, n_mfcc). + Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`. """ n = paddle.arange(n_mels, dtype=dtype) k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1) diff --git a/paddleaudio/paddleaudio/functional/window.py b/paddleaudio/paddleaudio/functional/window.py index f321b38e..c99d5046 100644 --- a/paddleaudio/paddleaudio/functional/window.py +++ b/paddleaudio/paddleaudio/functional/window.py @@ -20,24 +20,11 @@ from paddle import Tensor __all__ = [ 'get_window', - - # windows - 'taylor', - 'hamming', - 'hann', - 'tukey', - 'kaiser', - 'gaussian', - 'exponential', - 'triang', - 'bohman', - 'blackman', - 'cosine', ] -def _cat(a: List[Tensor], data_type: str) -> Tensor: - l = [paddle.to_tensor(_a, data_type) for _a in a] +def _cat(x: List[Tensor], data_type: str) -> Tensor: + l = [paddle.to_tensor(_, data_type) for _ in x] return paddle.concat(l) @@ -48,7 +35,7 @@ def _acosh(x: Union[Tensor, float]) -> Tensor: def _extend(M: int, sym: bool) -> bool: - """Extend window by 1 sample if needed for DFT-even symmetry""" + """Extend window by 1 sample if needed for DFT-even symmetry. """ if not sym: return M + 1, True else: @@ -56,7 +43,7 @@ def _extend(M: int, sym: bool) -> bool: def _len_guards(M: int) -> bool: - """Handle small or incorrect window lengths""" + """Handle small or incorrect window lengths. """ if int(M) != M or M < 0: raise ValueError('Window length M must be a non-negative integer') @@ -64,15 +51,15 @@ def _len_guards(M: int) -> bool: def _truncate(w: Tensor, needed: bool) -> Tensor: - """Truncate window by 1 sample if needed for DFT-even symmetry""" + """Truncate window by 1 sample if needed for DFT-even symmetry. """ if needed: return w[:-1] else: return w -def general_gaussian(M: int, p, sig, sym: bool=True, - dtype: str='float64') -> Tensor: +def _general_gaussian(M: int, p, sig, sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a window with a generalized Gaussian shape. This function is consistent with scipy.signal.windows.general_gaussian(). """ @@ -86,8 +73,8 @@ def general_gaussian(M: int, p, sig, sym: bool=True, return _truncate(w, needs_trunc) -def general_cosine(M: int, a: float, sym: bool=True, - dtype: str='float64') -> Tensor: +def _general_cosine(M: int, a: float, sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a generic weighted sum of cosine terms window. This function is consistent with scipy.signal.windows.general_cosine(). """ @@ -101,31 +88,23 @@ def general_cosine(M: int, a: float, sym: bool=True, return _truncate(w, needs_trunc) -def general_hamming(M: int, alpha: float, sym: bool=True, - dtype: str='float64') -> Tensor: +def _general_hamming(M: int, alpha: float, sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a generalized Hamming window. This function is consistent with scipy.signal.windows.general_hamming() """ - return general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype) + return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype) -def taylor(M: int, - nbar=4, - sll=30, - norm=True, - sym: bool=True, - dtype: str='float64') -> Tensor: +def _taylor(M: int, + nbar=4, + sll=30, + norm=True, + sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a Taylor window. The Taylor window taper function approximates the Dolph-Chebyshev window's constant sidelobe level for a parameterized number of near-in sidelobes. - Parameters: - M(int): window size - nbar, sil, norm: the window-specific parameter. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -171,46 +150,25 @@ def taylor(M: int, return _truncate(w, needs_trunc) -def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a Hamming window. The Hamming window is a taper formed by using a raised cosine with non-zero endpoints, optimized to minimize the nearest side lobe. - Parameters: - M(int): window size - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ - return general_hamming(M, 0.54, sym, dtype=dtype) + return _general_hamming(M, 0.54, sym, dtype=dtype) -def hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a Hann window. The Hann window is a taper formed by using a raised cosine or sine-squared with ends that touch zero. - Parameters: - M(int): window size - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ - return general_hamming(M, 0.5, sym, dtype=dtype) + return _general_hamming(M, 0.5, sym, dtype=dtype) -def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: +def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a Tukey window. The Tukey window is also known as a tapered cosine window. - Parameters: - M(int): window size - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -237,32 +195,18 @@ def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: return _truncate(w, needs_trunc) -def kaiser(M: int, beta: float, sym: bool=True, dtype: str='float64') -> Tensor: +def _kaiser(M: int, beta: float, sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a Kaiser window. The Kaiser window is a taper formed by using a Bessel function. - Parameters: - M(int): window size. - beta(float): the window-specific parameter. - sym(bool):whether to return symmetric window. - The default value is True - Returns: - Tensor: the window tensor """ raise NotImplementedError() -def gaussian(M: int, std: float, sym: bool=True, - dtype: str='float64') -> Tensor: +def _gaussian(M: int, std: float, sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a Gaussian window. The Gaussian widows has a Gaussian shape defined by the standard deviation(std). - Parameters: - M(int): window size. - std(float): the window-specific parameter. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -275,21 +219,12 @@ def gaussian(M: int, std: float, sym: bool=True, return _truncate(w, needs_trunc) -def exponential(M: int, - center=None, - tau=1., - sym: bool=True, - dtype: str='float64') -> Tensor: - """Compute an exponential (or Poisson) window. - Parameters: - M(int): window size. - tau(float): the window-specific parameter. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor - """ +def _exponential(M: int, + center=None, + tau=1., + sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute an exponential (or Poisson) window. """ if sym and center is not None: raise ValueError("If sym==True, center must be None.") if _len_guards(M): @@ -305,15 +240,8 @@ def exponential(M: int, return _truncate(w, needs_trunc) -def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a triangular window. - Parameters: - M(int): window size. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -330,16 +258,9 @@ def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: return _truncate(w, needs_trunc) -def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a Bohman window. The Bohman window is the autocorrelation of a cosine window. - Parameters: - M(int): window size. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -353,32 +274,18 @@ def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: return _truncate(w, needs_trunc) -def blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a Blackman window. The Blackman window is a taper formed by using the first three terms of a summation of cosines. It was designed to have close to the minimal leakage possible. It is close to optimal, only slightly worse than a Kaiser window. - Parameters: - M(int): window size. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ - return general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype) + return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype) -def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a window with a simple cosine shape. - Parameters: - M(int): window size. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -388,19 +295,20 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: return _truncate(w, needs_trunc) -## factory function def get_window(window: Union[str, Tuple[str, float]], win_length: int, fftbins: bool=True, dtype: str='float64') -> Tensor: """Return a window of a given length and type. - Parameters: - window(str|(str,float)): the type of window to create. - win_length(int): the number of samples in the window. - fftbins(bool): If True, create a "periodic" window. Otherwise, - create a "symmetric" window, for use in filter design. + + Args: + window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. + win_length (int): Number of samples. + fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True. + dtype (str, optional): The data type of the return window. Defaults to 'float64'. + Returns: - The window represented as a tensor. + Tensor: The window represented as a tensor. """ sym = not fftbins @@ -420,7 +328,7 @@ def get_window(window: Union[str, Tuple[str, float]], str(type(window))) try: - winfunc = eval(winstr) + winfunc = eval('_' + winstr) except KeyError as e: raise ValueError("Unknown window type.") from e diff --git a/paddleaudio/paddleaudio/metric/dtw.py b/paddleaudio/paddleaudio/metric/dtw.py index d27f56e2..c4dc7a28 100644 --- a/paddleaudio/paddleaudio/metric/dtw.py +++ b/paddleaudio/paddleaudio/metric/dtw.py @@ -20,9 +20,7 @@ __all__ = [ def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float: - """dtw distance - - Dynamic Time Warping. + """Dynamic Time Warping. This function keeps a compact matrix, not the full warping paths matrix. Uses dynamic programming to compute: diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py index d77d27b0..064939a8 100644 --- a/paddlespeech/cli/executor.py +++ b/paddlespeech/cli/executor.py @@ -178,7 +178,8 @@ class BaseExecutor(ABC): Returns: bool: return `True` for job input, `False` otherwise. """ - return input_ and os.path.isfile(input_) and input_.endswith('.job') + return input_ and os.path.isfile(input_) and (input_.endswith('.job') or + input_.endswith('.txt')) def _get_job_contents( self, job_input: os.PathLike) -> Dict[str, Union[str, os.PathLike]]: