diff --git a/paddleaudio/paddleaudio/backends/soundfile_backend.py b/paddleaudio/paddleaudio/backends/soundfile_backend.py index 2b920284..c1155654 100644 --- a/paddleaudio/paddleaudio/backends/soundfile_backend.py +++ b/paddleaudio/paddleaudio/backends/soundfile_backend.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import warnings from typing import Optional from typing import Tuple @@ -19,7 +20,6 @@ from typing import Union import numpy as np import resampy import soundfile as sf -from numpy import ndarray as array from scipy.io import wavfile from ..utils import ParameterError @@ -38,13 +38,21 @@ RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast'] EPS = 1e-8 -def resample(y: array, src_sr: int, target_sr: int, - mode: str='kaiser_fast') -> array: - """ Audio resampling - This function is the same as using resampy.resample(). - Notes: - The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast' - """ +def resample(y: np.ndarray, + src_sr: int, + target_sr: int, + mode: str='kaiser_fast') -> np.ndarray: + """Audio resampling. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + src_sr (int): Source sample rate. + target_sr (int): Target sample rate. + mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. + + Returns: + np.ndarray: `y` resampled to `target_sr` + """ if mode == 'kaiser_best': warnings.warn( @@ -53,7 +61,7 @@ def resample(y: array, src_sr: int, target_sr: int, if not isinstance(y, np.ndarray): raise ParameterError( - 'Only support numpy array, but received y in {type(y)}') + 'Only support numpy np.ndarray, but received y in {type(y)}') if mode not in RESAMPLE_MODES: raise ParameterError(f'resample mode must in {RESAMPLE_MODES}') @@ -61,9 +69,17 @@ def resample(y: array, src_sr: int, target_sr: int, return resampy.resample(y, src_sr, target_sr, filter=mode) -def to_mono(y: array, merge_type: str='average') -> array: - """ convert sterior audio to mono +def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray: + """Convert sterior audio to mono. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'. + + Returns: + np.ndarray: `y` with mono channel. """ + if merge_type not in MERGE_TYPES: raise ParameterError( f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}' @@ -101,18 +117,34 @@ def to_mono(y: array, merge_type: str='average') -> array: return y_out -def _safe_cast(y: array, dtype: Union[type, str]) -> array: - """ data type casting in a safe way, i.e., prevent overflow or underflow - This function is used internally. +def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray: + """Data type casting in a safe way, i.e., prevent overflow or underflow. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + dtype (Union[type, str]): Data type of waveform. + + Returns: + np.ndarray: `y` after safe casting. """ - return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype) + if 'float' in str(y.dtype): + return np.clip(y, np.finfo(dtype).min, + np.finfo(dtype).max).astype(dtype) + else: + return np.clip(y, np.iinfo(dtype).min, + np.iinfo(dtype).max).astype(dtype) -def depth_convert(y: array, dtype: Union[type, str], - dithering: bool=True) -> array: - """Convert audio array to target dtype safely - This function convert audio waveform to a target dtype, with addition steps of +def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray: + """Convert audio array to target dtype safely. This function convert audio waveform to a target dtype, with addition steps of preventing overflow/underflow and preserving audio range. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + dtype (Union[type, str]): Data type of waveform. + + Returns: + np.ndarray: `y` after safe casting. """ SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64'] @@ -157,14 +189,20 @@ def depth_convert(y: array, dtype: Union[type, str], return y -def sound_file_load(file: str, +def sound_file_load(file: os.PathLike, offset: Optional[float]=None, dtype: str='int16', - duration: Optional[int]=None) -> Tuple[array, int]: - """Load audio using soundfile library - This function load audio file using libsndfile. - Reference: - http://www.mega-nerd.com/libsndfile/#Features + duration: Optional[int]=None) -> Tuple[np.ndarray, int]: + """Load audio using soundfile library. This function load audio file using libsndfile. + + Args: + file (os.PathLike): File of waveform. + offset (Optional[float], optional): Offset to the start of waveform. Defaults to None. + dtype (str, optional): Data type of waveform. Defaults to 'int16'. + duration (Optional[int], optional): Duration of waveform to read. Defaults to None. + + Returns: + Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. """ with sf.SoundFile(file) as sf_desc: sr_native = sf_desc.samplerate @@ -179,9 +217,17 @@ def sound_file_load(file: str, return y, sf_desc.samplerate -def normalize(y: array, norm_type: str='linear', - mul_factor: float=1.0) -> array: - """ normalize an input audio with additional multiplier. +def normalize(y: np.ndarray, norm_type: str='linear', + mul_factor: float=1.0) -> np.ndarray: + """Normalize an input audio with additional multiplier. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + norm_type (str, optional): Type of normalization. Defaults to 'linear'. + mul_factor (float, optional): Scaling factor. Defaults to 1.0. + + Returns: + np.ndarray: `y` after normalization. """ if norm_type == 'linear': @@ -199,12 +245,13 @@ def normalize(y: array, norm_type: str='linear', return y -def save(y: array, sr: int, file: str) -> None: - """Save audio file to disk. - This function saves audio to disk using scipy.io.wavfile, with additional step - to convert input waveform to int16 unless it already is int16 - Notes: - It only support raw wav format. +def save(y: np.ndarray, sr: int, file: os.PathLike) -> None: + """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + sr (int): Sample rate. + file (os.PathLike): Path of auido file to save. """ if not file.endswith('.wav'): raise ParameterError( @@ -226,7 +273,7 @@ def save(y: array, sr: int, file: str) -> None: def load( - file: str, + file: os.PathLike, sr: Optional[int]=None, mono: bool=True, merge_type: str='average', # ch0,ch1,random,average @@ -236,11 +283,24 @@ def load( offset: float=0.0, duration: Optional[int]=None, dtype: str='float32', - resample_mode: str='kaiser_fast') -> Tuple[array, int]: - """Load audio file from disk. - This function loads audio from disk using using audio beackend. - Parameters: - Notes: + resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]: + """Load audio file from disk. This function loads audio from disk using using audio beackend. + + Args: + file (os.PathLike): Path of auido file to load. + sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None. + mono (bool, optional): Return waveform with mono channel. Defaults to True. + merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'. + normal (bool, optional): Waveform normalization. Defaults to True. + norm_type (str, optional): Type of normalization. Defaults to 'linear'. + norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0. + offset (float, optional): Offset to the start of waveform. Defaults to 0.0. + duration (Optional[int], optional): Duration of waveform to read. Defaults to None. + dtype (str, optional): Data type of waveform. Defaults to 'float32'. + resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. + + Returns: + Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. """ y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration) diff --git a/paddleaudio/paddleaudio/compliance/kaldi.py b/paddleaudio/paddleaudio/compliance/kaldi.py index 8cb9b666..538be019 100644 --- a/paddleaudio/paddleaudio/compliance/kaldi.py +++ b/paddleaudio/paddleaudio/compliance/kaldi.py @@ -220,7 +220,7 @@ def spectrogram(waveform: Tensor, """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. Args: - waveform (Tensor): A waveform tensor with shape [C, T]. + waveform (Tensor): A waveform tensor with shape `(C, T)`. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. channel (int, optional): Select the channel of waveform. Defaults to -1. dither (float, optional): Dithering constant . Defaults to 0.0. @@ -239,7 +239,7 @@ def spectrogram(waveform: Tensor, window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. Returns: - Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames + Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames depends on frame_length and frame_shift. """ dtype = waveform.dtype @@ -422,7 +422,7 @@ def fbank(waveform: Tensor, """Compute and return filter banks from a waveform. The output is identical to Kaldi's. Args: - waveform (Tensor): A waveform tensor with shape [C, T]. + waveform (Tensor): A waveform tensor with shape `(C, T)`. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. channel (int, optional): Select the channel of waveform. Defaults to -1. dither (float, optional): Dithering constant . Defaults to 0.0. @@ -451,7 +451,7 @@ def fbank(waveform: Tensor, window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. Returns: - Tensor: A filter banks tensor with shape (m, n_mels). + Tensor: A filter banks tensor with shape `(m, n_mels)`. """ dtype = waveform.dtype @@ -542,7 +542,7 @@ def mfcc(waveform: Tensor, identical to Kaldi's. Args: - waveform (Tensor): A waveform tensor with shape [C, T]. + waveform (Tensor): A waveform tensor with shape `(C, T)`. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0. channel (int, optional): Select the channel of waveform. Defaults to -1. @@ -571,7 +571,7 @@ def mfcc(waveform: Tensor, window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. Returns: - Tensor: A mel frequency cepstral coefficients tensor with shape (m, n_mfcc). + Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`. """ assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( n_mfcc, n_mels) diff --git a/paddleaudio/paddleaudio/compliance/librosa.py b/paddleaudio/paddleaudio/compliance/librosa.py index 167795c3..740584ca 100644 --- a/paddleaudio/paddleaudio/compliance/librosa.py +++ b/paddleaudio/paddleaudio/compliance/librosa.py @@ -19,7 +19,6 @@ from typing import Union import numpy as np import scipy -from numpy import ndarray as array from numpy.lib.stride_tricks import as_strided from scipy import signal @@ -32,7 +31,6 @@ __all__ = [ 'mfcc', 'hz_to_mel', 'mel_to_hz', - 'split_frames', 'mel_frequencies', 'power_to_db', 'compute_fbank_matrix', @@ -49,7 +47,8 @@ __all__ = [ ] -def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array: +def _pad_center(data: np.ndarray, size: int, axis: int=-1, + **kwargs) -> np.ndarray: """Pad an array to a target length along a target axis. This differs from `np.pad` by centering the data prior to padding, @@ -69,8 +68,10 @@ def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array: return np.pad(data, lengths, **kwargs) -def split_frames(x: array, frame_length: int, hop_length: int, - axis: int=-1) -> array: +def _split_frames(x: np.ndarray, + frame_length: int, + hop_length: int, + axis: int=-1) -> np.ndarray: """Slice a data array into (overlapping) frames. This function is aligned with librosa.frame @@ -142,11 +143,16 @@ def _check_audio(y, mono=True) -> bool: return True -def hz_to_mel(frequencies: Union[float, List[float], array], - htk: bool=False) -> array: - """Convert Hz to Mels +def hz_to_mel(frequencies: Union[float, List[float], np.ndarray], + htk: bool=False) -> np.ndarray: + """Convert Hz to Mels. - This function is aligned with librosa. + Args: + frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + np.ndarray: Frequency in mels. """ freq = np.asanyarray(frequencies) @@ -177,10 +183,16 @@ def hz_to_mel(frequencies: Union[float, List[float], array], return mels -def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array: +def mel_to_hz(mels: Union[float, List[float], np.ndarray], + htk: int=False) -> np.ndarray: """Convert mel bin numbers to frequencies. - This function is aligned with librosa. + Args: + mels (Union[float, List[float], np.ndarray]): Frequency in mels. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + np.ndarray: Frequencies in Hz. """ mel_array = np.asanyarray(mels) @@ -212,10 +224,17 @@ def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array: def mel_frequencies(n_mels: int=128, fmin: float=0.0, fmax: float=11025.0, - htk: bool=False) -> array: - """Compute mel frequencies + htk: bool=False) -> np.ndarray: + """Compute mel frequencies. + + Args: + n_mels (int, optional): Number of mel bins. Defaults to 128. + fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0. + htk (bool, optional): Use htk scaling. Defaults to False. - This function is aligned with librosa. + Returns: + np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`. """ # 'Center freqs' of mel bands - uniformly spaced between limits min_mel = hz_to_mel(fmin, htk=htk) @@ -226,10 +245,15 @@ def mel_frequencies(n_mels: int=128, return mel_to_hz(mels, htk=htk) -def fft_frequencies(sr: int, n_fft: int) -> array: +def fft_frequencies(sr: int, n_fft: int) -> np.ndarray: """Compute fourier frequencies. - This function is aligned with librosa. + Args: + sr (int): Sample rate. + n_fft (int): FFT size. + + Returns: + np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. """ return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True) @@ -241,10 +265,22 @@ def compute_fbank_matrix(sr: int, fmax: Optional[float]=None, htk: bool=False, norm: str="slaney", - dtype: type=np.float32): + dtype: type=np.float32) -> np.ndarray: """Compute fbank matrix. - This funciton is aligned with librosa. + Args: + sr (int): Sample rate. + n_fft (int): FFT size. + n_mels (int, optional): Number of mel bins. Defaults to 128. + fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use htk scaling. Defaults to False. + norm (str, optional): Type of normalization. Defaults to "slaney". + dtype (type, optional): Data type. Defaults to np.float32. + + + Returns: + np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. """ if norm != "slaney": raise ParameterError('norm must set to slaney') @@ -289,17 +325,28 @@ def compute_fbank_matrix(sr: int, return weights -def stft(x: array, +def stft(x: np.ndarray, n_fft: int=2048, hop_length: Optional[int]=None, win_length: Optional[int]=None, window: str="hann", center: bool=True, dtype: type=np.complex64, - pad_mode: str="reflect") -> array: + pad_mode: str="reflect") -> np.ndarray: """Short-time Fourier transform (STFT). - This function is aligned with librosa. + Args: + x (np.ndarray): Input waveform in one dimension. + n_fft (int, optional): FFT size. Defaults to 2048. + hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None. + win_length (Optional[int], optional): The size of window. Defaults to None. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + dtype (type, optional): Data type of STFT results. Defaults to np.complex64. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + + Returns: + np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`. """ _check_audio(x) @@ -314,7 +361,7 @@ def stft(x: array, fft_window = signal.get_window(window, win_length, fftbins=True) # Pad the window out to n_fft size - fft_window = pad_center(fft_window, n_fft) + fft_window = _pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) @@ -333,7 +380,7 @@ def stft(x: array, ) # Window the time series. - x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length) + x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length) # Pre-allocate the STFT matrix stft_matrix = np.empty( (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F") @@ -352,16 +399,20 @@ def stft(x: array, return stft_matrix -def power_to_db(spect: array, +def power_to_db(spect: np.ndarray, ref: float=1.0, amin: float=1e-10, - top_db: Optional[float]=80.0) -> array: - """Convert a power spectrogram (amplitude squared) to decibel (dB) units + top_db: Optional[float]=80.0) -> np.ndarray: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way. - This computes the scaling ``10 * log10(spect / ref)`` in a numerically - stable way. + Args: + spect (np.ndarray): STFT power spectrogram of an input waveform. + ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0. - This function is aligned with librosa. + Returns: + np.ndarray: Power spectrogram in db scale. """ spect = np.asarray(spect) @@ -394,49 +445,27 @@ def power_to_db(spect: array, return log_spec -def mfcc(x, +def mfcc(x: np.ndarray, sr: int=16000, - spect: Optional[array]=None, + spect: Optional[np.ndarray]=None, n_mfcc: int=20, dct_type: int=2, norm: str="ortho", lifter: int=0, - **kwargs) -> array: + **kwargs) -> np.ndarray: """Mel-frequency cepstral coefficients (MFCCs) - This function is NOT strictly aligned with librosa. The following example shows how to get the - same result with librosa: - - # mfcc: - kwargs = { - 'window_size':512, - 'hop_length':320, - 'mel_bins':64, - 'fmin':50, - 'to_db':False} - a = mfcc(x, - spect=None, - n_mfcc=20, - dct_type=2, - norm='ortho', - lifter=0, - **kwargs) - - # librosa mfcc: - spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512, - win_length=512, - hop_length=320, - n_mels=64, fmin=50) - b = librosa.feature.mfcc(y=x, - sr=16000, - S=spect, - n_mfcc=20, - dct_type=2, - norm='ortho', - lifter=0) - - assert np.mean( (a-b)**2) < 1e-8 + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None. + n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20. + dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2. + norm (str, optional): Type of normalization. Defaults to "ortho". + lifter (int, optional): Cepstral filtering. Defaults to 0. + Returns: + np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`. """ if spect is None: spect = melspectrogram(x, sr=sr, **kwargs) @@ -454,12 +483,12 @@ def mfcc(x, f"MFCC lifter={lifter} must be a non-negative number") -def melspectrogram(x: array, +def melspectrogram(x: np.ndarray, sr: int=16000, window_size: int=512, hop_length: int=320, n_mels: int=64, - fmin: int=50, + fmin: float=50.0, fmax: Optional[float]=None, window: str='hann', center: bool=True, @@ -468,27 +497,28 @@ def melspectrogram(x: array, to_db: bool=True, ref: float=1.0, amin: float=1e-10, - top_db: Optional[float]=None) -> array: + top_db: Optional[float]=None) -> np.ndarray: """Compute mel-spectrogram. - Parameters: - x: numpy.ndarray - The input wavform is a numpy array [shape=(n,)] - - window_size: int, typically 512, 1024, 2048, etc. - The window size for framing, also used as n_fft for stft - + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + window_size (int, optional): Size of FFT and window length. Defaults to 512. + hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. + n_mels (int, optional): Number of mel bins. Defaults to 64. + fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0. + fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. + to_db (bool, optional): Enable db scale. Defaults to True. + ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. Returns: - The mel-spectrogram in power scale or db scale(default) - - - Notes: - 1. sr is default to 16000, which is commonly used in speech/speaker processing. - 2. when fmax is None, it is set to sr//2. - 3. this function will convert mel spectgrum to db scale by default. This is different - that of librosa. - + np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`. """ _check_audio(x, mono=True) if len(x) <= 0: @@ -518,18 +548,28 @@ def melspectrogram(x: array, return mel_spect -def spectrogram(x: array, +def spectrogram(x: np.ndarray, sr: int=16000, window_size: int=512, hop_length: int=320, window: str='hann', center: bool=True, pad_mode: str='reflect', - power: float=2.0) -> array: - """Compute spectrogram from an input waveform. + power: float=2.0) -> np.ndarray: + """Compute spectrogram. + + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + window_size (int, optional): Size of FFT and window length. Defaults to 512. + hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. - This function is a wrapper for librosa.feature.stft, with addition step to - compute the magnitude of the complex spectrogram. + Returns: + np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`. """ s = stft( @@ -544,18 +584,16 @@ def spectrogram(x: array, return np.abs(s)**power -def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array: - """Mu-law encoding. - - Compute the mu-law decoding given an input code. - When quantized is True, the result will be converted to - integer in range [0,mu-1]. Otherwise, the resulting signal - is in range [-1,1] - +def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray: + """Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`. - Reference: - https://en.wikipedia.org/wiki/%CE%9C-law_algorithm + Args: + x (np.ndarray): The input waveform to encode. + mu (int, optional): The endoceding parameter. Defaults to 255. + quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True. + Returns: + np.ndarray: The mu-law encoded waveform. """ mu = 255 y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu) @@ -564,17 +602,16 @@ def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array: return y -def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array: - """Mu-law decoding. - - Compute the mu-law decoding given an input code. +def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray: + """Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise. - it assumes that the input y is in - range [0,mu-1] when quantize is True and [-1,1] otherwise - - Reference: - https://en.wikipedia.org/wiki/%CE%9C-law_algorithm + Args: + y (np.ndarray): The encoded waveform. + mu (int, optional): The endoceding parameter. Defaults to 255. + quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True. + Returns: + np.ndarray: The mu-law decoded waveform. """ if mu < 1: raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...') @@ -586,7 +623,7 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array: return x -def randint(high: int) -> int: +def _randint(high: int) -> int: """Generate one random integer in range [0 high) This is a helper function for random data augmentaiton @@ -594,20 +631,18 @@ def randint(high: int) -> int: return int(np.random.randint(0, high=high)) -def rand() -> float: - """Generate one floating-point number in range [0 1) - - This is a helper function for random data augmentaiton - """ - return float(np.random.rand(1)) - - -def depth_augment(y: array, +def depth_augment(y: np.ndarray, choices: List=['int8', 'int16'], - probs: List[float]=[0.5, 0.5]) -> array: - """ Audio depth augmentation + probs: List[float]=[0.5, 0.5]) -> np.ndarray: + """ Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16']. + probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5]. - Do audio depth augmentation to simulate the distortion brought by quantization. + Returns: + np.ndarray: The augmented waveform. """ assert len(probs) == len( choices @@ -621,13 +656,18 @@ def depth_augment(y: array, return y2 -def adaptive_spect_augment(spect: array, tempo_axis: int=0, - level: float=0.1) -> array: - """Do adpative spectrogram augmentation +def adaptive_spect_augment(spect: np.ndarray, + tempo_axis: int=0, + level: float=0.1) -> np.ndarray: + """Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation. - The level of the augmentation is gowern by the paramter level, - ranging from 0 to 1, with 0 represents no augmentation。 + Args: + spect (np.ndarray): Input spectrogram. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + level (float, optional): The level factor of masking. Defaults to 0.1. + Returns: + np.ndarray: The augmented spectrogram. """ assert spect.ndim == 2., 'only supports 2d tensor or numpy array' if tempo_axis == 0: @@ -643,32 +683,40 @@ def adaptive_spect_augment(spect: array, tempo_axis: int=0, if tempo_axis == 0: for _ in range(num_time_mask): - start = randint(nt - time_mask_width) + start = _randint(nt - time_mask_width) spect[start:start + time_mask_width, :] = 0 for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) + start = _randint(nf - freq_mask_width) spect[:, start:start + freq_mask_width] = 0 else: for _ in range(num_time_mask): - start = randint(nt - time_mask_width) + start = _randint(nt - time_mask_width) spect[:, start:start + time_mask_width] = 0 for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) + start = _randint(nf - freq_mask_width) spect[start:start + freq_mask_width, :] = 0 return spect -def spect_augment(spect: array, +def spect_augment(spect: np.ndarray, tempo_axis: int=0, max_time_mask: int=3, max_freq_mask: int=3, max_time_mask_width: int=30, - max_freq_mask_width: int=20) -> array: - """Do spectrogram augmentation in both time and freq axis + max_freq_mask_width: int=20) -> np.ndarray: + """Do spectrogram augmentation in both time and freq axis. - Reference: + Args: + spect (np.ndarray): Input spectrogram. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + max_time_mask (int, optional): Maximum number of time masking. Defaults to 3. + max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3. + max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30. + max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20. + Returns: + np.ndarray: The augmented spectrogram. """ assert spect.ndim == 2., 'only supports 2d tensor or numpy array' if tempo_axis == 0: @@ -676,52 +724,64 @@ def spect_augment(spect: array, else: nf, nt = spect.shape - num_time_mask = randint(max_time_mask) - num_freq_mask = randint(max_freq_mask) + num_time_mask = _randint(max_time_mask) + num_freq_mask = _randint(max_freq_mask) - time_mask_width = randint(max_time_mask_width) - freq_mask_width = randint(max_freq_mask_width) + time_mask_width = _randint(max_time_mask_width) + freq_mask_width = _randint(max_freq_mask_width) if tempo_axis == 0: for _ in range(num_time_mask): - start = randint(nt - time_mask_width) + start = _randint(nt - time_mask_width) spect[start:start + time_mask_width, :] = 0 for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) + start = _randint(nf - freq_mask_width) spect[:, start:start + freq_mask_width] = 0 else: for _ in range(num_time_mask): - start = randint(nt - time_mask_width) + start = _randint(nt - time_mask_width) spect[:, start:start + time_mask_width] = 0 for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) + start = _randint(nf - freq_mask_width) spect[start:start + freq_mask_width, :] = 0 return spect -def random_crop1d(y: array, crop_len: int) -> array: - """ Do random cropping on 1d input signal +def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray: + """ Random cropping on a input waveform. - The input is a 1d signal, typically a sound waveform + Args: + y (np.ndarray): Input waveform array in 1D. + crop_len (int): Length of waveform to crop. + + Returns: + np.ndarray: The cropped waveform. """ if y.ndim != 1: 'only accept 1d tensor or numpy array' n = len(y) - idx = randint(n - crop_len) + idx = _randint(n - crop_len) return y[idx:idx + crop_len] -def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array: - """ Do random cropping for 2D array, typically a spectrogram. +def random_crop2d(s: np.ndarray, crop_len: int, + tempo_axis: int=0) -> np.ndarray: + """ Random cropping on a spectrogram. - The cropping is done in temporal direction on the time-freq input signal. + Args: + s (np.ndarray): Input spectrogram in 2D. + crop_len (int): Length of spectrogram to crop. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + + Returns: + np.ndarray: The cropped spectrogram. """ if tempo_axis >= s.ndim: raise ParameterError('axis out of range') n = s.shape[tempo_axis] - idx = randint(high=n - crop_len) + idx = _randint(high=n - crop_len) sli = [slice(None) for i in range(s.ndim)] sli[tempo_axis] = slice(idx, idx + crop_len) out = s[tuple(sli)] diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py index 6afd234a..09037255 100644 --- a/paddleaudio/paddleaudio/features/layers.py +++ b/paddleaudio/paddleaudio/features/layers.py @@ -17,6 +17,7 @@ from typing import Union import paddle import paddle.nn as nn +from paddle import Tensor from ..functional import compute_fbank_matrix from ..functional import create_dct @@ -32,6 +33,20 @@ __all__ = [ class Spectrogram(nn.Layer): + """Compute spectrogram of given signals, typically audio waveforms. + The spectorgram is defined as the complex norm of the short-time Fourier transformation. + + Args: + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + def __init__(self, n_fft: int=512, hop_length: Optional[int]=None, @@ -40,34 +55,7 @@ class Spectrogram(nn.Layer): power: float=2.0, center: bool=True, pad_mode: str='reflect', - dtype: str=paddle.float32): - """Compute spectrogram of a given signal, typically an audio waveform. - The spectorgram is defined as the complex norm of the short-time - Fourier transformation. - Parameters: - n_fft (int): the number of frequency components of the discrete Fourier transform. - The default value is 2048, - hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4. - The default value is None. - win_length: the window length of the short time FFt. If None, it is set to same as n_fft. - The default value is None. - window (str): the name of the window function applied to the single before the Fourier transform. - The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', - 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. - The default value is 'hann' - power (float): Exponent for the magnitude spectrogram. The default value is 2.0. - center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. - If False, frame t begins at x[t * hop_length] - The default value is True - pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect' - and 'constant'. The default value is 'reflect'. - dtype (str): the data type of input and window. - Notes: - The Spectrogram transform relies on STFT transform to compute the spectrogram. - By default, the weights are not learnable. To fine-tune the Fourier coefficients, - set stop_gradient=False before training. - For more information, see STFT(). - """ + dtype: str='float32') -> None: super(Spectrogram, self).__init__() assert power > 0, 'Power of spectrogram must be > 0.' @@ -88,13 +76,39 @@ class Spectrogram(nn.Layer): pad_mode=pad_mode) self.register_buffer('fft_window', self.fft_window) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`. + """ stft = self._stft(x) spectrogram = paddle.pow(paddle.abs(stft), self.power) return spectrogram class MelSpectrogram(nn.Layer): + """Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix. + + Args: + sr (int, optional): Sample rate. Defaults to 22050. + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. + norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + def __init__(self, sr: int=22050, n_fft: int=512, @@ -109,39 +123,7 @@ class MelSpectrogram(nn.Layer): f_max: Optional[float]=None, htk: bool=False, norm: Union[str, float]='slaney', - dtype: str=paddle.float32): - """Compute the melspectrogram of a given signal, typically an audio waveform. - The melspectrogram is also known as filterbank or fbank feature in audio community. - It is computed by multiplying spectrogram with Mel filter bank matrix. - Parameters: - sr(int): the audio sample rate. - The default value is 22050. - n_fft(int): the number of frequency components of the discrete Fourier transform. - The default value is 2048, - hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. - The default value is None. - win_length: the window length of the short time FFt. If None, it is set to same as n_fft. - The default value is None. - window(str): the name of the window function applied to the single before the Fourier transform. - The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', - 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. - The default value is 'hann' - power (float): Exponent for the magnitude spectrogram. The default value is 2.0. - center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. - If False, frame t begins at x[t * hop_length] - The default value is True - pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' - and 'constant'. - The default value is 'reflect'. - n_mels(int): the mel bins. - f_min(float): the lower cut-off frequency, below which the filter response is zero. - f_max(float): the upper cut-off frequency, above which the filter response is zeros. - htk(bool): whether to use HTK formula in computing fbank matrix. - norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. - You can specify norm=1.0/2.0 to use customized p-norm normalization. - dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical - accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. - """ + dtype: str='float32') -> None: super(MelSpectrogram, self).__init__() self._spectrogram = Spectrogram( @@ -171,13 +153,42 @@ class MelSpectrogram(nn.Layer): dtype=dtype) # float64 for better numerical results self.register_buffer('fbank_matrix', self.fbank_matrix) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`. + """ spect_feature = self._spectrogram(x) mel_feature = paddle.matmul(self.fbank_matrix, spect_feature) return mel_feature class LogMelSpectrogram(nn.Layer): + """Compute log-mel-spectrogram feature of given signals, typically audio waveforms. + + Args: + sr (int, optional): Sample rate. Defaults to 22050. + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. + norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. + ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. + top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + def __init__(self, sr: int=22050, n_fft: int=512, @@ -195,41 +206,7 @@ class LogMelSpectrogram(nn.Layer): ref_value: float=1.0, amin: float=1e-10, top_db: Optional[float]=None, - dtype: str=paddle.float32): - """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal, - typically an audio waveform. - Parameters: - sr (int): the audio sample rate. - The default value is 22050. - n_fft (int): the number of frequency components of the discrete Fourier transform. - The default value is 2048, - hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4. - The default value is None. - win_length: the window length of the short time FFt. If None, it is set to same as n_fft. - The default value is None. - window (str): the name of the window function applied to the single before the Fourier transform. - The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', - 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. - The default value is 'hann' - center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. - If False, frame t begins at x[t * hop_length] - The default value is True - pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect' - and 'constant'. - The default value is 'reflect'. - n_mels (int): the mel bins. - f_min (float): the lower cut-off frequency, below which the filter response is zero. - f_max (float): the upper cut-off frequency, above which the filter response is zeros. - htk (bool): whether to use HTK formula in computing fbank matrix. - norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. - You can specify norm=1.0/2.0 to use customized p-norm normalization. - ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. - amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin). - top_db (float): the maximum db value of resulting spectrum, above which the - spectrum is clipped(to top_db). - dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical - accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. - """ + dtype: str='float32') -> None: super(LogMelSpectrogram, self).__init__() self._melspectrogram = MelSpectrogram( @@ -252,7 +229,14 @@ class LogMelSpectrogram(nn.Layer): self.amin = amin self.top_db = top_db - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`. + """ mel_feature = self._melspectrogram(x) log_mel_feature = power_to_db( mel_feature, @@ -263,6 +247,29 @@ class LogMelSpectrogram(nn.Layer): class MFCC(nn.Layer): + """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms. + + Args: + sr (int, optional): Sample rate. Defaults to 22050. + n_mfcc (int, optional): [description]. Defaults to 40. + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. + norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. + ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. + top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + def __init__(self, sr: int=22050, n_mfcc: int=40, @@ -281,43 +288,7 @@ class MFCC(nn.Layer): ref_value: float=1.0, amin: float=1e-10, top_db: Optional[float]=None, - dtype: str=paddle.float32): - """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms. - - Parameters: - sr(int): the audio sample rate. - The default value is 22050. - n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40. - n_fft (int): the number of frequency components of the discrete Fourier transform. - The default value is 2048, - hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4. - The default value is None. - win_length: the window length of the short time FFt. If None, it is set to same as n_fft. - The default value is None. - window (str): the name of the window function applied to the single before the Fourier transform. - The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', - 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. - The default value is 'hann' - power (float): Exponent for the magnitude spectrogram. The default value is 2.0. - center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. - If False, frame t begins at x[t * hop_length] - The default value is True - pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect' - and 'constant'. - The default value is 'reflect'. - n_mels (int): the mel bins. - f_min (float): the lower cut-off frequency, below which the filter response is zero. - f_max (float): the upper cut-off frequency, above which the filter response is zeros. - htk (bool): whether to use HTK formula in computing fbank matrix. - norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. - You can specify norm=1.0/2.0 to use customized p-norm normalization. - ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. - amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin). - top_db (float): the maximum db value of resulting spectrum, above which the - spectrum is clipped(to top_db). - dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical - accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. - """ + dtype: str=paddle.float32) -> None: super(MFCC, self).__init__() assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( n_mfcc, n_mels) @@ -342,7 +313,14 @@ class MFCC(nn.Layer): self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype) self.register_buffer('dct_matrix', self.dct_matrix) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`. + """ log_mel_feature = self._log_melspectrogram(x) mfcc = paddle.matmul( log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose( diff --git a/paddleaudio/paddleaudio/functional/functional.py b/paddleaudio/paddleaudio/functional/functional.py index c5ab3045..19c63a9a 100644 --- a/paddleaudio/paddleaudio/functional/functional.py +++ b/paddleaudio/paddleaudio/functional/functional.py @@ -17,6 +17,7 @@ from typing import Optional from typing import Union import paddle +from paddle import Tensor __all__ = [ 'hz_to_mel', @@ -29,19 +30,20 @@ __all__ = [ ] -def hz_to_mel(freq: Union[paddle.Tensor, float], - htk: bool=False) -> Union[paddle.Tensor, float]: +def hz_to_mel(freq: Union[Tensor, float], + htk: bool=False) -> Union[Tensor, float]: """Convert Hz to Mels. - Parameters: - freq: the input tensor of arbitrary shape, or a single floating point number. - htk: use HTK formula to do the conversion. - The default value is False. + + Args: + freq (Union[Tensor, float]): The input tensor with arbitrary shape. + htk (bool, optional): Use htk scaling. Defaults to False. + Returns: - The frequencies represented in Mel-scale. + Union[Tensor, float]: Frequency in mels. """ if htk: - if isinstance(freq, paddle.Tensor): + if isinstance(freq, Tensor): return 2595.0 * paddle.log10(1.0 + freq / 700.0) else: return 2595.0 * math.log10(1.0 + freq / 700.0) @@ -58,7 +60,7 @@ def hz_to_mel(freq: Union[paddle.Tensor, float], min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) logstep = math.log(6.4) / 27.0 # step size for log region - if isinstance(freq, paddle.Tensor): + if isinstance(freq, Tensor): target = min_log_mel + paddle.log( freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10 mask = (freq > min_log_hz).astype(freq.dtype) @@ -71,14 +73,16 @@ def hz_to_mel(freq: Union[paddle.Tensor, float], return mels -def mel_to_hz(mel: Union[float, paddle.Tensor], - htk: bool=False) -> Union[float, paddle.Tensor]: +def mel_to_hz(mel: Union[float, Tensor], + htk: bool=False) -> Union[float, Tensor]: """Convert mel bin numbers to frequencies. - Parameters: - mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number. - htk: use HTK formula to do the conversion. + + Args: + mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape. + htk (bool, optional): Use htk scaling. Defaults to False. + Returns: - The frequencies represented in hz. + Union[float, Tensor]: Frequencies in Hz. """ if htk: return 700.0 * (10.0**(mel / 2595.0) - 1.0) @@ -90,7 +94,7 @@ def mel_to_hz(mel: Union[float, paddle.Tensor], min_log_hz = 1000.0 # beginning of log region (Hz) min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) logstep = math.log(6.4) / 27.0 # step size for log region - if isinstance(mel, paddle.Tensor): + if isinstance(mel, Tensor): target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel)) mask = (mel > min_log_mel).astype(mel.dtype) freqs = target * mask + freqs * ( @@ -106,16 +110,18 @@ def mel_frequencies(n_mels: int=64, f_min: float=0.0, f_max: float=11025.0, htk: bool=False, - dtype: str=paddle.float32): + dtype: str='float32') -> Tensor: """Compute mel frequencies. - Parameters: - n_mels(int): number of Mel bins. - f_min(float): the lower cut-off frequency, below which the filter response is zero. - f_max(float): the upper cut-off frequency, above which the filter response is zero. - htk(bool): whether to use htk formula. - dtype(str): the datatype of the return frequencies. + + Args: + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0. + htk (bool, optional): Use htk scaling. Defaults to False. + dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'. + Returns: - The frequencies represented in Mel-scale + Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`. """ # 'Center freqs' of mel bands - uniformly spaced between limits min_mel = hz_to_mel(f_min, htk=htk) @@ -125,14 +131,16 @@ def mel_frequencies(n_mels: int=64, return freqs -def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32): +def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor: """Compute fourier frequencies. - Parameters: - sr(int): the audio sample rate. - n_fft(float): the number of fft bins. - dtype(str): the datatype of the return frequencies. + + Args: + sr (int): Sample rate. + n_fft (int): Number of fft bins. + dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'. + Returns: - The frequencies represented in hz. + Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. """ return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype) @@ -144,23 +152,21 @@ def compute_fbank_matrix(sr: int, f_max: Optional[float]=None, htk: bool=False, norm: Union[str, float]='slaney', - dtype: str=paddle.float32): + dtype: str='float32') -> Tensor: """Compute fbank matrix. - Parameters: - sr(int): the audio sample rate. - n_fft(int): the number of fft bins. - n_mels(int): the number of Mel bins. - f_min(float): the lower cut-off frequency, below which the filter response is zero. - f_max(float): the upper cut-off frequency, above which the filter response is zero. - htk: whether to use htk formula. - return_complex(bool): whether to return complex matrix. If True, the matrix will - be complex type. Otherwise, the real and image part will be stored in the last - axis of returned tensor. - dtype(str): the datatype of the returned fbank matrix. + + Args: + sr (int): Sample rate. + n_fft (int): Number of fft bins. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use htk scaling. Defaults to False. + norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'. + dtype (str, optional): The data type of the return matrix. Defaults to 'float32'. + Returns: - The fbank matrix of shape (n_mels, int(1+n_fft//2)). - Shape: - output: (n_mels, int(1+n_fft//2)) + Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. """ if f_max is None: @@ -199,27 +205,20 @@ def compute_fbank_matrix(sr: int, return weights -def power_to_db(magnitude: paddle.Tensor, +def power_to_db(spect: Tensor, ref_value: float=1.0, amin: float=1e-10, - top_db: Optional[float]=None) -> paddle.Tensor: - """Convert a power spectrogram (amplitude squared) to decibel (dB) units. - The function computes the scaling ``10 * log10(x / ref)`` in a numerically - stable way. - Parameters: - magnitude(Tensor): the input magnitude tensor of any shape. - ref_value(float): the reference value. If smaller than 1.0, the db level - of the signal will be pulled up accordingly. Otherwise, the db level - is pushed down. - amin(float): the minimum value of input magnitude, below which the input - magnitude is clipped(to amin). - top_db(float): the maximum db value of resulting spectrum, above which the - spectrum is clipped(to top_db). + top_db: Optional[float]=None) -> Tensor: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way. + + Args: + spect (Tensor): STFT power spectrogram. + ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. + Returns: - The spectrogram in log-scale. - shape: - input: any shape - output: same as input + Tensor: Power spectrogram in db scale. """ if amin <= 0: raise Exception("amin must be strictly positive") @@ -227,8 +226,8 @@ def power_to_db(magnitude: paddle.Tensor, if ref_value <= 0: raise Exception("ref_value must be strictly positive") - ones = paddle.ones_like(magnitude) - log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude)) + ones = paddle.ones_like(spect) + log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect)) log_spec -= 10.0 * math.log10(max(ref_value, amin)) if top_db is not None: @@ -242,15 +241,17 @@ def power_to_db(magnitude: paddle.Tensor, def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]='ortho', - dtype: Optional[str]=paddle.float32) -> paddle.Tensor: + dtype: str='float32') -> Tensor: """Create a discrete cosine transform(DCT) matrix. - Parameters: + Args: n_mfcc (int): Number of mel frequency cepstral coefficients. n_mels (int): Number of mel filterbanks. - norm (str, optional): Normalizaiton type. Defaults to 'ortho'. + norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'. + dtype (str, optional): The data type of the return matrix. Defaults to 'float32'. + Returns: - Tensor: The DCT matrix with shape (n_mels, n_mfcc). + Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`. """ n = paddle.arange(n_mels, dtype=dtype) k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1) diff --git a/paddleaudio/paddleaudio/functional/window.py b/paddleaudio/paddleaudio/functional/window.py index f321b38e..c99d5046 100644 --- a/paddleaudio/paddleaudio/functional/window.py +++ b/paddleaudio/paddleaudio/functional/window.py @@ -20,24 +20,11 @@ from paddle import Tensor __all__ = [ 'get_window', - - # windows - 'taylor', - 'hamming', - 'hann', - 'tukey', - 'kaiser', - 'gaussian', - 'exponential', - 'triang', - 'bohman', - 'blackman', - 'cosine', ] -def _cat(a: List[Tensor], data_type: str) -> Tensor: - l = [paddle.to_tensor(_a, data_type) for _a in a] +def _cat(x: List[Tensor], data_type: str) -> Tensor: + l = [paddle.to_tensor(_, data_type) for _ in x] return paddle.concat(l) @@ -48,7 +35,7 @@ def _acosh(x: Union[Tensor, float]) -> Tensor: def _extend(M: int, sym: bool) -> bool: - """Extend window by 1 sample if needed for DFT-even symmetry""" + """Extend window by 1 sample if needed for DFT-even symmetry. """ if not sym: return M + 1, True else: @@ -56,7 +43,7 @@ def _extend(M: int, sym: bool) -> bool: def _len_guards(M: int) -> bool: - """Handle small or incorrect window lengths""" + """Handle small or incorrect window lengths. """ if int(M) != M or M < 0: raise ValueError('Window length M must be a non-negative integer') @@ -64,15 +51,15 @@ def _len_guards(M: int) -> bool: def _truncate(w: Tensor, needed: bool) -> Tensor: - """Truncate window by 1 sample if needed for DFT-even symmetry""" + """Truncate window by 1 sample if needed for DFT-even symmetry. """ if needed: return w[:-1] else: return w -def general_gaussian(M: int, p, sig, sym: bool=True, - dtype: str='float64') -> Tensor: +def _general_gaussian(M: int, p, sig, sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a window with a generalized Gaussian shape. This function is consistent with scipy.signal.windows.general_gaussian(). """ @@ -86,8 +73,8 @@ def general_gaussian(M: int, p, sig, sym: bool=True, return _truncate(w, needs_trunc) -def general_cosine(M: int, a: float, sym: bool=True, - dtype: str='float64') -> Tensor: +def _general_cosine(M: int, a: float, sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a generic weighted sum of cosine terms window. This function is consistent with scipy.signal.windows.general_cosine(). """ @@ -101,31 +88,23 @@ def general_cosine(M: int, a: float, sym: bool=True, return _truncate(w, needs_trunc) -def general_hamming(M: int, alpha: float, sym: bool=True, - dtype: str='float64') -> Tensor: +def _general_hamming(M: int, alpha: float, sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a generalized Hamming window. This function is consistent with scipy.signal.windows.general_hamming() """ - return general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype) + return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype) -def taylor(M: int, - nbar=4, - sll=30, - norm=True, - sym: bool=True, - dtype: str='float64') -> Tensor: +def _taylor(M: int, + nbar=4, + sll=30, + norm=True, + sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a Taylor window. The Taylor window taper function approximates the Dolph-Chebyshev window's constant sidelobe level for a parameterized number of near-in sidelobes. - Parameters: - M(int): window size - nbar, sil, norm: the window-specific parameter. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -171,46 +150,25 @@ def taylor(M: int, return _truncate(w, needs_trunc) -def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a Hamming window. The Hamming window is a taper formed by using a raised cosine with non-zero endpoints, optimized to minimize the nearest side lobe. - Parameters: - M(int): window size - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ - return general_hamming(M, 0.54, sym, dtype=dtype) + return _general_hamming(M, 0.54, sym, dtype=dtype) -def hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a Hann window. The Hann window is a taper formed by using a raised cosine or sine-squared with ends that touch zero. - Parameters: - M(int): window size - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ - return general_hamming(M, 0.5, sym, dtype=dtype) + return _general_hamming(M, 0.5, sym, dtype=dtype) -def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: +def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a Tukey window. The Tukey window is also known as a tapered cosine window. - Parameters: - M(int): window size - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -237,32 +195,18 @@ def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: return _truncate(w, needs_trunc) -def kaiser(M: int, beta: float, sym: bool=True, dtype: str='float64') -> Tensor: +def _kaiser(M: int, beta: float, sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a Kaiser window. The Kaiser window is a taper formed by using a Bessel function. - Parameters: - M(int): window size. - beta(float): the window-specific parameter. - sym(bool):whether to return symmetric window. - The default value is True - Returns: - Tensor: the window tensor """ raise NotImplementedError() -def gaussian(M: int, std: float, sym: bool=True, - dtype: str='float64') -> Tensor: +def _gaussian(M: int, std: float, sym: bool=True, + dtype: str='float64') -> Tensor: """Compute a Gaussian window. The Gaussian widows has a Gaussian shape defined by the standard deviation(std). - Parameters: - M(int): window size. - std(float): the window-specific parameter. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -275,21 +219,12 @@ def gaussian(M: int, std: float, sym: bool=True, return _truncate(w, needs_trunc) -def exponential(M: int, - center=None, - tau=1., - sym: bool=True, - dtype: str='float64') -> Tensor: - """Compute an exponential (or Poisson) window. - Parameters: - M(int): window size. - tau(float): the window-specific parameter. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor - """ +def _exponential(M: int, + center=None, + tau=1., + sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute an exponential (or Poisson) window. """ if sym and center is not None: raise ValueError("If sym==True, center must be None.") if _len_guards(M): @@ -305,15 +240,8 @@ def exponential(M: int, return _truncate(w, needs_trunc) -def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a triangular window. - Parameters: - M(int): window size. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -330,16 +258,9 @@ def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: return _truncate(w, needs_trunc) -def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a Bohman window. The Bohman window is the autocorrelation of a cosine window. - Parameters: - M(int): window size. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -353,32 +274,18 @@ def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: return _truncate(w, needs_trunc) -def blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a Blackman window. The Blackman window is a taper formed by using the first three terms of a summation of cosines. It was designed to have close to the minimal leakage possible. It is close to optimal, only slightly worse than a Kaiser window. - Parameters: - M(int): window size. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ - return general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype) + return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype) -def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: +def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a window with a simple cosine shape. - Parameters: - M(int): window size. - sym(bool):whether to return symmetric window. - The default value is True - dtype(str): the datatype of returned tensor. - Returns: - Tensor: the window tensor """ if _len_guards(M): return paddle.ones((M, ), dtype=dtype) @@ -388,19 +295,20 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: return _truncate(w, needs_trunc) -## factory function def get_window(window: Union[str, Tuple[str, float]], win_length: int, fftbins: bool=True, dtype: str='float64') -> Tensor: """Return a window of a given length and type. - Parameters: - window(str|(str,float)): the type of window to create. - win_length(int): the number of samples in the window. - fftbins(bool): If True, create a "periodic" window. Otherwise, - create a "symmetric" window, for use in filter design. + + Args: + window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. + win_length (int): Number of samples. + fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True. + dtype (str, optional): The data type of the return window. Defaults to 'float64'. + Returns: - The window represented as a tensor. + Tensor: The window represented as a tensor. """ sym = not fftbins @@ -420,7 +328,7 @@ def get_window(window: Union[str, Tuple[str, float]], str(type(window))) try: - winfunc = eval(winstr) + winfunc = eval('_' + winstr) except KeyError as e: raise ValueError("Unknown window type.") from e diff --git a/paddleaudio/paddleaudio/metric/dtw.py b/paddleaudio/paddleaudio/metric/dtw.py index d27f56e2..c4dc7a28 100644 --- a/paddleaudio/paddleaudio/metric/dtw.py +++ b/paddleaudio/paddleaudio/metric/dtw.py @@ -20,9 +20,7 @@ __all__ = [ def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float: - """dtw distance - - Dynamic Time Warping. + """Dynamic Time Warping. This function keeps a compact matrix, not the full warping paths matrix. Uses dynamic programming to compute: diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py index d77d27b0..064939a8 100644 --- a/paddlespeech/cli/executor.py +++ b/paddlespeech/cli/executor.py @@ -178,7 +178,8 @@ class BaseExecutor(ABC): Returns: bool: return `True` for job input, `False` otherwise. """ - return input_ and os.path.isfile(input_) and input_.endswith('.job') + return input_ and os.path.isfile(input_) and (input_.endswith('.job') or + input_.endswith('.txt')) def _get_job_contents( self, job_input: os.PathLike) -> Dict[str, Union[str, os.PathLike]]: