Add paddleaudio doc.

3 years ago · 43a9e39c05
parent 8dcaef9ae9
commit 43a9e39c05
2 changed files with 117 additions and 126 deletions
--- a/paddleaudio/paddleaudio/compliance/librosa.py
+++ b/paddleaudio/paddleaudio/compliance/librosa.py
@ -346,7 +346,7 @@ def stft(x: np.ndarray,
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
    Returns:
-        np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`
+        np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
    """
    _check_audio(x)
@ -465,7 +465,7 @@ def mfcc(x: np.ndarray,
        lifter (int, optional): Cepstral filtering. Defaults to 0.
    Returns:
-        np.ndarray: A mel frequency cepstral coefficients tensor with shape `(n_mfcc, num_frames)`.
+        np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
    """
    if spect is None:
        spect = melspectrogram(x, sr=sr, **kwargs)
--- a/paddleaudio/paddleaudio/features/layers.py
+++ b/paddleaudio/paddleaudio/features/layers.py
@ -17,6 +17,7 @@ from typing import Union
 import paddle
 import paddle.nn as nn
 from paddle import Tensor
 from ..functional import compute_fbank_matrix
 from ..functional import create_dct
@ -32,6 +33,20 @@ __all__ = [
 class Spectrogram(nn.Layer):
    """Compute spectrogram of given signals, typically audio waveforms.
    The spectorgram is defined as the complex norm of the short-time Fourier transformation.
    Args:
        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
    """
    def __init__(self,
                 n_fft: int=512,
                 hop_length: Optional[int]=None,
@ -40,21 +55,7 @@ class Spectrogram(nn.Layer):
                 power: float=2.0,
                 center: bool=True,
                 pad_mode: str='reflect',
-                 dtype: str=paddle.float32):
+                 dtype: str='float32') -> None:
        """Compute spectrogram of a given signal, typically an audio waveform.
        The spectorgram is defined as the complex norm of the short-time
        Fourier transformation.
        Args:
            n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
            hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
            win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
            window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
            power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
            center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
            pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
            dtype (str, optional): Data type of input and window. Defaults to paddle.float32.
        """
        super(Spectrogram, self).__init__()
        assert power > 0, 'Power of spectrogram must be > 0.'
@ -75,13 +76,39 @@ class Spectrogram(nn.Layer):
            pad_mode=pad_mode)
        self.register_buffer('fft_window', self.fft_window)
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x (Tensor): Tensor of waveforms with shape `(N, T)`
        Returns:
            Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
        """
        stft = self._stft(x)
        spectrogram = paddle.pow(paddle.abs(stft), self.power)
        return spectrogram
 class MelSpectrogram(nn.Layer):
    """Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
    Args:
        sr (int, optional): Sample rate. Defaults to 22050.
        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        n_mels (int, optional): Number of mel bins. Defaults to 64.
        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
    """
    def __init__(self,
                 sr: int=22050,
                 n_fft: int=512,
@ -96,39 +123,7 @@ class MelSpectrogram(nn.Layer):
                 f_max: Optional[float]=None,
                 htk: bool=False,
                 norm: Union[str, float]='slaney',
-                 dtype: str=paddle.float32):
+                 dtype: str='float32') -> None:
        """Compute the melspectrogram of a given signal, typically an audio waveform.
        The melspectrogram is also known as filterbank or fbank feature in audio community.
        It is computed by multiplying spectrogram with Mel filter bank matrix.
        Parameters:
            sr(int): the audio sample rate.
                The default value is 22050.
            n_fft(int): the number of frequency components of the discrete Fourier transform.
                The default value is 2048,
            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                The default value is None.
            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                The default value is None.
            window(str): the name of the window function applied to the single before the Fourier transform.
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                and 'constant'.
                The default value is 'reflect'.
            n_mels(int): the mel bins.
            f_min(float): the lower cut-off frequency, below which the filter response is zero.
            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
            htk(bool): whether to use HTK formula in computing fbank matrix.
            norm(str|float): the normalization type in computing fbank matrix.  Slaney-style is used by default.
                You can specify norm=1.0/2.0 to use customized p-norm normalization.
            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
        """
        super(MelSpectrogram, self).__init__()
        self._spectrogram = Spectrogram(
@ -158,13 +153,42 @@ class MelSpectrogram(nn.Layer):
            dtype=dtype)  # float64 for better numerical results
        self.register_buffer('fbank_matrix', self.fbank_matrix)
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x (Tensor): Tensor of waveforms with shape `(N, T)`
        Returns:
            Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
        """
        spect_feature = self._spectrogram(x)
        mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
        return mel_feature
 class LogMelSpectrogram(nn.Layer):
    """Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
    Args:
        sr (int, optional): Sample rate. Defaults to 22050.
        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        n_mels (int, optional): Number of mel bins. Defaults to 64.
        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
    """
    def __init__(self,
                 sr: int=22050,
                 n_fft: int=512,
@ -182,41 +206,7 @@ class LogMelSpectrogram(nn.Layer):
                 ref_value: float=1.0,
                 amin: float=1e-10,
                 top_db: Optional[float]=None,
-                 dtype: str=paddle.float32):
+                 dtype: str='float32') -> None:
        """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
        typically an audio waveform.
        Parameters:
            sr (int): the audio sample rate.
                The default value is 22050.
            n_fft (int): the number of frequency components of the discrete Fourier transform.
                The default value is 2048,
            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                The default value is None.
            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                The default value is None.
            window (str): the name of the window function applied to the single before the Fourier transform.
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                and 'constant'.
                The default value is 'reflect'.
            n_mels (int): the mel bins.
            f_min (float): the lower cut-off frequency, below which the filter response is zero.
            f_max (float): the upper cut-off frequency, above which the filter response is zeros.
            htk (bool): whether to use HTK formula in computing fbank matrix.
            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
                You can specify norm=1.0/2.0 to use customized p-norm normalization.
            ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down.
            amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin).
            top_db (float): the maximum db value of resulting spectrum, above which the
                spectrum is clipped(to top_db).
            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
        """
        super(LogMelSpectrogram, self).__init__()
        self._melspectrogram = MelSpectrogram(
@ -239,7 +229,14 @@ class LogMelSpectrogram(nn.Layer):
        self.amin = amin
        self.top_db = top_db
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x (Tensor): Tensor of waveforms with shape `(N, T)`
        Returns:
            Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
        """
        mel_feature = self._melspectrogram(x)
        log_mel_feature = power_to_db(
            mel_feature,
@ -250,6 +247,29 @@ class LogMelSpectrogram(nn.Layer):
 class MFCC(nn.Layer):
    """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
    Args:
        sr (int, optional): Sample rate. Defaults to 22050.
        n_mfcc (int, optional): [description]. Defaults to 40.
        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
        window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
        n_mels (int, optional): Number of mel bins. Defaults to 64.
        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
    """
    def __init__(self,
                 sr: int=22050,
                 n_mfcc: int=40,
@ -268,43 +288,7 @@ class MFCC(nn.Layer):
                 ref_value: float=1.0,
                 amin: float=1e-10,
                 top_db: Optional[float]=None,
-                 dtype: str=paddle.float32):
+                 dtype: str=paddle.float32) -> None:
        """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
        Parameters:
            sr(int): the audio sample rate.
                The default value is 22050.
            n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40.
            n_fft (int): the number of frequency components of the discrete Fourier transform.
                The default value is 2048,
            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                The default value is None.
            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                The default value is None.
            window (str): the name of the window function applied to the single before the Fourier transform.
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                and 'constant'.
                The default value is 'reflect'.
            n_mels (int): the mel bins.
            f_min (float): the lower cut-off frequency, below which the filter response is zero.
            f_max (float): the upper cut-off frequency, above which the filter response is zeros.
            htk (bool): whether to use HTK formula in computing fbank matrix.
            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
                You can specify norm=1.0/2.0 to use customized p-norm normalization.
            ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down.
            amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin).
            top_db (float): the maximum db value of resulting spectrum, above which the
                spectrum is clipped(to top_db).
            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
        """
        super(MFCC, self).__init__()
        assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
            n_mfcc, n_mels)
@ -329,7 +313,14 @@ class MFCC(nn.Layer):
        self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
        self.register_buffer('dct_matrix', self.dct_matrix)
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x (Tensor): Tensor of waveforms with shape `(N, T)`
        Returns:
            Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
        """
        log_mel_feature = self._log_melspectrogram(x)
        mfcc = paddle.matmul(
            log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(