Merge pull request #1494 from PaddlePaddle/audio

[audio] refactor audio arch
3 years ago · d0bca1982e
parent 2886ab9373 d70bcb8f7a
commit d0bca1982e
35 changed files with 1675 additions and 717 deletions
--- a/.gitignore
+++ b/.gitignore
@ -14,6 +14,7 @@
 *.whl
 *.egg-info
 build
 *output/
 docs/build/
 docs/topic/ctc/warp-ctc/
@ -34,5 +35,3 @@ tools/miniconda.sh
 tools/CRF++-0.58/
 speechx/fc_patch/
 *output/
--- a/paddleaudio/CHANGELOG.md
+++ b/paddleaudio/CHANGELOG.md
@ -1 +1,5 @@
 # Changelog
 Date: 2022-2-25, Author: Hui Zhang.
  - Refactor architecture.
  - dtw distance and mcd style dtw
--- a/paddleaudio/features/augment.py
+++ b/paddleaudio/features/augment.py
@ -1,170 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 import numpy as np
 from numpy import ndarray as array
 from ..backends import depth_convert
 from ..utils import ParameterError
 __all__ = [
    'depth_augment',
    'spect_augment',
    'random_crop1d',
    'random_crop2d',
    'adaptive_spect_augment',
 ]
 def randint(high: int) -> int:
    """Generate one random integer in range [0 high)
     This is a helper function for random data augmentaiton
    """
    return int(np.random.randint(0, high=high))
 def rand() -> float:
    """Generate one floating-point number in range [0 1)
    This is a helper function for random data augmentaiton
    """
    return float(np.random.rand(1))
 def depth_augment(y: array,
                  choices: List=['int8', 'int16'],
                  probs: List[float]=[0.5, 0.5]) -> array:
    """ Audio depth augmentation
    Do audio depth augmentation to simulate the distortion brought by quantization.
    """
    assert len(probs) == len(
        choices
    ), 'number of choices {} must be equal to size of probs {}'.format(
        len(choices), len(probs))
    depth = np.random.choice(choices, p=probs)
    src_depth = y.dtype
    y1 = depth_convert(y, depth)
    y2 = depth_convert(y1, src_depth)
    return y2
 def adaptive_spect_augment(spect: array, tempo_axis: int=0,
                           level: float=0.1) -> array:
    """Do adpative spectrogram augmentation
    The level of the augmentation is gowern by the paramter level,
    ranging from 0 to 1, with 0 represents no augmentation。
    """
    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
    if tempo_axis == 0:
        nt, nf = spect.shape
    else:
        nf, nt = spect.shape
    time_mask_width = int(nt * level * 0.5)
    freq_mask_width = int(nf * level * 0.5)
    num_time_mask = int(10 * level)
    num_freq_mask = int(10 * level)
    if tempo_axis == 0:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[start:start + time_mask_width, :] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[:, start:start + freq_mask_width] = 0
    else:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[:, start:start + time_mask_width] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[start:start + freq_mask_width, :] = 0
    return spect
 def spect_augment(spect: array,
                  tempo_axis: int=0,
                  max_time_mask: int=3,
                  max_freq_mask: int=3,
                  max_time_mask_width: int=30,
                  max_freq_mask_width: int=20) -> array:
    """Do spectrogram augmentation in both time and freq axis
    Reference:
    """
    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
    if tempo_axis == 0:
        nt, nf = spect.shape
    else:
        nf, nt = spect.shape
    num_time_mask = randint(max_time_mask)
    num_freq_mask = randint(max_freq_mask)
    time_mask_width = randint(max_time_mask_width)
    freq_mask_width = randint(max_freq_mask_width)
    if tempo_axis == 0:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[start:start + time_mask_width, :] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[:, start:start + freq_mask_width] = 0
    else:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[:, start:start + time_mask_width] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[start:start + freq_mask_width, :] = 0
    return spect
 def random_crop1d(y: array, crop_len: int) -> array:
    """ Do random cropping on 1d input signal
    The input is a 1d signal, typically a sound waveform
    """
    if y.ndim != 1:
        'only accept 1d tensor or numpy array'
    n = len(y)
    idx = randint(n - crop_len)
    return y[idx:idx + crop_len]
 def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
    """ Do random cropping for 2D array, typically a spectrogram.
    The cropping is done in temporal direction on the time-freq input signal.
    """
    if tempo_axis >= s.ndim:
        raise ParameterError('axis out of range')
    n = s.shape[tempo_axis]
    idx = randint(high=n - crop_len)
    sli = [slice(None) for i in range(s.ndim)]
    sli[tempo_axis] = slice(idx, idx + crop_len)
    out = s[tuple(sli)]
    return out
--- a/paddleaudio/features/spectrum.py
+++ b/paddleaudio/features/spectrum.py
@ -1,461 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from functools import partial
 from typing import Optional
 from typing import Union
 import paddle
 import paddle.nn as nn
 from .window import get_window
 __all__ = [
    'Spectrogram',
    'MelSpectrogram',
    'LogMelSpectrogram',
 ]
 def hz_to_mel(freq: Union[paddle.Tensor, float],
              htk: bool=False) -> Union[paddle.Tensor, float]:
    """Convert Hz to Mels.
    Parameters:
        freq: the input tensor of arbitrary shape, or a single floating point number.
        htk: use HTK formula to do the conversion.
            The default value is False.
    Returns:
        The frequencies represented in Mel-scale.
    """
    if htk:
        if isinstance(freq, paddle.Tensor):
            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
        else:
            return 2595.0 * math.log10(1.0 + freq / 700.0)
    # Fill in the linear part
    f_min = 0.0
    f_sp = 200.0 / 3
    mels = (freq - f_min) / f_sp
    # Fill in the log-scale part
    min_log_hz = 1000.0  # beginning of log region (Hz)
    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
    logstep = math.log(6.4) / 27.0  # step size for log region
    if isinstance(freq, paddle.Tensor):
        target = min_log_mel + paddle.log(
            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
        mask = (freq > min_log_hz).astype(freq.dtype)
        mels = target * mask + mels * (
            1 - mask)  # will replace by masked_fill OP in future
    else:
        if freq >= min_log_hz:
            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
    return mels
 def mel_to_hz(mel: Union[float, paddle.Tensor],
              htk: bool=False) -> Union[float, paddle.Tensor]:
    """Convert mel bin numbers to frequencies.
    Parameters:
        mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
        htk: use HTK formula to do the conversion.
    Returns:
        The frequencies represented in hz.
    """
    if htk:
        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
    f_min = 0.0
    f_sp = 200.0 / 3
    freqs = f_min + f_sp * mel
    # And now the nonlinear scale
    min_log_hz = 1000.0  # beginning of log region (Hz)
    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
    logstep = math.log(6.4) / 27.0  # step size for log region
    if isinstance(mel, paddle.Tensor):
        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
        mask = (mel > min_log_mel).astype(mel.dtype)
        freqs = target * mask + freqs * (
            1 - mask)  # will replace by masked_fill OP in future
    else:
        if mel >= min_log_mel:
            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
    return freqs
 def mel_frequencies(n_mels: int=64,
                    f_min: float=0.0,
                    f_max: float=11025.0,
                    htk: bool=False,
                    dtype: str=paddle.float32):
    """Compute mel frequencies.
    Parameters:
        n_mels(int): number of Mel bins.
        f_min(float): the lower cut-off frequency, below which the filter response is zero.
        f_max(float): the upper cut-off frequency, above which the filter response is zero.
        htk(bool): whether to use htk formula.
        dtype(str): the datatype of the return frequencies.
    Returns:
        The frequencies represented in Mel-scale
    """
    # 'Center freqs' of mel bands - uniformly spaced between limits
    min_mel = hz_to_mel(f_min, htk=htk)
    max_mel = hz_to_mel(f_max, htk=htk)
    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
    freqs = mel_to_hz(mels, htk=htk)
    return freqs
 def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
    """Compute fourier frequencies.
    Parameters:
        sr(int): the audio sample rate.
        n_fft(float): the number of fft bins.
        dtype(str): the datatype of the return frequencies.
    Returns:
        The frequencies represented in hz.
    """
    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
 def compute_fbank_matrix(sr: int,
                         n_fft: int,
                         n_mels: int=64,
                         f_min: float=0.0,
                         f_max: Optional[float]=None,
                         htk: bool=False,
                         norm: Union[str, float]='slaney',
                         dtype: str=paddle.float32):
    """Compute fbank matrix.
    Parameters:
        sr(int): the audio sample rate.
        n_fft(int): the number of fft bins.
        n_mels(int): the number of Mel bins.
        f_min(float): the lower cut-off frequency, below which the filter response is zero.
        f_max(float): the upper cut-off frequency, above which the filter response is zero.
        htk: whether to use htk formula.
        return_complex(bool): whether to return complex matrix. If True, the matrix will
            be complex type. Otherwise, the real and image part will be stored in the last
            axis of returned tensor.
        dtype(str): the datatype of the returned fbank matrix.
    Returns:
        The fbank matrix of shape (n_mels, int(1+n_fft//2)).
    Shape:
        output: (n_mels, int(1+n_fft//2))
    """
    if f_max is None:
        f_max = float(sr) / 2
    # Initialize the weights
    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
    # Center freqs of each FFT bin
    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
    # 'Center freqs' of mel bands - uniformly spaced between limits
    mel_f = mel_frequencies(
        n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
    #ramps = np.subtract.outer(mel_f, fftfreqs)
    for i in range(n_mels):
        # lower and upper slopes for all bins
        lower = -ramps[i] / fdiff[i]
        upper = ramps[i + 2] / fdiff[i + 1]
        # .. then intersect them with each other and zero
        weights[i] = paddle.maximum(
            paddle.zeros_like(lower), paddle.minimum(lower, upper))
    # Slaney-style mel is scaled to be approx constant energy per channel
    if norm == 'slaney':
        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
        weights *= enorm.unsqueeze(1)
    elif isinstance(norm, int) or isinstance(norm, float):
        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
    return weights
 def power_to_db(magnitude: paddle.Tensor,
                ref_value: float=1.0,
                amin: float=1e-10,
                top_db: Optional[float]=None) -> paddle.Tensor:
    """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
    The function computes the scaling ``10 * log10(x / ref)`` in a numerically
    stable way.
    Parameters:
        magnitude(Tensor): the input magnitude tensor of any shape.
        ref_value(float): the reference value. If smaller than 1.0, the db level
            of the signal will be pulled up accordingly. Otherwise, the db level
            is pushed down.
        amin(float): the minimum value of input magnitude, below which the input
            magnitude is clipped(to amin).
        top_db(float): the maximum db value of resulting spectrum, above which the
            spectrum is clipped(to top_db).
    Returns:
        The spectrogram in log-scale.
    shape:
        input: any shape
        output: same as input
    """
    if amin <= 0:
        raise Exception("amin must be strictly positive")
    if ref_value <= 0:
        raise Exception("ref_value must be strictly positive")
    ones = paddle.ones_like(magnitude)
    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
    log_spec -= 10.0 * math.log10(max(ref_value, amin))
    if top_db is not None:
        if top_db < 0:
            raise Exception("top_db must be non-negative")
        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
    return log_spec
 class Spectrogram(nn.Layer):
    def __init__(self,
                 n_fft: int=512,
                 hop_length: Optional[int]=None,
                 win_length: Optional[int]=None,
                 window: str='hann',
                 center: bool=True,
                 pad_mode: str='reflect',
                 dtype: str=paddle.float32):
        """Compute spectrogram of a given signal, typically an audio waveform.
        The spectorgram is defined as the complex norm of the short-time
        Fourier transformation.
        Parameters:
            n_fft(int): the number of frequency components of the discrete Fourier transform.
                The default value is 2048,
            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                The default value is None.
            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                The default value is None.
            window(str): the name of the window function applied to the single before the Fourier transform.
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                and 'constant'. The default value is 'reflect'.
            dtype(str): the data type of input and window.
        Notes:
            The Spectrogram transform relies on STFT transform to compute the spectrogram.
            By default, the weights are not learnable. To fine-tune the Fourier coefficients,
            set stop_gradient=False before training.
            For more information, see STFT().
        """
        super(Spectrogram, self).__init__()
        if win_length is None:
            win_length = n_fft
        fft_window = get_window(window, win_length, fftbins=True, dtype=dtype)
        self._stft = partial(
            paddle.signal.stft,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=fft_window,
            center=center,
            pad_mode=pad_mode)
    def forward(self, x):
        stft = self._stft(x)
        spectrogram = paddle.square(paddle.abs(stft))
        return spectrogram
 class MelSpectrogram(nn.Layer):
    def __init__(self,
                 sr: int=22050,
                 n_fft: int=512,
                 hop_length: Optional[int]=None,
                 win_length: Optional[int]=None,
                 window: str='hann',
                 center: bool=True,
                 pad_mode: str='reflect',
                 n_mels: int=64,
                 f_min: float=50.0,
                 f_max: Optional[float]=None,
                 htk: bool=False,
                 norm: Union[str, float]='slaney',
                 dtype: str=paddle.float32):
        """Compute the melspectrogram of a given signal, typically an audio waveform.
        The melspectrogram is also known as filterbank or fbank feature in audio community.
        It is computed by multiplying spectrogram with Mel filter bank matrix.
        Parameters:
            sr(int): the audio sample rate.
                The default value is 22050.
            n_fft(int): the number of frequency components of the discrete Fourier transform.
                The default value is 2048,
            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                The default value is None.
            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                The default value is None.
            window(str): the name of the window function applied to the single before the Fourier transform.
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                and 'constant'.
                The default value is 'reflect'.
            n_mels(int): the mel bins.
            f_min(float): the lower cut-off frequency, below which the filter response is zero.
            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
            htk(bool): whether to use HTK formula in computing fbank matrix.
            norm(str|float): the normalization type in computing fbank matrix.  Slaney-style is used by default.
                You can specify norm=1.0/2.0 to use customized p-norm normalization.
            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
        """
        super(MelSpectrogram, self).__init__()
        self._spectrogram = Spectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            center=center,
            pad_mode=pad_mode,
            dtype=dtype)
        self.n_mels = n_mels
        self.f_min = f_min
        self.f_max = f_max
        self.htk = htk
        self.norm = norm
        if f_max is None:
            f_max = sr // 2
        self.fbank_matrix = compute_fbank_matrix(
            sr=sr,
            n_fft=n_fft,
            n_mels=n_mels,
            f_min=f_min,
            f_max=f_max,
            htk=htk,
            norm=norm,
            dtype=dtype)  # float64 for better numerical results
        self.register_buffer('fbank_matrix', self.fbank_matrix)
    def forward(self, x):
        spect_feature = self._spectrogram(x)
        mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
        return mel_feature
 class LogMelSpectrogram(nn.Layer):
    def __init__(self,
                 sr: int=22050,
                 n_fft: int=512,
                 hop_length: Optional[int]=None,
                 win_length: Optional[int]=None,
                 window: str='hann',
                 center: bool=True,
                 pad_mode: str='reflect',
                 n_mels: int=64,
                 f_min: float=50.0,
                 f_max: Optional[float]=None,
                 htk: bool=False,
                 norm: Union[str, float]='slaney',
                 ref_value: float=1.0,
                 amin: float=1e-10,
                 top_db: Optional[float]=None,
                 dtype: str=paddle.float32):
        """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
        typically an audio waveform.
        Parameters:
            sr(int): the audio sample rate.
                The default value is 22050.
            n_fft(int): the number of frequency components of the discrete Fourier transform.
                The default value is 2048,
            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                The default value is None.
            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                The default value is None.
            window(str): the name of the window function applied to the single before the Fourier transform.
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                and 'constant'.
                The default value is 'reflect'.
            n_mels(int): the mel bins.
            f_min(float): the lower cut-off frequency, below which the filter response is zero.
            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
            ref_value(float): the reference value. If smaller than 1.0, the db level
            htk(bool): whether to use HTK formula in computing fbank matrix.
            norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
                You can specify norm=1.0/2.0 to use customized p-norm normalization.
            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
            amin(float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
                Otherwise, the db level is pushed down.
                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
                e.g., 1e-3.
            top_db(float): the maximum db value of resulting spectrum, above which the
                spectrum is clipped(to top_db).
        """
        super(LogMelSpectrogram, self).__init__()
        self._melspectrogram = MelSpectrogram(
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            center=center,
            pad_mode=pad_mode,
            n_mels=n_mels,
            f_min=f_min,
            f_max=f_max,
            htk=htk,
            norm=norm,
            dtype=dtype)
        self.ref_value = ref_value
        self.amin = amin
        self.top_db = top_db
    def forward(self, x):
        # import ipdb; ipdb.set_trace()
        mel_feature = self._melspectrogram(x)
        log_mel_feature = power_to_db(
            mel_feature,
            ref_value=self.ref_value,
            amin=self.amin,
            top_db=self.top_db)
        return log_mel_feature
--- a/paddleaudio/paddleaudio/init.py
+++ b/paddleaudio/paddleaudio/init.py
@ -0,0 +1,22 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from . import compliance
 from . import datasets
 from . import features
 from . import functional
 from . import io
 from . import metric
 from . import sox_effects
 from .backends import load
 from .backends import save
--- a/paddleaudio/paddleaudio/backends/init.py
+++ b/paddleaudio/paddleaudio/backends/init.py
@ -0,0 +1,19 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .soundfile_backend import depth_convert
 from .soundfile_backend import load
 from .soundfile_backend import normalize
 from .soundfile_backend import resample
 from .soundfile_backend import save
 from .soundfile_backend import to_mono
--- a/paddleaudio/paddleaudio/backends/soundfile_backend.py
+++ b/paddleaudio/paddleaudio/backends/soundfile_backend.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -29,7 +29,7 @@ __all__ = [
    'to_mono',
    'depth_convert',
    'normalize',
-    'save_wav',
+    'save',
    'load',
 ]
 NORMALMIZE_TYPES = ['linear', 'gaussian']
@ -41,12 +41,9 @@ EPS = 1e-8
 def resample(y: array, src_sr: int, target_sr: int,
             mode: str='kaiser_fast') -> array:
    """ Audio resampling
     This function is the same as using resampy.resample().
     Notes:
        The default mode is kaiser_fast.  For better audio quality, use mode = 'kaiser_fast'
     """
    if mode == 'kaiser_best':
@ -106,7 +103,6 @@ def to_mono(y: array, merge_type: str='average') -> array:
 def _safe_cast(y: array, dtype: Union[type, str]) -> array:
    """ data type casting in a safe way, i.e., prevent overflow or underflow
    This function is used internally.
    """
    return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
@ -115,10 +111,8 @@ def _safe_cast(y: array, dtype: Union[type, str]) -> array:
 def depth_convert(y: array, dtype: Union[type, str],
                  dithering: bool=True) -> array:
    """Convert audio array to target dtype safely
    This function convert audio waveform to a target dtype, with addition steps of
    preventing overflow/underflow and preserving audio range.
    """
    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
@ -168,12 +162,9 @@ def sound_file_load(file: str,
                    dtype: str='int16',
                    duration: Optional[int]=None) -> Tuple[array, int]:
    """Load audio using soundfile library
    This function load audio file using libsndfile.
    Reference:
        http://www.mega-nerd.com/libsndfile/#Features
    """
    with sf.SoundFile(file) as sf_desc:
        sr_native = sf_desc.samplerate
@ -188,33 +179,9 @@ def sound_file_load(file: str,
    return y, sf_desc.samplerate
 def audio_file_load():
    """Load audio using audiofile library
    This function load audio file using audiofile.
    Reference:
        https://audiofile.68k.org/
    """
    raise NotImplementedError()
 def sox_file_load():
    """Load audio using sox library
    This function load audio file using sox.
    Reference:
        http://sox.sourceforge.net/
    """
    raise NotImplementedError()
 def normalize(y: array, norm_type: str='linear',
              mul_factor: float=1.0) -> array:
    """ normalize an input audio with additional multiplier.
    """
    if norm_type == 'linear':
@ -232,14 +199,12 @@ def normalize(y: array, norm_type: str='linear',
    return y
-def save_wav(y: array, sr: int, file: str) -> None:
+def save(y: array, sr: int, file: str) -> None:
    """Save audio file to disk.
    This function saves audio to disk using scipy.io.wavfile, with additional step
    to convert input waveform to int16 unless it already is int16
    Notes:
        It only support raw wav format.
    """
    if not file.endswith('.wav'):
        raise ParameterError(
@ -274,11 +239,8 @@ def load(
        resample_mode: str='kaiser_fast') -> Tuple[array, int]:
    """Load audio file from disk.
    This function loads audio from disk using using audio beackend.
    Parameters:
    Notes:
    """
    y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
--- a/paddleaudio/paddleaudio/backends/sox_backend.py
+++ b/paddleaudio/paddleaudio/backends/sox_backend.py
@ -0,0 +1,13 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/paddleaudio/paddleaudio/compliance/init.py
+++ b/paddleaudio/paddleaudio/compliance/init.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .backends import *
 from .features import *
--- a/paddleaudio/paddleaudio/compliance/kaldi.py
+++ b/paddleaudio/paddleaudio/compliance/kaldi.py
@ -0,0 +1,638 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from torchaudio(https://github.com/pytorch/audio)
 import math
 from typing import Tuple
 import paddle
 from paddle import Tensor
 from ..functional import create_dct
 from ..functional.window import get_window
 __all__ = [
    'spectrogram',
    'fbank',
    'mfcc',
 ]
 # window types
 HANNING = 'hann'
 HAMMING = 'hamming'
 POVEY = 'povey'
 RECTANGULAR = 'rect'
 BLACKMAN = 'blackman'
 def _get_epsilon(dtype):
    return paddle.to_tensor(1e-07, dtype=dtype)
 def _next_power_of_2(x: int) -> int:
    return 1 if x == 0 else 2**(x - 1).bit_length()
 def _get_strided(waveform: Tensor,
                 window_size: int,
                 window_shift: int,
                 snip_edges: bool) -> Tensor:
    assert waveform.dim() == 1
    num_samples = waveform.shape[0]
    if snip_edges:
        if num_samples < window_size:
            return paddle.empty((0, 0), dtype=waveform.dtype)
        else:
            m = 1 + (num_samples - window_size) // window_shift
    else:
        reversed_waveform = paddle.flip(waveform, [0])
        m = (num_samples + (window_shift // 2)) // window_shift
        pad = window_size // 2 - window_shift // 2
        pad_right = reversed_waveform
        if pad > 0:
            pad_left = reversed_waveform[-pad:]
            waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
        else:
            waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
    return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
 def _feature_window_function(
        window_type: str,
        window_size: int,
        blackman_coeff: float,
        dtype: int, ) -> Tensor:
    if window_type == HANNING:
        return get_window('hann', window_size, fftbins=False, dtype=dtype)
    elif window_type == HAMMING:
        return get_window('hamming', window_size, fftbins=False, dtype=dtype)
    elif window_type == POVEY:
        return get_window(
            'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
    elif window_type == RECTANGULAR:
        return paddle.ones([window_size], dtype=dtype)
    elif window_type == BLACKMAN:
        a = 2 * math.pi / (window_size - 1)
        window_function = paddle.arange(window_size, dtype=dtype)
        return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
                (0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
                ).astype(dtype)
    else:
        raise Exception('Invalid window type ' + window_type)
 def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
                    energy_floor: float) -> Tensor:
    log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
    if energy_floor == 0.0:
        return log_energy
    return paddle.maximum(
        log_energy,
        paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
 def _get_waveform_and_window_properties(
        waveform: Tensor,
        channel: int,
        sr: int,
        frame_shift: float,
        frame_length: float,
        round_to_power_of_two: bool,
        preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
    channel = max(channel, 0)
    assert channel < waveform.shape[0], (
        'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
    waveform = waveform[channel, :]  # size (n)
    window_shift = int(
        sr * frame_shift *
        0.001)  # pass frame_shift and frame_length in milliseconds
    window_size = int(sr * frame_length * 0.001)
    padded_window_size = _next_power_of_2(
        window_size) if round_to_power_of_two else window_size
    assert 2 <= window_size <= len(waveform), (
        'choose a window size {} that is [2, {}]'.format(window_size,
                                                         len(waveform)))
    assert 0 < window_shift, '`window_shift` must be greater than 0'
    assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
                                        ' use `round_to_power_of_two` or change `frame_length`'
    assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
    assert sr > 0, '`sr` must be greater than zero'
    return waveform, window_shift, window_size, padded_window_size
 def _get_window(waveform: Tensor,
                padded_window_size: int,
                window_size: int,
                window_shift: int,
                window_type: str,
                blackman_coeff: float,
                snip_edges: bool,
                raw_energy: bool,
                energy_floor: float,
                dither: float,
                remove_dc_offset: bool,
                preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
    dtype = waveform.dtype
    epsilon = _get_epsilon(dtype)
    # (m, window_size)
    strided_input = _get_strided(waveform, window_size, window_shift,
                                 snip_edges)
    if dither != 0.0:
        x = paddle.maximum(epsilon,
                           paddle.rand(strided_input.shape, dtype=dtype))
        rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
        strided_input = strided_input + rand_gauss * dither
    if remove_dc_offset:
        row_means = paddle.mean(strided_input, axis=1).unsqueeze(1)  # (m, 1)
        strided_input = strided_input - row_means
    if raw_energy:
        signal_log_energy = _get_log_energy(strided_input, epsilon,
                                            energy_floor)  # (m)
    if preemphasis_coefficient != 0.0:
        offset_strided_input = paddle.nn.functional.pad(
            strided_input.unsqueeze(0), (1, 0),
            data_format='NCL',
            mode='replicate').squeeze(0)  # (m, window_size + 1)
        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
                                                                                       -1]
    window_function = _feature_window_function(
        window_type, window_size, blackman_coeff,
        dtype).unsqueeze(0)  # (1, window_size)
    strided_input = strided_input * window_function  # (m, window_size)
    # (m, padded_window_size)
    if padded_window_size != window_size:
        padding_right = padded_window_size - window_size
        strided_input = paddle.nn.functional.pad(
            strided_input.unsqueeze(0), (0, padding_right),
            data_format='NCL',
            mode='constant',
            value=0).squeeze(0)
    if not raw_energy:
        signal_log_energy = _get_log_energy(strided_input, epsilon,
                                            energy_floor)  # size (m)
    return strided_input, signal_log_energy
 def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
    if subtract_mean:
        col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
        tensor = tensor - col_means
    return tensor
 def spectrogram(waveform: Tensor,
                blackman_coeff: float=0.42,
                channel: int=-1,
                dither: float=0.0,
                energy_floor: float=1.0,
                frame_length: float=25.0,
                frame_shift: float=10.0,
                preemphasis_coefficient: float=0.97,
                raw_energy: bool=True,
                remove_dc_offset: bool=True,
                round_to_power_of_two: bool=True,
                sr: int=16000,
                snip_edges: bool=True,
                subtract_mean: bool=False,
                window_type: str=POVEY) -> Tensor:
    """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
    Args:
        waveform (Tensor): A waveform tensor with shape [C, T].
        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
        channel (int, optional): Select the channel of waveform. Defaults to -1.
        dither (float, optional): Dithering constant . Defaults to 0.0.
        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
            to FFT. Defaults to True.
        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
    Returns:
        Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames
            depends on frame_length and frame_shift.
    """
    dtype = waveform.dtype
    epsilon = _get_epsilon(dtype)
    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
        waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
        preemphasis_coefficient)
    strided_input, signal_log_energy = _get_window(
        waveform, padded_window_size, window_size, window_shift, window_type,
        blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
        remove_dc_offset, preemphasis_coefficient)
    # (m, padded_window_size // 2 + 1, 2)
    fft = paddle.fft.rfft(strided_input)
    power_spectrum = paddle.maximum(
        fft.abs().pow(2.), epsilon).log()  # (m, padded_window_size // 2 + 1)
    power_spectrum[:, 0] = signal_log_energy
    power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
    return power_spectrum
 def _inverse_mel_scale_scalar(mel_freq: float) -> float:
    return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
 def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
    return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
 def _mel_scale_scalar(freq: float) -> float:
    return 1127.0 * math.log(1.0 + freq / 700.0)
 def _mel_scale(freq: Tensor) -> Tensor:
    return 1127.0 * (1.0 + freq / 700.0).log()
 def _vtln_warp_freq(vtln_low_cutoff: float,
                    vtln_high_cutoff: float,
                    low_freq: float,
                    high_freq: float,
                    vtln_warp_factor: float,
                    freq: Tensor) -> Tensor:
    assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
    assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
    l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
    h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
    scale = 1.0 / vtln_warp_factor
    Fl = scale * l
    Fh = scale * h
    assert l > low_freq and h < high_freq
    scale_left = (Fl - low_freq) / (l - low_freq)
    scale_right = (high_freq - Fh) / (high_freq - h)
    res = paddle.empty_like(freq)
    outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
        | paddle.greater_than(freq, paddle.to_tensor(high_freq))
    before_l = paddle.less_than(freq, paddle.to_tensor(l))
    before_h = paddle.less_than(freq, paddle.to_tensor(h))
    after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
    res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
    res[before_h] = scale * freq[before_h]
    res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
    res[outside_low_high_freq] = freq[outside_low_high_freq]
    return res
 def _vtln_warp_mel_freq(vtln_low_cutoff: float,
                        vtln_high_cutoff: float,
                        low_freq,
                        high_freq: float,
                        vtln_warp_factor: float,
                        mel_freq: Tensor) -> Tensor:
    return _mel_scale(
        _vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
                        vtln_warp_factor, _inverse_mel_scale(mel_freq)))
 def _get_mel_banks(num_bins: int,
                   window_length_padded: int,
                   sample_freq: float,
                   low_freq: float,
                   high_freq: float,
                   vtln_low: float,
                   vtln_high: float,
                   vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
    assert num_bins > 3, 'Must have at least 3 mel bins'
    assert window_length_padded % 2 == 0
    num_fft_bins = window_length_padded / 2
    nyquist = 0.5 * sample_freq
    if high_freq <= 0.0:
        high_freq += nyquist
    assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
        ('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
    fft_bin_width = sample_freq / window_length_padded
    mel_low_freq = _mel_scale_scalar(low_freq)
    mel_high_freq = _mel_scale_scalar(high_freq)
    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
    if vtln_high < 0.0:
        vtln_high += nyquist
    assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
                                       (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
        ('Bad values in options: vtln-low {} and vtln-high {}, versus '
         'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
    bin = paddle.arange(num_bins).unsqueeze(1)
    left_mel = mel_low_freq + bin * mel_freq_delta  # (num_bins, 1)
    center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta  # (num_bins, 1)
    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # (num_bins, 1)
    if vtln_warp_factor != 1.0:
        left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
                                       vtln_warp_factor, left_mel)
        center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
                                         high_freq, vtln_warp_factor,
                                         center_mel)
        right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
                                        high_freq, vtln_warp_factor, right_mel)
    center_freqs = _inverse_mel_scale(center_mel)  # (num_bins)
    # (1, num_fft_bins)
    mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
    # (num_bins, num_fft_bins)
    up_slope = (mel - left_mel) / (center_mel - left_mel)
    down_slope = (right_mel - mel) / (right_mel - center_mel)
    if vtln_warp_factor == 1.0:
        bins = paddle.maximum(
            paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
    else:
        bins = paddle.zeros_like(up_slope)
        up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
            mel, center_mel)
        down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
            mel, right_mel)
        bins[up_idx] = up_slope[up_idx]
        bins[down_idx] = down_slope[down_idx]
    return bins, center_freqs
 def fbank(waveform: Tensor,
          blackman_coeff: float=0.42,
          channel: int=-1,
          dither: float=0.0,
          energy_floor: float=1.0,
          frame_length: float=25.0,
          frame_shift: float=10.0,
          high_freq: float=0.0,
          htk_compat: bool=False,
          low_freq: float=20.0,
          n_mels: int=23,
          preemphasis_coefficient: float=0.97,
          raw_energy: bool=True,
          remove_dc_offset: bool=True,
          round_to_power_of_two: bool=True,
          sr: int=16000,
          snip_edges: bool=True,
          subtract_mean: bool=False,
          use_energy: bool=False,
          use_log_fbank: bool=True,
          use_power: bool=True,
          vtln_high: float=-500.0,
          vtln_low: float=100.0,
          vtln_warp: float=1.0,
          window_type: str=POVEY) -> Tensor:
    """Compute and return filter banks from a waveform. The output is identical to Kaldi's.
    Args:
        waveform (Tensor): A waveform tensor with shape [C, T].
        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
        channel (int, optional): Select the channel of waveform. Defaults to -1.
        dither (float, optional): Dithering constant . Defaults to 0.0.
        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
        high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
        htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
        low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
        n_mels (int, optional): Number of output mel bins. Defaults to 23.
        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
            to FFT. Defaults to True.
        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
        use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
        use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
        use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
        vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
    Returns:
        Tensor: A filter banks tensor with shape (m, n_mels).
    """
    dtype = waveform.dtype
    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
        waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
        preemphasis_coefficient)
    strided_input, signal_log_energy = _get_window(
        waveform, padded_window_size, window_size, window_shift, window_type,
        blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
        remove_dc_offset, preemphasis_coefficient)
    # (m, padded_window_size // 2 + 1)
    spectrum = paddle.fft.rfft(strided_input).abs()
    if use_power:
        spectrum = spectrum.pow(2.)
    # (n_mels, padded_window_size // 2)
    mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
                                     high_freq, vtln_low, vtln_high, vtln_warp)
    mel_energies = mel_energies.astype(dtype)
    # (n_mels, padded_window_size // 2 + 1)
    mel_energies = paddle.nn.functional.pad(
        mel_energies.unsqueeze(0), (0, 1),
        data_format='NCL',
        mode='constant',
        value=0).squeeze(0)
    # (m, n_mels)
    mel_energies = paddle.mm(spectrum, mel_energies.T)
    if use_log_fbank:
        mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
    if use_energy:
        signal_log_energy = signal_log_energy.unsqueeze(1)
        if htk_compat:
            mel_energies = paddle.concat(
                (mel_energies, signal_log_energy), axis=1)
        else:
            mel_energies = paddle.concat(
                (signal_log_energy, mel_energies), axis=1)
    # (m, n_mels + 1)
    mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
    return mel_energies
 def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
    dct_matrix = create_dct(n_mels, n_mels, 'ortho')
    dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
    dct_matrix = dct_matrix[:, :n_mfcc]  # (n_mels, n_mfcc)
    return dct_matrix
 def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
    i = paddle.arange(n_mfcc)
    return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
                                                    cepstral_lifter)
 def mfcc(waveform: Tensor,
         blackman_coeff: float=0.42,
         cepstral_lifter: float=22.0,
         channel: int=-1,
         dither: float=0.0,
         energy_floor: float=1.0,
         frame_length: float=25.0,
         frame_shift: float=10.0,
         high_freq: float=0.0,
         htk_compat: bool=False,
         low_freq: float=20.0,
         n_mfcc: int=13,
         n_mels: int=23,
         preemphasis_coefficient: float=0.97,
         raw_energy: bool=True,
         remove_dc_offset: bool=True,
         round_to_power_of_two: bool=True,
         sr: int=16000,
         snip_edges: bool=True,
         subtract_mean: bool=False,
         use_energy: bool=False,
         vtln_high: float=-500.0,
         vtln_low: float=100.0,
         vtln_warp: float=1.0,
         window_type: str=POVEY) -> Tensor:
    """Compute and return mel frequency cepstral coefficients from a waveform. The output is
            identical to Kaldi's.
    Args:
        waveform (Tensor): A waveform tensor with shape [C, T].
        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
        cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
        channel (int, optional): Select the channel of waveform. Defaults to -1.
        dither (float, optional): Dithering constant . Defaults to 0.0.
        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
        high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
        htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
        low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
        n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
        n_mels (int, optional): Number of output mel bins. Defaults to 23.
        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
            to FFT. Defaults to True.
        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
        use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
        vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
    Returns:
        Tensor: A mel frequency cepstral coefficients tensor with shape (m, n_mfcc).
    """
    assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
        n_mfcc, n_mels)
    dtype = waveform.dtype
    # (m, n_mels + use_energy)
    feature = fbank(
        waveform=waveform,
        blackman_coeff=blackman_coeff,
        channel=channel,
        dither=dither,
        energy_floor=energy_floor,
        frame_length=frame_length,
        frame_shift=frame_shift,
        high_freq=high_freq,
        htk_compat=htk_compat,
        low_freq=low_freq,
        n_mels=n_mels,
        preemphasis_coefficient=preemphasis_coefficient,
        raw_energy=raw_energy,
        remove_dc_offset=remove_dc_offset,
        round_to_power_of_two=round_to_power_of_two,
        sr=sr,
        snip_edges=snip_edges,
        subtract_mean=False,
        use_energy=use_energy,
        use_log_fbank=True,
        use_power=True,
        vtln_high=vtln_high,
        vtln_low=vtln_low,
        vtln_warp=vtln_warp,
        window_type=window_type)
    if use_energy:
        # (m)
        signal_log_energy = feature[:, n_mels if htk_compat else 0]
        mel_offset = int(not htk_compat)
        feature = feature[:, mel_offset:(n_mels + mel_offset)]
    # (n_mels, n_mfcc)
    dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
    # (m, n_mfcc)
    feature = feature.matmul(dct_matrix)
    if cepstral_lifter != 0.0:
        # (1, n_mfcc)
        lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
        feature *= lifter_coeffs.astype(dtype=dtype)
    if use_energy:
        feature[:, 0] = signal_log_energy
    if htk_compat:
        energy = feature[:, 0].unsqueeze(1)  # (m, 1)
        feature = feature[:, 1:]  # (m, n_mfcc - 1)
        if not use_energy:
            energy *= math.sqrt(2)
        feature = paddle.concat((feature, energy), axis=1)
    feature = _subtract_column_mean(feature, subtract_mean)
    return feature
--- a/paddleaudio/paddleaudio/compliance/librosa.py
+++ b/paddleaudio/paddleaudio/compliance/librosa.py
@ -21,11 +21,13 @@ import numpy as np
 import scipy
 from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
-from scipy.signal import get_window
+from scipy import signal
 from ..backends import depth_convert
 from ..utils import ParameterError
 __all__ = [
    # dsp
    'stft',
    'mfcc',
    'hz_to_mel',
@ -38,6 +40,12 @@ __all__ = [
    'spectrogram',
    'mu_encode',
    'mu_decode',
    # augmentation
    'depth_augment',
    'spect_augment',
    'random_crop1d',
    'random_crop2d',
    'adaptive_spect_augment',
 ]
@ -303,7 +311,7 @@ def stft(x: array,
    if hop_length is None:
        hop_length = int(win_length // 4)
-    fft_window = get_window(window, win_length, fftbins=True)
+    fft_window = signal.get_window(window, win_length, fftbins=True)
    # Pad the window out to n_fft size
    fft_window = pad_center(fft_window, n_fft)
@ -576,3 +584,145 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
        y = y * 2 / mu - 1
    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
    return x
 def randint(high: int) -> int:
    """Generate one random integer in range [0 high)
     This is a helper function for random data augmentaiton
    """
    return int(np.random.randint(0, high=high))
 def rand() -> float:
    """Generate one floating-point number in range [0 1)
    This is a helper function for random data augmentaiton
    """
    return float(np.random.rand(1))
 def depth_augment(y: array,
                  choices: List=['int8', 'int16'],
                  probs: List[float]=[0.5, 0.5]) -> array:
    """ Audio depth augmentation
    Do audio depth augmentation to simulate the distortion brought by quantization.
    """
    assert len(probs) == len(
        choices
    ), 'number of choices {} must be equal to size of probs {}'.format(
        len(choices), len(probs))
    depth = np.random.choice(choices, p=probs)
    src_depth = y.dtype
    y1 = depth_convert(y, depth)
    y2 = depth_convert(y1, src_depth)
    return y2
 def adaptive_spect_augment(spect: array, tempo_axis: int=0,
                           level: float=0.1) -> array:
    """Do adpative spectrogram augmentation
    The level of the augmentation is gowern by the paramter level,
    ranging from 0 to 1, with 0 represents no augmentation。
    """
    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
    if tempo_axis == 0:
        nt, nf = spect.shape
    else:
        nf, nt = spect.shape
    time_mask_width = int(nt * level * 0.5)
    freq_mask_width = int(nf * level * 0.5)
    num_time_mask = int(10 * level)
    num_freq_mask = int(10 * level)
    if tempo_axis == 0:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[start:start + time_mask_width, :] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[:, start:start + freq_mask_width] = 0
    else:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[:, start:start + time_mask_width] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[start:start + freq_mask_width, :] = 0
    return spect
 def spect_augment(spect: array,
                  tempo_axis: int=0,
                  max_time_mask: int=3,
                  max_freq_mask: int=3,
                  max_time_mask_width: int=30,
                  max_freq_mask_width: int=20) -> array:
    """Do spectrogram augmentation in both time and freq axis
    Reference:
    """
    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
    if tempo_axis == 0:
        nt, nf = spect.shape
    else:
        nf, nt = spect.shape
    num_time_mask = randint(max_time_mask)
    num_freq_mask = randint(max_freq_mask)
    time_mask_width = randint(max_time_mask_width)
    freq_mask_width = randint(max_freq_mask_width)
    if tempo_axis == 0:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[start:start + time_mask_width, :] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[:, start:start + freq_mask_width] = 0
    else:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[:, start:start + time_mask_width] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[start:start + freq_mask_width, :] = 0
    return spect
 def random_crop1d(y: array, crop_len: int) -> array:
    """ Do random cropping on 1d input signal
    The input is a 1d signal, typically a sound waveform
    """
    if y.ndim != 1:
        'only accept 1d tensor or numpy array'
    n = len(y)
    idx = randint(n - crop_len)
    return y[idx:idx + crop_len]
 def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
    """ Do random cropping for 2D array, typically a spectrogram.
    The cropping is done in temporal direction on the time-freq input signal.
    """
    if tempo_axis >= s.ndim:
        raise ParameterError('axis out of range')
    n = s.shape[tempo_axis]
    idx = randint(high=n - crop_len)
    sli = [slice(None) for i in range(s.ndim)]
    sli[tempo_axis] = slice(idx, idx + crop_len)
    out = s[tuple(sli)]
    return out
--- a/paddleaudio/paddleaudio/datasets/init.py
+++ b/paddleaudio/paddleaudio/datasets/init.py
@ -15,10 +15,3 @@ from .esc50 import ESC50
 from .gtzan import GTZAN
 from .tess import TESS
 from .urban_sound import UrbanSound8K
 __all__ = [
    'ESC50',
    'UrbanSound8K',
    'GTZAN',
    'TESS',
 ]
--- a/paddleaudio/paddleaudio/datasets/dataset.py
+++ b/paddleaudio/paddleaudio/datasets/dataset.py
@ -17,8 +17,8 @@ import numpy as np
 import paddle
 from ..backends import load as load_audio
-from ..features import melspectrogram
+from ..compliance.librosa import melspectrogram
-from ..features import mfcc
+from ..compliance.librosa import mfcc
 feat_funcs = {
    'raw': None,
--- a/paddleaudio/paddleaudio/datasets/esc50.py
+++ b/paddleaudio/paddleaudio/datasets/esc50.py
--- a/paddleaudio/paddleaudio/datasets/gtzan.py
+++ b/paddleaudio/paddleaudio/datasets/gtzan.py
--- a/paddleaudio/paddleaudio/datasets/tess.py
+++ b/paddleaudio/paddleaudio/datasets/tess.py
--- a/paddleaudio/paddleaudio/datasets/urban_sound.py
+++ b/paddleaudio/paddleaudio/datasets/urban_sound.py
--- a/paddleaudio/paddleaudio/features/init.py
+++ b/paddleaudio/paddleaudio/features/init.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .augment import *
+from .layers import LogMelSpectrogram
-from .core import *
+from .layers import MelSpectrogram
-from .spectrum import *
+from .layers import MFCC
 from .layers import Spectrogram
--- a/paddleaudio/paddleaudio/features/layers.py
+++ b/paddleaudio/paddleaudio/features/layers.py
@ -0,0 +1,344 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from functools import partial
 from typing import Optional
 from typing import Union
 import paddle
 import paddle.nn as nn
 from ..functional import compute_fbank_matrix
 from ..functional import create_dct
 from ..functional import power_to_db
 from ..functional.window import get_window
 __all__ = [
    'Spectrogram',
    'MelSpectrogram',
    'LogMelSpectrogram',
    'MFCC',
 ]
 class Spectrogram(nn.Layer):
    def __init__(self,
                 n_fft: int=512,
                 hop_length: Optional[int]=None,
                 win_length: Optional[int]=None,
                 window: str='hann',
                 center: bool=True,
                 pad_mode: str='reflect',
                 dtype: str=paddle.float32):
        """Compute spectrogram of a given signal, typically an audio waveform.
        The spectorgram is defined as the complex norm of the short-time
        Fourier transformation.
        Parameters:
            n_fft (int): the number of frequency components of the discrete Fourier transform.
                The default value is 2048,
            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                The default value is None.
            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                The default value is None.
            window (str): the name of the window function applied to the single before the Fourier transform.
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                and 'constant'. The default value is 'reflect'.
            dtype (str): the data type of input and window.
        Notes:
            The Spectrogram transform relies on STFT transform to compute the spectrogram.
            By default, the weights are not learnable. To fine-tune the Fourier coefficients,
            set stop_gradient=False before training.
            For more information, see STFT().
        """
        super(Spectrogram, self).__init__()
        if win_length is None:
            win_length = n_fft
        self.fft_window = get_window(
            window, win_length, fftbins=True, dtype=dtype)
        self._stft = partial(
            paddle.signal.stft,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=self.fft_window,
            center=center,
            pad_mode=pad_mode)
        self.register_buffer('fft_window', self.fft_window)
    def forward(self, x):
        stft = self._stft(x)
        spectrogram = paddle.square(paddle.abs(stft))
        return spectrogram
 class MelSpectrogram(nn.Layer):
    def __init__(self,
                 sr: int=22050,
                 n_fft: int=512,
                 hop_length: Optional[int]=None,
                 win_length: Optional[int]=None,
                 window: str='hann',
                 center: bool=True,
                 pad_mode: str='reflect',
                 n_mels: int=64,
                 f_min: float=50.0,
                 f_max: Optional[float]=None,
                 htk: bool=False,
                 norm: Union[str, float]='slaney',
                 dtype: str=paddle.float32):
        """Compute the melspectrogram of a given signal, typically an audio waveform.
        The melspectrogram is also known as filterbank or fbank feature in audio community.
        It is computed by multiplying spectrogram with Mel filter bank matrix.
        Parameters:
            sr(int): the audio sample rate.
                The default value is 22050.
            n_fft(int): the number of frequency components of the discrete Fourier transform.
                The default value is 2048,
            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                The default value is None.
            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                The default value is None.
            window(str): the name of the window function applied to the single before the Fourier transform.
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                and 'constant'.
                The default value is 'reflect'.
            n_mels(int): the mel bins.
            f_min(float): the lower cut-off frequency, below which the filter response is zero.
            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
            htk(bool): whether to use HTK formula in computing fbank matrix.
            norm(str|float): the normalization type in computing fbank matrix.  Slaney-style is used by default.
                You can specify norm=1.0/2.0 to use customized p-norm normalization.
            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
        """
        super(MelSpectrogram, self).__init__()
        self._spectrogram = Spectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            center=center,
            pad_mode=pad_mode,
            dtype=dtype)
        self.n_mels = n_mels
        self.f_min = f_min
        self.f_max = f_max
        self.htk = htk
        self.norm = norm
        if f_max is None:
            f_max = sr // 2
        self.fbank_matrix = compute_fbank_matrix(
            sr=sr,
            n_fft=n_fft,
            n_mels=n_mels,
            f_min=f_min,
            f_max=f_max,
            htk=htk,
            norm=norm,
            dtype=dtype)  # float64 for better numerical results
        self.register_buffer('fbank_matrix', self.fbank_matrix)
    def forward(self, x):
        spect_feature = self._spectrogram(x)
        mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
        return mel_feature
 class LogMelSpectrogram(nn.Layer):
    def __init__(self,
                 sr: int=22050,
                 n_fft: int=512,
                 hop_length: Optional[int]=None,
                 win_length: Optional[int]=None,
                 window: str='hann',
                 center: bool=True,
                 pad_mode: str='reflect',
                 n_mels: int=64,
                 f_min: float=50.0,
                 f_max: Optional[float]=None,
                 htk: bool=False,
                 norm: Union[str, float]='slaney',
                 ref_value: float=1.0,
                 amin: float=1e-10,
                 top_db: Optional[float]=None,
                 dtype: str=paddle.float32):
        """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
        typically an audio waveform.
        Parameters:
            sr (int): the audio sample rate.
                The default value is 22050.
            n_fft (int): the number of frequency components of the discrete Fourier transform.
                The default value is 2048,
            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                The default value is None.
            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                The default value is None.
            window (str): the name of the window function applied to the single before the Fourier transform.
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                and 'constant'.
                The default value is 'reflect'.
            n_mels (int): the mel bins.
            f_min (float): the lower cut-off frequency, below which the filter response is zero.
            f_max (float): the upper cut-off frequency, above which the filter response is zeros.
            htk (bool): whether to use HTK formula in computing fbank matrix.
            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
                You can specify norm=1.0/2.0 to use customized p-norm normalization.
            ref_value (float): the reference value. If smaller than 1.0, the db level
            amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
                Otherwise, the db level is pushed down.
                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
                e.g., 1e-3.
            top_db (float): the maximum db value of resulting spectrum, above which the
                spectrum is clipped(to top_db).
            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
        """
        super(LogMelSpectrogram, self).__init__()
        self._melspectrogram = MelSpectrogram(
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            center=center,
            pad_mode=pad_mode,
            n_mels=n_mels,
            f_min=f_min,
            f_max=f_max,
            htk=htk,
            norm=norm,
            dtype=dtype)
        self.ref_value = ref_value
        self.amin = amin
        self.top_db = top_db
    def forward(self, x):
        # import ipdb; ipdb.set_trace()
        mel_feature = self._melspectrogram(x)
        log_mel_feature = power_to_db(
            mel_feature,
            ref_value=self.ref_value,
            amin=self.amin,
            top_db=self.top_db)
        return log_mel_feature
 class MFCC(nn.Layer):
    def __init__(self,
                 sr: int=22050,
                 n_mfcc: int=40,
                 n_fft: int=512,
                 hop_length: Optional[int]=None,
                 win_length: Optional[int]=None,
                 window: str='hann',
                 center: bool=True,
                 pad_mode: str='reflect',
                 n_mels: int=64,
                 f_min: float=50.0,
                 f_max: Optional[float]=None,
                 htk: bool=False,
                 norm: Union[str, float]='slaney',
                 ref_value: float=1.0,
                 amin: float=1e-10,
                 top_db: Optional[float]=None,
                 dtype: str=paddle.float32):
        """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
        Parameters:
            sr(int): the audio sample rate.
                The default value is 22050.
            n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40.
            n_fft (int): the number of frequency components of the discrete Fourier transform.
                The default value is 2048,
            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                The default value is None.
            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                The default value is None.
            window (str): the name of the window function applied to the single before the Fourier transform.
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                and 'constant'.
                The default value is 'reflect'.
            n_mels (int): the mel bins.
            f_min (float): the lower cut-off frequency, below which the filter response is zero.
            f_max (float): the upper cut-off frequency, above which the filter response is zeros.
            htk (bool): whether to use HTK formula in computing fbank matrix.
            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
                You can specify norm=1.0/2.0 to use customized p-norm normalization.
            ref_value (float): the reference value. If smaller than 1.0, the db level
            amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
                Otherwise, the db level is pushed down.
                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
                e.g., 1e-3.
            top_db (float): the maximum db value of resulting spectrum, above which the
                spectrum is clipped(to top_db).
            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
        """
        super(MFCC, self).__init__()
        assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
            n_mfcc, n_mels)
        self._log_melspectrogram = LogMelSpectrogram(
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            center=center,
            pad_mode=pad_mode,
            n_mels=n_mels,
            f_min=f_min,
            f_max=f_max,
            htk=htk,
            norm=norm,
            ref_value=ref_value,
            amin=amin,
            top_db=top_db,
            dtype=dtype)
        self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
        self.register_buffer('dct_matrix', self.dct_matrix)
    def forward(self, x):
        log_mel_feature = self._log_melspectrogram(x)
        mfcc = paddle.matmul(
            log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
                (0, 2, 1))  # (B, n_mels, L)
        return mfcc
--- a/paddleaudio/paddleaudio/functional/init.py
+++ b/paddleaudio/paddleaudio/functional/init.py
@ -0,0 +1,20 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .functional import compute_fbank_matrix
 from .functional import create_dct
 from .functional import fft_frequencies
 from .functional import hz_to_mel
 from .functional import mel_frequencies
 from .functional import mel_to_hz
 from .functional import power_to_db
--- a/paddleaudio/paddleaudio/functional/functional.py
+++ b/paddleaudio/paddleaudio/functional/functional.py
@ -0,0 +1,265 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from librosa(https://github.com/librosa/librosa)
 import math
 from typing import Optional
 from typing import Union
 import paddle
 __all__ = [
    'hz_to_mel',
    'mel_to_hz',
    'mel_frequencies',
    'fft_frequencies',
    'compute_fbank_matrix',
    'power_to_db',
    'create_dct',
 ]
 def hz_to_mel(freq: Union[paddle.Tensor, float],
              htk: bool=False) -> Union[paddle.Tensor, float]:
    """Convert Hz to Mels.
    Parameters:
        freq: the input tensor of arbitrary shape, or a single floating point number.
        htk: use HTK formula to do the conversion.
            The default value is False.
    Returns:
        The frequencies represented in Mel-scale.
    """
    if htk:
        if isinstance(freq, paddle.Tensor):
            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
        else:
            return 2595.0 * math.log10(1.0 + freq / 700.0)
    # Fill in the linear part
    f_min = 0.0
    f_sp = 200.0 / 3
    mels = (freq - f_min) / f_sp
    # Fill in the log-scale part
    min_log_hz = 1000.0  # beginning of log region (Hz)
    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
    logstep = math.log(6.4) / 27.0  # step size for log region
    if isinstance(freq, paddle.Tensor):
        target = min_log_mel + paddle.log(
            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
        mask = (freq > min_log_hz).astype(freq.dtype)
        mels = target * mask + mels * (
            1 - mask)  # will replace by masked_fill OP in future
    else:
        if freq >= min_log_hz:
            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
    return mels
 def mel_to_hz(mel: Union[float, paddle.Tensor],
              htk: bool=False) -> Union[float, paddle.Tensor]:
    """Convert mel bin numbers to frequencies.
    Parameters:
        mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
        htk: use HTK formula to do the conversion.
    Returns:
        The frequencies represented in hz.
    """
    if htk:
        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
    f_min = 0.0
    f_sp = 200.0 / 3
    freqs = f_min + f_sp * mel
    # And now the nonlinear scale
    min_log_hz = 1000.0  # beginning of log region (Hz)
    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
    logstep = math.log(6.4) / 27.0  # step size for log region
    if isinstance(mel, paddle.Tensor):
        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
        mask = (mel > min_log_mel).astype(mel.dtype)
        freqs = target * mask + freqs * (
            1 - mask)  # will replace by masked_fill OP in future
    else:
        if mel >= min_log_mel:
            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
    return freqs
 def mel_frequencies(n_mels: int=64,
                    f_min: float=0.0,
                    f_max: float=11025.0,
                    htk: bool=False,
                    dtype: str=paddle.float32):
    """Compute mel frequencies.
    Parameters:
        n_mels(int): number of Mel bins.
        f_min(float): the lower cut-off frequency, below which the filter response is zero.
        f_max(float): the upper cut-off frequency, above which the filter response is zero.
        htk(bool): whether to use htk formula.
        dtype(str): the datatype of the return frequencies.
    Returns:
        The frequencies represented in Mel-scale
    """
    # 'Center freqs' of mel bands - uniformly spaced between limits
    min_mel = hz_to_mel(f_min, htk=htk)
    max_mel = hz_to_mel(f_max, htk=htk)
    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
    freqs = mel_to_hz(mels, htk=htk)
    return freqs
 def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
    """Compute fourier frequencies.
    Parameters:
        sr(int): the audio sample rate.
        n_fft(float): the number of fft bins.
        dtype(str): the datatype of the return frequencies.
    Returns:
        The frequencies represented in hz.
    """
    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
 def compute_fbank_matrix(sr: int,
                         n_fft: int,
                         n_mels: int=64,
                         f_min: float=0.0,
                         f_max: Optional[float]=None,
                         htk: bool=False,
                         norm: Union[str, float]='slaney',
                         dtype: str=paddle.float32):
    """Compute fbank matrix.
    Parameters:
        sr(int): the audio sample rate.
        n_fft(int): the number of fft bins.
        n_mels(int): the number of Mel bins.
        f_min(float): the lower cut-off frequency, below which the filter response is zero.
        f_max(float): the upper cut-off frequency, above which the filter response is zero.
        htk: whether to use htk formula.
        return_complex(bool): whether to return complex matrix. If True, the matrix will
            be complex type. Otherwise, the real and image part will be stored in the last
            axis of returned tensor.
        dtype(str): the datatype of the returned fbank matrix.
    Returns:
        The fbank matrix of shape (n_mels, int(1+n_fft//2)).
    Shape:
        output: (n_mels, int(1+n_fft//2))
    """
    if f_max is None:
        f_max = float(sr) / 2
    # Initialize the weights
    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
    # Center freqs of each FFT bin
    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
    # 'Center freqs' of mel bands - uniformly spaced between limits
    mel_f = mel_frequencies(
        n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
    #ramps = np.subtract.outer(mel_f, fftfreqs)
    for i in range(n_mels):
        # lower and upper slopes for all bins
        lower = -ramps[i] / fdiff[i]
        upper = ramps[i + 2] / fdiff[i + 1]
        # .. then intersect them with each other and zero
        weights[i] = paddle.maximum(
            paddle.zeros_like(lower), paddle.minimum(lower, upper))
    # Slaney-style mel is scaled to be approx constant energy per channel
    if norm == 'slaney':
        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
        weights *= enorm.unsqueeze(1)
    elif isinstance(norm, int) or isinstance(norm, float):
        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
    return weights
 def power_to_db(magnitude: paddle.Tensor,
                ref_value: float=1.0,
                amin: float=1e-10,
                top_db: Optional[float]=None) -> paddle.Tensor:
    """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
    The function computes the scaling ``10 * log10(x / ref)`` in a numerically
    stable way.
    Parameters:
        magnitude(Tensor): the input magnitude tensor of any shape.
        ref_value(float): the reference value. If smaller than 1.0, the db level
            of the signal will be pulled up accordingly. Otherwise, the db level
            is pushed down.
        amin(float): the minimum value of input magnitude, below which the input
            magnitude is clipped(to amin).
        top_db(float): the maximum db value of resulting spectrum, above which the
            spectrum is clipped(to top_db).
    Returns:
        The spectrogram in log-scale.
    shape:
        input: any shape
        output: same as input
    """
    if amin <= 0:
        raise Exception("amin must be strictly positive")
    if ref_value <= 0:
        raise Exception("ref_value must be strictly positive")
    ones = paddle.ones_like(magnitude)
    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
    log_spec -= 10.0 * math.log10(max(ref_value, amin))
    if top_db is not None:
        if top_db < 0:
            raise Exception("top_db must be non-negative")
        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
    return log_spec
 def create_dct(n_mfcc: int,
               n_mels: int,
               norm: Optional[str]='ortho',
               dtype: Optional[str]=paddle.float32) -> paddle.Tensor:
    """Create a discrete cosine transform(DCT) matrix.
    Parameters:
        n_mfcc (int): Number of mel frequency cepstral coefficients. 
        n_mels (int): Number of mel filterbanks.
        norm (str, optional): Normalizaiton type. Defaults to 'ortho'.
    Returns:
        Tensor: The DCT matrix with shape (n_mels, n_mfcc).
    """
    n = paddle.arange(n_mels, dtype=dtype)
    k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
    dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
                     k)  # size (n_mfcc, n_mels)
    if norm is None:
        dct *= 2.0
    else:
        assert norm == "ortho"
        dct[0] *= 1.0 / math.sqrt(2.0)
        dct *= math.sqrt(2.0 / float(n_mels))
    return dct.T
--- a/paddleaudio/paddleaudio/functional/window.py
+++ b/paddleaudio/paddleaudio/functional/window.py
@ -20,6 +20,19 @@ from paddle import Tensor
 __all__ = [
    'get_window',
    # windows
    'taylor',
    'hamming',
    'hann',
    'tukey',
    'kaiser',
    'gaussian',
    'exponential',
    'triang',
    'bohman',
    'blackman',
    'cosine',
 ]
@ -73,6 +86,21 @@ def general_gaussian(M: int, p, sig, sym: bool=True,
    return _truncate(w, needs_trunc)
 def general_cosine(M: int, a: float, sym: bool=True,
                   dtype: str='float64') -> Tensor:
    """Compute a generic weighted sum of cosine terms window.
    This function is consistent with scipy.signal.windows.general_cosine().
    """
    if _len_guards(M):
        return paddle.ones((M, ), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
    w = paddle.zeros((M, ), dtype=dtype)
    for k in range(len(a)):
        w += a[k] * paddle.cos(k * fac)
    return _truncate(w, needs_trunc)
 def general_hamming(M: int, alpha: float, sym: bool=True,
                    dtype: str='float64') -> Tensor:
    """Compute a generalized Hamming window.
@ -143,21 +171,6 @@ def taylor(M: int,
    return _truncate(w, needs_trunc)
 def general_cosine(M: int, a: float, sym: bool=True,
                   dtype: str='float64') -> Tensor:
    """Compute a generic weighted sum of cosine terms window.
    This function is consistent with scipy.signal.windows.general_cosine().
    """
    if _len_guards(M):
        return paddle.ones((M, ), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
    w = paddle.zeros((M, ), dtype=dtype)
    for k in range(len(a)):
        w += a[k] * paddle.cos(k * fac)
    return _truncate(w, needs_trunc)
 def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    """Compute a Hamming window.
    The Hamming window is a taper formed by using a raised cosine with
@ -375,6 +388,7 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _truncate(w, needs_trunc)
 ## factory function
 def get_window(window: Union[str, Tuple[str, float]],
               win_length: int,
               fftbins: bool=True,
--- a/paddleaudio/paddleaudio/io/init.py
+++ b/paddleaudio/paddleaudio/io/init.py
@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .audio import *
--- a/paddleaudio/paddleaudio/metric/init.py
+++ b/paddleaudio/paddleaudio/metric/init.py
@ -1,6 +1,6 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License"
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@ -11,8 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .download import *
+from .dtw import dtw_distance
-from .env import *
+from .mcd import mcd_distance
 from .error import *
 from .log import *
 from .time import *
--- a/paddleaudio/paddleaudio/metric/dtw.py
+++ b/paddleaudio/paddleaudio/metric/dtw.py
@ -0,0 +1,42 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 from dtaidistance import dtw_ndim
 __all__ = [
    'dtw_distance',
 ]
 def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float:
    """dtw distance
    Dynamic Time Warping.
    This function keeps a compact matrix, not the full warping paths matrix.
    Uses dynamic programming to compute:
    wps[i, j] = (s1[i]-s2[j])**2 + min(
                    wps[i-1, j  ] + penalty,  // vertical   / insertion / expansion
                    wps[i  , j-1] + penalty,  // horizontal / deletion  / compression
                    wps[i-1, j-1])            // diagonal   / match
    dtw = sqrt(wps[-1, -1])
    Args:
        xs (np.ndarray): ref sequence, [T,D]
        ys (np.ndarray): hyp sequence, [T,D]
    Returns:
        float: dtw distance
    """
    return dtw_ndim.distance(xs, ys)
--- a/paddleaudio/paddleaudio/metric/mcd.py
+++ b/paddleaudio/paddleaudio/metric/mcd.py
@ -0,0 +1,48 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import mcd.metrics_fast as mt
 import numpy as np
 from mcd import dtw
 __all__ = [
    'mcd_distance',
 ]
 def mcd_distance(xs: np.ndarray, ys: np.ndarray, cost_fn=mt.logSpecDbDist):
    """Mel cepstral distortion (MCD), dtw distance.
    Dynamic Time Warping.
    Uses dynamic programming to compute:
    wps[i, j] = cost_fn(xs[i], ys[j]) + min(
                    wps[i-1, j  ],  // vertical   / insertion / expansion
                    wps[i  , j-1],  // horizontal / deletion  / compression
                    wps[i-1, j-1])  // diagonal   / match
    dtw = sqrt(wps[-1, -1])
    Cost Function:
    logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
    def logSpecDbDist(x, y):
        diff = x - y
        return logSpecDbConst * math.sqrt(np.inner(diff, diff))
    Args:
        xs (np.ndarray): ref sequence, [T,D]
        ys (np.ndarray): hyp sequence, [T,D]
    Returns:
        float: dtw distance
    """
    min_cost, path = dtw.dtw(xs, ys, cost_fn)
    return min_cost
--- a/paddleaudio/paddleaudio/sox_effects/init.py
+++ b/paddleaudio/paddleaudio/sox_effects/init.py
@ -0,0 +1,13 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/paddleaudio/paddleaudio/utils/init.py
+++ b/paddleaudio/paddleaudio/utils/init.py
@ -0,0 +1,25 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .download import decompress
 from .download import download_and_decompress
 from .download import load_state_dict_from_url
 from .env import DATA_HOME
 from .env import MODEL_HOME
 from .env import PPAUDIO_HOME
 from .env import USER_HOME
 from .error import ParameterError
 from .log import Logger
 from .log import logger
 from .time import seconds_to_hms
 from .time import Timer
--- a/paddleaudio/paddleaudio/utils/download.py
+++ b/paddleaudio/paddleaudio/utils/download.py
@ -22,6 +22,12 @@ from .log import logger
 download.logger = logger
 __all__ = [
    'decompress',
    'download_and_decompress',
    'load_state_dict_from_url',
 ]
 def decompress(file: str):
    """
--- a/paddleaudio/paddleaudio/utils/env.py
+++ b/paddleaudio/paddleaudio/utils/env.py
@ -20,6 +20,13 @@ PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. D
 '''
 import os
 __all__ = [
    'USER_HOME',
    'PPAUDIO_HOME',
    'MODEL_HOME',
    'DATA_HOME',
 ]
 def _get_user_home():
    return os.path.expanduser('~')
--- a/paddleaudio/paddleaudio/utils/error.py
+++ b/paddleaudio/paddleaudio/utils/error.py
--- a/paddleaudio/paddleaudio/utils/log.py
+++ b/paddleaudio/paddleaudio/utils/log.py
@ -19,7 +19,10 @@ import time
 import colorlog
-loggers = {}
+__all__ = [
    'Logger',
    'logger',
 ]
 log_config = {
    'DEBUG': {
--- a/paddleaudio/paddleaudio/utils/time.py
+++ b/paddleaudio/paddleaudio/utils/time.py
@ -14,6 +14,11 @@
 import math
 import time
 __all__ = [
    'Timer',
    'seconds_to_hms',
 ]
 class Timer(object):
    '''Calculate runing speed and estimated time of arrival(ETA)'''
--- a/paddleaudio/setup.py
+++ b/paddleaudio/setup.py
@ -14,7 +14,7 @@
 import setuptools
 # set the version here
-VERSION = '0.1.0'
+VERSION = '0.2.0'
 def write_version_py(filename='paddleaudio/__init__.py'):
@ -59,6 +59,8 @@ setuptools.setup(
        'resampy >= 0.2.2',
        'soundfile >= 0.9.0',
        'colorlog',
        'dtaidistance >= 2.3.6',
        'mcd >= 0.4',
    ], )
 remove_version_py()
--- a/paddleaudio/tests/.gitkeep
+++ b/paddleaudio/tests/.gitkeep