split paddleaudio

3 years ago · 4df081b954
parent cfd32d00de
commit 4df081b954
39 changed files with 5514 additions and 3 deletions
--- a/audio/paddleaudio/init.py
+++ b/audio/paddleaudio/init.py
@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import compliance
+from . import datasets
+from . import features
+from . import functional
+from . import io
+from . import metric
+from . import sox_effects
+from . import backends
--- a/audio/paddleaudio/_internal/init.py
+++ b/audio/paddleaudio/_internal/init.py
--- a/audio/paddleaudio/_internal/module_utils.py
+++ b/audio/paddleaudio/_internal/module_utils.py
@ -0,0 +1,148 @@
+import importlib.util
+import warnings
+from functools import wraps
+from typing import Optional
+
+#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py
+
+
+def is_module_available(*modules: str) -> bool:
+    r"""Returns if a top-level module with :attr:`name` exists *without**
+    importing it. This is generally safer than try-catch block around a
+    `import X`. It avoids third party libraries breaking assumptions of some of
+    our tests, e.g., setting multiprocessing start method when imported
+    (see librosa/#747, torchvision/#544).
+    """
+    return all(importlib.util.find_spec(m) is not None for m in modules)
+
+
+def requires_module(*modules: str):
+    """Decorate function to give error message if invoked without required optional modules.
+    This decorator is to give better error message to users rather
+    than raising ``NameError:  name 'module' is not defined`` at random places.
+    """
+    missing = [m for m in modules if not is_module_available(m)]
+
+    if not missing:
+        # fall through. If all the modules are available, no need to decorate
+        def decorator(func):
+            return func
+
+    else:
+        req = f"module: {missing[0]}" if len(
+            missing) == 1 else f"modules: {missing}"
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires {req}")
+
+            return wrapped
+
+    return decorator
+
+
+def deprecated(direction: str, version: Optional[str]=None):
+    """Decorator to add deprecation message
+    Args:
+        direction (str): Migration steps to be given to users.
+        version (str or int): The version when the object will be removed
+    """
+
+    def decorator(func):
+        @wraps(func)
+        def wrapped(*args, **kwargs):
+            message = (
+                f"{func.__module__}.{func.__name__} has been deprecated "
+                f'and will be removed from {"future" if version is None else version} release. '
+                f"{direction}")
+            warnings.warn(message, stacklevel=2)
+            return func(*args, **kwargs)
+
+        return wrapped
+
+    return decorator
+
+
+def is_kaldi_available():
+    return is_module_available("paddleaudio._paddleaudio")
+
+
+def requires_kaldi():
+    if is_kaldi_available():
+
+        def decorator(func):
+            return func
+
+    else:
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires kaldi")
+
+            return wrapped
+
+    return decorator
+
+
+def _check_soundfile_importable():
+    if not is_module_available("soundfile"):
+        return False
+    try:
+        import soundfile  # noqa: F401
+
+        return True
+    except Exception:
+        warnings.warn(
+            "Failed to import soundfile. 'soundfile' backend is not available.")
+        return False
+
+
+_is_soundfile_importable = _check_soundfile_importable()
+
+
+def is_soundfile_available():
+    return _is_soundfile_importable
+
+
+def requires_soundfile():
+    if is_soundfile_available():
+
+        def decorator(func):
+            return func
+    else:
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires soundfile")
+
+            return wrapped
+
+    return decorator
+
+
+def is_sox_available():
+    return is_module_available("paddleaudio._paddleaudio")
+
+
+def requires_sox():
+    if is_sox_available():
+
+        def decorator(func):
+            return func
+    else:
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires sox")
+
+            return wrapped
+
+    return decorator
--- a/audio/paddleaudio/backends/init.py
+++ b/audio/paddleaudio/backends/init.py
@ -0,0 +1,26 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .soundfile_backend import depth_convert
+from .soundfile_backend import soundfile_load
+from .soundfile_backend import normalize
+from .soundfile_backend import resample
+from .soundfile_backend import soundfile_save
+from .soundfile_backend import to_mono
+
+from . import utils
+from .utils import get_audio_backend
+from .utils import list_audio_backends
+from .utils import set_audio_backend
+
+utils._init_audio_backend()
--- a/audio/paddleaudio/backends/common.py
+++ b/audio/paddleaudio/backends/common.py
@ -0,0 +1,55 @@
+# Token form https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification.
+
+class AudioInfo:
+    """return of info function.
+
+    This class is used by :ref:`"sox_io" backend<sox_io_backend>` and
+    :ref:`"soundfile" backend with the new interface<soundfile_backend>`.
+
+    :ivar int sample_rate: Sample rate
+    :ivar int num_frames: The number of frames
+    :ivar int num_channels: The number of channels
+    :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
+        or when it cannot be accurately inferred.
+    :ivar str encoding: Audio encoding
+        The values encoding can take are one of the following:
+
+            * ``PCM_S``: Signed integer linear PCM
+            * ``PCM_U``: Unsigned integer linear PCM
+            * ``PCM_F``: Floating point linear PCM
+            * ``FLAC``: Flac, Free Lossless Audio Codec
+            * ``ULAW``: Mu-law
+            * ``ALAW``: A-law
+            * ``MP3`` : MP3, MPEG-1 Audio Layer III
+            * ``VORBIS``: OGG Vorbis
+            * ``AMR_WB``: Adaptive Multi-Rate
+            * ``AMR_NB``: Adaptive Multi-Rate Wideband
+            * ``OPUS``: Opus
+            * ``HTK``: Single channel 16-bit PCM
+            * ``UNKNOWN`` : None of above
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        num_frames: int,
+        num_channels: int,
+        bits_per_sample: int,
+        encoding: str,
+    ):
+        self.sample_rate = sample_rate
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.bits_per_sample = bits_per_sample
+        self.encoding = encoding
+
+    def __str__(self):
+        return (
+            f"AudioMetaData("
+            f"sample_rate={self.sample_rate}, "
+            f"num_frames={self.num_frames}, "
+            f"num_channels={self.num_channels}, "
+            f"bits_per_sample={self.bits_per_sample}, "
+            f"encoding={self.encoding}"
+            f")"
+        )
--- a/audio/paddleaudio/backends/no_backend.py
+++ b/audio/paddleaudio/backends/no_backend.py
@ -0,0 +1,32 @@
+from pathlib import Path
+from typing import Callable
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+from paddle import Tensor
+
+#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
+
+
+def load(
+        filepath: Union[str, Path],
+        out: Optional[Tensor]=None,
+        normalization: Union[bool, float, Callable]=True,
+        channels_first: bool=True,
+        num_frames: int=0,
+        offset: int=0,
+        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def save(filepath: str,
+         src: Tensor,
+         sample_rate: int,
+         precision: int=16,
+         channels_first: bool=True) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def info(filepath: str) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
--- a/audio/paddleaudio/backends/soundfile_backend.py
+++ b/audio/paddleaudio/backends/soundfile_backend.py
@ -0,0 +1,661 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import warnings
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import paddle
+import resampy
+import soundfile
+from scipy.io import wavfile
+
+from ..utils import depth_convert
+from ..utils import ParameterError
+from .common import AudioInfo
+
+__all__ = [
+    'resample',
+    'to_mono',
+    'normalize',
+    'save',
+    'soundfile_save',
+    'load',
+    'soundfile_load',
+    'info',
+]
+NORMALMIZE_TYPES = ['linear', 'gaussian']
+MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
+RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
+EPS = 1e-8
+
+
+def resample(y: np.ndarray,
+             src_sr: int,
+             target_sr: int,
+             mode: str='kaiser_fast') -> np.ndarray:
+    """Audio resampling.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        src_sr (int): Source sample rate.
+        target_sr (int): Target sample rate.
+        mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
+
+    Returns:
+        np.ndarray: `y` resampled to `target_sr`
+    """
+
+    if mode == 'kaiser_best':
+        warnings.warn(
+            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
+        we recommend the mode kaiser_fast in large scale audio trainning')
+
+    if not isinstance(y, np.ndarray):
+        raise ParameterError(
+            'Only support numpy np.ndarray, but received y in {type(y)}')
+
+    if mode not in RESAMPLE_MODES:
+        raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
+
+    return resampy.resample(y, src_sr, target_sr, filter=mode)
+
+
+def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
+    """Convert sterior audio to mono.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
+
+    Returns:
+        np.ndarray: `y` with mono channel.
+    """
+
+    if merge_type not in MERGE_TYPES:
+        raise ParameterError(
+            f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
+        )
+    if y.ndim > 2:
+        raise ParameterError(
+            f'Unsupported audio array,  y.ndim > 2, the shape is {y.shape}')
+    if y.ndim == 1:  # nothing to merge
+        return y
+
+    if merge_type == 'ch0':
+        return y[0]
+    if merge_type == 'ch1':
+        return y[1]
+    if merge_type == 'random':
+        return y[np.random.randint(0, 2)]
+
+    # need to do averaging according to dtype
+
+    if y.dtype == 'float32':
+        y_out = (y[0] + y[1]) * 0.5
+    elif y.dtype == 'int16':
+        y_out = y.astype('int32')
+        y_out = (y_out[0] + y_out[1]) // 2
+        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
+                        np.iinfo(y.dtype).max).astype(y.dtype)
+
+    elif y.dtype == 'int8':
+        y_out = y.astype('int16')
+        y_out = (y_out[0] + y_out[1]) // 2
+        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
+                        np.iinfo(y.dtype).max).astype(y.dtype)
+    else:
+        raise ParameterError(f'Unsupported dtype: {y.dtype}')
+    return y_out
+
+
+def soundfile_load_(file: os.PathLike,
+                    offset: Optional[float]=None,
+                    dtype: str='int16',
+                    duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
+    """Load audio using soundfile library. This function load audio file using libsndfile.
+
+    Args:
+        file (os.PathLike): File of waveform.
+        offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
+        dtype (str, optional): Data type of waveform. Defaults to 'int16'.
+        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
+
+    Returns:
+        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
+    """
+    with soundfile.SoundFile(file) as sf_desc:
+        sr_native = sf_desc.samplerate
+        if offset:
+            sf_desc.seek(int(offset * sr_native))
+        if duration is not None:
+            frame_duration = int(duration * sr_native)
+        else:
+            frame_duration = -1
+        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
+
+    return y, sf_desc.samplerate
+
+
+def normalize(y: np.ndarray, norm_type: str='linear',
+              mul_factor: float=1.0) -> np.ndarray:
+    """Normalize an input audio with additional multiplier.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
+        mul_factor (float, optional): Scaling factor. Defaults to 1.0.
+
+    Returns:
+        np.ndarray: `y` after normalization.
+    """
+
+    if norm_type == 'linear':
+        amax = np.max(np.abs(y))
+        factor = 1.0 / (amax + EPS)
+        y = y * factor * mul_factor
+    elif norm_type == 'gaussian':
+        amean = np.mean(y)
+        astd = np.std(y)
+        astd = max(astd, EPS)
+        y = mul_factor * (y - amean) / astd
+    else:
+        raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
+
+    return y
+
+
+def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
+    """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        sr (int): Sample rate.
+        file (os.PathLike): Path of auido file to save.
+    """
+    if not file.endswith('.wav'):
+        raise ParameterError(
+            f'only .wav file supported, but dst file name is: {file}')
+
+    if sr <= 0:
+        raise ParameterError(
+            f'Sample rate should be larger than 0, recieved sr = {sr}')
+
+    if y.dtype not in ['int16', 'int8']:
+        warnings.warn(
+            f'input data type is {y.dtype}, will convert data to int16 format before saving'
+        )
+        y_out = depth_convert(y, 'int16')
+    else:
+        y_out = y
+
+    wavfile.write(file, sr, y_out)
+
+def soundfile_load(
+        file: os.PathLike,
+        sr: Optional[int]=None,
+        mono: bool=True,
+        merge_type: str='average',  # ch0,ch1,random,average
+        normal: bool=True,
+        norm_type: str='linear',
+        norm_mul_factor: float=1.0,
+        offset: float=0.0,
+        duration: Optional[int]=None,
+        dtype: str='float32',
+        resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
+    """Load audio file from disk. This function loads audio from disk using using audio beackend.
+
+    Args:
+        file (os.PathLike): Path of auido file to load.
+        sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
+        mono (bool, optional): Return waveform with mono channel. Defaults to True.
+        merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
+        normal (bool, optional): Waveform normalization. Defaults to True.
+        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
+        norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
+        offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
+        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
+        dtype (str, optional): Data type of waveform. Defaults to 'float32'.
+        resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
+
+    Returns:
+        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
+    """
+
+    y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration)
+
+    if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
+        raise ParameterError(f'audio file {file} looks empty')
+
+    if mono:
+        y = to_mono(y, merge_type)
+
+    if sr is not None and sr != r:
+        y = resample(y, r, sr, mode=resample_mode)
+        r = sr
+
+    if normal:
+        y = normalize(y, norm_type, norm_mul_factor)
+    elif dtype in ['int8', 'int16']:
+        # still need to do normalization, before depth convertion
+        y = normalize(y, 'linear', 1.0)
+
+    y = depth_convert(y, dtype)
+    return y, r
+
+#the code below token form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py with modificaion.
+
+def _get_subtype_for_wav(dtype: paddle.dtype, encoding: str, bits_per_sample: int):
+    if not encoding:
+        if not bits_per_sample:
+            subtype = {
+                paddle.uint8: "PCM_U8",
+                paddle.int16: "PCM_16",
+                paddle.int32: "PCM_32",
+                paddle.float32: "FLOAT",
+                paddle.float64: "DOUBLE",
+            }.get(dtype)
+            if not subtype:
+                raise ValueError(f"Unsupported dtype for wav: {dtype}")
+            return subtype
+        if bits_per_sample == 8:
+            return "PCM_U8"
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_S":
+        if not bits_per_sample:
+            return "PCM_32"
+        if bits_per_sample == 8:
+            raise ValueError("wav does not support 8-bit signed PCM encoding.")
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_U":
+        if bits_per_sample in (None, 8):
+            return "PCM_U8"
+        raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
+    if encoding == "PCM_F":
+        if bits_per_sample in (None, 32):
+            return "FLOAT"
+        if bits_per_sample == 64:
+            return "DOUBLE"
+        raise ValueError("wav only supports 32/64-bit float PCM encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("wav only supports 8-bit mu-law encoding.")
+    if encoding == "ALAW":
+        if bits_per_sample in (None, 8):
+            return "ALAW"
+        raise ValueError("wav only supports 8-bit a-law encoding.")
+    raise ValueError(f"wav does not support {encoding}.")
+
+
+def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
+    if encoding in (None, "PCM_S"):
+        return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
+    if encoding in ("PCM_U", "PCM_F"):
+        raise ValueError(f"sph does not support {encoding} encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("sph only supports 8-bit for mu-law encoding.")
+    if encoding == "ALAW":
+        return "ALAW"
+    raise ValueError(f"sph does not support {encoding}.")
+
+
+def _get_subtype(dtype: paddle.dtype, format: str, encoding: str, bits_per_sample: int):
+    if format == "wav":
+        return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
+    if format == "flac":
+        if encoding:
+            raise ValueError("flac does not support encoding.")
+        if not bits_per_sample:
+            return "PCM_16"
+        if bits_per_sample > 24:
+            raise ValueError("flac does not support bits_per_sample > 24.")
+        return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
+    if format in ("ogg", "vorbis"):
+        if encoding or bits_per_sample:
+            raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.")
+        return "VORBIS"
+    if format == "sph":
+        return _get_subtype_for_sphere(encoding, bits_per_sample)
+    if format in ("nis", "nist"):
+        return "PCM_16"
+    raise ValueError(f"Unsupported format: {format}")
+
+def save(
+    filepath: str,
+    src: paddle.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    compression: Optional[float] = None,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+):
+    """Save audio data to file.
+
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+
+        * WAV
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+
+    Args:
+        filepath (str or pathlib.Path): Path to audio file.
+        src (paddle.Tensor): Audio data to save. must be 2D tensor.
+        sample_rate (int): sampling rate
+        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+            otherwise `[time, channel]`.
+        compression (float of None, optional): Not used.
+            It is here only for interface compatibility reson with "sox_io" backend.
+        format (str or None, optional): Override the audio format.
+            When ``filepath`` argument is path-like object, audio format is
+            inferred from file extension. If the file extension is missing or
+            different, you can specify the correct format with this argument.
+
+            When ``filepath`` argument is file-like object,
+            this argument is required.
+
+            Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
+            ``"flac"`` and ``"sph"``.
+        encoding (str or None, optional): Changes the encoding for supported formats.
+            This argument is effective only for supported formats, sush as
+            ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
+
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+
+        bits_per_sample (int or None, optional): Changes the bit depth for the
+            supported formats.
+            When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
+            you can change the bit depth.
+            Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
+
+    Supported formats/encodings/bit depth/compression are:
+
+    ``"wav"``
+        - 32-bit floating-point PCM
+        - 32-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 8-bit unsigned integer PCM
+        - 8-bit mu-law
+        - 8-bit a-law
+
+        Note:
+            Default encoding/bit depth is determined by the dtype of
+            the input Tensor.
+
+    ``"flac"``
+        - 8-bit
+        - 16-bit (default)
+        - 24-bit
+
+    ``"ogg"``, ``"vorbis"``
+        - Doesn't accept changing configuration.
+
+    ``"sph"``
+        - 8-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 32-bit signed integer PCM (default)
+        - 8-bit mu-law
+        - 8-bit a-law
+        - 16-bit a-law
+        - 24-bit a-law
+        - 32-bit a-law
+
+    """
+    if src.ndim != 2:
+        raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
+    if compression is not None:
+        warnings.warn(
+            '`save` function of "soundfile" backend does not support "compression" parameter. '
+            "The argument is silently ignored."
+        )
+    if hasattr(filepath, "write"):
+        if format is None:
+            raise RuntimeError("`format` is required when saving to file object.")
+        ext = format.lower()
+    else:
+        ext = str(filepath).split(".")[-1].lower()
+
+    if bits_per_sample not in (None, 8, 16, 24, 32, 64):
+        raise ValueError("Invalid bits_per_sample.")
+    if bits_per_sample == 24:
+        warnings.warn(
+            "Saving audio with 24 bits per sample might warp samples near -1. "
+            "Using 16 bits per sample might be able to avoid this."
+        )
+    subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
+
+    # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
+    # so we extend the extensions manually here
+    if ext in ["nis", "nist", "sph"] and format is None:
+        format = "NIST"
+
+    if channels_first:
+        src = src.t()
+
+    soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
+
+_SUBTYPE2DTYPE = {
+    "PCM_S8": "int8",
+    "PCM_U8": "uint8",
+    "PCM_16": "int16",
+    "PCM_32": "int32",
+    "FLOAT": "float32",
+    "DOUBLE": "float64",
+}
+
+def load(
+    filepath: str,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[paddle.Tensor, int]:
+    """Load audio data from file.
+
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+
+        * WAV
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+
+    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+    ``float32`` dtype and the shape of `[channel, time]`.
+    The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
+
+    When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+    signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
+    by providing ``normalize=False``, this function can return integer Tensor, where the samples
+    are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
+    for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
+
+    ``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
+    ``flac`` and ``mp3``.
+    For these formats, this function always returns ``float32`` Tensor with values normalized to
+    ``[-1.0, 1.0]``.
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend.
+
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        frame_offset (int, optional):
+            Number of frames to skip before start reading data.
+        num_frames (int, optional):
+            Maximum number of frames to read. ``-1`` reads all the remaining samples,
+            starting from ``frame_offset``.
+            This function may return the less number of frames if there is not enough
+            frames in the given file.
+        normalize (bool, optional):
+            When ``True``, this function always return ``float32``, and sample values are
+            normalized to ``[-1.0, 1.0]``.
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type.
+            This argument has no effect for formats other than integer WAV type.
+        channels_first (bool, optional):
+            When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+
+    Returns:
+        (paddle.Tensor, int): Resulting Tensor and sample rate.
+            If the input file has integer wav format and normalization is off, then it has
+            integer type, else ``float32`` type. If ``channels_first=True``, it has
+            `[channel, time]` else `[time, channel]`.
+    """
+    with soundfile.SoundFile(filepath, "r") as file_:
+        if file_.format != "WAV" or normalize:
+            dtype = "float32"
+        elif file_.subtype not in _SUBTYPE2DTYPE:
+            raise ValueError(f"Unsupported subtype: {file_.subtype}")
+        else:
+            dtype = _SUBTYPE2DTYPE[file_.subtype]
+
+        frames = file_._prepare_read(frame_offset, None, num_frames)
+        waveform = file_.read(frames, dtype, always_2d=True)
+        sample_rate = file_.samplerate
+
+    waveform = paddle.to_tensor(waveform)
+    if channels_first:
+        waveform = paddle.transpose(waveform, perm=[1,0])
+    return waveform, sample_rate
+
+
+# Mapping from soundfile subtype to number of bits per sample.
+# This is mostly heuristical and the value is set to 0 when it is irrelevant
+# (lossy formats) or when it can't be inferred.
+# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
+# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
+# the default seems to be 8 bits but it can be compressed further to 4 bits.
+# The dict is inspired from
+# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
+_SUBTYPE_TO_BITS_PER_SAMPLE = {
+    "PCM_S8": 8,  # Signed 8 bit data
+    "PCM_16": 16,  # Signed 16 bit data
+    "PCM_24": 24,  # Signed 24 bit data
+    "PCM_32": 32,  # Signed 32 bit data
+    "PCM_U8": 8,  # Unsigned 8 bit data (WAV and RAW only)
+    "FLOAT": 32,  # 32 bit float data
+    "DOUBLE": 64,  # 64 bit float data
+    "ULAW": 8,  # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "ALAW": 8,  # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "IMA_ADPCM": 0,  # IMA ADPCM.
+    "MS_ADPCM": 0,  # Microsoft ADPCM.
+    "GSM610": 0,  # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
+    "VOX_ADPCM": 0,  # OKI / Dialogix ADPCM
+    "G721_32": 0,  # 32kbs G721 ADPCM encoding.
+    "G723_24": 0,  # 24kbs G723 ADPCM encoding.
+    "G723_40": 0,  # 40kbs G723 ADPCM encoding.
+    "DWVW_12": 12,  # 12 bit Delta Width Variable Word encoding.
+    "DWVW_16": 16,  # 16 bit Delta Width Variable Word encoding.
+    "DWVW_24": 24,  # 24 bit Delta Width Variable Word encoding.
+    "DWVW_N": 0,  # N bit Delta Width Variable Word encoding.
+    "DPCM_8": 8,  # 8 bit differential PCM (XI only)
+    "DPCM_16": 16,  # 16 bit differential PCM (XI only)
+    "VORBIS": 0,  # Xiph Vorbis encoding. (lossy)
+    "ALAC_16": 16,  # Apple Lossless Audio Codec (16 bit).
+    "ALAC_20": 20,  # Apple Lossless Audio Codec (20 bit).
+    "ALAC_24": 24,  # Apple Lossless Audio Codec (24 bit).
+    "ALAC_32": 32,  # Apple Lossless Audio Codec (32 bit).
+}
+
+def _get_bit_depth(subtype):
+    if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
+        warnings.warn(
+            f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample "
+            "attribute will be set to 0. If you are seeing this warning, please "
+            "report by opening an issue on github (after checking for existing/closed ones). "
+            "You may otherwise ignore this warning."
+        )
+    return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
+
+_SUBTYPE_TO_ENCODING = {
+    "PCM_S8": "PCM_S",
+    "PCM_16": "PCM_S",
+    "PCM_24": "PCM_S",
+    "PCM_32": "PCM_S",
+    "PCM_U8": "PCM_U",
+    "FLOAT": "PCM_F",
+    "DOUBLE": "PCM_F",
+    "ULAW": "ULAW",
+    "ALAW": "ALAW",
+    "VORBIS": "VORBIS",
+}
+
+def _get_encoding(format: str, subtype: str):
+    if format == "FLAC":
+        return "FLAC"
+    return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
+
+def info(filepath: str, format: Optional[str] = None) -> AudioInfo:
+    """Get signal information of an audio file.
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+
+    Returns:
+        AudioInfo: meta data of the given audio.
+
+    """
+    sinfo = soundfile.info(filepath)
+    return AudioInfo(
+        sinfo.samplerate,
+        sinfo.frames,
+        sinfo.channels,
+        bits_per_sample=_get_bit_depth(sinfo.subtype),
+        encoding=_get_encoding(sinfo.format, sinfo.subtype),
+    )
--- a/audio/paddleaudio/backends/sox_io_backend.py
+++ b/audio/paddleaudio/backends/sox_io_backend.py
@ -0,0 +1,101 @@
+from pathlib import Path
+from typing import Callable
+from typing import Optional, Tuple, Union
+
+import paddle
+import paddleaudio
+from paddle import Tensor
+from .common import AudioInfo
+import os
+
+from paddleaudio._internal import module_utils  as _mod_utils
+
+#https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py
+
+def _fail_info(filepath: str, format: Optional[str]) -> AudioInfo:
+    raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
+
+
+def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioInfo:
+    raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
+
+
+# Note: need to comply TorchScript syntax -- need annotation and no f-string
+def _fail_load(
+    filepath: str,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[Tensor, int]:
+    raise RuntimeError("Failed to load audio from {}".format(filepath))
+
+
+def _fail_load_fileobj(fileobj, *args, **kwargs):
+    raise RuntimeError(f"Failed to load audio from {fileobj}")
+
+_fallback_info = _fail_info
+_fallback_info_fileobj = _fail_info_fileobj
+_fallback_load = _fail_load
+_fallback_load_filebj = _fail_load_fileobj
+
+@_mod_utils.requires_sox()
+def load(
+        filepath: str,
+        frame_offset: int = 0,
+        num_frames: int=-1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str]=None, ) -> Tuple[Tensor, int]:
+    if hasattr(filepath, "read"):
+        ret = paddleaudio._paddleaudio.load_audio_fileobj(
+            filepath, frame_offset, num_frames, normalize, channels_first, format
+        )
+        if ret is not None:
+            audio_tensor = paddle.to_tensor(ret[0])
+            return (audio_tensor, ret[1])
+        return _fallback_load_fileobj(filepath, frame_offset, num_frames, normalize, channels_first, format)
+    filepath = os.fspath(filepath)
+    ret = paddleaudio._paddleaudio.sox_io_load_audio_file(
+        filepath, frame_offset, num_frames, normalize, channels_first, format
+    )
+    if ret is not None:
+        audio_tensor = paddle.to_tensor(ret[0])
+        return (audio_tensor, ret[1])
+    return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)
+
+
+@_mod_utils.requires_sox()
+def save(filepath: str,
+    src: Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    compression: Optional[float] = None,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+):
+    src_arr = src.numpy()
+    if hasattr(filepath, "write"):
+        paddleaudio._paddleaudio.save_audio_fileobj(
+            filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
+        )
+        return
+    filepath = os.fspath(filepath)
+    paddleaudio._paddleaudio.sox_io_save_audio_file(
+        filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
+    )
+
+@_mod_utils.requires_sox()
+def info(filepath: str, format: Optional[str] = None,) -> AudioInfo:
+    if hasattr(filepath, "read"):
+        sinfo = paddleaudio._paddleaudio.get_info_fileobj(filepath, format)
+        if sinfo is not None:
+            return AudioInfo(*sinfo)
+        return _fallback_info_fileobj(filepath, format)
+    filepath = os.fspath(filepath)
+    sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format)
+    if sinfo is not None:
+        return AudioInfo(*sinfo)
+    return _fallback_info(filepath, format)
--- a/audio/paddleaudio/backends/utils.py
+++ b/audio/paddleaudio/backends/utils.py
@ -0,0 +1,81 @@
+"""Defines utilities for switching audio backends"""
+#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py
+
+import warnings
+from typing import List
+from typing import Optional
+
+import paddleaudio
+from paddleaudio._internal import module_utils as _mod_utils
+
+from . import no_backend, soundfile_backend, sox_io_backend
+
+__all__ = [
+    "list_audio_backends",
+    "get_audio_backend",
+    "set_audio_backend",
+]
+
+
+def list_audio_backends() -> List[str]:
+    """List available backends
+
+    Returns:
+        List[str]: The list of available backends.
+    """
+    backends = []
+    if _mod_utils.is_module_available("soundfile"):
+        backends.append("soundfile")
+    if _mod_utils.is_sox_available():
+        backends.append("sox_io")
+    return backends
+
+
+def set_audio_backend(backend: Optional[str]):
+    """Set the backend for I/O operation
+
+    Args:
+        backend (str or None): Name of the backend.
+            One of ``"sox_io"`` or ``"soundfile"`` based on availability
+            of the system. If ``None`` is provided the  current backend is unassigned.
+    """
+    if backend is not None and backend not in list_audio_backends():
+        raise RuntimeError(f'Backend "{backend}" is not one of '
+                           f"available backends: {list_audio_backends()}.")
+
+    if backend is None:
+        module = no_backend
+    elif backend == "sox_io":
+        module = sox_io_backend
+    elif backend == "soundfile":
+        module = soundfile_backend
+    else:
+        raise NotImplementedError(f'Unexpected backend "{backend}"')
+
+    for func in ["save", "load", "info"]:
+        setattr(paddleaudio, func, getattr(module, func))
+
+def _init_audio_backend():
+    backends = list_audio_backends()
+    if "soundfile" in backends:
+        set_audio_backend("soundfile")
+    elif "sox_io" in backends:
+        set_audio_backend("sox_io")
+    else:
+        warnings.warn("No audio backend is available.")
+        set_audio_backend(None)
+
+
+def get_audio_backend() -> Optional[str]:
+    """Get the name of the current backend
+
+    Returns:
+        Optional[str]: The name of the current backend or ``None`` if no backend is assigned.
+    """
+    if paddleaudio.load == no_backend.load:
+        return None
+    if paddleaudio.load == sox_io_backend.load:
+        return "sox_io"
+    if paddleaudio.load == soundfile_backend.load:
+        return "soundfile"
+    raise ValueError("Unknown backend.")
--- a/audio/paddleaudio/compliance/init.py
+++ b/audio/paddleaudio/compliance/init.py
@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import kaldi
+from . import librosa
--- a/audio/paddleaudio/compliance/kaldi.py
+++ b/audio/paddleaudio/compliance/kaldi.py
@ -0,0 +1,638 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from torchaudio(https://github.com/pytorch/audio)
+import math
+from typing import Tuple
+
+import paddle
+from paddle import Tensor
+
+from ..functional import create_dct
+from ..functional.window import get_window
+
+__all__ = [
+    'spectrogram',
+    'fbank',
+    'mfcc',
+]
+
+# window types
+HANNING = 'hann'
+HAMMING = 'hamming'
+POVEY = 'povey'
+RECTANGULAR = 'rect'
+BLACKMAN = 'blackman'
+
+
+def _get_epsilon(dtype):
+    return paddle.to_tensor(1e-07, dtype=dtype)
+
+
+def _next_power_of_2(x: int) -> int:
+    return 1 if x == 0 else 2**(x - 1).bit_length()
+
+
+def _get_strided(waveform: Tensor,
+                 window_size: int,
+                 window_shift: int,
+                 snip_edges: bool) -> Tensor:
+    assert waveform.dim() == 1
+    num_samples = waveform.shape[0]
+
+    if snip_edges:
+        if num_samples < window_size:
+            return paddle.empty((0, 0), dtype=waveform.dtype)
+        else:
+            m = 1 + (num_samples - window_size) // window_shift
+    else:
+        reversed_waveform = paddle.flip(waveform, [0])
+        m = (num_samples + (window_shift // 2)) // window_shift
+        pad = window_size // 2 - window_shift // 2
+        pad_right = reversed_waveform
+        if pad > 0:
+            pad_left = reversed_waveform[-pad:]
+            waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
+        else:
+            waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
+
+    return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
+
+
+def _feature_window_function(
+        window_type: str,
+        window_size: int,
+        blackman_coeff: float,
+        dtype: int, ) -> Tensor:
+    if window_type == HANNING:
+        return get_window('hann', window_size, fftbins=False, dtype=dtype)
+    elif window_type == HAMMING:
+        return get_window('hamming', window_size, fftbins=False, dtype=dtype)
+    elif window_type == POVEY:
+        return get_window(
+            'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
+    elif window_type == RECTANGULAR:
+        return paddle.ones([window_size], dtype=dtype)
+    elif window_type == BLACKMAN:
+        a = 2 * math.pi / (window_size - 1)
+        window_function = paddle.arange(window_size, dtype=dtype)
+        return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
+                (0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
+                ).astype(dtype)
+    else:
+        raise Exception('Invalid window type ' + window_type)
+
+
+def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
+                    energy_floor: float) -> Tensor:
+    log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
+    if energy_floor == 0.0:
+        return log_energy
+    return paddle.maximum(
+        log_energy,
+        paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
+
+
+def _get_waveform_and_window_properties(
+        waveform: Tensor,
+        channel: int,
+        sr: int,
+        frame_shift: float,
+        frame_length: float,
+        round_to_power_of_two: bool,
+        preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
+    channel = max(channel, 0)
+    assert channel < waveform.shape[0], (
+        'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
+    waveform = waveform[channel, :]  # size (n)
+    window_shift = int(
+        sr * frame_shift *
+        0.001)  # pass frame_shift and frame_length in milliseconds
+    window_size = int(sr * frame_length * 0.001)
+    padded_window_size = _next_power_of_2(
+        window_size) if round_to_power_of_two else window_size
+
+    assert 2 <= window_size <= len(waveform), (
+        'choose a window size {} that is [2, {}]'.format(window_size,
+                                                         len(waveform)))
+    assert 0 < window_shift, '`window_shift` must be greater than 0'
+    assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
+                                        ' use `round_to_power_of_two` or change `frame_length`'
+    assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
+    assert sr > 0, '`sr` must be greater than zero'
+    return waveform, window_shift, window_size, padded_window_size
+
+
+def _get_window(waveform: Tensor,
+                padded_window_size: int,
+                window_size: int,
+                window_shift: int,
+                window_type: str,
+                blackman_coeff: float,
+                snip_edges: bool,
+                raw_energy: bool,
+                energy_floor: float,
+                dither: float,
+                remove_dc_offset: bool,
+                preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
+    dtype = waveform.dtype
+    epsilon = _get_epsilon(dtype)
+
+    # (m, window_size)
+    strided_input = _get_strided(waveform, window_size, window_shift,
+                                 snip_edges)
+
+    if dither != 0.0:
+        x = paddle.maximum(epsilon,
+                           paddle.rand(strided_input.shape, dtype=dtype))
+        rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
+        strided_input = strided_input + rand_gauss * dither
+
+    if remove_dc_offset:
+        row_means = paddle.mean(strided_input, axis=1).unsqueeze(1)  # (m, 1)
+        strided_input = strided_input - row_means
+
+    if raw_energy:
+        signal_log_energy = _get_log_energy(strided_input, epsilon,
+                                            energy_floor)  # (m)
+
+    if preemphasis_coefficient != 0.0:
+        offset_strided_input = paddle.nn.functional.pad(
+            strided_input.unsqueeze(0), (1, 0),
+            data_format='NCL',
+            mode='replicate').squeeze(0)  # (m, window_size + 1)
+        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
+                                                                                       -1]
+
+    window_function = _feature_window_function(
+        window_type, window_size, blackman_coeff,
+        dtype).unsqueeze(0)  # (1, window_size)
+    strided_input = strided_input * window_function  # (m, window_size)
+
+    # (m, padded_window_size)
+    if padded_window_size != window_size:
+        padding_right = padded_window_size - window_size
+        strided_input = paddle.nn.functional.pad(
+            strided_input.unsqueeze(0), (0, padding_right),
+            data_format='NCL',
+            mode='constant',
+            value=0).squeeze(0)
+
+    if not raw_energy:
+        signal_log_energy = _get_log_energy(strided_input, epsilon,
+                                            energy_floor)  # size (m)
+
+    return strided_input, signal_log_energy
+
+
+def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
+    if subtract_mean:
+        col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
+        tensor = tensor - col_means
+    return tensor
+
+
+def spectrogram(waveform: Tensor,
+                blackman_coeff: float=0.42,
+                channel: int=-1,
+                dither: float=0.0,
+                energy_floor: float=1.0,
+                frame_length: float=25.0,
+                frame_shift: float=10.0,
+                preemphasis_coefficient: float=0.97,
+                raw_energy: bool=True,
+                remove_dc_offset: bool=True,
+                round_to_power_of_two: bool=True,
+                sr: int=16000,
+                snip_edges: bool=True,
+                subtract_mean: bool=False,
+                window_type: str=POVEY) -> Tensor:
+    """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
+
+    Args:
+        waveform (Tensor): A waveform tensor with shape `(C, T)`.
+        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
+        channel (int, optional): Select the channel of waveform. Defaults to -1.
+        dither (float, optional): Dithering constant . Defaults to 0.0.
+        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
+        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
+        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
+        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
+        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
+        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. Defaults to True.
+        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
+        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
+            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
+        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
+        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
+
+    Returns:
+        Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames
+            depends on frame_length and frame_shift.
+    """
+    dtype = waveform.dtype
+    epsilon = _get_epsilon(dtype)
+
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
+        preemphasis_coefficient)
+
+    strided_input, signal_log_energy = _get_window(
+        waveform, padded_window_size, window_size, window_shift, window_type,
+        blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
+        remove_dc_offset, preemphasis_coefficient)
+
+    # (m, padded_window_size // 2 + 1, 2)
+    fft = paddle.fft.rfft(strided_input)
+
+    power_spectrum = paddle.maximum(
+        fft.abs().pow(2.), epsilon).log()  # (m, padded_window_size // 2 + 1)
+    power_spectrum[:, 0] = signal_log_energy
+
+    power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
+    return power_spectrum
+
+
+def _inverse_mel_scale_scalar(mel_freq: float) -> float:
+    return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
+
+
+def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
+    return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
+
+
+def _mel_scale_scalar(freq: float) -> float:
+    return 1127.0 * math.log(1.0 + freq / 700.0)
+
+
+def _mel_scale(freq: Tensor) -> Tensor:
+    return 1127.0 * (1.0 + freq / 700.0).log()
+
+
+def _vtln_warp_freq(vtln_low_cutoff: float,
+                    vtln_high_cutoff: float,
+                    low_freq: float,
+                    high_freq: float,
+                    vtln_warp_factor: float,
+                    freq: Tensor) -> Tensor:
+    assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
+    assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
+    l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
+    h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
+    scale = 1.0 / vtln_warp_factor
+    Fl = scale * l
+    Fh = scale * h
+    assert l > low_freq and h < high_freq
+    scale_left = (Fl - low_freq) / (l - low_freq)
+    scale_right = (high_freq - Fh) / (high_freq - h)
+    res = paddle.empty_like(freq)
+
+    outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
+        | paddle.greater_than(freq, paddle.to_tensor(high_freq))
+    before_l = paddle.less_than(freq, paddle.to_tensor(l))
+    before_h = paddle.less_than(freq, paddle.to_tensor(h))
+    after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
+
+    res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
+    res[before_h] = scale * freq[before_h]
+    res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
+    res[outside_low_high_freq] = freq[outside_low_high_freq]
+
+    return res
+
+
+def _vtln_warp_mel_freq(vtln_low_cutoff: float,
+                        vtln_high_cutoff: float,
+                        low_freq,
+                        high_freq: float,
+                        vtln_warp_factor: float,
+                        mel_freq: Tensor) -> Tensor:
+    return _mel_scale(
+        _vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
+                        vtln_warp_factor, _inverse_mel_scale(mel_freq)))
+
+
+def _get_mel_banks(num_bins: int,
+                   window_length_padded: int,
+                   sample_freq: float,
+                   low_freq: float,
+                   high_freq: float,
+                   vtln_low: float,
+                   vtln_high: float,
+                   vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
+    assert num_bins > 3, 'Must have at least 3 mel bins'
+    assert window_length_padded % 2 == 0
+    num_fft_bins = window_length_padded / 2
+    nyquist = 0.5 * sample_freq
+
+    if high_freq <= 0.0:
+        high_freq += nyquist
+
+    assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
+        ('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
+
+    fft_bin_width = sample_freq / window_length_padded
+    mel_low_freq = _mel_scale_scalar(low_freq)
+    mel_high_freq = _mel_scale_scalar(high_freq)
+
+    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
+
+    if vtln_high < 0.0:
+        vtln_high += nyquist
+
+    assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
+                                       (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
+        ('Bad values in options: vtln-low {} and vtln-high {}, versus '
+         'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
+
+    bin = paddle.arange(num_bins).unsqueeze(1)
+    left_mel = mel_low_freq + bin * mel_freq_delta  # (num_bins, 1)
+    center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta  # (num_bins, 1)
+    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # (num_bins, 1)
+
+    if vtln_warp_factor != 1.0:
+        left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
+                                       vtln_warp_factor, left_mel)
+        center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
+                                         high_freq, vtln_warp_factor,
+                                         center_mel)
+        right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
+                                        high_freq, vtln_warp_factor, right_mel)
+
+    center_freqs = _inverse_mel_scale(center_mel)  # (num_bins)
+    # (1, num_fft_bins)
+    mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
+
+    # (num_bins, num_fft_bins)
+    up_slope = (mel - left_mel) / (center_mel - left_mel)
+    down_slope = (right_mel - mel) / (right_mel - center_mel)
+
+    if vtln_warp_factor == 1.0:
+        bins = paddle.maximum(
+            paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
+    else:
+        bins = paddle.zeros_like(up_slope)
+        up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
+            mel, center_mel)
+        down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
+            mel, right_mel)
+        bins[up_idx] = up_slope[up_idx]
+        bins[down_idx] = down_slope[down_idx]
+
+    return bins, center_freqs
+
+
+def fbank(waveform: Tensor,
+          blackman_coeff: float=0.42,
+          channel: int=-1,
+          dither: float=0.0,
+          energy_floor: float=1.0,
+          frame_length: float=25.0,
+          frame_shift: float=10.0,
+          high_freq: float=0.0,
+          htk_compat: bool=False,
+          low_freq: float=20.0,
+          n_mels: int=23,
+          preemphasis_coefficient: float=0.97,
+          raw_energy: bool=True,
+          remove_dc_offset: bool=True,
+          round_to_power_of_two: bool=True,
+          sr: int=16000,
+          snip_edges: bool=True,
+          subtract_mean: bool=False,
+          use_energy: bool=False,
+          use_log_fbank: bool=True,
+          use_power: bool=True,
+          vtln_high: float=-500.0,
+          vtln_low: float=100.0,
+          vtln_warp: float=1.0,
+          window_type: str=POVEY) -> Tensor:
+    """Compute and return filter banks from a waveform. The output is identical to Kaldi's.
+
+    Args:
+        waveform (Tensor): A waveform tensor with shape `(C, T)`.
+        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
+        channel (int, optional): Select the channel of waveform. Defaults to -1.
+        dither (float, optional): Dithering constant . Defaults to 0.0.
+        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
+        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
+        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
+        high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
+        htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
+        low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
+        n_mels (int, optional): Number of output mel bins. Defaults to 23.
+        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
+        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
+        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. Defaults to True.
+        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
+        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
+            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
+        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
+        use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
+        use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
+        use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
+        vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
+        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
+
+    Returns:
+        Tensor: A filter banks tensor with shape `(m, n_mels)`.
+    """
+    dtype = waveform.dtype
+
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
+        preemphasis_coefficient)
+
+    strided_input, signal_log_energy = _get_window(
+        waveform, padded_window_size, window_size, window_shift, window_type,
+        blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
+        remove_dc_offset, preemphasis_coefficient)
+
+    # (m, padded_window_size // 2 + 1)
+    spectrum = paddle.fft.rfft(strided_input).abs()
+    if use_power:
+        spectrum = spectrum.pow(2.)
+
+    # (n_mels, padded_window_size // 2)
+    mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
+                                     high_freq, vtln_low, vtln_high, vtln_warp)
+    mel_energies = mel_energies.astype(dtype)
+
+    # (n_mels, padded_window_size // 2 + 1)
+    mel_energies = paddle.nn.functional.pad(
+        mel_energies.unsqueeze(0), (0, 1),
+        data_format='NCL',
+        mode='constant',
+        value=0).squeeze(0)
+
+    # (m, n_mels)
+    mel_energies = paddle.mm(spectrum, mel_energies.T)
+    if use_log_fbank:
+        mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
+
+    if use_energy:
+        signal_log_energy = signal_log_energy.unsqueeze(1)
+        if htk_compat:
+            mel_energies = paddle.concat(
+                (mel_energies, signal_log_energy), axis=1)
+        else:
+            mel_energies = paddle.concat(
+                (signal_log_energy, mel_energies), axis=1)
+
+    # (m, n_mels + 1)
+    mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
+    return mel_energies
+
+
+def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
+    dct_matrix = create_dct(n_mels, n_mels, 'ortho')
+    dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
+    dct_matrix = dct_matrix[:, :n_mfcc]  # (n_mels, n_mfcc)
+    return dct_matrix
+
+
+def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
+    i = paddle.arange(n_mfcc)
+    return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
+                                                    cepstral_lifter)
+
+
+def mfcc(waveform: Tensor,
+         blackman_coeff: float=0.42,
+         cepstral_lifter: float=22.0,
+         channel: int=-1,
+         dither: float=0.0,
+         energy_floor: float=1.0,
+         frame_length: float=25.0,
+         frame_shift: float=10.0,
+         high_freq: float=0.0,
+         htk_compat: bool=False,
+         low_freq: float=20.0,
+         n_mfcc: int=13,
+         n_mels: int=23,
+         preemphasis_coefficient: float=0.97,
+         raw_energy: bool=True,
+         remove_dc_offset: bool=True,
+         round_to_power_of_two: bool=True,
+         sr: int=16000,
+         snip_edges: bool=True,
+         subtract_mean: bool=False,
+         use_energy: bool=False,
+         vtln_high: float=-500.0,
+         vtln_low: float=100.0,
+         vtln_warp: float=1.0,
+         window_type: str=POVEY) -> Tensor:
+    """Compute and return mel frequency cepstral coefficients from a waveform. The output is
+            identical to Kaldi's.
+
+    Args:
+        waveform (Tensor): A waveform tensor with shape `(C, T)`.
+        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
+        cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
+        channel (int, optional): Select the channel of waveform. Defaults to -1.
+        dither (float, optional): Dithering constant . Defaults to 0.0.
+        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
+        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
+        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
+        high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
+        htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
+        low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
+        n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
+        n_mels (int, optional): Number of output mel bins. Defaults to 23.
+        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
+        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
+        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. Defaults to True.
+        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
+        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
+            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
+        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
+        use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
+        vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
+        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
+
+    Returns:
+        Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`.
+    """
+    assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
+        n_mfcc, n_mels)
+
+    dtype = waveform.dtype
+
+    # (m, n_mels + use_energy)
+    feature = fbank(
+        waveform=waveform,
+        blackman_coeff=blackman_coeff,
+        channel=channel,
+        dither=dither,
+        energy_floor=energy_floor,
+        frame_length=frame_length,
+        frame_shift=frame_shift,
+        high_freq=high_freq,
+        htk_compat=htk_compat,
+        low_freq=low_freq,
+        n_mels=n_mels,
+        preemphasis_coefficient=preemphasis_coefficient,
+        raw_energy=raw_energy,
+        remove_dc_offset=remove_dc_offset,
+        round_to_power_of_two=round_to_power_of_two,
+        sr=sr,
+        snip_edges=snip_edges,
+        subtract_mean=False,
+        use_energy=use_energy,
+        use_log_fbank=True,
+        use_power=True,
+        vtln_high=vtln_high,
+        vtln_low=vtln_low,
+        vtln_warp=vtln_warp,
+        window_type=window_type)
+
+    if use_energy:
+        # (m)
+        signal_log_energy = feature[:, n_mels if htk_compat else 0]
+        mel_offset = int(not htk_compat)
+        feature = feature[:, mel_offset:(n_mels + mel_offset)]
+
+    # (n_mels, n_mfcc)
+    dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
+
+    # (m, n_mfcc)
+    feature = feature.matmul(dct_matrix)
+
+    if cepstral_lifter != 0.0:
+        # (1, n_mfcc)
+        lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
+        feature *= lifter_coeffs.astype(dtype=dtype)
+
+    if use_energy:
+        feature[:, 0] = signal_log_energy
+
+    if htk_compat:
+        energy = feature[:, 0].unsqueeze(1)  # (m, 1)
+        feature = feature[:, 1:]  # (m, n_mfcc - 1)
+        if not use_energy:
+            energy *= math.sqrt(2)
+
+        feature = paddle.concat((feature, energy), axis=1)
+
+    feature = _subtract_column_mean(feature, subtract_mean)
+    return feature
--- a/audio/paddleaudio/compliance/librosa.py
+++ b/audio/paddleaudio/compliance/librosa.py
@ -0,0 +1,788 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from librosa(https://github.com/librosa/librosa)
+import warnings
+from typing import List
+from typing import Optional
+from typing import Union
+
+import numpy as np
+import scipy
+from numpy.lib.stride_tricks import as_strided
+from scipy import signal
+
+from ..backends import depth_convert
+from ..utils import ParameterError
+
+__all__ = [
+    # dsp
+    'stft',
+    'mfcc',
+    'hz_to_mel',
+    'mel_to_hz',
+    'mel_frequencies',
+    'power_to_db',
+    'compute_fbank_matrix',
+    'melspectrogram',
+    'spectrogram',
+    'mu_encode',
+    'mu_decode',
+    # augmentation
+    'depth_augment',
+    'spect_augment',
+    'random_crop1d',
+    'random_crop2d',
+    'adaptive_spect_augment',
+]
+
+
+def _pad_center(data: np.ndarray, size: int, axis: int=-1,
+                **kwargs) -> np.ndarray:
+    """Pad an array to a target length along a target axis.
+
+    This differs from `np.pad` by centering the data prior to padding,
+    analogous to `str.center`
+    """
+
+    kwargs.setdefault("mode", "constant")
+    n = data.shape[axis]
+    lpad = int((size - n) // 2)
+    lengths = [(0, 0)] * data.ndim
+    lengths[axis] = (lpad, int(size - n - lpad))
+
+    if lpad < 0:
+        raise ParameterError(("Target size ({size:d}) must be "
+                              "at least input size ({n:d})"))
+
+    return np.pad(data, lengths, **kwargs)
+
+
+def _split_frames(x: np.ndarray,
+                  frame_length: int,
+                  hop_length: int,
+                  axis: int=-1) -> np.ndarray:
+    """Slice a data array into (overlapping) frames.
+
+    This function is aligned with librosa.frame
+    """
+
+    if not isinstance(x, np.ndarray):
+        raise ParameterError(
+            f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
+
+    if x.shape[axis] < frame_length:
+        raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
+                             f" for frame_length={frame_length:d}")
+
+    if hop_length < 1:
+        raise ParameterError(f"Invalid hop_length: {hop_length:d}")
+
+    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
+        warnings.warn(f"librosa.util.frame called with axis={axis} "
+                      "on a non-contiguous input. This will result in a copy.")
+        x = np.asfortranarray(x)
+    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
+        warnings.warn(f"librosa.util.frame called with axis={axis} "
+                      "on a non-contiguous input. This will result in a copy.")
+        x = np.ascontiguousarray(x)
+
+    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
+    strides = np.asarray(x.strides)
+
+    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
+
+    if axis == -1:
+        shape = list(x.shape)[:-1] + [frame_length, n_frames]
+        strides = list(strides) + [hop_length * new_stride]
+
+    elif axis == 0:
+        shape = [n_frames, frame_length] + list(x.shape)[1:]
+        strides = [hop_length * new_stride] + list(strides)
+
+    else:
+        raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
+
+    return as_strided(x, shape=shape, strides=strides)
+
+
+def _check_audio(y, mono=True) -> bool:
+    """Determine whether a variable contains valid audio data.
+
+    The audio y must be a np.ndarray, ether 1-channel or two channel
+    """
+    if not isinstance(y, np.ndarray):
+        raise ParameterError("Audio data must be of type numpy.ndarray")
+    if y.ndim > 2:
+        raise ParameterError(
+            f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
+
+    if mono and y.ndim == 2:
+        raise ParameterError(
+            f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
+
+    if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
+        raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
+
+    if not np.issubdtype(y.dtype, np.floating):
+        raise ParameterError("Audio data must be floating-point")
+
+    if not np.isfinite(y).all():
+        raise ParameterError("Audio buffer is not finite everywhere")
+
+    return True
+
+
+def hz_to_mel(frequencies: Union[float, List[float], np.ndarray],
+              htk: bool=False) -> np.ndarray:
+    """Convert Hz to Mels.
+
+    Args:
+        frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
+    Returns:
+        np.ndarray: Frequency in mels.
+    """
+    freq = np.asanyarray(frequencies)
+
+    if htk:
+        return 2595.0 * np.log10(1.0 + freq / 700.0)
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = np.log(6.4) / 27.0  # step size for log region
+
+    if freq.ndim:
+        # If we have array data, vectorize
+        log_t = freq >= min_log_hz
+        mels[log_t] = min_log_mel + \
+            np.log(freq[log_t] / min_log_hz) / logstep
+    elif freq >= min_log_hz:
+        # If we have scalar data, heck directly
+        mels = min_log_mel + np.log(freq / min_log_hz) / logstep
+
+    return mels
+
+
+def mel_to_hz(mels: Union[float, List[float], np.ndarray],
+              htk: int=False) -> np.ndarray:
+    """Convert mel bin numbers to frequencies.
+
+    Args:
+        mels (Union[float, List[float], np.ndarray]): Frequency in mels.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
+    Returns:
+        np.ndarray: Frequencies in Hz.
+    """
+    mel_array = np.asanyarray(mels)
+
+    if htk:
+        return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
+
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mel_array
+
+    # And now the nonlinear scale
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = np.log(6.4) / 27.0  # step size for log region
+
+    if mel_array.ndim:
+        # If we have vector data, vectorize
+        log_t = mel_array >= min_log_mel
+        freqs[log_t] = min_log_hz * \
+            np.exp(logstep * (mel_array[log_t] - min_log_mel))
+    elif mel_array >= min_log_mel:
+        # If we have scalar data, check directly
+        freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
+
+    return freqs
+
+
+def mel_frequencies(n_mels: int=128,
+                    fmin: float=0.0,
+                    fmax: float=11025.0,
+                    htk: bool=False) -> np.ndarray:
+    """Compute mel frequencies.
+
+    Args:
+        n_mels (int, optional): Number of mel bins. Defaults to 128.
+        fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
+    Returns:
+        np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`.
+    """
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(fmin, htk=htk)
+    max_mel = hz_to_mel(fmax, htk=htk)
+
+    mels = np.linspace(min_mel, max_mel, n_mels)
+
+    return mel_to_hz(mels, htk=htk)
+
+
+def fft_frequencies(sr: int, n_fft: int) -> np.ndarray:
+    """Compute fourier frequencies.
+
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): FFT size.
+
+    Returns:
+        np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
+    """
+    return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
+
+
+def compute_fbank_matrix(sr: int,
+                         n_fft: int,
+                         n_mels: int=128,
+                         fmin: float=0.0,
+                         fmax: Optional[float]=None,
+                         htk: bool=False,
+                         norm: str="slaney",
+                         dtype: type=np.float32) -> np.ndarray:
+    """Compute fbank matrix.
+
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): FFT size.
+        n_mels (int, optional): Number of mel bins. Defaults to 128.
+        fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+        norm (str, optional): Type of normalization. Defaults to "slaney".
+        dtype (type, optional): Data type. Defaults to np.float32.
+
+
+    Returns:
+        np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
+    """
+    if norm != "slaney":
+        raise ParameterError('norm must set to slaney')
+
+    if fmax is None:
+        fmax = float(sr) / 2
+
+    # Initialize the weights
+    n_mels = int(n_mels)
+    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
+
+    fdiff = np.diff(mel_f)
+    ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = np.maximum(0, np.minimum(lower, upper))
+
+    if norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm[:, np.newaxis]
+
+    # Only check weights if f_mel[0] is positive
+    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
+        # This means we have an empty channel somewhere
+        warnings.warn("Empty filters detected in mel frequency basis. "
+                      "Some channels will produce empty responses. "
+                      "Try increasing your sampling rate (and fmax) or "
+                      "reducing n_mels.")
+
+    return weights
+
+
+def stft(x: np.ndarray,
+         n_fft: int=2048,
+         hop_length: Optional[int]=None,
+         win_length: Optional[int]=None,
+         window: str="hann",
+         center: bool=True,
+         dtype: type=np.complex64,
+         pad_mode: str="reflect") -> np.ndarray:
+    """Short-time Fourier transform (STFT).
+
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        n_fft (int, optional): FFT size. Defaults to 2048.
+        hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
+        win_length (Optional[int], optional): The size of window. Defaults to None.
+        window (str, optional): A string of window specification. Defaults to "hann".
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
+        dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
+
+    Returns:
+        np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
+    """
+    _check_audio(x)
+
+    # By default, use the entire frame
+    if win_length is None:
+        win_length = n_fft
+
+    # Set the default hop, if it's not already specified
+    if hop_length is None:
+        hop_length = int(win_length // 4)
+
+    fft_window = signal.get_window(window, win_length, fftbins=True)
+
+    # Pad the window out to n_fft size
+    fft_window = _pad_center(fft_window, n_fft)
+
+    # Reshape so that the window can be broadcast
+    fft_window = fft_window.reshape((-1, 1))
+
+    # Pad the time series so that frames are centered
+    if center:
+        if n_fft > x.shape[-1]:
+            warnings.warn(
+                f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
+            )
+        x = np.pad(x, int(n_fft // 2), mode=pad_mode)
+
+    elif n_fft > x.shape[-1]:
+        raise ParameterError(
+            f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
+        )
+
+    # Window the time series.
+    x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length)
+    # Pre-allocate the STFT matrix
+    stft_matrix = np.empty(
+        (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
+    fft = np.fft  # use numpy fft as default
+    # Constrain STFT block sizes to 256 KB
+    MAX_MEM_BLOCK = 2**8 * 2**10
+    # how many columns can we fit within MAX_MEM_BLOCK?
+    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
+    n_columns = max(n_columns, 1)
+
+    for bl_s in range(0, stft_matrix.shape[1], n_columns):
+        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
+        stft_matrix[:, bl_s:bl_t] = fft.rfft(
+            fft_window * x_frames[:, bl_s:bl_t], axis=0)
+
+    return stft_matrix
+
+
+def power_to_db(spect: np.ndarray,
+                ref: float=1.0,
+                amin: float=1e-10,
+                top_db: Optional[float]=80.0) -> np.ndarray:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
+
+    Args:
+        spect (np.ndarray): STFT power spectrogram of an input waveform.
+        ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): Minimum threshold. Defaults to 1e-10.
+        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0.
+
+    Returns:
+        np.ndarray: Power spectrogram in db scale.
+    """
+    spect = np.asarray(spect)
+
+    if amin <= 0:
+        raise ParameterError("amin must be strictly positive")
+
+    if np.issubdtype(spect.dtype, np.complexfloating):
+        warnings.warn(
+            "power_to_db was called on complex input so phase "
+            "information will be discarded. To suppress this warning, "
+            "call power_to_db(np.abs(D)**2) instead.")
+        magnitude = np.abs(spect)
+    else:
+        magnitude = spect
+
+    if callable(ref):
+        # User supplied a function to calculate reference power
+        ref_value = ref(magnitude)
+    else:
+        ref_value = np.abs(ref)
+
+    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
+    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
+
+    if top_db is not None:
+        if top_db < 0:
+            raise ParameterError("top_db must be non-negative")
+        log_spec = np.maximum(log_spec, log_spec.max() - top_db)
+
+    return log_spec
+
+
+def mfcc(x: np.ndarray,
+         sr: int=16000,
+         spect: Optional[np.ndarray]=None,
+         n_mfcc: int=20,
+         dct_type: int=2,
+         norm: str="ortho",
+         lifter: int=0,
+         **kwargs) -> np.ndarray:
+    """Mel-frequency cepstral coefficients (MFCCs)
+
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        sr (int, optional): Sample rate. Defaults to 16000.
+        spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None.
+        n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20.
+        dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2.
+        norm (str, optional): Type of normalization. Defaults to "ortho".
+        lifter (int, optional): Cepstral filtering. Defaults to 0.
+
+    Returns:
+        np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
+    """
+    if spect is None:
+        spect = melspectrogram(x, sr=sr, **kwargs)
+
+    M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
+
+    if lifter > 0:
+        factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
+                        lifter)
+        return M * factor[:, np.newaxis]
+    elif lifter == 0:
+        return M
+    else:
+        raise ParameterError(
+            f"MFCC lifter={lifter} must be a non-negative number")
+
+
+def melspectrogram(x: np.ndarray,
+                   sr: int=16000,
+                   window_size: int=512,
+                   hop_length: int=320,
+                   n_mels: int=64,
+                   fmin: float=50.0,
+                   fmax: Optional[float]=None,
+                   window: str='hann',
+                   center: bool=True,
+                   pad_mode: str='reflect',
+                   power: float=2.0,
+                   to_db: bool=True,
+                   ref: float=1.0,
+                   amin: float=1e-10,
+                   top_db: Optional[float]=None) -> np.ndarray:
+    """Compute mel-spectrogram.
+
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        sr (int, optional): Sample rate. Defaults to 16000.
+        window_size (int, optional): Size of FFT and window length. Defaults to 512.
+        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        window (str, optional): A string of window specification. Defaults to "hann".
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
+        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
+        to_db (bool, optional): Enable db scale. Defaults to True.
+        ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): Minimum threshold. Defaults to 1e-10.
+        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
+
+    Returns:
+        np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`.
+    """
+    _check_audio(x, mono=True)
+    if len(x) <= 0:
+        raise ParameterError('The input waveform is empty')
+
+    if fmax is None:
+        fmax = sr // 2
+    if fmin < 0 or fmin >= fmax:
+        raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
+
+    s = stft(
+        x,
+        n_fft=window_size,
+        hop_length=hop_length,
+        win_length=window_size,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    spect_power = np.abs(s)**power
+    fb_matrix = compute_fbank_matrix(
+        sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
+    mel_spect = np.matmul(fb_matrix, spect_power)
+    if to_db:
+        return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
+    else:
+        return mel_spect
+
+
+def spectrogram(x: np.ndarray,
+                sr: int=16000,
+                window_size: int=512,
+                hop_length: int=320,
+                window: str='hann',
+                center: bool=True,
+                pad_mode: str='reflect',
+                power: float=2.0) -> np.ndarray:
+    """Compute spectrogram.
+
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        sr (int, optional): Sample rate. Defaults to 16000.
+        window_size (int, optional): Size of FFT and window length. Defaults to 512.
+        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
+        window (str, optional): A string of window specification. Defaults to "hann".
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
+        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
+
+    Returns:
+        np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`.
+    """
+
+    s = stft(
+        x,
+        n_fft=window_size,
+        hop_length=hop_length,
+        win_length=window_size,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    return np.abs(s)**power
+
+
+def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
+    """Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`.
+
+    Args:
+        x (np.ndarray): The input waveform to encode.
+        mu (int, optional): The endoceding parameter. Defaults to 255.
+        quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True.
+
+    Returns:
+        np.ndarray: The mu-law encoded waveform.
+    """
+    mu = 255
+    y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
+    if quantized:
+        y = np.floor((y + 1) / 2 * mu + 0.5)  # convert to [0 , mu-1]
+    return y
+
+
+def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
+    """Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise.
+
+    Args:
+        y (np.ndarray): The encoded waveform.
+        mu (int, optional): The endoceding parameter. Defaults to 255.
+        quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True.
+
+    Returns:
+        np.ndarray: The mu-law decoded waveform.
+    """
+    if mu < 1:
+        raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
+
+    mu = mu - 1
+    if quantized:  # undo the quantization
+        y = y * 2 / mu - 1
+    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
+    return x
+
+
+def _randint(high: int) -> int:
+    """Generate one random integer in range [0 high)
+
+     This is a helper function for random data augmentaiton
+    """
+    return int(np.random.randint(0, high=high))
+
+
+def depth_augment(y: np.ndarray,
+                  choices: List=['int8', 'int16'],
+                  probs: List[float]=[0.5, 0.5]) -> np.ndarray:
+    """ Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16'].
+        probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5].
+
+    Returns:
+        np.ndarray: The augmented waveform.
+    """
+    assert len(probs) == len(
+        choices
+    ), 'number of choices {} must be equal to size of probs {}'.format(
+        len(choices), len(probs))
+    depth = np.random.choice(choices, p=probs)
+    src_depth = y.dtype
+    y1 = depth_convert(y, depth)
+    y2 = depth_convert(y1, src_depth)
+
+    return y2
+
+
+def adaptive_spect_augment(spect: np.ndarray,
+                           tempo_axis: int=0,
+                           level: float=0.1) -> np.ndarray:
+    """Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation.
+
+    Args:
+        spect (np.ndarray): Input spectrogram.
+        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
+        level (float, optional): The level factor of masking. Defaults to 0.1.
+
+    Returns:
+        np.ndarray: The augmented spectrogram.
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    time_mask_width = int(nt * level * 0.5)
+    freq_mask_width = int(nf * level * 0.5)
+
+    num_time_mask = int(10 * level)
+    num_freq_mask = int(10 * level)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = _randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = _randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = _randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = _randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def spect_augment(spect: np.ndarray,
+                  tempo_axis: int=0,
+                  max_time_mask: int=3,
+                  max_freq_mask: int=3,
+                  max_time_mask_width: int=30,
+                  max_freq_mask_width: int=20) -> np.ndarray:
+    """Do spectrogram augmentation in both time and freq axis.
+
+    Args:
+        spect (np.ndarray): Input spectrogram.
+        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
+        max_time_mask (int, optional): Maximum number of time masking. Defaults to 3.
+        max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3.
+        max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30.
+        max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20.
+
+    Returns:
+        np.ndarray: The augmented spectrogram.
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    num_time_mask = _randint(max_time_mask)
+    num_freq_mask = _randint(max_freq_mask)
+
+    time_mask_width = _randint(max_time_mask_width)
+    freq_mask_width = _randint(max_freq_mask_width)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = _randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = _randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = _randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = _randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray:
+    """ Random cropping on a input waveform.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D.
+        crop_len (int): Length of waveform to crop.
+
+    Returns:
+        np.ndarray: The cropped waveform.
+    """
+    if y.ndim != 1:
+        'only accept 1d tensor or numpy array'
+    n = len(y)
+    idx = _randint(n - crop_len)
+    return y[idx:idx + crop_len]
+
+
+def random_crop2d(s: np.ndarray, crop_len: int,
+                  tempo_axis: int=0) -> np.ndarray:
+    """ Random cropping on a spectrogram.
+
+    Args:
+        s (np.ndarray): Input spectrogram in 2D.
+        crop_len (int): Length of spectrogram to crop.
+        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
+
+    Returns:
+        np.ndarray: The cropped spectrogram.
+    """
+    if tempo_axis >= s.ndim:
+        raise ParameterError('axis out of range')
+
+    n = s.shape[tempo_axis]
+    idx = _randint(high=n - crop_len)
+    sli = [slice(None) for i in range(s.ndim)]
+    sli[tempo_axis] = slice(idx, idx + crop_len)
+    out = s[tuple(sli)]
+    return out
--- a/audio/paddleaudio/datasets/init.py
+++ b/audio/paddleaudio/datasets/init.py
@ -0,0 +1,20 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .esc50 import ESC50
+from .gtzan import GTZAN
+from .hey_snips import HeySnips
+from .rirs_noises import OpenRIRNoise
+from .tess import TESS
+from .urban_sound import UrbanSound8K
+from .voxceleb import VoxCeleb
--- a/audio/paddleaudio/datasets/dataset.py
+++ b/audio/paddleaudio/datasets/dataset.py
@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import numpy as np
+import paddle
+
+from ..backends.soundfile_backend import soundfile_load as load_audio
+from ..compliance.kaldi import fbank as kaldi_fbank
+from ..compliance.kaldi import mfcc as kaldi_mfcc
+from ..compliance.librosa import melspectrogram
+from ..compliance.librosa import mfcc
+
+feat_funcs = {
+    'raw': None,
+    'melspectrogram': melspectrogram,
+    'mfcc': mfcc,
+    'kaldi_fbank': kaldi_fbank,
+    'kaldi_mfcc': kaldi_mfcc,
+}
+
+
+class AudioClassificationDataset(paddle.io.Dataset):
+    """
+    Base class of audio classification dataset.
+    """
+
+    def __init__(self,
+                 files: List[str],
+                 labels: List[int],
+                 feat_type: str='raw',
+                 sample_rate: int=None,
+                 **kwargs):
+        """
+        Ags:
+            files (:obj:`List[str]`): A list of absolute path of audio files.
+            labels (:obj:`List[int]`): Labels of audio files.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        super(AudioClassificationDataset, self).__init__()
+
+        if feat_type not in feat_funcs.keys():
+            raise RuntimeError(
+                f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
+            )
+
+        self.files = files
+        self.labels = labels
+
+        self.feat_type = feat_type
+        self.sample_rate = sample_rate
+        self.feat_config = kwargs  # Pass keyword arguments to customize feature config
+
+    def _get_data(self, input_file: str):
+        raise NotImplementedError
+
+    def _convert_to_record(self, idx):
+        file, label = self.files[idx], self.labels[idx]
+
+        if self.sample_rate is None:
+            waveform, sample_rate = load_audio(file)
+        else:
+            waveform, sample_rate = load_audio(file, sr=self.sample_rate)
+
+        feat_func = feat_funcs[self.feat_type]
+
+        record = {}
+        if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
+            waveform = paddle.to_tensor(waveform).unsqueeze(0)  # (C, T)
+            record['feat'] = feat_func(
+                waveform=waveform, sr=self.sample_rate, **self.feat_config)
+        else:
+            record['feat'] = feat_func(
+                waveform, sample_rate,
+                **self.feat_config) if feat_func else waveform
+        record['label'] = label
+        return record
+
+    def __getitem__(self, idx):
+        record = self._convert_to_record(idx)
+        if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
+            return self.keys[idx], record['feat'], record['label']
+        else:
+            return np.array(record['feat']).transpose(), np.array(
+                record['label'], dtype=np.int64)
+
+    def __len__(self):
+        return len(self.files)
--- a/audio/paddleaudio/datasets/esc50.py
+++ b/audio/paddleaudio/datasets/esc50.py
@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['ESC50']
+
+
+class ESC50(AudioClassificationDataset):
+    """
+    The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
+    suitable for benchmarking methods of environmental sound classification. The dataset
+    consists of 5-second-long recordings organized into 50 semantical classes (with
+    40 examples per class)
+
+    Reference:
+        ESC: Dataset for Environmental Sound Classification
+        http://dx.doi.org/10.1145/2733373.2806390
+    """
+
+    archieves = [
+        {
+            'url':
+            'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
+            'md5': '7771e4b9d86d0945acce719c7a59305a',
+        },
+    ]
+    label_list = [
+        # Animals
+        'Dog',
+        'Rooster',
+        'Pig',
+        'Cow',
+        'Frog',
+        'Cat',
+        'Hen',
+        'Insects (flying)',
+        'Sheep',
+        'Crow',
+        # Natural soundscapes & water sounds
+        'Rain',
+        'Sea waves',
+        'Crackling fire',
+        'Crickets',
+        'Chirping birds',
+        'Water drops',
+        'Wind',
+        'Pouring water',
+        'Toilet flush',
+        'Thunderstorm',
+        # Human, non-speech sounds
+        'Crying baby',
+        'Sneezing',
+        'Clapping',
+        'Breathing',
+        'Coughing',
+        'Footsteps',
+        'Laughing',
+        'Brushing teeth',
+        'Snoring',
+        'Drinking, sipping',
+        # Interior/domestic sounds
+        'Door knock',
+        'Mouse click',
+        'Keyboard typing',
+        'Door, wood creaks',
+        'Can opening',
+        'Washing machine',
+        'Vacuum cleaner',
+        'Clock alarm',
+        'Clock tick',
+        'Glass breaking',
+        # Exterior/urban noises
+        'Helicopter',
+        'Chainsaw',
+        'Siren',
+        'Car horn',
+        'Engine',
+        'Train',
+        'Church bells',
+        'Airplane',
+        'Fireworks',
+        'Hand saw',
+    ]
+    meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
+    meta_info = collections.namedtuple(
+        'META_INFO',
+        ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
+    audio_path = os.path.join('ESC-50-master', 'audio')
+
+    def __init__(self,
+                 mode: str='train',
+                 split: int=1,
+                 feat_type: str='raw',
+                 **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        files, labels = self._get_data(mode, split)
+        super(ESC50, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self) -> List[collections.namedtuple]:
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                ret.append(self.meta_info(*line.strip().split(',')))
+        return ret
+
+    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        meta_info = self._get_meta_info()
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, fold, target, _, _, _, _ = sample
+            if mode == 'train' and int(fold) != split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+
+            if mode != 'train' and int(fold) == split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+
+        return files, labels
--- a/audio/paddleaudio/datasets/gtzan.py
+++ b/audio/paddleaudio/datasets/gtzan.py
@ -0,0 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+import random
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['GTZAN']
+
+
+class GTZAN(AudioClassificationDataset):
+    """
+    The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
+    each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
+    in machine listening research for music genre recognition (MGR).
+
+    Reference:
+        Musical genre classification of audio signals
+        https://ieeexplore.ieee.org/document/1021072/
+    """
+
+    archieves = [
+        {
+            'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
+            'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
+        },
+    ]
+    label_list = [
+        'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
+        'pop', 'reggae', 'rock'
+    ]
+    meta = os.path.join('genres', 'input.mf')
+    meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
+    audio_path = 'genres'
+
+    def __init__(self,
+                 mode='train',
+                 seed=0,
+                 n_folds=5,
+                 split=1,
+                 feat_type='raw',
+                 **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            seed (:obj:`int`, `optional`, defaults to 0):
+                Set the random seed to shuffle samples.
+            n_folds (:obj:`int`, `optional`, defaults to 5):
+                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
+        files, labels = self._get_data(mode, seed, n_folds, split)
+        super(GTZAN, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self) -> List[collections.namedtuple]:
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines():
+                ret.append(self.meta_info(*line.strip().split('\t')))
+        return ret
+
+    def _get_data(self, mode, seed, n_folds,
+                  split) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        meta_info = self._get_meta_info()
+        random.seed(seed)  # shuffle samples to split data
+        random.shuffle(
+            meta_info
+        )  # make sure using the same seed to create train and dev dataset
+
+        files = []
+        labels = []
+        n_samples_per_fold = len(meta_info) // n_folds
+        for idx, sample in enumerate(meta_info):
+            file_path, label = sample
+            filename = os.path.basename(file_path)
+            target = self.label_list.index(label)
+            fold = idx // n_samples_per_fold + 1
+
+            if mode == 'train' and int(fold) != split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, label, filename))
+                labels.append(target)
+
+            if mode != 'train' and int(fold) == split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, label, filename))
+                labels.append(target)
+
+        return files, labels
--- a/audio/paddleaudio/datasets/hey_snips.py
+++ b/audio/paddleaudio/datasets/hey_snips.py
@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import json
+import os
+from typing import List
+from typing import Tuple
+
+from .dataset import AudioClassificationDataset
+
+__all__ = ['HeySnips']
+
+
+class HeySnips(AudioClassificationDataset):
+    meta_info = collections.namedtuple('META_INFO',
+                                       ('key', 'label', 'duration', 'wav'))
+
+    def __init__(self,
+                 data_dir: os.PathLike,
+                 mode: str='train',
+                 feat_type: str='kaldi_fbank',
+                 sample_rate: int=16000,
+                 **kwargs):
+        self.data_dir = data_dir
+        files, labels = self._get_data(mode)
+        super(HeySnips, self).__init__(
+            files=files,
+            labels=labels,
+            feat_type=feat_type,
+            sample_rate=sample_rate,
+            **kwargs)
+
+    def _get_meta_info(self, mode) -> List[collections.namedtuple]:
+        ret = []
+        with open(os.path.join(self.data_dir, '{}.json'.format(mode)),
+                  'r') as f:
+            data = json.load(f)
+            for item in data:
+                sample = collections.OrderedDict()
+                if item['duration'] > 0:
+                    sample['key'] = item['id']
+                    sample['label'] = 0 if item['is_hotword'] == 1 else -1
+                    sample['duration'] = item['duration']
+                    sample['wav'] = os.path.join(self.data_dir,
+                                                 item['audio_file_path'])
+                    ret.append(self.meta_info(*sample.values()))
+        return ret
+
+    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
+        meta_info = self._get_meta_info(mode)
+
+        files = []
+        labels = []
+        self.keys = []
+        self.durations = []
+        for sample in meta_info:
+            key, target, duration, wav = sample
+            files.append(wav)
+            labels.append(int(target))
+            self.keys.append(key)
+            self.durations.append(float(duration))
+
+        return files, labels
--- a/audio/paddleaudio/datasets/rirs_noises.py
+++ b/audio/paddleaudio/datasets/rirs_noises.py
@ -0,0 +1,201 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import csv
+import os
+import random
+from typing import List
+
+from paddle.io import Dataset
+from tqdm import tqdm
+
+from ..backends.soundfile_backend import soundfile_load as load_audio
+from ..backends.soundfile_backend import soundfile_save as save_wav
+from ..utils import DATA_HOME
+from ..utils.download import download_and_decompress
+from .dataset import feat_funcs
+
+__all__ = ['OpenRIRNoise']
+
+
+class OpenRIRNoise(Dataset):
+    archieves = [
+        {
+            'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
+            'md5': 'e6f48e257286e05de56413b4779d8ffb',
+        },
+    ]
+
+    sample_rate = 16000
+    meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
+    base_path = os.path.join(DATA_HOME, 'open_rir_noise')
+    wav_path = os.path.join(base_path, 'RIRS_NOISES')
+    csv_path = os.path.join(base_path, 'csv')
+    subsets = ['rir', 'noise']
+
+    def __init__(self,
+                 subset: str='rir',
+                 feat_type: str='raw',
+                 target_dir=None,
+                 random_chunk: bool=True,
+                 chunk_duration: float=3.0,
+                 seed: int=0,
+                 **kwargs):
+
+        assert subset in self.subsets, \
+            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
+
+        self.subset = subset
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self.random_chunk = random_chunk
+        self.chunk_duration = chunk_duration
+
+        OpenRIRNoise.csv_path = os.path.join(
+            target_dir, "open_rir_noise",
+            "csv") if target_dir else self.csv_path
+        self._data = self._get_data()
+        super(OpenRIRNoise, self).__init__()
+
+        # Set up a seed to reproduce training or predicting result.
+        # random.seed(seed)
+
+    def _get_data(self):
+        # Download audio files.
+        print(f"rirs noises base path: {self.base_path}")
+        if not os.path.isdir(self.base_path):
+            download_and_decompress(
+                self.archieves, self.base_path, decompress=True)
+        else:
+            print(
+                f"{self.base_path} already exists, we will not download and decompress again"
+            )
+
+        # Data preparation.
+        print(f"prepare the csv to {self.csv_path}")
+        if not os.path.isdir(self.csv_path):
+            os.makedirs(self.csv_path)
+            self.prepare_data()
+
+        data = []
+        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                audio_id, duration, wav = line.strip().split(',')
+                data.append(self.meta_info(audio_id, float(duration), wav))
+
+        random.shuffle(data)
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(record['wav'])
+
+        assert self.feat_type in feat_funcs.keys(), \
+            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sr=sr, **self.feat_config) if feat_func else waveform
+
+        record.update({'feat': feat})
+        return record
+
+    @staticmethod
+    def _get_chunks(seg_dur, audio_id, audio_duration):
+        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
+
+        chunk_lst = [
+            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
+            for i in range(num_chunks)
+        ]
+        return chunk_lst
+
+    def _get_audio_info(self, wav_file: str,
+                        split_chunks: bool) -> List[List[str]]:
+        waveform, sr = load_audio(wav_file)
+        audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
+        audio_duration = waveform.shape[0] / sr
+
+        ret = []
+        if split_chunks and audio_duration > self.chunk_duration:  # Split into pieces of self.chunk_duration seconds.
+            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
+                                                audio_duration)
+
+            for idx, chunk in enumerate(uniq_chunks_list):
+                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
+                start_sample = int(float(s) * sr)
+                end_sample = int(float(e) * sr)
+                new_wav_file = os.path.join(self.base_path,
+                                            audio_id + f'_chunk_{idx+1:02}.wav')
+                save_wav(waveform[start_sample:end_sample], sr, new_wav_file)
+                # id, duration, new_wav
+                ret.append([chunk, self.chunk_duration, new_wav_file])
+        else:  # Keep whole audio.
+            ret.append([audio_id, audio_duration, wav_file])
+        return ret
+
+    def generate_csv(self,
+                     wav_files: List[str],
+                     output_file: str,
+                     split_chunks: bool=True):
+        print(f'Generating csv: {output_file}')
+        header = ["id", "duration", "wav"]
+
+        infos = list(
+            tqdm(
+                map(self._get_audio_info, wav_files, [split_chunks] * len(
+                    wav_files)),
+                total=len(wav_files)))
+
+        csv_lines = []
+        for info in infos:
+            csv_lines.extend(info)
+
+        with open(output_file, mode="w") as csv_f:
+            csv_writer = csv.writer(
+                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            csv_writer.writerow(header)
+            for line in csv_lines:
+                csv_writer.writerow(line)
+
+    def prepare_data(self):
+        rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
+                                "rir_list")
+        rir_files = []
+        with open(rir_list, 'r') as f:
+            for line in f.readlines():
+                rir_file = line.strip().split(' ')[-1]
+                rir_files.append(os.path.join(self.base_path, rir_file))
+
+        noise_list = os.path.join(self.wav_path, "pointsource_noises",
+                                  "noise_list")
+        noise_files = []
+        with open(noise_list, 'r') as f:
+            for line in f.readlines():
+                noise_file = line.strip().split(' ')[-1]
+                noise_files.append(os.path.join(self.base_path, noise_file))
+
+        self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
+        self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
+
+    def __getitem__(self, idx):
+        return self._convert_to_record(idx)
+
+    def __len__(self):
+        return len(self._data)
--- a/audio/paddleaudio/datasets/tess.py
+++ b/audio/paddleaudio/datasets/tess.py
@ -0,0 +1,126 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+import random
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['TESS']
+
+
+class TESS(AudioClassificationDataset):
+    """
+    TESS is a set of 200 target words were spoken in the carrier phrase
+    "Say the word _____' by two actresses (aged 26 and 64 years) and
+    recordings were made of the set portraying each of seven emotions(anger,
+    disgust, fear, happiness, pleasant surprise, sadness, and neutral).
+    There are 2800 stimuli in total.
+
+    Reference:
+        Toronto emotional speech set (TESS)
+        https://doi.org/10.5683/SP2/E8H2MF
+    """
+
+    archieves = [
+        {
+            'url':
+            'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
+            'md5':
+            '1465311b24d1de704c4c63e4ccc470c7',
+        },
+    ]
+    label_list = [
+        'angry',
+        'disgust',
+        'fear',
+        'happy',
+        'neutral',
+        'ps',  # pleasant surprise
+        'sad',
+    ]
+    meta_info = collections.namedtuple('META_INFO',
+                                       ('speaker', 'word', 'emotion'))
+    audio_path = 'TESS_Toronto_emotional_speech_set'
+
+    def __init__(self,
+                 mode='train',
+                 seed=0,
+                 n_folds=5,
+                 split=1,
+                 feat_type='raw',
+                 **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            seed (:obj:`int`, `optional`, defaults to 0):
+                Set the random seed to shuffle samples.
+            n_folds (:obj:`int`, `optional`, defaults to 5):
+                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
+        files, labels = self._get_data(mode, seed, n_folds, split)
+        super(TESS, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self, files) -> List[collections.namedtuple]:
+        ret = []
+        for file in files:
+            basename_without_extend = os.path.basename(file)[:-4]
+            ret.append(self.meta_info(*basename_without_extend.split('_')))
+        return ret
+
+    def _get_data(self, mode, seed, n_folds,
+                  split) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        wav_files = []
+        for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
+            for file in files:
+                if file.endswith('.wav'):
+                    wav_files.append(os.path.join(root, file))
+
+        random.seed(seed)  # shuffle samples to split data
+        random.shuffle(
+            wav_files
+        )  # make sure using the same seed to create train and dev dataset
+        meta_info = self._get_meta_info(wav_files)
+
+        files = []
+        labels = []
+        n_samples_per_fold = len(meta_info) // n_folds
+        for idx, sample in enumerate(meta_info):
+            _, _, emotion = sample
+            target = self.label_list.index(emotion)
+            fold = idx // n_samples_per_fold + 1
+
+            if mode == 'train' and int(fold) != split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+            if mode != 'train' and int(fold) == split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+        return files, labels
--- a/audio/paddleaudio/datasets/urban_sound.py
+++ b/audio/paddleaudio/datasets/urban_sound.py
@ -0,0 +1,104 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['UrbanSound8K']
+
+
+class UrbanSound8K(AudioClassificationDataset):
+    """
+    UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
+    sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
+    drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
+    classes are drawn from the urban sound taxonomy.
+
+    Reference:
+        A Dataset and Taxonomy for Urban Sound Research
+        https://dl.acm.org/doi/10.1145/2647868.2655045
+    """
+
+    archieves = [
+        {
+            'url':
+            'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
+            'md5': '9aa69802bbf37fb986f71ec1483a196e',
+        },
+    ]
+    label_list = [
+        "air_conditioner", "car_horn", "children_playing", "dog_bark",
+        "drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
+        "street_music"
+    ]
+    meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
+    meta_info = collections.namedtuple(
+        'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
+                      'class_id', 'label'))
+    audio_path = os.path.join('UrbanSound8K', 'audio')
+
+    def __init__(self,
+                 mode: str='train',
+                 split: int=1,
+                 feat_type: str='raw',
+                 **kwargs):
+        files, labels = self._get_data(mode, split)
+        super(UrbanSound8K, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+
+    def _get_meta_info(self):
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                ret.append(self.meta_info(*line.strip().split(',')))
+        return ret
+
+    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        meta_info = self._get_meta_info()
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, _, _, _, _, fold, target, _ = sample
+            if mode == 'train' and int(fold) != split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
+                                 filename))
+                labels.append(int(target))
+
+            if mode != 'train' and int(fold) == split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
+                                 filename))
+                labels.append(int(target))
+
+        return files, labels
--- a/audio/paddleaudio/datasets/voxceleb.py
+++ b/audio/paddleaudio/datasets/voxceleb.py
@ -0,0 +1,356 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import csv
+import glob
+import os
+import random
+from multiprocessing import cpu_count
+from typing import List
+
+from paddle.io import Dataset
+from pathos.multiprocessing import Pool
+from tqdm import tqdm
+
+from ..backends.soundfile_backend import soundfile_load as load_audio
+from ..utils import DATA_HOME
+from ..utils import decompress
+from ..utils.download import download_and_decompress
+from .dataset import feat_funcs
+
+__all__ = ['VoxCeleb']
+
+
+class VoxCeleb(Dataset):
+    source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
+    archieves_audio_dev = [
+        {
+            'url': source_url + 'vox1_dev_wav_partaa',
+            'md5': 'e395d020928bc15670b570a21695ed96',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partab',
+            'md5': 'bbfaaccefab65d82b21903e81a8a8020',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partac',
+            'md5': '017d579a2a96a077f40042ec33e51512',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partad',
+            'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
+        },
+    ]
+    archieves_audio_test = [
+        {
+            'url': source_url + 'vox1_test_wav.zip',
+            'md5': '185fdc63c3c739954633d50379a3d102',
+        },
+    ]
+    archieves_meta = [
+        {
+            'url':
+            'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
+            'md5':
+            'b73110731c9223c1461fe49cb48dddfc',
+        },
+    ]
+
+    num_speakers = 1211  # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+    sample_rate = 16000
+    meta_info = collections.namedtuple(
+        'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
+    base_path = os.path.join(DATA_HOME, 'vox1')
+    wav_path = os.path.join(base_path, 'wav')
+    meta_path = os.path.join(base_path, 'meta')
+    veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
+    csv_path = os.path.join(base_path, 'csv')
+    subsets = ['train', 'dev', 'enroll', 'test']
+
+    def __init__(
+            self,
+            subset: str='train',
+            feat_type: str='raw',
+            random_chunk: bool=True,
+            chunk_duration: float=3.0,  # seconds
+            split_ratio: float=0.9,  # train split ratio
+            seed: int=0,
+            target_dir: str=None,
+            vox2_base_path=None,
+            **kwargs):
+        """VoxCeleb data prepare and get the specific dataset audio info
+
+        Args:
+            subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
+            feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
+            random_chunk (bool, optional): random select a duration from audio. Defaults to True.
+            chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
+            target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
+            vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
+        """
+        assert subset in self.subsets, \
+            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
+
+        self.subset = subset
+        self.spk_id2label = {}
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self.random_chunk = random_chunk
+        self.chunk_duration = chunk_duration
+        self.split_ratio = split_ratio
+        self.target_dir = target_dir if target_dir else VoxCeleb.base_path
+        self.vox2_base_path = vox2_base_path
+
+        # if we set the target dir, we will change the vox data info data from base path to target dir
+        VoxCeleb.csv_path = os.path.join(
+            target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
+        VoxCeleb.meta_path = os.path.join(
+            target_dir, "voxceleb",
+            'meta') if target_dir else VoxCeleb.meta_path
+        VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
+                                               'veri_test2.txt')
+        # self._data = self._get_data()[:1000]  # KP: Small dataset test.
+        self._data = self._get_data()
+        super(VoxCeleb, self).__init__()
+
+        # Set up a seed to reproduce training or predicting result.
+        # random.seed(seed)
+
+    def _get_data(self):
+        # Download audio files.
+        # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
+        # so, we check the vox1/wav dir status
+        print(f"wav base path: {self.wav_path}")
+        if not os.path.isdir(self.wav_path):
+            print("start to download the voxceleb1 dataset")
+            download_and_decompress(  # multi-zip parts concatenate to vox1_dev_wav.zip
+                self.archieves_audio_dev,
+                self.base_path,
+                decompress=False)
+            download_and_decompress(  # download the vox1_test_wav.zip and unzip
+                self.archieves_audio_test,
+                self.base_path,
+                decompress=True)
+
+            # Download all parts and concatenate the files into one zip file.
+            dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
+            print(f'Concatenating all parts to: {dev_zipfile}')
+            os.system(
+                f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
+            )
+
+            # Extract all audio files of dev and test set.
+            decompress(dev_zipfile, self.base_path)
+
+        # Download meta files.
+        if not os.path.isdir(self.meta_path):
+            print("prepare the meta data")
+            download_and_decompress(
+                self.archieves_meta, self.meta_path, decompress=False)
+
+        # Data preparation.
+        if not os.path.isdir(self.csv_path):
+            os.makedirs(self.csv_path)
+            self.prepare_data()
+
+        data = []
+        print(
+            f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
+        )
+        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                audio_id, duration, wav, start, stop, spk_id = line.strip(
+                ).split(',')
+                data.append(
+                    self.meta_info(audio_id,
+                                   float(duration), wav,
+                                   int(start), int(stop), spk_id))
+
+        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
+            for line in f.readlines():
+                spk_id, label = line.strip().split(' ')
+                self.spk_id2label[spk_id] = int(label)
+
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(record['wav'])
+
+        # random select a chunk audio samples from the audio
+        if self.random_chunk:
+            num_wav_samples = waveform.shape[0]
+            num_chunk_samples = int(self.chunk_duration * sr)
+            start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
+            stop = start + num_chunk_samples
+        else:
+            start = record['start']
+            stop = record['stop']
+
+        waveform = waveform[start:stop]
+
+        assert self.feat_type in feat_funcs.keys(), \
+            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sr=sr, **self.feat_config) if feat_func else waveform
+
+        record.update({'feat': feat})
+        if self.subset in ['train',
+                           'dev']:  # Labels are available in train and dev.
+            record.update({'label': self.spk_id2label[record['spk_id']]})
+
+        return record
+
+    @staticmethod
+    def _get_chunks(seg_dur, audio_id, audio_duration):
+        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
+
+        chunk_lst = [
+            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
+            for i in range(num_chunks)
+        ]
+        return chunk_lst
+
+    def _get_audio_info(self, wav_file: str,
+                        split_chunks: bool) -> List[List[str]]:
+        waveform, sr = load_audio(wav_file)
+        spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
+        audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
+        audio_duration = waveform.shape[0] / sr
+
+        ret = []
+        if split_chunks:  # Split into pieces of self.chunk_duration seconds.
+            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
+                                                audio_duration)
+
+            for chunk in uniq_chunks_list:
+                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
+                start_sample = int(float(s) * sr)
+                end_sample = int(float(e) * sr)
+                # id, duration, wav, start, stop, spk_id
+                ret.append([
+                    chunk, audio_duration, wav_file, start_sample, end_sample,
+                    spk_id
+                ])
+        else:  # Keep whole audio.
+            ret.append([
+                audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
+            ])
+        return ret
+
+    def generate_csv(self,
+                     wav_files: List[str],
+                     output_file: str,
+                     split_chunks: bool=True):
+        print(f'Generating csv: {output_file}')
+        header = ["id", "duration", "wav", "start", "stop", "spk_id"]
+        # Note: this may occurs c++ execption, but the program will execute fine
+        # so we can ignore the execption 
+        with Pool(cpu_count()) as p:
+            infos = list(
+                tqdm(
+                    p.imap(lambda x: self._get_audio_info(x, split_chunks),
+                           wav_files),
+                    total=len(wav_files)))
+
+        csv_lines = []
+        for info in infos:
+            csv_lines.extend(info)
+
+        with open(output_file, mode="w") as csv_f:
+            csv_writer = csv.writer(
+                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            csv_writer.writerow(header)
+            for line in csv_lines:
+                csv_writer.writerow(line)
+
+    def prepare_data(self):
+        # Audio of speakers in veri_test_file should not be included in training set.
+        print("start to prepare the data csv file")
+        enroll_files = set()
+        test_files = set()
+        # get the enroll and test audio file path
+        with open(self.veri_test_file, 'r') as f:
+            for line in f.readlines():
+                _, enrol_file, test_file = line.strip().split(' ')
+                enroll_files.add(os.path.join(self.wav_path, enrol_file))
+                test_files.add(os.path.join(self.wav_path, test_file))
+            enroll_files = sorted(enroll_files)
+            test_files = sorted(test_files)
+
+        # get the enroll and test speakers
+        test_spks = set()
+        for file in (enroll_files + test_files):
+            spk = file.split('/wav/')[1].split('/')[0]
+            test_spks.add(spk)
+
+        # get all the train and dev audios file path
+        audio_files = []
+        speakers = set()
+        print("Getting file list...")
+        for path in [self.wav_path, self.vox2_base_path]:
+            # if vox2 directory is not set and vox2 is not a directory 
+            # we will not process this directory
+            if not path or not os.path.exists(path):
+                print(f"{path} is an invalid path, please check again, "
+                      "and we will ignore the vox2 base path")
+                continue
+            for file in glob.glob(
+                    os.path.join(path, "**", "*.wav"), recursive=True):
+                spk = file.split('/wav/')[1].split('/')[0]
+                if spk in test_spks:
+                    continue
+                speakers.add(spk)
+                audio_files.append(file)
+
+        print(
+            f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
+        )
+        # encode the train and dev speakers label to spk_id2label.txt
+        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
+            for label, spk_id in enumerate(
+                    sorted(speakers)):  # 1211 vox1, 5994 vox2, 7205 vox1+2
+                f.write(f'{spk_id} {label}\n')
+
+        audio_files = sorted(audio_files)
+        random.shuffle(audio_files)
+        split_idx = int(self.split_ratio * len(audio_files))
+        # split_ratio to train
+        train_files, dev_files = audio_files[:split_idx], audio_files[
+            split_idx:]
+
+        self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
+        self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
+
+        self.generate_csv(
+            enroll_files,
+            os.path.join(self.csv_path, 'enroll.csv'),
+            split_chunks=False)
+        self.generate_csv(
+            test_files,
+            os.path.join(self.csv_path, 'test.csv'),
+            split_chunks=False)
+
+    def __getitem__(self, idx):
+        return self._convert_to_record(idx)
+
+    def __len__(self):
+        return len(self._data)
--- a/audio/paddleaudio/features/init.py
+++ b/audio/paddleaudio/features/init.py
@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .layers import LogMelSpectrogram
+from .layers import MelSpectrogram
+from .layers import MFCC
+from .layers import Spectrogram
--- a/audio/paddleaudio/features/layers.py
+++ b/audio/paddleaudio/features/layers.py
@ -0,0 +1,328 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import Optional
+from typing import Union
+
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+
+from ..functional import compute_fbank_matrix
+from ..functional import create_dct
+from ..functional import power_to_db
+from ..functional.window import get_window
+
+__all__ = [
+    'Spectrogram',
+    'MelSpectrogram',
+    'LogMelSpectrogram',
+    'MFCC',
+]
+
+
+class Spectrogram(nn.Layer):
+    """Compute spectrogram of given signals, typically audio waveforms.
+    The spectorgram is defined as the complex norm of the short-time Fourier transformation.
+
+    Args:
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
+    def __init__(self,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 power: float=2.0,
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 dtype: str='float32') -> None:
+        super(Spectrogram, self).__init__()
+
+        assert power > 0, 'Power of spectrogram must be > 0.'
+        self.power = power
+
+        if win_length is None:
+            win_length = n_fft
+
+        self.fft_window = get_window(
+            window, win_length, fftbins=True, dtype=dtype)
+        self._stft = partial(
+            paddle.signal.stft,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=self.fft_window,
+            center=center,
+            pad_mode=pad_mode)
+        self.register_buffer('fft_window', self.fft_window)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
+        """
+        stft = self._stft(x)
+        spectrogram = paddle.pow(paddle.abs(stft), self.power)
+        return spectrogram
+
+
+class MelSpectrogram(nn.Layer):
+    """Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
+    def __init__(self,
+                 sr: int=22050,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 power: float=2.0,
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=50.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 dtype: str='float32') -> None:
+        super(MelSpectrogram, self).__init__()
+
+        self._spectrogram = Spectrogram(
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            power=power,
+            center=center,
+            pad_mode=pad_mode,
+            dtype=dtype)
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max
+        self.htk = htk
+        self.norm = norm
+        if f_max is None:
+            f_max = sr // 2
+        self.fbank_matrix = compute_fbank_matrix(
+            sr=sr,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            dtype=dtype)  # float64 for better numerical results
+        self.register_buffer('fbank_matrix', self.fbank_matrix)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
+        """
+        spect_feature = self._spectrogram(x)
+        mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
+        return mel_feature
+
+
+class LogMelSpectrogram(nn.Layer):
+    """Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
+        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
+    def __init__(self,
+                 sr: int=22050,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 power: float=2.0,
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=50.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 ref_value: float=1.0,
+                 amin: float=1e-10,
+                 top_db: Optional[float]=None,
+                 dtype: str='float32') -> None:
+        super(LogMelSpectrogram, self).__init__()
+
+        self._melspectrogram = MelSpectrogram(
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            power=power,
+            center=center,
+            pad_mode=pad_mode,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            dtype=dtype)
+
+        self.ref_value = ref_value
+        self.amin = amin
+        self.top_db = top_db
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
+        """
+        mel_feature = self._melspectrogram(x)
+        log_mel_feature = power_to_db(
+            mel_feature,
+            ref_value=self.ref_value,
+            amin=self.amin,
+            top_db=self.top_db)
+        return log_mel_feature
+
+
+class MFCC(nn.Layer):
+    """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_mfcc (int, optional): [description]. Defaults to 40.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
+        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
+    def __init__(self,
+                 sr: int=22050,
+                 n_mfcc: int=40,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 power: float=2.0,
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=50.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 ref_value: float=1.0,
+                 amin: float=1e-10,
+                 top_db: Optional[float]=None,
+                 dtype: str=paddle.float32) -> None:
+        super(MFCC, self).__init__()
+        assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
+            n_mfcc, n_mels)
+        self._log_melspectrogram = LogMelSpectrogram(
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            power=power,
+            center=center,
+            pad_mode=pad_mode,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            ref_value=ref_value,
+            amin=amin,
+            top_db=top_db,
+            dtype=dtype)
+        self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
+        self.register_buffer('dct_matrix', self.dct_matrix)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
+        """
+        log_mel_feature = self._log_melspectrogram(x)
+        mfcc = paddle.matmul(
+            log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
+                (0, 2, 1))  # (B, n_mels, L)
+        return mfcc
--- a/audio/paddleaudio/functional/init.py
+++ b/audio/paddleaudio/functional/init.py
@ -0,0 +1,20 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .functional import compute_fbank_matrix
+from .functional import create_dct
+from .functional import fft_frequencies
+from .functional import hz_to_mel
+from .functional import mel_frequencies
+from .functional import mel_to_hz
+from .functional import power_to_db
--- a/audio/paddleaudio/functional/functional.py
+++ b/audio/paddleaudio/functional/functional.py
@ -0,0 +1,266 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from librosa(https://github.com/librosa/librosa)
+import math
+from typing import Optional
+from typing import Union
+
+import paddle
+from paddle import Tensor
+
+__all__ = [
+    'hz_to_mel',
+    'mel_to_hz',
+    'mel_frequencies',
+    'fft_frequencies',
+    'compute_fbank_matrix',
+    'power_to_db',
+    'create_dct',
+]
+
+
+def hz_to_mel(freq: Union[Tensor, float],
+              htk: bool=False) -> Union[Tensor, float]:
+    """Convert Hz to Mels.
+
+    Args:
+        freq (Union[Tensor, float]): The input tensor with arbitrary shape.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
+    Returns:
+        Union[Tensor, float]: Frequency in mels.
+    """
+
+    if htk:
+        if isinstance(freq, Tensor):
+            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
+        else:
+            return 2595.0 * math.log10(1.0 + freq / 700.0)
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+
+    if isinstance(freq, Tensor):
+        target = min_log_mel + paddle.log(
+            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
+        mask = (freq > min_log_hz).astype(freq.dtype)
+        mels = target * mask + mels * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if freq >= min_log_hz:
+            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
+
+    return mels
+
+
+def mel_to_hz(mel: Union[float, Tensor],
+              htk: bool=False) -> Union[float, Tensor]:
+    """Convert mel bin numbers to frequencies.
+
+    Args:
+        mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
+    Returns:
+        Union[float, Tensor]: Frequencies in Hz.
+    """
+    if htk:
+        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
+
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mel
+    # And now the nonlinear scale
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+    if isinstance(mel, Tensor):
+        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
+        mask = (mel > min_log_mel).astype(mel.dtype)
+        freqs = target * mask + freqs * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if mel >= min_log_mel:
+            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
+
+    return freqs
+
+
+def mel_frequencies(n_mels: int=64,
+                    f_min: float=0.0,
+                    f_max: float=11025.0,
+                    htk: bool=False,
+                    dtype: str='float32') -> Tensor:
+    """Compute mel frequencies.
+
+    Args:
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
+
+    Returns:
+        Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
+    """
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(f_min, htk=htk)
+    max_mel = hz_to_mel(f_max, htk=htk)
+    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
+    freqs = mel_to_hz(mels, htk=htk)
+    return freqs
+
+
+def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor:
+    """Compute fourier frequencies.
+
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): Number of fft bins.
+        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
+
+    Returns:
+        Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
+    """
+    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
+
+
+def compute_fbank_matrix(sr: int,
+                         n_fft: int,
+                         n_mels: int=64,
+                         f_min: float=0.0,
+                         f_max: Optional[float]=None,
+                         htk: bool=False,
+                         norm: Union[str, float]='slaney',
+                         dtype: str='float32') -> Tensor:
+    """Compute fbank matrix.
+
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): Number of fft bins.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
+        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
+
+    Returns:
+        Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
+    """
+
+    if f_max is None:
+        f_max = float(sr) / 2
+
+    # Initialize the weights
+    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(
+        n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
+
+    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
+    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
+    #ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = paddle.maximum(
+            paddle.zeros_like(lower), paddle.minimum(lower, upper))
+
+    # Slaney-style mel is scaled to be approx constant energy per channel
+    if norm == 'slaney':
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm.unsqueeze(1)
+    elif isinstance(norm, int) or isinstance(norm, float):
+        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
+
+    return weights
+
+
+def power_to_db(spect: Tensor,
+                ref_value: float=1.0,
+                amin: float=1e-10,
+                top_db: Optional[float]=None) -> Tensor:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
+
+    Args:
+        spect (Tensor): STFT power spectrogram.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): Minimum threshold. Defaults to 1e-10.
+        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
+
+    Returns:
+        Tensor: Power spectrogram in db scale.
+    """
+    if amin <= 0:
+        raise Exception("amin must be strictly positive")
+
+    if ref_value <= 0:
+        raise Exception("ref_value must be strictly positive")
+
+    ones = paddle.ones_like(spect)
+    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
+    log_spec -= 10.0 * math.log10(max(ref_value, amin))
+
+    if top_db is not None:
+        if top_db < 0:
+            raise Exception("top_db must be non-negative")
+        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
+
+    return log_spec
+
+
+def create_dct(n_mfcc: int,
+               n_mels: int,
+               norm: Optional[str]='ortho',
+               dtype: str='float32') -> Tensor:
+    """Create a discrete cosine transform(DCT) matrix.
+
+    Args:
+        n_mfcc (int): Number of mel frequency cepstral coefficients. 
+        n_mels (int): Number of mel filterbanks.
+        norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
+        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
+
+    Returns:
+        Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
+    """
+    n = paddle.arange(n_mels, dtype=dtype)
+    k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
+    dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
+                     k)  # size (n_mfcc, n_mels)
+    if norm is None:
+        dct *= 2.0
+    else:
+        assert norm == "ortho"
+        dct[0] *= 1.0 / math.sqrt(2.0)
+        dct *= math.sqrt(2.0 / float(n_mels))
+    return dct.T
--- a/audio/paddleaudio/functional/window.py
+++ b/audio/paddleaudio/functional/window.py
@ -0,0 +1,337 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import math
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import paddle
+from paddle import Tensor
+
+__all__ = [
+    'get_window',
+]
+
+
+def _cat(x: List[Tensor], data_type: str) -> Tensor:
+    l = [paddle.to_tensor(_, data_type) for _ in x]
+    return paddle.concat(l)
+
+
+def _acosh(x: Union[Tensor, float]) -> Tensor:
+    if isinstance(x, float):
+        return math.log(x + math.sqrt(x**2 - 1))
+    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
+
+
+def _extend(M: int, sym: bool) -> bool:
+    """Extend window by 1 sample if needed for DFT-even symmetry. """
+    if not sym:
+        return M + 1, True
+    else:
+        return M, False
+
+
+def _len_guards(M: int) -> bool:
+    """Handle small or incorrect window lengths. """
+    if int(M) != M or M < 0:
+        raise ValueError('Window length M must be a non-negative integer')
+
+    return M <= 1
+
+
+def _truncate(w: Tensor, needed: bool) -> Tensor:
+    """Truncate window by 1 sample if needed for DFT-even symmetry. """
+    if needed:
+        return w[:-1]
+    else:
+        return w
+
+
+def _general_gaussian(M: int, p, sig, sym: bool=True,
+                      dtype: str='float64') -> Tensor:
+    """Compute a window with a generalized Gaussian shape.
+    This function is consistent with scipy.signal.windows.general_gaussian().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
+
+    return _truncate(w, needs_trunc)
+
+
+def _general_cosine(M: int, a: float, sym: bool=True,
+                    dtype: str='float64') -> Tensor:
+    """Compute a generic weighted sum of cosine terms window.
+    This function is consistent with scipy.signal.windows.general_cosine().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
+    w = paddle.zeros((M, ), dtype=dtype)
+    for k in range(len(a)):
+        w += a[k] * paddle.cos(k * fac)
+    return _truncate(w, needs_trunc)
+
+
+def _general_hamming(M: int, alpha: float, sym: bool=True,
+                     dtype: str='float64') -> Tensor:
+    """Compute a generalized Hamming window.
+    This function is consistent with scipy.signal.windows.general_hamming()
+    """
+    return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
+
+
+def _taylor(M: int,
+            nbar=4,
+            sll=30,
+            norm=True,
+            sym: bool=True,
+            dtype: str='float64') -> Tensor:
+    """Compute a Taylor window.
+    The Taylor window taper function approximates the Dolph-Chebyshev window's
+    constant sidelobe level for a parameterized number of near-in sidelobes.
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    # Original text uses a negative sidelobe level parameter and then negates
+    # it in the calculation of B. To keep consistent with other methods we
+    # assume the sidelobe level parameter to be positive.
+    B = 10**(sll / 20)
+    A = _acosh(B) / math.pi
+    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
+    ma = paddle.arange(1, nbar, dtype=dtype)
+
+    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
+    signs = paddle.empty_like(ma)
+    signs[::2] = 1
+    signs[1::2] = -1
+    m2 = ma * ma
+    for mi in range(len(ma)):
+        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
+                                                           ))
+        if mi == 0:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
+        elif mi == len(ma) - 1:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
+        else:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
+                mi] / m2[mi + 1:])
+
+        Fm[mi] = numer / denom
+
+    def W(n):
+        return 1 + 2 * paddle.matmul(
+            Fm.unsqueeze(0),
+            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
+
+    w = W(paddle.arange(0, M, dtype=dtype))
+
+    # normalize (Note that this is not described in the original text [1])
+    if norm:
+        scale = 1.0 / W((M - 1) / 2)
+        w *= scale
+    w = w.squeeze()
+    return _truncate(w, needs_trunc)
+
+
+def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Hamming window.
+    The Hamming window is a taper formed by using a raised cosine with
+    non-zero endpoints, optimized to minimize the nearest side lobe.
+    """
+    return _general_hamming(M, 0.54, sym, dtype=dtype)
+
+
+def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Hann window.
+    The Hann window is a taper formed by using a raised cosine or sine-squared
+    with ends that touch zero.
+    """
+    return _general_hamming(M, 0.5, sym, dtype=dtype)
+
+
+def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Tukey window.
+    The Tukey window is also known as a tapered cosine window.
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+
+    if alpha <= 0:
+        return paddle.ones((M, ), dtype=dtype)
+    elif alpha >= 1.0:
+        return hann(M, sym=sym)
+
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype)
+    width = int(alpha * (M - 1) / 2.0)
+    n1 = n[0:width + 1]
+    n2 = n[width + 1:M - width - 1]
+    n3 = n[M - width - 1:]
+
+    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
+    w2 = paddle.ones(n2.shape, dtype=dtype)
+    w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
+                                          (M - 1))))
+    w = paddle.concat([w1, w2, w3])
+
+    return _truncate(w, needs_trunc)
+
+
+def _kaiser(M: int, beta: float, sym: bool=True,
+            dtype: str='float64') -> Tensor:
+    """Compute a Kaiser window.
+    The Kaiser window is a taper formed by using a Bessel function.
+    """
+    raise NotImplementedError()
+
+
+def _gaussian(M: int, std: float, sym: bool=True,
+              dtype: str='float64') -> Tensor:
+    """Compute a Gaussian window.
+    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    sig2 = 2 * std * std
+    w = paddle.exp(-n**2 / sig2)
+
+    return _truncate(w, needs_trunc)
+
+
+def _exponential(M: int,
+                 center=None,
+                 tau=1.,
+                 sym: bool=True,
+                 dtype: str='float64') -> Tensor:
+    """Compute an exponential (or Poisson) window. """
+    if sym and center is not None:
+        raise ValueError("If sym==True, center must be None.")
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    if center is None:
+        center = (M - 1) / 2
+
+    n = paddle.arange(0, M, dtype=dtype)
+    w = paddle.exp(-paddle.abs(n - center) / tau)
+
+    return _truncate(w, needs_trunc)
+
+
+def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a triangular window.
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
+    if M % 2 == 0:
+        w = (2 * n - 1.0) / M
+        w = paddle.concat([w, w[::-1]])
+    else:
+        w = 2 * n / (M + 1.0)
+        w = paddle.concat([w, w[-2::-1]])
+
+    return _truncate(w, needs_trunc)
+
+
+def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Bohman window.
+    The Bohman window is the autocorrelation of a cosine window.
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
+    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
+        math.pi * fac)
+    w = _cat([0, w, 0], dtype)
+
+    return _truncate(w, needs_trunc)
+
+
+def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Blackman window.
+    The Blackman window is a taper formed by using the first three terms of
+    a summation of cosines. It was designed to have close to the minimal
+    leakage possible.  It is close to optimal, only slightly worse than a
+    Kaiser window.
+    """
+    return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
+
+
+def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a window with a simple cosine shape.
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
+
+    return _truncate(w, needs_trunc)
+
+
+def get_window(window: Union[str, Tuple[str, float]],
+               win_length: int,
+               fftbins: bool=True,
+               dtype: str='float64') -> Tensor:
+    """Return a window of a given length and type.
+
+    Args:
+        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
+        win_length (int): Number of samples.
+        fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
+        dtype (str, optional): The data type of the return window. Defaults to 'float64'.
+
+    Returns:
+        Tensor: The window represented as a tensor.
+    """
+    sym = not fftbins
+
+    args = ()
+    if isinstance(window, tuple):
+        winstr = window[0]
+        if len(window) > 1:
+            args = window[1:]
+    elif isinstance(window, str):
+        if window in ['gaussian', 'exponential']:
+            raise ValueError("The '" + window + "' window needs one or "
+                             "more parameters -- pass a tuple.")
+        else:
+            winstr = window
+    else:
+        raise ValueError("%s as window type is not supported." %
+                         str(type(window)))
+
+    try:
+        winfunc = eval('_' + winstr)
+    except KeyError as e:
+        raise ValueError("Unknown window type.") from e
+
+    params = (win_length, ) + args
+    kwargs = {'sym': sym}
+    return winfunc(*params, dtype=dtype, **kwargs)
--- a/audio/paddleaudio/io/init.py
+++ b/audio/paddleaudio/io/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/audio/paddleaudio/metric/init.py
+++ b/audio/paddleaudio/metric/init.py
@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .eer import compute_eer
+from .eer import compute_minDCF
--- a/audio/paddleaudio/metric/eer.py
+++ b/audio/paddleaudio/metric/eer.py
@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import numpy as np
+import paddle
+from sklearn.metrics import roc_curve
+
+
+def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
+    """Compute EER and return score threshold.
+
+    Args:
+        labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num
+        scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num
+
+    Returns:
+        List[float]: eer and the specific threshold
+    """
+    fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
+    fnr = 1 - tpr
+    eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
+    eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
+    return eer, eer_threshold
+
+
+def compute_minDCF(positive_scores,
+                   negative_scores,
+                   c_miss=1.0,
+                   c_fa=1.0,
+                   p_target=0.01):
+    """
+    This is modified from SpeechBrain
+    https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509
+    Computes the minDCF metric normally used to evaluate speaker verification
+    systems. The min_DCF is the minimum of the following C_det function computed
+    within the defined threshold range:
+
+    C_det =  c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)
+
+    where p_miss is the missing probability and p_fa is the probability of having
+    a false alarm.
+
+    Args:
+        positive_scores (Paddle.Tensor): The scores from entries of the same class.
+        negative_scores (Paddle.Tensor): The scores from entries of different classes.
+        c_miss (float, optional): Cost assigned to a missing error (default 1.0).
+        c_fa (float, optional): Cost assigned to a false alarm (default 1.0).
+        p_target (float, optional): Prior probability of having a target (default 0.01).
+
+    Returns:
+        List[float]: min dcf and the specific threshold
+    """
+    # Computing candidate thresholds
+    if len(positive_scores.shape) > 1:
+        positive_scores = positive_scores.squeeze()
+
+    if len(negative_scores.shape) > 1:
+        negative_scores = negative_scores.squeeze()
+
+    thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores]))
+    thresholds = paddle.unique(thresholds)
+
+    # Adding intermediate thresholds
+    interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2
+    thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds]))
+
+    # Computing False Rejection Rate (miss detection)
+    positive_scores = paddle.concat(
+        len(thresholds) * [positive_scores.unsqueeze(0)])
+    pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds
+    p_miss = (pos_scores_threshold.sum(0)
+              ).astype("float32") / positive_scores.shape[1]
+    del positive_scores
+    del pos_scores_threshold
+
+    # Computing False Acceptance Rate (false alarm)
+    negative_scores = paddle.concat(
+        len(thresholds) * [negative_scores.unsqueeze(0)])
+    neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds
+    p_fa = (neg_scores_threshold.sum(0)
+            ).astype("float32") / negative_scores.shape[1]
+    del negative_scores
+    del neg_scores_threshold
+
+    c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target)
+    c_min = paddle.min(c_det, axis=0)
+    min_index = paddle.argmin(c_det, axis=0)
+    return float(c_min), float(thresholds[min_index])
--- a/audio/paddleaudio/sox_effects/init.py
+++ b/audio/paddleaudio/sox_effects/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/audio/paddleaudio/utils/init.py
+++ b/audio/paddleaudio/utils/init.py
@ -0,0 +1,27 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .download import decompress
+from .download import download_and_decompress
+from .download import load_state_dict_from_url
+from .env import DATA_HOME
+from .env import MODEL_HOME
+from .env import PPAUDIO_HOME
+from .env import USER_HOME
+from .error import ParameterError
+from .log import Logger
+from .log import logger
+from .time import seconds_to_hms
+from .time import Timer
+from .numeric import depth_convert
+from .numeric import pcm16to32
--- a/audio/paddleaudio/utils/download.py
+++ b/audio/paddleaudio/utils/download.py
@ -0,0 +1,64 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Dict
+from typing import List
+
+from paddle.framework import load as load_state_dict
+from paddle.utils import download
+
+from .log import logger
+
+download.logger = logger
+
+__all__ = [
+    'decompress',
+    'download_and_decompress',
+    'load_state_dict_from_url',
+]
+
+
+def decompress(file: str):
+    """
+    Extracts all files from a compressed file.
+    """
+    assert os.path.isfile(file), "File: {} not exists.".format(file)
+    download._decompress(file)
+
+
+def download_and_decompress(archives: List[Dict[str, str]],
+                            path: str,
+                            decompress: bool=True):
+    """
+    Download archieves and decompress to specific path.
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    for archive in archives:
+        assert 'url' in archive and 'md5' in archive, \
+            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+        download.get_path_from_url(
+            archive['url'], path, archive['md5'], decompress=decompress)
+
+
+def load_state_dict_from_url(url: str, path: str, md5: str=None):
+    """
+    Download and load a state dict from url
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    download.get_path_from_url(url, path, md5)
+    return load_state_dict(os.path.join(path, os.path.basename(url)))
--- a/audio/paddleaudio/utils/env.py
+++ b/audio/paddleaudio/utils/env.py
@ -0,0 +1,60 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+This module is used to store environmental variables in PaddleAudio.
+PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
+├                            default value through the PPAUDIO_HOME environment variable.
+├─ MODEL_HOME    -->  Store model files.
+└─ DATA_HOME     -->  Store automatically downloaded datasets.
+'''
+import os
+
+__all__ = [
+    'USER_HOME',
+    'PPAUDIO_HOME',
+    'MODEL_HOME',
+    'DATA_HOME',
+]
+
+
+def _get_user_home():
+    return os.path.expanduser('~')
+
+
+def _get_ppaudio_home():
+    if 'PPAUDIO_HOME' in os.environ:
+        home_path = os.environ['PPAUDIO_HOME']
+        if os.path.exists(home_path):
+            if os.path.isdir(home_path):
+                return home_path
+            else:
+                raise RuntimeError(
+                    'The environment variable PPAUDIO_HOME {} is not a directory.'.
+                    format(home_path))
+        else:
+            return home_path
+    return os.path.join(_get_user_home(), '.paddleaudio')
+
+
+def _get_sub_home(directory):
+    home = os.path.join(_get_ppaudio_home(), directory)
+    if not os.path.exists(home):
+        os.makedirs(home)
+    return home
+
+
+USER_HOME = _get_user_home()
+PPAUDIO_HOME = _get_ppaudio_home()
+MODEL_HOME = _get_sub_home('models')
+DATA_HOME = _get_sub_home('datasets')
--- a/audio/paddleaudio/utils/error.py
+++ b/audio/paddleaudio/utils/error.py
@ -0,0 +1,20 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['ParameterError']
+
+
+class ParameterError(Exception):
+    """Exception class for Parameter checking"""
+    pass
--- a/audio/paddleaudio/utils/log.py
+++ b/audio/paddleaudio/utils/log.py
@ -0,0 +1,139 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import functools
+import logging
+import threading
+import time
+
+import colorlog
+
+__all__ = [
+    'Logger',
+    'logger',
+]
+
+log_config = {
+    'DEBUG': {
+        'level': 10,
+        'color': 'purple'
+    },
+    'INFO': {
+        'level': 20,
+        'color': 'green'
+    },
+    'TRAIN': {
+        'level': 21,
+        'color': 'cyan'
+    },
+    'EVAL': {
+        'level': 22,
+        'color': 'blue'
+    },
+    'WARNING': {
+        'level': 30,
+        'color': 'yellow'
+    },
+    'ERROR': {
+        'level': 40,
+        'color': 'red'
+    },
+    'CRITICAL': {
+        'level': 50,
+        'color': 'bold_red'
+    }
+}
+
+
+class Logger(object):
+    '''
+    Deafult logger in PaddleAudio
+    Args:
+        name(str) : Logger name, default is 'PaddleAudio'
+    '''
+
+    def __init__(self, name: str=None):
+        name = 'PaddleAudio' if not name else name
+        self.logger = logging.getLogger(name)
+
+        for key, conf in log_config.items():
+            logging.addLevelName(conf['level'], key)
+            self.__dict__[key] = functools.partial(self.__call__, conf['level'])
+            self.__dict__[key.lower()] = functools.partial(self.__call__,
+                                                           conf['level'])
+
+        self.format = colorlog.ColoredFormatter(
+            '%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
+            log_colors={key: conf['color']
+                        for key, conf in log_config.items()})
+
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+
+        self.logger.addHandler(self.handler)
+        self.logLevel = 'DEBUG'
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+        self._is_enable = True
+
+    def disable(self):
+        self._is_enable = False
+
+    def enable(self):
+        self._is_enable = True
+
+    @property
+    def is_enable(self) -> bool:
+        return self._is_enable
+
+    def __call__(self, log_level: str, msg: str):
+        if not self.is_enable:
+            return
+
+        self.logger.log(log_level, msg)
+
+    @contextlib.contextmanager
+    def use_terminator(self, terminator: str):
+        old_terminator = self.handler.terminator
+        self.handler.terminator = terminator
+        yield
+        self.handler.terminator = old_terminator
+
+    @contextlib.contextmanager
+    def processing(self, msg: str, interval: float=0.1):
+        '''
+        Continuously print a progress bar with rotating special effects.
+        Args:
+            msg(str): Message to be printed.
+            interval(float): Rotation interval. Default to 0.1.
+        '''
+        end = False
+
+        def _printer():
+            index = 0
+            flags = ['\\', '|', '/', '-']
+            while not end:
+                flag = flags[index % len(flags)]
+                with self.use_terminator('\r'):
+                    self.info('{}: {}'.format(msg, flag))
+                time.sleep(interval)
+                index += 1
+
+        t = threading.Thread(target=_printer)
+        t.start()
+        yield
+        end = True
+
+
+logger = Logger()
--- a/audio/paddleaudio/utils/numeric.py
+++ b/audio/paddleaudio/utils/numeric.py
@ -0,0 +1,107 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Union
+
+import numpy as np
+
+__all__ = ["pcm16to32", "depth_convert"]
+
+
+def pcm16to32(audio: np.ndarray) -> np.ndarray:
+    """pcm int16 to float32
+
+    Args:
+        audio (np.ndarray): Waveform with dtype of int16.
+
+    Returns:
+        np.ndarray: Waveform with dtype of float32.
+    """
+    if audio.dtype == np.int16:
+        audio = audio.astype("float32")
+        bits = np.iinfo(np.int16).bits
+        audio = audio / (2**(bits - 1))
+    return audio
+
+
+def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Data type casting in a safe way, i.e., prevent overflow or underflow.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
+    """
+    if 'float' in str(y.dtype):
+        return np.clip(y, np.finfo(dtype).min,
+                       np.finfo(dtype).max).astype(dtype)
+    else:
+        return np.clip(y, np.iinfo(dtype).min,
+                       np.iinfo(dtype).max).astype(dtype)
+
+
+def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Convert audio array to target dtype safely. 
+    This function convert audio waveform to a target dtype, with addition steps of
+    preventing overflow/underflow and preserving audio range.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
+    """
+
+    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
+    if y.dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype == y.dtype:
+        return y
+
+    if dtype == 'float64' and y.dtype == 'float32':
+        return _safe_cast(y, dtype)
+    if dtype == 'float32' and y.dtype == 'float64':
+        return _safe_cast(y, dtype)
+
+    if dtype == 'int16' or dtype == 'int8':
+        if y.dtype in ['float64', 'float32']:
+            factor = np.iinfo(dtype).max
+            y = np.clip(y * factor, np.iinfo(dtype).min,
+                        np.iinfo(dtype).max).astype(dtype)
+            y = y.astype(dtype)
+        else:
+            if dtype == 'int16' and y.dtype == 'int8':
+                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
+                y = y.astype('float32') * factor
+                y = y.astype('int16')
+
+            else:  # dtype == 'int8' and y.dtype=='int16':
+                y = y.astype('int32') * np.iinfo('int8').max / \
+                    np.iinfo('int16').max
+                y = y.astype('int8')
+
+    if dtype in ['float32', 'float64']:
+        org_dtype = y.dtype
+        y = y.astype(dtype) / np.iinfo(org_dtype).max
+    return y
--- a/audio/paddleaudio/utils/time.py
+++ b/audio/paddleaudio/utils/time.py
@ -0,0 +1,72 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import time
+
+__all__ = [
+    'Timer',
+    'seconds_to_hms',
+]
+
+
+class Timer(object):
+    '''Calculate runing speed and estimated time of arrival(ETA)'''
+
+    def __init__(self, total_step: int):
+        self.total_step = total_step
+        self.last_start_step = 0
+        self.current_step = 0
+        self._is_running = True
+
+    def start(self):
+        self.last_time = time.time()
+        self.start_time = time.time()
+
+    def stop(self):
+        self._is_running = False
+        self.end_time = time.time()
+
+    def count(self) -> int:
+        if not self.current_step >= self.total_step:
+            self.current_step += 1
+        return self.current_step
+
+    @property
+    def timing(self) -> float:
+        run_steps = self.current_step - self.last_start_step
+        self.last_start_step = self.current_step
+        time_used = time.time() - self.last_time
+        self.last_time = time.time()
+        return run_steps / time_used
+
+    @property
+    def is_running(self) -> bool:
+        return self._is_running
+
+    @property
+    def eta(self) -> str:
+        if not self.is_running:
+            return '00:00:00'
+        scale = self.total_step / self.current_step
+        remaining_time = (time.time() - self.start_time) * scale
+        return seconds_to_hms(remaining_time)
+
+
+def seconds_to_hms(seconds: int) -> str:
+    '''Convert the number of seconds to hh:mm:ss'''
+    h = math.floor(seconds / 3600)
+    m = math.floor((seconds - h * 3600) / 60)
+    s = int(seconds - h * 3600 - m * 60)
+    hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
+    return hms_str
--- a/audio/setup.py
+++ b/audio/setup.py
@ -0,0 +1,99 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import os
+
+import setuptools
+from setuptools.command.install import install
+from setuptools.command.test import test
+
+# set the version here
+VERSION = '1.0.2'
+
+
+# Inspired by the example at https://pytest.org/latest/goodpractises.html
+class TestCommand(test):
+    def finalize_options(self):
+        test.finalize_options(self)
+        self.test_args = []
+        self.test_suite = True
+
+    def run(self):
+        self.run_benchmark()
+        super(TestCommand, self).run()
+
+    def run_tests(self):
+        # Run nose ensuring that argv simulates running nosetests directly
+        import nose
+        nose.run_exit(argv=['nosetests', '-w', 'tests'])
+
+    def run_benchmark(self):
+        for benchmark_item in glob.glob('tests/benchmark/*py'):
+            os.system(f'pytest {benchmark_item}')
+
+
+class InstallCommand(install):
+    def run(self):
+        install.run(self)
+
+
+def write_version_py(filename='paddleaudio/__init__.py'):
+    with open(filename, "a") as f:
+        f.write(f"__version__ = '{VERSION}'")
+
+
+def remove_version_py(filename='paddleaudio/__init__.py'):
+    with open(filename, "r") as f:
+        lines = f.readlines()
+    with open(filename, "w") as f:
+        for line in lines:
+            if "__version__" not in line:
+                f.write(line)
+
+
+remove_version_py()
+write_version_py()
+
+setuptools.setup(
+    name="paddleaudio",
+    version=VERSION,
+    author="",
+    author_email="",
+    description="PaddleAudio, in development",
+    long_description="",
+    long_description_content_type="text/markdown",
+    url="",
+    packages=setuptools.find_packages(include=['paddleaudio*']),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+    install_requires=[
+        'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
+        'soundfile >= 0.9.0', 'colorlog', 'pathos == 0.2.8'
+    ],
+    extras_require={
+        'test': [
+            'nose', 'librosa==0.8.1', 'soundfile==0.10.3.post1',
+            'torchaudio==0.10.2', 'pytest-benchmark'
+        ],
+    },
+    cmdclass={
+        'install': InstallCommand,
+        'test': TestCommand,
+    }, )
+
+remove_version_py()
--- a/paddlespeech/audio/backends/soundfile_backend.py
+++ b/paddlespeech/audio/backends/soundfile_backend.py
@ -32,9 +32,9 @@ __all__ = [
    'to_mono',
    'normalize',
    'save',
-    'soudfile_save',
+    'soundfile_save',
    'load',
-    'load_old',
+    'soundfile_load',
    'info',
    'to_mono'
 ]
@ -659,4 +659,4 @@ def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
        sinfo.channels,
        bits_per_sample=_get_bit_depth(sinfo.subtype),
        encoding=_get_encoding(sinfo.format, sinfo.subtype),
-    )
+    )