clean paddlespeech/audio

3 years ago · 13ee17cdcb
parent d94996f222
commit 13ee17cdcb
76 changed files with 11 additions and 15141 deletions
--- a/paddlespeech/audio/CMakeLists.txt
+++ b/paddlespeech/audio/CMakeLists.txt
@ -1,3 +0,0 @@
-
-add_subdirectory(third_party)
-add_subdirectory(src)
--- a/paddlespeech/audio/README.md
+++ b/paddlespeech/audio/README.md
@ -1,31 +0,0 @@
-# PaddleAudio
-
-## Reference
-`csrc` code is reference of `torchaudio`.
-
-```text
-BSD 2-Clause License
-
-Copyright (c) [year], [fullname]
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-```
--- a/paddlespeech/audio/init.py
+++ b/paddlespeech/audio/init.py
@ -11,17 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from . import _extension
-from . import compliance
-from . import datasets
-from . import features
-from . import functional
-from . import io
-from . import metric
-from . import sox_effects
 from . import streamdata
 from . import text
 from . import transform
-from .backends import load
-from .backends import save
--- a/paddlespeech/audio/_extension.py
+++ b/paddlespeech/audio/_extension.py
@ -1,164 +0,0 @@
-import os
-import warnings
-from pathlib import Path
-
-from ._internal import module_utils as _mod_utils  # noqa: F401
-
-
-import contextlib
-import ctypes
-import os
-import sys
-import types
-
-# Query `hasattr` only once.
-_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
-                                                               'setdlopenflags')
-
-
-@contextlib.contextmanager
-def dl_open_guard():
-    """
-    # https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
-    Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
-    shared library to load custom operators.
-    """
-    if _SET_GLOBAL_FLAGS:
-        old_flags = sys.getdlopenflags()
-        sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
-    yield
-    if _SET_GLOBAL_FLAGS:
-        sys.setdlopenflags(old_flags)
-
-
-def resolve_library_path(path: str) -> str:
-    return os.path.realpath(path)
-
-
-class _Ops(types.ModuleType):
-    #__file__ = '_ops.py'
-
-    def __init__(self):
-        super(_Ops, self).__init__('paddlespeech.ops')
-        self.loaded_libraries = set()
-
-    def load_library(self, path):
-        """
-        Loads a shared library from the given path into the current process.
-        This allows dynamically loading custom operators. For this, 
-        you should compile your operator and 
-        the static registration code into a shared library object, and then
-        call ``paddlespeech.ops.load_library('path/to/libcustom.so')`` to load the
-        shared object.
-        After the library is loaded, it is added to the
-        ``paddlespeech.ops.loaded_libraries`` attribute, a set that may be inspected
-        for the paths of all libraries loaded using this function.
-        Args:
-            path (str): A path to a shared library to load.
-        """
-        path = resolve_library_path(path)
-        with dl_open_guard():
-            # https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
-            # Import the shared library into the process, thus running its
-            # static (global) initialization code in order to register custom
-            # operators with the JIT.
-            ctypes.CDLL(path)
-        self.loaded_libraries.add(path)
-
-
-_LIB_DIR = Path(__file__).parent / "lib"
-
-def _get_lib_path(lib: str):
-    suffix = "pyd" if os.name == "nt" else "so"
-    path = _LIB_DIR / f"{lib}.{suffix}"
-    return path
-
-
-def _load_lib(lib: str) -> bool:
-    """Load extension module
-    Note:
-        In case `paddleaudio` is deployed with `pex` format, the library file
-        is not in a standard location.
-        In this case, we expect that `libpaddlleaudio` is available somewhere
-        in the search path of dynamic loading mechanism, so that importing
-        `_paddlleaudio` will have library loader find and load `libpaddlleaudio`.
-        This is the reason why the function should not raising an error when the library
-        file is not found.
-    Returns:
-        bool:
-            True if the library file is found AND the library loaded without failure.
-            False if the library file is not found (like in the case where paddlleaudio
-            is deployed with pex format, thus the shared library file is
-            in a non-standard location.).
-            If the library file is found but there is an issue loading the library,
-            (such as missing dependency) then this function raises the exception as-is.
-    Raises:
-        Exception:
-            If the library file is found, but there is an issue loading the library file,
-            (when underlying `ctype.DLL` throws an exception), this function will pass
-            the exception as-is, instead of catching it and returning bool.
-            The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
-            is not found.
-            This behavior was chosen because the expected failure case is not recoverable.
-            If a dependency is missing, then users have to install it.
-    """
-    path = _get_lib_path(lib)
-    if not path.exists():
-        warnings.warn("lib path is not exists:" + str(path))
-        return False
-    #paddlespeech.audio.ops.load_library(path)
-    ops.load_library(path)
-    return True
-
-
-_FFMPEG_INITIALIZED = False
-
-
-def _init_ffmpeg():
-    global _FFMPEG_INITIALIZED
-    if _FFMPEG_INITIALIZED:
-        return
-
-    if not paddlespeech.audio._paddlleaudio.is_ffmpeg_available():
-        raise RuntimeError(
-            "paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
-        )
-
-    try:
-        _load_lib("libpaddlleaudio_ffmpeg")
-    except OSError as err:
-        raise ImportError(
-            "FFmpeg libraries are not found. Please install FFmpeg.") from err
-
-    import paddllespeech.audio._paddlleaudio_ffmpeg  # noqa
-
-    paddlespeech.audio._paddlleaudio.ffmpeg_init()
-    if paddlespeech.audio._paddlleaudio.ffmpeg_get_log_level() > 8:
-        paddlespeech.audio._paddlleaudio.ffmpeg_set_log_level(8)
-
-    _FFMPEG_INITIALIZED = True
-
-
-def _init_extension():
-    if not _mod_utils.is_module_available("paddlespeech.audio._paddleaudio"):
-        warnings.warn("paddlespeech C++ extension is not available.")
-        return
-
-    _load_lib("libpaddleaudio")
-    # This import is for initializing the methods registered via PyBind11
-    # This has to happen after the base library is loaded
-    from paddlespeech.audio import _paddleaudio  # noqa
-
-    # Because this part is executed as part of `import torchaudio`, we ignore the
-    # initialization failure.
-    # If the FFmpeg integration is not properly initialized, then detailed error
-    # will be raised when client code attempts to import the dedicated feature.
-    try:
-        _init_ffmpeg()
-    except Exception:
-        pass
-
-
-ops = _Ops()
-
-_init_extension()
--- a/paddlespeech/audio/_internal/init.py
+++ b/paddlespeech/audio/_internal/init.py
--- a/paddlespeech/audio/_internal/module_utils.py
+++ b/paddlespeech/audio/_internal/module_utils.py
@ -1,148 +0,0 @@
-import importlib.util
-import warnings
-from functools import wraps
-from typing import Optional
-
-#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py
-
-
-def is_module_available(*modules: str) -> bool:
-    r"""Returns if a top-level module with :attr:`name` exists *without**
-    importing it. This is generally safer than try-catch block around a
-    `import X`. It avoids third party libraries breaking assumptions of some of
-    our tests, e.g., setting multiprocessing start method when imported
-    (see librosa/#747, torchvision/#544).
-    """
-    return all(importlib.util.find_spec(m) is not None for m in modules)
-
-
-def requires_module(*modules: str):
-    """Decorate function to give error message if invoked without required optional modules.
-    This decorator is to give better error message to users rather
-    than raising ``NameError:  name 'module' is not defined`` at random places.
-    """
-    missing = [m for m in modules if not is_module_available(m)]
-
-    if not missing:
-        # fall through. If all the modules are available, no need to decorate
-        def decorator(func):
-            return func
-
-    else:
-        req = f"module: {missing[0]}" if len(
-            missing) == 1 else f"modules: {missing}"
-
-        def decorator(func):
-            @wraps(func)
-            def wrapped(*args, **kwargs):
-                raise RuntimeError(
-                    f"{func.__module__}.{func.__name__} requires {req}")
-
-            return wrapped
-
-    return decorator
-
-
-def deprecated(direction: str, version: Optional[str]=None):
-    """Decorator to add deprecation message
-    Args:
-        direction (str): Migration steps to be given to users.
-        version (str or int): The version when the object will be removed
-    """
-
-    def decorator(func):
-        @wraps(func)
-        def wrapped(*args, **kwargs):
-            message = (
-                f"{func.__module__}.{func.__name__} has been deprecated "
-                f'and will be removed from {"future" if version is None else version} release. '
-                f"{direction}")
-            warnings.warn(message, stacklevel=2)
-            return func(*args, **kwargs)
-
-        return wrapped
-
-    return decorator
-
-
-def is_kaldi_available():
-    return is_module_available("paddlespeech.audio._paddleaudio")
-
-
-def requires_kaldi():
-    if is_kaldi_available():
-
-        def decorator(func):
-            return func
-
-    else:
-
-        def decorator(func):
-            @wraps(func)
-            def wrapped(*args, **kwargs):
-                raise RuntimeError(
-                    f"{func.__module__}.{func.__name__} requires kaldi")
-
-            return wrapped
-
-    return decorator
-
-
-def _check_soundfile_importable():
-    if not is_module_available("soundfile"):
-        return False
-    try:
-        import soundfile  # noqa: F401
-
-        return True
-    except Exception:
-        warnings.warn(
-            "Failed to import soundfile. 'soundfile' backend is not available.")
-        return False
-
-
-_is_soundfile_importable = _check_soundfile_importable()
-
-
-def is_soundfile_available():
-    return _is_soundfile_importable
-
-
-def requires_soundfile():
-    if is_soundfile_available():
-
-        def decorator(func):
-            return func
-    else:
-
-        def decorator(func):
-            @wraps(func)
-            def wrapped(*args, **kwargs):
-                raise RuntimeError(
-                    f"{func.__module__}.{func.__name__} requires soundfile")
-
-            return wrapped
-
-    return decorator
-
-
-def is_sox_available():
-    return is_module_available("paddlespeech.audio._paddleaudio")
-
-
-def requires_sox():
-    if is_sox_available():
-
-        def decorator(func):
-            return func
-    else:
-
-        def decorator(func):
-            @wraps(func)
-            def wrapped(*args, **kwargs):
-                raise RuntimeError(
-                    f"{func.__module__}.{func.__name__} requires sox")
-
-            return wrapped
-
-    return decorator
--- a/paddlespeech/audio/_ops.py
+++ b/paddlespeech/audio/_ops.py
@ -1,63 +0,0 @@
-import contextlib
-import ctypes
-import os
-import sys
-import types
-
-# Query `hasattr` only once.
-_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
-                                                               'setdlopenflags')
-
-
-@contextlib.contextmanager
-def dl_open_guard():
-    """
-    # https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
-    Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
-    shared library to load custom operators.
-    """
-    if _SET_GLOBAL_FLAGS:
-        old_flags = sys.getdlopenflags()
-        sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
-    yield
-    if _SET_GLOBAL_FLAGS:
-        sys.setdlopenflags(old_flags)
-
-
-def resolve_library_path(path: str) -> str:
-    return os.path.realpath(path)
-
-
-class _Ops(types.ModuleType):
-    __file__ = '_ops.py'
-
-    def __init__(self):
-        super(_Ops, self).__init__('paddlespeech.ops')
-        self.loaded_libraries = set()
-
-    def load_library(self, path):
-        """
-        Loads a shared library from the given path into the current process.
-        This allows dynamically loading custom operators. For this, 
-        you should compile your operator and 
-        the static registration code into a shared library object, and then
-        call ``paddlespeech.ops.load_library('path/to/libcustom.so')`` to load the
-        shared object.
-        After the library is loaded, it is added to the
-        ``paddlespeech.ops.loaded_libraries`` attribute, a set that may be inspected
-        for the paths of all libraries loaded using this function.
-        Args:
-            path (str): A path to a shared library to load.
-        """
-        path = resolve_library_path(path)
-        with dl_open_guard():
-            # https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
-            # Import the shared library into the process, thus running its
-            # static (global) initialization code in order to register custom
-            # operators with the JIT.
-            ctypes.CDLL(path)
-        self.loaded_libraries.add(path)
-
-
-# The ops "namespace"
-ops = _Ops()
--- a/paddlespeech/audio/backends/init.py
+++ b/paddlespeech/audio/backends/init.py
@ -1,18 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-from . import utils
-from .utils import get_audio_backend
-from .utils import list_audio_backends
-from .utils import set_audio_backend
--- a/paddlespeech/audio/backends/common.py
+++ b/paddlespeech/audio/backends/common.py
@ -1,55 +0,0 @@
-# code from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py
-
-class AudioMetaData:
-    """Return type of ``torchaudio.info`` function.
-
-    This class is used by :ref:`"sox_io" backend<sox_io_backend>` and
-    :ref:`"soundfile" backend with the new interface<soundfile_backend>`.
-
-    :ivar int sample_rate: Sample rate
-    :ivar int num_frames: The number of frames
-    :ivar int num_channels: The number of channels
-    :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
-        or when it cannot be accurately inferred.
-    :ivar str encoding: Audio encoding
-        The values encoding can take are one of the following:
-
-            * ``PCM_S``: Signed integer linear PCM
-            * ``PCM_U``: Unsigned integer linear PCM
-            * ``PCM_F``: Floating point linear PCM
-            * ``FLAC``: Flac, Free Lossless Audio Codec
-            * ``ULAW``: Mu-law
-            * ``ALAW``: A-law
-            * ``MP3`` : MP3, MPEG-1 Audio Layer III
-            * ``VORBIS``: OGG Vorbis
-            * ``AMR_WB``: Adaptive Multi-Rate
-            * ``AMR_NB``: Adaptive Multi-Rate Wideband
-            * ``OPUS``: Opus
-            * ``HTK``: Single channel 16-bit PCM
-            * ``UNKNOWN`` : None of above
-    """
-
-    def __init__(
-        self,
-        sample_rate: int,
-        num_frames: int,
-        num_channels: int,
-        bits_per_sample: int,
-        encoding: str,
-    ):
-        self.sample_rate = sample_rate
-        self.num_frames = num_frames
-        self.num_channels = num_channels
-        self.bits_per_sample = bits_per_sample
-        self.encoding = encoding
-
-    def __str__(self):
-        return (
-            f"AudioMetaData("
-            f"sample_rate={self.sample_rate}, "
-            f"num_frames={self.num_frames}, "
-            f"num_channels={self.num_channels}, "
-            f"bits_per_sample={self.bits_per_sample}, "
-            f"encoding={self.encoding}"
-            f")"
-        )
--- a/paddlespeech/audio/backends/no_backend.py
+++ b/paddlespeech/audio/backends/no_backend.py
@ -1,32 +0,0 @@
-from pathlib import Path
-from typing import Callable
-from typing import Optional
-from typing import Tuple
-from typing import Union
-
-from paddle import Tensor
-
-#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
-
-
-def load(
-        filepath: Union[str, Path],
-        out: Optional[Tensor]=None,
-        normalization: Union[bool, float, Callable]=True,
-        channels_first: bool=True,
-        num_frames: int=0,
-        offset: int=0,
-        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
-    raise RuntimeError("No audio I/O backend is available.")
-
-
-def save(filepath: str,
-         src: Tensor,
-         sample_rate: int,
-         precision: int=16,
-         channels_first: bool=True) -> None:
-    raise RuntimeError("No audio I/O backend is available.")
-
-
-def info(filepath: str) -> None:
-    raise RuntimeError("No audio I/O backend is available.")
--- a/paddlespeech/audio/backends/soundfile_backend.py
+++ b/paddlespeech/audio/backends/soundfile_backend.py
@ -1,662 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import warnings
-from typing import Optional
-from typing import Tuple
-
-import numpy as np
-import paddle
-import resampy
-import soundfile
-from scipy.io import wavfile
-
-from ..utils import depth_convert
-from ..utils import ParameterError
-from .common import AudioMetaData
-
-__all__ = [
-    'resample',
-    'to_mono',
-    'normalize',
-    'save',
-    'soundfile_save',
-    'load',
-    'soundfile_load',
-    'info',
-    'to_mono'
-]
-NORMALMIZE_TYPES = ['linear', 'gaussian']
-MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
-RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
-EPS = 1e-8
-
-
-def resample(y: np.ndarray,
-             src_sr: int,
-             target_sr: int,
-             mode: str='kaiser_fast') -> np.ndarray:
-    """Audio resampling.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        src_sr (int): Source sample rate.
-        target_sr (int): Target sample rate.
-        mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
-
-    Returns:
-        np.ndarray: `y` resampled to `target_sr`
-    """
-
-    if mode == 'kaiser_best':
-        warnings.warn(
-            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
-        we recommend the mode kaiser_fast in large scale audio trainning')
-
-    if not isinstance(y, np.ndarray):
-        raise ParameterError(
-            'Only support numpy np.ndarray, but received y in {type(y)}')
-
-    if mode not in RESAMPLE_MODES:
-        raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
-
-    return resampy.resample(y, src_sr, target_sr, filter=mode)
-
-
-def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
-    """Convert sterior audio to mono.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
-
-    Returns:
-        np.ndarray: `y` with mono channel.
-    """
-
-    if merge_type not in MERGE_TYPES:
-        raise ParameterError(
-            f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
-        )
-    if y.ndim > 2:
-        raise ParameterError(
-            f'Unsupported audio array,  y.ndim > 2, the shape is {y.shape}')
-    if y.ndim == 1:  # nothing to merge
-        return y
-
-    if merge_type == 'ch0':
-        return y[0]
-    if merge_type == 'ch1':
-        return y[1]
-    if merge_type == 'random':
-        return y[np.random.randint(0, 2)]
-
-    # need to do averaging according to dtype
-
-    if y.dtype == 'float32':
-        y_out = (y[0] + y[1]) * 0.5
-    elif y.dtype == 'int16':
-        y_out = y.astype('int32')
-        y_out = (y_out[0] + y_out[1]) // 2
-        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
-                        np.iinfo(y.dtype).max).astype(y.dtype)
-
-    elif y.dtype == 'int8':
-        y_out = y.astype('int16')
-        y_out = (y_out[0] + y_out[1]) // 2
-        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
-                        np.iinfo(y.dtype).max).astype(y.dtype)
-    else:
-        raise ParameterError(f'Unsupported dtype: {y.dtype}')
-    return y_out
-
-
-def soundfile_load_(file: os.PathLike,
-                    offset: Optional[float]=None,
-                    dtype: str='int16',
-                    duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
-    """Load audio using soundfile library. This function load audio file using libsndfile.
-
-    Args:
-        file (os.PathLike): File of waveform.
-        offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
-        dtype (str, optional): Data type of waveform. Defaults to 'int16'.
-        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
-
-    Returns:
-        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
-    """
-    with soundfile.SoundFile(file) as sf_desc:
-        sr_native = sf_desc.samplerate
-        if offset:
-            sf_desc.seek(int(offset * sr_native))
-        if duration is not None:
-            frame_duration = int(duration * sr_native)
-        else:
-            frame_duration = -1
-        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
-
-    return y, sf_desc.samplerate
-
-
-def normalize(y: np.ndarray, norm_type: str='linear',
-              mul_factor: float=1.0) -> np.ndarray:
-    """Normalize an input audio with additional multiplier.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
-        mul_factor (float, optional): Scaling factor. Defaults to 1.0.
-
-    Returns:
-        np.ndarray: `y` after normalization.
-    """
-
-    if norm_type == 'linear':
-        amax = np.max(np.abs(y))
-        factor = 1.0 / (amax + EPS)
-        y = y * factor * mul_factor
-    elif norm_type == 'gaussian':
-        amean = np.mean(y)
-        astd = np.std(y)
-        astd = max(astd, EPS)
-        y = mul_factor * (y - amean) / astd
-    else:
-        raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
-
-    return y
-
-
-def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
-    """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        sr (int): Sample rate.
-        file (os.PathLike): Path of auido file to save.
-    """
-    if not file.endswith('.wav'):
-        raise ParameterError(
-            f'only .wav file supported, but dst file name is: {file}')
-
-    if sr <= 0:
-        raise ParameterError(
-            f'Sample rate should be larger than 0, recieved sr = {sr}')
-
-    if y.dtype not in ['int16', 'int8']:
-        warnings.warn(
-            f'input data type is {y.dtype}, will convert data to int16 format before saving'
-        )
-        y_out = depth_convert(y, 'int16')
-    else:
-        y_out = y
-
-    wavfile.write(file, sr, y_out)
-
-def soundfile_load(
-        file: os.PathLike,
-        sr: Optional[int]=None,
-        mono: bool=True,
-        merge_type: str='average',  # ch0,ch1,random,average
-        normal: bool=True,
-        norm_type: str='linear',
-        norm_mul_factor: float=1.0,
-        offset: float=0.0,
-        duration: Optional[int]=None,
-        dtype: str='float32',
-        resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
-    """Load audio file from disk. This function loads audio from disk using using audio beackend.
-
-    Args:
-        file (os.PathLike): Path of auido file to load.
-        sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
-        mono (bool, optional): Return waveform with mono channel. Defaults to True.
-        merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
-        normal (bool, optional): Waveform normalization. Defaults to True.
-        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
-        norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
-        offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
-        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
-        dtype (str, optional): Data type of waveform. Defaults to 'float32'.
-        resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
-
-    Returns:
-        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
-    """
-
-    y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration)
-
-    if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
-        raise ParameterError(f'audio file {file} looks empty')
-
-    if mono:
-        y = to_mono(y, merge_type)
-
-    if sr is not None and sr != r:
-        y = resample(y, r, sr, mode=resample_mode)
-        r = sr
-
-    if normal:
-        y = normalize(y, norm_type, norm_mul_factor)
-    elif dtype in ['int8', 'int16']:
-        # still need to do normalization, before depth convertion
-        y = normalize(y, 'linear', 1.0)
-
-    y = depth_convert(y, dtype)
-    return y, r
-
-#the code below is form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py
-
-def _get_subtype_for_wav(dtype: paddle.dtype, encoding: str, bits_per_sample: int):
-    if not encoding:
-        if not bits_per_sample:
-            subtype = {
-                paddle.uint8: "PCM_U8",
-                paddle.int16: "PCM_16",
-                paddle.int32: "PCM_32",
-                paddle.float32: "FLOAT",
-                paddle.float64: "DOUBLE",
-            }.get(dtype)
-            if not subtype:
-                raise ValueError(f"Unsupported dtype for wav: {dtype}")
-            return subtype
-        if bits_per_sample == 8:
-            return "PCM_U8"
-        return f"PCM_{bits_per_sample}"
-    if encoding == "PCM_S":
-        if not bits_per_sample:
-            return "PCM_32"
-        if bits_per_sample == 8:
-            raise ValueError("wav does not support 8-bit signed PCM encoding.")
-        return f"PCM_{bits_per_sample}"
-    if encoding == "PCM_U":
-        if bits_per_sample in (None, 8):
-            return "PCM_U8"
-        raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
-    if encoding == "PCM_F":
-        if bits_per_sample in (None, 32):
-            return "FLOAT"
-        if bits_per_sample == 64:
-            return "DOUBLE"
-        raise ValueError("wav only supports 32/64-bit float PCM encoding.")
-    if encoding == "ULAW":
-        if bits_per_sample in (None, 8):
-            return "ULAW"
-        raise ValueError("wav only supports 8-bit mu-law encoding.")
-    if encoding == "ALAW":
-        if bits_per_sample in (None, 8):
-            return "ALAW"
-        raise ValueError("wav only supports 8-bit a-law encoding.")
-    raise ValueError(f"wav does not support {encoding}.")
-
-
-def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
-    if encoding in (None, "PCM_S"):
-        return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
-    if encoding in ("PCM_U", "PCM_F"):
-        raise ValueError(f"sph does not support {encoding} encoding.")
-    if encoding == "ULAW":
-        if bits_per_sample in (None, 8):
-            return "ULAW"
-        raise ValueError("sph only supports 8-bit for mu-law encoding.")
-    if encoding == "ALAW":
-        return "ALAW"
-    raise ValueError(f"sph does not support {encoding}.")
-
-
-def _get_subtype(dtype: paddle.dtype, format: str, encoding: str, bits_per_sample: int):
-    if format == "wav":
-        return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
-    if format == "flac":
-        if encoding:
-            raise ValueError("flac does not support encoding.")
-        if not bits_per_sample:
-            return "PCM_16"
-        if bits_per_sample > 24:
-            raise ValueError("flac does not support bits_per_sample > 24.")
-        return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
-    if format in ("ogg", "vorbis"):
-        if encoding or bits_per_sample:
-            raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.")
-        return "VORBIS"
-    if format == "sph":
-        return _get_subtype_for_sphere(encoding, bits_per_sample)
-    if format in ("nis", "nist"):
-        return "PCM_16"
-    raise ValueError(f"Unsupported format: {format}")
-
-def save(
-    filepath: str,
-    src: paddle.Tensor,
-    sample_rate: int,
-    channels_first: bool = True,
-    compression: Optional[float] = None,
-    format: Optional[str] = None,
-    encoding: Optional[str] = None,
-    bits_per_sample: Optional[int] = None,
-):
-    """Save audio data to file.
-
-    Note:
-        The formats this function can handle depend on the soundfile installation.
-        This function is tested on the following formats;
-
-        * WAV
-
-            * 32-bit floating-point
-            * 32-bit signed integer
-            * 16-bit signed integer
-            * 8-bit unsigned integer
-
-        * FLAC
-        * OGG/VORBIS
-        * SPHERE
-
-    Note:
-        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
-        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
-
-    Args:
-        filepath (str or pathlib.Path): Path to audio file.
-        src (paddle.Tensor): Audio data to save. must be 2D tensor.
-        sample_rate (int): sampling rate
-        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
-            otherwise `[time, channel]`.
-        compression (float of None, optional): Not used.
-            It is here only for interface compatibility reson with "sox_io" backend.
-        format (str or None, optional): Override the audio format.
-            When ``filepath`` argument is path-like object, audio format is
-            inferred from file extension. If the file extension is missing or
-            different, you can specify the correct format with this argument.
-
-            When ``filepath`` argument is file-like object,
-            this argument is required.
-
-            Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
-            ``"flac"`` and ``"sph"``.
-        encoding (str or None, optional): Changes the encoding for supported formats.
-            This argument is effective only for supported formats, sush as
-            ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
-
-                - ``"PCM_S"`` (signed integer Linear PCM)
-                - ``"PCM_U"`` (unsigned integer Linear PCM)
-                - ``"PCM_F"`` (floating point PCM)
-                - ``"ULAW"`` (mu-law)
-                - ``"ALAW"`` (a-law)
-
-        bits_per_sample (int or None, optional): Changes the bit depth for the
-            supported formats.
-            When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
-            you can change the bit depth.
-            Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
-
-    Supported formats/encodings/bit depth/compression are:
-
-    ``"wav"``
-        - 32-bit floating-point PCM
-        - 32-bit signed integer PCM
-        - 24-bit signed integer PCM
-        - 16-bit signed integer PCM
-        - 8-bit unsigned integer PCM
-        - 8-bit mu-law
-        - 8-bit a-law
-
-        Note:
-            Default encoding/bit depth is determined by the dtype of
-            the input Tensor.
-
-    ``"flac"``
-        - 8-bit
-        - 16-bit (default)
-        - 24-bit
-
-    ``"ogg"``, ``"vorbis"``
-        - Doesn't accept changing configuration.
-
-    ``"sph"``
-        - 8-bit signed integer PCM
-        - 16-bit signed integer PCM
-        - 24-bit signed integer PCM
-        - 32-bit signed integer PCM (default)
-        - 8-bit mu-law
-        - 8-bit a-law
-        - 16-bit a-law
-        - 24-bit a-law
-        - 32-bit a-law
-
-    """
-    if src.ndim != 2:
-        raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
-    if compression is not None:
-        warnings.warn(
-            '`save` function of "soundfile" backend does not support "compression" parameter. '
-            "The argument is silently ignored."
-        )
-    if hasattr(filepath, "write"):
-        if format is None:
-            raise RuntimeError("`format` is required when saving to file object.")
-        ext = format.lower()
-    else:
-        ext = str(filepath).split(".")[-1].lower()
-
-    if bits_per_sample not in (None, 8, 16, 24, 32, 64):
-        raise ValueError("Invalid bits_per_sample.")
-    if bits_per_sample == 24:
-        warnings.warn(
-            "Saving audio with 24 bits per sample might warp samples near -1. "
-            "Using 16 bits per sample might be able to avoid this."
-        )
-    subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
-
-    # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
-    # so we extend the extensions manually here
-    if ext in ["nis", "nist", "sph"] and format is None:
-        format = "NIST"
-
-    if channels_first:
-        src = src.t()
-
-    soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
-
-_SUBTYPE2DTYPE = {
-    "PCM_S8": "int8",
-    "PCM_U8": "uint8",
-    "PCM_16": "int16",
-    "PCM_32": "int32",
-    "FLOAT": "float32",
-    "DOUBLE": "float64",
-}
-
-def load(
-    filepath: str,
-    frame_offset: int = 0,
-    num_frames: int = -1,
-    normalize: bool = True,
-    channels_first: bool = True,
-    format: Optional[str] = None,
-) -> Tuple[paddle.Tensor, int]:
-    """Load audio data from file.
-
-    Note:
-        The formats this function can handle depend on the soundfile installation.
-        This function is tested on the following formats;
-
-        * WAV
-
-            * 32-bit floating-point
-            * 32-bit signed integer
-            * 16-bit signed integer
-            * 8-bit unsigned integer
-
-        * FLAC
-        * OGG/VORBIS
-        * SPHERE
-
-    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
-    ``float32`` dtype and the shape of `[channel, time]`.
-    The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
-
-    When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
-    signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
-    by providing ``normalize=False``, this function can return integer Tensor, where the samples
-    are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
-    for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
-
-    ``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
-    ``flac`` and ``mp3``.
-    For these formats, this function always returns ``float32`` Tensor with values normalized to
-    ``[-1.0, 1.0]``.
-
-    Note:
-        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
-        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend.
-
-    Args:
-        filepath (path-like object or file-like object):
-            Source of audio data.
-        frame_offset (int, optional):
-            Number of frames to skip before start reading data.
-        num_frames (int, optional):
-            Maximum number of frames to read. ``-1`` reads all the remaining samples,
-            starting from ``frame_offset``.
-            This function may return the less number of frames if there is not enough
-            frames in the given file.
-        normalize (bool, optional):
-            When ``True``, this function always return ``float32``, and sample values are
-            normalized to ``[-1.0, 1.0]``.
-            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
-            integer type.
-            This argument has no effect for formats other than integer WAV type.
-        channels_first (bool, optional):
-            When True, the returned Tensor has dimension `[channel, time]`.
-            Otherwise, the returned Tensor's dimension is `[time, channel]`.
-        format (str or None, optional):
-            Not used. PySoundFile does not accept format hint.
-
-    Returns:
-        (paddle.Tensor, int): Resulting Tensor and sample rate.
-            If the input file has integer wav format and normalization is off, then it has
-            integer type, else ``float32`` type. If ``channels_first=True``, it has
-            `[channel, time]` else `[time, channel]`.
-    """
-    with soundfile.SoundFile(filepath, "r") as file_:
-        if file_.format != "WAV" or normalize:
-            dtype = "float32"
-        elif file_.subtype not in _SUBTYPE2DTYPE:
-            raise ValueError(f"Unsupported subtype: {file_.subtype}")
-        else:
-            dtype = _SUBTYPE2DTYPE[file_.subtype]
-
-        frames = file_._prepare_read(frame_offset, None, num_frames)
-        waveform = file_.read(frames, dtype, always_2d=True)
-        sample_rate = file_.samplerate
-
-    waveform = paddle.to_tensor(waveform)
-    if channels_first:
-        waveform = paddle.transpose(waveform, perm=[1,0])
-    return waveform, sample_rate
-
-
-# Mapping from soundfile subtype to number of bits per sample.
-# This is mostly heuristical and the value is set to 0 when it is irrelevant
-# (lossy formats) or when it can't be inferred.
-# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
-# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
-# the default seems to be 8 bits but it can be compressed further to 4 bits.
-# The dict is inspired from
-# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
-_SUBTYPE_TO_BITS_PER_SAMPLE = {
-    "PCM_S8": 8,  # Signed 8 bit data
-    "PCM_16": 16,  # Signed 16 bit data
-    "PCM_24": 24,  # Signed 24 bit data
-    "PCM_32": 32,  # Signed 32 bit data
-    "PCM_U8": 8,  # Unsigned 8 bit data (WAV and RAW only)
-    "FLOAT": 32,  # 32 bit float data
-    "DOUBLE": 64,  # 64 bit float data
-    "ULAW": 8,  # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
-    "ALAW": 8,  # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
-    "IMA_ADPCM": 0,  # IMA ADPCM.
-    "MS_ADPCM": 0,  # Microsoft ADPCM.
-    "GSM610": 0,  # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
-    "VOX_ADPCM": 0,  # OKI / Dialogix ADPCM
-    "G721_32": 0,  # 32kbs G721 ADPCM encoding.
-    "G723_24": 0,  # 24kbs G723 ADPCM encoding.
-    "G723_40": 0,  # 40kbs G723 ADPCM encoding.
-    "DWVW_12": 12,  # 12 bit Delta Width Variable Word encoding.
-    "DWVW_16": 16,  # 16 bit Delta Width Variable Word encoding.
-    "DWVW_24": 24,  # 24 bit Delta Width Variable Word encoding.
-    "DWVW_N": 0,  # N bit Delta Width Variable Word encoding.
-    "DPCM_8": 8,  # 8 bit differential PCM (XI only)
-    "DPCM_16": 16,  # 16 bit differential PCM (XI only)
-    "VORBIS": 0,  # Xiph Vorbis encoding. (lossy)
-    "ALAC_16": 16,  # Apple Lossless Audio Codec (16 bit).
-    "ALAC_20": 20,  # Apple Lossless Audio Codec (20 bit).
-    "ALAC_24": 24,  # Apple Lossless Audio Codec (24 bit).
-    "ALAC_32": 32,  # Apple Lossless Audio Codec (32 bit).
-}
-
-def _get_bit_depth(subtype):
-    if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
-        warnings.warn(
-            f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample "
-            "attribute will be set to 0. If you are seeing this warning, please "
-            "report by opening an issue on github (after checking for existing/closed ones). "
-            "You may otherwise ignore this warning."
-        )
-    return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
-
-_SUBTYPE_TO_ENCODING = {
-    "PCM_S8": "PCM_S",
-    "PCM_16": "PCM_S",
-    "PCM_24": "PCM_S",
-    "PCM_32": "PCM_S",
-    "PCM_U8": "PCM_U",
-    "FLOAT": "PCM_F",
-    "DOUBLE": "PCM_F",
-    "ULAW": "ULAW",
-    "ALAW": "ALAW",
-    "VORBIS": "VORBIS",
-}
-
-def _get_encoding(format: str, subtype: str):
-    if format == "FLAC":
-        return "FLAC"
-    return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
-
-def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
-    """Get signal information of an audio file.
-
-    Note:
-        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
-        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
-
-    Args:
-        filepath (path-like object or file-like object):
-            Source of audio data.
-        format (str or None, optional):
-            Not used. PySoundFile does not accept format hint.
-
-    Returns:
-        AudioMetaData: meta data of the given audio.
-
-    """
-    sinfo = soundfile.info(filepath)
-    return AudioMetaData(
-        sinfo.samplerate,
-        sinfo.frames,
-        sinfo.channels,
-        bits_per_sample=_get_bit_depth(sinfo.subtype),
-        encoding=_get_encoding(sinfo.format, sinfo.subtype),
-    )
--- a/paddlespeech/audio/backends/sox_io_backend.py
+++ b/paddlespeech/audio/backends/sox_io_backend.py
@ -1,101 +0,0 @@
-from pathlib import Path
-from typing import Callable
-from typing import Optional, Tuple, Union
-
-import paddle
-from paddle import Tensor
-from .common import AudioMetaData
-import os
-
-from paddlespeech.audio._internal import module_utils  as _mod_utils
-from paddlespeech.audio import _paddleaudio as paddleaudio 
-
-#https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py
-
-def _fail_info(filepath: str, format: Optional[str]) -> AudioMetaData:
-    raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
-
-
-def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioMetaData:
-    raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
-
-
-# Note: need to comply TorchScript syntax -- need annotation and no f-string
-def _fail_load(
-    filepath: str,
-    frame_offset: int = 0,
-    num_frames: int = -1,
-    normalize: bool = True,
-    channels_first: bool = True,
-    format: Optional[str] = None,
-) -> Tuple[Tensor, int]:
-    raise RuntimeError("Failed to load audio from {}".format(filepath))
-
-
-def _fail_load_fileobj(fileobj, *args, **kwargs):
-    raise RuntimeError(f"Failed to load audio from {fileobj}")
-
-_fallback_info = _fail_info
-_fallback_info_fileobj = _fail_info_fileobj
-_fallback_load = _fail_load
-_fallback_load_filebj = _fail_load_fileobj
-
-@_mod_utils.requires_sox()
-def load(
-        filepath: str,
-        frame_offset: int = 0,
-        num_frames: int=-1,
-        normalize: bool = True,
-        channels_first: bool = True,
-        format: Optional[str]=None, ) -> Tuple[Tensor, int]:
-    if hasattr(filepath, "read"):
-        ret = paddleaudio.load_audio_fileobj(
-            filepath, frame_offset, num_frames, normalize, channels_first, format
-        )
-        if ret is not None:
-            audio_tensor = paddle.to_tensor(ret[0])
-            return (audio_tensor, ret[1])
-        return _fallback_load_fileobj(filepath, frame_offset, num_frames, normalize, channels_first, format)
-    filepath = os.fspath(filepath)
-    ret = paddleaudio.sox_io_load_audio_file(
-        filepath, frame_offset, num_frames, normalize, channels_first, format
-    )
-    if ret is not None:
-        audio_tensor = paddle.to_tensor(ret[0])
-        return (audio_tensor, ret[1])
-    return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)
-
-
-@_mod_utils.requires_sox()
-def save(filepath: str,
-    src: Tensor,
-    sample_rate: int,
-    channels_first: bool = True,
-    compression: Optional[float] = None,
-    format: Optional[str] = None,
-    encoding: Optional[str] = None,
-    bits_per_sample: Optional[int] = None,
-):
-    src_arr = src.numpy()
-    if hasattr(filepath, "write"):
-        paddleaudio.save_audio_fileobj(
-            filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
-        )
-        return
-    filepath = os.fspath(filepath)
-    paddleaudio.sox_io_save_audio_file(
-        filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
-    )
-
-@_mod_utils.requires_sox()
-def info(filepath: str, format: Optional[str] = None,) -> AudioMetaData:
-    if hasattr(filepath, "read"):
-        sinfo = paddleaudio.get_info_fileobj(filepath, format)
-        if sinfo is not None:
-            return AudioMetaData(*sinfo)
-        return _fallback_info_fileobj(filepath, format)
-    filepath = os.fspath(filepath)
-    sinfo = paddleaudio.get_info_file(filepath, format)
-    if sinfo is not None:
-        return AudioMetaData(*sinfo)
-    return _fallback_info(filepath, format)
--- a/paddlespeech/audio/backends/utils.py
+++ b/paddlespeech/audio/backends/utils.py
@ -1,93 +0,0 @@
-"""Defines utilities for switching audio backends"""
-#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py
-
-import warnings
-from typing import List
-from typing import Optional
-
-import paddlespeech.audio
-from paddlespeech.audio._internal import module_utils as _mod_utils
-
-from . import no_backend, soundfile_backend, sox_io_backend
-
-__all__ = [
-    "list_audio_backends",
-    "get_audio_backend",
-    "set_audio_backend",
-]
-
-
-def list_audio_backends() -> List[str]:
-    """List available backends
-
-    Returns:
-        List[str]: The list of available backends.
-    """
-    backends = []
-    if _mod_utils.is_module_available("soundfile"):
-        backends.append("soundfile")
-    if _mod_utils.is_sox_available():
-        backends.append("sox_io")
-    return backends
-
-
-def set_audio_backend(backend: Optional[str]):
-    """Set the backend for I/O operation
-
-    Args:
-        backend (str or None): Name of the backend.
-            One of ``"sox_io"`` or ``"soundfile"`` based on availability
-            of the system. If ``None`` is provided the  current backend is unassigned.
-    """
-    if backend is not None and backend not in list_audio_backends():
-        raise RuntimeError(f'Backend "{backend}" is not one of '
-                           f"available backends: {list_audio_backends()}.")
-
-    if backend is None:
-        module = no_backend
-    elif backend == "sox_io":
-        module = sox_io_backend
-    elif backend == "soundfile":
-        module = soundfile_backend
-    else:
-        raise NotImplementedError(f'Unexpected backend "{backend}"')
-
-    for func in ["save", "load", "info"]:
-        setattr(paddlespeech.audio, func, getattr(module, func))
-
-
-# def _init_audio_backend():
-#     backends = list_audio_backends()
-#     if "sox_io" in backends:
-#         set_audio_backend("sox_io")
-#     elif "soundfile" in backends:
-#         set_audio_backend("soundfile")
-#     else:
-#         warnings.warn("No audio backend is available.")
-#         set_audio_backend(None)
-
-
-def _init_audio_backend():
-    backends = list_audio_backends()
-    if "soundfile" in backends:
-        set_audio_backend("soundfile")
-    elif "sox_io" in backends:
-        set_audio_backend("sox_io")
-    else:
-        warnings.warn("No audio backend is available.")
-        set_audio_backend(None)
-
-
-def get_audio_backend() -> Optional[str]:
-    """Get the name of the current backend
-
-    Returns:
-        Optional[str]: The name of the current backend or ``None`` if no backend is assigned.
-    """
-    if paddlespeech.audio.load == no_backend.load:
-        return None
-    if paddlespeech.audio.load == sox_io_backend.load:
-        return "sox_io"
-    if paddlespeech.audio.load == soundfile_backend.load:
-        return "soundfile"
-    raise ValueError("Unknown backend.")
--- a/paddlespeech/audio/compliance/init.py
+++ b/paddlespeech/audio/compliance/init.py
@ -1,15 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from . import kaldi
-from . import librosa
--- a/paddlespeech/audio/compliance/kaldi.py
+++ b/paddlespeech/audio/compliance/kaldi.py
@ -1,638 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from torchaudio(https://github.com/pytorch/audio)
-import math
-from typing import Tuple
-
-import paddle
-from paddle import Tensor
-
-from ..functional import create_dct
-from ..functional.window import get_window
-
-__all__ = [
-    'spectrogram',
-    'fbank',
-    'mfcc',
-]
-
-# window types
-HANNING = 'hann'
-HAMMING = 'hamming'
-POVEY = 'povey'
-RECTANGULAR = 'rect'
-BLACKMAN = 'blackman'
-
-
-def _get_epsilon(dtype):
-    return paddle.to_tensor(1e-07, dtype=dtype)
-
-
-def _next_power_of_2(x: int) -> int:
-    return 1 if x == 0 else 2**(x - 1).bit_length()
-
-
-def _get_strided(waveform: Tensor,
-                 window_size: int,
-                 window_shift: int,
-                 snip_edges: bool) -> Tensor:
-    assert waveform.dim() == 1
-    num_samples = waveform.shape[0]
-
-    if snip_edges:
-        if num_samples < window_size:
-            return paddle.empty((0, 0), dtype=waveform.dtype)
-        else:
-            m = 1 + (num_samples - window_size) // window_shift
-    else:
-        reversed_waveform = paddle.flip(waveform, [0])
-        m = (num_samples + (window_shift // 2)) // window_shift
-        pad = window_size // 2 - window_shift // 2
-        pad_right = reversed_waveform
-        if pad > 0:
-            pad_left = reversed_waveform[-pad:]
-            waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
-        else:
-            waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
-
-    return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
-
-
-def _feature_window_function(
-        window_type: str,
-        window_size: int,
-        blackman_coeff: float,
-        dtype: int, ) -> Tensor:
-    if window_type == HANNING:
-        return get_window('hann', window_size, fftbins=False, dtype=dtype)
-    elif window_type == HAMMING:
-        return get_window('hamming', window_size, fftbins=False, dtype=dtype)
-    elif window_type == POVEY:
-        return get_window(
-            'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
-    elif window_type == RECTANGULAR:
-        return paddle.ones([window_size], dtype=dtype)
-    elif window_type == BLACKMAN:
-        a = 2 * math.pi / (window_size - 1)
-        window_function = paddle.arange(window_size, dtype=dtype)
-        return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
-                (0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
-                ).astype(dtype)
-    else:
-        raise Exception('Invalid window type ' + window_type)
-
-
-def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
-                    energy_floor: float) -> Tensor:
-    log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
-    if energy_floor == 0.0:
-        return log_energy
-    return paddle.maximum(
-        log_energy,
-        paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
-
-
-def _get_waveform_and_window_properties(
-        waveform: Tensor,
-        channel: int,
-        sr: int,
-        frame_shift: float,
-        frame_length: float,
-        round_to_power_of_two: bool,
-        preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
-    channel = max(channel, 0)
-    assert channel < waveform.shape[0], (
-        'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
-    waveform = waveform[channel, :]  # size (n)
-    window_shift = int(
-        sr * frame_shift *
-        0.001)  # pass frame_shift and frame_length in milliseconds
-    window_size = int(sr * frame_length * 0.001)
-    padded_window_size = _next_power_of_2(
-        window_size) if round_to_power_of_two else window_size
-
-    assert 2 <= window_size <= len(waveform), (
-        'choose a window size {} that is [2, {}]'.format(window_size,
-                                                         len(waveform)))
-    assert 0 < window_shift, '`window_shift` must be greater than 0'
-    assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
-                                        ' use `round_to_power_of_two` or change `frame_length`'
-    assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
-    assert sr > 0, '`sr` must be greater than zero'
-    return waveform, window_shift, window_size, padded_window_size
-
-
-def _get_window(waveform: Tensor,
-                padded_window_size: int,
-                window_size: int,
-                window_shift: int,
-                window_type: str,
-                blackman_coeff: float,
-                snip_edges: bool,
-                raw_energy: bool,
-                energy_floor: float,
-                dither: float,
-                remove_dc_offset: bool,
-                preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
-    dtype = waveform.dtype
-    epsilon = _get_epsilon(dtype)
-
-    # (m, window_size)
-    strided_input = _get_strided(waveform, window_size, window_shift,
-                                 snip_edges)
-
-    if dither != 0.0:
-        x = paddle.maximum(epsilon,
-                           paddle.rand(strided_input.shape, dtype=dtype))
-        rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
-        strided_input = strided_input + rand_gauss * dither
-
-    if remove_dc_offset:
-        row_means = paddle.mean(strided_input, axis=1).unsqueeze(1)  # (m, 1)
-        strided_input = strided_input - row_means
-
-    if raw_energy:
-        signal_log_energy = _get_log_energy(strided_input, epsilon,
-                                            energy_floor)  # (m)
-
-    if preemphasis_coefficient != 0.0:
-        offset_strided_input = paddle.nn.functional.pad(
-            strided_input.unsqueeze(0), (1, 0),
-            data_format='NCL',
-            mode='replicate').squeeze(0)  # (m, window_size + 1)
-        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
-                                                                                       -1]
-
-    window_function = _feature_window_function(
-        window_type, window_size, blackman_coeff,
-        dtype).unsqueeze(0)  # (1, window_size)
-    strided_input = strided_input * window_function  # (m, window_size)
-
-    # (m, padded_window_size)
-    if padded_window_size != window_size:
-        padding_right = padded_window_size - window_size
-        strided_input = paddle.nn.functional.pad(
-            strided_input.unsqueeze(0), (0, padding_right),
-            data_format='NCL',
-            mode='constant',
-            value=0).squeeze(0)
-
-    if not raw_energy:
-        signal_log_energy = _get_log_energy(strided_input, epsilon,
-                                            energy_floor)  # size (m)
-
-    return strided_input, signal_log_energy
-
-
-def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
-    if subtract_mean:
-        col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
-        tensor = tensor - col_means
-    return tensor
-
-
-def spectrogram(waveform: Tensor,
-                blackman_coeff: float=0.42,
-                channel: int=-1,
-                dither: float=0.0,
-                energy_floor: float=1.0,
-                frame_length: float=25.0,
-                frame_shift: float=10.0,
-                preemphasis_coefficient: float=0.97,
-                raw_energy: bool=True,
-                remove_dc_offset: bool=True,
-                round_to_power_of_two: bool=True,
-                sr: int=16000,
-                snip_edges: bool=True,
-                subtract_mean: bool=False,
-                window_type: str=POVEY) -> Tensor:
-    """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
-
-    Args:
-        waveform (Tensor): A waveform tensor with shape `(C, T)`.
-        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
-        channel (int, optional): Select the channel of waveform. Defaults to -1.
-        dither (float, optional): Dithering constant . Defaults to 0.0.
-        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
-        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
-        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
-        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
-        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
-        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
-        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
-            to FFT. Defaults to True.
-        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
-        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
-            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
-        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
-        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
-
-    Returns:
-        Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames
-            depends on frame_length and frame_shift.
-    """
-    dtype = waveform.dtype
-    epsilon = _get_epsilon(dtype)
-
-    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
-        waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
-        preemphasis_coefficient)
-
-    strided_input, signal_log_energy = _get_window(
-        waveform, padded_window_size, window_size, window_shift, window_type,
-        blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
-        remove_dc_offset, preemphasis_coefficient)
-
-    # (m, padded_window_size // 2 + 1, 2)
-    fft = paddle.fft.rfft(strided_input)
-
-    power_spectrum = paddle.maximum(
-        fft.abs().pow(2.), epsilon).log()  # (m, padded_window_size // 2 + 1)
-    power_spectrum[:, 0] = signal_log_energy
-
-    power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
-    return power_spectrum
-
-
-def _inverse_mel_scale_scalar(mel_freq: float) -> float:
-    return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
-
-
-def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
-    return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
-
-
-def _mel_scale_scalar(freq: float) -> float:
-    return 1127.0 * math.log(1.0 + freq / 700.0)
-
-
-def _mel_scale(freq: Tensor) -> Tensor:
-    return 1127.0 * (1.0 + freq / 700.0).log()
-
-
-def _vtln_warp_freq(vtln_low_cutoff: float,
-                    vtln_high_cutoff: float,
-                    low_freq: float,
-                    high_freq: float,
-                    vtln_warp_factor: float,
-                    freq: Tensor) -> Tensor:
-    assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
-    assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
-    l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
-    h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
-    scale = 1.0 / vtln_warp_factor
-    Fl = scale * l
-    Fh = scale * h
-    assert l > low_freq and h < high_freq
-    scale_left = (Fl - low_freq) / (l - low_freq)
-    scale_right = (high_freq - Fh) / (high_freq - h)
-    res = paddle.empty_like(freq)
-
-    outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
-        | paddle.greater_than(freq, paddle.to_tensor(high_freq))
-    before_l = paddle.less_than(freq, paddle.to_tensor(l))
-    before_h = paddle.less_than(freq, paddle.to_tensor(h))
-    after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
-
-    res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
-    res[before_h] = scale * freq[before_h]
-    res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
-    res[outside_low_high_freq] = freq[outside_low_high_freq]
-
-    return res
-
-
-def _vtln_warp_mel_freq(vtln_low_cutoff: float,
-                        vtln_high_cutoff: float,
-                        low_freq,
-                        high_freq: float,
-                        vtln_warp_factor: float,
-                        mel_freq: Tensor) -> Tensor:
-    return _mel_scale(
-        _vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
-                        vtln_warp_factor, _inverse_mel_scale(mel_freq)))
-
-
-def _get_mel_banks(num_bins: int,
-                   window_length_padded: int,
-                   sample_freq: float,
-                   low_freq: float,
-                   high_freq: float,
-                   vtln_low: float,
-                   vtln_high: float,
-                   vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
-    assert num_bins > 3, 'Must have at least 3 mel bins'
-    assert window_length_padded % 2 == 0
-    num_fft_bins = window_length_padded / 2
-    nyquist = 0.5 * sample_freq
-
-    if high_freq <= 0.0:
-        high_freq += nyquist
-
-    assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
-        ('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
-
-    fft_bin_width = sample_freq / window_length_padded
-    mel_low_freq = _mel_scale_scalar(low_freq)
-    mel_high_freq = _mel_scale_scalar(high_freq)
-
-    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
-
-    if vtln_high < 0.0:
-        vtln_high += nyquist
-
-    assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
-                                       (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
-        ('Bad values in options: vtln-low {} and vtln-high {}, versus '
-         'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
-
-    bin = paddle.arange(num_bins).unsqueeze(1)
-    left_mel = mel_low_freq + bin * mel_freq_delta  # (num_bins, 1)
-    center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta  # (num_bins, 1)
-    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # (num_bins, 1)
-
-    if vtln_warp_factor != 1.0:
-        left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
-                                       vtln_warp_factor, left_mel)
-        center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
-                                         high_freq, vtln_warp_factor,
-                                         center_mel)
-        right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
-                                        high_freq, vtln_warp_factor, right_mel)
-
-    center_freqs = _inverse_mel_scale(center_mel)  # (num_bins)
-    # (1, num_fft_bins)
-    mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
-
-    # (num_bins, num_fft_bins)
-    up_slope = (mel - left_mel) / (center_mel - left_mel)
-    down_slope = (right_mel - mel) / (right_mel - center_mel)
-
-    if vtln_warp_factor == 1.0:
-        bins = paddle.maximum(
-            paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
-    else:
-        bins = paddle.zeros_like(up_slope)
-        up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
-            mel, center_mel)
-        down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
-            mel, right_mel)
-        bins[up_idx] = up_slope[up_idx]
-        bins[down_idx] = down_slope[down_idx]
-
-    return bins, center_freqs
-
-
-def fbank(waveform: Tensor,
-          blackman_coeff: float=0.42,
-          channel: int=-1,
-          dither: float=0.0,
-          energy_floor: float=1.0,
-          frame_length: float=25.0,
-          frame_shift: float=10.0,
-          high_freq: float=0.0,
-          htk_compat: bool=False,
-          low_freq: float=20.0,
-          n_mels: int=23,
-          preemphasis_coefficient: float=0.97,
-          raw_energy: bool=True,
-          remove_dc_offset: bool=True,
-          round_to_power_of_two: bool=True,
-          sr: int=16000,
-          snip_edges: bool=True,
-          subtract_mean: bool=False,
-          use_energy: bool=False,
-          use_log_fbank: bool=True,
-          use_power: bool=True,
-          vtln_high: float=-500.0,
-          vtln_low: float=100.0,
-          vtln_warp: float=1.0,
-          window_type: str=POVEY) -> Tensor:
-    """Compute and return filter banks from a waveform. The output is identical to Kaldi's.
-
-    Args:
-        waveform (Tensor): A waveform tensor with shape `(C, T)`.
-        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
-        channel (int, optional): Select the channel of waveform. Defaults to -1.
-        dither (float, optional): Dithering constant . Defaults to 0.0.
-        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
-        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
-        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
-        high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
-        htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
-        low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
-        n_mels (int, optional): Number of output mel bins. Defaults to 23.
-        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
-        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
-        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
-        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
-            to FFT. Defaults to True.
-        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
-        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
-            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
-        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
-        use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
-        use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
-        use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
-        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
-        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
-        vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
-        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
-
-    Returns:
-        Tensor: A filter banks tensor with shape `(m, n_mels)`.
-    """
-    dtype = waveform.dtype
-
-    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
-        waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
-        preemphasis_coefficient)
-
-    strided_input, signal_log_energy = _get_window(
-        waveform, padded_window_size, window_size, window_shift, window_type,
-        blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
-        remove_dc_offset, preemphasis_coefficient)
-
-    # (m, padded_window_size // 2 + 1)
-    spectrum = paddle.fft.rfft(strided_input).abs()
-    if use_power:
-        spectrum = spectrum.pow(2.)
-
-    # (n_mels, padded_window_size // 2)
-    mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
-                                     high_freq, vtln_low, vtln_high, vtln_warp)
-    mel_energies = mel_energies.astype(dtype)
-
-    # (n_mels, padded_window_size // 2 + 1)
-    mel_energies = paddle.nn.functional.pad(
-        mel_energies.unsqueeze(0), (0, 1),
-        data_format='NCL',
-        mode='constant',
-        value=0).squeeze(0)
-
-    # (m, n_mels)
-    mel_energies = paddle.mm(spectrum, mel_energies.T)
-    if use_log_fbank:
-        mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
-
-    if use_energy:
-        signal_log_energy = signal_log_energy.unsqueeze(1)
-        if htk_compat:
-            mel_energies = paddle.concat(
-                (mel_energies, signal_log_energy), axis=1)
-        else:
-            mel_energies = paddle.concat(
-                (signal_log_energy, mel_energies), axis=1)
-
-    # (m, n_mels + 1)
-    mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
-    return mel_energies
-
-
-def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
-    dct_matrix = create_dct(n_mels, n_mels, 'ortho')
-    dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
-    dct_matrix = dct_matrix[:, :n_mfcc]  # (n_mels, n_mfcc)
-    return dct_matrix
-
-
-def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
-    i = paddle.arange(n_mfcc)
-    return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
-                                                    cepstral_lifter)
-
-
-def mfcc(waveform: Tensor,
-         blackman_coeff: float=0.42,
-         cepstral_lifter: float=22.0,
-         channel: int=-1,
-         dither: float=0.0,
-         energy_floor: float=1.0,
-         frame_length: float=25.0,
-         frame_shift: float=10.0,
-         high_freq: float=0.0,
-         htk_compat: bool=False,
-         low_freq: float=20.0,
-         n_mfcc: int=13,
-         n_mels: int=23,
-         preemphasis_coefficient: float=0.97,
-         raw_energy: bool=True,
-         remove_dc_offset: bool=True,
-         round_to_power_of_two: bool=True,
-         sr: int=16000,
-         snip_edges: bool=True,
-         subtract_mean: bool=False,
-         use_energy: bool=False,
-         vtln_high: float=-500.0,
-         vtln_low: float=100.0,
-         vtln_warp: float=1.0,
-         window_type: str=POVEY) -> Tensor:
-    """Compute and return mel frequency cepstral coefficients from a waveform. The output is
-            identical to Kaldi's.
-
-    Args:
-        waveform (Tensor): A waveform tensor with shape `(C, T)`.
-        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
-        cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
-        channel (int, optional): Select the channel of waveform. Defaults to -1.
-        dither (float, optional): Dithering constant . Defaults to 0.0.
-        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
-        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
-        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
-        high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
-        htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
-        low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
-        n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
-        n_mels (int, optional): Number of output mel bins. Defaults to 23.
-        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
-        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
-        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
-        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
-            to FFT. Defaults to True.
-        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
-        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
-            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
-        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
-        use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
-        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
-        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
-        vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
-        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
-
-    Returns:
-        Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`.
-    """
-    assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
-        n_mfcc, n_mels)
-
-    dtype = waveform.dtype
-
-    # (m, n_mels + use_energy)
-    feature = fbank(
-        waveform=waveform,
-        blackman_coeff=blackman_coeff,
-        channel=channel,
-        dither=dither,
-        energy_floor=energy_floor,
-        frame_length=frame_length,
-        frame_shift=frame_shift,
-        high_freq=high_freq,
-        htk_compat=htk_compat,
-        low_freq=low_freq,
-        n_mels=n_mels,
-        preemphasis_coefficient=preemphasis_coefficient,
-        raw_energy=raw_energy,
-        remove_dc_offset=remove_dc_offset,
-        round_to_power_of_two=round_to_power_of_two,
-        sr=sr,
-        snip_edges=snip_edges,
-        subtract_mean=False,
-        use_energy=use_energy,
-        use_log_fbank=True,
-        use_power=True,
-        vtln_high=vtln_high,
-        vtln_low=vtln_low,
-        vtln_warp=vtln_warp,
-        window_type=window_type)
-
-    if use_energy:
-        # (m)
-        signal_log_energy = feature[:, n_mels if htk_compat else 0]
-        mel_offset = int(not htk_compat)
-        feature = feature[:, mel_offset:(n_mels + mel_offset)]
-
-    # (n_mels, n_mfcc)
-    dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
-
-    # (m, n_mfcc)
-    feature = feature.matmul(dct_matrix)
-
-    if cepstral_lifter != 0.0:
-        # (1, n_mfcc)
-        lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
-        feature *= lifter_coeffs.astype(dtype=dtype)
-
-    if use_energy:
-        feature[:, 0] = signal_log_energy
-
-    if htk_compat:
-        energy = feature[:, 0].unsqueeze(1)  # (m, 1)
-        feature = feature[:, 1:]  # (m, n_mfcc - 1)
-        if not use_energy:
-            energy *= math.sqrt(2)
-
-        feature = paddle.concat((feature, energy), axis=1)
-
-    feature = _subtract_column_mean(feature, subtract_mean)
-    return feature
--- a/paddlespeech/audio/compliance/librosa.py
+++ b/paddlespeech/audio/compliance/librosa.py
@ -1,788 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from librosa(https://github.com/librosa/librosa)
-import warnings
-from typing import List
-from typing import Optional
-from typing import Union
-
-import numpy as np
-import scipy
-from numpy.lib.stride_tricks import as_strided
-from scipy import signal
-
-from ..utils import depth_convert
-from ..utils import ParameterError
-
-__all__ = [
-    # dsp
-    'stft',
-    'mfcc',
-    'hz_to_mel',
-    'mel_to_hz',
-    'mel_frequencies',
-    'power_to_db',
-    'compute_fbank_matrix',
-    'melspectrogram',
-    'spectrogram',
-    'mu_encode',
-    'mu_decode',
-    # augmentation
-    'depth_augment',
-    'spect_augment',
-    'random_crop1d',
-    'random_crop2d',
-    'adaptive_spect_augment',
-]
-
-
-def _pad_center(data: np.ndarray, size: int, axis: int=-1,
-                **kwargs) -> np.ndarray:
-    """Pad an array to a target length along a target axis.
-
-    This differs from `np.pad` by centering the data prior to padding,
-    analogous to `str.center`
-    """
-
-    kwargs.setdefault("mode", "constant")
-    n = data.shape[axis]
-    lpad = int((size - n) // 2)
-    lengths = [(0, 0)] * data.ndim
-    lengths[axis] = (lpad, int(size - n - lpad))
-
-    if lpad < 0:
-        raise ParameterError(("Target size ({size:d}) must be "
-                              "at least input size ({n:d})"))
-
-    return np.pad(data, lengths, **kwargs)
-
-
-def _split_frames(x: np.ndarray,
-                  frame_length: int,
-                  hop_length: int,
-                  axis: int=-1) -> np.ndarray:
-    """Slice a data array into (overlapping) frames.
-
-    This function is aligned with librosa.frame
-    """
-
-    if not isinstance(x, np.ndarray):
-        raise ParameterError(
-            f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
-
-    if x.shape[axis] < frame_length:
-        raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
-                             f" for frame_length={frame_length:d}")
-
-    if hop_length < 1:
-        raise ParameterError(f"Invalid hop_length: {hop_length:d}")
-
-    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
-        warnings.warn(f"librosa.util.frame called with axis={axis} "
-                      "on a non-contiguous input. This will result in a copy.")
-        x = np.asfortranarray(x)
-    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
-        warnings.warn(f"librosa.util.frame called with axis={axis} "
-                      "on a non-contiguous input. This will result in a copy.")
-        x = np.ascontiguousarray(x)
-
-    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
-    strides = np.asarray(x.strides)
-
-    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
-
-    if axis == -1:
-        shape = list(x.shape)[:-1] + [frame_length, n_frames]
-        strides = list(strides) + [hop_length * new_stride]
-
-    elif axis == 0:
-        shape = [n_frames, frame_length] + list(x.shape)[1:]
-        strides = [hop_length * new_stride] + list(strides)
-
-    else:
-        raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
-
-    return as_strided(x, shape=shape, strides=strides)
-
-
-def _check_audio(y, mono=True) -> bool:
-    """Determine whether a variable contains valid audio data.
-
-    The audio y must be a np.ndarray, ether 1-channel or two channel
-    """
-    if not isinstance(y, np.ndarray):
-        raise ParameterError("Audio data must be of type numpy.ndarray")
-    if y.ndim > 2:
-        raise ParameterError(
-            f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
-
-    if mono and y.ndim == 2:
-        raise ParameterError(
-            f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
-
-    if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
-        raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
-
-    if not np.issubdtype(y.dtype, np.floating):
-        raise ParameterError("Audio data must be floating-point")
-
-    if not np.isfinite(y).all():
-        raise ParameterError("Audio buffer is not finite everywhere")
-
-    return True
-
-
-def hz_to_mel(frequencies: Union[float, List[float], np.ndarray],
-              htk: bool=False) -> np.ndarray:
-    """Convert Hz to Mels.
-
-    Args:
-        frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-
-    Returns:
-        np.ndarray: Frequency in mels.
-    """
-    freq = np.asanyarray(frequencies)
-
-    if htk:
-        return 2595.0 * np.log10(1.0 + freq / 700.0)
-
-    # Fill in the linear part
-    f_min = 0.0
-    f_sp = 200.0 / 3
-
-    mels = (freq - f_min) / f_sp
-
-    # Fill in the log-scale part
-
-    min_log_hz = 1000.0  # beginning of log region (Hz)
-    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = np.log(6.4) / 27.0  # step size for log region
-
-    if freq.ndim:
-        # If we have array data, vectorize
-        log_t = freq >= min_log_hz
-        mels[log_t] = min_log_mel + \
-            np.log(freq[log_t] / min_log_hz) / logstep
-    elif freq >= min_log_hz:
-        # If we have scalar data, heck directly
-        mels = min_log_mel + np.log(freq / min_log_hz) / logstep
-
-    return mels
-
-
-def mel_to_hz(mels: Union[float, List[float], np.ndarray],
-              htk: int=False) -> np.ndarray:
-    """Convert mel bin numbers to frequencies.
-
-    Args:
-        mels (Union[float, List[float], np.ndarray]): Frequency in mels.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-
-    Returns:
-        np.ndarray: Frequencies in Hz.
-    """
-    mel_array = np.asanyarray(mels)
-
-    if htk:
-        return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
-
-    # Fill in the linear scale
-    f_min = 0.0
-    f_sp = 200.0 / 3
-    freqs = f_min + f_sp * mel_array
-
-    # And now the nonlinear scale
-    min_log_hz = 1000.0  # beginning of log region (Hz)
-    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = np.log(6.4) / 27.0  # step size for log region
-
-    if mel_array.ndim:
-        # If we have vector data, vectorize
-        log_t = mel_array >= min_log_mel
-        freqs[log_t] = min_log_hz * \
-            np.exp(logstep * (mel_array[log_t] - min_log_mel))
-    elif mel_array >= min_log_mel:
-        # If we have scalar data, check directly
-        freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
-
-    return freqs
-
-
-def mel_frequencies(n_mels: int=128,
-                    fmin: float=0.0,
-                    fmax: float=11025.0,
-                    htk: bool=False) -> np.ndarray:
-    """Compute mel frequencies.
-
-    Args:
-        n_mels (int, optional): Number of mel bins. Defaults to 128.
-        fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
-        fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-
-    Returns:
-        np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`.
-    """
-    # 'Center freqs' of mel bands - uniformly spaced between limits
-    min_mel = hz_to_mel(fmin, htk=htk)
-    max_mel = hz_to_mel(fmax, htk=htk)
-
-    mels = np.linspace(min_mel, max_mel, n_mels)
-
-    return mel_to_hz(mels, htk=htk)
-
-
-def fft_frequencies(sr: int, n_fft: int) -> np.ndarray:
-    """Compute fourier frequencies.
-
-    Args:
-        sr (int): Sample rate.
-        n_fft (int): FFT size.
-
-    Returns:
-        np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
-    """
-    return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
-
-
-def compute_fbank_matrix(sr: int,
-                         n_fft: int,
-                         n_mels: int=128,
-                         fmin: float=0.0,
-                         fmax: Optional[float]=None,
-                         htk: bool=False,
-                         norm: str="slaney",
-                         dtype: type=np.float32) -> np.ndarray:
-    """Compute fbank matrix.
-
-    Args:
-        sr (int): Sample rate.
-        n_fft (int): FFT size.
-        n_mels (int, optional): Number of mel bins. Defaults to 128.
-        fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
-        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-        norm (str, optional): Type of normalization. Defaults to "slaney".
-        dtype (type, optional): Data type. Defaults to np.float32.
-
-
-    Returns:
-        np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
-    """
-    if norm != "slaney":
-        raise ParameterError('norm must set to slaney')
-
-    if fmax is None:
-        fmax = float(sr) / 2
-
-    # Initialize the weights
-    n_mels = int(n_mels)
-    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
-
-    # Center freqs of each FFT bin
-    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
-
-    # 'Center freqs' of mel bands - uniformly spaced between limits
-    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
-
-    fdiff = np.diff(mel_f)
-    ramps = np.subtract.outer(mel_f, fftfreqs)
-
-    for i in range(n_mels):
-        # lower and upper slopes for all bins
-        lower = -ramps[i] / fdiff[i]
-        upper = ramps[i + 2] / fdiff[i + 1]
-
-        # .. then intersect them with each other and zero
-        weights[i] = np.maximum(0, np.minimum(lower, upper))
-
-    if norm == "slaney":
-        # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
-        weights *= enorm[:, np.newaxis]
-
-    # Only check weights if f_mel[0] is positive
-    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
-        # This means we have an empty channel somewhere
-        warnings.warn("Empty filters detected in mel frequency basis. "
-                      "Some channels will produce empty responses. "
-                      "Try increasing your sampling rate (and fmax) or "
-                      "reducing n_mels.")
-
-    return weights
-
-
-def stft(x: np.ndarray,
-         n_fft: int=2048,
-         hop_length: Optional[int]=None,
-         win_length: Optional[int]=None,
-         window: str="hann",
-         center: bool=True,
-         dtype: type=np.complex64,
-         pad_mode: str="reflect") -> np.ndarray:
-    """Short-time Fourier transform (STFT).
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        n_fft (int, optional): FFT size. Defaults to 2048.
-        hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
-        win_length (Optional[int], optional): The size of window. Defaults to None.
-        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
-
-    Returns:
-        np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
-    """
-    _check_audio(x)
-
-    # By default, use the entire frame
-    if win_length is None:
-        win_length = n_fft
-
-    # Set the default hop, if it's not already specified
-    if hop_length is None:
-        hop_length = int(win_length // 4)
-
-    fft_window = signal.get_window(window, win_length, fftbins=True)
-
-    # Pad the window out to n_fft size
-    fft_window = _pad_center(fft_window, n_fft)
-
-    # Reshape so that the window can be broadcast
-    fft_window = fft_window.reshape((-1, 1))
-
-    # Pad the time series so that frames are centered
-    if center:
-        if n_fft > x.shape[-1]:
-            warnings.warn(
-                f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
-            )
-        x = np.pad(x, int(n_fft // 2), mode=pad_mode)
-
-    elif n_fft > x.shape[-1]:
-        raise ParameterError(
-            f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
-        )
-
-    # Window the time series.
-    x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length)
-    # Pre-allocate the STFT matrix
-    stft_matrix = np.empty(
-        (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
-    fft = np.fft  # use numpy fft as default
-    # Constrain STFT block sizes to 256 KB
-    MAX_MEM_BLOCK = 2**8 * 2**10
-    # how many columns can we fit within MAX_MEM_BLOCK?
-    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
-    n_columns = max(n_columns, 1)
-
-    for bl_s in range(0, stft_matrix.shape[1], n_columns):
-        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
-        stft_matrix[:, bl_s:bl_t] = fft.rfft(
-            fft_window * x_frames[:, bl_s:bl_t], axis=0)
-
-    return stft_matrix
-
-
-def power_to_db(spect: np.ndarray,
-                ref: float=1.0,
-                amin: float=1e-10,
-                top_db: Optional[float]=80.0) -> np.ndarray:
-    """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
-
-    Args:
-        spect (np.ndarray): STFT power spectrogram of an input waveform.
-        ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
-        amin (float, optional): Minimum threshold. Defaults to 1e-10.
-        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0.
-
-    Returns:
-        np.ndarray: Power spectrogram in db scale.
-    """
-    spect = np.asarray(spect)
-
-    if amin <= 0:
-        raise ParameterError("amin must be strictly positive")
-
-    if np.issubdtype(spect.dtype, np.complexfloating):
-        warnings.warn(
-            "power_to_db was called on complex input so phase "
-            "information will be discarded. To suppress this warning, "
-            "call power_to_db(np.abs(D)**2) instead.")
-        magnitude = np.abs(spect)
-    else:
-        magnitude = spect
-
-    if callable(ref):
-        # User supplied a function to calculate reference power
-        ref_value = ref(magnitude)
-    else:
-        ref_value = np.abs(ref)
-
-    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
-    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
-
-    if top_db is not None:
-        if top_db < 0:
-            raise ParameterError("top_db must be non-negative")
-        log_spec = np.maximum(log_spec, log_spec.max() - top_db)
-
-    return log_spec
-
-
-def mfcc(x: np.ndarray,
-         sr: int=16000,
-         spect: Optional[np.ndarray]=None,
-         n_mfcc: int=20,
-         dct_type: int=2,
-         norm: str="ortho",
-         lifter: int=0,
-         **kwargs) -> np.ndarray:
-    """Mel-frequency cepstral coefficients (MFCCs)
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        sr (int, optional): Sample rate. Defaults to 16000.
-        spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None.
-        n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20.
-        dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2.
-        norm (str, optional): Type of normalization. Defaults to "ortho".
-        lifter (int, optional): Cepstral filtering. Defaults to 0.
-
-    Returns:
-        np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
-    """
-    if spect is None:
-        spect = melspectrogram(x, sr=sr, **kwargs)
-
-    M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
-
-    if lifter > 0:
-        factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
-                        lifter)
-        return M * factor[:, np.newaxis]
-    elif lifter == 0:
-        return M
-    else:
-        raise ParameterError(
-            f"MFCC lifter={lifter} must be a non-negative number")
-
-
-def melspectrogram(x: np.ndarray,
-                   sr: int=16000,
-                   window_size: int=512,
-                   hop_length: int=320,
-                   n_mels: int=64,
-                   fmin: float=50.0,
-                   fmax: Optional[float]=None,
-                   window: str='hann',
-                   center: bool=True,
-                   pad_mode: str='reflect',
-                   power: float=2.0,
-                   to_db: bool=True,
-                   ref: float=1.0,
-                   amin: float=1e-10,
-                   top_db: Optional[float]=None) -> np.ndarray:
-    """Compute mel-spectrogram.
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        sr (int, optional): Sample rate. Defaults to 16000.
-        window_size (int, optional): Size of FFT and window length. Defaults to 512.
-        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
-        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
-        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
-        to_db (bool, optional): Enable db scale. Defaults to True.
-        ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
-        amin (float, optional): Minimum threshold. Defaults to 1e-10.
-        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
-
-    Returns:
-        np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`.
-    """
-    _check_audio(x, mono=True)
-    if len(x) <= 0:
-        raise ParameterError('The input waveform is empty')
-
-    if fmax is None:
-        fmax = sr // 2
-    if fmin < 0 or fmin >= fmax:
-        raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
-
-    s = stft(
-        x,
-        n_fft=window_size,
-        hop_length=hop_length,
-        win_length=window_size,
-        window=window,
-        center=center,
-        pad_mode=pad_mode)
-
-    spect_power = np.abs(s)**power
-    fb_matrix = compute_fbank_matrix(
-        sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
-    mel_spect = np.matmul(fb_matrix, spect_power)
-    if to_db:
-        return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
-    else:
-        return mel_spect
-
-
-def spectrogram(x: np.ndarray,
-                sr: int=16000,
-                window_size: int=512,
-                hop_length: int=320,
-                window: str='hann',
-                center: bool=True,
-                pad_mode: str='reflect',
-                power: float=2.0) -> np.ndarray:
-    """Compute spectrogram.
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        sr (int, optional): Sample rate. Defaults to 16000.
-        window_size (int, optional): Size of FFT and window length. Defaults to 512.
-        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
-        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
-        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
-
-    Returns:
-        np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`.
-    """
-
-    s = stft(
-        x,
-        n_fft=window_size,
-        hop_length=hop_length,
-        win_length=window_size,
-        window=window,
-        center=center,
-        pad_mode=pad_mode)
-
-    return np.abs(s)**power
-
-
-def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
-    """Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`.
-
-    Args:
-        x (np.ndarray): The input waveform to encode.
-        mu (int, optional): The endoceding parameter. Defaults to 255.
-        quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True.
-
-    Returns:
-        np.ndarray: The mu-law encoded waveform.
-    """
-    mu = 255
-    y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
-    if quantized:
-        y = np.floor((y + 1) / 2 * mu + 0.5)  # convert to [0 , mu-1]
-    return y
-
-
-def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
-    """Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise.
-
-    Args:
-        y (np.ndarray): The encoded waveform.
-        mu (int, optional): The endoceding parameter. Defaults to 255.
-        quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True.
-
-    Returns:
-        np.ndarray: The mu-law decoded waveform.
-    """
-    if mu < 1:
-        raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
-
-    mu = mu - 1
-    if quantized:  # undo the quantization
-        y = y * 2 / mu - 1
-    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
-    return x
-
-
-def _randint(high: int) -> int:
-    """Generate one random integer in range [0 high)
-
-     This is a helper function for random data augmentaiton
-    """
-    return int(np.random.randint(0, high=high))
-
-
-def depth_augment(y: np.ndarray,
-                  choices: List=['int8', 'int16'],
-                  probs: List[float]=[0.5, 0.5]) -> np.ndarray:
-    """ Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16'].
-        probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5].
-
-    Returns:
-        np.ndarray: The augmented waveform.
-    """
-    assert len(probs) == len(
-        choices
-    ), 'number of choices {} must be equal to size of probs {}'.format(
-        len(choices), len(probs))
-    depth = np.random.choice(choices, p=probs)
-    src_depth = y.dtype
-    y1 = depth_convert(y, depth)
-    y2 = depth_convert(y1, src_depth)
-
-    return y2
-
-
-def adaptive_spect_augment(spect: np.ndarray,
-                           tempo_axis: int=0,
-                           level: float=0.1) -> np.ndarray:
-    """Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation.
-
-    Args:
-        spect (np.ndarray): Input spectrogram.
-        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
-        level (float, optional): The level factor of masking. Defaults to 0.1.
-
-    Returns:
-        np.ndarray: The augmented spectrogram.
-    """
-    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
-    if tempo_axis == 0:
-        nt, nf = spect.shape
-    else:
-        nf, nt = spect.shape
-
-    time_mask_width = int(nt * level * 0.5)
-    freq_mask_width = int(nf * level * 0.5)
-
-    num_time_mask = int(10 * level)
-    num_freq_mask = int(10 * level)
-
-    if tempo_axis == 0:
-        for _ in range(num_time_mask):
-            start = _randint(nt - time_mask_width)
-            spect[start:start + time_mask_width, :] = 0
-        for _ in range(num_freq_mask):
-            start = _randint(nf - freq_mask_width)
-            spect[:, start:start + freq_mask_width] = 0
-    else:
-        for _ in range(num_time_mask):
-            start = _randint(nt - time_mask_width)
-            spect[:, start:start + time_mask_width] = 0
-        for _ in range(num_freq_mask):
-            start = _randint(nf - freq_mask_width)
-            spect[start:start + freq_mask_width, :] = 0
-
-    return spect
-
-
-def spect_augment(spect: np.ndarray,
-                  tempo_axis: int=0,
-                  max_time_mask: int=3,
-                  max_freq_mask: int=3,
-                  max_time_mask_width: int=30,
-                  max_freq_mask_width: int=20) -> np.ndarray:
-    """Do spectrogram augmentation in both time and freq axis.
-
-    Args:
-        spect (np.ndarray): Input spectrogram.
-        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
-        max_time_mask (int, optional): Maximum number of time masking. Defaults to 3.
-        max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3.
-        max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30.
-        max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20.
-
-    Returns:
-        np.ndarray: The augmented spectrogram.
-    """
-    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
-    if tempo_axis == 0:
-        nt, nf = spect.shape
-    else:
-        nf, nt = spect.shape
-
-    num_time_mask = _randint(max_time_mask)
-    num_freq_mask = _randint(max_freq_mask)
-
-    time_mask_width = _randint(max_time_mask_width)
-    freq_mask_width = _randint(max_freq_mask_width)
-
-    if tempo_axis == 0:
-        for _ in range(num_time_mask):
-            start = _randint(nt - time_mask_width)
-            spect[start:start + time_mask_width, :] = 0
-        for _ in range(num_freq_mask):
-            start = _randint(nf - freq_mask_width)
-            spect[:, start:start + freq_mask_width] = 0
-    else:
-        for _ in range(num_time_mask):
-            start = _randint(nt - time_mask_width)
-            spect[:, start:start + time_mask_width] = 0
-        for _ in range(num_freq_mask):
-            start = _randint(nf - freq_mask_width)
-            spect[start:start + freq_mask_width, :] = 0
-
-    return spect
-
-
-def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray:
-    """ Random cropping on a input waveform.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D.
-        crop_len (int): Length of waveform to crop.
-
-    Returns:
-        np.ndarray: The cropped waveform.
-    """
-    if y.ndim != 1:
-        'only accept 1d tensor or numpy array'
-    n = len(y)
-    idx = _randint(n - crop_len)
-    return y[idx:idx + crop_len]
-
-
-def random_crop2d(s: np.ndarray, crop_len: int,
-                  tempo_axis: int=0) -> np.ndarray:
-    """ Random cropping on a spectrogram.
-
-    Args:
-        s (np.ndarray): Input spectrogram in 2D.
-        crop_len (int): Length of spectrogram to crop.
-        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
-
-    Returns:
-        np.ndarray: The cropped spectrogram.
-    """
-    if tempo_axis >= s.ndim:
-        raise ParameterError('axis out of range')
-
-    n = s.shape[tempo_axis]
-    idx = _randint(high=n - crop_len)
-    sli = [slice(None) for i in range(s.ndim)]
-    sli[tempo_axis] = slice(idx, idx + crop_len)
-    out = s[tuple(sli)]
-    return out
--- a/paddlespeech/audio/datasets/init.py
+++ b/paddlespeech/audio/datasets/init.py
@ -1,20 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .esc50 import ESC50
-from .gtzan import GTZAN
-from .hey_snips import HeySnips
-from .rirs_noises import OpenRIRNoise
-from .tess import TESS
-from .urban_sound import UrbanSound8K
-from .voxceleb import VoxCeleb
--- a/paddlespeech/audio/datasets/dataset.py
+++ b/paddlespeech/audio/datasets/dataset.py
@ -1,100 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List
-
-import numpy as np
-import paddle
-
-from ..compliance.kaldi import fbank as kaldi_fbank
-from ..compliance.kaldi import mfcc as kaldi_mfcc
-from ..compliance.librosa import melspectrogram
-from ..compliance.librosa import mfcc
-
-feat_funcs = {
-    'raw': None,
-    'melspectrogram': melspectrogram,
-    'mfcc': mfcc,
-    'kaldi_fbank': kaldi_fbank,
-    'kaldi_mfcc': kaldi_mfcc,
-}
-
-
-class AudioClassificationDataset(paddle.io.Dataset):
-    """
-    Base class of audio classification dataset.
-    """
-
-    def __init__(self,
-                 files: List[str],
-                 labels: List[int],
-                 feat_type: str='raw',
-                 sample_rate: int=None,
-                 **kwargs):
-        """
-        Ags:
-            files (:obj:`List[str]`): A list of absolute path of audio files.
-            labels (:obj:`List[int]`): Labels of audio files.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        super(AudioClassificationDataset, self).__init__()
-
-        if feat_type not in feat_funcs.keys():
-            raise RuntimeError(
-                f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
-            )
-
-        self.files = files
-        self.labels = labels
-
-        self.feat_type = feat_type
-        self.sample_rate = sample_rate
-        self.feat_config = kwargs  # Pass keyword arguments to customize feature config
-
-    def _get_data(self, input_file: str):
-        raise NotImplementedError
-
-    def _convert_to_record(self, idx):
-        file, label = self.files[idx], self.labels[idx]
-
-        if self.sample_rate is None:
-            waveform, sample_rate = paddlespeech.audio.load(file)
-        else:
-            waveform, sample_rate = paddlespeech.audio.load(
-                file, sr=self.sample_rate)
-
-        feat_func = feat_funcs[self.feat_type]
-
-        record = {}
-        if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
-            waveform = paddle.to_tensor(waveform).unsqueeze(0)  # (C, T)
-            record['feat'] = feat_func(
-                waveform=waveform, sr=self.sample_rate, **self.feat_config)
-        else:
-            record['feat'] = feat_func(
-                waveform, sample_rate,
-                **self.feat_config) if feat_func else waveform
-        record['label'] = label
-        return record
-
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
-            return self.keys[idx], record['feat'], record['label']
-        else:
-            return np.array(record['feat']).transpose(), np.array(
-                record['label'], dtype=np.int64)
-
-    def __len__(self):
-        return len(self.files)
--- a/paddlespeech/audio/datasets/esc50.py
+++ b/paddlespeech/audio/datasets/esc50.py
@ -1,152 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-from typing import List
-from typing import Tuple
-
-from ..utils import DATA_HOME
-from ..utils.download import download_and_decompress
-from .dataset import AudioClassificationDataset
-
-__all__ = ['ESC50']
-
-
-class ESC50(AudioClassificationDataset):
-    """
-    The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
-    suitable for benchmarking methods of environmental sound classification. The dataset
-    consists of 5-second-long recordings organized into 50 semantical classes (with
-    40 examples per class)
-
-    Reference:
-        ESC: Dataset for Environmental Sound Classification
-        http://dx.doi.org/10.1145/2733373.2806390
-    """
-
-    archieves = [
-        {
-            'url':
-            'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
-            'md5': '7771e4b9d86d0945acce719c7a59305a',
-        },
-    ]
-    label_list = [
-        # Animals
-        'Dog',
-        'Rooster',
-        'Pig',
-        'Cow',
-        'Frog',
-        'Cat',
-        'Hen',
-        'Insects (flying)',
-        'Sheep',
-        'Crow',
-        # Natural soundscapes & water sounds
-        'Rain',
-        'Sea waves',
-        'Crackling fire',
-        'Crickets',
-        'Chirping birds',
-        'Water drops',
-        'Wind',
-        'Pouring water',
-        'Toilet flush',
-        'Thunderstorm',
-        # Human, non-speech sounds
-        'Crying baby',
-        'Sneezing',
-        'Clapping',
-        'Breathing',
-        'Coughing',
-        'Footsteps',
-        'Laughing',
-        'Brushing teeth',
-        'Snoring',
-        'Drinking, sipping',
-        # Interior/domestic sounds
-        'Door knock',
-        'Mouse click',
-        'Keyboard typing',
-        'Door, wood creaks',
-        'Can opening',
-        'Washing machine',
-        'Vacuum cleaner',
-        'Clock alarm',
-        'Clock tick',
-        'Glass breaking',
-        # Exterior/urban noises
-        'Helicopter',
-        'Chainsaw',
-        'Siren',
-        'Car horn',
-        'Engine',
-        'Train',
-        'Church bells',
-        'Airplane',
-        'Fireworks',
-        'Hand saw',
-    ]
-    meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
-    meta_info = collections.namedtuple(
-        'META_INFO',
-        ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
-    audio_path = os.path.join('ESC-50-master', 'audio')
-
-    def __init__(self,
-                 mode: str='train',
-                 split: int=1,
-                 feat_type: str='raw',
-                 **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode, split)
-        super(ESC50, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self) -> List[collections.namedtuple]:
-        ret = []
-        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                ret.append(self.meta_info(*line.strip().split(',')))
-        return ret
-
-    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        meta_info = self._get_meta_info()
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, fold, target, _, _, _, _ = sample
-            if mode == 'train' and int(fold) != split:
-                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-                labels.append(int(target))
-
-            if mode != 'train' and int(fold) == split:
-                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-                labels.append(int(target))
-
-        return files, labels
--- a/paddlespeech/audio/datasets/gtzan.py
+++ b/paddlespeech/audio/datasets/gtzan.py
@ -1,115 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-import random
-from typing import List
-from typing import Tuple
-
-from ..utils import DATA_HOME
-from ..utils.download import download_and_decompress
-from .dataset import AudioClassificationDataset
-
-__all__ = ['GTZAN']
-
-
-class GTZAN(AudioClassificationDataset):
-    """
-    The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
-    each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
-    in machine listening research for music genre recognition (MGR).
-
-    Reference:
-        Musical genre classification of audio signals
-        https://ieeexplore.ieee.org/document/1021072/
-    """
-
-    archieves = [
-        {
-            'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
-            'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
-        },
-    ]
-    label_list = [
-        'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
-        'pop', 'reggae', 'rock'
-    ]
-    meta = os.path.join('genres', 'input.mf')
-    meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
-    audio_path = 'genres'
-
-    def __init__(self,
-                 mode='train',
-                 seed=0,
-                 n_folds=5,
-                 split=1,
-                 feat_type='raw',
-                 **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            seed (:obj:`int`, `optional`, defaults to 0):
-                Set the random seed to shuffle samples.
-            n_folds (:obj:`int`, `optional`, defaults to 5):
-                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
-        files, labels = self._get_data(mode, seed, n_folds, split)
-        super(GTZAN, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self) -> List[collections.namedtuple]:
-        ret = []
-        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
-            for line in rf.readlines():
-                ret.append(self.meta_info(*line.strip().split('\t')))
-        return ret
-
-    def _get_data(self, mode, seed, n_folds,
-                  split) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        meta_info = self._get_meta_info()
-        random.seed(seed)  # shuffle samples to split data
-        random.shuffle(
-            meta_info
-        )  # make sure using the same seed to create train and dev dataset
-
-        files = []
-        labels = []
-        n_samples_per_fold = len(meta_info) // n_folds
-        for idx, sample in enumerate(meta_info):
-            file_path, label = sample
-            filename = os.path.basename(file_path)
-            target = self.label_list.index(label)
-            fold = idx // n_samples_per_fold + 1
-
-            if mode == 'train' and int(fold) != split:
-                files.append(
-                    os.path.join(DATA_HOME, self.audio_path, label, filename))
-                labels.append(target)
-
-            if mode != 'train' and int(fold) == split:
-                files.append(
-                    os.path.join(DATA_HOME, self.audio_path, label, filename))
-                labels.append(target)
-
-        return files, labels
--- a/paddlespeech/audio/datasets/hey_snips.py
+++ b/paddlespeech/audio/datasets/hey_snips.py
@ -1,74 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import json
-import os
-from typing import List
-from typing import Tuple
-
-from .dataset import AudioClassificationDataset
-
-__all__ = ['HeySnips']
-
-
-class HeySnips(AudioClassificationDataset):
-    meta_info = collections.namedtuple('META_INFO',
-                                       ('key', 'label', 'duration', 'wav'))
-
-    def __init__(self,
-                 data_dir: os.PathLike,
-                 mode: str='train',
-                 feat_type: str='kaldi_fbank',
-                 sample_rate: int=16000,
-                 **kwargs):
-        self.data_dir = data_dir
-        files, labels = self._get_data(mode)
-        super(HeySnips, self).__init__(
-            files=files,
-            labels=labels,
-            feat_type=feat_type,
-            sample_rate=sample_rate,
-            **kwargs)
-
-    def _get_meta_info(self, mode) -> List[collections.namedtuple]:
-        ret = []
-        with open(os.path.join(self.data_dir, '{}.json'.format(mode)),
-                  'r') as f:
-            data = json.load(f)
-            for item in data:
-                sample = collections.OrderedDict()
-                if item['duration'] > 0:
-                    sample['key'] = item['id']
-                    sample['label'] = 0 if item['is_hotword'] == 1 else -1
-                    sample['duration'] = item['duration']
-                    sample['wav'] = os.path.join(self.data_dir,
-                                                 item['audio_file_path'])
-                    ret.append(self.meta_info(*sample.values()))
-        return ret
-
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        meta_info = self._get_meta_info(mode)
-
-        files = []
-        labels = []
-        self.keys = []
-        self.durations = []
-        for sample in meta_info:
-            key, target, duration, wav = sample
-            files.append(wav)
-            labels.append(int(target))
-            self.keys.append(key)
-            self.durations.append(float(duration))
-
-        return files, labels
--- a/paddlespeech/audio/datasets/rirs_noises.py
+++ b/paddlespeech/audio/datasets/rirs_noises.py
@ -1,200 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import csv
-import os
-import random
-from typing import List
-
-from paddle.io import Dataset
-from tqdm import tqdm
-
-from ..utils import DATA_HOME
-from ..utils.download import download_and_decompress
-from .dataset import feat_funcs
-
-__all__ = ['OpenRIRNoise']
-
-
-class OpenRIRNoise(Dataset):
-    archieves = [
-        {
-            'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
-            'md5': 'e6f48e257286e05de56413b4779d8ffb',
-        },
-    ]
-
-    sample_rate = 16000
-    meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
-    base_path = os.path.join(DATA_HOME, 'open_rir_noise')
-    wav_path = os.path.join(base_path, 'RIRS_NOISES')
-    csv_path = os.path.join(base_path, 'csv')
-    subsets = ['rir', 'noise']
-
-    def __init__(self,
-                 subset: str='rir',
-                 feat_type: str='raw',
-                 target_dir=None,
-                 random_chunk: bool=True,
-                 chunk_duration: float=3.0,
-                 seed: int=0,
-                 **kwargs):
-
-        assert subset in self.subsets, \
-            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
-
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self.random_chunk = random_chunk
-        self.chunk_duration = chunk_duration
-
-        OpenRIRNoise.csv_path = os.path.join(
-            target_dir, "open_rir_noise",
-            "csv") if target_dir else self.csv_path
-        self._data = self._get_data()
-        super(OpenRIRNoise, self).__init__()
-
-        # Set up a seed to reproduce training or predicting result.
-        # random.seed(seed)
-
-    def _get_data(self):
-        # Download audio files.
-        print(f"rirs noises base path: {self.base_path}")
-        if not os.path.isdir(self.base_path):
-            download_and_decompress(
-                self.archieves, self.base_path, decompress=True)
-        else:
-            print(
-                f"{self.base_path} already exists, we will not download and decompress again"
-            )
-
-        # Data preparation.
-        print(f"prepare the csv to {self.csv_path}")
-        if not os.path.isdir(self.csv_path):
-            os.makedirs(self.csv_path)
-            self.prepare_data()
-
-        data = []
-        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                audio_id, duration, wav = line.strip().split(',')
-                data.append(self.meta_info(audio_id, float(duration), wav))
-
-        random.shuffle(data)
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = paddlespeech.audio.load(record['wav'])
-
-        assert self.feat_type in feat_funcs.keys(), \
-            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sr=sr, **self.feat_config) if feat_func else waveform
-
-        record.update({'feat': feat})
-        return record
-
-    @staticmethod
-    def _get_chunks(seg_dur, audio_id, audio_duration):
-        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
-
-        chunk_lst = [
-            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
-            for i in range(num_chunks)
-        ]
-        return chunk_lst
-
-    def _get_audio_info(self, wav_file: str,
-                        split_chunks: bool) -> List[List[str]]:
-        waveform, sr = paddlespeech.audio.load(wav_file)
-        audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
-        audio_duration = waveform.shape[0] / sr
-
-        ret = []
-        if split_chunks and audio_duration > self.chunk_duration:  # Split into pieces of self.chunk_duration seconds.
-            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
-                                                audio_duration)
-
-            for idx, chunk in enumerate(uniq_chunks_list):
-                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
-                start_sample = int(float(s) * sr)
-                end_sample = int(float(e) * sr)
-                new_wav_file = os.path.join(self.base_path,
-                                            audio_id + f'_chunk_{idx+1:02}.wav')
-                paddlespeech.audio.save(waveform[start_sample:end_sample], sr,
-                                        new_wav_file)
-                # id, duration, new_wav
-                ret.append([chunk, self.chunk_duration, new_wav_file])
-        else:  # Keep whole audio.
-            ret.append([audio_id, audio_duration, wav_file])
-        return ret
-
-    def generate_csv(self,
-                     wav_files: List[str],
-                     output_file: str,
-                     split_chunks: bool=True):
-        print(f'Generating csv: {output_file}')
-        header = ["id", "duration", "wav"]
-
-        infos = list(
-            tqdm(
-                map(self._get_audio_info, wav_files, [split_chunks] * len(
-                    wav_files)),
-                total=len(wav_files)))
-
-        csv_lines = []
-        for info in infos:
-            csv_lines.extend(info)
-
-        with open(output_file, mode="w") as csv_f:
-            csv_writer = csv.writer(
-                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
-            csv_writer.writerow(header)
-            for line in csv_lines:
-                csv_writer.writerow(line)
-
-    def prepare_data(self):
-        rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
-                                "rir_list")
-        rir_files = []
-        with open(rir_list, 'r') as f:
-            for line in f.readlines():
-                rir_file = line.strip().split(' ')[-1]
-                rir_files.append(os.path.join(self.base_path, rir_file))
-
-        noise_list = os.path.join(self.wav_path, "pointsource_noises",
-                                  "noise_list")
-        noise_files = []
-        with open(noise_list, 'r') as f:
-            for line in f.readlines():
-                noise_file = line.strip().split(' ')[-1]
-                noise_files.append(os.path.join(self.base_path, noise_file))
-
-        self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
-        self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
-
-    def __getitem__(self, idx):
-        return self._convert_to_record(idx)
-
-    def __len__(self):
-        return len(self._data)
--- a/paddlespeech/audio/datasets/tess.py
+++ b/paddlespeech/audio/datasets/tess.py
@ -1,126 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-import random
-from typing import List
-from typing import Tuple
-
-from ..utils import DATA_HOME
-from ..utils.download import download_and_decompress
-from .dataset import AudioClassificationDataset
-
-__all__ = ['TESS']
-
-
-class TESS(AudioClassificationDataset):
-    """
-    TESS is a set of 200 target words were spoken in the carrier phrase
-    "Say the word _____' by two actresses (aged 26 and 64 years) and
-    recordings were made of the set portraying each of seven emotions(anger,
-    disgust, fear, happiness, pleasant surprise, sadness, and neutral).
-    There are 2800 stimuli in total.
-
-    Reference:
-        Toronto emotional speech set (TESS)
-        https://doi.org/10.5683/SP2/E8H2MF
-    """
-
-    archieves = [
-        {
-            'url':
-            'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
-            'md5':
-            '1465311b24d1de704c4c63e4ccc470c7',
-        },
-    ]
-    label_list = [
-        'angry',
-        'disgust',
-        'fear',
-        'happy',
-        'neutral',
-        'ps',  # pleasant surprise
-        'sad',
-    ]
-    meta_info = collections.namedtuple('META_INFO',
-                                       ('speaker', 'word', 'emotion'))
-    audio_path = 'TESS_Toronto_emotional_speech_set'
-
-    def __init__(self,
-                 mode='train',
-                 seed=0,
-                 n_folds=5,
-                 split=1,
-                 feat_type='raw',
-                 **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            seed (:obj:`int`, `optional`, defaults to 0):
-                Set the random seed to shuffle samples.
-            n_folds (:obj:`int`, `optional`, defaults to 5):
-                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
-        files, labels = self._get_data(mode, seed, n_folds, split)
-        super(TESS, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, files) -> List[collections.namedtuple]:
-        ret = []
-        for file in files:
-            basename_without_extend = os.path.basename(file)[:-4]
-            ret.append(self.meta_info(*basename_without_extend.split('_')))
-        return ret
-
-    def _get_data(self, mode, seed, n_folds,
-                  split) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        wav_files = []
-        for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-
-        random.seed(seed)  # shuffle samples to split data
-        random.shuffle(
-            wav_files
-        )  # make sure using the same seed to create train and dev dataset
-        meta_info = self._get_meta_info(wav_files)
-
-        files = []
-        labels = []
-        n_samples_per_fold = len(meta_info) // n_folds
-        for idx, sample in enumerate(meta_info):
-            _, _, emotion = sample
-            target = self.label_list.index(emotion)
-            fold = idx // n_samples_per_fold + 1
-
-            if mode == 'train' and int(fold) != split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-            if mode != 'train' and int(fold) == split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-        return files, labels
--- a/paddlespeech/audio/datasets/urban_sound.py
+++ b/paddlespeech/audio/datasets/urban_sound.py
@ -1,104 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-from typing import List
-from typing import Tuple
-
-from ..utils import DATA_HOME
-from ..utils.download import download_and_decompress
-from .dataset import AudioClassificationDataset
-
-__all__ = ['UrbanSound8K']
-
-
-class UrbanSound8K(AudioClassificationDataset):
-    """
-    UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
-    sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
-    drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
-    classes are drawn from the urban sound taxonomy.
-
-    Reference:
-        A Dataset and Taxonomy for Urban Sound Research
-        https://dl.acm.org/doi/10.1145/2647868.2655045
-    """
-
-    archieves = [
-        {
-            'url':
-            'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
-            'md5': '9aa69802bbf37fb986f71ec1483a196e',
-        },
-    ]
-    label_list = [
-        "air_conditioner", "car_horn", "children_playing", "dog_bark",
-        "drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
-        "street_music"
-    ]
-    meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
-    meta_info = collections.namedtuple(
-        'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
-                      'class_id', 'label'))
-    audio_path = os.path.join('UrbanSound8K', 'audio')
-
-    def __init__(self,
-                 mode: str='train',
-                 split: int=1,
-                 feat_type: str='raw',
-                 **kwargs):
-        files, labels = self._get_data(mode, split)
-        super(UrbanSound8K, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-
-    def _get_meta_info(self):
-        ret = []
-        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                ret.append(self.meta_info(*line.strip().split(',')))
-        return ret
-
-    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        meta_info = self._get_meta_info()
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, _, _, _, _, fold, target, _ = sample
-            if mode == 'train' and int(fold) != split:
-                files.append(
-                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
-                                 filename))
-                labels.append(int(target))
-
-            if mode != 'train' and int(fold) == split:
-                files.append(
-                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
-                                 filename))
-                labels.append(int(target))
-
-        return files, labels
--- a/paddlespeech/audio/datasets/voxceleb.py
+++ b/paddlespeech/audio/datasets/voxceleb.py
@ -1,355 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import csv
-import glob
-import os
-import random
-from multiprocessing import cpu_count
-from typing import List
-
-from paddle.io import Dataset
-from pathos.multiprocessing import Pool
-from tqdm import tqdm
-
-from ..utils import DATA_HOME
-from ..utils import decompress
-from ..utils.download import download_and_decompress
-from .dataset import feat_funcs
-
-__all__ = ['VoxCeleb']
-
-
-class VoxCeleb(Dataset):
-    source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
-    archieves_audio_dev = [
-        {
-            'url': source_url + 'vox1_dev_wav_partaa',
-            'md5': 'e395d020928bc15670b570a21695ed96',
-        },
-        {
-            'url': source_url + 'vox1_dev_wav_partab',
-            'md5': 'bbfaaccefab65d82b21903e81a8a8020',
-        },
-        {
-            'url': source_url + 'vox1_dev_wav_partac',
-            'md5': '017d579a2a96a077f40042ec33e51512',
-        },
-        {
-            'url': source_url + 'vox1_dev_wav_partad',
-            'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
-        },
-    ]
-    archieves_audio_test = [
-        {
-            'url': source_url + 'vox1_test_wav.zip',
-            'md5': '185fdc63c3c739954633d50379a3d102',
-        },
-    ]
-    archieves_meta = [
-        {
-            'url':
-            'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
-            'md5':
-            'b73110731c9223c1461fe49cb48dddfc',
-        },
-    ]
-
-    num_speakers = 1211  # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
-    sample_rate = 16000
-    meta_info = collections.namedtuple(
-        'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
-    base_path = os.path.join(DATA_HOME, 'vox1')
-    wav_path = os.path.join(base_path, 'wav')
-    meta_path = os.path.join(base_path, 'meta')
-    veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
-    csv_path = os.path.join(base_path, 'csv')
-    subsets = ['train', 'dev', 'enroll', 'test']
-
-    def __init__(
-            self,
-            subset: str='train',
-            feat_type: str='raw',
-            random_chunk: bool=True,
-            chunk_duration: float=3.0,  # seconds
-            split_ratio: float=0.9,  # train split ratio
-            seed: int=0,
-            target_dir: str=None,
-            vox2_base_path=None,
-            **kwargs):
-        """VoxCeleb data prepare and get the specific dataset audio info
-
-        Args:
-            subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
-            feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
-            random_chunk (bool, optional): random select a duration from audio. Defaults to True.
-            chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
-            target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
-            vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
-        """
-        assert subset in self.subsets, \
-            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
-
-        self.subset = subset
-        self.spk_id2label = {}
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self.random_chunk = random_chunk
-        self.chunk_duration = chunk_duration
-        self.split_ratio = split_ratio
-        self.target_dir = target_dir if target_dir else VoxCeleb.base_path
-        self.vox2_base_path = vox2_base_path
-
-        # if we set the target dir, we will change the vox data info data from base path to target dir
-        VoxCeleb.csv_path = os.path.join(
-            target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
-        VoxCeleb.meta_path = os.path.join(
-            target_dir, "voxceleb",
-            'meta') if target_dir else VoxCeleb.meta_path
-        VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
-                                               'veri_test2.txt')
-        # self._data = self._get_data()[:1000]  # KP: Small dataset test.
-        self._data = self._get_data()
-        super(VoxCeleb, self).__init__()
-
-        # Set up a seed to reproduce training or predicting result.
-        # random.seed(seed)
-
-    def _get_data(self):
-        # Download audio files.
-        # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
-        # so, we check the vox1/wav dir status
-        print(f"wav base path: {self.wav_path}")
-        if not os.path.isdir(self.wav_path):
-            print("start to download the voxceleb1 dataset")
-            download_and_decompress(  # multi-zip parts concatenate to vox1_dev_wav.zip
-                self.archieves_audio_dev,
-                self.base_path,
-                decompress=False)
-            download_and_decompress(  # download the vox1_test_wav.zip and unzip
-                self.archieves_audio_test,
-                self.base_path,
-                decompress=True)
-
-            # Download all parts and concatenate the files into one zip file.
-            dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
-            print(f'Concatenating all parts to: {dev_zipfile}')
-            os.system(
-                f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
-            )
-
-            # Extract all audio files of dev and test set.
-            decompress(dev_zipfile, self.base_path)
-
-        # Download meta files.
-        if not os.path.isdir(self.meta_path):
-            print("prepare the meta data")
-            download_and_decompress(
-                self.archieves_meta, self.meta_path, decompress=False)
-
-        # Data preparation.
-        if not os.path.isdir(self.csv_path):
-            os.makedirs(self.csv_path)
-            self.prepare_data()
-
-        data = []
-        print(
-            f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
-        )
-        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                audio_id, duration, wav, start, stop, spk_id = line.strip(
-                ).split(',')
-                data.append(
-                    self.meta_info(audio_id,
-                                   float(duration), wav,
-                                   int(start), int(stop), spk_id))
-
-        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
-            for line in f.readlines():
-                spk_id, label = line.strip().split(' ')
-                self.spk_id2label[spk_id] = int(label)
-
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = paddlespeech.audio.load(record['wav'])
-
-        # random select a chunk audio samples from the audio
-        if self.random_chunk:
-            num_wav_samples = waveform.shape[0]
-            num_chunk_samples = int(self.chunk_duration * sr)
-            start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
-            stop = start + num_chunk_samples
-        else:
-            start = record['start']
-            stop = record['stop']
-
-        waveform = waveform[start:stop]
-
-        assert self.feat_type in feat_funcs.keys(), \
-            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sr=sr, **self.feat_config) if feat_func else waveform
-
-        record.update({'feat': feat})
-        if self.subset in ['train',
-                           'dev']:  # Labels are available in train and dev.
-            record.update({'label': self.spk_id2label[record['spk_id']]})
-
-        return record
-
-    @staticmethod
-    def _get_chunks(seg_dur, audio_id, audio_duration):
-        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
-
-        chunk_lst = [
-            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
-            for i in range(num_chunks)
-        ]
-        return chunk_lst
-
-    def _get_audio_info(self, wav_file: str,
-                        split_chunks: bool) -> List[List[str]]:
-        waveform, sr = paddlespeech.audio.load(wav_file)
-        spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
-        audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
-        audio_duration = waveform.shape[0] / sr
-
-        ret = []
-        if split_chunks:  # Split into pieces of self.chunk_duration seconds.
-            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
-                                                audio_duration)
-
-            for chunk in uniq_chunks_list:
-                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
-                start_sample = int(float(s) * sr)
-                end_sample = int(float(e) * sr)
-                # id, duration, wav, start, stop, spk_id
-                ret.append([
-                    chunk, audio_duration, wav_file, start_sample, end_sample,
-                    spk_id
-                ])
-        else:  # Keep whole audio.
-            ret.append([
-                audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
-            ])
-        return ret
-
-    def generate_csv(self,
-                     wav_files: List[str],
-                     output_file: str,
-                     split_chunks: bool=True):
-        print(f'Generating csv: {output_file}')
-        header = ["id", "duration", "wav", "start", "stop", "spk_id"]
-        # Note: this may occurs c++ execption, but the program will execute fine
-        # so we can ignore the execption 
-        with Pool(cpu_count()) as p:
-            infos = list(
-                tqdm(
-                    p.imap(lambda x: self._get_audio_info(x, split_chunks),
-                           wav_files),
-                    total=len(wav_files)))
-
-        csv_lines = []
-        for info in infos:
-            csv_lines.extend(info)
-
-        with open(output_file, mode="w") as csv_f:
-            csv_writer = csv.writer(
-                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
-            csv_writer.writerow(header)
-            for line in csv_lines:
-                csv_writer.writerow(line)
-
-    def prepare_data(self):
-        # Audio of speakers in veri_test_file should not be included in training set.
-        print("start to prepare the data csv file")
-        enroll_files = set()
-        test_files = set()
-        # get the enroll and test audio file path
-        with open(self.veri_test_file, 'r') as f:
-            for line in f.readlines():
-                _, enrol_file, test_file = line.strip().split(' ')
-                enroll_files.add(os.path.join(self.wav_path, enrol_file))
-                test_files.add(os.path.join(self.wav_path, test_file))
-            enroll_files = sorted(enroll_files)
-            test_files = sorted(test_files)
-
-        # get the enroll and test speakers
-        test_spks = set()
-        for file in (enroll_files + test_files):
-            spk = file.split('/wav/')[1].split('/')[0]
-            test_spks.add(spk)
-
-        # get all the train and dev audios file path
-        audio_files = []
-        speakers = set()
-        print("Getting file list...")
-        for path in [self.wav_path, self.vox2_base_path]:
-            # if vox2 directory is not set and vox2 is not a directory 
-            # we will not process this directory
-            if not path or not os.path.exists(path):
-                print(f"{path} is an invalid path, please check again, "
-                      "and we will ignore the vox2 base path")
-                continue
-            for file in glob.glob(
-                    os.path.join(path, "**", "*.wav"), recursive=True):
-                spk = file.split('/wav/')[1].split('/')[0]
-                if spk in test_spks:
-                    continue
-                speakers.add(spk)
-                audio_files.append(file)
-
-        print(
-            f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
-        )
-        # encode the train and dev speakers label to spk_id2label.txt
-        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
-            for label, spk_id in enumerate(
-                    sorted(speakers)):  # 1211 vox1, 5994 vox2, 7205 vox1+2
-                f.write(f'{spk_id} {label}\n')
-
-        audio_files = sorted(audio_files)
-        random.shuffle(audio_files)
-        split_idx = int(self.split_ratio * len(audio_files))
-        # split_ratio to train
-        train_files, dev_files = audio_files[:split_idx], audio_files[
-            split_idx:]
-
-        self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
-        self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
-
-        self.generate_csv(
-            enroll_files,
-            os.path.join(self.csv_path, 'enroll.csv'),
-            split_chunks=False)
-        self.generate_csv(
-            test_files,
-            os.path.join(self.csv_path, 'test.csv'),
-            split_chunks=False)
-
-    def __getitem__(self, idx):
-        return self._convert_to_record(idx)
-
-    def __len__(self):
-        return len(self._data)
--- a/paddlespeech/audio/features/init.py
+++ b/paddlespeech/audio/features/init.py
@ -1,17 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .layers import LogMelSpectrogram
-from .layers import MelSpectrogram
-from .layers import MFCC
-from .layers import Spectrogram
--- a/paddlespeech/audio/features/layers.py
+++ b/paddlespeech/audio/features/layers.py
@ -1,328 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from functools import partial
-from typing import Optional
-from typing import Union
-
-import paddle
-import paddle.nn as nn
-from paddle import Tensor
-
-from ..functional import compute_fbank_matrix
-from ..functional import create_dct
-from ..functional import power_to_db
-from ..functional.window import get_window
-
-__all__ = [
-    'Spectrogram',
-    'MelSpectrogram',
-    'LogMelSpectrogram',
-    'MFCC',
-]
-
-
-class Spectrogram(nn.Layer):
-    """Compute spectrogram of given signals, typically audio waveforms.
-    The spectorgram is defined as the complex norm of the short-time Fourier transformation.
-
-    Args:
-        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
-        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
-        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
-        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
-        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
-        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
-    """
-
-    def __init__(self,
-                 n_fft: int=512,
-                 hop_length: Optional[int]=None,
-                 win_length: Optional[int]=None,
-                 window: str='hann',
-                 power: float=2.0,
-                 center: bool=True,
-                 pad_mode: str='reflect',
-                 dtype: str='float32') -> None:
-        super(Spectrogram, self).__init__()
-
-        assert power > 0, 'Power of spectrogram must be > 0.'
-        self.power = power
-
-        if win_length is None:
-            win_length = n_fft
-
-        self.fft_window = get_window(
-            window, win_length, fftbins=True, dtype=dtype)
-        self._stft = partial(
-            paddle.signal.stft,
-            n_fft=n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=self.fft_window,
-            center=center,
-            pad_mode=pad_mode)
-        self.register_buffer('fft_window', self.fft_window)
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x (Tensor): Tensor of waveforms with shape `(N, T)`
-
-        Returns:
-            Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
-        """
-        stft = self._stft(x)
-        spectrogram = paddle.pow(paddle.abs(stft), self.power)
-        return spectrogram
-
-
-class MelSpectrogram(nn.Layer):
-    """Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
-
-    Args:
-        sr (int, optional): Sample rate. Defaults to 22050.
-        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
-        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
-        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
-        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
-        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
-        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
-        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
-        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
-    """
-
-    def __init__(self,
-                 sr: int=22050,
-                 n_fft: int=512,
-                 hop_length: Optional[int]=None,
-                 win_length: Optional[int]=None,
-                 window: str='hann',
-                 power: float=2.0,
-                 center: bool=True,
-                 pad_mode: str='reflect',
-                 n_mels: int=64,
-                 f_min: float=50.0,
-                 f_max: Optional[float]=None,
-                 htk: bool=False,
-                 norm: Union[str, float]='slaney',
-                 dtype: str='float32') -> None:
-        super(MelSpectrogram, self).__init__()
-
-        self._spectrogram = Spectrogram(
-            n_fft=n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=window,
-            power=power,
-            center=center,
-            pad_mode=pad_mode,
-            dtype=dtype)
-        self.n_mels = n_mels
-        self.f_min = f_min
-        self.f_max = f_max
-        self.htk = htk
-        self.norm = norm
-        if f_max is None:
-            f_max = sr // 2
-        self.fbank_matrix = compute_fbank_matrix(
-            sr=sr,
-            n_fft=n_fft,
-            n_mels=n_mels,
-            f_min=f_min,
-            f_max=f_max,
-            htk=htk,
-            norm=norm,
-            dtype=dtype)  # float64 for better numerical results
-        self.register_buffer('fbank_matrix', self.fbank_matrix)
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x (Tensor): Tensor of waveforms with shape `(N, T)`
-
-        Returns:
-            Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
-        """
-        spect_feature = self._spectrogram(x)
-        mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
-        return mel_feature
-
-
-class LogMelSpectrogram(nn.Layer):
-    """Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
-
-    Args:
-        sr (int, optional): Sample rate. Defaults to 22050.
-        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
-        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
-        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
-        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
-        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
-        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
-        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
-        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
-        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
-        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
-        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
-    """
-
-    def __init__(self,
-                 sr: int=22050,
-                 n_fft: int=512,
-                 hop_length: Optional[int]=None,
-                 win_length: Optional[int]=None,
-                 window: str='hann',
-                 power: float=2.0,
-                 center: bool=True,
-                 pad_mode: str='reflect',
-                 n_mels: int=64,
-                 f_min: float=50.0,
-                 f_max: Optional[float]=None,
-                 htk: bool=False,
-                 norm: Union[str, float]='slaney',
-                 ref_value: float=1.0,
-                 amin: float=1e-10,
-                 top_db: Optional[float]=None,
-                 dtype: str='float32') -> None:
-        super(LogMelSpectrogram, self).__init__()
-
-        self._melspectrogram = MelSpectrogram(
-            sr=sr,
-            n_fft=n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=window,
-            power=power,
-            center=center,
-            pad_mode=pad_mode,
-            n_mels=n_mels,
-            f_min=f_min,
-            f_max=f_max,
-            htk=htk,
-            norm=norm,
-            dtype=dtype)
-
-        self.ref_value = ref_value
-        self.amin = amin
-        self.top_db = top_db
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x (Tensor): Tensor of waveforms with shape `(N, T)`
-
-        Returns:
-            Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
-        """
-        mel_feature = self._melspectrogram(x)
-        log_mel_feature = power_to_db(
-            mel_feature,
-            ref_value=self.ref_value,
-            amin=self.amin,
-            top_db=self.top_db)
-        return log_mel_feature
-
-
-class MFCC(nn.Layer):
-    """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
-
-    Args:
-        sr (int, optional): Sample rate. Defaults to 22050.
-        n_mfcc (int, optional): [description]. Defaults to 40.
-        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
-        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
-        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
-        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
-        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
-        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
-        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
-        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
-        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
-        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
-        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
-    """
-
-    def __init__(self,
-                 sr: int=22050,
-                 n_mfcc: int=40,
-                 n_fft: int=512,
-                 hop_length: Optional[int]=None,
-                 win_length: Optional[int]=None,
-                 window: str='hann',
-                 power: float=2.0,
-                 center: bool=True,
-                 pad_mode: str='reflect',
-                 n_mels: int=64,
-                 f_min: float=50.0,
-                 f_max: Optional[float]=None,
-                 htk: bool=False,
-                 norm: Union[str, float]='slaney',
-                 ref_value: float=1.0,
-                 amin: float=1e-10,
-                 top_db: Optional[float]=None,
-                 dtype: str=paddle.float32) -> None:
-        super(MFCC, self).__init__()
-        assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
-            n_mfcc, n_mels)
-        self._log_melspectrogram = LogMelSpectrogram(
-            sr=sr,
-            n_fft=n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=window,
-            power=power,
-            center=center,
-            pad_mode=pad_mode,
-            n_mels=n_mels,
-            f_min=f_min,
-            f_max=f_max,
-            htk=htk,
-            norm=norm,
-            ref_value=ref_value,
-            amin=amin,
-            top_db=top_db,
-            dtype=dtype)
-        self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
-        self.register_buffer('dct_matrix', self.dct_matrix)
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x (Tensor): Tensor of waveforms with shape `(N, T)`
-
-        Returns:
-            Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
-        """
-        log_mel_feature = self._log_melspectrogram(x)
-        mfcc = paddle.matmul(
-            log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
-                (0, 2, 1))  # (B, n_mels, L)
-        return mfcc
--- a/paddlespeech/audio/functional/init.py
+++ b/paddlespeech/audio/functional/init.py
@ -1,20 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .functional import compute_fbank_matrix
-from .functional import create_dct
-from .functional import fft_frequencies
-from .functional import hz_to_mel
-from .functional import mel_frequencies
-from .functional import mel_to_hz
-from .functional import power_to_db
--- a/paddlespeech/audio/functional/functional.py
+++ b/paddlespeech/audio/functional/functional.py
@ -1,266 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from librosa(https://github.com/librosa/librosa)
-import math
-from typing import Optional
-from typing import Union
-
-import paddle
-from paddle import Tensor
-
-__all__ = [
-    'hz_to_mel',
-    'mel_to_hz',
-    'mel_frequencies',
-    'fft_frequencies',
-    'compute_fbank_matrix',
-    'power_to_db',
-    'create_dct',
-]
-
-
-def hz_to_mel(freq: Union[Tensor, float],
-              htk: bool=False) -> Union[Tensor, float]:
-    """Convert Hz to Mels.
-
-    Args:
-        freq (Union[Tensor, float]): The input tensor with arbitrary shape.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-
-    Returns:
-        Union[Tensor, float]: Frequency in mels.
-    """
-
-    if htk:
-        if isinstance(freq, Tensor):
-            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
-        else:
-            return 2595.0 * math.log10(1.0 + freq / 700.0)
-
-    # Fill in the linear part
-    f_min = 0.0
-    f_sp = 200.0 / 3
-
-    mels = (freq - f_min) / f_sp
-
-    # Fill in the log-scale part
-
-    min_log_hz = 1000.0  # beginning of log region (Hz)
-    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = math.log(6.4) / 27.0  # step size for log region
-
-    if isinstance(freq, Tensor):
-        target = min_log_mel + paddle.log(
-            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
-        mask = (freq > min_log_hz).astype(freq.dtype)
-        mels = target * mask + mels * (
-            1 - mask)  # will replace by masked_fill OP in future
-    else:
-        if freq >= min_log_hz:
-            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
-
-    return mels
-
-
-def mel_to_hz(mel: Union[float, Tensor],
-              htk: bool=False) -> Union[float, Tensor]:
-    """Convert mel bin numbers to frequencies.
-
-    Args:
-        mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-
-    Returns:
-        Union[float, Tensor]: Frequencies in Hz.
-    """
-    if htk:
-        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
-
-    f_min = 0.0
-    f_sp = 200.0 / 3
-    freqs = f_min + f_sp * mel
-    # And now the nonlinear scale
-    min_log_hz = 1000.0  # beginning of log region (Hz)
-    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = math.log(6.4) / 27.0  # step size for log region
-    if isinstance(mel, Tensor):
-        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
-        mask = (mel > min_log_mel).astype(mel.dtype)
-        freqs = target * mask + freqs * (
-            1 - mask)  # will replace by masked_fill OP in future
-    else:
-        if mel >= min_log_mel:
-            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
-
-    return freqs
-
-
-def mel_frequencies(n_mels: int=64,
-                    f_min: float=0.0,
-                    f_max: float=11025.0,
-                    htk: bool=False,
-                    dtype: str='float32') -> Tensor:
-    """Compute mel frequencies.
-
-    Args:
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
-        fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
-
-    Returns:
-        Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
-    """
-    # 'Center freqs' of mel bands - uniformly spaced between limits
-    min_mel = hz_to_mel(f_min, htk=htk)
-    max_mel = hz_to_mel(f_max, htk=htk)
-    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
-    freqs = mel_to_hz(mels, htk=htk)
-    return freqs
-
-
-def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor:
-    """Compute fourier frequencies.
-
-    Args:
-        sr (int): Sample rate.
-        n_fft (int): Number of fft bins.
-        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
-
-    Returns:
-        Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
-    """
-    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
-
-
-def compute_fbank_matrix(sr: int,
-                         n_fft: int,
-                         n_mels: int=64,
-                         f_min: float=0.0,
-                         f_max: Optional[float]=None,
-                         htk: bool=False,
-                         norm: Union[str, float]='slaney',
-                         dtype: str='float32') -> Tensor:
-    """Compute fbank matrix.
-
-    Args:
-        sr (int): Sample rate.
-        n_fft (int): Number of fft bins.
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
-        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-        norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
-        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
-
-    Returns:
-        Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
-    """
-
-    if f_max is None:
-        f_max = float(sr) / 2
-
-    # Initialize the weights
-    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
-
-    # Center freqs of each FFT bin
-    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
-
-    # 'Center freqs' of mel bands - uniformly spaced between limits
-    mel_f = mel_frequencies(
-        n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
-
-    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
-    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
-    #ramps = np.subtract.outer(mel_f, fftfreqs)
-
-    for i in range(n_mels):
-        # lower and upper slopes for all bins
-        lower = -ramps[i] / fdiff[i]
-        upper = ramps[i + 2] / fdiff[i + 1]
-
-        # .. then intersect them with each other and zero
-        weights[i] = paddle.maximum(
-            paddle.zeros_like(lower), paddle.minimum(lower, upper))
-
-    # Slaney-style mel is scaled to be approx constant energy per channel
-    if norm == 'slaney':
-        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
-        weights *= enorm.unsqueeze(1)
-    elif isinstance(norm, int) or isinstance(norm, float):
-        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
-
-    return weights
-
-
-def power_to_db(spect: Tensor,
-                ref_value: float=1.0,
-                amin: float=1e-10,
-                top_db: Optional[float]=None) -> Tensor:
-    """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
-
-    Args:
-        spect (Tensor): STFT power spectrogram.
-        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
-        amin (float, optional): Minimum threshold. Defaults to 1e-10.
-        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
-
-    Returns:
-        Tensor: Power spectrogram in db scale.
-    """
-    if amin <= 0:
-        raise Exception("amin must be strictly positive")
-
-    if ref_value <= 0:
-        raise Exception("ref_value must be strictly positive")
-
-    ones = paddle.ones_like(spect)
-    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
-    log_spec -= 10.0 * math.log10(max(ref_value, amin))
-
-    if top_db is not None:
-        if top_db < 0:
-            raise Exception("top_db must be non-negative")
-        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
-
-    return log_spec
-
-
-def create_dct(n_mfcc: int,
-               n_mels: int,
-               norm: Optional[str]='ortho',
-               dtype: str='float32') -> Tensor:
-    """Create a discrete cosine transform(DCT) matrix.
-
-    Args:
-        n_mfcc (int): Number of mel frequency cepstral coefficients. 
-        n_mels (int): Number of mel filterbanks.
-        norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
-        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
-
-    Returns:
-        Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
-    """
-    n = paddle.arange(n_mels, dtype=dtype)
-    k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
-    dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
-                     k)  # size (n_mfcc, n_mels)
-    if norm is None:
-        dct *= 2.0
-    else:
-        assert norm == "ortho"
-        dct[0] *= 1.0 / math.sqrt(2.0)
-        dct *= math.sqrt(2.0 / float(n_mels))
-    return dct.T
--- a/paddlespeech/audio/functional/window.py
+++ b/paddlespeech/audio/functional/window.py
@ -1,337 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-import math
-from typing import List
-from typing import Tuple
-from typing import Union
-
-import paddle
-from paddle import Tensor
-
-__all__ = [
-    'get_window',
-]
-
-
-def _cat(x: List[Tensor], data_type: str) -> Tensor:
-    l = [paddle.to_tensor(_, data_type) for _ in x]
-    return paddle.concat(l)
-
-
-def _acosh(x: Union[Tensor, float]) -> Tensor:
-    if isinstance(x, float):
-        return math.log(x + math.sqrt(x**2 - 1))
-    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
-
-
-def _extend(M: int, sym: bool) -> bool:
-    """Extend window by 1 sample if needed for DFT-even symmetry. """
-    if not sym:
-        return M + 1, True
-    else:
-        return M, False
-
-
-def _len_guards(M: int) -> bool:
-    """Handle small or incorrect window lengths. """
-    if int(M) != M or M < 0:
-        raise ValueError('Window length M must be a non-negative integer')
-
-    return M <= 1
-
-
-def _truncate(w: Tensor, needed: bool) -> Tensor:
-    """Truncate window by 1 sample if needed for DFT-even symmetry. """
-    if needed:
-        return w[:-1]
-    else:
-        return w
-
-
-def _general_gaussian(M: int, p, sig, sym: bool=True,
-                      dtype: str='float64') -> Tensor:
-    """Compute a window with a generalized Gaussian shape.
-    This function is consistent with scipy.signal.windows.general_gaussian().
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-
-    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
-    w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
-
-    return _truncate(w, needs_trunc)
-
-
-def _general_cosine(M: int, a: float, sym: bool=True,
-                    dtype: str='float64') -> Tensor:
-    """Compute a generic weighted sum of cosine terms window.
-    This function is consistent with scipy.signal.windows.general_cosine().
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
-    w = paddle.zeros((M, ), dtype=dtype)
-    for k in range(len(a)):
-        w += a[k] * paddle.cos(k * fac)
-    return _truncate(w, needs_trunc)
-
-
-def _general_hamming(M: int, alpha: float, sym: bool=True,
-                     dtype: str='float64') -> Tensor:
-    """Compute a generalized Hamming window.
-    This function is consistent with scipy.signal.windows.general_hamming()
-    """
-    return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
-
-
-def _taylor(M: int,
-            nbar=4,
-            sll=30,
-            norm=True,
-            sym: bool=True,
-            dtype: str='float64') -> Tensor:
-    """Compute a Taylor window.
-    The Taylor window taper function approximates the Dolph-Chebyshev window's
-    constant sidelobe level for a parameterized number of near-in sidelobes.
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-    # Original text uses a negative sidelobe level parameter and then negates
-    # it in the calculation of B. To keep consistent with other methods we
-    # assume the sidelobe level parameter to be positive.
-    B = 10**(sll / 20)
-    A = _acosh(B) / math.pi
-    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
-    ma = paddle.arange(1, nbar, dtype=dtype)
-
-    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
-    signs = paddle.empty_like(ma)
-    signs[::2] = 1
-    signs[1::2] = -1
-    m2 = ma * ma
-    for mi in range(len(ma)):
-        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
-                                                           ))
-        if mi == 0:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
-        elif mi == len(ma) - 1:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
-        else:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
-                mi] / m2[mi + 1:])
-
-        Fm[mi] = numer / denom
-
-    def W(n):
-        return 1 + 2 * paddle.matmul(
-            Fm.unsqueeze(0),
-            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
-
-    w = W(paddle.arange(0, M, dtype=dtype))
-
-    # normalize (Note that this is not described in the original text [1])
-    if norm:
-        scale = 1.0 / W((M - 1) / 2)
-        w *= scale
-    w = w.squeeze()
-    return _truncate(w, needs_trunc)
-
-
-def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a Hamming window.
-    The Hamming window is a taper formed by using a raised cosine with
-    non-zero endpoints, optimized to minimize the nearest side lobe.
-    """
-    return _general_hamming(M, 0.54, sym, dtype=dtype)
-
-
-def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a Hann window.
-    The Hann window is a taper formed by using a raised cosine or sine-squared
-    with ends that touch zero.
-    """
-    return _general_hamming(M, 0.5, sym, dtype=dtype)
-
-
-def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a Tukey window.
-    The Tukey window is also known as a tapered cosine window.
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-
-    if alpha <= 0:
-        return paddle.ones((M, ), dtype=dtype)
-    elif alpha >= 1.0:
-        return hann(M, sym=sym)
-
-    M, needs_trunc = _extend(M, sym)
-
-    n = paddle.arange(0, M, dtype=dtype)
-    width = int(alpha * (M - 1) / 2.0)
-    n1 = n[0:width + 1]
-    n2 = n[width + 1:M - width - 1]
-    n3 = n[M - width - 1:]
-
-    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
-    w2 = paddle.ones(n2.shape, dtype=dtype)
-    w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
-                                          (M - 1))))
-    w = paddle.concat([w1, w2, w3])
-
-    return _truncate(w, needs_trunc)
-
-
-def _kaiser(M: int, beta: float, sym: bool=True,
-            dtype: str='float64') -> Tensor:
-    """Compute a Kaiser window.
-    The Kaiser window is a taper formed by using a Bessel function.
-    """
-    raise NotImplementedError()
-
-
-def _gaussian(M: int, std: float, sym: bool=True,
-              dtype: str='float64') -> Tensor:
-    """Compute a Gaussian window.
-    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-
-    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
-    sig2 = 2 * std * std
-    w = paddle.exp(-n**2 / sig2)
-
-    return _truncate(w, needs_trunc)
-
-
-def _exponential(M: int,
-                 center=None,
-                 tau=1.,
-                 sym: bool=True,
-                 dtype: str='float64') -> Tensor:
-    """Compute an exponential (or Poisson) window. """
-    if sym and center is not None:
-        raise ValueError("If sym==True, center must be None.")
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-
-    if center is None:
-        center = (M - 1) / 2
-
-    n = paddle.arange(0, M, dtype=dtype)
-    w = paddle.exp(-paddle.abs(n - center) / tau)
-
-    return _truncate(w, needs_trunc)
-
-
-def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a triangular window.
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-
-    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
-    if M % 2 == 0:
-        w = (2 * n - 1.0) / M
-        w = paddle.concat([w, w[::-1]])
-    else:
-        w = 2 * n / (M + 1.0)
-        w = paddle.concat([w, w[-2::-1]])
-
-    return _truncate(w, needs_trunc)
-
-
-def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a Bohman window.
-    The Bohman window is the autocorrelation of a cosine window.
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-
-    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
-    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
-        math.pi * fac)
-    w = _cat([0, w, 0], dtype)
-
-    return _truncate(w, needs_trunc)
-
-
-def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a Blackman window.
-    The Blackman window is a taper formed by using the first three terms of
-    a summation of cosines. It was designed to have close to the minimal
-    leakage possible.  It is close to optimal, only slightly worse than a
-    Kaiser window.
-    """
-    return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
-
-
-def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a window with a simple cosine shape.
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
-
-    return _truncate(w, needs_trunc)
-
-
-def get_window(window: Union[str, Tuple[str, float]],
-               win_length: int,
-               fftbins: bool=True,
-               dtype: str='float64') -> Tensor:
-    """Return a window of a given length and type.
-
-    Args:
-        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
-        win_length (int): Number of samples.
-        fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
-        dtype (str, optional): The data type of the return window. Defaults to 'float64'.
-
-    Returns:
-        Tensor: The window represented as a tensor.
-    """
-    sym = not fftbins
-
-    args = ()
-    if isinstance(window, tuple):
-        winstr = window[0]
-        if len(window) > 1:
-            args = window[1:]
-    elif isinstance(window, str):
-        if window in ['gaussian', 'exponential']:
-            raise ValueError("The '" + window + "' window needs one or "
-                             "more parameters -- pass a tuple.")
-        else:
-            winstr = window
-    else:
-        raise ValueError("%s as window type is not supported." %
-                         str(type(window)))
-
-    try:
-        winfunc = eval('_' + winstr)
-    except KeyError as e:
-        raise ValueError("Unknown window type.") from e
-
-    params = (win_length, ) + args
-    kwargs = {'sym': sym}
-    return winfunc(*params, dtype=dtype, **kwargs)
--- a/paddlespeech/audio/io/init.py
+++ b/paddlespeech/audio/io/init.py
@ -1,13 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/paddlespeech/audio/kaldi/init.py
+++ b/paddlespeech/audio/kaldi/init.py
@ -1,15 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .kaldi import fbank
-from .kaldi import pitch
--- a/paddlespeech/audio/kaldi/kaldi.py
+++ b/paddlespeech/audio/kaldi/kaldi.py
@ -1,132 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddlespeech
-from paddlespeech.audio._internal import module_utils 
-
-__all__ = [
-    'fbank',
-    'pitch',
-]
-
-
-@module_utils.requires_kaldi()
-def fbank(
-        wav,
-        samp_freq: int=16000,
-        frame_shift_ms: float=10.0,
-        frame_length_ms: float=25.0,
-        dither: float=0.0,
-        preemph_coeff: float=0.97,
-        remove_dc_offset: bool=True,
-        window_type: str='povey',
-        round_to_power_of_two: bool=True,
-        blackman_coeff: float=0.42,
-        snip_edges: bool=True,
-        allow_downsample: bool=False,
-        allow_upsample: bool=False,
-        max_feature_vectors: int=-1,
-        num_bins: int=23,
-        low_freq: float=20,
-        high_freq: float=0,
-        vtln_low: float=100,
-        vtln_high: float=-500,
-        debug_mel: bool=False,
-        htk_mode: bool=False,
-        use_energy: bool=False,  # fbank opts
-        energy_floor: float=0.0,
-        raw_energy: bool=True,
-        htk_compat: bool=False,
-        use_log_fbank: bool=True,
-        use_power: bool=True):
-    frame_opts = paddlespeech.audio._paddleaudio.FrameExtractionOptions()
-    mel_opts = paddlespeech.audio._paddleaudio.MelBanksOptions()
-    fbank_opts = paddlespeech.audio._paddleaudio.FbankOptions()
-    frame_opts.samp_freq = samp_freq
-    frame_opts.frame_shift_ms = frame_shift_ms
-    frame_opts.frame_length_ms = frame_length_ms
-    frame_opts.dither = dither
-    frame_opts.preemph_coeff = preemph_coeff
-    frame_opts.remove_dc_offset = remove_dc_offset
-    frame_opts.window_type = window_type
-    frame_opts.round_to_power_of_two = round_to_power_of_two
-    frame_opts.blackman_coeff = blackman_coeff
-    frame_opts.snip_edges = snip_edges
-    frame_opts.allow_downsample = allow_downsample
-    frame_opts.allow_upsample = allow_upsample
-    frame_opts.max_feature_vectors = max_feature_vectors
-
-    mel_opts.num_bins = num_bins
-    mel_opts.low_freq = low_freq
-    mel_opts.high_freq = high_freq
-    mel_opts.vtln_low = vtln_low
-    mel_opts.vtln_high = vtln_high
-    mel_opts.debug_mel = debug_mel
-    mel_opts.htk_mode = htk_mode
-
-    fbank_opts.use_energy = use_energy
-    fbank_opts.energy_floor = energy_floor
-    fbank_opts.raw_energy = raw_energy
-    fbank_opts.htk_compat = htk_compat
-    fbank_opts.use_log_fbank = use_log_fbank
-    fbank_opts.use_power = use_power
-    feat = paddlespeech.audio._paddleaudio.ComputeFbank(frame_opts, mel_opts, fbank_opts, wav)
-    return feat
-
-
-@module_utils.requires_kaldi()
-def pitch(wav,
-          samp_freq: int=16000,
-          frame_shift_ms: float=10.0,
-          frame_length_ms: float=25.0,
-          preemph_coeff: float=0.0,
-          min_f0: int=50,
-          max_f0: int=400,
-          soft_min_f0: float=10.0,
-          penalty_factor: float=0.1,
-          lowpass_cutoff: int=1000,
-          resample_freq: int=4000,
-          delta_pitch: float=0.005,
-          nccf_ballast: int=7000,
-          lowpass_filter_width: int=1,
-          upsample_filter_width: int=5,
-          max_frames_latency: int=0,
-          frames_per_chunk: int=0,
-          simulate_first_pass_online: bool=False,
-          recompute_frame: int=500,
-          nccf_ballast_online: bool=False,
-          snip_edges: bool=True):
-    pitch_opts = paddlespeech.audio._paddleaudio.PitchExtractionOptions()
-    pitch_opts.samp_freq = samp_freq
-    pitch_opts.frame_shift_ms = frame_shift_ms
-    pitch_opts.frame_length_ms = frame_length_ms
-    pitch_opts.preemph_coeff = preemph_coeff
-    pitch_opts.min_f0 = min_f0
-    pitch_opts.max_f0 = max_f0
-    pitch_opts.soft_min_f0 = soft_min_f0
-    pitch_opts.penalty_factor = penalty_factor
-    pitch_opts.lowpass_cutoff = lowpass_cutoff
-    pitch_opts.resample_freq = resample_freq
-    pitch_opts.delta_pitch = delta_pitch
-    pitch_opts.nccf_ballast = nccf_ballast
-    pitch_opts.lowpass_filter_width = lowpass_filter_width
-    pitch_opts.upsample_filter_width = upsample_filter_width
-    pitch_opts.max_frames_latency = max_frames_latency
-    pitch_opts.frames_per_chunk = frames_per_chunk
-    pitch_opts.simulate_first_pass_online = simulate_first_pass_online
-    pitch_opts.recompute_frame = recompute_frame
-    pitch_opts.nccf_ballast_online = nccf_ballast_online
-    pitch_opts.snip_edges = snip_edges
-    pitch = paddlespeech.audio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
-    return pitch
--- a/paddlespeech/audio/lib/.gitignore
+++ b/paddlespeech/audio/lib/.gitignore
--- a/paddlespeech/audio/metric/init.py
+++ b/paddlespeech/audio/metric/init.py
@ -1,15 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .eer import compute_eer
-from .eer import compute_minDCF
--- a/paddlespeech/audio/metric/eer.py
+++ b/paddlespeech/audio/metric/eer.py
@ -1,100 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List
-
-import numpy as np
-import paddle
-from sklearn.metrics import roc_curve
-
-
-def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
-    """Compute EER and return score threshold.
-
-    Args:
-        labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num
-        scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num
-
-    Returns:
-        List[float]: eer and the specific threshold
-    """
-    fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
-    fnr = 1 - tpr
-    eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
-    eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
-    return eer, eer_threshold
-
-
-def compute_minDCF(positive_scores,
-                   negative_scores,
-                   c_miss=1.0,
-                   c_fa=1.0,
-                   p_target=0.01):
-    """
-    This is modified from SpeechBrain
-    https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509
-    Computes the minDCF metric normally used to evaluate speaker verification
-    systems. The min_DCF is the minimum of the following C_det function computed
-    within the defined threshold range:
-
-    C_det =  c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)
-
-    where p_miss is the missing probability and p_fa is the probability of having
-    a false alarm.
-
-    Args:
-        positive_scores (Paddle.Tensor): The scores from entries of the same class.
-        negative_scores (Paddle.Tensor): The scores from entries of different classes.
-        c_miss (float, optional): Cost assigned to a missing error (default 1.0).
-        c_fa (float, optional): Cost assigned to a false alarm (default 1.0).
-        p_target (float, optional): Prior probability of having a target (default 0.01).
-
-    Returns:
-        List[float]: min dcf and the specific threshold
-    """
-    # Computing candidate thresholds
-    if len(positive_scores.shape) > 1:
-        positive_scores = positive_scores.squeeze()
-
-    if len(negative_scores.shape) > 1:
-        negative_scores = negative_scores.squeeze()
-
-    thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores]))
-    thresholds = paddle.unique(thresholds)
-
-    # Adding intermediate thresholds
-    interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2
-    thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds]))
-
-    # Computing False Rejection Rate (miss detection)
-    positive_scores = paddle.concat(
-        len(thresholds) * [positive_scores.unsqueeze(0)])
-    pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds
-    p_miss = (pos_scores_threshold.sum(0)
-              ).astype("float32") / positive_scores.shape[1]
-    del positive_scores
-    del pos_scores_threshold
-
-    # Computing False Acceptance Rate (false alarm)
-    negative_scores = paddle.concat(
-        len(thresholds) * [negative_scores.unsqueeze(0)])
-    neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds
-    p_fa = (neg_scores_threshold.sum(0)
-            ).astype("float32") / negative_scores.shape[1]
-    del negative_scores
-    del neg_scores_threshold
-
-    c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target)
-    c_min = paddle.min(c_det, axis=0)
-    min_index = paddle.argmin(c_det, axis=0)
-    return float(c_min), float(thresholds[min_index])
--- a/paddlespeech/audio/sox_effects/init.py
+++ b/paddlespeech/audio/sox_effects/init.py
@ -1,25 +0,0 @@
-from paddlespeech.audio._internal import module_utils as _mod_utils
-
-from .sox_effects import (
-    apply_effects_file,
-    apply_effects_tensor,
-    effect_names,
-    init_sox_effects,
-    shutdown_sox_effects,
-)
-
-
-if _mod_utils.is_sox_available():
-    import atexit
-
-    init_sox_effects()
-    atexit.register(shutdown_sox_effects)
-
-__all__ = [
-    "init_sox_effects",
-    "shutdown_sox_effects",
-    "effect_names",
-    "apply_effects_tensor",
-    "apply_effects_file",
-]
-
--- a/paddlespeech/audio/sox_effects/sox_effects.py
+++ b/paddlespeech/audio/sox_effects/sox_effects.py
@ -1,238 +0,0 @@
-import os
-from typing import List, Optional, Tuple
-import paddle
-import numpy
-
-from paddlespeech.audio._internal import module_utils as _mod_utils
-from paddlespeech.audio.utils.sox_utils import list_effects
-from paddlespeech.audio import _paddleaudio as paddleaudio
-
-#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
-
-@_mod_utils.requires_sox()
-def init_sox_effects():
-    """Initialize resources required to use sox effects.
-
-    Note:
-        You do not need to call this function manually. It is called automatically.
-
-    Once initialized, you do not need to call this function again across the multiple uses of
-    sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
-    Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
-    again will result in error.
-    """
-    paddleaudio.sox_effects_initialize_sox_effects()
-
-
-@_mod_utils.requires_sox()
-def shutdown_sox_effects():
-    """Clean up resources required to use sox effects.
-
-    Note:
-        You do not need to call this function manually. It is called automatically.
-
-    It is safe to call this function multiple times.
-    Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
-    initializing again will result in error.
-    """
-    paddleaudio.sox_effects_shutdown_sox_effects()
-
-
-@_mod_utils.requires_sox()
-def effect_names() -> List[str]:
-    """Gets list of valid sox effect names
-
-    Returns:
-        List[str]: list of available effect names.
-
-    Example
-        >>> paddleaudio.sox_effects.effect_names()
-        ['allpass', 'band', 'bandpass', ... ]
-    """
-    return list(list_effects().keys())
-
-
-@_mod_utils.requires_sox()
-def apply_effects_tensor(
-    tensor: paddle.Tensor,
-    sample_rate: int,
-    effects: List[List[str]],
-    channels_first: bool = True,
-) -> Tuple[paddle.Tensor, int]:
-    """Apply sox effects to given Tensor
-
-    .. devices:: CPU
-
-    Note:
-        This function only works on CPU Tensors.
-        This function works in the way very similar to ``sox`` command, however there are slight
-        differences. For example, ``sox`` command adds certain effects automatically (such as
-        ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
-        only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
-        need to give ``rate`` effect with desired sampling rate.).
-
-    Args:
-        tensor (paddle.Tensor): Input 2D CPU Tensor.
-        sample_rate (int): Sample rate
-        effects (List[List[str]]): List of effects.
-        channels_first (bool, optional): Indicates if the input Tensor's dimension is
-            `[channels, time]` or `[time, channels]`
-
-    Returns:
-        (Tensor, int): Resulting Tensor and sample rate.
-        The resulting Tensor has the same ``dtype`` as the input Tensor, and
-        the same channels order. The shape of the Tensor can be different based on the
-        effects applied. Sample rate can also be different based on the effects applied.
-
-    Example - Basic usage
-        >>>
-        >>> # Defines the effects to apply
-        >>> effects = [
-        ...     ['gain', '-n'],  # normalises to 0dB
-        ...     ['pitch', '5'],  # 5 cent pitch shift
-        ...     ['rate', '8000'],  # resample to 8000 Hz
-        ... ]
-        >>>
-        >>> # Generate pseudo wave:
-        >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
-        >>> sample_rate = 16000
-        >>> waveform = 2 * paddle.rand([2, sample_rate * 1]) - 1
-        >>> waveform.shape
-        paddle.Size([2, 16000])
-        >>> waveform
-        tensor([[ 0.3138,  0.7620, -0.9019,  ..., -0.7495, -0.4935,  0.5442],
-                [-0.0832,  0.0061,  0.8233,  ..., -0.5176, -0.9140, -0.2434]])
-        >>>
-        >>> # Apply effects
-        >>> waveform, sample_rate = apply_effects_tensor(
-        ...     wave_form, sample_rate, effects, channels_first=True)
-        >>>
-        >>> # Check the result
-        >>> # The new waveform is sampling rate 8000, 1 second.
-        >>> # normalization and channel order are preserved
-        >>> waveform.shape
-        paddle.Size([2, 8000])
-        >>> waveform
-        tensor([[ 0.5054, -0.5518, -0.4800,  ..., -0.0076,  0.0096, -0.0110],
-                [ 0.1331,  0.0436, -0.3783,  ..., -0.0035,  0.0012,  0.0008]])
-        >>> sample_rate
-        8000
-
-    """
-    tensor_np = tensor.numpy()
-    ret = paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate, effects, channels_first)
-    if ret is not None:
-       return (paddle.to_tensor(ret[0]), ret[1])
-    raise RuntimeError("Failed to apply sox effect")
-
-
-@_mod_utils.requires_sox()
-def apply_effects_file(
-    path: str,
-    effects: List[List[str]],
-    normalize: bool = True,
-    channels_first: bool = True,
-    format: Optional[str] = None,
-) -> Tuple[paddle.Tensor, int]:
-    """Apply sox effects to the audio file and load the resulting data as Tensor
-
-    Note:
-        This function works in the way very similar to ``sox`` command, however there are slight
-        differences. For example, ``sox`` commnad adds certain effects automatically (such as
-        ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
-        effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
-        effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
-        rate and leave samples untouched.
-
-    Args:
-        path (path-like object or file-like object):
-        effects (List[List[str]]): List of effects.
-        normalize (bool, optional):
-            When ``True``, this function always return ``float32``, and sample values are
-            normalized to ``[-1.0, 1.0]``.
-            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
-            integer type. This argument has no effect for formats other
-            than integer WAV type.
-        channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
-            Otherwise, the returned Tensor's dimension is `[time, channel]`.
-        format (str or None, optional):
-            Override the format detection with the given format.
-            Providing the argument might help when libsox can not infer the format
-            from header or extension,
-
-    Returns:
-        (Tensor, int): Resulting Tensor and sample rate.
-        If ``normalize=True``, the resulting Tensor is always ``float32`` type.
-        If ``normalize=False`` and the input audio file is of integer WAV file, then the
-        resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
-        If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
-        otherwise `[time, channel]`.
-
-    Example - Basic usage
-        >>>
-        >>> # Defines the effects to apply
-        >>> effects = [
-        ...     ['gain', '-n'],  # normalises to 0dB
-        ...     ['pitch', '5'],  # 5 cent pitch shift
-        ...     ['rate', '8000'],  # resample to 8000 Hz
-        ... ]
-        >>>
-        >>> # Apply effects and load data with channels_first=True
-        >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
-        >>>
-        >>> # Check the result
-        >>> waveform.shape
-        paddle.Size([2, 8000])
-        >>> waveform
-        tensor([[ 5.1151e-03,  1.8073e-02,  2.2188e-02,  ...,  1.0431e-07,
-                 -1.4761e-07,  1.8114e-07],
-                [-2.6924e-03,  2.1860e-03,  1.0650e-02,  ...,  6.4122e-07,
-                 -5.6159e-07,  4.8103e-07]])
-        >>> sample_rate
-        8000
-
-    Example - Apply random speed perturbation to dataset
-        >>>
-        >>> # Load data from file, apply random speed perturbation
-        >>> class RandomPerturbationFile(paddle.utils.data.Dataset):
-        ...     \"\"\"Given flist, apply random speed perturbation
-        ...
-        ...     Suppose all the input files are at least one second long.
-        ...     \"\"\"
-        ...     def __init__(self, flist: List[str], sample_rate: int):
-        ...         super().__init__()
-        ...         self.flist = flist
-        ...         self.sample_rate = sample_rate
-        ...
-        ...     def __getitem__(self, index):
-        ...         speed = 0.5 + 1.5 * random.randn()
-        ...         effects = [
-        ...             ['gain', '-n', '-10'],  # apply 10 db attenuation
-        ...             ['remix', '-'],  # merge all the channels
-        ...             ['speed', f'{speed:.5f}'],  # duration is now 0.5 ~ 2.0 seconds.
-        ...             ['rate', f'{self.sample_rate}'],
-        ...             ['pad', '0', '1.5'],  # add 1.5 seconds silence at the end
-        ...             ['trim', '0', '2'],  # get the first 2 seconds
-        ...         ]
-        ...         waveform, _ = paddleaudio.sox_effects.apply_effects_file(
-        ...             self.flist[index], effects)
-        ...         return waveform
-        ...
-        ...     def __len__(self):
-        ...         return len(self.flist)
-        ...
-        >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
-        >>> loader = paddle.utils.data.DataLoader(dataset, batch_size=32)
-        >>> for batch in loader:
-        >>>     pass
-    """
-    if hasattr(path, "read"):
-        ret = paddleaudio.apply_effects_fileobj(path, effects, normalize, channels_first, format)
-        if ret is None:
-            raise RuntimeError("Failed to load audio from {}".format(path))
-        return (paddle.to_tensor(ret[0]), ret[1])
-    path = os.fspath(path)
-    ret = paddleaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)
-    if ret is not None:
-        return (paddle.to_tensor(ret[0]), ret[1])
-    raise RuntimeError("Failed to load audio from {}".format(path))
--- a/paddlespeech/audio/src/CMakeLists.txt
+++ b/paddlespeech/audio/src/CMakeLists.txt
@ -1,201 +0,0 @@
-if (MSVC)
-  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-endif()
-
-################################################################################
-# libpaddleaudio
-################################################################################
-set(
-  LIBPADDLEAUDIO_SOURCES
-  utils.cpp
-  )
-
-set(
-  LIBPADDLEAUDIO_INCLUDE_DIRS
-  ${PROJECT_SOURCE_DIR}
-  )
-
-set(
-  LIBPADDLEAUDIO_LINK_LIBRARIES
-  )
-
-set(
-  LIBPADDLEAUDIO_COMPILE_DEFINITIONS)
-
-#------------------------------------------------------------------------------#
-# START OF CUSTOMIZATION LOGICS
-#------------------------------------------------------------------------------#
-
-if(BUILD_SOX)
-  list(
-    APPEND
-    LIBPADDLEAUDIO_LINK_LIBRARIES
-    libsox
-    )
-  list(
-    APPEND
-    LIBPADDLEAUDIO_SOURCES
-    #sox/io.cpp
-    #sox/utils.cpp
-    #sox/effects.cpp
-    #sox/effects_chain.cpp
-    #sox/types.cpp
-    )
-  list(
-    APPEND
-    LIBPADDLEAUDIO_COMPILE_DEFINITIONS
-    INCLUDE_SOX
-    )
-endif()
-
-
-if(BUILD_KALDI)
-  list(
-    APPEND
-    LIBPADDLEAUDIO_LINK_LIBRARIES
-    libkaldi
-  )
-  list(
-    APPEND
-    LIBPADDLEAUDIO_COMPILE_DEFINITIONS
-    INCLUDE_KALDI
-    COMPILE_WITHOUT_OPENFST
-  )
-endif()
-
-#------------------------------------------------------------------------------#
-# END OF CUSTOMIZATION LOGICS
-#------------------------------------------------------------------------------#
-
-function (define_library name source include_dirs link_libraries compile_defs)
-  add_library(${name} SHARED ${source})
-  target_include_directories(${name} PRIVATE ${include_dirs})
-  target_link_libraries(${name} ${link_libraries})
-  target_compile_definitions(${name} PRIVATE ${compile_defs})
-  set_target_properties(${name} PROPERTIES PREFIX "")
-  if (MSVC)
-    set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
-  endif(MSVC)
-  install(
-    TARGETS ${name}
-    LIBRARY DESTINATION lib
-    RUNTIME DESTINATION lib  # For Windows
-    )
-endfunction()
-
-
-define_library(
-  libpaddleaudio
-  "${LIBPADDLEAUDIO_SOURCES}"
-  "${LIBPADDLEAUDIO_INCLUDE_DIRS}"
-  "${LIBPADDLEAUDIO_LINK_LIBRARIES}"
-  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
-)
-
-if (APPLE)
-  set(TORCHAUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
-else()
-  set(TORCHAUDIO_LIBRARY -Wl,--no-as-needed libpaddleaudio -Wl,--as-needed CACHE INTERNAL "")
-endif()
-
-  ################################################################################
-# _paddleaudio.so
-################################################################################
-if (BUILD_PADDLEAUDIO_PYTHON_EXTENSION)
-if (WIN32)
-  find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
-  set(ADDITIONAL_ITEMS Python3::Python)
-endif()
-function(define_extension name sources include_dirs libraries definitions)
-  add_library(${name} SHARED ${sources})
-  target_compile_definitions(${name} PRIVATE "${definitions}")
-  target_include_directories(
-    ${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
-  target_link_libraries(
-    ${name}
-    ${libraries}
-    ${TORCH_PYTHON_LIBRARY}
-    ${ADDITIONAL_ITEMS}
-    )
-  set_target_properties(${name} PROPERTIES PREFIX "")
-  if (MSVC)
-    set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
-  endif(MSVC)
-  if (APPLE)
-    # https://github.com/facebookarchive/caffe2/issues/854#issuecomment-364538485
-    # https://github.com/pytorch/pytorch/commit/73f6715f4725a0723d8171d3131e09ac7abf0666
-    set_target_properties(${name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
-  endif()
-  install(
-    TARGETS ${name}
-    LIBRARY DESTINATION .
-    RUNTIME DESTINATION .  # For Windows
-    )
-endfunction()
-
-set(
-  EXTENSION_SOURCES
-  pybind/pybind.cpp
-  )
-#----------------------------------------------------------------------------#
-# START OF CUSTOMIZATION LOGICS
-#----------------------------------------------------------------------------#
-if(BUILD_SOX)
-  list(
-    APPEND
-    EXTENSION_SOURCES
-    pybind/sox/effects.cpp
-    pybind/sox/effects_chain.cpp
-    pybind/sox/io.cpp
-    pybind/sox/types.cpp
-    pybind/sox/utils.cpp
-    )
-endif()
-
-if(BUILD_KALDI)
-  list(
-    APPEND
-    EXTENSION_SOURCES
-    pybind/kaldi/kaldi_feature_wrapper.cc
-    pybind/kaldi/kaldi_feature.cc
-    )
-endif()
-#----------------------------------------------------------------------------#
-# END OF CUSTOMIZATION LOGICS
-#----------------------------------------------------------------------------#
-define_extension(
-  _paddleaudio
-  "${EXTENSION_SOURCES}"
-  ""
-  libpaddleaudio
-  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
-  )
-# if(BUILD_CTC_DECODER)
-#   set(
-#     DECODER_EXTENSION_SOURCES
-#     decoder/bindings/pybind.cpp
-#     )
-#   define_extension(
-#     _paddleaudio_decoder
-#     "${DECODER_EXTENSION_SOURCES}"
-#     ""
-#     "libpaddleaudio_decoder"
-#     "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
-#     )
-# endif()
-# if(USE_FFMPEG)
-#   set(
-#     FFMPEG_EXTENSION_SOURCES
-#     ffmpeg/pybind/typedefs.cpp
-#     ffmpeg/pybind/pybind.cpp
-#     ffmpeg/pybind/stream_reader.cpp
-#     )
-#   define_extension(
-#     _paddleaudio_ffmpeg
-#     "${FFMPEG_EXTENSION_SOURCES}"
-#     "${FFMPEG_INCLUDE_DIRS}"
-#     "libpaddleaudio_ffmpeg"
-#     "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
-#     )
-# endif()
-endif()
--- a/paddlespeech/audio/src/optional/COPYING
+++ b/paddlespeech/audio/src/optional/COPYING
@ -1,121 +0,0 @@
-Creative Commons Legal Code
-
-CC0 1.0 Universal
-
-    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
-    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
-    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
-    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
-    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
-    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
-    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
-    HEREUNDER.
-
-Statement of Purpose
-
-The laws of most jurisdictions throughout the world automatically confer
-exclusive Copyright and Related Rights (defined below) upon the creator
-and subsequent owner(s) (each and all, an "owner") of an original work of
-authorship and/or a database (each, a "Work").
-
-Certain owners wish to permanently relinquish those rights to a Work for
-the purpose of contributing to a commons of creative, cultural and
-scientific works ("Commons") that the public can reliably and without fear
-of later claims of infringement build upon, modify, incorporate in other
-works, reuse and redistribute as freely as possible in any form whatsoever
-and for any purposes, including without limitation commercial purposes.
-These owners may contribute to the Commons to promote the ideal of a free
-culture and the further production of creative, cultural and scientific
-works, or to gain reputation or greater distribution for their Work in
-part through the use and efforts of others.
-
-For these and/or other purposes and motivations, and without any
-expectation of additional consideration or compensation, the person
-associating CC0 with a Work (the "Affirmer"), to the extent that he or she
-is an owner of Copyright and Related Rights in the Work, voluntarily
-elects to apply CC0 to the Work and publicly distribute the Work under its
-terms, with knowledge of his or her Copyright and Related Rights in the
-Work and the meaning and intended legal effect of CC0 on those rights.
-
-1. Copyright and Related Rights. A Work made available under CC0 may be
-protected by copyright and related or neighboring rights ("Copyright and
-Related Rights"). Copyright and Related Rights include, but are not
-limited to, the following:
-
-  i. the right to reproduce, adapt, distribute, perform, display,
-     communicate, and translate a Work;
- ii. moral rights retained by the original author(s) and/or performer(s);
-iii. publicity and privacy rights pertaining to a person's image or
-     likeness depicted in a Work;
- iv. rights protecting against unfair competition in regards to a Work,
-     subject to the limitations in paragraph 4(a), below;
-  v. rights protecting the extraction, dissemination, use and reuse of data
-     in a Work;
- vi. database rights (such as those arising under Directive 96/9/EC of the
-     European Parliament and of the Council of 11 March 1996 on the legal
-     protection of databases, and under any national implementation
-     thereof, including any amended or successor version of such
-     directive); and
-vii. other similar, equivalent or corresponding rights throughout the
-     world based on applicable law or treaty, and any national
-     implementations thereof.
-
-2. Waiver. To the greatest extent permitted by, but not in contravention
-of, applicable law, Affirmer hereby overtly, fully, permanently,
-irrevocably and unconditionally waives, abandons, and surrenders all of
-Affirmer's Copyright and Related Rights and associated claims and causes
-of action, whether now known or unknown (including existing as well as
-future claims and causes of action), in the Work (i) in all territories
-worldwide, (ii) for the maximum duration provided by applicable law or
-treaty (including future time extensions), (iii) in any current or future
-medium and for any number of copies, and (iv) for any purpose whatsoever,
-including without limitation commercial, advertising or promotional
-purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
-member of the public at large and to the detriment of Affirmer's heirs and
-successors, fully intending that such Waiver shall not be subject to
-revocation, rescission, cancellation, termination, or any other legal or
-equitable action to disrupt the quiet enjoyment of the Work by the public
-as contemplated by Affirmer's express Statement of Purpose.
-
-3. Public License Fallback. Should any part of the Waiver for any reason
-be judged legally invalid or ineffective under applicable law, then the
-Waiver shall be preserved to the maximum extent permitted taking into
-account Affirmer's express Statement of Purpose. In addition, to the
-extent the Waiver is so judged Affirmer hereby grants to each affected
-person a royalty-free, non transferable, non sublicensable, non exclusive,
-irrevocable and unconditional license to exercise Affirmer's Copyright and
-Related Rights in the Work (i) in all territories worldwide, (ii) for the
-maximum duration provided by applicable law or treaty (including future
-time extensions), (iii) in any current or future medium and for any number
-of copies, and (iv) for any purpose whatsoever, including without
-limitation commercial, advertising or promotional purposes (the
-"License"). The License shall be deemed effective as of the date CC0 was
-applied by Affirmer to the Work. Should any part of the License for any
-reason be judged legally invalid or ineffective under applicable law, such
-partial invalidity or ineffectiveness shall not invalidate the remainder
-of the License, and in such case Affirmer hereby affirms that he or she
-will not (i) exercise any of his or her remaining Copyright and Related
-Rights in the Work or (ii) assert any associated claims and causes of
-action with respect to the Work, in either case contrary to Affirmer's
-express Statement of Purpose.
-
-4. Limitations and Disclaimers.
-
- a. No trademark or patent rights held by Affirmer are waived, abandoned,
-    surrendered, licensed or otherwise affected by this document.
- b. Affirmer offers the Work as-is and makes no representations or
-    warranties of any kind concerning the Work, express, implied,
-    statutory or otherwise, including without limitation warranties of
-    title, merchantability, fitness for a particular purpose, non
-    infringement, or the absence of latent or other defects, accuracy, or
-    the present or absence of errors, whether or not discoverable, all to
-    the greatest extent permissible under applicable law.
- c. Affirmer disclaims responsibility for clearing rights of other persons
-    that may apply to the Work or any use thereof, including without
-    limitation any person's Copyright and Related Rights in the Work.
-    Further, Affirmer disclaims responsibility for obtaining any necessary
-    consents, permissions or other rights required for any use of the
-    Work.
- d. Affirmer understands and acknowledges that Creative Commons is not a
-    party to this document and has no duty or obligation with respect to
-    this CC0 or use of the Work.
--- a/paddlespeech/audio/src/optional/optional.hpp
+++ b/paddlespeech/audio/src/optional/optional.hpp
--- a/paddlespeech/audio/src/pybind/kaldi/feature_common.h
+++ b/paddlespeech/audio/src/pybind/kaldi/feature_common.h
@ -1,49 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "pybind11/pybind11.h"
-#include "pybind11/numpy.h"
-#include "feat/feature-window.h"
-
-namespace paddleaudio {
-namespace kaldi {
-
-namespace py = pybind11;
-
-template <class F>
-class StreamingFeatureTpl {
-  public:
-    typedef typename F::Options Options;
-    StreamingFeatureTpl(const Options& opts);
-    bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
-                        ::kaldi::Vector<::kaldi::BaseFloat>* feats);
-    void Reset() { remained_wav_.Resize(0); }
-
-    int Dim() { return computer_.Dim(); }
-
-  private:
-    bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
-                 ::kaldi::Vector<::kaldi::BaseFloat>* feats);
-    Options opts_;
-    ::kaldi::FeatureWindowFunction window_function_;
-    ::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
-    F computer_;
-};
-
-}  // namespace kaldi
-}  // namespace ppspeech
-
-#include "feature_common_inl.h"
--- a/paddlespeech/audio/src/pybind/kaldi/feature_common_inl.h
+++ b/paddlespeech/audio/src/pybind/kaldi/feature_common_inl.h
@ -1,93 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-
-namespace paddleaudio {
-namespace kaldi {
-
-template <class F>
-StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)
-    : opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
-    // window_function_(computer_.GetFrameOptions()) { the opt set to zero
-}
-
-template <class F>
-bool StreamingFeatureTpl<F>::ComputeFeature(
-    const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
-    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
-    // append remaned waves
-    ::kaldi::int32 wav_len = wav.Dim();
-    if (wav_len == 0) return false;
-    ::kaldi::int32 left_len = remained_wav_.Dim();
-    ::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
-    waves.Range(0, left_len).CopyFromVec(remained_wav_);
-    waves.Range(left_len, wav_len).CopyFromVec(wav);
-
-    // cache remaned waves
-    ::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
-    ::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
-    ::kaldi::int32 frame_shift = frame_opts.WindowShift();
-    ::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
-    remained_wav_.Resize(left_samples);
-    remained_wav_.CopyFromVec(
-        waves.Range(frame_shift * num_frames, left_samples));
-
-    // compute speech feature
-    Compute(waves, feats);
-    return true;
-}
-
-// Compute feat
-template <class F>
-bool StreamingFeatureTpl<F>::Compute(
-    const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
-    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
-    ::kaldi::BaseFloat vtln_warp = 1.0;
-    const ::kaldi::FrameExtractionOptions& frame_opts =
-        computer_.GetFrameOptions();
-    ::kaldi::int32 num_samples = waves.Dim();
-    ::kaldi::int32 frame_length = frame_opts.WindowSize();
-    ::kaldi::int32 sample_rate = frame_opts.samp_freq;
-    if (num_samples < frame_length) {
-        return false;
-    }
-
-    ::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
-    feats->Resize(num_frames * Dim());
-
-    ::kaldi::Vector<::kaldi::BaseFloat> window;
-    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
-    for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
-        ::kaldi::BaseFloat raw_log_energy = 0.0;
-        ::kaldi::ExtractWindow(0,
-                               waves,
-                               frame,
-                               frame_opts,
-                               window_function_,
-                               &window,
-                               need_raw_log_energy ? &raw_log_energy : NULL);
-
-        ::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
-                                                         ::kaldi::kUndefined);
-        computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
-        ::kaldi::SubVector<::kaldi::BaseFloat> output_row(
-            feats->Data() + frame * Dim(), Dim());
-        output_row.CopyFromVec(this_feature);
-    }
-    return true;
-}
-
-}  // namespace kaldi
-}  // namespace paddleaudio
--- a/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.cc
+++ b/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.cc
@ -1,75 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h"
-#include "feat/pitch-functions.h"
-
-namespace paddleaudio {
-namespace kaldi {
-
-bool InitFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
-    FbankOptions fbank_opts) {
-    ::kaldi::FbankOptions opts;
-    opts.frame_opts = frame_opts;
-    opts.mel_opts = mel_opts;
-    opts.use_energy = fbank_opts.use_energy;
-    opts.energy_floor = fbank_opts.energy_floor;
-    opts.raw_energy = fbank_opts.raw_energy;
-    opts.htk_compat = fbank_opts.htk_compat;
-    opts.use_log_fbank = fbank_opts.use_log_fbank;
-    opts.use_power = fbank_opts.use_power;
-    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->InitFbank(opts);
-    return true;
-}
-
-py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
-    return paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ComputeFbank(
-        wav);
-}
-
-py::array_t<float> ComputeFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
-    FbankOptions fbank_opts,
-    const py::array_t<float>& wav) {
-    InitFbank(frame_opts, mel_opts, fbank_opts);
-    py::array_t<float> result = ComputeFbankStreaming(wav);
-    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
-    return result;
-}
-
-void ResetFbank() {
-    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
-}
-
-py::array_t<float> ComputeKaldiPitch(
-  const ::kaldi::PitchExtractionOptions& opts,
-  const py::array_t<float>& wav) {
-    py::buffer_info info = wav.request();
-    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
-   
-    ::kaldi::Matrix<::kaldi::BaseFloat> features;
-    ::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
-    auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
-    for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
-        std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
-                    sizeof(float)*features.NumCols());
-    }
-   return result;
-}
-
-}  // namespace kaldi
-}  // namespace paddleaudio
--- a/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h
+++ b/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h
@ -1,64 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <string>
-
-#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature_wrapper.h"
-#include "feat/pitch-functions.h"
-
-namespace py = pybind11;
-
-namespace paddleaudio {
-namespace kaldi {
-
-struct FbankOptions{
-  bool use_energy;  // append an extra dimension with energy to the filter banks
-  float energy_floor;
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-  bool htk_compat;  // If true, put energy last (if using energy)
-  bool use_log_fbank;  // if true (default), produce log-filterbank, else linear
-  bool use_power; 
-  FbankOptions(): use_energy(false),
-                 energy_floor(0.0),
-                 raw_energy(true),
-                 htk_compat(false),
-                 use_log_fbank(true),
-                 use_power(true) {}
-};
-
-bool InitFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
-    FbankOptions fbank_opts);
-
-py::array_t<float> ComputeFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
-    FbankOptions fbank_opts,
-    const py::array_t<float>& wav);
-
-py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);
-
-void ResetFbank();
-
-py::array_t<float> ComputeKaldiPitch(
-    const ::kaldi::PitchExtractionOptions& opts,
-    const py::array_t<float>& wav);
-
-}  // namespace kaldi
-}  // namespace paddleaudio
--- a/paddlespeech/audio/src/pybind/kaldi/kaldi_feature_wrapper.cc
+++ b/paddlespeech/audio/src/pybind/kaldi/kaldi_feature_wrapper.cc
@ -1,51 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature_wrapper.h"
-
-namespace paddleaudio {
-namespace kaldi {
-
-KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
-    static KaldiFeatureWrapper instance;
-    return &instance;
-}
-
-bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
-    fbank_.reset(new Fbank(opts));
-    return true;
-}
-
-py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
-    const py::array_t<float> wav) {
-    py::buffer_info info = wav.request();
-    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
-
-    ::kaldi::Vector<::kaldi::BaseFloat> feats;
-    bool flag = fbank_->ComputeFeature(input_wav, &feats);
-    if (flag == false || feats.Dim() == 0) return py::array_t<float>();
-    auto result = py::array_t<float>(feats.Dim());
-    py::buffer_info xs = result.request();
-    std::cout << std::endl;
-    float* res_ptr = (float*)xs.ptr;
-    for (int idx = 0; idx < feats.Dim(); ++idx) {
-        *res_ptr = feats(idx);
-        res_ptr++;
-    }
-
-    return result.reshape({feats.Dim() / Dim(), Dim()});
-}
-
-}  // namesapce kaldi
-}  // namespace paddleaudio
--- a/paddlespeech/audio/src/pybind/kaldi/kaldi_feature_wrapper.h
+++ b/paddlespeech/audio/src/pybind/kaldi/kaldi_feature_wrapper.h
@ -1,40 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "base/kaldi-common.h"
-#include "feat/feature-fbank.h"
-
-#include "paddlespeech/audio/src/pybind/kaldi/feature_common.h"
-
-namespace paddleaudio {
-namespace kaldi {
-
-typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
-
-class KaldiFeatureWrapper {
-  public:
-    static KaldiFeatureWrapper* GetInstance();
-    bool InitFbank(::kaldi::FbankOptions opts);
-    py::array_t<float> ComputeFbank(const py::array_t<float> wav);
-    int Dim() { return fbank_->Dim(); }
-    void ResetFbank() { fbank_->Reset(); }
-
-  private:
-    std::unique_ptr<paddleaudio::kaldi::Fbank> fbank_;
-};
-
-}  // namespace kaldi
-}  // namespace paddleaudio
--- a/paddlespeech/audio/src/pybind/pybind.cpp
+++ b/paddlespeech/audio/src/pybind/pybind.cpp
@ -1,144 +0,0 @@
-// Copyright (c) 2017 Facebook Inc. (Soumith Chintala), All rights reserved.
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h"
-#include "paddlespeech/audio/src/pybind/sox/io.h"
-#include "paddlespeech/audio/src/pybind/sox/effects.h"
-#include "paddlespeech/audio/third_party/kaldi/feat/feature-fbank.h"
-
-#include <pybind11/stl.h>
-#include <pybind11/pybind11.h>
-
-// `tl::optional` 
-namespace pybind11 { namespace detail {
-   template <typename T>
-   struct type_caster<tl::optional<T>> : optional_caster<tl::optional<T>> {};
-}}
-
-PYBIND11_MODULE(_paddleaudio, m) {
-#ifdef INCLUDE_SOX
-    m.def("get_info_file",
-          &paddleaudio::sox_io::get_info_file,
-          "Get metadata of audio file.");
-    // support obj later
-    m.def("get_info_fileobj",
-          &paddleaudio::sox_io::get_info_fileobj,
-          "Get metadata of audio in file object.");
-    m.def("load_audio_fileobj",
-          &paddleaudio::sox_io::load_audio_fileobj,
-          "Load audio from file object.");
-    m.def("save_audio_fileobj",
-          &paddleaudio::sox_io::save_audio_fileobj,
-          "Save audio to file obj.");
-          
-    // sox io
-     m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file);
-     m.def(
-         "sox_io_load_audio_file",
-         &paddleaudio::sox_io::load_audio_file);
-     m.def(
-         "sox_io_save_audio_file",
-         &paddleaudio::sox_io::save_audio_file);
-    
-     // sox utils
-     m.def("sox_utils_set_seed", &paddleaudio::sox_utils::set_seed);
-     m.def(
-         "sox_utils_set_verbosity",
-         &paddleaudio::sox_utils::set_verbosity);
-     m.def(
-         "sox_utils_set_use_threads",
-         &paddleaudio::sox_utils::set_use_threads);
-     m.def(
-         "sox_utils_set_buffer_size",
-         &paddleaudio::sox_utils::set_buffer_size);
-     m.def(
-         "sox_utils_list_effects",
-         &paddleaudio::sox_utils::list_effects);
-     m.def(
-         "sox_utils_list_read_formats",
-         &paddleaudio::sox_utils::list_read_formats);
-     m.def(
-         "sox_utils_list_write_formats",
-         &paddleaudio::sox_utils::list_write_formats);
-     m.def(
-         "sox_utils_get_buffer_size",
-         &paddleaudio::sox_utils::get_buffer_size);
-
-     // effect
-     m.def("apply_effects_fileobj",
-           &paddleaudio::sox_effects::apply_effects_fileobj,
-           "Decode audio data from file-like obj and apply effects.");
-     m.def("sox_effects_initialize_sox_effects",
-       &paddleaudio::sox_effects::initialize_sox_effects);
-     m.def(
-         "sox_effects_shutdown_sox_effects",
-         &paddleaudio::sox_effects::shutdown_sox_effects);
-     m.def(
-         "sox_effects_apply_effects_tensor",
-         &paddleaudio::sox_effects::apply_effects_tensor);
-     m.def(
-         "sox_effects_apply_effects_file",
-         &paddleaudio::sox_effects::apply_effects_file);
-#endif
-
-#ifdef INCLUDE_KALDI
-    m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
-    py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
-        .def(py::init<>())
-        .def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
-        .def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
-        .def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
-        .def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
-        .def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
-        .def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
-        .def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
-        .def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
-        .def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
-        .def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
-        .def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
-        .def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
-        .def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
-        .def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
-        .def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
-        .def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
-        .def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
-        .def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
-        .def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
-        .def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
-    m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
-    py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
-        .def(py::init<>())            
-        .def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
-        .def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)            
-        .def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
-        .def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)            
-        .def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)            
-        .def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)            
-        .def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
-        .def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)           
-        .def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)          
-        .def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
-        .def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
-        .def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
-        .def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
-    py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
-        .def(py::init<>())
-        .def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
-        .def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
-        .def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
-        .def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
-        .def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
-        .def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
-        .def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
-
-    py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
-        .def(py::init<>())
-        .def_readwrite("use_energy", &paddleaudio::kaldi::FbankOptions::use_energy)
-        .def_readwrite("energy_floor", &paddleaudio::kaldi::FbankOptions::energy_floor)
-        .def_readwrite("raw_energy", &paddleaudio::kaldi::FbankOptions::raw_energy)
-        .def_readwrite("htk_compat", &paddleaudio::kaldi::FbankOptions::htk_compat)
-        .def_readwrite("use_log_fbank", &paddleaudio::kaldi::FbankOptions::use_log_fbank)
-        .def_readwrite("use_power", &paddleaudio::kaldi::FbankOptions::use_power);
-#endif
-
-}
--- a/paddlespeech/audio/src/pybind/sox/effects.cpp
+++ b/paddlespeech/audio/src/pybind/sox/effects.cpp
@ -1,257 +0,0 @@
-#include <mutex>
-#include <sox.h>
-
-#include "paddlespeech/audio/src/pybind/sox/effects.h"
-#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
-#include "paddlespeech/audio/src/pybind/sox/utils.h"
-
-using namespace paddleaudio::sox_utils;
-
-namespace paddleaudio::sox_effects {
-
-// Streaming decoding over file-like object is tricky because libsox operates on
-// FILE pointer. The folloing is what `sox` and `play` commands do
-//  - file input -> FILE pointer
-//  - URL input -> call wget in suprocess and pipe the data -> FILE pointer
-//  - stdin -> FILE pointer
-//
-// We want to, instead, fetch byte strings chunk by chunk, consume them, and
-// discard.
-//
-// Here is the approach
-// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
-// chunk of byte string
-//    This will perform header-based format detection, if necessary, then fill
-//    the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
-//    which returns FILE* which points the buffer of the provided byte string.
-// 2. Each time sox reads a chunk from the FILE*, we update the underlying
-// buffer in a way that it
-//    starts with unseen data, and append the new data read from the given
-//    fileobj. This will trick libsox as if it keeps reading from the FILE*
-//    continuously.
-// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
-auto apply_effects_fileobj(
-    py::object fileobj,
-    const std::vector<std::vector<std::string>>& effects,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    tl::optional<std::string> format)
-    -> tl::optional<std::tuple<py::array, int64_t>> {
-  // Prepare the buffer used throughout the lifecycle of SoxEffectChain.
-  //
-  // For certain format (such as FLAC), libsox keeps reading the content at
-  // the initialization unless it reaches EOF even when the header is properly
-  // parsed. (Making buffer size 8192, which is way bigger than the header,
-  // resulted in libsox consuming all the buffer content at the time it opens
-  // the file.) Therefore buffer has to always contain valid data, except after
-  // EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
-  // first check if there is enough data to fill the buffer. `read_fileobj`
-  // repeatedly calls `read`  method until it receives the requested length of
-  // bytes or it reaches EOF. If we get bytes shorter than requested, that means
-  // the whole audio data are fetched.
-  //
-  // * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`.
-  const auto capacity = [&]() {
-    // NOTE:
-    // Use the abstraction provided by `libpaddleaudio` to access the global
-    // config defined by libsox. Directly using `sox_get_globals` function will
-    // end up retrieving the static variable defined in `_paddleaudio`, which is
-    // not correct.
-    const auto bufsiz = get_buffer_size();
-    const int64_t kDefaultCapacityInBytes = 256;
-    return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
-                                              : kDefaultCapacityInBytes;
-  }();
-  std::string buffer(capacity, '\0');
-  auto* in_buf = const_cast<char*>(buffer.data());
-  auto num_read = read_fileobj(&fileobj, capacity, in_buf);
-  // If the file is shorter than 256, then libsox cannot read the header.
-  auto in_buffer_size = (num_read > 256) ? num_read : 256;
-
-  // Open file (this starts reading the header)
-  // When opening a file there are two functions that can touches FILE*.
-  // * `auto_detect_format`
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
-  // * `startread` handler of detected format.
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
-  // To see the handler of a particular format, go to
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
-  // For example, voribs can be found
-  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
-  SoxFormat sf(sox_open_mem_read(
-      in_buf,
-      in_buffer_size,
-      /*signal=*/nullptr,
-      /*encoding=*/nullptr,
-      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
-
-  // In case of streamed data, length can be 0
-  if (static_cast<sox_format_t*>(sf) == nullptr ||
-      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-    return {};
-  }
-
-  // Prepare output buffer
-  std::vector<sox_sample_t> out_buffer;
-  out_buffer.reserve(sf->signal.length);
-
-  // Create and run SoxEffectsChain
-  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
-  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
-      /*input_encoding=*/sf->encoding,
-      /*output_encoding=*/get_tensor_encodinginfo(dtype));
-  chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
-  for (const auto& effect : effects) {
-    chain.addEffect(effect);
-  }
-  chain.addOutputBuffer(&out_buffer);
-  chain.run();
-
-  // Create tensor from buffer
-  bool channels_first_ = channels_first.value_or(true);
-  auto tensor = convert_to_tensor(
-      /*buffer=*/out_buffer.data(),
-      /*num_samples=*/out_buffer.size(),
-      /*num_channels=*/chain.getOutputNumChannels(),
-      dtype,
-      normalize.value_or(true),
-      channels_first_);
-
-  return std::forward_as_tuple(
-      tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
-}
-
-namespace {
-
-enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
-SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
-std::mutex SOX_RESOUCE_STATE_MUTEX;
-
-} // namespace
-
-void initialize_sox_effects() {
-  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
-
-  switch (SOX_RESOURCE_STATE) {
-    case NotInitialized:
-      if (sox_init() != SOX_SUCCESS) {
-        throw std::runtime_error("Failed to initialize sox effects.");
-      };
-      SOX_RESOURCE_STATE = Initialized;
-      break;
-    case Initialized:
-      break;
-    case ShutDown:
-      throw std::runtime_error(
-          "SoX Effects has been shut down. Cannot initialize again.");
-  }
-};
-
-void shutdown_sox_effects() {
-  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
-
-  switch (SOX_RESOURCE_STATE) {
-    case NotInitialized:
-      throw std::runtime_error(
-          "SoX Effects is not initialized. Cannot shutdown.");
-    case Initialized:
-      if (sox_quit() != SOX_SUCCESS) {
-        throw std::runtime_error("Failed to initialize sox effects.");
-      };
-      SOX_RESOURCE_STATE = ShutDown;
-      break;
-    case ShutDown:
-      break;
-  }
-}
-
-auto apply_effects_tensor(
-    py::array waveform,
-    int64_t sample_rate,
-    const std::vector<std::vector<std::string>>& effects,
-    bool channels_first) -> std::tuple<py::array, int64_t> {
-  validate_input_tensor(waveform);
-
-  // Create SoxEffectsChain
-  const auto dtype = waveform.dtype();
-  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
-      /*input_encoding=*/get_tensor_encodinginfo(dtype),
-      /*output_encoding=*/get_tensor_encodinginfo(dtype));
-
-  // Prepare output buffer
-  std::vector<sox_sample_t> out_buffer;
-  out_buffer.reserve(waveform.size());
-
-  // Build and run effects chain
-  chain.addInputTensor(&waveform, sample_rate, channels_first);
-  for (const auto& effect : effects) {
-    chain.addEffect(effect);
-  }
-  chain.addOutputBuffer(&out_buffer);
-  chain.run();
-
-  // Create tensor from buffer
-  auto out_tensor = convert_to_tensor(
-      /*buffer=*/out_buffer.data(),
-      /*num_samples=*/out_buffer.size(),
-      /*num_channels=*/chain.getOutputNumChannels(),
-      dtype,
-      /*normalize=*/false,
-      channels_first);
-
-  return std::tuple<py::array, int64_t>(
-      out_tensor, chain.getOutputSampleRate());
-}
-
-auto apply_effects_file(
-    const std::string& path,
-    const std::vector<std::vector<std::string>>& effects,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    const tl::optional<std::string>& format)
-    -> tl::optional<std::tuple<py::array, int64_t>> {
-  // Open input file
-  SoxFormat sf(sox_open_read(
-      path.c_str(),
-      /*signal=*/nullptr,
-      /*encoding=*/nullptr,
-      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
-
-  if (static_cast<sox_format_t*>(sf) == nullptr ||
-      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-    return {};
-  }
-
-  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
-
-  // Prepare output
-  std::vector<sox_sample_t> out_buffer;
-  out_buffer.reserve(sf->signal.length);
-
-  // Create and run SoxEffectsChain
-  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
-      /*input_encoding=*/sf->encoding,
-      /*output_encoding=*/get_tensor_encodinginfo(dtype));
-
-  chain.addInputFile(sf);
-  for (const auto& effect : effects) {
-    chain.addEffect(effect);
-  }
-  chain.addOutputBuffer(&out_buffer);
-  chain.run();
-
-  // Create tensor from buffer
-  bool channels_first_ = channels_first.value_or(true);
-  auto tensor = convert_to_tensor(
-      /*buffer=*/out_buffer.data(),
-      /*num_samples=*/out_buffer.size(),
-      /*num_channels=*/chain.getOutputNumChannels(),
-      dtype,
-      normalize.value_or(true),
-      channels_first_);
-
-  return std::tuple<py::array, int64_t>(
-      tensor, chain.getOutputSampleRate());
-}
-
-} // namespace paddleaudio::sox_effects
--- a/paddlespeech/audio/src/pybind/sox/effects.h
+++ b/paddlespeech/audio/src/pybind/sox/effects.h
@ -1,36 +0,0 @@
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-
-#include "paddlespeech/audio/src/optional/optional.hpp"
-
-namespace py = pybind11;
-
-namespace paddleaudio::sox_effects {
-
-auto apply_effects_fileobj(
-    py::object fileobj,
-    const std::vector<std::vector<std::string>>& effects,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    tl::optional<std::string> format)
-    -> tl::optional<std::tuple<py::array, int64_t>>;
-
-void initialize_sox_effects();
-
-void shutdown_sox_effects();
-
-auto apply_effects_tensor(
-    py::array waveform,
-    int64_t sample_rate,
-    const std::vector<std::vector<std::string>>& effects,
-    bool channels_first) -> std::tuple<py::array, int64_t>;
-
-auto apply_effects_file(
-    const std::string& path,
-    const std::vector<std::vector<std::string>>& effects,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    const tl::optional<std::string>& format)
-    -> tl::optional<std::tuple<py::array, int64_t>>;
-
-} // namespace paddleaudio::sox_effects
--- a/paddlespeech/audio/src/pybind/sox/effects_chain.cpp
+++ b/paddlespeech/audio/src/pybind/sox/effects_chain.cpp
@ -1,595 +0,0 @@
-#include <sox.h>
-#include <iostream>
-#include <vector>
-#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
-#include "paddlespeech/audio/src/pybind/sox/utils.h"
-
-using namespace paddleaudio::sox_utils;
-
-namespace paddleaudio::sox_effects_chain {
-
-namespace {
-
-/// helper classes for passing the location of input tensor and output buffer
-///
-/// drain/flow callback functions require plaing C style function signature and
-/// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
-/// The following structs will be assigned to sox_effect_t::priv pointer which
-/// gives sox_effect_t an access to input Tensor and output buffer object.
-struct TensorInputPriv {
-  size_t index;
-  py::array* waveform;
-  int64_t sample_rate;
-  bool channels_first;
-};
-
-struct TensorOutputPriv {
-  std::vector<sox_sample_t>* buffer;
-};
-struct FileOutputPriv {
-  sox_format_t* sf;
-};
-
-/// Callback function to feed Tensor data to SoxEffectChain.
-int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
-  // Retrieve the input Tensor and current index
-  auto priv = static_cast<TensorInputPriv*>(effp->priv);
-  auto index = priv->index;
-  auto tensor = *(priv->waveform);
-  auto num_channels = effp->out_signal.channels;
-
-  // Adjust the number of samples to read
-  const size_t num_samples = tensor.size();
-  if (index + *osamp > num_samples) {
-    *osamp = num_samples - index;
-  }
-
-  // Ensure that it's a multiple of the number of channels
-  *osamp -= *osamp % num_channels;
-
-  // Slice the input Tensor
-  // refacor this module, chunk
-  auto i_frame = index / num_channels;
-  auto num_frames = *osamp / num_channels;
-
-  std::vector<int> chunk(num_frames*num_channels);
-  py::buffer_info ori_info = tensor.request();
-  void* ptr = ori_info.ptr;
-  // Convert to sox_sample_t (int32_t)
-  switch (tensor.dtype().num()) {
-    //case c10::ScalarType::Float: {
-    case 11: {
-      // Need to convert to 64-bit precision so that
-      // values around INT32_MIN/MAX are handled correctly.
-      for (int idx = 0; idx < chunk.size(); ++idx) {
-        int frame_idx = (idx + index) / num_channels;
-        int channels_idx = (idx + index) % num_channels;
-        double elem = 0; 
-        if (priv->channels_first) {
-          elem = *(float*)tensor.data(channels_idx, frame_idx);
-        } else {
-          elem = *(float*)tensor.data(frame_idx, channels_idx);
-        } 
-        elem = elem * 2147483648.;
-        // *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX);
-        if (elem > INT32_MAX) { 
-          chunk[idx] = INT32_MAX; 
-        } else if (elem < INT32_MIN) {
-          chunk[idx] = INT32_MIN; 
-        } else { 
-          chunk[idx] = elem;
-        }
-      }
-      break;
-    }
-    //case c10::ScalarType::Int: {
-    case 5: {
-      for (int idx = 0; idx < chunk.size(); ++idx) {
-        int frame_idx = (idx + index) / num_channels;
-        int channels_idx = (idx + index) % num_channels;
-        int elem = 0;
-        if (priv->channels_first) {
-          elem = *(int*)tensor.data(channels_idx, frame_idx);
-        } else {
-          elem = *(int*)tensor.data(frame_idx, channels_idx);
-        }
-        chunk[idx] = elem;
-      }
-      break;
-    }
-    // case short
-    case 3: {
-      for (int idx = 0; idx < chunk.size(); ++idx) {
-        int frame_idx = (idx + index) / num_channels;
-        int channels_idx = (idx + index) % num_channels;
-        int16_t elem = 0;
-        if (priv->channels_first) {
-          elem = *(int16_t*)tensor.data(channels_idx, frame_idx);
-        } else {
-          elem = *(int16_t*)tensor.data(frame_idx, channels_idx);
-        }
-        chunk[idx] = elem * 65536;
-      }
-      break;
-    }
-    // case byte
-    case 1: {
-      for (int idx = 0; idx < chunk.size(); ++idx) {
-        int frame_idx = (idx + index) / num_channels;
-        int channels_idx = (idx + index) % num_channels;
-        int8_t elem = 0;
-        if (priv->channels_first) {
-          elem = *(int8_t*)tensor.data(channels_idx, frame_idx);
-        } else {
-          elem = *(int8_t*)tensor.data(frame_idx, channels_idx);
-        }
-        chunk[idx] = (elem - 128) * 16777216; 
-      }
-      break;
-    }
-    default:
-      throw std::runtime_error("Unexpected dtype.");
-  }
-  // Write to buffer
-  memcpy(obuf, chunk.data(), *osamp * 4);
-  priv->index += *osamp;
-  return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS;
-}
-
-/// Callback function to fetch data from SoxEffectChain.
-int tensor_output_flow(
-    sox_effect_t* effp,
-    sox_sample_t const* ibuf,
-    sox_sample_t* obuf LSX_UNUSED,
-    size_t* isamp,
-    size_t* osamp) {
-  *osamp = 0;
-  // Get output buffer
-  auto out_buffer = static_cast<TensorOutputPriv*>(effp->priv)->buffer;
-  // Append at the end
-  out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp);
-  return SOX_SUCCESS;
-}
-
-int file_output_flow(
-    sox_effect_t* effp,
-    sox_sample_t const* ibuf,
-    sox_sample_t* obuf LSX_UNUSED,
-    size_t* isamp,
-    size_t* osamp) {
-  *osamp = 0;
-  if (*isamp) {
-    auto sf = static_cast<FileOutputPriv*>(effp->priv)->sf;
-    if (sox_write(sf, ibuf, *isamp) != *isamp) {
-      if (sf->sox_errno) {
-        std::ostringstream stream;
-        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
-               << sf->filename;
-        throw std::runtime_error(stream.str());
-      }
-      return SOX_EOF;
-    }
-  }
-  return SOX_SUCCESS;
-}
-
-sox_effect_handler_t* get_tensor_input_handler() {
-  static sox_effect_handler_t handler{
-      /*name=*/"input_tensor",
-      /*usage=*/NULL,
-      /*flags=*/SOX_EFF_MCHAN,
-      /*getopts=*/NULL,
-      /*start=*/NULL,
-      /*flow=*/NULL,
-      /*drain=*/tensor_input_drain,
-      /*stop=*/NULL,
-      /*kill=*/NULL,
-      /*priv_size=*/sizeof(TensorInputPriv)};
-  return &handler;
-}
-
-sox_effect_handler_t* get_tensor_output_handler() {
-  static sox_effect_handler_t handler{
-      /*name=*/"output_tensor",
-      /*usage=*/NULL,
-      /*flags=*/SOX_EFF_MCHAN,
-      /*getopts=*/NULL,
-      /*start=*/NULL,
-      /*flow=*/tensor_output_flow,
-      /*drain=*/NULL,
-      /*stop=*/NULL,
-      /*kill=*/NULL,
-      /*priv_size=*/sizeof(TensorOutputPriv)};
-  return &handler;
-}
-
-sox_effect_handler_t* get_file_output_handler() {
-  static sox_effect_handler_t handler{
-      /*name=*/"output_file",
-      /*usage=*/NULL,
-      /*flags=*/SOX_EFF_MCHAN,
-      /*getopts=*/NULL,
-      /*start=*/NULL,
-      /*flow=*/file_output_flow,
-      /*drain=*/NULL,
-      /*stop=*/NULL,
-      /*kill=*/NULL,
-      /*priv_size=*/sizeof(FileOutputPriv)};
-  return &handler;
-}
-
-} // namespace
-
-SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {}
-
-SoxEffect::~SoxEffect() {
-  if (se_ != nullptr) {
-    free(se_);
-  }
-}
-
-SoxEffect::operator sox_effect_t*() const {
-  return se_;
-}
-
-auto SoxEffect::operator->() noexcept -> sox_effect_t* {
-  return se_;
-}
-
-SoxEffectsChain::SoxEffectsChain(
-    sox_encodinginfo_t input_encoding,
-    sox_encodinginfo_t output_encoding)
-    : in_enc_(input_encoding),
-      out_enc_(output_encoding),
-      in_sig_(),
-      interm_sig_(),
-      out_sig_(),
-      sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) {
-  if (!sec_) {
-    throw std::runtime_error("Failed to create effect chain.");
-  }
-}
-
-SoxEffectsChain::~SoxEffectsChain() {
-  if (sec_ != nullptr) {
-    sox_delete_effects_chain(sec_);
-  }
-}
-
-void SoxEffectsChain::run() {
-  sox_flow_effects(sec_, NULL, NULL);
-}
-
-void SoxEffectsChain::addInputTensor(
-    py::array* waveform,
-    int64_t sample_rate,
-    bool channels_first) {
-  in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first);
-  interm_sig_ = in_sig_;
-  SoxEffect e(sox_create_effect(get_tensor_input_handler()));
-  auto priv = static_cast<TensorInputPriv*>(e->priv);
-  priv->index = 0;
-  priv->waveform = waveform;
-  priv->sample_rate = sample_rate;
-  priv->channels_first = channels_first;
-  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
-    throw std::runtime_error(
-        "Internal Error: Failed to add effect: input_tensor");
-  }
-}
-
-void SoxEffectsChain::addOutputBuffer(
-    std::vector<sox_sample_t>* output_buffer) {
-  SoxEffect e(sox_create_effect(get_tensor_output_handler()));
-  static_cast<TensorOutputPriv*>(e->priv)->buffer = output_buffer;
-  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
-    throw std::runtime_error(
-        "Internal Error: Failed to add effect: output_tensor");
-  }
-}
-
-void SoxEffectsChain::addInputFile(sox_format_t* sf) {
-  in_sig_ = sf->signal;
-  interm_sig_ = in_sig_;
-  SoxEffect e(sox_create_effect(sox_find_effect("input")));
-  char* opts[] = {(char*)sf};
-  sox_effect_options(e, 1, opts);
-  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
-    std::ostringstream stream;
-    stream << "Internal Error: Failed to add effect: input " << sf->filename;
-    throw std::runtime_error(stream.str());
-  }
-}
-
-void SoxEffectsChain::addOutputFile(sox_format_t* sf) {
-  out_sig_ = sf->signal;
-  SoxEffect e(sox_create_effect(get_file_output_handler()));
-  static_cast<FileOutputPriv*>(e->priv)->sf = sf;
-  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
-    std::ostringstream stream;
-    stream << "Internal Error: Failed to add effect: output " << sf->filename;
-    throw std::runtime_error(stream.str());
-  }
-}
-
-void SoxEffectsChain::addEffect(const std::vector<std::string> effect) {
-  const auto num_args = effect.size();
-  if (num_args == 0) {
-    throw std::runtime_error("Invalid argument: empty effect.");
-  }
-  const auto name = effect[0];
-  if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) {
-    std::ostringstream stream;
-    stream << "Unsupported effect: " << name;
-    throw std::runtime_error(stream.str());
-  }
-
-  auto returned_effect = sox_find_effect(name.c_str());
-  if (!returned_effect) {
-    std::ostringstream stream;
-    stream << "Unsupported effect: " << name;
-    throw std::runtime_error(stream.str());
-  }
-  SoxEffect e(sox_create_effect(returned_effect));
-  const auto num_options = num_args - 1;
-
-  std::vector<char*> opts;
-  for (size_t i = 1; i < num_args; ++i) {
-    opts.push_back((char*)effect[i].c_str());
-  }
-  if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) !=
-      SOX_SUCCESS) {
-    std::ostringstream stream;
-    stream << "Invalid effect option:";
-    for (const auto& v : effect) {
-      stream << " " << v;
-    }
-    throw std::runtime_error(stream.str());
-  }
-
-  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
-    std::ostringstream stream;
-    stream << "Internal Error: Failed to add effect: \"" << name;
-    for (size_t i = 1; i < num_args; ++i) {
-      stream << " " << effect[i];
-    }
-    stream << "\"";
-    throw std::runtime_error(stream.str());
-  }
-}
-
-int64_t SoxEffectsChain::getOutputNumChannels() {
-  return interm_sig_.channels;
-}
-
-int64_t SoxEffectsChain::getOutputSampleRate() {
-  return interm_sig_.rate;
-}
-
-namespace {
-
-/// helper classes for passing file-like object to SoxEffectChain
-struct FileObjInputPriv {
-  sox_format_t* sf;
-  py::object* fileobj;
-  bool eof_reached;
-  char* buffer;
-  uint64_t buffer_size;
-};
-
-struct FileObjOutputPriv {
-  sox_format_t* sf;
-  py::object* fileobj;
-  char** buffer;
-  size_t* buffer_size;
-};
-
-/// Callback function to feed byte string
-/// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/sox.h#L1268-L1278
-auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp)
-    -> int {
-  auto priv = static_cast<FileObjInputPriv*>(effp->priv);
-  auto sf = priv->sf;
-  auto buffer = priv->buffer;
-
-  // 1. Refresh the buffer
-  //
-  // NOTE:
-  //   Since the underlying FILE* was opened with `fmemopen`, the only way
-  //   libsox detect EOF is reaching the end of the buffer. (null byte won't
-  //   help) Therefore we need to align the content at the end of buffer,
-  //   otherwise, libsox will keep reading the content beyond intended length.
-  //
-  // Before:
-  //
-  //     |<-------consumed------>|<---remaining--->|
-  //     |***********************|-----------------|
-  //                             ^ ftell
-  //
-  // After:
-  //
-  //     |<-offset->|<---remaining--->|<-new data->|
-  //     |**********|-----------------|++++++++++++|
-  //                ^ ftell
-
-  // NOTE:
-  //   Do not use `sf->tell_off` here. Presumably, `tell_off` and `fseek` are
-  //   supposed to be in sync, but there are cases (Vorbis) they are not
-  //   in sync and `tell_off` has seemingly uninitialized value, which
-  //   leads num_remain to be negative and cause segmentation fault
-  //   in `memmove`.
-  const auto tell = ftell((FILE*)sf->fp);
-  if (tell < 0) {
-    throw std::runtime_error("Internal Error: ftell failed.");
-  }
-  const auto num_consumed = static_cast<size_t>(tell);
-  if (num_consumed > priv->buffer_size) {
-    throw std::runtime_error("Internal Error: buffer overrun.");
-  }
-
-  const auto num_remain = priv->buffer_size - num_consumed;
-
-  // 1.1. Fetch the data to see if there is data to fill the buffer
-  size_t num_refill = 0;
-  std::string chunk(num_consumed, '\0');
-  if (num_consumed && !priv->eof_reached) {
-    num_refill = read_fileobj(
-        priv->fileobj, num_consumed, const_cast<char*>(chunk.data()));
-    if (num_refill < num_consumed) {
-      priv->eof_reached = true;
-    }
-  }
-  const auto offset = num_consumed - num_refill;
-
-  // 1.2. Move the unconsumed data towards the beginning of buffer.
-  if (num_remain) {
-    auto src = static_cast<void*>(buffer + num_consumed);
-    auto dst = static_cast<void*>(buffer + offset);
-    memmove(dst, src, num_remain);
-  }
-
-  // 1.3. Refill the remaining buffer.
-  if (num_refill) {
-    auto src = static_cast<void*>(const_cast<char*>(chunk.c_str()));
-    auto dst = buffer + offset + num_remain;
-    memcpy(dst, src, num_refill);
-  }
-
-  // 1.4. Set the file pointer to the new offset
-  sf->tell_off = offset;
-  fseek((FILE*)sf->fp, offset, SEEK_SET);
-
-  // 2. Perform decoding operation
-  // The following part is practically same as "input" effect
-  // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
-
-  // At this point, osamp represents the buffer size in bytes,
-  // but sox_read expects the maximum number of samples ready to read.
-  // Normally, this is fine, but in case when the samples are not 4-byte
-  // aligned, (e.g. sample is 24bits), the resulting signal is not correct.
-  // https://github.com/pytorch/audio/issues/2083
-  if (sf->encoding.bits_per_sample > 0)
-    *osamp /= (sf->encoding.bits_per_sample / 8);
-
-  // Ensure that it's a multiple of the number of channels
-  *osamp -= *osamp % effp->out_signal.channels;
-
-  // Read up to *osamp samples into obuf;
-  // store the actual number read back to *osamp
-  *osamp = sox_read(sf, obuf, *osamp);
-
-  // Decoding is finished when fileobject is exhausted and sox can no longer
-  // decode a sample.
-  return (priv->eof_reached && !*osamp) ? SOX_EOF : SOX_SUCCESS;
-}
-
-auto fileobj_output_flow(
-    sox_effect_t* effp,
-    sox_sample_t const* ibuf,
-    sox_sample_t* obuf LSX_UNUSED,
-    size_t* isamp,
-    size_t* osamp) -> int {
-  *osamp = 0;
-  if (*isamp) {
-    auto priv = static_cast<FileObjOutputPriv*>(effp->priv);
-    auto sf = priv->sf;
-    auto fp = static_cast<FILE*>(sf->fp);
-    auto fileobj = priv->fileobj;
-    auto buffer = priv->buffer;
-
-    // Encode chunk
-    auto num_samples_written = sox_write(sf, ibuf, *isamp);
-    fflush(fp);
-
-    // Copy the encoded chunk to python object.
-    fileobj->attr("write")(py::bytes(*buffer, ftell(fp)));
-
-    // Reset FILE*
-    sf->tell_off = 0;
-    fseek(fp, 0, SEEK_SET);
-
-    if (num_samples_written != *isamp) {
-      if (sf->sox_errno) {
-        std::ostringstream stream;
-        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
-               << sf->filename;
-        throw std::runtime_error(stream.str());
-      }
-      return SOX_EOF;
-    }
-  }
-  return SOX_SUCCESS;
-}
-
-auto get_fileobj_input_handler() -> sox_effect_handler_t* {
-  static sox_effect_handler_t handler{
-      /*name=*/"input_fileobj_object",
-      /*usage=*/nullptr,
-      /*flags=*/SOX_EFF_MCHAN,
-      /*getopts=*/nullptr,
-      /*start=*/nullptr,
-      /*flow=*/nullptr,
-      /*drain=*/fileobj_input_drain,
-      /*stop=*/nullptr,
-      /*kill=*/nullptr,
-      /*priv_size=*/sizeof(FileObjInputPriv)};
-  return &handler;
-}
-
-auto get_fileobj_output_handler() -> sox_effect_handler_t* {
-  static sox_effect_handler_t handler{
-      /*name=*/"output_fileobj_object",
-      /*usage=*/nullptr,
-      /*flags=*/SOX_EFF_MCHAN,
-      /*getopts=*/nullptr,
-      /*start=*/nullptr,
-      /*flow=*/fileobj_output_flow,
-      /*drain=*/nullptr,
-      /*stop=*/nullptr,
-      /*kill=*/nullptr,
-      /*priv_size=*/sizeof(FileObjOutputPriv)};
-  return &handler;
-}
-
-} // namespace
-
-void SoxEffectsChainPyBind::addInputFileObj(
-    sox_format_t* sf,
-    char* buffer,
-    uint64_t buffer_size,
-    py::object* fileobj) {
-  in_sig_ = sf->signal;
-  interm_sig_ = in_sig_;
-
-  SoxEffect e(sox_create_effect(get_fileobj_input_handler()));
-  auto priv = static_cast<FileObjInputPriv*>(e->priv);
-  priv->sf = sf;
-  priv->fileobj = fileobj;
-  priv->eof_reached = false;
-  priv->buffer = buffer;
-  priv->buffer_size = buffer_size;
-  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
-    throw std::runtime_error(
-        "Internal Error: Failed to add effect: input fileobj");
-  }
-}
-
-void SoxEffectsChainPyBind::addOutputFileObj(
-    sox_format_t* sf,
-    char** buffer,
-    size_t* buffer_size,
-    py::object* fileobj) {
-  out_sig_ = sf->signal;
-  SoxEffect e(sox_create_effect(get_fileobj_output_handler()));
-  auto priv = static_cast<FileObjOutputPriv*>(e->priv);
-  priv->sf = sf;
-  priv->fileobj = fileobj;
-  priv->buffer = buffer;
-  priv->buffer_size = buffer_size;
-  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
-    throw std::runtime_error(
-        "Internal Error: Failed to add effect: output fileobj");
-  }
-}
-
-} // namespace paddleaudio::sox_effects_chain
--- a/paddlespeech/audio/src/pybind/sox/effects_chain.h
+++ b/paddlespeech/audio/src/pybind/sox/effects_chain.h
@ -1,76 +0,0 @@
-#pragma once
-
-#include <sox.h>
-#include "paddlespeech/audio/src/pybind/sox/utils.h"
-
-namespace paddleaudio::sox_effects_chain {
-
-// Helper struct to safely close sox_effect_t* pointer returned by
-// sox_create_effect
-
-struct SoxEffect {
-  explicit SoxEffect(sox_effect_t* se) noexcept;
-  SoxEffect(const SoxEffect& other) = delete;
-  SoxEffect(const SoxEffect&& other) = delete;
-  auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
-  auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
-  ~SoxEffect();
-  operator sox_effect_t*() const;
-  auto operator->() noexcept -> sox_effect_t*;
-
- private:
-  sox_effect_t* se_;
-};
-
-// Helper struct to safely close sox_effects_chain_t with handy methods
-class SoxEffectsChain {
-  const sox_encodinginfo_t in_enc_;
-  const sox_encodinginfo_t out_enc_;
-
- protected:
-  sox_signalinfo_t in_sig_;
-  sox_signalinfo_t interm_sig_;
-  sox_signalinfo_t out_sig_;
-  sox_effects_chain_t* sec_;
-
- public:
-  explicit SoxEffectsChain(
-      sox_encodinginfo_t input_encoding,
-      sox_encodinginfo_t output_encoding);
-  SoxEffectsChain(const SoxEffectsChain& other) = delete;
-  SoxEffectsChain(const SoxEffectsChain&& other) = delete;
-  SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
-  SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
-  ~SoxEffectsChain();
-  void run();
-  void addInputTensor(
-      py::array* waveform,
-      int64_t sample_rate,
-      bool channels_first);
-  void addInputFile(sox_format_t* sf);
-  void addOutputBuffer(std::vector<sox_sample_t>* output_buffer);
-  void addOutputFile(sox_format_t* sf);
-  void addEffect(const std::vector<std::string> effect);
-  int64_t getOutputNumChannels();
-  int64_t getOutputSampleRate();
-};
-
-class SoxEffectsChainPyBind : public SoxEffectsChain {
-  using SoxEffectsChain::SoxEffectsChain;
-
- public:
-  void addInputFileObj(
-      sox_format_t* sf,
-      char* buffer,
-      uint64_t buffer_size,
-      py::object* fileobj);
-
-  void addOutputFileObj(
-      sox_format_t* sf,
-      char** buffer,
-      size_t* buffer_size,
-      py::object* fileobj);
-};
-
-} // namespace paddleaudio::sox_effects_chain
-
--- a/paddlespeech/audio/src/pybind/sox/io.cpp
+++ b/paddlespeech/audio/src/pybind/sox/io.cpp
@ -1,280 +0,0 @@
-// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
-// All rights reserved.
-
-#include "paddlespeech/audio/src/pybind/sox/io.h"
-#include "paddlespeech/audio/src/pybind/sox/effects.h"
-#include "paddlespeech/audio/src/pybind/sox/types.h"
-#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
-#include "paddlespeech/audio/src/pybind/sox/utils.h"
-#include "paddlespeech/audio/src/optional/optional.hpp"
-
-using namespace paddleaudio::sox_utils;
-
-namespace paddleaudio {
-namespace sox_io {
-
-auto get_info_file(const std::string &path, 
-                   const tl::optional<std::string> &format)
-    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
-    SoxFormat sf(
-        sox_open_read(path.data(),
-                      /*signal=*/nullptr,
-                      /*encoding=*/nullptr,
-                      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
-
-
-    validate_input_file(sf, path);
-
-    return std::make_tuple(
-        static_cast<int64_t>(sf->signal.rate),
-        static_cast<int64_t>(sf->signal.length / sf->signal.channels),
-        static_cast<int64_t>(sf->signal.channels),
-        static_cast<int64_t>(sf->encoding.bits_per_sample),
-        get_encoding(sf->encoding.encoding));
-}
-
-std::vector<std::vector<std::string>> get_effects(
-    const tl::optional<int64_t>& frame_offset,
-    const tl::optional<int64_t>& num_frames) {
-  const auto offset = frame_offset.value_or(0);
-  if (offset < 0) {
-    throw std::runtime_error(
-        "Invalid argument: frame_offset must be non-negative.");
-  }
-  const auto frames = num_frames.value_or(-1);
-  if (frames == 0 || frames < -1) {
-    throw std::runtime_error(
-        "Invalid argument: num_frames must be -1 or greater than 0.");
-  }
-
-  std::vector<std::vector<std::string>> effects;
-  if (frames != -1) {
-    std::ostringstream os_offset, os_frames;
-    os_offset << offset << "s";
-    os_frames << "+" << frames << "s";
-    effects.emplace_back(
-        std::vector<std::string>{"trim", os_offset.str(), os_frames.str()});
-  } else if (offset != 0) {
-    std::ostringstream os_offset;
-    os_offset << offset << "s";
-    effects.emplace_back(std::vector<std::string>{"trim", os_offset.str()});
-  }
-  return effects;
-}
-
-auto get_info_fileobj(py::object fileobj, 
-                      const tl::optional<std::string> &format)
-    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
-    const auto capacity = [&]() {
-        const auto bufsiz = get_buffer_size();
-        const int64_t kDefaultCapacityInBytes = 4096;
-        return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
-                                                  : kDefaultCapacityInBytes;
-    }();
-    std::string buffer(capacity, '\0');
-    auto *buf = const_cast<char *>(buffer.data());
-    auto num_read = read_fileobj(&fileobj, capacity, buf);
-    // If the file is shorter than 256, then libsox cannot read the header.
-    auto buf_size = (num_read > 256) ? num_read : 256;
-
-    SoxFormat sf(sox_open_mem_read(
-        buf,
-        buf_size,
-        /*signal=*/nullptr,
-        /*encoding=*/nullptr,
-        /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
-
-    // In case of streamed data, length can be 0
-    validate_input_memfile(sf);
-
-    return std::make_tuple(
-        static_cast<int64_t>(sf->signal.rate),
-        static_cast<int64_t>(sf->signal.length / sf->signal.channels),
-        static_cast<int64_t>(sf->signal.channels),
-        static_cast<int64_t>(sf->encoding.bits_per_sample),
-        get_encoding(sf->encoding.encoding));
-}
-
-tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
-    py::object fileobj,
-    const tl::optional<int64_t>& frame_offset,
-    const tl::optional<int64_t>& num_frames,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    const tl::optional<std::string>& format) {
-  auto effects = get_effects(frame_offset, num_frames);
-  return paddleaudio::sox_effects::apply_effects_fileobj(
-      std::move(fileobj), effects, normalize, channels_first, std::move(format));
-}
-
-tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
-    const std::string& path,
-    const tl::optional<int64_t>& frame_offset,
-    const tl::optional<int64_t>& num_frames,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    const tl::optional<std::string>& format) {
-    auto effects = get_effects(frame_offset, num_frames);
-    return paddleaudio::sox_effects::apply_effects_file(
-        path, effects, normalize, channels_first, format);
-}
-
-void save_audio_file(const std::string& path,
-                     py::array tensor,
-                     int64_t sample_rate,
-                     bool channels_first,
-                     tl::optional<double> compression,
-                     tl::optional<std::string> format,
-                     tl::optional<std::string> encoding,
-                     tl::optional<int64_t> bits_per_sample) {
-    validate_input_tensor(tensor);
-
-    const auto filetype = [&]() {
-        if (format.has_value()) return format.value();
-        return get_filetype(path);
-    }();
-
-    if (filetype == "amr-nb") {
-        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
-        //TORCH_CHECK(num_channels == 1,
-        //            "amr-nb format only supports single channel audio.");
-        assert(num_channels == 1);
-    } else if (filetype == "htk") {
-        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
-       // TORCH_CHECK(num_channels == 1,
-        //            "htk format only supports single channel audio.");
-        assert(num_channels == 1);
-    } else if (filetype == "gsm") {
-        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
-        assert(num_channels == 1);
-        assert(sample_rate == 8000);
-        //TORCH_CHECK(num_channels == 1,
-        //            "gsm format only supports single channel audio.");
-        //TORCH_CHECK(sample_rate == 8000,
-        //            "gsm format only supports a sampling rate of 8kHz.");
-    }
-    const auto signal_info =
-        get_signalinfo(&tensor, sample_rate, filetype, channels_first);
-    const auto encoding_info = get_encodinginfo_for_save(
-        filetype, tensor.dtype(), compression, encoding, bits_per_sample);
-
-    SoxFormat sf(sox_open_write(path.c_str(),
-                                &signal_info,
-                                &encoding_info,
-                                /*filetype=*/filetype.c_str(),
-                                /*oob=*/nullptr,
-                                /*overwrite_permitted=*/nullptr));
-
-    if (static_cast<sox_format_t*>(sf) == nullptr) {
-        throw std::runtime_error(
-            "Error saving audio file: failed to open file " + path);
-    }
-
-    paddleaudio::sox_effects_chain::SoxEffectsChain chain(
-        /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
-        /*output_encoding=*/sf->encoding);
-    chain.addInputTensor(&tensor, sample_rate, channels_first);
-    chain.addOutputFile(sf);
-    chain.run();
-}
-
-namespace {
-// helper class to automatically release buffer, to be used by
-// save_audio_fileobj
-struct AutoReleaseBuffer {
-  char* ptr;
-  size_t size;
-
-  AutoReleaseBuffer() : ptr(nullptr), size(0) {}
-  AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete;
-  AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete;
-  auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete;
-  auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete;
-  ~AutoReleaseBuffer() {
-    if (ptr) {
-      free(ptr);
-    }
-  }
-};
-
-} // namespace
-
-void save_audio_fileobj(
-    py::object fileobj,
-    py::array tensor,
-    int64_t sample_rate,
-    bool channels_first,
-    tl::optional<double> compression,
-    tl::optional<std::string> format,
-    tl::optional<std::string> encoding,
-    tl::optional<int64_t> bits_per_sample) {
-
-  if (!format.has_value()) {
-    throw std::runtime_error(
-        "`format` is required when saving to file object.");
-  }
-  const auto filetype = format.value();
-
-  if (filetype == "amr-nb") {
-    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
-    if (num_channels != 1) {
-      throw std::runtime_error(
-          "amr-nb format only supports single channel audio.");
-    }
-  } else if (filetype == "htk") {
-    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
-    if (num_channels != 1) {
-      throw std::runtime_error(
-          "htk format only supports single channel audio.");
-    }
-  } else if (filetype == "gsm") {
-    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
-    if (num_channels != 1) {
-      throw std::runtime_error(
-          "gsm format only supports single channel audio.");
-    }
-    if (sample_rate != 8000) {
-      throw std::runtime_error(
-          "gsm format only supports a sampling rate of 8kHz.");
-    }
-  }
-
-  const auto signal_info =
-      get_signalinfo(&tensor, sample_rate, filetype, channels_first);
-  const auto encoding_info = get_encodinginfo_for_save(
-      filetype,
-      tensor.dtype(),
-      compression,
-      std::move(encoding),
-      bits_per_sample);
-
-  AutoReleaseBuffer buffer;
-
-  SoxFormat sf(sox_open_memstream_write(
-      &buffer.ptr,
-      &buffer.size,
-      &signal_info,
-      &encoding_info,
-      filetype.c_str(),
-      /*oob=*/nullptr));
-
-  if (static_cast<sox_format_t*>(sf) == nullptr) {
-    throw std::runtime_error(
-        "Error saving audio file: failed to open memory stream.");
-  }
-
-  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
-      /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
-      /*output_encoding=*/sf->encoding);
-  chain.addInputTensor(&tensor, sample_rate, channels_first);
-  chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj);
-  chain.run();
-
-  // Closing the sox_format_t is necessary for flushing the last chunk to the
-  // buffer
-  sf.close();
-  fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
-}
-
-}  // namespace paddleaudio
-}  // namespace sox_io
--- a/paddlespeech/audio/src/pybind/sox/io.h
+++ b/paddlespeech/audio/src/pybind/sox/io.h
@ -1,63 +0,0 @@
-// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
-// All rights reserved.
-
-#pragma once
-
-#include "paddlespeech/audio/src/pybind/sox/utils.h"
-
-namespace py = pybind11;
-
-namespace paddleaudio {
-namespace sox_io {
-
-auto get_info_file(const std::string &path, 
-                   const tl::optional<std::string> &format)
-    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
-
-auto get_info_fileobj(py::object fileobj,
-                   const tl::optional<std::string> &format)
-    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
-
-tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
-    py::object fileobj,
-    const tl::optional<int64_t>& frame_offset,
-    const tl::optional<int64_t>& num_frames,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    const tl::optional<std::string>& format);
-
-void save_audio_fileobj(
-    py::object fileobj,
-    py::array tensor,
-    int64_t sample_rate,
-    bool channels_first,
-    tl::optional<double> compression,
-    tl::optional<std::string> format,
-    tl::optional<std::string> encoding,
-    tl::optional<int64_t> bits_per_sample);
-
-auto get_effects(const tl::optional<int64_t>& frame_offset,
-                 const tl::optional<int64_t>& num_frames)
-    -> std::vector<std::vector<std::string>>;
-
-
-tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
-    const std::string& path,
-    const tl::optional<int64_t>& frame_offset,
-    const tl::optional<int64_t>& num_frames,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    const tl::optional<std::string>& format);
-
-void save_audio_file(const std::string& path,
-                     py::array tensor,
-                     int64_t sample_rate,
-                     bool channels_first,
-                     tl::optional<double> compression,
-                     tl::optional<std::string> format,
-                     tl::optional<std::string> encoding,
-                     tl::optional<int64_t> bits_per_sample);    
-
-
-}  // namespace paddleaudio
-}  // namespace sox_io
--- a/paddlespeech/audio/src/pybind/sox/types.cpp
+++ b/paddlespeech/audio/src/pybind/sox/types.cpp
@ -1,143 +0,0 @@
-//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp
-
-#include "paddlespeech/audio/src/pybind/sox/types.h"
-#include <ostream>
-#include <sstream>
-
-namespace paddleaudio {
-namespace sox_utils {
-
-Format get_format_from_string(const std::string& format) {
-  if (format == "wav")
-    return Format::WAV;
-  if (format == "mp3")
-    return Format::MP3;
-  if (format == "flac")
-    return Format::FLAC;
-  if (format == "ogg" || format == "vorbis")
-    return Format::VORBIS;
-  if (format == "amr-nb")
-    return Format::AMR_NB;
-  if (format == "amr-wb")
-    return Format::AMR_WB;
-  if (format == "amb")
-    return Format::AMB;
-  if (format == "sph")
-    return Format::SPHERE;
-  if (format == "htk")
-    return Format::HTK;
-  if (format == "gsm")
-    return Format::GSM;
-  std::ostringstream stream;
-  stream << "Internal Error: unexpected format value: " << format;
-  throw std::runtime_error(stream.str());
-}
-
-std::string to_string(Encoding v) {
-  switch (v) {
-    case Encoding::UNKNOWN:
-      return "UNKNOWN";
-    case Encoding::PCM_SIGNED:
-      return "PCM_S";
-    case Encoding::PCM_UNSIGNED:
-      return "PCM_U";
-    case Encoding::PCM_FLOAT:
-      return "PCM_F";
-    case Encoding::FLAC:
-      return "FLAC";
-    case Encoding::ULAW:
-      return "ULAW";
-    case Encoding::ALAW:
-      return "ALAW";
-    case Encoding::MP3:
-      return "MP3";
-    case Encoding::VORBIS:
-      return "VORBIS";
-    case Encoding::AMR_WB:
-      return "AMR_WB";
-    case Encoding::AMR_NB:
-      return "AMR_NB";
-    case Encoding::OPUS:
-      return "OPUS";
-    default:
-      throw std::runtime_error("Internal Error: unexpected encoding.");
-  }
-}
-
-Encoding get_encoding_from_option(const tl::optional<std::string> encoding) {
-  if (!encoding.has_value())
-    return Encoding::NOT_PROVIDED;
-  std::string v = encoding.value();
-  if (v == "PCM_S")
-    return Encoding::PCM_SIGNED;
-  if (v == "PCM_U")
-    return Encoding::PCM_UNSIGNED;
-  if (v == "PCM_F")
-    return Encoding::PCM_FLOAT;
-  if (v == "ULAW")
-    return Encoding::ULAW;
-  if (v == "ALAW")
-    return Encoding::ALAW;
-  std::ostringstream stream;
-  stream << "Internal Error: unexpected encoding value: " << v;
-  throw std::runtime_error(stream.str());
-}
-
-BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth) {
-  if (!bit_depth.has_value())
-    return BitDepth::NOT_PROVIDED;
-  int64_t v = bit_depth.value();
-  switch (v) {
-    case 8:
-      return BitDepth::B8;
-    case 16:
-      return BitDepth::B16;
-    case 24:
-      return BitDepth::B24;
-    case 32:
-      return BitDepth::B32;
-    case 64:
-      return BitDepth::B64;
-    default: {
-      std::ostringstream s;
-      s << "Internal Error: unexpected bit depth value: " << v;
-      throw std::runtime_error(s.str());
-    }
-  }
-}
-
-std::string get_encoding(sox_encoding_t encoding) {
-  switch (encoding) {
-    case SOX_ENCODING_UNKNOWN:
-      return "UNKNOWN";
-    case SOX_ENCODING_SIGN2:
-      return "PCM_S";
-    case SOX_ENCODING_UNSIGNED:
-      return "PCM_U";
-    case SOX_ENCODING_FLOAT:
-      return "PCM_F";
-    case SOX_ENCODING_FLAC:
-      return "FLAC";
-    case SOX_ENCODING_ULAW:
-      return "ULAW";
-    case SOX_ENCODING_ALAW:
-      return "ALAW";
-    case SOX_ENCODING_MP3:
-      return "MP3";
-    case SOX_ENCODING_VORBIS:
-      return "VORBIS";
-    case SOX_ENCODING_AMR_WB:
-      return "AMR_WB";
-    case SOX_ENCODING_AMR_NB:
-      return "AMR_NB";
-    case SOX_ENCODING_OPUS:
-      return "OPUS";
-    case SOX_ENCODING_GSM:
-      return "GSM";
-    default:
-      return "UNKNOWN";
-  }
-}
-
-} // namespace sox_utils
-} // namespace paddleaudio
--- a/paddlespeech/audio/src/pybind/sox/types.h
+++ b/paddlespeech/audio/src/pybind/sox/types.h
@ -1,58 +0,0 @@
-//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h
-#pragma once
-
-#include <sox.h>
-#include "paddlespeech/audio/src/optional/optional.hpp"
-
-namespace paddleaudio {
-namespace sox_utils {
-
-enum class Format {
-  WAV,
-  MP3,
-  FLAC,
-  VORBIS,
-  AMR_NB,
-  AMR_WB,
-  AMB,
-  SPHERE,
-  GSM,
-  HTK,
-};
-
-Format get_format_from_string(const std::string& format);
-
-enum class Encoding {
-  NOT_PROVIDED,
-  UNKNOWN,
-  PCM_SIGNED,
-  PCM_UNSIGNED,
-  PCM_FLOAT,
-  FLAC,
-  ULAW,
-  ALAW,
-  MP3,
-  VORBIS,
-  AMR_WB,
-  AMR_NB,
-  OPUS,
-};
-
-std::string to_string(Encoding v);
-Encoding get_encoding_from_option(const tl::optional<std::string> encoding);
-
-enum class BitDepth : unsigned {
-  NOT_PROVIDED = 0,
-  B8 = 8,
-  B16 = 16,
-  B24 = 24,
-  B32 = 32,
-  B64 = 64,
-};
-
-BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth);
-
-std::string get_encoding(sox_encoding_t encoding);
-
-} // namespace sox_utils
-} // namespace paddleaudio
--- a/paddlespeech/audio/src/pybind/sox/utils.cpp
+++ b/paddlespeech/audio/src/pybind/sox/utils.cpp
@ -1,642 +0,0 @@
-// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
-// All rights reserved.
-#include <sox.h>
-
-#include "paddlespeech/audio/src/pybind/sox/utils.h"
-#include "paddlespeech/audio/src/pybind/sox/types.h"
-
-#include <sstream>
-
-namespace paddleaudio {
-namespace sox_utils {
-
-auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer)
-    -> uint64_t {
-    uint64_t num_read = 0;
-    while (num_read < size) {
-        auto request = size - num_read;
-        auto chunk = static_cast<std::string>(
-            static_cast<py::bytes>(fileobj->attr("read")(request)));
-        auto chunk_len = chunk.length();
-        if (chunk_len == 0) {
-            break;
-        }
-        if (chunk_len > request) {
-            std::ostringstream message;
-            message
-                << "Requested up to " << request << " bytes but, "
-                << "received " << chunk_len << " bytes. "
-                << "The given object does not confirm to read protocol of file "
-                   "object.";
-            throw std::runtime_error(message.str());
-        }
-        memcpy(buffer, chunk.data(), chunk_len);
-        buffer += chunk_len;
-        num_read += chunk_len;
-    }
-    return num_read;
-}
-
-
-void set_seed(const int64_t seed) {
-  sox_get_globals()->ranqd1 = static_cast<sox_int32_t>(seed);
-}
-
-void set_verbosity(const int64_t verbosity) {
-  sox_get_globals()->verbosity = static_cast<unsigned>(verbosity);
-}
-
-void set_use_threads(const bool use_threads) {
-  sox_get_globals()->use_threads = static_cast<sox_bool>(use_threads);
-}
-
-void set_buffer_size(const int64_t buffer_size) {
-  sox_get_globals()->bufsiz = static_cast<size_t>(buffer_size);
-}
-
-int64_t get_buffer_size() {
-  return sox_get_globals()->bufsiz;
-}
-
-std::vector<std::vector<std::string>> list_effects() {
-  std::vector<std::vector<std::string>> effects;
-  for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) {
-    const sox_effect_handler_t* handler = (*fns)();
-    if (handler && handler->name) {
-      if (UNSUPPORTED_EFFECTS.find(handler->name) ==
-          UNSUPPORTED_EFFECTS.end()) {
-        effects.emplace_back(std::vector<std::string>{
-            handler->name,
-            handler->usage ? std::string(handler->usage) : std::string("")});
-      }
-    }
-  }
-  return effects;
-}
-
-std::vector<std::string> list_write_formats() {
-  std::vector<std::string> formats;
-  for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
-    const sox_format_handler_t* handler = fns->fn();
-    for (const char* const* names = handler->names; *names; ++names) {
-      if (!strchr(*names, '/') && handler->write)
-        formats.emplace_back(*names);
-    }
-  }
-  return formats;
-}
-
-std::vector<std::string> list_read_formats() {
-  std::vector<std::string> formats;
-  for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
-    const sox_format_handler_t* handler = fns->fn();
-    for (const char* const* names = handler->names; *names; ++names) {
-      if (!strchr(*names, '/') && handler->read)
-        formats.emplace_back(*names);
-    }
-  }
-  return formats;
-}
-
-SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {}
-SoxFormat::~SoxFormat() {
-  close();
-}
-
-sox_format_t* SoxFormat::operator->() const noexcept {
-  return fd_;
-}
-SoxFormat::operator sox_format_t*() const noexcept {
-  return fd_;
-}
-
-void SoxFormat::close() {
-  if (fd_ != nullptr) {
-    sox_close(fd_);
-    fd_ = nullptr;
-  }
-}
-
-void validate_input_file(const SoxFormat& sf, const std::string& path) {
-  if (static_cast<sox_format_t*>(sf) == nullptr) {
-    throw std::runtime_error(
-        "Error loading audio file: failed to open file " + path);
-  }
-  if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-    throw std::runtime_error("Error loading audio file: unknown encoding.");
-  }
-}
-
-void validate_input_memfile(const SoxFormat &sf) {
-    return validate_input_file(sf, "<in memory buffer>");
-}
-
-void validate_input_tensor(const py::array tensor) {
-  if (tensor.ndim() != 2) {
-    throw std::runtime_error("Input tensor has to be 2D.");
-  }
-
-  char dtype = tensor.dtype().char_();
-  bool flag = (dtype == 'f') || (dtype == 'd') || (dtype == 'l') || (dtype == 'i');
-  if (flag == false) {
-      throw std::runtime_error(
-          "Input tensor has to be one of float32, int32, int16 or uint8 type.");
-  }
-}
-
-py::dtype get_dtype(
-    const sox_encoding_t encoding,
-    const unsigned precision) {
-    switch (encoding) {
-      case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
-        return py::dtype('u1');
-      case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV
-        switch (precision) {
-          case 16:
-            return py::dtype("i2");
-          case 24: // Cast 24-bit to 32-bit.
-          case 32:
-            return py::dtype('i');
-          default:
-            throw std::runtime_error(
-                "Only 16, 24, and 32 bits are supported for signed PCM.");
-        }
-      default:
-        // default to float32 for the other formats, including
-        // 32-bit flaoting-point WAV,
-        // MP3,
-        // FLAC,
-        // VORBIS etc...
-        return py::dtype("f");
-    }
-}
-
-py::array convert_to_tensor(
-    sox_sample_t* buffer,
-    const int32_t num_samples,
-    const int32_t num_channels,
-    const py::dtype dtype,
-    const bool normalize,
-    const bool channels_first) {
-  // todo refector later(SGoat)
-  py::array t;
-  uint64_t dummy = 0;
-  SOX_SAMPLE_LOCALS;
-  int32_t num_rows = num_samples / num_channels;
-  if (normalize || dtype.char_() == 'f') {
-    t = py::array(dtype, {num_rows, num_channels});
-    auto ptr = (float*)t.mutable_data(0, 0);
-    for (int32_t i = 0; i < num_samples; ++i) {
-      ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy);
-    }
-    if (channels_first) {
-    py::array t2 = py::array(dtype, {num_channels, num_rows});
-    for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
-      for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
-       *(float*)t2.mutable_data(row_idx, col_idx) = *(float*)t.data(col_idx, row_idx);
-    }
-    return t2;
-  }
-  } else if (dtype.char_() == 'i') {
-    t = py::array(dtype, {num_rows, num_channels});
-    auto ptr = (int*)t.mutable_data(0, 0);
-    for (int32_t i = 0; i < num_samples; ++i) {
-      ptr[i] = buffer[i];
-    }
-    if (channels_first) {
-      py::array t2 = py::array(dtype, {num_channels, num_rows});
-      for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
-        for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
-          *(int*)t2.mutable_data(row_idx, col_idx) = *(int*)t.data(col_idx, row_idx);
-      }
-      return t2;
-    }
-  } else if (dtype.char_() == 'h') { // int16
-    t = py::array(dtype, {num_rows, num_channels});
-    auto ptr = (int16_t*)t.mutable_data(0, 0);
-    for (int32_t i = 0; i < num_samples; ++i) {
-      ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy);
-    }
-    if (channels_first) {
-      py::array t2 = py::array(dtype, {num_channels, num_rows});
-      for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
-        for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
-          *(int16_t*)t2.mutable_data(row_idx, col_idx) = *(int16_t*)t.data(col_idx, row_idx);
-      }
-      return t2;
-    }
-  } else if (dtype.char_() == 'b') {
-    //t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8);
-    t = py::array(dtype, {num_rows, num_channels});
-    auto ptr = (uint8_t*)t.mutable_data(0,0);
-    for (int32_t i = 0; i < num_samples; ++i) {
-      ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy);
-    }
-    if (channels_first) {
-      py::array t2 = py::array(dtype, {num_channels, num_rows});
-      for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
-        for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
-        *(uint8_t*)t2.mutable_data(row_idx, col_idx) = *(uint8_t*)t.data(col_idx, row_idx);
-      }
-      return t2;
-    }
-  } else {
-    throw std::runtime_error("Unsupported dtype.");
-  }
-  return t;
-}
-
-const std::string get_filetype(const std::string path) {
-  std::string ext = path.substr(path.find_last_of(".") + 1);
-  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
-  return ext;
-}
-
-namespace {
-
-std::tuple<sox_encoding_t, unsigned> get_save_encoding_for_wav(
-    const std::string format,
-    py::dtype dtype,
-    const Encoding& encoding,
-    const BitDepth& bits_per_sample) {
-  switch (encoding) {
-    case Encoding::NOT_PROVIDED:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-          switch (dtype.num()) {
-            case 11: // float32 numpy dtype num 
-              return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
-            case 5: // int numpy dtype num
-              return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
-            case 3: // int16 numpy
-              return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
-            case 1: // byte numpy
-              return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
-            default:
-              throw std::runtime_error("Internal Error: Unexpected dtype.");
-          }
-        case BitDepth::B8:
-          return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
-        default:
-          return std::make_tuple<>(
-              SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
-      }
-    case Encoding::PCM_SIGNED:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-          return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
-        case BitDepth::B8:
-          throw std::runtime_error(
-              format + " does not support 8-bit signed PCM encoding.");
-        default:
-          return std::make_tuple<>(
-              SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
-      }
-    case Encoding::PCM_UNSIGNED:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-        case BitDepth::B8:
-          return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
-        default:
-          throw std::runtime_error(
-              format + " only supports 8-bit for unsigned PCM encoding.");
-      }
-    case Encoding::PCM_FLOAT:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-        case BitDepth::B32:
-          return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
-        case BitDepth::B64:
-          return std::make_tuple<>(SOX_ENCODING_FLOAT, 64);
-        default:
-          throw std::runtime_error(
-              format +
-              " only supports 32-bit or 64-bit for floating-point PCM encoding.");
-      }
-    case Encoding::ULAW:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-        case BitDepth::B8:
-          return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
-        default:
-          throw std::runtime_error(
-              format + " only supports 8-bit for mu-law encoding.");
-      }
-    case Encoding::ALAW:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-        case BitDepth::B8:
-          return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
-        default:
-          throw std::runtime_error(
-              format + " only supports 8-bit for a-law encoding.");
-      }
-    default:
-      throw std::runtime_error(
-          format + " does not support encoding: " + to_string(encoding));
-  }
-}
-
-std::tuple<sox_encoding_t, unsigned> get_save_encoding(
-    const std::string& format,
-    const py::dtype dtype,
-    const tl::optional<std::string> encoding,
-    const tl::optional<int64_t> bits_per_sample) {
-  const Format fmt = get_format_from_string(format);
-  const Encoding enc = get_encoding_from_option(encoding);
-  const BitDepth bps = get_bit_depth_from_option(bits_per_sample);
-
-  switch (fmt) {
-    case Format::WAV:
-    case Format::AMB:
-      return get_save_encoding_for_wav(format, dtype, enc, bps);
-    case Format::MP3:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("mp3 does not support `encoding` option.");
-      if (bps != BitDepth::NOT_PROVIDED)
-        throw std::runtime_error(
-            "mp3 does not support `bits_per_sample` option.");
-      return std::make_tuple<>(SOX_ENCODING_MP3, 16);
-    case Format::HTK:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("htk does not support `encoding` option.");
-      if (bps != BitDepth::NOT_PROVIDED)
-        throw std::runtime_error(
-            "htk does not support `bits_per_sample` option.");
-      return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
-    case Format::VORBIS:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("vorbis does not support `encoding` option.");
-      if (bps != BitDepth::NOT_PROVIDED)
-        throw std::runtime_error(
-            "vorbis does not support `bits_per_sample` option.");
-      return std::make_tuple<>(SOX_ENCODING_VORBIS, 16);
-    case Format::AMR_NB:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("amr-nb does not support `encoding` option.");
-      if (bps != BitDepth::NOT_PROVIDED)
-        throw std::runtime_error(
-            "amr-nb does not support `bits_per_sample` option.");
-      return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16);
-    case Format::FLAC:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("flac does not support `encoding` option.");
-      switch (bps) {
-        case BitDepth::B32:
-        case BitDepth::B64:
-          throw std::runtime_error(
-              "flac does not support `bits_per_sample` larger than 24.");
-        default:
-          return std::make_tuple<>(
-              SOX_ENCODING_FLAC, static_cast<unsigned>(bps));
-      }
-    case Format::SPHERE:
-      switch (enc) {
-        case Encoding::NOT_PROVIDED:
-        case Encoding::PCM_SIGNED:
-          switch (bps) {
-            case BitDepth::NOT_PROVIDED:
-              return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
-            default:
-              return std::make_tuple<>(
-                  SOX_ENCODING_SIGN2, static_cast<unsigned>(bps));
-          }
-        case Encoding::PCM_UNSIGNED:
-          throw std::runtime_error(
-              "sph does not support unsigned integer PCM.");
-        case Encoding::PCM_FLOAT:
-          throw std::runtime_error("sph does not support floating point PCM.");
-        case Encoding::ULAW:
-          switch (bps) {
-            case BitDepth::NOT_PROVIDED:
-            case BitDepth::B8:
-              return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
-            default:
-              throw std::runtime_error(
-                  "sph only supports 8-bit for mu-law encoding.");
-          }
-        case Encoding::ALAW:
-          switch (bps) {
-            case BitDepth::NOT_PROVIDED:
-            case BitDepth::B8:
-              return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
-            default:
-              return std::make_tuple<>(
-                  SOX_ENCODING_ALAW, static_cast<unsigned>(bps));
-          }
-        default:
-          throw std::runtime_error(
-              "sph does not support encoding: " + encoding.value());
-      }
-    case Format::GSM:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("gsm does not support `encoding` option.");
-      if (bps != BitDepth::NOT_PROVIDED)
-        throw std::runtime_error(
-            "gsm does not support `bits_per_sample` option.");
-      return std::make_tuple<>(SOX_ENCODING_GSM, 16);
-
-    default:
-      throw std::runtime_error("Unsupported format: " + format);
-  }
-}
-
-unsigned get_precision(const std::string filetype, py::dtype dtype) {
-  if (filetype == "mp3")
-    return SOX_UNSPEC;
-  if (filetype == "flac")
-    return 24;
-  if (filetype == "ogg" || filetype == "vorbis")
-    return SOX_UNSPEC;
-  if (filetype == "wav" || filetype == "amb") {
-    switch (dtype.num()) {
-      case 1: // byte in numpy dype num
-        return 8;
-      case 3: // short, in numpy dtype num
-        return 16;
-      case 5: // int, numpy dtype 
-        return 32;
-      case 11: // float, numpy dtype
-        return 32;
-      default:
-        throw std::runtime_error("Unsupported dtype.");
-    }
-  }
-  if (filetype == "sph")
-    return 32;
-  if (filetype == "amr-nb") {
-    return 16;
-  }
-  if (filetype == "gsm") {
-    return 16;
-  }
-  if (filetype == "htk") {
-    return 16;
-  }
-  throw std::runtime_error("Unsupported file type: " + filetype);
-}
-
-} // namespace
-
-sox_signalinfo_t get_signalinfo(
-    const py::array* waveform,
-    const int64_t sample_rate,
-    const std::string filetype,
-    const bool channels_first) {
-  return sox_signalinfo_t{
-      /*rate=*/static_cast<sox_rate_t>(sample_rate),
-      /*channels=*/
-      static_cast<unsigned>(waveform->shape(channels_first ? 0 : 1)),
-      /*precision=*/get_precision(filetype, waveform->dtype()),
-      /*length=*/static_cast<uint64_t>(waveform->size())};
-}
-
-sox_encodinginfo_t get_tensor_encodinginfo(py::dtype dtype) {
-  sox_encoding_t encoding = [&]() {
-    switch (dtype.num()) {
-      case 1: // byte
-        return SOX_ENCODING_UNSIGNED;
-      case 3: // short
-        return SOX_ENCODING_SIGN2;
-      case 5: // int32
-        return SOX_ENCODING_SIGN2;
-      case 11: // float
-        return SOX_ENCODING_FLOAT;
-      default:
-        throw std::runtime_error("Unsupported dtype.");
-    }
-  }();
-  unsigned bits_per_sample = [&]() {
-    switch (dtype.num()) {
-      case 1: // byte
-        return 8;
-      case 3: //short
-        return 16;
-      case 5: // int32
-        return 32;
-      case 11: // float
-        return 32;
-      default:
-        throw std::runtime_error("Unsupported dtype.");
-    }
-  }();
-  return sox_encodinginfo_t{
-      /*encoding=*/encoding,
-      /*bits_per_sample=*/bits_per_sample,
-      /*compression=*/HUGE_VAL,
-      /*reverse_bytes=*/sox_option_default,
-      /*reverse_nibbles=*/sox_option_default,
-      /*reverse_bits=*/sox_option_default,
-      /*opposite_endian=*/sox_false};
-}
-
-sox_encodinginfo_t get_encodinginfo_for_save(
-    const std::string& format,
-    const py::dtype dtype,
-    const tl::optional<double> compression,
-    const tl::optional<std::string> encoding,
-    const tl::optional<int64_t> bits_per_sample) {
-  auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample);
-  return sox_encodinginfo_t{
-      /*encoding=*/std::get<0>(enc),
-      /*bits_per_sample=*/std::get<1>(enc),
-      /*compression=*/compression.value_or(HUGE_VAL),
-      /*reverse_bytes=*/sox_option_default,
-      /*reverse_nibbles=*/sox_option_default,
-      /*reverse_bits=*/sox_option_default,
-      /*opposite_endian=*/sox_false};
-}
-
-
-/*
-SoxFormat::SoxFormat(sox_format_t *fd) noexcept : fd_(fd) {}
-SoxFormat::~SoxFormat() { close(); }
-
-sox_format_t *SoxFormat::operator->() const noexcept { return fd_; }
-SoxFormat::operator sox_format_t *() const noexcept { return fd_; }
-
-void SoxFormat::close() {
-    if (fd_ != nullptr) {
-        sox_close(fd_);
-        fd_ = nullptr;
-    }
-}
-
-auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer)
-    -> uint64_t {
-    uint64_t num_read = 0;
-    while (num_read < size) {
-        auto request = size - num_read;
-        auto chunk = static_cast<std::string>(
-            static_cast<py::bytes>(fileobj->attr("read")(request)));
-        auto chunk_len = chunk.length();
-        if (chunk_len == 0) {
-            break;
-        }
-        if (chunk_len > request) {
-            std::ostringstream message;
-            message
-                << "Requested up to " << request << " bytes but, "
-                << "received " << chunk_len << " bytes. "
-                << "The given object does not confirm to read protocol of file "
-                   "object.";
-            throw std::runtime_error(message.str());
-        }
-        memcpy(buffer, chunk.data(), chunk_len);
-        buffer += chunk_len;
-        num_read += chunk_len;
-    }
-    return num_read;
-}
-
-int64_t get_buffer_size() { return sox_get_globals()->bufsiz; }
-
-void validate_input_file(const SoxFormat &sf, const std::string &path) {
-    if (static_cast<sox_format_t *>(sf) == nullptr) {
-        throw std::runtime_error(
-            "Error loading audio file: failed to open file " + path);
-    }
-    if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-        throw std::runtime_error("Error loading audio file: unknown encoding.");
-    }
-}
-
-void validate_input_memfile(const SoxFormat &sf) {
-    return validate_input_file(sf, "<in memory buffer>");
-}
-
-std::string get_encoding(sox_encoding_t encoding) {
-    switch (encoding) {
-        case SOX_ENCODING_UNKNOWN:
-            return "UNKNOWN";
-        case SOX_ENCODING_SIGN2:
-            return "PCM_S";
-        case SOX_ENCODING_UNSIGNED:
-            return "PCM_U";
-        case SOX_ENCODING_FLOAT:
-            return "PCM_F";
-        case SOX_ENCODING_FLAC:
-            return "FLAC";
-        case SOX_ENCODING_ULAW:
-            return "ULAW";
-        case SOX_ENCODING_ALAW:
-            return "ALAW";
-        case SOX_ENCODING_MP3:
-            return "MP3";
-        case SOX_ENCODING_VORBIS:
-            return "VORBIS";
-        case SOX_ENCODING_AMR_WB:
-            return "AMR_WB";
-        case SOX_ENCODING_AMR_NB:
-            return "AMR_NB";
-        case SOX_ENCODING_OPUS:
-            return "OPUS";
-        case SOX_ENCODING_GSM:
-            return "GSM";
-        default:
-            return "UNKNOWN";
-    }
-}
-*/
-}  // namespace paddleaudio
-}  // namespace sox_utils
--- a/paddlespeech/audio/src/pybind/sox/utils.h
+++ b/paddlespeech/audio/src/pybind/sox/utils.h
@ -1,116 +0,0 @@
-// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
-// All rights reserved.
-
-#pragma once
-
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <sox.h>
-#include "paddlespeech/audio/src/optional/optional.hpp"
-
-namespace py = pybind11;
-
-namespace paddleaudio {
-namespace sox_utils {
-
-auto read_fileobj(py::object *fileobj, uint64_t size, char *buffer) -> uint64_t;
-
-void set_seed(const int64_t seed);
-
-void set_verbosity(const int64_t verbosity);
-
-void set_use_threads(const bool use_threads);
-
-void set_buffer_size(const int64_t buffer_size);
-
-int64_t get_buffer_size();
-
-std::vector<std::vector<std::string>> list_effects();
-
-std::vector<std::string> list_read_formats();
-
-std::vector<std::string> list_write_formats();
-
-////////////////////////////////////////////////////////////////////////////////
-// Utilities for sox_io / sox_effects implementations
-////////////////////////////////////////////////////////////////////////////////
-
-const std::unordered_set<std::string> UNSUPPORTED_EFFECTS =
-    {"input", "output", "spectrogram", "noiseprof", "noisered", "splice"};
-
-/// helper class to automatically close sox_format_t*
-struct SoxFormat {
-  explicit SoxFormat(sox_format_t* fd) noexcept;
-  SoxFormat(const SoxFormat& other) = delete;
-  SoxFormat(SoxFormat&& other) = delete;
-  SoxFormat& operator=(const SoxFormat& other) = delete;
-  SoxFormat& operator=(SoxFormat&& other) = delete;
-  ~SoxFormat();
-  sox_format_t* operator->() const noexcept;
-  operator sox_format_t*() const noexcept;
-
-  void close();
-
- private:
-  sox_format_t* fd_;
-};
-
-///
-/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
-void validate_input_tensor(const py::array);
-
-void validate_input_file(const SoxFormat& sf, const std::string& path);
-
-void validate_input_memfile(const SoxFormat &sf);
-///
-/// Get target dtype for the given encoding and precision.
-py::dtype get_dtype(
-    const sox_encoding_t encoding,
-    const unsigned precision);
-
-///
-/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
-/// NOTE: This function might modify the values in the input buffer to
-/// reduce the number of memory copy.
-/// @param buffer Pointer to buffer that contains audio data.
-/// @param num_samples The number of samples to read.
-/// @param num_channels The number of channels. Used to reshape the resulting
-/// Tensor.
-/// @param dtype Target dtype. Determines the output dtype and value range in
-/// conjunction with normalization.
-/// @param noramlize Perform normalization. Only effective when dtype is not
-/// kFloat32. When effective, the output tensor is kFloat32 type and value range
-/// is [-1.0, 1.0]
-/// @param channels_first When True, output Tensor has shape of [num_channels,
-/// num_frames].
-py::array convert_to_tensor(
-    sox_sample_t* buffer,
-    const int32_t num_samples,
-    const int32_t num_channels,
-    const py::dtype dtype,
-    const bool normalize,
-    const bool channels_first);
-
-/// Extract extension from file path
-const std::string get_filetype(const std::string path);
-
-/// Get sox_signalinfo_t for passing a py::array object.
-sox_signalinfo_t get_signalinfo(
-    const py::array* waveform,
-    const int64_t sample_rate,
-    const std::string filetype,
-    const bool channels_first);
-
-/// Get sox_encodinginfo_t for Tensor I/O
-sox_encodinginfo_t get_tensor_encodinginfo(const py::dtype dtype);
-
-/// Get sox_encodinginfo_t for saving to file/file object
-sox_encodinginfo_t get_encodinginfo_for_save(
-    const std::string& format,
-    const py::dtype dtype,
-    const tl::optional<double> compression,
-    const tl::optional<std::string> encoding,
-    const tl::optional<int64_t> bits_per_sample);
-
-}  // namespace paddleaudio
-}  // namespace sox_utils
--- a/paddlespeech/audio/src/utils.cpp
+++ b/paddlespeech/audio/src/utils.cpp
@ -1,33 +0,0 @@
-namespace paddleaudio {
-
-namespace {
-
-bool is_sox_available() {
-#ifdef INCLUDE_SOX
-    return true;
-#else
-    return false;
-#endif
-}
-
-bool is_kaldi_available() {
-#ifdef INCLUDE_KALDI
-    return true;
-#else
-    return false;
-#endif
-}
-
-// It tells whether paddleaudio was compiled with ffmpeg
-// not the runtime availability.
-bool is_ffmpeg_available() {
-#ifdef USE_FFMPEG
-    return true;
-#else
-    return false;
-#endif
-}
-
-}  // namespace
-
-}  // namespace paddleaudio
--- a/paddlespeech/audio/streamdata/autodecode.py
+++ b/paddlespeech/audio/streamdata/autodecode.py
@ -295,7 +295,7 @@ def torch_video(key, data):


 def paddle_audio(key, data):
-    """Decode audio using the paddlespeech.audio library.
+    """Decode audio using the paddleaudio library.

    :param key: file name extension
    :param data: data to be decoded
@ -304,13 +304,13 @@ def paddle_audio(key, data):
    if extension not in ["flac", "mp3", "sox", "wav", "m4a", "ogg", "wma"]:
        return None

-    import paddlespeech.audio
+    import paddlesaudio

    with tempfile.TemporaryDirectory() as dirname:
        fname = os.path.join(dirname, f"file.{extension}")
        with open(fname, "wb") as stream:
            stream.write(data)
-        return paddlespeech.audio.load(fname)
+        return paddleaudio.backends.soundfile_load(fname)


 ################################################################
--- a/paddlespeech/audio/streamdata/filters.py
+++ b/paddlespeech/audio/streamdata/filters.py
@ -25,8 +25,10 @@ import paddle

 from . import autodecode
 from . import utils
-from .. import backends
-from ..compliance import kaldi
+
+from paddleaudio import backends
+from paddleaudio.compliance import kaldi
+
 from ..transform.cmvn import GlobalCMVN
 from ..transform.spec_augment import freq_mask
 from ..transform.spec_augment import time_mask
--- a/paddlespeech/audio/streamdata/tariterators.py
+++ b/paddlespeech/audio/streamdata/tariterators.py
@ -20,7 +20,7 @@ trace = False
 meta_prefix = "__"
 meta_suffix = "__"

-import paddlespeech
+import paddleaudio
 import paddle
 import numpy as np

@ -111,7 +111,7 @@ def tar_file_iterator(fileobj,
            assert pos > 0
            prefix, postfix = name[:pos], name[pos + 1:]
            if postfix == 'wav':
-                waveform, sample_rate = paddlespeech.audio.load(
+                waveform, sample_rate = paddleaudio.backends.soundfile_load(
                    stream.extractfile(tarinfo), normal=False)
                result = dict(
                    fname=prefix, wav=waveform, sample_rate=sample_rate)
@ -163,7 +163,7 @@ def tar_file_and_group_iterator(fileobj,
                if postfix == 'txt':
                    example['txt'] = file_obj.read().decode('utf8').strip()
                elif postfix in AUDIO_FORMAT_SETS:
-                    waveform, sample_rate = paddlespeech.audio.load(
+                    waveform, sample_rate = paddleaudio.backends.soundfile_load(
                        file_obj, normal=False)
                    waveform = paddle.to_tensor(
                        np.expand_dims(np.array(waveform), 0),
--- a/paddlespeech/audio/third_party/.gitignore
+++ b/paddlespeech/audio/third_party/.gitignore
@ -1,2 +0,0 @@
-archives/
-install/
--- a/paddlespeech/audio/third_party/CMakeLists.txt
+++ b/paddlespeech/audio/third_party/CMakeLists.txt
@ -1,15 +0,0 @@
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
-
-################################################################################
-# sox
-################################################################################
-if (BUILD_SOX)
-  add_subdirectory(sox)
-endif()
-
-################################################################################
-# kaldi
-################################################################################
-if (BUILD_KALDI)
-  add_subdirectory(kaldi)
-endif()
--- a/paddlespeech/audio/third_party/kaldi/CMakeLists.txt
+++ b/paddlespeech/audio/third_party/kaldi/CMakeLists.txt
@ -1,117 +0,0 @@
-# checkout the thirdparty/kaldi/base/kaldi-types.h
-# compile kaldi without openfst
-add_definitions("-DCOMPILE_WITHOUT_OPENFST")
-
-# function (define_library name source include_dirs link_libraries compile_defs)
-#   add_library(${name} INTERFACE ${source})
-#   target_include_directories(${name} INTERFACE ${include_dirs})
-#   target_link_libraries(${name} INTERFACE ${link_libraries})
-#   target_compile_definitions(${name} INTERFACE ${compile_defs})
-#   set_target_properties(${name} PROPERTIES PREFIX "")
-#   if (MSVC)
-#     set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
-#   endif(MSVC)
-#   install(
-#     TARGETS ${name}
-#     LIBRARY DESTINATION lib
-#     RUNTIME DESTINATION lib  # For Windows
-#     )
-# endfunction()
-
-# kaldi-base
-add_library(kaldi-base STATIC
-  base/io-funcs.cc
-  base/kaldi-error.cc
-  base/kaldi-math.cc
-  base/kaldi-utils.cc
-  base/timer.cc
-)
-target_include_directories(kaldi-base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-
-
-# kaldi-matrix
-add_library(kaldi-matrix STATIC
-  matrix/compressed-matrix.cc
-  matrix/matrix-functions.cc
-  matrix/kaldi-matrix.cc
-  matrix/kaldi-vector.cc
-  matrix/optimization.cc
-  matrix/packed-matrix.cc
-  matrix/qr.cc
-  matrix/sparse-matrix.cc
-  matrix/sp-matrix.cc
-  matrix/srfft.cc
-  matrix/tp-matrix.cc
-)
-target_include_directories(kaldi-matrix PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-matrix PUBLIC gfortran kaldi-base libopenblas)
-
-
-# kaldi-util
-add_library(kaldi-util STATIC
-  util/kaldi-holder.cc
-  util/kaldi-io.cc
-  util/kaldi-semaphore.cc
-  util/kaldi-table.cc
-  util/kaldi-thread.cc
-  util/parse-options.cc
-  util/simple-io-funcs.cc
-  util/simple-options.cc
-  util/text-utils.cc
-)
-target_include_directories(kaldi-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix)
-
-
-# kaldi-feat-common
-add_library(kaldi-feat-common STATIC
-  feat/cmvn.cc
-  feat/feature-functions.cc
-  feat/feature-window.cc
-  feat/mel-computations.cc
-  feat/pitch-functions.cc
-  feat/resample.cc
-  feat/signal.cc
-  feat/wave-reader.cc
-)
-target_include_directories(kaldi-feat-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
-
-
-# kaldi-mfcc
-add_library(kaldi-mfcc STATIC
-  feat/feature-mfcc.cc
-)
-target_include_directories(kaldi-mfcc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
-
-
-# kaldi-fbank
-add_library(kaldi-fbank STATIC
-  feat/feature-fbank.cc
-)
-target_include_directories(kaldi-fbank PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
-
-
-set(KALDI_LIBRARIES
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-base.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-matrix.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-util.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-feat-common.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-mfcc.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-fbank.a
-)
-
-add_library(libkaldi INTERFACE)
-add_dependencies(libkaldi kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank)
-target_include_directories(libkaldi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(libkaldi INTERFACE   
-  # --whole-archive  for undefined symbol when link static lib into shared lib
-  -Wl,--start-group -Wl,--whole-archive 
-  ${KALDI_LIBRARIES}
-  libopenblas
-  gfortran
-  -Wl,--no-whole-archive -Wl,--end-group
-)
-target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
--- a/paddlespeech/audio/third_party/kaldi/base
+++ b/paddlespeech/audio/third_party/kaldi/base
@ -1 +0,0 @@
-../../../../speechx/speechx/kaldi/base
--- a/paddlespeech/audio/third_party/kaldi/feat
+++ b/paddlespeech/audio/third_party/kaldi/feat
@ -1 +0,0 @@
-../../../../speechx/speechx/kaldi/feat
--- a/paddlespeech/audio/third_party/kaldi/matrix
+++ b/paddlespeech/audio/third_party/kaldi/matrix
@ -1 +0,0 @@
-../../../../speechx/speechx/kaldi/matrix
--- a/paddlespeech/audio/third_party/kaldi/util
+++ b/paddlespeech/audio/third_party/kaldi/util
@ -1 +0,0 @@
-../../../../speechx/speechx/kaldi/util
--- a/paddlespeech/audio/third_party/patches/config.guess
+++ b/paddlespeech/audio/third_party/patches/config.guess
--- a/paddlespeech/audio/third_party/patches/config.sub
+++ b/paddlespeech/audio/third_party/patches/config.sub
--- a/paddlespeech/audio/third_party/patches/libmad.patch
+++ b/paddlespeech/audio/third_party/patches/libmad.patch
@ -1,86 +0,0 @@
-See the followings for the origin of this patch
-http://www.linuxfromscratch.org/blfs/view/svn/multimedia/libmad.html
-http://www.linuxfromscratch.org/patches/blfs/svn/libmad-0.15.1b-fixes-1.patch
--- src/libmad/configure	2004-02-05 09:34:07.000000000 +0000
-+++ src/libmad/configure.new	2020-06-30 21:10:28.528018931 +0000
-@@ -19083,71 +19083,7 @@
- 
- if test "$GCC" = yes
- then
-    if test -z "$arch"
-    then
-	case "$host" in
-	    i386-*)           ;;
-	    i?86-*)           arch="-march=i486" ;;
-	    arm*-empeg-*)     arch="-march=armv4 -mtune=strongarm1100" ;;
-	    armv4*-*)         arch="-march=armv4 -mtune=strongarm" ;;
-	    powerpc-*)        ;;
-	    mips*-agenda-*)   arch="-mcpu=vr4100" ;;
-	    mips*-luxsonor-*) arch="-mips1 -mcpu=r3000 -Wa,-m4010" ;;
-	esac
-    fi
-
-    case "$optimize" in
-	-O|"-O "*)
-	    optimize="-O"
-	    optimize="$optimize -fforce-mem"
-	    optimize="$optimize -fforce-addr"
-	    : #x optimize="$optimize -finline-functions"
-	    : #- optimize="$optimize -fstrength-reduce"
-	    optimize="$optimize -fthread-jumps"
-	    optimize="$optimize -fcse-follow-jumps"
-	    optimize="$optimize -fcse-skip-blocks"
-	    : #x optimize="$optimize -frerun-cse-after-loop"
-	    : #x optimize="$optimize -frerun-loop-opt"
-	    : #x optimize="$optimize -fgcse"
-	    optimize="$optimize -fexpensive-optimizations"
-	    optimize="$optimize -fregmove"
-	    : #* optimize="$optimize -fdelayed-branch"
-	    : #x optimize="$optimize -fschedule-insns"
-	    optimize="$optimize -fschedule-insns2"
-	    : #? optimize="$optimize -ffunction-sections"
-	    : #? optimize="$optimize -fcaller-saves"
-	    : #> optimize="$optimize -funroll-loops"
-	    : #> optimize="$optimize -funroll-all-loops"
-	    : #x optimize="$optimize -fmove-all-movables"
-	    : #x optimize="$optimize -freduce-all-givs"
-	    : #? optimize="$optimize -fstrict-aliasing"
-	    : #* optimize="$optimize -fstructure-noalias"
-
-	    case "$host" in
-		arm*-*)
-		    optimize="$optimize -fstrength-reduce"
-		    ;;
-		mips*-*)
-		    optimize="$optimize -fstrength-reduce"
-		    optimize="$optimize -finline-functions"
-		    ;;
-		i?86-*)
-		    optimize="$optimize -fstrength-reduce"
-		    ;;
-		powerpc-apple-*)
-		    # this triggers an internal compiler error with gcc2
-		    : #optimize="$optimize -fstrength-reduce"
-
-		    # this is really only beneficial with gcc3
-		    : #optimize="$optimize -finline-functions"
-		    ;;
-		*)
-		    # this sometimes provokes bugs in gcc 2.95.2
-		    : #optimize="$optimize -fstrength-reduce"
-		    ;;
-	    esac
-	    ;;
-    esac
-+    optimize="-O2"
- fi
- 
- case "$host" in
-@@ -21497,6 +21433,7 @@
- then
-     case "$host" in
- 	i?86-*)     FPM="INTEL"  ;;
-+	x86_64*)    FPM="64BIT"  ;;
- 	arm*-*)     FPM="ARM"    ;;
- 	mips*-*)    FPM="MIPS"   ;;
- 	sparc*-*)   FPM="SPARC"  ;;
--- a/paddlespeech/audio/third_party/patches/sox.patch
+++ b/paddlespeech/audio/third_party/patches/sox.patch
@ -1,16 +0,0 @@
-See https://github.com/pytorch/audio/pull/1297
-diff -ru sox/src/formats.c sox/src/formats.c
--- sox/src/formats.c	2014-10-26 19:55:50.000000000 -0700
-+++ sox/src/formats.c	2021-02-22 16:01:02.833144070 -0800
-@@ -333,6 +333,10 @@
-   assert(ft);
-   if (!ft->fp)
-     return sox_false;
-  fstat(fileno((FILE*)ft->fp), &st);
-+  int fd = fileno((FILE*)ft->fp);
-+  if (fd < 0)
-+    return sox_false;
-+  if (fstat(fd, &st) < 0)
-+    return sox_false;
-   return ((st.st_mode & S_IFMT) == S_IFREG);
- }
--- a/paddlespeech/audio/third_party/sox/CMakeLists.txt
+++ b/paddlespeech/audio/third_party/sox/CMakeLists.txt
@ -1,254 +0,0 @@
-find_package(PkgConfig REQUIRED)
-
-include(ExternalProject)
-
-set(INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../install)
-set(ARCHIVE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../archives)
-set(patch_dir ${CMAKE_CURRENT_SOURCE_DIR}/../patches)
-set(COMMON_ARGS --quiet --disable-shared --enable-static --prefix=${INSTALL_DIR} --with-pic --disable-dependency-tracking --disable-debug --disable-examples --disable-doc)
-
-# To pass custom environment variables to ExternalProject_Add command,
-# we need to do `${CMAKE_COMMAND} -E env ${envs} <COMMANAD>`.
-# https://stackoverflow.com/a/62437353
-# We constrcut the custom environment variables here
-set(envs
-  "PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig"
-  "LDFLAGS=-L${INSTALL_DIR}/lib $ENV{LDFLAGS}"
-  "CFLAGS=-I${INSTALL_DIR}/include -fvisibility=hidden $ENV{CFLAGS}"
-)
-
-if (BUILD_MAD)
-  ExternalProject_Add(mad
-    PREFIX ${CMAKE_CURRENT_BINARY_DIR}
-    DOWNLOAD_DIR ${ARCHIVE_DIR}
-    URL https://downloads.sourceforge.net/project/mad/libmad/0.15.1b/libmad-0.15.1b.tar.gz
-    URL_HASH SHA256=bbfac3ed6bfbc2823d3775ebb931087371e142bb0e9bb1bee51a76a6e0078690
-    PATCH_COMMAND patch < ${patch_dir}/libmad.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/mad/
-    CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/mad/configure ${COMMON_ARGS}
-    DOWNLOAD_NO_PROGRESS ON
-    LOG_DOWNLOAD ON
-    LOG_UPDATE ON
-    LOG_CONFIGURE ON
-    LOG_BUILD ON
-    LOG_INSTALL ON
-    LOG_MERGED_STDOUTERR ON
-    LOG_OUTPUT_ON_FAILURE ON
-  )
-endif (BUILD_MAD)
-
-ExternalProject_Add(amr
-  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
-  DOWNLOAD_DIR ${ARCHIVE_DIR}
-  URL https://sourceforge.net/projects/opencore-amr/files/opencore-amr/opencore-amr-0.1.5.tar.gz
-  URL_HASH SHA256=2c006cb9d5f651bfb5e60156dbff6af3c9d35c7bbcc9015308c0aff1e14cd341
-  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/amr/
-  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/amr/configure ${COMMON_ARGS}
-  DOWNLOAD_NO_PROGRESS ON
-  LOG_DOWNLOAD ON
-  LOG_UPDATE ON
-  LOG_CONFIGURE ON
-  LOG_BUILD ON
-  LOG_INSTALL ON
-  LOG_MERGED_STDOUTERR ON
-  LOG_OUTPUT_ON_FAILURE ON
-)
-
-ExternalProject_Add(lame
-  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
-  DOWNLOAD_DIR ${ARCHIVE_DIR}
-  URL https://downloads.sourceforge.net/project/lame/lame/3.99/lame-3.99.5.tar.gz
-  URL_HASH SHA256=24346b4158e4af3bd9f2e194bb23eb473c75fb7377011523353196b19b9a23ff
-  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/lame/
-  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/lame/configure ${COMMON_ARGS} --enable-nasm
-  DOWNLOAD_NO_PROGRESS ON
-  LOG_DOWNLOAD ON
-  LOG_UPDATE ON
-  LOG_CONFIGURE ON
-  LOG_BUILD ON
-  LOG_INSTALL ON
-  LOG_MERGED_STDOUTERR ON
-  LOG_OUTPUT_ON_FAILURE ON
-)
-
-ExternalProject_Add(ogg
-  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
-  DOWNLOAD_DIR ${ARCHIVE_DIR}
-  URL https://ftp.osuosl.org/pub/xiph/releases/ogg/libogg-1.3.3.tar.gz
-  URL_HASH SHA256=c2e8a485110b97550f453226ec644ebac6cb29d1caef2902c007edab4308d985
-  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/
-  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/configure ${COMMON_ARGS}
-  DOWNLOAD_NO_PROGRESS ON
-  LOG_DOWNLOAD ON
-  LOG_UPDATE ON
-  LOG_CONFIGURE ON
-  LOG_BUILD ON
-  LOG_INSTALL ON
-  LOG_MERGED_STDOUTERR ON
-  LOG_OUTPUT_ON_FAILURE ON
-)
-
-ExternalProject_Add(flac
-  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS ogg
-  DOWNLOAD_DIR ${ARCHIVE_DIR}
-  URL https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.2.tar.xz
-  URL_HASH SHA256=91cfc3ed61dc40f47f050a109b08610667d73477af6ef36dcad31c31a4a8d53f
-  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/flac/
-  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/flac/configure ${COMMON_ARGS} --with-ogg --disable-cpplibs
-  DOWNLOAD_NO_PROGRESS ON
-  LOG_DOWNLOAD ON
-  LOG_UPDATE ON
-  LOG_CONFIGURE ON
-  LOG_BUILD ON
-  LOG_INSTALL ON
-  LOG_MERGED_STDOUTERR ON
-  LOG_OUTPUT_ON_FAILURE ON
-)
-
-ExternalProject_Add(vorbis
-  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS ogg
-  DOWNLOAD_DIR ${ARCHIVE_DIR}
-  URL https://ftp.osuosl.org/pub/xiph/releases/vorbis/libvorbis-1.3.6.tar.gz
-  URL_HASH SHA256=6ed40e0241089a42c48604dc00e362beee00036af2d8b3f46338031c9e0351cb
-  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/
-  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/configure ${COMMON_ARGS} --with-ogg
-  DOWNLOAD_NO_PROGRESS ON
-  LOG_DOWNLOAD ON
-  LOG_UPDATE ON
-  LOG_CONFIGURE ON
-  LOG_BUILD ON
-  LOG_INSTALL ON
-  LOG_MERGED_STDOUTERR ON
-  LOG_OUTPUT_ON_FAILURE ON
-)
-
-ExternalProject_Add(opus
-  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS ogg
-  DOWNLOAD_DIR ${ARCHIVE_DIR}
-  URL https://ftp.osuosl.org/pub/xiph/releases/opus/opus-1.3.1.tar.gz
-  URL_HASH SHA256=65b58e1e25b2a114157014736a3d9dfeaad8d41be1c8179866f144a2fb44ff9d
-  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opus/
-  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opus/configure ${COMMON_ARGS} --with-ogg
-  DOWNLOAD_NO_PROGRESS ON
-  LOG_DOWNLOAD ON
-  LOG_UPDATE ON
-  LOG_CONFIGURE ON
-  LOG_BUILD ON
-  LOG_INSTALL ON
-  LOG_MERGED_STDOUTERR ON
-  LOG_OUTPUT_ON_FAILURE ON
-)
-
-ExternalProject_Add(opusfile
-  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS opus
-  DOWNLOAD_DIR ${ARCHIVE_DIR}
-  URL https://ftp.osuosl.org/pub/xiph/releases/opus/opusfile-0.12.tar.gz
-  URL_HASH SHA256=118d8601c12dd6a44f52423e68ca9083cc9f2bfe72da7a8c1acb22a80ae3550b
-  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/
-  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/configure ${COMMON_ARGS} --disable-http
-  DOWNLOAD_NO_PROGRESS ON
-  LOG_DOWNLOAD ON
-  LOG_UPDATE ON
-  LOG_CONFIGURE ON
-  LOG_BUILD ON
-  LOG_INSTALL ON
-  LOG_MERGED_STDOUTERR ON
-  LOG_OUTPUT_ON_FAILURE ON
-)
-
-# OpenMP is by default compiled against GNU OpenMP, which conflicts with the version of OpenMP that PyTorch uses.
-# See https://github.com/pytorch/audio/pull/1026
-# TODO: Add flags like https://github.com/suphoff/pytorch_parallel_extension_cpp/blob/master/setup.py
-set(SOX_OPTIONS
-  --disable-openmp
-  --with-amrnb
-  --with-amrwb
-  --with-flac
-  --with-lame
-  --with-oggvorbis
-  --with-opus
-  --without-alsa
-  --without-ao
-  --without-coreaudio
-  --without-oss
-  --without-id3tag
-  --without-ladspa
-  --without-magic
-  --without-png
-  --without-pulseaudio
-  --without-sndfile
-  --without-sndio
-  --without-sunaudio
-  --without-waveaudio
-  --without-wavpack
-  --without-twolame
-  )
-
-set(SOX_LIBRARIES
-  ${INSTALL_DIR}/lib/libsox.a
-  ${INSTALL_DIR}/lib/libopencore-amrnb.a
-  ${INSTALL_DIR}/lib/libopencore-amrwb.a
-  ${INSTALL_DIR}/lib/libmp3lame.a
-  ${INSTALL_DIR}/lib/libFLAC.a
-  ${INSTALL_DIR}/lib/libopusfile.a
-  ${INSTALL_DIR}/lib/libopus.a
-  ${INSTALL_DIR}/lib/libvorbisenc.a
-  ${INSTALL_DIR}/lib/libvorbisfile.a
-  ${INSTALL_DIR}/lib/libvorbis.a
-  ${INSTALL_DIR}/lib/libogg.a
-  )
-
-set(sox_depends
-  ogg flac vorbis opusfile lame amr
-  )
-
-if (BUILD_MAD)
-  list(
-    APPEND
-    SOX_OPTIONS
-    --with-mad
-    )
-  list(
-    APPEND
-    SOX_LIBRARIES
-    ${INSTALL_DIR}/lib/libmad.a
-    )
-  list(
-    APPEND
-    sox_depends
-    mad
-    )
-else ()
-  list(
-    APPEND
-    SOX_OPTIONS
-    --without-mad
-    )  
-endif (BUILD_MAD)
-
-ExternalProject_Add(sox
-  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS ${sox_depends}
-  DOWNLOAD_DIR ${ARCHIVE_DIR}
-  URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2
-  URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c
-  PATCH_COMMAND patch -p1 < ${patch_dir}/sox.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/sox/
-  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/sox/configure ${COMMON_ARGS} ${SOX_OPTIONS}
-  BUILD_BYPRODUCTS ${SOX_LIBRARIES}
-  DOWNLOAD_NO_PROGRESS ON
-  LOG_DOWNLOAD ON
-  LOG_UPDATE ON
-  LOG_CONFIGURE ON
-  LOG_BUILD ON
-  LOG_INSTALL ON
-  LOG_MERGED_STDOUTERR ON
-  LOG_OUTPUT_ON_FAILURE ON
-)
-
-add_library(libsox INTERFACE)
-add_dependencies(libsox sox)
-target_include_directories(libsox INTERFACE ${INSTALL_DIR}/include)
-target_link_libraries(libsox INTERFACE ${SOX_LIBRARIES})
--- a/paddlespeech/audio/transform/spectrogram.py
+++ b/paddlespeech/audio/transform/spectrogram.py
@ -17,7 +17,7 @@ import numpy as np
 import paddle
 from python_speech_features import logfbank

-from ..compliance import kaldi
+from paddleaudio.compliance import kaldi


 def stft(x,
--- a/paddlespeech/audio/utils/sox_utils.py
+++ b/paddlespeech/audio/utils/sox_utils.py
@ -1,101 +0,0 @@
-from typing import Dict, List
-
-from paddlespeech.audio._internal import module_utils as _mod_utils
-from paddlespeech.audio import _paddleaudio
-
-@_mod_utils.requires_sox()
-def set_seed(seed: int):
-    """Set libsox's PRNG
-
-    Args:
-        seed (int): seed value. valid range is int32.
-
-    See Also:
-        http://sox.sourceforge.net/sox.html
-    """
-    _paddleaudio.sox_utils_set_seed(seed)
-
-
-@_mod_utils.requires_sox()
-def set_verbosity(verbosity: int):
-    """Set libsox's verbosity
-
-    Args:
-        verbosity (int): Set verbosity level of libsox.
-
-            * ``1`` failure messages
-            * ``2`` warnings
-            * ``3`` details of processing
-            * ``4``-``6`` increasing levels of debug messages
-
-    See Also:
-        http://sox.sourceforge.net/sox.html
-    """
-    _paddleaudio.sox_utils_set_verbosity(verbosity)
-
-
-@_mod_utils.requires_sox()
-def set_buffer_size(buffer_size: int):
-    """Set buffer size for sox effect chain
-
-    Args:
-        buffer_size (int): Set the size in bytes of the buffers used for processing audio.
-
-    See Also:
-        http://sox.sourceforge.net/sox.html
-    """
-    _paddleaudio.sox_utils_set_buffer_size(buffer_size)
-
-
-@_mod_utils.requires_sox()
-def set_use_threads(use_threads: bool):
-    """Set multithread option for sox effect chain
-
-    Args:
-        use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing.
-            To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support.
-
-    See Also:
-        http://sox.sourceforge.net/sox.html
-    """
-    _paddleaudio.sox_utils_set_use_threads(use_threads)
-
-
-@_mod_utils.requires_sox()
-def list_effects() -> Dict[str, str]:
-    """List the available sox effect names
-
-    Returns:
-        Dict[str, str]: Mapping from ``effect name`` to ``usage``
-    """
-    return dict(_paddleaudio.sox_utils_list_effects())
-
-
-@_mod_utils.requires_sox()
-def list_read_formats() -> List[str]:
-    """List the supported audio formats for read
-
-    Returns:
-        List[str]: List of supported audio formats
-    """
-    return _paddleaudio.sox_utils_list_read_formats()
-
-
-@_mod_utils.requires_sox()
-def list_write_formats() -> List[str]:
-    """List the supported audio formats for write
-
-    Returns:
-        List[str]: List of supported audio formats
-    """
-    return _paddleaudio.sox_utils_list_write_formats()
-
-
-@_mod_utils.requires_sox()
-def get_buffer_size() -> int:
-    """Get buffer size for sox effect chain
-
-    Returns:
-        int: size in bytes of buffers used for processing audio.
-    """
-    return _paddleaudio.sox_utils_get_buffer_size()