[audio] mv paddlespeech/audio to paddleaudio (#2706)

* split paddlespeech/audio to paddleaudio. * add sox io ,sox effect, kaldi native fbank to paddleaudio.
2 years ago · 42ff946007
parent 0cc54bb785
commit 42ff946007
250 changed files with 18847 additions and 797 deletions
--- a/.gitignore
+++ b/.gitignore
@ -16,6 +16,9 @@
 build
 *output/

+audio/dist/
+audio/fc_patch/
+
 docs/build/
 docs/topic/ctc/warp-ctc/

@ -42,6 +45,7 @@ tools/python-soundfile/
 tools/onnx
 tools/onnxruntime
 tools/Paddle2ONNX
+tools/onnx-simplifier/

 speechx/fc_patch/

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,8 +3,13 @@ repos:
    rev: v0.16.0
    hooks:
    -   id: yapf
-        files: \.py$
-        exclude: (?=third_party).*(\.py)$
+        name: yapf
+        language: python
+        entry: yapf
+        args: [-i, -vv]
+        types: [python]
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: a11d9314b22d8f8c7556443875b731ef05965464
    hooks:
@ -30,7 +35,8 @@ repos:
        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
        -  --builtins=G,request
        -  --jobs=1
-        exclude: (?=third_party).*(\.py)$
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+
 -   repo : https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.0.1
    hooks:
@ -42,6 +48,7 @@ repos:
        files: \.md$
    -   id: remove-tabs
        files: \.md$
+
 -   repo: local
    hooks:
    -   id: clang-format
@ -49,23 +56,17 @@ repos:
        description: Format files with ClangFormat
        entry: bash .pre-commit-hooks/clang-format.hook -i
        language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
-        exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$
-    #-   id: copyright_checker
-    #    name: copyright_checker
-    #    entry: python .pre-commit-hooks/copyright-check.hook
-    #    language: system
-    #    files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
-    #    exclude: (?=third_party|pypinyin|speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$
+        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
    -   id: cpplint
        name: cpplint
        description: Static code analysis of C/C++ files
        language: python
        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$ 
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
        entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent
 -   repo: https://github.com/asottile/reorder_python_imports
    rev: v2.4.0
    hooks:
      - id: reorder-python-imports
-        exclude: (?=third_party).*(\.py)$
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h\.hpp|\.py)$
--- a/audio/CMakeLists.txt
+++ b/audio/CMakeLists.txt
@ -0,0 +1,70 @@
+cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
+
+# Use compiler ID "AppleClang" instead of "Clang" for XCode.
+# Not setting this sometimes makes XCode C compiler gets detected as "Clang",
+# even when the C++ one is detected as "AppleClang".
+cmake_policy(SET CMP0010 NEW)
+cmake_policy(SET CMP0025 NEW)
+
+# Suppress warning flags in default MSVC configuration.  It's not
+# mandatory that we do this (and we don't if cmake is old), but it's
+# nice when it's possible, and it's possible on our Windows configs.
+if(NOT CMAKE_VERSION VERSION_LESS 3.15.0)
+  cmake_policy(SET CMP0092 NEW)
+endif()
+
+project(paddleaudio)
+
+# check and set CMAKE_CXX_STANDARD
+string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard)
+if(env_cxx_standard GREATER -1)
+  message(
+      WARNING "C++ standard version definition detected in environment variable."
+      "paddleaudio requires -std=c++14. Please remove -std=c++ settings in your environment.")
+endif()
+
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_C_STANDARD 11)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_VERBOSE_MAKEFILE ON)
+
+# Options
+option(BUILD_SOX "Build libsox statically" ON)
+option(BUILD_MAD "Enable libmad" ON)
+option(BUILD_KALDI "Build kaldi statically" ON)
+option(BUILD_PADDLEAUDIO_PYTHON_EXTENSION "Build Python extension" ON)
+
+
+# cmake
+set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJECT_SOURCE_DIR}/cmake/external")
+
+if (NOT MSVC)
+    find_package(GFortranLibs REQUIRED)
+    include(FortranCInterface)
+    include(FindGFortranLibs REQUIRED)
+endif()
+
+# fc_patch dir
+set(FETCHCONTENT_QUIET off)
+get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
+set(FETCHCONTENT_BASE_DIR ${fc_patch})
+set(THIRD_PARTY_PATH ${fc_patch})
+
+include(openblas)
+
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
+include(cmake/pybind.cmake)
+include_directories(${PYTHON_INCLUDE_DIR})
+
+# packages
+find_package(Python3 COMPONENTS Interpreter Development)
+
+# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O0 -Wall -g")
+add_subdirectory(paddleaudio)
+
+# Summary
+include(cmake/summary.cmake)
+onnx_print_configuration_summary()
--- a/audio/README.md
+++ b/audio/README.md
@ -0,0 +1,35 @@
+# PaddleAudio
+
+安装方式： pip install paddleaudio
+
+目前支持的平台：Linux：
+
+## Environment
+
+## Build wheel
+
+Linux test build whl environment:
+* docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2`
+* os - Ubuntu 16.04.7 LTS
+* gcc/g++/gfortran - 8.2.0
+* cmake - 3.18.0 (need install)
+
+* [How to Install Docker](https://docs.docker.com/engine/install/)
+* [A Docker Tutorial for Beginners](https://docker-curriculum.com/)
+
+1. First to launch docker container.
+
+```
+docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash
+```
+2. python setup.py bdist_wheel
+
+MAC：test build whl envrioment：
+* os 
+* gcc/g++/gfortran 12.2.0
+* cpu Intel Xeon E5 x86_64
+
+
+Windows：
+not support： paddleaudio C++ extension lib (sox io, kaldi native fbank)
+python setup.py bdist_wheel
--- a/audio/cmake/FindGFortranLibs.cmake
+++ b/audio/cmake/FindGFortranLibs.cmake
@ -0,0 +1,153 @@
+#.rst:
+# FindGFortranLibs
+# --------
+#  https://github.com/Argonne-National-Laboratory/PIPS/blob/master/cmake/Modules/FindGFortranLibs.cmake
+#  https://enccs.github.io/cmake-workshop/cxx-fortran/
+#
+# Find gcc Fortran compiler & library paths
+#
+# The module defines the following variables:
+#
+# ::
+#
+#
+#   GFORTRANLIBS_FOUND - true if system has gfortran
+#   LIBGFORTRAN_LIBRARIES - path to libgfortran
+#   LIBQUADMATH_LIBRARIES - path to libquadmath
+#   GFORTRAN_LIBARIES_DIR - directory containing libgfortran, libquadmath
+#   GFORTRAN_INCLUDE_DIR - directory containing gfortran/gcc headers
+#   LIBGOMP_LIBRARIES - path to libgomp
+#   LIBGOMP_INCLUDE_DIR - directory containing omp.h header
+#   GFORTRAN_VERSION_STRING - version of gfortran found
+#
+set(CMAKE_REQUIRED_QUIET ${LIBIOMP_FIND_QUIETLY})
+
+if(NOT CMAKE_REQUIRED_QUIET)
+  message(STATUS "Looking for gfortran related libraries...")
+endif()
+
+enable_language(Fortran)
+if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
+
+  # Basically, call "gfortran -v" to dump compiler info to the string
+  # GFORTRAN_VERBOSE_STR, which will be used to get necessary paths
+  message(STATUS "Extracting library and header information by calling 'gfortran -v'...")
+  execute_process(COMMAND "${CMAKE_Fortran_COMPILER}" "-v" ERROR_VARIABLE
+    GFORTRAN_VERBOSE_STR RESULT_VARIABLE FLAG)
+
+  # For debugging
+  message(STATUS "'gfortran -v' returned:")
+  message(STATUS "${GFORTRAN_VERBOSE_STR}")
+
+  # Detect gfortran version
+  string(REGEX MATCH "gcc version [^\t\n ]+" GFORTRAN_VER_STR "${GFORTRAN_VERBOSE_STR}")
+  string(REGEX REPLACE "gcc version ([^\t\n ]+)" "\\1" GFORTRAN_VERSION_STRING "${GFORTRAN_VER_STR}")
+  message(STATUS "Detected gfortran version ${GFORTRAN_VERSION_STRING}")
+  unset(GFORTRAN_VER_STR)
+
+  set(MATCH_REGEX "[^\t\n ]+[\t\n ]+")
+  set(REPLACE_REGEX "([^\t\n ]+)")
+
+  # Find architecture for compiler
+  string(REGEX MATCH "Target: [^\t\n ]+"
+    GFORTRAN_ARCH_STR "${GFORTRAN_VERBOSE_STR}")
+  message(STATUS "Architecture string: ${GFORTRAN_ARCH_STR}")
+  string(REGEX REPLACE "Target: ([^\t\n ]+)" "\\1"
+    GFORTRAN_ARCH "${GFORTRAN_ARCH_STR}")
+  message(STATUS "Detected gfortran architecture: ${GFORTRAN_ARCH}")
+  unset(GFORTRAN_ARCH_STR)
+
+  # Find install prefix, if it exists; if not, use default
+  string(REGEX MATCH  "--prefix=[^\t\n ]+[\t\n ]+"
+    GFORTRAN_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_PREFIX_STR)
+    message(STATUS "Detected default gfortran prefix")
+    set(GFORTRAN_PREFIX_DIR "/usr/local") # default prefix for gcc install
+  else()
+    string(REGEX REPLACE "--prefix=([^\t\n ]+)" "\\1"
+      GFORTRAN_PREFIX_DIR "${GFORTRAN_PREFIX_STR}")
+  endif()
+  message(STATUS "Detected gfortran prefix: ${GFORTRAN_PREFIX_DIR}")
+  unset(GFORTRAN_PREFIX_STR)
+
+  # Find install exec-prefix, if it exists; if not, use default
+  string(REGEX MATCH "--exec-prefix=[^\t\n ]+[\t\n ]+" "\\1"
+    GFORTRAN_EXEC_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_EXEC_PREFIX_STR)
+    message(STATUS "Detected default gfortran exec-prefix")
+    set(GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_PREFIX_DIR}")
+  else()
+    string(REGEX REPLACE "--exec-prefix=([^\t\n ]+)" "\\1"
+      GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_EXEC_PREFIX_STR}")
+  endif()
+  message(STATUS "Detected gfortran exec-prefix: ${GFORTRAN_EXEC_PREFIX_DIR}")
+  UNSET(GFORTRAN_EXEC_PREFIX_STR)
+
+  # Find library directory and include directory, if library directory specified
+  string(REGEX MATCH "--libdir=[^\t\n ]+"
+    GFORTRAN_LIB_DIR_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_LIB_DIR_STR)
+    message(STATUS "Found --libdir flag -- not found")
+    message(STATUS "Using default gfortran library & include directory paths")
+    string(STRIP ${GFORTRAN_PREFIX_DIR} TMPLIBDIR)
+    set(GFORTRAN_LIBRARIES_DIR "${TMPLIBDIR}/lib64")
+    set(GFORTRAN_INCLUDE_DIR "${TMPLIBDIR}/include")
+  else()
+    message(STATUS "Found --libdir flag -- yes")
+    string(REGEX REPLACE "--libdir=([^\t\n ]+)" "\\1"
+      GFORTRAN_LIBRARIES_DIR "${GFORTRAN_LIB_DIR_STR}")
+    string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/gcc/" "${GFORTRAN_ARCH}" "/" "${GFORTRAN_VERSION_STRING}" "/include")
+  endif()
+  message(STATUS "gfortran libraries path: ${GFORTRAN_LIBRARIES_DIR}")
+  message(STATUS "gfortran include path dir: ${GFORTRAN_INCLUDE_DIR}")
+  unset(GFORTRAN_LIB_DIR_STR)
+
+  # There are lots of other build options for gcc & gfortran. For now, the
+  # options implemented above should cover a lot of common use cases.
+
+  # Clean up be deleting the output string from "gfortran -v"
+  unset(GFORTRAN_VERBOSE_STR)
+
+  # Find paths for libgfortran, libquadmath, libgomp
+  # libgomp needed for OpenMP support without Clang
+  find_library(LIBGFORTRAN_LIBRARIES NAMES gfortran libgfortran
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+  find_library(LIBQUADMATH_LIBRARIES NAMES quadmath libquadmath
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+  find_library(LIBGOMP_LIBRARIES NAMES gomp libgomp
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+
+  # Find OpenMP headers
+  find_path(LIBGOMP_INCLUDE_DIR NAMES omp.h HINTS ${GFORTRAN_INCLUDE_DIR})
+
+else()
+  message(STATUS "CMAKE_Fortran_COMPILER_ID does not match 'GNU'!")
+endif()
+
+include(FindPackageHandleStandardArgs)
+
+# Required: libgfortran, libquadmath, path for gfortran libraries
+# Optional: libgomp, path for OpenMP headers, path for gcc/gfortran headers
+find_package_handle_standard_args(GFortranLibs
+  REQUIRED_VARS LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES GFORTRAN_LIBRARIES_DIR
+  VERSION_VAR GFORTRAN_VERSION_STRING)
+
+if(GFORTRANLIBS_FOUND)
+  message(STATUS "Looking for gfortran libraries -- found")
+  message(STATUS "gfortran version: ${GFORTRAN_VERSION_STRING}")
+else()
+  message(STATUS "Looking for gfortran libraries -- not found")
+endif()
+
+mark_as_advanced(LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES
+  LIBGOMP_LIBRARIES LIBGOMP_INCLUDE_DIR
+  GFORTRAN_LIBRARIES_DIR GFORTRAN_INCLUDE_DIR)
+# FindGFortranLIBS.cmake ends here
+
+
+message(STATUS LIBGFORTRAN_LIBRARIES= ${LIBGFORTRAN_LIBRARIES})
+message(STATUS LIBQUADMATH_LIBRARIES= ${LIBQUADMATH_LIBRARIES})
+message(STATUS LIBGOMP_LIBRARIES= ${LIBGOMP_LIBRARIES})
+message(STATUS LIBGOMP_INCLUDE_DIR= ${LIBGOMP_INCLUDE_DIR})
+message(STATUS GFORTRAN_LIBRARIES_DIR= ${GFORTRAN_LIBRARIES_DIR})
+message(STATUS GFORTRAN_INCLUDE_DIR= ${GFORTRAN_INCLUDE_DIR})
--- a/audio/cmake/external/openblas.cmake
+++ b/audio/cmake/external/openblas.cmake
@ -0,0 +1,119 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+set(CBLAS_PREFIX_DIR ${THIRD_PARTY_PATH}/openblas)
+set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
+set(CBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
+set(CBLAS_TAG v0.3.10)
+
+if(NOT WIN32)
+  set(CBLAS_LIBRARIES
+      "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "openblas library." FORCE)
+  set(CBLAS_INC_DIR
+      "${CBLAS_INSTALL_DIR}/include"
+      CACHE PATH "openblas include directory." FORCE)
+  set(OPENBLAS_CC
+      "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
+
+  if(APPLE)
+    set(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+  endif()
+  set(OPTIONAL_ARGS "")
+  set(COMMON_ARGS "")
+
+  if(APPLE)
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
+      set(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
+    endif()
+    set(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1)
+  endif()
+
+  ExternalProject_Add(
+    OPENBLAS
+    URL "https://paddleaudio.bj.bcebos.com/build/OpenBLAS-0.3.10.zip"
+    GIT_SHALLOW YES
+    DOWNLOAD_DIR ${CBLAS_PREFIX_DIR}
+    SOURCE_DIR ${CBLAS_PREFIX_DIR}
+    INSTALL_DIR ${CBLAS_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND make -j${NPROC} ${COMMON_ARGS} ${OPTIONAL_ARGS}
+    INSTALL_COMMAND make install PREFIX=<INSTALL_DIR>
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
+
+    ExternalProject_Get_Property(OPENBLAS INSTALL_DIR)
+    set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR})
+    add_library(openblas STATIC IMPORTED)
+    add_dependencies(openblas OPENBLAS)
+    set_target_properties(openblas PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran)
+    set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a)
+
+    link_directories(${OpenBLAS_INSTALL_PREFIX}/lib)
+    include_directories(${OpenBLAS_INSTALL_PREFIX}/include)
+
+    set(OPENBLAS_LIBRARIES
+        ${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a
+    )
+
+    add_library(libopenblas INTERFACE)
+    add_dependencies(libopenblas openblas)
+    target_include_directories(libopenblas INTERFACE ${OpenBLAS_INSTALL_PREFIX}/include/openblas)
+    target_link_libraries(libopenblas INTERFACE ${OPENBLAS_LIBRARIES})
+else()
+  set(CBLAS_LIBRARIES
+      "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "openblas library." FORCE)
+  set(CBLAS_INC_DIR
+      "${CBLAS_INSTALL_DIR}/include/openblas"
+      CACHE PATH "openblas include directory." FORCE)
+  ExternalProject_Add(
+    extern_openblas
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY ${CBLAS_REPOSITORY}
+    GIT_TAG ${CBLAS_TAG}
+    PREFIX ${CBLAS_PREFIX_DIR}
+    INSTALL_DIR ${CBLAS_INSTALL_DIR}
+    BUILD_IN_SOURCE 0
+    UPDATE_COMMAND ""
+    CMAKE_ARGS -DCMAKE_C_COMPILER=clang-cl
+               -DCMAKE_CXX_COMPILER=clang-cl
+               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+               -DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
+               -DCMAKE_BUILD_TYPE=Release #${THIRD_PARTY_BUILD_TYPE}
+               -DCMAKE_MT=mt
+               -DUSE_THREAD=OFF
+               -DBUILD_WITHOUT_LAPACK=NO
+               -DCMAKE_Fortran_COMPILER=flang
+               -DNOFORTRAN=0
+               -DDYNAMIC_ARCH=ON
+               #${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS
+      -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+      -DCMAKE_BUILD_TYPE:STRING=Release #${THIRD_PARTY_BUILD_TYPE}
+    # ninja need to know where openblas.lib comes from
+    BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
+  set(OPENBLAS_SHARED_LIB
+      ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
+
+  add_library(openblas INTERFACE)
+  add_dependencies(openblas extern_openblas)
+  include_directories(${CBLAS_INC_DIR})
+  link_libraries(${CBLAS_LIBRARIES})
+endif()
+
--- a/audio/cmake/pybind.cmake
+++ b/audio/cmake/pybind.cmake
@ -0,0 +1,42 @@
+#the pybind11 is from:https://github.com/pybind/pybind11
+# Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+SET(PYBIND_ZIP "v2.10.0.zip")
+SET(LOCAL_PYBIND_ZIP ${FETCHCONTENT_BASE_DIR}/${PYBIND_ZIP})
+SET(PYBIND_SRC ${FETCHCONTENT_BASE_DIR}/pybind11)
+SET(DOWNLOAD_URL "https://paddleaudio.bj.bcebos.com/build/v2.10.0.zip")
+SET(PYBIND_TIMEOUT 600 CACHE STRING "Timeout in seconds when downloading pybind.")
+
+IF(NOT EXISTS ${LOCAL_PYBIND_ZIP})
+    FILE(DOWNLOAD ${DOWNLOAD_URL}
+      ${LOCAL_PYBIND_ZIP}
+      TIMEOUT ${PYBIND_TIMEOUT}
+      STATUS ERR
+      SHOW_PROGRESS
+    )
+
+    IF(ERR EQUAL 0)
+        MESSAGE(STATUS "download pybind success")
+    ELSE()
+        MESSAGE(FATAL_ERROR "download pybind fail")
+    ENDIF()
+ENDIF()
+
+IF(NOT EXISTS ${PYBIND_SRC})
+    EXECUTE_PROCESS(
+      COMMAND ${CMAKE_COMMAND} -E tar xfz ${LOCAL_PYBIND_ZIP}
+       WORKING_DIRECTORY ${FETCHCONTENT_BASE_DIR}
+       RESULT_VARIABLE tar_result
+    )
+
+    file(RENAME ${FETCHCONTENT_BASE_DIR}/pybind11-2.10.0 ${PYBIND_SRC})
+
+  IF (tar_result MATCHES 0)
+      MESSAGE(STATUS "unzip pybind success")
+  ELSE()
+      MESSAGE(FATAL_ERROR "unzip pybind fail")
+  ENDIF()
+
+ENDIF()
+
+include_directories(${PYBIND_SRC}/include)
--- a/audio/cmake/summary.cmake
+++ b/audio/cmake/summary.cmake
@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Prints accumulated ONNX configuration summary
+function (onnx_print_configuration_summary)
+  message(STATUS "")
+  message(STATUS "******** Summary ********")
+  message(STATUS "  CMake version             : ${CMAKE_VERSION}")
+  message(STATUS "  CMake command             : ${CMAKE_COMMAND}")
+  message(STATUS "  System                    : ${CMAKE_SYSTEM_NAME}")
+  message(STATUS "  C++ compiler              : ${CMAKE_CXX_COMPILER}")
+  message(STATUS "  C++ compiler version      : ${CMAKE_CXX_COMPILER_VERSION}")
+  message(STATUS "  CXX flags                 : ${CMAKE_CXX_FLAGS}")
+  message(STATUS "  Build type                : ${CMAKE_BUILD_TYPE}")
+  get_directory_property(tmp DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)
+  message(STATUS "  Compile definitions       : ${tmp}")
+  message(STATUS "  CMAKE_PREFIX_PATH         : ${CMAKE_PREFIX_PATH}")
+  message(STATUS "  CMAKE_INSTALL_PREFIX      : ${CMAKE_INSTALL_PREFIX}")
+  message(STATUS "  CMAKE_MODULE_PATH         : ${CMAKE_MODULE_PATH}")
+  message(STATUS "")
+  message(STATUS "  ONNX version              : ${ONNX_VERSION}")
+  message(STATUS "  ONNX NAMESPACE            : ${ONNX_NAMESPACE}")
+  message(STATUS "  ONNX_USE_LITE_PROTO       : ${ONNX_USE_LITE_PROTO}")
+  message(STATUS "  USE_PROTOBUF_SHARED_LIBS  : ${ONNX_USE_PROTOBUF_SHARED_LIBS}")
+  message(STATUS "  Protobuf_USE_STATIC_LIBS  : ${Protobuf_USE_STATIC_LIBS}")
+  message(STATUS "  ONNX_DISABLE_EXCEPTIONS   : ${ONNX_DISABLE_EXCEPTIONS}")
+  message(STATUS "  ONNX_WERROR               : ${ONNX_WERROR}")
+  message(STATUS "  ONNX_BUILD_TESTS          : ${ONNX_BUILD_TESTS}")
+  message(STATUS "  ONNX_BUILD_BENCHMARKS     : ${ONNX_BUILD_BENCHMARKS}")
+  message(STATUS "  ONNXIFI_DUMMY_BACKEND     : ${ONNXIFI_DUMMY_BACKEND}")
+  message(STATUS "  ONNXIFI_ENABLE_EXT        : ${ONNXIFI_ENABLE_EXT}")
+  message(STATUS "")
+  message(STATUS "  Protobuf compiler         : ${PROTOBUF_PROTOC_EXECUTABLE}")
+  message(STATUS "  Protobuf includes         : ${PROTOBUF_INCLUDE_DIRS}")
+  message(STATUS "  Protobuf libraries        : ${PROTOBUF_LIBRARIES}")
+  message(STATUS "  BUILD_ONNX_PYTHON         : ${BUILD_ONNX_PYTHON}")
+  message(STATUS "    Python version        : ${Python_VERSION}")
+  message(STATUS "    Python executable     : ${Python_EXECUTABLE}")
+  message(STATUS "    Python includes       : ${Python_INCLUDE_DIR}")
+  message(STATUS "    Python libraries      : ${Python_LIBRARY}")
+  message(STATUS "  PYBIND11                  : ${pybind11_FOUND}")
+  message(STATUS "    Pybind11 version        : ${pybind11_VERSION}")
+  message(STATUS "    Pybind11 include        : ${pybind11_INCLUDE_DIR}")
+  message(STATUS "    Pybind11 includes       : ${pybind11_INCLUDE_DIRS}")
+  message(STATUS "    Pybind11 libraries      : ${pybind11_LIBRARIES}")
+endfunction()
--- a/audio/paddleaudio/CMakeLists.txt
+++ b/audio/paddleaudio/CMakeLists.txt
@ -0,0 +1,19 @@
+
+add_subdirectory(third_party)
+add_subdirectory(src)
+
+if (APPLE) 
+  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib
+          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib)
+endif(APPLE)
+
+if (UNIX AND NOT APPLE)
+  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgfortran.so.5
+          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
+
+  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libquadmath.so.0
+          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
+
+  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.so.1
+          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
+endif()
--- a/paddlespeech/audio/backends/init.py
+++ b/paddlespeech/audio/backends/init.py
@ -11,9 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .soundfile_backend import depth_convert
-from .soundfile_backend import load
-from .soundfile_backend import normalize
-from .soundfile_backend import resample
-from .soundfile_backend import save
-from .soundfile_backend import to_mono
+from . import _extension
+from . import backends
+from . import compliance
+from . import datasets
+from . import features
+from . import functional
+from . import metric
+from . import sox_effects
+from . import utils
--- a/audio/paddleaudio/_extension.py
+++ b/audio/paddleaudio/_extension.py
@ -0,0 +1,167 @@
+import contextlib
+import ctypes
+import os
+import sys
+import types
+import warnings
+from pathlib import Path
+
+from ._internal import module_utils as _mod_utils  # noqa: F401
+
+# Query `hasattr` only once.
+_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
+                                                               'setdlopenflags')
+
+
+@contextlib.contextmanager
+def dl_open_guard():
+    """
+    # https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
+    Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
+    shared library to load custom operators.
+    """
+    if _SET_GLOBAL_FLAGS:
+        old_flags = sys.getdlopenflags()
+        sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
+    yield
+    if _SET_GLOBAL_FLAGS:
+        sys.setdlopenflags(old_flags)
+
+
+def resolve_library_path(path: str) -> str:
+    return os.path.realpath(path)
+
+
+class _Ops(types.ModuleType):
+    #__file__ = '_ops.py'
+
+    def __init__(self):
+        super(_Ops, self).__init__('paddleaudio.ops')
+        self.loaded_libraries = set()
+
+    def load_library(self, path):
+        """
+        Loads a shared library from the given path into the current process.
+        This allows dynamically loading custom operators. For this, 
+        you should compile your operator and 
+        the static registration code into a shared library object, and then
+        call ``paddleaudio.ops.load_library('path/to/libcustom.so')`` to load the
+        shared object.
+        After the library is loaded, it is added to the
+        ``paddleaudio.ops.loaded_libraries`` attribute, a set that may be inspected
+        for the paths of all libraries loaded using this function.
+        Args:
+            path (str): A path to a shared library to load.
+        """
+        path = resolve_library_path(path)
+        with dl_open_guard():
+            # https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
+            # Import the shared library into the process, thus running its
+            # static (global) initialization code in order to register custom
+            # operators with the JIT.
+            ctypes.CDLL(path)
+        self.loaded_libraries.add(path)
+
+
+_LIB_DIR = Path(__file__).parent / "lib"
+
+
+def _get_lib_path(lib: str):
+    suffix = "pyd" if os.name == "nt" else "so"
+    path = _LIB_DIR / f"{lib}.{suffix}"
+    return path
+
+
+def _load_lib(lib: str) -> bool:
+    """Load extension module
+    Note:
+        In case `paddleaudio` is deployed with `pex` format, the library file
+        is not in a standard location.
+        In this case, we expect that `libpaddlleaudio` is available somewhere
+        in the search path of dynamic loading mechanism, so that importing
+        `_paddlleaudio` will have library loader find and load `libpaddlleaudio`.
+        This is the reason why the function should not raising an error when the library
+        file is not found.
+    Returns:
+        bool:
+            True if the library file is found AND the library loaded without failure.
+            False if the library file is not found (like in the case where paddlleaudio
+            is deployed with pex format, thus the shared library file is
+            in a non-standard location.).
+            If the library file is found but there is an issue loading the library,
+            (such as missing dependency) then this function raises the exception as-is.
+    Raises:
+        Exception:
+            If the library file is found, but there is an issue loading the library file,
+            (when underlying `ctype.DLL` throws an exception), this function will pass
+            the exception as-is, instead of catching it and returning bool.
+            The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
+            is not found.
+            This behavior was chosen because the expected failure case is not recoverable.
+            If a dependency is missing, then users have to install it.
+    """
+    path = _get_lib_path(lib)
+    if not path.exists():
+        warnings.warn("lib path is not exists:" + str(path))
+        return False
+    ops.load_library(path)
+    return True
+
+
+_FFMPEG_INITIALIZED = False
+
+
+def _init_ffmpeg():
+    global _FFMPEG_INITIALIZED
+    if _FFMPEG_INITIALIZED:
+        return
+
+    if not paddleaudio._paddlleaudio.is_ffmpeg_available():
+        raise RuntimeError(
+            "paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
+        )
+
+    try:
+        _load_lib("libpaddlleaudio_ffmpeg")
+    except OSError as err:
+        raise ImportError(
+            "FFmpeg libraries are not found. Please install FFmpeg.") from err
+
+    import paddllespeech.audio._paddlleaudio_ffmpeg  # noqa
+
+    paddleaudio._paddlleaudio.ffmpeg_init()
+    if paddleaudio._paddlleaudio.ffmpeg_get_log_level() > 8:
+        paddleaudio._paddlleaudio.ffmpeg_set_log_level(8)
+
+    _FFMPEG_INITIALIZED = True
+
+
+def _init_extension():
+    if not _mod_utils.is_module_available("paddleaudio._paddleaudio"):
+        warnings.warn(
+            "paddleaudio C++ extension is not available. sox_io, sox_effect, kaldi raw feature is not supported!!!")
+        return
+
+    _load_lib("libpaddleaudio")
+    # This import is for initializing the methods registered via PyBind11
+    # This has to happen after the base library is loaded
+    try:
+        from paddleaudio import _paddleaudio  # noqa
+    except Exception:
+        warnings.warn(
+            "paddleaudio C++ extension is not available. sox_io, sox_effect, kaldi raw feature is not supported!!!")
+        return
+
+    # Because this part is executed as part of `import torchaudio`, we ignore the
+    # initialization failure.
+    # If the FFmpeg integration is not properly initialized, then detailed error
+    # will be raised when client code attempts to import the dedicated feature.
+    try:
+        _init_ffmpeg()
+    except Exception:
+        pass
+
+
+ops = _Ops()
+
+_init_extension()
--- a/audio/paddleaudio/_internal/init.py
+++ b/audio/paddleaudio/_internal/init.py
--- a/audio/paddleaudio/_internal/module_utils.py
+++ b/audio/paddleaudio/_internal/module_utils.py
@ -0,0 +1,151 @@
+import importlib.util
+import platform
+import warnings
+from functools import wraps
+from typing import Optional
+
+#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py with modification.
+
+
+def is_module_available(*modules: str) -> bool:
+    r"""Returns if a top-level module with :attr:`name` exists *without**
+    importing it. This is generally safer than try-catch block around a
+    `import X`. It avoids third party libraries breaking assumptions of some of
+    our tests, e.g., setting multiprocessing start method when imported
+    (see librosa/#747, torchvision/#544).
+    """
+    return all(importlib.util.find_spec(m) is not None for m in modules)
+
+
+def requires_module(*modules: str):
+    """Decorate function to give error message if invoked without required optional modules.
+    This decorator is to give better error message to users rather
+    than raising ``NameError:  name 'module' is not defined`` at random places.
+    """
+    missing = [m for m in modules if not is_module_available(m)]
+
+    if not missing:
+        # fall through. If all the modules are available, no need to decorate
+        def decorator(func):
+            return func
+
+    else:
+        req = f"module: {missing[0]}" if len(
+            missing) == 1 else f"modules: {missing}"
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires {req}")
+
+            return wrapped
+
+    return decorator
+
+
+def deprecated(direction: str, version: Optional[str]=None):
+    """Decorator to add deprecation message
+    Args:
+        direction (str): Migration steps to be given to users.
+        version (str or int): The version when the object will be removed
+    """
+
+    def decorator(func):
+        @wraps(func)
+        def wrapped(*args, **kwargs):
+            message = (
+                f"{func.__module__}.{func.__name__} has been deprecated "
+                f'and will be removed from {"future" if version is None else version} release. '
+                f"{direction}")
+            warnings.warn(message, stacklevel=2)
+            return func(*args, **kwargs)
+
+        return wrapped
+
+    return decorator
+
+
+def is_kaldi_available():
+    return is_module_available("paddleaudio._paddleaudio")
+
+
+def requires_kaldi():
+    if is_kaldi_available():
+
+        def decorator(func):
+            return func
+
+    else:
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires libpaddleaudio build with kaldi")
+
+            return wrapped
+
+    return decorator
+
+
+def _check_soundfile_importable():
+    if not is_module_available("soundfile"):
+        return False
+    try:
+        import soundfile  # noqa: F401
+
+        return True
+    except Exception:
+        warnings.warn(
+            "Failed to import soundfile. 'soundfile' backend is not available.")
+        return False
+
+
+_is_soundfile_importable = _check_soundfile_importable()
+
+
+def is_soundfile_available():
+    return _is_soundfile_importable
+
+
+def requires_soundfile():
+    if is_soundfile_available():
+
+        def decorator(func):
+            return func
+    else:
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires soundfile")
+
+            return wrapped
+
+    return decorator
+
+
+def is_sox_available():
+    if platform.system() == "Windows":  # not support sox in windows
+        return False
+    return is_module_available("paddleaudio._paddleaudio")
+
+
+def requires_sox():
+    if is_sox_available():
+
+        def decorator(func):
+            return func
+    else:
+
+        def decorator(func):
+            @wraps(func)
+            def wrapped(*args, **kwargs):
+                raise RuntimeError(
+                    f"{func.__module__}.{func.__name__} requires libpaddleaudio build with sox")
+
+            return wrapped
+
+    return decorator
--- a/audio/paddleaudio/backends/init.py
+++ b/audio/paddleaudio/backends/init.py
@ -11,3 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from . import utils
+from .soundfile_backend import depth_convert
+from .soundfile_backend import normalize
+from .soundfile_backend import resample
+from .soundfile_backend import soundfile_load
+from .soundfile_backend import soundfile_save
+from .soundfile_backend import to_mono
+from .utils import get_audio_backend
+from .utils import list_audio_backends
+from .utils import set_audio_backend
+
+utils._init_audio_backend()
--- a/audio/paddleaudio/backends/common.py
+++ b/audio/paddleaudio/backends/common.py
@ -0,0 +1,55 @@
+# Token form https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification.
+
+class AudioInfo:
+    """return of info function.
+
+    This class is used by :ref:`"sox_io" backend<sox_io_backend>` and
+    :ref:`"soundfile" backend with the new interface<soundfile_backend>`.
+
+    :ivar int sample_rate: Sample rate
+    :ivar int num_frames: The number of frames
+    :ivar int num_channels: The number of channels
+    :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
+        or when it cannot be accurately inferred.
+    :ivar str encoding: Audio encoding
+        The values encoding can take are one of the following:
+
+            * ``PCM_S``: Signed integer linear PCM
+            * ``PCM_U``: Unsigned integer linear PCM
+            * ``PCM_F``: Floating point linear PCM
+            * ``FLAC``: Flac, Free Lossless Audio Codec
+            * ``ULAW``: Mu-law
+            * ``ALAW``: A-law
+            * ``MP3`` : MP3, MPEG-1 Audio Layer III
+            * ``VORBIS``: OGG Vorbis
+            * ``AMR_WB``: Adaptive Multi-Rate
+            * ``AMR_NB``: Adaptive Multi-Rate Wideband
+            * ``OPUS``: Opus
+            * ``HTK``: Single channel 16-bit PCM
+            * ``UNKNOWN`` : None of above
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        num_frames: int,
+        num_channels: int,
+        bits_per_sample: int,
+        encoding: str,
+    ):
+        self.sample_rate = sample_rate
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.bits_per_sample = bits_per_sample
+        self.encoding = encoding
+
+    def __str__(self):
+        return (
+            f"AudioMetaData("
+            f"sample_rate={self.sample_rate}, "
+            f"num_frames={self.num_frames}, "
+            f"num_channels={self.num_channels}, "
+            f"bits_per_sample={self.bits_per_sample}, "
+            f"encoding={self.encoding}"
+            f")"
+        )
--- a/audio/paddleaudio/backends/no_backend.py
+++ b/audio/paddleaudio/backends/no_backend.py
@ -0,0 +1,32 @@
+from pathlib import Path
+from typing import Callable
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+from paddle import Tensor
+
+#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
+
+
+def load(
+        filepath: Union[str, Path],
+        out: Optional[Tensor]=None,
+        normalization: Union[bool, float, Callable]=True,
+        channels_first: bool=True,
+        num_frames: int=0,
+        offset: int=0,
+        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def save(filepath: str,
+         src: Tensor,
+         sample_rate: int,
+         precision: int=16,
+         channels_first: bool=True) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def info(filepath: str) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
--- a/audio/paddleaudio/backends/soundfile_backend.py
+++ b/audio/paddleaudio/backends/soundfile_backend.py
@ -0,0 +1,677 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import warnings
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import paddle
+import resampy
+import soundfile
+from scipy.io import wavfile
+
+from ..utils import depth_convert
+from ..utils import ParameterError
+from .common import AudioInfo
+
+__all__ = [
+    'resample',
+    'to_mono',
+    'normalize',
+    'save',
+    'soundfile_save',
+    'load',
+    'soundfile_load',
+    'info',
+]
+NORMALMIZE_TYPES = ['linear', 'gaussian']
+MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
+RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
+EPS = 1e-8
+
+
+def resample(y: np.ndarray,
+             src_sr: int,
+             target_sr: int,
+             mode: str='kaiser_fast') -> np.ndarray:
+    """Audio resampling.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        src_sr (int): Source sample rate.
+        target_sr (int): Target sample rate.
+        mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
+
+    Returns:
+        np.ndarray: `y` resampled to `target_sr`
+    """
+
+    if mode == 'kaiser_best':
+        warnings.warn(
+            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
+        we recommend the mode kaiser_fast in large scale audio trainning')
+
+    if not isinstance(y, np.ndarray):
+        raise ParameterError(
+            'Only support numpy np.ndarray, but received y in {type(y)}')
+
+    if mode not in RESAMPLE_MODES:
+        raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
+
+    return resampy.resample(y, src_sr, target_sr, filter=mode)
+
+
+def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
+    """Convert sterior audio to mono.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
+
+    Returns:
+        np.ndarray: `y` with mono channel.
+    """
+
+    if merge_type not in MERGE_TYPES:
+        raise ParameterError(
+            f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
+        )
+    if y.ndim > 2:
+        raise ParameterError(
+            f'Unsupported audio array,  y.ndim > 2, the shape is {y.shape}')
+    if y.ndim == 1:  # nothing to merge
+        return y
+
+    if merge_type == 'ch0':
+        return y[0]
+    if merge_type == 'ch1':
+        return y[1]
+    if merge_type == 'random':
+        return y[np.random.randint(0, 2)]
+
+    # need to do averaging according to dtype
+
+    if y.dtype == 'float32':
+        y_out = (y[0] + y[1]) * 0.5
+    elif y.dtype == 'int16':
+        y_out = y.astype('int32')
+        y_out = (y_out[0] + y_out[1]) // 2
+        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
+                        np.iinfo(y.dtype).max).astype(y.dtype)
+
+    elif y.dtype == 'int8':
+        y_out = y.astype('int16')
+        y_out = (y_out[0] + y_out[1]) // 2
+        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
+                        np.iinfo(y.dtype).max).astype(y.dtype)
+    else:
+        raise ParameterError(f'Unsupported dtype: {y.dtype}')
+    return y_out
+
+
+def soundfile_load_(file: os.PathLike,
+                    offset: Optional[float]=None,
+                    dtype: str='int16',
+                    duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
+    """Load audio using soundfile library. This function load audio file using libsndfile.
+
+    Args:
+        file (os.PathLike): File of waveform.
+        offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
+        dtype (str, optional): Data type of waveform. Defaults to 'int16'.
+        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
+
+    Returns:
+        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
+    """
+    with soundfile.SoundFile(file) as sf_desc:
+        sr_native = sf_desc.samplerate
+        if offset:
+            sf_desc.seek(int(offset * sr_native))
+        if duration is not None:
+            frame_duration = int(duration * sr_native)
+        else:
+            frame_duration = -1
+        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
+
+    return y, sf_desc.samplerate
+
+
+def normalize(y: np.ndarray, norm_type: str='linear',
+              mul_factor: float=1.0) -> np.ndarray:
+    """Normalize an input audio with additional multiplier.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
+        mul_factor (float, optional): Scaling factor. Defaults to 1.0.
+
+    Returns:
+        np.ndarray: `y` after normalization.
+    """
+
+    if norm_type == 'linear':
+        amax = np.max(np.abs(y))
+        factor = 1.0 / (amax + EPS)
+        y = y * factor * mul_factor
+    elif norm_type == 'gaussian':
+        amean = np.mean(y)
+        astd = np.std(y)
+        astd = max(astd, EPS)
+        y = mul_factor * (y - amean) / astd
+    else:
+        raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
+
+    return y
+
+
+def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
+    """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        sr (int): Sample rate.
+        file (os.PathLike): Path of auido file to save.
+    """
+    if not file.endswith('.wav'):
+        raise ParameterError(
+            f'only .wav file supported, but dst file name is: {file}')
+
+    if sr <= 0:
+        raise ParameterError(
+            f'Sample rate should be larger than 0, recieved sr = {sr}')
+
+    if y.dtype not in ['int16', 'int8']:
+        warnings.warn(
+            f'input data type is {y.dtype}, will convert data to int16 format before saving'
+        )
+        y_out = depth_convert(y, 'int16')
+    else:
+        y_out = y
+
+    wavfile.write(file, sr, y_out)
+
+
+def soundfile_load(
+        file: os.PathLike,
+        sr: Optional[int]=None,
+        mono: bool=True,
+        merge_type: str='average',  # ch0,ch1,random,average
+        normal: bool=True,
+        norm_type: str='linear',
+        norm_mul_factor: float=1.0,
+        offset: float=0.0,
+        duration: Optional[int]=None,
+        dtype: str='float32',
+        resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
+    """Load audio file from disk. This function loads audio from disk using using audio beackend.
+
+    Args:
+        file (os.PathLike): Path of auido file to load.
+        sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
+        mono (bool, optional): Return waveform with mono channel. Defaults to True.
+        merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
+        normal (bool, optional): Waveform normalization. Defaults to True.
+        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
+        norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
+        offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
+        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
+        dtype (str, optional): Data type of waveform. Defaults to 'float32'.
+        resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
+
+    Returns:
+        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
+    """
+
+    y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration)
+
+    if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
+        raise ParameterError(f'audio file {file} looks empty')
+
+    if mono:
+        y = to_mono(y, merge_type)
+
+    if sr is not None and sr != r:
+        y = resample(y, r, sr, mode=resample_mode)
+        r = sr
+
+    if normal:
+        y = normalize(y, norm_type, norm_mul_factor)
+    elif dtype in ['int8', 'int16']:
+        # still need to do normalization, before depth convertion
+        y = normalize(y, 'linear', 1.0)
+
+    y = depth_convert(y, dtype)
+    return y, r
+
+
+#the code below token form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py with modificaion.
+
+
+def _get_subtype_for_wav(dtype: paddle.dtype,
+                         encoding: str,
+                         bits_per_sample: int):
+    if not encoding:
+        if not bits_per_sample:
+            subtype = {
+                paddle.uint8: "PCM_U8",
+                paddle.int16: "PCM_16",
+                paddle.int32: "PCM_32",
+                paddle.float32: "FLOAT",
+                paddle.float64: "DOUBLE",
+            }.get(dtype)
+            if not subtype:
+                raise ValueError(f"Unsupported dtype for wav: {dtype}")
+            return subtype
+        if bits_per_sample == 8:
+            return "PCM_U8"
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_S":
+        if not bits_per_sample:
+            return "PCM_32"
+        if bits_per_sample == 8:
+            raise ValueError("wav does not support 8-bit signed PCM encoding.")
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_U":
+        if bits_per_sample in (None, 8):
+            return "PCM_U8"
+        raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
+    if encoding == "PCM_F":
+        if bits_per_sample in (None, 32):
+            return "FLOAT"
+        if bits_per_sample == 64:
+            return "DOUBLE"
+        raise ValueError("wav only supports 32/64-bit float PCM encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("wav only supports 8-bit mu-law encoding.")
+    if encoding == "ALAW":
+        if bits_per_sample in (None, 8):
+            return "ALAW"
+        raise ValueError("wav only supports 8-bit a-law encoding.")
+    raise ValueError(f"wav does not support {encoding}.")
+
+
+def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
+    if encoding in (None, "PCM_S"):
+        return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
+    if encoding in ("PCM_U", "PCM_F"):
+        raise ValueError(f"sph does not support {encoding} encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("sph only supports 8-bit for mu-law encoding.")
+    if encoding == "ALAW":
+        return "ALAW"
+    raise ValueError(f"sph does not support {encoding}.")
+
+
+def _get_subtype(dtype: paddle.dtype,
+                 format: str,
+                 encoding: str,
+                 bits_per_sample: int):
+    if format == "wav":
+        return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
+    if format == "flac":
+        if encoding:
+            raise ValueError("flac does not support encoding.")
+        if not bits_per_sample:
+            return "PCM_16"
+        if bits_per_sample > 24:
+            raise ValueError("flac does not support bits_per_sample > 24.")
+        return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
+    if format in ("ogg", "vorbis"):
+        if encoding or bits_per_sample:
+            raise ValueError(
+                "ogg/vorbis does not support encoding/bits_per_sample.")
+        return "VORBIS"
+    if format == "sph":
+        return _get_subtype_for_sphere(encoding, bits_per_sample)
+    if format in ("nis", "nist"):
+        return "PCM_16"
+    raise ValueError(f"Unsupported format: {format}")
+
+
+def save(
+        filepath: str,
+        src: paddle.Tensor,
+        sample_rate: int,
+        channels_first: bool=True,
+        compression: Optional[float]=None,
+        format: Optional[str]=None,
+        encoding: Optional[str]=None,
+        bits_per_sample: Optional[int]=None, ):
+    """Save audio data to file.
+
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+
+        * WAV
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+
+    Args:
+        filepath (str or pathlib.Path): Path to audio file.
+        src (paddle.Tensor): Audio data to save. must be 2D tensor.
+        sample_rate (int): sampling rate
+        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+            otherwise `[time, channel]`.
+        compression (float of None, optional): Not used.
+            It is here only for interface compatibility reson with "sox_io" backend.
+        format (str or None, optional): Override the audio format.
+            When ``filepath`` argument is path-like object, audio format is
+            inferred from file extension. If the file extension is missing or
+            different, you can specify the correct format with this argument.
+
+            When ``filepath`` argument is file-like object,
+            this argument is required.
+
+            Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
+            ``"flac"`` and ``"sph"``.
+        encoding (str or None, optional): Changes the encoding for supported formats.
+            This argument is effective only for supported formats, sush as
+            ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
+
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+
+        bits_per_sample (int or None, optional): Changes the bit depth for the
+            supported formats.
+            When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
+            you can change the bit depth.
+            Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
+
+    Supported formats/encodings/bit depth/compression are:
+
+    ``"wav"``
+        - 32-bit floating-point PCM
+        - 32-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 8-bit unsigned integer PCM
+        - 8-bit mu-law
+        - 8-bit a-law
+
+        Note:
+            Default encoding/bit depth is determined by the dtype of
+            the input Tensor.
+
+    ``"flac"``
+        - 8-bit
+        - 16-bit (default)
+        - 24-bit
+
+    ``"ogg"``, ``"vorbis"``
+        - Doesn't accept changing configuration.
+
+    ``"sph"``
+        - 8-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 32-bit signed integer PCM (default)
+        - 8-bit mu-law
+        - 8-bit a-law
+        - 16-bit a-law
+        - 24-bit a-law
+        - 32-bit a-law
+
+    """
+    if src.ndim != 2:
+        raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
+    if compression is not None:
+        warnings.warn(
+            '`save` function of "soundfile" backend does not support "compression" parameter. '
+            "The argument is silently ignored.")
+    if hasattr(filepath, "write"):
+        if format is None:
+            raise RuntimeError(
+                "`format` is required when saving to file object.")
+        ext = format.lower()
+    else:
+        ext = str(filepath).split(".")[-1].lower()
+
+    if bits_per_sample not in (None, 8, 16, 24, 32, 64):
+        raise ValueError("Invalid bits_per_sample.")
+    if bits_per_sample == 24:
+        warnings.warn(
+            "Saving audio with 24 bits per sample might warp samples near -1. "
+            "Using 16 bits per sample might be able to avoid this.")
+    subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
+
+    # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
+    # so we extend the extensions manually here
+    if ext in ["nis", "nist", "sph"] and format is None:
+        format = "NIST"
+
+    if channels_first:
+        src = src.t()
+
+    soundfile.write(
+        file=filepath,
+        data=src,
+        samplerate=sample_rate,
+        subtype=subtype,
+        format=format)
+
+
+_SUBTYPE2DTYPE = {
+    "PCM_S8": "int8",
+    "PCM_U8": "uint8",
+    "PCM_16": "int16",
+    "PCM_32": "int32",
+    "FLOAT": "float32",
+    "DOUBLE": "float64",
+}
+
+
+def load(
+        filepath: str,
+        frame_offset: int=0,
+        num_frames: int=-1,
+        normalize: bool=True,
+        channels_first: bool=True,
+        format: Optional[str]=None, ) -> Tuple[paddle.Tensor, int]:
+    """Load audio data from file.
+
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+
+        * WAV
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+
+    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+    ``float32`` dtype and the shape of `[channel, time]`.
+    The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
+
+    When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+    signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
+    by providing ``normalize=False``, this function can return integer Tensor, where the samples
+    are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
+    for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
+
+    ``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
+    ``flac`` and ``mp3``.
+    For these formats, this function always returns ``float32`` Tensor with values normalized to
+    ``[-1.0, 1.0]``.
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend.
+
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        frame_offset (int, optional):
+            Number of frames to skip before start reading data.
+        num_frames (int, optional):
+            Maximum number of frames to read. ``-1`` reads all the remaining samples,
+            starting from ``frame_offset``.
+            This function may return the less number of frames if there is not enough
+            frames in the given file.
+        normalize (bool, optional):
+            When ``True``, this function always return ``float32``, and sample values are
+            normalized to ``[-1.0, 1.0]``.
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type.
+            This argument has no effect for formats other than integer WAV type.
+        channels_first (bool, optional):
+            When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+
+    Returns:
+        (paddle.Tensor, int): Resulting Tensor and sample rate.
+            If the input file has integer wav format and normalization is off, then it has
+            integer type, else ``float32`` type. If ``channels_first=True``, it has
+            `[channel, time]` else `[time, channel]`.
+    """
+    with soundfile.SoundFile(filepath, "r") as file_:
+        if file_.format != "WAV" or normalize:
+            dtype = "float32"
+        elif file_.subtype not in _SUBTYPE2DTYPE:
+            raise ValueError(f"Unsupported subtype: {file_.subtype}")
+        else:
+            dtype = _SUBTYPE2DTYPE[file_.subtype]
+
+        frames = file_._prepare_read(frame_offset, None, num_frames)
+        waveform = file_.read(frames, dtype, always_2d=True)
+        sample_rate = file_.samplerate
+
+    waveform = paddle.to_tensor(waveform)
+    if channels_first:
+        waveform = paddle.transpose(waveform, perm=[1, 0])
+    return waveform, sample_rate
+
+
+# Mapping from soundfile subtype to number of bits per sample.
+# This is mostly heuristical and the value is set to 0 when it is irrelevant
+# (lossy formats) or when it can't be inferred.
+# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
+# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
+# the default seems to be 8 bits but it can be compressed further to 4 bits.
+# The dict is inspired from
+# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
+_SUBTYPE_TO_BITS_PER_SAMPLE = {
+    "PCM_S8": 8,  # Signed 8 bit data
+    "PCM_16": 16,  # Signed 16 bit data
+    "PCM_24": 24,  # Signed 24 bit data
+    "PCM_32": 32,  # Signed 32 bit data
+    "PCM_U8": 8,  # Unsigned 8 bit data (WAV and RAW only)
+    "FLOAT": 32,  # 32 bit float data
+    "DOUBLE": 64,  # 64 bit float data
+    "ULAW": 8,  # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "ALAW": 8,  # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "IMA_ADPCM": 0,  # IMA ADPCM.
+    "MS_ADPCM": 0,  # Microsoft ADPCM.
+    "GSM610":
+    0,  # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
+    "VOX_ADPCM": 0,  # OKI / Dialogix ADPCM
+    "G721_32": 0,  # 32kbs G721 ADPCM encoding.
+    "G723_24": 0,  # 24kbs G723 ADPCM encoding.
+    "G723_40": 0,  # 40kbs G723 ADPCM encoding.
+    "DWVW_12": 12,  # 12 bit Delta Width Variable Word encoding.
+    "DWVW_16": 16,  # 16 bit Delta Width Variable Word encoding.
+    "DWVW_24": 24,  # 24 bit Delta Width Variable Word encoding.
+    "DWVW_N": 0,  # N bit Delta Width Variable Word encoding.
+    "DPCM_8": 8,  # 8 bit differential PCM (XI only)
+    "DPCM_16": 16,  # 16 bit differential PCM (XI only)
+    "VORBIS": 0,  # Xiph Vorbis encoding. (lossy)
+    "ALAC_16": 16,  # Apple Lossless Audio Codec (16 bit).
+    "ALAC_20": 20,  # Apple Lossless Audio Codec (20 bit).
+    "ALAC_24": 24,  # Apple Lossless Audio Codec (24 bit).
+    "ALAC_32": 32,  # Apple Lossless Audio Codec (32 bit).
+}
+
+
+def _get_bit_depth(subtype):
+    if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
+        warnings.warn(
+            f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample "
+            "attribute will be set to 0. If you are seeing this warning, please "
+            "report by opening an issue on github (after checking for existing/closed ones). "
+            "You may otherwise ignore this warning.")
+    return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
+
+
+_SUBTYPE_TO_ENCODING = {
+    "PCM_S8": "PCM_S",
+    "PCM_16": "PCM_S",
+    "PCM_24": "PCM_S",
+    "PCM_32": "PCM_S",
+    "PCM_U8": "PCM_U",
+    "FLOAT": "PCM_F",
+    "DOUBLE": "PCM_F",
+    "ULAW": "ULAW",
+    "ALAW": "ALAW",
+    "VORBIS": "VORBIS",
+}
+
+
+def _get_encoding(format: str, subtype: str):
+    if format == "FLAC":
+        return "FLAC"
+    return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
+
+
+def info(filepath: str, format: Optional[str]=None) -> AudioInfo:
+    """Get signal information of an audio file.
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+
+    Returns:
+        AudioInfo: meta data of the given audio.
+
+    """
+    sinfo = soundfile.info(filepath)
+    return AudioInfo(
+        sinfo.samplerate,
+        sinfo.frames,
+        sinfo.channels,
+        bits_per_sample=_get_bit_depth(sinfo.subtype),
+        encoding=_get_encoding(sinfo.format, sinfo.subtype), )
--- a/audio/paddleaudio/backends/sox_io_backend.py
+++ b/audio/paddleaudio/backends/sox_io_backend.py
@ -0,0 +1,106 @@
+import os
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import paddleaudio
+from paddle import Tensor
+from paddleaudio._internal import module_utils as _mod_utils
+
+from .common import AudioInfo
+
+#https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py
+
+
+def _fail_info(filepath: str, format: Optional[str]) -> AudioInfo:
+    raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
+
+
+def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioInfo:
+    raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
+
+
+# Note: need to comply TorchScript syntax -- need annotation and no f-string
+def _fail_load(
+        filepath: str,
+        frame_offset: int=0,
+        num_frames: int=-1,
+        normalize: bool=True,
+        channels_first: bool=True,
+        format: Optional[str]=None, ) -> Tuple[Tensor, int]:
+    raise RuntimeError("Failed to load audio from {}".format(filepath))
+
+
+def _fail_load_fileobj(fileobj, *args, **kwargs):
+    raise RuntimeError(f"Failed to load audio from {fileobj}")
+
+
+_fallback_info = _fail_info
+_fallback_info_fileobj = _fail_info_fileobj
+_fallback_load = _fail_load
+_fallback_load_filebj = _fail_load_fileobj
+
+
+@_mod_utils.requires_sox()
+def load(
+        filepath: str,
+        frame_offset: int=0,
+        num_frames: int=-1,
+        normalize: bool=True,
+        channels_first: bool=True,
+        format: Optional[str]=None, ) -> Tuple[Tensor, int]:
+    if hasattr(filepath, "read"):
+        ret = paddleaudio._paddleaudio.load_audio_fileobj(
+            filepath, frame_offset, num_frames, normalize, channels_first,
+            format)
+        if ret is not None:
+            audio_tensor = paddle.to_tensor(ret[0])
+            return (audio_tensor, ret[1])
+        return _fallback_load_fileobj(filepath, frame_offset, num_frames,
+                                      normalize, channels_first, format)
+    filepath = os.fspath(filepath)
+    ret = paddleaudio._paddleaudio.sox_io_load_audio_file(
+        filepath, frame_offset, num_frames, normalize, channels_first, format)
+    if ret is not None:
+        audio_tensor = paddle.to_tensor(ret[0])
+        return (audio_tensor, ret[1])
+    return _fallback_load(filepath, frame_offset, num_frames, normalize,
+                          channels_first, format)
+
+
+@_mod_utils.requires_sox()
+def save(
+        filepath: str,
+        src: Tensor,
+        sample_rate: int,
+        channels_first: bool=True,
+        compression: Optional[float]=None,
+        format: Optional[str]=None,
+        encoding: Optional[str]=None,
+        bits_per_sample: Optional[int]=None, ):
+    src_arr = src.numpy()
+    if hasattr(filepath, "write"):
+        paddleaudio._paddleaudio.save_audio_fileobj(
+            filepath, src_arr, sample_rate, channels_first, compression, format,
+            encoding, bits_per_sample)
+        return
+    filepath = os.fspath(filepath)
+    paddleaudio._paddleaudio.sox_io_save_audio_file(
+        filepath, src_arr, sample_rate, channels_first, compression, format,
+        encoding, bits_per_sample)
+
+
+@_mod_utils.requires_sox()
+def info(
+        filepath: str,
+        format: Optional[str]=None, ) -> AudioInfo:
+    if hasattr(filepath, "read"):
+        sinfo = paddleaudio._paddleaudio.get_info_fileobj(filepath, format)
+        if sinfo is not None:
+            return AudioInfo(*sinfo)
+        return _fallback_info_fileobj(filepath, format)
+    filepath = os.fspath(filepath)
+    sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format)
+    if sinfo is not None:
+        return AudioInfo(*sinfo)
+    return _fallback_info(filepath, format)
--- a/audio/paddleaudio/backends/utils.py
+++ b/audio/paddleaudio/backends/utils.py
@ -0,0 +1,83 @@
+"""Defines utilities for switching audio backends"""
+#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py
+import warnings
+from typing import List
+from typing import Optional
+
+import paddleaudio
+from paddleaudio._internal import module_utils as _mod_utils
+
+from . import no_backend
+from . import soundfile_backend
+from . import sox_io_backend
+
+__all__ = [
+    "list_audio_backends",
+    "get_audio_backend",
+    "set_audio_backend",
+]
+
+
+def list_audio_backends() -> List[str]:
+    """List available backends
+
+    Returns:
+        List[str]: The list of available backends.
+    """
+    backends = []
+    if _mod_utils.is_module_available("soundfile"):
+        backends.append("soundfile")
+    if _mod_utils.is_sox_available():
+        backends.append("sox_io")
+    return backends
+
+
+def set_audio_backend(backend: Optional[str]):
+    """Set the backend for I/O operation
+
+    Args:
+        backend (str or None): Name of the backend.
+            One of ``"sox_io"`` or ``"soundfile"`` based on availability
+            of the system. If ``None`` is provided the  current backend is unassigned.
+    """
+    if backend is not None and backend not in list_audio_backends():
+        raise RuntimeError(f'Backend "{backend}" is not one of '
+                           f"available backends: {list_audio_backends()}.")
+
+    if backend is None:
+        module = no_backend
+    elif backend == "sox_io":
+        module = sox_io_backend
+    elif backend == "soundfile":
+        module = soundfile_backend
+    else:
+        raise NotImplementedError(f'Unexpected backend "{backend}"')
+
+    for func in ["save", "load", "info"]:
+        setattr(paddleaudio, func, getattr(module, func))
+
+
+def _init_audio_backend():
+    backends = list_audio_backends()
+    if "soundfile" in backends:
+        set_audio_backend("soundfile")
+    elif "sox_io" in backends:
+        set_audio_backend("sox_io")
+    else:
+        warnings.warn("No audio backend is available.")
+        set_audio_backend(None)
+
+
+def get_audio_backend() -> Optional[str]:
+    """Get the name of the current backend
+
+    Returns:
+        Optional[str]: The name of the current backend or ``None`` if no backend is assigned.
+    """
+    if paddleaudio.load == no_backend.load:
+        return None
+    if paddleaudio.load == sox_io_backend.load:
+        return "sox_io"
+    if paddleaudio.load == soundfile_backend.load:
+        return "soundfile"
+    raise ValueError("Unknown backend.")
--- a/paddlespeech/audio/compliance/init.py
+++ b/paddlespeech/audio/compliance/init.py
--- a/paddlespeech/audio/compliance/kaldi.py
+++ b/paddlespeech/audio/compliance/kaldi.py
--- a/paddlespeech/audio/compliance/librosa.py
+++ b/paddlespeech/audio/compliance/librosa.py
--- a/paddlespeech/audio/datasets/init.py
+++ b/paddlespeech/audio/datasets/init.py
--- a/paddlespeech/audio/datasets/dataset.py
+++ b/paddlespeech/audio/datasets/dataset.py
@ -16,7 +16,7 @@ from typing import List
 import numpy as np
 import paddle

-from ..backends import load as load_audio
+from ..backends.soundfile_backend import soundfile_load as load_audio
 from ..compliance.kaldi import fbank as kaldi_fbank
 from ..compliance.kaldi import mfcc as kaldi_mfcc
 from ..compliance.librosa import melspectrogram
--- a/paddlespeech/audio/datasets/esc50.py
+++ b/paddlespeech/audio/datasets/esc50.py
@ -16,8 +16,8 @@ import os
 from typing import List
 from typing import Tuple

-from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset

 __all__ = ['ESC50']
--- a/paddlespeech/audio/datasets/gtzan.py
+++ b/paddlespeech/audio/datasets/gtzan.py
@ -17,8 +17,8 @@ import random
 from typing import List
 from typing import Tuple

-from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset

 __all__ = ['GTZAN']
--- a/paddlespeech/audio/datasets/hey_snips.py
+++ b/paddlespeech/audio/datasets/hey_snips.py
--- a/paddlespeech/audio/datasets/rirs_noises.py
+++ b/paddlespeech/audio/datasets/rirs_noises.py
@ -20,8 +20,8 @@ from typing import List
 from paddle.io import Dataset
 from tqdm import tqdm

-from ..backends import load as load_audio
-from ..backends import save as save_wav
+from ..backends.soundfile_backend import soundfile_load as load_audio
+from ..backends.soundfile_backend import soundfile_save as save_wav
 from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
 from .dataset import feat_funcs
--- a/paddlespeech/audio/datasets/tess.py
+++ b/paddlespeech/audio/datasets/tess.py
@ -17,8 +17,8 @@ import random
 from typing import List
 from typing import Tuple

-from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset

 __all__ = ['TESS']
--- a/paddlespeech/audio/datasets/urban_sound.py
+++ b/paddlespeech/audio/datasets/urban_sound.py
@ -16,8 +16,8 @@ import os
 from typing import List
 from typing import Tuple

-from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset

 __all__ = ['UrbanSound8K']
--- a/paddlespeech/audio/datasets/voxceleb.py
+++ b/paddlespeech/audio/datasets/voxceleb.py
@ -23,7 +23,7 @@ from paddle.io import Dataset
 from pathos.multiprocessing import Pool
 from tqdm import tqdm

-from ..backends import load as load_audio
+from ..backends.soundfile_backend import soundfile_load as load_audio
 from ..utils import DATA_HOME
 from ..utils import decompress
 from ..utils.download import download_and_decompress
--- a/paddlespeech/audio/features/init.py
+++ b/paddlespeech/audio/features/init.py
--- a/paddlespeech/audio/features/layers.py
+++ b/paddlespeech/audio/features/layers.py
--- a/paddlespeech/audio/functional/init.py
+++ b/paddlespeech/audio/functional/init.py
--- a/paddlespeech/audio/functional/functional.py
+++ b/paddlespeech/audio/functional/functional.py
--- a/paddlespeech/audio/functional/window.py
+++ b/paddlespeech/audio/functional/window.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -18,127 +18,156 @@ from typing import Union
 import paddle
 from paddle import Tensor

-__all__ = [
-    'get_window',
-]

+class WindowFunctionRegister(object):
+    def __init__(self):
+        self._functions_dict = dict()

+    def register(self):
+        def add_subfunction(func):
+            name = func.__name__
+            self._functions_dict[name] = func
+            return func
+
+        return add_subfunction
+
+    def get(self, name):
+        return self._functions_dict[name]
+
+
+window_function_register = WindowFunctionRegister()
+
+
+@window_function_register.register()
 def _cat(x: List[Tensor], data_type: str) -> Tensor:
    l = [paddle.to_tensor(_, data_type) for _ in x]
    return paddle.concat(l)


+@window_function_register.register()
 def _acosh(x: Union[Tensor, float]) -> Tensor:
    if isinstance(x, float):
        return math.log(x + math.sqrt(x**2 - 1))
    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))


+@window_function_register.register()
 def _extend(M: int, sym: bool) -> bool:
-    """Extend window by 1 sample if needed for DFT-even symmetry. """
+    """Extend window by 1 sample if needed for DFT-even symmetry."""
    if not sym:
        return M + 1, True
    else:
        return M, False


+@window_function_register.register()
 def _len_guards(M: int) -> bool:
-    """Handle small or incorrect window lengths. """
+    """Handle small or incorrect window lengths."""
    if int(M) != M or M < 0:
        raise ValueError('Window length M must be a non-negative integer')

    return M <= 1


+@window_function_register.register()
 def _truncate(w: Tensor, needed: bool) -> Tensor:
-    """Truncate window by 1 sample if needed for DFT-even symmetry. """
+    """Truncate window by 1 sample if needed for DFT-even symmetry."""
    if needed:
        return w[:-1]
    else:
        return w


-def _general_gaussian(M: int, p, sig, sym: bool=True,
-                      dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _general_gaussian(
+    M: int, p, sig, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a window with a generalized Gaussian shape.
    This function is consistent with scipy.signal.windows.general_gaussian().
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)

    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
-    w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
+    w = paddle.exp(-0.5 * paddle.abs(n / sig) ** (2 * p))

    return _truncate(w, needs_trunc)


-def _general_cosine(M: int, a: float, sym: bool=True,
-                    dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _general_cosine(
+    M: int, a: float, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a generic weighted sum of cosine terms window.
    This function is consistent with scipy.signal.windows.general_cosine().
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
-    w = paddle.zeros((M, ), dtype=dtype)
+    w = paddle.zeros((M,), dtype=dtype)
    for k in range(len(a)):
        w += a[k] * paddle.cos(k * fac)
    return _truncate(w, needs_trunc)


-def _general_hamming(M: int, alpha: float, sym: bool=True,
-                     dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _general_hamming(
+    M: int, alpha: float, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a generalized Hamming window.
    This function is consistent with scipy.signal.windows.general_hamming()
    """
-    return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
+    return _general_cosine(M, [alpha, 1.0 - alpha], sym, dtype=dtype)


-def _taylor(M: int,
-            nbar=4,
-            sll=30,
-            norm=True,
-            sym: bool=True,
-            dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _taylor(
+    M: int, nbar=4, sll=30, norm=True, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a Taylor window.
    The Taylor window taper function approximates the Dolph-Chebyshev window's
    constant sidelobe level for a parameterized number of near-in sidelobes.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    # Original text uses a negative sidelobe level parameter and then negates
    # it in the calculation of B. To keep consistent with other methods we
    # assume the sidelobe level parameter to be positive.
-    B = 10**(sll / 20)
+    B = 10 ** (sll / 20)
    A = _acosh(B) / math.pi
-    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
+    s2 = nbar**2 / (A**2 + (nbar - 0.5) ** 2)
    ma = paddle.arange(1, nbar, dtype=dtype)

-    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
+    Fm = paddle.empty((nbar - 1,), dtype=dtype)
    signs = paddle.empty_like(ma)
    signs[::2] = 1
    signs[1::2] = -1
    m2 = ma * ma
    for mi in range(len(ma)):
-        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
-                                                           ))
+        numer = signs[mi] * paddle.prod(
+            1 - m2[mi] / s2 / (A**2 + (ma - 0.5) ** 2)
+        )
        if mi == 0:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
        elif mi == len(ma) - 1:
            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
        else:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
-                mi] / m2[mi + 1:])
+            denom = (
+                2
+                * paddle.prod(1 - m2[mi] / m2[:mi])
+                * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
+            )

        Fm[mi] = numer / denom

    def W(n):
        return 1 + 2 * paddle.matmul(
            Fm.unsqueeze(0),
-            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
+            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2.0 + 0.5) / M),
+        )

    w = W(paddle.arange(0, M, dtype=dtype))

@ -150,7 +179,8 @@ def _taylor(M: int,
    return _truncate(w, needs_trunc)


-def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _hamming(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Hamming window.
    The Hamming window is a taper formed by using a raised cosine with
    non-zero endpoints, optimized to minimize the nearest side lobe.
@ -158,7 +188,8 @@ def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_hamming(M, 0.54, sym, dtype=dtype)


-def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _hann(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Hann window.
    The Hann window is a taper formed by using a raised cosine or sine-squared
    with ends that touch zero.
@ -166,15 +197,18 @@ def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_hamming(M, 0.5, sym, dtype=dtype)


-def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _tukey(
+    M: int, alpha=0.5, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a Tukey window.
    The Tukey window is also known as a tapered cosine window.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)

    if alpha <= 0:
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    elif alpha >= 1.0:
        return hann(M, sym=sym)

@ -182,53 +216,48 @@ def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:

    n = paddle.arange(0, M, dtype=dtype)
    width = int(alpha * (M - 1) / 2.0)
-    n1 = n[0:width + 1]
-    n2 = n[width + 1:M - width - 1]
-    n3 = n[M - width - 1:]
+    n1 = n[0 : width + 1]
+    n2 = n[width + 1 : M - width - 1]
+    n3 = n[M - width - 1 :]

    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
    w2 = paddle.ones(n2.shape, dtype=dtype)
-    w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
-                                          (M - 1))))
+    w3 = 0.5 * (
+        1
+        + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / (M - 1)))
+    )
    w = paddle.concat([w1, w2, w3])

    return _truncate(w, needs_trunc)


-def _kaiser(M: int, beta: float, sym: bool=True,
-            dtype: str='float64') -> Tensor:
-    """Compute a Kaiser window.
-    The Kaiser window is a taper formed by using a Bessel function.
-    """
-    raise NotImplementedError()
-
-
-def _gaussian(M: int, std: float, sym: bool=True,
-              dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _gaussian(
+    M: int, std: float, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
    """Compute a Gaussian window.
    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)

    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
    sig2 = 2 * std * std
-    w = paddle.exp(-n**2 / sig2)
+    w = paddle.exp(-(n**2) / sig2)

    return _truncate(w, needs_trunc)


-def _exponential(M: int,
-                 center=None,
-                 tau=1.,
-                 sym: bool=True,
-                 dtype: str='float64') -> Tensor:
-    """Compute an exponential (or Poisson) window. """
+@window_function_register.register()
+def _exponential(
+    M: int, center=None, tau=1.0, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
+    """Compute an exponential (or Poisson) window."""
    if sym and center is not None:
        raise ValueError("If sym==True, center must be None.")
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)

    if center is None:
@ -240,11 +269,11 @@ def _exponential(M: int,
    return _truncate(w, needs_trunc)


-def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a triangular window.
-    """
+@window_function_register.register()
+def _triang(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
+    """Compute a triangular window."""
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)

    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
@ -258,23 +287,26 @@ def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _truncate(w, needs_trunc)


-def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _bohman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Bohman window.
    The Bohman window is the autocorrelation of a cosine window.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)

    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
-        math.pi * fac)
+        math.pi * fac
+    )
    w = _cat([0, w, 0], dtype)

    return _truncate(w, needs_trunc)


-def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
+def _blackman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Blackman window.
    The Blackman window is a taper formed by using the first three terms of
    a summation of cosines. It was designed to have close to the minimal
@ -284,31 +316,44 @@ def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)


-def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a window with a simple cosine shape.
-    """
+@window_function_register.register()
+def _cosine(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
+    """Compute a window with a simple cosine shape."""
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
-    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
+    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + 0.5))

    return _truncate(w, needs_trunc)


-def get_window(window: Union[str, Tuple[str, float]],
-               win_length: int,
-               fftbins: bool=True,
-               dtype: str='float64') -> Tensor:
+def get_window(
+    window: Union[str, Tuple[str, float]],
+    win_length: int,
+    fftbins: bool = True,
+    dtype: str = 'float64',
+) -> Tensor:
    """Return a window of a given length and type.

    Args:
-        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
+        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
        win_length (int): Number of samples.
        fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
        dtype (str, optional): The data type of the return window. Defaults to 'float64'.

    Returns:
        Tensor: The window represented as a tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            n_fft = 512
+            cosine_window = paddle.audio.functional.get_window('cosine', n_fft)
+
+            std = 7
+            gaussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft)
    """
    sym = not fftbins

@ -319,19 +364,22 @@ def get_window(window: Union[str, Tuple[str, float]],
            args = window[1:]
    elif isinstance(window, str):
        if window in ['gaussian', 'exponential']:
-            raise ValueError("The '" + window + "' window needs one or "
-                             "more parameters -- pass a tuple.")
+            raise ValueError(
+                "The '" + window + "' window needs one or "
+                "more parameters -- pass a tuple."
+            )
        else:
            winstr = window
    else:
-        raise ValueError("%s as window type is not supported." %
-                         str(type(window)))
+        raise ValueError(
+            "%s as window type is not supported." % str(type(window))
+        )

    try:
-        winfunc = eval('_' + winstr)
+        winfunc = window_function_register.get('_' + winstr)
    except KeyError as e:
        raise ValueError("Unknown window type.") from e

-    params = (win_length, ) + args
+    params = (win_length,) + args
    kwargs = {'sym': sym}
    return winfunc(*params, dtype=dtype, **kwargs)
--- a/tests/unit/audio/backends/soundfile/init.py
+++ b/tests/unit/audio/backends/soundfile/init.py
@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .kaldi import fbank
+from .kaldi import pitch
--- a/audio/paddleaudio/kaldi/kaldi.py
+++ b/audio/paddleaudio/kaldi/kaldi.py
@ -0,0 +1,132 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddleaudio
+from paddleaudio._internal import module_utils
+
+__all__ = [
+    'fbank',
+    'pitch',
+]
+
+
+@module_utils.requires_kaldi()
+def fbank(
+        wav,
+        samp_freq: int=16000,
+        frame_shift_ms: float=10.0,
+        frame_length_ms: float=25.0,
+        dither: float=0.0,
+        preemph_coeff: float=0.97,
+        remove_dc_offset: bool=True,
+        window_type: str='povey',
+        round_to_power_of_two: bool=True,
+        blackman_coeff: float=0.42,
+        snip_edges: bool=True,
+        allow_downsample: bool=False,
+        allow_upsample: bool=False,
+        max_feature_vectors: int=-1,
+        num_bins: int=23,
+        low_freq: float=20,
+        high_freq: float=0,
+        vtln_low: float=100,
+        vtln_high: float=-500,
+        debug_mel: bool=False,
+        htk_mode: bool=False,
+        use_energy: bool=False,  # fbank opts
+        energy_floor: float=0.0,
+        raw_energy: bool=True,
+        htk_compat: bool=False,
+        use_log_fbank: bool=True,
+        use_power: bool=True):
+    frame_opts = paddleaudio._paddleaudio.FrameExtractionOptions()
+    mel_opts = paddleaudio._paddleaudio.MelBanksOptions()
+    fbank_opts = paddleaudio._paddleaudio.FbankOptions()
+    frame_opts.samp_freq = samp_freq
+    frame_opts.frame_shift_ms = frame_shift_ms
+    frame_opts.frame_length_ms = frame_length_ms
+    frame_opts.dither = dither
+    frame_opts.preemph_coeff = preemph_coeff
+    frame_opts.remove_dc_offset = remove_dc_offset
+    frame_opts.window_type = window_type
+    frame_opts.round_to_power_of_two = round_to_power_of_two
+    frame_opts.blackman_coeff = blackman_coeff
+    frame_opts.snip_edges = snip_edges
+    frame_opts.allow_downsample = allow_downsample
+    frame_opts.allow_upsample = allow_upsample
+    frame_opts.max_feature_vectors = max_feature_vectors
+
+    mel_opts.num_bins = num_bins
+    mel_opts.low_freq = low_freq
+    mel_opts.high_freq = high_freq
+    mel_opts.vtln_low = vtln_low
+    mel_opts.vtln_high = vtln_high
+    mel_opts.debug_mel = debug_mel
+    mel_opts.htk_mode = htk_mode
+
+    fbank_opts.use_energy = use_energy
+    fbank_opts.energy_floor = energy_floor
+    fbank_opts.raw_energy = raw_energy
+    fbank_opts.htk_compat = htk_compat
+    fbank_opts.use_log_fbank = use_log_fbank
+    fbank_opts.use_power = use_power
+    feat = paddleaudio._paddleaudio.ComputeFbank(frame_opts, mel_opts,
+                                                 fbank_opts, wav)
+    return feat
+
+
+@module_utils.requires_kaldi()
+def pitch(wav,
+          samp_freq: int=16000,
+          frame_shift_ms: float=10.0,
+          frame_length_ms: float=25.0,
+          preemph_coeff: float=0.0,
+          min_f0: int=50,
+          max_f0: int=400,
+          soft_min_f0: float=10.0,
+          penalty_factor: float=0.1,
+          lowpass_cutoff: int=1000,
+          resample_freq: int=4000,
+          delta_pitch: float=0.005,
+          nccf_ballast: int=7000,
+          lowpass_filter_width: int=1,
+          upsample_filter_width: int=5,
+          max_frames_latency: int=0,
+          frames_per_chunk: int=0,
+          simulate_first_pass_online: bool=False,
+          recompute_frame: int=500,
+          nccf_ballast_online: bool=False,
+          snip_edges: bool=True):
+    pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
+    pitch_opts.samp_freq = samp_freq
+    pitch_opts.frame_shift_ms = frame_shift_ms
+    pitch_opts.frame_length_ms = frame_length_ms
+    pitch_opts.preemph_coeff = preemph_coeff
+    pitch_opts.min_f0 = min_f0
+    pitch_opts.max_f0 = max_f0
+    pitch_opts.soft_min_f0 = soft_min_f0
+    pitch_opts.penalty_factor = penalty_factor
+    pitch_opts.lowpass_cutoff = lowpass_cutoff
+    pitch_opts.resample_freq = resample_freq
+    pitch_opts.delta_pitch = delta_pitch
+    pitch_opts.nccf_ballast = nccf_ballast
+    pitch_opts.lowpass_filter_width = lowpass_filter_width
+    pitch_opts.upsample_filter_width = upsample_filter_width
+    pitch_opts.max_frames_latency = max_frames_latency
+    pitch_opts.frames_per_chunk = frames_per_chunk
+    pitch_opts.simulate_first_pass_online = simulate_first_pass_online
+    pitch_opts.recompute_frame = recompute_frame
+    pitch_opts.nccf_ballast_online = nccf_ballast_online
+    pitch_opts.snip_edges = snip_edges
+    pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
+    return pitch
--- a/paddlespeech/audio/metric/init.py
+++ b/paddlespeech/audio/metric/init.py
--- a/paddlespeech/audio/metric/eer.py
+++ b/paddlespeech/audio/metric/eer.py
--- a/audio/paddleaudio/sox_effects/init.py
+++ b/audio/paddleaudio/sox_effects/init.py
@ -0,0 +1,21 @@
+from paddleaudio._internal import module_utils as _mod_utils
+
+from .sox_effects import apply_effects_file
+from .sox_effects import apply_effects_tensor
+from .sox_effects import effect_names
+from .sox_effects import init_sox_effects
+from .sox_effects import shutdown_sox_effects
+
+if _mod_utils.is_sox_available():
+    import atexit
+
+    init_sox_effects()
+    atexit.register(shutdown_sox_effects)
+
+__all__ = [
+    "init_sox_effects",
+    "shutdown_sox_effects",
+    "effect_names",
+    "apply_effects_tensor",
+    "apply_effects_file",
+]
--- a/audio/paddleaudio/sox_effects/sox_effects.py
+++ b/audio/paddleaudio/sox_effects/sox_effects.py
@ -0,0 +1,241 @@
+import os
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import paddleaudio
+from paddleaudio._internal import module_utils as _mod_utils
+from paddleaudio.utils.sox_utils import list_effects
+
+#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
+
+
+@_mod_utils.requires_sox()
+def init_sox_effects():
+    """Initialize resources required to use sox effects.
+
+    Note:
+        You do not need to call this function manually. It is called automatically.
+
+    Once initialized, you do not need to call this function again across the multiple uses of
+    sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
+    Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
+    again will result in error.
+    """
+    paddleaudio._paddleaudio.sox_effects_initialize_sox_effects()
+
+
+@_mod_utils.requires_sox()
+def shutdown_sox_effects():
+    """Clean up resources required to use sox effects.
+
+    Note:
+        You do not need to call this function manually. It is called automatically.
+
+    It is safe to call this function multiple times.
+    Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
+    initializing again will result in error.
+    """
+    paddleaudio._paddleaudio.sox_effects_shutdown_sox_effects()
+
+
+@_mod_utils.requires_sox()
+def effect_names() -> List[str]:
+    """Gets list of valid sox effect names
+
+    Returns:
+        List[str]: list of available effect names.
+
+    Example
+        >>> paddleaudio.sox_effects.effect_names()
+        ['allpass', 'band', 'bandpass', ... ]
+    """
+    return list(list_effects().keys())
+
+
+@_mod_utils.requires_sox()
+def apply_effects_tensor(
+        tensor: paddle.Tensor,
+        sample_rate: int,
+        effects: List[List[str]],
+        channels_first: bool=True, ) -> Tuple[paddle.Tensor, int]:
+    """Apply sox effects to given Tensor
+
+    .. devices:: CPU
+
+    Note:
+        This function only works on CPU Tensors.
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` command adds certain effects automatically (such as
+        ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
+        only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
+        need to give ``rate`` effect with desired sampling rate.).
+
+    Args:
+        tensor (paddle.Tensor): Input 2D CPU Tensor.
+        sample_rate (int): Sample rate
+        effects (List[List[str]]): List of effects.
+        channels_first (bool, optional): Indicates if the input Tensor's dimension is
+            `[channels, time]` or `[time, channels]`
+
+    Returns:
+        (Tensor, int): Resulting Tensor and sample rate.
+        The resulting Tensor has the same ``dtype`` as the input Tensor, and
+        the same channels order. The shape of the Tensor can be different based on the
+        effects applied. Sample rate can also be different based on the effects applied.
+
+    Example - Basic usage
+        >>>
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>>
+        >>> # Generate pseudo wave:
+        >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
+        >>> sample_rate = 16000
+        >>> waveform = 2 * paddle.rand([2, sample_rate * 1]) - 1
+        >>> waveform.shape
+        paddle.Size([2, 16000])
+        >>> waveform
+        tensor([[ 0.3138,  0.7620, -0.9019,  ..., -0.7495, -0.4935,  0.5442],
+                [-0.0832,  0.0061,  0.8233,  ..., -0.5176, -0.9140, -0.2434]])
+        >>>
+        >>> # Apply effects
+        >>> waveform, sample_rate = apply_effects_tensor(
+        ...     wave_form, sample_rate, effects, channels_first=True)
+        >>>
+        >>> # Check the result
+        >>> # The new waveform is sampling rate 8000, 1 second.
+        >>> # normalization and channel order are preserved
+        >>> waveform.shape
+        paddle.Size([2, 8000])
+        >>> waveform
+        tensor([[ 0.5054, -0.5518, -0.4800,  ..., -0.0076,  0.0096, -0.0110],
+                [ 0.1331,  0.0436, -0.3783,  ..., -0.0035,  0.0012,  0.0008]])
+        >>> sample_rate
+        8000
+
+    """
+    tensor_np = tensor.numpy()
+    ret = paddleaudio._paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate,
+                                                       effects, channels_first)
+    if ret is not None:
+        return (paddle.to_tensor(ret[0]), ret[1])
+    raise RuntimeError("Failed to apply sox effect")
+
+
+@_mod_utils.requires_sox()
+def apply_effects_file(
+        path: str,
+        effects: List[List[str]],
+        normalize: bool=True,
+        channels_first: bool=True,
+        format: Optional[str]=None, ) -> Tuple[paddle.Tensor, int]:
+    """Apply sox effects to the audio file and load the resulting data as Tensor
+
+    Note:
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` commnad adds certain effects automatically (such as
+        ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
+        effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
+        effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
+        rate and leave samples untouched.
+
+    Args:
+        path (path-like object or file-like object):
+        effects (List[List[str]]): List of effects.
+        normalize (bool, optional):
+            When ``True``, this function always return ``float32``, and sample values are
+            normalized to ``[-1.0, 1.0]``.
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type. This argument has no effect for formats other
+            than integer WAV type.
+        channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Override the format detection with the given format.
+            Providing the argument might help when libsox can not infer the format
+            from header or extension,
+
+    Returns:
+        (Tensor, int): Resulting Tensor and sample rate.
+        If ``normalize=True``, the resulting Tensor is always ``float32`` type.
+        If ``normalize=False`` and the input audio file is of integer WAV file, then the
+        resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
+        If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
+        otherwise `[time, channel]`.
+
+    Example - Basic usage
+        >>>
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>>
+        >>> # Apply effects and load data with channels_first=True
+        >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
+        >>>
+        >>> # Check the result
+        >>> waveform.shape
+        paddle.Size([2, 8000])
+        >>> waveform
+        tensor([[ 5.1151e-03,  1.8073e-02,  2.2188e-02,  ...,  1.0431e-07,
+                 -1.4761e-07,  1.8114e-07],
+                [-2.6924e-03,  2.1860e-03,  1.0650e-02,  ...,  6.4122e-07,
+                 -5.6159e-07,  4.8103e-07]])
+        >>> sample_rate
+        8000
+
+    Example - Apply random speed perturbation to dataset
+        >>>
+        >>> # Load data from file, apply random speed perturbation
+        >>> class RandomPerturbationFile(paddle.utils.data.Dataset):
+        ...     \"\"\"Given flist, apply random speed perturbation
+        ...
+        ...     Suppose all the input files are at least one second long.
+        ...     \"\"\"
+        ...     def __init__(self, flist: List[str], sample_rate: int):
+        ...         super().__init__()
+        ...         self.flist = flist
+        ...         self.sample_rate = sample_rate
+        ...
+        ...     def __getitem__(self, index):
+        ...         speed = 0.5 + 1.5 * random.randn()
+        ...         effects = [
+        ...             ['gain', '-n', '-10'],  # apply 10 db attenuation
+        ...             ['remix', '-'],  # merge all the channels
+        ...             ['speed', f'{speed:.5f}'],  # duration is now 0.5 ~ 2.0 seconds.
+        ...             ['rate', f'{self.sample_rate}'],
+        ...             ['pad', '0', '1.5'],  # add 1.5 seconds silence at the end
+        ...             ['trim', '0', '2'],  # get the first 2 seconds
+        ...         ]
+        ...         waveform, _ = paddleaudio.sox_effects.apply_effects_file(
+        ...             self.flist[index], effects)
+        ...         return waveform
+        ...
+        ...     def __len__(self):
+        ...         return len(self.flist)
+        ...
+        >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
+        >>> loader = paddle.utils.data.DataLoader(dataset, batch_size=32)
+        >>> for batch in loader:
+        >>>     pass
+    """
+    if hasattr(path, "read"):
+        ret = paddleaudio._paddleaudio.apply_effects_fileobj(path, effects, normalize,
+                                                channels_first, format)
+        if ret is None:
+            raise RuntimeError("Failed to load audio from {}".format(path))
+        return (paddle.to_tensor(ret[0]), ret[1])
+    path = os.fspath(path)
+    ret = paddleaudio._paddleaudio.sox_effects_apply_effects_file(path, effects, normalize,
+                                                     channels_first, format)
+    if ret is not None:
+        return (paddle.to_tensor(ret[0]), ret[1])
+    raise RuntimeError("Failed to load audio from {}".format(path))
--- a/audio/paddleaudio/src/CMakeLists.txt
+++ b/audio/paddleaudio/src/CMakeLists.txt
@ -0,0 +1,217 @@
+if (MSVC)
+  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+
+if(APPLE)
+set(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
+endif(APPLE)
+
+################################################################################
+# libpaddleaudio
+################################################################################
+set(
+  LIBPADDLEAUDIO_SOURCES
+  utils.cpp
+  )
+
+set(
+  LIBPADDLEAUDIO_INCLUDE_DIRS
+  ${PROJECT_SOURCE_DIR}
+  )
+
+set(
+  LIBPADDLEAUDIO_LINK_LIBRARIES
+  )
+
+set(
+  LIBPADDLEAUDIO_COMPILE_DEFINITIONS)
+
+#------------------------------------------------------------------------------#
+# START OF CUSTOMIZATION LOGICS
+#------------------------------------------------------------------------------#
+
+if(BUILD_SOX)
+  list(
+    APPEND
+    LIBPADDLEAUDIO_LINK_LIBRARIES
+    libsox
+    )
+  list(
+    APPEND
+    LIBPADDLEAUDIO_SOURCES
+    )
+  list(
+    APPEND
+    LIBPADDLEAUDIO_COMPILE_DEFINITIONS
+    INCLUDE_SOX
+    )
+endif()
+
+
+if(BUILD_KALDI)
+  list(
+    APPEND
+    LIBPADDLEAUDIO_LINK_LIBRARIES
+    libkaldi
+  )
+  list(
+    APPEND
+    LIBPADDLEAUDIO_COMPILE_DEFINITIONS
+    INCLUDE_KALDI
+    COMPILE_WITHOUT_OPENFST
+  )
+endif()
+
+#------------------------------------------------------------------------------#
+# END OF CUSTOMIZATION LOGICS
+#------------------------------------------------------------------------------#
+
+function (define_library name source include_dirs link_libraries compile_defs)
+  add_library(${name} SHARED ${source})
+  target_include_directories(${name} PRIVATE ${include_dirs})
+  target_link_libraries(${name} ${link_libraries})
+  target_compile_definitions(${name} PRIVATE ${compile_defs})
+  set_target_properties(${name} PROPERTIES PREFIX "")
+  if (MSVC)
+    set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
+  endif(MSVC)
+
+  install(
+    TARGETS ${name}
+    LIBRARY DESTINATION lib
+    RUNTIME DESTINATION lib  # For Windows
+    )
+endfunction()
+
+
+define_library(
+  libpaddleaudio
+  "${LIBPADDLEAUDIO_SOURCES}"
+  "${LIBPADDLEAUDIO_INCLUDE_DIRS}"
+  "${LIBPADDLEAUDIO_LINK_LIBRARIES}"
+  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
+)
+
+if (APPLE)
+  add_custom_command(TARGET libpaddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/libgcc_s.1.1.dylib" libpaddleaudio.so)
+endif(APPLE)
+
+if (UNIX AND NOT APPLE)
+  set_target_properties(libpaddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN")
+endif()
+
+if (APPLE)
+  set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
+else()
+  set(AUDIO_LIBRARY -Wl,--no-as-needed libpaddleaudio -Wl,--as-needed CACHE INTERNAL "")
+endif()
+
+  ################################################################################
+# _paddleaudio.so
+################################################################################
+if (BUILD_PADDLEAUDIO_PYTHON_EXTENSION)
+if (WIN32)
+  find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
+  set(ADDITIONAL_ITEMS Python3::Python)
+endif()
+function(define_extension name sources include_dirs libraries definitions)
+  add_library(${name} SHARED ${sources})
+  target_compile_definitions(${name} PRIVATE "${definitions}")
+  target_include_directories(
+    ${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
+  target_link_libraries(
+    ${name}
+    ${libraries}
+    ${PYTHON_LIBRARY}
+    ${ADDITIONAL_ITEMS}
+    )
+  set_target_properties(${name} PROPERTIES PREFIX "")
+  if (MSVC)
+    set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
+  endif(MSVC)
+  if (APPLE)
+    # https://github.com/facebookarchive/caffe2/issues/854#issuecomment-364538485
+    # https://github.com/pytorch/pytorch/commit/73f6715f4725a0723d8171d3131e09ac7abf0666
+    set_target_properties(${name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+  endif()
+  install(
+    TARGETS ${name}
+    LIBRARY DESTINATION .
+    RUNTIME DESTINATION .  # For Windows
+    )
+endfunction()
+
+set(
+  EXTENSION_SOURCES
+  pybind/pybind.cpp
+  )
+#----------------------------------------------------------------------------#
+# START OF CUSTOMIZATION LOGICS
+#----------------------------------------------------------------------------#
+if(BUILD_SOX)
+  list(
+    APPEND
+    EXTENSION_SOURCES
+    pybind/sox/effects.cpp
+    pybind/sox/effects_chain.cpp
+    pybind/sox/io.cpp
+    pybind/sox/types.cpp
+    pybind/sox/utils.cpp
+    )
+endif()
+
+if(BUILD_KALDI)
+  list(
+    APPEND
+    EXTENSION_SOURCES
+    pybind/kaldi/kaldi_feature_wrapper.cc
+    pybind/kaldi/kaldi_feature.cc
+    )
+endif()
+#----------------------------------------------------------------------------#
+# END OF CUSTOMIZATION LOGICS
+#----------------------------------------------------------------------------#
+define_extension(
+  _paddleaudio
+  "${EXTENSION_SOURCES}"
+  ""
+  libpaddleaudio
+  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
+  )
+# if(BUILD_CTC_DECODER)
+#   set(
+#     DECODER_EXTENSION_SOURCES
+#     decoder/bindings/pybind.cpp
+#     )
+#   define_extension(
+#     _paddleaudio_decoder
+#     "${DECODER_EXTENSION_SOURCES}"
+#     ""
+#     "libpaddleaudio_decoder"
+#     "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
+#     )
+# endif()
+# if(USE_FFMPEG)
+#   set(
+#     FFMPEG_EXTENSION_SOURCES
+#     ffmpeg/pybind/typedefs.cpp
+#     ffmpeg/pybind/pybind.cpp
+#     ffmpeg/pybind/stream_reader.cpp
+#     )
+#   define_extension(
+#     _paddleaudio_ffmpeg
+#     "${FFMPEG_EXTENSION_SOURCES}"
+#     "${FFMPEG_INCLUDE_DIRS}"
+#     "libpaddleaudio_ffmpeg"
+#     "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
+#     )
+# endif()
+endif()
+
+if (APPLE)
+  add_custom_command(TARGET _paddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/lib/libgcc_s.1.1.dylib" _paddleaudio.so)
+endif(APPLE)
+
+if (UNIX AND NOT APPLE)
+  set_target_properties(_paddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN/lib")
+endif()
--- a/audio/paddleaudio/src/optional/COPYING
+++ b/audio/paddleaudio/src/optional/COPYING
@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
--- a/audio/paddleaudio/src/optional/optional.hpp
+++ b/audio/paddleaudio/src/optional/optional.hpp
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common.h
@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/numpy.h"
+#include "feat/feature-window.h"
+
+namespace paddleaudio {
+namespace kaldi {
+
+namespace py = pybind11;
+
+template <class F>
+class StreamingFeatureTpl {
+  public:
+    typedef typename F::Options Options;
+    StreamingFeatureTpl(const Options& opts);
+    bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
+                        ::kaldi::Vector<::kaldi::BaseFloat>* feats);
+    void Reset() { remained_wav_.Resize(0); }
+
+    int Dim() { return computer_.Dim(); }
+
+  private:
+    bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
+                 ::kaldi::Vector<::kaldi::BaseFloat>* feats);
+    Options opts_;
+    ::kaldi::FeatureWindowFunction window_function_;
+    ::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
+    F computer_;
+};
+
+}  // namespace kaldi
+}  // namespace ppspeech
+
+#include "feature_common_inl.h"
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
@ -0,0 +1,93 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+
+namespace paddleaudio {
+namespace kaldi {
+
+template <class F>
+StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)
+    : opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
+    // window_function_(computer_.GetFrameOptions()) { the opt set to zero
+}
+
+template <class F>
+bool StreamingFeatureTpl<F>::ComputeFeature(
+    const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
+    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
+    // append remaned waves
+    ::kaldi::int32 wav_len = wav.Dim();
+    if (wav_len == 0) return false;
+    ::kaldi::int32 left_len = remained_wav_.Dim();
+    ::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
+    waves.Range(0, left_len).CopyFromVec(remained_wav_);
+    waves.Range(left_len, wav_len).CopyFromVec(wav);
+
+    // cache remaned waves
+    ::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
+    ::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
+    ::kaldi::int32 frame_shift = frame_opts.WindowShift();
+    ::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
+    remained_wav_.Resize(left_samples);
+    remained_wav_.CopyFromVec(
+        waves.Range(frame_shift * num_frames, left_samples));
+
+    // compute speech feature
+    Compute(waves, feats);
+    return true;
+}
+
+// Compute feat
+template <class F>
+bool StreamingFeatureTpl<F>::Compute(
+    const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
+    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
+    ::kaldi::BaseFloat vtln_warp = 1.0;
+    const ::kaldi::FrameExtractionOptions& frame_opts =
+        computer_.GetFrameOptions();
+    ::kaldi::int32 num_samples = waves.Dim();
+    ::kaldi::int32 frame_length = frame_opts.WindowSize();
+    ::kaldi::int32 sample_rate = frame_opts.samp_freq;
+    if (num_samples < frame_length) {
+        return false;
+    }
+
+    ::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
+    feats->Resize(num_frames * Dim());
+
+    ::kaldi::Vector<::kaldi::BaseFloat> window;
+    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
+    for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
+        ::kaldi::BaseFloat raw_log_energy = 0.0;
+        ::kaldi::ExtractWindow(0,
+                               waves,
+                               frame,
+                               frame_opts,
+                               window_function_,
+                               &window,
+                               need_raw_log_energy ? &raw_log_energy : NULL);
+
+        ::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
+                                                         ::kaldi::kUndefined);
+        computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
+        ::kaldi::SubVector<::kaldi::BaseFloat> output_row(
+            feats->Data() + frame * Dim(), Dim());
+        output_row.CopyFromVec(this_feature);
+    }
+    return true;
+}
+
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
+#include "feat/pitch-functions.h"
+
+namespace paddleaudio {
+namespace kaldi {
+
+bool InitFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts) {
+    ::kaldi::FbankOptions opts;
+    opts.frame_opts = frame_opts;
+    opts.mel_opts = mel_opts;
+    opts.use_energy = fbank_opts.use_energy;
+    opts.energy_floor = fbank_opts.energy_floor;
+    opts.raw_energy = fbank_opts.raw_energy;
+    opts.htk_compat = fbank_opts.htk_compat;
+    opts.use_log_fbank = fbank_opts.use_log_fbank;
+    opts.use_power = fbank_opts.use_power;
+    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->InitFbank(opts);
+    return true;
+}
+
+py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
+    return paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ComputeFbank(
+        wav);
+}
+
+py::array_t<float> ComputeFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts,
+    const py::array_t<float>& wav) {
+    InitFbank(frame_opts, mel_opts, fbank_opts);
+    py::array_t<float> result = ComputeFbankStreaming(wav);
+    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
+    return result;
+}
+
+void ResetFbank() {
+    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
+}
+
+py::array_t<float> ComputeKaldiPitch(
+  const ::kaldi::PitchExtractionOptions& opts,
+  const py::array_t<float>& wav) {
+    py::buffer_info info = wav.request();
+    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+   
+    ::kaldi::Matrix<::kaldi::BaseFloat> features;
+    ::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
+    auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
+    for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
+        std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
+                    sizeof(float)*features.NumCols());
+    }
+   return result;
+}
+
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <string>
+
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
+#include "feat/pitch-functions.h"
+
+namespace py = pybind11;
+
+namespace paddleaudio {
+namespace kaldi {
+
+struct FbankOptions{
+  bool use_energy;  // append an extra dimension with energy to the filter banks
+  float energy_floor;
+  bool raw_energy;  // If true, compute energy before preemphasis and windowing
+  bool htk_compat;  // If true, put energy last (if using energy)
+  bool use_log_fbank;  // if true (default), produce log-filterbank, else linear
+  bool use_power; 
+  FbankOptions(): use_energy(false),
+                 energy_floor(0.0),
+                 raw_energy(true),
+                 htk_compat(false),
+                 use_log_fbank(true),
+                 use_power(true) {}
+};
+
+bool InitFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts);
+
+py::array_t<float> ComputeFbank(
+    ::kaldi::FrameExtractionOptions frame_opts,
+    ::kaldi::MelBanksOptions mel_opts,
+    FbankOptions fbank_opts,
+    const py::array_t<float>& wav);
+
+py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);
+
+void ResetFbank();
+
+py::array_t<float> ComputeKaldiPitch(
+    const ::kaldi::PitchExtractionOptions& opts,
+    const py::array_t<float>& wav);
+
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
+
+namespace paddleaudio {
+namespace kaldi {
+
+KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
+    static KaldiFeatureWrapper instance;
+    return &instance;
+}
+
+bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
+    fbank_.reset(new Fbank(opts));
+    return true;
+}
+
+py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
+    const py::array_t<float> wav) {
+    py::buffer_info info = wav.request();
+    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+
+    ::kaldi::Vector<::kaldi::BaseFloat> feats;
+    bool flag = fbank_->ComputeFeature(input_wav, &feats);
+    if (flag == false || feats.Dim() == 0) return py::array_t<float>();
+    auto result = py::array_t<float>(feats.Dim());
+    py::buffer_info xs = result.request();
+    std::cout << std::endl;
+    float* res_ptr = (float*)xs.ptr;
+    for (int idx = 0; idx < feats.Dim(); ++idx) {
+        *res_ptr = feats(idx);
+        res_ptr++;
+    }
+
+    return result.reshape({feats.Dim() / Dim(), Dim()});
+}
+
+}  // namesapce kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/kaldi-common.h"
+#include "feat/feature-fbank.h"
+
+#include "paddleaudio/src/pybind/kaldi/feature_common.h"
+
+namespace paddleaudio {
+namespace kaldi {
+
+typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
+
+class KaldiFeatureWrapper {
+  public:
+    static KaldiFeatureWrapper* GetInstance();
+    bool InitFbank(::kaldi::FbankOptions opts);
+    py::array_t<float> ComputeFbank(const py::array_t<float> wav);
+    int Dim() { return fbank_->Dim(); }
+    void ResetFbank() { fbank_->Reset(); }
+
+  private:
+    std::unique_ptr<paddleaudio::kaldi::Fbank> fbank_;
+};
+
+}  // namespace kaldi
+}  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/pybind.cpp
+++ b/audio/paddleaudio/src/pybind/pybind.cpp
@ -0,0 +1,148 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+#include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
+#include "paddleaudio/third_party/kaldi/feat/feature-fbank.h"
+
+#ifdef INCLUDE_SOX
+#include "paddleaudio/src/pybind/sox/io.h"
+#include "paddleaudio/src/pybind/sox/effects.h"
+#endif
+
+#include <pybind11/stl.h>
+#include <pybind11/pybind11.h>
+
+// `tl::optional` 
+#ifdef INCLUDE_SOX
+namespace pybind11 { namespace detail {
+   template <typename T>
+   struct type_caster<tl::optional<T>> : optional_caster<tl::optional<T>> {};
+}}
+#endif
+
+PYBIND11_MODULE(_paddleaudio, m) {
+#ifdef INCLUDE_SOX
+    m.def("get_info_file",
+          &paddleaudio::sox_io::get_info_file,
+          "Get metadata of audio file.");
+    // support obj later
+    m.def("get_info_fileobj",
+          &paddleaudio::sox_io::get_info_fileobj,
+          "Get metadata of audio in file object.");
+    m.def("load_audio_fileobj",
+          &paddleaudio::sox_io::load_audio_fileobj,
+          "Load audio from file object.");
+    m.def("save_audio_fileobj",
+          &paddleaudio::sox_io::save_audio_fileobj,
+          "Save audio to file obj.");
+          
+    // sox io
+     m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file);
+     m.def(
+         "sox_io_load_audio_file",
+         &paddleaudio::sox_io::load_audio_file);
+     m.def(
+         "sox_io_save_audio_file",
+         &paddleaudio::sox_io::save_audio_file);
+    
+     // sox utils
+     m.def("sox_utils_set_seed", &paddleaudio::sox_utils::set_seed);
+     m.def(
+         "sox_utils_set_verbosity",
+         &paddleaudio::sox_utils::set_verbosity);
+     m.def(
+         "sox_utils_set_use_threads",
+         &paddleaudio::sox_utils::set_use_threads);
+     m.def(
+         "sox_utils_set_buffer_size",
+         &paddleaudio::sox_utils::set_buffer_size);
+     m.def(
+         "sox_utils_list_effects",
+         &paddleaudio::sox_utils::list_effects);
+     m.def(
+         "sox_utils_list_read_formats",
+         &paddleaudio::sox_utils::list_read_formats);
+     m.def(
+         "sox_utils_list_write_formats",
+         &paddleaudio::sox_utils::list_write_formats);
+     m.def(
+         "sox_utils_get_buffer_size",
+         &paddleaudio::sox_utils::get_buffer_size);
+
+     // effect
+     m.def("apply_effects_fileobj",
+           &paddleaudio::sox_effects::apply_effects_fileobj,
+           "Decode audio data from file-like obj and apply effects.");
+     m.def("sox_effects_initialize_sox_effects",
+       &paddleaudio::sox_effects::initialize_sox_effects);
+     m.def(
+         "sox_effects_shutdown_sox_effects",
+         &paddleaudio::sox_effects::shutdown_sox_effects);
+     m.def(
+         "sox_effects_apply_effects_tensor",
+         &paddleaudio::sox_effects::apply_effects_tensor);
+     m.def(
+         "sox_effects_apply_effects_file",
+         &paddleaudio::sox_effects::apply_effects_file);
+#endif
+
+#ifdef INCLUDE_KALDI
+    m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
+    py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
+        .def(py::init<>())
+        .def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
+        .def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
+        .def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
+        .def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
+        .def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
+        .def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
+        .def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
+        .def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
+        .def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
+        .def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
+        .def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
+        .def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
+        .def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
+        .def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
+        .def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
+        .def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
+        .def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
+        .def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
+        .def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
+        .def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
+    m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
+    py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
+        .def(py::init<>())            
+        .def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
+        .def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)            
+        .def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
+        .def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)            
+        .def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)            
+        .def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)            
+        .def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
+        .def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)           
+        .def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)          
+        .def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
+        .def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
+        .def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
+        .def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
+    py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
+        .def(py::init<>())
+        .def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
+        .def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
+        .def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
+        .def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
+        .def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
+        .def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
+        .def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
+
+    py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
+        .def(py::init<>())
+        .def_readwrite("use_energy", &paddleaudio::kaldi::FbankOptions::use_energy)
+        .def_readwrite("energy_floor", &paddleaudio::kaldi::FbankOptions::energy_floor)
+        .def_readwrite("raw_energy", &paddleaudio::kaldi::FbankOptions::raw_energy)
+        .def_readwrite("htk_compat", &paddleaudio::kaldi::FbankOptions::htk_compat)
+        .def_readwrite("use_log_fbank", &paddleaudio::kaldi::FbankOptions::use_log_fbank)
+        .def_readwrite("use_power", &paddleaudio::kaldi::FbankOptions::use_power);
+#endif
+
+}
--- a/audio/paddleaudio/src/pybind/sox/effects.cpp
+++ b/audio/paddleaudio/src/pybind/sox/effects.cpp
@ -0,0 +1,259 @@
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp  with modification.
+
+#include <mutex>
+#include <sox.h>
+
+#include "paddleaudio/src/pybind/sox/effects.h"
+#include "paddleaudio/src/pybind/sox/effects_chain.h"
+#include "paddleaudio/src/pybind/sox/utils.h"
+
+using namespace paddleaudio::sox_utils;
+
+namespace paddleaudio::sox_effects {
+
+// Streaming decoding over file-like object is tricky because libsox operates on
+// FILE pointer. The folloing is what `sox` and `play` commands do
+//  - file input -> FILE pointer
+//  - URL input -> call wget in suprocess and pipe the data -> FILE pointer
+//  - stdin -> FILE pointer
+//
+// We want to, instead, fetch byte strings chunk by chunk, consume them, and
+// discard.
+//
+// Here is the approach
+// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
+// chunk of byte string
+//    This will perform header-based format detection, if necessary, then fill
+//    the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
+//    which returns FILE* which points the buffer of the provided byte string.
+// 2. Each time sox reads a chunk from the FILE*, we update the underlying
+// buffer in a way that it
+//    starts with unseen data, and append the new data read from the given
+//    fileobj. This will trick libsox as if it keeps reading from the FILE*
+//    continuously.
+// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
+auto apply_effects_fileobj(
+    py::object fileobj,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    tl::optional<std::string> format)
+    -> tl::optional<std::tuple<py::array, int64_t>> {
+  // Prepare the buffer used throughout the lifecycle of SoxEffectChain.
+  //
+  // For certain format (such as FLAC), libsox keeps reading the content at
+  // the initialization unless it reaches EOF even when the header is properly
+  // parsed. (Making buffer size 8192, which is way bigger than the header,
+  // resulted in libsox consuming all the buffer content at the time it opens
+  // the file.) Therefore buffer has to always contain valid data, except after
+  // EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
+  // first check if there is enough data to fill the buffer. `read_fileobj`
+  // repeatedly calls `read`  method until it receives the requested length of
+  // bytes or it reaches EOF. If we get bytes shorter than requested, that means
+  // the whole audio data are fetched.
+  //
+  // * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`.
+  const auto capacity = [&]() {
+    // NOTE:
+    // Use the abstraction provided by `libpaddleaudio` to access the global
+    // config defined by libsox. Directly using `sox_get_globals` function will
+    // end up retrieving the static variable defined in `_paddleaudio`, which is
+    // not correct.
+    const auto bufsiz = get_buffer_size();
+    const int64_t kDefaultCapacityInBytes = 256;
+    return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
+                                              : kDefaultCapacityInBytes;
+  }();
+  std::string buffer(capacity, '\0');
+  auto* in_buf = const_cast<char*>(buffer.data());
+  auto num_read = read_fileobj(&fileobj, capacity, in_buf);
+  // If the file is shorter than 256, then libsox cannot read the header.
+  auto in_buffer_size = (num_read > 256) ? num_read : 256;
+
+  // Open file (this starts reading the header)
+  // When opening a file there are two functions that can touches FILE*.
+  // * `auto_detect_format`
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
+  // * `startread` handler of detected format.
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
+  // To see the handler of a particular format, go to
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
+  // For example, voribs can be found
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
+  SoxFormat sf(sox_open_mem_read(
+      in_buf,
+      in_buffer_size,
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+
+  // In case of streamed data, length can be 0
+  if (static_cast<sox_format_t*>(sf) == nullptr ||
+      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    return {};
+  }
+
+  // Prepare output buffer
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(sf->signal.length);
+
+  // Create and run SoxEffectsChain
+  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
+  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
+      /*input_encoding=*/sf->encoding,
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+  chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+
+  // Create tensor from buffer
+  bool channels_first_ = channels_first.value_or(true);
+  auto tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      normalize.value_or(true),
+      channels_first_);
+
+  return std::forward_as_tuple(
+      tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
+}
+
+namespace {
+
+enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
+SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
+std::mutex SOX_RESOUCE_STATE_MUTEX;
+
+} // namespace
+
+void initialize_sox_effects() {
+  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
+
+  switch (SOX_RESOURCE_STATE) {
+    case NotInitialized:
+      if (sox_init() != SOX_SUCCESS) {
+        throw std::runtime_error("Failed to initialize sox effects.");
+      };
+      SOX_RESOURCE_STATE = Initialized;
+      break;
+    case Initialized:
+      break;
+    case ShutDown:
+      throw std::runtime_error(
+          "SoX Effects has been shut down. Cannot initialize again.");
+  }
+};
+
+void shutdown_sox_effects() {
+  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
+
+  switch (SOX_RESOURCE_STATE) {
+    case NotInitialized:
+      throw std::runtime_error(
+          "SoX Effects is not initialized. Cannot shutdown.");
+    case Initialized:
+      if (sox_quit() != SOX_SUCCESS) {
+        throw std::runtime_error("Failed to initialize sox effects.");
+      };
+      SOX_RESOURCE_STATE = ShutDown;
+      break;
+    case ShutDown:
+      break;
+  }
+}
+
+auto apply_effects_tensor(
+    py::array waveform,
+    int64_t sample_rate,
+    const std::vector<std::vector<std::string>>& effects,
+    bool channels_first) -> std::tuple<py::array, int64_t> {
+  validate_input_tensor(waveform);
+
+  // Create SoxEffectsChain
+  const auto dtype = waveform.dtype();
+  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
+      /*input_encoding=*/get_tensor_encodinginfo(dtype),
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+
+  // Prepare output buffer
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(waveform.size());
+
+  // Build and run effects chain
+  chain.addInputTensor(&waveform, sample_rate, channels_first);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+
+  // Create tensor from buffer
+  auto out_tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      /*normalize=*/false,
+      channels_first);
+
+  return std::tuple<py::array, int64_t>(
+      out_tensor, chain.getOutputSampleRate());
+}
+
+auto apply_effects_file(
+    const std::string& path,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format)
+    -> tl::optional<std::tuple<py::array, int64_t>> {
+  // Open input file
+  SoxFormat sf(sox_open_read(
+      path.c_str(),
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+
+  if (static_cast<sox_format_t*>(sf) == nullptr ||
+      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    return {};
+  }
+
+  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
+
+  // Prepare output
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(sf->signal.length);
+
+  // Create and run SoxEffectsChain
+  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
+      /*input_encoding=*/sf->encoding,
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+
+  chain.addInputFile(sf);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+
+  // Create tensor from buffer
+  bool channels_first_ = channels_first.value_or(true);
+  auto tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      normalize.value_or(true),
+      channels_first_);
+
+  return std::tuple<py::array, int64_t>(
+      tensor, chain.getOutputSampleRate());
+}
+
+} // namespace paddleaudio::sox_effects
--- a/audio/paddleaudio/src/pybind/sox/effects.h
+++ b/audio/paddleaudio/src/pybind/sox/effects.h
@ -0,0 +1,37 @@
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h  with modification.
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+#include "paddleaudio/src/optional/optional.hpp"
+
+namespace py = pybind11;
+
+namespace paddleaudio::sox_effects {
+
+auto apply_effects_fileobj(
+    py::object fileobj,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    tl::optional<std::string> format)
+    -> tl::optional<std::tuple<py::array, int64_t>>;
+
+void initialize_sox_effects();
+
+void shutdown_sox_effects();
+
+auto apply_effects_tensor(
+    py::array waveform,
+    int64_t sample_rate,
+    const std::vector<std::vector<std::string>>& effects,
+    bool channels_first) -> std::tuple<py::array, int64_t>;
+
+auto apply_effects_file(
+    const std::string& path,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format)
+    -> tl::optional<std::tuple<py::array, int64_t>>;
+
+} // namespace paddleaudio::sox_effects
--- a/audio/paddleaudio/src/pybind/sox/effects_chain.cpp
+++ b/audio/paddleaudio/src/pybind/sox/effects_chain.cpp
@ -0,0 +1,597 @@
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.cpp with modification.
+
+#include <sox.h>
+#include <iostream>
+#include <vector>
+#include "paddleaudio/src/pybind/sox/effects_chain.h"
+#include "paddleaudio/src/pybind/sox/utils.h"
+
+using namespace paddleaudio::sox_utils;
+
+namespace paddleaudio::sox_effects_chain {
+
+namespace {
+
+/// helper classes for passing the location of input tensor and output buffer
+///
+/// drain/flow callback functions require plaing C style function signature and
+/// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
+/// The following structs will be assigned to sox_effect_t::priv pointer which
+/// gives sox_effect_t an access to input Tensor and output buffer object.
+struct TensorInputPriv {
+  size_t index;
+  py::array* waveform;
+  int64_t sample_rate;
+  bool channels_first;
+};
+
+struct TensorOutputPriv {
+  std::vector<sox_sample_t>* buffer;
+};
+struct FileOutputPriv {
+  sox_format_t* sf;
+};
+
+/// Callback function to feed Tensor data to SoxEffectChain.
+int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
+  // Retrieve the input Tensor and current index
+  auto priv = static_cast<TensorInputPriv*>(effp->priv);
+  auto index = priv->index;
+  auto tensor = *(priv->waveform);
+  auto num_channels = effp->out_signal.channels;
+
+  // Adjust the number of samples to read
+  const size_t num_samples = tensor.size();
+  if (index + *osamp > num_samples) {
+    *osamp = num_samples - index;
+  }
+
+  // Ensure that it's a multiple of the number of channels
+  *osamp -= *osamp % num_channels;
+
+  // Slice the input Tensor
+  // refacor this module, chunk
+  auto i_frame = index / num_channels;
+  auto num_frames = *osamp / num_channels;
+
+  std::vector<int> chunk(num_frames*num_channels);
+  py::buffer_info ori_info = tensor.request();
+  void* ptr = ori_info.ptr;
+  // Convert to sox_sample_t (int32_t)
+  switch (tensor.dtype().num()) {
+    //case c10::ScalarType::Float: {
+    case 11: {
+      // Need to convert to 64-bit precision so that
+      // values around INT32_MIN/MAX are handled correctly.
+      for (int idx = 0; idx < chunk.size(); ++idx) {
+        int frame_idx = (idx + index) / num_channels;
+        int channels_idx = (idx + index) % num_channels;
+        double elem = 0; 
+        if (priv->channels_first) {
+          elem = *(float*)tensor.data(channels_idx, frame_idx);
+        } else {
+          elem = *(float*)tensor.data(frame_idx, channels_idx);
+        } 
+        elem = elem * 2147483648.;
+        // *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX);
+        if (elem > INT32_MAX) { 
+          chunk[idx] = INT32_MAX; 
+        } else if (elem < INT32_MIN) {
+          chunk[idx] = INT32_MIN; 
+        } else { 
+          chunk[idx] = elem;
+        }
+      }
+      break;
+    }
+    //case c10::ScalarType::Int: {
+    case 5: {
+      for (int idx = 0; idx < chunk.size(); ++idx) {
+        int frame_idx = (idx + index) / num_channels;
+        int channels_idx = (idx + index) % num_channels;
+        int elem = 0;
+        if (priv->channels_first) {
+          elem = *(int*)tensor.data(channels_idx, frame_idx);
+        } else {
+          elem = *(int*)tensor.data(frame_idx, channels_idx);
+        }
+        chunk[idx] = elem;
+      }
+      break;
+    }
+    // case short
+    case 3: {
+      for (int idx = 0; idx < chunk.size(); ++idx) {
+        int frame_idx = (idx + index) / num_channels;
+        int channels_idx = (idx + index) % num_channels;
+        int16_t elem = 0;
+        if (priv->channels_first) {
+          elem = *(int16_t*)tensor.data(channels_idx, frame_idx);
+        } else {
+          elem = *(int16_t*)tensor.data(frame_idx, channels_idx);
+        }
+        chunk[idx] = elem * 65536;
+      }
+      break;
+    }
+    // case byte
+    case 1: {
+      for (int idx = 0; idx < chunk.size(); ++idx) {
+        int frame_idx = (idx + index) / num_channels;
+        int channels_idx = (idx + index) % num_channels;
+        int8_t elem = 0;
+        if (priv->channels_first) {
+          elem = *(int8_t*)tensor.data(channels_idx, frame_idx);
+        } else {
+          elem = *(int8_t*)tensor.data(frame_idx, channels_idx);
+        }
+        chunk[idx] = (elem - 128) * 16777216; 
+      }
+      break;
+    }
+    default:
+      throw std::runtime_error("Unexpected dtype.");
+  }
+  // Write to buffer
+  memcpy(obuf, chunk.data(), *osamp * 4);
+  priv->index += *osamp;
+  return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS;
+}
+
+/// Callback function to fetch data from SoxEffectChain.
+int tensor_output_flow(
+    sox_effect_t* effp,
+    sox_sample_t const* ibuf,
+    sox_sample_t* obuf LSX_UNUSED,
+    size_t* isamp,
+    size_t* osamp) {
+  *osamp = 0;
+  // Get output buffer
+  auto out_buffer = static_cast<TensorOutputPriv*>(effp->priv)->buffer;
+  // Append at the end
+  out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp);
+  return SOX_SUCCESS;
+}
+
+int file_output_flow(
+    sox_effect_t* effp,
+    sox_sample_t const* ibuf,
+    sox_sample_t* obuf LSX_UNUSED,
+    size_t* isamp,
+    size_t* osamp) {
+  *osamp = 0;
+  if (*isamp) {
+    auto sf = static_cast<FileOutputPriv*>(effp->priv)->sf;
+    if (sox_write(sf, ibuf, *isamp) != *isamp) {
+      if (sf->sox_errno) {
+        std::ostringstream stream;
+        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
+               << sf->filename;
+        throw std::runtime_error(stream.str());
+      }
+      return SOX_EOF;
+    }
+  }
+  return SOX_SUCCESS;
+}
+
+sox_effect_handler_t* get_tensor_input_handler() {
+  static sox_effect_handler_t handler{
+      /*name=*/"input_tensor",
+      /*usage=*/NULL,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/NULL,
+      /*start=*/NULL,
+      /*flow=*/NULL,
+      /*drain=*/tensor_input_drain,
+      /*stop=*/NULL,
+      /*kill=*/NULL,
+      /*priv_size=*/sizeof(TensorInputPriv)};
+  return &handler;
+}
+
+sox_effect_handler_t* get_tensor_output_handler() {
+  static sox_effect_handler_t handler{
+      /*name=*/"output_tensor",
+      /*usage=*/NULL,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/NULL,
+      /*start=*/NULL,
+      /*flow=*/tensor_output_flow,
+      /*drain=*/NULL,
+      /*stop=*/NULL,
+      /*kill=*/NULL,
+      /*priv_size=*/sizeof(TensorOutputPriv)};
+  return &handler;
+}
+
+sox_effect_handler_t* get_file_output_handler() {
+  static sox_effect_handler_t handler{
+      /*name=*/"output_file",
+      /*usage=*/NULL,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/NULL,
+      /*start=*/NULL,
+      /*flow=*/file_output_flow,
+      /*drain=*/NULL,
+      /*stop=*/NULL,
+      /*kill=*/NULL,
+      /*priv_size=*/sizeof(FileOutputPriv)};
+  return &handler;
+}
+
+} // namespace
+
+SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {}
+
+SoxEffect::~SoxEffect() {
+  if (se_ != nullptr) {
+    free(se_);
+  }
+}
+
+SoxEffect::operator sox_effect_t*() const {
+  return se_;
+}
+
+auto SoxEffect::operator->() noexcept -> sox_effect_t* {
+  return se_;
+}
+
+SoxEffectsChain::SoxEffectsChain(
+    sox_encodinginfo_t input_encoding,
+    sox_encodinginfo_t output_encoding)
+    : in_enc_(input_encoding),
+      out_enc_(output_encoding),
+      in_sig_(),
+      interm_sig_(),
+      out_sig_(),
+      sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) {
+  if (!sec_) {
+    throw std::runtime_error("Failed to create effect chain.");
+  }
+}
+
+SoxEffectsChain::~SoxEffectsChain() {
+  if (sec_ != nullptr) {
+    sox_delete_effects_chain(sec_);
+  }
+}
+
+void SoxEffectsChain::run() {
+  sox_flow_effects(sec_, NULL, NULL);
+}
+
+void SoxEffectsChain::addInputTensor(
+    py::array* waveform,
+    int64_t sample_rate,
+    bool channels_first) {
+  in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first);
+  interm_sig_ = in_sig_;
+  SoxEffect e(sox_create_effect(get_tensor_input_handler()));
+  auto priv = static_cast<TensorInputPriv*>(e->priv);
+  priv->index = 0;
+  priv->waveform = waveform;
+  priv->sample_rate = sample_rate;
+  priv->channels_first = channels_first;
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: input_tensor");
+  }
+}
+
+void SoxEffectsChain::addOutputBuffer(
+    std::vector<sox_sample_t>* output_buffer) {
+  SoxEffect e(sox_create_effect(get_tensor_output_handler()));
+  static_cast<TensorOutputPriv*>(e->priv)->buffer = output_buffer;
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: output_tensor");
+  }
+}
+
+void SoxEffectsChain::addInputFile(sox_format_t* sf) {
+  in_sig_ = sf->signal;
+  interm_sig_ = in_sig_;
+  SoxEffect e(sox_create_effect(sox_find_effect("input")));
+  char* opts[] = {(char*)sf};
+  sox_effect_options(e, 1, opts);
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Internal Error: Failed to add effect: input " << sf->filename;
+    throw std::runtime_error(stream.str());
+  }
+}
+
+void SoxEffectsChain::addOutputFile(sox_format_t* sf) {
+  out_sig_ = sf->signal;
+  SoxEffect e(sox_create_effect(get_file_output_handler()));
+  static_cast<FileOutputPriv*>(e->priv)->sf = sf;
+  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Internal Error: Failed to add effect: output " << sf->filename;
+    throw std::runtime_error(stream.str());
+  }
+}
+
+void SoxEffectsChain::addEffect(const std::vector<std::string> effect) {
+  const auto num_args = effect.size();
+  if (num_args == 0) {
+    throw std::runtime_error("Invalid argument: empty effect.");
+  }
+  const auto name = effect[0];
+  if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) {
+    std::ostringstream stream;
+    stream << "Unsupported effect: " << name;
+    throw std::runtime_error(stream.str());
+  }
+
+  auto returned_effect = sox_find_effect(name.c_str());
+  if (!returned_effect) {
+    std::ostringstream stream;
+    stream << "Unsupported effect: " << name;
+    throw std::runtime_error(stream.str());
+  }
+  SoxEffect e(sox_create_effect(returned_effect));
+  const auto num_options = num_args - 1;
+
+  std::vector<char*> opts;
+  for (size_t i = 1; i < num_args; ++i) {
+    opts.push_back((char*)effect[i].c_str());
+  }
+  if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) !=
+      SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Invalid effect option:";
+    for (const auto& v : effect) {
+      stream << " " << v;
+    }
+    throw std::runtime_error(stream.str());
+  }
+
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Internal Error: Failed to add effect: \"" << name;
+    for (size_t i = 1; i < num_args; ++i) {
+      stream << " " << effect[i];
+    }
+    stream << "\"";
+    throw std::runtime_error(stream.str());
+  }
+}
+
+int64_t SoxEffectsChain::getOutputNumChannels() {
+  return interm_sig_.channels;
+}
+
+int64_t SoxEffectsChain::getOutputSampleRate() {
+  return interm_sig_.rate;
+}
+
+namespace {
+
+/// helper classes for passing file-like object to SoxEffectChain
+struct FileObjInputPriv {
+  sox_format_t* sf;
+  py::object* fileobj;
+  bool eof_reached;
+  char* buffer;
+  uint64_t buffer_size;
+};
+
+struct FileObjOutputPriv {
+  sox_format_t* sf;
+  py::object* fileobj;
+  char** buffer;
+  size_t* buffer_size;
+};
+
+/// Callback function to feed byte string
+/// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/sox.h#L1268-L1278
+auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp)
+    -> int {
+  auto priv = static_cast<FileObjInputPriv*>(effp->priv);
+  auto sf = priv->sf;
+  auto buffer = priv->buffer;
+
+  // 1. Refresh the buffer
+  //
+  // NOTE:
+  //   Since the underlying FILE* was opened with `fmemopen`, the only way
+  //   libsox detect EOF is reaching the end of the buffer. (null byte won't
+  //   help) Therefore we need to align the content at the end of buffer,
+  //   otherwise, libsox will keep reading the content beyond intended length.
+  //
+  // Before:
+  //
+  //     |<-------consumed------>|<---remaining--->|
+  //     |***********************|-----------------|
+  //                             ^ ftell
+  //
+  // After:
+  //
+  //     |<-offset->|<---remaining--->|<-new data->|
+  //     |**********|-----------------|++++++++++++|
+  //                ^ ftell
+
+  // NOTE:
+  //   Do not use `sf->tell_off` here. Presumably, `tell_off` and `fseek` are
+  //   supposed to be in sync, but there are cases (Vorbis) they are not
+  //   in sync and `tell_off` has seemingly uninitialized value, which
+  //   leads num_remain to be negative and cause segmentation fault
+  //   in `memmove`.
+  const auto tell = ftell((FILE*)sf->fp);
+  if (tell < 0) {
+    throw std::runtime_error("Internal Error: ftell failed.");
+  }
+  const auto num_consumed = static_cast<size_t>(tell);
+  if (num_consumed > priv->buffer_size) {
+    throw std::runtime_error("Internal Error: buffer overrun.");
+  }
+
+  const auto num_remain = priv->buffer_size - num_consumed;
+
+  // 1.1. Fetch the data to see if there is data to fill the buffer
+  size_t num_refill = 0;
+  std::string chunk(num_consumed, '\0');
+  if (num_consumed && !priv->eof_reached) {
+    num_refill = read_fileobj(
+        priv->fileobj, num_consumed, const_cast<char*>(chunk.data()));
+    if (num_refill < num_consumed) {
+      priv->eof_reached = true;
+    }
+  }
+  const auto offset = num_consumed - num_refill;
+
+  // 1.2. Move the unconsumed data towards the beginning of buffer.
+  if (num_remain) {
+    auto src = static_cast<void*>(buffer + num_consumed);
+    auto dst = static_cast<void*>(buffer + offset);
+    memmove(dst, src, num_remain);
+  }
+
+  // 1.3. Refill the remaining buffer.
+  if (num_refill) {
+    auto src = static_cast<void*>(const_cast<char*>(chunk.c_str()));
+    auto dst = buffer + offset + num_remain;
+    memcpy(dst, src, num_refill);
+  }
+
+  // 1.4. Set the file pointer to the new offset
+  sf->tell_off = offset;
+  fseek((FILE*)sf->fp, offset, SEEK_SET);
+
+  // 2. Perform decoding operation
+  // The following part is practically same as "input" effect
+  // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
+
+  // At this point, osamp represents the buffer size in bytes,
+  // but sox_read expects the maximum number of samples ready to read.
+  // Normally, this is fine, but in case when the samples are not 4-byte
+  // aligned, (e.g. sample is 24bits), the resulting signal is not correct.
+  // https://github.com/pytorch/audio/issues/2083
+  if (sf->encoding.bits_per_sample > 0)
+    *osamp /= (sf->encoding.bits_per_sample / 8);
+
+  // Ensure that it's a multiple of the number of channels
+  *osamp -= *osamp % effp->out_signal.channels;
+
+  // Read up to *osamp samples into obuf;
+  // store the actual number read back to *osamp
+  *osamp = sox_read(sf, obuf, *osamp);
+
+  // Decoding is finished when fileobject is exhausted and sox can no longer
+  // decode a sample.
+  return (priv->eof_reached && !*osamp) ? SOX_EOF : SOX_SUCCESS;
+}
+
+auto fileobj_output_flow(
+    sox_effect_t* effp,
+    sox_sample_t const* ibuf,
+    sox_sample_t* obuf LSX_UNUSED,
+    size_t* isamp,
+    size_t* osamp) -> int {
+  *osamp = 0;
+  if (*isamp) {
+    auto priv = static_cast<FileObjOutputPriv*>(effp->priv);
+    auto sf = priv->sf;
+    auto fp = static_cast<FILE*>(sf->fp);
+    auto fileobj = priv->fileobj;
+    auto buffer = priv->buffer;
+
+    // Encode chunk
+    auto num_samples_written = sox_write(sf, ibuf, *isamp);
+    fflush(fp);
+
+    // Copy the encoded chunk to python object.
+    fileobj->attr("write")(py::bytes(*buffer, ftell(fp)));
+
+    // Reset FILE*
+    sf->tell_off = 0;
+    fseek(fp, 0, SEEK_SET);
+
+    if (num_samples_written != *isamp) {
+      if (sf->sox_errno) {
+        std::ostringstream stream;
+        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
+               << sf->filename;
+        throw std::runtime_error(stream.str());
+      }
+      return SOX_EOF;
+    }
+  }
+  return SOX_SUCCESS;
+}
+
+auto get_fileobj_input_handler() -> sox_effect_handler_t* {
+  static sox_effect_handler_t handler{
+      /*name=*/"input_fileobj_object",
+      /*usage=*/nullptr,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/nullptr,
+      /*start=*/nullptr,
+      /*flow=*/nullptr,
+      /*drain=*/fileobj_input_drain,
+      /*stop=*/nullptr,
+      /*kill=*/nullptr,
+      /*priv_size=*/sizeof(FileObjInputPriv)};
+  return &handler;
+}
+
+auto get_fileobj_output_handler() -> sox_effect_handler_t* {
+  static sox_effect_handler_t handler{
+      /*name=*/"output_fileobj_object",
+      /*usage=*/nullptr,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/nullptr,
+      /*start=*/nullptr,
+      /*flow=*/fileobj_output_flow,
+      /*drain=*/nullptr,
+      /*stop=*/nullptr,
+      /*kill=*/nullptr,
+      /*priv_size=*/sizeof(FileObjOutputPriv)};
+  return &handler;
+}
+
+} // namespace
+
+void SoxEffectsChainPyBind::addInputFileObj(
+    sox_format_t* sf,
+    char* buffer,
+    uint64_t buffer_size,
+    py::object* fileobj) {
+  in_sig_ = sf->signal;
+  interm_sig_ = in_sig_;
+
+  SoxEffect e(sox_create_effect(get_fileobj_input_handler()));
+  auto priv = static_cast<FileObjInputPriv*>(e->priv);
+  priv->sf = sf;
+  priv->fileobj = fileobj;
+  priv->eof_reached = false;
+  priv->buffer = buffer;
+  priv->buffer_size = buffer_size;
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: input fileobj");
+  }
+}
+
+void SoxEffectsChainPyBind::addOutputFileObj(
+    sox_format_t* sf,
+    char** buffer,
+    size_t* buffer_size,
+    py::object* fileobj) {
+  out_sig_ = sf->signal;
+  SoxEffect e(sox_create_effect(get_fileobj_output_handler()));
+  auto priv = static_cast<FileObjOutputPriv*>(e->priv);
+  priv->sf = sf;
+  priv->fileobj = fileobj;
+  priv->buffer = buffer;
+  priv->buffer_size = buffer_size;
+  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: output fileobj");
+  }
+}
+
+} // namespace paddleaudio::sox_effects_chain
--- a/audio/paddleaudio/src/pybind/sox/effects_chain.h
+++ b/audio/paddleaudio/src/pybind/sox/effects_chain.h
@ -0,0 +1,78 @@
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h with modification.
+
+#pragma once
+
+#include <sox.h>
+#include "paddleaudio/src/pybind/sox/utils.h"
+
+namespace paddleaudio::sox_effects_chain {
+
+// Helper struct to safely close sox_effect_t* pointer returned by
+// sox_create_effect
+
+struct SoxEffect {
+  explicit SoxEffect(sox_effect_t* se) noexcept;
+  SoxEffect(const SoxEffect& other) = delete;
+  SoxEffect(const SoxEffect&& other) = delete;
+  auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
+  auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
+  ~SoxEffect();
+  operator sox_effect_t*() const;
+  auto operator->() noexcept -> sox_effect_t*;
+
+ private:
+  sox_effect_t* se_;
+};
+
+// Helper struct to safely close sox_effects_chain_t with handy methods
+class SoxEffectsChain {
+  const sox_encodinginfo_t in_enc_;
+  const sox_encodinginfo_t out_enc_;
+
+ protected:
+  sox_signalinfo_t in_sig_;
+  sox_signalinfo_t interm_sig_;
+  sox_signalinfo_t out_sig_;
+  sox_effects_chain_t* sec_;
+
+ public:
+  explicit SoxEffectsChain(
+      sox_encodinginfo_t input_encoding,
+      sox_encodinginfo_t output_encoding);
+  SoxEffectsChain(const SoxEffectsChain& other) = delete;
+  SoxEffectsChain(const SoxEffectsChain&& other) = delete;
+  SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
+  SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
+  ~SoxEffectsChain();
+  void run();
+  void addInputTensor(
+      py::array* waveform,
+      int64_t sample_rate,
+      bool channels_first);
+  void addInputFile(sox_format_t* sf);
+  void addOutputBuffer(std::vector<sox_sample_t>* output_buffer);
+  void addOutputFile(sox_format_t* sf);
+  void addEffect(const std::vector<std::string> effect);
+  int64_t getOutputNumChannels();
+  int64_t getOutputSampleRate();
+};
+
+class SoxEffectsChainPyBind : public SoxEffectsChain {
+  using SoxEffectsChain::SoxEffectsChain;
+
+ public:
+  void addInputFileObj(
+      sox_format_t* sf,
+      char* buffer,
+      uint64_t buffer_size,
+      py::object* fileobj);
+
+  void addOutputFileObj(
+      sox_format_t* sf,
+      char** buffer,
+      size_t* buffer_size,
+      py::object* fileobj);
+};
+
+} // namespace paddleaudio::sox_effects_chain
+
--- a/audio/paddleaudio/src/pybind/sox/io.cpp
+++ b/audio/paddleaudio/src/pybind/sox/io.cpp
@ -0,0 +1,279 @@
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp with modification.
+
+#include "paddleaudio/src/pybind/sox/io.h"
+#include "paddleaudio/src/pybind/sox/effects.h"
+#include "paddleaudio/src/pybind/sox/types.h"
+#include "paddleaudio/src/pybind/sox/effects_chain.h"
+#include "paddleaudio/src/pybind/sox/utils.h"
+#include "paddleaudio/src/optional/optional.hpp"
+
+using namespace paddleaudio::sox_utils;
+
+namespace paddleaudio {
+namespace sox_io {
+
+auto get_info_file(const std::string &path, 
+                   const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
+    SoxFormat sf(
+        sox_open_read(path.data(),
+                      /*signal=*/nullptr,
+                      /*encoding=*/nullptr,
+                      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+
+
+    validate_input_file(sf, path);
+
+    return std::make_tuple(
+        static_cast<int64_t>(sf->signal.rate),
+        static_cast<int64_t>(sf->signal.length / sf->signal.channels),
+        static_cast<int64_t>(sf->signal.channels),
+        static_cast<int64_t>(sf->encoding.bits_per_sample),
+        get_encoding(sf->encoding.encoding));
+}
+
+std::vector<std::vector<std::string>> get_effects(
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames) {
+  const auto offset = frame_offset.value_or(0);
+  if (offset < 0) {
+    throw std::runtime_error(
+        "Invalid argument: frame_offset must be non-negative.");
+  }
+  const auto frames = num_frames.value_or(-1);
+  if (frames == 0 || frames < -1) {
+    throw std::runtime_error(
+        "Invalid argument: num_frames must be -1 or greater than 0.");
+  }
+
+  std::vector<std::vector<std::string>> effects;
+  if (frames != -1) {
+    std::ostringstream os_offset, os_frames;
+    os_offset << offset << "s";
+    os_frames << "+" << frames << "s";
+    effects.emplace_back(
+        std::vector<std::string>{"trim", os_offset.str(), os_frames.str()});
+  } else if (offset != 0) {
+    std::ostringstream os_offset;
+    os_offset << offset << "s";
+    effects.emplace_back(std::vector<std::string>{"trim", os_offset.str()});
+  }
+  return effects;
+}
+
+auto get_info_fileobj(py::object fileobj, 
+                      const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
+    const auto capacity = [&]() {
+        const auto bufsiz = get_buffer_size();
+        const int64_t kDefaultCapacityInBytes = 4096;
+        return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
+                                                  : kDefaultCapacityInBytes;
+    }();
+    std::string buffer(capacity, '\0');
+    auto *buf = const_cast<char *>(buffer.data());
+    auto num_read = read_fileobj(&fileobj, capacity, buf);
+    // If the file is shorter than 256, then libsox cannot read the header.
+    auto buf_size = (num_read > 256) ? num_read : 256;
+
+    SoxFormat sf(sox_open_mem_read(
+        buf,
+        buf_size,
+        /*signal=*/nullptr,
+        /*encoding=*/nullptr,
+        /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+
+    // In case of streamed data, length can be 0
+    validate_input_memfile(sf);
+
+    return std::make_tuple(
+        static_cast<int64_t>(sf->signal.rate),
+        static_cast<int64_t>(sf->signal.length / sf->signal.channels),
+        static_cast<int64_t>(sf->signal.channels),
+        static_cast<int64_t>(sf->encoding.bits_per_sample),
+        get_encoding(sf->encoding.encoding));
+}
+
+tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
+    py::object fileobj,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format) {
+  auto effects = get_effects(frame_offset, num_frames);
+  return paddleaudio::sox_effects::apply_effects_fileobj(
+      std::move(fileobj), effects, normalize, channels_first, std::move(format));
+}
+
+tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
+    const std::string& path,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format) {
+    auto effects = get_effects(frame_offset, num_frames);
+    return paddleaudio::sox_effects::apply_effects_file(
+        path, effects, normalize, channels_first, format);
+}
+
+void save_audio_file(const std::string& path,
+                     py::array tensor,
+                     int64_t sample_rate,
+                     bool channels_first,
+                     tl::optional<double> compression,
+                     tl::optional<std::string> format,
+                     tl::optional<std::string> encoding,
+                     tl::optional<int64_t> bits_per_sample) {
+    validate_input_tensor(tensor);
+
+    const auto filetype = [&]() {
+        if (format.has_value()) return format.value();
+        return get_filetype(path);
+    }();
+
+    if (filetype == "amr-nb") {
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+        //TORCH_CHECK(num_channels == 1,
+        //            "amr-nb format only supports single channel audio.");
+        assert(num_channels == 1);
+    } else if (filetype == "htk") {
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+       // TORCH_CHECK(num_channels == 1,
+        //            "htk format only supports single channel audio.");
+        assert(num_channels == 1);
+    } else if (filetype == "gsm") {
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+        assert(num_channels == 1);
+        assert(sample_rate == 8000);
+        //TORCH_CHECK(num_channels == 1,
+        //            "gsm format only supports single channel audio.");
+        //TORCH_CHECK(sample_rate == 8000,
+        //            "gsm format only supports a sampling rate of 8kHz.");
+    }
+    const auto signal_info =
+        get_signalinfo(&tensor, sample_rate, filetype, channels_first);
+    const auto encoding_info = get_encodinginfo_for_save(
+        filetype, tensor.dtype(), compression, encoding, bits_per_sample);
+
+    SoxFormat sf(sox_open_write(path.c_str(),
+                                &signal_info,
+                                &encoding_info,
+                                /*filetype=*/filetype.c_str(),
+                                /*oob=*/nullptr,
+                                /*overwrite_permitted=*/nullptr));
+
+    if (static_cast<sox_format_t*>(sf) == nullptr) {
+        throw std::runtime_error(
+            "Error saving audio file: failed to open file " + path);
+    }
+
+    paddleaudio::sox_effects_chain::SoxEffectsChain chain(
+        /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
+        /*output_encoding=*/sf->encoding);
+    chain.addInputTensor(&tensor, sample_rate, channels_first);
+    chain.addOutputFile(sf);
+    chain.run();
+}
+
+namespace {
+// helper class to automatically release buffer, to be used by
+// save_audio_fileobj
+struct AutoReleaseBuffer {
+  char* ptr;
+  size_t size;
+
+  AutoReleaseBuffer() : ptr(nullptr), size(0) {}
+  AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete;
+  AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete;
+  auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete;
+  auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete;
+  ~AutoReleaseBuffer() {
+    if (ptr) {
+      free(ptr);
+    }
+  }
+};
+
+} // namespace
+
+void save_audio_fileobj(
+    py::object fileobj,
+    py::array tensor,
+    int64_t sample_rate,
+    bool channels_first,
+    tl::optional<double> compression,
+    tl::optional<std::string> format,
+    tl::optional<std::string> encoding,
+    tl::optional<int64_t> bits_per_sample) {
+
+  if (!format.has_value()) {
+    throw std::runtime_error(
+        "`format` is required when saving to file object.");
+  }
+  const auto filetype = format.value();
+
+  if (filetype == "amr-nb") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "amr-nb format only supports single channel audio.");
+    }
+  } else if (filetype == "htk") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "htk format only supports single channel audio.");
+    }
+  } else if (filetype == "gsm") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "gsm format only supports single channel audio.");
+    }
+    if (sample_rate != 8000) {
+      throw std::runtime_error(
+          "gsm format only supports a sampling rate of 8kHz.");
+    }
+  }
+
+  const auto signal_info =
+      get_signalinfo(&tensor, sample_rate, filetype, channels_first);
+  const auto encoding_info = get_encodinginfo_for_save(
+      filetype,
+      tensor.dtype(),
+      compression,
+      std::move(encoding),
+      bits_per_sample);
+
+  AutoReleaseBuffer buffer;
+
+  SoxFormat sf(sox_open_memstream_write(
+      &buffer.ptr,
+      &buffer.size,
+      &signal_info,
+      &encoding_info,
+      filetype.c_str(),
+      /*oob=*/nullptr));
+
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error(
+        "Error saving audio file: failed to open memory stream.");
+  }
+
+  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
+      /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
+      /*output_encoding=*/sf->encoding);
+  chain.addInputTensor(&tensor, sample_rate, channels_first);
+  chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj);
+  chain.run();
+
+  // Closing the sox_format_t is necessary for flushing the last chunk to the
+  // buffer
+  sf.close();
+  fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
+}
+
+}  // namespace paddleaudio
+}  // namespace sox_io
--- a/audio/paddleaudio/src/pybind/sox/io.h
+++ b/audio/paddleaudio/src/pybind/sox/io.h
@ -0,0 +1,61 @@
+// the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.h with modification.
+#pragma once
+
+#include "paddleaudio/src/pybind/sox/utils.h"
+
+namespace py = pybind11;
+
+namespace paddleaudio {
+namespace sox_io {
+
+auto get_info_file(const std::string &path, 
+                   const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
+
+auto get_info_fileobj(py::object fileobj,
+                   const tl::optional<std::string> &format)
+    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
+
+tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
+    py::object fileobj,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format);
+
+void save_audio_fileobj(
+    py::object fileobj,
+    py::array tensor,
+    int64_t sample_rate,
+    bool channels_first,
+    tl::optional<double> compression,
+    tl::optional<std::string> format,
+    tl::optional<std::string> encoding,
+    tl::optional<int64_t> bits_per_sample);
+
+auto get_effects(const tl::optional<int64_t>& frame_offset,
+                 const tl::optional<int64_t>& num_frames)
+    -> std::vector<std::vector<std::string>>;
+
+
+tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
+    const std::string& path,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format);
+
+void save_audio_file(const std::string& path,
+                     py::array tensor,
+                     int64_t sample_rate,
+                     bool channels_first,
+                     tl::optional<double> compression,
+                     tl::optional<std::string> format,
+                     tl::optional<std::string> encoding,
+                     tl::optional<int64_t> bits_per_sample);    
+
+
+}  // namespace paddleaudio
+}  // namespace sox_io
--- a/audio/paddleaudio/src/pybind/sox/types.cpp
+++ b/audio/paddleaudio/src/pybind/sox/types.cpp
@ -0,0 +1,143 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp
+
+#include "paddleaudio/src/pybind/sox/types.h"
+#include <ostream>
+#include <sstream>
+
+namespace paddleaudio {
+namespace sox_utils {
+
+Format get_format_from_string(const std::string& format) {
+  if (format == "wav")
+    return Format::WAV;
+  if (format == "mp3")
+    return Format::MP3;
+  if (format == "flac")
+    return Format::FLAC;
+  if (format == "ogg" || format == "vorbis")
+    return Format::VORBIS;
+  if (format == "amr-nb")
+    return Format::AMR_NB;
+  if (format == "amr-wb")
+    return Format::AMR_WB;
+  if (format == "amb")
+    return Format::AMB;
+  if (format == "sph")
+    return Format::SPHERE;
+  if (format == "htk")
+    return Format::HTK;
+  if (format == "gsm")
+    return Format::GSM;
+  std::ostringstream stream;
+  stream << "Internal Error: unexpected format value: " << format;
+  throw std::runtime_error(stream.str());
+}
+
+std::string to_string(Encoding v) {
+  switch (v) {
+    case Encoding::UNKNOWN:
+      return "UNKNOWN";
+    case Encoding::PCM_SIGNED:
+      return "PCM_S";
+    case Encoding::PCM_UNSIGNED:
+      return "PCM_U";
+    case Encoding::PCM_FLOAT:
+      return "PCM_F";
+    case Encoding::FLAC:
+      return "FLAC";
+    case Encoding::ULAW:
+      return "ULAW";
+    case Encoding::ALAW:
+      return "ALAW";
+    case Encoding::MP3:
+      return "MP3";
+    case Encoding::VORBIS:
+      return "VORBIS";
+    case Encoding::AMR_WB:
+      return "AMR_WB";
+    case Encoding::AMR_NB:
+      return "AMR_NB";
+    case Encoding::OPUS:
+      return "OPUS";
+    default:
+      throw std::runtime_error("Internal Error: unexpected encoding.");
+  }
+}
+
+Encoding get_encoding_from_option(const tl::optional<std::string> encoding) {
+  if (!encoding.has_value())
+    return Encoding::NOT_PROVIDED;
+  std::string v = encoding.value();
+  if (v == "PCM_S")
+    return Encoding::PCM_SIGNED;
+  if (v == "PCM_U")
+    return Encoding::PCM_UNSIGNED;
+  if (v == "PCM_F")
+    return Encoding::PCM_FLOAT;
+  if (v == "ULAW")
+    return Encoding::ULAW;
+  if (v == "ALAW")
+    return Encoding::ALAW;
+  std::ostringstream stream;
+  stream << "Internal Error: unexpected encoding value: " << v;
+  throw std::runtime_error(stream.str());
+}
+
+BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth) {
+  if (!bit_depth.has_value())
+    return BitDepth::NOT_PROVIDED;
+  int64_t v = bit_depth.value();
+  switch (v) {
+    case 8:
+      return BitDepth::B8;
+    case 16:
+      return BitDepth::B16;
+    case 24:
+      return BitDepth::B24;
+    case 32:
+      return BitDepth::B32;
+    case 64:
+      return BitDepth::B64;
+    default: {
+      std::ostringstream s;
+      s << "Internal Error: unexpected bit depth value: " << v;
+      throw std::runtime_error(s.str());
+    }
+  }
+}
+
+std::string get_encoding(sox_encoding_t encoding) {
+  switch (encoding) {
+    case SOX_ENCODING_UNKNOWN:
+      return "UNKNOWN";
+    case SOX_ENCODING_SIGN2:
+      return "PCM_S";
+    case SOX_ENCODING_UNSIGNED:
+      return "PCM_U";
+    case SOX_ENCODING_FLOAT:
+      return "PCM_F";
+    case SOX_ENCODING_FLAC:
+      return "FLAC";
+    case SOX_ENCODING_ULAW:
+      return "ULAW";
+    case SOX_ENCODING_ALAW:
+      return "ALAW";
+    case SOX_ENCODING_MP3:
+      return "MP3";
+    case SOX_ENCODING_VORBIS:
+      return "VORBIS";
+    case SOX_ENCODING_AMR_WB:
+      return "AMR_WB";
+    case SOX_ENCODING_AMR_NB:
+      return "AMR_NB";
+    case SOX_ENCODING_OPUS:
+      return "OPUS";
+    case SOX_ENCODING_GSM:
+      return "GSM";
+    default:
+      return "UNKNOWN";
+  }
+}
+
+} // namespace sox_utils
+} // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/sox/types.h
+++ b/audio/paddleaudio/src/pybind/sox/types.h
@ -0,0 +1,58 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h
+#pragma once
+
+#include <sox.h>
+#include "paddleaudio/src/optional/optional.hpp"
+
+namespace paddleaudio {
+namespace sox_utils {
+
+enum class Format {
+  WAV,
+  MP3,
+  FLAC,
+  VORBIS,
+  AMR_NB,
+  AMR_WB,
+  AMB,
+  SPHERE,
+  GSM,
+  HTK,
+};
+
+Format get_format_from_string(const std::string& format);
+
+enum class Encoding {
+  NOT_PROVIDED,
+  UNKNOWN,
+  PCM_SIGNED,
+  PCM_UNSIGNED,
+  PCM_FLOAT,
+  FLAC,
+  ULAW,
+  ALAW,
+  MP3,
+  VORBIS,
+  AMR_WB,
+  AMR_NB,
+  OPUS,
+};
+
+std::string to_string(Encoding v);
+Encoding get_encoding_from_option(const tl::optional<std::string> encoding);
+
+enum class BitDepth : unsigned {
+  NOT_PROVIDED = 0,
+  B8 = 8,
+  B16 = 16,
+  B24 = 24,
+  B32 = 32,
+  B64 = 64,
+};
+
+BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth);
+
+std::string get_encoding(sox_encoding_t encoding);
+
+} // namespace sox_utils
+} // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/sox/utils.cpp
+++ b/audio/paddleaudio/src/pybind/sox/utils.cpp
@ -0,0 +1,550 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.cpp with modification.
+#include <sox.h>
+
+#include "paddleaudio/src/pybind/sox/utils.h"
+#include "paddleaudio/src/pybind/sox/types.h"
+
+#include <sstream>
+
+namespace paddleaudio {
+namespace sox_utils {
+
+auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer)
+    -> uint64_t {
+    uint64_t num_read = 0;
+    while (num_read < size) {
+        auto request = size - num_read;
+        auto chunk = static_cast<std::string>(
+            static_cast<py::bytes>(fileobj->attr("read")(request)));
+        auto chunk_len = chunk.length();
+        if (chunk_len == 0) {
+            break;
+        }
+        if (chunk_len > request) {
+            std::ostringstream message;
+            message
+                << "Requested up to " << request << " bytes but, "
+                << "received " << chunk_len << " bytes. "
+                << "The given object does not confirm to read protocol of file "
+                   "object.";
+            throw std::runtime_error(message.str());
+        }
+        memcpy(buffer, chunk.data(), chunk_len);
+        buffer += chunk_len;
+        num_read += chunk_len;
+    }
+    return num_read;
+}
+
+
+void set_seed(const int64_t seed) {
+  sox_get_globals()->ranqd1 = static_cast<sox_int32_t>(seed);
+}
+
+void set_verbosity(const int64_t verbosity) {
+  sox_get_globals()->verbosity = static_cast<unsigned>(verbosity);
+}
+
+void set_use_threads(const bool use_threads) {
+  sox_get_globals()->use_threads = static_cast<sox_bool>(use_threads);
+}
+
+void set_buffer_size(const int64_t buffer_size) {
+  sox_get_globals()->bufsiz = static_cast<size_t>(buffer_size);
+}
+
+int64_t get_buffer_size() {
+  return sox_get_globals()->bufsiz;
+}
+
+std::vector<std::vector<std::string>> list_effects() {
+  std::vector<std::vector<std::string>> effects;
+  for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) {
+    const sox_effect_handler_t* handler = (*fns)();
+    if (handler && handler->name) {
+      if (UNSUPPORTED_EFFECTS.find(handler->name) ==
+          UNSUPPORTED_EFFECTS.end()) {
+        effects.emplace_back(std::vector<std::string>{
+            handler->name,
+            handler->usage ? std::string(handler->usage) : std::string("")});
+      }
+    }
+  }
+  return effects;
+}
+
+std::vector<std::string> list_write_formats() {
+  std::vector<std::string> formats;
+  for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
+    const sox_format_handler_t* handler = fns->fn();
+    for (const char* const* names = handler->names; *names; ++names) {
+      if (!strchr(*names, '/') && handler->write)
+        formats.emplace_back(*names);
+    }
+  }
+  return formats;
+}
+
+std::vector<std::string> list_read_formats() {
+  std::vector<std::string> formats;
+  for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
+    const sox_format_handler_t* handler = fns->fn();
+    for (const char* const* names = handler->names; *names; ++names) {
+      if (!strchr(*names, '/') && handler->read)
+        formats.emplace_back(*names);
+    }
+  }
+  return formats;
+}
+
+SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {}
+SoxFormat::~SoxFormat() {
+  close();
+}
+
+sox_format_t* SoxFormat::operator->() const noexcept {
+  return fd_;
+}
+SoxFormat::operator sox_format_t*() const noexcept {
+  return fd_;
+}
+
+void SoxFormat::close() {
+  if (fd_ != nullptr) {
+    sox_close(fd_);
+    fd_ = nullptr;
+  }
+}
+
+void validate_input_file(const SoxFormat& sf, const std::string& path) {
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error(
+        "Error loading audio file: failed to open file " + path);
+  }
+  if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    throw std::runtime_error("Error loading audio file: unknown encoding.");
+  }
+}
+
+void validate_input_memfile(const SoxFormat &sf) {
+    return validate_input_file(sf, "<in memory buffer>");
+}
+
+void validate_input_tensor(const py::array tensor) {
+  if (tensor.ndim() != 2) {
+    throw std::runtime_error("Input tensor has to be 2D.");
+  }
+
+  char dtype = tensor.dtype().char_();
+  bool flag = (dtype == 'f') || (dtype == 'd') || (dtype == 'l') || (dtype == 'i');
+  if (flag == false) {
+      throw std::runtime_error(
+          "Input tensor has to be one of float32, int32, int16 or uint8 type.");
+  }
+}
+
+py::dtype get_dtype(
+    const sox_encoding_t encoding,
+    const unsigned precision) {
+    switch (encoding) {
+      case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
+        return py::dtype('u1');
+      case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV
+        switch (precision) {
+          case 16:
+            return py::dtype("i2");
+          case 24: // Cast 24-bit to 32-bit.
+          case 32:
+            return py::dtype('i');
+          default:
+            throw std::runtime_error(
+                "Only 16, 24, and 32 bits are supported for signed PCM.");
+        }
+      default:
+        // default to float32 for the other formats, including
+        // 32-bit flaoting-point WAV,
+        // MP3,
+        // FLAC,
+        // VORBIS etc...
+        return py::dtype("f");
+    }
+}
+
+py::array convert_to_tensor(
+    sox_sample_t* buffer,
+    const int32_t num_samples,
+    const int32_t num_channels,
+    const py::dtype dtype,
+    const bool normalize,
+    const bool channels_first) {
+  // todo refector later(SGoat)
+  py::array t;
+  uint64_t dummy = 0;
+  SOX_SAMPLE_LOCALS;
+  int32_t num_rows = num_samples / num_channels;
+  if (normalize || dtype.char_() == 'f') {
+    t = py::array(dtype, {num_rows, num_channels});
+    auto ptr = (float*)t.mutable_data(0, 0);
+    for (int32_t i = 0; i < num_samples; ++i) {
+      ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy);
+    }
+    if (channels_first) {
+    py::array t2 = py::array(dtype, {num_channels, num_rows});
+    for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
+      for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
+       *(float*)t2.mutable_data(row_idx, col_idx) = *(float*)t.data(col_idx, row_idx);
+    }
+    return t2;
+  }
+  } else if (dtype.char_() == 'i') {
+    t = py::array(dtype, {num_rows, num_channels});
+    auto ptr = (int*)t.mutable_data(0, 0);
+    for (int32_t i = 0; i < num_samples; ++i) {
+      ptr[i] = buffer[i];
+    }
+    if (channels_first) {
+      py::array t2 = py::array(dtype, {num_channels, num_rows});
+      for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
+        for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
+          *(int*)t2.mutable_data(row_idx, col_idx) = *(int*)t.data(col_idx, row_idx);
+      }
+      return t2;
+    }
+  } else if (dtype.char_() == 'h') { // int16
+    t = py::array(dtype, {num_rows, num_channels});
+    auto ptr = (int16_t*)t.mutable_data(0, 0);
+    for (int32_t i = 0; i < num_samples; ++i) {
+      ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy);
+    }
+    if (channels_first) {
+      py::array t2 = py::array(dtype, {num_channels, num_rows});
+      for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
+        for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
+          *(int16_t*)t2.mutable_data(row_idx, col_idx) = *(int16_t*)t.data(col_idx, row_idx);
+      }
+      return t2;
+    }
+  } else if (dtype.char_() == 'b') {
+    //t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8);
+    t = py::array(dtype, {num_rows, num_channels});
+    auto ptr = (uint8_t*)t.mutable_data(0,0);
+    for (int32_t i = 0; i < num_samples; ++i) {
+      ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy);
+    }
+    if (channels_first) {
+      py::array t2 = py::array(dtype, {num_channels, num_rows});
+      for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
+        for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
+        *(uint8_t*)t2.mutable_data(row_idx, col_idx) = *(uint8_t*)t.data(col_idx, row_idx);
+      }
+      return t2;
+    }
+  } else {
+    throw std::runtime_error("Unsupported dtype.");
+  }
+  return t;
+}
+
+const std::string get_filetype(const std::string path) {
+  std::string ext = path.substr(path.find_last_of(".") + 1);
+  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+  return ext;
+}
+
+namespace {
+
+std::tuple<sox_encoding_t, unsigned> get_save_encoding_for_wav(
+    const std::string format,
+    py::dtype dtype,
+    const Encoding& encoding,
+    const BitDepth& bits_per_sample) {
+  switch (encoding) {
+    case Encoding::NOT_PROVIDED:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+          switch (dtype.num()) {
+            case 11: // float32 numpy dtype num 
+              return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
+            case 5: // int numpy dtype num
+              return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
+            case 3: // int16 numpy
+              return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
+            case 1: // byte numpy
+              return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
+            default:
+              throw std::runtime_error("Internal Error: Unexpected dtype.");
+          }
+        case BitDepth::B8:
+          return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
+        default:
+          return std::make_tuple<>(
+              SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
+      }
+    case Encoding::PCM_SIGNED:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+          return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
+        case BitDepth::B8:
+          throw std::runtime_error(
+              format + " does not support 8-bit signed PCM encoding.");
+        default:
+          return std::make_tuple<>(
+              SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
+      }
+    case Encoding::PCM_UNSIGNED:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+        case BitDepth::B8:
+          return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
+        default:
+          throw std::runtime_error(
+              format + " only supports 8-bit for unsigned PCM encoding.");
+      }
+    case Encoding::PCM_FLOAT:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+        case BitDepth::B32:
+          return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
+        case BitDepth::B64:
+          return std::make_tuple<>(SOX_ENCODING_FLOAT, 64);
+        default:
+          throw std::runtime_error(
+              format +
+              " only supports 32-bit or 64-bit for floating-point PCM encoding.");
+      }
+    case Encoding::ULAW:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+        case BitDepth::B8:
+          return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
+        default:
+          throw std::runtime_error(
+              format + " only supports 8-bit for mu-law encoding.");
+      }
+    case Encoding::ALAW:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+        case BitDepth::B8:
+          return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
+        default:
+          throw std::runtime_error(
+              format + " only supports 8-bit for a-law encoding.");
+      }
+    default:
+      throw std::runtime_error(
+          format + " does not support encoding: " + to_string(encoding));
+  }
+}
+
+std::tuple<sox_encoding_t, unsigned> get_save_encoding(
+    const std::string& format,
+    const py::dtype dtype,
+    const tl::optional<std::string> encoding,
+    const tl::optional<int64_t> bits_per_sample) {
+  const Format fmt = get_format_from_string(format);
+  const Encoding enc = get_encoding_from_option(encoding);
+  const BitDepth bps = get_bit_depth_from_option(bits_per_sample);
+
+  switch (fmt) {
+    case Format::WAV:
+    case Format::AMB:
+      return get_save_encoding_for_wav(format, dtype, enc, bps);
+    case Format::MP3:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("mp3 does not support `encoding` option.");
+      if (bps != BitDepth::NOT_PROVIDED)
+        throw std::runtime_error(
+            "mp3 does not support `bits_per_sample` option.");
+      return std::make_tuple<>(SOX_ENCODING_MP3, 16);
+    case Format::HTK:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("htk does not support `encoding` option.");
+      if (bps != BitDepth::NOT_PROVIDED)
+        throw std::runtime_error(
+            "htk does not support `bits_per_sample` option.");
+      return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
+    case Format::VORBIS:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("vorbis does not support `encoding` option.");
+      if (bps != BitDepth::NOT_PROVIDED)
+        throw std::runtime_error(
+            "vorbis does not support `bits_per_sample` option.");
+      return std::make_tuple<>(SOX_ENCODING_VORBIS, 16);
+    case Format::AMR_NB:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("amr-nb does not support `encoding` option.");
+      if (bps != BitDepth::NOT_PROVIDED)
+        throw std::runtime_error(
+            "amr-nb does not support `bits_per_sample` option.");
+      return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16);
+    case Format::FLAC:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("flac does not support `encoding` option.");
+      switch (bps) {
+        case BitDepth::B32:
+        case BitDepth::B64:
+          throw std::runtime_error(
+              "flac does not support `bits_per_sample` larger than 24.");
+        default:
+          return std::make_tuple<>(
+              SOX_ENCODING_FLAC, static_cast<unsigned>(bps));
+      }
+    case Format::SPHERE:
+      switch (enc) {
+        case Encoding::NOT_PROVIDED:
+        case Encoding::PCM_SIGNED:
+          switch (bps) {
+            case BitDepth::NOT_PROVIDED:
+              return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
+            default:
+              return std::make_tuple<>(
+                  SOX_ENCODING_SIGN2, static_cast<unsigned>(bps));
+          }
+        case Encoding::PCM_UNSIGNED:
+          throw std::runtime_error(
+              "sph does not support unsigned integer PCM.");
+        case Encoding::PCM_FLOAT:
+          throw std::runtime_error("sph does not support floating point PCM.");
+        case Encoding::ULAW:
+          switch (bps) {
+            case BitDepth::NOT_PROVIDED:
+            case BitDepth::B8:
+              return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
+            default:
+              throw std::runtime_error(
+                  "sph only supports 8-bit for mu-law encoding.");
+          }
+        case Encoding::ALAW:
+          switch (bps) {
+            case BitDepth::NOT_PROVIDED:
+            case BitDepth::B8:
+              return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
+            default:
+              return std::make_tuple<>(
+                  SOX_ENCODING_ALAW, static_cast<unsigned>(bps));
+          }
+        default:
+          throw std::runtime_error(
+              "sph does not support encoding: " + encoding.value());
+      }
+    case Format::GSM:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("gsm does not support `encoding` option.");
+      if (bps != BitDepth::NOT_PROVIDED)
+        throw std::runtime_error(
+            "gsm does not support `bits_per_sample` option.");
+      return std::make_tuple<>(SOX_ENCODING_GSM, 16);
+
+    default:
+      throw std::runtime_error("Unsupported format: " + format);
+  }
+}
+
+unsigned get_precision(const std::string filetype, py::dtype dtype) {
+  if (filetype == "mp3")
+    return SOX_UNSPEC;
+  if (filetype == "flac")
+    return 24;
+  if (filetype == "ogg" || filetype == "vorbis")
+    return SOX_UNSPEC;
+  if (filetype == "wav" || filetype == "amb") {
+    switch (dtype.num()) {
+      case 1: // byte in numpy dype num
+        return 8;
+      case 3: // short, in numpy dtype num
+        return 16;
+      case 5: // int, numpy dtype 
+        return 32;
+      case 11: // float, numpy dtype
+        return 32;
+      default:
+        throw std::runtime_error("Unsupported dtype.");
+    }
+  }
+  if (filetype == "sph")
+    return 32;
+  if (filetype == "amr-nb") {
+    return 16;
+  }
+  if (filetype == "gsm") {
+    return 16;
+  }
+  if (filetype == "htk") {
+    return 16;
+  }
+  throw std::runtime_error("Unsupported file type: " + filetype);
+}
+
+} // namespace
+
+sox_signalinfo_t get_signalinfo(
+    const py::array* waveform,
+    const int64_t sample_rate,
+    const std::string filetype,
+    const bool channels_first) {
+  return sox_signalinfo_t{
+      /*rate=*/static_cast<sox_rate_t>(sample_rate),
+      /*channels=*/
+      static_cast<unsigned>(waveform->shape(channels_first ? 0 : 1)),
+      /*precision=*/get_precision(filetype, waveform->dtype()),
+      /*length=*/static_cast<uint64_t>(waveform->size())};
+}
+
+sox_encodinginfo_t get_tensor_encodinginfo(py::dtype dtype) {
+  sox_encoding_t encoding = [&]() {
+    switch (dtype.num()) {
+      case 1: // byte
+        return SOX_ENCODING_UNSIGNED;
+      case 3: // short
+        return SOX_ENCODING_SIGN2;
+      case 5: // int32
+        return SOX_ENCODING_SIGN2;
+      case 11: // float
+        return SOX_ENCODING_FLOAT;
+      default:
+        throw std::runtime_error("Unsupported dtype.");
+    }
+  }();
+  unsigned bits_per_sample = [&]() {
+    switch (dtype.num()) {
+      case 1: // byte
+        return 8;
+      case 3: //short
+        return 16;
+      case 5: // int32
+        return 32;
+      case 11: // float
+        return 32;
+      default:
+        throw std::runtime_error("Unsupported dtype.");
+    }
+  }();
+  return sox_encodinginfo_t{
+      /*encoding=*/encoding,
+      /*bits_per_sample=*/bits_per_sample,
+      /*compression=*/HUGE_VAL,
+      /*reverse_bytes=*/sox_option_default,
+      /*reverse_nibbles=*/sox_option_default,
+      /*reverse_bits=*/sox_option_default,
+      /*opposite_endian=*/sox_false};
+}
+
+sox_encodinginfo_t get_encodinginfo_for_save(
+    const std::string& format,
+    const py::dtype dtype,
+    const tl::optional<double> compression,
+    const tl::optional<std::string> encoding,
+    const tl::optional<int64_t> bits_per_sample) {
+  auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample);
+  return sox_encodinginfo_t{
+      /*encoding=*/std::get<0>(enc),
+      /*bits_per_sample=*/std::get<1>(enc),
+      /*compression=*/compression.value_or(HUGE_VAL),
+      /*reverse_bytes=*/sox_option_default,
+      /*reverse_nibbles=*/sox_option_default,
+      /*reverse_bits=*/sox_option_default,
+      /*opposite_endian=*/sox_false};
+}
+
+}  // namespace paddleaudio
+}  // namespace sox_utils
--- a/audio/paddleaudio/src/pybind/sox/utils.h
+++ b/audio/paddleaudio/src/pybind/sox/utils.h
@ -0,0 +1,114 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.h with modification.
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <sox.h>
+#include "paddleaudio/src/optional/optional.hpp"
+
+namespace py = pybind11;
+
+namespace paddleaudio {
+namespace sox_utils {
+
+auto read_fileobj(py::object *fileobj, uint64_t size, char *buffer) -> uint64_t;
+
+void set_seed(const int64_t seed);
+
+void set_verbosity(const int64_t verbosity);
+
+void set_use_threads(const bool use_threads);
+
+void set_buffer_size(const int64_t buffer_size);
+
+int64_t get_buffer_size();
+
+std::vector<std::vector<std::string>> list_effects();
+
+std::vector<std::string> list_read_formats();
+
+std::vector<std::string> list_write_formats();
+
+////////////////////////////////////////////////////////////////////////////////
+// Utilities for sox_io / sox_effects implementations
+////////////////////////////////////////////////////////////////////////////////
+
+const std::unordered_set<std::string> UNSUPPORTED_EFFECTS =
+    {"input", "output", "spectrogram", "noiseprof", "noisered", "splice"};
+
+/// helper class to automatically close sox_format_t*
+struct SoxFormat {
+  explicit SoxFormat(sox_format_t* fd) noexcept;
+  SoxFormat(const SoxFormat& other) = delete;
+  SoxFormat(SoxFormat&& other) = delete;
+  SoxFormat& operator=(const SoxFormat& other) = delete;
+  SoxFormat& operator=(SoxFormat&& other) = delete;
+  ~SoxFormat();
+  sox_format_t* operator->() const noexcept;
+  operator sox_format_t*() const noexcept;
+
+  void close();
+
+ private:
+  sox_format_t* fd_;
+};
+
+///
+/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
+void validate_input_tensor(const py::array);
+
+void validate_input_file(const SoxFormat& sf, const std::string& path);
+
+void validate_input_memfile(const SoxFormat &sf);
+///
+/// Get target dtype for the given encoding and precision.
+py::dtype get_dtype(
+    const sox_encoding_t encoding,
+    const unsigned precision);
+
+///
+/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
+/// NOTE: This function might modify the values in the input buffer to
+/// reduce the number of memory copy.
+/// @param buffer Pointer to buffer that contains audio data.
+/// @param num_samples The number of samples to read.
+/// @param num_channels The number of channels. Used to reshape the resulting
+/// Tensor.
+/// @param dtype Target dtype. Determines the output dtype and value range in
+/// conjunction with normalization.
+/// @param noramlize Perform normalization. Only effective when dtype is not
+/// kFloat32. When effective, the output tensor is kFloat32 type and value range
+/// is [-1.0, 1.0]
+/// @param channels_first When True, output Tensor has shape of [num_channels,
+/// num_frames].
+py::array convert_to_tensor(
+    sox_sample_t* buffer,
+    const int32_t num_samples,
+    const int32_t num_channels,
+    const py::dtype dtype,
+    const bool normalize,
+    const bool channels_first);
+
+/// Extract extension from file path
+const std::string get_filetype(const std::string path);
+
+/// Get sox_signalinfo_t for passing a py::array object.
+sox_signalinfo_t get_signalinfo(
+    const py::array* waveform,
+    const int64_t sample_rate,
+    const std::string filetype,
+    const bool channels_first);
+
+/// Get sox_encodinginfo_t for Tensor I/O
+sox_encodinginfo_t get_tensor_encodinginfo(const py::dtype dtype);
+
+/// Get sox_encodinginfo_t for saving to file/file object
+sox_encodinginfo_t get_encodinginfo_for_save(
+    const std::string& format,
+    const py::dtype dtype,
+    const tl::optional<double> compression,
+    const tl::optional<std::string> encoding,
+    const tl::optional<int64_t> bits_per_sample);
+
+}  // namespace paddleaudio
+}  // namespace sox_utils
--- a/audio/paddleaudio/src/utils.cpp
+++ b/audio/paddleaudio/src/utils.cpp
@ -0,0 +1,35 @@
+// this is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/utils.cpp with modification.
+
+namespace paddleaudio {
+
+namespace {
+
+bool is_sox_available() {
+#ifdef INCLUDE_SOX
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool is_kaldi_available() {
+#ifdef INCLUDE_KALDI
+    return true;
+#else
+    return false;
+#endif
+}
+
+// It tells whether paddleaudio was compiled with ffmpeg
+// not the runtime availability.
+bool is_ffmpeg_available() {
+#ifdef USE_FFMPEG
+    return true;
+#else
+    return false;
+#endif
+}
+
+}  // namespace
+
+}  // namespace paddleaudio
--- a/audio/paddleaudio/third_party/.gitignore
+++ b/audio/paddleaudio/third_party/.gitignore
@ -0,0 +1,2 @@
+archives/
+install/
--- a/audio/paddleaudio/third_party/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/CMakeLists.txt
@ -0,0 +1,15 @@
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+
+################################################################################
+# sox
+################################################################################
+if (BUILD_SOX)
+  add_subdirectory(sox)
+endif()
+
+################################################################################
+# kaldi
+################################################################################
+if (BUILD_KALDI)
+  add_subdirectory(kaldi)
+endif()
--- a/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
@ -0,0 +1,111 @@
+# checkout the thirdparty/kaldi/base/kaldi-types.h
+# compile kaldi without openfst
+add_definitions("-DCOMPILE_WITHOUT_OPENFST")
+
+if ((NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/base))
+    file(COPY ../../../../speechx/speechx/kaldi/base DESTINATION ${CMAKE_CURRENT_LIST_DIR})
+    file(COPY ../../../../speechx/speechx/kaldi/feat DESTINATION ${CMAKE_CURRENT_LIST_DIR})
+    file(COPY ../../../../speechx/speechx/kaldi/matrix DESTINATION ${CMAKE_CURRENT_LIST_DIR})
+    file(COPY ../../../../speechx/speechx/kaldi/util DESTINATION ${CMAKE_CURRENT_LIST_DIR})
+endif()
+
+# kaldi-base
+add_library(kaldi-base STATIC
+  base/io-funcs.cc
+  base/kaldi-error.cc
+  base/kaldi-math.cc
+  base/kaldi-utils.cc
+  base/timer.cc
+)
+target_include_directories(kaldi-base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+# kaldi-matrix
+add_library(kaldi-matrix STATIC
+  matrix/compressed-matrix.cc
+  matrix/matrix-functions.cc
+  matrix/kaldi-matrix.cc
+  matrix/kaldi-vector.cc
+  matrix/optimization.cc
+  matrix/packed-matrix.cc
+  matrix/qr.cc
+  matrix/sparse-matrix.cc
+  matrix/sp-matrix.cc
+  matrix/srfft.cc
+  matrix/tp-matrix.cc
+)
+target_include_directories(kaldi-matrix PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+if (NOT MSVC)
+    target_link_libraries(kaldi-matrix PUBLIC kaldi-base libopenblas)
+else()
+    target_link_libraries(kaldi-matrix PUBLIC kaldi-base openblas)
+endif()
+
+# kaldi-util
+add_library(kaldi-util STATIC
+  util/kaldi-holder.cc
+  util/kaldi-io.cc
+  util/kaldi-semaphore.cc
+  util/kaldi-table.cc
+  util/kaldi-thread.cc
+  util/parse-options.cc
+  util/simple-io-funcs.cc
+  util/simple-options.cc
+  util/text-utils.cc
+)
+target_include_directories(kaldi-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix)
+
+# kaldi-feat-common
+add_library(kaldi-feat-common STATIC
+  feat/cmvn.cc
+  feat/feature-functions.cc
+  feat/feature-window.cc
+  feat/mel-computations.cc
+  feat/pitch-functions.cc
+  feat/resample.cc
+  feat/signal.cc
+  feat/wave-reader.cc
+)
+target_include_directories(kaldi-feat-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
+
+
+# kaldi-mfcc
+add_library(kaldi-mfcc STATIC
+  feat/feature-mfcc.cc
+)
+target_include_directories(kaldi-mfcc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
+
+
+# kaldi-fbank
+add_library(kaldi-fbank STATIC
+  feat/feature-fbank.cc
+)
+target_include_directories(kaldi-fbank PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
+
+
+set(KALDI_LIBRARIES
+  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-base.a
+  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-matrix.a
+  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-util.a
+  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-feat-common.a
+  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-mfcc.a
+  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-fbank.a
+)
+
+add_library(libkaldi INTERFACE)
+add_dependencies(libkaldi kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank)
+target_include_directories(libkaldi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
+
+if (APPLE)
+    target_link_libraries(libkaldi INTERFACE ${KALDI_LIBRARIES} libopenblas ${GFORTRAN_LIBRARIES_DIR}/libgfortran.a ${GFORTRAN_LIBRARIES_DIR}/libquadmath.a ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib)
+elseif (MSVC)
+    target_link_libraries(libkaldi INTERFACE kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank openblas)
+else()
+    target_link_libraries(libkaldi INTERFACE -Wl,--start-group -Wl,--whole-archive ${KALDI_LIBRARIES} libopenblas.a gfortran -Wl,--no-whole-archive -Wl,--end-group)
+endif()
+
+target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
--- a/audio/paddleaudio/third_party/patches/config.guess
+++ b/audio/paddleaudio/third_party/patches/config.guess
--- a/audio/paddleaudio/third_party/patches/config.sub
+++ b/audio/paddleaudio/third_party/patches/config.sub
--- a/audio/paddleaudio/third_party/patches/libmad.patch
+++ b/audio/paddleaudio/third_party/patches/libmad.patch
@ -0,0 +1,86 @@
+See the followings for the origin of this patch
+http://www.linuxfromscratch.org/blfs/view/svn/multimedia/libmad.html
+http://www.linuxfromscratch.org/patches/blfs/svn/libmad-0.15.1b-fixes-1.patch
+--- src/libmad/configure	2004-02-05 09:34:07.000000000 +0000
+++ src/libmad/configure.new	2020-06-30 21:10:28.528018931 +0000
+@@ -19083,71 +19083,7 @@
+ 
+ if test "$GCC" = yes
+ then
+-    if test -z "$arch"
+-    then
+-	case "$host" in
+-	    i386-*)           ;;
+-	    i?86-*)           arch="-march=i486" ;;
+-	    arm*-empeg-*)     arch="-march=armv4 -mtune=strongarm1100" ;;
+-	    armv4*-*)         arch="-march=armv4 -mtune=strongarm" ;;
+-	    powerpc-*)        ;;
+-	    mips*-agenda-*)   arch="-mcpu=vr4100" ;;
+-	    mips*-luxsonor-*) arch="-mips1 -mcpu=r3000 -Wa,-m4010" ;;
+-	esac
+-    fi
+-
+-    case "$optimize" in
+-	-O|"-O "*)
+-	    optimize="-O"
+-	    optimize="$optimize -fforce-mem"
+-	    optimize="$optimize -fforce-addr"
+-	    : #x optimize="$optimize -finline-functions"
+-	    : #- optimize="$optimize -fstrength-reduce"
+-	    optimize="$optimize -fthread-jumps"
+-	    optimize="$optimize -fcse-follow-jumps"
+-	    optimize="$optimize -fcse-skip-blocks"
+-	    : #x optimize="$optimize -frerun-cse-after-loop"
+-	    : #x optimize="$optimize -frerun-loop-opt"
+-	    : #x optimize="$optimize -fgcse"
+-	    optimize="$optimize -fexpensive-optimizations"
+-	    optimize="$optimize -fregmove"
+-	    : #* optimize="$optimize -fdelayed-branch"
+-	    : #x optimize="$optimize -fschedule-insns"
+-	    optimize="$optimize -fschedule-insns2"
+-	    : #? optimize="$optimize -ffunction-sections"
+-	    : #? optimize="$optimize -fcaller-saves"
+-	    : #> optimize="$optimize -funroll-loops"
+-	    : #> optimize="$optimize -funroll-all-loops"
+-	    : #x optimize="$optimize -fmove-all-movables"
+-	    : #x optimize="$optimize -freduce-all-givs"
+-	    : #? optimize="$optimize -fstrict-aliasing"
+-	    : #* optimize="$optimize -fstructure-noalias"
+-
+-	    case "$host" in
+-		arm*-*)
+-		    optimize="$optimize -fstrength-reduce"
+-		    ;;
+-		mips*-*)
+-		    optimize="$optimize -fstrength-reduce"
+-		    optimize="$optimize -finline-functions"
+-		    ;;
+-		i?86-*)
+-		    optimize="$optimize -fstrength-reduce"
+-		    ;;
+-		powerpc-apple-*)
+-		    # this triggers an internal compiler error with gcc2
+-		    : #optimize="$optimize -fstrength-reduce"
+-
+-		    # this is really only beneficial with gcc3
+-		    : #optimize="$optimize -finline-functions"
+-		    ;;
+-		*)
+-		    # this sometimes provokes bugs in gcc 2.95.2
+-		    : #optimize="$optimize -fstrength-reduce"
+-		    ;;
+-	    esac
+-	    ;;
+-    esac
+    optimize="-O2"
+ fi
+ 
+ case "$host" in
+@@ -21497,6 +21433,7 @@
+ then
+     case "$host" in
+ 	i?86-*)     FPM="INTEL"  ;;
+	x86_64*)    FPM="64BIT"  ;;
+ 	arm*-*)     FPM="ARM"    ;;
+ 	mips*-*)    FPM="MIPS"   ;;
+ 	sparc*-*)   FPM="SPARC"  ;;
--- a/audio/paddleaudio/third_party/patches/sox.patch
+++ b/audio/paddleaudio/third_party/patches/sox.patch
@ -0,0 +1,16 @@
+See https://github.com/pytorch/audio/pull/1297
+diff -ru sox/src/formats.c sox/src/formats.c
+--- sox/src/formats.c	2014-10-26 19:55:50.000000000 -0700
+++ sox/src/formats.c	2021-02-22 16:01:02.833144070 -0800
+@@ -333,6 +333,10 @@
+   assert(ft);
+   if (!ft->fp)
+     return sox_false;
+-  fstat(fileno((FILE*)ft->fp), &st);
+  int fd = fileno((FILE*)ft->fp);
+  if (fd < 0)
+    return sox_false;
+  if (fstat(fd, &st) < 0)
+    return sox_false;
+   return ((st.st_mode & S_IFMT) == S_IFREG);
+ }
--- a/audio/paddleaudio/third_party/sox/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/sox/CMakeLists.txt
@ -0,0 +1,254 @@
+find_package(PkgConfig REQUIRED)
+
+include(ExternalProject)
+
+set(INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../install)
+set(ARCHIVE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../archives)
+set(patch_dir ${CMAKE_CURRENT_SOURCE_DIR}/../patches)
+set(COMMON_ARGS --quiet --disable-shared --enable-static --prefix=${INSTALL_DIR} --with-pic --disable-dependency-tracking --disable-debug --disable-examples --disable-doc)
+
+# To pass custom environment variables to ExternalProject_Add command,
+# we need to do `${CMAKE_COMMAND} -E env ${envs} <COMMANAD>`.
+# https://stackoverflow.com/a/62437353
+# We constrcut the custom environment variables here
+set(envs
+  "PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig"
+  "LDFLAGS=-L${INSTALL_DIR}/lib $ENV{LDFLAGS}"
+  "CFLAGS=-I${INSTALL_DIR}/include -fvisibility=hidden $ENV{CFLAGS}"
+)
+
+if (BUILD_MAD)
+  ExternalProject_Add(mad
+    PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+    DOWNLOAD_DIR ${ARCHIVE_DIR}
+    URL https://downloads.sourceforge.net/project/mad/libmad/0.15.1b/libmad-0.15.1b.tar.gz
+    URL_HASH SHA256=bbfac3ed6bfbc2823d3775ebb931087371e142bb0e9bb1bee51a76a6e0078690
+    PATCH_COMMAND patch < ${patch_dir}/libmad.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/mad/
+    CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/mad/configure ${COMMON_ARGS}
+    DOWNLOAD_NO_PROGRESS ON
+    LOG_DOWNLOAD ON
+    LOG_UPDATE ON
+    LOG_CONFIGURE ON
+    LOG_BUILD ON
+    LOG_INSTALL ON
+    LOG_MERGED_STDOUTERR ON
+    LOG_OUTPUT_ON_FAILURE ON
+  )
+endif (BUILD_MAD)
+
+ExternalProject_Add(amr
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://sourceforge.net/projects/opencore-amr/files/opencore-amr/opencore-amr-0.1.5.tar.gz
+  URL_HASH SHA256=2c006cb9d5f651bfb5e60156dbff6af3c9d35c7bbcc9015308c0aff1e14cd341
+  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/amr/
+  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/amr/configure ${COMMON_ARGS}
+  DOWNLOAD_NO_PROGRESS ON
+  LOG_DOWNLOAD ON
+  LOG_UPDATE ON
+  LOG_CONFIGURE ON
+  LOG_BUILD ON
+  LOG_INSTALL ON
+  LOG_MERGED_STDOUTERR ON
+  LOG_OUTPUT_ON_FAILURE ON
+)
+
+ExternalProject_Add(lame
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://downloads.sourceforge.net/project/lame/lame/3.99/lame-3.99.5.tar.gz
+  URL_HASH SHA256=24346b4158e4af3bd9f2e194bb23eb473c75fb7377011523353196b19b9a23ff
+  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/lame/
+  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/lame/configure ${COMMON_ARGS} --enable-nasm
+  DOWNLOAD_NO_PROGRESS ON
+  LOG_DOWNLOAD ON
+  LOG_UPDATE ON
+  LOG_CONFIGURE ON
+  LOG_BUILD ON
+  LOG_INSTALL ON
+  LOG_MERGED_STDOUTERR ON
+  LOG_OUTPUT_ON_FAILURE ON
+)
+
+ExternalProject_Add(ogg
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://ftp.osuosl.org/pub/xiph/releases/ogg/libogg-1.3.3.tar.gz
+  URL_HASH SHA256=c2e8a485110b97550f453226ec644ebac6cb29d1caef2902c007edab4308d985
+  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/
+  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/configure ${COMMON_ARGS}
+  DOWNLOAD_NO_PROGRESS ON
+  LOG_DOWNLOAD ON
+  LOG_UPDATE ON
+  LOG_CONFIGURE ON
+  LOG_BUILD ON
+  LOG_INSTALL ON
+  LOG_MERGED_STDOUTERR ON
+  LOG_OUTPUT_ON_FAILURE ON
+)
+
+ExternalProject_Add(flac
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ogg
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.2.tar.xz
+  URL_HASH SHA256=91cfc3ed61dc40f47f050a109b08610667d73477af6ef36dcad31c31a4a8d53f
+  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/flac/
+  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/flac/configure ${COMMON_ARGS} --with-ogg --disable-cpplibs
+  DOWNLOAD_NO_PROGRESS ON
+  LOG_DOWNLOAD ON
+  LOG_UPDATE ON
+  LOG_CONFIGURE ON
+  LOG_BUILD ON
+  LOG_INSTALL ON
+  LOG_MERGED_STDOUTERR ON
+  LOG_OUTPUT_ON_FAILURE ON
+)
+
+ExternalProject_Add(vorbis
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ogg
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://ftp.osuosl.org/pub/xiph/releases/vorbis/libvorbis-1.3.6.tar.gz
+  URL_HASH SHA256=6ed40e0241089a42c48604dc00e362beee00036af2d8b3f46338031c9e0351cb
+  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/
+  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/configure ${COMMON_ARGS} --with-ogg
+  DOWNLOAD_NO_PROGRESS ON
+  LOG_DOWNLOAD ON
+  LOG_UPDATE ON
+  LOG_CONFIGURE ON
+  LOG_BUILD ON
+  LOG_INSTALL ON
+  LOG_MERGED_STDOUTERR ON
+  LOG_OUTPUT_ON_FAILURE ON
+)
+
+ExternalProject_Add(opus
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ogg
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://ftp.osuosl.org/pub/xiph/releases/opus/opus-1.3.1.tar.gz
+  URL_HASH SHA256=65b58e1e25b2a114157014736a3d9dfeaad8d41be1c8179866f144a2fb44ff9d
+  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opus/
+  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opus/configure ${COMMON_ARGS} --with-ogg
+  DOWNLOAD_NO_PROGRESS ON
+  LOG_DOWNLOAD ON
+  LOG_UPDATE ON
+  LOG_CONFIGURE ON
+  LOG_BUILD ON
+  LOG_INSTALL ON
+  LOG_MERGED_STDOUTERR ON
+  LOG_OUTPUT_ON_FAILURE ON
+)
+
+ExternalProject_Add(opusfile
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS opus
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://ftp.osuosl.org/pub/xiph/releases/opus/opusfile-0.12.tar.gz
+  URL_HASH SHA256=118d8601c12dd6a44f52423e68ca9083cc9f2bfe72da7a8c1acb22a80ae3550b
+  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/
+  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/configure ${COMMON_ARGS} --disable-http
+  DOWNLOAD_NO_PROGRESS ON
+  LOG_DOWNLOAD ON
+  LOG_UPDATE ON
+  LOG_CONFIGURE ON
+  LOG_BUILD ON
+  LOG_INSTALL ON
+  LOG_MERGED_STDOUTERR ON
+  LOG_OUTPUT_ON_FAILURE ON
+)
+
+# OpenMP is by default compiled against GNU OpenMP, which conflicts with the version of OpenMP that PyTorch uses.
+# See https://github.com/pytorch/audio/pull/1026
+# TODO: Add flags like https://github.com/suphoff/pytorch_parallel_extension_cpp/blob/master/setup.py
+set(SOX_OPTIONS
+  --disable-openmp
+  --with-amrnb
+  --with-amrwb
+  --with-flac
+  --with-lame
+  --with-oggvorbis
+  --with-opus
+  --without-alsa
+  --without-ao
+  --without-coreaudio
+  --without-oss
+  --without-id3tag
+  --without-ladspa
+  --without-magic
+  --without-png
+  --without-pulseaudio
+  --without-sndfile
+  --without-sndio
+  --without-sunaudio
+  --without-waveaudio
+  --without-wavpack
+  --without-twolame
+  )
+
+set(SOX_LIBRARIES
+  ${INSTALL_DIR}/lib/libsox.a
+  ${INSTALL_DIR}/lib/libopencore-amrnb.a
+  ${INSTALL_DIR}/lib/libopencore-amrwb.a
+  ${INSTALL_DIR}/lib/libmp3lame.a
+  ${INSTALL_DIR}/lib/libFLAC.a
+  ${INSTALL_DIR}/lib/libopusfile.a
+  ${INSTALL_DIR}/lib/libopus.a
+  ${INSTALL_DIR}/lib/libvorbisenc.a
+  ${INSTALL_DIR}/lib/libvorbisfile.a
+  ${INSTALL_DIR}/lib/libvorbis.a
+  ${INSTALL_DIR}/lib/libogg.a
+  )
+
+set(sox_depends
+  ogg flac vorbis opusfile lame amr
+  )
+
+if (BUILD_MAD)
+  list(
+    APPEND
+    SOX_OPTIONS
+    --with-mad
+    )
+  list(
+    APPEND
+    SOX_LIBRARIES
+    ${INSTALL_DIR}/lib/libmad.a
+    )
+  list(
+    APPEND
+    sox_depends
+    mad
+    )
+else ()
+  list(
+    APPEND
+    SOX_OPTIONS
+    --without-mad
+    )  
+endif (BUILD_MAD)
+
+ExternalProject_Add(sox
+  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${sox_depends}
+  DOWNLOAD_DIR ${ARCHIVE_DIR}
+  URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2
+  URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c
+  PATCH_COMMAND patch -p1 < ${patch_dir}/sox.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/sox/
+  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/sox/configure ${COMMON_ARGS} ${SOX_OPTIONS}
+  BUILD_BYPRODUCTS ${SOX_LIBRARIES}
+  DOWNLOAD_NO_PROGRESS ON
+  LOG_DOWNLOAD ON
+  LOG_UPDATE ON
+  LOG_CONFIGURE ON
+  LOG_BUILD ON
+  LOG_INSTALL ON
+  LOG_MERGED_STDOUTERR ON
+  LOG_OUTPUT_ON_FAILURE ON
+)
+
+add_library(libsox INTERFACE)
+add_dependencies(libsox sox)
+target_include_directories(libsox INTERFACE ${INSTALL_DIR}/include)
+target_link_libraries(libsox INTERFACE ${SOX_LIBRARIES})
--- a/audio/paddleaudio/utils/init.py
+++ b/audio/paddleaudio/utils/init.py
@ -0,0 +1,27 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .download import decompress
+from .download import download_and_decompress
+from .download import load_state_dict_from_url
+from .env import DATA_HOME
+from .env import MODEL_HOME
+from .env import PPAUDIO_HOME
+from .env import USER_HOME
+from .error import ParameterError
+from .log import Logger
+from .log import logger
+from .numeric import depth_convert
+from .numeric import pcm16to32
+from .time import seconds_to_hms
+from .time import Timer
--- a/audio/paddleaudio/utils/download.py
+++ b/audio/paddleaudio/utils/download.py
@ -0,0 +1,64 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Dict
+from typing import List
+
+from paddle.framework import load as load_state_dict
+from paddle.utils import download
+
+from .log import logger
+
+download.logger = logger
+
+__all__ = [
+    'decompress',
+    'download_and_decompress',
+    'load_state_dict_from_url',
+]
+
+
+def decompress(file: str):
+    """
+    Extracts all files from a compressed file.
+    """
+    assert os.path.isfile(file), "File: {} not exists.".format(file)
+    download._decompress(file)
+
+
+def download_and_decompress(archives: List[Dict[str, str]],
+                            path: str,
+                            decompress: bool=True):
+    """
+    Download archieves and decompress to specific path.
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    for archive in archives:
+        assert 'url' in archive and 'md5' in archive, \
+            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+        download.get_path_from_url(
+            archive['url'], path, archive['md5'], decompress=decompress)
+
+
+def load_state_dict_from_url(url: str, path: str, md5: str=None):
+    """
+    Download and load a state dict from url
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    download.get_path_from_url(url, path, md5)
+    return load_state_dict(os.path.join(path, os.path.basename(url)))
--- a/audio/paddleaudio/utils/env.py
+++ b/audio/paddleaudio/utils/env.py
@ -0,0 +1,60 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+This module is used to store environmental variables in PaddleAudio.
+PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
+├                            default value through the PPAUDIO_HOME environment variable.
+├─ MODEL_HOME    -->  Store model files.
+└─ DATA_HOME     -->  Store automatically downloaded datasets.
+'''
+import os
+
+__all__ = [
+    'USER_HOME',
+    'PPAUDIO_HOME',
+    'MODEL_HOME',
+    'DATA_HOME',
+]
+
+
+def _get_user_home():
+    return os.path.expanduser('~')
+
+
+def _get_ppaudio_home():
+    if 'PPAUDIO_HOME' in os.environ:
+        home_path = os.environ['PPAUDIO_HOME']
+        if os.path.exists(home_path):
+            if os.path.isdir(home_path):
+                return home_path
+            else:
+                raise RuntimeError(
+                    'The environment variable PPAUDIO_HOME {} is not a directory.'.
+                    format(home_path))
+        else:
+            return home_path
+    return os.path.join(_get_user_home(), '.paddleaudio')
+
+
+def _get_sub_home(directory):
+    home = os.path.join(_get_ppaudio_home(), directory)
+    if not os.path.exists(home):
+        os.makedirs(home)
+    return home
+
+
+USER_HOME = _get_user_home()
+PPAUDIO_HOME = _get_ppaudio_home()
+MODEL_HOME = _get_sub_home('models')
+DATA_HOME = _get_sub_home('datasets')
--- a/audio/paddleaudio/utils/error.py
+++ b/audio/paddleaudio/utils/error.py
@ -11,3 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+__all__ = ['ParameterError']
+
+
+class ParameterError(Exception):
+    """Exception class for Parameter checking"""
+    pass
--- a/audio/paddleaudio/utils/log.py
+++ b/audio/paddleaudio/utils/log.py
@ -0,0 +1,139 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import functools
+import logging
+import threading
+import time
+
+import colorlog
+
+__all__ = [
+    'Logger',
+    'logger',
+]
+
+log_config = {
+    'DEBUG': {
+        'level': 10,
+        'color': 'purple'
+    },
+    'INFO': {
+        'level': 20,
+        'color': 'green'
+    },
+    'TRAIN': {
+        'level': 21,
+        'color': 'cyan'
+    },
+    'EVAL': {
+        'level': 22,
+        'color': 'blue'
+    },
+    'WARNING': {
+        'level': 30,
+        'color': 'yellow'
+    },
+    'ERROR': {
+        'level': 40,
+        'color': 'red'
+    },
+    'CRITICAL': {
+        'level': 50,
+        'color': 'bold_red'
+    }
+}
+
+
+class Logger(object):
+    '''
+    Deafult logger in PaddleAudio
+    Args:
+        name(str) : Logger name, default is 'PaddleAudio'
+    '''
+
+    def __init__(self, name: str=None):
+        name = 'PaddleAudio' if not name else name
+        self.logger = logging.getLogger(name)
+
+        for key, conf in log_config.items():
+            logging.addLevelName(conf['level'], key)
+            self.__dict__[key] = functools.partial(self.__call__, conf['level'])
+            self.__dict__[key.lower()] = functools.partial(self.__call__,
+                                                           conf['level'])
+
+        self.format = colorlog.ColoredFormatter(
+            '%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
+            log_colors={key: conf['color']
+                        for key, conf in log_config.items()})
+
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+
+        self.logger.addHandler(self.handler)
+        self.logLevel = 'DEBUG'
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+        self._is_enable = True
+
+    def disable(self):
+        self._is_enable = False
+
+    def enable(self):
+        self._is_enable = True
+
+    @property
+    def is_enable(self) -> bool:
+        return self._is_enable
+
+    def __call__(self, log_level: str, msg: str):
+        if not self.is_enable:
+            return
+
+        self.logger.log(log_level, msg)
+
+    @contextlib.contextmanager
+    def use_terminator(self, terminator: str):
+        old_terminator = self.handler.terminator
+        self.handler.terminator = terminator
+        yield
+        self.handler.terminator = old_terminator
+
+    @contextlib.contextmanager
+    def processing(self, msg: str, interval: float=0.1):
+        '''
+        Continuously print a progress bar with rotating special effects.
+        Args:
+            msg(str): Message to be printed.
+            interval(float): Rotation interval. Default to 0.1.
+        '''
+        end = False
+
+        def _printer():
+            index = 0
+            flags = ['\\', '|', '/', '-']
+            while not end:
+                flag = flags[index % len(flags)]
+                with self.use_terminator('\r'):
+                    self.info('{}: {}'.format(msg, flag))
+                time.sleep(interval)
+                index += 1
+
+        t = threading.Thread(target=_printer)
+        t.start()
+        yield
+        end = True
+
+
+logger = Logger()
--- a/audio/paddleaudio/utils/numeric.py
+++ b/audio/paddleaudio/utils/numeric.py
@ -0,0 +1,107 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Union
+
+import numpy as np
+
+__all__ = ["pcm16to32", "depth_convert"]
+
+
+def pcm16to32(audio: np.ndarray) -> np.ndarray:
+    """pcm int16 to float32
+
+    Args:
+        audio (np.ndarray): Waveform with dtype of int16.
+
+    Returns:
+        np.ndarray: Waveform with dtype of float32.
+    """
+    if audio.dtype == np.int16:
+        audio = audio.astype("float32")
+        bits = np.iinfo(np.int16).bits
+        audio = audio / (2**(bits - 1))
+    return audio
+
+
+def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Data type casting in a safe way, i.e., prevent overflow or underflow.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
+    """
+    if 'float' in str(y.dtype):
+        return np.clip(y, np.finfo(dtype).min,
+                       np.finfo(dtype).max).astype(dtype)
+    else:
+        return np.clip(y, np.iinfo(dtype).min,
+                       np.iinfo(dtype).max).astype(dtype)
+
+
+def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Convert audio array to target dtype safely. 
+    This function convert audio waveform to a target dtype, with addition steps of
+    preventing overflow/underflow and preserving audio range.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
+    """
+
+    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
+    if y.dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype == y.dtype:
+        return y
+
+    if dtype == 'float64' and y.dtype == 'float32':
+        return _safe_cast(y, dtype)
+    if dtype == 'float32' and y.dtype == 'float64':
+        return _safe_cast(y, dtype)
+
+    if dtype == 'int16' or dtype == 'int8':
+        if y.dtype in ['float64', 'float32']:
+            factor = np.iinfo(dtype).max
+            y = np.clip(y * factor, np.iinfo(dtype).min,
+                        np.iinfo(dtype).max).astype(dtype)
+            y = y.astype(dtype)
+        else:
+            if dtype == 'int16' and y.dtype == 'int8':
+                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
+                y = y.astype('float32') * factor
+                y = y.astype('int16')
+
+            else:  # dtype == 'int8' and y.dtype=='int16':
+                y = y.astype('int32') * np.iinfo('int8').max / \
+                    np.iinfo('int16').max
+                y = y.astype('int8')
+
+    if dtype in ['float32', 'float64']:
+        org_dtype = y.dtype
+        y = y.astype(dtype) / np.iinfo(org_dtype).max
+    return y
--- a/audio/paddleaudio/utils/sox_utils.py
+++ b/audio/paddleaudio/utils/sox_utils.py
@ -0,0 +1,103 @@
+from typing import Dict
+from typing import List
+
+import paddleaudio
+from paddleaudio._internal import module_utils as _mod_utils
+
+
+@_mod_utils.requires_sox()
+def set_seed(seed: int):
+    """Set libsox's PRNG
+
+    Args:
+        seed (int): seed value. valid range is int32.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    paddleaudio._paddleaudio.sox_utils_set_seed(seed)
+
+
+@_mod_utils.requires_sox()
+def set_verbosity(verbosity: int):
+    """Set libsox's verbosity
+
+    Args:
+        verbosity (int): Set verbosity level of libsox.
+
+            * ``1`` failure messages
+            * ``2`` warnings
+            * ``3`` details of processing
+            * ``4``-``6`` increasing levels of debug messages
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    paddleaudio._paddleaudio.sox_utils_set_verbosity(verbosity)
+
+
+@_mod_utils.requires_sox()
+def set_buffer_size(buffer_size: int):
+    """Set buffer size for sox effect chain
+
+    Args:
+        buffer_size (int): Set the size in bytes of the buffers used for processing audio.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    paddleaudio._paddleaudio.sox_utils_set_buffer_size(buffer_size)
+
+
+@_mod_utils.requires_sox()
+def set_use_threads(use_threads: bool):
+    """Set multithread option for sox effect chain
+
+    Args:
+        use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing.
+            To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    paddleaudio._paddleaudio.sox_utils_set_use_threads(use_threads)
+
+
+@_mod_utils.requires_sox()
+def list_effects() -> Dict[str, str]:
+    """List the available sox effect names
+
+    Returns:
+        Dict[str, str]: Mapping from ``effect name`` to ``usage``
+    """
+    return dict(paddleaudio._paddleaudio.sox_utils_list_effects())
+
+
+@_mod_utils.requires_sox()
+def list_read_formats() -> List[str]:
+    """List the supported audio formats for read
+
+    Returns:
+        List[str]: List of supported audio formats
+    """
+    return paddleaudio._paddleaudio.sox_utils_list_read_formats()
+
+
+@_mod_utils.requires_sox()
+def list_write_formats() -> List[str]:
+    """List the supported audio formats for write
+
+    Returns:
+        List[str]: List of supported audio formats
+    """
+    return paddleaudio._paddleaudio.sox_utils_list_write_formats()
+
+
+@_mod_utils.requires_sox()
+def get_buffer_size() -> int:
+    """Get buffer size for sox effect chain
+
+    Returns:
+        int: size in bytes of buffers used for processing audio.
+    """
+    return paddleaudio._paddleaudio.sox_utils_get_buffer_size()
--- a/audio/paddleaudio/utils/tensor_utils.py
+++ b/audio/paddleaudio/utils/tensor_utils.py
@ -0,0 +1,192 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unility functions for Transformer."""
+from typing import List
+from typing import Tuple
+
+import paddle
+
+from .log import Logger
+
+__all__ = ["pad_sequence", "add_sos_eos", "th_accuracy", "has_tensor"]
+
+logger = Logger(__name__)
+
+
+def has_tensor(val):
+    if isinstance(val, (list, tuple)):
+        for item in val:
+            if has_tensor(item):
+                return True
+    elif isinstance(val, dict):
+        for k, v in val.items():
+            print(k)
+            if has_tensor(v):
+                return True
+    else:
+        return paddle.is_tensor(val)
+
+
+def pad_sequence(sequences: List[paddle.Tensor],
+                 batch_first: bool=False,
+                 padding_value: float=0.0) -> paddle.Tensor:
+    r"""Pad a list of variable length Tensors with ``padding_value``
+
+    ``pad_sequence`` stacks a list of Tensors along a new dimension,
+    and pads them to equal length. For example, if the input is list of
+    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
+    otherwise.
+
+    `B` is batch size. It is equal to the number of elements in ``sequences``.
+    `T` is length of the longest sequence.
+    `L` is length of the sequence.
+    `*` is any number of trailing dimensions, including none.
+
+    Example:
+        >>> from paddle.nn.utils.rnn import pad_sequence
+        >>> a = paddle.ones(25, 300)
+        >>> b = paddle.ones(22, 300)
+        >>> c = paddle.ones(15, 300)
+        >>> pad_sequence([a, b, c]).shape
+        paddle.Tensor([25, 3, 300])
+
+    Note:
+        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+        where `T` is the length of the longest sequence. This function assumes
+        trailing dimensions and type of all the Tensors in sequences are same.
+
+    Args:
+        sequences (list[Tensor]): list of variable length sequences.
+        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
+            ``T x B x *`` otherwise
+        padding_value (float, optional): value for padded elements. Default: 0.
+
+    Returns:
+        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
+        Tensor of size ``B x T x *`` otherwise
+    """
+
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+    max_size = paddle.shape(sequences[0])
+    # (TODO Hui Zhang): slice not supprot `end==start`
+    # trailing_dims = max_size[1:]
+    trailing_dims = tuple(
+        max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
+    max_len = max([s.shape[0] for s in sequences])
+    if batch_first:
+        out_dims = (len(sequences), max_len) + trailing_dims
+    else:
+        out_dims = (max_len, len(sequences)) + trailing_dims
+    out_tensor = paddle.full(out_dims, padding_value, sequences[0].dtype)
+    for i, tensor in enumerate(sequences):
+        length = tensor.shape[0]
+        # use index notation to prevent duplicate references to the tensor
+        if batch_first:
+            # TODO (Hui Zhang): set_value op not supprot `end==start`
+            # TODO (Hui Zhang): set_value op not support int16
+            # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
+            # out_tensor[i, :length, ...] = tensor
+            if length != 0:
+                out_tensor[i, :length] = tensor
+            else:
+                out_tensor[i, length] = tensor
+        else:
+            # TODO (Hui Zhang): set_value op not supprot `end==start`
+            # out_tensor[:length, i, ...] = tensor
+            if length != 0:
+                out_tensor[:length, i] = tensor
+            else:
+                out_tensor[length, i] = tensor
+
+    return out_tensor
+
+
+def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
+                ignore_id: int) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    """Add <sos> and <eos> labels.
+    Args:
+        ys_pad (paddle.Tensor): batch of padded target sequences (B, Lmax)
+        sos (int): index of <sos>
+        eos (int): index of <eeos>
+        ignore_id (int): index of padding
+    Returns:
+        ys_in (paddle.Tensor) : (B, Lmax + 1)
+        ys_out (paddle.Tensor) : (B, Lmax + 1)
+    Examples:
+        >>> sos_id = 10
+        >>> eos_id = 11
+        >>> ignore_id = -1
+        >>> ys_pad
+        tensor([[ 1,  2,  3,  4,  5],
+                [ 4,  5,  6, -1, -1],
+                [ 7,  8,  9, -1, -1]], dtype=paddle.int32)
+        >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id)
+        >>> ys_in
+        tensor([[10,  1,  2,  3,  4,  5],
+                [10,  4,  5,  6, 11, 11],
+                [10,  7,  8,  9, 11, 11]])
+        >>> ys_out
+        tensor([[ 1,  2,  3,  4,  5, 11],
+                [ 4,  5,  6, 11, -1, -1],
+                [ 7,  8,  9, 11, -1, -1]])
+    """
+    # TODO(Hui Zhang): using comment code,
+    #_sos = paddle.to_tensor(
+    #    [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
+    #_eos = paddle.to_tensor(
+    #    [eos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
+    #ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
+    #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
+    #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
+    #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
+    B = ys_pad.shape[0]
+    _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
+    _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
+    ys_in = paddle.cat([_sos, ys_pad], dim=1)
+    mask_pad = (ys_in == ignore_id)
+    ys_in = ys_in.masked_fill(mask_pad, eos)
+
+    ys_out = paddle.cat([ys_pad, _eos], dim=1)
+    ys_out = ys_out.masked_fill(mask_pad, eos)
+    mask_eos = (ys_out == ignore_id)
+    ys_out = ys_out.masked_fill(mask_eos, eos)
+    ys_out = ys_out.masked_fill(mask_pad, ignore_id)
+    return ys_in, ys_out
+
+
+def th_accuracy(pad_outputs: paddle.Tensor,
+                pad_targets: paddle.Tensor,
+                ignore_label: int) -> float:
+    """Calculate accuracy.
+    Args:
+        pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
+        pad_targets (LongTensor): Target label tensors (B, Lmax, D).
+        ignore_label (int): Ignore label id.
+    Returns:
+        float: Accuracy value (0.0 - 1.0).
+    """
+    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
+                                pad_outputs.shape[1]).argmax(2)
+    mask = pad_targets != ignore_label
+    #TODO(Hui Zhang): sum not support bool type
+    # numerator = paddle.sum(
+    #     pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
+    numerator = (
+        pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
+    numerator = paddle.sum(numerator.type_as(pad_targets))
+    #TODO(Hui Zhang): sum not support bool type
+    # denominator = paddle.sum(mask)
+    denominator = paddle.sum(mask.type_as(pad_targets))
+    return float(numerator) / float(denominator)
--- a/audio/paddleaudio/utils/time.py
+++ b/audio/paddleaudio/utils/time.py
@ -0,0 +1,72 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import time
+
+__all__ = [
+    'Timer',
+    'seconds_to_hms',
+]
+
+
+class Timer(object):
+    '''Calculate runing speed and estimated time of arrival(ETA)'''
+
+    def __init__(self, total_step: int):
+        self.total_step = total_step
+        self.last_start_step = 0
+        self.current_step = 0
+        self._is_running = True
+
+    def start(self):
+        self.last_time = time.time()
+        self.start_time = time.time()
+
+    def stop(self):
+        self._is_running = False
+        self.end_time = time.time()
+
+    def count(self) -> int:
+        if not self.current_step >= self.total_step:
+            self.current_step += 1
+        return self.current_step
+
+    @property
+    def timing(self) -> float:
+        run_steps = self.current_step - self.last_start_step
+        self.last_start_step = self.current_step
+        time_used = time.time() - self.last_time
+        self.last_time = time.time()
+        return run_steps / time_used
+
+    @property
+    def is_running(self) -> bool:
+        return self._is_running
+
+    @property
+    def eta(self) -> str:
+        if not self.is_running:
+            return '00:00:00'
+        scale = self.total_step / self.current_step
+        remaining_time = (time.time() - self.start_time) * scale
+        return seconds_to_hms(remaining_time)
+
+
+def seconds_to_hms(seconds: int) -> str:
+    '''Convert the number of seconds to hh:mm:ss'''
+    h = math.floor(seconds / 3600)
+    m = math.floor((seconds - h * 3600) / 60)
+    s = int(seconds - h * 3600 - m * 60)
+    hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
+    return hms_str
--- a/audio/setup.py
+++ b/audio/setup.py
@ -0,0 +1,293 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import inspect
+import io
+import os
+import platform
+import subprocess as sp
+import sys
+from pathlib import Path
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import distutils.command.clean
+from setuptools import Command
+from setuptools import find_packages
+from setuptools import setup
+from setuptools.command.develop import develop
+from setuptools.command.test import test
+
+from tools import setup_helpers
+
+ROOT_DIR = Path(__file__).parent.resolve()
+
+VERSION = '1.1.0'
+COMMITID = 'none'
+
+base = [
+    "kaldiio",
+    "librosa==0.8.1",
+    "scipy>=1.0.0",
+    "soundfile~=0.10",
+    "colorlog",
+    "pathos == 0.2.8",
+    "pybind11",
+    "parameterized",
+    "tqdm"
+]
+
+requirements = {
+    "install":
+    base,
+    "develop": [
+        "sox",
+        "soxbindings",
+        "pre-commit",
+    ],
+}
+
+def check_call(cmd: str, shell=False, executable=None):
+    try:
+        sp.check_call(
+            cmd.split(),
+            shell=shell,
+            executable="/bin/bash" if shell else executable)
+    except sp.CalledProcessError as e:
+        print(
+            f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:",
+            e.output,
+            file=sys.stderr)
+        raise e
+
+
+def check_output(cmd: Union[str, List[str], Tuple[str]], shell=False):
+    try:
+
+        if isinstance(cmd, (list, tuple)):
+            cmds = cmd
+        else:
+            cmds = cmd.split()
+        out_bytes = sp.check_output(cmds)
+
+    except sp.CalledProcessError as e:
+        out_bytes = e.output  # Output generated before error
+        code = e.returncode  # Return code
+        print(
+            f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:",
+            out_bytes,
+            file=sys.stderr)
+    return out_bytes.strip().decode('utf8')
+
+def _run_cmd(cmd):
+    try:
+        return subprocess.check_output(
+            cmd, cwd=ROOT_DIR,
+            stderr=subprocess.DEVNULL).decode("ascii").strip()
+    except Exception:
+        return None
+
+@contextlib.contextmanager
+def pushd(new_dir):
+    old_dir = os.getcwd()
+    os.chdir(new_dir)
+    print(new_dir)
+    yield
+    os.chdir(old_dir)
+    print(old_dir)
+
+def read(*names, **kwargs):
+    with io.open(
+            os.path.join(os.path.dirname(__file__), *names),
+            encoding=kwargs.get("encoding", "utf8")) as fp:
+        return fp.read()
+
+def _remove(files: str):
+    for f in files:
+        f.unlink()
+
+################################# Install ##################################
+
+
+def _post_install(install_lib_dir):
+    pass
+
+class DevelopCommand(develop):
+    def run(self):
+        develop.run(self)
+        # must after develop.run, or pkg install by shell will not see
+        self.execute(_post_install, (self.install_lib, ), msg="Post Install...")
+
+
+class TestCommand(test):
+    def finalize_options(self):
+        test.finalize_options(self)
+        self.test_args = []
+        self.test_suite = True
+
+    def run_tests(self):
+        # Run nose ensuring that argv simulates running nosetests directly
+        import nose
+        nose.run_exit(argv=['nosetests', '-w', 'tests'])
+    
+    def run_benchmark(self):
+        for benchmark_item in glob.glob('tests/benchmark/*py'):
+            os.system(f'pytest {benchmark_item}')
+
+
+# cmd: python setup.py upload
+class UploadCommand(Command):
+    description = "Build and publish the package."
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        try:
+            print("Removing previous dist/ ...")
+            shutil.rmtree(str(ROOT_DIR / "dist"))
+        except OSError:
+            pass
+        print("Building source distribution...")
+        sp.check_call([sys.executable, "setup.py", "sdist"])
+        print("Uploading package to PyPi...")
+        sp.check_call(["twine", "upload", "dist/*"])
+        sys.exit()
+
+
+################################# Version ##################################
+def _get_version(sha):
+    version = VERSION
+    if os.getenv("BUILD_VERSION"):
+        version = os.getenv("BUILD_VERSION")
+    elif sha is not None:
+        version += "+" + sha[:7]
+    return version
+
+
+def _make_version_file(version, sha):
+    sha = "Unknown" if sha is None else sha
+    version_path = ROOT_DIR / "paddleaudio" / "__init__.py"
+    with open(version_path, "a") as f:
+        f.write(f"__version__ = '{version}'\n")
+
+def _rm_version():
+    file_ = ROOT_DIR / "paddleaudio" / "__init__.py"
+    with open(file_, "r") as f:
+        lines = f.readlines()
+    with open(file_, "w") as f:
+        for line in lines:
+            if "__version__" not in line:
+                f.write(line)
+
+
+################################# Steup ##################################
+class clean(distutils.command.clean.clean):
+    def run(self):
+        # Run default behavior first
+        distutils.command.clean.clean.run(self)
+
+        # Remove paddleaudio extension
+        for path in (ROOT_DIR / "paddleaudio").glob("**/*.so"):
+            print(f"removing '{path}'")
+            path.unlink()
+        # Remove build directory
+        build_dirs = [
+            ROOT_DIR / "build",
+        ]
+        for path in build_dirs:
+            if path.exists():
+                print(f"removing '{path}' (and everything under it)")
+                shutil.rmtree(str(path), ignore_errors=True)
+
+
+def main():
+
+    sha = _run_cmd(["git", "rev-parse", "HEAD"])  # commit id
+    branch = _run_cmd(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    tag = _run_cmd(["git", "describe", "--tags", "--exact-match", "@"])
+    print("-- Git branch:", branch)
+    print("-- Git SHA:", sha)
+    print("-- Git tag:", tag)
+    version = _get_version(sha)
+    print("-- Building version", version)
+    _rm_version()
+
+    _make_version_file(version, sha)
+    lib_package_data = {}
+    if platform.system() != 'Windows' and platform.system() != 'Linux':
+        lib_package_data = {'paddleaudio': ['lib/libgcc_s.1.1.dylib']}
+
+    if platform.system() == 'Linux':
+        lib_package_data = {'paddleaudio': ['lib/lib*']}
+
+    setup_info = dict(
+        # Metadata
+        name='paddleaudio',
+        version=VERSION,
+        author='PaddlePaddle Speech and Language Team',
+        author_email='paddlesl@baidu.com',
+        url='https://github.com/PaddlePaddle/PaddleSpeech/audio',
+        license='Apache 2.0',
+        description='Speech audio tools based on Paddlepaddle',
+        keywords=[
+            "audio process"
+            "paddlepaddle",
+        ],
+        python_requires='>=3.7',
+        install_requires=requirements["install"],
+        extras_require={
+            'develop':
+            requirements["develop"],
+            #'test': ["nose", "torchaudio==0.10.2", "pytest-benchmark", "librosa=0.8.1", "parameterized", "paddlepaddle"],
+        },
+        cmdclass={
+            "build_ext": setup_helpers.CMakeBuild,
+            'develop': DevelopCommand,
+            'test': TestCommand,
+            'upload': UploadCommand,
+            "clean": clean,
+        },
+
+        # Package info
+        packages=find_packages(include=('paddleaudio*')),
+        package_data=lib_package_data,
+        ext_modules=setup_helpers.get_ext_modules(),
+        zip_safe=True,
+        classifiers=[
+            'Development Status :: 5 - Production/Stable',
+            'Intended Audience :: Developers',
+            'Intended Audience :: Science/Research',
+            'Topic :: Scientific/Engineering :: Artificial Intelligence',
+            'License :: OSI Approved :: Apache Software License',
+            'Programming Language :: Python',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8',
+            'Programming Language :: Python :: 3.9',
+            'Programming Language :: Python :: 3.10',
+        ],
+    )
+
+    setup(**setup_info)
+    _rm_version()
+
+if __name__ == '__main__':
+    main()
--- a/tests/unit/audio/backends/base.py
+++ b/tests/unit/audio/backends/base.py
--- a/audio/tests/backends/common.py
+++ b/audio/tests/backends/common.py
@ -0,0 +1,32 @@
+
+def get_encoding(ext, dtype):
+    exts = {
+        "mp3",
+        "flac",
+        "vorbis",
+    }
+    encodings = {
+        "float32": "PCM_F",
+        "int32": "PCM_S",
+        "int16": "PCM_S",
+        "uint8": "PCM_U",
+    }
+    return ext.upper() if ext in exts else encodings[dtype]
+
+
+def get_bit_depth(dtype):
+    bit_depths = {
+        "float32": 32,
+        "int32": 32,
+        "int16": 16,
+        "uint8": 8,
+    }
+    return bit_depths[dtype]
+
+def get_bits_per_sample(ext, dtype):
+    bits_per_samples = {
+        "flac": 24,
+        "mp3": 0,
+        "vorbis": 0,
+    }
+    return bits_per_samples.get(ext, get_bit_depth(dtype))
--- a/audio/tests/backends/soundfile/base.py
+++ b/audio/tests/backends/soundfile/base.py
@ -0,0 +1,34 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import urllib.request
+
+mono_channel_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
+multi_channels_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav'
+
+
+class BackendTest(unittest.TestCase):
+    def setUp(self):
+        self.initWavInput()
+
+    def initWavInput(self):
+        self.files = []
+        for url in [mono_channel_wav, multi_channels_wav]:
+            if not os.path.isfile(os.path.basename(url)):
+                urllib.request.urlretrieve(url, os.path.basename(url))
+            self.files.append(os.path.basename(url))
+
+    def initParmas(self):
+        raise NotImplementedError
--- a/audio/tests/backends/soundfile/common.py
+++ b/audio/tests/backends/soundfile/common.py
@ -0,0 +1,89 @@
+import itertools
+from unittest import skipIf
+
+from paddleaudio._internal.module_utils import is_module_available
+from parameterized import parameterized
+
+
+def name_func(func, _, params):
+    return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
+
+
+def dtype2subtype(dtype):
+    return {
+        "float64": "DOUBLE",
+        "float32": "FLOAT",
+        "int32": "PCM_32",
+        "int16": "PCM_16",
+        "uint8": "PCM_U8",
+        "int8": "PCM_S8",
+    }[dtype]
+
+
+def skipIfFormatNotSupported(fmt):
+    fmts = []
+    if is_module_available("soundfile"):
+        import soundfile
+
+        fmts = soundfile.available_formats()
+        return skipIf(fmt not in fmts, f'"{fmt}" is not supported by soundfile')
+    return skipIf(True, '"soundfile" not available.')
+
+
+def parameterize(*params):
+    return parameterized.expand(
+        list(itertools.product(*params)), name_func=name_func)
+
+
+def fetch_wav_subtype(dtype, encoding, bits_per_sample):
+    subtype = {
+        (None, None): dtype2subtype(dtype),
+        (None, 8): "PCM_U8",
+        ("PCM_U", None): "PCM_U8",
+        ("PCM_U", 8): "PCM_U8",
+        ("PCM_S", None): "PCM_32",
+        ("PCM_S", 16): "PCM_16",
+        ("PCM_S", 32): "PCM_32",
+        ("PCM_F", None): "FLOAT",
+        ("PCM_F", 32): "FLOAT",
+        ("PCM_F", 64): "DOUBLE",
+        ("ULAW", None): "ULAW",
+        ("ULAW", 8): "ULAW",
+        ("ALAW", None): "ALAW",
+        ("ALAW", 8): "ALAW",
+    }.get((encoding, bits_per_sample))
+    if subtype:
+        return subtype
+    raise ValueError(f"wav does not support ({encoding}, {bits_per_sample}).")
+
+def get_encoding(ext, dtype):
+    exts = {
+        "mp3",
+        "flac",
+        "vorbis",
+    }
+    encodings = {
+        "float32": "PCM_F",
+        "int32": "PCM_S",
+        "int16": "PCM_S",
+        "uint8": "PCM_U",
+    }
+    return ext.upper() if ext in exts else encodings[dtype]
+
+
+def get_bit_depth(dtype):
+    bit_depths = {
+        "float32": 32,
+        "int32": 32,
+        "int16": 16,
+        "uint8": 8,
+    }
+    return bit_depths[dtype]
+
+def get_bits_per_sample(ext, dtype):
+    bits_per_samples = {
+        "flac": 24,
+        "mp3": 0,
+        "vorbis": 0,
+    }
+    return bits_per_samples.get(ext, get_bit_depth(dtype))
--- a/audio/tests/backends/soundfile/common_utils
+++ b/audio/tests/backends/soundfile/common_utils
@ -0,0 +1 @@
+../../common_utils
--- a/audio/tests/backends/soundfile/info_test.py
+++ b/audio/tests/backends/soundfile/info_test.py
@ -0,0 +1,199 @@
+#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/info_test.py
+import tarfile
+import unittest
+import warnings
+from unittest.mock import patch
+
+import paddle
+import soundfile
+from common import get_bits_per_sample
+from common import get_encoding
+from common import parameterize
+from common import skipIfFormatNotSupported
+from common_utils import get_wav_data
+from common_utils import nested_params
+from common_utils import save_wav
+from common_utils import TempDirMixin
+from paddleaudio.backends import soundfile_backend
+
+
+class TestInfo(TempDirMixin, unittest.TestCase):
+    @parameterize(
+        ["float32", "int32"],
+        [8000, 16000],
+        [1, 2], )
+    def test_wav(self, dtype, sample_rate, num_channels):
+        """`soundfile_backend.info` can check wav file correctly"""
+        duration = 1
+        path = self.get_temp_path("data.wav")
+        data = get_wav_data(
+            dtype,
+            num_channels,
+            normalize=False,
+            num_frames=duration * sample_rate)
+        save_wav(path, data, sample_rate)
+        info = soundfile_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == sample_rate * duration
+        assert info.num_channels == num_channels
+        assert info.bits_per_sample == get_bits_per_sample("wav", dtype)
+        assert info.encoding == get_encoding("wav", dtype)
+
+    @parameterize([8000, 16000], [1, 2])
+    @skipIfFormatNotSupported("FLAC")
+    def test_flac(self, sample_rate, num_channels):
+        """`soundfile_backend.info` can check flac file correctly"""
+        duration = 1
+        num_frames = sample_rate * duration
+        #data = torch.randn(num_frames, num_channels).numpy()
+        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
+
+        path = self.get_temp_path("data.flac")
+        soundfile.write(path, data, sample_rate)
+
+        info = soundfile_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == num_frames
+        assert info.num_channels == num_channels
+        assert info.bits_per_sample == 16
+        assert info.encoding == "FLAC"
+
+    #@parameterize([8000, 16000], [1, 2])
+    #@skipIfFormatNotSupported("OGG")
+    #def test_ogg(self, sample_rate, num_channels):
+    #"""`soundfile_backend.info` can check ogg file correctly"""
+    #duration = 1
+    #num_frames = sample_rate * duration
+    ##data = torch.randn(num_frames, num_channels).numpy()
+    #data = paddle.randn(shape=[num_frames, num_channels]).numpy()
+    #print(len(data))
+    #path = self.get_temp_path("data.ogg")
+    #soundfile.write(path, data, sample_rate)
+
+    #info = soundfile_backend.info(path)
+    #print(info)
+    #assert info.sample_rate == sample_rate
+    #print("info")
+    #print(info.num_frames)
+    #print("jiji")
+    #print(sample_rate*duration)
+    ##assert info.num_frames == sample_rate * duration
+    #assert info.num_channels == num_channels
+    #assert info.bits_per_sample == 0
+    #assert info.encoding == "VORBIS"
+
+    @nested_params(
+        [8000, 16000],
+        [1, 2],
+        [("PCM_24", 24), ("PCM_32", 32)], )
+    @skipIfFormatNotSupported("NIST")
+    def test_sphere(self, sample_rate, num_channels, subtype_and_bit_depth):
+        """`soundfile_backend.info` can check sph file correctly"""
+        duration = 1
+        num_frames = sample_rate * duration
+        #data = torch.randn(num_frames, num_channels).numpy()
+        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
+        path = self.get_temp_path("data.nist")
+        subtype, bits_per_sample = subtype_and_bit_depth
+        soundfile.write(path, data, sample_rate, subtype=subtype)
+
+        info = soundfile_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == sample_rate * duration
+        assert info.num_channels == num_channels
+        assert info.bits_per_sample == bits_per_sample
+        assert info.encoding == "PCM_S"
+
+    def test_unknown_subtype_warning(self):
+        """soundfile_backend.info issues a warning when the subtype is unknown
+
+        This will happen if a new subtype is supported in SoundFile: the _SUBTYPE_TO_BITS_PER_SAMPLE
+        dict should be updated.
+        """
+
+        def _mock_info_func(_):
+            class MockSoundFileInfo:
+                samplerate = 8000
+                frames = 356
+                channels = 2
+                subtype = "UNSEEN_SUBTYPE"
+                format = "UNKNOWN"
+
+            return MockSoundFileInfo()
+
+        with patch("soundfile.info", _mock_info_func):
+            with warnings.catch_warnings(record=True) as w:
+                info = soundfile_backend.info("foo")
+                assert len(w) == 1
+                assert "UNSEEN_SUBTYPE subtype is unknown to PaddleAudio" in str(
+                    w[-1].message)
+                assert info.bits_per_sample == 0
+
+
+class TestFileObject(TempDirMixin, unittest.TestCase):
+    def _test_fileobj(self, ext, subtype, bits_per_sample):
+        """Query audio via file-like object works"""
+        duration = 2
+        sample_rate = 16000
+        num_channels = 2
+        num_frames = sample_rate * duration
+        path = self.get_temp_path(f"test.{ext}")
+
+        #data = torch.randn(num_frames, num_channels).numpy()
+        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
+        soundfile.write(path, data, sample_rate, subtype=subtype)
+
+        with open(path, "rb") as fileobj:
+            info = soundfile_backend.info(fileobj)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == num_frames
+        assert info.num_channels == num_channels
+        assert info.bits_per_sample == bits_per_sample
+        assert info.encoding == "FLAC" if ext == "flac" else "PCM_S"
+
+    def test_fileobj_wav(self):
+        """Loading audio via file-like object works"""
+        self._test_fileobj("wav", "PCM_16", 16)
+
+    @skipIfFormatNotSupported("FLAC")
+    def test_fileobj_flac(self):
+        """Loading audio via file-like object works"""
+        self._test_fileobj("flac", "PCM_16", 16)
+
+    def _test_tarobj(self, ext, subtype, bits_per_sample):
+        """Query compressed audio via file-like object works"""
+        duration = 2
+        sample_rate = 16000
+        num_channels = 2
+        num_frames = sample_rate * duration
+        audio_file = f"test.{ext}"
+        audio_path = self.get_temp_path(audio_file)
+        archive_path = self.get_temp_path("archive.tar.gz")
+
+        #data = torch.randn(num_frames, num_channels).numpy()
+        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
+        soundfile.write(audio_path, data, sample_rate, subtype=subtype)
+
+        with tarfile.TarFile(archive_path, "w") as tarobj:
+            tarobj.add(audio_path, arcname=audio_file)
+        with tarfile.TarFile(archive_path, "r") as tarobj:
+            fileobj = tarobj.extractfile(audio_file)
+            info = soundfile_backend.info(fileobj)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == num_frames
+        assert info.num_channels == num_channels
+        assert info.bits_per_sample == bits_per_sample
+        assert info.encoding == "FLAC" if ext == "flac" else "PCM_S"
+
+    def test_tarobj_wav(self):
+        """Query compressed audio via file-like object works"""
+        self._test_tarobj("wav", "PCM_16", 16)
+
+    @skipIfFormatNotSupported("FLAC")
+    def test_tarobj_flac(self):
+        """Query compressed audio via file-like object works"""
+        self._test_tarobj("flac", "PCM_16", 16)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/audio/tests/backends/soundfile/load_test.py
+++ b/audio/tests/backends/soundfile/load_test.py
@ -0,0 +1,363 @@
+#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/load_test.py
+import os
+import tarfile
+import unittest
+from unittest.mock import patch
+
+import numpy as np
+import paddle
+import soundfile
+from common import dtype2subtype
+from common import parameterize
+from common import skipIfFormatNotSupported
+from common_utils import get_wav_data
+from common_utils import load_wav
+from common_utils import normalize_wav
+from common_utils import save_wav
+from common_utils import TempDirMixin
+from paddleaudio.backends import soundfile_backend
+from parameterized import parameterized
+
+
+def _get_mock_path(
+        ext: str,
+        dtype: str,
+        sample_rate: int,
+        num_channels: int,
+        num_frames: int, ):
+    return f"{dtype}_{sample_rate}_{num_channels}_{num_frames}.{ext}"
+
+
+def _get_mock_params(path: str):
+    filename, ext = path.split(".")
+    parts = filename.split("_")
+    return {
+        "ext": ext,
+        "dtype": parts[0],
+        "sample_rate": int(parts[1]),
+        "num_channels": int(parts[2]),
+        "num_frames": int(parts[3]),
+    }
+
+
+class SoundFileMock:
+    def __init__(self, path, mode):
+        assert mode == "r"
+        self.path = path
+        self._params = _get_mock_params(path)
+        self._start = None
+
+    @property
+    def samplerate(self):
+        return self._params["sample_rate"]
+
+    @property
+    def format(self):
+        if self._params["ext"] == "wav":
+            return "WAV"
+        if self._params["ext"] == "flac":
+            return "FLAC"
+        if self._params["ext"] == "ogg":
+            return "OGG"
+        if self._params["ext"] in ["sph", "nis", "nist"]:
+            return "NIST"
+
+    @property
+    def subtype(self):
+        if self._params["ext"] == "ogg":
+            return "VORBIS"
+        return dtype2subtype(self._params["dtype"])
+
+    def _prepare_read(self, start, stop, frames):
+        assert stop is None
+        self._start = start
+        return frames
+
+    def read(self, frames, dtype, always_2d):
+        assert always_2d
+        data = get_wav_data(
+            dtype,
+            self._params["num_channels"],
+            normalize=False,
+            num_frames=self._params["num_frames"],
+            channels_first=False, ).numpy()
+        return data[self._start:self._start + frames]
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        pass
+
+
+class MockedLoadTest(unittest.TestCase):
+    def assert_dtype(self, ext, dtype, sample_rate, num_channels, normalize,
+                     channels_first):
+        """When format is WAV or NIST, normalize=False will return the native dtype Tensor, otherwise float32"""
+        num_frames = 3 * sample_rate
+        path = _get_mock_path(ext, dtype, sample_rate, num_channels, num_frames)
+        expected_dtype = paddle.float32 if normalize or ext not in [
+            "wav", "nist"
+        ] else getattr(paddle, dtype)
+        with patch("soundfile.SoundFile", SoundFileMock):
+            found, sr = soundfile_backend.load(
+                path, normalize=normalize, channels_first=channels_first)
+            assert found.dtype == expected_dtype
+            assert sample_rate == sr
+
+    @parameterize(
+        ["int32", "float32", "float64"],
+        [8000, 16000],
+        [1, 2],
+        [True, False],
+        [True, False], )
+    def test_wav(self, dtype, sample_rate, num_channels, normalize,
+                 channels_first):
+        """Returns native dtype when normalize=False else float32"""
+        self.assert_dtype("wav", dtype, sample_rate, num_channels, normalize,
+                          channels_first)
+
+    @parameterize(
+        ["int32"],
+        [8000, 16000],
+        [1, 2],
+        [True, False],
+        [True, False], )
+    def test_sphere(self, dtype, sample_rate, num_channels, normalize,
+                    channels_first):
+        """Returns float32 always"""
+        self.assert_dtype("sph", dtype, sample_rate, num_channels, normalize,
+                          channels_first)
+
+    @parameterize([8000, 16000], [1, 2], [True, False], [True, False])
+    def test_ogg(self, sample_rate, num_channels, normalize, channels_first):
+        """Returns float32 always"""
+        self.assert_dtype("ogg", "int16", sample_rate, num_channels, normalize,
+                          channels_first)
+
+    @parameterize([8000, 16000], [1, 2], [True, False], [True, False])
+    def test_flac(self, sample_rate, num_channels, normalize, channels_first):
+        """`soundfile_backend.load` can load ogg format."""
+        self.assert_dtype("flac", "int16", sample_rate, num_channels, normalize,
+                          channels_first)
+
+
+class LoadTestBase(TempDirMixin, unittest.TestCase):
+    def assert_wav(
+            self,
+            dtype,
+            sample_rate,
+            num_channels,
+            normalize,
+            channels_first=True,
+            duration=1, ):
+        """`soundfile_backend.load` can load wav format correctly.
+
+        Wav data loaded with soundfile backend should match those with scipy
+        """
+        path = self.get_temp_path("reference.wav")
+        num_frames = duration * sample_rate
+        data = get_wav_data(
+            dtype,
+            num_channels,
+            normalize=normalize,
+            num_frames=num_frames,
+            channels_first=channels_first, )
+        save_wav(path, data, sample_rate, channels_first=channels_first)
+        expected = load_wav(
+            path, normalize=normalize, channels_first=channels_first)[0]
+        data, sr = soundfile_backend.load(
+            path, normalize=normalize, channels_first=channels_first)
+        assert sr == sample_rate
+        np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
+
+    def assert_sphere(
+            self,
+            dtype,
+            sample_rate,
+            num_channels,
+            channels_first=True,
+            duration=1, ):
+        """`soundfile_backend.load` can load SPHERE format correctly."""
+        path = self.get_temp_path("reference.sph")
+        num_frames = duration * sample_rate
+        raw = get_wav_data(
+            dtype,
+            num_channels,
+            num_frames=num_frames,
+            normalize=False,
+            channels_first=False, )
+        soundfile.write(
+            path, raw, sample_rate, subtype=dtype2subtype(dtype), format="NIST")
+        expected = normalize_wav(raw.t() if channels_first else raw)
+        data, sr = soundfile_backend.load(path, channels_first=channels_first)
+        assert sr == sample_rate
+        #self.assertEqual(data, expected, atol=1e-4, rtol=1e-8)
+        np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
+
+    def assert_flac(
+            self,
+            dtype,
+            sample_rate,
+            num_channels,
+            channels_first=True,
+            duration=1, ):
+        """`soundfile_backend.load` can load FLAC format correctly."""
+        path = self.get_temp_path("reference.flac")
+        num_frames = duration * sample_rate
+        raw = get_wav_data(
+            dtype,
+            num_channels,
+            num_frames=num_frames,
+            normalize=False,
+            channels_first=False, )
+        soundfile.write(path, raw, sample_rate)
+        expected = normalize_wav(raw.t() if channels_first else raw)
+        data, sr = soundfile_backend.load(path, channels_first=channels_first)
+        assert sr == sample_rate
+        #self.assertEqual(data, expected, atol=1e-4, rtol=1e-8)
+        np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
+
+
+class TestLoad(LoadTestBase):
+    """Test the correctness of `soundfile_backend.load` for various formats"""
+
+    @parameterize(
+        ["float32", "int32"],
+        [8000, 16000],
+        [1, 2],
+        [False, True],
+        [False, True], )
+    def test_wav(self, dtype, sample_rate, num_channels, normalize,
+                 channels_first):
+        """`soundfile_backend.load` can load wav format correctly."""
+        self.assert_wav(dtype, sample_rate, num_channels, normalize,
+                        channels_first)
+
+    @parameterize(
+        ["int32"],
+        [16000],
+        [2],
+        [False], )
+    def test_wav_large(self, dtype, sample_rate, num_channels, normalize):
+        """`soundfile_backend.load` can load large wav file correctly."""
+        two_hours = 2 * 60 * 60
+        self.assert_wav(
+            dtype, sample_rate, num_channels, normalize, duration=two_hours)
+
+    @parameterize(["float32", "int32"], [4, 8, 16, 32], [False, True])
+    def test_multiple_channels(self, dtype, num_channels, channels_first):
+        """`soundfile_backend.load` can load wav file with more than 2 channels."""
+        sample_rate = 8000
+        normalize = False
+        self.assert_wav(dtype, sample_rate, num_channels, normalize,
+                        channels_first)
+
+    #@parameterize(["int32"], [8000, 16000], [1, 2], [False, True])
+    #@skipIfFormatNotSupported("NIST")
+    #def test_sphere(self, dtype, sample_rate, num_channels, channels_first):
+    #"""`soundfile_backend.load` can load sphere format correctly."""
+    #self.assert_sphere(dtype, sample_rate, num_channels, channels_first)
+
+    #@parameterize(["int32"], [8000, 16000], [1, 2], [False, True])
+    #@skipIfFormatNotSupported("FLAC")
+    #def test_flac(self, dtype, sample_rate, num_channels, channels_first):
+    #"""`soundfile_backend.load` can load flac format correctly."""
+    #self.assert_flac(dtype, sample_rate, num_channels, channels_first)
+
+
+class TestLoadFormat(TempDirMixin, unittest.TestCase):
+    """Given `format` parameter, `so.load` can load files without extension"""
+
+    original = None
+    path = None
+
+    def _make_file(self, format_):
+        sample_rate = 8000
+        path_with_ext = self.get_temp_path(f"test.{format_}")
+        data = get_wav_data("float32", num_channels=2).numpy().T
+        soundfile.write(path_with_ext, data, sample_rate)
+        expected = soundfile.read(path_with_ext, dtype="float32")[0].T
+        path = os.path.splitext(path_with_ext)[0]
+        os.rename(path_with_ext, path)
+        return path, expected
+
+    def _test_format(self, format_):
+        """Providing format allows to read file without extension"""
+        path, expected = self._make_file(format_)
+        found, _ = soundfile_backend.load(path)
+        #self.assertEqual(found, expected)
+        np.testing.assert_array_almost_equal(found, expected)
+
+    @parameterized.expand([
+        ("WAV", ),
+        ("wav", ),
+    ])
+    def test_wav(self, format_):
+        self._test_format(format_)
+
+    @parameterized.expand([
+        ("FLAC", ),
+        ("flac", ),
+    ])
+    @skipIfFormatNotSupported("FLAC")
+    def test_flac(self, format_):
+        self._test_format(format_)
+
+
+class TestFileObject(TempDirMixin, unittest.TestCase):
+    def _test_fileobj(self, ext):
+        """Loading audio via file-like object works"""
+        sample_rate = 16000
+        path = self.get_temp_path(f"test.{ext}")
+
+        data = get_wav_data("float32", num_channels=2).numpy().T
+        soundfile.write(path, data, sample_rate)
+        expected = soundfile.read(path, dtype="float32")[0].T
+
+        with open(path, "rb") as fileobj:
+            found, sr = soundfile_backend.load(fileobj)
+        assert sr == sample_rate
+        #self.assertEqual(expected, found)
+        np.testing.assert_array_almost_equal(found, expected)
+
+    def test_fileobj_wav(self):
+        """Loading audio via file-like object works"""
+        self._test_fileobj("wav")
+
+    def test_fileobj_flac(self):
+        """Loading audio via file-like object works"""
+        self._test_fileobj("flac")
+
+    def _test_tarfile(self, ext):
+        """Loading audio via file-like object works"""
+        sample_rate = 16000
+        audio_file = f"test.{ext}"
+        audio_path = self.get_temp_path(audio_file)
+        archive_path = self.get_temp_path("archive.tar.gz")
+
+        data = get_wav_data("float32", num_channels=2).numpy().T
+        soundfile.write(audio_path, data, sample_rate)
+        expected = soundfile.read(audio_path, dtype="float32")[0].T
+
+        with tarfile.TarFile(archive_path, "w") as tarobj:
+            tarobj.add(audio_path, arcname=audio_file)
+        with tarfile.TarFile(archive_path, "r") as tarobj:
+            fileobj = tarobj.extractfile(audio_file)
+            found, sr = soundfile_backend.load(fileobj)
+
+        assert sr == sample_rate
+        #self.assertEqual(expected, found)
+        np.testing.assert_array_almost_equal(found.numpy(), expected)
+
+    def test_tarfile_wav(self):
+        """Loading audio via file-like object works"""
+        self._test_tarfile("wav")
+
+    def test_tarfile_flac(self):
+        """Loading audio via file-like object works"""
+        self._test_tarfile("flac")
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/audio/tests/backends/soundfile/save_test.py
+++ b/audio/tests/backends/soundfile/save_test.py
@ -0,0 +1,323 @@
+import io
+import unittest
+from unittest.mock import patch
+
+import numpy as np
+import paddle
+import soundfile
+from common import fetch_wav_subtype
+from common import parameterize
+from common import skipIfFormatNotSupported
+from common_utils import get_wav_data
+from common_utils import load_wav
+from common_utils import nested_params
+from common_utils import TempDirMixin
+from paddleaudio.backends import soundfile_backend
+
+
+class MockedSaveTest(unittest.TestCase):
+    @nested_params(
+        ["float32", "int32"],
+        [8000, 16000],
+        [1, 2],
+        [False, True],
+        [
+            (None, None),
+            ("PCM_U", None),
+            ("PCM_U", 8),
+            ("PCM_S", None),
+            ("PCM_S", 16),
+            ("PCM_S", 32),
+            ("PCM_F", None),
+            ("PCM_F", 32),
+            ("PCM_F", 64),
+            ("ULAW", None),
+            ("ULAW", 8),
+            ("ALAW", None),
+            ("ALAW", 8),
+        ], )
+    @patch("soundfile.write")
+    def test_wav(self, dtype, sample_rate, num_channels, channels_first,
+                 enc_params, mocked_write):
+        """soundfile_backend.save passes correct subtype to soundfile.write when WAV"""
+        filepath = "foo.wav"
+        input_tensor = get_wav_data(
+            dtype,
+            num_channels,
+            num_frames=3 * sample_rate,
+            normalize=dtype == "float32",
+            channels_first=channels_first, )
+        input_tensor = paddle.transpose(input_tensor, [1, 0])
+
+        encoding, bits_per_sample = enc_params
+        soundfile_backend.save(
+            filepath,
+            input_tensor,
+            sample_rate,
+            channels_first=channels_first,
+            encoding=encoding,
+            bits_per_sample=bits_per_sample, )
+
+        # on +Py3.8 call_args.kwargs is more descreptive
+        args = mocked_write.call_args[1]
+        assert args["file"] == filepath
+        assert args["samplerate"] == sample_rate
+        assert args["subtype"] == fetch_wav_subtype(dtype, encoding,
+                                                    bits_per_sample)
+        assert args["format"] is None
+        tensor_result = paddle.transpose(
+            input_tensor, [1, 0]) if channels_first else input_tensor
+        #self.assertEqual(args["data"], tensor_result.numpy())
+        np.testing.assert_array_almost_equal(args["data"].numpy(),
+                                             tensor_result.numpy())
+
+    @patch("soundfile.write")
+    def assert_non_wav(
+            self,
+            fmt,
+            dtype,
+            sample_rate,
+            num_channels,
+            channels_first,
+            mocked_write,
+            encoding=None,
+            bits_per_sample=None, ):
+        """soundfile_backend.save passes correct subtype and format to soundfile.write when SPHERE"""
+        filepath = f"foo.{fmt}"
+        input_tensor = get_wav_data(
+            dtype,
+            num_channels,
+            num_frames=3 * sample_rate,
+            normalize=False,
+            channels_first=channels_first, )
+        input_tensor = paddle.transpose(input_tensor, [1, 0])
+
+        expected_data = paddle.transpose(
+            input_tensor, [1, 0]) if channels_first else input_tensor
+
+        soundfile_backend.save(
+            filepath,
+            input_tensor,
+            sample_rate,
+            channels_first,
+            encoding=encoding,
+            bits_per_sample=bits_per_sample, )
+
+        # on +Py3.8 call_args.kwargs is more descreptive
+        args = mocked_write.call_args[1]
+        assert args["file"] == filepath
+        assert args["samplerate"] == sample_rate
+        if fmt in ["sph", "nist", "nis"]:
+            assert args["format"] == "NIST"
+        else:
+            assert args["format"] is None
+        np.testing.assert_array_almost_equal(args["data"].numpy(),
+                                             expected_data.numpy())
+        #self.assertEqual(args["data"], expected_data)
+
+    @nested_params(
+        ["sph", "nist", "nis"],
+        ["int32"],
+        [8000, 16000],
+        [1, 2],
+        [False, True],
+        [
+            ("PCM_S", 8),
+            ("PCM_S", 16),
+            ("PCM_S", 24),
+            ("PCM_S", 32),
+            ("ULAW", 8),
+            ("ALAW", 8),
+            ("ALAW", 16),
+            ("ALAW", 24),
+            ("ALAW", 32),
+        ], )
+    def test_sph(self, fmt, dtype, sample_rate, num_channels, channels_first,
+                 enc_params):
+        """soundfile_backend.save passes default format and subtype (None-s) to
+        soundfile.write when not WAV"""
+        encoding, bits_per_sample = enc_params
+        self.assert_non_wav(
+            fmt,
+            dtype,
+            sample_rate,
+            num_channels,
+            channels_first,
+            encoding=encoding,
+            bits_per_sample=bits_per_sample)
+
+    @parameterize(
+        ["int32"],
+        [8000, 16000],
+        [1, 2],
+        [False, True],
+        [8, 16, 24], )
+    def test_flac(self, dtype, sample_rate, num_channels, channels_first,
+                  bits_per_sample):
+        """soundfile_backend.save passes default format and subtype (None-s) to
+        soundfile.write when not WAV"""
+        self.assert_non_wav(
+            "flac",
+            dtype,
+            sample_rate,
+            num_channels,
+            channels_first,
+            bits_per_sample=bits_per_sample)
+
+    @parameterize(
+        ["int32"],
+        [8000, 16000],
+        [1, 2],
+        [False, True], )
+    def test_ogg(self, dtype, sample_rate, num_channels, channels_first):
+        """soundfile_backend.save passes default format and subtype (None-s) to
+        soundfile.write when not WAV"""
+        self.assert_non_wav("ogg", dtype, sample_rate, num_channels,
+                            channels_first)
+
+
+class SaveTestBase(TempDirMixin, unittest.TestCase):
+    def assert_wav(self, dtype, sample_rate, num_channels, num_frames):
+        """`soundfile_backend.save` can save wav format."""
+        path = self.get_temp_path("data.wav")
+        expected = get_wav_data(
+            dtype, num_channels, num_frames=num_frames, normalize=False)
+        soundfile_backend.save(path, expected, sample_rate)
+        found, sr = load_wav(path, normalize=False)
+        assert sample_rate == sr
+        #self.assertEqual(found, expected)
+        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
+
+    def _assert_non_wav(self, fmt, dtype, sample_rate, num_channels):
+        """`soundfile_backend.save` can save non-wav format.
+
+        Due to precision missmatch, and the lack of alternative way to decode the
+        resulting files without using soundfile, only meta data are validated.
+        """
+        num_frames = sample_rate * 3
+        path = self.get_temp_path(f"data.{fmt}")
+        expected = get_wav_data(
+            dtype, num_channels, num_frames=num_frames, normalize=False)
+        soundfile_backend.save(path, expected, sample_rate)
+        sinfo = soundfile.info(path)
+        assert sinfo.format == fmt.upper()
+        #assert sinfo.frames == num_frames this go wrong
+        assert sinfo.channels == num_channels
+        assert sinfo.samplerate == sample_rate
+
+    def assert_flac(self, dtype, sample_rate, num_channels):
+        """`soundfile_backend.save` can save flac format."""
+        self._assert_non_wav("flac", dtype, sample_rate, num_channels)
+
+    def assert_sphere(self, dtype, sample_rate, num_channels):
+        """`soundfile_backend.save` can save sph format."""
+        self._assert_non_wav("nist", dtype, sample_rate, num_channels)
+
+    def assert_ogg(self, dtype, sample_rate, num_channels):
+        """`soundfile_backend.save` can save ogg format.
+
+        As we cannot inspect the OGG format (it's lossy), we only check the metadata.
+        """
+        self._assert_non_wav("ogg", dtype, sample_rate, num_channels)
+
+
+class TestSave(SaveTestBase):
+    @parameterize(
+        ["float32", "int32"],
+        [8000, 16000],
+        [1, 2], )
+    def test_wav(self, dtype, sample_rate, num_channels):
+        """`soundfile_backend.save` can save wav format."""
+        self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
+
+    @parameterize(
+        ["float32", "int32"],
+        [4, 8, 16, 32], )
+    def test_multiple_channels(self, dtype, num_channels):
+        """`soundfile_backend.save` can save wav with more than 2 channels."""
+        sample_rate = 8000
+        self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
+
+    @parameterize(
+        ["int32"],
+        [8000, 16000],
+        [1, 2], )
+    @skipIfFormatNotSupported("NIST")
+    def test_sphere(self, dtype, sample_rate, num_channels):
+        """`soundfile_backend.save` can save sph format."""
+        self.assert_sphere(dtype, sample_rate, num_channels)
+
+    @parameterize(
+        [8000, 16000],
+        [1, 2], )
+    @skipIfFormatNotSupported("FLAC")
+    def test_flac(self, sample_rate, num_channels):
+        """`soundfile_backend.save` can save flac format."""
+        self.assert_flac("float32", sample_rate, num_channels)
+
+    @parameterize(
+        [8000, 16000],
+        [1, 2], )
+    @skipIfFormatNotSupported("OGG")
+    def test_ogg(self, sample_rate, num_channels):
+        """`soundfile_backend.save` can save ogg/vorbis format."""
+        self.assert_ogg("float32", sample_rate, num_channels)
+
+
+class TestSaveParams(TempDirMixin, unittest.TestCase):
+    """Test the correctness of optional parameters of `soundfile_backend.save`"""
+
+    @parameterize([True, False])
+    def test_channels_first(self, channels_first):
+        """channels_first swaps axes"""
+        path = self.get_temp_path("data.wav")
+        data = get_wav_data("int32", 2, channels_first=channels_first)
+        soundfile_backend.save(path, data, 8000, channels_first=channels_first)
+        found = load_wav(path)[0]
+        expected = data if channels_first else data.transpose([1, 0])
+        #self.assertEqual(found, expected, atol=1e-4, rtol=1e-8)
+        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
+
+
+class TestFileObject(TempDirMixin, unittest.TestCase):
+    def _test_fileobj(self, ext):
+        """Saving audio to file-like object works"""
+        sample_rate = 16000
+        path = self.get_temp_path(f"test.{ext}")
+
+        subtype = "FLOAT" if ext == "wav" else None
+        data = get_wav_data("float32", num_channels=2)
+        soundfile.write(path, data.numpy().T, sample_rate, subtype=subtype)
+        expected = soundfile.read(path, dtype="float32")[0]
+
+        fileobj = io.BytesIO()
+        soundfile_backend.save(fileobj, data, sample_rate, format=ext)
+        fileobj.seek(0)
+        found, sr = soundfile.read(fileobj, dtype="float32")
+
+        assert sr == sample_rate
+        #self.assertEqual(expected, found, atol=1e-4, rtol=1e-8)
+        np.testing.assert_array_almost_equal(found, expected)
+
+    def test_fileobj_wav(self):
+        """Saving audio via file-like object works"""
+        self._test_fileobj("wav")
+
+    @skipIfFormatNotSupported("FLAC")
+    def test_fileobj_flac(self):
+        """Saving audio via file-like object works"""
+        self._test_fileobj("flac")
+
+    @skipIfFormatNotSupported("NIST")
+    def test_fileobj_nist(self):
+        """Saving audio via file-like object works"""
+        self._test_fileobj("NIST")
+
+    @skipIfFormatNotSupported("OGG")
+    def test_fileobj_ogg(self):
+        """Saving audio via file-like object works"""
+        self._test_fileobj("OGG")
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/unit/audio/backends/soundfile/test_io.py
+++ b/tests/unit/audio/backends/soundfile/test_io.py
@ -16,16 +16,17 @@ import os
 import unittest

 import numpy as np
+from paddleaudio.backends import soundfile_load as load
+from paddleaudio.backends import soundfile_save as save
 import soundfile as sf

-import paddlespeech.audio
-from ..base import BackendTest
+from base import BackendTest


 class TestIO(BackendTest):
    def test_load_mono_channel(self):
        sf_data, sf_sr = sf.read(self.files[0])
-        pa_data, pa_sr = paddlespeech.audio.load(
+        pa_data, pa_sr = load(
            self.files[0], normal=False, dtype='float64')

        self.assertEqual(sf_data.dtype, pa_data.dtype)
@ -35,7 +36,7 @@ class TestIO(BackendTest):
    def test_load_multi_channels(self):
        sf_data, sf_sr = sf.read(self.files[1])
        sf_data = sf_data.T  # Channel dim first
-        pa_data, pa_sr = paddlespeech.audio.load(
+        pa_data, pa_sr = load(
            self.files[1], mono=False, normal=False, dtype='float64')

        self.assertEqual(sf_data.dtype, pa_data.dtype)
@ -49,7 +50,7 @@ class TestIO(BackendTest):
        pa_tmp_file = 'pa_tmp.wav'

        sf.write(sf_tmp_file, waveform, sr)
-        paddlespeech.audio.save(waveform, sr, pa_tmp_file)
+        save(waveform, sr, pa_tmp_file)

        self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file))
        for file in [sf_tmp_file, pa_tmp_file]:
@ -62,7 +63,7 @@ class TestIO(BackendTest):
        pa_tmp_file = 'pa_tmp.wav'

        sf.write(sf_tmp_file, waveform.T, sr)
-        paddlespeech.audio.save(waveform.T, sr, pa_tmp_file)
+        save(waveform.T, sr, pa_tmp_file)

        self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file))
        for file in [sf_tmp_file, pa_tmp_file]:
--- a/audio/tests/backends/sox_io/common.py
+++ b/audio/tests/backends/sox_io/common.py
@ -0,0 +1,89 @@
+import itertools
+from unittest import skipIf
+
+from paddleaudio._internal.module_utils import is_module_available
+from parameterized import parameterized
+
+
+def name_func(func, _, params):
+    return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
+
+
+def dtype2subtype(dtype):
+    return {
+        "float64": "DOUBLE",
+        "float32": "FLOAT",
+        "int32": "PCM_32",
+        "int16": "PCM_16",
+        "uint8": "PCM_U8",
+        "int8": "PCM_S8",
+    }[dtype]
+
+
+def skipIfFormatNotSupported(fmt):
+    fmts = []
+    if is_module_available("soundfile"):
+        import soundfile
+
+        fmts = soundfile.available_formats()
+        return skipIf(fmt not in fmts, f'"{fmt}" is not supported by soundfile')
+    return skipIf(True, '"soundfile" not available.')
+
+
+def parameterize(*params):
+    return parameterized.expand(
+        list(itertools.product(*params)), name_func=name_func)
+
+
+def fetch_wav_subtype(dtype, encoding, bits_per_sample):
+    subtype = {
+        (None, None): dtype2subtype(dtype),
+        (None, 8): "PCM_U8",
+        ("PCM_U", None): "PCM_U8",
+        ("PCM_U", 8): "PCM_U8",
+        ("PCM_S", None): "PCM_32",
+        ("PCM_S", 16): "PCM_16",
+        ("PCM_S", 32): "PCM_32",
+        ("PCM_F", None): "FLOAT",
+        ("PCM_F", 32): "FLOAT",
+        ("PCM_F", 64): "DOUBLE",
+        ("ULAW", None): "ULAW",
+        ("ULAW", 8): "ULAW",
+        ("ALAW", None): "ALAW",
+        ("ALAW", 8): "ALAW",
+    }.get((encoding, bits_per_sample))
+    if subtype:
+        return subtype
+    raise ValueError(f"wav does not support ({encoding}, {bits_per_sample}).")
+
+def get_encoding(ext, dtype):
+    exts = {
+        "mp3",
+        "flac",
+        "vorbis",
+    }
+    encodings = {
+        "float32": "PCM_F",
+        "int32": "PCM_S",
+        "int16": "PCM_S",
+        "uint8": "PCM_U",
+    }
+    return ext.upper() if ext in exts else encodings[dtype]
+
+
+def get_bit_depth(dtype):
+    bit_depths = {
+        "float32": 32,
+        "int32": 32,
+        "int16": 16,
+        "uint8": 8,
+    }
+    return bit_depths[dtype]
+
+def get_bits_per_sample(ext, dtype):
+    bits_per_samples = {
+        "flac": 24,
+        "mp3": 0,
+        "vorbis": 0,
+    }
+    return bits_per_samples.get(ext, get_bit_depth(dtype))
--- a/audio/tests/backends/sox_io/common_utils
+++ b/audio/tests/backends/sox_io/common_utils
@ -0,0 +1 @@
+../../common_utils
--- a/audio/tests/backends/sox_io/info_test.py
+++ b/audio/tests/backends/sox_io/info_test.py
@ -0,0 +1,322 @@
+import io
+import itertools
+import os
+import platform
+import tarfile
+import unittest
+from contextlib import contextmanager
+if platform.system() == "Windows":
+    import warnings
+    warnings.warn("sox io not support in Windows, please skip test.")
+    exit()
+
+from parameterized import parameterized
+from common import get_bits_per_sample, get_encoding
+
+from paddleaudio.backends import sox_io_backend
+
+from common_utils import (
+    get_wav_data,
+    save_wav,
+    TempDirMixin,
+    sox_utils, )
+
+#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/info_test.py
+
+
+class TestInfo(TempDirMixin, unittest.TestCase):
+    @parameterized.expand(
+        list(
+            itertools.product(
+                [
+                    "float32",
+                    "int32",
+                ],
+                [8000, 16000],
+                [1, 2], )), )
+    def test_wav(self, dtype, sample_rate, num_channels):
+        """`sox_io_backend.info` can check wav file correctly"""
+        duration = 1
+        path = self.get_temp_path("data.wav")
+        data = get_wav_data(
+            dtype,
+            num_channels,
+            normalize=False,
+            num_frames=duration * sample_rate)
+        save_wav(path, data, sample_rate)
+        info = sox_io_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == sample_rate * duration
+        assert info.num_channels == num_channels
+        assert info.bits_per_sample == sox_utils.get_bit_depth(dtype)
+        assert info.encoding == get_encoding("wav", dtype)
+
+    @parameterized.expand(
+        list(
+            itertools.product(
+                ["float32", "int32"],
+                [8000, 16000],
+                [4, 8, 16, 32], )), )
+    def test_wav_multiple_channels(self, dtype, sample_rate, num_channels):
+        """`sox_io_backend.info` can check wav file with channels more than 2 correctly"""
+        duration = 1
+        path = self.get_temp_path("data.wav")
+        data = get_wav_data(
+            dtype,
+            num_channels,
+            normalize=False,
+            num_frames=duration * sample_rate)
+        save_wav(path, data, sample_rate)
+        info = sox_io_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == sample_rate * duration
+        assert info.num_channels == num_channels
+        assert info.bits_per_sample == sox_utils.get_bit_depth(dtype)
+
+    def test_ulaw(self):
+        """`sox_io_backend.info` can check ulaw file correctly"""
+        duration = 1
+        num_channels = 1
+        sample_rate = 8000
+        path = self.get_temp_path("data.wav")
+        sox_utils.gen_audio_file(
+            path,
+            sample_rate=sample_rate,
+            num_channels=num_channels,
+            bit_depth=8,
+            encoding="u-law",
+            duration=duration)
+        info = sox_io_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == sample_rate * duration
+        assert info.num_channels == num_channels
+        assert info.bits_per_sample == 8
+        assert info.encoding == "ULAW"
+
+    def test_alaw(self):
+        """`sox_io_backend.info` can check alaw file correctly"""
+        duration = 1
+        num_channels = 1
+        sample_rate = 8000
+        path = self.get_temp_path("data.wav")
+        sox_utils.gen_audio_file(
+            path,
+            sample_rate=sample_rate,
+            num_channels=num_channels,
+            bit_depth=8,
+            encoding="a-law",
+            duration=duration)
+        info = sox_io_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_frames == sample_rate * duration
+        assert info.num_channels == num_channels
+        assert info.bits_per_sample == 8
+        assert info.encoding == "ALAW"
+
+
+#class TestInfoOpus(unittest.TestCase):
+#@parameterized.expand(
+#list(
+#itertools.product(
+#["96k"],
+#[1, 2],
+#[0, 5, 10],
+#)
+#),
+#)
+#def test_opus(self, bitrate, num_channels, compression_level):
+#"""`sox_io_backend.info` can check opus file correcty"""
+#path = data_utils.get_asset_path("io", f"{bitrate}_{compression_level}_{num_channels}ch.opus")
+#info = sox_io_backend.info(path)
+#assert info.sample_rate == 48000
+#assert info.num_frames == 32768
+#assert info.num_channels == num_channels
+#assert info.bits_per_sample == 0  # bit_per_sample is irrelevant for compressed formats
+#assert info.encoding == "OPUS"
+
+
+class FileObjTestBase(TempDirMixin):
+    def _gen_file(self,
+                  ext,
+                  dtype,
+                  sample_rate,
+                  num_channels,
+                  num_frames,
+                  *,
+                  comments=None):
+        path = self.get_temp_path(f"test.{ext}")
+        bit_depth = sox_utils.get_bit_depth(dtype)
+        duration = num_frames / sample_rate
+        comment_file = self._gen_comment_file(comments) if comments else None
+
+        sox_utils.gen_audio_file(
+            path,
+            sample_rate,
+            num_channels=num_channels,
+            encoding=sox_utils.get_encoding(dtype),
+            bit_depth=bit_depth,
+            duration=duration,
+            comment_file=comment_file, )
+        return path
+
+    def _gen_comment_file(self, comments):
+        comment_path = self.get_temp_path("comment.txt")
+        with open(comment_path, "w") as file_:
+            file_.writelines(comments)
+        return comment_path
+
+
+class Unseekable:
+    def __init__(self, fileobj):
+        self.fileobj = fileobj
+
+    def read(self, n):
+        return self.fileobj.read(n)
+
+
+class TestFileObject(FileObjTestBase, unittest.TestCase):
+    def _query_fileobj(self,
+                       ext,
+                       dtype,
+                       sample_rate,
+                       num_channels,
+                       num_frames,
+                       *,
+                       comments=None):
+        path = self._gen_file(
+            ext,
+            dtype,
+            sample_rate,
+            num_channels,
+            num_frames,
+            comments=comments)
+        format_ = ext if ext in ["mp3"] else None
+        with open(path, "rb") as fileobj:
+            return sox_io_backend.info(fileobj, format_)
+
+    def _query_bytesio(self, ext, dtype, sample_rate, num_channels, num_frames):
+        path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames)
+        format_ = ext if ext in ["mp3"] else None
+        with open(path, "rb") as file_:
+            fileobj = io.BytesIO(file_.read())
+        return sox_io_backend.info(fileobj, format_)
+
+    def _query_tarfile(self, ext, dtype, sample_rate, num_channels, num_frames):
+        audio_path = self._gen_file(ext, dtype, sample_rate, num_channels,
+                                    num_frames)
+        audio_file = os.path.basename(audio_path)
+        archive_path = self.get_temp_path("archive.tar.gz")
+        with tarfile.TarFile(archive_path, "w") as tarobj:
+            tarobj.add(audio_path, arcname=audio_file)
+        format_ = ext if ext in ["mp3"] else None
+        with tarfile.TarFile(archive_path, "r") as tarobj:
+            fileobj = tarobj.extractfile(audio_file)
+            return sox_io_backend.info(fileobj, format_)
+
+    @contextmanager
+    def _set_buffer_size(self, buffer_size):
+        try:
+            original_buffer_size = get_buffer_size()
+            set_buffer_size(buffer_size)
+            yield
+        finally:
+            set_buffer_size(original_buffer_size)
+
+    @parameterized.expand([
+        ("wav", "float32"),
+        ("wav", "int32"),
+        ("wav", "int16"),
+        ("wav", "uint8"),
+    ])
+    def test_fileobj(self, ext, dtype):
+        """Querying audio via file object works"""
+        sample_rate = 16000
+        num_frames = 3 * sample_rate
+        num_channels = 2
+        sinfo = self._query_fileobj(ext, dtype, sample_rate, num_channels,
+                                    num_frames)
+
+        bits_per_sample = get_bits_per_sample(ext, dtype)
+        num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
+
+        assert sinfo.sample_rate == sample_rate
+        assert sinfo.num_channels == num_channels
+        assert sinfo.num_frames == num_frames
+        assert sinfo.bits_per_sample == bits_per_sample
+        assert sinfo.encoding == get_encoding(ext, dtype)
+
+    @parameterized.expand([
+        ("wav", "float32"),
+        ("wav", "int32"),
+        ("wav", "int16"),
+        ("wav", "uint8"),
+    ])
+    def test_bytesio(self, ext, dtype):
+        """Querying audio via ByteIO object works for small data"""
+        sample_rate = 16000
+        num_frames = 3 * sample_rate
+        num_channels = 2
+        sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels,
+                                    num_frames)
+
+        bits_per_sample = get_bits_per_sample(ext, dtype)
+        num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
+
+        assert sinfo.sample_rate == sample_rate
+        assert sinfo.num_channels == num_channels
+        assert sinfo.num_frames == num_frames
+        assert sinfo.bits_per_sample == bits_per_sample
+        assert sinfo.encoding == get_encoding(ext, dtype)
+
+    @parameterized.expand([
+        ("wav", "float32"),
+        ("wav", "int32"),
+        ("wav", "int16"),
+        ("wav", "uint8"),
+    ])
+    def test_bytesio_tiny(self, ext, dtype):
+        """Querying audio via ByteIO object works for small data"""
+        sample_rate = 8000
+        num_frames = 4
+        num_channels = 2
+        sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels,
+                                    num_frames)
+
+        bits_per_sample = get_bits_per_sample(ext, dtype)
+        num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
+
+        assert sinfo.sample_rate == sample_rate
+        assert sinfo.num_channels == num_channels
+        assert sinfo.num_frames == num_frames
+        assert sinfo.bits_per_sample == bits_per_sample
+        assert sinfo.encoding == get_encoding(ext, dtype)
+
+    @parameterized.expand([
+        ("wav", "float32"),
+        ("wav", "int32"),
+        ("wav", "int16"),
+        ("wav", "uint8"),
+        ("flac", "float32"),
+        ("vorbis", "float32"),
+        ("amb", "int16"),
+    ])
+    def test_tarfile(self, ext, dtype):
+        """Querying compressed audio via file-like object works"""
+        sample_rate = 16000
+        num_frames = 3.0 * sample_rate
+        num_channels = 2
+        sinfo = self._query_tarfile(ext, dtype, sample_rate, num_channels,
+                                    num_frames)
+
+        bits_per_sample = get_bits_per_sample(ext, dtype)
+        num_frames = 0 if ext in ["vorbis"] else num_frames
+
+        assert sinfo.sample_rate == sample_rate
+        assert sinfo.num_channels == num_channels
+        assert sinfo.num_frames == num_frames
+        assert sinfo.bits_per_sample == bits_per_sample
+        assert sinfo.encoding == get_encoding(ext, dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/audio/tests/backends/sox_io/load_test.py
+++ b/audio/tests/backends/sox_io/load_test.py
@ -0,0 +1,56 @@
+import itertools
+import platform
+import unittest
+if platform.system() == "Windows":
+    import warnings
+    warnings.warn("sox io not support in Windows, please skip test.")
+    exit()
+
+from parameterized import parameterized
+import numpy as np
+from paddleaudio.backends import sox_io_backend
+
+from common_utils import (
+    get_wav_data,
+    load_wav,
+    save_wav, )
+
+#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/load_test.py
+
+
+class TestLoad(unittest.TestCase):
+    def assert_wav(self, dtype, sample_rate, num_channels, normalize, duration):
+        """`sox_io_backend.load` can load wav format correctly.
+
+        Wav data loaded with sox_io backend should match those with scipy
+        """
+        path = 'testdata/reference.wav'
+        data = get_wav_data(
+            dtype,
+            num_channels,
+            normalize=normalize,
+            num_frames=duration * sample_rate)
+        save_wav(path, data, sample_rate)
+        expected = load_wav(path, normalize=normalize)[0]
+        data, sr = sox_io_backend.load(path, normalize=normalize)
+        assert sr == sample_rate
+        np.testing.assert_array_almost_equal(data, expected, decimal=4)
+
+    @parameterized.expand(
+        list(
+            itertools.product(
+                [
+                    "float64",
+                    "float32",
+                    "int32",
+                ],
+                [8000, 16000],
+                [1, 2],
+                [False, True], )), )
+    def test_wav(self, dtype, sample_rate, num_channels, normalize):
+        """`sox_io_backend.load` can load wav format correctly."""
+        self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=1)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/audio/tests/backends/sox_io/save_test.py
+++ b/audio/tests/backends/sox_io/save_test.py
@ -0,0 +1,188 @@
+import io
+import platform
+import unittest
+if platform.system() == "Windows":
+    import warnings
+    warnings.warn("sox io not support in Windows, please skip test.")
+    exit()
+
+import numpy as np
+from paddleaudio.backends import sox_io_backend
+
+from common_utils import (get_wav_data, load_wav, save_wav, nested_params,
+                          TempDirMixin, sox_utils)
+
+#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/save_test.py
+
+
+def _get_sox_encoding(encoding):
+    encodings = {
+        "PCM_F": "floating-point",
+        "PCM_S": "signed-integer",
+        "PCM_U": "unsigned-integer",
+        "ULAW": "u-law",
+        "ALAW": "a-law",
+    }
+    return encodings.get(encoding)
+
+
+class TestSaveBase(TempDirMixin):
+    def assert_save_consistency(
+            self,
+            format: str,
+            *,
+            compression: float=None,
+            encoding: str=None,
+            bits_per_sample: int=None,
+            sample_rate: float=8000,
+            num_channels: int=2,
+            num_frames: float=3 * 8000,
+            src_dtype: str="int32",
+            test_mode: str="path", ):
+        """`save` function produces file that is comparable with `sox` command
+
+        To compare that the file produced by `save` function agains the file produced by
+        the equivalent `sox` command, we need to load both files.
+        But there are many formats that cannot be opened with common Python modules (like
+        SciPy).
+        So we use `sox` command to prepare the original data and convert the saved files
+        into a format that SciPy can read (PCM wav).
+        The following diagram illustrates this process. The difference is 2.1. and 3.1.
+
+        This assumes that
+         - loading data with SciPy preserves the data well.
+         - converting the resulting files into WAV format with `sox` preserve the data well.
+
+                          x
+                          | 1. Generate source wav file with SciPy
+                          |
+                          v
+          -------------- wav ----------------
+         |                                   |
+         | 2.1. load with scipy              | 3.1. Convert to the target
+         |   then save it into the target    |      format depth with sox
+         |   format with paddleaudio          |
+         v                                   v
+        target format                       target format
+         |                                   |
+         | 2.2. Convert to wav with sox      | 3.2. Convert to wav with sox
+         |                                   |
+         v                                   v
+        wav                                 wav
+         |                                   |
+         | 2.3. load with scipy              | 3.3. load with scipy
+         |                                   |
+         v                                   v
+        tensor -------> compare <--------- tensor
+
+        """
+        cmp_encoding = "floating-point"
+        cmp_bit_depth = 32
+
+        src_path = self.get_temp_path("1.source.wav")
+        tgt_path = self.get_temp_path(f"2.1.paddleaudio.{format}")
+        tst_path = self.get_temp_path("2.2.result.wav")
+        sox_path = self.get_temp_path(f"3.1.sox.{format}")
+        ref_path = self.get_temp_path("3.2.ref.wav")
+
+        # 1. Generate original wav
+        data = get_wav_data(
+            src_dtype, num_channels, normalize=False, num_frames=num_frames)
+        save_wav(src_path, data, sample_rate)
+
+        # 2.1. Convert the original wav to target format with paddleaudio
+        data = load_wav(src_path, normalize=False)[0]
+        if test_mode == "path":
+            sox_io_backend.save(
+                tgt_path,
+                data,
+                sample_rate,
+                compression=compression,
+                encoding=encoding,
+                bits_per_sample=bits_per_sample)
+        elif test_mode == "fileobj":
+            with open(tgt_path, "bw") as file_:
+                sox_io_backend.save(
+                    file_,
+                    data,
+                    sample_rate,
+                    format=format,
+                    compression=compression,
+                    encoding=encoding,
+                    bits_per_sample=bits_per_sample, )
+        elif test_mode == "bytesio":
+            file_ = io.BytesIO()
+            sox_io_backend.save(
+                file_,
+                data,
+                sample_rate,
+                format=format,
+                compression=compression,
+                encoding=encoding,
+                bits_per_sample=bits_per_sample, )
+            file_.seek(0)
+            with open(tgt_path, "bw") as f:
+                f.write(file_.read())
+        else:
+            raise ValueError(f"Unexpected test mode: {test_mode}")
+        # 2.2. Convert the target format to wav with sox
+        sox_utils.convert_audio_file(
+            tgt_path, tst_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth)
+        # 2.3. Load with SciPy
+        found = load_wav(tst_path, normalize=False)[0]
+
+        # 3.1. Convert the original wav to target format with sox
+        sox_encoding = _get_sox_encoding(encoding)
+        sox_utils.convert_audio_file(
+            src_path,
+            sox_path,
+            compression=compression,
+            encoding=sox_encoding,
+            bit_depth=bits_per_sample)
+        # 3.2. Convert the target format to wav with sox
+        sox_utils.convert_audio_file(
+            sox_path, ref_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth)
+        # 3.3. Load with SciPy
+        expected = load_wav(ref_path, normalize=False)[0]
+
+        np.testing.assert_array_almost_equal(found, expected)
+
+
+class TestSave(TestSaveBase, unittest.TestCase):
+    @nested_params(
+        [
+            "path",
+        ],
+        [
+            ("PCM_U", 8),
+            ("PCM_S", 16),
+            ("PCM_S", 32),
+            ("PCM_F", 32),
+            ("PCM_F", 64),
+            ("ULAW", 8),
+            ("ALAW", 8),
+        ], )
+    def test_save_wav(self, test_mode, enc_params):
+        encoding, bits_per_sample = enc_params
+        self.assert_save_consistency(
+            "wav",
+            encoding=encoding,
+            bits_per_sample=bits_per_sample,
+            test_mode=test_mode)
+
+    @nested_params(
+        [
+            "path",
+        ],
+        [
+            ("float32", ),
+            ("int32", ),
+        ], )
+    def test_save_wav_dtype(self, test_mode, params):
+        (dtype, ) = params
+        self.assert_save_consistency(
+            "wav", src_dtype=dtype, test_mode=test_mode)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/audio/tests/backends/sox_io/smoke_test.py
+++ b/audio/tests/backends/sox_io/smoke_test.py
@ -0,0 +1,189 @@
+import io
+import itertools
+import platform
+import unittest
+if platform.system() == "Windows":
+    import warnings
+    warnings.warn("sox io not support in Windows, please skip test.")
+    exit()
+
+from parameterized import parameterized
+from paddleaudio.backends import sox_io_backend
+from common_utils import (get_wav_data, TempDirMixin, name_func)
+
+
+class SmokeTest(TempDirMixin, unittest.TestCase):
+    """Run smoke test on various audio format
+
+    The purpose of this test suite is to verify that sox_io_backend functionalities do not exhibit
+    abnormal behaviors.
+
+    This test suite should be able to run without any additional tools (such as sox command),
+    however without such tools, the correctness of each function cannot be verified.
+    """
+
+    def run_smoke_test(self,
+                       ext,
+                       sample_rate,
+                       num_channels,
+                       *,
+                       compression=None,
+                       dtype="float32"):
+        duration = 1
+        num_frames = sample_rate * duration
+        #path = self.get_temp_path(f"test.{ext}")
+        path = self.get_temp_path(f"test.{ext}")
+        original = get_wav_data(
+            dtype, num_channels, normalize=False, num_frames=num_frames)
+
+        # 1. run save
+        sox_io_backend.save(
+            path, original, sample_rate, compression=compression)
+        # 2. run info
+        info = sox_io_backend.info(path)
+        assert info.sample_rate == sample_rate
+        assert info.num_channels == num_channels
+        # 3. run load
+        loaded, sr = sox_io_backend.load(path, normalize=False)
+        assert sr == sample_rate
+        assert loaded.shape[0] == num_channels
+
+    @parameterized.expand(
+        list(
+            itertools.product(
+                ["float32", "int32"],
+                #["float32", "int32", "int16", "uint8"],
+                [8000, 16000],
+                [1, 2], )),
+        name_func=name_func, )
+    def test_wav(self, dtype, sample_rate, num_channels):
+        """Run smoke test on wav format"""
+        self.run_smoke_test("wav", sample_rate, num_channels, dtype=dtype)
+
+    #@parameterized.expand(
+    #list(
+    #itertools.product(
+    #[8000, 16000],
+    #[1, 2],
+    #[-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320],
+    #)
+    #)
+    #)
+    #def test_mp3(self, sample_rate, num_channels, bit_rate):
+    #"""Run smoke test on mp3 format"""
+    #self.run_smoke_test("mp3", sample_rate, num_channels, compression=bit_rate)
+
+    #@parameterized.expand(
+    #list(
+    #itertools.product(
+    #[8000, 16000],
+    #[1, 2],
+    #[-1, 0, 1, 2, 3, 3.6, 5, 10],
+    #)
+    #)
+    #)
+    #def test_vorbis(self, sample_rate, num_channels, quality_level):
+    #"""Run smoke test on vorbis format"""
+    #self.run_smoke_test("vorbis", sample_rate, num_channels, compression=quality_level)
+
+    @parameterized.expand(
+        list(itertools.product(
+            [8000, 16000],
+            [1, 2],
+            list(range(9)), )),
+        name_func=name_func, )
+    def test_flac(self, sample_rate, num_channels, compression_level):
+        """Run smoke test on flac format"""
+        self.run_smoke_test(
+            "flac", sample_rate, num_channels, compression=compression_level)
+
+
+class SmokeTestFileObj(unittest.TestCase):
+    """Run smoke test on various audio format
+
+    The purpose of this test suite is to verify that sox_io_backend functionalities do not exhibit
+    abnormal behaviors.
+
+    This test suite should be able to run without any additional tools (such as sox command),
+    however without such tools, the correctness of each function cannot be verified.
+    """
+
+    def run_smoke_test(self,
+                       ext,
+                       sample_rate,
+                       num_channels,
+                       *,
+                       compression=None,
+                       dtype="float32"):
+        duration = 1
+        num_frames = sample_rate * duration
+        original = get_wav_data(
+            dtype, num_channels, normalize=False, num_frames=num_frames)
+
+        fileobj = io.BytesIO()
+        # 1. run save
+        sox_io_backend.save(
+            fileobj, original, sample_rate, compression=compression, format=ext)
+        # 2. run info
+        fileobj.seek(0)
+        info = sox_io_backend.info(fileobj, format=ext)
+        assert info.sample_rate == sample_rate
+        assert info.num_channels == num_channels
+        # 3. run load
+        fileobj.seek(0)
+        loaded, sr = sox_io_backend.load(fileobj, normalize=False, format=ext)
+        assert sr == sample_rate
+        assert loaded.shape[0] == num_channels
+
+    @parameterized.expand(
+        list(itertools.product(
+            ["float32", "int32"],
+            [8000, 16000],
+            [1, 2], )),
+        name_func=name_func, )
+    def test_wav(self, dtype, sample_rate, num_channels):
+        """Run smoke test on wav format"""
+        self.run_smoke_test("wav", sample_rate, num_channels, dtype=dtype)
+
+    # not support yet
+    #@parameterized.expand(
+    #list(
+    #itertools.product(
+    #[8000, 16000],
+    #[1, 2],
+    #[-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320],
+    #)
+    #)
+    #)
+    #def test_mp3(self, sample_rate, num_channels, bit_rate):
+    #"""Run smoke test on mp3 format"""
+    #self.run_smoke_test("mp3", sample_rate, num_channels, compression=bit_rate)
+
+    #@parameterized.expand(
+    #list(
+    #itertools.product(
+    #[8000, 16000],
+    #[1, 2],
+    #[-1, 0, 1, 2, 3, 3.6, 5, 10],
+    #)
+    #)
+    #)
+    #def test_vorbis(self, sample_rate, num_channels, quality_level):
+    #"""Run smoke test on vorbis format"""
+    #self.run_smoke_test("vorbis", sample_rate, num_channels, compression=quality_level)
+
+    @parameterized.expand(
+        list(itertools.product(
+            [8000, 16000],
+            [1, 2],
+            list(range(9)), )),
+        name_func=name_func, )
+    def test_flac(self, sample_rate, num_channels, compression_level):
+        #"""Run smoke test on flac format"""
+        self.run_smoke_test(
+            "flac", sample_rate, num_channels, compression=compression_level)
+
+
+if __name__ == '__main__':
+    #test_func()
+    unittest.main()
--- a/audio/tests/backends/sox_io/sox_effect_test.py
+++ b/audio/tests/backends/sox_io/sox_effect_test.py
@ -0,0 +1,364 @@
+#code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/sox_effect/sox_effect_test.py
+import io
+import itertools
+import platform
+import tarfile
+import unittest
+from pathlib import Path
+
+import numpy as np
+if platform.system() == "Windows":
+    import warnings
+    warnings.warn("sox io not support in Windows, please skip test.")
+    exit()
+
+from parameterized import parameterized
+from paddleaudio import sox_effects
+from common_utils import (get_sinusoid, get_wav_data, load_wav, save_wav,
+                          sox_utils, TempDirMixin, load_effects_params)
+
+
+class TestSoxEffects(unittest.TestCase):
+    def test_init(self):
+        """Calling init_sox_effects multiple times does not crush"""
+        for _ in range(3):
+            sox_effects.init_sox_effects()
+
+
+class TestSoxEffectsTensor(TempDirMixin, unittest.TestCase):
+    """Test suite for `apply_effects_tensor` function"""
+
+    @parameterized.expand(
+        list(
+            itertools.product(["float32", "int32"], [8000, 16000], [1, 2, 4, 8],
+                              [True, False])), )
+    def test_apply_no_effect(self, dtype, sample_rate, num_channels,
+                             channels_first):
+        """`apply_effects_tensor` without effects should return identical data as input"""
+        original = get_wav_data(
+            dtype, num_channels, channels_first=channels_first)
+        expected = original.clone()
+
+        found, output_sample_rate = sox_effects.apply_effects_tensor(
+            expected, sample_rate, [], channels_first)
+
+        assert (output_sample_rate == sample_rate)
+        # SoxEffect should not alter the input Tensor object
+        #self.assertEqual(original, expected)
+        np.testing.assert_array_almost_equal(original.numpy(), expected.numpy())
+
+        # SoxEffect should not return the same Tensor object
+        assert expected is not found
+        # Returned Tensor should equal to the input Tensor
+        #self.assertEqual(expected, found)
+        np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
+
+    @parameterized.expand(
+        load_effects_params("sox_effect_test_args.jsonl"),
+        name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
+    )
+    def test_apply_effects(self, args):
+        """`apply_effects_tensor` should return identical data as sox command"""
+        effects = args["effects"]
+        num_channels = args.get("num_channels", 2)
+        input_sr = args.get("input_sample_rate", 8000)
+        output_sr = args.get("output_sample_rate")
+
+        input_path = self.get_temp_path("input.wav")
+        reference_path = self.get_temp_path("reference.wav")
+
+        original = get_sinusoid(
+            frequency=800,
+            sample_rate=input_sr,
+            n_channels=num_channels,
+            dtype="float32")
+        save_wav(input_path, original, input_sr)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_sample_rate=output_sr)
+
+        expected, expected_sr = load_wav(reference_path)
+        found, sr = sox_effects.apply_effects_tensor(original, input_sr,
+                                                     effects)
+
+        assert sr == expected_sr
+        #self.assertEqual(expected, found)
+        np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
+
+
+class TestSoxEffectsFile(TempDirMixin, unittest.TestCase):
+    """Test suite for `apply_effects_file` function"""
+
+    @parameterized.expand(
+        list(
+            itertools.product(
+                ["float32", "int32"],
+                [8000, 16000],
+                [1, 2, 4, 8],
+                [False, True], )),
+        #name_func=name_func,
+    )
+    def test_apply_no_effect(self, dtype, sample_rate, num_channels,
+                             channels_first):
+        """`apply_effects_file` without effects should return identical data as input"""
+        path = self.get_temp_path("input.wav")
+        expected = get_wav_data(
+            dtype, num_channels, channels_first=channels_first)
+        save_wav(path, expected, sample_rate, channels_first=channels_first)
+
+        found, output_sample_rate = sox_effects.apply_effects_file(
+            path, [], normalize=False, channels_first=channels_first)
+
+        assert output_sample_rate == sample_rate
+        #self.assertEqual(expected, found)
+        np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
+
+    @parameterized.expand(
+        load_effects_params("sox_effect_test_args.jsonl"),
+        #name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
+    )
+    def test_apply_effects_str(self, args):
+        """`apply_effects_file` should return identical data as sox command"""
+        dtype = "int32"
+        channels_first = True
+        effects = args["effects"]
+        num_channels = args.get("num_channels", 2)
+        input_sr = args.get("input_sample_rate", 8000)
+        output_sr = args.get("output_sample_rate")
+
+        input_path = self.get_temp_path("input.wav")
+        reference_path = self.get_temp_path("reference.wav")
+        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
+        save_wav(input_path, data, input_sr, channels_first=channels_first)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_sample_rate=output_sr)
+
+        expected, expected_sr = load_wav(reference_path)
+        found, sr = sox_effects.apply_effects_file(
+            input_path, effects, normalize=False, channels_first=channels_first)
+
+        assert sr == expected_sr
+        #self.assertEqual(found, expected)
+        np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
+
+    def test_apply_effects_path(self):
+        """`apply_effects_file` should return identical data as sox command when file path is given as a Path Object"""
+        dtype = "int32"
+        channels_first = True
+        effects = [["hilbert"]]
+        num_channels = 2
+        input_sr = 8000
+        output_sr = 8000
+
+        input_path = self.get_temp_path("input.wav")
+        reference_path = self.get_temp_path("reference.wav")
+        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
+        save_wav(input_path, data, input_sr, channels_first=channels_first)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_sample_rate=output_sr)
+
+        expected, expected_sr = load_wav(reference_path)
+        found, sr = sox_effects.apply_effects_file(
+            Path(input_path),
+            effects,
+            normalize=False,
+            channels_first=channels_first)
+
+        assert sr == expected_sr
+        #self.assertEqual(found, expected)
+        np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
+
+
+class TestFileFormats(TempDirMixin, unittest.TestCase):
+    """`apply_effects_file` gives the same result as sox on various file formats"""
+
+    @parameterized.expand(
+        list(itertools.product(
+            ["float32", "int32"],
+            [8000, 16000],
+            [1, 2], )),
+        #name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
+    )
+    def test_wav(self, dtype, sample_rate, num_channels):
+        """`apply_effects_file` works on various wav format"""
+        channels_first = True
+        effects = [["band", "300", "10"]]
+
+        input_path = self.get_temp_path("input.wav")
+        reference_path = self.get_temp_path("reference.wav")
+        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
+        save_wav(input_path, data, sample_rate, channels_first=channels_first)
+        sox_utils.run_sox_effect(input_path, reference_path, effects)
+
+        expected, expected_sr = load_wav(reference_path)
+        found, sr = sox_effects.apply_effects_file(
+            input_path, effects, normalize=False, channels_first=channels_first)
+
+        assert sr == expected_sr
+        #self.assertEqual(found, expected)
+        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
+
+    #not support now
+    #@parameterized.expand(
+    #list(
+    #itertools.product(
+    #[8000, 16000],
+    #[1, 2],
+    #)
+    #),
+    ##name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
+    #)
+    #def test_flac(self, sample_rate, num_channels):
+    #"""`apply_effects_file` works on various flac format"""
+    #channels_first = True
+    #effects = [["band", "300", "10"]]
+
+    #input_path = self.get_temp_path("input.flac")
+    #reference_path = self.get_temp_path("reference.wav")
+    #sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
+    #sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
+
+    #expected, expected_sr = load_wav(reference_path)
+    #found, sr = sox_effects.apply_effects_file(input_path, effects, channels_first=channels_first)
+    #save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
+
+    #assert sr == expected_sr
+    ##self.assertEqual(found, expected)
+    #np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
+
+    #@parameterized.expand(
+    #list(
+    #itertools.product(
+    #[8000, 16000],
+    #[1, 2],
+    #)
+    #),
+    ##name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
+    #)
+    #def test_vorbis(self, sample_rate, num_channels):
+    #"""`apply_effects_file` works on various vorbis format"""
+    #channels_first = True
+    #effects = [["band", "300", "10"]]
+
+    #input_path = self.get_temp_path("input.vorbis")
+    #reference_path = self.get_temp_path("reference.wav")
+    #sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
+    #sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
+
+    #expected, expected_sr = load_wav(reference_path)
+    #found, sr = sox_effects.apply_effects_file(input_path, effects, channels_first=channels_first)
+    #save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
+
+    #assert sr == expected_sr
+    ##self.assertEqual(found, expected)
+    #np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
+
+
+    #@skipIfNoExec("sox")
+    #@skipIfNoSox
+class TestFileObject(TempDirMixin, unittest.TestCase):
+    @parameterized.expand([
+        ("wav", None),
+    ])
+    def test_fileobj(self, ext, compression):
+        """Applying effects via file object works"""
+        sample_rate = 16000
+        channels_first = True
+        effects = [["band", "300", "10"]]
+        input_path = self.get_temp_path(f"input.{ext}")
+        reference_path = self.get_temp_path("reference.wav")
+
+        #sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
+        data = get_wav_data("int32", 2, channels_first=channels_first)
+        save_wav(input_path, data, sample_rate, channels_first=channels_first)
+
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_bitdepth=32)
+        expected, expected_sr = load_wav(reference_path)
+
+        with open(input_path, "rb") as fileobj:
+            found, sr = sox_effects.apply_effects_file(
+                fileobj, effects, channels_first=channels_first)
+        save_wav(
+            self.get_temp_path("result.wav"),
+            found,
+            sr,
+            channels_first=channels_first)
+        assert sr == expected_sr
+        #self.assertEqual(found, expected)
+        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
+
+    @parameterized.expand([
+        ("wav", None),
+    ])
+    def test_bytesio(self, ext, compression):
+        """Applying effects via BytesIO object works"""
+        sample_rate = 16000
+        channels_first = True
+        effects = [["band", "300", "10"]]
+        input_path = self.get_temp_path(f"input.{ext}")
+        reference_path = self.get_temp_path("reference.wav")
+
+        #sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
+        data = get_wav_data("int32", 2, channels_first=channels_first)
+        save_wav(input_path, data, sample_rate, channels_first=channels_first)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_bitdepth=32)
+        expected, expected_sr = load_wav(reference_path)
+
+        with open(input_path, "rb") as file_:
+            fileobj = io.BytesIO(file_.read())
+        found, sr = sox_effects.apply_effects_file(
+            fileobj, effects, channels_first=channels_first)
+        save_wav(
+            self.get_temp_path("result.wav"),
+            found,
+            sr,
+            channels_first=channels_first)
+        assert sr == expected_sr
+        #self.assertEqual(found, expected)
+        print("found")
+        print(found)
+        print("expected")
+        print(expected)
+        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
+
+    @parameterized.expand([
+        ("wav", None),
+    ])
+    def test_tarfile(self, ext, compression):
+        """Applying effects to compressed audio via file-like file works"""
+        sample_rate = 16000
+        channels_first = True
+        effects = [["band", "300", "10"]]
+        audio_file = f"input.{ext}"
+
+        input_path = self.get_temp_path(audio_file)
+        reference_path = self.get_temp_path("reference.wav")
+        archive_path = self.get_temp_path("archive.tar.gz")
+        data = get_wav_data("int32", 2, channels_first=channels_first)
+        save_wav(input_path, data, sample_rate, channels_first=channels_first)
+
+        #       sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_bitdepth=32)
+
+        expected, expected_sr = load_wav(reference_path)
+
+        with tarfile.TarFile(archive_path, "w") as tarobj:
+            tarobj.add(input_path, arcname=audio_file)
+        with tarfile.TarFile(archive_path, "r") as tarobj:
+            fileobj = tarobj.extractfile(audio_file)
+            found, sr = sox_effects.apply_effects_file(
+                fileobj, effects, channels_first=channels_first)
+        save_wav(
+            self.get_temp_path("result.wav"),
+            found,
+            sr,
+            channels_first=channels_first)
+        assert sr == expected_sr
+        #self.assertEqual(found, expected)
+        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/audio/tests/backends/sox_io/sox_effect_test_args.jsonl
+++ b/audio/tests/backends/sox_io/sox_effect_test_args.jsonl
@ -0,0 +1,77 @@
+{"effects": [["allpass", "300", "10"]]}
+{"effects": [["band", "300", "10"]]}
+{"effects": [["bandpass", "300", "10"]]}
+{"effects": [["bandreject", "300", "10"]]}
+{"effects": [["bass", "-10"]]}
+{"effects": [["biquad", "0.4", "0.2", "0.9", "0.7", "0.2", "0.6"]]}
+{"effects": [["chorus", "0.7", "0.9", "55", "0.4", "0.25", "2", "-t"]]}
+{"effects": [["chorus", "0.6", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "1.3", "-s"]]}
+{"effects": [["chorus", "0.5", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "2.3", "-t", "40", "0.3", "0.3", "1.3", "-s"]]}
+{"effects": [["channels", "1"]]}
+{"effects": [["channels", "2"]]}
+{"effects": [["channels", "3"]]}
+{"effects": [["compand", "0.3,1", "6:-70,-60,-20", "-5", "-90", "0.2"]]}
+{"effects": [["compand", ".1,.2", "-inf,-50.1,-inf,-50,-50", "0", "-90", ".1"]]}
+{"effects": [["compand", ".1,.1", "-45.1,-45,-inf,0,-inf", "45", "-90", ".1"]]}
+{"effects": [["contrast", "0"]]}
+{"effects": [["contrast", "25"]]}
+{"effects": [["contrast", "50"]]}
+{"effects": [["contrast", "75"]]}
+{"effects": [["contrast", "100"]]}
+{"effects": [["dcshift", "1.0"]]}
+{"effects": [["dcshift", "-1.0"]]}
+{"effects": [["deemph"]], "input_sample_rate": 44100}
+{"effects": [["dither", "-s"]]}
+{"effects": [["dither", "-S"]]}
+{"effects": [["divide"]]}
+{"effects": [["downsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 4000}
+{"effects": [["earwax"]], "input_sample_rate": 44100}
+{"effects": [["echo", "0.8", "0.88", "60", "0.4"]]}
+{"effects": [["echo", "0.8", "0.88", "6", "0.4"]]}
+{"effects": [["echo", "0.8", "0.9", "1000", "0.3"]]}
+{"effects": [["echo", "0.8", "0.9", "1000", "0.3", "1800", "0.25"]]}
+{"effects": [["echos", "0.8", "0.7", "700", "0.25", "700", "0.3"]]}
+{"effects": [["echos", "0.8", "0.7", "700", "0.25", "900", "0.3"]]}
+{"effects": [["echos", "0.8", "0.7", "40", "0.25", "63", "0.3"]]}
+{"effects": [["equalizer", "300", "10", "5"]]}
+{"effects": [["fade", "q", "3"]]}
+{"effects": [["fade", "h", "3"]]}
+{"effects": [["fade", "t", "3"]]}
+{"effects": [["fade", "l", "3"]]}
+{"effects": [["fade", "p", "3"]]}
+{"effects": [["fir", "0.0195", "-0.082", "0.234", "0.891", "-0.145", "0.043"]]}
+{"effects": [["flanger"]]}
+{"effects": [["gain", "-l", "-6"]]}
+{"effects": [["highpass", "-1", "300"]]}
+{"effects": [["highpass", "-2", "300"]]}
+{"effects": [["hilbert"]]}
+{"effects": [["loudness"]]}
+{"effects": [["lowpass", "-1", "300"]]}
+{"effects": [["lowpass", "-2", "300"]]}
+{"effects": [["mcompand", "0.005,0.1 -47,-40,-34,-34,-17,-33", "100", "0.003,0.05 -47,-40,-34,-34,-17,-33", "400", "0.000625,0.0125 -47,-40,-34,-34,-15,-33", "1600", "0.0001,0.025 -47,-40,-34,-34,-31,-31,-0,-30", "6400", "0,0.025 -38,-31,-28,-28,-0,-25"]], "input_sample_rate": 44100}
+{"effects": [["oops"]]}
+{"effects": [["overdrive"]]}
+{"effects": [["pad"]]}
+{"effects": [["phaser"]]}
+{"effects": [["remix", "6", "7", "8", "0"]], "num_channels": 8}
+{"effects": [["remix", "1-3,7", "3"]], "num_channels": 8}
+{"effects": [["repeat"]]}
+{"effects": [["reverb"]]}
+{"effects": [["reverse"]]}
+{"effects": [["riaa"]], "input_sample_rate": 44100}
+{"effects": [["silence", "0"]]}
+{"effects": [["speed", "1.3"]], "input_sample_rate": 4000, "output_sample_rate": 5200}
+{"effects": [["speed", "0.7"]], "input_sample_rate": 4000, "output_sample_rate": 2800}
+{"effects": [["stat"]]}
+{"effects": [["stats"]]}
+{"effects": [["stretch"]]}
+{"effects": [["swap"]]}
+{"effects": [["synth"]]}
+{"effects": [["tempo", "0.9"]]}
+{"effects": [["tempo", "1.1"]]}
+{"effects": [["treble", "3"]]}
+{"effects": [["tremolo", "300", "40"]]}
+{"effects": [["tremolo", "300", "50"]]}
+{"effects": [["trim", "0", "0.1"]]}
+{"effects": [["upsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 16000}
+{"effects": [["vol", "3"]]}
--- a/audio/tests/benchmark/README.md
+++ b/audio/tests/benchmark/README.md
@ -15,6 +15,7 @@ Result:
 ========================================================================== test session starts ==========================================================================
 platform linux -- Python 3.7.7, pytest-7.0.1, pluggy-1.0.0
 benchmark: 3.4.1 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000)
+rootdir: /ssd3/chenxiaojie06/PaddleSpeech/DeepSpeech/paddleaudio
 plugins: typeguard-2.12.1, benchmark-3.4.1, anyio-3.5.0
 collected 4 items

--- a/audio/tests/benchmark/log_melspectrogram.py
+++ b/audio/tests/benchmark/log_melspectrogram.py
@ -17,17 +17,15 @@ import urllib.request
 import librosa
 import numpy as np
 import paddle
+import paddleaudio
 import torch
 import torchaudio

-import paddlespeech.audio
-
 wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
 if not os.path.isfile(os.path.basename(wav_url)):
    urllib.request.urlretrieve(wav_url, os.path.basename(wav_url))

-waveform, sr = paddlespeech.audio.load(
-    os.path.abspath(os.path.basename(wav_url)))
+waveform, sr = paddleaudio.backends.soundfile_load(os.path.abspath(os.path.basename(wav_url)))
 waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0)
 waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0)

@ -57,7 +55,7 @@ def enable_gpu_device():
    paddle.set_device('gpu')


-log_mel_extractor = paddlespeech.audio.features.LogMelSpectrogram(
+log_mel_extractor = paddle.audio.features.LogMelSpectrogram(
    **mel_conf, f_min=0.0, top_db=80.0, dtype=waveform_tensor.dtype)


@ -67,20 +65,20 @@ def log_melspectrogram():

 def test_log_melspect_cpu(benchmark):
    enable_cpu_device()
-    feature_audio = benchmark(log_melspectrogram)
+    feature_paddleaudio = benchmark(log_melspectrogram)
    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
+        feature_librosa, feature_paddleaudio, decimal=3)


 def test_log_melspect_gpu(benchmark):
    enable_gpu_device()
-    feature_audio = benchmark(log_melspectrogram)
+    feature_paddleaudio = benchmark(log_melspectrogram)
    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=2)
+        feature_librosa, feature_paddleaudio, decimal=2)


 mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram(
@ -104,11 +102,11 @@ def test_log_melspect_cpu_torchaudio(benchmark):
    waveform_tensor_torch = waveform_tensor_torch.to('cpu')
    amplitude_to_DB = amplitude_to_DB.to('cpu')

-    feature_audio = benchmark(log_melspectrogram_torchaudio)
+    feature_paddleaudio = benchmark(log_melspectrogram_torchaudio)
    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
+        feature_librosa, feature_paddleaudio, decimal=3)


 def test_log_melspect_gpu_torchaudio(benchmark):
--- a/Show More
+++ b/Show More