[audio] mv paddlespeech/audio to paddleaudio (#2706)

* split paddlespeech/audio to paddleaudio. * add sox io ,sox effect, kaldi native fbank to paddleaudio.
2 years ago · 42ff946007
parent 0cc54bb785
commit 42ff946007
250 changed files with 18847 additions and 797 deletions
--- a/.gitignore
+++ b/.gitignore
@ -16,6 +16,9 @@
 build
 *output/
 audio/dist/
 audio/fc_patch/
 docs/build/
 docs/topic/ctc/warp-ctc/
@ -42,6 +45,7 @@ tools/python-soundfile/
 tools/onnx
 tools/onnxruntime
 tools/Paddle2ONNX
 tools/onnx-simplifier/
 speechx/fc_patch/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,8 +3,13 @@ repos:
    rev: v0.16.0
    hooks:
    -   id: yapf
-        files: \.py$
+        name: yapf
-        exclude: (?=third_party).*(\.py)$
+        language: python
        entry: yapf
        args: [-i, -vv]
        types: [python]
        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: a11d9314b22d8f8c7556443875b731ef05965464
    hooks:
@ -30,7 +35,8 @@ repos:
        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
        -  --builtins=G,request
        -  --jobs=1
-        exclude: (?=third_party).*(\.py)$
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
 -   repo : https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.0.1
    hooks:
@ -42,6 +48,7 @@ repos:
        files: \.md$
    -   id: remove-tabs
        files: \.md$
 -   repo: local
    hooks:
    -   id: clang-format
@ -49,23 +56,17 @@ repos:
        description: Format files with ClangFormat
        entry: bash .pre-commit-hooks/clang-format.hook -i
        language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
+        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
    #-   id: copyright_checker
    #    name: copyright_checker
    #    entry: python .pre-commit-hooks/copyright-check.hook
    #    language: system
    #    files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
    #    exclude: (?=third_party|pypinyin|speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$
    -   id: cpplint
        name: cpplint
        description: Static code analysis of C/C++ files
        language: python
        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$ 
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
        entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent
 -   repo: https://github.com/asottile/reorder_python_imports
    rev: v2.4.0
    hooks:
      - id: reorder-python-imports
-        exclude: (?=third_party).*(\.py)$
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h\.hpp|\.py)$
--- a/audio/CMakeLists.txt
+++ b/audio/CMakeLists.txt
@ -0,0 +1,70 @@
 cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
 # Use compiler ID "AppleClang" instead of "Clang" for XCode.
 # Not setting this sometimes makes XCode C compiler gets detected as "Clang",
 # even when the C++ one is detected as "AppleClang".
 cmake_policy(SET CMP0010 NEW)
 cmake_policy(SET CMP0025 NEW)
 # Suppress warning flags in default MSVC configuration.  It's not
 # mandatory that we do this (and we don't if cmake is old), but it's
 # nice when it's possible, and it's possible on our Windows configs.
 if(NOT CMAKE_VERSION VERSION_LESS 3.15.0)
  cmake_policy(SET CMP0092 NEW)
 endif()
 project(paddleaudio)
 # check and set CMAKE_CXX_STANDARD
 string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard)
 if(env_cxx_standard GREATER -1)
  message(
      WARNING "C++ standard version definition detected in environment variable."
      "paddleaudio requires -std=c++14. Please remove -std=c++ settings in your environment.")
 endif()
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 # Options
 option(BUILD_SOX "Build libsox statically" ON)
 option(BUILD_MAD "Enable libmad" ON)
 option(BUILD_KALDI "Build kaldi statically" ON)
 option(BUILD_PADDLEAUDIO_PYTHON_EXTENSION "Build Python extension" ON)
 # cmake
 set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJECT_SOURCE_DIR}/cmake/external")
 if (NOT MSVC)
    find_package(GFortranLibs REQUIRED)
    include(FortranCInterface)
    include(FindGFortranLibs REQUIRED)
 endif()
 # fc_patch dir
 set(FETCHCONTENT_QUIET off)
 get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
 set(FETCHCONTENT_BASE_DIR ${fc_patch})
 set(THIRD_PARTY_PATH ${fc_patch})
 include(openblas)
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 include(cmake/pybind.cmake)
 include_directories(${PYTHON_INCLUDE_DIR})
 # packages
 find_package(Python3 COMPONENTS Interpreter Development)
 # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O0 -Wall -g")
 add_subdirectory(paddleaudio)
 # Summary
 include(cmake/summary.cmake)
 onnx_print_configuration_summary()
--- a/audio/README.md
+++ b/audio/README.md
@ -0,0 +1,35 @@
 # PaddleAudio
 安装方式： pip install paddleaudio
 目前支持的平台：Linux：
 ## Environment
 ## Build wheel
 Linux test build whl environment:
 * docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2`
 * os - Ubuntu 16.04.7 LTS
 * gcc/g++/gfortran - 8.2.0
 * cmake - 3.18.0 (need install)
 * [How to Install Docker](https://docs.docker.com/engine/install/)
 * [A Docker Tutorial for Beginners](https://docker-curriculum.com/)
 1. First to launch docker container.
 ```
 docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash
 ```
 2. python setup.py bdist_wheel
 MAC：test build whl envrioment：
 * os 
 * gcc/g++/gfortran 12.2.0
 * cpu Intel Xeon E5 x86_64
 Windows：
 not support： paddleaudio C++ extension lib (sox io, kaldi native fbank)
 python setup.py bdist_wheel
--- a/audio/cmake/FindGFortranLibs.cmake
+++ b/audio/cmake/FindGFortranLibs.cmake
@ -0,0 +1,153 @@
 #.rst:
 # FindGFortranLibs
 # --------
 #  https://github.com/Argonne-National-Laboratory/PIPS/blob/master/cmake/Modules/FindGFortranLibs.cmake
 #  https://enccs.github.io/cmake-workshop/cxx-fortran/
 #
 # Find gcc Fortran compiler & library paths
 #
 # The module defines the following variables:
 #
 # ::
 #
 #
 #   GFORTRANLIBS_FOUND - true if system has gfortran
 #   LIBGFORTRAN_LIBRARIES - path to libgfortran
 #   LIBQUADMATH_LIBRARIES - path to libquadmath
 #   GFORTRAN_LIBARIES_DIR - directory containing libgfortran, libquadmath
 #   GFORTRAN_INCLUDE_DIR - directory containing gfortran/gcc headers
 #   LIBGOMP_LIBRARIES - path to libgomp
 #   LIBGOMP_INCLUDE_DIR - directory containing omp.h header
 #   GFORTRAN_VERSION_STRING - version of gfortran found
 #
 set(CMAKE_REQUIRED_QUIET ${LIBIOMP_FIND_QUIETLY})
 if(NOT CMAKE_REQUIRED_QUIET)
  message(STATUS "Looking for gfortran related libraries...")
 endif()
 enable_language(Fortran)
 if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
  # Basically, call "gfortran -v" to dump compiler info to the string
  # GFORTRAN_VERBOSE_STR, which will be used to get necessary paths
  message(STATUS "Extracting library and header information by calling 'gfortran -v'...")
  execute_process(COMMAND "${CMAKE_Fortran_COMPILER}" "-v" ERROR_VARIABLE
    GFORTRAN_VERBOSE_STR RESULT_VARIABLE FLAG)
  # For debugging
  message(STATUS "'gfortran -v' returned:")
  message(STATUS "${GFORTRAN_VERBOSE_STR}")
  # Detect gfortran version
  string(REGEX MATCH "gcc version [^\t\n ]+" GFORTRAN_VER_STR "${GFORTRAN_VERBOSE_STR}")
  string(REGEX REPLACE "gcc version ([^\t\n ]+)" "\\1" GFORTRAN_VERSION_STRING "${GFORTRAN_VER_STR}")
  message(STATUS "Detected gfortran version ${GFORTRAN_VERSION_STRING}")
  unset(GFORTRAN_VER_STR)
  set(MATCH_REGEX "[^\t\n ]+[\t\n ]+")
  set(REPLACE_REGEX "([^\t\n ]+)")
  # Find architecture for compiler
  string(REGEX MATCH "Target: [^\t\n ]+"
    GFORTRAN_ARCH_STR "${GFORTRAN_VERBOSE_STR}")
  message(STATUS "Architecture string: ${GFORTRAN_ARCH_STR}")
  string(REGEX REPLACE "Target: ([^\t\n ]+)" "\\1"
    GFORTRAN_ARCH "${GFORTRAN_ARCH_STR}")
  message(STATUS "Detected gfortran architecture: ${GFORTRAN_ARCH}")
  unset(GFORTRAN_ARCH_STR)
  # Find install prefix, if it exists; if not, use default
  string(REGEX MATCH  "--prefix=[^\t\n ]+[\t\n ]+"
    GFORTRAN_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
  if(NOT GFORTRAN_PREFIX_STR)
    message(STATUS "Detected default gfortran prefix")
    set(GFORTRAN_PREFIX_DIR "/usr/local") # default prefix for gcc install
  else()
    string(REGEX REPLACE "--prefix=([^\t\n ]+)" "\\1"
      GFORTRAN_PREFIX_DIR "${GFORTRAN_PREFIX_STR}")
  endif()
  message(STATUS "Detected gfortran prefix: ${GFORTRAN_PREFIX_DIR}")
  unset(GFORTRAN_PREFIX_STR)
  # Find install exec-prefix, if it exists; if not, use default
  string(REGEX MATCH "--exec-prefix=[^\t\n ]+[\t\n ]+" "\\1"
    GFORTRAN_EXEC_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
  if(NOT GFORTRAN_EXEC_PREFIX_STR)
    message(STATUS "Detected default gfortran exec-prefix")
    set(GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_PREFIX_DIR}")
  else()
    string(REGEX REPLACE "--exec-prefix=([^\t\n ]+)" "\\1"
      GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_EXEC_PREFIX_STR}")
  endif()
  message(STATUS "Detected gfortran exec-prefix: ${GFORTRAN_EXEC_PREFIX_DIR}")
  UNSET(GFORTRAN_EXEC_PREFIX_STR)
  # Find library directory and include directory, if library directory specified
  string(REGEX MATCH "--libdir=[^\t\n ]+"
    GFORTRAN_LIB_DIR_STR "${GFORTRAN_VERBOSE_STR}")
  if(NOT GFORTRAN_LIB_DIR_STR)
    message(STATUS "Found --libdir flag -- not found")
    message(STATUS "Using default gfortran library & include directory paths")
    string(STRIP ${GFORTRAN_PREFIX_DIR} TMPLIBDIR)
    set(GFORTRAN_LIBRARIES_DIR "${TMPLIBDIR}/lib64")
    set(GFORTRAN_INCLUDE_DIR "${TMPLIBDIR}/include")
  else()
    message(STATUS "Found --libdir flag -- yes")
    string(REGEX REPLACE "--libdir=([^\t\n ]+)" "\\1"
      GFORTRAN_LIBRARIES_DIR "${GFORTRAN_LIB_DIR_STR}")
    string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/gcc/" "${GFORTRAN_ARCH}" "/" "${GFORTRAN_VERSION_STRING}" "/include")
  endif()
  message(STATUS "gfortran libraries path: ${GFORTRAN_LIBRARIES_DIR}")
  message(STATUS "gfortran include path dir: ${GFORTRAN_INCLUDE_DIR}")
  unset(GFORTRAN_LIB_DIR_STR)
  # There are lots of other build options for gcc & gfortran. For now, the
  # options implemented above should cover a lot of common use cases.
  # Clean up be deleting the output string from "gfortran -v"
  unset(GFORTRAN_VERBOSE_STR)
  # Find paths for libgfortran, libquadmath, libgomp
  # libgomp needed for OpenMP support without Clang
  find_library(LIBGFORTRAN_LIBRARIES NAMES gfortran libgfortran
    HINTS ${GFORTRAN_LIBRARIES_DIR})
  find_library(LIBQUADMATH_LIBRARIES NAMES quadmath libquadmath
    HINTS ${GFORTRAN_LIBRARIES_DIR})
  find_library(LIBGOMP_LIBRARIES NAMES gomp libgomp
    HINTS ${GFORTRAN_LIBRARIES_DIR})
  # Find OpenMP headers
  find_path(LIBGOMP_INCLUDE_DIR NAMES omp.h HINTS ${GFORTRAN_INCLUDE_DIR})
 else()
  message(STATUS "CMAKE_Fortran_COMPILER_ID does not match 'GNU'!")
 endif()
 include(FindPackageHandleStandardArgs)
 # Required: libgfortran, libquadmath, path for gfortran libraries
 # Optional: libgomp, path for OpenMP headers, path for gcc/gfortran headers
 find_package_handle_standard_args(GFortranLibs
  REQUIRED_VARS LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES GFORTRAN_LIBRARIES_DIR
  VERSION_VAR GFORTRAN_VERSION_STRING)
 if(GFORTRANLIBS_FOUND)
  message(STATUS "Looking for gfortran libraries -- found")
  message(STATUS "gfortran version: ${GFORTRAN_VERSION_STRING}")
 else()
  message(STATUS "Looking for gfortran libraries -- not found")
 endif()
 mark_as_advanced(LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES
  LIBGOMP_LIBRARIES LIBGOMP_INCLUDE_DIR
  GFORTRAN_LIBRARIES_DIR GFORTRAN_INCLUDE_DIR)
 # FindGFortranLIBS.cmake ends here
 message(STATUS LIBGFORTRAN_LIBRARIES= ${LIBGFORTRAN_LIBRARIES})
 message(STATUS LIBQUADMATH_LIBRARIES= ${LIBQUADMATH_LIBRARIES})
 message(STATUS LIBGOMP_LIBRARIES= ${LIBGOMP_LIBRARIES})
 message(STATUS LIBGOMP_INCLUDE_DIR= ${LIBGOMP_INCLUDE_DIR})
 message(STATUS GFORTRAN_LIBRARIES_DIR= ${GFORTRAN_LIBRARIES_DIR})
 message(STATUS GFORTRAN_INCLUDE_DIR= ${GFORTRAN_INCLUDE_DIR})
--- a/audio/cmake/external/openblas.cmake
+++ b/audio/cmake/external/openblas.cmake
@ -0,0 +1,119 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 include(ExternalProject)
 set(CBLAS_PREFIX_DIR ${THIRD_PARTY_PATH}/openblas)
 set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
 set(CBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
 set(CBLAS_TAG v0.3.10)
 if(NOT WIN32)
  set(CBLAS_LIBRARIES
      "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
      CACHE FILEPATH "openblas library." FORCE)
  set(CBLAS_INC_DIR
      "${CBLAS_INSTALL_DIR}/include"
      CACHE PATH "openblas include directory." FORCE)
  set(OPENBLAS_CC
      "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
  if(APPLE)
    set(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
  endif()
  set(OPTIONAL_ARGS "")
  set(COMMON_ARGS "")
  if(APPLE)
    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
      set(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
    endif()
    set(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1)
  endif()
  ExternalProject_Add(
    OPENBLAS
    URL "https://paddleaudio.bj.bcebos.com/build/OpenBLAS-0.3.10.zip"
    GIT_SHALLOW YES
    DOWNLOAD_DIR ${CBLAS_PREFIX_DIR}
    SOURCE_DIR ${CBLAS_PREFIX_DIR}
    INSTALL_DIR ${CBLAS_INSTALL_DIR}
    BUILD_IN_SOURCE 1
    BUILD_COMMAND make -j${NPROC} ${COMMON_ARGS} ${OPTIONAL_ARGS}
    INSTALL_COMMAND make install PREFIX=<INSTALL_DIR>
    UPDATE_COMMAND ""
    CONFIGURE_COMMAND ""
    BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
    ExternalProject_Get_Property(OPENBLAS INSTALL_DIR)
    set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR})
    add_library(openblas STATIC IMPORTED)
    add_dependencies(openblas OPENBLAS)
    set_target_properties(openblas PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran)
    set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a)
    link_directories(${OpenBLAS_INSTALL_PREFIX}/lib)
    include_directories(${OpenBLAS_INSTALL_PREFIX}/include)
    set(OPENBLAS_LIBRARIES
        ${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a
    )
    add_library(libopenblas INTERFACE)
    add_dependencies(libopenblas openblas)
    target_include_directories(libopenblas INTERFACE ${OpenBLAS_INSTALL_PREFIX}/include/openblas)
    target_link_libraries(libopenblas INTERFACE ${OPENBLAS_LIBRARIES})
 else()
  set(CBLAS_LIBRARIES
      "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
      CACHE FILEPATH "openblas library." FORCE)
  set(CBLAS_INC_DIR
      "${CBLAS_INSTALL_DIR}/include/openblas"
      CACHE PATH "openblas include directory." FORCE)
  ExternalProject_Add(
    extern_openblas
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY ${CBLAS_REPOSITORY}
    GIT_TAG ${CBLAS_TAG}
    PREFIX ${CBLAS_PREFIX_DIR}
    INSTALL_DIR ${CBLAS_INSTALL_DIR}
    BUILD_IN_SOURCE 0
    UPDATE_COMMAND ""
    CMAKE_ARGS -DCMAKE_C_COMPILER=clang-cl
               -DCMAKE_CXX_COMPILER=clang-cl
               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
               -DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
               -DCMAKE_BUILD_TYPE=Release #${THIRD_PARTY_BUILD_TYPE}
               -DCMAKE_MT=mt
               -DUSE_THREAD=OFF
               -DBUILD_WITHOUT_LAPACK=NO
               -DCMAKE_Fortran_COMPILER=flang
               -DNOFORTRAN=0
               -DDYNAMIC_ARCH=ON
               #${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS
      -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
      -DCMAKE_BUILD_TYPE:STRING=Release #${THIRD_PARTY_BUILD_TYPE}
    # ninja need to know where openblas.lib comes from
    BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
  set(OPENBLAS_SHARED_LIB
      ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
  add_library(openblas INTERFACE)
  add_dependencies(openblas extern_openblas)
  include_directories(${CBLAS_INC_DIR})
  link_libraries(${CBLAS_LIBRARIES})
 endif()
--- a/audio/cmake/pybind.cmake
+++ b/audio/cmake/pybind.cmake
@ -0,0 +1,42 @@
 #the pybind11 is from:https://github.com/pybind/pybind11
 # Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
 SET(PYBIND_ZIP "v2.10.0.zip")
 SET(LOCAL_PYBIND_ZIP ${FETCHCONTENT_BASE_DIR}/${PYBIND_ZIP})
 SET(PYBIND_SRC ${FETCHCONTENT_BASE_DIR}/pybind11)
 SET(DOWNLOAD_URL "https://paddleaudio.bj.bcebos.com/build/v2.10.0.zip")
 SET(PYBIND_TIMEOUT 600 CACHE STRING "Timeout in seconds when downloading pybind.")
 IF(NOT EXISTS ${LOCAL_PYBIND_ZIP})
    FILE(DOWNLOAD ${DOWNLOAD_URL}
      ${LOCAL_PYBIND_ZIP}
      TIMEOUT ${PYBIND_TIMEOUT}
      STATUS ERR
      SHOW_PROGRESS
    )
    IF(ERR EQUAL 0)
        MESSAGE(STATUS "download pybind success")
    ELSE()
        MESSAGE(FATAL_ERROR "download pybind fail")
    ENDIF()
 ENDIF()
 IF(NOT EXISTS ${PYBIND_SRC})
    EXECUTE_PROCESS(
      COMMAND ${CMAKE_COMMAND} -E tar xfz ${LOCAL_PYBIND_ZIP}
       WORKING_DIRECTORY ${FETCHCONTENT_BASE_DIR}
       RESULT_VARIABLE tar_result
    )
    file(RENAME ${FETCHCONTENT_BASE_DIR}/pybind11-2.10.0 ${PYBIND_SRC})
  IF (tar_result MATCHES 0)
      MESSAGE(STATUS "unzip pybind success")
  ELSE()
      MESSAGE(FATAL_ERROR "unzip pybind fail")
  ENDIF()
 ENDIF()
 include_directories(${PYBIND_SRC}/include)
--- a/audio/cmake/summary.cmake
+++ b/audio/cmake/summary.cmake
@ -0,0 +1,45 @@
 # SPDX-License-Identifier: Apache-2.0
 # Prints accumulated ONNX configuration summary
 function (onnx_print_configuration_summary)
  message(STATUS "")
  message(STATUS "******** Summary ********")
  message(STATUS "  CMake version             : ${CMAKE_VERSION}")
  message(STATUS "  CMake command             : ${CMAKE_COMMAND}")
  message(STATUS "  System                    : ${CMAKE_SYSTEM_NAME}")
  message(STATUS "  C++ compiler              : ${CMAKE_CXX_COMPILER}")
  message(STATUS "  C++ compiler version      : ${CMAKE_CXX_COMPILER_VERSION}")
  message(STATUS "  CXX flags                 : ${CMAKE_CXX_FLAGS}")
  message(STATUS "  Build type                : ${CMAKE_BUILD_TYPE}")
  get_directory_property(tmp DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)
  message(STATUS "  Compile definitions       : ${tmp}")
  message(STATUS "  CMAKE_PREFIX_PATH         : ${CMAKE_PREFIX_PATH}")
  message(STATUS "  CMAKE_INSTALL_PREFIX      : ${CMAKE_INSTALL_PREFIX}")
  message(STATUS "  CMAKE_MODULE_PATH         : ${CMAKE_MODULE_PATH}")
  message(STATUS "")
  message(STATUS "  ONNX version              : ${ONNX_VERSION}")
  message(STATUS "  ONNX NAMESPACE            : ${ONNX_NAMESPACE}")
  message(STATUS "  ONNX_USE_LITE_PROTO       : ${ONNX_USE_LITE_PROTO}")
  message(STATUS "  USE_PROTOBUF_SHARED_LIBS  : ${ONNX_USE_PROTOBUF_SHARED_LIBS}")
  message(STATUS "  Protobuf_USE_STATIC_LIBS  : ${Protobuf_USE_STATIC_LIBS}")
  message(STATUS "  ONNX_DISABLE_EXCEPTIONS   : ${ONNX_DISABLE_EXCEPTIONS}")
  message(STATUS "  ONNX_WERROR               : ${ONNX_WERROR}")
  message(STATUS "  ONNX_BUILD_TESTS          : ${ONNX_BUILD_TESTS}")
  message(STATUS "  ONNX_BUILD_BENCHMARKS     : ${ONNX_BUILD_BENCHMARKS}")
  message(STATUS "  ONNXIFI_DUMMY_BACKEND     : ${ONNXIFI_DUMMY_BACKEND}")
  message(STATUS "  ONNXIFI_ENABLE_EXT        : ${ONNXIFI_ENABLE_EXT}")
  message(STATUS "")
  message(STATUS "  Protobuf compiler         : ${PROTOBUF_PROTOC_EXECUTABLE}")
  message(STATUS "  Protobuf includes         : ${PROTOBUF_INCLUDE_DIRS}")
  message(STATUS "  Protobuf libraries        : ${PROTOBUF_LIBRARIES}")
  message(STATUS "  BUILD_ONNX_PYTHON         : ${BUILD_ONNX_PYTHON}")
  message(STATUS "    Python version        : ${Python_VERSION}")
  message(STATUS "    Python executable     : ${Python_EXECUTABLE}")
  message(STATUS "    Python includes       : ${Python_INCLUDE_DIR}")
  message(STATUS "    Python libraries      : ${Python_LIBRARY}")
  message(STATUS "  PYBIND11                  : ${pybind11_FOUND}")
  message(STATUS "    Pybind11 version        : ${pybind11_VERSION}")
  message(STATUS "    Pybind11 include        : ${pybind11_INCLUDE_DIR}")
  message(STATUS "    Pybind11 includes       : ${pybind11_INCLUDE_DIRS}")
  message(STATUS "    Pybind11 libraries      : ${pybind11_LIBRARIES}")
 endfunction()
--- a/audio/paddleaudio/CMakeLists.txt
+++ b/audio/paddleaudio/CMakeLists.txt
@ -0,0 +1,19 @@
 add_subdirectory(third_party)
 add_subdirectory(src)
 if (APPLE) 
  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib
          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib)
 endif(APPLE)
 if (UNIX AND NOT APPLE)
  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgfortran.so.5
          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libquadmath.so.0
          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.so.1
          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
 endif()
--- a/paddlespeech/audio/backends/init.py
+++ b/paddlespeech/audio/backends/init.py
@ -11,9 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .soundfile_backend import depth_convert
+from . import _extension
-from .soundfile_backend import load
+from . import backends
-from .soundfile_backend import normalize
+from . import compliance
-from .soundfile_backend import resample
+from . import datasets
-from .soundfile_backend import save
+from . import features
-from .soundfile_backend import to_mono
+from . import functional
 from . import metric
 from . import sox_effects
 from . import utils
--- a/audio/paddleaudio/_extension.py
+++ b/audio/paddleaudio/_extension.py
@ -0,0 +1,167 @@
 import contextlib
 import ctypes
 import os
 import sys
 import types
 import warnings
 from pathlib import Path
 from ._internal import module_utils as _mod_utils  # noqa: F401
 # Query `hasattr` only once.
 _SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
                                                               'setdlopenflags')
@contextlib.contextmanager
 def dl_open_guard():
    """
    # https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
    Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
    shared library to load custom operators.
    """
    if _SET_GLOBAL_FLAGS:
        old_flags = sys.getdlopenflags()
        sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
    yield
    if _SET_GLOBAL_FLAGS:
        sys.setdlopenflags(old_flags)
 def resolve_library_path(path: str) -> str:
    return os.path.realpath(path)
 class _Ops(types.ModuleType):
    #__file__ = '_ops.py'
    def __init__(self):
        super(_Ops, self).__init__('paddleaudio.ops')
        self.loaded_libraries = set()
    def load_library(self, path):
        """
        Loads a shared library from the given path into the current process.
        This allows dynamically loading custom operators. For this, 
        you should compile your operator and 
        the static registration code into a shared library object, and then
        call ``paddleaudio.ops.load_library('path/to/libcustom.so')`` to load the
        shared object.
        After the library is loaded, it is added to the
        ``paddleaudio.ops.loaded_libraries`` attribute, a set that may be inspected
        for the paths of all libraries loaded using this function.
        Args:
            path (str): A path to a shared library to load.
        """
        path = resolve_library_path(path)
        with dl_open_guard():
            # https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
            # Import the shared library into the process, thus running its
            # static (global) initialization code in order to register custom
            # operators with the JIT.
            ctypes.CDLL(path)
        self.loaded_libraries.add(path)
 _LIB_DIR = Path(__file__).parent / "lib"
 def _get_lib_path(lib: str):
    suffix = "pyd" if os.name == "nt" else "so"
    path = _LIB_DIR / f"{lib}.{suffix}"
    return path
 def _load_lib(lib: str) -> bool:
    """Load extension module
    Note:
        In case `paddleaudio` is deployed with `pex` format, the library file
        is not in a standard location.
        In this case, we expect that `libpaddlleaudio` is available somewhere
        in the search path of dynamic loading mechanism, so that importing
        `_paddlleaudio` will have library loader find and load `libpaddlleaudio`.
        This is the reason why the function should not raising an error when the library
        file is not found.
    Returns:
        bool:
            True if the library file is found AND the library loaded without failure.
            False if the library file is not found (like in the case where paddlleaudio
            is deployed with pex format, thus the shared library file is
            in a non-standard location.).
            If the library file is found but there is an issue loading the library,
            (such as missing dependency) then this function raises the exception as-is.
    Raises:
        Exception:
            If the library file is found, but there is an issue loading the library file,
            (when underlying `ctype.DLL` throws an exception), this function will pass
            the exception as-is, instead of catching it and returning bool.
            The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
            is not found.
            This behavior was chosen because the expected failure case is not recoverable.
            If a dependency is missing, then users have to install it.
    """
    path = _get_lib_path(lib)
    if not path.exists():
        warnings.warn("lib path is not exists:" + str(path))
        return False
    ops.load_library(path)
    return True
 _FFMPEG_INITIALIZED = False
 def _init_ffmpeg():
    global _FFMPEG_INITIALIZED
    if _FFMPEG_INITIALIZED:
        return
    if not paddleaudio._paddlleaudio.is_ffmpeg_available():
        raise RuntimeError(
            "paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
        )
    try:
        _load_lib("libpaddlleaudio_ffmpeg")
    except OSError as err:
        raise ImportError(
            "FFmpeg libraries are not found. Please install FFmpeg.") from err
    import paddllespeech.audio._paddlleaudio_ffmpeg  # noqa
    paddleaudio._paddlleaudio.ffmpeg_init()
    if paddleaudio._paddlleaudio.ffmpeg_get_log_level() > 8:
        paddleaudio._paddlleaudio.ffmpeg_set_log_level(8)
    _FFMPEG_INITIALIZED = True
 def _init_extension():
    if not _mod_utils.is_module_available("paddleaudio._paddleaudio"):
        warnings.warn(
            "paddleaudio C++ extension is not available. sox_io, sox_effect, kaldi raw feature is not supported!!!")
        return
    _load_lib("libpaddleaudio")
    # This import is for initializing the methods registered via PyBind11
    # This has to happen after the base library is loaded
    try:
        from paddleaudio import _paddleaudio  # noqa
    except Exception:
        warnings.warn(
            "paddleaudio C++ extension is not available. sox_io, sox_effect, kaldi raw feature is not supported!!!")
        return
    # Because this part is executed as part of `import torchaudio`, we ignore the
    # initialization failure.
    # If the FFmpeg integration is not properly initialized, then detailed error
    # will be raised when client code attempts to import the dedicated feature.
    try:
        _init_ffmpeg()
    except Exception:
        pass
 ops = _Ops()
 _init_extension()
--- a/audio/paddleaudio/_internal/init.py
+++ b/audio/paddleaudio/_internal/init.py
--- a/audio/paddleaudio/_internal/module_utils.py
+++ b/audio/paddleaudio/_internal/module_utils.py
@ -0,0 +1,151 @@
 import importlib.util
 import platform
 import warnings
 from functools import wraps
 from typing import Optional
 #code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py with modification.
 def is_module_available(*modules: str) -> bool:
    r"""Returns if a top-level module with :attr:`name` exists *without**
    importing it. This is generally safer than try-catch block around a
    `import X`. It avoids third party libraries breaking assumptions of some of
    our tests, e.g., setting multiprocessing start method when imported
    (see librosa/#747, torchvision/#544).
    """
    return all(importlib.util.find_spec(m) is not None for m in modules)
 def requires_module(*modules: str):
    """Decorate function to give error message if invoked without required optional modules.
    This decorator is to give better error message to users rather
    than raising ``NameError:  name 'module' is not defined`` at random places.
    """
    missing = [m for m in modules if not is_module_available(m)]
    if not missing:
        # fall through. If all the modules are available, no need to decorate
        def decorator(func):
            return func
    else:
        req = f"module: {missing[0]}" if len(
            missing) == 1 else f"modules: {missing}"
        def decorator(func):
            @wraps(func)
            def wrapped(*args, **kwargs):
                raise RuntimeError(
                    f"{func.__module__}.{func.__name__} requires {req}")
            return wrapped
    return decorator
 def deprecated(direction: str, version: Optional[str]=None):
    """Decorator to add deprecation message
    Args:
        direction (str): Migration steps to be given to users.
        version (str or int): The version when the object will be removed
    """
    def decorator(func):
        @wraps(func)
        def wrapped(*args, **kwargs):
            message = (
                f"{func.__module__}.{func.__name__} has been deprecated "
                f'and will be removed from {"future" if version is None else version} release. '
                f"{direction}")
            warnings.warn(message, stacklevel=2)
            return func(*args, **kwargs)
        return wrapped
    return decorator
 def is_kaldi_available():
    return is_module_available("paddleaudio._paddleaudio")
 def requires_kaldi():
    if is_kaldi_available():
        def decorator(func):
            return func
    else:
        def decorator(func):
            @wraps(func)
            def wrapped(*args, **kwargs):
                raise RuntimeError(
                    f"{func.__module__}.{func.__name__} requires libpaddleaudio build with kaldi")
            return wrapped
    return decorator
 def _check_soundfile_importable():
    if not is_module_available("soundfile"):
        return False
    try:
        import soundfile  # noqa: F401
        return True
    except Exception:
        warnings.warn(
            "Failed to import soundfile. 'soundfile' backend is not available.")
        return False
 _is_soundfile_importable = _check_soundfile_importable()
 def is_soundfile_available():
    return _is_soundfile_importable
 def requires_soundfile():
    if is_soundfile_available():
        def decorator(func):
            return func
    else:
        def decorator(func):
            @wraps(func)
            def wrapped(*args, **kwargs):
                raise RuntimeError(
                    f"{func.__module__}.{func.__name__} requires soundfile")
            return wrapped
    return decorator
 def is_sox_available():
    if platform.system() == "Windows":  # not support sox in windows
        return False
    return is_module_available("paddleaudio._paddleaudio")
 def requires_sox():
    if is_sox_available():
        def decorator(func):
            return func
    else:
        def decorator(func):
            @wraps(func)
            def wrapped(*args, **kwargs):
                raise RuntimeError(
                    f"{func.__module__}.{func.__name__} requires libpaddleaudio build with sox")
            return wrapped
    return decorator
--- a/audio/paddleaudio/backends/init.py
+++ b/audio/paddleaudio/backends/init.py
@ -11,3 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from . import utils
 from .soundfile_backend import depth_convert
 from .soundfile_backend import normalize
 from .soundfile_backend import resample
 from .soundfile_backend import soundfile_load
 from .soundfile_backend import soundfile_save
 from .soundfile_backend import to_mono
 from .utils import get_audio_backend
 from .utils import list_audio_backends
 from .utils import set_audio_backend
 utils._init_audio_backend()
--- a/audio/paddleaudio/backends/common.py
+++ b/audio/paddleaudio/backends/common.py
@ -0,0 +1,55 @@
 # Token form https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification.
 class AudioInfo:
    """return of info function.
    This class is used by :ref:`"sox_io" backend<sox_io_backend>` and
    :ref:`"soundfile" backend with the new interface<soundfile_backend>`.
    :ivar int sample_rate: Sample rate
    :ivar int num_frames: The number of frames
    :ivar int num_channels: The number of channels
    :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
        or when it cannot be accurately inferred.
    :ivar str encoding: Audio encoding
        The values encoding can take are one of the following:
            * ``PCM_S``: Signed integer linear PCM
            * ``PCM_U``: Unsigned integer linear PCM
            * ``PCM_F``: Floating point linear PCM
            * ``FLAC``: Flac, Free Lossless Audio Codec
            * ``ULAW``: Mu-law
            * ``ALAW``: A-law
            * ``MP3`` : MP3, MPEG-1 Audio Layer III
            * ``VORBIS``: OGG Vorbis
            * ``AMR_WB``: Adaptive Multi-Rate
            * ``AMR_NB``: Adaptive Multi-Rate Wideband
            * ``OPUS``: Opus
            * ``HTK``: Single channel 16-bit PCM
            * ``UNKNOWN`` : None of above
    """
    def __init__(
        self,
        sample_rate: int,
        num_frames: int,
        num_channels: int,
        bits_per_sample: int,
        encoding: str,
    ):
        self.sample_rate = sample_rate
        self.num_frames = num_frames
        self.num_channels = num_channels
        self.bits_per_sample = bits_per_sample
        self.encoding = encoding
    def __str__(self):
        return (
            f"AudioMetaData("
            f"sample_rate={self.sample_rate}, "
            f"num_frames={self.num_frames}, "
            f"num_channels={self.num_channels}, "
            f"bits_per_sample={self.bits_per_sample}, "
            f"encoding={self.encoding}"
            f")"
        )
--- a/audio/paddleaudio/backends/no_backend.py
+++ b/audio/paddleaudio/backends/no_backend.py
@ -0,0 +1,32 @@
 from pathlib import Path
 from typing import Callable
 from typing import Optional
 from typing import Tuple
 from typing import Union
 from paddle import Tensor
 #code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
 def load(
        filepath: Union[str, Path],
        out: Optional[Tensor]=None,
        normalization: Union[bool, float, Callable]=True,
        channels_first: bool=True,
        num_frames: int=0,
        offset: int=0,
        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
    raise RuntimeError("No audio I/O backend is available.")
 def save(filepath: str,
         src: Tensor,
         sample_rate: int,
         precision: int=16,
         channels_first: bool=True) -> None:
    raise RuntimeError("No audio I/O backend is available.")
 def info(filepath: str) -> None:
    raise RuntimeError("No audio I/O backend is available.")
--- a/audio/paddleaudio/backends/soundfile_backend.py
+++ b/audio/paddleaudio/backends/soundfile_backend.py
@ -0,0 +1,677 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import warnings
 from typing import Optional
 from typing import Tuple
 import numpy as np
 import paddle
 import resampy
 import soundfile
 from scipy.io import wavfile
 from ..utils import depth_convert
 from ..utils import ParameterError
 from .common import AudioInfo
 __all__ = [
    'resample',
    'to_mono',
    'normalize',
    'save',
    'soundfile_save',
    'load',
    'soundfile_load',
    'info',
 ]
 NORMALMIZE_TYPES = ['linear', 'gaussian']
 MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
 RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
 EPS = 1e-8
 def resample(y: np.ndarray,
             src_sr: int,
             target_sr: int,
             mode: str='kaiser_fast') -> np.ndarray:
    """Audio resampling.
    Args:
        y (np.ndarray): Input waveform array in 1D or 2D.
        src_sr (int): Source sample rate.
        target_sr (int): Target sample rate.
        mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
    Returns:
        np.ndarray: `y` resampled to `target_sr`
    """
    if mode == 'kaiser_best':
        warnings.warn(
            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
        we recommend the mode kaiser_fast in large scale audio trainning')
    if not isinstance(y, np.ndarray):
        raise ParameterError(
            'Only support numpy np.ndarray, but received y in {type(y)}')
    if mode not in RESAMPLE_MODES:
        raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
    return resampy.resample(y, src_sr, target_sr, filter=mode)
 def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
    """Convert sterior audio to mono.
    Args:
        y (np.ndarray): Input waveform array in 1D or 2D.
        merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
    Returns:
        np.ndarray: `y` with mono channel.
    """
    if merge_type not in MERGE_TYPES:
        raise ParameterError(
            f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
        )
    if y.ndim > 2:
        raise ParameterError(
            f'Unsupported audio array,  y.ndim > 2, the shape is {y.shape}')
    if y.ndim == 1:  # nothing to merge
        return y
    if merge_type == 'ch0':
        return y[0]
    if merge_type == 'ch1':
        return y[1]
    if merge_type == 'random':
        return y[np.random.randint(0, 2)]
    # need to do averaging according to dtype
    if y.dtype == 'float32':
        y_out = (y[0] + y[1]) * 0.5
    elif y.dtype == 'int16':
        y_out = y.astype('int32')
        y_out = (y_out[0] + y_out[1]) // 2
        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
                        np.iinfo(y.dtype).max).astype(y.dtype)
    elif y.dtype == 'int8':
        y_out = y.astype('int16')
        y_out = (y_out[0] + y_out[1]) // 2
        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
                        np.iinfo(y.dtype).max).astype(y.dtype)
    else:
        raise ParameterError(f'Unsupported dtype: {y.dtype}')
    return y_out
 def soundfile_load_(file: os.PathLike,
                    offset: Optional[float]=None,
                    dtype: str='int16',
                    duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
    """Load audio using soundfile library. This function load audio file using libsndfile.
    Args:
        file (os.PathLike): File of waveform.
        offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
        dtype (str, optional): Data type of waveform. Defaults to 'int16'.
        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
    Returns:
        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
    """
    with soundfile.SoundFile(file) as sf_desc:
        sr_native = sf_desc.samplerate
        if offset:
            sf_desc.seek(int(offset * sr_native))
        if duration is not None:
            frame_duration = int(duration * sr_native)
        else:
            frame_duration = -1
        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
    return y, sf_desc.samplerate
 def normalize(y: np.ndarray, norm_type: str='linear',
              mul_factor: float=1.0) -> np.ndarray:
    """Normalize an input audio with additional multiplier.
    Args:
        y (np.ndarray): Input waveform array in 1D or 2D.
        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
        mul_factor (float, optional): Scaling factor. Defaults to 1.0.
    Returns:
        np.ndarray: `y` after normalization.
    """
    if norm_type == 'linear':
        amax = np.max(np.abs(y))
        factor = 1.0 / (amax + EPS)
        y = y * factor * mul_factor
    elif norm_type == 'gaussian':
        amean = np.mean(y)
        astd = np.std(y)
        astd = max(astd, EPS)
        y = mul_factor * (y - amean) / astd
    else:
        raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
    return y
 def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
    """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
    Args:
        y (np.ndarray): Input waveform array in 1D or 2D.
        sr (int): Sample rate.
        file (os.PathLike): Path of auido file to save.
    """
    if not file.endswith('.wav'):
        raise ParameterError(
            f'only .wav file supported, but dst file name is: {file}')
    if sr <= 0:
        raise ParameterError(
            f'Sample rate should be larger than 0, recieved sr = {sr}')
    if y.dtype not in ['int16', 'int8']:
        warnings.warn(
            f'input data type is {y.dtype}, will convert data to int16 format before saving'
        )
        y_out = depth_convert(y, 'int16')
    else:
        y_out = y
    wavfile.write(file, sr, y_out)
 def soundfile_load(
        file: os.PathLike,
        sr: Optional[int]=None,
        mono: bool=True,
        merge_type: str='average',  # ch0,ch1,random,average
        normal: bool=True,
        norm_type: str='linear',
        norm_mul_factor: float=1.0,
        offset: float=0.0,
        duration: Optional[int]=None,
        dtype: str='float32',
        resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
    """Load audio file from disk. This function loads audio from disk using using audio beackend.
    Args:
        file (os.PathLike): Path of auido file to load.
        sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
        mono (bool, optional): Return waveform with mono channel. Defaults to True.
        merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
        normal (bool, optional): Waveform normalization. Defaults to True.
        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
        norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
        offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
        dtype (str, optional): Data type of waveform. Defaults to 'float32'.
        resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
    Returns:
        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
    """
    y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration)
    if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
        raise ParameterError(f'audio file {file} looks empty')
    if mono:
        y = to_mono(y, merge_type)
    if sr is not None and sr != r:
        y = resample(y, r, sr, mode=resample_mode)
        r = sr
    if normal:
        y = normalize(y, norm_type, norm_mul_factor)
    elif dtype in ['int8', 'int16']:
        # still need to do normalization, before depth convertion
        y = normalize(y, 'linear', 1.0)
    y = depth_convert(y, dtype)
    return y, r
 #the code below token form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py with modificaion.
 def _get_subtype_for_wav(dtype: paddle.dtype,
                         encoding: str,
                         bits_per_sample: int):
    if not encoding:
        if not bits_per_sample:
            subtype = {
                paddle.uint8: "PCM_U8",
                paddle.int16: "PCM_16",
                paddle.int32: "PCM_32",
                paddle.float32: "FLOAT",
                paddle.float64: "DOUBLE",
            }.get(dtype)
            if not subtype:
                raise ValueError(f"Unsupported dtype for wav: {dtype}")
            return subtype
        if bits_per_sample == 8:
            return "PCM_U8"
        return f"PCM_{bits_per_sample}"
    if encoding == "PCM_S":
        if not bits_per_sample:
            return "PCM_32"
        if bits_per_sample == 8:
            raise ValueError("wav does not support 8-bit signed PCM encoding.")
        return f"PCM_{bits_per_sample}"
    if encoding == "PCM_U":
        if bits_per_sample in (None, 8):
            return "PCM_U8"
        raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
    if encoding == "PCM_F":
        if bits_per_sample in (None, 32):
            return "FLOAT"
        if bits_per_sample == 64:
            return "DOUBLE"
        raise ValueError("wav only supports 32/64-bit float PCM encoding.")
    if encoding == "ULAW":
        if bits_per_sample in (None, 8):
            return "ULAW"
        raise ValueError("wav only supports 8-bit mu-law encoding.")
    if encoding == "ALAW":
        if bits_per_sample in (None, 8):
            return "ALAW"
        raise ValueError("wav only supports 8-bit a-law encoding.")
    raise ValueError(f"wav does not support {encoding}.")
 def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
    if encoding in (None, "PCM_S"):
        return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
    if encoding in ("PCM_U", "PCM_F"):
        raise ValueError(f"sph does not support {encoding} encoding.")
    if encoding == "ULAW":
        if bits_per_sample in (None, 8):
            return "ULAW"
        raise ValueError("sph only supports 8-bit for mu-law encoding.")
    if encoding == "ALAW":
        return "ALAW"
    raise ValueError(f"sph does not support {encoding}.")
 def _get_subtype(dtype: paddle.dtype,
                 format: str,
                 encoding: str,
                 bits_per_sample: int):
    if format == "wav":
        return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
    if format == "flac":
        if encoding:
            raise ValueError("flac does not support encoding.")
        if not bits_per_sample:
            return "PCM_16"
        if bits_per_sample > 24:
            raise ValueError("flac does not support bits_per_sample > 24.")
        return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
    if format in ("ogg", "vorbis"):
        if encoding or bits_per_sample:
            raise ValueError(
                "ogg/vorbis does not support encoding/bits_per_sample.")
        return "VORBIS"
    if format == "sph":
        return _get_subtype_for_sphere(encoding, bits_per_sample)
    if format in ("nis", "nist"):
        return "PCM_16"
    raise ValueError(f"Unsupported format: {format}")
 def save(
        filepath: str,
        src: paddle.Tensor,
        sample_rate: int,
        channels_first: bool=True,
        compression: Optional[float]=None,
        format: Optional[str]=None,
        encoding: Optional[str]=None,
        bits_per_sample: Optional[int]=None, ):
    """Save audio data to file.
    Note:
        The formats this function can handle depend on the soundfile installation.
        This function is tested on the following formats;
        * WAV
            * 32-bit floating-point
            * 32-bit signed integer
            * 16-bit signed integer
            * 8-bit unsigned integer
        * FLAC
        * OGG/VORBIS
        * SPHERE
    Note:
        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
    Args:
        filepath (str or pathlib.Path): Path to audio file.
        src (paddle.Tensor): Audio data to save. must be 2D tensor.
        sample_rate (int): sampling rate
        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
            otherwise `[time, channel]`.
        compression (float of None, optional): Not used.
            It is here only for interface compatibility reson with "sox_io" backend.
        format (str or None, optional): Override the audio format.
            When ``filepath`` argument is path-like object, audio format is
            inferred from file extension. If the file extension is missing or
            different, you can specify the correct format with this argument.
            When ``filepath`` argument is file-like object,
            this argument is required.
            Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
            ``"flac"`` and ``"sph"``.
        encoding (str or None, optional): Changes the encoding for supported formats.
            This argument is effective only for supported formats, sush as
            ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
                - ``"PCM_S"`` (signed integer Linear PCM)
                - ``"PCM_U"`` (unsigned integer Linear PCM)
                - ``"PCM_F"`` (floating point PCM)
                - ``"ULAW"`` (mu-law)
                - ``"ALAW"`` (a-law)
        bits_per_sample (int or None, optional): Changes the bit depth for the
            supported formats.
            When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
            you can change the bit depth.
            Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
    Supported formats/encodings/bit depth/compression are:
    ``"wav"``
        - 32-bit floating-point PCM
        - 32-bit signed integer PCM
        - 24-bit signed integer PCM
        - 16-bit signed integer PCM
        - 8-bit unsigned integer PCM
        - 8-bit mu-law
        - 8-bit a-law
        Note:
            Default encoding/bit depth is determined by the dtype of
            the input Tensor.
    ``"flac"``
        - 8-bit
        - 16-bit (default)
        - 24-bit
    ``"ogg"``, ``"vorbis"``
        - Doesn't accept changing configuration.
    ``"sph"``
        - 8-bit signed integer PCM
        - 16-bit signed integer PCM
        - 24-bit signed integer PCM
        - 32-bit signed integer PCM (default)
        - 8-bit mu-law
        - 8-bit a-law
        - 16-bit a-law
        - 24-bit a-law
        - 32-bit a-law
    """
    if src.ndim != 2:
        raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
    if compression is not None:
        warnings.warn(
            '`save` function of "soundfile" backend does not support "compression" parameter. '
            "The argument is silently ignored.")
    if hasattr(filepath, "write"):
        if format is None:
            raise RuntimeError(
                "`format` is required when saving to file object.")
        ext = format.lower()
    else:
        ext = str(filepath).split(".")[-1].lower()
    if bits_per_sample not in (None, 8, 16, 24, 32, 64):
        raise ValueError("Invalid bits_per_sample.")
    if bits_per_sample == 24:
        warnings.warn(
            "Saving audio with 24 bits per sample might warp samples near -1. "
            "Using 16 bits per sample might be able to avoid this.")
    subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
    # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
    # so we extend the extensions manually here
    if ext in ["nis", "nist", "sph"] and format is None:
        format = "NIST"
    if channels_first:
        src = src.t()
    soundfile.write(
        file=filepath,
        data=src,
        samplerate=sample_rate,
        subtype=subtype,
        format=format)
 _SUBTYPE2DTYPE = {
    "PCM_S8": "int8",
    "PCM_U8": "uint8",
    "PCM_16": "int16",
    "PCM_32": "int32",
    "FLOAT": "float32",
    "DOUBLE": "float64",
 }
 def load(
        filepath: str,
        frame_offset: int=0,
        num_frames: int=-1,
        normalize: bool=True,
        channels_first: bool=True,
        format: Optional[str]=None, ) -> Tuple[paddle.Tensor, int]:
    """Load audio data from file.
    Note:
        The formats this function can handle depend on the soundfile installation.
        This function is tested on the following formats;
        * WAV
            * 32-bit floating-point
            * 32-bit signed integer
            * 16-bit signed integer
            * 8-bit unsigned integer
        * FLAC
        * OGG/VORBIS
        * SPHERE
    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
    ``float32`` dtype and the shape of `[channel, time]`.
    The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
    When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
    signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
    by providing ``normalize=False``, this function can return integer Tensor, where the samples
    are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
    for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
    ``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
    ``flac`` and ``mp3``.
    For these formats, this function always returns ``float32`` Tensor with values normalized to
    ``[-1.0, 1.0]``.
    Note:
        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend.
    Args:
        filepath (path-like object or file-like object):
            Source of audio data.
        frame_offset (int, optional):
            Number of frames to skip before start reading data.
        num_frames (int, optional):
            Maximum number of frames to read. ``-1`` reads all the remaining samples,
            starting from ``frame_offset``.
            This function may return the less number of frames if there is not enough
            frames in the given file.
        normalize (bool, optional):
            When ``True``, this function always return ``float32``, and sample values are
            normalized to ``[-1.0, 1.0]``.
            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
            integer type.
            This argument has no effect for formats other than integer WAV type.
        channels_first (bool, optional):
            When True, the returned Tensor has dimension `[channel, time]`.
            Otherwise, the returned Tensor's dimension is `[time, channel]`.
        format (str or None, optional):
            Not used. PySoundFile does not accept format hint.
    Returns:
        (paddle.Tensor, int): Resulting Tensor and sample rate.
            If the input file has integer wav format and normalization is off, then it has
            integer type, else ``float32`` type. If ``channels_first=True``, it has
            `[channel, time]` else `[time, channel]`.
    """
    with soundfile.SoundFile(filepath, "r") as file_:
        if file_.format != "WAV" or normalize:
            dtype = "float32"
        elif file_.subtype not in _SUBTYPE2DTYPE:
            raise ValueError(f"Unsupported subtype: {file_.subtype}")
        else:
            dtype = _SUBTYPE2DTYPE[file_.subtype]
        frames = file_._prepare_read(frame_offset, None, num_frames)
        waveform = file_.read(frames, dtype, always_2d=True)
        sample_rate = file_.samplerate
    waveform = paddle.to_tensor(waveform)
    if channels_first:
        waveform = paddle.transpose(waveform, perm=[1, 0])
    return waveform, sample_rate
 # Mapping from soundfile subtype to number of bits per sample.
 # This is mostly heuristical and the value is set to 0 when it is irrelevant
 # (lossy formats) or when it can't be inferred.
 # For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
 # According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
 # the default seems to be 8 bits but it can be compressed further to 4 bits.
 # The dict is inspired from
 # https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
 _SUBTYPE_TO_BITS_PER_SAMPLE = {
    "PCM_S8": 8,  # Signed 8 bit data
    "PCM_16": 16,  # Signed 16 bit data
    "PCM_24": 24,  # Signed 24 bit data
    "PCM_32": 32,  # Signed 32 bit data
    "PCM_U8": 8,  # Unsigned 8 bit data (WAV and RAW only)
    "FLOAT": 32,  # 32 bit float data
    "DOUBLE": 64,  # 64 bit float data
    "ULAW": 8,  # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
    "ALAW": 8,  # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
    "IMA_ADPCM": 0,  # IMA ADPCM.
    "MS_ADPCM": 0,  # Microsoft ADPCM.
    "GSM610":
    0,  # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
    "VOX_ADPCM": 0,  # OKI / Dialogix ADPCM
    "G721_32": 0,  # 32kbs G721 ADPCM encoding.
    "G723_24": 0,  # 24kbs G723 ADPCM encoding.
    "G723_40": 0,  # 40kbs G723 ADPCM encoding.
    "DWVW_12": 12,  # 12 bit Delta Width Variable Word encoding.
    "DWVW_16": 16,  # 16 bit Delta Width Variable Word encoding.
    "DWVW_24": 24,  # 24 bit Delta Width Variable Word encoding.
    "DWVW_N": 0,  # N bit Delta Width Variable Word encoding.
    "DPCM_8": 8,  # 8 bit differential PCM (XI only)
    "DPCM_16": 16,  # 16 bit differential PCM (XI only)
    "VORBIS": 0,  # Xiph Vorbis encoding. (lossy)
    "ALAC_16": 16,  # Apple Lossless Audio Codec (16 bit).
    "ALAC_20": 20,  # Apple Lossless Audio Codec (20 bit).
    "ALAC_24": 24,  # Apple Lossless Audio Codec (24 bit).
    "ALAC_32": 32,  # Apple Lossless Audio Codec (32 bit).
 }
 def _get_bit_depth(subtype):
    if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
        warnings.warn(
            f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample "
            "attribute will be set to 0. If you are seeing this warning, please "
            "report by opening an issue on github (after checking for existing/closed ones). "
            "You may otherwise ignore this warning.")
    return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
 _SUBTYPE_TO_ENCODING = {
    "PCM_S8": "PCM_S",
    "PCM_16": "PCM_S",
    "PCM_24": "PCM_S",
    "PCM_32": "PCM_S",
    "PCM_U8": "PCM_U",
    "FLOAT": "PCM_F",
    "DOUBLE": "PCM_F",
    "ULAW": "ULAW",
    "ALAW": "ALAW",
    "VORBIS": "VORBIS",
 }
 def _get_encoding(format: str, subtype: str):
    if format == "FLAC":
        return "FLAC"
    return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
 def info(filepath: str, format: Optional[str]=None) -> AudioInfo:
    """Get signal information of an audio file.
    Note:
        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
    Args:
        filepath (path-like object or file-like object):
            Source of audio data.
        format (str or None, optional):
            Not used. PySoundFile does not accept format hint.
    Returns:
        AudioInfo: meta data of the given audio.
    """
    sinfo = soundfile.info(filepath)
    return AudioInfo(
        sinfo.samplerate,
        sinfo.frames,
        sinfo.channels,
        bits_per_sample=_get_bit_depth(sinfo.subtype),
        encoding=_get_encoding(sinfo.format, sinfo.subtype), )
--- a/audio/paddleaudio/backends/sox_io_backend.py
+++ b/audio/paddleaudio/backends/sox_io_backend.py
@ -0,0 +1,106 @@
 import os
 from typing import Optional
 from typing import Tuple
 import paddle
 import paddleaudio
 from paddle import Tensor
 from paddleaudio._internal import module_utils as _mod_utils
 from .common import AudioInfo
 #https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py
 def _fail_info(filepath: str, format: Optional[str]) -> AudioInfo:
    raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
 def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioInfo:
    raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
 # Note: need to comply TorchScript syntax -- need annotation and no f-string
 def _fail_load(
        filepath: str,
        frame_offset: int=0,
        num_frames: int=-1,
        normalize: bool=True,
        channels_first: bool=True,
        format: Optional[str]=None, ) -> Tuple[Tensor, int]:
    raise RuntimeError("Failed to load audio from {}".format(filepath))
 def _fail_load_fileobj(fileobj, *args, **kwargs):
    raise RuntimeError(f"Failed to load audio from {fileobj}")
 _fallback_info = _fail_info
 _fallback_info_fileobj = _fail_info_fileobj
 _fallback_load = _fail_load
 _fallback_load_filebj = _fail_load_fileobj
@_mod_utils.requires_sox()
 def load(
        filepath: str,
        frame_offset: int=0,
        num_frames: int=-1,
        normalize: bool=True,
        channels_first: bool=True,
        format: Optional[str]=None, ) -> Tuple[Tensor, int]:
    if hasattr(filepath, "read"):
        ret = paddleaudio._paddleaudio.load_audio_fileobj(
            filepath, frame_offset, num_frames, normalize, channels_first,
            format)
        if ret is not None:
            audio_tensor = paddle.to_tensor(ret[0])
            return (audio_tensor, ret[1])
        return _fallback_load_fileobj(filepath, frame_offset, num_frames,
                                      normalize, channels_first, format)
    filepath = os.fspath(filepath)
    ret = paddleaudio._paddleaudio.sox_io_load_audio_file(
        filepath, frame_offset, num_frames, normalize, channels_first, format)
    if ret is not None:
        audio_tensor = paddle.to_tensor(ret[0])
        return (audio_tensor, ret[1])
    return _fallback_load(filepath, frame_offset, num_frames, normalize,
                          channels_first, format)
@_mod_utils.requires_sox()
 def save(
        filepath: str,
        src: Tensor,
        sample_rate: int,
        channels_first: bool=True,
        compression: Optional[float]=None,
        format: Optional[str]=None,
        encoding: Optional[str]=None,
        bits_per_sample: Optional[int]=None, ):
    src_arr = src.numpy()
    if hasattr(filepath, "write"):
        paddleaudio._paddleaudio.save_audio_fileobj(
            filepath, src_arr, sample_rate, channels_first, compression, format,
            encoding, bits_per_sample)
        return
    filepath = os.fspath(filepath)
    paddleaudio._paddleaudio.sox_io_save_audio_file(
        filepath, src_arr, sample_rate, channels_first, compression, format,
        encoding, bits_per_sample)
@_mod_utils.requires_sox()
 def info(
        filepath: str,
        format: Optional[str]=None, ) -> AudioInfo:
    if hasattr(filepath, "read"):
        sinfo = paddleaudio._paddleaudio.get_info_fileobj(filepath, format)
        if sinfo is not None:
            return AudioInfo(*sinfo)
        return _fallback_info_fileobj(filepath, format)
    filepath = os.fspath(filepath)
    sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format)
    if sinfo is not None:
        return AudioInfo(*sinfo)
    return _fallback_info(filepath, format)
--- a/audio/paddleaudio/backends/utils.py
+++ b/audio/paddleaudio/backends/utils.py
@ -0,0 +1,83 @@
 """Defines utilities for switching audio backends"""
 #code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py
 import warnings
 from typing import List
 from typing import Optional
 import paddleaudio
 from paddleaudio._internal import module_utils as _mod_utils
 from . import no_backend
 from . import soundfile_backend
 from . import sox_io_backend
 __all__ = [
    "list_audio_backends",
    "get_audio_backend",
    "set_audio_backend",
 ]
 def list_audio_backends() -> List[str]:
    """List available backends
    Returns:
        List[str]: The list of available backends.
    """
    backends = []
    if _mod_utils.is_module_available("soundfile"):
        backends.append("soundfile")
    if _mod_utils.is_sox_available():
        backends.append("sox_io")
    return backends
 def set_audio_backend(backend: Optional[str]):
    """Set the backend for I/O operation
    Args:
        backend (str or None): Name of the backend.
            One of ``"sox_io"`` or ``"soundfile"`` based on availability
            of the system. If ``None`` is provided the  current backend is unassigned.
    """
    if backend is not None and backend not in list_audio_backends():
        raise RuntimeError(f'Backend "{backend}" is not one of '
                           f"available backends: {list_audio_backends()}.")
    if backend is None:
        module = no_backend
    elif backend == "sox_io":
        module = sox_io_backend
    elif backend == "soundfile":
        module = soundfile_backend
    else:
        raise NotImplementedError(f'Unexpected backend "{backend}"')
    for func in ["save", "load", "info"]:
        setattr(paddleaudio, func, getattr(module, func))
 def _init_audio_backend():
    backends = list_audio_backends()
    if "soundfile" in backends:
        set_audio_backend("soundfile")
    elif "sox_io" in backends:
        set_audio_backend("sox_io")
    else:
        warnings.warn("No audio backend is available.")
        set_audio_backend(None)
 def get_audio_backend() -> Optional[str]:
    """Get the name of the current backend
    Returns:
        Optional[str]: The name of the current backend or ``None`` if no backend is assigned.
    """
    if paddleaudio.load == no_backend.load:
        return None
    if paddleaudio.load == sox_io_backend.load:
        return "sox_io"
    if paddleaudio.load == soundfile_backend.load:
        return "soundfile"
    raise ValueError("Unknown backend.")
--- a/paddlespeech/audio/compliance/init.py
+++ b/paddlespeech/audio/compliance/init.py
--- a/paddlespeech/audio/compliance/kaldi.py
+++ b/paddlespeech/audio/compliance/kaldi.py
--- a/paddlespeech/audio/compliance/librosa.py
+++ b/paddlespeech/audio/compliance/librosa.py
--- a/paddlespeech/audio/datasets/init.py
+++ b/paddlespeech/audio/datasets/init.py
--- a/paddlespeech/audio/datasets/dataset.py
+++ b/paddlespeech/audio/datasets/dataset.py
@ -16,7 +16,7 @@ from typing import List
 import numpy as np
 import paddle
-from ..backends import load as load_audio
+from ..backends.soundfile_backend import soundfile_load as load_audio
 from ..compliance.kaldi import fbank as kaldi_fbank
 from ..compliance.kaldi import mfcc as kaldi_mfcc
 from ..compliance.librosa import melspectrogram
--- a/paddlespeech/audio/datasets/esc50.py
+++ b/paddlespeech/audio/datasets/esc50.py
@ -16,8 +16,8 @@ import os
 from typing import List
 from typing import Tuple
 from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['ESC50']
--- a/paddlespeech/audio/datasets/gtzan.py
+++ b/paddlespeech/audio/datasets/gtzan.py
@ -17,8 +17,8 @@ import random
 from typing import List
 from typing import Tuple
 from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['GTZAN']
--- a/paddlespeech/audio/datasets/hey_snips.py
+++ b/paddlespeech/audio/datasets/hey_snips.py
--- a/paddlespeech/audio/datasets/rirs_noises.py
+++ b/paddlespeech/audio/datasets/rirs_noises.py
@ -20,8 +20,8 @@ from typing import List
 from paddle.io import Dataset
 from tqdm import tqdm
-from ..backends import load as load_audio
+from ..backends.soundfile_backend import soundfile_load as load_audio
-from ..backends import save as save_wav
+from ..backends.soundfile_backend import soundfile_save as save_wav
 from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
 from .dataset import feat_funcs
--- a/paddlespeech/audio/datasets/tess.py
+++ b/paddlespeech/audio/datasets/tess.py
@ -17,8 +17,8 @@ import random
 from typing import List
 from typing import Tuple
 from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['TESS']
--- a/paddlespeech/audio/datasets/urban_sound.py
+++ b/paddlespeech/audio/datasets/urban_sound.py
@ -16,8 +16,8 @@ import os
 from typing import List
 from typing import Tuple
 from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['UrbanSound8K']
--- a/paddlespeech/audio/datasets/voxceleb.py
+++ b/paddlespeech/audio/datasets/voxceleb.py
@ -23,7 +23,7 @@ from paddle.io import Dataset
 from pathos.multiprocessing import Pool
 from tqdm import tqdm
-from ..backends import load as load_audio
+from ..backends.soundfile_backend import soundfile_load as load_audio
 from ..utils import DATA_HOME
 from ..utils import decompress
 from ..utils.download import download_and_decompress
--- a/paddlespeech/audio/features/init.py
+++ b/paddlespeech/audio/features/init.py
--- a/paddlespeech/audio/features/layers.py
+++ b/paddlespeech/audio/features/layers.py
--- a/paddlespeech/audio/functional/init.py
+++ b/paddlespeech/audio/functional/init.py
--- a/paddlespeech/audio/functional/functional.py
+++ b/paddlespeech/audio/functional/functional.py
--- a/paddlespeech/audio/functional/window.py
+++ b/paddlespeech/audio/functional/window.py
@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -18,127 +18,156 @@ from typing import Union
 import paddle
 from paddle import Tensor
 __all__ = [
    'get_window',
 ]
 class WindowFunctionRegister(object):
    def __init__(self):
        self._functions_dict = dict()
    def register(self):
        def add_subfunction(func):
            name = func.__name__
            self._functions_dict[name] = func
            return func
        return add_subfunction
    def get(self, name):
        return self._functions_dict[name]
 window_function_register = WindowFunctionRegister()
@window_function_register.register()
 def _cat(x: List[Tensor], data_type: str) -> Tensor:
    l = [paddle.to_tensor(_, data_type) for _ in x]
    return paddle.concat(l)
@window_function_register.register()
 def _acosh(x: Union[Tensor, float]) -> Tensor:
    if isinstance(x, float):
        return math.log(x + math.sqrt(x**2 - 1))
    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
@window_function_register.register()
 def _extend(M: int, sym: bool) -> bool:
-    """Extend window by 1 sample if needed for DFT-even symmetry. """
+    """Extend window by 1 sample if needed for DFT-even symmetry."""
    if not sym:
        return M + 1, True
    else:
        return M, False
@window_function_register.register()
 def _len_guards(M: int) -> bool:
-    """Handle small or incorrect window lengths. """
+    """Handle small or incorrect window lengths."""
    if int(M) != M or M < 0:
        raise ValueError('Window length M must be a non-negative integer')
    return M <= 1
@window_function_register.register()
 def _truncate(w: Tensor, needed: bool) -> Tensor:
-    """Truncate window by 1 sample if needed for DFT-even symmetry. """
+    """Truncate window by 1 sample if needed for DFT-even symmetry."""
    if needed:
        return w[:-1]
    else:
        return w
-def _general_gaussian(M: int, p, sig, sym: bool=True,
+@window_function_register.register()
-                      dtype: str='float64') -> Tensor:
+def _general_gaussian(
    M: int, p, sig, sym: bool = True, dtype: str = 'float64'
 ) -> Tensor:
    """Compute a window with a generalized Gaussian shape.
    This function is consistent with scipy.signal.windows.general_gaussian().
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
-    w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
+    w = paddle.exp(-0.5 * paddle.abs(n / sig) ** (2 * p))
    return _truncate(w, needs_trunc)
-def _general_cosine(M: int, a: float, sym: bool=True,
+@window_function_register.register()
-                    dtype: str='float64') -> Tensor:
+def _general_cosine(
    M: int, a: float, sym: bool = True, dtype: str = 'float64'
 ) -> Tensor:
    """Compute a generic weighted sum of cosine terms window.
    This function is consistent with scipy.signal.windows.general_cosine().
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
-    w = paddle.zeros((M, ), dtype=dtype)
+    w = paddle.zeros((M,), dtype=dtype)
    for k in range(len(a)):
        w += a[k] * paddle.cos(k * fac)
    return _truncate(w, needs_trunc)
-def _general_hamming(M: int, alpha: float, sym: bool=True,
+@window_function_register.register()
-                     dtype: str='float64') -> Tensor:
+def _general_hamming(
    M: int, alpha: float, sym: bool = True, dtype: str = 'float64'
 ) -> Tensor:
    """Compute a generalized Hamming window.
    This function is consistent with scipy.signal.windows.general_hamming()
    """
-    return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
+    return _general_cosine(M, [alpha, 1.0 - alpha], sym, dtype=dtype)
-def _taylor(M: int,
+@window_function_register.register()
-            nbar=4,
+def _taylor(
-            sll=30,
+    M: int, nbar=4, sll=30, norm=True, sym: bool = True, dtype: str = 'float64'
-            norm=True,
+) -> Tensor:
            sym: bool=True,
            dtype: str='float64') -> Tensor:
    """Compute a Taylor window.
    The Taylor window taper function approximates the Dolph-Chebyshev window's
    constant sidelobe level for a parameterized number of near-in sidelobes.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    # Original text uses a negative sidelobe level parameter and then negates
    # it in the calculation of B. To keep consistent with other methods we
    # assume the sidelobe level parameter to be positive.
-    B = 10**(sll / 20)
+    B = 10 ** (sll / 20)
    A = _acosh(B) / math.pi
-    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
+    s2 = nbar**2 / (A**2 + (nbar - 0.5) ** 2)
    ma = paddle.arange(1, nbar, dtype=dtype)
-    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
+    Fm = paddle.empty((nbar - 1,), dtype=dtype)
    signs = paddle.empty_like(ma)
    signs[::2] = 1
    signs[1::2] = -1
    m2 = ma * ma
    for mi in range(len(ma)):
-        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
+        numer = signs[mi] * paddle.prod(
-                                                           ))
+            1 - m2[mi] / s2 / (A**2 + (ma - 0.5) ** 2)
        )
        if mi == 0:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
        elif mi == len(ma) - 1:
            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
        else:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
+            denom = (
-                mi] / m2[mi + 1:])
+                2
                * paddle.prod(1 - m2[mi] / m2[:mi])
                * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
            )
        Fm[mi] = numer / denom
    def W(n):
        return 1 + 2 * paddle.matmul(
            Fm.unsqueeze(0),
-            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
+            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2.0 + 0.5) / M),
        )
    w = W(paddle.arange(0, M, dtype=dtype))
@ -150,7 +179,8 @@ def _taylor(M: int,
    return _truncate(w, needs_trunc)
-def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
 def _hamming(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Hamming window.
    The Hamming window is a taper formed by using a raised cosine with
    non-zero endpoints, optimized to minimize the nearest side lobe.
@ -158,7 +188,8 @@ def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_hamming(M, 0.54, sym, dtype=dtype)
-def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
 def _hann(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Hann window.
    The Hann window is a taper formed by using a raised cosine or sine-squared
    with ends that touch zero.
@ -166,15 +197,18 @@ def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_hamming(M, 0.5, sym, dtype=dtype)
-def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
 def _tukey(
    M: int, alpha=0.5, sym: bool = True, dtype: str = 'float64'
 ) -> Tensor:
    """Compute a Tukey window.
    The Tukey window is also known as a tapered cosine window.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    if alpha <= 0:
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    elif alpha >= 1.0:
        return hann(M, sym=sym)
@ -182,53 +216,48 @@ def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
    n = paddle.arange(0, M, dtype=dtype)
    width = int(alpha * (M - 1) / 2.0)
-    n1 = n[0:width + 1]
+    n1 = n[0 : width + 1]
-    n2 = n[width + 1:M - width - 1]
+    n2 = n[width + 1 : M - width - 1]
-    n3 = n[M - width - 1:]
+    n3 = n[M - width - 1 :]
    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
    w2 = paddle.ones(n2.shape, dtype=dtype)
-    w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
+    w3 = 0.5 * (
-                                          (M - 1))))
+        1
        + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / (M - 1)))
    )
    w = paddle.concat([w1, w2, w3])
    return _truncate(w, needs_trunc)
-def _kaiser(M: int, beta: float, sym: bool=True,
+@window_function_register.register()
-            dtype: str='float64') -> Tensor:
+def _gaussian(
-    """Compute a Kaiser window.
+    M: int, std: float, sym: bool = True, dtype: str = 'float64'
-    The Kaiser window is a taper formed by using a Bessel function.
+) -> Tensor:
    """
    raise NotImplementedError()
 def _gaussian(M: int, std: float, sym: bool=True,
              dtype: str='float64') -> Tensor:
    """Compute a Gaussian window.
    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
    sig2 = 2 * std * std
-    w = paddle.exp(-n**2 / sig2)
+    w = paddle.exp(-(n**2) / sig2)
    return _truncate(w, needs_trunc)
-def _exponential(M: int,
+@window_function_register.register()
-                 center=None,
+def _exponential(
-                 tau=1.,
+    M: int, center=None, tau=1.0, sym: bool = True, dtype: str = 'float64'
-                 sym: bool=True,
+) -> Tensor:
-                 dtype: str='float64') -> Tensor:
+    """Compute an exponential (or Poisson) window."""
    """Compute an exponential (or Poisson) window. """
    if sym and center is not None:
        raise ValueError("If sym==True, center must be None.")
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    if center is None:
@ -240,11 +269,11 @@ def _exponential(M: int,
    return _truncate(w, needs_trunc)
-def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
-    """Compute a triangular window.
+def _triang(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
-    """
+    """Compute a triangular window."""
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
@ -258,23 +287,26 @@ def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _truncate(w, needs_trunc)
-def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
 def _bohman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Bohman window.
    The Bohman window is the autocorrelation of a cosine window.
    """
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
-        math.pi * fac)
+        math.pi * fac
    )
    w = _cat([0, w, 0], dtype)
    return _truncate(w, needs_trunc)
-def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
 def _blackman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
    """Compute a Blackman window.
    The Blackman window is a taper formed by using the first three terms of
    a summation of cosines. It was designed to have close to the minimal
@ -284,31 +316,44 @@ def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
    return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
-def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+@window_function_register.register()
-    """Compute a window with a simple cosine shape.
+def _cosine(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
-    """
+    """Compute a window with a simple cosine shape."""
    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
+        return paddle.ones((M,), dtype=dtype)
    M, needs_trunc = _extend(M, sym)
-    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
+    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + 0.5))
    return _truncate(w, needs_trunc)
-def get_window(window: Union[str, Tuple[str, float]],
+def get_window(
    window: Union[str, Tuple[str, float]],
    win_length: int,
-               fftbins: bool=True,
+    fftbins: bool = True,
-               dtype: str='float64') -> Tensor:
+    dtype: str = 'float64',
 ) -> Tensor:
    """Return a window of a given length and type.
    Args:
-        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
+        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
        win_length (int): Number of samples.
        fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
        dtype (str, optional): The data type of the return window. Defaults to 'float64'.
    Returns:
        Tensor: The window represented as a tensor.
    Examples:
        .. code-block:: python
            import paddle
            n_fft = 512
            cosine_window = paddle.audio.functional.get_window('cosine', n_fft)
            std = 7
            gaussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft)
    """
    sym = not fftbins
@ -319,19 +364,22 @@ def get_window(window: Union[str, Tuple[str, float]],
            args = window[1:]
    elif isinstance(window, str):
        if window in ['gaussian', 'exponential']:
-            raise ValueError("The '" + window + "' window needs one or "
+            raise ValueError(
-                             "more parameters -- pass a tuple.")
+                "The '" + window + "' window needs one or "
                "more parameters -- pass a tuple."
            )
        else:
            winstr = window
    else:
-        raise ValueError("%s as window type is not supported." %
+        raise ValueError(
-                         str(type(window)))
+            "%s as window type is not supported." % str(type(window))
        )
    try:
-        winfunc = eval('_' + winstr)
+        winfunc = window_function_register.get('_' + winstr)
    except KeyError as e:
        raise ValueError("Unknown window type.") from e
-    params = (win_length, ) + args
+    params = (win_length,) + args
    kwargs = {'sym': sym}
    return winfunc(*params, dtype=dtype, **kwargs)
--- a/tests/unit/audio/backends/soundfile/init.py
+++ b/tests/unit/audio/backends/soundfile/init.py
@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .kaldi import fbank
 from .kaldi import pitch
--- a/audio/paddleaudio/kaldi/kaldi.py
+++ b/audio/paddleaudio/kaldi/kaldi.py
@ -0,0 +1,132 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddleaudio
 from paddleaudio._internal import module_utils
 __all__ = [
    'fbank',
    'pitch',
 ]
@module_utils.requires_kaldi()
 def fbank(
        wav,
        samp_freq: int=16000,
        frame_shift_ms: float=10.0,
        frame_length_ms: float=25.0,
        dither: float=0.0,
        preemph_coeff: float=0.97,
        remove_dc_offset: bool=True,
        window_type: str='povey',
        round_to_power_of_two: bool=True,
        blackman_coeff: float=0.42,
        snip_edges: bool=True,
        allow_downsample: bool=False,
        allow_upsample: bool=False,
        max_feature_vectors: int=-1,
        num_bins: int=23,
        low_freq: float=20,
        high_freq: float=0,
        vtln_low: float=100,
        vtln_high: float=-500,
        debug_mel: bool=False,
        htk_mode: bool=False,
        use_energy: bool=False,  # fbank opts
        energy_floor: float=0.0,
        raw_energy: bool=True,
        htk_compat: bool=False,
        use_log_fbank: bool=True,
        use_power: bool=True):
    frame_opts = paddleaudio._paddleaudio.FrameExtractionOptions()
    mel_opts = paddleaudio._paddleaudio.MelBanksOptions()
    fbank_opts = paddleaudio._paddleaudio.FbankOptions()
    frame_opts.samp_freq = samp_freq
    frame_opts.frame_shift_ms = frame_shift_ms
    frame_opts.frame_length_ms = frame_length_ms
    frame_opts.dither = dither
    frame_opts.preemph_coeff = preemph_coeff
    frame_opts.remove_dc_offset = remove_dc_offset
    frame_opts.window_type = window_type
    frame_opts.round_to_power_of_two = round_to_power_of_two
    frame_opts.blackman_coeff = blackman_coeff
    frame_opts.snip_edges = snip_edges
    frame_opts.allow_downsample = allow_downsample
    frame_opts.allow_upsample = allow_upsample
    frame_opts.max_feature_vectors = max_feature_vectors
    mel_opts.num_bins = num_bins
    mel_opts.low_freq = low_freq
    mel_opts.high_freq = high_freq
    mel_opts.vtln_low = vtln_low
    mel_opts.vtln_high = vtln_high
    mel_opts.debug_mel = debug_mel
    mel_opts.htk_mode = htk_mode
    fbank_opts.use_energy = use_energy
    fbank_opts.energy_floor = energy_floor
    fbank_opts.raw_energy = raw_energy
    fbank_opts.htk_compat = htk_compat
    fbank_opts.use_log_fbank = use_log_fbank
    fbank_opts.use_power = use_power
    feat = paddleaudio._paddleaudio.ComputeFbank(frame_opts, mel_opts,
                                                 fbank_opts, wav)
    return feat
@module_utils.requires_kaldi()
 def pitch(wav,
          samp_freq: int=16000,
          frame_shift_ms: float=10.0,
          frame_length_ms: float=25.0,
          preemph_coeff: float=0.0,
          min_f0: int=50,
          max_f0: int=400,
          soft_min_f0: float=10.0,
          penalty_factor: float=0.1,
          lowpass_cutoff: int=1000,
          resample_freq: int=4000,
          delta_pitch: float=0.005,
          nccf_ballast: int=7000,
          lowpass_filter_width: int=1,
          upsample_filter_width: int=5,
          max_frames_latency: int=0,
          frames_per_chunk: int=0,
          simulate_first_pass_online: bool=False,
          recompute_frame: int=500,
          nccf_ballast_online: bool=False,
          snip_edges: bool=True):
    pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
    pitch_opts.samp_freq = samp_freq
    pitch_opts.frame_shift_ms = frame_shift_ms
    pitch_opts.frame_length_ms = frame_length_ms
    pitch_opts.preemph_coeff = preemph_coeff
    pitch_opts.min_f0 = min_f0
    pitch_opts.max_f0 = max_f0
    pitch_opts.soft_min_f0 = soft_min_f0
    pitch_opts.penalty_factor = penalty_factor
    pitch_opts.lowpass_cutoff = lowpass_cutoff
    pitch_opts.resample_freq = resample_freq
    pitch_opts.delta_pitch = delta_pitch
    pitch_opts.nccf_ballast = nccf_ballast
    pitch_opts.lowpass_filter_width = lowpass_filter_width
    pitch_opts.upsample_filter_width = upsample_filter_width
    pitch_opts.max_frames_latency = max_frames_latency
    pitch_opts.frames_per_chunk = frames_per_chunk
    pitch_opts.simulate_first_pass_online = simulate_first_pass_online
    pitch_opts.recompute_frame = recompute_frame
    pitch_opts.nccf_ballast_online = nccf_ballast_online
    pitch_opts.snip_edges = snip_edges
    pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
    return pitch
--- a/paddlespeech/audio/metric/init.py
+++ b/paddlespeech/audio/metric/init.py
--- a/paddlespeech/audio/metric/eer.py
+++ b/paddlespeech/audio/metric/eer.py
--- a/audio/paddleaudio/sox_effects/init.py
+++ b/audio/paddleaudio/sox_effects/init.py
@ -0,0 +1,21 @@
 from paddleaudio._internal import module_utils as _mod_utils
 from .sox_effects import apply_effects_file
 from .sox_effects import apply_effects_tensor
 from .sox_effects import effect_names
 from .sox_effects import init_sox_effects
 from .sox_effects import shutdown_sox_effects
 if _mod_utils.is_sox_available():
    import atexit
    init_sox_effects()
    atexit.register(shutdown_sox_effects)
 __all__ = [
    "init_sox_effects",
    "shutdown_sox_effects",
    "effect_names",
    "apply_effects_tensor",
    "apply_effects_file",
 ]
--- a/audio/paddleaudio/sox_effects/sox_effects.py
+++ b/audio/paddleaudio/sox_effects/sox_effects.py
@ -0,0 +1,241 @@
 import os
 from typing import List
 from typing import Optional
 from typing import Tuple
 import paddle
 import paddleaudio
 from paddleaudio._internal import module_utils as _mod_utils
 from paddleaudio.utils.sox_utils import list_effects
 #code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
@_mod_utils.requires_sox()
 def init_sox_effects():
    """Initialize resources required to use sox effects.
    Note:
        You do not need to call this function manually. It is called automatically.
    Once initialized, you do not need to call this function again across the multiple uses of
    sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
    Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
    again will result in error.
    """
    paddleaudio._paddleaudio.sox_effects_initialize_sox_effects()
@_mod_utils.requires_sox()
 def shutdown_sox_effects():
    """Clean up resources required to use sox effects.
    Note:
        You do not need to call this function manually. It is called automatically.
    It is safe to call this function multiple times.
    Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
    initializing again will result in error.
    """
    paddleaudio._paddleaudio.sox_effects_shutdown_sox_effects()
@_mod_utils.requires_sox()
 def effect_names() -> List[str]:
    """Gets list of valid sox effect names
    Returns:
        List[str]: list of available effect names.
    Example
        >>> paddleaudio.sox_effects.effect_names()
        ['allpass', 'band', 'bandpass', ... ]
    """
    return list(list_effects().keys())
@_mod_utils.requires_sox()
 def apply_effects_tensor(
        tensor: paddle.Tensor,
        sample_rate: int,
        effects: List[List[str]],
        channels_first: bool=True, ) -> Tuple[paddle.Tensor, int]:
    """Apply sox effects to given Tensor
    .. devices:: CPU
    Note:
        This function only works on CPU Tensors.
        This function works in the way very similar to ``sox`` command, however there are slight
        differences. For example, ``sox`` command adds certain effects automatically (such as
        ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
        only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
        need to give ``rate`` effect with desired sampling rate.).
    Args:
        tensor (paddle.Tensor): Input 2D CPU Tensor.
        sample_rate (int): Sample rate
        effects (List[List[str]]): List of effects.
        channels_first (bool, optional): Indicates if the input Tensor's dimension is
            `[channels, time]` or `[time, channels]`
    Returns:
        (Tensor, int): Resulting Tensor and sample rate.
        The resulting Tensor has the same ``dtype`` as the input Tensor, and
        the same channels order. The shape of the Tensor can be different based on the
        effects applied. Sample rate can also be different based on the effects applied.
    Example - Basic usage
        >>>
        >>> # Defines the effects to apply
        >>> effects = [
        ...     ['gain', '-n'],  # normalises to 0dB
        ...     ['pitch', '5'],  # 5 cent pitch shift
        ...     ['rate', '8000'],  # resample to 8000 Hz
        ... ]
        >>>
        >>> # Generate pseudo wave:
        >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
        >>> sample_rate = 16000
        >>> waveform = 2 * paddle.rand([2, sample_rate * 1]) - 1
        >>> waveform.shape
        paddle.Size([2, 16000])
        >>> waveform
        tensor([[ 0.3138,  0.7620, -0.9019,  ..., -0.7495, -0.4935,  0.5442],
                [-0.0832,  0.0061,  0.8233,  ..., -0.5176, -0.9140, -0.2434]])
        >>>
        >>> # Apply effects
        >>> waveform, sample_rate = apply_effects_tensor(
        ...     wave_form, sample_rate, effects, channels_first=True)
        >>>
        >>> # Check the result
        >>> # The new waveform is sampling rate 8000, 1 second.
        >>> # normalization and channel order are preserved
        >>> waveform.shape
        paddle.Size([2, 8000])
        >>> waveform
        tensor([[ 0.5054, -0.5518, -0.4800,  ..., -0.0076,  0.0096, -0.0110],
                [ 0.1331,  0.0436, -0.3783,  ..., -0.0035,  0.0012,  0.0008]])
        >>> sample_rate
        8000
    """
    tensor_np = tensor.numpy()
    ret = paddleaudio._paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate,
                                                       effects, channels_first)
    if ret is not None:
        return (paddle.to_tensor(ret[0]), ret[1])
    raise RuntimeError("Failed to apply sox effect")
@_mod_utils.requires_sox()
 def apply_effects_file(
        path: str,
        effects: List[List[str]],
        normalize: bool=True,
        channels_first: bool=True,
        format: Optional[str]=None, ) -> Tuple[paddle.Tensor, int]:
    """Apply sox effects to the audio file and load the resulting data as Tensor
    Note:
        This function works in the way very similar to ``sox`` command, however there are slight
        differences. For example, ``sox`` commnad adds certain effects automatically (such as
        ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
        effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
        effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
        rate and leave samples untouched.
    Args:
        path (path-like object or file-like object):
        effects (List[List[str]]): List of effects.
        normalize (bool, optional):
            When ``True``, this function always return ``float32``, and sample values are
            normalized to ``[-1.0, 1.0]``.
            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
            integer type. This argument has no effect for formats other
            than integer WAV type.
        channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
            Otherwise, the returned Tensor's dimension is `[time, channel]`.
        format (str or None, optional):
            Override the format detection with the given format.
            Providing the argument might help when libsox can not infer the format
            from header or extension,
    Returns:
        (Tensor, int): Resulting Tensor and sample rate.
        If ``normalize=True``, the resulting Tensor is always ``float32`` type.
        If ``normalize=False`` and the input audio file is of integer WAV file, then the
        resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
        If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
        otherwise `[time, channel]`.
    Example - Basic usage
        >>>
        >>> # Defines the effects to apply
        >>> effects = [
        ...     ['gain', '-n'],  # normalises to 0dB
        ...     ['pitch', '5'],  # 5 cent pitch shift
        ...     ['rate', '8000'],  # resample to 8000 Hz
        ... ]
        >>>
        >>> # Apply effects and load data with channels_first=True
        >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
        >>>
        >>> # Check the result
        >>> waveform.shape
        paddle.Size([2, 8000])
        >>> waveform
        tensor([[ 5.1151e-03,  1.8073e-02,  2.2188e-02,  ...,  1.0431e-07,
                 -1.4761e-07,  1.8114e-07],
                [-2.6924e-03,  2.1860e-03,  1.0650e-02,  ...,  6.4122e-07,
                 -5.6159e-07,  4.8103e-07]])
        >>> sample_rate
        8000
    Example - Apply random speed perturbation to dataset
        >>>
        >>> # Load data from file, apply random speed perturbation
        >>> class RandomPerturbationFile(paddle.utils.data.Dataset):
        ...     \"\"\"Given flist, apply random speed perturbation
        ...
        ...     Suppose all the input files are at least one second long.
        ...     \"\"\"
        ...     def __init__(self, flist: List[str], sample_rate: int):
        ...         super().__init__()
        ...         self.flist = flist
        ...         self.sample_rate = sample_rate
        ...
        ...     def __getitem__(self, index):
        ...         speed = 0.5 + 1.5 * random.randn()
        ...         effects = [
        ...             ['gain', '-n', '-10'],  # apply 10 db attenuation
        ...             ['remix', '-'],  # merge all the channels
        ...             ['speed', f'{speed:.5f}'],  # duration is now 0.5 ~ 2.0 seconds.
        ...             ['rate', f'{self.sample_rate}'],
        ...             ['pad', '0', '1.5'],  # add 1.5 seconds silence at the end
        ...             ['trim', '0', '2'],  # get the first 2 seconds
        ...         ]
        ...         waveform, _ = paddleaudio.sox_effects.apply_effects_file(
        ...             self.flist[index], effects)
        ...         return waveform
        ...
        ...     def __len__(self):
        ...         return len(self.flist)
        ...
        >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
        >>> loader = paddle.utils.data.DataLoader(dataset, batch_size=32)
        >>> for batch in loader:
        >>>     pass
    """
    if hasattr(path, "read"):
        ret = paddleaudio._paddleaudio.apply_effects_fileobj(path, effects, normalize,
                                                channels_first, format)
        if ret is None:
            raise RuntimeError("Failed to load audio from {}".format(path))
        return (paddle.to_tensor(ret[0]), ret[1])
    path = os.fspath(path)
    ret = paddleaudio._paddleaudio.sox_effects_apply_effects_file(path, effects, normalize,
                                                     channels_first, format)
    if ret is not None:
        return (paddle.to_tensor(ret[0]), ret[1])
    raise RuntimeError("Failed to load audio from {}".format(path))
--- a/audio/paddleaudio/src/CMakeLists.txt
+++ b/audio/paddleaudio/src/CMakeLists.txt
@ -0,0 +1,217 @@
 if (MSVC)
  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 if(APPLE)
 set(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
 endif(APPLE)
 ################################################################################
 # libpaddleaudio
 ################################################################################
 set(
  LIBPADDLEAUDIO_SOURCES
  utils.cpp
  )
 set(
  LIBPADDLEAUDIO_INCLUDE_DIRS
  ${PROJECT_SOURCE_DIR}
  )
 set(
  LIBPADDLEAUDIO_LINK_LIBRARIES
  )
 set(
  LIBPADDLEAUDIO_COMPILE_DEFINITIONS)
 #------------------------------------------------------------------------------#
 # START OF CUSTOMIZATION LOGICS
 #------------------------------------------------------------------------------#
 if(BUILD_SOX)
  list(
    APPEND
    LIBPADDLEAUDIO_LINK_LIBRARIES
    libsox
    )
  list(
    APPEND
    LIBPADDLEAUDIO_SOURCES
    )
  list(
    APPEND
    LIBPADDLEAUDIO_COMPILE_DEFINITIONS
    INCLUDE_SOX
    )
 endif()
 if(BUILD_KALDI)
  list(
    APPEND
    LIBPADDLEAUDIO_LINK_LIBRARIES
    libkaldi
  )
  list(
    APPEND
    LIBPADDLEAUDIO_COMPILE_DEFINITIONS
    INCLUDE_KALDI
    COMPILE_WITHOUT_OPENFST
  )
 endif()
 #------------------------------------------------------------------------------#
 # END OF CUSTOMIZATION LOGICS
 #------------------------------------------------------------------------------#
 function (define_library name source include_dirs link_libraries compile_defs)
  add_library(${name} SHARED ${source})
  target_include_directories(${name} PRIVATE ${include_dirs})
  target_link_libraries(${name} ${link_libraries})
  target_compile_definitions(${name} PRIVATE ${compile_defs})
  set_target_properties(${name} PROPERTIES PREFIX "")
  if (MSVC)
    set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
  endif(MSVC)
  install(
    TARGETS ${name}
    LIBRARY DESTINATION lib
    RUNTIME DESTINATION lib  # For Windows
    )
 endfunction()
 define_library(
  libpaddleaudio
  "${LIBPADDLEAUDIO_SOURCES}"
  "${LIBPADDLEAUDIO_INCLUDE_DIRS}"
  "${LIBPADDLEAUDIO_LINK_LIBRARIES}"
  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
 )
 if (APPLE)
  add_custom_command(TARGET libpaddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/libgcc_s.1.1.dylib" libpaddleaudio.so)
 endif(APPLE)
 if (UNIX AND NOT APPLE)
  set_target_properties(libpaddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN")
 endif()
 if (APPLE)
  set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
 else()
  set(AUDIO_LIBRARY -Wl,--no-as-needed libpaddleaudio -Wl,--as-needed CACHE INTERNAL "")
 endif()
  ################################################################################
 # _paddleaudio.so
 ################################################################################
 if (BUILD_PADDLEAUDIO_PYTHON_EXTENSION)
 if (WIN32)
  find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
  set(ADDITIONAL_ITEMS Python3::Python)
 endif()
 function(define_extension name sources include_dirs libraries definitions)
  add_library(${name} SHARED ${sources})
  target_compile_definitions(${name} PRIVATE "${definitions}")
  target_include_directories(
    ${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
  target_link_libraries(
    ${name}
    ${libraries}
    ${PYTHON_LIBRARY}
    ${ADDITIONAL_ITEMS}
    )
  set_target_properties(${name} PROPERTIES PREFIX "")
  if (MSVC)
    set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
  endif(MSVC)
  if (APPLE)
    # https://github.com/facebookarchive/caffe2/issues/854#issuecomment-364538485
    # https://github.com/pytorch/pytorch/commit/73f6715f4725a0723d8171d3131e09ac7abf0666
    set_target_properties(${name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
  endif()
  install(
    TARGETS ${name}
    LIBRARY DESTINATION .
    RUNTIME DESTINATION .  # For Windows
    )
 endfunction()
 set(
  EXTENSION_SOURCES
  pybind/pybind.cpp
  )
 #----------------------------------------------------------------------------#
 # START OF CUSTOMIZATION LOGICS
 #----------------------------------------------------------------------------#
 if(BUILD_SOX)
  list(
    APPEND
    EXTENSION_SOURCES
    pybind/sox/effects.cpp
    pybind/sox/effects_chain.cpp
    pybind/sox/io.cpp
    pybind/sox/types.cpp
    pybind/sox/utils.cpp
    )
 endif()
 if(BUILD_KALDI)
  list(
    APPEND
    EXTENSION_SOURCES
    pybind/kaldi/kaldi_feature_wrapper.cc
    pybind/kaldi/kaldi_feature.cc
    )
 endif()
 #----------------------------------------------------------------------------#
 # END OF CUSTOMIZATION LOGICS
 #----------------------------------------------------------------------------#
 define_extension(
  _paddleaudio
  "${EXTENSION_SOURCES}"
  ""
  libpaddleaudio
  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
  )
 # if(BUILD_CTC_DECODER)
 #   set(
 #     DECODER_EXTENSION_SOURCES
 #     decoder/bindings/pybind.cpp
 #     )
 #   define_extension(
 #     _paddleaudio_decoder
 #     "${DECODER_EXTENSION_SOURCES}"
 #     ""
 #     "libpaddleaudio_decoder"
 #     "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
 #     )
 # endif()
 # if(USE_FFMPEG)
 #   set(
 #     FFMPEG_EXTENSION_SOURCES
 #     ffmpeg/pybind/typedefs.cpp
 #     ffmpeg/pybind/pybind.cpp
 #     ffmpeg/pybind/stream_reader.cpp
 #     )
 #   define_extension(
 #     _paddleaudio_ffmpeg
 #     "${FFMPEG_EXTENSION_SOURCES}"
 #     "${FFMPEG_INCLUDE_DIRS}"
 #     "libpaddleaudio_ffmpeg"
 #     "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
 #     )
 # endif()
 endif()
 if (APPLE)
  add_custom_command(TARGET _paddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/lib/libgcc_s.1.1.dylib" _paddleaudio.so)
 endif(APPLE)
 if (UNIX AND NOT APPLE)
  set_target_properties(_paddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN/lib")
 endif()
--- a/audio/paddleaudio/src/optional/COPYING
+++ b/audio/paddleaudio/src/optional/COPYING
@ -0,0 +1,121 @@
 Creative Commons Legal Code
 CC0 1.0 Universal
    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
    HEREUNDER.
 Statement of Purpose
 The laws of most jurisdictions throughout the world automatically confer
 exclusive Copyright and Related Rights (defined below) upon the creator
 and subsequent owner(s) (each and all, an "owner") of an original work of
 authorship and/or a database (each, a "Work").
 Certain owners wish to permanently relinquish those rights to a Work for
 the purpose of contributing to a commons of creative, cultural and
 scientific works ("Commons") that the public can reliably and without fear
 of later claims of infringement build upon, modify, incorporate in other
 works, reuse and redistribute as freely as possible in any form whatsoever
 and for any purposes, including without limitation commercial purposes.
 These owners may contribute to the Commons to promote the ideal of a free
 culture and the further production of creative, cultural and scientific
 works, or to gain reputation or greater distribution for their Work in
 part through the use and efforts of others.
 For these and/or other purposes and motivations, and without any
 expectation of additional consideration or compensation, the person
 associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 is an owner of Copyright and Related Rights in the Work, voluntarily
 elects to apply CC0 to the Work and publicly distribute the Work under its
 terms, with knowledge of his or her Copyright and Related Rights in the
 Work and the meaning and intended legal effect of CC0 on those rights.
 1. Copyright and Related Rights. A Work made available under CC0 may be
 protected by copyright and related or neighboring rights ("Copyright and
 Related Rights"). Copyright and Related Rights include, but are not
 limited to, the following:
  i. the right to reproduce, adapt, distribute, perform, display,
     communicate, and translate a Work;
 ii. moral rights retained by the original author(s) and/or performer(s);
 iii. publicity and privacy rights pertaining to a person's image or
     likeness depicted in a Work;
 iv. rights protecting against unfair competition in regards to a Work,
     subject to the limitations in paragraph 4(a), below;
  v. rights protecting the extraction, dissemination, use and reuse of data
     in a Work;
 vi. database rights (such as those arising under Directive 96/9/EC of the
     European Parliament and of the Council of 11 March 1996 on the legal
     protection of databases, and under any national implementation
     thereof, including any amended or successor version of such
     directive); and
 vii. other similar, equivalent or corresponding rights throughout the
     world based on applicable law or treaty, and any national
     implementations thereof.
 2. Waiver. To the greatest extent permitted by, but not in contravention
 of, applicable law, Affirmer hereby overtly, fully, permanently,
 irrevocably and unconditionally waives, abandons, and surrenders all of
 Affirmer's Copyright and Related Rights and associated claims and causes
 of action, whether now known or unknown (including existing as well as
 future claims and causes of action), in the Work (i) in all territories
 worldwide, (ii) for the maximum duration provided by applicable law or
 treaty (including future time extensions), (iii) in any current or future
 medium and for any number of copies, and (iv) for any purpose whatsoever,
 including without limitation commercial, advertising or promotional
 purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 member of the public at large and to the detriment of Affirmer's heirs and
 successors, fully intending that such Waiver shall not be subject to
 revocation, rescission, cancellation, termination, or any other legal or
 equitable action to disrupt the quiet enjoyment of the Work by the public
 as contemplated by Affirmer's express Statement of Purpose.
 3. Public License Fallback. Should any part of the Waiver for any reason
 be judged legally invalid or ineffective under applicable law, then the
 Waiver shall be preserved to the maximum extent permitted taking into
 account Affirmer's express Statement of Purpose. In addition, to the
 extent the Waiver is so judged Affirmer hereby grants to each affected
 person a royalty-free, non transferable, non sublicensable, non exclusive,
 irrevocable and unconditional license to exercise Affirmer's Copyright and
 Related Rights in the Work (i) in all territories worldwide, (ii) for the
 maximum duration provided by applicable law or treaty (including future
 time extensions), (iii) in any current or future medium and for any number
 of copies, and (iv) for any purpose whatsoever, including without
 limitation commercial, advertising or promotional purposes (the
 "License"). The License shall be deemed effective as of the date CC0 was
 applied by Affirmer to the Work. Should any part of the License for any
 reason be judged legally invalid or ineffective under applicable law, such
 partial invalidity or ineffectiveness shall not invalidate the remainder
 of the License, and in such case Affirmer hereby affirms that he or she
 will not (i) exercise any of his or her remaining Copyright and Related
 Rights in the Work or (ii) assert any associated claims and causes of
 action with respect to the Work, in either case contrary to Affirmer's
 express Statement of Purpose.
 4. Limitations and Disclaimers.
 a. No trademark or patent rights held by Affirmer are waived, abandoned,
    surrendered, licensed or otherwise affected by this document.
 b. Affirmer offers the Work as-is and makes no representations or
    warranties of any kind concerning the Work, express, implied,
    statutory or otherwise, including without limitation warranties of
    title, merchantability, fitness for a particular purpose, non
    infringement, or the absence of latent or other defects, accuracy, or
    the present or absence of errors, whether or not discoverable, all to
    the greatest extent permissible under applicable law.
 c. Affirmer disclaims responsibility for clearing rights of other persons
    that may apply to the Work or any use thereof, including without
    limitation any person's Copyright and Related Rights in the Work.
    Further, Affirmer disclaims responsibility for obtaining any necessary
    consents, permissions or other rights required for any use of the
    Work.
 d. Affirmer understands and acknowledges that Creative Commons is not a
    party to this document and has no duty or obligation with respect to
    this CC0 or use of the Work.
--- a/audio/paddleaudio/src/optional/optional.hpp
+++ b/audio/paddleaudio/src/optional/optional.hpp
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common.h
@ -0,0 +1,49 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "pybind11/pybind11.h"
 #include "pybind11/numpy.h"
 #include "feat/feature-window.h"
 namespace paddleaudio {
 namespace kaldi {
 namespace py = pybind11;
 template <class F>
 class StreamingFeatureTpl {
  public:
    typedef typename F::Options Options;
    StreamingFeatureTpl(const Options& opts);
    bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
                        ::kaldi::Vector<::kaldi::BaseFloat>* feats);
    void Reset() { remained_wav_.Resize(0); }
    int Dim() { return computer_.Dim(); }
  private:
    bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
                 ::kaldi::Vector<::kaldi::BaseFloat>* feats);
    Options opts_;
    ::kaldi::FeatureWindowFunction window_function_;
    ::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
    F computer_;
 };
 }  // namespace kaldi
 }  // namespace ppspeech
 #include "feature_common_inl.h"
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
@ -0,0 +1,93 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "base/kaldi-common.h"
 namespace paddleaudio {
 namespace kaldi {
 template <class F>
 StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)
    : opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
    // window_function_(computer_.GetFrameOptions()) { the opt set to zero
 }
 template <class F>
 bool StreamingFeatureTpl<F>::ComputeFeature(
    const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
    // append remaned waves
    ::kaldi::int32 wav_len = wav.Dim();
    if (wav_len == 0) return false;
    ::kaldi::int32 left_len = remained_wav_.Dim();
    ::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
    waves.Range(0, left_len).CopyFromVec(remained_wav_);
    waves.Range(left_len, wav_len).CopyFromVec(wav);
    // cache remaned waves
    ::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
    ::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
    ::kaldi::int32 frame_shift = frame_opts.WindowShift();
    ::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
    remained_wav_.Resize(left_samples);
    remained_wav_.CopyFromVec(
        waves.Range(frame_shift * num_frames, left_samples));
    // compute speech feature
    Compute(waves, feats);
    return true;
 }
 // Compute feat
 template <class F>
 bool StreamingFeatureTpl<F>::Compute(
    const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
    ::kaldi::BaseFloat vtln_warp = 1.0;
    const ::kaldi::FrameExtractionOptions& frame_opts =
        computer_.GetFrameOptions();
    ::kaldi::int32 num_samples = waves.Dim();
    ::kaldi::int32 frame_length = frame_opts.WindowSize();
    ::kaldi::int32 sample_rate = frame_opts.samp_freq;
    if (num_samples < frame_length) {
        return false;
    }
    ::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
    feats->Resize(num_frames * Dim());
    ::kaldi::Vector<::kaldi::BaseFloat> window;
    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
    for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
        ::kaldi::BaseFloat raw_log_energy = 0.0;
        ::kaldi::ExtractWindow(0,
                               waves,
                               frame,
                               frame_opts,
                               window_function_,
                               &window,
                               need_raw_log_energy ? &raw_log_energy : NULL);
        ::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
                                                         ::kaldi::kUndefined);
        computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
        ::kaldi::SubVector<::kaldi::BaseFloat> output_row(
            feats->Data() + frame * Dim(), Dim());
        output_row.CopyFromVec(this_feature);
    }
    return true;
 }
 }  // namespace kaldi
 }  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
@ -0,0 +1,75 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
 #include "feat/pitch-functions.h"
 namespace paddleaudio {
 namespace kaldi {
 bool InitFbank(
    ::kaldi::FrameExtractionOptions frame_opts,
    ::kaldi::MelBanksOptions mel_opts,
    FbankOptions fbank_opts) {
    ::kaldi::FbankOptions opts;
    opts.frame_opts = frame_opts;
    opts.mel_opts = mel_opts;
    opts.use_energy = fbank_opts.use_energy;
    opts.energy_floor = fbank_opts.energy_floor;
    opts.raw_energy = fbank_opts.raw_energy;
    opts.htk_compat = fbank_opts.htk_compat;
    opts.use_log_fbank = fbank_opts.use_log_fbank;
    opts.use_power = fbank_opts.use_power;
    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->InitFbank(opts);
    return true;
 }
 py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
    return paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ComputeFbank(
        wav);
 }
 py::array_t<float> ComputeFbank(
    ::kaldi::FrameExtractionOptions frame_opts,
    ::kaldi::MelBanksOptions mel_opts,
    FbankOptions fbank_opts,
    const py::array_t<float>& wav) {
    InitFbank(frame_opts, mel_opts, fbank_opts);
    py::array_t<float> result = ComputeFbankStreaming(wav);
    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
    return result;
 }
 void ResetFbank() {
    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
 }
 py::array_t<float> ComputeKaldiPitch(
  const ::kaldi::PitchExtractionOptions& opts,
  const py::array_t<float>& wav) {
    py::buffer_info info = wav.request();
    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
    ::kaldi::Matrix<::kaldi::BaseFloat> features;
    ::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
    auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
    for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
        std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
                    sizeof(float)*features.NumCols());
    }
   return result;
 }
 }  // namespace kaldi
 }  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
@ -0,0 +1,64 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <string>
 #include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
 #include "feat/pitch-functions.h"
 namespace py = pybind11;
 namespace paddleaudio {
 namespace kaldi {
 struct FbankOptions{
  bool use_energy;  // append an extra dimension with energy to the filter banks
  float energy_floor;
  bool raw_energy;  // If true, compute energy before preemphasis and windowing
  bool htk_compat;  // If true, put energy last (if using energy)
  bool use_log_fbank;  // if true (default), produce log-filterbank, else linear
  bool use_power; 
  FbankOptions(): use_energy(false),
                 energy_floor(0.0),
                 raw_energy(true),
                 htk_compat(false),
                 use_log_fbank(true),
                 use_power(true) {}
 };
 bool InitFbank(
    ::kaldi::FrameExtractionOptions frame_opts,
    ::kaldi::MelBanksOptions mel_opts,
    FbankOptions fbank_opts);
 py::array_t<float> ComputeFbank(
    ::kaldi::FrameExtractionOptions frame_opts,
    ::kaldi::MelBanksOptions mel_opts,
    FbankOptions fbank_opts,
    const py::array_t<float>& wav);
 py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);
 void ResetFbank();
 py::array_t<float> ComputeKaldiPitch(
    const ::kaldi::PitchExtractionOptions& opts,
    const py::array_t<float>& wav);
 }  // namespace kaldi
 }  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
@ -0,0 +1,51 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
 namespace paddleaudio {
 namespace kaldi {
 KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
    static KaldiFeatureWrapper instance;
    return &instance;
 }
 bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
    fbank_.reset(new Fbank(opts));
    return true;
 }
 py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
    const py::array_t<float> wav) {
    py::buffer_info info = wav.request();
    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
    ::kaldi::Vector<::kaldi::BaseFloat> feats;
    bool flag = fbank_->ComputeFeature(input_wav, &feats);
    if (flag == false || feats.Dim() == 0) return py::array_t<float>();
    auto result = py::array_t<float>(feats.Dim());
    py::buffer_info xs = result.request();
    std::cout << std::endl;
    float* res_ptr = (float*)xs.ptr;
    for (int idx = 0; idx < feats.Dim(); ++idx) {
        *res_ptr = feats(idx);
        res_ptr++;
    }
    return result.reshape({feats.Dim() / Dim(), Dim()});
 }
 }  // namesapce kaldi
 }  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
@ -0,0 +1,40 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "base/kaldi-common.h"
 #include "feat/feature-fbank.h"
 #include "paddleaudio/src/pybind/kaldi/feature_common.h"
 namespace paddleaudio {
 namespace kaldi {
 typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
 class KaldiFeatureWrapper {
  public:
    static KaldiFeatureWrapper* GetInstance();
    bool InitFbank(::kaldi::FbankOptions opts);
    py::array_t<float> ComputeFbank(const py::array_t<float> wav);
    int Dim() { return fbank_->Dim(); }
    void ResetFbank() { fbank_->Reset(); }
  private:
    std::unique_ptr<paddleaudio::kaldi::Fbank> fbank_;
 };
 }  // namespace kaldi
 }  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/pybind.cpp
+++ b/audio/paddleaudio/src/pybind/pybind.cpp
@ -0,0 +1,148 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
 #include "paddleaudio/third_party/kaldi/feat/feature-fbank.h"
 #ifdef INCLUDE_SOX
 #include "paddleaudio/src/pybind/sox/io.h"
 #include "paddleaudio/src/pybind/sox/effects.h"
 #endif
 #include <pybind11/stl.h>
 #include <pybind11/pybind11.h>
 // `tl::optional` 
 #ifdef INCLUDE_SOX
 namespace pybind11 { namespace detail {
   template <typename T>
   struct type_caster<tl::optional<T>> : optional_caster<tl::optional<T>> {};
 }}
 #endif
 PYBIND11_MODULE(_paddleaudio, m) {
 #ifdef INCLUDE_SOX
    m.def("get_info_file",
          &paddleaudio::sox_io::get_info_file,
          "Get metadata of audio file.");
    // support obj later
    m.def("get_info_fileobj",
          &paddleaudio::sox_io::get_info_fileobj,
          "Get metadata of audio in file object.");
    m.def("load_audio_fileobj",
          &paddleaudio::sox_io::load_audio_fileobj,
          "Load audio from file object.");
    m.def("save_audio_fileobj",
          &paddleaudio::sox_io::save_audio_fileobj,
          "Save audio to file obj.");
    // sox io
     m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file);
     m.def(
         "sox_io_load_audio_file",
         &paddleaudio::sox_io::load_audio_file);
     m.def(
         "sox_io_save_audio_file",
         &paddleaudio::sox_io::save_audio_file);
     // sox utils
     m.def("sox_utils_set_seed", &paddleaudio::sox_utils::set_seed);
     m.def(
         "sox_utils_set_verbosity",
         &paddleaudio::sox_utils::set_verbosity);
     m.def(
         "sox_utils_set_use_threads",
         &paddleaudio::sox_utils::set_use_threads);
     m.def(
         "sox_utils_set_buffer_size",
         &paddleaudio::sox_utils::set_buffer_size);
     m.def(
         "sox_utils_list_effects",
         &paddleaudio::sox_utils::list_effects);
     m.def(
         "sox_utils_list_read_formats",
         &paddleaudio::sox_utils::list_read_formats);
     m.def(
         "sox_utils_list_write_formats",
         &paddleaudio::sox_utils::list_write_formats);
     m.def(
         "sox_utils_get_buffer_size",
         &paddleaudio::sox_utils::get_buffer_size);
     // effect
     m.def("apply_effects_fileobj",
           &paddleaudio::sox_effects::apply_effects_fileobj,
           "Decode audio data from file-like obj and apply effects.");
     m.def("sox_effects_initialize_sox_effects",
       &paddleaudio::sox_effects::initialize_sox_effects);
     m.def(
         "sox_effects_shutdown_sox_effects",
         &paddleaudio::sox_effects::shutdown_sox_effects);
     m.def(
         "sox_effects_apply_effects_tensor",
         &paddleaudio::sox_effects::apply_effects_tensor);
     m.def(
         "sox_effects_apply_effects_file",
         &paddleaudio::sox_effects::apply_effects_file);
 #endif
 #ifdef INCLUDE_KALDI
    m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
    py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
        .def(py::init<>())
        .def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
        .def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
        .def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
        .def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
        .def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
        .def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
        .def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
        .def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
        .def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
        .def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
        .def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
        .def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
        .def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
        .def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
        .def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
        .def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
        .def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
        .def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
        .def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
        .def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
    m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
    py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
        .def(py::init<>())            
        .def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
        .def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)            
        .def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
        .def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)            
        .def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)            
        .def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)            
        .def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
        .def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)           
        .def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)          
        .def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
        .def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
        .def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
        .def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
    py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
        .def(py::init<>())
        .def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
        .def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
        .def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
        .def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
        .def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
        .def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
        .def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
    py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
        .def(py::init<>())
        .def_readwrite("use_energy", &paddleaudio::kaldi::FbankOptions::use_energy)
        .def_readwrite("energy_floor", &paddleaudio::kaldi::FbankOptions::energy_floor)
        .def_readwrite("raw_energy", &paddleaudio::kaldi::FbankOptions::raw_energy)
        .def_readwrite("htk_compat", &paddleaudio::kaldi::FbankOptions::htk_compat)
        .def_readwrite("use_log_fbank", &paddleaudio::kaldi::FbankOptions::use_log_fbank)
        .def_readwrite("use_power", &paddleaudio::kaldi::FbankOptions::use_power);
 #endif
 }
--- a/audio/paddleaudio/src/pybind/sox/effects.cpp
+++ b/audio/paddleaudio/src/pybind/sox/effects.cpp
@ -0,0 +1,259 @@
 // the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp  with modification.
 #include <mutex>
 #include <sox.h>
 #include "paddleaudio/src/pybind/sox/effects.h"
 #include "paddleaudio/src/pybind/sox/effects_chain.h"
 #include "paddleaudio/src/pybind/sox/utils.h"
 using namespace paddleaudio::sox_utils;
 namespace paddleaudio::sox_effects {
 // Streaming decoding over file-like object is tricky because libsox operates on
 // FILE pointer. The folloing is what `sox` and `play` commands do
 //  - file input -> FILE pointer
 //  - URL input -> call wget in suprocess and pipe the data -> FILE pointer
 //  - stdin -> FILE pointer
 //
 // We want to, instead, fetch byte strings chunk by chunk, consume them, and
 // discard.
 //
 // Here is the approach
 // 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
 // chunk of byte string
 //    This will perform header-based format detection, if necessary, then fill
 //    the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
 //    which returns FILE* which points the buffer of the provided byte string.
 // 2. Each time sox reads a chunk from the FILE*, we update the underlying
 // buffer in a way that it
 //    starts with unseen data, and append the new data read from the given
 //    fileobj. This will trick libsox as if it keeps reading from the FILE*
 //    continuously.
 // For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
 auto apply_effects_fileobj(
    py::object fileobj,
    const std::vector<std::vector<std::string>>& effects,
    tl::optional<bool> normalize,
    tl::optional<bool> channels_first,
    tl::optional<std::string> format)
    -> tl::optional<std::tuple<py::array, int64_t>> {
  // Prepare the buffer used throughout the lifecycle of SoxEffectChain.
  //
  // For certain format (such as FLAC), libsox keeps reading the content at
  // the initialization unless it reaches EOF even when the header is properly
  // parsed. (Making buffer size 8192, which is way bigger than the header,
  // resulted in libsox consuming all the buffer content at the time it opens
  // the file.) Therefore buffer has to always contain valid data, except after
  // EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
  // first check if there is enough data to fill the buffer. `read_fileobj`
  // repeatedly calls `read`  method until it receives the requested length of
  // bytes or it reaches EOF. If we get bytes shorter than requested, that means
  // the whole audio data are fetched.
  //
  // * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`.
  const auto capacity = [&]() {
    // NOTE:
    // Use the abstraction provided by `libpaddleaudio` to access the global
    // config defined by libsox. Directly using `sox_get_globals` function will
    // end up retrieving the static variable defined in `_paddleaudio`, which is
    // not correct.
    const auto bufsiz = get_buffer_size();
    const int64_t kDefaultCapacityInBytes = 256;
    return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
                                              : kDefaultCapacityInBytes;
  }();
  std::string buffer(capacity, '\0');
  auto* in_buf = const_cast<char*>(buffer.data());
  auto num_read = read_fileobj(&fileobj, capacity, in_buf);
  // If the file is shorter than 256, then libsox cannot read the header.
  auto in_buffer_size = (num_read > 256) ? num_read : 256;
  // Open file (this starts reading the header)
  // When opening a file there are two functions that can touches FILE*.
  // * `auto_detect_format`
  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
  // * `startread` handler of detected format.
  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
  // To see the handler of a particular format, go to
  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
  // For example, voribs can be found
  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
  SoxFormat sf(sox_open_mem_read(
      in_buf,
      in_buffer_size,
      /*signal=*/nullptr,
      /*encoding=*/nullptr,
      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
  // In case of streamed data, length can be 0
  if (static_cast<sox_format_t*>(sf) == nullptr ||
      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
    return {};
  }
  // Prepare output buffer
  std::vector<sox_sample_t> out_buffer;
  out_buffer.reserve(sf->signal.length);
  // Create and run SoxEffectsChain
  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
      /*input_encoding=*/sf->encoding,
      /*output_encoding=*/get_tensor_encodinginfo(dtype));
  chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
  for (const auto& effect : effects) {
    chain.addEffect(effect);
  }
  chain.addOutputBuffer(&out_buffer);
  chain.run();
  // Create tensor from buffer
  bool channels_first_ = channels_first.value_or(true);
  auto tensor = convert_to_tensor(
      /*buffer=*/out_buffer.data(),
      /*num_samples=*/out_buffer.size(),
      /*num_channels=*/chain.getOutputNumChannels(),
      dtype,
      normalize.value_or(true),
      channels_first_);
  return std::forward_as_tuple(
      tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
 }
 namespace {
 enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
 SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
 std::mutex SOX_RESOUCE_STATE_MUTEX;
 } // namespace
 void initialize_sox_effects() {
  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
  switch (SOX_RESOURCE_STATE) {
    case NotInitialized:
      if (sox_init() != SOX_SUCCESS) {
        throw std::runtime_error("Failed to initialize sox effects.");
      };
      SOX_RESOURCE_STATE = Initialized;
      break;
    case Initialized:
      break;
    case ShutDown:
      throw std::runtime_error(
          "SoX Effects has been shut down. Cannot initialize again.");
  }
 };
 void shutdown_sox_effects() {
  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
  switch (SOX_RESOURCE_STATE) {
    case NotInitialized:
      throw std::runtime_error(
          "SoX Effects is not initialized. Cannot shutdown.");
    case Initialized:
      if (sox_quit() != SOX_SUCCESS) {
        throw std::runtime_error("Failed to initialize sox effects.");
      };
      SOX_RESOURCE_STATE = ShutDown;
      break;
    case ShutDown:
      break;
  }
 }
 auto apply_effects_tensor(
    py::array waveform,
    int64_t sample_rate,
    const std::vector<std::vector<std::string>>& effects,
    bool channels_first) -> std::tuple<py::array, int64_t> {
  validate_input_tensor(waveform);
  // Create SoxEffectsChain
  const auto dtype = waveform.dtype();
  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
      /*input_encoding=*/get_tensor_encodinginfo(dtype),
      /*output_encoding=*/get_tensor_encodinginfo(dtype));
  // Prepare output buffer
  std::vector<sox_sample_t> out_buffer;
  out_buffer.reserve(waveform.size());
  // Build and run effects chain
  chain.addInputTensor(&waveform, sample_rate, channels_first);
  for (const auto& effect : effects) {
    chain.addEffect(effect);
  }
  chain.addOutputBuffer(&out_buffer);
  chain.run();
  // Create tensor from buffer
  auto out_tensor = convert_to_tensor(
      /*buffer=*/out_buffer.data(),
      /*num_samples=*/out_buffer.size(),
      /*num_channels=*/chain.getOutputNumChannels(),
      dtype,
      /*normalize=*/false,
      channels_first);
  return std::tuple<py::array, int64_t>(
      out_tensor, chain.getOutputSampleRate());
 }
 auto apply_effects_file(
    const std::string& path,
    const std::vector<std::vector<std::string>>& effects,
    tl::optional<bool> normalize,
    tl::optional<bool> channels_first,
    const tl::optional<std::string>& format)
    -> tl::optional<std::tuple<py::array, int64_t>> {
  // Open input file
  SoxFormat sf(sox_open_read(
      path.c_str(),
      /*signal=*/nullptr,
      /*encoding=*/nullptr,
      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
  if (static_cast<sox_format_t*>(sf) == nullptr ||
      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
    return {};
  }
  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
  // Prepare output
  std::vector<sox_sample_t> out_buffer;
  out_buffer.reserve(sf->signal.length);
  // Create and run SoxEffectsChain
  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
      /*input_encoding=*/sf->encoding,
      /*output_encoding=*/get_tensor_encodinginfo(dtype));
  chain.addInputFile(sf);
  for (const auto& effect : effects) {
    chain.addEffect(effect);
  }
  chain.addOutputBuffer(&out_buffer);
  chain.run();
  // Create tensor from buffer
  bool channels_first_ = channels_first.value_or(true);
  auto tensor = convert_to_tensor(
      /*buffer=*/out_buffer.data(),
      /*num_samples=*/out_buffer.size(),
      /*num_channels=*/chain.getOutputNumChannels(),
      dtype,
      normalize.value_or(true),
      channels_first_);
  return std::tuple<py::array, int64_t>(
      tensor, chain.getOutputSampleRate());
 }
 } // namespace paddleaudio::sox_effects
--- a/audio/paddleaudio/src/pybind/sox/effects.h
+++ b/audio/paddleaudio/src/pybind/sox/effects.h
@ -0,0 +1,37 @@
 // the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h  with modification.
 #include <pybind11/pybind11.h>
 #include <pybind11/numpy.h>
 #include "paddleaudio/src/optional/optional.hpp"
 namespace py = pybind11;
 namespace paddleaudio::sox_effects {
 auto apply_effects_fileobj(
    py::object fileobj,
    const std::vector<std::vector<std::string>>& effects,
    tl::optional<bool> normalize,
    tl::optional<bool> channels_first,
    tl::optional<std::string> format)
    -> tl::optional<std::tuple<py::array, int64_t>>;
 void initialize_sox_effects();
 void shutdown_sox_effects();
 auto apply_effects_tensor(
    py::array waveform,
    int64_t sample_rate,
    const std::vector<std::vector<std::string>>& effects,
    bool channels_first) -> std::tuple<py::array, int64_t>;
 auto apply_effects_file(
    const std::string& path,
    const std::vector<std::vector<std::string>>& effects,
    tl::optional<bool> normalize,
    tl::optional<bool> channels_first,
    const tl::optional<std::string>& format)
    -> tl::optional<std::tuple<py::array, int64_t>>;
 } // namespace paddleaudio::sox_effects
--- a/audio/paddleaudio/src/pybind/sox/effects_chain.cpp
+++ b/audio/paddleaudio/src/pybind/sox/effects_chain.cpp
@ -0,0 +1,597 @@
 // the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.cpp with modification.
 #include <sox.h>
 #include <iostream>
 #include <vector>
 #include "paddleaudio/src/pybind/sox/effects_chain.h"
 #include "paddleaudio/src/pybind/sox/utils.h"
 using namespace paddleaudio::sox_utils;
 namespace paddleaudio::sox_effects_chain {
 namespace {
 /// helper classes for passing the location of input tensor and output buffer
 ///
 /// drain/flow callback functions require plaing C style function signature and
 /// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
 /// The following structs will be assigned to sox_effect_t::priv pointer which
 /// gives sox_effect_t an access to input Tensor and output buffer object.
 struct TensorInputPriv {
  size_t index;
  py::array* waveform;
  int64_t sample_rate;
  bool channels_first;
 };
 struct TensorOutputPriv {
  std::vector<sox_sample_t>* buffer;
 };
 struct FileOutputPriv {
  sox_format_t* sf;
 };
 /// Callback function to feed Tensor data to SoxEffectChain.
 int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
  // Retrieve the input Tensor and current index
  auto priv = static_cast<TensorInputPriv*>(effp->priv);
  auto index = priv->index;
  auto tensor = *(priv->waveform);
  auto num_channels = effp->out_signal.channels;
  // Adjust the number of samples to read
  const size_t num_samples = tensor.size();
  if (index + *osamp > num_samples) {
    *osamp = num_samples - index;
  }
  // Ensure that it's a multiple of the number of channels
  *osamp -= *osamp % num_channels;
  // Slice the input Tensor
  // refacor this module, chunk
  auto i_frame = index / num_channels;
  auto num_frames = *osamp / num_channels;
  std::vector<int> chunk(num_frames*num_channels);
  py::buffer_info ori_info = tensor.request();
  void* ptr = ori_info.ptr;
  // Convert to sox_sample_t (int32_t)
  switch (tensor.dtype().num()) {
    //case c10::ScalarType::Float: {
    case 11: {
      // Need to convert to 64-bit precision so that
      // values around INT32_MIN/MAX are handled correctly.
      for (int idx = 0; idx < chunk.size(); ++idx) {
        int frame_idx = (idx + index) / num_channels;
        int channels_idx = (idx + index) % num_channels;
        double elem = 0; 
        if (priv->channels_first) {
          elem = *(float*)tensor.data(channels_idx, frame_idx);
        } else {
          elem = *(float*)tensor.data(frame_idx, channels_idx);
        } 
        elem = elem * 2147483648.;
        // *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX);
        if (elem > INT32_MAX) { 
          chunk[idx] = INT32_MAX; 
        } else if (elem < INT32_MIN) {
          chunk[idx] = INT32_MIN; 
        } else { 
          chunk[idx] = elem;
        }
      }
      break;
    }
    //case c10::ScalarType::Int: {
    case 5: {
      for (int idx = 0; idx < chunk.size(); ++idx) {
        int frame_idx = (idx + index) / num_channels;
        int channels_idx = (idx + index) % num_channels;
        int elem = 0;
        if (priv->channels_first) {
          elem = *(int*)tensor.data(channels_idx, frame_idx);
        } else {
          elem = *(int*)tensor.data(frame_idx, channels_idx);
        }
        chunk[idx] = elem;
      }
      break;
    }
    // case short
    case 3: {
      for (int idx = 0; idx < chunk.size(); ++idx) {
        int frame_idx = (idx + index) / num_channels;
        int channels_idx = (idx + index) % num_channels;
        int16_t elem = 0;
        if (priv->channels_first) {
          elem = *(int16_t*)tensor.data(channels_idx, frame_idx);
        } else {
          elem = *(int16_t*)tensor.data(frame_idx, channels_idx);
        }
        chunk[idx] = elem * 65536;
      }
      break;
    }
    // case byte
    case 1: {
      for (int idx = 0; idx < chunk.size(); ++idx) {
        int frame_idx = (idx + index) / num_channels;
        int channels_idx = (idx + index) % num_channels;
        int8_t elem = 0;
        if (priv->channels_first) {
          elem = *(int8_t*)tensor.data(channels_idx, frame_idx);
        } else {
          elem = *(int8_t*)tensor.data(frame_idx, channels_idx);
        }
        chunk[idx] = (elem - 128) * 16777216; 
      }
      break;
    }
    default:
      throw std::runtime_error("Unexpected dtype.");
  }
  // Write to buffer
  memcpy(obuf, chunk.data(), *osamp * 4);
  priv->index += *osamp;
  return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS;
 }
 /// Callback function to fetch data from SoxEffectChain.
 int tensor_output_flow(
    sox_effect_t* effp,
    sox_sample_t const* ibuf,
    sox_sample_t* obuf LSX_UNUSED,
    size_t* isamp,
    size_t* osamp) {
  *osamp = 0;
  // Get output buffer
  auto out_buffer = static_cast<TensorOutputPriv*>(effp->priv)->buffer;
  // Append at the end
  out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp);
  return SOX_SUCCESS;
 }
 int file_output_flow(
    sox_effect_t* effp,
    sox_sample_t const* ibuf,
    sox_sample_t* obuf LSX_UNUSED,
    size_t* isamp,
    size_t* osamp) {
  *osamp = 0;
  if (*isamp) {
    auto sf = static_cast<FileOutputPriv*>(effp->priv)->sf;
    if (sox_write(sf, ibuf, *isamp) != *isamp) {
      if (sf->sox_errno) {
        std::ostringstream stream;
        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
               << sf->filename;
        throw std::runtime_error(stream.str());
      }
      return SOX_EOF;
    }
  }
  return SOX_SUCCESS;
 }
 sox_effect_handler_t* get_tensor_input_handler() {
  static sox_effect_handler_t handler{
      /*name=*/"input_tensor",
      /*usage=*/NULL,
      /*flags=*/SOX_EFF_MCHAN,
      /*getopts=*/NULL,
      /*start=*/NULL,
      /*flow=*/NULL,
      /*drain=*/tensor_input_drain,
      /*stop=*/NULL,
      /*kill=*/NULL,
      /*priv_size=*/sizeof(TensorInputPriv)};
  return &handler;
 }
 sox_effect_handler_t* get_tensor_output_handler() {
  static sox_effect_handler_t handler{
      /*name=*/"output_tensor",
      /*usage=*/NULL,
      /*flags=*/SOX_EFF_MCHAN,
      /*getopts=*/NULL,
      /*start=*/NULL,
      /*flow=*/tensor_output_flow,
      /*drain=*/NULL,
      /*stop=*/NULL,
      /*kill=*/NULL,
      /*priv_size=*/sizeof(TensorOutputPriv)};
  return &handler;
 }
 sox_effect_handler_t* get_file_output_handler() {
  static sox_effect_handler_t handler{
      /*name=*/"output_file",
      /*usage=*/NULL,
      /*flags=*/SOX_EFF_MCHAN,
      /*getopts=*/NULL,
      /*start=*/NULL,
      /*flow=*/file_output_flow,
      /*drain=*/NULL,
      /*stop=*/NULL,
      /*kill=*/NULL,
      /*priv_size=*/sizeof(FileOutputPriv)};
  return &handler;
 }
 } // namespace
 SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {}
 SoxEffect::~SoxEffect() {
  if (se_ != nullptr) {
    free(se_);
  }
 }
 SoxEffect::operator sox_effect_t*() const {
  return se_;
 }
 auto SoxEffect::operator->() noexcept -> sox_effect_t* {
  return se_;
 }
 SoxEffectsChain::SoxEffectsChain(
    sox_encodinginfo_t input_encoding,
    sox_encodinginfo_t output_encoding)
    : in_enc_(input_encoding),
      out_enc_(output_encoding),
      in_sig_(),
      interm_sig_(),
      out_sig_(),
      sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) {
  if (!sec_) {
    throw std::runtime_error("Failed to create effect chain.");
  }
 }
 SoxEffectsChain::~SoxEffectsChain() {
  if (sec_ != nullptr) {
    sox_delete_effects_chain(sec_);
  }
 }
 void SoxEffectsChain::run() {
  sox_flow_effects(sec_, NULL, NULL);
 }
 void SoxEffectsChain::addInputTensor(
    py::array* waveform,
    int64_t sample_rate,
    bool channels_first) {
  in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first);
  interm_sig_ = in_sig_;
  SoxEffect e(sox_create_effect(get_tensor_input_handler()));
  auto priv = static_cast<TensorInputPriv*>(e->priv);
  priv->index = 0;
  priv->waveform = waveform;
  priv->sample_rate = sample_rate;
  priv->channels_first = channels_first;
  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
    throw std::runtime_error(
        "Internal Error: Failed to add effect: input_tensor");
  }
 }
 void SoxEffectsChain::addOutputBuffer(
    std::vector<sox_sample_t>* output_buffer) {
  SoxEffect e(sox_create_effect(get_tensor_output_handler()));
  static_cast<TensorOutputPriv*>(e->priv)->buffer = output_buffer;
  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
    throw std::runtime_error(
        "Internal Error: Failed to add effect: output_tensor");
  }
 }
 void SoxEffectsChain::addInputFile(sox_format_t* sf) {
  in_sig_ = sf->signal;
  interm_sig_ = in_sig_;
  SoxEffect e(sox_create_effect(sox_find_effect("input")));
  char* opts[] = {(char*)sf};
  sox_effect_options(e, 1, opts);
  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
    std::ostringstream stream;
    stream << "Internal Error: Failed to add effect: input " << sf->filename;
    throw std::runtime_error(stream.str());
  }
 }
 void SoxEffectsChain::addOutputFile(sox_format_t* sf) {
  out_sig_ = sf->signal;
  SoxEffect e(sox_create_effect(get_file_output_handler()));
  static_cast<FileOutputPriv*>(e->priv)->sf = sf;
  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
    std::ostringstream stream;
    stream << "Internal Error: Failed to add effect: output " << sf->filename;
    throw std::runtime_error(stream.str());
  }
 }
 void SoxEffectsChain::addEffect(const std::vector<std::string> effect) {
  const auto num_args = effect.size();
  if (num_args == 0) {
    throw std::runtime_error("Invalid argument: empty effect.");
  }
  const auto name = effect[0];
  if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) {
    std::ostringstream stream;
    stream << "Unsupported effect: " << name;
    throw std::runtime_error(stream.str());
  }
  auto returned_effect = sox_find_effect(name.c_str());
  if (!returned_effect) {
    std::ostringstream stream;
    stream << "Unsupported effect: " << name;
    throw std::runtime_error(stream.str());
  }
  SoxEffect e(sox_create_effect(returned_effect));
  const auto num_options = num_args - 1;
  std::vector<char*> opts;
  for (size_t i = 1; i < num_args; ++i) {
    opts.push_back((char*)effect[i].c_str());
  }
  if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) !=
      SOX_SUCCESS) {
    std::ostringstream stream;
    stream << "Invalid effect option:";
    for (const auto& v : effect) {
      stream << " " << v;
    }
    throw std::runtime_error(stream.str());
  }
  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
    std::ostringstream stream;
    stream << "Internal Error: Failed to add effect: \"" << name;
    for (size_t i = 1; i < num_args; ++i) {
      stream << " " << effect[i];
    }
    stream << "\"";
    throw std::runtime_error(stream.str());
  }
 }
 int64_t SoxEffectsChain::getOutputNumChannels() {
  return interm_sig_.channels;
 }
 int64_t SoxEffectsChain::getOutputSampleRate() {
  return interm_sig_.rate;
 }
 namespace {
 /// helper classes for passing file-like object to SoxEffectChain
 struct FileObjInputPriv {
  sox_format_t* sf;
  py::object* fileobj;
  bool eof_reached;
  char* buffer;
  uint64_t buffer_size;
 };
 struct FileObjOutputPriv {
  sox_format_t* sf;
  py::object* fileobj;
  char** buffer;
  size_t* buffer_size;
 };
 /// Callback function to feed byte string
 /// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/sox.h#L1268-L1278
 auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp)
    -> int {
  auto priv = static_cast<FileObjInputPriv*>(effp->priv);
  auto sf = priv->sf;
  auto buffer = priv->buffer;
  // 1. Refresh the buffer
  //
  // NOTE:
  //   Since the underlying FILE* was opened with `fmemopen`, the only way
  //   libsox detect EOF is reaching the end of the buffer. (null byte won't
  //   help) Therefore we need to align the content at the end of buffer,
  //   otherwise, libsox will keep reading the content beyond intended length.
  //
  // Before:
  //
  //     |<-------consumed------>|<---remaining--->|
  //     |***********************|-----------------|
  //                             ^ ftell
  //
  // After:
  //
  //     |<-offset->|<---remaining--->|<-new data->|
  //     |**********|-----------------|++++++++++++|
  //                ^ ftell
  // NOTE:
  //   Do not use `sf->tell_off` here. Presumably, `tell_off` and `fseek` are
  //   supposed to be in sync, but there are cases (Vorbis) they are not
  //   in sync and `tell_off` has seemingly uninitialized value, which
  //   leads num_remain to be negative and cause segmentation fault
  //   in `memmove`.
  const auto tell = ftell((FILE*)sf->fp);
  if (tell < 0) {
    throw std::runtime_error("Internal Error: ftell failed.");
  }
  const auto num_consumed = static_cast<size_t>(tell);
  if (num_consumed > priv->buffer_size) {
    throw std::runtime_error("Internal Error: buffer overrun.");
  }
  const auto num_remain = priv->buffer_size - num_consumed;
  // 1.1. Fetch the data to see if there is data to fill the buffer
  size_t num_refill = 0;
  std::string chunk(num_consumed, '\0');
  if (num_consumed && !priv->eof_reached) {
    num_refill = read_fileobj(
        priv->fileobj, num_consumed, const_cast<char*>(chunk.data()));
    if (num_refill < num_consumed) {
      priv->eof_reached = true;
    }
  }
  const auto offset = num_consumed - num_refill;
  // 1.2. Move the unconsumed data towards the beginning of buffer.
  if (num_remain) {
    auto src = static_cast<void*>(buffer + num_consumed);
    auto dst = static_cast<void*>(buffer + offset);
    memmove(dst, src, num_remain);
  }
  // 1.3. Refill the remaining buffer.
  if (num_refill) {
    auto src = static_cast<void*>(const_cast<char*>(chunk.c_str()));
    auto dst = buffer + offset + num_remain;
    memcpy(dst, src, num_refill);
  }
  // 1.4. Set the file pointer to the new offset
  sf->tell_off = offset;
  fseek((FILE*)sf->fp, offset, SEEK_SET);
  // 2. Perform decoding operation
  // The following part is practically same as "input" effect
  // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
  // At this point, osamp represents the buffer size in bytes,
  // but sox_read expects the maximum number of samples ready to read.
  // Normally, this is fine, but in case when the samples are not 4-byte
  // aligned, (e.g. sample is 24bits), the resulting signal is not correct.
  // https://github.com/pytorch/audio/issues/2083
  if (sf->encoding.bits_per_sample > 0)
    *osamp /= (sf->encoding.bits_per_sample / 8);
  // Ensure that it's a multiple of the number of channels
  *osamp -= *osamp % effp->out_signal.channels;
  // Read up to *osamp samples into obuf;
  // store the actual number read back to *osamp
  *osamp = sox_read(sf, obuf, *osamp);
  // Decoding is finished when fileobject is exhausted and sox can no longer
  // decode a sample.
  return (priv->eof_reached && !*osamp) ? SOX_EOF : SOX_SUCCESS;
 }
 auto fileobj_output_flow(
    sox_effect_t* effp,
    sox_sample_t const* ibuf,
    sox_sample_t* obuf LSX_UNUSED,
    size_t* isamp,
    size_t* osamp) -> int {
  *osamp = 0;
  if (*isamp) {
    auto priv = static_cast<FileObjOutputPriv*>(effp->priv);
    auto sf = priv->sf;
    auto fp = static_cast<FILE*>(sf->fp);
    auto fileobj = priv->fileobj;
    auto buffer = priv->buffer;
    // Encode chunk
    auto num_samples_written = sox_write(sf, ibuf, *isamp);
    fflush(fp);
    // Copy the encoded chunk to python object.
    fileobj->attr("write")(py::bytes(*buffer, ftell(fp)));
    // Reset FILE*
    sf->tell_off = 0;
    fseek(fp, 0, SEEK_SET);
    if (num_samples_written != *isamp) {
      if (sf->sox_errno) {
        std::ostringstream stream;
        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
               << sf->filename;
        throw std::runtime_error(stream.str());
      }
      return SOX_EOF;
    }
  }
  return SOX_SUCCESS;
 }
 auto get_fileobj_input_handler() -> sox_effect_handler_t* {
  static sox_effect_handler_t handler{
      /*name=*/"input_fileobj_object",
      /*usage=*/nullptr,
      /*flags=*/SOX_EFF_MCHAN,
      /*getopts=*/nullptr,
      /*start=*/nullptr,
      /*flow=*/nullptr,
      /*drain=*/fileobj_input_drain,
      /*stop=*/nullptr,
      /*kill=*/nullptr,
      /*priv_size=*/sizeof(FileObjInputPriv)};
  return &handler;
 }
 auto get_fileobj_output_handler() -> sox_effect_handler_t* {
  static sox_effect_handler_t handler{
      /*name=*/"output_fileobj_object",
      /*usage=*/nullptr,
      /*flags=*/SOX_EFF_MCHAN,
      /*getopts=*/nullptr,
      /*start=*/nullptr,
      /*flow=*/fileobj_output_flow,
      /*drain=*/nullptr,
      /*stop=*/nullptr,
      /*kill=*/nullptr,
      /*priv_size=*/sizeof(FileObjOutputPriv)};
  return &handler;
 }
 } // namespace
 void SoxEffectsChainPyBind::addInputFileObj(
    sox_format_t* sf,
    char* buffer,
    uint64_t buffer_size,
    py::object* fileobj) {
  in_sig_ = sf->signal;
  interm_sig_ = in_sig_;
  SoxEffect e(sox_create_effect(get_fileobj_input_handler()));
  auto priv = static_cast<FileObjInputPriv*>(e->priv);
  priv->sf = sf;
  priv->fileobj = fileobj;
  priv->eof_reached = false;
  priv->buffer = buffer;
  priv->buffer_size = buffer_size;
  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
    throw std::runtime_error(
        "Internal Error: Failed to add effect: input fileobj");
  }
 }
 void SoxEffectsChainPyBind::addOutputFileObj(
    sox_format_t* sf,
    char** buffer,
    size_t* buffer_size,
    py::object* fileobj) {
  out_sig_ = sf->signal;
  SoxEffect e(sox_create_effect(get_fileobj_output_handler()));
  auto priv = static_cast<FileObjOutputPriv*>(e->priv);
  priv->sf = sf;
  priv->fileobj = fileobj;
  priv->buffer = buffer;
  priv->buffer_size = buffer_size;
  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
    throw std::runtime_error(
        "Internal Error: Failed to add effect: output fileobj");
  }
 }
 } // namespace paddleaudio::sox_effects_chain
--- a/audio/paddleaudio/src/pybind/sox/effects_chain.h
+++ b/audio/paddleaudio/src/pybind/sox/effects_chain.h
@ -0,0 +1,78 @@
 // the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h with modification.
 #pragma once
 #include <sox.h>
 #include "paddleaudio/src/pybind/sox/utils.h"
 namespace paddleaudio::sox_effects_chain {
 // Helper struct to safely close sox_effect_t* pointer returned by
 // sox_create_effect
 struct SoxEffect {
  explicit SoxEffect(sox_effect_t* se) noexcept;
  SoxEffect(const SoxEffect& other) = delete;
  SoxEffect(const SoxEffect&& other) = delete;
  auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
  auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
  ~SoxEffect();
  operator sox_effect_t*() const;
  auto operator->() noexcept -> sox_effect_t*;
 private:
  sox_effect_t* se_;
 };
 // Helper struct to safely close sox_effects_chain_t with handy methods
 class SoxEffectsChain {
  const sox_encodinginfo_t in_enc_;
  const sox_encodinginfo_t out_enc_;
 protected:
  sox_signalinfo_t in_sig_;
  sox_signalinfo_t interm_sig_;
  sox_signalinfo_t out_sig_;
  sox_effects_chain_t* sec_;
 public:
  explicit SoxEffectsChain(
      sox_encodinginfo_t input_encoding,
      sox_encodinginfo_t output_encoding);
  SoxEffectsChain(const SoxEffectsChain& other) = delete;
  SoxEffectsChain(const SoxEffectsChain&& other) = delete;
  SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
  SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
  ~SoxEffectsChain();
  void run();
  void addInputTensor(
      py::array* waveform,
      int64_t sample_rate,
      bool channels_first);
  void addInputFile(sox_format_t* sf);
  void addOutputBuffer(std::vector<sox_sample_t>* output_buffer);
  void addOutputFile(sox_format_t* sf);
  void addEffect(const std::vector<std::string> effect);
  int64_t getOutputNumChannels();
  int64_t getOutputSampleRate();
 };
 class SoxEffectsChainPyBind : public SoxEffectsChain {
  using SoxEffectsChain::SoxEffectsChain;
 public:
  void addInputFileObj(
      sox_format_t* sf,
      char* buffer,
      uint64_t buffer_size,
      py::object* fileobj);
  void addOutputFileObj(
      sox_format_t* sf,
      char** buffer,
      size_t* buffer_size,
      py::object* fileobj);
 };
 } // namespace paddleaudio::sox_effects_chain
--- a/audio/paddleaudio/src/pybind/sox/io.cpp
+++ b/audio/paddleaudio/src/pybind/sox/io.cpp
@ -0,0 +1,279 @@
 // the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp with modification.
 #include "paddleaudio/src/pybind/sox/io.h"
 #include "paddleaudio/src/pybind/sox/effects.h"
 #include "paddleaudio/src/pybind/sox/types.h"
 #include "paddleaudio/src/pybind/sox/effects_chain.h"
 #include "paddleaudio/src/pybind/sox/utils.h"
 #include "paddleaudio/src/optional/optional.hpp"
 using namespace paddleaudio::sox_utils;
 namespace paddleaudio {
 namespace sox_io {
 auto get_info_file(const std::string &path, 
                   const tl::optional<std::string> &format)
    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
    SoxFormat sf(
        sox_open_read(path.data(),
                      /*signal=*/nullptr,
                      /*encoding=*/nullptr,
                      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
    validate_input_file(sf, path);
    return std::make_tuple(
        static_cast<int64_t>(sf->signal.rate),
        static_cast<int64_t>(sf->signal.length / sf->signal.channels),
        static_cast<int64_t>(sf->signal.channels),
        static_cast<int64_t>(sf->encoding.bits_per_sample),
        get_encoding(sf->encoding.encoding));
 }
 std::vector<std::vector<std::string>> get_effects(
    const tl::optional<int64_t>& frame_offset,
    const tl::optional<int64_t>& num_frames) {
  const auto offset = frame_offset.value_or(0);
  if (offset < 0) {
    throw std::runtime_error(
        "Invalid argument: frame_offset must be non-negative.");
  }
  const auto frames = num_frames.value_or(-1);
  if (frames == 0 || frames < -1) {
    throw std::runtime_error(
        "Invalid argument: num_frames must be -1 or greater than 0.");
  }
  std::vector<std::vector<std::string>> effects;
  if (frames != -1) {
    std::ostringstream os_offset, os_frames;
    os_offset << offset << "s";
    os_frames << "+" << frames << "s";
    effects.emplace_back(
        std::vector<std::string>{"trim", os_offset.str(), os_frames.str()});
  } else if (offset != 0) {
    std::ostringstream os_offset;
    os_offset << offset << "s";
    effects.emplace_back(std::vector<std::string>{"trim", os_offset.str()});
  }
  return effects;
 }
 auto get_info_fileobj(py::object fileobj, 
                      const tl::optional<std::string> &format)
    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
    const auto capacity = [&]() {
        const auto bufsiz = get_buffer_size();
        const int64_t kDefaultCapacityInBytes = 4096;
        return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
                                                  : kDefaultCapacityInBytes;
    }();
    std::string buffer(capacity, '\0');
    auto *buf = const_cast<char *>(buffer.data());
    auto num_read = read_fileobj(&fileobj, capacity, buf);
    // If the file is shorter than 256, then libsox cannot read the header.
    auto buf_size = (num_read > 256) ? num_read : 256;
    SoxFormat sf(sox_open_mem_read(
        buf,
        buf_size,
        /*signal=*/nullptr,
        /*encoding=*/nullptr,
        /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
    // In case of streamed data, length can be 0
    validate_input_memfile(sf);
    return std::make_tuple(
        static_cast<int64_t>(sf->signal.rate),
        static_cast<int64_t>(sf->signal.length / sf->signal.channels),
        static_cast<int64_t>(sf->signal.channels),
        static_cast<int64_t>(sf->encoding.bits_per_sample),
        get_encoding(sf->encoding.encoding));
 }
 tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
    py::object fileobj,
    const tl::optional<int64_t>& frame_offset,
    const tl::optional<int64_t>& num_frames,
    tl::optional<bool> normalize,
    tl::optional<bool> channels_first,
    const tl::optional<std::string>& format) {
  auto effects = get_effects(frame_offset, num_frames);
  return paddleaudio::sox_effects::apply_effects_fileobj(
      std::move(fileobj), effects, normalize, channels_first, std::move(format));
 }
 tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
    const std::string& path,
    const tl::optional<int64_t>& frame_offset,
    const tl::optional<int64_t>& num_frames,
    tl::optional<bool> normalize,
    tl::optional<bool> channels_first,
    const tl::optional<std::string>& format) {
    auto effects = get_effects(frame_offset, num_frames);
    return paddleaudio::sox_effects::apply_effects_file(
        path, effects, normalize, channels_first, format);
 }
 void save_audio_file(const std::string& path,
                     py::array tensor,
                     int64_t sample_rate,
                     bool channels_first,
                     tl::optional<double> compression,
                     tl::optional<std::string> format,
                     tl::optional<std::string> encoding,
                     tl::optional<int64_t> bits_per_sample) {
    validate_input_tensor(tensor);
    const auto filetype = [&]() {
        if (format.has_value()) return format.value();
        return get_filetype(path);
    }();
    if (filetype == "amr-nb") {
        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
        //TORCH_CHECK(num_channels == 1,
        //            "amr-nb format only supports single channel audio.");
        assert(num_channels == 1);
    } else if (filetype == "htk") {
        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
       // TORCH_CHECK(num_channels == 1,
        //            "htk format only supports single channel audio.");
        assert(num_channels == 1);
    } else if (filetype == "gsm") {
        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
        assert(num_channels == 1);
        assert(sample_rate == 8000);
        //TORCH_CHECK(num_channels == 1,
        //            "gsm format only supports single channel audio.");
        //TORCH_CHECK(sample_rate == 8000,
        //            "gsm format only supports a sampling rate of 8kHz.");
    }
    const auto signal_info =
        get_signalinfo(&tensor, sample_rate, filetype, channels_first);
    const auto encoding_info = get_encodinginfo_for_save(
        filetype, tensor.dtype(), compression, encoding, bits_per_sample);
    SoxFormat sf(sox_open_write(path.c_str(),
                                &signal_info,
                                &encoding_info,
                                /*filetype=*/filetype.c_str(),
                                /*oob=*/nullptr,
                                /*overwrite_permitted=*/nullptr));
    if (static_cast<sox_format_t*>(sf) == nullptr) {
        throw std::runtime_error(
            "Error saving audio file: failed to open file " + path);
    }
    paddleaudio::sox_effects_chain::SoxEffectsChain chain(
        /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
        /*output_encoding=*/sf->encoding);
    chain.addInputTensor(&tensor, sample_rate, channels_first);
    chain.addOutputFile(sf);
    chain.run();
 }
 namespace {
 // helper class to automatically release buffer, to be used by
 // save_audio_fileobj
 struct AutoReleaseBuffer {
  char* ptr;
  size_t size;
  AutoReleaseBuffer() : ptr(nullptr), size(0) {}
  AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete;
  AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete;
  auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete;
  auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete;
  ~AutoReleaseBuffer() {
    if (ptr) {
      free(ptr);
    }
  }
 };
 } // namespace
 void save_audio_fileobj(
    py::object fileobj,
    py::array tensor,
    int64_t sample_rate,
    bool channels_first,
    tl::optional<double> compression,
    tl::optional<std::string> format,
    tl::optional<std::string> encoding,
    tl::optional<int64_t> bits_per_sample) {
  if (!format.has_value()) {
    throw std::runtime_error(
        "`format` is required when saving to file object.");
  }
  const auto filetype = format.value();
  if (filetype == "amr-nb") {
    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
    if (num_channels != 1) {
      throw std::runtime_error(
          "amr-nb format only supports single channel audio.");
    }
  } else if (filetype == "htk") {
    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
    if (num_channels != 1) {
      throw std::runtime_error(
          "htk format only supports single channel audio.");
    }
  } else if (filetype == "gsm") {
    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
    if (num_channels != 1) {
      throw std::runtime_error(
          "gsm format only supports single channel audio.");
    }
    if (sample_rate != 8000) {
      throw std::runtime_error(
          "gsm format only supports a sampling rate of 8kHz.");
    }
  }
  const auto signal_info =
      get_signalinfo(&tensor, sample_rate, filetype, channels_first);
  const auto encoding_info = get_encodinginfo_for_save(
      filetype,
      tensor.dtype(),
      compression,
      std::move(encoding),
      bits_per_sample);
  AutoReleaseBuffer buffer;
  SoxFormat sf(sox_open_memstream_write(
      &buffer.ptr,
      &buffer.size,
      &signal_info,
      &encoding_info,
      filetype.c_str(),
      /*oob=*/nullptr));
  if (static_cast<sox_format_t*>(sf) == nullptr) {
    throw std::runtime_error(
        "Error saving audio file: failed to open memory stream.");
  }
  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
      /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
      /*output_encoding=*/sf->encoding);
  chain.addInputTensor(&tensor, sample_rate, channels_first);
  chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj);
  chain.run();
  // Closing the sox_format_t is necessary for flushing the last chunk to the
  // buffer
  sf.close();
  fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
 }
 }  // namespace paddleaudio
 }  // namespace sox_io
--- a/audio/paddleaudio/src/pybind/sox/io.h
+++ b/audio/paddleaudio/src/pybind/sox/io.h
@ -0,0 +1,61 @@
 // the code is from https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.h with modification.
 #pragma once
 #include "paddleaudio/src/pybind/sox/utils.h"
 namespace py = pybind11;
 namespace paddleaudio {
 namespace sox_io {
 auto get_info_file(const std::string &path, 
                   const tl::optional<std::string> &format)
    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
 auto get_info_fileobj(py::object fileobj,
                   const tl::optional<std::string> &format)
    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
 tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
    py::object fileobj,
    const tl::optional<int64_t>& frame_offset,
    const tl::optional<int64_t>& num_frames,
    tl::optional<bool> normalize,
    tl::optional<bool> channels_first,
    const tl::optional<std::string>& format);
 void save_audio_fileobj(
    py::object fileobj,
    py::array tensor,
    int64_t sample_rate,
    bool channels_first,
    tl::optional<double> compression,
    tl::optional<std::string> format,
    tl::optional<std::string> encoding,
    tl::optional<int64_t> bits_per_sample);
 auto get_effects(const tl::optional<int64_t>& frame_offset,
                 const tl::optional<int64_t>& num_frames)
    -> std::vector<std::vector<std::string>>;
 tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
    const std::string& path,
    const tl::optional<int64_t>& frame_offset,
    const tl::optional<int64_t>& num_frames,
    tl::optional<bool> normalize,
    tl::optional<bool> channels_first,
    const tl::optional<std::string>& format);
 void save_audio_file(const std::string& path,
                     py::array tensor,
                     int64_t sample_rate,
                     bool channels_first,
                     tl::optional<double> compression,
                     tl::optional<std::string> format,
                     tl::optional<std::string> encoding,
                     tl::optional<int64_t> bits_per_sample);    
 }  // namespace paddleaudio
 }  // namespace sox_io
--- a/audio/paddleaudio/src/pybind/sox/types.cpp
+++ b/audio/paddleaudio/src/pybind/sox/types.cpp
@ -0,0 +1,143 @@
 //code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp
 #include "paddleaudio/src/pybind/sox/types.h"
 #include <ostream>
 #include <sstream>
 namespace paddleaudio {
 namespace sox_utils {
 Format get_format_from_string(const std::string& format) {
  if (format == "wav")
    return Format::WAV;
  if (format == "mp3")
    return Format::MP3;
  if (format == "flac")
    return Format::FLAC;
  if (format == "ogg" || format == "vorbis")
    return Format::VORBIS;
  if (format == "amr-nb")
    return Format::AMR_NB;
  if (format == "amr-wb")
    return Format::AMR_WB;
  if (format == "amb")
    return Format::AMB;
  if (format == "sph")
    return Format::SPHERE;
  if (format == "htk")
    return Format::HTK;
  if (format == "gsm")
    return Format::GSM;
  std::ostringstream stream;
  stream << "Internal Error: unexpected format value: " << format;
  throw std::runtime_error(stream.str());
 }
 std::string to_string(Encoding v) {
  switch (v) {
    case Encoding::UNKNOWN:
      return "UNKNOWN";
    case Encoding::PCM_SIGNED:
      return "PCM_S";
    case Encoding::PCM_UNSIGNED:
      return "PCM_U";
    case Encoding::PCM_FLOAT:
      return "PCM_F";
    case Encoding::FLAC:
      return "FLAC";
    case Encoding::ULAW:
      return "ULAW";
    case Encoding::ALAW:
      return "ALAW";
    case Encoding::MP3:
      return "MP3";
    case Encoding::VORBIS:
      return "VORBIS";
    case Encoding::AMR_WB:
      return "AMR_WB";
    case Encoding::AMR_NB:
      return "AMR_NB";
    case Encoding::OPUS:
      return "OPUS";
    default:
      throw std::runtime_error("Internal Error: unexpected encoding.");
  }
 }
 Encoding get_encoding_from_option(const tl::optional<std::string> encoding) {
  if (!encoding.has_value())
    return Encoding::NOT_PROVIDED;
  std::string v = encoding.value();
  if (v == "PCM_S")
    return Encoding::PCM_SIGNED;
  if (v == "PCM_U")
    return Encoding::PCM_UNSIGNED;
  if (v == "PCM_F")
    return Encoding::PCM_FLOAT;
  if (v == "ULAW")
    return Encoding::ULAW;
  if (v == "ALAW")
    return Encoding::ALAW;
  std::ostringstream stream;
  stream << "Internal Error: unexpected encoding value: " << v;
  throw std::runtime_error(stream.str());
 }
 BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth) {
  if (!bit_depth.has_value())
    return BitDepth::NOT_PROVIDED;
  int64_t v = bit_depth.value();
  switch (v) {
    case 8:
      return BitDepth::B8;
    case 16:
      return BitDepth::B16;
    case 24:
      return BitDepth::B24;
    case 32:
      return BitDepth::B32;
    case 64:
      return BitDepth::B64;
    default: {
      std::ostringstream s;
      s << "Internal Error: unexpected bit depth value: " << v;
      throw std::runtime_error(s.str());
    }
  }
 }
 std::string get_encoding(sox_encoding_t encoding) {
  switch (encoding) {
    case SOX_ENCODING_UNKNOWN:
      return "UNKNOWN";
    case SOX_ENCODING_SIGN2:
      return "PCM_S";
    case SOX_ENCODING_UNSIGNED:
      return "PCM_U";
    case SOX_ENCODING_FLOAT:
      return "PCM_F";
    case SOX_ENCODING_FLAC:
      return "FLAC";
    case SOX_ENCODING_ULAW:
      return "ULAW";
    case SOX_ENCODING_ALAW:
      return "ALAW";
    case SOX_ENCODING_MP3:
      return "MP3";
    case SOX_ENCODING_VORBIS:
      return "VORBIS";
    case SOX_ENCODING_AMR_WB:
      return "AMR_WB";
    case SOX_ENCODING_AMR_NB:
      return "AMR_NB";
    case SOX_ENCODING_OPUS:
      return "OPUS";
    case SOX_ENCODING_GSM:
      return "GSM";
    default:
      return "UNKNOWN";
  }
 }
 } // namespace sox_utils
 } // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/sox/types.h
+++ b/audio/paddleaudio/src/pybind/sox/types.h
@ -0,0 +1,58 @@
 //code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h
 #pragma once
 #include <sox.h>
 #include "paddleaudio/src/optional/optional.hpp"
 namespace paddleaudio {
 namespace sox_utils {
 enum class Format {
  WAV,
  MP3,
  FLAC,
  VORBIS,
  AMR_NB,
  AMR_WB,
  AMB,
  SPHERE,
  GSM,
  HTK,
 };
 Format get_format_from_string(const std::string& format);
 enum class Encoding {
  NOT_PROVIDED,
  UNKNOWN,
  PCM_SIGNED,
  PCM_UNSIGNED,
  PCM_FLOAT,
  FLAC,
  ULAW,
  ALAW,
  MP3,
  VORBIS,
  AMR_WB,
  AMR_NB,
  OPUS,
 };
 std::string to_string(Encoding v);
 Encoding get_encoding_from_option(const tl::optional<std::string> encoding);
 enum class BitDepth : unsigned {
  NOT_PROVIDED = 0,
  B8 = 8,
  B16 = 16,
  B24 = 24,
  B32 = 32,
  B64 = 64,
 };
 BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth);
 std::string get_encoding(sox_encoding_t encoding);
 } // namespace sox_utils
 } // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/sox/utils.cpp
+++ b/audio/paddleaudio/src/pybind/sox/utils.cpp
@ -0,0 +1,550 @@
 //code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.cpp with modification.
 #include <sox.h>
 #include "paddleaudio/src/pybind/sox/utils.h"
 #include "paddleaudio/src/pybind/sox/types.h"
 #include <sstream>
 namespace paddleaudio {
 namespace sox_utils {
 auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer)
    -> uint64_t {
    uint64_t num_read = 0;
    while (num_read < size) {
        auto request = size - num_read;
        auto chunk = static_cast<std::string>(
            static_cast<py::bytes>(fileobj->attr("read")(request)));
        auto chunk_len = chunk.length();
        if (chunk_len == 0) {
            break;
        }
        if (chunk_len > request) {
            std::ostringstream message;
            message
                << "Requested up to " << request << " bytes but, "
                << "received " << chunk_len << " bytes. "
                << "The given object does not confirm to read protocol of file "
                   "object.";
            throw std::runtime_error(message.str());
        }
        memcpy(buffer, chunk.data(), chunk_len);
        buffer += chunk_len;
        num_read += chunk_len;
    }
    return num_read;
 }
 void set_seed(const int64_t seed) {
  sox_get_globals()->ranqd1 = static_cast<sox_int32_t>(seed);
 }
 void set_verbosity(const int64_t verbosity) {
  sox_get_globals()->verbosity = static_cast<unsigned>(verbosity);
 }
 void set_use_threads(const bool use_threads) {
  sox_get_globals()->use_threads = static_cast<sox_bool>(use_threads);
 }
 void set_buffer_size(const int64_t buffer_size) {
  sox_get_globals()->bufsiz = static_cast<size_t>(buffer_size);
 }
 int64_t get_buffer_size() {
  return sox_get_globals()->bufsiz;
 }
 std::vector<std::vector<std::string>> list_effects() {
  std::vector<std::vector<std::string>> effects;
  for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) {
    const sox_effect_handler_t* handler = (*fns)();
    if (handler && handler->name) {
      if (UNSUPPORTED_EFFECTS.find(handler->name) ==
          UNSUPPORTED_EFFECTS.end()) {
        effects.emplace_back(std::vector<std::string>{
            handler->name,
            handler->usage ? std::string(handler->usage) : std::string("")});
      }
    }
  }
  return effects;
 }
 std::vector<std::string> list_write_formats() {
  std::vector<std::string> formats;
  for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
    const sox_format_handler_t* handler = fns->fn();
    for (const char* const* names = handler->names; *names; ++names) {
      if (!strchr(*names, '/') && handler->write)
        formats.emplace_back(*names);
    }
  }
  return formats;
 }
 std::vector<std::string> list_read_formats() {
  std::vector<std::string> formats;
  for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
    const sox_format_handler_t* handler = fns->fn();
    for (const char* const* names = handler->names; *names; ++names) {
      if (!strchr(*names, '/') && handler->read)
        formats.emplace_back(*names);
    }
  }
  return formats;
 }
 SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {}
 SoxFormat::~SoxFormat() {
  close();
 }
 sox_format_t* SoxFormat::operator->() const noexcept {
  return fd_;
 }
 SoxFormat::operator sox_format_t*() const noexcept {
  return fd_;
 }
 void SoxFormat::close() {
  if (fd_ != nullptr) {
    sox_close(fd_);
    fd_ = nullptr;
  }
 }
 void validate_input_file(const SoxFormat& sf, const std::string& path) {
  if (static_cast<sox_format_t*>(sf) == nullptr) {
    throw std::runtime_error(
        "Error loading audio file: failed to open file " + path);
  }
  if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
    throw std::runtime_error("Error loading audio file: unknown encoding.");
  }
 }
 void validate_input_memfile(const SoxFormat &sf) {
    return validate_input_file(sf, "<in memory buffer>");
 }
 void validate_input_tensor(const py::array tensor) {
  if (tensor.ndim() != 2) {
    throw std::runtime_error("Input tensor has to be 2D.");
  }
  char dtype = tensor.dtype().char_();
  bool flag = (dtype == 'f') || (dtype == 'd') || (dtype == 'l') || (dtype == 'i');
  if (flag == false) {
      throw std::runtime_error(
          "Input tensor has to be one of float32, int32, int16 or uint8 type.");
  }
 }
 py::dtype get_dtype(
    const sox_encoding_t encoding,
    const unsigned precision) {
    switch (encoding) {
      case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
        return py::dtype('u1');
      case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV
        switch (precision) {
          case 16:
            return py::dtype("i2");
          case 24: // Cast 24-bit to 32-bit.
          case 32:
            return py::dtype('i');
          default:
            throw std::runtime_error(
                "Only 16, 24, and 32 bits are supported for signed PCM.");
        }
      default:
        // default to float32 for the other formats, including
        // 32-bit flaoting-point WAV,
        // MP3,
        // FLAC,
        // VORBIS etc...
        return py::dtype("f");
    }
 }
 py::array convert_to_tensor(
    sox_sample_t* buffer,
    const int32_t num_samples,
    const int32_t num_channels,
    const py::dtype dtype,
    const bool normalize,
    const bool channels_first) {
  // todo refector later(SGoat)
  py::array t;
  uint64_t dummy = 0;
  SOX_SAMPLE_LOCALS;
  int32_t num_rows = num_samples / num_channels;
  if (normalize || dtype.char_() == 'f') {
    t = py::array(dtype, {num_rows, num_channels});
    auto ptr = (float*)t.mutable_data(0, 0);
    for (int32_t i = 0; i < num_samples; ++i) {
      ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy);
    }
    if (channels_first) {
    py::array t2 = py::array(dtype, {num_channels, num_rows});
    for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
      for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
       *(float*)t2.mutable_data(row_idx, col_idx) = *(float*)t.data(col_idx, row_idx);
    }
    return t2;
  }
  } else if (dtype.char_() == 'i') {
    t = py::array(dtype, {num_rows, num_channels});
    auto ptr = (int*)t.mutable_data(0, 0);
    for (int32_t i = 0; i < num_samples; ++i) {
      ptr[i] = buffer[i];
    }
    if (channels_first) {
      py::array t2 = py::array(dtype, {num_channels, num_rows});
      for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
        for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
          *(int*)t2.mutable_data(row_idx, col_idx) = *(int*)t.data(col_idx, row_idx);
      }
      return t2;
    }
  } else if (dtype.char_() == 'h') { // int16
    t = py::array(dtype, {num_rows, num_channels});
    auto ptr = (int16_t*)t.mutable_data(0, 0);
    for (int32_t i = 0; i < num_samples; ++i) {
      ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy);
    }
    if (channels_first) {
      py::array t2 = py::array(dtype, {num_channels, num_rows});
      for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
        for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
          *(int16_t*)t2.mutable_data(row_idx, col_idx) = *(int16_t*)t.data(col_idx, row_idx);
      }
      return t2;
    }
  } else if (dtype.char_() == 'b') {
    //t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8);
    t = py::array(dtype, {num_rows, num_channels});
    auto ptr = (uint8_t*)t.mutable_data(0,0);
    for (int32_t i = 0; i < num_samples; ++i) {
      ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy);
    }
    if (channels_first) {
      py::array t2 = py::array(dtype, {num_channels, num_rows});
      for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
        for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
        *(uint8_t*)t2.mutable_data(row_idx, col_idx) = *(uint8_t*)t.data(col_idx, row_idx);
      }
      return t2;
    }
  } else {
    throw std::runtime_error("Unsupported dtype.");
  }
  return t;
 }
 const std::string get_filetype(const std::string path) {
  std::string ext = path.substr(path.find_last_of(".") + 1);
  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
  return ext;
 }
 namespace {
 std::tuple<sox_encoding_t, unsigned> get_save_encoding_for_wav(
    const std::string format,
    py::dtype dtype,
    const Encoding& encoding,
    const BitDepth& bits_per_sample) {
  switch (encoding) {
    case Encoding::NOT_PROVIDED:
      switch (bits_per_sample) {
        case BitDepth::NOT_PROVIDED:
          switch (dtype.num()) {
            case 11: // float32 numpy dtype num 
              return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
            case 5: // int numpy dtype num
              return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
            case 3: // int16 numpy
              return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
            case 1: // byte numpy
              return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
            default:
              throw std::runtime_error("Internal Error: Unexpected dtype.");
          }
        case BitDepth::B8:
          return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
        default:
          return std::make_tuple<>(
              SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
      }
    case Encoding::PCM_SIGNED:
      switch (bits_per_sample) {
        case BitDepth::NOT_PROVIDED:
          return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
        case BitDepth::B8:
          throw std::runtime_error(
              format + " does not support 8-bit signed PCM encoding.");
        default:
          return std::make_tuple<>(
              SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
      }
    case Encoding::PCM_UNSIGNED:
      switch (bits_per_sample) {
        case BitDepth::NOT_PROVIDED:
        case BitDepth::B8:
          return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
        default:
          throw std::runtime_error(
              format + " only supports 8-bit for unsigned PCM encoding.");
      }
    case Encoding::PCM_FLOAT:
      switch (bits_per_sample) {
        case BitDepth::NOT_PROVIDED:
        case BitDepth::B32:
          return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
        case BitDepth::B64:
          return std::make_tuple<>(SOX_ENCODING_FLOAT, 64);
        default:
          throw std::runtime_error(
              format +
              " only supports 32-bit or 64-bit for floating-point PCM encoding.");
      }
    case Encoding::ULAW:
      switch (bits_per_sample) {
        case BitDepth::NOT_PROVIDED:
        case BitDepth::B8:
          return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
        default:
          throw std::runtime_error(
              format + " only supports 8-bit for mu-law encoding.");
      }
    case Encoding::ALAW:
      switch (bits_per_sample) {
        case BitDepth::NOT_PROVIDED:
        case BitDepth::B8:
          return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
        default:
          throw std::runtime_error(
              format + " only supports 8-bit for a-law encoding.");
      }
    default:
      throw std::runtime_error(
          format + " does not support encoding: " + to_string(encoding));
  }
 }
 std::tuple<sox_encoding_t, unsigned> get_save_encoding(
    const std::string& format,
    const py::dtype dtype,
    const tl::optional<std::string> encoding,
    const tl::optional<int64_t> bits_per_sample) {
  const Format fmt = get_format_from_string(format);
  const Encoding enc = get_encoding_from_option(encoding);
  const BitDepth bps = get_bit_depth_from_option(bits_per_sample);
  switch (fmt) {
    case Format::WAV:
    case Format::AMB:
      return get_save_encoding_for_wav(format, dtype, enc, bps);
    case Format::MP3:
      if (enc != Encoding::NOT_PROVIDED)
        throw std::runtime_error("mp3 does not support `encoding` option.");
      if (bps != BitDepth::NOT_PROVIDED)
        throw std::runtime_error(
            "mp3 does not support `bits_per_sample` option.");
      return std::make_tuple<>(SOX_ENCODING_MP3, 16);
    case Format::HTK:
      if (enc != Encoding::NOT_PROVIDED)
        throw std::runtime_error("htk does not support `encoding` option.");
      if (bps != BitDepth::NOT_PROVIDED)
        throw std::runtime_error(
            "htk does not support `bits_per_sample` option.");
      return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
    case Format::VORBIS:
      if (enc != Encoding::NOT_PROVIDED)
        throw std::runtime_error("vorbis does not support `encoding` option.");
      if (bps != BitDepth::NOT_PROVIDED)
        throw std::runtime_error(
            "vorbis does not support `bits_per_sample` option.");
      return std::make_tuple<>(SOX_ENCODING_VORBIS, 16);
    case Format::AMR_NB:
      if (enc != Encoding::NOT_PROVIDED)
        throw std::runtime_error("amr-nb does not support `encoding` option.");
      if (bps != BitDepth::NOT_PROVIDED)
        throw std::runtime_error(
            "amr-nb does not support `bits_per_sample` option.");
      return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16);
    case Format::FLAC:
      if (enc != Encoding::NOT_PROVIDED)
        throw std::runtime_error("flac does not support `encoding` option.");
      switch (bps) {
        case BitDepth::B32:
        case BitDepth::B64:
          throw std::runtime_error(
              "flac does not support `bits_per_sample` larger than 24.");
        default:
          return std::make_tuple<>(
              SOX_ENCODING_FLAC, static_cast<unsigned>(bps));
      }
    case Format::SPHERE:
      switch (enc) {
        case Encoding::NOT_PROVIDED:
        case Encoding::PCM_SIGNED:
          switch (bps) {
            case BitDepth::NOT_PROVIDED:
              return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
            default:
              return std::make_tuple<>(
                  SOX_ENCODING_SIGN2, static_cast<unsigned>(bps));
          }
        case Encoding::PCM_UNSIGNED:
          throw std::runtime_error(
              "sph does not support unsigned integer PCM.");
        case Encoding::PCM_FLOAT:
          throw std::runtime_error("sph does not support floating point PCM.");
        case Encoding::ULAW:
          switch (bps) {
            case BitDepth::NOT_PROVIDED:
            case BitDepth::B8:
              return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
            default:
              throw std::runtime_error(
                  "sph only supports 8-bit for mu-law encoding.");
          }
        case Encoding::ALAW:
          switch (bps) {
            case BitDepth::NOT_PROVIDED:
            case BitDepth::B8:
              return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
            default:
              return std::make_tuple<>(
                  SOX_ENCODING_ALAW, static_cast<unsigned>(bps));
          }
        default:
          throw std::runtime_error(
              "sph does not support encoding: " + encoding.value());
      }
    case Format::GSM:
      if (enc != Encoding::NOT_PROVIDED)
        throw std::runtime_error("gsm does not support `encoding` option.");
      if (bps != BitDepth::NOT_PROVIDED)
        throw std::runtime_error(
            "gsm does not support `bits_per_sample` option.");
      return std::make_tuple<>(SOX_ENCODING_GSM, 16);
    default:
      throw std::runtime_error("Unsupported format: " + format);
  }
 }
 unsigned get_precision(const std::string filetype, py::dtype dtype) {
  if (filetype == "mp3")
    return SOX_UNSPEC;
  if (filetype == "flac")
    return 24;
  if (filetype == "ogg" || filetype == "vorbis")
    return SOX_UNSPEC;
  if (filetype == "wav" || filetype == "amb") {
    switch (dtype.num()) {
      case 1: // byte in numpy dype num
        return 8;
      case 3: // short, in numpy dtype num
        return 16;
      case 5: // int, numpy dtype 
        return 32;
      case 11: // float, numpy dtype
        return 32;
      default:
        throw std::runtime_error("Unsupported dtype.");
    }
  }
  if (filetype == "sph")
    return 32;
  if (filetype == "amr-nb") {
    return 16;
  }
  if (filetype == "gsm") {
    return 16;
  }
  if (filetype == "htk") {
    return 16;
  }
  throw std::runtime_error("Unsupported file type: " + filetype);
 }
 } // namespace
 sox_signalinfo_t get_signalinfo(
    const py::array* waveform,
    const int64_t sample_rate,
    const std::string filetype,
    const bool channels_first) {
  return sox_signalinfo_t{
      /*rate=*/static_cast<sox_rate_t>(sample_rate),
      /*channels=*/
      static_cast<unsigned>(waveform->shape(channels_first ? 0 : 1)),
      /*precision=*/get_precision(filetype, waveform->dtype()),
      /*length=*/static_cast<uint64_t>(waveform->size())};
 }
 sox_encodinginfo_t get_tensor_encodinginfo(py::dtype dtype) {
  sox_encoding_t encoding = [&]() {
    switch (dtype.num()) {
      case 1: // byte
        return SOX_ENCODING_UNSIGNED;
      case 3: // short
        return SOX_ENCODING_SIGN2;
      case 5: // int32
        return SOX_ENCODING_SIGN2;
      case 11: // float
        return SOX_ENCODING_FLOAT;
      default:
        throw std::runtime_error("Unsupported dtype.");
    }
  }();
  unsigned bits_per_sample = [&]() {
    switch (dtype.num()) {
      case 1: // byte
        return 8;
      case 3: //short
        return 16;
      case 5: // int32
        return 32;
      case 11: // float
        return 32;
      default:
        throw std::runtime_error("Unsupported dtype.");
    }
  }();
  return sox_encodinginfo_t{
      /*encoding=*/encoding,
      /*bits_per_sample=*/bits_per_sample,
      /*compression=*/HUGE_VAL,
      /*reverse_bytes=*/sox_option_default,
      /*reverse_nibbles=*/sox_option_default,
      /*reverse_bits=*/sox_option_default,
      /*opposite_endian=*/sox_false};
 }
 sox_encodinginfo_t get_encodinginfo_for_save(
    const std::string& format,
    const py::dtype dtype,
    const tl::optional<double> compression,
    const tl::optional<std::string> encoding,
    const tl::optional<int64_t> bits_per_sample) {
  auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample);
  return sox_encodinginfo_t{
      /*encoding=*/std::get<0>(enc),
      /*bits_per_sample=*/std::get<1>(enc),
      /*compression=*/compression.value_or(HUGE_VAL),
      /*reverse_bytes=*/sox_option_default,
      /*reverse_nibbles=*/sox_option_default,
      /*reverse_bits=*/sox_option_default,
      /*opposite_endian=*/sox_false};
 }
 }  // namespace paddleaudio
 }  // namespace sox_utils
--- a/audio/paddleaudio/src/pybind/sox/utils.h
+++ b/audio/paddleaudio/src/pybind/sox/utils.h
@ -0,0 +1,114 @@
 //code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.h with modification.
 #pragma once
 #include <pybind11/pybind11.h>
 #include <pybind11/numpy.h>
 #include <sox.h>
 #include "paddleaudio/src/optional/optional.hpp"
 namespace py = pybind11;
 namespace paddleaudio {
 namespace sox_utils {
 auto read_fileobj(py::object *fileobj, uint64_t size, char *buffer) -> uint64_t;
 void set_seed(const int64_t seed);
 void set_verbosity(const int64_t verbosity);
 void set_use_threads(const bool use_threads);
 void set_buffer_size(const int64_t buffer_size);
 int64_t get_buffer_size();
 std::vector<std::vector<std::string>> list_effects();
 std::vector<std::string> list_read_formats();
 std::vector<std::string> list_write_formats();
 ////////////////////////////////////////////////////////////////////////////////
 // Utilities for sox_io / sox_effects implementations
 ////////////////////////////////////////////////////////////////////////////////
 const std::unordered_set<std::string> UNSUPPORTED_EFFECTS =
    {"input", "output", "spectrogram", "noiseprof", "noisered", "splice"};
 /// helper class to automatically close sox_format_t*
 struct SoxFormat {
  explicit SoxFormat(sox_format_t* fd) noexcept;
  SoxFormat(const SoxFormat& other) = delete;
  SoxFormat(SoxFormat&& other) = delete;
  SoxFormat& operator=(const SoxFormat& other) = delete;
  SoxFormat& operator=(SoxFormat&& other) = delete;
  ~SoxFormat();
  sox_format_t* operator->() const noexcept;
  operator sox_format_t*() const noexcept;
  void close();
 private:
  sox_format_t* fd_;
 };
 ///
 /// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
 void validate_input_tensor(const py::array);
 void validate_input_file(const SoxFormat& sf, const std::string& path);
 void validate_input_memfile(const SoxFormat &sf);
 ///
 /// Get target dtype for the given encoding and precision.
 py::dtype get_dtype(
    const sox_encoding_t encoding,
    const unsigned precision);
 ///
 /// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
 /// NOTE: This function might modify the values in the input buffer to
 /// reduce the number of memory copy.
 /// @param buffer Pointer to buffer that contains audio data.
 /// @param num_samples The number of samples to read.
 /// @param num_channels The number of channels. Used to reshape the resulting
 /// Tensor.
 /// @param dtype Target dtype. Determines the output dtype and value range in
 /// conjunction with normalization.
 /// @param noramlize Perform normalization. Only effective when dtype is not
 /// kFloat32. When effective, the output tensor is kFloat32 type and value range
 /// is [-1.0, 1.0]
 /// @param channels_first When True, output Tensor has shape of [num_channels,
 /// num_frames].
 py::array convert_to_tensor(
    sox_sample_t* buffer,
    const int32_t num_samples,
    const int32_t num_channels,
    const py::dtype dtype,
    const bool normalize,
    const bool channels_first);
 /// Extract extension from file path
 const std::string get_filetype(const std::string path);
 /// Get sox_signalinfo_t for passing a py::array object.
 sox_signalinfo_t get_signalinfo(
    const py::array* waveform,
    const int64_t sample_rate,
    const std::string filetype,
    const bool channels_first);
 /// Get sox_encodinginfo_t for Tensor I/O
 sox_encodinginfo_t get_tensor_encodinginfo(const py::dtype dtype);
 /// Get sox_encodinginfo_t for saving to file/file object
 sox_encodinginfo_t get_encodinginfo_for_save(
    const std::string& format,
    const py::dtype dtype,
    const tl::optional<double> compression,
    const tl::optional<std::string> encoding,
    const tl::optional<int64_t> bits_per_sample);
 }  // namespace paddleaudio
 }  // namespace sox_utils
--- a/audio/paddleaudio/src/utils.cpp
+++ b/audio/paddleaudio/src/utils.cpp
@ -0,0 +1,35 @@
 // this is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/utils.cpp with modification.
 namespace paddleaudio {
 namespace {
 bool is_sox_available() {
 #ifdef INCLUDE_SOX
    return true;
 #else
    return false;
 #endif
 }
 bool is_kaldi_available() {
 #ifdef INCLUDE_KALDI
    return true;
 #else
    return false;
 #endif
 }
 // It tells whether paddleaudio was compiled with ffmpeg
 // not the runtime availability.
 bool is_ffmpeg_available() {
 #ifdef USE_FFMPEG
    return true;
 #else
    return false;
 #endif
 }
 }  // namespace
 }  // namespace paddleaudio
--- a/audio/paddleaudio/third_party/.gitignore
+++ b/audio/paddleaudio/third_party/.gitignore
@ -0,0 +1,2 @@
 archives/
 install/
--- a/audio/paddleaudio/third_party/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/CMakeLists.txt
@ -0,0 +1,15 @@
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
 ################################################################################
 # sox
 ################################################################################
 if (BUILD_SOX)
  add_subdirectory(sox)
 endif()
 ################################################################################
 # kaldi
 ################################################################################
 if (BUILD_KALDI)
  add_subdirectory(kaldi)
 endif()
--- a/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
@ -0,0 +1,111 @@
 # checkout the thirdparty/kaldi/base/kaldi-types.h
 # compile kaldi without openfst
 add_definitions("-DCOMPILE_WITHOUT_OPENFST")
 if ((NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/base))
    file(COPY ../../../../speechx/speechx/kaldi/base DESTINATION ${CMAKE_CURRENT_LIST_DIR})
    file(COPY ../../../../speechx/speechx/kaldi/feat DESTINATION ${CMAKE_CURRENT_LIST_DIR})
    file(COPY ../../../../speechx/speechx/kaldi/matrix DESTINATION ${CMAKE_CURRENT_LIST_DIR})
    file(COPY ../../../../speechx/speechx/kaldi/util DESTINATION ${CMAKE_CURRENT_LIST_DIR})
 endif()
 # kaldi-base
 add_library(kaldi-base STATIC
  base/io-funcs.cc
  base/kaldi-error.cc
  base/kaldi-math.cc
  base/kaldi-utils.cc
  base/timer.cc
 )
 target_include_directories(kaldi-base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 # kaldi-matrix
 add_library(kaldi-matrix STATIC
  matrix/compressed-matrix.cc
  matrix/matrix-functions.cc
  matrix/kaldi-matrix.cc
  matrix/kaldi-vector.cc
  matrix/optimization.cc
  matrix/packed-matrix.cc
  matrix/qr.cc
  matrix/sparse-matrix.cc
  matrix/sp-matrix.cc
  matrix/srfft.cc
  matrix/tp-matrix.cc
 )
 target_include_directories(kaldi-matrix PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 if (NOT MSVC)
    target_link_libraries(kaldi-matrix PUBLIC kaldi-base libopenblas)
 else()
    target_link_libraries(kaldi-matrix PUBLIC kaldi-base openblas)
 endif()
 # kaldi-util
 add_library(kaldi-util STATIC
  util/kaldi-holder.cc
  util/kaldi-io.cc
  util/kaldi-semaphore.cc
  util/kaldi-table.cc
  util/kaldi-thread.cc
  util/parse-options.cc
  util/simple-io-funcs.cc
  util/simple-options.cc
  util/text-utils.cc
 )
 target_include_directories(kaldi-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix)
 # kaldi-feat-common
 add_library(kaldi-feat-common STATIC
  feat/cmvn.cc
  feat/feature-functions.cc
  feat/feature-window.cc
  feat/mel-computations.cc
  feat/pitch-functions.cc
  feat/resample.cc
  feat/signal.cc
  feat/wave-reader.cc
 )
 target_include_directories(kaldi-feat-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
 # kaldi-mfcc
 add_library(kaldi-mfcc STATIC
  feat/feature-mfcc.cc
 )
 target_include_directories(kaldi-mfcc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
 # kaldi-fbank
 add_library(kaldi-fbank STATIC
  feat/feature-fbank.cc
 )
 target_include_directories(kaldi-fbank PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
 set(KALDI_LIBRARIES
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-base.a
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-matrix.a
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-util.a
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-feat-common.a
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-mfcc.a
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-fbank.a
 )
 add_library(libkaldi INTERFACE)
 add_dependencies(libkaldi kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank)
 target_include_directories(libkaldi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 if (APPLE)
    target_link_libraries(libkaldi INTERFACE ${KALDI_LIBRARIES} libopenblas ${GFORTRAN_LIBRARIES_DIR}/libgfortran.a ${GFORTRAN_LIBRARIES_DIR}/libquadmath.a ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib)
 elseif (MSVC)
    target_link_libraries(libkaldi INTERFACE kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank openblas)
 else()
    target_link_libraries(libkaldi INTERFACE -Wl,--start-group -Wl,--whole-archive ${KALDI_LIBRARIES} libopenblas.a gfortran -Wl,--no-whole-archive -Wl,--end-group)
 endif()
 target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
--- a/audio/paddleaudio/third_party/patches/config.guess
+++ b/audio/paddleaudio/third_party/patches/config.guess
--- a/audio/paddleaudio/third_party/patches/config.sub
+++ b/audio/paddleaudio/third_party/patches/config.sub
--- a/audio/paddleaudio/third_party/patches/libmad.patch
+++ b/audio/paddleaudio/third_party/patches/libmad.patch
@ -0,0 +1,86 @@
 See the followings for the origin of this patch
 http://www.linuxfromscratch.org/blfs/view/svn/multimedia/libmad.html
 http://www.linuxfromscratch.org/patches/blfs/svn/libmad-0.15.1b-fixes-1.patch
 --- src/libmad/configure	2004-02-05 09:34:07.000000000 +0000
 +++ src/libmad/configure.new	2020-06-30 21:10:28.528018931 +0000
@@ -19083,71 +19083,7 @@
 if test "$GCC" = yes
 then
 -    if test -z "$arch"
 -    then
 -	case "$host" in
 -	    i386-*)           ;;
 -	    i?86-*)           arch="-march=i486" ;;
 -	    arm*-empeg-*)     arch="-march=armv4 -mtune=strongarm1100" ;;
 -	    armv4*-*)         arch="-march=armv4 -mtune=strongarm" ;;
 -	    powerpc-*)        ;;
 -	    mips*-agenda-*)   arch="-mcpu=vr4100" ;;
 -	    mips*-luxsonor-*) arch="-mips1 -mcpu=r3000 -Wa,-m4010" ;;
 -	esac
 -    fi
 -
 -    case "$optimize" in
 -	-O|"-O "*)
 -	    optimize="-O"
 -	    optimize="$optimize -fforce-mem"
 -	    optimize="$optimize -fforce-addr"
 -	    : #x optimize="$optimize -finline-functions"
 -	    : #- optimize="$optimize -fstrength-reduce"
 -	    optimize="$optimize -fthread-jumps"
 -	    optimize="$optimize -fcse-follow-jumps"
 -	    optimize="$optimize -fcse-skip-blocks"
 -	    : #x optimize="$optimize -frerun-cse-after-loop"
 -	    : #x optimize="$optimize -frerun-loop-opt"
 -	    : #x optimize="$optimize -fgcse"
 -	    optimize="$optimize -fexpensive-optimizations"
 -	    optimize="$optimize -fregmove"
 -	    : #* optimize="$optimize -fdelayed-branch"
 -	    : #x optimize="$optimize -fschedule-insns"
 -	    optimize="$optimize -fschedule-insns2"
 -	    : #? optimize="$optimize -ffunction-sections"
 -	    : #? optimize="$optimize -fcaller-saves"
 -	    : #> optimize="$optimize -funroll-loops"
 -	    : #> optimize="$optimize -funroll-all-loops"
 -	    : #x optimize="$optimize -fmove-all-movables"
 -	    : #x optimize="$optimize -freduce-all-givs"
 -	    : #? optimize="$optimize -fstrict-aliasing"
 -	    : #* optimize="$optimize -fstructure-noalias"
 -
 -	    case "$host" in
 -		arm*-*)
 -		    optimize="$optimize -fstrength-reduce"
 -		    ;;
 -		mips*-*)
 -		    optimize="$optimize -fstrength-reduce"
 -		    optimize="$optimize -finline-functions"
 -		    ;;
 -		i?86-*)
 -		    optimize="$optimize -fstrength-reduce"
 -		    ;;
 -		powerpc-apple-*)
 -		    # this triggers an internal compiler error with gcc2
 -		    : #optimize="$optimize -fstrength-reduce"
 -
 -		    # this is really only beneficial with gcc3
 -		    : #optimize="$optimize -finline-functions"
 -		    ;;
 -		*)
 -		    # this sometimes provokes bugs in gcc 2.95.2
 -		    : #optimize="$optimize -fstrength-reduce"
 -		    ;;
 -	    esac
 -	    ;;
 -    esac
 +    optimize="-O2"
 fi
 case "$host" in
@@ -21497,6 +21433,7 @@
 then
     case "$host" in
 	i?86-*)     FPM="INTEL"  ;;
 +	x86_64*)    FPM="64BIT"  ;;
 	arm*-*)     FPM="ARM"    ;;
 	mips*-*)    FPM="MIPS"   ;;
 	sparc*-*)   FPM="SPARC"  ;;
--- a/audio/paddleaudio/third_party/patches/sox.patch
+++ b/audio/paddleaudio/third_party/patches/sox.patch
@ -0,0 +1,16 @@
 See https://github.com/pytorch/audio/pull/1297
 diff -ru sox/src/formats.c sox/src/formats.c
 --- sox/src/formats.c	2014-10-26 19:55:50.000000000 -0700
 +++ sox/src/formats.c	2021-02-22 16:01:02.833144070 -0800
@@ -333,6 +333,10 @@
   assert(ft);
   if (!ft->fp)
     return sox_false;
 -  fstat(fileno((FILE*)ft->fp), &st);
 +  int fd = fileno((FILE*)ft->fp);
 +  if (fd < 0)
 +    return sox_false;
 +  if (fstat(fd, &st) < 0)
 +    return sox_false;
   return ((st.st_mode & S_IFMT) == S_IFREG);
 }
--- a/audio/paddleaudio/third_party/sox/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/sox/CMakeLists.txt
@ -0,0 +1,254 @@
 find_package(PkgConfig REQUIRED)
 include(ExternalProject)
 set(INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../install)
 set(ARCHIVE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../archives)
 set(patch_dir ${CMAKE_CURRENT_SOURCE_DIR}/../patches)
 set(COMMON_ARGS --quiet --disable-shared --enable-static --prefix=${INSTALL_DIR} --with-pic --disable-dependency-tracking --disable-debug --disable-examples --disable-doc)
 # To pass custom environment variables to ExternalProject_Add command,
 # we need to do `${CMAKE_COMMAND} -E env ${envs} <COMMANAD>`.
 # https://stackoverflow.com/a/62437353
 # We constrcut the custom environment variables here
 set(envs
  "PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig"
  "LDFLAGS=-L${INSTALL_DIR}/lib $ENV{LDFLAGS}"
  "CFLAGS=-I${INSTALL_DIR}/include -fvisibility=hidden $ENV{CFLAGS}"
 )
 if (BUILD_MAD)
  ExternalProject_Add(mad
    PREFIX ${CMAKE_CURRENT_BINARY_DIR}
    DOWNLOAD_DIR ${ARCHIVE_DIR}
    URL https://downloads.sourceforge.net/project/mad/libmad/0.15.1b/libmad-0.15.1b.tar.gz
    URL_HASH SHA256=bbfac3ed6bfbc2823d3775ebb931087371e142bb0e9bb1bee51a76a6e0078690
    PATCH_COMMAND patch < ${patch_dir}/libmad.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/mad/
    CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/mad/configure ${COMMON_ARGS}
    DOWNLOAD_NO_PROGRESS ON
    LOG_DOWNLOAD ON
    LOG_UPDATE ON
    LOG_CONFIGURE ON
    LOG_BUILD ON
    LOG_INSTALL ON
    LOG_MERGED_STDOUTERR ON
    LOG_OUTPUT_ON_FAILURE ON
  )
 endif (BUILD_MAD)
 ExternalProject_Add(amr
  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
  DOWNLOAD_DIR ${ARCHIVE_DIR}
  URL https://sourceforge.net/projects/opencore-amr/files/opencore-amr/opencore-amr-0.1.5.tar.gz
  URL_HASH SHA256=2c006cb9d5f651bfb5e60156dbff6af3c9d35c7bbcc9015308c0aff1e14cd341
  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/amr/
  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/amr/configure ${COMMON_ARGS}
  DOWNLOAD_NO_PROGRESS ON
  LOG_DOWNLOAD ON
  LOG_UPDATE ON
  LOG_CONFIGURE ON
  LOG_BUILD ON
  LOG_INSTALL ON
  LOG_MERGED_STDOUTERR ON
  LOG_OUTPUT_ON_FAILURE ON
 )
 ExternalProject_Add(lame
  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
  DOWNLOAD_DIR ${ARCHIVE_DIR}
  URL https://downloads.sourceforge.net/project/lame/lame/3.99/lame-3.99.5.tar.gz
  URL_HASH SHA256=24346b4158e4af3bd9f2e194bb23eb473c75fb7377011523353196b19b9a23ff
  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/lame/
  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/lame/configure ${COMMON_ARGS} --enable-nasm
  DOWNLOAD_NO_PROGRESS ON
  LOG_DOWNLOAD ON
  LOG_UPDATE ON
  LOG_CONFIGURE ON
  LOG_BUILD ON
  LOG_INSTALL ON
  LOG_MERGED_STDOUTERR ON
  LOG_OUTPUT_ON_FAILURE ON
 )
 ExternalProject_Add(ogg
  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
  DOWNLOAD_DIR ${ARCHIVE_DIR}
  URL https://ftp.osuosl.org/pub/xiph/releases/ogg/libogg-1.3.3.tar.gz
  URL_HASH SHA256=c2e8a485110b97550f453226ec644ebac6cb29d1caef2902c007edab4308d985
  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/
  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/configure ${COMMON_ARGS}
  DOWNLOAD_NO_PROGRESS ON
  LOG_DOWNLOAD ON
  LOG_UPDATE ON
  LOG_CONFIGURE ON
  LOG_BUILD ON
  LOG_INSTALL ON
  LOG_MERGED_STDOUTERR ON
  LOG_OUTPUT_ON_FAILURE ON
 )
 ExternalProject_Add(flac
  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
  DEPENDS ogg
  DOWNLOAD_DIR ${ARCHIVE_DIR}
  URL https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.2.tar.xz
  URL_HASH SHA256=91cfc3ed61dc40f47f050a109b08610667d73477af6ef36dcad31c31a4a8d53f
  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/flac/
  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/flac/configure ${COMMON_ARGS} --with-ogg --disable-cpplibs
  DOWNLOAD_NO_PROGRESS ON
  LOG_DOWNLOAD ON
  LOG_UPDATE ON
  LOG_CONFIGURE ON
  LOG_BUILD ON
  LOG_INSTALL ON
  LOG_MERGED_STDOUTERR ON
  LOG_OUTPUT_ON_FAILURE ON
 )
 ExternalProject_Add(vorbis
  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
  DEPENDS ogg
  DOWNLOAD_DIR ${ARCHIVE_DIR}
  URL https://ftp.osuosl.org/pub/xiph/releases/vorbis/libvorbis-1.3.6.tar.gz
  URL_HASH SHA256=6ed40e0241089a42c48604dc00e362beee00036af2d8b3f46338031c9e0351cb
  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/
  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/configure ${COMMON_ARGS} --with-ogg
  DOWNLOAD_NO_PROGRESS ON
  LOG_DOWNLOAD ON
  LOG_UPDATE ON
  LOG_CONFIGURE ON
  LOG_BUILD ON
  LOG_INSTALL ON
  LOG_MERGED_STDOUTERR ON
  LOG_OUTPUT_ON_FAILURE ON
 )
 ExternalProject_Add(opus
  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
  DEPENDS ogg
  DOWNLOAD_DIR ${ARCHIVE_DIR}
  URL https://ftp.osuosl.org/pub/xiph/releases/opus/opus-1.3.1.tar.gz
  URL_HASH SHA256=65b58e1e25b2a114157014736a3d9dfeaad8d41be1c8179866f144a2fb44ff9d
  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opus/
  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opus/configure ${COMMON_ARGS} --with-ogg
  DOWNLOAD_NO_PROGRESS ON
  LOG_DOWNLOAD ON
  LOG_UPDATE ON
  LOG_CONFIGURE ON
  LOG_BUILD ON
  LOG_INSTALL ON
  LOG_MERGED_STDOUTERR ON
  LOG_OUTPUT_ON_FAILURE ON
 )
 ExternalProject_Add(opusfile
  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
  DEPENDS opus
  DOWNLOAD_DIR ${ARCHIVE_DIR}
  URL https://ftp.osuosl.org/pub/xiph/releases/opus/opusfile-0.12.tar.gz
  URL_HASH SHA256=118d8601c12dd6a44f52423e68ca9083cc9f2bfe72da7a8c1acb22a80ae3550b
  PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/
  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/configure ${COMMON_ARGS} --disable-http
  DOWNLOAD_NO_PROGRESS ON
  LOG_DOWNLOAD ON
  LOG_UPDATE ON
  LOG_CONFIGURE ON
  LOG_BUILD ON
  LOG_INSTALL ON
  LOG_MERGED_STDOUTERR ON
  LOG_OUTPUT_ON_FAILURE ON
 )
 # OpenMP is by default compiled against GNU OpenMP, which conflicts with the version of OpenMP that PyTorch uses.
 # See https://github.com/pytorch/audio/pull/1026
 # TODO: Add flags like https://github.com/suphoff/pytorch_parallel_extension_cpp/blob/master/setup.py
 set(SOX_OPTIONS
  --disable-openmp
  --with-amrnb
  --with-amrwb
  --with-flac
  --with-lame
  --with-oggvorbis
  --with-opus
  --without-alsa
  --without-ao
  --without-coreaudio
  --without-oss
  --without-id3tag
  --without-ladspa
  --without-magic
  --without-png
  --without-pulseaudio
  --without-sndfile
  --without-sndio
  --without-sunaudio
  --without-waveaudio
  --without-wavpack
  --without-twolame
  )
 set(SOX_LIBRARIES
  ${INSTALL_DIR}/lib/libsox.a
  ${INSTALL_DIR}/lib/libopencore-amrnb.a
  ${INSTALL_DIR}/lib/libopencore-amrwb.a
  ${INSTALL_DIR}/lib/libmp3lame.a
  ${INSTALL_DIR}/lib/libFLAC.a
  ${INSTALL_DIR}/lib/libopusfile.a
  ${INSTALL_DIR}/lib/libopus.a
  ${INSTALL_DIR}/lib/libvorbisenc.a
  ${INSTALL_DIR}/lib/libvorbisfile.a
  ${INSTALL_DIR}/lib/libvorbis.a
  ${INSTALL_DIR}/lib/libogg.a
  )
 set(sox_depends
  ogg flac vorbis opusfile lame amr
  )
 if (BUILD_MAD)
  list(
    APPEND
    SOX_OPTIONS
    --with-mad
    )
  list(
    APPEND
    SOX_LIBRARIES
    ${INSTALL_DIR}/lib/libmad.a
    )
  list(
    APPEND
    sox_depends
    mad
    )
 else ()
  list(
    APPEND
    SOX_OPTIONS
    --without-mad
    )  
 endif (BUILD_MAD)
 ExternalProject_Add(sox
  PREFIX ${CMAKE_CURRENT_BINARY_DIR}
  DEPENDS ${sox_depends}
  DOWNLOAD_DIR ${ARCHIVE_DIR}
  URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2
  URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c
  PATCH_COMMAND patch -p1 < ${patch_dir}/sox.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/sox/
  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/sox/configure ${COMMON_ARGS} ${SOX_OPTIONS}
  BUILD_BYPRODUCTS ${SOX_LIBRARIES}
  DOWNLOAD_NO_PROGRESS ON
  LOG_DOWNLOAD ON
  LOG_UPDATE ON
  LOG_CONFIGURE ON
  LOG_BUILD ON
  LOG_INSTALL ON
  LOG_MERGED_STDOUTERR ON
  LOG_OUTPUT_ON_FAILURE ON
 )
 add_library(libsox INTERFACE)
 add_dependencies(libsox sox)
 target_include_directories(libsox INTERFACE ${INSTALL_DIR}/include)
 target_link_libraries(libsox INTERFACE ${SOX_LIBRARIES})
--- a/audio/paddleaudio/utils/init.py
+++ b/audio/paddleaudio/utils/init.py
@ -0,0 +1,27 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .download import decompress
 from .download import download_and_decompress
 from .download import load_state_dict_from_url
 from .env import DATA_HOME
 from .env import MODEL_HOME
 from .env import PPAUDIO_HOME
 from .env import USER_HOME
 from .error import ParameterError
 from .log import Logger
 from .log import logger
 from .numeric import depth_convert
 from .numeric import pcm16to32
 from .time import seconds_to_hms
 from .time import Timer
--- a/audio/paddleaudio/utils/download.py
+++ b/audio/paddleaudio/utils/download.py
@ -0,0 +1,64 @@
 # Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 from typing import Dict
 from typing import List
 from paddle.framework import load as load_state_dict
 from paddle.utils import download
 from .log import logger
 download.logger = logger
 __all__ = [
    'decompress',
    'download_and_decompress',
    'load_state_dict_from_url',
 ]
 def decompress(file: str):
    """
    Extracts all files from a compressed file.
    """
    assert os.path.isfile(file), "File: {} not exists.".format(file)
    download._decompress(file)
 def download_and_decompress(archives: List[Dict[str, str]],
                            path: str,
                            decompress: bool=True):
    """
    Download archieves and decompress to specific path.
    """
    if not os.path.isdir(path):
        os.makedirs(path)
    for archive in archives:
        assert 'url' in archive and 'md5' in archive, \
            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
        download.get_path_from_url(
            archive['url'], path, archive['md5'], decompress=decompress)
 def load_state_dict_from_url(url: str, path: str, md5: str=None):
    """
    Download and load a state dict from url
    """
    if not os.path.isdir(path):
        os.makedirs(path)
    download.get_path_from_url(url, path, md5)
    return load_state_dict(os.path.join(path, os.path.basename(url)))
--- a/audio/paddleaudio/utils/env.py
+++ b/audio/paddleaudio/utils/env.py
@ -0,0 +1,60 @@
 # Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 '''
 This module is used to store environmental variables in PaddleAudio.
 PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
 ├                            default value through the PPAUDIO_HOME environment variable.
 ├─ MODEL_HOME    -->  Store model files.
 └─ DATA_HOME     -->  Store automatically downloaded datasets.
 '''
 import os
 __all__ = [
    'USER_HOME',
    'PPAUDIO_HOME',
    'MODEL_HOME',
    'DATA_HOME',
 ]
 def _get_user_home():
    return os.path.expanduser('~')
 def _get_ppaudio_home():
    if 'PPAUDIO_HOME' in os.environ:
        home_path = os.environ['PPAUDIO_HOME']
        if os.path.exists(home_path):
            if os.path.isdir(home_path):
                return home_path
            else:
                raise RuntimeError(
                    'The environment variable PPAUDIO_HOME {} is not a directory.'.
                    format(home_path))
        else:
            return home_path
    return os.path.join(_get_user_home(), '.paddleaudio')
 def _get_sub_home(directory):
    home = os.path.join(_get_ppaudio_home(), directory)
    if not os.path.exists(home):
        os.makedirs(home)
    return home
 USER_HOME = _get_user_home()
 PPAUDIO_HOME = _get_ppaudio_home()
 MODEL_HOME = _get_sub_home('models')
 DATA_HOME = _get_sub_home('datasets')
--- a/audio/paddleaudio/utils/error.py
+++ b/audio/paddleaudio/utils/error.py
@ -11,3 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 __all__ = ['ParameterError']
 class ParameterError(Exception):
    """Exception class for Parameter checking"""
    pass
--- a/audio/paddleaudio/utils/log.py
+++ b/audio/paddleaudio/utils/log.py
@ -0,0 +1,139 @@
 # Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
 import functools
 import logging
 import threading
 import time
 import colorlog
 __all__ = [
    'Logger',
    'logger',
 ]
 log_config = {
    'DEBUG': {
        'level': 10,
        'color': 'purple'
    },
    'INFO': {
        'level': 20,
        'color': 'green'
    },
    'TRAIN': {
        'level': 21,
        'color': 'cyan'
    },
    'EVAL': {
        'level': 22,
        'color': 'blue'
    },
    'WARNING': {
        'level': 30,
        'color': 'yellow'
    },
    'ERROR': {
        'level': 40,
        'color': 'red'
    },
    'CRITICAL': {
        'level': 50,
        'color': 'bold_red'
    }
 }
 class Logger(object):
    '''
    Deafult logger in PaddleAudio
    Args:
        name(str) : Logger name, default is 'PaddleAudio'
    '''
    def __init__(self, name: str=None):
        name = 'PaddleAudio' if not name else name
        self.logger = logging.getLogger(name)
        for key, conf in log_config.items():
            logging.addLevelName(conf['level'], key)
            self.__dict__[key] = functools.partial(self.__call__, conf['level'])
            self.__dict__[key.lower()] = functools.partial(self.__call__,
                                                           conf['level'])
        self.format = colorlog.ColoredFormatter(
            '%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
            log_colors={key: conf['color']
                        for key, conf in log_config.items()})
        self.handler = logging.StreamHandler()
        self.handler.setFormatter(self.format)
        self.logger.addHandler(self.handler)
        self.logLevel = 'DEBUG'
        self.logger.setLevel(logging.DEBUG)
        self.logger.propagate = False
        self._is_enable = True
    def disable(self):
        self._is_enable = False
    def enable(self):
        self._is_enable = True
    @property
    def is_enable(self) -> bool:
        return self._is_enable
    def __call__(self, log_level: str, msg: str):
        if not self.is_enable:
            return
        self.logger.log(log_level, msg)
    @contextlib.contextmanager
    def use_terminator(self, terminator: str):
        old_terminator = self.handler.terminator
        self.handler.terminator = terminator
        yield
        self.handler.terminator = old_terminator
    @contextlib.contextmanager
    def processing(self, msg: str, interval: float=0.1):
        '''
        Continuously print a progress bar with rotating special effects.
        Args:
            msg(str): Message to be printed.
            interval(float): Rotation interval. Default to 0.1.
        '''
        end = False
        def _printer():
            index = 0
            flags = ['\\', '|', '/', '-']
            while not end:
                flag = flags[index % len(flags)]
                with self.use_terminator('\r'):
                    self.info('{}: {}'.format(msg, flag))
                time.sleep(interval)
                index += 1
        t = threading.Thread(target=_printer)
        t.start()
        yield
        end = True
 logger = Logger()
--- a/audio/paddleaudio/utils/numeric.py
+++ b/audio/paddleaudio/utils/numeric.py
@ -0,0 +1,107 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Union
 import numpy as np
 __all__ = ["pcm16to32", "depth_convert"]
 def pcm16to32(audio: np.ndarray) -> np.ndarray:
    """pcm int16 to float32
    Args:
        audio (np.ndarray): Waveform with dtype of int16.
    Returns:
        np.ndarray: Waveform with dtype of float32.
    """
    if audio.dtype == np.int16:
        audio = audio.astype("float32")
        bits = np.iinfo(np.int16).bits
        audio = audio / (2**(bits - 1))
    return audio
 def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
    """Data type casting in a safe way, i.e., prevent overflow or underflow.
    Args:
        y (np.ndarray): Input waveform array in 1D or 2D.
        dtype (Union[type, str]): Data type of waveform.
    Returns:
        np.ndarray: `y` after safe casting.
    """
    if 'float' in str(y.dtype):
        return np.clip(y, np.finfo(dtype).min,
                       np.finfo(dtype).max).astype(dtype)
    else:
        return np.clip(y, np.iinfo(dtype).min,
                       np.iinfo(dtype).max).astype(dtype)
 def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
    """Convert audio array to target dtype safely. 
    This function convert audio waveform to a target dtype, with addition steps of
    preventing overflow/underflow and preserving audio range.
    Args:
        y (np.ndarray): Input waveform array in 1D or 2D.
        dtype (Union[type, str]): Data type of waveform.
    Returns:
        np.ndarray: `y` after safe casting.
    """
    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
    if y.dtype not in SUPPORT_DTYPE:
        raise ParameterError(
            'Unsupported audio dtype, '
            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
    if dtype not in SUPPORT_DTYPE:
        raise ParameterError(
            'Unsupported audio dtype, '
            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
    if dtype == y.dtype:
        return y
    if dtype == 'float64' and y.dtype == 'float32':
        return _safe_cast(y, dtype)
    if dtype == 'float32' and y.dtype == 'float64':
        return _safe_cast(y, dtype)
    if dtype == 'int16' or dtype == 'int8':
        if y.dtype in ['float64', 'float32']:
            factor = np.iinfo(dtype).max
            y = np.clip(y * factor, np.iinfo(dtype).min,
                        np.iinfo(dtype).max).astype(dtype)
            y = y.astype(dtype)
        else:
            if dtype == 'int16' and y.dtype == 'int8':
                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
                y = y.astype('float32') * factor
                y = y.astype('int16')
            else:  # dtype == 'int8' and y.dtype=='int16':
                y = y.astype('int32') * np.iinfo('int8').max / \
                    np.iinfo('int16').max
                y = y.astype('int8')
    if dtype in ['float32', 'float64']:
        org_dtype = y.dtype
        y = y.astype(dtype) / np.iinfo(org_dtype).max
    return y
--- a/audio/paddleaudio/utils/sox_utils.py
+++ b/audio/paddleaudio/utils/sox_utils.py
@ -0,0 +1,103 @@
 from typing import Dict
 from typing import List
 import paddleaudio
 from paddleaudio._internal import module_utils as _mod_utils
@_mod_utils.requires_sox()
 def set_seed(seed: int):
    """Set libsox's PRNG
    Args:
        seed (int): seed value. valid range is int32.
    See Also:
        http://sox.sourceforge.net/sox.html
    """
    paddleaudio._paddleaudio.sox_utils_set_seed(seed)
@_mod_utils.requires_sox()
 def set_verbosity(verbosity: int):
    """Set libsox's verbosity
    Args:
        verbosity (int): Set verbosity level of libsox.
            * ``1`` failure messages
            * ``2`` warnings
            * ``3`` details of processing
            * ``4``-``6`` increasing levels of debug messages
    See Also:
        http://sox.sourceforge.net/sox.html
    """
    paddleaudio._paddleaudio.sox_utils_set_verbosity(verbosity)
@_mod_utils.requires_sox()
 def set_buffer_size(buffer_size: int):
    """Set buffer size for sox effect chain
    Args:
        buffer_size (int): Set the size in bytes of the buffers used for processing audio.
    See Also:
        http://sox.sourceforge.net/sox.html
    """
    paddleaudio._paddleaudio.sox_utils_set_buffer_size(buffer_size)
@_mod_utils.requires_sox()
 def set_use_threads(use_threads: bool):
    """Set multithread option for sox effect chain
    Args:
        use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing.
            To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support.
    See Also:
        http://sox.sourceforge.net/sox.html
    """
    paddleaudio._paddleaudio.sox_utils_set_use_threads(use_threads)
@_mod_utils.requires_sox()
 def list_effects() -> Dict[str, str]:
    """List the available sox effect names
    Returns:
        Dict[str, str]: Mapping from ``effect name`` to ``usage``
    """
    return dict(paddleaudio._paddleaudio.sox_utils_list_effects())
@_mod_utils.requires_sox()
 def list_read_formats() -> List[str]:
    """List the supported audio formats for read
    Returns:
        List[str]: List of supported audio formats
    """
    return paddleaudio._paddleaudio.sox_utils_list_read_formats()
@_mod_utils.requires_sox()
 def list_write_formats() -> List[str]:
    """List the supported audio formats for write
    Returns:
        List[str]: List of supported audio formats
    """
    return paddleaudio._paddleaudio.sox_utils_list_write_formats()
@_mod_utils.requires_sox()
 def get_buffer_size() -> int:
    """Get buffer size for sox effect chain
    Returns:
        int: size in bytes of buffers used for processing audio.
    """
    return paddleaudio._paddleaudio.sox_utils_get_buffer_size()
--- a/audio/paddleaudio/utils/tensor_utils.py
+++ b/audio/paddleaudio/utils/tensor_utils.py
@ -0,0 +1,192 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Unility functions for Transformer."""
 from typing import List
 from typing import Tuple
 import paddle
 from .log import Logger
 __all__ = ["pad_sequence", "add_sos_eos", "th_accuracy", "has_tensor"]
 logger = Logger(__name__)
 def has_tensor(val):
    if isinstance(val, (list, tuple)):
        for item in val:
            if has_tensor(item):
                return True
    elif isinstance(val, dict):
        for k, v in val.items():
            print(k)
            if has_tensor(v):
                return True
    else:
        return paddle.is_tensor(val)
 def pad_sequence(sequences: List[paddle.Tensor],
                 batch_first: bool=False,
                 padding_value: float=0.0) -> paddle.Tensor:
    r"""Pad a list of variable length Tensors with ``padding_value``
    ``pad_sequence`` stacks a list of Tensors along a new dimension,
    and pads them to equal length. For example, if the input is list of
    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
    otherwise.
    `B` is batch size. It is equal to the number of elements in ``sequences``.
    `T` is length of the longest sequence.
    `L` is length of the sequence.
    `*` is any number of trailing dimensions, including none.
    Example:
        >>> from paddle.nn.utils.rnn import pad_sequence
        >>> a = paddle.ones(25, 300)
        >>> b = paddle.ones(22, 300)
        >>> c = paddle.ones(15, 300)
        >>> pad_sequence([a, b, c]).shape
        paddle.Tensor([25, 3, 300])
    Note:
        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
        where `T` is the length of the longest sequence. This function assumes
        trailing dimensions and type of all the Tensors in sequences are same.
    Args:
        sequences (list[Tensor]): list of variable length sequences.
        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
            ``T x B x *`` otherwise
        padding_value (float, optional): value for padded elements. Default: 0.
    Returns:
        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
        Tensor of size ``B x T x *`` otherwise
    """
    # assuming trailing dimensions and type of all the Tensors
    # in sequences are same and fetching those from sequences[0]
    max_size = paddle.shape(sequences[0])
    # (TODO Hui Zhang): slice not supprot `end==start`
    # trailing_dims = max_size[1:]
    trailing_dims = tuple(
        max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
    max_len = max([s.shape[0] for s in sequences])
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
        out_dims = (max_len, len(sequences)) + trailing_dims
    out_tensor = paddle.full(out_dims, padding_value, sequences[0].dtype)
    for i, tensor in enumerate(sequences):
        length = tensor.shape[0]
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
            # TODO (Hui Zhang): set_value op not supprot `end==start`
            # TODO (Hui Zhang): set_value op not support int16
            # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
            # out_tensor[i, :length, ...] = tensor
            if length != 0:
                out_tensor[i, :length] = tensor
            else:
                out_tensor[i, length] = tensor
        else:
            # TODO (Hui Zhang): set_value op not supprot `end==start`
            # out_tensor[:length, i, ...] = tensor
            if length != 0:
                out_tensor[:length, i] = tensor
            else:
                out_tensor[length, i] = tensor
    return out_tensor
 def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
                ignore_id: int) -> Tuple[paddle.Tensor, paddle.Tensor]:
    """Add <sos> and <eos> labels.
    Args:
        ys_pad (paddle.Tensor): batch of padded target sequences (B, Lmax)
        sos (int): index of <sos>
        eos (int): index of <eeos>
        ignore_id (int): index of padding
    Returns:
        ys_in (paddle.Tensor) : (B, Lmax + 1)
        ys_out (paddle.Tensor) : (B, Lmax + 1)
    Examples:
        >>> sos_id = 10
        >>> eos_id = 11
        >>> ignore_id = -1
        >>> ys_pad
        tensor([[ 1,  2,  3,  4,  5],
                [ 4,  5,  6, -1, -1],
                [ 7,  8,  9, -1, -1]], dtype=paddle.int32)
        >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id)
        >>> ys_in
        tensor([[10,  1,  2,  3,  4,  5],
                [10,  4,  5,  6, 11, 11],
                [10,  7,  8,  9, 11, 11]])
        >>> ys_out
        tensor([[ 1,  2,  3,  4,  5, 11],
                [ 4,  5,  6, 11, -1, -1],
                [ 7,  8,  9, 11, -1, -1]])
    """
    # TODO(Hui Zhang): using comment code,
    #_sos = paddle.to_tensor(
    #    [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
    #_eos = paddle.to_tensor(
    #    [eos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
    #ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
    #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
    #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
    #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
    B = ys_pad.shape[0]
    _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
    _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
    ys_in = paddle.cat([_sos, ys_pad], dim=1)
    mask_pad = (ys_in == ignore_id)
    ys_in = ys_in.masked_fill(mask_pad, eos)
    ys_out = paddle.cat([ys_pad, _eos], dim=1)
    ys_out = ys_out.masked_fill(mask_pad, eos)
    mask_eos = (ys_out == ignore_id)
    ys_out = ys_out.masked_fill(mask_eos, eos)
    ys_out = ys_out.masked_fill(mask_pad, ignore_id)
    return ys_in, ys_out
 def th_accuracy(pad_outputs: paddle.Tensor,
                pad_targets: paddle.Tensor,
                ignore_label: int) -> float:
    """Calculate accuracy.
    Args:
        pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
        pad_targets (LongTensor): Target label tensors (B, Lmax, D).
        ignore_label (int): Ignore label id.
    Returns:
        float: Accuracy value (0.0 - 1.0).
    """
    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
                                pad_outputs.shape[1]).argmax(2)
    mask = pad_targets != ignore_label
    #TODO(Hui Zhang): sum not support bool type
    # numerator = paddle.sum(
    #     pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
    numerator = (
        pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
    numerator = paddle.sum(numerator.type_as(pad_targets))
    #TODO(Hui Zhang): sum not support bool type
    # denominator = paddle.sum(mask)
    denominator = paddle.sum(mask.type_as(pad_targets))
    return float(numerator) / float(denominator)
--- a/audio/paddleaudio/utils/time.py
+++ b/audio/paddleaudio/utils/time.py
@ -0,0 +1,72 @@
 # Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import time
 __all__ = [
    'Timer',
    'seconds_to_hms',
 ]
 class Timer(object):
    '''Calculate runing speed and estimated time of arrival(ETA)'''
    def __init__(self, total_step: int):
        self.total_step = total_step
        self.last_start_step = 0
        self.current_step = 0
        self._is_running = True
    def start(self):
        self.last_time = time.time()
        self.start_time = time.time()
    def stop(self):
        self._is_running = False
        self.end_time = time.time()
    def count(self) -> int:
        if not self.current_step >= self.total_step:
            self.current_step += 1
        return self.current_step
    @property
    def timing(self) -> float:
        run_steps = self.current_step - self.last_start_step
        self.last_start_step = self.current_step
        time_used = time.time() - self.last_time
        self.last_time = time.time()
        return run_steps / time_used
    @property
    def is_running(self) -> bool:
        return self._is_running
    @property
    def eta(self) -> str:
        if not self.is_running:
            return '00:00:00'
        scale = self.total_step / self.current_step
        remaining_time = (time.time() - self.start_time) * scale
        return seconds_to_hms(remaining_time)
 def seconds_to_hms(seconds: int) -> str:
    '''Convert the number of seconds to hh:mm:ss'''
    h = math.floor(seconds / 3600)
    m = math.floor((seconds - h * 3600) / 60)
    s = int(seconds - h * 3600 - m * 60)
    hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
    return hms_str
--- a/audio/setup.py
+++ b/audio/setup.py
@ -0,0 +1,293 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
 import inspect
 import io
 import os
 import platform
 import subprocess as sp
 import sys
 from pathlib import Path
 from typing import List
 from typing import Tuple
 from typing import Union
 import distutils.command.clean
 from setuptools import Command
 from setuptools import find_packages
 from setuptools import setup
 from setuptools.command.develop import develop
 from setuptools.command.test import test
 from tools import setup_helpers
 ROOT_DIR = Path(__file__).parent.resolve()
 VERSION = '1.1.0'
 COMMITID = 'none'
 base = [
    "kaldiio",
    "librosa==0.8.1",
    "scipy>=1.0.0",
    "soundfile~=0.10",
    "colorlog",
    "pathos == 0.2.8",
    "pybind11",
    "parameterized",
    "tqdm"
 ]
 requirements = {
    "install":
    base,
    "develop": [
        "sox",
        "soxbindings",
        "pre-commit",
    ],
 }
 def check_call(cmd: str, shell=False, executable=None):
    try:
        sp.check_call(
            cmd.split(),
            shell=shell,
            executable="/bin/bash" if shell else executable)
    except sp.CalledProcessError as e:
        print(
            f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:",
            e.output,
            file=sys.stderr)
        raise e
 def check_output(cmd: Union[str, List[str], Tuple[str]], shell=False):
    try:
        if isinstance(cmd, (list, tuple)):
            cmds = cmd
        else:
            cmds = cmd.split()
        out_bytes = sp.check_output(cmds)
    except sp.CalledProcessError as e:
        out_bytes = e.output  # Output generated before error
        code = e.returncode  # Return code
        print(
            f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:",
            out_bytes,
            file=sys.stderr)
    return out_bytes.strip().decode('utf8')
 def _run_cmd(cmd):
    try:
        return subprocess.check_output(
            cmd, cwd=ROOT_DIR,
            stderr=subprocess.DEVNULL).decode("ascii").strip()
    except Exception:
        return None
@contextlib.contextmanager
 def pushd(new_dir):
    old_dir = os.getcwd()
    os.chdir(new_dir)
    print(new_dir)
    yield
    os.chdir(old_dir)
    print(old_dir)
 def read(*names, **kwargs):
    with io.open(
            os.path.join(os.path.dirname(__file__), *names),
            encoding=kwargs.get("encoding", "utf8")) as fp:
        return fp.read()
 def _remove(files: str):
    for f in files:
        f.unlink()
 ################################# Install ##################################
 def _post_install(install_lib_dir):
    pass
 class DevelopCommand(develop):
    def run(self):
        develop.run(self)
        # must after develop.run, or pkg install by shell will not see
        self.execute(_post_install, (self.install_lib, ), msg="Post Install...")
 class TestCommand(test):
    def finalize_options(self):
        test.finalize_options(self)
        self.test_args = []
        self.test_suite = True
    def run_tests(self):
        # Run nose ensuring that argv simulates running nosetests directly
        import nose
        nose.run_exit(argv=['nosetests', '-w', 'tests'])
    def run_benchmark(self):
        for benchmark_item in glob.glob('tests/benchmark/*py'):
            os.system(f'pytest {benchmark_item}')
 # cmd: python setup.py upload
 class UploadCommand(Command):
    description = "Build and publish the package."
    user_options = []
    def initialize_options(self):
        pass
    def finalize_options(self):
        pass
    def run(self):
        try:
            print("Removing previous dist/ ...")
            shutil.rmtree(str(ROOT_DIR / "dist"))
        except OSError:
            pass
        print("Building source distribution...")
        sp.check_call([sys.executable, "setup.py", "sdist"])
        print("Uploading package to PyPi...")
        sp.check_call(["twine", "upload", "dist/*"])
        sys.exit()
 ################################# Version ##################################
 def _get_version(sha):
    version = VERSION
    if os.getenv("BUILD_VERSION"):
        version = os.getenv("BUILD_VERSION")
    elif sha is not None:
        version += "+" + sha[:7]
    return version
 def _make_version_file(version, sha):
    sha = "Unknown" if sha is None else sha
    version_path = ROOT_DIR / "paddleaudio" / "__init__.py"
    with open(version_path, "a") as f:
        f.write(f"__version__ = '{version}'\n")
 def _rm_version():
    file_ = ROOT_DIR / "paddleaudio" / "__init__.py"
    with open(file_, "r") as f:
        lines = f.readlines()
    with open(file_, "w") as f:
        for line in lines:
            if "__version__" not in line:
                f.write(line)
 ################################# Steup ##################################
 class clean(distutils.command.clean.clean):
    def run(self):
        # Run default behavior first
        distutils.command.clean.clean.run(self)
        # Remove paddleaudio extension
        for path in (ROOT_DIR / "paddleaudio").glob("**/*.so"):
            print(f"removing '{path}'")
            path.unlink()
        # Remove build directory
        build_dirs = [
            ROOT_DIR / "build",
        ]
        for path in build_dirs:
            if path.exists():
                print(f"removing '{path}' (and everything under it)")
                shutil.rmtree(str(path), ignore_errors=True)
 def main():
    sha = _run_cmd(["git", "rev-parse", "HEAD"])  # commit id
    branch = _run_cmd(["git", "rev-parse", "--abbrev-ref", "HEAD"])
    tag = _run_cmd(["git", "describe", "--tags", "--exact-match", "@"])
    print("-- Git branch:", branch)
    print("-- Git SHA:", sha)
    print("-- Git tag:", tag)
    version = _get_version(sha)
    print("-- Building version", version)
    _rm_version()
    _make_version_file(version, sha)
    lib_package_data = {}
    if platform.system() != 'Windows' and platform.system() != 'Linux':
        lib_package_data = {'paddleaudio': ['lib/libgcc_s.1.1.dylib']}
    if platform.system() == 'Linux':
        lib_package_data = {'paddleaudio': ['lib/lib*']}
    setup_info = dict(
        # Metadata
        name='paddleaudio',
        version=VERSION,
        author='PaddlePaddle Speech and Language Team',
        author_email='paddlesl@baidu.com',
        url='https://github.com/PaddlePaddle/PaddleSpeech/audio',
        license='Apache 2.0',
        description='Speech audio tools based on Paddlepaddle',
        keywords=[
            "audio process"
            "paddlepaddle",
        ],
        python_requires='>=3.7',
        install_requires=requirements["install"],
        extras_require={
            'develop':
            requirements["develop"],
            #'test': ["nose", "torchaudio==0.10.2", "pytest-benchmark", "librosa=0.8.1", "parameterized", "paddlepaddle"],
        },
        cmdclass={
            "build_ext": setup_helpers.CMakeBuild,
            'develop': DevelopCommand,
            'test': TestCommand,
            'upload': UploadCommand,
            "clean": clean,
        },
        # Package info
        packages=find_packages(include=('paddleaudio*')),
        package_data=lib_package_data,
        ext_modules=setup_helpers.get_ext_modules(),
        zip_safe=True,
        classifiers=[
            'Development Status :: 5 - Production/Stable',
            'Intended Audience :: Developers',
            'Intended Audience :: Science/Research',
            'Topic :: Scientific/Engineering :: Artificial Intelligence',
            'License :: OSI Approved :: Apache Software License',
            'Programming Language :: Python',
            'Programming Language :: Python :: 3',
            'Programming Language :: Python :: 3.6',
            'Programming Language :: Python :: 3.7',
            'Programming Language :: Python :: 3.8',
            'Programming Language :: Python :: 3.9',
            'Programming Language :: Python :: 3.10',
        ],
    )
    setup(**setup_info)
    _rm_version()
 if __name__ == '__main__':
    main()
--- a/tests/unit/audio/backends/base.py
+++ b/tests/unit/audio/backends/base.py
--- a/audio/tests/backends/common.py
+++ b/audio/tests/backends/common.py
@ -0,0 +1,32 @@
 def get_encoding(ext, dtype):
    exts = {
        "mp3",
        "flac",
        "vorbis",
    }
    encodings = {
        "float32": "PCM_F",
        "int32": "PCM_S",
        "int16": "PCM_S",
        "uint8": "PCM_U",
    }
    return ext.upper() if ext in exts else encodings[dtype]
 def get_bit_depth(dtype):
    bit_depths = {
        "float32": 32,
        "int32": 32,
        "int16": 16,
        "uint8": 8,
    }
    return bit_depths[dtype]
 def get_bits_per_sample(ext, dtype):
    bits_per_samples = {
        "flac": 24,
        "mp3": 0,
        "vorbis": 0,
    }
    return bits_per_samples.get(ext, get_bit_depth(dtype))
--- a/audio/tests/backends/soundfile/base.py
+++ b/audio/tests/backends/soundfile/base.py
@ -0,0 +1,34 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import unittest
 import urllib.request
 mono_channel_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
 multi_channels_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav'
 class BackendTest(unittest.TestCase):
    def setUp(self):
        self.initWavInput()
    def initWavInput(self):
        self.files = []
        for url in [mono_channel_wav, multi_channels_wav]:
            if not os.path.isfile(os.path.basename(url)):
                urllib.request.urlretrieve(url, os.path.basename(url))
            self.files.append(os.path.basename(url))
    def initParmas(self):
        raise NotImplementedError
--- a/audio/tests/backends/soundfile/common.py
+++ b/audio/tests/backends/soundfile/common.py
@ -0,0 +1,89 @@
 import itertools
 from unittest import skipIf
 from paddleaudio._internal.module_utils import is_module_available
 from parameterized import parameterized
 def name_func(func, _, params):
    return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
 def dtype2subtype(dtype):
    return {
        "float64": "DOUBLE",
        "float32": "FLOAT",
        "int32": "PCM_32",
        "int16": "PCM_16",
        "uint8": "PCM_U8",
        "int8": "PCM_S8",
    }[dtype]
 def skipIfFormatNotSupported(fmt):
    fmts = []
    if is_module_available("soundfile"):
        import soundfile
        fmts = soundfile.available_formats()
        return skipIf(fmt not in fmts, f'"{fmt}" is not supported by soundfile')
    return skipIf(True, '"soundfile" not available.')
 def parameterize(*params):
    return parameterized.expand(
        list(itertools.product(*params)), name_func=name_func)
 def fetch_wav_subtype(dtype, encoding, bits_per_sample):
    subtype = {
        (None, None): dtype2subtype(dtype),
        (None, 8): "PCM_U8",
        ("PCM_U", None): "PCM_U8",
        ("PCM_U", 8): "PCM_U8",
        ("PCM_S", None): "PCM_32",
        ("PCM_S", 16): "PCM_16",
        ("PCM_S", 32): "PCM_32",
        ("PCM_F", None): "FLOAT",
        ("PCM_F", 32): "FLOAT",
        ("PCM_F", 64): "DOUBLE",
        ("ULAW", None): "ULAW",
        ("ULAW", 8): "ULAW",
        ("ALAW", None): "ALAW",
        ("ALAW", 8): "ALAW",
    }.get((encoding, bits_per_sample))
    if subtype:
        return subtype
    raise ValueError(f"wav does not support ({encoding}, {bits_per_sample}).")
 def get_encoding(ext, dtype):
    exts = {
        "mp3",
        "flac",
        "vorbis",
    }
    encodings = {
        "float32": "PCM_F",
        "int32": "PCM_S",
        "int16": "PCM_S",
        "uint8": "PCM_U",
    }
    return ext.upper() if ext in exts else encodings[dtype]
 def get_bit_depth(dtype):
    bit_depths = {
        "float32": 32,
        "int32": 32,
        "int16": 16,
        "uint8": 8,
    }
    return bit_depths[dtype]
 def get_bits_per_sample(ext, dtype):
    bits_per_samples = {
        "flac": 24,
        "mp3": 0,
        "vorbis": 0,
    }
    return bits_per_samples.get(ext, get_bit_depth(dtype))
--- a/audio/tests/backends/soundfile/common_utils
+++ b/audio/tests/backends/soundfile/common_utils
@ -0,0 +1 @@
 ../../common_utils
--- a/audio/tests/backends/soundfile/info_test.py
+++ b/audio/tests/backends/soundfile/info_test.py
@ -0,0 +1,199 @@
 #this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/info_test.py
 import tarfile
 import unittest
 import warnings
 from unittest.mock import patch
 import paddle
 import soundfile
 from common import get_bits_per_sample
 from common import get_encoding
 from common import parameterize
 from common import skipIfFormatNotSupported
 from common_utils import get_wav_data
 from common_utils import nested_params
 from common_utils import save_wav
 from common_utils import TempDirMixin
 from paddleaudio.backends import soundfile_backend
 class TestInfo(TempDirMixin, unittest.TestCase):
    @parameterize(
        ["float32", "int32"],
        [8000, 16000],
        [1, 2], )
    def test_wav(self, dtype, sample_rate, num_channels):
        """`soundfile_backend.info` can check wav file correctly"""
        duration = 1
        path = self.get_temp_path("data.wav")
        data = get_wav_data(
            dtype,
            num_channels,
            normalize=False,
            num_frames=duration * sample_rate)
        save_wav(path, data, sample_rate)
        info = soundfile_backend.info(path)
        assert info.sample_rate == sample_rate
        assert info.num_frames == sample_rate * duration
        assert info.num_channels == num_channels
        assert info.bits_per_sample == get_bits_per_sample("wav", dtype)
        assert info.encoding == get_encoding("wav", dtype)
    @parameterize([8000, 16000], [1, 2])
    @skipIfFormatNotSupported("FLAC")
    def test_flac(self, sample_rate, num_channels):
        """`soundfile_backend.info` can check flac file correctly"""
        duration = 1
        num_frames = sample_rate * duration
        #data = torch.randn(num_frames, num_channels).numpy()
        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
        path = self.get_temp_path("data.flac")
        soundfile.write(path, data, sample_rate)
        info = soundfile_backend.info(path)
        assert info.sample_rate == sample_rate
        assert info.num_frames == num_frames
        assert info.num_channels == num_channels
        assert info.bits_per_sample == 16
        assert info.encoding == "FLAC"
    #@parameterize([8000, 16000], [1, 2])
    #@skipIfFormatNotSupported("OGG")
    #def test_ogg(self, sample_rate, num_channels):
    #"""`soundfile_backend.info` can check ogg file correctly"""
    #duration = 1
    #num_frames = sample_rate * duration
    ##data = torch.randn(num_frames, num_channels).numpy()
    #data = paddle.randn(shape=[num_frames, num_channels]).numpy()
    #print(len(data))
    #path = self.get_temp_path("data.ogg")
    #soundfile.write(path, data, sample_rate)
    #info = soundfile_backend.info(path)
    #print(info)
    #assert info.sample_rate == sample_rate
    #print("info")
    #print(info.num_frames)
    #print("jiji")
    #print(sample_rate*duration)
    ##assert info.num_frames == sample_rate * duration
    #assert info.num_channels == num_channels
    #assert info.bits_per_sample == 0
    #assert info.encoding == "VORBIS"
    @nested_params(
        [8000, 16000],
        [1, 2],
        [("PCM_24", 24), ("PCM_32", 32)], )
    @skipIfFormatNotSupported("NIST")
    def test_sphere(self, sample_rate, num_channels, subtype_and_bit_depth):
        """`soundfile_backend.info` can check sph file correctly"""
        duration = 1
        num_frames = sample_rate * duration
        #data = torch.randn(num_frames, num_channels).numpy()
        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
        path = self.get_temp_path("data.nist")
        subtype, bits_per_sample = subtype_and_bit_depth
        soundfile.write(path, data, sample_rate, subtype=subtype)
        info = soundfile_backend.info(path)
        assert info.sample_rate == sample_rate
        assert info.num_frames == sample_rate * duration
        assert info.num_channels == num_channels
        assert info.bits_per_sample == bits_per_sample
        assert info.encoding == "PCM_S"
    def test_unknown_subtype_warning(self):
        """soundfile_backend.info issues a warning when the subtype is unknown
        This will happen if a new subtype is supported in SoundFile: the _SUBTYPE_TO_BITS_PER_SAMPLE
        dict should be updated.
        """
        def _mock_info_func(_):
            class MockSoundFileInfo:
                samplerate = 8000
                frames = 356
                channels = 2
                subtype = "UNSEEN_SUBTYPE"
                format = "UNKNOWN"
            return MockSoundFileInfo()
        with patch("soundfile.info", _mock_info_func):
            with warnings.catch_warnings(record=True) as w:
                info = soundfile_backend.info("foo")
                assert len(w) == 1
                assert "UNSEEN_SUBTYPE subtype is unknown to PaddleAudio" in str(
                    w[-1].message)
                assert info.bits_per_sample == 0
 class TestFileObject(TempDirMixin, unittest.TestCase):
    def _test_fileobj(self, ext, subtype, bits_per_sample):
        """Query audio via file-like object works"""
        duration = 2
        sample_rate = 16000
        num_channels = 2
        num_frames = sample_rate * duration
        path = self.get_temp_path(f"test.{ext}")
        #data = torch.randn(num_frames, num_channels).numpy()
        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
        soundfile.write(path, data, sample_rate, subtype=subtype)
        with open(path, "rb") as fileobj:
            info = soundfile_backend.info(fileobj)
        assert info.sample_rate == sample_rate
        assert info.num_frames == num_frames
        assert info.num_channels == num_channels
        assert info.bits_per_sample == bits_per_sample
        assert info.encoding == "FLAC" if ext == "flac" else "PCM_S"
    def test_fileobj_wav(self):
        """Loading audio via file-like object works"""
        self._test_fileobj("wav", "PCM_16", 16)
    @skipIfFormatNotSupported("FLAC")
    def test_fileobj_flac(self):
        """Loading audio via file-like object works"""
        self._test_fileobj("flac", "PCM_16", 16)
    def _test_tarobj(self, ext, subtype, bits_per_sample):
        """Query compressed audio via file-like object works"""
        duration = 2
        sample_rate = 16000
        num_channels = 2
        num_frames = sample_rate * duration
        audio_file = f"test.{ext}"
        audio_path = self.get_temp_path(audio_file)
        archive_path = self.get_temp_path("archive.tar.gz")
        #data = torch.randn(num_frames, num_channels).numpy()
        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
        soundfile.write(audio_path, data, sample_rate, subtype=subtype)
        with tarfile.TarFile(archive_path, "w") as tarobj:
            tarobj.add(audio_path, arcname=audio_file)
        with tarfile.TarFile(archive_path, "r") as tarobj:
            fileobj = tarobj.extractfile(audio_file)
            info = soundfile_backend.info(fileobj)
        assert info.sample_rate == sample_rate
        assert info.num_frames == num_frames
        assert info.num_channels == num_channels
        assert info.bits_per_sample == bits_per_sample
        assert info.encoding == "FLAC" if ext == "flac" else "PCM_S"
    def test_tarobj_wav(self):
        """Query compressed audio via file-like object works"""
        self._test_tarobj("wav", "PCM_16", 16)
    @skipIfFormatNotSupported("FLAC")
    def test_tarobj_flac(self):
        """Query compressed audio via file-like object works"""
        self._test_tarobj("flac", "PCM_16", 16)
 if __name__ == '__main__':
    unittest.main()
--- a/audio/tests/backends/soundfile/load_test.py
+++ b/audio/tests/backends/soundfile/load_test.py
@ -0,0 +1,363 @@
 #this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/load_test.py
 import os
 import tarfile
 import unittest
 from unittest.mock import patch
 import numpy as np
 import paddle
 import soundfile
 from common import dtype2subtype
 from common import parameterize
 from common import skipIfFormatNotSupported
 from common_utils import get_wav_data
 from common_utils import load_wav
 from common_utils import normalize_wav
 from common_utils import save_wav
 from common_utils import TempDirMixin
 from paddleaudio.backends import soundfile_backend
 from parameterized import parameterized
 def _get_mock_path(
        ext: str,
        dtype: str,
        sample_rate: int,
        num_channels: int,
        num_frames: int, ):
    return f"{dtype}_{sample_rate}_{num_channels}_{num_frames}.{ext}"
 def _get_mock_params(path: str):
    filename, ext = path.split(".")
    parts = filename.split("_")
    return {
        "ext": ext,
        "dtype": parts[0],
        "sample_rate": int(parts[1]),
        "num_channels": int(parts[2]),
        "num_frames": int(parts[3]),
    }
 class SoundFileMock:
    def __init__(self, path, mode):
        assert mode == "r"
        self.path = path
        self._params = _get_mock_params(path)
        self._start = None
    @property
    def samplerate(self):
        return self._params["sample_rate"]
    @property
    def format(self):
        if self._params["ext"] == "wav":
            return "WAV"
        if self._params["ext"] == "flac":
            return "FLAC"
        if self._params["ext"] == "ogg":
            return "OGG"
        if self._params["ext"] in ["sph", "nis", "nist"]:
            return "NIST"
    @property
    def subtype(self):
        if self._params["ext"] == "ogg":
            return "VORBIS"
        return dtype2subtype(self._params["dtype"])
    def _prepare_read(self, start, stop, frames):
        assert stop is None
        self._start = start
        return frames
    def read(self, frames, dtype, always_2d):
        assert always_2d
        data = get_wav_data(
            dtype,
            self._params["num_channels"],
            normalize=False,
            num_frames=self._params["num_frames"],
            channels_first=False, ).numpy()
        return data[self._start:self._start + frames]
    def __enter__(self):
        return self
    def __exit__(self, *args, **kwargs):
        pass
 class MockedLoadTest(unittest.TestCase):
    def assert_dtype(self, ext, dtype, sample_rate, num_channels, normalize,
                     channels_first):
        """When format is WAV or NIST, normalize=False will return the native dtype Tensor, otherwise float32"""
        num_frames = 3 * sample_rate
        path = _get_mock_path(ext, dtype, sample_rate, num_channels, num_frames)
        expected_dtype = paddle.float32 if normalize or ext not in [
            "wav", "nist"
        ] else getattr(paddle, dtype)
        with patch("soundfile.SoundFile", SoundFileMock):
            found, sr = soundfile_backend.load(
                path, normalize=normalize, channels_first=channels_first)
            assert found.dtype == expected_dtype
            assert sample_rate == sr
    @parameterize(
        ["int32", "float32", "float64"],
        [8000, 16000],
        [1, 2],
        [True, False],
        [True, False], )
    def test_wav(self, dtype, sample_rate, num_channels, normalize,
                 channels_first):
        """Returns native dtype when normalize=False else float32"""
        self.assert_dtype("wav", dtype, sample_rate, num_channels, normalize,
                          channels_first)
    @parameterize(
        ["int32"],
        [8000, 16000],
        [1, 2],
        [True, False],
        [True, False], )
    def test_sphere(self, dtype, sample_rate, num_channels, normalize,
                    channels_first):
        """Returns float32 always"""
        self.assert_dtype("sph", dtype, sample_rate, num_channels, normalize,
                          channels_first)
    @parameterize([8000, 16000], [1, 2], [True, False], [True, False])
    def test_ogg(self, sample_rate, num_channels, normalize, channels_first):
        """Returns float32 always"""
        self.assert_dtype("ogg", "int16", sample_rate, num_channels, normalize,
                          channels_first)
    @parameterize([8000, 16000], [1, 2], [True, False], [True, False])
    def test_flac(self, sample_rate, num_channels, normalize, channels_first):
        """`soundfile_backend.load` can load ogg format."""
        self.assert_dtype("flac", "int16", sample_rate, num_channels, normalize,
                          channels_first)
 class LoadTestBase(TempDirMixin, unittest.TestCase):
    def assert_wav(
            self,
            dtype,
            sample_rate,
            num_channels,
            normalize,
            channels_first=True,
            duration=1, ):
        """`soundfile_backend.load` can load wav format correctly.
        Wav data loaded with soundfile backend should match those with scipy
        """
        path = self.get_temp_path("reference.wav")
        num_frames = duration * sample_rate
        data = get_wav_data(
            dtype,
            num_channels,
            normalize=normalize,
            num_frames=num_frames,
            channels_first=channels_first, )
        save_wav(path, data, sample_rate, channels_first=channels_first)
        expected = load_wav(
            path, normalize=normalize, channels_first=channels_first)[0]
        data, sr = soundfile_backend.load(
            path, normalize=normalize, channels_first=channels_first)
        assert sr == sample_rate
        np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
    def assert_sphere(
            self,
            dtype,
            sample_rate,
            num_channels,
            channels_first=True,
            duration=1, ):
        """`soundfile_backend.load` can load SPHERE format correctly."""
        path = self.get_temp_path("reference.sph")
        num_frames = duration * sample_rate
        raw = get_wav_data(
            dtype,
            num_channels,
            num_frames=num_frames,
            normalize=False,
            channels_first=False, )
        soundfile.write(
            path, raw, sample_rate, subtype=dtype2subtype(dtype), format="NIST")
        expected = normalize_wav(raw.t() if channels_first else raw)
        data, sr = soundfile_backend.load(path, channels_first=channels_first)
        assert sr == sample_rate
        #self.assertEqual(data, expected, atol=1e-4, rtol=1e-8)
        np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
    def assert_flac(
            self,
            dtype,
            sample_rate,
            num_channels,
            channels_first=True,
            duration=1, ):
        """`soundfile_backend.load` can load FLAC format correctly."""
        path = self.get_temp_path("reference.flac")
        num_frames = duration * sample_rate
        raw = get_wav_data(
            dtype,
            num_channels,
            num_frames=num_frames,
            normalize=False,
            channels_first=False, )
        soundfile.write(path, raw, sample_rate)
        expected = normalize_wav(raw.t() if channels_first else raw)
        data, sr = soundfile_backend.load(path, channels_first=channels_first)
        assert sr == sample_rate
        #self.assertEqual(data, expected, atol=1e-4, rtol=1e-8)
        np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
 class TestLoad(LoadTestBase):
    """Test the correctness of `soundfile_backend.load` for various formats"""
    @parameterize(
        ["float32", "int32"],
        [8000, 16000],
        [1, 2],
        [False, True],
        [False, True], )
    def test_wav(self, dtype, sample_rate, num_channels, normalize,
                 channels_first):
        """`soundfile_backend.load` can load wav format correctly."""
        self.assert_wav(dtype, sample_rate, num_channels, normalize,
                        channels_first)
    @parameterize(
        ["int32"],
        [16000],
        [2],
        [False], )
    def test_wav_large(self, dtype, sample_rate, num_channels, normalize):
        """`soundfile_backend.load` can load large wav file correctly."""
        two_hours = 2 * 60 * 60
        self.assert_wav(
            dtype, sample_rate, num_channels, normalize, duration=two_hours)
    @parameterize(["float32", "int32"], [4, 8, 16, 32], [False, True])
    def test_multiple_channels(self, dtype, num_channels, channels_first):
        """`soundfile_backend.load` can load wav file with more than 2 channels."""
        sample_rate = 8000
        normalize = False
        self.assert_wav(dtype, sample_rate, num_channels, normalize,
                        channels_first)
    #@parameterize(["int32"], [8000, 16000], [1, 2], [False, True])
    #@skipIfFormatNotSupported("NIST")
    #def test_sphere(self, dtype, sample_rate, num_channels, channels_first):
    #"""`soundfile_backend.load` can load sphere format correctly."""
    #self.assert_sphere(dtype, sample_rate, num_channels, channels_first)
    #@parameterize(["int32"], [8000, 16000], [1, 2], [False, True])
    #@skipIfFormatNotSupported("FLAC")
    #def test_flac(self, dtype, sample_rate, num_channels, channels_first):
    #"""`soundfile_backend.load` can load flac format correctly."""
    #self.assert_flac(dtype, sample_rate, num_channels, channels_first)
 class TestLoadFormat(TempDirMixin, unittest.TestCase):
    """Given `format` parameter, `so.load` can load files without extension"""
    original = None
    path = None
    def _make_file(self, format_):
        sample_rate = 8000
        path_with_ext = self.get_temp_path(f"test.{format_}")
        data = get_wav_data("float32", num_channels=2).numpy().T
        soundfile.write(path_with_ext, data, sample_rate)
        expected = soundfile.read(path_with_ext, dtype="float32")[0].T
        path = os.path.splitext(path_with_ext)[0]
        os.rename(path_with_ext, path)
        return path, expected
    def _test_format(self, format_):
        """Providing format allows to read file without extension"""
        path, expected = self._make_file(format_)
        found, _ = soundfile_backend.load(path)
        #self.assertEqual(found, expected)
        np.testing.assert_array_almost_equal(found, expected)
    @parameterized.expand([
        ("WAV", ),
        ("wav", ),
    ])
    def test_wav(self, format_):
        self._test_format(format_)
    @parameterized.expand([
        ("FLAC", ),
        ("flac", ),
    ])
    @skipIfFormatNotSupported("FLAC")
    def test_flac(self, format_):
        self._test_format(format_)
 class TestFileObject(TempDirMixin, unittest.TestCase):
    def _test_fileobj(self, ext):
        """Loading audio via file-like object works"""
        sample_rate = 16000
        path = self.get_temp_path(f"test.{ext}")
        data = get_wav_data("float32", num_channels=2).numpy().T
        soundfile.write(path, data, sample_rate)
        expected = soundfile.read(path, dtype="float32")[0].T
        with open(path, "rb") as fileobj:
            found, sr = soundfile_backend.load(fileobj)
        assert sr == sample_rate
        #self.assertEqual(expected, found)
        np.testing.assert_array_almost_equal(found, expected)
    def test_fileobj_wav(self):
        """Loading audio via file-like object works"""
        self._test_fileobj("wav")
    def test_fileobj_flac(self):
        """Loading audio via file-like object works"""
        self._test_fileobj("flac")
    def _test_tarfile(self, ext):
        """Loading audio via file-like object works"""
        sample_rate = 16000
        audio_file = f"test.{ext}"
        audio_path = self.get_temp_path(audio_file)
        archive_path = self.get_temp_path("archive.tar.gz")
        data = get_wav_data("float32", num_channels=2).numpy().T
        soundfile.write(audio_path, data, sample_rate)
        expected = soundfile.read(audio_path, dtype="float32")[0].T
        with tarfile.TarFile(archive_path, "w") as tarobj:
            tarobj.add(audio_path, arcname=audio_file)
        with tarfile.TarFile(archive_path, "r") as tarobj:
            fileobj = tarobj.extractfile(audio_file)
            found, sr = soundfile_backend.load(fileobj)
        assert sr == sample_rate
        #self.assertEqual(expected, found)
        np.testing.assert_array_almost_equal(found.numpy(), expected)
    def test_tarfile_wav(self):
        """Loading audio via file-like object works"""
        self._test_tarfile("wav")
    def test_tarfile_flac(self):
        """Loading audio via file-like object works"""
        self._test_tarfile("flac")
 if __name__ == '__main__':
    unittest.main()
--- a/audio/tests/backends/soundfile/save_test.py
+++ b/audio/tests/backends/soundfile/save_test.py
@ -0,0 +1,323 @@
 import io
 import unittest
 from unittest.mock import patch
 import numpy as np
 import paddle
 import soundfile
 from common import fetch_wav_subtype
 from common import parameterize
 from common import skipIfFormatNotSupported
 from common_utils import get_wav_data
 from common_utils import load_wav
 from common_utils import nested_params
 from common_utils import TempDirMixin
 from paddleaudio.backends import soundfile_backend
 class MockedSaveTest(unittest.TestCase):
    @nested_params(
        ["float32", "int32"],
        [8000, 16000],
        [1, 2],
        [False, True],
        [
            (None, None),
            ("PCM_U", None),
            ("PCM_U", 8),
            ("PCM_S", None),
            ("PCM_S", 16),
            ("PCM_S", 32),
            ("PCM_F", None),
            ("PCM_F", 32),
            ("PCM_F", 64),
            ("ULAW", None),
            ("ULAW", 8),
            ("ALAW", None),
            ("ALAW", 8),
        ], )
    @patch("soundfile.write")
    def test_wav(self, dtype, sample_rate, num_channels, channels_first,
                 enc_params, mocked_write):
        """soundfile_backend.save passes correct subtype to soundfile.write when WAV"""
        filepath = "foo.wav"
        input_tensor = get_wav_data(
            dtype,
            num_channels,
            num_frames=3 * sample_rate,
            normalize=dtype == "float32",
            channels_first=channels_first, )
        input_tensor = paddle.transpose(input_tensor, [1, 0])
        encoding, bits_per_sample = enc_params
        soundfile_backend.save(
            filepath,
            input_tensor,
            sample_rate,
            channels_first=channels_first,
            encoding=encoding,
            bits_per_sample=bits_per_sample, )
        # on +Py3.8 call_args.kwargs is more descreptive
        args = mocked_write.call_args[1]
        assert args["file"] == filepath
        assert args["samplerate"] == sample_rate
        assert args["subtype"] == fetch_wav_subtype(dtype, encoding,
                                                    bits_per_sample)
        assert args["format"] is None
        tensor_result = paddle.transpose(
            input_tensor, [1, 0]) if channels_first else input_tensor
        #self.assertEqual(args["data"], tensor_result.numpy())
        np.testing.assert_array_almost_equal(args["data"].numpy(),
                                             tensor_result.numpy())
    @patch("soundfile.write")
    def assert_non_wav(
            self,
            fmt,
            dtype,
            sample_rate,
            num_channels,
            channels_first,
            mocked_write,
            encoding=None,
            bits_per_sample=None, ):
        """soundfile_backend.save passes correct subtype and format to soundfile.write when SPHERE"""
        filepath = f"foo.{fmt}"
        input_tensor = get_wav_data(
            dtype,
            num_channels,
            num_frames=3 * sample_rate,
            normalize=False,
            channels_first=channels_first, )
        input_tensor = paddle.transpose(input_tensor, [1, 0])
        expected_data = paddle.transpose(
            input_tensor, [1, 0]) if channels_first else input_tensor
        soundfile_backend.save(
            filepath,
            input_tensor,
            sample_rate,
            channels_first,
            encoding=encoding,
            bits_per_sample=bits_per_sample, )
        # on +Py3.8 call_args.kwargs is more descreptive
        args = mocked_write.call_args[1]
        assert args["file"] == filepath
        assert args["samplerate"] == sample_rate
        if fmt in ["sph", "nist", "nis"]:
            assert args["format"] == "NIST"
        else:
            assert args["format"] is None
        np.testing.assert_array_almost_equal(args["data"].numpy(),
                                             expected_data.numpy())
        #self.assertEqual(args["data"], expected_data)
    @nested_params(
        ["sph", "nist", "nis"],
        ["int32"],
        [8000, 16000],
        [1, 2],
        [False, True],
        [
            ("PCM_S", 8),
            ("PCM_S", 16),
            ("PCM_S", 24),
            ("PCM_S", 32),
            ("ULAW", 8),
            ("ALAW", 8),
            ("ALAW", 16),
            ("ALAW", 24),
            ("ALAW", 32),
        ], )
    def test_sph(self, fmt, dtype, sample_rate, num_channels, channels_first,
                 enc_params):
        """soundfile_backend.save passes default format and subtype (None-s) to
        soundfile.write when not WAV"""
        encoding, bits_per_sample = enc_params
        self.assert_non_wav(
            fmt,
            dtype,
            sample_rate,
            num_channels,
            channels_first,
            encoding=encoding,
            bits_per_sample=bits_per_sample)
    @parameterize(
        ["int32"],
        [8000, 16000],
        [1, 2],
        [False, True],
        [8, 16, 24], )
    def test_flac(self, dtype, sample_rate, num_channels, channels_first,
                  bits_per_sample):
        """soundfile_backend.save passes default format and subtype (None-s) to
        soundfile.write when not WAV"""
        self.assert_non_wav(
            "flac",
            dtype,
            sample_rate,
            num_channels,
            channels_first,
            bits_per_sample=bits_per_sample)
    @parameterize(
        ["int32"],
        [8000, 16000],
        [1, 2],
        [False, True], )
    def test_ogg(self, dtype, sample_rate, num_channels, channels_first):
        """soundfile_backend.save passes default format and subtype (None-s) to
        soundfile.write when not WAV"""
        self.assert_non_wav("ogg", dtype, sample_rate, num_channels,
                            channels_first)
 class SaveTestBase(TempDirMixin, unittest.TestCase):
    def assert_wav(self, dtype, sample_rate, num_channels, num_frames):
        """`soundfile_backend.save` can save wav format."""
        path = self.get_temp_path("data.wav")
        expected = get_wav_data(
            dtype, num_channels, num_frames=num_frames, normalize=False)
        soundfile_backend.save(path, expected, sample_rate)
        found, sr = load_wav(path, normalize=False)
        assert sample_rate == sr
        #self.assertEqual(found, expected)
        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
    def _assert_non_wav(self, fmt, dtype, sample_rate, num_channels):
        """`soundfile_backend.save` can save non-wav format.
        Due to precision missmatch, and the lack of alternative way to decode the
        resulting files without using soundfile, only meta data are validated.
        """
        num_frames = sample_rate * 3
        path = self.get_temp_path(f"data.{fmt}")
        expected = get_wav_data(
            dtype, num_channels, num_frames=num_frames, normalize=False)
        soundfile_backend.save(path, expected, sample_rate)
        sinfo = soundfile.info(path)
        assert sinfo.format == fmt.upper()
        #assert sinfo.frames == num_frames this go wrong
        assert sinfo.channels == num_channels
        assert sinfo.samplerate == sample_rate
    def assert_flac(self, dtype, sample_rate, num_channels):
        """`soundfile_backend.save` can save flac format."""
        self._assert_non_wav("flac", dtype, sample_rate, num_channels)
    def assert_sphere(self, dtype, sample_rate, num_channels):
        """`soundfile_backend.save` can save sph format."""
        self._assert_non_wav("nist", dtype, sample_rate, num_channels)
    def assert_ogg(self, dtype, sample_rate, num_channels):
        """`soundfile_backend.save` can save ogg format.
        As we cannot inspect the OGG format (it's lossy), we only check the metadata.
        """
        self._assert_non_wav("ogg", dtype, sample_rate, num_channels)
 class TestSave(SaveTestBase):
    @parameterize(
        ["float32", "int32"],
        [8000, 16000],
        [1, 2], )
    def test_wav(self, dtype, sample_rate, num_channels):
        """`soundfile_backend.save` can save wav format."""
        self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
    @parameterize(
        ["float32", "int32"],
        [4, 8, 16, 32], )
    def test_multiple_channels(self, dtype, num_channels):
        """`soundfile_backend.save` can save wav with more than 2 channels."""
        sample_rate = 8000
        self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
    @parameterize(
        ["int32"],
        [8000, 16000],
        [1, 2], )
    @skipIfFormatNotSupported("NIST")
    def test_sphere(self, dtype, sample_rate, num_channels):
        """`soundfile_backend.save` can save sph format."""
        self.assert_sphere(dtype, sample_rate, num_channels)
    @parameterize(
        [8000, 16000],
        [1, 2], )
    @skipIfFormatNotSupported("FLAC")
    def test_flac(self, sample_rate, num_channels):
        """`soundfile_backend.save` can save flac format."""
        self.assert_flac("float32", sample_rate, num_channels)
    @parameterize(
        [8000, 16000],
        [1, 2], )
    @skipIfFormatNotSupported("OGG")
    def test_ogg(self, sample_rate, num_channels):
        """`soundfile_backend.save` can save ogg/vorbis format."""
        self.assert_ogg("float32", sample_rate, num_channels)
 class TestSaveParams(TempDirMixin, unittest.TestCase):
    """Test the correctness of optional parameters of `soundfile_backend.save`"""
    @parameterize([True, False])
    def test_channels_first(self, channels_first):
        """channels_first swaps axes"""
        path = self.get_temp_path("data.wav")
        data = get_wav_data("int32", 2, channels_first=channels_first)
        soundfile_backend.save(path, data, 8000, channels_first=channels_first)
        found = load_wav(path)[0]
        expected = data if channels_first else data.transpose([1, 0])
        #self.assertEqual(found, expected, atol=1e-4, rtol=1e-8)
        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
 class TestFileObject(TempDirMixin, unittest.TestCase):
    def _test_fileobj(self, ext):
        """Saving audio to file-like object works"""
        sample_rate = 16000
        path = self.get_temp_path(f"test.{ext}")
        subtype = "FLOAT" if ext == "wav" else None
        data = get_wav_data("float32", num_channels=2)
        soundfile.write(path, data.numpy().T, sample_rate, subtype=subtype)
        expected = soundfile.read(path, dtype="float32")[0]
        fileobj = io.BytesIO()
        soundfile_backend.save(fileobj, data, sample_rate, format=ext)
        fileobj.seek(0)
        found, sr = soundfile.read(fileobj, dtype="float32")
        assert sr == sample_rate
        #self.assertEqual(expected, found, atol=1e-4, rtol=1e-8)
        np.testing.assert_array_almost_equal(found, expected)
    def test_fileobj_wav(self):
        """Saving audio via file-like object works"""
        self._test_fileobj("wav")
    @skipIfFormatNotSupported("FLAC")
    def test_fileobj_flac(self):
        """Saving audio via file-like object works"""
        self._test_fileobj("flac")
    @skipIfFormatNotSupported("NIST")
    def test_fileobj_nist(self):
        """Saving audio via file-like object works"""
        self._test_fileobj("NIST")
    @skipIfFormatNotSupported("OGG")
    def test_fileobj_ogg(self):
        """Saving audio via file-like object works"""
        self._test_fileobj("OGG")
 if __name__ == '__main__':
    unittest.main()
--- a/tests/unit/audio/backends/soundfile/test_io.py
+++ b/tests/unit/audio/backends/soundfile/test_io.py
@ -16,16 +16,17 @@ import os
 import unittest
 import numpy as np
 from paddleaudio.backends import soundfile_load as load
 from paddleaudio.backends import soundfile_save as save
 import soundfile as sf
-import paddlespeech.audio
+from base import BackendTest
 from ..base import BackendTest
 class TestIO(BackendTest):
    def test_load_mono_channel(self):
        sf_data, sf_sr = sf.read(self.files[0])
-        pa_data, pa_sr = paddlespeech.audio.load(
+        pa_data, pa_sr = load(
            self.files[0], normal=False, dtype='float64')
        self.assertEqual(sf_data.dtype, pa_data.dtype)
@ -35,7 +36,7 @@ class TestIO(BackendTest):
    def test_load_multi_channels(self):
        sf_data, sf_sr = sf.read(self.files[1])
        sf_data = sf_data.T  # Channel dim first
-        pa_data, pa_sr = paddlespeech.audio.load(
+        pa_data, pa_sr = load(
            self.files[1], mono=False, normal=False, dtype='float64')
        self.assertEqual(sf_data.dtype, pa_data.dtype)
@ -49,7 +50,7 @@ class TestIO(BackendTest):
        pa_tmp_file = 'pa_tmp.wav'
        sf.write(sf_tmp_file, waveform, sr)
-        paddlespeech.audio.save(waveform, sr, pa_tmp_file)
+        save(waveform, sr, pa_tmp_file)
        self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file))
        for file in [sf_tmp_file, pa_tmp_file]:
@ -62,7 +63,7 @@ class TestIO(BackendTest):
        pa_tmp_file = 'pa_tmp.wav'
        sf.write(sf_tmp_file, waveform.T, sr)
-        paddlespeech.audio.save(waveform.T, sr, pa_tmp_file)
+        save(waveform.T, sr, pa_tmp_file)
        self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file))
        for file in [sf_tmp_file, pa_tmp_file]:
--- a/audio/tests/backends/sox_io/common.py
+++ b/audio/tests/backends/sox_io/common.py
@ -0,0 +1,89 @@
 import itertools
 from unittest import skipIf
 from paddleaudio._internal.module_utils import is_module_available
 from parameterized import parameterized
 def name_func(func, _, params):
    return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
 def dtype2subtype(dtype):
    return {
        "float64": "DOUBLE",
        "float32": "FLOAT",
        "int32": "PCM_32",
        "int16": "PCM_16",
        "uint8": "PCM_U8",
        "int8": "PCM_S8",
    }[dtype]
 def skipIfFormatNotSupported(fmt):
    fmts = []
    if is_module_available("soundfile"):
        import soundfile
        fmts = soundfile.available_formats()
        return skipIf(fmt not in fmts, f'"{fmt}" is not supported by soundfile')
    return skipIf(True, '"soundfile" not available.')
 def parameterize(*params):
    return parameterized.expand(
        list(itertools.product(*params)), name_func=name_func)
 def fetch_wav_subtype(dtype, encoding, bits_per_sample):
    subtype = {
        (None, None): dtype2subtype(dtype),
        (None, 8): "PCM_U8",
        ("PCM_U", None): "PCM_U8",
        ("PCM_U", 8): "PCM_U8",
        ("PCM_S", None): "PCM_32",
        ("PCM_S", 16): "PCM_16",
        ("PCM_S", 32): "PCM_32",
        ("PCM_F", None): "FLOAT",
        ("PCM_F", 32): "FLOAT",
        ("PCM_F", 64): "DOUBLE",
        ("ULAW", None): "ULAW",
        ("ULAW", 8): "ULAW",
        ("ALAW", None): "ALAW",
        ("ALAW", 8): "ALAW",
    }.get((encoding, bits_per_sample))
    if subtype:
        return subtype
    raise ValueError(f"wav does not support ({encoding}, {bits_per_sample}).")
 def get_encoding(ext, dtype):
    exts = {
        "mp3",
        "flac",
        "vorbis",
    }
    encodings = {
        "float32": "PCM_F",
        "int32": "PCM_S",
        "int16": "PCM_S",
        "uint8": "PCM_U",
    }
    return ext.upper() if ext in exts else encodings[dtype]
 def get_bit_depth(dtype):
    bit_depths = {
        "float32": 32,
        "int32": 32,
        "int16": 16,
        "uint8": 8,
    }
    return bit_depths[dtype]
 def get_bits_per_sample(ext, dtype):
    bits_per_samples = {
        "flac": 24,
        "mp3": 0,
        "vorbis": 0,
    }
    return bits_per_samples.get(ext, get_bit_depth(dtype))
--- a/audio/tests/backends/sox_io/common_utils
+++ b/audio/tests/backends/sox_io/common_utils
@ -0,0 +1 @@
 ../../common_utils
--- a/audio/tests/backends/sox_io/info_test.py
+++ b/audio/tests/backends/sox_io/info_test.py
@ -0,0 +1,322 @@
 import io
 import itertools
 import os
 import platform
 import tarfile
 import unittest
 from contextlib import contextmanager
 if platform.system() == "Windows":
    import warnings
    warnings.warn("sox io not support in Windows, please skip test.")
    exit()
 from parameterized import parameterized
 from common import get_bits_per_sample, get_encoding
 from paddleaudio.backends import sox_io_backend
 from common_utils import (
    get_wav_data,
    save_wav,
    TempDirMixin,
    sox_utils, )
 #code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/info_test.py
 class TestInfo(TempDirMixin, unittest.TestCase):
    @parameterized.expand(
        list(
            itertools.product(
                [
                    "float32",
                    "int32",
                ],
                [8000, 16000],
                [1, 2], )), )
    def test_wav(self, dtype, sample_rate, num_channels):
        """`sox_io_backend.info` can check wav file correctly"""
        duration = 1
        path = self.get_temp_path("data.wav")
        data = get_wav_data(
            dtype,
            num_channels,
            normalize=False,
            num_frames=duration * sample_rate)
        save_wav(path, data, sample_rate)
        info = sox_io_backend.info(path)
        assert info.sample_rate == sample_rate
        assert info.num_frames == sample_rate * duration
        assert info.num_channels == num_channels
        assert info.bits_per_sample == sox_utils.get_bit_depth(dtype)
        assert info.encoding == get_encoding("wav", dtype)
    @parameterized.expand(
        list(
            itertools.product(
                ["float32", "int32"],
                [8000, 16000],
                [4, 8, 16, 32], )), )
    def test_wav_multiple_channels(self, dtype, sample_rate, num_channels):
        """`sox_io_backend.info` can check wav file with channels more than 2 correctly"""
        duration = 1
        path = self.get_temp_path("data.wav")
        data = get_wav_data(
            dtype,
            num_channels,
            normalize=False,
            num_frames=duration * sample_rate)
        save_wav(path, data, sample_rate)
        info = sox_io_backend.info(path)
        assert info.sample_rate == sample_rate
        assert info.num_frames == sample_rate * duration
        assert info.num_channels == num_channels
        assert info.bits_per_sample == sox_utils.get_bit_depth(dtype)
    def test_ulaw(self):
        """`sox_io_backend.info` can check ulaw file correctly"""
        duration = 1
        num_channels = 1
        sample_rate = 8000
        path = self.get_temp_path("data.wav")
        sox_utils.gen_audio_file(
            path,
            sample_rate=sample_rate,
            num_channels=num_channels,
            bit_depth=8,
            encoding="u-law",
            duration=duration)
        info = sox_io_backend.info(path)
        assert info.sample_rate == sample_rate
        assert info.num_frames == sample_rate * duration
        assert info.num_channels == num_channels
        assert info.bits_per_sample == 8
        assert info.encoding == "ULAW"
    def test_alaw(self):
        """`sox_io_backend.info` can check alaw file correctly"""
        duration = 1
        num_channels = 1
        sample_rate = 8000
        path = self.get_temp_path("data.wav")
        sox_utils.gen_audio_file(
            path,
            sample_rate=sample_rate,
            num_channels=num_channels,
            bit_depth=8,
            encoding="a-law",
            duration=duration)
        info = sox_io_backend.info(path)
        assert info.sample_rate == sample_rate
        assert info.num_frames == sample_rate * duration
        assert info.num_channels == num_channels
        assert info.bits_per_sample == 8
        assert info.encoding == "ALAW"
 #class TestInfoOpus(unittest.TestCase):
 #@parameterized.expand(
 #list(
 #itertools.product(
 #["96k"],
 #[1, 2],
 #[0, 5, 10],
 #)
 #),
 #)
 #def test_opus(self, bitrate, num_channels, compression_level):
 #"""`sox_io_backend.info` can check opus file correcty"""
 #path = data_utils.get_asset_path("io", f"{bitrate}_{compression_level}_{num_channels}ch.opus")
 #info = sox_io_backend.info(path)
 #assert info.sample_rate == 48000
 #assert info.num_frames == 32768
 #assert info.num_channels == num_channels
 #assert info.bits_per_sample == 0  # bit_per_sample is irrelevant for compressed formats
 #assert info.encoding == "OPUS"
 class FileObjTestBase(TempDirMixin):
    def _gen_file(self,
                  ext,
                  dtype,
                  sample_rate,
                  num_channels,
                  num_frames,
                  *,
                  comments=None):
        path = self.get_temp_path(f"test.{ext}")
        bit_depth = sox_utils.get_bit_depth(dtype)
        duration = num_frames / sample_rate
        comment_file = self._gen_comment_file(comments) if comments else None
        sox_utils.gen_audio_file(
            path,
            sample_rate,
            num_channels=num_channels,
            encoding=sox_utils.get_encoding(dtype),
            bit_depth=bit_depth,
            duration=duration,
            comment_file=comment_file, )
        return path
    def _gen_comment_file(self, comments):
        comment_path = self.get_temp_path("comment.txt")
        with open(comment_path, "w") as file_:
            file_.writelines(comments)
        return comment_path
 class Unseekable:
    def __init__(self, fileobj):
        self.fileobj = fileobj
    def read(self, n):
        return self.fileobj.read(n)
 class TestFileObject(FileObjTestBase, unittest.TestCase):
    def _query_fileobj(self,
                       ext,
                       dtype,
                       sample_rate,
                       num_channels,
                       num_frames,
                       *,
                       comments=None):
        path = self._gen_file(
            ext,
            dtype,
            sample_rate,
            num_channels,
            num_frames,
            comments=comments)
        format_ = ext if ext in ["mp3"] else None
        with open(path, "rb") as fileobj:
            return sox_io_backend.info(fileobj, format_)
    def _query_bytesio(self, ext, dtype, sample_rate, num_channels, num_frames):
        path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames)
        format_ = ext if ext in ["mp3"] else None
        with open(path, "rb") as file_:
            fileobj = io.BytesIO(file_.read())
        return sox_io_backend.info(fileobj, format_)
    def _query_tarfile(self, ext, dtype, sample_rate, num_channels, num_frames):
        audio_path = self._gen_file(ext, dtype, sample_rate, num_channels,
                                    num_frames)
        audio_file = os.path.basename(audio_path)
        archive_path = self.get_temp_path("archive.tar.gz")
        with tarfile.TarFile(archive_path, "w") as tarobj:
            tarobj.add(audio_path, arcname=audio_file)
        format_ = ext if ext in ["mp3"] else None
        with tarfile.TarFile(archive_path, "r") as tarobj:
            fileobj = tarobj.extractfile(audio_file)
            return sox_io_backend.info(fileobj, format_)
    @contextmanager
    def _set_buffer_size(self, buffer_size):
        try:
            original_buffer_size = get_buffer_size()
            set_buffer_size(buffer_size)
            yield
        finally:
            set_buffer_size(original_buffer_size)
    @parameterized.expand([
        ("wav", "float32"),
        ("wav", "int32"),
        ("wav", "int16"),
        ("wav", "uint8"),
    ])
    def test_fileobj(self, ext, dtype):
        """Querying audio via file object works"""
        sample_rate = 16000
        num_frames = 3 * sample_rate
        num_channels = 2
        sinfo = self._query_fileobj(ext, dtype, sample_rate, num_channels,
                                    num_frames)
        bits_per_sample = get_bits_per_sample(ext, dtype)
        num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
        assert sinfo.sample_rate == sample_rate
        assert sinfo.num_channels == num_channels
        assert sinfo.num_frames == num_frames
        assert sinfo.bits_per_sample == bits_per_sample
        assert sinfo.encoding == get_encoding(ext, dtype)
    @parameterized.expand([
        ("wav", "float32"),
        ("wav", "int32"),
        ("wav", "int16"),
        ("wav", "uint8"),
    ])
    def test_bytesio(self, ext, dtype):
        """Querying audio via ByteIO object works for small data"""
        sample_rate = 16000
        num_frames = 3 * sample_rate
        num_channels = 2
        sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels,
                                    num_frames)
        bits_per_sample = get_bits_per_sample(ext, dtype)
        num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
        assert sinfo.sample_rate == sample_rate
        assert sinfo.num_channels == num_channels
        assert sinfo.num_frames == num_frames
        assert sinfo.bits_per_sample == bits_per_sample
        assert sinfo.encoding == get_encoding(ext, dtype)
    @parameterized.expand([
        ("wav", "float32"),
        ("wav", "int32"),
        ("wav", "int16"),
        ("wav", "uint8"),
    ])
    def test_bytesio_tiny(self, ext, dtype):
        """Querying audio via ByteIO object works for small data"""
        sample_rate = 8000
        num_frames = 4
        num_channels = 2
        sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels,
                                    num_frames)
        bits_per_sample = get_bits_per_sample(ext, dtype)
        num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
        assert sinfo.sample_rate == sample_rate
        assert sinfo.num_channels == num_channels
        assert sinfo.num_frames == num_frames
        assert sinfo.bits_per_sample == bits_per_sample
        assert sinfo.encoding == get_encoding(ext, dtype)
    @parameterized.expand([
        ("wav", "float32"),
        ("wav", "int32"),
        ("wav", "int16"),
        ("wav", "uint8"),
        ("flac", "float32"),
        ("vorbis", "float32"),
        ("amb", "int16"),
    ])
    def test_tarfile(self, ext, dtype):
        """Querying compressed audio via file-like object works"""
        sample_rate = 16000
        num_frames = 3.0 * sample_rate
        num_channels = 2
        sinfo = self._query_tarfile(ext, dtype, sample_rate, num_channels,
                                    num_frames)
        bits_per_sample = get_bits_per_sample(ext, dtype)
        num_frames = 0 if ext in ["vorbis"] else num_frames
        assert sinfo.sample_rate == sample_rate
        assert sinfo.num_channels == num_channels
        assert sinfo.num_frames == num_frames
        assert sinfo.bits_per_sample == bits_per_sample
        assert sinfo.encoding == get_encoding(ext, dtype)
 if __name__ == '__main__':
    unittest.main()
--- a/audio/tests/backends/sox_io/load_test.py
+++ b/audio/tests/backends/sox_io/load_test.py
@ -0,0 +1,56 @@
 import itertools
 import platform
 import unittest
 if platform.system() == "Windows":
    import warnings
    warnings.warn("sox io not support in Windows, please skip test.")
    exit()
 from parameterized import parameterized
 import numpy as np
 from paddleaudio.backends import sox_io_backend
 from common_utils import (
    get_wav_data,
    load_wav,
    save_wav, )
 #code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/load_test.py
 class TestLoad(unittest.TestCase):
    def assert_wav(self, dtype, sample_rate, num_channels, normalize, duration):
        """`sox_io_backend.load` can load wav format correctly.
        Wav data loaded with sox_io backend should match those with scipy
        """
        path = 'testdata/reference.wav'
        data = get_wav_data(
            dtype,
            num_channels,
            normalize=normalize,
            num_frames=duration * sample_rate)
        save_wav(path, data, sample_rate)
        expected = load_wav(path, normalize=normalize)[0]
        data, sr = sox_io_backend.load(path, normalize=normalize)
        assert sr == sample_rate
        np.testing.assert_array_almost_equal(data, expected, decimal=4)
    @parameterized.expand(
        list(
            itertools.product(
                [
                    "float64",
                    "float32",
                    "int32",
                ],
                [8000, 16000],
                [1, 2],
                [False, True], )), )
    def test_wav(self, dtype, sample_rate, num_channels, normalize):
        """`sox_io_backend.load` can load wav format correctly."""
        self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=1)
 if __name__ == '__main__':
    unittest.main()
--- a/audio/tests/backends/sox_io/save_test.py
+++ b/audio/tests/backends/sox_io/save_test.py
@ -0,0 +1,188 @@
 import io
 import platform
 import unittest
 if platform.system() == "Windows":
    import warnings
    warnings.warn("sox io not support in Windows, please skip test.")
    exit()
 import numpy as np
 from paddleaudio.backends import sox_io_backend
 from common_utils import (get_wav_data, load_wav, save_wav, nested_params,
                          TempDirMixin, sox_utils)
 #code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/save_test.py
 def _get_sox_encoding(encoding):
    encodings = {
        "PCM_F": "floating-point",
        "PCM_S": "signed-integer",
        "PCM_U": "unsigned-integer",
        "ULAW": "u-law",
        "ALAW": "a-law",
    }
    return encodings.get(encoding)
 class TestSaveBase(TempDirMixin):
    def assert_save_consistency(
            self,
            format: str,
            *,
            compression: float=None,
            encoding: str=None,
            bits_per_sample: int=None,
            sample_rate: float=8000,
            num_channels: int=2,
            num_frames: float=3 * 8000,
            src_dtype: str="int32",
            test_mode: str="path", ):
        """`save` function produces file that is comparable with `sox` command
        To compare that the file produced by `save` function agains the file produced by
        the equivalent `sox` command, we need to load both files.
        But there are many formats that cannot be opened with common Python modules (like
        SciPy).
        So we use `sox` command to prepare the original data and convert the saved files
        into a format that SciPy can read (PCM wav).
        The following diagram illustrates this process. The difference is 2.1. and 3.1.
        This assumes that
         - loading data with SciPy preserves the data well.
         - converting the resulting files into WAV format with `sox` preserve the data well.
                          x
                          | 1. Generate source wav file with SciPy
                          |
                          v
          -------------- wav ----------------
         |                                   |
         | 2.1. load with scipy              | 3.1. Convert to the target
         |   then save it into the target    |      format depth with sox
         |   format with paddleaudio          |
         v                                   v
        target format                       target format
         |                                   |
         | 2.2. Convert to wav with sox      | 3.2. Convert to wav with sox
         |                                   |
         v                                   v
        wav                                 wav
         |                                   |
         | 2.3. load with scipy              | 3.3. load with scipy
         |                                   |
         v                                   v
        tensor -------> compare <--------- tensor
        """
        cmp_encoding = "floating-point"
        cmp_bit_depth = 32
        src_path = self.get_temp_path("1.source.wav")
        tgt_path = self.get_temp_path(f"2.1.paddleaudio.{format}")
        tst_path = self.get_temp_path("2.2.result.wav")
        sox_path = self.get_temp_path(f"3.1.sox.{format}")
        ref_path = self.get_temp_path("3.2.ref.wav")
        # 1. Generate original wav
        data = get_wav_data(
            src_dtype, num_channels, normalize=False, num_frames=num_frames)
        save_wav(src_path, data, sample_rate)
        # 2.1. Convert the original wav to target format with paddleaudio
        data = load_wav(src_path, normalize=False)[0]
        if test_mode == "path":
            sox_io_backend.save(
                tgt_path,
                data,
                sample_rate,
                compression=compression,
                encoding=encoding,
                bits_per_sample=bits_per_sample)
        elif test_mode == "fileobj":
            with open(tgt_path, "bw") as file_:
                sox_io_backend.save(
                    file_,
                    data,
                    sample_rate,
                    format=format,
                    compression=compression,
                    encoding=encoding,
                    bits_per_sample=bits_per_sample, )
        elif test_mode == "bytesio":
            file_ = io.BytesIO()
            sox_io_backend.save(
                file_,
                data,
                sample_rate,
                format=format,
                compression=compression,
                encoding=encoding,
                bits_per_sample=bits_per_sample, )
            file_.seek(0)
            with open(tgt_path, "bw") as f:
                f.write(file_.read())
        else:
            raise ValueError(f"Unexpected test mode: {test_mode}")
        # 2.2. Convert the target format to wav with sox
        sox_utils.convert_audio_file(
            tgt_path, tst_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth)
        # 2.3. Load with SciPy
        found = load_wav(tst_path, normalize=False)[0]
        # 3.1. Convert the original wav to target format with sox
        sox_encoding = _get_sox_encoding(encoding)
        sox_utils.convert_audio_file(
            src_path,
            sox_path,
            compression=compression,
            encoding=sox_encoding,
            bit_depth=bits_per_sample)
        # 3.2. Convert the target format to wav with sox
        sox_utils.convert_audio_file(
            sox_path, ref_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth)
        # 3.3. Load with SciPy
        expected = load_wav(ref_path, normalize=False)[0]
        np.testing.assert_array_almost_equal(found, expected)
 class TestSave(TestSaveBase, unittest.TestCase):
    @nested_params(
        [
            "path",
        ],
        [
            ("PCM_U", 8),
            ("PCM_S", 16),
            ("PCM_S", 32),
            ("PCM_F", 32),
            ("PCM_F", 64),
            ("ULAW", 8),
            ("ALAW", 8),
        ], )
    def test_save_wav(self, test_mode, enc_params):
        encoding, bits_per_sample = enc_params
        self.assert_save_consistency(
            "wav",
            encoding=encoding,
            bits_per_sample=bits_per_sample,
            test_mode=test_mode)
    @nested_params(
        [
            "path",
        ],
        [
            ("float32", ),
            ("int32", ),
        ], )
    def test_save_wav_dtype(self, test_mode, params):
        (dtype, ) = params
        self.assert_save_consistency(
            "wav", src_dtype=dtype, test_mode=test_mode)
 if __name__ == '__main__':
    unittest.main()
--- a/audio/tests/backends/sox_io/smoke_test.py
+++ b/audio/tests/backends/sox_io/smoke_test.py
@ -0,0 +1,189 @@
 import io
 import itertools
 import platform
 import unittest
 if platform.system() == "Windows":
    import warnings
    warnings.warn("sox io not support in Windows, please skip test.")
    exit()
 from parameterized import parameterized
 from paddleaudio.backends import sox_io_backend
 from common_utils import (get_wav_data, TempDirMixin, name_func)
 class SmokeTest(TempDirMixin, unittest.TestCase):
    """Run smoke test on various audio format
    The purpose of this test suite is to verify that sox_io_backend functionalities do not exhibit
    abnormal behaviors.
    This test suite should be able to run without any additional tools (such as sox command),
    however without such tools, the correctness of each function cannot be verified.
    """
    def run_smoke_test(self,
                       ext,
                       sample_rate,
                       num_channels,
                       *,
                       compression=None,
                       dtype="float32"):
        duration = 1
        num_frames = sample_rate * duration
        #path = self.get_temp_path(f"test.{ext}")
        path = self.get_temp_path(f"test.{ext}")
        original = get_wav_data(
            dtype, num_channels, normalize=False, num_frames=num_frames)
        # 1. run save
        sox_io_backend.save(
            path, original, sample_rate, compression=compression)
        # 2. run info
        info = sox_io_backend.info(path)
        assert info.sample_rate == sample_rate
        assert info.num_channels == num_channels
        # 3. run load
        loaded, sr = sox_io_backend.load(path, normalize=False)
        assert sr == sample_rate
        assert loaded.shape[0] == num_channels
    @parameterized.expand(
        list(
            itertools.product(
                ["float32", "int32"],
                #["float32", "int32", "int16", "uint8"],
                [8000, 16000],
                [1, 2], )),
        name_func=name_func, )
    def test_wav(self, dtype, sample_rate, num_channels):
        """Run smoke test on wav format"""
        self.run_smoke_test("wav", sample_rate, num_channels, dtype=dtype)
    #@parameterized.expand(
    #list(
    #itertools.product(
    #[8000, 16000],
    #[1, 2],
    #[-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320],
    #)
    #)
    #)
    #def test_mp3(self, sample_rate, num_channels, bit_rate):
    #"""Run smoke test on mp3 format"""
    #self.run_smoke_test("mp3", sample_rate, num_channels, compression=bit_rate)
    #@parameterized.expand(
    #list(
    #itertools.product(
    #[8000, 16000],
    #[1, 2],
    #[-1, 0, 1, 2, 3, 3.6, 5, 10],
    #)
    #)
    #)
    #def test_vorbis(self, sample_rate, num_channels, quality_level):
    #"""Run smoke test on vorbis format"""
    #self.run_smoke_test("vorbis", sample_rate, num_channels, compression=quality_level)
    @parameterized.expand(
        list(itertools.product(
            [8000, 16000],
            [1, 2],
            list(range(9)), )),
        name_func=name_func, )
    def test_flac(self, sample_rate, num_channels, compression_level):
        """Run smoke test on flac format"""
        self.run_smoke_test(
            "flac", sample_rate, num_channels, compression=compression_level)
 class SmokeTestFileObj(unittest.TestCase):
    """Run smoke test on various audio format
    The purpose of this test suite is to verify that sox_io_backend functionalities do not exhibit
    abnormal behaviors.
    This test suite should be able to run without any additional tools (such as sox command),
    however without such tools, the correctness of each function cannot be verified.
    """
    def run_smoke_test(self,
                       ext,
                       sample_rate,
                       num_channels,
                       *,
                       compression=None,
                       dtype="float32"):
        duration = 1
        num_frames = sample_rate * duration
        original = get_wav_data(
            dtype, num_channels, normalize=False, num_frames=num_frames)
        fileobj = io.BytesIO()
        # 1. run save
        sox_io_backend.save(
            fileobj, original, sample_rate, compression=compression, format=ext)
        # 2. run info
        fileobj.seek(0)
        info = sox_io_backend.info(fileobj, format=ext)
        assert info.sample_rate == sample_rate
        assert info.num_channels == num_channels
        # 3. run load
        fileobj.seek(0)
        loaded, sr = sox_io_backend.load(fileobj, normalize=False, format=ext)
        assert sr == sample_rate
        assert loaded.shape[0] == num_channels
    @parameterized.expand(
        list(itertools.product(
            ["float32", "int32"],
            [8000, 16000],
            [1, 2], )),
        name_func=name_func, )
    def test_wav(self, dtype, sample_rate, num_channels):
        """Run smoke test on wav format"""
        self.run_smoke_test("wav", sample_rate, num_channels, dtype=dtype)
    # not support yet
    #@parameterized.expand(
    #list(
    #itertools.product(
    #[8000, 16000],
    #[1, 2],
    #[-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320],
    #)
    #)
    #)
    #def test_mp3(self, sample_rate, num_channels, bit_rate):
    #"""Run smoke test on mp3 format"""
    #self.run_smoke_test("mp3", sample_rate, num_channels, compression=bit_rate)
    #@parameterized.expand(
    #list(
    #itertools.product(
    #[8000, 16000],
    #[1, 2],
    #[-1, 0, 1, 2, 3, 3.6, 5, 10],
    #)
    #)
    #)
    #def test_vorbis(self, sample_rate, num_channels, quality_level):
    #"""Run smoke test on vorbis format"""
    #self.run_smoke_test("vorbis", sample_rate, num_channels, compression=quality_level)
    @parameterized.expand(
        list(itertools.product(
            [8000, 16000],
            [1, 2],
            list(range(9)), )),
        name_func=name_func, )
    def test_flac(self, sample_rate, num_channels, compression_level):
        #"""Run smoke test on flac format"""
        self.run_smoke_test(
            "flac", sample_rate, num_channels, compression=compression_level)
 if __name__ == '__main__':
    #test_func()
    unittest.main()
--- a/audio/tests/backends/sox_io/sox_effect_test.py
+++ b/audio/tests/backends/sox_io/sox_effect_test.py
@ -0,0 +1,364 @@
 #code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/sox_effect/sox_effect_test.py
 import io
 import itertools
 import platform
 import tarfile
 import unittest
 from pathlib import Path
 import numpy as np
 if platform.system() == "Windows":
    import warnings
    warnings.warn("sox io not support in Windows, please skip test.")
    exit()
 from parameterized import parameterized
 from paddleaudio import sox_effects
 from common_utils import (get_sinusoid, get_wav_data, load_wav, save_wav,
                          sox_utils, TempDirMixin, load_effects_params)
 class TestSoxEffects(unittest.TestCase):
    def test_init(self):
        """Calling init_sox_effects multiple times does not crush"""
        for _ in range(3):
            sox_effects.init_sox_effects()
 class TestSoxEffectsTensor(TempDirMixin, unittest.TestCase):
    """Test suite for `apply_effects_tensor` function"""
    @parameterized.expand(
        list(
            itertools.product(["float32", "int32"], [8000, 16000], [1, 2, 4, 8],
                              [True, False])), )
    def test_apply_no_effect(self, dtype, sample_rate, num_channels,
                             channels_first):
        """`apply_effects_tensor` without effects should return identical data as input"""
        original = get_wav_data(
            dtype, num_channels, channels_first=channels_first)
        expected = original.clone()
        found, output_sample_rate = sox_effects.apply_effects_tensor(
            expected, sample_rate, [], channels_first)
        assert (output_sample_rate == sample_rate)
        # SoxEffect should not alter the input Tensor object
        #self.assertEqual(original, expected)
        np.testing.assert_array_almost_equal(original.numpy(), expected.numpy())
        # SoxEffect should not return the same Tensor object
        assert expected is not found
        # Returned Tensor should equal to the input Tensor
        #self.assertEqual(expected, found)
        np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
    @parameterized.expand(
        load_effects_params("sox_effect_test_args.jsonl"),
        name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
    )
    def test_apply_effects(self, args):
        """`apply_effects_tensor` should return identical data as sox command"""
        effects = args["effects"]
        num_channels = args.get("num_channels", 2)
        input_sr = args.get("input_sample_rate", 8000)
        output_sr = args.get("output_sample_rate")
        input_path = self.get_temp_path("input.wav")
        reference_path = self.get_temp_path("reference.wav")
        original = get_sinusoid(
            frequency=800,
            sample_rate=input_sr,
            n_channels=num_channels,
            dtype="float32")
        save_wav(input_path, original, input_sr)
        sox_utils.run_sox_effect(
            input_path, reference_path, effects, output_sample_rate=output_sr)
        expected, expected_sr = load_wav(reference_path)
        found, sr = sox_effects.apply_effects_tensor(original, input_sr,
                                                     effects)
        assert sr == expected_sr
        #self.assertEqual(expected, found)
        np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
 class TestSoxEffectsFile(TempDirMixin, unittest.TestCase):
    """Test suite for `apply_effects_file` function"""
    @parameterized.expand(
        list(
            itertools.product(
                ["float32", "int32"],
                [8000, 16000],
                [1, 2, 4, 8],
                [False, True], )),
        #name_func=name_func,
    )
    def test_apply_no_effect(self, dtype, sample_rate, num_channels,
                             channels_first):
        """`apply_effects_file` without effects should return identical data as input"""
        path = self.get_temp_path("input.wav")
        expected = get_wav_data(
            dtype, num_channels, channels_first=channels_first)
        save_wav(path, expected, sample_rate, channels_first=channels_first)
        found, output_sample_rate = sox_effects.apply_effects_file(
            path, [], normalize=False, channels_first=channels_first)
        assert output_sample_rate == sample_rate
        #self.assertEqual(expected, found)
        np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
    @parameterized.expand(
        load_effects_params("sox_effect_test_args.jsonl"),
        #name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
    )
    def test_apply_effects_str(self, args):
        """`apply_effects_file` should return identical data as sox command"""
        dtype = "int32"
        channels_first = True
        effects = args["effects"]
        num_channels = args.get("num_channels", 2)
        input_sr = args.get("input_sample_rate", 8000)
        output_sr = args.get("output_sample_rate")
        input_path = self.get_temp_path("input.wav")
        reference_path = self.get_temp_path("reference.wav")
        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
        save_wav(input_path, data, input_sr, channels_first=channels_first)
        sox_utils.run_sox_effect(
            input_path, reference_path, effects, output_sample_rate=output_sr)
        expected, expected_sr = load_wav(reference_path)
        found, sr = sox_effects.apply_effects_file(
            input_path, effects, normalize=False, channels_first=channels_first)
        assert sr == expected_sr
        #self.assertEqual(found, expected)
        np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
    def test_apply_effects_path(self):
        """`apply_effects_file` should return identical data as sox command when file path is given as a Path Object"""
        dtype = "int32"
        channels_first = True
        effects = [["hilbert"]]
        num_channels = 2
        input_sr = 8000
        output_sr = 8000
        input_path = self.get_temp_path("input.wav")
        reference_path = self.get_temp_path("reference.wav")
        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
        save_wav(input_path, data, input_sr, channels_first=channels_first)
        sox_utils.run_sox_effect(
            input_path, reference_path, effects, output_sample_rate=output_sr)
        expected, expected_sr = load_wav(reference_path)
        found, sr = sox_effects.apply_effects_file(
            Path(input_path),
            effects,
            normalize=False,
            channels_first=channels_first)
        assert sr == expected_sr
        #self.assertEqual(found, expected)
        np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
 class TestFileFormats(TempDirMixin, unittest.TestCase):
    """`apply_effects_file` gives the same result as sox on various file formats"""
    @parameterized.expand(
        list(itertools.product(
            ["float32", "int32"],
            [8000, 16000],
            [1, 2], )),
        #name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
    )
    def test_wav(self, dtype, sample_rate, num_channels):
        """`apply_effects_file` works on various wav format"""
        channels_first = True
        effects = [["band", "300", "10"]]
        input_path = self.get_temp_path("input.wav")
        reference_path = self.get_temp_path("reference.wav")
        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
        save_wav(input_path, data, sample_rate, channels_first=channels_first)
        sox_utils.run_sox_effect(input_path, reference_path, effects)
        expected, expected_sr = load_wav(reference_path)
        found, sr = sox_effects.apply_effects_file(
            input_path, effects, normalize=False, channels_first=channels_first)
        assert sr == expected_sr
        #self.assertEqual(found, expected)
        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
    #not support now
    #@parameterized.expand(
    #list(
    #itertools.product(
    #[8000, 16000],
    #[1, 2],
    #)
    #),
    ##name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
    #)
    #def test_flac(self, sample_rate, num_channels):
    #"""`apply_effects_file` works on various flac format"""
    #channels_first = True
    #effects = [["band", "300", "10"]]
    #input_path = self.get_temp_path("input.flac")
    #reference_path = self.get_temp_path("reference.wav")
    #sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
    #sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
    #expected, expected_sr = load_wav(reference_path)
    #found, sr = sox_effects.apply_effects_file(input_path, effects, channels_first=channels_first)
    #save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
    #assert sr == expected_sr
    ##self.assertEqual(found, expected)
    #np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
    #@parameterized.expand(
    #list(
    #itertools.product(
    #[8000, 16000],
    #[1, 2],
    #)
    #),
    ##name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
    #)
    #def test_vorbis(self, sample_rate, num_channels):
    #"""`apply_effects_file` works on various vorbis format"""
    #channels_first = True
    #effects = [["band", "300", "10"]]
    #input_path = self.get_temp_path("input.vorbis")
    #reference_path = self.get_temp_path("reference.wav")
    #sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
    #sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
    #expected, expected_sr = load_wav(reference_path)
    #found, sr = sox_effects.apply_effects_file(input_path, effects, channels_first=channels_first)
    #save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
    #assert sr == expected_sr
    ##self.assertEqual(found, expected)
    #np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
    #@skipIfNoExec("sox")
    #@skipIfNoSox
 class TestFileObject(TempDirMixin, unittest.TestCase):
    @parameterized.expand([
        ("wav", None),
    ])
    def test_fileobj(self, ext, compression):
        """Applying effects via file object works"""
        sample_rate = 16000
        channels_first = True
        effects = [["band", "300", "10"]]
        input_path = self.get_temp_path(f"input.{ext}")
        reference_path = self.get_temp_path("reference.wav")
        #sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
        data = get_wav_data("int32", 2, channels_first=channels_first)
        save_wav(input_path, data, sample_rate, channels_first=channels_first)
        sox_utils.run_sox_effect(
            input_path, reference_path, effects, output_bitdepth=32)
        expected, expected_sr = load_wav(reference_path)
        with open(input_path, "rb") as fileobj:
            found, sr = sox_effects.apply_effects_file(
                fileobj, effects, channels_first=channels_first)
        save_wav(
            self.get_temp_path("result.wav"),
            found,
            sr,
            channels_first=channels_first)
        assert sr == expected_sr
        #self.assertEqual(found, expected)
        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
    @parameterized.expand([
        ("wav", None),
    ])
    def test_bytesio(self, ext, compression):
        """Applying effects via BytesIO object works"""
        sample_rate = 16000
        channels_first = True
        effects = [["band", "300", "10"]]
        input_path = self.get_temp_path(f"input.{ext}")
        reference_path = self.get_temp_path("reference.wav")
        #sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
        data = get_wav_data("int32", 2, channels_first=channels_first)
        save_wav(input_path, data, sample_rate, channels_first=channels_first)
        sox_utils.run_sox_effect(
            input_path, reference_path, effects, output_bitdepth=32)
        expected, expected_sr = load_wav(reference_path)
        with open(input_path, "rb") as file_:
            fileobj = io.BytesIO(file_.read())
        found, sr = sox_effects.apply_effects_file(
            fileobj, effects, channels_first=channels_first)
        save_wav(
            self.get_temp_path("result.wav"),
            found,
            sr,
            channels_first=channels_first)
        assert sr == expected_sr
        #self.assertEqual(found, expected)
        print("found")
        print(found)
        print("expected")
        print(expected)
        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
    @parameterized.expand([
        ("wav", None),
    ])
    def test_tarfile(self, ext, compression):
        """Applying effects to compressed audio via file-like file works"""
        sample_rate = 16000
        channels_first = True
        effects = [["band", "300", "10"]]
        audio_file = f"input.{ext}"
        input_path = self.get_temp_path(audio_file)
        reference_path = self.get_temp_path("reference.wav")
        archive_path = self.get_temp_path("archive.tar.gz")
        data = get_wav_data("int32", 2, channels_first=channels_first)
        save_wav(input_path, data, sample_rate, channels_first=channels_first)
        #       sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
        sox_utils.run_sox_effect(
            input_path, reference_path, effects, output_bitdepth=32)
        expected, expected_sr = load_wav(reference_path)
        with tarfile.TarFile(archive_path, "w") as tarobj:
            tarobj.add(input_path, arcname=audio_file)
        with tarfile.TarFile(archive_path, "r") as tarobj:
            fileobj = tarobj.extractfile(audio_file)
            found, sr = sox_effects.apply_effects_file(
                fileobj, effects, channels_first=channels_first)
        save_wav(
            self.get_temp_path("result.wav"),
            found,
            sr,
            channels_first=channels_first)
        assert sr == expected_sr
        #self.assertEqual(found, expected)
        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
 if __name__ == '__main__':
    unittest.main()
--- a/audio/tests/backends/sox_io/sox_effect_test_args.jsonl
+++ b/audio/tests/backends/sox_io/sox_effect_test_args.jsonl
@ -0,0 +1,77 @@
 {"effects": [["allpass", "300", "10"]]}
 {"effects": [["band", "300", "10"]]}
 {"effects": [["bandpass", "300", "10"]]}
 {"effects": [["bandreject", "300", "10"]]}
 {"effects": [["bass", "-10"]]}
 {"effects": [["biquad", "0.4", "0.2", "0.9", "0.7", "0.2", "0.6"]]}
 {"effects": [["chorus", "0.7", "0.9", "55", "0.4", "0.25", "2", "-t"]]}
 {"effects": [["chorus", "0.6", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "1.3", "-s"]]}
 {"effects": [["chorus", "0.5", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "2.3", "-t", "40", "0.3", "0.3", "1.3", "-s"]]}
 {"effects": [["channels", "1"]]}
 {"effects": [["channels", "2"]]}
 {"effects": [["channels", "3"]]}
 {"effects": [["compand", "0.3,1", "6:-70,-60,-20", "-5", "-90", "0.2"]]}
 {"effects": [["compand", ".1,.2", "-inf,-50.1,-inf,-50,-50", "0", "-90", ".1"]]}
 {"effects": [["compand", ".1,.1", "-45.1,-45,-inf,0,-inf", "45", "-90", ".1"]]}
 {"effects": [["contrast", "0"]]}
 {"effects": [["contrast", "25"]]}
 {"effects": [["contrast", "50"]]}
 {"effects": [["contrast", "75"]]}
 {"effects": [["contrast", "100"]]}
 {"effects": [["dcshift", "1.0"]]}
 {"effects": [["dcshift", "-1.0"]]}
 {"effects": [["deemph"]], "input_sample_rate": 44100}
 {"effects": [["dither", "-s"]]}
 {"effects": [["dither", "-S"]]}
 {"effects": [["divide"]]}
 {"effects": [["downsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 4000}
 {"effects": [["earwax"]], "input_sample_rate": 44100}
 {"effects": [["echo", "0.8", "0.88", "60", "0.4"]]}
 {"effects": [["echo", "0.8", "0.88", "6", "0.4"]]}
 {"effects": [["echo", "0.8", "0.9", "1000", "0.3"]]}
 {"effects": [["echo", "0.8", "0.9", "1000", "0.3", "1800", "0.25"]]}
 {"effects": [["echos", "0.8", "0.7", "700", "0.25", "700", "0.3"]]}
 {"effects": [["echos", "0.8", "0.7", "700", "0.25", "900", "0.3"]]}
 {"effects": [["echos", "0.8", "0.7", "40", "0.25", "63", "0.3"]]}
 {"effects": [["equalizer", "300", "10", "5"]]}
 {"effects": [["fade", "q", "3"]]}
 {"effects": [["fade", "h", "3"]]}
 {"effects": [["fade", "t", "3"]]}
 {"effects": [["fade", "l", "3"]]}
 {"effects": [["fade", "p", "3"]]}
 {"effects": [["fir", "0.0195", "-0.082", "0.234", "0.891", "-0.145", "0.043"]]}
 {"effects": [["flanger"]]}
 {"effects": [["gain", "-l", "-6"]]}
 {"effects": [["highpass", "-1", "300"]]}
 {"effects": [["highpass", "-2", "300"]]}
 {"effects": [["hilbert"]]}
 {"effects": [["loudness"]]}
 {"effects": [["lowpass", "-1", "300"]]}
 {"effects": [["lowpass", "-2", "300"]]}
 {"effects": [["mcompand", "0.005,0.1 -47,-40,-34,-34,-17,-33", "100", "0.003,0.05 -47,-40,-34,-34,-17,-33", "400", "0.000625,0.0125 -47,-40,-34,-34,-15,-33", "1600", "0.0001,0.025 -47,-40,-34,-34,-31,-31,-0,-30", "6400", "0,0.025 -38,-31,-28,-28,-0,-25"]], "input_sample_rate": 44100}
 {"effects": [["oops"]]}
 {"effects": [["overdrive"]]}
 {"effects": [["pad"]]}
 {"effects": [["phaser"]]}
 {"effects": [["remix", "6", "7", "8", "0"]], "num_channels": 8}
 {"effects": [["remix", "1-3,7", "3"]], "num_channels": 8}
 {"effects": [["repeat"]]}
 {"effects": [["reverb"]]}
 {"effects": [["reverse"]]}
 {"effects": [["riaa"]], "input_sample_rate": 44100}
 {"effects": [["silence", "0"]]}
 {"effects": [["speed", "1.3"]], "input_sample_rate": 4000, "output_sample_rate": 5200}
 {"effects": [["speed", "0.7"]], "input_sample_rate": 4000, "output_sample_rate": 2800}
 {"effects": [["stat"]]}
 {"effects": [["stats"]]}
 {"effects": [["stretch"]]}
 {"effects": [["swap"]]}
 {"effects": [["synth"]]}
 {"effects": [["tempo", "0.9"]]}
 {"effects": [["tempo", "1.1"]]}
 {"effects": [["treble", "3"]]}
 {"effects": [["tremolo", "300", "40"]]}
 {"effects": [["tremolo", "300", "50"]]}
 {"effects": [["trim", "0", "0.1"]]}
 {"effects": [["upsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 16000}
 {"effects": [["vol", "3"]]}
--- a/audio/tests/benchmark/README.md
+++ b/audio/tests/benchmark/README.md
@ -15,6 +15,7 @@ Result:
 ========================================================================== test session starts ==========================================================================
 platform linux -- Python 3.7.7, pytest-7.0.1, pluggy-1.0.0
 benchmark: 3.4.1 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000)
 rootdir: /ssd3/chenxiaojie06/PaddleSpeech/DeepSpeech/paddleaudio
 plugins: typeguard-2.12.1, benchmark-3.4.1, anyio-3.5.0
 collected 4 items
--- a/audio/tests/benchmark/log_melspectrogram.py
+++ b/audio/tests/benchmark/log_melspectrogram.py
@ -17,17 +17,15 @@ import urllib.request
 import librosa
 import numpy as np
 import paddle
 import paddleaudio
 import torch
 import torchaudio
 import paddlespeech.audio
 wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
 if not os.path.isfile(os.path.basename(wav_url)):
    urllib.request.urlretrieve(wav_url, os.path.basename(wav_url))
-waveform, sr = paddlespeech.audio.load(
+waveform, sr = paddleaudio.backends.soundfile_load(os.path.abspath(os.path.basename(wav_url)))
    os.path.abspath(os.path.basename(wav_url)))
 waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0)
 waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0)
@ -57,7 +55,7 @@ def enable_gpu_device():
    paddle.set_device('gpu')
-log_mel_extractor = paddlespeech.audio.features.LogMelSpectrogram(
+log_mel_extractor = paddle.audio.features.LogMelSpectrogram(
    **mel_conf, f_min=0.0, top_db=80.0, dtype=waveform_tensor.dtype)
@ -67,20 +65,20 @@ def log_melspectrogram():
 def test_log_melspect_cpu(benchmark):
    enable_cpu_device()
-    feature_audio = benchmark(log_melspectrogram)
+    feature_paddleaudio = benchmark(log_melspectrogram)
    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
+        feature_librosa, feature_paddleaudio, decimal=3)
 def test_log_melspect_gpu(benchmark):
    enable_gpu_device()
-    feature_audio = benchmark(log_melspectrogram)
+    feature_paddleaudio = benchmark(log_melspectrogram)
    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=2)
+        feature_librosa, feature_paddleaudio, decimal=2)
 mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram(
@ -104,11 +102,11 @@ def test_log_melspect_cpu_torchaudio(benchmark):
    waveform_tensor_torch = waveform_tensor_torch.to('cpu')
    amplitude_to_DB = amplitude_to_DB.to('cpu')
-    feature_audio = benchmark(log_melspectrogram_torchaudio)
+    feature_paddleaudio = benchmark(log_melspectrogram_torchaudio)
    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
+        feature_librosa, feature_paddleaudio, decimal=3)
 def test_log_melspect_gpu_torchaudio(benchmark):
--- a/Show More
+++ b/Show More