add sox load_audio&&effets

3 years ago · 98300b86e5
parent 76b7616f26
commit 98300b86e5
24 changed files with 2039 additions and 73 deletions
--- a/cmake/external/pybind.cmake
+++ b/cmake/external/pybind.cmake
@ -3,8 +3,8 @@ include(ExternalProject)

 FetchContent_Declare(
  pybind
-  URL      https://github.com/pybind/pybind11/archive/refs/tags/v2.9.0.zip 
-  URL_HASH SHA256=1c6e0141f7092867c5bf388bc3acdb2689ed49f59c3977651394c6c87ae88232
+  URL      https://github.com/pybind/pybind11/archive/refs/tags/v2.10.0.zip
+  URL_HASH SHA256=225df6e6dea7cea7c5754d4ed954e9ca7c43947b849b3795f87cb56437f1bd19
 )
 FetchContent_MakeAvailable(pybind)
 include_directories(${pybind_SOURCE_DIR}/include)
--- a/paddlespeech/audio/_internal/module_utils.py
+++ b/paddlespeech/audio/_internal/module_utils.py
@ -145,4 +145,4 @@ def requires_sox():

            return wrapped

-    return
+    return decorator
--- a/paddlespeech/audio/backends/sox_io_backend.py
+++ b/paddlespeech/audio/backends/sox_io_backend.py
@ -29,7 +29,7 @@ def _fail_load(
    normalize: bool = True,
    channels_first: bool = True,
    format: Optional[str] = None,
-) -> Tuple[paddle.Tensor, int]:
+) -> Tuple[Tensor, int]:
    raise RuntimeError("Failed to load audio from {}".format(filepath))


@ -41,6 +41,7 @@ _fallback_info_fileobj = _fail_info_fileobj
 _fallback_load = _fail_load
 _fallback_load_filebj = _fail_load_fileobj

+@_mod_utils.requires_sox()
 def load(
        filepath: Union[str, Path],
        out: Optional[Tensor]=None,
@ -51,6 +52,7 @@ def load(
        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
    raise RuntimeError("No audio I/O backend is available.")

+@_mod_utils.requires_sox()
 def save(filepath: str, 
         src: Tensor, 
         sample_rate: int, 
--- a/paddlespeech/audio/src/CMakeLists.txt
+++ b/paddlespeech/audio/src/CMakeLists.txt
@ -35,6 +35,11 @@ if(BUILD_SOX)
  list(
    APPEND
    LIBPADDLEAUDIO_SOURCES
+    sox/io.cpp
+    sox/utils.cpp
+    sox/effects.cpp
+    sox/effects_chain.cpp
+    sox/types.cpp
    )
  list(
    APPEND
@ -139,8 +144,8 @@ if(BUILD_SOX)
  list(
    APPEND
    EXTENSION_SOURCES
-    # pybind/sox/effects.cpp
-    # pybind/sox/effects_chain.cpp
+    pybind/sox/effects.cpp
+    pybind/sox/effects_chain.cpp
    pybind/sox/io.cpp
    pybind/sox/utils.cpp
    )
@ -192,4 +197,4 @@ define_extension(
 #     "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
 #     )
 # endif()
-endif()
+endif()
--- a/paddlespeech/audio/src/pybind/pybind.cpp
+++ b/paddlespeech/audio/src/pybind/pybind.cpp
@ -3,6 +3,7 @@

 #include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h"
 #include "paddlespeech/audio/src/pybind/sox/io.h"
+#include "paddlespeech/audio/src/pybind/sox/effects.h"
 #include "paddlespeech/audio/third_party/kaldi/feat/feature-fbank.h"

 PYBIND11_MODULE(_paddleaudio, m) {
@ -13,6 +14,15 @@ PYBIND11_MODULE(_paddleaudio, m) {
    m.def("get_info_fileobj",
          &paddleaudio::sox_io::get_info_fileobj,
          "Get metadata of audio in file object.");
+    m.def("load_audio_fileobj",
+          &paddleaudio::sox_io::load_audio_fileobj,
+          "Load audio from file object.");
+    m.def("save_audio_fileobj",
+          &paddleaudio::sox_io::save_audio_fileobj,
+          "Save audio to file obj.");
+    m.def("apply_effects_fileobj",
+          &paddleaudio::sox_effects::apply_effects_fileobj,
+          "Decode audio data from file-like obj and apply effects.");
 #endif

 #ifdef INCLUDE_KALDI
--- a/paddlespeech/audio/src/pybind/sox/effects.cpp
+++ b/paddlespeech/audio/src/pybind/sox/effects.cpp
@ -0,0 +1,121 @@
+#include "paddlespeech/audio/src/pybind/sox/effects.h"
+#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
+#include "paddlespeech/audio/src/pybind/sox/utils.h"
+
+using namespace paddleaudio::sox_utils;
+
+namespace paddleaudio::sox_effects {
+
+// Streaming decoding over file-like object is tricky because libsox operates on
+// FILE pointer. The folloing is what `sox` and `play` commands do
+//  - file input -> FILE pointer
+//  - URL input -> call wget in suprocess and pipe the data -> FILE pointer
+//  - stdin -> FILE pointer
+//
+// We want to, instead, fetch byte strings chunk by chunk, consume them, and
+// discard.
+//
+// Here is the approach
+// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
+// chunk of byte string
+//    This will perform header-based format detection, if necessary, then fill
+//    the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
+//    which returns FILE* which points the buffer of the provided byte string.
+// 2. Each time sox reads a chunk from the FILE*, we update the underlying
+// buffer in a way that it
+//    starts with unseen data, and append the new data read from the given
+//    fileobj. This will trick libsox as if it keeps reading from the FILE*
+//    continuously.
+// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
+auto apply_effects_fileobj(
+    py::object fileobj,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    tl::optional<std::string> format)
+    -> tl::optional<std::tuple<py::array, int64_t>> {
+  // Prepare the buffer used throughout the lifecycle of SoxEffectChain.
+  //
+  // For certain format (such as FLAC), libsox keeps reading the content at
+  // the initialization unless it reaches EOF even when the header is properly
+  // parsed. (Making buffer size 8192, which is way bigger than the header,
+  // resulted in libsox consuming all the buffer content at the time it opens
+  // the file.) Therefore buffer has to always contain valid data, except after
+  // EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
+  // first check if there is enough data to fill the buffer. `read_fileobj`
+  // repeatedly calls `read`  method until it receives the requested length of
+  // bytes or it reaches EOF. If we get bytes shorter than requested, that means
+  // the whole audio data are fetched.
+  //
+  // * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`.
+  const auto capacity = [&]() {
+    // NOTE:
+    // Use the abstraction provided by `libpaddleaudio` to access the global
+    // config defined by libsox. Directly using `sox_get_globals` function will
+    // end up retrieving the static variable defined in `_paddleaudio`, which is
+    // not correct.
+    const auto bufsiz = get_buffer_size();
+    const int64_t kDefaultCapacityInBytes = 256;
+    return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
+                                              : kDefaultCapacityInBytes;
+  }();
+  std::string buffer(capacity, '\0');
+  auto* in_buf = const_cast<char*>(buffer.data());
+  auto num_read = read_fileobj(&fileobj, capacity, in_buf);
+  // If the file is shorter than 256, then libsox cannot read the header.
+  auto in_buffer_size = (num_read > 256) ? num_read : 256;
+
+  // Open file (this starts reading the header)
+  // When opening a file there are two functions that can touches FILE*.
+  // * `auto_detect_format`
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
+  // * `startread` handler of detected format.
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
+  // To see the handler of a particular format, go to
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
+  // For example, voribs can be found
+  //   https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
+  SoxFormat sf(sox_open_mem_read(
+      in_buf,
+      in_buffer_size,
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+
+  // In case of streamed data, length can be 0
+  if (static_cast<sox_format_t*>(sf) == nullptr ||
+      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    return {};
+  }
+
+  // Prepare output buffer
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(sf->signal.length);
+
+  // Create and run SoxEffectsChain
+  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
+  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
+      /*input_encoding=*/sf->encoding,
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+  chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+
+  // Create tensor from buffer
+  bool channels_first_ = channels_first.value_or(true);
+  auto tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      normalize.value_or(true),
+      channels_first_);
+
+  return std::forward_as_tuple(
+      tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
+}
+
+} // namespace paddleaudio::sox_effects
--- a/paddlespeech/audio/src/pybind/sox/effects.h
+++ b/paddlespeech/audio/src/pybind/sox/effects.h
@ -0,0 +1,18 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+#include "paddlespeech/audio/src/optional/optional.hpp"
+
+namespace py = pybind11;
+
+namespace paddleaudio::sox_effects {
+
+auto apply_effects_fileobj(
+    py::object fileobj,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    tl::optional<std::string> format)
+    -> tl::optional<std::tuple<py::array, int64_t>>;
+
+} // namespace paddleaudio::sox_effects
--- a/paddlespeech/audio/src/pybind/sox/effects_chain.cpp
+++ b/paddlespeech/audio/src/pybind/sox/effects_chain.cpp
@ -0,0 +1,236 @@
+#include <sox.h>
+
+#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
+#include "paddlespeech/audio/src/pybind/sox/utils.h"
+
+using namespace paddleaudio::sox_utils;
+
+namespace paddleaudio::sox_effects_chain {
+
+namespace {
+
+/// helper classes for passing file-like object to SoxEffectChain
+struct FileObjInputPriv {
+  sox_format_t* sf;
+  py::object* fileobj;
+  bool eof_reached;
+  char* buffer;
+  uint64_t buffer_size;
+};
+
+struct FileObjOutputPriv {
+  sox_format_t* sf;
+  py::object* fileobj;
+  char** buffer;
+  size_t* buffer_size;
+};
+
+/// Callback function to feed byte string
+/// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/sox.h#L1268-L1278
+auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp)
+    -> int {
+  auto priv = static_cast<FileObjInputPriv*>(effp->priv);
+  auto sf = priv->sf;
+  auto buffer = priv->buffer;
+
+  // 1. Refresh the buffer
+  //
+  // NOTE:
+  //   Since the underlying FILE* was opened with `fmemopen`, the only way
+  //   libsox detect EOF is reaching the end of the buffer. (null byte won't
+  //   help) Therefore we need to align the content at the end of buffer,
+  //   otherwise, libsox will keep reading the content beyond intended length.
+  //
+  // Before:
+  //
+  //     |<-------consumed------>|<---remaining--->|
+  //     |***********************|-----------------|
+  //                             ^ ftell
+  //
+  // After:
+  //
+  //     |<-offset->|<---remaining--->|<-new data->|
+  //     |**********|-----------------|++++++++++++|
+  //                ^ ftell
+
+  // NOTE:
+  //   Do not use `sf->tell_off` here. Presumably, `tell_off` and `fseek` are
+  //   supposed to be in sync, but there are cases (Vorbis) they are not
+  //   in sync and `tell_off` has seemingly uninitialized value, which
+  //   leads num_remain to be negative and cause segmentation fault
+  //   in `memmove`.
+  const auto tell = ftell((FILE*)sf->fp);
+  if (tell < 0) {
+    throw std::runtime_error("Internal Error: ftell failed.");
+  }
+  const auto num_consumed = static_cast<size_t>(tell);
+  if (num_consumed > priv->buffer_size) {
+    throw std::runtime_error("Internal Error: buffer overrun.");
+  }
+
+  const auto num_remain = priv->buffer_size - num_consumed;
+
+  // 1.1. Fetch the data to see if there is data to fill the buffer
+  size_t num_refill = 0;
+  std::string chunk(num_consumed, '\0');
+  if (num_consumed && !priv->eof_reached) {
+    num_refill = read_fileobj(
+        priv->fileobj, num_consumed, const_cast<char*>(chunk.data()));
+    if (num_refill < num_consumed) {
+      priv->eof_reached = true;
+    }
+  }
+  const auto offset = num_consumed - num_refill;
+
+  // 1.2. Move the unconsumed data towards the beginning of buffer.
+  if (num_remain) {
+    auto src = static_cast<void*>(buffer + num_consumed);
+    auto dst = static_cast<void*>(buffer + offset);
+    memmove(dst, src, num_remain);
+  }
+
+  // 1.3. Refill the remaining buffer.
+  if (num_refill) {
+    auto src = static_cast<void*>(const_cast<char*>(chunk.c_str()));
+    auto dst = buffer + offset + num_remain;
+    memcpy(dst, src, num_refill);
+  }
+
+  // 1.4. Set the file pointer to the new offset
+  sf->tell_off = offset;
+  fseek((FILE*)sf->fp, offset, SEEK_SET);
+
+  // 2. Perform decoding operation
+  // The following part is practically same as "input" effect
+  // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
+
+  // At this point, osamp represents the buffer size in bytes,
+  // but sox_read expects the maximum number of samples ready to read.
+  // Normally, this is fine, but in case when the samples are not 4-byte
+  // aligned, (e.g. sample is 24bits), the resulting signal is not correct.
+  // https://github.com/pytorch/audio/issues/2083
+  if (sf->encoding.bits_per_sample > 0)
+    *osamp /= (sf->encoding.bits_per_sample / 8);
+
+  // Ensure that it's a multiple of the number of channels
+  *osamp -= *osamp % effp->out_signal.channels;
+
+  // Read up to *osamp samples into obuf;
+  // store the actual number read back to *osamp
+  *osamp = sox_read(sf, obuf, *osamp);
+
+  // Decoding is finished when fileobject is exhausted and sox can no longer
+  // decode a sample.
+  return (priv->eof_reached && !*osamp) ? SOX_EOF : SOX_SUCCESS;
+}
+
+auto fileobj_output_flow(
+    sox_effect_t* effp,
+    sox_sample_t const* ibuf,
+    sox_sample_t* obuf LSX_UNUSED,
+    size_t* isamp,
+    size_t* osamp) -> int {
+  *osamp = 0;
+  if (*isamp) {
+    auto priv = static_cast<FileObjOutputPriv*>(effp->priv);
+    auto sf = priv->sf;
+    auto fp = static_cast<FILE*>(sf->fp);
+    auto fileobj = priv->fileobj;
+    auto buffer = priv->buffer;
+
+    // Encode chunk
+    auto num_samples_written = sox_write(sf, ibuf, *isamp);
+    fflush(fp);
+
+    // Copy the encoded chunk to python object.
+    fileobj->attr("write")(py::bytes(*buffer, ftell(fp)));
+
+    // Reset FILE*
+    sf->tell_off = 0;
+    fseek(fp, 0, SEEK_SET);
+
+    if (num_samples_written != *isamp) {
+      if (sf->sox_errno) {
+        std::ostringstream stream;
+        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
+               << sf->filename;
+        throw std::runtime_error(stream.str());
+      }
+      return SOX_EOF;
+    }
+  }
+  return SOX_SUCCESS;
+}
+
+auto get_fileobj_input_handler() -> sox_effect_handler_t* {
+  static sox_effect_handler_t handler{
+      /*name=*/"input_fileobj_object",
+      /*usage=*/nullptr,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/nullptr,
+      /*start=*/nullptr,
+      /*flow=*/nullptr,
+      /*drain=*/fileobj_input_drain,
+      /*stop=*/nullptr,
+      /*kill=*/nullptr,
+      /*priv_size=*/sizeof(FileObjInputPriv)};
+  return &handler;
+}
+
+auto get_fileobj_output_handler() -> sox_effect_handler_t* {
+  static sox_effect_handler_t handler{
+      /*name=*/"output_fileobj_object",
+      /*usage=*/nullptr,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/nullptr,
+      /*start=*/nullptr,
+      /*flow=*/fileobj_output_flow,
+      /*drain=*/nullptr,
+      /*stop=*/nullptr,
+      /*kill=*/nullptr,
+      /*priv_size=*/sizeof(FileObjOutputPriv)};
+  return &handler;
+}
+
+} // namespace
+
+void SoxEffectsChainPyBind::addInputFileObj(
+    sox_format_t* sf,
+    char* buffer,
+    uint64_t buffer_size,
+    py::object* fileobj) {
+  in_sig_ = sf->signal;
+  interm_sig_ = in_sig_;
+
+  SoxEffect e(sox_create_effect(get_fileobj_input_handler()));
+  auto priv = static_cast<FileObjInputPriv*>(e->priv);
+  priv->sf = sf;
+  priv->fileobj = fileobj;
+  priv->eof_reached = false;
+  priv->buffer = buffer;
+  priv->buffer_size = buffer_size;
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: input fileobj");
+  }
+}
+
+void SoxEffectsChainPyBind::addOutputFileObj(
+    sox_format_t* sf,
+    char** buffer,
+    size_t* buffer_size,
+    py::object* fileobj) {
+  out_sig_ = sf->signal;
+  SoxEffect e(sox_create_effect(get_fileobj_output_handler()));
+  auto priv = static_cast<FileObjOutputPriv*>(e->priv);
+  priv->sf = sf;
+  priv->fileobj = fileobj;
+  priv->buffer = buffer;
+  priv->buffer_size = buffer_size;
+  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: output fileobj");
+  }
+}
+
+} // namespace paddleaudio::sox_effects_chain
--- a/paddlespeech/audio/src/pybind/sox/effects_chain.h
+++ b/paddlespeech/audio/src/pybind/sox/effects_chain.h
@ -0,0 +1,25 @@
+#pragma once
+
+#include "paddlespeech/audio/src/sox/effects_chain.h"
+
+namespace paddleaudio::sox_effects_chain {
+
+class SoxEffectsChainPyBind : public SoxEffectsChain {
+  using SoxEffectsChain::SoxEffectsChain;
+
+ public:
+  void addInputFileObj(
+      sox_format_t* sf,
+      char* buffer,
+      uint64_t buffer_size,
+      py::object* fileobj);
+
+  void addOutputFileObj(
+      sox_format_t* sf,
+      char** buffer,
+      size_t* buffer_size,
+      py::object* fileobj);
+};
+
+} // namespace paddleaudio::sox_effects_chain
+
--- a/paddlespeech/audio/src/pybind/sox/io.cpp
+++ b/paddlespeech/audio/src/pybind/sox/io.cpp
@ -2,7 +2,14 @@
 // All rights reserved.

 #include "paddlespeech/audio/src/pybind/sox/io.h"
+#include "paddlespeech/audio/src/pybind/sox/effects.h"
+#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
 #include "paddlespeech/audio/src/pybind/sox/utils.h"
+#include "paddlespeech/audio/src/optional/optional.hpp"
+
+#include "paddlespeech/audio/src/sox/io.h"
+#include "paddlespeech/audio/src/sox/types.h"
+#include "paddlespeech/audio/src/sox/utils.h"

 using namespace paddleaudio::sox_utils;

@ -28,6 +35,35 @@ auto get_info_file(const std::string &path, const std::string &format)
        get_encoding(sf->encoding.encoding));
 }

+std::vector<std::vector<std::string>> get_effects(
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames) {
+  const auto offset = frame_offset.value_or(0);
+  if (offset < 0) {
+    throw std::runtime_error(
+        "Invalid argument: frame_offset must be non-negative.");
+  }
+  const auto frames = num_frames.value_or(-1);
+  if (frames == 0 || frames < -1) {
+    throw std::runtime_error(
+        "Invalid argument: num_frames must be -1 or greater than 0.");
+  }
+
+  std::vector<std::vector<std::string>> effects;
+  if (frames != -1) {
+    std::ostringstream os_offset, os_frames;
+    os_offset << offset << "s";
+    os_frames << "+" << frames << "s";
+    effects.emplace_back(
+        std::vector<std::string>{"trim", os_offset.str(), os_frames.str()});
+  } else if (offset != 0) {
+    std::ostringstream os_offset;
+    os_offset << offset << "s";
+    effects.emplace_back(std::vector<std::string>{"trim", os_offset.str()});
+  }
+  return effects;
+}
+
 auto get_info_fileobj(py::object fileobj, const std::string &format)
    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
    const auto capacity = [&]() {
@ -60,5 +96,115 @@ auto get_info_fileobj(py::object fileobj, const std::string &format)
        get_encoding(sf->encoding.encoding));
 }

+tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
+    py::object fileobj,
+    const tl::optional<int64_t>& frame_offset,
+    const tl::optional<int64_t>& num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format) {
+  auto effects = get_effects(frame_offset, num_frames);
+  return paddleaudio::sox_effects::apply_effects_fileobj(
+      std::move(fileobj), effects, normalize, channels_first, std::move(format));
+}
+
+namespace {
+// helper class to automatically release buffer, to be used by
+// save_audio_fileobj
+struct AutoReleaseBuffer {
+  char* ptr;
+  size_t size;
+
+  AutoReleaseBuffer() : ptr(nullptr), size(0) {}
+  AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete;
+  AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete;
+  auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete;
+  auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete;
+  ~AutoReleaseBuffer() {
+    if (ptr) {
+      free(ptr);
+    }
+  }
+};
+
+} // namespace
+
+void save_audio_fileobj(
+    py::object fileobj,
+    py::array tensor,
+    int64_t sample_rate,
+    bool channels_first,
+    tl::optional<double> compression,
+    tl::optional<std::string> format,
+    tl::optional<std::string> encoding,
+    tl::optional<int64_t> bits_per_sample) {
+
+  if (!format.has_value()) {
+    throw std::runtime_error(
+        "`format` is required when saving to file object.");
+  }
+  const auto filetype = format.value();
+
+  if (filetype == "amr-nb") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "amr-nb format only supports single channel audio.");
+    }
+  } else if (filetype == "htk") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "htk format only supports single channel audio.");
+    }
+  } else if (filetype == "gsm") {
+    const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+    if (num_channels != 1) {
+      throw std::runtime_error(
+          "gsm format only supports single channel audio.");
+    }
+    if (sample_rate != 8000) {
+      throw std::runtime_error(
+          "gsm format only supports a sampling rate of 8kHz.");
+    }
+  }
+
+  const auto signal_info =
+      get_signalinfo(&tensor, sample_rate, filetype, channels_first);
+  const auto encoding_info = get_encodinginfo_for_save(
+      filetype,
+      tensor.dtype(),
+      compression,
+      std::move(encoding),
+      bits_per_sample);
+
+  AutoReleaseBuffer buffer;
+
+  SoxFormat sf(sox_open_memstream_write(
+      &buffer.ptr,
+      &buffer.size,
+      &signal_info,
+      &encoding_info,
+      filetype.c_str(),
+      /*oob=*/nullptr));
+
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error(
+        "Error saving audio file: failed to open memory stream.");
+  }
+
+  paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
+      /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
+      /*output_encoding=*/sf->encoding);
+  chain.addInputTensor(&tensor, sample_rate, channels_first);
+  chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj);
+  chain.run();
+
+  // Closing the sox_format_t is necessary for flushing the last chunk to the
+  // buffer
+  sf.close();
+  fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
+}
+
 }  // namespace paddleaudio
 }  // namespace sox_io
--- a/paddlespeech/audio/src/pybind/sox/io.h
+++ b/paddlespeech/audio/src/pybind/sox/io.h
@ -1,11 +1,12 @@
 // Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
 // All rights reserved.

-#ifndef PADDLEAUDIO_PYBIND_SOX_IO_H
-#define PADDLEAUDIO_PYBIND_SOX_IO_H
+#pragma once

 #include "paddlespeech/audio/src/pybind/sox/utils.h"

+namespace py = pybind11;
+
 namespace paddleaudio {
 namespace sox_io {

@ -15,7 +16,24 @@ auto get_info_file(const std::string &path, const std::string &format)
 auto get_info_fileobj(py::object fileobj, const std::string &format)
    -> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;

+auto load_audio_fileobj(
+    py::object fileobj,
+    tl::optional<int64_t> frame_offset,
+    tl::optional<int64_t> num_frames,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    tl::optional<std::string> format)
+    -> tl::optional<std::tuple<py::array, int64_t>>;
+
+void save_audio_fileobj(
+    py::object fileobj,
+    py::array tensor,
+    int64_t sample_rate,
+    bool channels_first,
+    tl::optional<double> compression,
+    tl::optional<std::string> format,
+    tl::optional<std::string> encoding,
+    tl::optional<int64_t> bits_per_sample);
+
 }  // namespace paddleaudio
 }  // namespace sox_io
-
-#endif
--- a/paddlespeech/audio/src/pybind/sox/utils.cpp
+++ b/paddlespeech/audio/src/pybind/sox/utils.cpp
@ -8,6 +8,34 @@
 namespace paddleaudio {
 namespace sox_utils {

+auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer)
+    -> uint64_t {
+    uint64_t num_read = 0;
+    while (num_read < size) {
+        auto request = size - num_read;
+        auto chunk = static_cast<std::string>(
+            static_cast<py::bytes>(fileobj->attr("read")(request)));
+        auto chunk_len = chunk.length();
+        if (chunk_len == 0) {
+            break;
+        }
+        if (chunk_len > request) {
+            std::ostringstream message;
+            message
+                << "Requested up to " << request << " bytes but, "
+                << "received " << chunk_len << " bytes. "
+                << "The given object does not confirm to read protocol of file "
+                   "object.";
+            throw std::runtime_error(message.str());
+        }
+        memcpy(buffer, chunk.data(), chunk_len);
+        buffer += chunk_len;
+        num_read += chunk_len;
+    }
+    return num_read;
+}
+
+/*
 SoxFormat::SoxFormat(sox_format_t *fd) noexcept : fd_(fd) {}
 SoxFormat::~SoxFormat() { close(); }

@ -96,6 +124,6 @@ std::string get_encoding(sox_encoding_t encoding) {
            return "UNKNOWN";
    }
 }
-
+*/
 }  // namespace paddleaudio
 }  // namespace sox_utils
--- a/paddlespeech/audio/src/pybind/sox/utils.h
+++ b/paddlespeech/audio/src/pybind/sox/utils.h
@ -4,39 +4,18 @@
 #pragma once

 #include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
 #include <sox.h>
+#include "paddlespeech/audio/src/optional/optional.hpp"
+#include "paddlespeech/audio/src/sox/utils.h"
+#include "paddlespeech/audio/src/sox/types.h"

 namespace py = pybind11;

 namespace paddleaudio {
 namespace sox_utils {

-/// helper class to automatically close sox_format_t*
-struct SoxFormat {
-    explicit SoxFormat(sox_format_t *fd) noexcept;
-    SoxFormat(const SoxFormat &other) = delete;
-    SoxFormat(SoxFormat &&other) = delete;
-    SoxFormat &operator=(const SoxFormat &other) = delete;
-    SoxFormat &operator=(SoxFormat &&other) = delete;
-    ~SoxFormat();
-    sox_format_t *operator->() const noexcept;
-    operator sox_format_t *() const noexcept;
-
-    void close();
-
-  private:
-    sox_format_t *fd_;
-};
-
 auto read_fileobj(py::object *fileobj, uint64_t size, char *buffer) -> uint64_t;

-int64_t get_buffer_size();
-
-void validate_input_file(const SoxFormat &sf, const std::string &path);
-
-void validate_input_memfile(const SoxFormat &sf);
-
-std::string get_encoding(sox_encoding_t encoding);
-
 }  // namespace paddleaudio
 }  // namespace sox_utils
--- a/paddlespeech/audio/src/sox/effects.cpp
+++ b/paddlespeech/audio/src/sox/effects.cpp
@ -0,0 +1,147 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp
+
+#include <sox.h>
+#include <mutex>
+
+#include "paddlespeech/audio/src/sox/effects.h"
+#include "paddlespeech/audio/src/sox/effects_chain.h"
+#include "paddlespeech/audio/src/sox/utils.h"
+
+using namespace paddleaudio::sox_utils;
+
+namespace paddleaudio::sox_effects {
+
+namespace {
+
+enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
+SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
+std::mutex SOX_RESOUCE_STATE_MUTEX;
+
+} // namespace
+
+void initialize_sox_effects() {
+  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
+
+  switch (SOX_RESOURCE_STATE) {
+    case NotInitialized:
+      if (sox_init() != SOX_SUCCESS) {
+        throw std::runtime_error("Failed to initialize sox effects.");
+      };
+      SOX_RESOURCE_STATE = Initialized;
+      break;
+    case Initialized:
+      break;
+    case ShutDown:
+      throw std::runtime_error(
+          "SoX Effects has been shut down. Cannot initialize again.");
+  }
+};
+
+void shutdown_sox_effects() {
+  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
+
+  switch (SOX_RESOURCE_STATE) {
+    case NotInitialized:
+      throw std::runtime_error(
+          "SoX Effects is not initialized. Cannot shutdown.");
+    case Initialized:
+      if (sox_quit() != SOX_SUCCESS) {
+        throw std::runtime_error("Failed to initialize sox effects.");
+      };
+      SOX_RESOURCE_STATE = ShutDown;
+      break;
+    case ShutDown:
+      break;
+  }
+}
+
+auto apply_effects_tensor(
+    py::array waveform,
+    int64_t sample_rate,
+    const std::vector<std::vector<std::string>>& effects,
+    bool channels_first) -> std::tuple<py::array, int64_t> {
+  validate_input_tensor(waveform);
+
+  // Create SoxEffectsChain
+  const auto dtype = waveform.dtype();
+  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
+      /*input_encoding=*/get_tensor_encodinginfo(dtype),
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+
+  // Prepare output buffer
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(waveform.size());
+
+  // Build and run effects chain
+  chain.addInputTensor(&waveform, sample_rate, channels_first);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+
+  // Create tensor from buffer
+  auto out_tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      /*normalize=*/false,
+      channels_first);
+
+  return std::tuple<py::array, int64_t>(
+      out_tensor, chain.getOutputSampleRate());
+}
+
+auto apply_effects_file(
+    const std::string& path,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format)
+    -> tl::optional<std::tuple<py::array, int64_t>> {
+  // Open input file
+  SoxFormat sf(sox_open_read(
+      path.c_str(),
+      /*signal=*/nullptr,
+      /*encoding=*/nullptr,
+      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
+
+  if (static_cast<sox_format_t*>(sf) == nullptr ||
+      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    return {};
+  }
+
+  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
+
+  // Prepare output
+  std::vector<sox_sample_t> out_buffer;
+  out_buffer.reserve(sf->signal.length);
+
+  // Create and run SoxEffectsChain
+  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
+      /*input_encoding=*/sf->encoding,
+      /*output_encoding=*/get_tensor_encodinginfo(dtype));
+
+  chain.addInputFile(sf);
+  for (const auto& effect : effects) {
+    chain.addEffect(effect);
+  }
+  chain.addOutputBuffer(&out_buffer);
+  chain.run();
+
+  // Create tensor from buffer
+  bool channels_first_ = channels_first.value_or(true);
+  auto tensor = convert_to_tensor(
+      /*buffer=*/out_buffer.data(),
+      /*num_samples=*/out_buffer.size(),
+      /*num_channels=*/chain.getOutputNumChannels(),
+      dtype,
+      normalize.value_or(true),
+      channels_first_);
+
+  return std::tuple<py::array, int64_t>(
+      tensor, chain.getOutputSampleRate());
+}
+
+} // namespace paddleaudio::sox_effects
--- a/paddlespeech/audio/src/sox/effects.h
+++ b/paddlespeech/audio/src/sox/effects.h
@ -0,0 +1,29 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "paddlespeech/audio/src/sox/utils.h"
+
+namespace py = pybind11;
+
+namespace paddleaudio::sox_effects {
+
+void initialize_sox_effects();
+
+void shutdown_sox_effects();
+
+auto apply_effects_tensor(
+    py::array waveform,
+    int64_t sample_rate,
+    const std::vector<std::vector<std::string>>& effects,
+    bool channels_first) -> std::tuple<py::array, int64_t>;
+
+auto apply_effects_file(
+    const std::string& path,
+    const std::vector<std::vector<std::string>>& effects,
+    tl::optional<bool> normalize,
+    tl::optional<bool> channels_first,
+    const tl::optional<std::string>& format)
+    -> tl::optional<std::tuple<py::array, int64_t>>;
+
+} // namespace torchaudio::sox_effects
--- a/paddlespeech/audio/src/sox/effects_chain.cpp
+++ b/paddlespeech/audio/src/sox/effects_chain.cpp
@ -0,0 +1,342 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.cpp
+
+#include "paddlespeech/audio/src/sox/effects_chain.h"
+#include "paddlespeech/audio/src/sox/utils.h"
+
+using namespace paddleaudio::sox_utils;
+
+namespace paddleaudio {
+namespace sox_effects_chain {
+
+namespace {
+
+/// helper classes for passing the location of input tensor and output buffer
+///
+/// drain/flow callback functions require plaing C style function signature and
+/// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
+/// The following structs will be assigned to sox_effect_t::priv pointer which
+/// gives sox_effect_t an access to input Tensor and output buffer object.
+struct TensorInputPriv {
+  size_t index;
+  py::array* waveform;
+  int64_t sample_rate;
+  bool channels_first;
+};
+
+struct TensorOutputPriv {
+  std::vector<sox_sample_t>* buffer;
+};
+struct FileOutputPriv {
+  sox_format_t* sf;
+};
+
+/// Callback function to feed Tensor data to SoxEffectChain.
+int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
+  // Retrieve the input Tensor and current index
+  auto priv = static_cast<TensorInputPriv*>(effp->priv);
+  auto index = priv->index;
+  auto tensor = *(priv->waveform);
+  auto num_channels = effp->out_signal.channels;
+
+  // Adjust the number of samples to read
+  const size_t num_samples = tensor.size();
+  if (index + *osamp > num_samples) {
+    *osamp = num_samples - index;
+  }
+  // Ensure that it's a multiple of the number of channels
+  *osamp -= *osamp % num_channels;
+
+  // Slice the input Tensor
+  // refacor this module, chunk
+  auto i_frame = index / num_channels;
+  auto num_frames = *osamp / num_channels;
+  py::array chunk(tensor.dtype(), {num_frames*num_channels});
+  py::buffer_info ori_info = tensor.request();
+  py::buffer_info info = chunk.request();
+  char* ori_start_ptr = (char*)ori_info.ptr + index * chunk.itemsize() / sizeof(char);
+  std::memcpy(info.ptr, ori_start_ptr, chunk.nbytes());
+  
+  py::dtype chunk_type = py::dtype("i"); // dtype int32
+  py::array new_chunk = py::array(chunk_type, chunk.shape());
+  py::buffer_info new_info = new_chunk.request();
+  void* ptr = (void*) info.ptr;
+  int* new_ptr = (int*) new_info.ptr;
+  // Convert to sox_sample_t (int32_t)
+  switch (chunk.dtype().num()) {
+    //case c10::ScalarType::Float: {
+    case 11: {
+      // Need to convert to 64-bit precision so that
+      // values around INT32_MIN/MAX are handled correctly.
+      float* ptr_f = (float*)ptr;
+      for (int idx = 0; idx < chunk.size(); ++idx) {
+        double elem = *ptr_f * 2147483648.;
+        // *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX);
+        if (elem > INT32_MAX) { 
+          *new_ptr = INT32_MAX; 
+        } else if (elem < INT32_MIN) {
+          *new_ptr = INT32_MIN; 
+        } else { *new_ptr = elem; }
+      }
+      break;
+    }
+    //case c10::ScalarType::Int: {
+    case 5: {
+      break;
+    }
+    // case short
+    case 3: {
+      int16_t* ptr_s = (int16_t*) ptr;
+      for (int idx = 0; idx < chunk.size(); ++idx) {
+        *new_ptr = *ptr_s * 65536; 
+      }
+      break;
+    }
+    // case byte
+    case 1: {
+      int8_t* ptr_b = (int8_t*) ptr;
+      for (int idx = 0; idx < chunk.size(); ++idx) {
+        *new_ptr = (*ptr_b - 128) * 16777216; 
+      }
+      break;
+    }
+    default:
+      throw std::runtime_error("Unexpected dtype.");
+  }
+  // Write to buffer
+  memcpy(obuf, (int*)new_info.ptr, *osamp * 4);
+  priv->index += *osamp;
+  return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS;
+}
+
+/// Callback function to fetch data from SoxEffectChain.
+int tensor_output_flow(
+    sox_effect_t* effp,
+    sox_sample_t const* ibuf,
+    sox_sample_t* obuf LSX_UNUSED,
+    size_t* isamp,
+    size_t* osamp) {
+  *osamp = 0;
+  // Get output buffer
+  auto out_buffer = static_cast<TensorOutputPriv*>(effp->priv)->buffer;
+  // Append at the end
+  out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp);
+  return SOX_SUCCESS;
+}
+
+int file_output_flow(
+    sox_effect_t* effp,
+    sox_sample_t const* ibuf,
+    sox_sample_t* obuf LSX_UNUSED,
+    size_t* isamp,
+    size_t* osamp) {
+  *osamp = 0;
+  if (*isamp) {
+    auto sf = static_cast<FileOutputPriv*>(effp->priv)->sf;
+    if (sox_write(sf, ibuf, *isamp) != *isamp) {
+      if (sf->sox_errno) {
+        std::ostringstream stream;
+        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
+               << sf->filename;
+        throw std::runtime_error(stream.str());
+      }
+      return SOX_EOF;
+    }
+  }
+  return SOX_SUCCESS;
+}
+
+sox_effect_handler_t* get_tensor_input_handler() {
+  static sox_effect_handler_t handler{
+      /*name=*/"input_tensor",
+      /*usage=*/NULL,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/NULL,
+      /*start=*/NULL,
+      /*flow=*/NULL,
+      /*drain=*/tensor_input_drain,
+      /*stop=*/NULL,
+      /*kill=*/NULL,
+      /*priv_size=*/sizeof(TensorInputPriv)};
+  return &handler;
+}
+
+sox_effect_handler_t* get_tensor_output_handler() {
+  static sox_effect_handler_t handler{
+      /*name=*/"output_tensor",
+      /*usage=*/NULL,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/NULL,
+      /*start=*/NULL,
+      /*flow=*/tensor_output_flow,
+      /*drain=*/NULL,
+      /*stop=*/NULL,
+      /*kill=*/NULL,
+      /*priv_size=*/sizeof(TensorOutputPriv)};
+  return &handler;
+}
+
+sox_effect_handler_t* get_file_output_handler() {
+  static sox_effect_handler_t handler{
+      /*name=*/"output_file",
+      /*usage=*/NULL,
+      /*flags=*/SOX_EFF_MCHAN,
+      /*getopts=*/NULL,
+      /*start=*/NULL,
+      /*flow=*/file_output_flow,
+      /*drain=*/NULL,
+      /*stop=*/NULL,
+      /*kill=*/NULL,
+      /*priv_size=*/sizeof(FileOutputPriv)};
+  return &handler;
+}
+
+} // namespace
+
+SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {}
+
+SoxEffect::~SoxEffect() {
+  if (se_ != nullptr) {
+    free(se_);
+  }
+}
+
+SoxEffect::operator sox_effect_t*() const {
+  return se_;
+}
+
+auto SoxEffect::operator->() noexcept -> sox_effect_t* {
+  return se_;
+}
+
+SoxEffectsChain::SoxEffectsChain(
+    sox_encodinginfo_t input_encoding,
+    sox_encodinginfo_t output_encoding)
+    : in_enc_(input_encoding),
+      out_enc_(output_encoding),
+      in_sig_(),
+      interm_sig_(),
+      out_sig_(),
+      sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) {
+  if (!sec_) {
+    throw std::runtime_error("Failed to create effect chain.");
+  }
+}
+
+SoxEffectsChain::~SoxEffectsChain() {
+  if (sec_ != nullptr) {
+    sox_delete_effects_chain(sec_);
+  }
+}
+
+void SoxEffectsChain::run() {
+  sox_flow_effects(sec_, NULL, NULL);
+}
+
+void SoxEffectsChain::addInputTensor(
+    py::array* waveform,
+    int64_t sample_rate,
+    bool channels_first) {
+  in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first);
+  interm_sig_ = in_sig_;
+  SoxEffect e(sox_create_effect(get_tensor_input_handler()));
+  auto priv = static_cast<TensorInputPriv*>(e->priv);
+  priv->index = 0;
+  priv->waveform = waveform;
+  priv->sample_rate = sample_rate;
+  priv->channels_first = channels_first;
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: input_tensor");
+  }
+}
+
+void SoxEffectsChain::addOutputBuffer(
+    std::vector<sox_sample_t>* output_buffer) {
+  SoxEffect e(sox_create_effect(get_tensor_output_handler()));
+  static_cast<TensorOutputPriv*>(e->priv)->buffer = output_buffer;
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    throw std::runtime_error(
+        "Internal Error: Failed to add effect: output_tensor");
+  }
+}
+
+void SoxEffectsChain::addInputFile(sox_format_t* sf) {
+  in_sig_ = sf->signal;
+  interm_sig_ = in_sig_;
+  SoxEffect e(sox_create_effect(sox_find_effect("input")));
+  char* opts[] = {(char*)sf};
+  sox_effect_options(e, 1, opts);
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Internal Error: Failed to add effect: input " << sf->filename;
+    throw std::runtime_error(stream.str());
+  }
+}
+
+void SoxEffectsChain::addOutputFile(sox_format_t* sf) {
+  out_sig_ = sf->signal;
+  SoxEffect e(sox_create_effect(get_file_output_handler()));
+  static_cast<FileOutputPriv*>(e->priv)->sf = sf;
+  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Internal Error: Failed to add effect: output " << sf->filename;
+    throw std::runtime_error(stream.str());
+  }
+}
+
+void SoxEffectsChain::addEffect(const std::vector<std::string> effect) {
+  const auto num_args = effect.size();
+  if (num_args == 0) {
+    throw std::runtime_error("Invalid argument: empty effect.");
+  }
+  const auto name = effect[0];
+  if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) {
+    std::ostringstream stream;
+    stream << "Unsupported effect: " << name;
+    throw std::runtime_error(stream.str());
+  }
+
+  auto returned_effect = sox_find_effect(name.c_str());
+  if (!returned_effect) {
+    std::ostringstream stream;
+    stream << "Unsupported effect: " << name;
+    throw std::runtime_error(stream.str());
+  }
+  SoxEffect e(sox_create_effect(returned_effect));
+  const auto num_options = num_args - 1;
+
+  std::vector<char*> opts;
+  for (size_t i = 1; i < num_args; ++i) {
+    opts.push_back((char*)effect[i].c_str());
+  }
+  if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) !=
+      SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Invalid effect option:";
+    for (const auto& v : effect) {
+      stream << " " << v;
+    }
+    throw std::runtime_error(stream.str());
+  }
+
+  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
+    std::ostringstream stream;
+    stream << "Internal Error: Failed to add effect: \"" << name;
+    for (size_t i = 1; i < num_args; ++i) {
+      stream << " " << effect[i];
+    }
+    stream << "\"";
+    throw std::runtime_error(stream.str());
+  }
+}
+
+int64_t SoxEffectsChain::getOutputNumChannels() {
+  return interm_sig_.channels;
+}
+
+int64_t SoxEffectsChain::getOutputSampleRate() {
+  return interm_sig_.rate;
+}
+
+} // namespace sox_effects_chain
+} // namespace paddleaudio
--- a/paddlespeech/audio/src/sox/effects_chain.h
+++ b/paddlespeech/audio/src/sox/effects_chain.h
@ -0,0 +1,62 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h
+#pragma once
+
+#include <sox.h>
+#include "paddlespeech/audio/src/sox/utils.h"
+
+namespace paddleaudio {
+namespace sox_effects_chain {
+
+// Helper struct to safely close sox_effect_t* pointer returned by
+// sox_create_effect
+
+struct SoxEffect {
+  explicit SoxEffect(sox_effect_t* se) noexcept;
+  SoxEffect(const SoxEffect& other) = delete;
+  SoxEffect(const SoxEffect&& other) = delete;
+  auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
+  auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
+  ~SoxEffect();
+  operator sox_effect_t*() const;
+  auto operator->() noexcept -> sox_effect_t*;
+
+ private:
+  sox_effect_t* se_;
+};
+
+// Helper struct to safely close sox_effects_chain_t with handy methods
+class SoxEffectsChain {
+  const sox_encodinginfo_t in_enc_;
+  const sox_encodinginfo_t out_enc_;
+
+ protected:
+  sox_signalinfo_t in_sig_;
+  sox_signalinfo_t interm_sig_;
+  sox_signalinfo_t out_sig_;
+  sox_effects_chain_t* sec_;
+
+ public:
+  explicit SoxEffectsChain(
+      sox_encodinginfo_t input_encoding,
+      sox_encodinginfo_t output_encoding);
+  SoxEffectsChain(const SoxEffectsChain& other) = delete;
+  SoxEffectsChain(const SoxEffectsChain&& other) = delete;
+  SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
+  SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
+  ~SoxEffectsChain();
+  void run();
+  void addInputTensor(
+      py::array* waveform,
+      int64_t sample_rate,
+      bool channels_first);
+  void addInputFile(sox_format_t* sf);
+  void addOutputBuffer(std::vector<sox_sample_t>* output_buffer);
+  void addOutputFile(sox_format_t* sf);
+  void addEffect(const std::vector<std::string> effect);
+  int64_t getOutputNumChannels();
+  int64_t getOutputSampleRate();
+};
+
+} // namespace sox_effects_chain
+} // namespace torchaudio
+
--- a/paddlespeech/audio/src/sox/io.cpp
+++ b/paddlespeech/audio/src/sox/io.cpp
@ -1,10 +1,10 @@
-// #include "sox/effects.h"
-// #include "sox/effects_chain.h"
-#include "sox/io.h"
-#include "sox/types.h"
-#include "sox/utils.h"
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp
+#include "paddlespeech/audio/src/sox/effects.h"
+#include "paddlespeech/audio/src/sox/effects_chain.h"
+#include "paddlespeech/audio/src/sox/io.h"
+#include "paddlespeech/audio/src/sox/types.h"
+#include "paddlespeech/audio/src/sox/utils.h"

-using namespace torch::indexing;
 using namespace paddleaudio::sox_utils;

 namespace paddleaudio {
@ -60,7 +60,7 @@ std::vector<std::vector<std::string>> get_effects(
    return effects;
 }

-tl::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
+tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
    const std::string& path,
    const tl::optional<int64_t>& frame_offset,
    const tl::optional<int64_t>& num_frames,
@ -73,7 +73,7 @@ tl::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
 }

 void save_audio_file(const std::string& path,
-                     torch::Tensor tensor,
+                     py::array tensor,
                     int64_t sample_rate,
                     bool channels_first,
                     tl::optional<double> compression,
@ -88,19 +88,19 @@ void save_audio_file(const std::string& path,
    }();

    if (filetype == "amr-nb") {
-        const auto num_channels = tensor.size(channels_first ? 0 : 1);
-        TORCH_CHECK(num_channels == 1,
-                    "amr-nb format only supports single channel audio.");
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+        //TORCH_CHECK(num_channels == 1,
+        //            "amr-nb format only supports single channel audio.");
    } else if (filetype == "htk") {
-        const auto num_channels = tensor.size(channels_first ? 0 : 1);
-        TORCH_CHECK(num_channels == 1,
-                    "htk format only supports single channel audio.");
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+       // TORCH_CHECK(num_channels == 1,
+        //            "htk format only supports single channel audio.");
    } else if (filetype == "gsm") {
-        const auto num_channels = tensor.size(channels_first ? 0 : 1);
-        TORCH_CHECK(num_channels == 1,
-                    "gsm format only supports single channel audio.");
-        TORCH_CHECK(sample_rate == 8000,
-                    "gsm format only supports a sampling rate of 8kHz.");
+        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
+        //TORCH_CHECK(num_channels == 1,
+        //            "gsm format only supports single channel audio.");
+        //TORCH_CHECK(sample_rate == 8000,
+        //            "gsm format only supports a sampling rate of 8kHz.");
    }
    const auto signal_info =
        get_signalinfo(&tensor, sample_rate, filetype, channels_first);
@ -127,13 +127,5 @@ void save_audio_file(const std::string& path,
    chain.run();
 }

-TORCH_LIBRARY_FRAGMENT(paddleaudio, m) {
-    m.def("paddleaudio::sox_io_get_info", &paddleaudio::sox_io::get_info_file);
-    m.def("paddleaudio::sox_io_load_audio_file",
-          &paddleaudio::sox_io::load_audio_file);
-    m.def("paddleaudio::sox_io_save_audio_file",
-          &paddleaudio::sox_io::save_audio_file);
-}
-
 }  // namespace sox_io
 }  // namespace paddleaudio
--- a/paddlespeech/audio/src/sox/io.h
+++ b/paddlespeech/audio/src/sox/io.h
@ -2,11 +2,10 @@
 // Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
 // All rights reserved.

-#ifndef PADDLEAUDIO_SOX_IO_H
-#define PADDLEAUDIO_SOX_IO_H
+#pragma once

-// #include "sox/utils.h"
-#include "optional/optional.hpp"
+#include "paddlespeech/audio/src/optional/optional.hpp"
+#include "paddlespeech/audio/src/sox/utils.h"

 namespace paddleaudio {
 namespace sox_io {
@ -21,7 +20,7 @@ using MetaDataTuple =
 tl::optional<MetaDataTuple> get_info_file(
    const std::string& path, const tl::optional<std::string>& format);

-tl::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
+tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
    const std::string& path,
    const tl::optional<int64_t>& frame_offset,
    const tl::optional<int64_t>& num_frames,
@ -30,7 +29,7 @@ tl::optional<std::tuple<torch::Tensor, int64_t>> load_audio_file(
    const tl::optional<std::string>& format);

 void save_audio_file(const std::string& path,
-                     torch::Tensor tensor,
+                     py::array tensor,
                     int64_t sample_rate,
                     bool channels_first,
                     tl::optional<double> compression,
@ -40,5 +39,3 @@ void save_audio_file(const std::string& path,

 }  // namespace sox_io
 }  // namespace paddleaudio
-
-#endif
--- a/paddlespeech/audio/src/sox/types.cpp
+++ b/paddlespeech/audio/src/sox/types.cpp
@ -0,0 +1,143 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp
+
+#include "paddlespeech/audio/src/sox/types.h"
+#include <ostream>
+#include <sstream>
+
+namespace paddleaudio {
+namespace sox_utils {
+
+Format get_format_from_string(const std::string& format) {
+  if (format == "wav")
+    return Format::WAV;
+  if (format == "mp3")
+    return Format::MP3;
+  if (format == "flac")
+    return Format::FLAC;
+  if (format == "ogg" || format == "vorbis")
+    return Format::VORBIS;
+  if (format == "amr-nb")
+    return Format::AMR_NB;
+  if (format == "amr-wb")
+    return Format::AMR_WB;
+  if (format == "amb")
+    return Format::AMB;
+  if (format == "sph")
+    return Format::SPHERE;
+  if (format == "htk")
+    return Format::HTK;
+  if (format == "gsm")
+    return Format::GSM;
+  std::ostringstream stream;
+  stream << "Internal Error: unexpected format value: " << format;
+  throw std::runtime_error(stream.str());
+}
+
+std::string to_string(Encoding v) {
+  switch (v) {
+    case Encoding::UNKNOWN:
+      return "UNKNOWN";
+    case Encoding::PCM_SIGNED:
+      return "PCM_S";
+    case Encoding::PCM_UNSIGNED:
+      return "PCM_U";
+    case Encoding::PCM_FLOAT:
+      return "PCM_F";
+    case Encoding::FLAC:
+      return "FLAC";
+    case Encoding::ULAW:
+      return "ULAW";
+    case Encoding::ALAW:
+      return "ALAW";
+    case Encoding::MP3:
+      return "MP3";
+    case Encoding::VORBIS:
+      return "VORBIS";
+    case Encoding::AMR_WB:
+      return "AMR_WB";
+    case Encoding::AMR_NB:
+      return "AMR_NB";
+    case Encoding::OPUS:
+      return "OPUS";
+    default:
+      throw std::runtime_error("Internal Error: unexpected encoding.");
+  }
+}
+
+Encoding get_encoding_from_option(const tl::optional<std::string> encoding) {
+  if (!encoding.has_value())
+    return Encoding::NOT_PROVIDED;
+  std::string v = encoding.value();
+  if (v == "PCM_S")
+    return Encoding::PCM_SIGNED;
+  if (v == "PCM_U")
+    return Encoding::PCM_UNSIGNED;
+  if (v == "PCM_F")
+    return Encoding::PCM_FLOAT;
+  if (v == "ULAW")
+    return Encoding::ULAW;
+  if (v == "ALAW")
+    return Encoding::ALAW;
+  std::ostringstream stream;
+  stream << "Internal Error: unexpected encoding value: " << v;
+  throw std::runtime_error(stream.str());
+}
+
+BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth) {
+  if (!bit_depth.has_value())
+    return BitDepth::NOT_PROVIDED;
+  int64_t v = bit_depth.value();
+  switch (v) {
+    case 8:
+      return BitDepth::B8;
+    case 16:
+      return BitDepth::B16;
+    case 24:
+      return BitDepth::B24;
+    case 32:
+      return BitDepth::B32;
+    case 64:
+      return BitDepth::B64;
+    default: {
+      std::ostringstream s;
+      s << "Internal Error: unexpected bit depth value: " << v;
+      throw std::runtime_error(s.str());
+    }
+  }
+}
+
+std::string get_encoding(sox_encoding_t encoding) {
+  switch (encoding) {
+    case SOX_ENCODING_UNKNOWN:
+      return "UNKNOWN";
+    case SOX_ENCODING_SIGN2:
+      return "PCM_S";
+    case SOX_ENCODING_UNSIGNED:
+      return "PCM_U";
+    case SOX_ENCODING_FLOAT:
+      return "PCM_F";
+    case SOX_ENCODING_FLAC:
+      return "FLAC";
+    case SOX_ENCODING_ULAW:
+      return "ULAW";
+    case SOX_ENCODING_ALAW:
+      return "ALAW";
+    case SOX_ENCODING_MP3:
+      return "MP3";
+    case SOX_ENCODING_VORBIS:
+      return "VORBIS";
+    case SOX_ENCODING_AMR_WB:
+      return "AMR_WB";
+    case SOX_ENCODING_AMR_NB:
+      return "AMR_NB";
+    case SOX_ENCODING_OPUS:
+      return "OPUS";
+    case SOX_ENCODING_GSM:
+      return "GSM";
+    default:
+      return "UNKNOWN";
+  }
+}
+
+} // namespace sox_utils
+} // namespace paddleaudio
--- a/paddlespeech/audio/src/sox/types.h
+++ b/paddlespeech/audio/src/sox/types.h
@ -0,0 +1,58 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h
+#pragma once
+
+#include <sox.h>
+#include "paddlespeech/audio/src/optional/optional.hpp"
+
+namespace paddleaudio {
+namespace sox_utils {
+
+enum class Format {
+  WAV,
+  MP3,
+  FLAC,
+  VORBIS,
+  AMR_NB,
+  AMR_WB,
+  AMB,
+  SPHERE,
+  GSM,
+  HTK,
+};
+
+Format get_format_from_string(const std::string& format);
+
+enum class Encoding {
+  NOT_PROVIDED,
+  UNKNOWN,
+  PCM_SIGNED,
+  PCM_UNSIGNED,
+  PCM_FLOAT,
+  FLAC,
+  ULAW,
+  ALAW,
+  MP3,
+  VORBIS,
+  AMR_WB,
+  AMR_NB,
+  OPUS,
+};
+
+std::string to_string(Encoding v);
+Encoding get_encoding_from_option(const tl::optional<std::string> encoding);
+
+enum class BitDepth : unsigned {
+  NOT_PROVIDED = 0,
+  B8 = 8,
+  B16 = 16,
+  B24 = 24,
+  B32 = 32,
+  B64 = 64,
+};
+
+BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth);
+
+std::string get_encoding(sox_encoding_t encoding);
+
+} // namespace sox_utils
+} // namespace torchaudio
--- a/paddlespeech/audio/src/sox/utils.cpp
+++ b/paddlespeech/audio/src/sox/utils.cpp
@ -0,0 +1,488 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.cpp
+
+#include <sox.h>
+#include "paddlespeech/audio/src/sox/types.h"
+#include "paddlespeech/audio/src/sox/utils.h"
+
+namespace paddleaudio {
+namespace sox_utils {
+
+void set_seed(const int64_t seed) {
+  sox_get_globals()->ranqd1 = static_cast<sox_int32_t>(seed);
+}
+
+void set_verbosity(const int64_t verbosity) {
+  sox_get_globals()->verbosity = static_cast<unsigned>(verbosity);
+}
+
+void set_use_threads(const bool use_threads) {
+  sox_get_globals()->use_threads = static_cast<sox_bool>(use_threads);
+}
+
+void set_buffer_size(const int64_t buffer_size) {
+  sox_get_globals()->bufsiz = static_cast<size_t>(buffer_size);
+}
+
+int64_t get_buffer_size() {
+  return sox_get_globals()->bufsiz;
+}
+
+std::vector<std::vector<std::string>> list_effects() {
+  std::vector<std::vector<std::string>> effects;
+  for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) {
+    const sox_effect_handler_t* handler = (*fns)();
+    if (handler && handler->name) {
+      if (UNSUPPORTED_EFFECTS.find(handler->name) ==
+          UNSUPPORTED_EFFECTS.end()) {
+        effects.emplace_back(std::vector<std::string>{
+            handler->name,
+            handler->usage ? std::string(handler->usage) : std::string("")});
+      }
+    }
+  }
+  return effects;
+}
+
+std::vector<std::string> list_write_formats() {
+  std::vector<std::string> formats;
+  for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
+    const sox_format_handler_t* handler = fns->fn();
+    for (const char* const* names = handler->names; *names; ++names) {
+      if (!strchr(*names, '/') && handler->write)
+        formats.emplace_back(*names);
+    }
+  }
+  return formats;
+}
+
+std::vector<std::string> list_read_formats() {
+  std::vector<std::string> formats;
+  for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
+    const sox_format_handler_t* handler = fns->fn();
+    for (const char* const* names = handler->names; *names; ++names) {
+      if (!strchr(*names, '/') && handler->read)
+        formats.emplace_back(*names);
+    }
+  }
+  return formats;
+}
+
+SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {}
+SoxFormat::~SoxFormat() {
+  close();
+}
+
+sox_format_t* SoxFormat::operator->() const noexcept {
+  return fd_;
+}
+SoxFormat::operator sox_format_t*() const noexcept {
+  return fd_;
+}
+
+void SoxFormat::close() {
+  if (fd_ != nullptr) {
+    sox_close(fd_);
+    fd_ = nullptr;
+  }
+}
+
+void validate_input_file(const SoxFormat& sf, const std::string& path) {
+  if (static_cast<sox_format_t*>(sf) == nullptr) {
+    throw std::runtime_error(
+        "Error loading audio file: failed to open file " + path);
+  }
+  if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
+    throw std::runtime_error("Error loading audio file: unknown encoding.");
+  }
+}
+
+void validate_input_memfile(const SoxFormat &sf) {
+    return validate_input_file(sf, "<in memory buffer>");
+}
+
+void validate_input_tensor(const py::array tensor) {
+  if (tensor.ndim() != 2) {
+    throw std::runtime_error("Input tensor has to be 2D.");
+  }
+
+  char dtype = tensor.dtype().char_();
+  bool flag = (dtype == 'f') || (dtype == 'd') || (dtype == 'l') || (dtype == 'i');
+  if (flag == false) {
+      throw std::runtime_error(
+          "Input tensor has to be one of float32, int32, int16 or uint8 type.");
+  }
+}
+
+py::dtype get_dtype(
+    const sox_encoding_t encoding,
+    const unsigned precision) {
+    switch (encoding) {
+      case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
+        return py::dtype('u1');
+      case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV
+        switch (precision) {
+          case 16:
+            return py::dtype("i2");
+          case 24: // Cast 24-bit to 32-bit.
+          case 32:
+            return py::dtype('i');
+          default:
+            throw std::runtime_error(
+                "Only 16, 24, and 32 bits are supported for signed PCM.");
+        }
+      default:
+        // default to float32 for the other formats, including
+        // 32-bit flaoting-point WAV,
+        // MP3,
+        // FLAC,
+        // VORBIS etc...
+        return py::dtype("f");
+    }
+}
+
+py::array convert_to_tensor(
+    sox_sample_t* buffer,
+    const int32_t num_samples,
+    const int32_t num_channels,
+    const py::dtype dtype,
+    const bool normalize,
+    const bool channels_first) {
+  py::array t;
+  uint64_t dummy = 0;
+  SOX_SAMPLE_LOCALS;
+  if (normalize || dtype.char_() == 'f') {
+    t = py::array(dtype, {num_samples / num_channels, num_channels});
+    auto ptr = (float*)t.mutable_data(0, 0);
+    for (int32_t i = 0; i < num_samples; ++i) {
+      ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy);
+    }
+  } else if (dtype.char_() == 'i') {
+    //t = torch::from_blob(
+    //        buffer, {num_samples / num_channels, num_channels}, torch::kInt32)
+    //        .clone();
+    t = py::array(dtype, {num_samples / num_channels, num_channels});
+    auto ptr = (int*)t.mutable_data(0, 0);
+    for (int32_t i = 0; i < num_samples; ++i) {
+      ptr[i] = buffer[i];
+    }
+  } else if (dtype.char_() == 'h') { // int16
+    t = py::array(dtype, {num_samples / num_channels, num_channels});
+    auto ptr = (int16_t*)t.mutable_data(0, 0);
+    for (int32_t i = 0; i < num_samples; ++i) {
+      ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy);
+    }
+  } else if (dtype.char_() == 'b') {
+    //t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8);
+    auto ptr = (uint8_t*)t.mutable_data(0,0);
+    for (int32_t i = 0; i < num_samples; ++i) {
+      ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy);
+    }
+  } else {
+    throw std::runtime_error("Unsupported dtype.");
+  }
+  return t;
+}
+
+const std::string get_filetype(const std::string path) {
+  std::string ext = path.substr(path.find_last_of(".") + 1);
+  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+  return ext;
+}
+
+namespace {
+
+std::tuple<sox_encoding_t, unsigned> get_save_encoding_for_wav(
+    const std::string format,
+    py::dtype dtype,
+    const Encoding& encoding,
+    const BitDepth& bits_per_sample) {
+  switch (encoding) {
+    case Encoding::NOT_PROVIDED:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+          switch (dtype.num()) {
+            case 11: // float32 numpy dtype num 
+              return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
+            case 5: // int numpy dtype num
+              return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
+            case 3: // int16 numpy
+              return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
+            case 1: // byte numpy
+              return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
+            default:
+              throw std::runtime_error("Internal Error: Unexpected dtype.");
+          }
+        case BitDepth::B8:
+          return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
+        default:
+          return std::make_tuple<>(
+              SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
+      }
+    case Encoding::PCM_SIGNED:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+          return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
+        case BitDepth::B8:
+          throw std::runtime_error(
+              format + " does not support 8-bit signed PCM encoding.");
+        default:
+          return std::make_tuple<>(
+              SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
+      }
+    case Encoding::PCM_UNSIGNED:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+        case BitDepth::B8:
+          return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
+        default:
+          throw std::runtime_error(
+              format + " only supports 8-bit for unsigned PCM encoding.");
+      }
+    case Encoding::PCM_FLOAT:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+        case BitDepth::B32:
+          return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
+        case BitDepth::B64:
+          return std::make_tuple<>(SOX_ENCODING_FLOAT, 64);
+        default:
+          throw std::runtime_error(
+              format +
+              " only supports 32-bit or 64-bit for floating-point PCM encoding.");
+      }
+    case Encoding::ULAW:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+        case BitDepth::B8:
+          return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
+        default:
+          throw std::runtime_error(
+              format + " only supports 8-bit for mu-law encoding.");
+      }
+    case Encoding::ALAW:
+      switch (bits_per_sample) {
+        case BitDepth::NOT_PROVIDED:
+        case BitDepth::B8:
+          return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
+        default:
+          throw std::runtime_error(
+              format + " only supports 8-bit for a-law encoding.");
+      }
+    default:
+      throw std::runtime_error(
+          format + " does not support encoding: " + to_string(encoding));
+  }
+}
+
+std::tuple<sox_encoding_t, unsigned> get_save_encoding(
+    const std::string& format,
+    const py::dtype dtype,
+    const tl::optional<std::string> encoding,
+    const tl::optional<int64_t> bits_per_sample) {
+  const Format fmt = get_format_from_string(format);
+  const Encoding enc = get_encoding_from_option(encoding);
+  const BitDepth bps = get_bit_depth_from_option(bits_per_sample);
+
+  switch (fmt) {
+    case Format::WAV:
+    case Format::AMB:
+      return get_save_encoding_for_wav(format, dtype, enc, bps);
+    case Format::MP3:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("mp3 does not support `encoding` option.");
+      if (bps != BitDepth::NOT_PROVIDED)
+        throw std::runtime_error(
+            "mp3 does not support `bits_per_sample` option.");
+      return std::make_tuple<>(SOX_ENCODING_MP3, 16);
+    case Format::HTK:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("htk does not support `encoding` option.");
+      if (bps != BitDepth::NOT_PROVIDED)
+        throw std::runtime_error(
+            "htk does not support `bits_per_sample` option.");
+      return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
+    case Format::VORBIS:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("vorbis does not support `encoding` option.");
+      if (bps != BitDepth::NOT_PROVIDED)
+        throw std::runtime_error(
+            "vorbis does not support `bits_per_sample` option.");
+      return std::make_tuple<>(SOX_ENCODING_VORBIS, 16);
+    case Format::AMR_NB:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("amr-nb does not support `encoding` option.");
+      if (bps != BitDepth::NOT_PROVIDED)
+        throw std::runtime_error(
+            "amr-nb does not support `bits_per_sample` option.");
+      return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16);
+    case Format::FLAC:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("flac does not support `encoding` option.");
+      switch (bps) {
+        case BitDepth::B32:
+        case BitDepth::B64:
+          throw std::runtime_error(
+              "flac does not support `bits_per_sample` larger than 24.");
+        default:
+          return std::make_tuple<>(
+              SOX_ENCODING_FLAC, static_cast<unsigned>(bps));
+      }
+    case Format::SPHERE:
+      switch (enc) {
+        case Encoding::NOT_PROVIDED:
+        case Encoding::PCM_SIGNED:
+          switch (bps) {
+            case BitDepth::NOT_PROVIDED:
+              return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
+            default:
+              return std::make_tuple<>(
+                  SOX_ENCODING_SIGN2, static_cast<unsigned>(bps));
+          }
+        case Encoding::PCM_UNSIGNED:
+          throw std::runtime_error(
+              "sph does not support unsigned integer PCM.");
+        case Encoding::PCM_FLOAT:
+          throw std::runtime_error("sph does not support floating point PCM.");
+        case Encoding::ULAW:
+          switch (bps) {
+            case BitDepth::NOT_PROVIDED:
+            case BitDepth::B8:
+              return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
+            default:
+              throw std::runtime_error(
+                  "sph only supports 8-bit for mu-law encoding.");
+          }
+        case Encoding::ALAW:
+          switch (bps) {
+            case BitDepth::NOT_PROVIDED:
+            case BitDepth::B8:
+              return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
+            default:
+              return std::make_tuple<>(
+                  SOX_ENCODING_ALAW, static_cast<unsigned>(bps));
+          }
+        default:
+          throw std::runtime_error(
+              "sph does not support encoding: " + encoding.value());
+      }
+    case Format::GSM:
+      if (enc != Encoding::NOT_PROVIDED)
+        throw std::runtime_error("gsm does not support `encoding` option.");
+      if (bps != BitDepth::NOT_PROVIDED)
+        throw std::runtime_error(
+            "gsm does not support `bits_per_sample` option.");
+      return std::make_tuple<>(SOX_ENCODING_GSM, 16);
+
+    default:
+      throw std::runtime_error("Unsupported format: " + format);
+  }
+}
+
+unsigned get_precision(const std::string filetype, py::dtype dtype) {
+  if (filetype == "mp3")
+    return SOX_UNSPEC;
+  if (filetype == "flac")
+    return 24;
+  if (filetype == "ogg" || filetype == "vorbis")
+    return SOX_UNSPEC;
+  if (filetype == "wav" || filetype == "amb") {
+    switch (dtype.num()) {
+      case 1: // byte in numpy dype num
+        return 8;
+      case 3: // short, in numpy dtype num
+        return 16;
+      case 5: // int, numpy dtype 
+        return 32;
+      case 11: // float, numpy dtype
+        return 32;
+      default:
+        throw std::runtime_error("Unsupported dtype.");
+    }
+  }
+  if (filetype == "sph")
+    return 32;
+  if (filetype == "amr-nb") {
+    return 16;
+  }
+  if (filetype == "gsm") {
+    return 16;
+  }
+  if (filetype == "htk") {
+    return 16;
+  }
+  throw std::runtime_error("Unsupported file type: " + filetype);
+}
+
+} // namespace
+
+sox_signalinfo_t get_signalinfo(
+    const py::array* waveform,
+    const int64_t sample_rate,
+    const std::string filetype,
+    const bool channels_first) {
+  return sox_signalinfo_t{
+      /*rate=*/static_cast<sox_rate_t>(sample_rate),
+      /*channels=*/
+      static_cast<unsigned>(waveform->shape(channels_first ? 0 : 1)),
+      /*precision=*/get_precision(filetype, waveform->dtype()),
+      /*length=*/static_cast<uint64_t>(waveform->size())};
+}
+
+sox_encodinginfo_t get_tensor_encodinginfo(py::dtype dtype) {
+  sox_encoding_t encoding = [&]() {
+    switch (dtype.num()) {
+      case 1: // byte
+        return SOX_ENCODING_UNSIGNED;
+      case 3: // short
+        return SOX_ENCODING_SIGN2;
+      case 5: // int32
+        return SOX_ENCODING_SIGN2;
+      case 11: // float
+        return SOX_ENCODING_FLOAT;
+      default:
+        throw std::runtime_error("Unsupported dtype.");
+    }
+  }();
+  unsigned bits_per_sample = [&]() {
+    switch (dtype.num()) {
+      case 1: // byte
+        return 8;
+      case 3: //short
+        return 16;
+      case 5: // int32
+        return 32;
+      case 11: // float
+        return 32;
+      default:
+        throw std::runtime_error("Unsupported dtype.");
+    }
+  }();
+  return sox_encodinginfo_t{
+      /*encoding=*/encoding,
+      /*bits_per_sample=*/bits_per_sample,
+      /*compression=*/HUGE_VAL,
+      /*reverse_bytes=*/sox_option_default,
+      /*reverse_nibbles=*/sox_option_default,
+      /*reverse_bits=*/sox_option_default,
+      /*opposite_endian=*/sox_false};
+}
+
+sox_encodinginfo_t get_encodinginfo_for_save(
+    const std::string& format,
+    const py::dtype dtype,
+    const tl::optional<double> compression,
+    const tl::optional<std::string> encoding,
+    const tl::optional<int64_t> bits_per_sample) {
+  auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample);
+  return sox_encodinginfo_t{
+      /*encoding=*/std::get<0>(enc),
+      /*bits_per_sample=*/std::get<1>(enc),
+      /*compression=*/compression.value_or(HUGE_VAL),
+      /*reverse_bytes=*/sox_option_default,
+      /*reverse_nibbles=*/sox_option_default,
+      /*reverse_bits=*/sox_option_default,
+      /*opposite_endian=*/sox_false};
+}
+
+} // namespace sox_utils
+} // namespace torchaudio
--- a/paddlespeech/audio/src/sox/utils.h
+++ b/paddlespeech/audio/src/sox/utils.h
@ -0,0 +1,120 @@
+//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.h
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <sox.h>
+
+#include "paddlespeech/audio/src/optional/optional.hpp"
+
+namespace py = pybind11;
+
+namespace paddleaudio {
+namespace sox_utils {
+
+////////////////////////////////////////////////////////////////////////////////
+// APIs for Python interaction
+////////////////////////////////////////////////////////////////////////////////
+
+/// Set sox global options
+void set_seed(const int64_t seed);
+
+void set_verbosity(const int64_t verbosity);
+
+void set_use_threads(const bool use_threads);
+
+void set_buffer_size(const int64_t buffer_size);
+
+int64_t get_buffer_size();
+
+std::vector<std::vector<std::string>> list_effects();
+
+std::vector<std::string> list_read_formats();
+
+std::vector<std::string> list_write_formats();
+
+////////////////////////////////////////////////////////////////////////////////
+// Utilities for sox_io / sox_effects implementations
+////////////////////////////////////////////////////////////////////////////////
+
+const std::unordered_set<std::string> UNSUPPORTED_EFFECTS =
+    {"input", "output", "spectrogram", "noiseprof", "noisered", "splice"};
+
+/// helper class to automatically close sox_format_t*
+struct SoxFormat {
+  explicit SoxFormat(sox_format_t* fd) noexcept;
+  SoxFormat(const SoxFormat& other) = delete;
+  SoxFormat(SoxFormat&& other) = delete;
+  SoxFormat& operator=(const SoxFormat& other) = delete;
+  SoxFormat& operator=(SoxFormat&& other) = delete;
+  ~SoxFormat();
+  sox_format_t* operator->() const noexcept;
+  operator sox_format_t*() const noexcept;
+
+  void close();
+
+ private:
+  sox_format_t* fd_;
+};
+
+///
+/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
+void validate_input_tensor(const py::array);
+
+void validate_input_file(const SoxFormat& sf, const std::string& path);
+
+void validate_input_memfile(const SoxFormat &sf);
+///
+/// Get target dtype for the given encoding and precision.
+py::dtype get_dtype(
+    const sox_encoding_t encoding,
+    const unsigned precision);
+
+///
+/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
+/// NOTE: This function might modify the values in the input buffer to
+/// reduce the number of memory copy.
+/// @param buffer Pointer to buffer that contains audio data.
+/// @param num_samples The number of samples to read.
+/// @param num_channels The number of channels. Used to reshape the resulting
+/// Tensor.
+/// @param dtype Target dtype. Determines the output dtype and value range in
+/// conjunction with normalization.
+/// @param noramlize Perform normalization. Only effective when dtype is not
+/// kFloat32. When effective, the output tensor is kFloat32 type and value range
+/// is [-1.0, 1.0]
+/// @param channels_first When True, output Tensor has shape of [num_channels,
+/// num_frames].
+py::array convert_to_tensor(
+    sox_sample_t* buffer,
+    const int32_t num_samples,
+    const int32_t num_channels,
+    const py::dtype dtype,
+    const bool normalize,
+    const bool channels_first);
+
+/// Extract extension from file path
+const std::string get_filetype(const std::string path);
+
+/// Get sox_signalinfo_t for passing a py::array object.
+sox_signalinfo_t get_signalinfo(
+    const py::array* waveform,
+    const int64_t sample_rate,
+    const std::string filetype,
+    const bool channels_first);
+
+/// Get sox_encodinginfo_t for Tensor I/O
+sox_encodinginfo_t get_tensor_encodinginfo(const py::dtype dtype);
+
+/// Get sox_encodinginfo_t for saving to file/file object
+sox_encodinginfo_t get_encodinginfo_for_save(
+    const std::string& format,
+    const py::dtype dtype,
+    const tl::optional<double> compression,
+    const tl::optional<std::string> encoding,
+    const tl::optional<int64_t> bits_per_sample);
+
+
+} // namespace sox_utils
+} // namespace paddleaudio
--- a/setup.py
+++ b/setup.py
@ -43,7 +43,7 @@ base = [
    "pypinyin", "pypinyin-dict", "python-dateutil", "pyworld", "resampy==0.2.2",
    "sacrebleu", "scipy", "sentencepiece~=0.1.96", "soundfile~=0.10",
    "textgrid", "timer", "tqdm", "typeguard", "visualdl", "webrtcvad",
-    "yacs~=0.1.8", "prettytable", "zhon", "colorlog", "pathos == 0.2.8"
+    "yacs~=0.1.8", "prettytable", "zhon", "colorlog", "pathos == 0.2.8", "Ninja"
 ]

 server = [