diff --git a/paddlespeech/audio/src/sox/effects.cpp b/paddlespeech/audio/src/sox/effects.cpp
deleted file mode 100644
index f2687f93f..000000000
--- a/paddlespeech/audio/src/sox/effects.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.cpp
-
-#include <sox.h>
-#include <mutex>
-
-#include "paddlespeech/audio/src/sox/effects.h"
-#include "paddlespeech/audio/src/sox/effects_chain.h"
-#include "paddlespeech/audio/src/sox/utils.h"
-
-using namespace paddleaudio::sox_utils;
-
-namespace paddleaudio::sox_effects {
-
-namespace {
-
-enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
-SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
-std::mutex SOX_RESOUCE_STATE_MUTEX;
-
-} // namespace
-
-void initialize_sox_effects() {
-  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
-
-  switch (SOX_RESOURCE_STATE) {
-    case NotInitialized:
-      if (sox_init() != SOX_SUCCESS) {
-        throw std::runtime_error("Failed to initialize sox effects.");
-      };
-      SOX_RESOURCE_STATE = Initialized;
-      break;
-    case Initialized:
-      break;
-    case ShutDown:
-      throw std::runtime_error(
-          "SoX Effects has been shut down. Cannot initialize again.");
-  }
-};
-
-void shutdown_sox_effects() {
-  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
-
-  switch (SOX_RESOURCE_STATE) {
-    case NotInitialized:
-      throw std::runtime_error(
-          "SoX Effects is not initialized. Cannot shutdown.");
-    case Initialized:
-      if (sox_quit() != SOX_SUCCESS) {
-        throw std::runtime_error("Failed to initialize sox effects.");
-      };
-      SOX_RESOURCE_STATE = ShutDown;
-      break;
-    case ShutDown:
-      break;
-  }
-}
-
-auto apply_effects_tensor(
-    py::array waveform,
-    int64_t sample_rate,
-    const std::vector<std::vector<std::string>>& effects,
-    bool channels_first) -> std::tuple<py::array, int64_t> {
-  validate_input_tensor(waveform);
-
-  // Create SoxEffectsChain
-  const auto dtype = waveform.dtype();
-  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
-      /*input_encoding=*/get_tensor_encodinginfo(dtype),
-      /*output_encoding=*/get_tensor_encodinginfo(dtype));
-
-  // Prepare output buffer
-  std::vector<sox_sample_t> out_buffer;
-  out_buffer.reserve(waveform.size());
-
-  // Build and run effects chain
-  chain.addInputTensor(&waveform, sample_rate, channels_first);
-  for (const auto& effect : effects) {
-    chain.addEffect(effect);
-  }
-  chain.addOutputBuffer(&out_buffer);
-  chain.run();
-
-  // Create tensor from buffer
-  auto out_tensor = convert_to_tensor(
-      /*buffer=*/out_buffer.data(),
-      /*num_samples=*/out_buffer.size(),
-      /*num_channels=*/chain.getOutputNumChannels(),
-      dtype,
-      /*normalize=*/false,
-      channels_first);
-
-  return std::tuple<py::array, int64_t>(
-      out_tensor, chain.getOutputSampleRate());
-}
-
-auto apply_effects_file(
-    const std::string& path,
-    const std::vector<std::vector<std::string>>& effects,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    const tl::optional<std::string>& format)
-    -> tl::optional<std::tuple<py::array, int64_t>> {
-  // Open input file
-  SoxFormat sf(sox_open_read(
-      path.c_str(),
-      /*signal=*/nullptr,
-      /*encoding=*/nullptr,
-      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
-
-  if (static_cast<sox_format_t*>(sf) == nullptr ||
-      sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-    return {};
-  }
-
-  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
-
-  // Prepare output
-  std::vector<sox_sample_t> out_buffer;
-  out_buffer.reserve(sf->signal.length);
-
-  // Create and run SoxEffectsChain
-  paddleaudio::sox_effects_chain::SoxEffectsChain chain(
-      /*input_encoding=*/sf->encoding,
-      /*output_encoding=*/get_tensor_encodinginfo(dtype));
-
-  chain.addInputFile(sf);
-  for (const auto& effect : effects) {
-    chain.addEffect(effect);
-  }
-  chain.addOutputBuffer(&out_buffer);
-  chain.run();
-
-  // Create tensor from buffer
-  bool channels_first_ = channels_first.value_or(true);
-  auto tensor = convert_to_tensor(
-      /*buffer=*/out_buffer.data(),
-      /*num_samples=*/out_buffer.size(),
-      /*num_channels=*/chain.getOutputNumChannels(),
-      dtype,
-      normalize.value_or(true),
-      channels_first_);
-
-  return std::tuple<py::array, int64_t>(
-      tensor, chain.getOutputSampleRate());
-}
-
-} // namespace paddleaudio::sox_effects
diff --git a/paddlespeech/audio/src/sox/effects.h b/paddlespeech/audio/src/sox/effects.h
deleted file mode 100644
index 81db23b44..000000000
--- a/paddlespeech/audio/src/sox/effects.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects.h
-#pragma once
-
-#include <pybind11/pybind11.h>
-#include "paddlespeech/audio/src/sox/utils.h"
-
-namespace py = pybind11;
-
-namespace paddleaudio::sox_effects {
-
-void initialize_sox_effects();
-
-void shutdown_sox_effects();
-
-auto apply_effects_tensor(
-    py::array waveform,
-    int64_t sample_rate,
-    const std::vector<std::vector<std::string>>& effects,
-    bool channels_first) -> std::tuple<py::array, int64_t>;
-
-auto apply_effects_file(
-    const std::string& path,
-    const std::vector<std::vector<std::string>>& effects,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    const tl::optional<std::string>& format)
-    -> tl::optional<std::tuple<py::array, int64_t>>;
-
-} // namespace torchaudio::sox_effects
diff --git a/paddlespeech/audio/src/sox/effects_chain.cpp b/paddlespeech/audio/src/sox/effects_chain.cpp
deleted file mode 100644
index 1b13fd186..000000000
--- a/paddlespeech/audio/src/sox/effects_chain.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.cpp
-
-#include "paddlespeech/audio/src/sox/effects_chain.h"
-#include "paddlespeech/audio/src/sox/utils.h"
-
-using namespace paddleaudio::sox_utils;
-
-namespace paddleaudio {
-namespace sox_effects_chain {
-
-namespace {
-
-/// helper classes for passing the location of input tensor and output buffer
-///
-/// drain/flow callback functions require plaing C style function signature and
-/// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
-/// The following structs will be assigned to sox_effect_t::priv pointer which
-/// gives sox_effect_t an access to input Tensor and output buffer object.
-struct TensorInputPriv {
-  size_t index;
-  py::array* waveform;
-  int64_t sample_rate;
-  bool channels_first;
-};
-
-struct TensorOutputPriv {
-  std::vector<sox_sample_t>* buffer;
-};
-struct FileOutputPriv {
-  sox_format_t* sf;
-};
-
-/// Callback function to feed Tensor data to SoxEffectChain.
-int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
-  // Retrieve the input Tensor and current index
-  auto priv = static_cast<TensorInputPriv*>(effp->priv);
-  auto index = priv->index;
-  auto tensor = *(priv->waveform);
-  auto num_channels = effp->out_signal.channels;
-
-  // Adjust the number of samples to read
-  const size_t num_samples = tensor.size();
-  if (index + *osamp > num_samples) {
-    *osamp = num_samples - index;
-  }
-  // Ensure that it's a multiple of the number of channels
-  *osamp -= *osamp % num_channels;
-
-  // Slice the input Tensor
-  // refacor this module, chunk
-  auto i_frame = index / num_channels;
-  auto num_frames = *osamp / num_channels;
-  py::array chunk(tensor.dtype(), {num_frames*num_channels});
-  py::buffer_info ori_info = tensor.request();
-  py::buffer_info info = chunk.request();
-  char* ori_start_ptr = (char*)ori_info.ptr + index * chunk.itemsize() / sizeof(char);
-  std::memcpy(info.ptr, ori_start_ptr, chunk.nbytes());
-  
-  py::dtype chunk_type = py::dtype("i"); // dtype int32
-  py::array new_chunk = py::array(chunk_type, chunk.shape());
-  py::buffer_info new_info = new_chunk.request();
-  void* ptr = (void*) info.ptr;
-  int* new_ptr = (int*) new_info.ptr;
-  // Convert to sox_sample_t (int32_t)
-  switch (chunk.dtype().num()) {
-    //case c10::ScalarType::Float: {
-    case 11: {
-      // Need to convert to 64-bit precision so that
-      // values around INT32_MIN/MAX are handled correctly.
-      float* ptr_f = (float*)ptr;
-      for (int idx = 0; idx < chunk.size(); ++idx) {
-        double elem = *ptr_f * 2147483648.;
-        // *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX);
-        if (elem > INT32_MAX) { 
-          *new_ptr = INT32_MAX; 
-        } else if (elem < INT32_MIN) {
-          *new_ptr = INT32_MIN; 
-        } else { *new_ptr = elem; }
-      }
-      break;
-    }
-    //case c10::ScalarType::Int: {
-    case 5: {
-      break;
-    }
-    // case short
-    case 3: {
-      int16_t* ptr_s = (int16_t*) ptr;
-      for (int idx = 0; idx < chunk.size(); ++idx) {
-        *new_ptr = *ptr_s * 65536; 
-      }
-      break;
-    }
-    // case byte
-    case 1: {
-      int8_t* ptr_b = (int8_t*) ptr;
-      for (int idx = 0; idx < chunk.size(); ++idx) {
-        *new_ptr = (*ptr_b - 128) * 16777216; 
-      }
-      break;
-    }
-    default:
-      throw std::runtime_error("Unexpected dtype.");
-  }
-  // Write to buffer
-  memcpy(obuf, (int*)new_info.ptr, *osamp * 4);
-  priv->index += *osamp;
-  return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS;
-}
-
-/// Callback function to fetch data from SoxEffectChain.
-int tensor_output_flow(
-    sox_effect_t* effp,
-    sox_sample_t const* ibuf,
-    sox_sample_t* obuf LSX_UNUSED,
-    size_t* isamp,
-    size_t* osamp) {
-  *osamp = 0;
-  // Get output buffer
-  auto out_buffer = static_cast<TensorOutputPriv*>(effp->priv)->buffer;
-  // Append at the end
-  out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp);
-  return SOX_SUCCESS;
-}
-
-int file_output_flow(
-    sox_effect_t* effp,
-    sox_sample_t const* ibuf,
-    sox_sample_t* obuf LSX_UNUSED,
-    size_t* isamp,
-    size_t* osamp) {
-  *osamp = 0;
-  if (*isamp) {
-    auto sf = static_cast<FileOutputPriv*>(effp->priv)->sf;
-    if (sox_write(sf, ibuf, *isamp) != *isamp) {
-      if (sf->sox_errno) {
-        std::ostringstream stream;
-        stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
-               << sf->filename;
-        throw std::runtime_error(stream.str());
-      }
-      return SOX_EOF;
-    }
-  }
-  return SOX_SUCCESS;
-}
-
-sox_effect_handler_t* get_tensor_input_handler() {
-  static sox_effect_handler_t handler{
-      /*name=*/"input_tensor",
-      /*usage=*/NULL,
-      /*flags=*/SOX_EFF_MCHAN,
-      /*getopts=*/NULL,
-      /*start=*/NULL,
-      /*flow=*/NULL,
-      /*drain=*/tensor_input_drain,
-      /*stop=*/NULL,
-      /*kill=*/NULL,
-      /*priv_size=*/sizeof(TensorInputPriv)};
-  return &handler;
-}
-
-sox_effect_handler_t* get_tensor_output_handler() {
-  static sox_effect_handler_t handler{
-      /*name=*/"output_tensor",
-      /*usage=*/NULL,
-      /*flags=*/SOX_EFF_MCHAN,
-      /*getopts=*/NULL,
-      /*start=*/NULL,
-      /*flow=*/tensor_output_flow,
-      /*drain=*/NULL,
-      /*stop=*/NULL,
-      /*kill=*/NULL,
-      /*priv_size=*/sizeof(TensorOutputPriv)};
-  return &handler;
-}
-
-sox_effect_handler_t* get_file_output_handler() {
-  static sox_effect_handler_t handler{
-      /*name=*/"output_file",
-      /*usage=*/NULL,
-      /*flags=*/SOX_EFF_MCHAN,
-      /*getopts=*/NULL,
-      /*start=*/NULL,
-      /*flow=*/file_output_flow,
-      /*drain=*/NULL,
-      /*stop=*/NULL,
-      /*kill=*/NULL,
-      /*priv_size=*/sizeof(FileOutputPriv)};
-  return &handler;
-}
-
-} // namespace
-
-SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {}
-
-SoxEffect::~SoxEffect() {
-  if (se_ != nullptr) {
-    free(se_);
-  }
-}
-
-SoxEffect::operator sox_effect_t*() const {
-  return se_;
-}
-
-auto SoxEffect::operator->() noexcept -> sox_effect_t* {
-  return se_;
-}
-
-SoxEffectsChain::SoxEffectsChain(
-    sox_encodinginfo_t input_encoding,
-    sox_encodinginfo_t output_encoding)
-    : in_enc_(input_encoding),
-      out_enc_(output_encoding),
-      in_sig_(),
-      interm_sig_(),
-      out_sig_(),
-      sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) {
-  if (!sec_) {
-    throw std::runtime_error("Failed to create effect chain.");
-  }
-}
-
-SoxEffectsChain::~SoxEffectsChain() {
-  if (sec_ != nullptr) {
-    sox_delete_effects_chain(sec_);
-  }
-}
-
-void SoxEffectsChain::run() {
-  sox_flow_effects(sec_, NULL, NULL);
-}
-
-void SoxEffectsChain::addInputTensor(
-    py::array* waveform,
-    int64_t sample_rate,
-    bool channels_first) {
-  in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first);
-  interm_sig_ = in_sig_;
-  SoxEffect e(sox_create_effect(get_tensor_input_handler()));
-  auto priv = static_cast<TensorInputPriv*>(e->priv);
-  priv->index = 0;
-  priv->waveform = waveform;
-  priv->sample_rate = sample_rate;
-  priv->channels_first = channels_first;
-  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
-    throw std::runtime_error(
-        "Internal Error: Failed to add effect: input_tensor");
-  }
-}
-
-void SoxEffectsChain::addOutputBuffer(
-    std::vector<sox_sample_t>* output_buffer) {
-  SoxEffect e(sox_create_effect(get_tensor_output_handler()));
-  static_cast<TensorOutputPriv*>(e->priv)->buffer = output_buffer;
-  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
-    throw std::runtime_error(
-        "Internal Error: Failed to add effect: output_tensor");
-  }
-}
-
-void SoxEffectsChain::addInputFile(sox_format_t* sf) {
-  in_sig_ = sf->signal;
-  interm_sig_ = in_sig_;
-  SoxEffect e(sox_create_effect(sox_find_effect("input")));
-  char* opts[] = {(char*)sf};
-  sox_effect_options(e, 1, opts);
-  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
-    std::ostringstream stream;
-    stream << "Internal Error: Failed to add effect: input " << sf->filename;
-    throw std::runtime_error(stream.str());
-  }
-}
-
-void SoxEffectsChain::addOutputFile(sox_format_t* sf) {
-  out_sig_ = sf->signal;
-  SoxEffect e(sox_create_effect(get_file_output_handler()));
-  static_cast<FileOutputPriv*>(e->priv)->sf = sf;
-  if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
-    std::ostringstream stream;
-    stream << "Internal Error: Failed to add effect: output " << sf->filename;
-    throw std::runtime_error(stream.str());
-  }
-}
-
-void SoxEffectsChain::addEffect(const std::vector<std::string> effect) {
-  const auto num_args = effect.size();
-  if (num_args == 0) {
-    throw std::runtime_error("Invalid argument: empty effect.");
-  }
-  const auto name = effect[0];
-  if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) {
-    std::ostringstream stream;
-    stream << "Unsupported effect: " << name;
-    throw std::runtime_error(stream.str());
-  }
-
-  auto returned_effect = sox_find_effect(name.c_str());
-  if (!returned_effect) {
-    std::ostringstream stream;
-    stream << "Unsupported effect: " << name;
-    throw std::runtime_error(stream.str());
-  }
-  SoxEffect e(sox_create_effect(returned_effect));
-  const auto num_options = num_args - 1;
-
-  std::vector<char*> opts;
-  for (size_t i = 1; i < num_args; ++i) {
-    opts.push_back((char*)effect[i].c_str());
-  }
-  if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) !=
-      SOX_SUCCESS) {
-    std::ostringstream stream;
-    stream << "Invalid effect option:";
-    for (const auto& v : effect) {
-      stream << " " << v;
-    }
-    throw std::runtime_error(stream.str());
-  }
-
-  if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
-    std::ostringstream stream;
-    stream << "Internal Error: Failed to add effect: \"" << name;
-    for (size_t i = 1; i < num_args; ++i) {
-      stream << " " << effect[i];
-    }
-    stream << "\"";
-    throw std::runtime_error(stream.str());
-  }
-}
-
-int64_t SoxEffectsChain::getOutputNumChannels() {
-  return interm_sig_.channels;
-}
-
-int64_t SoxEffectsChain::getOutputSampleRate() {
-  return interm_sig_.rate;
-}
-
-} // namespace sox_effects_chain
-} // namespace paddleaudio
diff --git a/paddlespeech/audio/src/sox/effects_chain.h b/paddlespeech/audio/src/sox/effects_chain.h
deleted file mode 100644
index 87a046975..000000000
--- a/paddlespeech/audio/src/sox/effects_chain.h
+++ /dev/null
@@ -1,62 +0,0 @@
-//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/effects_chain.h
-#pragma once
-
-#include <sox.h>
-#include "paddlespeech/audio/src/sox/utils.h"
-
-namespace paddleaudio {
-namespace sox_effects_chain {
-
-// Helper struct to safely close sox_effect_t* pointer returned by
-// sox_create_effect
-
-struct SoxEffect {
-  explicit SoxEffect(sox_effect_t* se) noexcept;
-  SoxEffect(const SoxEffect& other) = delete;
-  SoxEffect(const SoxEffect&& other) = delete;
-  auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
-  auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
-  ~SoxEffect();
-  operator sox_effect_t*() const;
-  auto operator->() noexcept -> sox_effect_t*;
-
- private:
-  sox_effect_t* se_;
-};
-
-// Helper struct to safely close sox_effects_chain_t with handy methods
-class SoxEffectsChain {
-  const sox_encodinginfo_t in_enc_;
-  const sox_encodinginfo_t out_enc_;
-
- protected:
-  sox_signalinfo_t in_sig_;
-  sox_signalinfo_t interm_sig_;
-  sox_signalinfo_t out_sig_;
-  sox_effects_chain_t* sec_;
-
- public:
-  explicit SoxEffectsChain(
-      sox_encodinginfo_t input_encoding,
-      sox_encodinginfo_t output_encoding);
-  SoxEffectsChain(const SoxEffectsChain& other) = delete;
-  SoxEffectsChain(const SoxEffectsChain&& other) = delete;
-  SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
-  SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
-  ~SoxEffectsChain();
-  void run();
-  void addInputTensor(
-      py::array* waveform,
-      int64_t sample_rate,
-      bool channels_first);
-  void addInputFile(sox_format_t* sf);
-  void addOutputBuffer(std::vector<sox_sample_t>* output_buffer);
-  void addOutputFile(sox_format_t* sf);
-  void addEffect(const std::vector<std::string> effect);
-  int64_t getOutputNumChannels();
-  int64_t getOutputSampleRate();
-};
-
-} // namespace sox_effects_chain
-} // namespace torchaudio
-
diff --git a/paddlespeech/audio/src/sox/io.cpp b/paddlespeech/audio/src/sox/io.cpp
deleted file mode 100644
index 5a75fc987..000000000
--- a/paddlespeech/audio/src/sox/io.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/io.cpp
-#include "paddlespeech/audio/src/sox/effects.h"
-#include "paddlespeech/audio/src/sox/effects_chain.h"
-#include "paddlespeech/audio/src/sox/io.h"
-#include "paddlespeech/audio/src/sox/types.h"
-#include "paddlespeech/audio/src/sox/utils.h"
-
-using namespace paddleaudio::sox_utils;
-
-namespace paddleaudio {
-namespace sox_io {
-
-tl::optional<MetaDataTuple> get_info_file(
-    const std::string& path, const tl::optional<std::string>& format) {
-    SoxFormat sf(sox_open_read(
-        path.c_str(),
-        /*signal=*/nullptr,
-        /*encoding=*/nullptr,
-        /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
-
-    if (static_cast<sox_format_t*>(sf) == nullptr ||
-        sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-        return {};
-    }
-
-    return std::forward_as_tuple(
-        static_cast<int64_t>(sf->signal.rate),
-        static_cast<int64_t>(sf->signal.length / sf->signal.channels),
-        static_cast<int64_t>(sf->signal.channels),
-        static_cast<int64_t>(sf->encoding.bits_per_sample),
-        get_encoding(sf->encoding.encoding));
-}
-
-std::vector<std::vector<std::string>> get_effects(
-    const tl::optional<int64_t>& frame_offset,
-    const tl::optional<int64_t>& num_frames) {
-    const auto offset = frame_offset.value_or(0);
-    if (offset < 0) {
-        throw std::runtime_error(
-            "Invalid argument: frame_offset must be non-negative.");
-    }
-    const auto frames = num_frames.value_or(-1);
-    if (frames == 0 || frames < -1) {
-        throw std::runtime_error(
-            "Invalid argument: num_frames must be -1 or greater than 0.");
-    }
-
-    std::vector<std::vector<std::string>> effects;
-    if (frames != -1) {
-        std::ostringstream os_offset, os_frames;
-        os_offset << offset << "s";
-        os_frames << "+" << frames << "s";
-        effects.emplace_back(
-            std::vector<std::string>{"trim", os_offset.str(), os_frames.str()});
-    } else if (offset != 0) {
-        std::ostringstream os_offset;
-        os_offset << offset << "s";
-        effects.emplace_back(std::vector<std::string>{"trim", os_offset.str()});
-    }
-    return effects;
-}
-
-tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
-    const std::string& path,
-    const tl::optional<int64_t>& frame_offset,
-    const tl::optional<int64_t>& num_frames,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    const tl::optional<std::string>& format) {
-    auto effects = get_effects(frame_offset, num_frames);
-    return paddleaudio::sox_effects::apply_effects_file(
-        path, effects, normalize, channels_first, format);
-}
-
-void save_audio_file(const std::string& path,
-                     py::array tensor,
-                     int64_t sample_rate,
-                     bool channels_first,
-                     tl::optional<double> compression,
-                     tl::optional<std::string> format,
-                     tl::optional<std::string> encoding,
-                     tl::optional<int64_t> bits_per_sample) {
-    validate_input_tensor(tensor);
-
-    const auto filetype = [&]() {
-        if (format.has_value()) return format.value();
-        return get_filetype(path);
-    }();
-
-    if (filetype == "amr-nb") {
-        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
-        //TORCH_CHECK(num_channels == 1,
-        //            "amr-nb format only supports single channel audio.");
-    } else if (filetype == "htk") {
-        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
-       // TORCH_CHECK(num_channels == 1,
-        //            "htk format only supports single channel audio.");
-    } else if (filetype == "gsm") {
-        const auto num_channels = tensor.shape(channels_first ? 0 : 1);
-        //TORCH_CHECK(num_channels == 1,
-        //            "gsm format only supports single channel audio.");
-        //TORCH_CHECK(sample_rate == 8000,
-        //            "gsm format only supports a sampling rate of 8kHz.");
-    }
-    const auto signal_info =
-        get_signalinfo(&tensor, sample_rate, filetype, channels_first);
-    const auto encoding_info = get_encodinginfo_for_save(
-        filetype, tensor.dtype(), compression, encoding, bits_per_sample);
-
-    SoxFormat sf(sox_open_write(path.c_str(),
-                                &signal_info,
-                                &encoding_info,
-                                /*filetype=*/filetype.c_str(),
-                                /*oob=*/nullptr,
-                                /*overwrite_permitted=*/nullptr));
-
-    if (static_cast<sox_format_t*>(sf) == nullptr) {
-        throw std::runtime_error(
-            "Error saving audio file: failed to open file " + path);
-    }
-
-    paddleaudio::sox_effects_chain::SoxEffectsChain chain(
-        /*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
-        /*output_encoding=*/sf->encoding);
-    chain.addInputTensor(&tensor, sample_rate, channels_first);
-    chain.addOutputFile(sf);
-    chain.run();
-}
-
-}  // namespace sox_io
-}  // namespace paddleaudio
\ No newline at end of file
diff --git a/paddlespeech/audio/src/sox/io.h b/paddlespeech/audio/src/sox/io.h
deleted file mode 100644
index f8001d872..000000000
--- a/paddlespeech/audio/src/sox/io.h
+++ /dev/null
@@ -1,41 +0,0 @@
-
-// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
-// All rights reserved.
-
-#pragma once
-
-#include "paddlespeech/audio/src/optional/optional.hpp"
-#include "paddlespeech/audio/src/sox/utils.h"
-
-namespace paddleaudio {
-namespace sox_io {
-
-auto get_effects(const tl::optional<int64_t>& frame_offset,
-                 const tl::optional<int64_t>& num_frames)
-    -> std::vector<std::vector<std::string>>;
-
-using MetaDataTuple =
-    std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
-
-tl::optional<MetaDataTuple> get_info_file(
-    const std::string& path, const tl::optional<std::string>& format);
-
-tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
-    const std::string& path,
-    const tl::optional<int64_t>& frame_offset,
-    const tl::optional<int64_t>& num_frames,
-    tl::optional<bool> normalize,
-    tl::optional<bool> channels_first,
-    const tl::optional<std::string>& format);
-
-void save_audio_file(const std::string& path,
-                     py::array tensor,
-                     int64_t sample_rate,
-                     bool channels_first,
-                     tl::optional<double> compression,
-                     tl::optional<std::string> format,
-                     tl::optional<std::string> encoding,
-                     tl::optional<int64_t> bits_per_sample);
-
-}  // namespace sox_io
-}  // namespace paddleaudio
diff --git a/paddlespeech/audio/src/sox/types.cpp b/paddlespeech/audio/src/sox/types.cpp
deleted file mode 100644
index ab1808be1..000000000
--- a/paddlespeech/audio/src/sox/types.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp
-
-#include "paddlespeech/audio/src/sox/types.h"
-#include <ostream>
-#include <sstream>
-
-namespace paddleaudio {
-namespace sox_utils {
-
-Format get_format_from_string(const std::string& format) {
-  if (format == "wav")
-    return Format::WAV;
-  if (format == "mp3")
-    return Format::MP3;
-  if (format == "flac")
-    return Format::FLAC;
-  if (format == "ogg" || format == "vorbis")
-    return Format::VORBIS;
-  if (format == "amr-nb")
-    return Format::AMR_NB;
-  if (format == "amr-wb")
-    return Format::AMR_WB;
-  if (format == "amb")
-    return Format::AMB;
-  if (format == "sph")
-    return Format::SPHERE;
-  if (format == "htk")
-    return Format::HTK;
-  if (format == "gsm")
-    return Format::GSM;
-  std::ostringstream stream;
-  stream << "Internal Error: unexpected format value: " << format;
-  throw std::runtime_error(stream.str());
-}
-
-std::string to_string(Encoding v) {
-  switch (v) {
-    case Encoding::UNKNOWN:
-      return "UNKNOWN";
-    case Encoding::PCM_SIGNED:
-      return "PCM_S";
-    case Encoding::PCM_UNSIGNED:
-      return "PCM_U";
-    case Encoding::PCM_FLOAT:
-      return "PCM_F";
-    case Encoding::FLAC:
-      return "FLAC";
-    case Encoding::ULAW:
-      return "ULAW";
-    case Encoding::ALAW:
-      return "ALAW";
-    case Encoding::MP3:
-      return "MP3";
-    case Encoding::VORBIS:
-      return "VORBIS";
-    case Encoding::AMR_WB:
-      return "AMR_WB";
-    case Encoding::AMR_NB:
-      return "AMR_NB";
-    case Encoding::OPUS:
-      return "OPUS";
-    default:
-      throw std::runtime_error("Internal Error: unexpected encoding.");
-  }
-}
-
-Encoding get_encoding_from_option(const tl::optional<std::string> encoding) {
-  if (!encoding.has_value())
-    return Encoding::NOT_PROVIDED;
-  std::string v = encoding.value();
-  if (v == "PCM_S")
-    return Encoding::PCM_SIGNED;
-  if (v == "PCM_U")
-    return Encoding::PCM_UNSIGNED;
-  if (v == "PCM_F")
-    return Encoding::PCM_FLOAT;
-  if (v == "ULAW")
-    return Encoding::ULAW;
-  if (v == "ALAW")
-    return Encoding::ALAW;
-  std::ostringstream stream;
-  stream << "Internal Error: unexpected encoding value: " << v;
-  throw std::runtime_error(stream.str());
-}
-
-BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth) {
-  if (!bit_depth.has_value())
-    return BitDepth::NOT_PROVIDED;
-  int64_t v = bit_depth.value();
-  switch (v) {
-    case 8:
-      return BitDepth::B8;
-    case 16:
-      return BitDepth::B16;
-    case 24:
-      return BitDepth::B24;
-    case 32:
-      return BitDepth::B32;
-    case 64:
-      return BitDepth::B64;
-    default: {
-      std::ostringstream s;
-      s << "Internal Error: unexpected bit depth value: " << v;
-      throw std::runtime_error(s.str());
-    }
-  }
-}
-
-std::string get_encoding(sox_encoding_t encoding) {
-  switch (encoding) {
-    case SOX_ENCODING_UNKNOWN:
-      return "UNKNOWN";
-    case SOX_ENCODING_SIGN2:
-      return "PCM_S";
-    case SOX_ENCODING_UNSIGNED:
-      return "PCM_U";
-    case SOX_ENCODING_FLOAT:
-      return "PCM_F";
-    case SOX_ENCODING_FLAC:
-      return "FLAC";
-    case SOX_ENCODING_ULAW:
-      return "ULAW";
-    case SOX_ENCODING_ALAW:
-      return "ALAW";
-    case SOX_ENCODING_MP3:
-      return "MP3";
-    case SOX_ENCODING_VORBIS:
-      return "VORBIS";
-    case SOX_ENCODING_AMR_WB:
-      return "AMR_WB";
-    case SOX_ENCODING_AMR_NB:
-      return "AMR_NB";
-    case SOX_ENCODING_OPUS:
-      return "OPUS";
-    case SOX_ENCODING_GSM:
-      return "GSM";
-    default:
-      return "UNKNOWN";
-  }
-}
-
-} // namespace sox_utils
-} // namespace paddleaudio
diff --git a/paddlespeech/audio/src/sox/types.h b/paddlespeech/audio/src/sox/types.h
deleted file mode 100644
index 824c0f632..000000000
--- a/paddlespeech/audio/src/sox/types.h
+++ /dev/null
@@ -1,58 +0,0 @@
-//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h
-#pragma once
-
-#include <sox.h>
-#include "paddlespeech/audio/src/optional/optional.hpp"
-
-namespace paddleaudio {
-namespace sox_utils {
-
-enum class Format {
-  WAV,
-  MP3,
-  FLAC,
-  VORBIS,
-  AMR_NB,
-  AMR_WB,
-  AMB,
-  SPHERE,
-  GSM,
-  HTK,
-};
-
-Format get_format_from_string(const std::string& format);
-
-enum class Encoding {
-  NOT_PROVIDED,
-  UNKNOWN,
-  PCM_SIGNED,
-  PCM_UNSIGNED,
-  PCM_FLOAT,
-  FLAC,
-  ULAW,
-  ALAW,
-  MP3,
-  VORBIS,
-  AMR_WB,
-  AMR_NB,
-  OPUS,
-};
-
-std::string to_string(Encoding v);
-Encoding get_encoding_from_option(const tl::optional<std::string> encoding);
-
-enum class BitDepth : unsigned {
-  NOT_PROVIDED = 0,
-  B8 = 8,
-  B16 = 16,
-  B24 = 24,
-  B32 = 32,
-  B64 = 64,
-};
-
-BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth);
-
-std::string get_encoding(sox_encoding_t encoding);
-
-} // namespace sox_utils
-} // namespace torchaudio
\ No newline at end of file
diff --git a/paddlespeech/audio/src/sox/utils.cpp b/paddlespeech/audio/src/sox/utils.cpp
deleted file mode 100644
index a44031bb4..000000000
--- a/paddlespeech/audio/src/sox/utils.cpp
+++ /dev/null
@@ -1,488 +0,0 @@
-//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.cpp
-
-#include <sox.h>
-#include "paddlespeech/audio/src/sox/types.h"
-#include "paddlespeech/audio/src/sox/utils.h"
-
-namespace paddleaudio {
-namespace sox_utils {
-
-void set_seed(const int64_t seed) {
-  sox_get_globals()->ranqd1 = static_cast<sox_int32_t>(seed);
-}
-
-void set_verbosity(const int64_t verbosity) {
-  sox_get_globals()->verbosity = static_cast<unsigned>(verbosity);
-}
-
-void set_use_threads(const bool use_threads) {
-  sox_get_globals()->use_threads = static_cast<sox_bool>(use_threads);
-}
-
-void set_buffer_size(const int64_t buffer_size) {
-  sox_get_globals()->bufsiz = static_cast<size_t>(buffer_size);
-}
-
-int64_t get_buffer_size() {
-  return sox_get_globals()->bufsiz;
-}
-
-std::vector<std::vector<std::string>> list_effects() {
-  std::vector<std::vector<std::string>> effects;
-  for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) {
-    const sox_effect_handler_t* handler = (*fns)();
-    if (handler && handler->name) {
-      if (UNSUPPORTED_EFFECTS.find(handler->name) ==
-          UNSUPPORTED_EFFECTS.end()) {
-        effects.emplace_back(std::vector<std::string>{
-            handler->name,
-            handler->usage ? std::string(handler->usage) : std::string("")});
-      }
-    }
-  }
-  return effects;
-}
-
-std::vector<std::string> list_write_formats() {
-  std::vector<std::string> formats;
-  for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
-    const sox_format_handler_t* handler = fns->fn();
-    for (const char* const* names = handler->names; *names; ++names) {
-      if (!strchr(*names, '/') && handler->write)
-        formats.emplace_back(*names);
-    }
-  }
-  return formats;
-}
-
-std::vector<std::string> list_read_formats() {
-  std::vector<std::string> formats;
-  for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
-    const sox_format_handler_t* handler = fns->fn();
-    for (const char* const* names = handler->names; *names; ++names) {
-      if (!strchr(*names, '/') && handler->read)
-        formats.emplace_back(*names);
-    }
-  }
-  return formats;
-}
-
-SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {}
-SoxFormat::~SoxFormat() {
-  close();
-}
-
-sox_format_t* SoxFormat::operator->() const noexcept {
-  return fd_;
-}
-SoxFormat::operator sox_format_t*() const noexcept {
-  return fd_;
-}
-
-void SoxFormat::close() {
-  if (fd_ != nullptr) {
-    sox_close(fd_);
-    fd_ = nullptr;
-  }
-}
-
-void validate_input_file(const SoxFormat& sf, const std::string& path) {
-  if (static_cast<sox_format_t*>(sf) == nullptr) {
-    throw std::runtime_error(
-        "Error loading audio file: failed to open file " + path);
-  }
-  if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
-    throw std::runtime_error("Error loading audio file: unknown encoding.");
-  }
-}
-
-void validate_input_memfile(const SoxFormat &sf) {
-    return validate_input_file(sf, "<in memory buffer>");
-}
-
-void validate_input_tensor(const py::array tensor) {
-  if (tensor.ndim() != 2) {
-    throw std::runtime_error("Input tensor has to be 2D.");
-  }
-
-  char dtype = tensor.dtype().char_();
-  bool flag = (dtype == 'f') || (dtype == 'd') || (dtype == 'l') || (dtype == 'i');
-  if (flag == false) {
-      throw std::runtime_error(
-          "Input tensor has to be one of float32, int32, int16 or uint8 type.");
-  }
-}
-
-py::dtype get_dtype(
-    const sox_encoding_t encoding,
-    const unsigned precision) {
-    switch (encoding) {
-      case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
-        return py::dtype('u1');
-      case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV
-        switch (precision) {
-          case 16:
-            return py::dtype("i2");
-          case 24: // Cast 24-bit to 32-bit.
-          case 32:
-            return py::dtype('i');
-          default:
-            throw std::runtime_error(
-                "Only 16, 24, and 32 bits are supported for signed PCM.");
-        }
-      default:
-        // default to float32 for the other formats, including
-        // 32-bit flaoting-point WAV,
-        // MP3,
-        // FLAC,
-        // VORBIS etc...
-        return py::dtype("f");
-    }
-}
-
-py::array convert_to_tensor(
-    sox_sample_t* buffer,
-    const int32_t num_samples,
-    const int32_t num_channels,
-    const py::dtype dtype,
-    const bool normalize,
-    const bool channels_first) {
-  py::array t;
-  uint64_t dummy = 0;
-  SOX_SAMPLE_LOCALS;
-  if (normalize || dtype.char_() == 'f') {
-    t = py::array(dtype, {num_samples / num_channels, num_channels});
-    auto ptr = (float*)t.mutable_data(0, 0);
-    for (int32_t i = 0; i < num_samples; ++i) {
-      ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy);
-    }
-  } else if (dtype.char_() == 'i') {
-    //t = torch::from_blob(
-    //        buffer, {num_samples / num_channels, num_channels}, torch::kInt32)
-    //        .clone();
-    t = py::array(dtype, {num_samples / num_channels, num_channels});
-    auto ptr = (int*)t.mutable_data(0, 0);
-    for (int32_t i = 0; i < num_samples; ++i) {
-      ptr[i] = buffer[i];
-    }
-  } else if (dtype.char_() == 'h') { // int16
-    t = py::array(dtype, {num_samples / num_channels, num_channels});
-    auto ptr = (int16_t*)t.mutable_data(0, 0);
-    for (int32_t i = 0; i < num_samples; ++i) {
-      ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy);
-    }
-  } else if (dtype.char_() == 'b') {
-    //t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8);
-    auto ptr = (uint8_t*)t.mutable_data(0,0);
-    for (int32_t i = 0; i < num_samples; ++i) {
-      ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy);
-    }
-  } else {
-    throw std::runtime_error("Unsupported dtype.");
-  }
-  return t;
-}
-
-const std::string get_filetype(const std::string path) {
-  std::string ext = path.substr(path.find_last_of(".") + 1);
-  std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
-  return ext;
-}
-
-namespace {
-
-std::tuple<sox_encoding_t, unsigned> get_save_encoding_for_wav(
-    const std::string format,
-    py::dtype dtype,
-    const Encoding& encoding,
-    const BitDepth& bits_per_sample) {
-  switch (encoding) {
-    case Encoding::NOT_PROVIDED:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-          switch (dtype.num()) {
-            case 11: // float32 numpy dtype num 
-              return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
-            case 5: // int numpy dtype num
-              return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
-            case 3: // int16 numpy
-              return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
-            case 1: // byte numpy
-              return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
-            default:
-              throw std::runtime_error("Internal Error: Unexpected dtype.");
-          }
-        case BitDepth::B8:
-          return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
-        default:
-          return std::make_tuple<>(
-              SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
-      }
-    case Encoding::PCM_SIGNED:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-          return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
-        case BitDepth::B8:
-          throw std::runtime_error(
-              format + " does not support 8-bit signed PCM encoding.");
-        default:
-          return std::make_tuple<>(
-              SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
-      }
-    case Encoding::PCM_UNSIGNED:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-        case BitDepth::B8:
-          return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
-        default:
-          throw std::runtime_error(
-              format + " only supports 8-bit for unsigned PCM encoding.");
-      }
-    case Encoding::PCM_FLOAT:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-        case BitDepth::B32:
-          return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
-        case BitDepth::B64:
-          return std::make_tuple<>(SOX_ENCODING_FLOAT, 64);
-        default:
-          throw std::runtime_error(
-              format +
-              " only supports 32-bit or 64-bit for floating-point PCM encoding.");
-      }
-    case Encoding::ULAW:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-        case BitDepth::B8:
-          return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
-        default:
-          throw std::runtime_error(
-              format + " only supports 8-bit for mu-law encoding.");
-      }
-    case Encoding::ALAW:
-      switch (bits_per_sample) {
-        case BitDepth::NOT_PROVIDED:
-        case BitDepth::B8:
-          return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
-        default:
-          throw std::runtime_error(
-              format + " only supports 8-bit for a-law encoding.");
-      }
-    default:
-      throw std::runtime_error(
-          format + " does not support encoding: " + to_string(encoding));
-  }
-}
-
-std::tuple<sox_encoding_t, unsigned> get_save_encoding(
-    const std::string& format,
-    const py::dtype dtype,
-    const tl::optional<std::string> encoding,
-    const tl::optional<int64_t> bits_per_sample) {
-  const Format fmt = get_format_from_string(format);
-  const Encoding enc = get_encoding_from_option(encoding);
-  const BitDepth bps = get_bit_depth_from_option(bits_per_sample);
-
-  switch (fmt) {
-    case Format::WAV:
-    case Format::AMB:
-      return get_save_encoding_for_wav(format, dtype, enc, bps);
-    case Format::MP3:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("mp3 does not support `encoding` option.");
-      if (bps != BitDepth::NOT_PROVIDED)
-        throw std::runtime_error(
-            "mp3 does not support `bits_per_sample` option.");
-      return std::make_tuple<>(SOX_ENCODING_MP3, 16);
-    case Format::HTK:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("htk does not support `encoding` option.");
-      if (bps != BitDepth::NOT_PROVIDED)
-        throw std::runtime_error(
-            "htk does not support `bits_per_sample` option.");
-      return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
-    case Format::VORBIS:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("vorbis does not support `encoding` option.");
-      if (bps != BitDepth::NOT_PROVIDED)
-        throw std::runtime_error(
-            "vorbis does not support `bits_per_sample` option.");
-      return std::make_tuple<>(SOX_ENCODING_VORBIS, 16);
-    case Format::AMR_NB:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("amr-nb does not support `encoding` option.");
-      if (bps != BitDepth::NOT_PROVIDED)
-        throw std::runtime_error(
-            "amr-nb does not support `bits_per_sample` option.");
-      return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16);
-    case Format::FLAC:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("flac does not support `encoding` option.");
-      switch (bps) {
-        case BitDepth::B32:
-        case BitDepth::B64:
-          throw std::runtime_error(
-              "flac does not support `bits_per_sample` larger than 24.");
-        default:
-          return std::make_tuple<>(
-              SOX_ENCODING_FLAC, static_cast<unsigned>(bps));
-      }
-    case Format::SPHERE:
-      switch (enc) {
-        case Encoding::NOT_PROVIDED:
-        case Encoding::PCM_SIGNED:
-          switch (bps) {
-            case BitDepth::NOT_PROVIDED:
-              return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
-            default:
-              return std::make_tuple<>(
-                  SOX_ENCODING_SIGN2, static_cast<unsigned>(bps));
-          }
-        case Encoding::PCM_UNSIGNED:
-          throw std::runtime_error(
-              "sph does not support unsigned integer PCM.");
-        case Encoding::PCM_FLOAT:
-          throw std::runtime_error("sph does not support floating point PCM.");
-        case Encoding::ULAW:
-          switch (bps) {
-            case BitDepth::NOT_PROVIDED:
-            case BitDepth::B8:
-              return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
-            default:
-              throw std::runtime_error(
-                  "sph only supports 8-bit for mu-law encoding.");
-          }
-        case Encoding::ALAW:
-          switch (bps) {
-            case BitDepth::NOT_PROVIDED:
-            case BitDepth::B8:
-              return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
-            default:
-              return std::make_tuple<>(
-                  SOX_ENCODING_ALAW, static_cast<unsigned>(bps));
-          }
-        default:
-          throw std::runtime_error(
-              "sph does not support encoding: " + encoding.value());
-      }
-    case Format::GSM:
-      if (enc != Encoding::NOT_PROVIDED)
-        throw std::runtime_error("gsm does not support `encoding` option.");
-      if (bps != BitDepth::NOT_PROVIDED)
-        throw std::runtime_error(
-            "gsm does not support `bits_per_sample` option.");
-      return std::make_tuple<>(SOX_ENCODING_GSM, 16);
-
-    default:
-      throw std::runtime_error("Unsupported format: " + format);
-  }
-}
-
-unsigned get_precision(const std::string filetype, py::dtype dtype) {
-  if (filetype == "mp3")
-    return SOX_UNSPEC;
-  if (filetype == "flac")
-    return 24;
-  if (filetype == "ogg" || filetype == "vorbis")
-    return SOX_UNSPEC;
-  if (filetype == "wav" || filetype == "amb") {
-    switch (dtype.num()) {
-      case 1: // byte in numpy dype num
-        return 8;
-      case 3: // short, in numpy dtype num
-        return 16;
-      case 5: // int, numpy dtype 
-        return 32;
-      case 11: // float, numpy dtype
-        return 32;
-      default:
-        throw std::runtime_error("Unsupported dtype.");
-    }
-  }
-  if (filetype == "sph")
-    return 32;
-  if (filetype == "amr-nb") {
-    return 16;
-  }
-  if (filetype == "gsm") {
-    return 16;
-  }
-  if (filetype == "htk") {
-    return 16;
-  }
-  throw std::runtime_error("Unsupported file type: " + filetype);
-}
-
-} // namespace
-
-sox_signalinfo_t get_signalinfo(
-    const py::array* waveform,
-    const int64_t sample_rate,
-    const std::string filetype,
-    const bool channels_first) {
-  return sox_signalinfo_t{
-      /*rate=*/static_cast<sox_rate_t>(sample_rate),
-      /*channels=*/
-      static_cast<unsigned>(waveform->shape(channels_first ? 0 : 1)),
-      /*precision=*/get_precision(filetype, waveform->dtype()),
-      /*length=*/static_cast<uint64_t>(waveform->size())};
-}
-
-sox_encodinginfo_t get_tensor_encodinginfo(py::dtype dtype) {
-  sox_encoding_t encoding = [&]() {
-    switch (dtype.num()) {
-      case 1: // byte
-        return SOX_ENCODING_UNSIGNED;
-      case 3: // short
-        return SOX_ENCODING_SIGN2;
-      case 5: // int32
-        return SOX_ENCODING_SIGN2;
-      case 11: // float
-        return SOX_ENCODING_FLOAT;
-      default:
-        throw std::runtime_error("Unsupported dtype.");
-    }
-  }();
-  unsigned bits_per_sample = [&]() {
-    switch (dtype.num()) {
-      case 1: // byte
-        return 8;
-      case 3: //short
-        return 16;
-      case 5: // int32
-        return 32;
-      case 11: // float
-        return 32;
-      default:
-        throw std::runtime_error("Unsupported dtype.");
-    }
-  }();
-  return sox_encodinginfo_t{
-      /*encoding=*/encoding,
-      /*bits_per_sample=*/bits_per_sample,
-      /*compression=*/HUGE_VAL,
-      /*reverse_bytes=*/sox_option_default,
-      /*reverse_nibbles=*/sox_option_default,
-      /*reverse_bits=*/sox_option_default,
-      /*opposite_endian=*/sox_false};
-}
-
-sox_encodinginfo_t get_encodinginfo_for_save(
-    const std::string& format,
-    const py::dtype dtype,
-    const tl::optional<double> compression,
-    const tl::optional<std::string> encoding,
-    const tl::optional<int64_t> bits_per_sample) {
-  auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample);
-  return sox_encodinginfo_t{
-      /*encoding=*/std::get<0>(enc),
-      /*bits_per_sample=*/std::get<1>(enc),
-      /*compression=*/compression.value_or(HUGE_VAL),
-      /*reverse_bytes=*/sox_option_default,
-      /*reverse_nibbles=*/sox_option_default,
-      /*reverse_bits=*/sox_option_default,
-      /*opposite_endian=*/sox_false};
-}
-
-} // namespace sox_utils
-} // namespace torchaudio
diff --git a/paddlespeech/audio/src/sox/utils.h b/paddlespeech/audio/src/sox/utils.h
deleted file mode 100644
index 5b015ece0..000000000
--- a/paddlespeech/audio/src/sox/utils.h
+++ /dev/null
@@ -1,120 +0,0 @@
-//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/utils.h
-
-#pragma once
-
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <sox.h>
-
-#include "paddlespeech/audio/src/optional/optional.hpp"
-
-namespace py = pybind11;
-
-namespace paddleaudio {
-namespace sox_utils {
-
-////////////////////////////////////////////////////////////////////////////////
-// APIs for Python interaction
-////////////////////////////////////////////////////////////////////////////////
-
-/// Set sox global options
-void set_seed(const int64_t seed);
-
-void set_verbosity(const int64_t verbosity);
-
-void set_use_threads(const bool use_threads);
-
-void set_buffer_size(const int64_t buffer_size);
-
-int64_t get_buffer_size();
-
-std::vector<std::vector<std::string>> list_effects();
-
-std::vector<std::string> list_read_formats();
-
-std::vector<std::string> list_write_formats();
-
-////////////////////////////////////////////////////////////////////////////////
-// Utilities for sox_io / sox_effects implementations
-////////////////////////////////////////////////////////////////////////////////
-
-const std::unordered_set<std::string> UNSUPPORTED_EFFECTS =
-    {"input", "output", "spectrogram", "noiseprof", "noisered", "splice"};
-
-/// helper class to automatically close sox_format_t*
-struct SoxFormat {
-  explicit SoxFormat(sox_format_t* fd) noexcept;
-  SoxFormat(const SoxFormat& other) = delete;
-  SoxFormat(SoxFormat&& other) = delete;
-  SoxFormat& operator=(const SoxFormat& other) = delete;
-  SoxFormat& operator=(SoxFormat&& other) = delete;
-  ~SoxFormat();
-  sox_format_t* operator->() const noexcept;
-  operator sox_format_t*() const noexcept;
-
-  void close();
-
- private:
-  sox_format_t* fd_;
-};
-
-///
-/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
-void validate_input_tensor(const py::array);
-
-void validate_input_file(const SoxFormat& sf, const std::string& path);
-
-void validate_input_memfile(const SoxFormat &sf);
-///
-/// Get target dtype for the given encoding and precision.
-py::dtype get_dtype(
-    const sox_encoding_t encoding,
-    const unsigned precision);
-
-///
-/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
-/// NOTE: This function might modify the values in the input buffer to
-/// reduce the number of memory copy.
-/// @param buffer Pointer to buffer that contains audio data.
-/// @param num_samples The number of samples to read.
-/// @param num_channels The number of channels. Used to reshape the resulting
-/// Tensor.
-/// @param dtype Target dtype. Determines the output dtype and value range in
-/// conjunction with normalization.
-/// @param noramlize Perform normalization. Only effective when dtype is not
-/// kFloat32. When effective, the output tensor is kFloat32 type and value range
-/// is [-1.0, 1.0]
-/// @param channels_first When True, output Tensor has shape of [num_channels,
-/// num_frames].
-py::array convert_to_tensor(
-    sox_sample_t* buffer,
-    const int32_t num_samples,
-    const int32_t num_channels,
-    const py::dtype dtype,
-    const bool normalize,
-    const bool channels_first);
-
-/// Extract extension from file path
-const std::string get_filetype(const std::string path);
-
-/// Get sox_signalinfo_t for passing a py::array object.
-sox_signalinfo_t get_signalinfo(
-    const py::array* waveform,
-    const int64_t sample_rate,
-    const std::string filetype,
-    const bool channels_first);
-
-/// Get sox_encodinginfo_t for Tensor I/O
-sox_encodinginfo_t get_tensor_encodinginfo(const py::dtype dtype);
-
-/// Get sox_encodinginfo_t for saving to file/file object
-sox_encodinginfo_t get_encodinginfo_for_save(
-    const std::string& format,
-    const py::dtype dtype,
-    const tl::optional<double> compression,
-    const tl::optional<std::string> encoding,
-    const tl::optional<int64_t> bits_per_sample);
-
-
-} // namespace sox_utils
-} // namespace paddleaudio