[audio]replace kaldi fbank with kaldi-native-fbank in paddleaudio (#2799)

* replace kaldi_fbank with kaldi-native-fbank in paddleaudio * fix mac
3 years ago · d7a6268bcc
parent 964211a81b
commit d7a6268bcc
30 changed files with 5234 additions and 359 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -31,7 +31,7 @@ repos:
        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
        -  --builtins=G,request
        -  --jobs=1
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$

 -   repo : https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.0.1
@ -53,13 +53,13 @@ repos:
        entry: bash .pre-commit-hooks/clang-format.hook -i
        language: system
        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party/kaldi-native-fbank/csrc|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
    -   id: cpplint
        name: cpplint
        description: Static code analysis of C/C++ files
        language: python
        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party/kaldi-native-fbank/csrc|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
        entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent
 -   repo: https://github.com/asottile/reorder_python_imports
    rev: v2.4.0
--- a/audio/CMakeLists.txt
+++ b/audio/CMakeLists.txt
@ -41,24 +41,18 @@ option(BUILD_PADDLEAUDIO_PYTHON_EXTENSION "Build Python extension" ON)
 # cmake
 set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJECT_SOURCE_DIR}/cmake/external")

-if (NOT MSVC)
-    find_package(GFortranLibs REQUIRED)
-    include(FortranCInterface)
-    include(FindGFortranLibs REQUIRED)
-endif()
-
 # fc_patch dir
 set(FETCHCONTENT_QUIET off)
 get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
 set(FETCHCONTENT_BASE_DIR ${fc_patch})
 set(THIRD_PARTY_PATH ${fc_patch})

-include(openblas)
-
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 include(cmake/pybind.cmake)
 include_directories(${PYTHON_INCLUDE_DIR})

+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/paddleaudio/third_party/)
+
 # packages
 find_package(Python3 COMPONENTS Interpreter Development)

--- a/audio/paddleaudio/CMakeLists.txt
+++ b/audio/paddleaudio/CMakeLists.txt
@ -1,19 +1,3 @@

 add_subdirectory(third_party)
 add_subdirectory(src)
-
-if (APPLE) 
-  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib
-          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib)
-endif(APPLE)
-
-if (UNIX AND NOT APPLE)
-  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgfortran.so.5
-          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
-
-  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libquadmath.so.0
-          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
-
-  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.so.1
-          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
-endif()
--- a/audio/paddleaudio/kaldi/init.py
+++ b/audio/paddleaudio/kaldi/init.py
@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .kaldi import fbank
-from .kaldi import pitch
+#from .kaldi import pitch
--- a/audio/paddleaudio/kaldi/kaldi.py
+++ b/audio/paddleaudio/kaldi/kaldi.py
@ -16,7 +16,6 @@ from paddleaudio._internal import module_utils

 __all__ = [
    'fbank',
-    'pitch',
 ]


@ -33,8 +32,6 @@ def fbank(
        round_to_power_of_two: bool=True,
        blackman_coeff: float=0.42,
        snip_edges: bool=True,
-        allow_downsample: bool=False,
-        allow_upsample: bool=False,
        max_feature_vectors: int=-1,
        num_bins: int=23,
        low_freq: float=20,
@ -62,8 +59,6 @@ def fbank(
    frame_opts.round_to_power_of_two = round_to_power_of_two
    frame_opts.blackman_coeff = blackman_coeff
    frame_opts.snip_edges = snip_edges
-    frame_opts.allow_downsample = allow_downsample
-    frame_opts.allow_upsample = allow_upsample
    frame_opts.max_feature_vectors = max_feature_vectors

    mel_opts.num_bins = num_bins
@ -85,48 +80,48 @@ def fbank(
    return feat


-@module_utils.requires_kaldi()
-def pitch(wav,
-          samp_freq: int=16000,
-          frame_shift_ms: float=10.0,
-          frame_length_ms: float=25.0,
-          preemph_coeff: float=0.0,
-          min_f0: int=50,
-          max_f0: int=400,
-          soft_min_f0: float=10.0,
-          penalty_factor: float=0.1,
-          lowpass_cutoff: int=1000,
-          resample_freq: int=4000,
-          delta_pitch: float=0.005,
-          nccf_ballast: int=7000,
-          lowpass_filter_width: int=1,
-          upsample_filter_width: int=5,
-          max_frames_latency: int=0,
-          frames_per_chunk: int=0,
-          simulate_first_pass_online: bool=False,
-          recompute_frame: int=500,
-          nccf_ballast_online: bool=False,
-          snip_edges: bool=True):
-    pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
-    pitch_opts.samp_freq = samp_freq
-    pitch_opts.frame_shift_ms = frame_shift_ms
-    pitch_opts.frame_length_ms = frame_length_ms
-    pitch_opts.preemph_coeff = preemph_coeff
-    pitch_opts.min_f0 = min_f0
-    pitch_opts.max_f0 = max_f0
-    pitch_opts.soft_min_f0 = soft_min_f0
-    pitch_opts.penalty_factor = penalty_factor
-    pitch_opts.lowpass_cutoff = lowpass_cutoff
-    pitch_opts.resample_freq = resample_freq
-    pitch_opts.delta_pitch = delta_pitch
-    pitch_opts.nccf_ballast = nccf_ballast
-    pitch_opts.lowpass_filter_width = lowpass_filter_width
-    pitch_opts.upsample_filter_width = upsample_filter_width
-    pitch_opts.max_frames_latency = max_frames_latency
-    pitch_opts.frames_per_chunk = frames_per_chunk
-    pitch_opts.simulate_first_pass_online = simulate_first_pass_online
-    pitch_opts.recompute_frame = recompute_frame
-    pitch_opts.nccf_ballast_online = nccf_ballast_online
-    pitch_opts.snip_edges = snip_edges
-    pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
-    return pitch
+#@module_utils.requires_kaldi()
+#def pitch(wav,
+#samp_freq: int=16000,
+#frame_shift_ms: float=10.0,
+#frame_length_ms: float=25.0,
+#preemph_coeff: float=0.0,
+#min_f0: int=50,
+#max_f0: int=400,
+#soft_min_f0: float=10.0,
+#penalty_factor: float=0.1,
+#lowpass_cutoff: int=1000,
+#resample_freq: int=4000,
+#delta_pitch: float=0.005,
+#nccf_ballast: int=7000,
+#lowpass_filter_width: int=1,
+#upsample_filter_width: int=5,
+#max_frames_latency: int=0,
+#frames_per_chunk: int=0,
+#simulate_first_pass_online: bool=False,
+#recompute_frame: int=500,
+#nccf_ballast_online: bool=False,
+#snip_edges: bool=True):
+#pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
+#pitch_opts.samp_freq = samp_freq
+#pitch_opts.frame_shift_ms = frame_shift_ms
+#pitch_opts.frame_length_ms = frame_length_ms
+#pitch_opts.preemph_coeff = preemph_coeff
+#pitch_opts.min_f0 = min_f0
+#pitch_opts.max_f0 = max_f0
+#pitch_opts.soft_min_f0 = soft_min_f0
+#pitch_opts.penalty_factor = penalty_factor
+#pitch_opts.lowpass_cutoff = lowpass_cutoff
+#pitch_opts.resample_freq = resample_freq
+#pitch_opts.delta_pitch = delta_pitch
+#pitch_opts.nccf_ballast = nccf_ballast
+#pitch_opts.lowpass_filter_width = lowpass_filter_width
+#pitch_opts.upsample_filter_width = upsample_filter_width
+#pitch_opts.max_frames_latency = max_frames_latency
+#pitch_opts.frames_per_chunk = frames_per_chunk
+#pitch_opts.simulate_first_pass_online = simulate_first_pass_online
+#pitch_opts.recompute_frame = recompute_frame
+#pitch_opts.nccf_ballast_online = nccf_ballast_online
+#pitch_opts.snip_edges = snip_edges
+#pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
+#return pitch
--- a/audio/paddleaudio/src/CMakeLists.txt
+++ b/audio/paddleaudio/src/CMakeLists.txt
@ -52,7 +52,7 @@ if(BUILD_KALDI)
  list(
    APPEND
    LIBPADDLEAUDIO_LINK_LIBRARIES
-    libkaldi
+    kaldi-native-fbank-core
  )
  list(
    APPEND
@ -92,14 +92,6 @@ define_library(
  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
 )

-if (APPLE)
-  add_custom_command(TARGET libpaddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/libgcc_s.1.1.dylib" libpaddleaudio.so)
-endif(APPLE)
-
-if (UNIX AND NOT APPLE)
-  set_target_properties(libpaddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN")
-endif()
-
 if (APPLE)
  set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
 else()
@ -207,11 +199,3 @@ define_extension(
 #     )
 # endif()
 endif()
-
-if (APPLE)
-  add_custom_command(TARGET _paddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/lib/libgcc_s.1.1.dylib" _paddleaudio.so)
-endif(APPLE)
-
-if (UNIX AND NOT APPLE)
-  set_target_properties(_paddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN/lib")
-endif()
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common.h
@ -16,7 +16,7 @@

 #include "pybind11/pybind11.h"
 #include "pybind11/numpy.h"
-#include "feat/feature-window.h"
+#include "kaldi-native-fbank/csrc/feature-window.h"

 namespace paddleaudio {
 namespace kaldi {
@ -28,18 +28,18 @@ class StreamingFeatureTpl {
  public:
    typedef typename F::Options Options;
    StreamingFeatureTpl(const Options& opts);
-    bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
-                        ::kaldi::Vector<::kaldi::BaseFloat>* feats);
-    void Reset() { remained_wav_.Resize(0); }
+    bool ComputeFeature(const std::vector<float>& wav,
+                        std::vector<float>* feats);
+    void Reset() { remained_wav_.resize(0); }

    int Dim() { return computer_.Dim(); }

  private:
-    bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
-                 ::kaldi::Vector<::kaldi::BaseFloat>* feats);
+    bool Compute(const std::vector<float>& waves,
+                 std::vector<float>* feats);
    Options opts_;
-    ::kaldi::FeatureWindowFunction window_function_;
-    ::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
+    knf::FeatureWindowFunction window_function_;
+    std::vector<float> remained_wav_;
    F computer_;
 };

--- a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "base/kaldi-common.h"

 namespace paddleaudio {
 namespace kaldi {
@ -25,24 +24,29 @@ StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)

 template <class F>
 bool StreamingFeatureTpl<F>::ComputeFeature(
-    const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
-    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
+    const std::vector<float>& wav,
+    std::vector<float>* feats) {
    // append remaned waves
-    ::kaldi::int32 wav_len = wav.Dim();
+    int wav_len = wav.size();
    if (wav_len == 0) return false;
-    ::kaldi::int32 left_len = remained_wav_.Dim();
-    ::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
-    waves.Range(0, left_len).CopyFromVec(remained_wav_);
-    waves.Range(left_len, wav_len).CopyFromVec(wav);
+    int left_len = remained_wav_.size();
+    std::vector<float> waves(left_len + wav_len);
+    std::memcpy(waves.data(),
+                remained_wav_.data(),
+                left_len * sizeof(float));
+    std::memcpy(waves.data() + left_len,
+                wav.data(),
+                wav_len * sizeof(float));

    // cache remaned waves
-    ::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
-    ::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
-    ::kaldi::int32 frame_shift = frame_opts.WindowShift();
-    ::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
-    remained_wav_.Resize(left_samples);
-    remained_wav_.CopyFromVec(
-        waves.Range(frame_shift * num_frames, left_samples));
+    knf::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
+    int num_frames = knf::NumFrames(waves.size(), frame_opts);
+    int frame_shift = frame_opts.WindowShift();
+    int left_samples = waves.size() - frame_shift * num_frames;
+    remained_wav_.resize(left_samples);
+    std::memcpy(remained_wav_.data(),
+                waves.data() + frame_shift * num_frames,
+                left_samples * sizeof(float));

    // compute speech feature
    Compute(waves, feats);
@ -51,40 +55,39 @@ bool StreamingFeatureTpl<F>::ComputeFeature(

 // Compute feat
 template <class F>
-bool StreamingFeatureTpl<F>::Compute(
-    const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
-    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
-    ::kaldi::BaseFloat vtln_warp = 1.0;
-    const ::kaldi::FrameExtractionOptions& frame_opts =
-        computer_.GetFrameOptions();
-    ::kaldi::int32 num_samples = waves.Dim();
-    ::kaldi::int32 frame_length = frame_opts.WindowSize();
-    ::kaldi::int32 sample_rate = frame_opts.samp_freq;
+bool StreamingFeatureTpl<F>::Compute(const std::vector<float>& waves,
+                                     std::vector<float>* feats) {
+    const knf::FrameExtractionOptions& frame_opts = computer_.GetFrameOptions();
+    int num_samples = waves.size();
+    int frame_length = frame_opts.WindowSize();
+    int sample_rate = frame_opts.samp_freq;
    if (num_samples < frame_length) {
-        return false;
+        return true;
    }

-    ::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
-    feats->Resize(num_frames * Dim());
+    int num_frames = knf::NumFrames(num_samples, frame_opts);
+    feats->resize(num_frames * Dim());

-    ::kaldi::Vector<::kaldi::BaseFloat> window;
+    std::vector<float> window;
    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
-    for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
-        ::kaldi::BaseFloat raw_log_energy = 0.0;
-        ::kaldi::ExtractWindow(0,
-                               waves,
-                               frame,
-                               frame_opts,
-                               window_function_,
-                               &window,
-                               need_raw_log_energy ? &raw_log_energy : NULL);
+    for (int frame = 0; frame < num_frames; frame++) {
+        std::fill(window.begin(), window.end(), 0);
+        float raw_log_energy = 0.0;
+        float vtln_warp = 1.0;
+        knf::ExtractWindow(0,
+                           waves,
+                           frame,
+                           frame_opts,
+                           window_function_,
+                           &window,
+                           need_raw_log_energy ? &raw_log_energy : NULL);

-        ::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
-                                                         ::kaldi::kUndefined);
-        computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
-        ::kaldi::SubVector<::kaldi::BaseFloat> output_row(
-            feats->Data() + frame * Dim(), Dim());
-        output_row.CopyFromVec(this_feature);
+        std::vector<float> this_feature(computer_.Dim());
+        computer_.Compute(
+            raw_log_energy, vtln_warp, &window, this_feature.data());
+        std::memcpy(feats->data() + frame * Dim(),
+                    this_feature.data(),
+                    sizeof(float) * Dim());
    }
    return true;
 }
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
@ -13,16 +13,16 @@
 // limitations under the License.

 #include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
-#include "feat/pitch-functions.h"
+//#include "feat/pitch-functions.h"

 namespace paddleaudio {
 namespace kaldi {

 bool InitFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::FrameExtractionOptions frame_opts,
+    knf::MelBanksOptions mel_opts,
    FbankOptions fbank_opts) {
-    ::kaldi::FbankOptions opts;
+    knf::FbankOptions opts;
    opts.frame_opts = frame_opts;
    opts.mel_opts = mel_opts;
    opts.use_energy = fbank_opts.use_energy;
@ -41,8 +41,8 @@ py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
 }

 py::array_t<float> ComputeFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::FrameExtractionOptions frame_opts,
+    knf::MelBanksOptions mel_opts,
    FbankOptions fbank_opts,
    const py::array_t<float>& wav) {
    InitFbank(frame_opts, mel_opts, fbank_opts);
@ -55,21 +55,21 @@ void ResetFbank() {
    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
 }

-py::array_t<float> ComputeKaldiPitch(
-  const ::kaldi::PitchExtractionOptions& opts,
-  const py::array_t<float>& wav) {
-    py::buffer_info info = wav.request();
-    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+//py::array_t<float> ComputeKaldiPitch(
+  //const ::kaldi::PitchExtractionOptions& opts,
+  //const py::array_t<float>& wav) {
+    //py::buffer_info info = wav.request();
+    //::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
   
-    ::kaldi::Matrix<::kaldi::BaseFloat> features;
-    ::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
-    auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
-    for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
-        std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
-                    sizeof(float)*features.NumCols());
-    }
-   return result;
-}
+    //::kaldi::Matrix<::kaldi::BaseFloat> features;
+    //::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
+    //auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
+    //for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
+        //std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
+                    //sizeof(float)*features.NumCols());
+    //}
+   //return result;
+//}

 }  // namespace kaldi
 }  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
@ -19,7 +19,7 @@
 #include <string>

 #include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
-#include "feat/pitch-functions.h"
+//#include "feat/pitch-functions.h"

 namespace py = pybind11;

@ -42,13 +42,13 @@ struct FbankOptions{
 };

 bool InitFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::FrameExtractionOptions frame_opts,
+    knf::MelBanksOptions mel_opts,
    FbankOptions fbank_opts);

 py::array_t<float> ComputeFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::FrameExtractionOptions frame_opts,
+    knf::MelBanksOptions mel_opts,
    FbankOptions fbank_opts,
    const py::array_t<float>& wav);

@ -56,9 +56,9 @@ py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);

 void ResetFbank();

-py::array_t<float> ComputeKaldiPitch(
-    const ::kaldi::PitchExtractionOptions& opts,
-    const py::array_t<float>& wav);
+//py::array_t<float> ComputeKaldiPitch(
+    //const ::kaldi::PitchExtractionOptions& opts,
+    //const py::array_t<float>& wav);

 }  // namespace kaldi
 }  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
@ -22,7 +22,7 @@ KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
    return &instance;
 }

-bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
+bool KaldiFeatureWrapper::InitFbank(knf::FbankOptions opts) {
    fbank_.reset(new Fbank(opts));
    return true;
 }
@ -30,21 +30,18 @@ bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
 py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
    const py::array_t<float> wav) {
    py::buffer_info info = wav.request();
-    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+    std::vector<float> input_wav((float*)info.ptr, (float*)info.ptr + info.size);

-    ::kaldi::Vector<::kaldi::BaseFloat> feats;
+    std::vector<float> feats;
    bool flag = fbank_->ComputeFeature(input_wav, &feats);
-    if (flag == false || feats.Dim() == 0) return py::array_t<float>();
-    auto result = py::array_t<float>(feats.Dim());
+    if (flag == false || feats.size() == 0) return py::array_t<float>();
+    auto result = py::array_t<float>(feats.size());
    py::buffer_info xs = result.request();
-    std::cout << std::endl;
    float* res_ptr = (float*)xs.ptr;
-    for (int idx = 0; idx < feats.Dim(); ++idx) {
-        *res_ptr = feats(idx);
-        res_ptr++;
-    }
-
-    return result.reshape({feats.Dim() / Dim(), Dim()});
+    std::memcpy(res_ptr, feats.data(), sizeof(float)*feats.size());
+    std::vector<int> shape{static_cast<int>(feats.size() / Dim()), 
+                           static_cast<int>(Dim())};
+    return result.reshape(shape);
 }

 }  // namesapce kaldi
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
@ -14,20 +14,18 @@

 #pragma once

-#include "base/kaldi-common.h"
-#include "feat/feature-fbank.h"
-
+#include "paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h"
 #include "paddleaudio/src/pybind/kaldi/feature_common.h"

 namespace paddleaudio {
 namespace kaldi {

-typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
+typedef StreamingFeatureTpl<knf::FbankComputer> Fbank;

 class KaldiFeatureWrapper {
  public:
    static KaldiFeatureWrapper* GetInstance();
-    bool InitFbank(::kaldi::FbankOptions opts);
+    bool InitFbank(knf::FbankOptions opts);
    py::array_t<float> ComputeFbank(const py::array_t<float> wav);
    int Dim() { return fbank_->Dim(); }
    void ResetFbank() { fbank_->Reset(); }
--- a/audio/paddleaudio/src/pybind/pybind.cpp
+++ b/audio/paddleaudio/src/pybind/pybind.cpp
@ -2,7 +2,7 @@

 #ifdef INCLUDE_KALDI
 #include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
-#include "paddleaudio/third_party/kaldi/feat/feature-fbank.h"
+#include "paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h"
 #endif

 #ifdef INCLUDE_SOX
@ -89,53 +89,51 @@ PYBIND11_MODULE(_paddleaudio, m) {

 #ifdef INCLUDE_KALDI
    m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
-    py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
-        .def(py::init<>())
-        .def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
-        .def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
-        .def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
-        .def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
-        .def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
-        .def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
-        .def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
-        .def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
-        .def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
-        .def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
-        .def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
-        .def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
-        .def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
-        .def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
-        .def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
-        .def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
-        .def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
-        .def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
-        .def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
-        .def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
-    m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
-    py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
+    //py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
+        //.def(py::init<>())
+        //.def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
+        //.def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
+        //.def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
+        //.def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
+        //.def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
+        //.def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
+        //.def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
+        //.def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
+        //.def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
+        //.def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
+        //.def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
+        //.def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
+        //.def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
+        //.def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
+        //.def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
+        //.def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
+        //.def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
+        //.def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
+        //.def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
+        //.def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
+    //m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
+    py::class_<knf::FrameExtractionOptions>(m, "FrameExtractionOptions")
        .def(py::init<>())            
-        .def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
-        .def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)            
-        .def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
-        .def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)            
-        .def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)            
-        .def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)            
-        .def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
-        .def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)           
-        .def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)          
-        .def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
-        .def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
-        .def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
-        .def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
-    py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
+        .def_readwrite("samp_freq", &knf::FrameExtractionOptions::samp_freq)
+        .def_readwrite("frame_shift_ms", &knf::FrameExtractionOptions::frame_shift_ms)            
+        .def_readwrite("frame_length_ms", &knf::FrameExtractionOptions::frame_length_ms)
+        .def_readwrite("dither", &knf::FrameExtractionOptions::dither)            
+        .def_readwrite("preemph_coeff", &knf::FrameExtractionOptions::preemph_coeff)            
+        .def_readwrite("remove_dc_offset", &knf::FrameExtractionOptions::remove_dc_offset)            
+        .def_readwrite("window_type", &knf::FrameExtractionOptions::window_type)
+        .def_readwrite("round_to_power_of_two", &knf::FrameExtractionOptions::round_to_power_of_two)           
+        .def_readwrite("blackman_coeff", &knf::FrameExtractionOptions::blackman_coeff)          
+        .def_readwrite("snip_edges", &knf::FrameExtractionOptions::snip_edges)
+        .def_readwrite("max_feature_vectors", &knf::FrameExtractionOptions::max_feature_vectors);
+    py::class_<knf::MelBanksOptions>(m, "MelBanksOptions")
        .def(py::init<>())
-        .def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
-        .def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
-        .def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
-        .def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
-        .def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
-        .def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
-        .def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
+        .def_readwrite("num_bins", &knf::MelBanksOptions::num_bins)
+        .def_readwrite("low_freq", &knf::MelBanksOptions::low_freq)
+        .def_readwrite("high_freq", &knf::MelBanksOptions::high_freq)
+        .def_readwrite("vtln_low", &knf::MelBanksOptions::vtln_low)
+        .def_readwrite("vtln_high", &knf::MelBanksOptions::vtln_high)
+        .def_readwrite("debug_mel", &knf::MelBanksOptions::debug_mel)
+        .def_readwrite("htk_mode", &knf::MelBanksOptions::htk_mode);

    py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
        .def(py::init<>())
--- a/audio/paddleaudio/third_party/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/CMakeLists.txt
@ -11,5 +11,6 @@ endif()
 # kaldi
 ################################################################################
 if (BUILD_KALDI)
-  add_subdirectory(kaldi)
-endif()
+  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+  add_subdirectory(kaldi-native-fbank/csrc)
+endif()
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt
@ -0,0 +1,22 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../)
+add_library(kaldi-native-fbank-core
+  feature-fbank.cc
+  feature-functions.cc
+  feature-window.cc
+  fftsg.c
+  log.cc
+  mel-computations.cc
+  rfft.cc
+)
+# We are using std::call_once() in log.h,which requires us to link with -pthread
+if(NOT WIN32)
+  target_link_libraries(kaldi-native-fbank-core -pthread)
+endif()
+
+if(KNF_HAVE_EXECINFO_H)
+  target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_EXECINFO_H=1)
+endif()
+
+if(KNF_HAVE_CXXABI_H)
+  target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_CXXABI_H=1)
+endif()
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc
@ -0,0 +1,117 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-fbank.cc
+//
+#include "kaldi-native-fbank/csrc/feature-fbank.h"
+
+#include <cmath>
+
+#include "kaldi-native-fbank/csrc/feature-functions.h"
+
+namespace knf {
+
+static void Sqrt(float *in_out, int32_t n) {
+  for (int32_t i = 0; i != n; ++i) {
+    in_out[i] = std::sqrt(in_out[i]);
+  }
+}
+
+std::ostream &operator<<(std::ostream &os, const FbankOptions &opts) {
+  os << opts.ToString();
+  return os;
+}
+
+FbankComputer::FbankComputer(const FbankOptions &opts)
+    : opts_(opts), rfft_(opts.frame_opts.PaddedWindowSize()) {
+  if (opts.energy_floor > 0.0f) {
+    log_energy_floor_ = logf(opts.energy_floor);
+  }
+
+  // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
+  // [note: this call caches it.]
+  GetMelBanks(1.0f);
+}
+
+FbankComputer::~FbankComputer() {
+  for (auto iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter)
+    delete iter->second;
+}
+
+const MelBanks *FbankComputer::GetMelBanks(float vtln_warp) {
+  MelBanks *this_mel_banks = nullptr;
+
+  // std::map<float, MelBanks *>::iterator iter = mel_banks_.find(vtln_warp);
+  auto iter = mel_banks_.find(vtln_warp);
+  if (iter == mel_banks_.end()) {
+    this_mel_banks = new MelBanks(opts_.mel_opts, opts_.frame_opts, vtln_warp);
+    mel_banks_[vtln_warp] = this_mel_banks;
+  } else {
+    this_mel_banks = iter->second;
+  }
+  return this_mel_banks;
+}
+
+void FbankComputer::Compute(float signal_raw_log_energy, float vtln_warp,
+                            std::vector<float> *signal_frame, float *feature) {
+  const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
+
+  KNF_CHECK_EQ(signal_frame->size(), opts_.frame_opts.PaddedWindowSize());
+
+  // Compute energy after window function (not the raw one).
+  if (opts_.use_energy && !opts_.raw_energy) {
+    signal_raw_log_energy = std::log(
+        std::max<float>(InnerProduct(signal_frame->data(), signal_frame->data(),
+                                     signal_frame->size()),
+                        std::numeric_limits<float>::epsilon()));
+  }
+  rfft_.Compute(signal_frame->data());  // signal_frame is modified in-place
+  ComputePowerSpectrum(signal_frame);
+
+  // Use magnitude instead of power if requested.
+  if (!opts_.use_power) {
+    Sqrt(signal_frame->data(), signal_frame->size() / 2 + 1);
+  }
+
+  int32_t mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
+
+  // Its length is opts_.mel_opts.num_bins
+  float *mel_energies = feature + mel_offset;
+
+  // Sum with mel filter banks over the power spectrum
+  mel_banks.Compute(signal_frame->data(), mel_energies);
+
+  if (opts_.use_log_fbank) {
+    // Avoid log of zero (which should be prevented anyway by dithering).
+    for (int32_t i = 0; i != opts_.mel_opts.num_bins; ++i) {
+      auto t = std::max(mel_energies[i], std::numeric_limits<float>::epsilon());
+      mel_energies[i] = std::log(t);
+    }
+  }
+
+  // Copy energy as first value (or the last, if htk_compat == true).
+  if (opts_.use_energy) {
+    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
+      signal_raw_log_energy = log_energy_floor_;
+    }
+    int32_t energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
+    feature[energy_index] = signal_raw_log_energy;
+  }
+}
+
+}  // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h
@ -0,0 +1,132 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-fbank.h
+
+#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
+#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
+
+#include <map>
+
+#include "kaldi-native-fbank/csrc/feature-window.h"
+#include "kaldi-native-fbank/csrc/mel-computations.h"
+#include "kaldi-native-fbank/csrc/rfft.h"
+
+namespace knf {
+
+struct FbankOptions {
+  FrameExtractionOptions frame_opts;
+  MelBanksOptions mel_opts;
+  // append an extra dimension with energy to the filter banks
+  bool use_energy = false;
+  float energy_floor = 0.0f;  // active iff use_energy==true
+
+  // If true, compute log_energy before preemphasis and windowing
+  // If false, compute log_energy after preemphasis ans windowing
+  bool raw_energy = true;  // active iff use_energy==true
+
+  // If true, put energy last (if using energy)
+  // If false, put energy first
+  bool htk_compat = false;  // active iff use_energy==true
+
+  // if true (default), produce log-filterbank, else linear
+  bool use_log_fbank = true;
+
+  // if true (default), use power in filterbank
+  // analysis, else magnitude.
+  bool use_power = true;
+
+  FbankOptions() { mel_opts.num_bins = 23; }
+
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "frame_opts: \n";
+    os << frame_opts << "\n";
+    os << "\n";
+
+    os << "mel_opts: \n";
+    os << mel_opts << "\n";
+
+    os << "use_energy: " << use_energy << "\n";
+    os << "energy_floor: " << energy_floor << "\n";
+    os << "raw_energy: " << raw_energy << "\n";
+    os << "htk_compat: " << htk_compat << "\n";
+    os << "use_log_fbank: " << use_log_fbank << "\n";
+    os << "use_power: " << use_power << "\n";
+    return os.str();
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, const FbankOptions &opts);
+
+class FbankComputer {
+ public:
+  using Options = FbankOptions;
+
+  explicit FbankComputer(const FbankOptions &opts);
+  ~FbankComputer();
+
+  int32_t Dim() const {
+    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
+  }
+
+  // if true, compute log_energy_pre_window but after dithering and dc removal
+  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
+
+  const FrameExtractionOptions &GetFrameOptions() const {
+    return opts_.frame_opts;
+  }
+
+  const FbankOptions &GetOptions() const { return opts_; }
+
+  /**
+     Function that computes one frame of features from
+     one frame of signal.
+
+     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
+         prior to windowing and pre-emphasis, or
+         log(numeric_limits<float>::min()), whichever is greater.  Must be
+         ignored by this function if this class returns false from
+         this->NeedsRawLogEnergy().
+     @param [in] vtln_warp  The VTLN warping factor that the user wants
+         to be applied when computing features for this utterance.  Will
+         normally be 1.0, meaning no warping is to be done.  The value will
+         be ignored for feature types that don't support VLTN, such as
+         spectrogram features.
+     @param [in] signal_frame  One frame of the signal,
+       as extracted using the function ExtractWindow() using the options
+       returned by this->GetFrameOptions().  The function will use the
+       vector as a workspace, which is why it's a non-const pointer.
+     @param [out] feature  Pointer to a vector of size this->Dim(), to which
+         the computed feature will be written. It should be pre-allocated.
+  */
+  void Compute(float signal_raw_log_energy, float vtln_warp,
+               std::vector<float> *signal_frame, float *feature);
+
+ private:
+  const MelBanks *GetMelBanks(float vtln_warp);
+
+  FbankOptions opts_;
+  float log_energy_floor_;
+  std::map<float, MelBanks *> mel_banks_;  // float is VTLN coefficient.
+  Rfft rfft_;
+};
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc
@ -0,0 +1,49 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-functions.cc
+
+#include "kaldi-native-fbank/csrc/feature-functions.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace knf {
+
+void ComputePowerSpectrum(std::vector<float> *complex_fft) {
+  int32_t dim = complex_fft->size();
+
+  // now we have in complex_fft, first half of complex spectrum
+  // it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
+
+  float *p = complex_fft->data();
+  int32_t half_dim = dim / 2;
+  float first_energy = p[0] * p[0];
+  float last_energy = p[1] * p[1];  // handle this special case
+
+  for (int32_t i = 1; i < half_dim; ++i) {
+    float real = p[i * 2];
+    float im = p[i * 2 + 1];
+    p[i] = real * real + im * im;
+  }
+  p[0] = first_energy;
+  p[half_dim] = last_energy;  // Will actually never be used, and anyway
+  // if the signal has been bandlimited sensibly this should be zero.
+}
+
+}  // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h
@ -0,0 +1,38 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-functions.h
+#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
+#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
+
+#include <vector>
+namespace knf {
+
+// ComputePowerSpectrum converts a complex FFT (as produced by the FFT
+// functions in csrc/rfft.h), and converts it into
+// a power spectrum.  If the complex FFT is a vector of size n (representing
+// half of the complex FFT of a real signal of size n, as described there),
+// this function computes in the first (n/2) + 1 elements of it, the
+// energies of the fft bins from zero to the Nyquist frequency.  Contents of the
+// remaining (n/2) - 1 elements are undefined at output.
+
+void ComputePowerSpectrum(std::vector<float> *complex_fft);
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc
@ -0,0 +1,236 @@
+// kaldi-native-fbank/csrc/feature-window.cc
+//
+// Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+
+// This file is copied/modified from kaldi/src/feat/feature-window.cc
+
+#include "kaldi-native-fbank/csrc/feature-window.h"
+
+#include <cmath>
+#include <vector>
+
+#ifndef M_2PI
+#define M_2PI 6.283185307179586476925286766559005
+#endif
+
+namespace knf {
+
+std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts) {
+  os << opts.ToString();
+  return os;
+}
+
+FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
+    : window_(opts.WindowSize()) {
+  int32_t frame_length = opts.WindowSize();
+  KNF_CHECK_GT(frame_length, 0);
+
+  float *window_data = window_.data();
+
+  double a = M_2PI / (frame_length - 1);
+  for (int32_t i = 0; i < frame_length; i++) {
+    double i_fl = static_cast<double>(i);
+    if (opts.window_type == "hanning") {
+      window_data[i] = 0.5 - 0.5 * cos(a * i_fl);
+    } else if (opts.window_type == "sine") {
+      // when you are checking ws wikipedia, please
+      // note that 0.5 * a = M_PI/(frame_length-1)
+      window_data[i] = sin(0.5 * a * i_fl);
+    } else if (opts.window_type == "hamming") {
+      window_data[i] = 0.54 - 0.46 * cos(a * i_fl);
+    } else if (opts.window_type ==
+               "povey") {  // like hamming but goes to zero at edges.
+      window_data[i] = pow(0.5 - 0.5 * cos(a * i_fl), 0.85);
+    } else if (opts.window_type == "rectangular") {
+      window_data[i] = 1.0;
+    } else if (opts.window_type == "blackman") {
+      window_data[i] = opts.blackman_coeff - 0.5 * cos(a * i_fl) +
+                       (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
+    } else {
+      KNF_LOG(FATAL) << "Invalid window type " << opts.window_type;
+    }
+  }
+}
+
+void FeatureWindowFunction::Apply(float *wave) const {
+  int32_t window_size = window_.size();
+  const float *p = window_.data();
+  for (int32_t k = 0; k != window_size; ++k) {
+    wave[k] *= p[k];
+  }
+}
+
+int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts) {
+  int64_t frame_shift = opts.WindowShift();
+  if (opts.snip_edges) {
+    return frame * frame_shift;
+  } else {
+    int64_t midpoint_of_frame = frame_shift * frame + frame_shift / 2,
+            beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
+    return beginning_of_frame;
+  }
+}
+
+int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
+                  bool flush /*= true*/) {
+  int64_t frame_shift = opts.WindowShift();
+  int64_t frame_length = opts.WindowSize();
+  if (opts.snip_edges) {
+    // with --snip-edges=true (the default), we use a HTK-like approach to
+    // determining the number of frames-- all frames have to fit completely into
+    // the waveform, and the first frame begins at sample zero.
+    if (num_samples < frame_length)
+      return 0;
+    else
+      return (1 + ((num_samples - frame_length) / frame_shift));
+    // You can understand the expression above as follows: 'num_samples -
+    // frame_length' is how much room we have to shift the frame within the
+    // waveform; 'frame_shift' is how much we shift it each time; and the ratio
+    // is how many times we can shift it (integer arithmetic rounds down).
+  } else {
+    // if --snip-edges=false, the number of frames is determined by rounding the
+    // (file-length / frame-shift) to the nearest integer.  The point of this
+    // formula is to make the number of frames an obvious and predictable
+    // function of the frame shift and signal length, which makes many
+    // segmentation-related questions simpler.
+    //
+    // Because integer division in C++ rounds toward zero, we add (half the
+    // frame-shift minus epsilon) before dividing, to have the effect of
+    // rounding towards the closest integer.
+    int32_t num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
+
+    if (flush) return num_frames;
+
+    // note: 'end' always means the last plus one, i.e. one past the last.
+    int64_t end_sample_of_last_frame =
+        FirstSampleOfFrame(num_frames - 1, opts) + frame_length;
+
+    // the following code is optimized more for clarity than efficiency.
+    // If flush == false, we can't output frames that extend past the end
+    // of the signal.
+    while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
+      num_frames--;
+      end_sample_of_last_frame -= frame_shift;
+    }
+    return num_frames;
+  }
+}
+
+void ExtractWindow(int64_t sample_offset, const std::vector<float> &wave,
+                   int32_t f, const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function,
+                   std::vector<float> *window,
+                   float *log_energy_pre_window /*= nullptr*/) {
+  KNF_CHECK(sample_offset >= 0 && wave.size() != 0);
+
+  int32_t frame_length = opts.WindowSize();
+  int32_t frame_length_padded = opts.PaddedWindowSize();
+
+  int64_t num_samples = sample_offset + wave.size();
+  int64_t start_sample = FirstSampleOfFrame(f, opts);
+  int64_t end_sample = start_sample + frame_length;
+
+  if (opts.snip_edges) {
+    KNF_CHECK(start_sample >= sample_offset && end_sample <= num_samples);
+  } else {
+    KNF_CHECK(sample_offset == 0 || start_sample >= sample_offset);
+  }
+
+  if (window->size() != frame_length_padded) {
+    window->resize(frame_length_padded);
+  }
+
+  // wave_start and wave_end are start and end indexes into 'wave', for the
+  // piece of wave that we're trying to extract.
+  int32_t wave_start = int32_t(start_sample - sample_offset);
+  int32_t wave_end = wave_start + frame_length;
+
+  if (wave_start >= 0 && wave_end <= wave.size()) {
+    // the normal case-- no edge effects to consider.
+    std::copy(wave.begin() + wave_start,
+              wave.begin() + wave_start + frame_length, window->data());
+  } else {
+    // Deal with any end effects by reflection, if needed.  This code will only
+    // be reached for about two frames per utterance, so we don't concern
+    // ourselves excessively with efficiency.
+    int32_t wave_dim = wave.size();
+    for (int32_t s = 0; s < frame_length; ++s) {
+      int32_t s_in_wave = s + wave_start;
+      while (s_in_wave < 0 || s_in_wave >= wave_dim) {
+        // reflect around the beginning or end of the wave.
+        // e.g. -1 -> 0, -2 -> 1.
+        // dim -> dim - 1, dim + 1 -> dim - 2.
+        // the code supports repeated reflections, although this
+        // would only be needed in pathological cases.
+        if (s_in_wave < 0)
+          s_in_wave = -s_in_wave - 1;
+        else
+          s_in_wave = 2 * wave_dim - 1 - s_in_wave;
+      }
+      (*window)[s] = wave[s_in_wave];
+    }
+  }
+
+  ProcessWindow(opts, window_function, window->data(), log_energy_pre_window);
+}
+
+static void RemoveDcOffset(float *d, int32_t n) {
+  float sum = 0;
+  for (int32_t i = 0; i != n; ++i) {
+    sum += d[i];
+  }
+
+  float mean = sum / n;
+
+  for (int32_t i = 0; i != n; ++i) {
+    d[i] -= mean;
+  }
+}
+
+float InnerProduct(const float *a, const float *b, int32_t n) {
+  float sum = 0;
+  for (int32_t i = 0; i != n; ++i) {
+    sum += a[i] * b[i];
+  }
+  return sum;
+}
+
+static void Preemphasize(float *d, int32_t n, float preemph_coeff) {
+  if (preemph_coeff == 0.0) {
+    return;
+  }
+
+  KNF_CHECK(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
+
+  for (int32_t i = n - 1; i > 0; --i) {
+    d[i] -= preemph_coeff * d[i - 1];
+  }
+  d[0] -= preemph_coeff * d[0];
+}
+
+void ProcessWindow(const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function, float *window,
+                   float *log_energy_pre_window /*= nullptr*/) {
+  int32_t frame_length = opts.WindowSize();
+
+  // TODO(fangjun): Remove dither
+  KNF_CHECK_EQ(opts.dither, 0);
+
+  if (opts.remove_dc_offset) {
+    RemoveDcOffset(window, frame_length);
+  }
+
+  if (log_energy_pre_window != NULL) {
+    float energy = std::max<float>(InnerProduct(window, window, frame_length),
+                                   std::numeric_limits<float>::epsilon());
+    *log_energy_pre_window = std::log(energy);
+  }
+
+  if (opts.preemph_coeff != 0.0) {
+    Preemphasize(window, frame_length, opts.preemph_coeff);
+  }
+
+  window_function.Apply(window);
+}
+
+}  // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h
@ -0,0 +1,178 @@
+// kaldi-native-fbank/csrc/feature-window.h
+//
+// Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+
+// This file is copied/modified from kaldi/src/feat/feature-window.h
+
+#ifndef KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
+#define KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "kaldi-native-fbank/csrc/log.h"
+
+namespace knf {
+
+inline int32_t RoundUpToNearestPowerOfTwo(int32_t n) {
+  // copied from kaldi/src/base/kaldi-math.cc
+  KNF_CHECK_GT(n, 0);
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  return n + 1;
+}
+
+struct FrameExtractionOptions {
+  float samp_freq = 16000;
+  float frame_shift_ms = 10.0f;   // in milliseconds.
+  float frame_length_ms = 25.0f;  // in milliseconds.
+  float dither = 1.0f;            // Amount of dithering, 0.0 means no dither.
+  float preemph_coeff = 0.97f;    // Preemphasis coefficient.
+  bool remove_dc_offset = true;   // Subtract mean of wave before FFT.
+  std::string window_type = "povey";  // e.g. Hamming window
+  // May be "hamming", "rectangular", "povey", "hanning", "sine", "blackman"
+  // "povey" is a window I made to be similar to Hamming but to go to zero at
+  // the edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) I just don't think the
+  // Hamming window makes sense as a windowing function.
+  bool round_to_power_of_two = true;
+  float blackman_coeff = 0.42f;
+  bool snip_edges = true;
+  // bool allow_downsample = false;
+  // bool allow_upsample = false;
+
+  // Used for streaming feature extraction. It indicates the number
+  // of feature frames to keep in the recycling vector. -1 means to
+  // keep all feature frames.
+  int32_t max_feature_vectors = -1;
+
+  int32_t WindowShift() const {
+    return static_cast<int32_t>(samp_freq * 0.001f * frame_shift_ms);
+  }
+  int32_t WindowSize() const {
+    return static_cast<int32_t>(samp_freq * 0.001f * frame_length_ms);
+  }
+  int32_t PaddedWindowSize() const {
+    return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize())
+                                  : WindowSize());
+  }
+  std::string ToString() const {
+    std::ostringstream os;
+#define KNF_PRINT(x) os << #x << ": " << x << "\n"
+    KNF_PRINT(samp_freq);
+    KNF_PRINT(frame_shift_ms);
+    KNF_PRINT(frame_length_ms);
+    KNF_PRINT(dither);
+    KNF_PRINT(preemph_coeff);
+    KNF_PRINT(remove_dc_offset);
+    KNF_PRINT(window_type);
+    KNF_PRINT(round_to_power_of_two);
+    KNF_PRINT(blackman_coeff);
+    KNF_PRINT(snip_edges);
+    // KNF_PRINT(allow_downsample);
+    // KNF_PRINT(allow_upsample);
+    KNF_PRINT(max_feature_vectors);
+#undef KNF_PRINT
+    return os.str();
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts);
+
+class FeatureWindowFunction {
+ public:
+  FeatureWindowFunction() = default;
+  explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
+  /**
+   * @param wave Pointer to a 1-D array of shape [window_size].
+   *             It is modified in-place: wave[i] = wave[i] * window_[i].
+   * @param
+   */
+  void Apply(float *wave) const;
+
+ private:
+  std::vector<float> window_;  // of size opts.WindowSize()
+};
+
+int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts);
+
+/**
+   This function returns the number of frames that we can extract from a wave
+   file with the given number of samples in it (assumed to have the same
+   sampling rate as specified in 'opts').
+
+      @param [in] num_samples  The number of samples in the wave file.
+      @param [in] opts     The frame-extraction options class
+
+      @param [in] flush   True if we are asserting that this number of samples
+   is 'all there is', false if we expecting more data to possibly come in.  This
+   only makes a difference to the answer
+   if opts.snips_edges== false.  For offline feature extraction you always want
+   flush == true.  In an online-decoding context, once you know (or decide) that
+   no more data is coming in, you'd call it with flush == true at the end to
+   flush out any remaining data.
+*/
+int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
+                  bool flush = true);
+
+/*
+  ExtractWindow() extracts a windowed frame of waveform (possibly with a
+  power-of-two, padded size, depending on the config), including all the
+  processing done by ProcessWindow().
+
+  @param [in] sample_offset  If 'wave' is not the entire waveform, but
+                   part of it to the left has been discarded, then the
+                   number of samples prior to 'wave' that we have
+                   already discarded.  Set this to zero if you are
+                   processing the entire waveform in one piece, or
+                   if you get 'no matching function' compilation
+                   errors when updating the code.
+  @param [in] wave  The waveform
+  @param [in] f     The frame index to be extracted, with
+                    0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
+  @param [in] opts  The options class to be used
+  @param [in] window_function  The windowing function, as derived from the
+                    options class.
+  @param [out] window  The windowed, possibly-padded waveform to be
+                     extracted.  Will be resized as needed.
+  @param [out] log_energy_pre_window  If non-NULL, the log-energy of
+                   the signal prior to pre-emphasis and multiplying by
+                   the windowing function will be written to here.
+*/
+void ExtractWindow(int64_t sample_offset, const std::vector<float> &wave,
+                   int32_t f, const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function,
+                   std::vector<float> *window,
+                   float *log_energy_pre_window = nullptr);
+
+/**
+  This function does all the windowing steps after actually
+  extracting the windowed signal: depending on the
+  configuration, it does dithering, dc offset removal,
+  preemphasis, and multiplication by the windowing function.
+   @param [in] opts  The options class to be used
+   @param [in] window_function  The windowing function-- should have
+                    been initialized using 'opts'.
+   @param [in,out] window  A vector of size opts.WindowSize().  Note:
+      it will typically be a sub-vector of a larger vector of size
+      opts.PaddedWindowSize(), with the remaining samples zero,
+      as the FFT code is more efficient if it operates on data with
+      power-of-two size.
+   @param [out]   log_energy_pre_window If non-NULL, then after dithering and
+      DC offset removal, this function will write to this pointer the log of
+      the total energy (i.e. sum-squared) of the frame.
+ */
+void ProcessWindow(const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function, float *window,
+                   float *log_energy_pre_window = nullptr);
+
+// Compute the inner product of two vectors
+float InnerProduct(const float *a, const float *b, int32_t n);
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc
@ -0,0 +1,143 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Stack trace related stuff is from kaldi.
+ * Refer to
+ * https://github.com/kaldi-asr/kaldi/blob/master/src/base/kaldi-error.cc
+ */
+
+#include "kaldi-native-fbank/csrc/log.h"
+
+#ifdef KNF_HAVE_EXECINFO_H
+#include <execinfo.h> // To get stack trace in error messages.
+#ifdef KNF_HAVE_CXXABI_H
+#include <cxxabi.h> // For name demangling.
+// Useful to decode the stack trace, but only used if we have execinfo.h
+#endif // KNF_HAVE_CXXABI_H
+#endif // KNF_HAVE_EXECINFO_H
+
+#include <stdlib.h>
+
+#include <ctime>
+#include <iomanip>
+#include <string>
+
+namespace knf {
+
+std::string GetDateTimeStr() {
+  std::ostringstream os;
+  std::time_t t = std::time(nullptr);
+  std::tm tm = *std::localtime(&t);
+  os << std::put_time(&tm, "%F %T"); // yyyy-mm-dd hh:mm:ss
+  return os.str();
+}
+
+static bool LocateSymbolRange(const std::string &trace_name, std::size_t *begin,
+                              std::size_t *end) {
+  // Find the first '_' with leading ' ' or '('.
+  *begin = std::string::npos;
+  for (std::size_t i = 1; i < trace_name.size(); ++i) {
+    if (trace_name[i] != '_') {
+      continue;
+    }
+    if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') {
+      *begin = i;
+      break;
+    }
+  }
+  if (*begin == std::string::npos) {
+    return false;
+  }
+  *end = trace_name.find_first_of(" +", *begin);
+  return *end != std::string::npos;
+}
+
+#ifdef KNF_HAVE_EXECINFO_H
+static std::string Demangle(const std::string &trace_name) {
+#ifndef KNF_HAVE_CXXABI_H
+  return trace_name;
+#else  // KNF_HAVE_CXXABI_H
+  // Try demangle the symbol. We are trying to support the following formats
+  // produced by different platforms:
+  //
+  // Linux:
+  //   ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d]
+  //
+  // Mac:
+  //   0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813
+  //
+  // We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and
+  // demangle it info a readable name like kaldi::UnitTextError.
+  std::size_t begin, end;
+  if (!LocateSymbolRange(trace_name, &begin, &end)) {
+    return trace_name;
+  }
+  std::string symbol = trace_name.substr(begin, end - begin);
+  int status;
+  char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status);
+  if (status == 0 && demangled_name != nullptr) {
+    symbol = demangled_name;
+    free(demangled_name);
+  }
+  return trace_name.substr(0, begin) + symbol +
+         trace_name.substr(end, std::string::npos);
+#endif // KNF_HAVE_CXXABI_H
+}
+#endif // KNF_HAVE_EXECINFO_H
+
+std::string GetStackTrace() {
+  std::string ans;
+#ifdef KNF_HAVE_EXECINFO_H
+  constexpr const std::size_t kMaxTraceSize = 50;
+  constexpr const std::size_t kMaxTracePrint = 50; // Must be even.
+                                                   // Buffer for the trace.
+  void *trace[kMaxTraceSize];
+  // Get the trace.
+  std::size_t size = backtrace(trace, kMaxTraceSize);
+  // Get the trace symbols.
+  char **trace_symbol = backtrace_symbols(trace, size);
+  if (trace_symbol == nullptr)
+    return ans;
+
+  // Compose a human-readable backtrace string.
+  ans += "[ Stack-Trace: ]\n";
+  if (size <= kMaxTracePrint) {
+    for (std::size_t i = 0; i < size; ++i) {
+      ans += Demangle(trace_symbol[i]) + "\n";
+    }
+  } else { // Print out first+last (e.g.) 5.
+    for (std::size_t i = 0; i < kMaxTracePrint / 2; ++i) {
+      ans += Demangle(trace_symbol[i]) + "\n";
+    }
+    ans += ".\n.\n.\n";
+    for (std::size_t i = size - kMaxTracePrint / 2; i < size; ++i) {
+      ans += Demangle(trace_symbol[i]) + "\n";
+    }
+    if (size == kMaxTraceSize)
+      ans += ".\n.\n.\n"; // Stack was too long, probably a bug.
+  }
+
+  // We must free the array of pointers allocated by backtrace_symbols(),
+  // but not the strings themselves.
+  free(trace_symbol);
+#endif // KNF_HAVE_EXECINFO_H
+  return ans;
+}
+
+} // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h
@ -0,0 +1,347 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The content in this file is copied/modified from
+// https://github.com/k2-fsa/k2/blob/master/k2/csrc/log.h
+#ifndef KALDI_NATIVE_FBANK_CSRC_LOG_H_
+#define KALDI_NATIVE_FBANK_CSRC_LOG_H_
+
+#include <stdio.h>
+
+#include <mutex>  // NOLINT
+#include <sstream>
+#include <string>
+
+namespace knf {
+
+#if defined(NDEBUG)
+constexpr bool kDisableDebug = true;
+#else
+constexpr bool kDisableDebug = false;
+#endif
+
+enum class LogLevel {
+  kTrace = 0,
+  kDebug = 1,
+  kInfo = 2,
+  kWarning = 3,
+  kError = 4,
+  kFatal = 5,  // print message and abort the program
+};
+
+// They are used in KNF_LOG(xxx), so their names
+// do not follow the google c++ code style
+//
+// You can use them in the following way:
+//
+//  KNF_LOG(TRACE) << "some message";
+//  KNF_LOG(DEBUG) << "some message";
+#ifndef _MSC_VER
+constexpr LogLevel TRACE = LogLevel::kTrace;
+constexpr LogLevel DEBUG = LogLevel::kDebug;
+constexpr LogLevel INFO = LogLevel::kInfo;
+constexpr LogLevel WARNING = LogLevel::kWarning;
+constexpr LogLevel ERROR = LogLevel::kError;
+constexpr LogLevel FATAL = LogLevel::kFatal;
+#else
+#define TRACE LogLevel::kTrace
+#define DEBUG LogLevel::kDebug
+#define INFO LogLevel::kInfo
+#define WARNING LogLevel::kWarning
+#define ERROR LogLevel::kError
+#define FATAL LogLevel::kFatal
+#endif
+
+std::string GetStackTrace();
+
+/* Return the current log level.
+
+
+   If the current log level is TRACE, then all logged messages are printed out.
+
+   If the current log level is DEBUG, log messages with "TRACE" level are not
+   shown and all other levels are printed out.
+
+   Similarly, if the current log level is INFO, log message with "TRACE" and
+   "DEBUG" are not shown and all other levels are printed out.
+
+   If it is FATAL, then only FATAL messages are shown.
+ */
+inline LogLevel GetCurrentLogLevel() {
+  static LogLevel log_level = INFO;
+  static std::once_flag init_flag;
+  std::call_once(init_flag, []() {
+    const char *env_log_level = std::getenv("KNF_LOG_LEVEL");
+    if (env_log_level == nullptr) return;
+
+    std::string s = env_log_level;
+    if (s == "TRACE")
+      log_level = TRACE;
+    else if (s == "DEBUG")
+      log_level = DEBUG;
+    else if (s == "INFO")
+      log_level = INFO;
+    else if (s == "WARNING")
+      log_level = WARNING;
+    else if (s == "ERROR")
+      log_level = ERROR;
+    else if (s == "FATAL")
+      log_level = FATAL;
+    else
+      fprintf(stderr,
+              "Unknown KNF_LOG_LEVEL: %s"
+              "\nSupported values are: "
+              "TRACE, DEBUG, INFO, WARNING, ERROR, FATAL",
+              s.c_str());
+  });
+  return log_level;
+}
+
+inline bool EnableAbort() {
+  static std::once_flag init_flag;
+  static bool enable_abort = false;
+  std::call_once(init_flag, []() {
+    enable_abort = (std::getenv("KNF_ABORT") != nullptr);
+  });
+  return enable_abort;
+}
+
+class Logger {
+ public:
+  Logger(const char *filename, const char *func_name, uint32_t line_num,
+         LogLevel level)
+      : filename_(filename),
+        func_name_(func_name),
+        line_num_(line_num),
+        level_(level) {
+    cur_level_ = GetCurrentLogLevel();
+    fprintf(stderr, "here\n");
+    switch (level) {
+      case TRACE:
+        if (cur_level_ <= TRACE) fprintf(stderr, "[T] ");
+        break;
+      case DEBUG:
+        if (cur_level_ <= DEBUG) fprintf(stderr, "[D] ");
+        break;
+      case INFO:
+        if (cur_level_ <= INFO) fprintf(stderr, "[I] ");
+        break;
+      case WARNING:
+        if (cur_level_ <= WARNING) fprintf(stderr, "[W] ");
+        break;
+      case ERROR:
+        if (cur_level_ <= ERROR) fprintf(stderr, "[E] ");
+        break;
+      case FATAL:
+        if (cur_level_ <= FATAL) fprintf(stderr, "[F] ");
+        break;
+    }
+
+    if (cur_level_ <= level_) {
+      fprintf(stderr, "%s:%u:%s ", filename, line_num, func_name);
+    }
+  }
+
+  ~Logger() noexcept(false) {
+    static constexpr const char *kErrMsg = R"(
+    Some bad things happened. Please read the above error messages and stack
+    trace. If you are using Python, the following command may be helpful:
+
+      gdb --args python /path/to/your/code.py
+
+    (You can use `gdb` to debug the code. Please consider compiling
+    a debug version of KNF.).
+
+    If you are unable to fix it, please open an issue at:
+
+      https://github.com/csukuangfj/kaldi-native-fbank/issues/new
+    )";
+    fprintf(stderr, "\n");
+    if (level_ == FATAL) {
+      std::string stack_trace = GetStackTrace();
+      if (!stack_trace.empty()) {
+        fprintf(stderr, "\n\n%s\n", stack_trace.c_str());
+      }
+
+      fflush(nullptr);
+
+#ifndef __ANDROID_API__
+      if (EnableAbort()) {
+        // NOTE: abort() will terminate the program immediately without
+        // printing the Python stack backtrace.
+        abort();
+      }
+
+      throw std::runtime_error(kErrMsg);
+#else
+      abort();
+#endif
+    }
+  }
+
+  const Logger &operator<<(bool b) const {
+    if (cur_level_ <= level_) {
+      fprintf(stderr, b ? "true" : "false");
+    }
+    return *this;
+  }
+
+  const Logger &operator<<(int8_t i) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%d", i);
+    return *this;
+  }
+
+  const Logger &operator<<(const char *s) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%s", s);
+    return *this;
+  }
+
+  const Logger &operator<<(int32_t i) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%d", i);
+    return *this;
+  }
+
+  const Logger &operator<<(uint32_t i) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%u", i);
+    return *this;
+  }
+
+  const Logger &operator<<(uint64_t i) const {
+    if (cur_level_ <= level_)
+      fprintf(stderr, "%llu", (long long unsigned int)i);  // NOLINT
+    return *this;
+  }
+
+  const Logger &operator<<(int64_t i) const {
+    if (cur_level_ <= level_)
+      fprintf(stderr, "%lli", (long long int)i);  // NOLINT
+    return *this;
+  }
+
+  const Logger &operator<<(float f) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%f", f);
+    return *this;
+  }
+
+  const Logger &operator<<(double d) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%f", d);
+    return *this;
+  }
+
+  template <typename T>
+  const Logger &operator<<(const T &t) const {
+    // require T overloads operator<<
+    std::ostringstream os;
+    os << t;
+    return *this << os.str().c_str();
+  }
+
+  // specialization to fix compile error: `stringstream << nullptr` is ambiguous
+  const Logger &operator<<(const std::nullptr_t &null) const {
+    if (cur_level_ <= level_) *this << "(null)";
+    return *this;
+  }
+
+ private:
+  const char *filename_;
+  const char *func_name_;
+  uint32_t line_num_;
+  LogLevel level_;
+  LogLevel cur_level_;
+};
+
+class Voidifier {
+ public:
+  void operator&(const Logger &)const {}
+};
+
+}  // namespace knf
+
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) || \
+    defined(__PRETTY_FUNCTION__)
+// for clang and GCC
+#define KNF_FUNC __PRETTY_FUNCTION__
+#else
+// for other compilers
+#define KNF_FUNC __func__
+#endif
+
+#define KNF_STATIC_ASSERT(x) static_assert(x, "")
+
+#define KNF_CHECK(x)                                                  \
+  (x) ? (void)0                                                       \
+      : ::knf::Voidifier() &                                          \
+            ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
+                << "Check failed: " << #x << " "
+
+// WARNING: x and y may be evaluated multiple times, but this happens only
+// when the check fails. Since the program aborts if it fails, we don't think
+// the extra evaluation of x and y matters.
+//
+// CAUTION: we recommend the following use case:
+//
+//      auto x = Foo();
+//      auto y = Bar();
+//      KNF_CHECK_EQ(x, y) << "Some message";
+//
+//  And please avoid
+//
+//      KNF_CHECK_EQ(Foo(), Bar());
+//
+//  if `Foo()` or `Bar()` causes some side effects, e.g., changing some
+//  local static variables or global variables.
+#define _KNF_CHECK_OP(x, y, op)                                              \
+  ((x)op(y)) ? (void)0                                                       \
+             : ::knf::Voidifier() &                                          \
+                   ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
+                       << "Check failed: " << #x << " " << #op << " " << #y  \
+                       << " (" << (x) << " vs. " << (y) << ") "
+
+#define KNF_CHECK_EQ(x, y) _KNF_CHECK_OP(x, y, ==)
+#define KNF_CHECK_NE(x, y) _KNF_CHECK_OP(x, y, !=)
+#define KNF_CHECK_LT(x, y) _KNF_CHECK_OP(x, y, <)
+#define KNF_CHECK_LE(x, y) _KNF_CHECK_OP(x, y, <=)
+#define KNF_CHECK_GT(x, y) _KNF_CHECK_OP(x, y, >)
+#define KNF_CHECK_GE(x, y) _KNF_CHECK_OP(x, y, >=)
+
+#define KNF_LOG(x) ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::x)
+
+// ------------------------------------------------------------
+//       For debug check
+// ------------------------------------------------------------
+// If you define the macro "-D NDEBUG" while compiling kaldi-native-fbank,
+// the following macros are in fact empty and does nothing.
+
+#define KNF_DCHECK(x) ::knf::kDisableDebug ? (void)0 : KNF_CHECK(x)
+
+#define KNF_DCHECK_EQ(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_EQ(x, y)
+
+#define KNF_DCHECK_NE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_NE(x, y)
+
+#define KNF_DCHECK_LT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LT(x, y)
+
+#define KNF_DCHECK_LE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LE(x, y)
+
+#define KNF_DCHECK_GT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GT(x, y)
+
+#define KNF_DCHECK_GE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GE(x, y)
+
+#define KNF_DLOG(x) \
+  ::knf::kDisableDebug ? (void)0 : ::knf::Voidifier() & KNF_LOG(x)
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_LOG_H_
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc
@ -0,0 +1,256 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/mel-computations.cc
+
+#include "kaldi-native-fbank/csrc/mel-computations.h"
+
+#include <algorithm>
+#include <sstream>
+
+#include "kaldi-native-fbank/csrc/feature-window.h"
+
+namespace knf {
+
+std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts) {
+  os << opts.ToString();
+  return os;
+}
+
+float MelBanks::VtlnWarpFreq(
+    float vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
+    float vtln_high_cutoff,
+    float low_freq,  // upper+lower frequency cutoffs in mel computation
+    float high_freq, float vtln_warp_factor, float freq) {
+  /// This computes a VTLN warping function that is not the same as HTK's one,
+  /// but has similar inputs (this function has the advantage of never producing
+  /// empty bins).
+
+  /// This function computes a warp function F(freq), defined between low_freq
+  /// and high_freq inclusive, with the following properties:
+  ///  F(low_freq) == low_freq
+  ///  F(high_freq) == high_freq
+  /// The function is continuous and piecewise linear with two inflection
+  ///   points.
+  /// The lower inflection point (measured in terms of the unwarped
+  ///  frequency) is at frequency l, determined as described below.
+  /// The higher inflection point is at a frequency h, determined as
+  ///   described below.
+  /// If l <= f <= h, then F(f) = f/vtln_warp_factor.
+  /// If the higher inflection point (measured in terms of the unwarped
+  ///   frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
+  ///   Since (by the last point) F(h) == h/vtln_warp_factor, then
+  ///   max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
+  ///   h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
+  ///     = vtln_high_cutoff * min(1, vtln_warp_factor).
+  /// If the lower inflection point (measured in terms of the unwarped
+  ///   frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
+  ///   This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
+  ///                       = vtln_low_cutoff * max(1, vtln_warp_factor)
+
+  if (freq < low_freq || freq > high_freq)
+    return freq;  // in case this gets called
+  // for out-of-range frequencies, just return the freq.
+
+  KNF_CHECK_GT(vtln_low_cutoff, low_freq);
+  KNF_CHECK_LT(vtln_high_cutoff, high_freq);
+
+  float one = 1.0f;
+  float l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
+  float h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
+  float scale = 1.0f / vtln_warp_factor;
+  float Fl = scale * l;  // F(l);
+  float Fh = scale * h;  // F(h);
+  KNF_CHECK(l > low_freq && h < high_freq);
+  // slope of left part of the 3-piece linear function
+  float scale_left = (Fl - low_freq) / (l - low_freq);
+  // [slope of center part is just "scale"]
+
+  // slope of right part of the 3-piece linear function
+  float scale_right = (high_freq - Fh) / (high_freq - h);
+
+  if (freq < l) {
+    return low_freq + scale_left * (freq - low_freq);
+  } else if (freq < h) {
+    return scale * freq;
+  } else {  // freq >= h
+    return high_freq + scale_right * (freq - high_freq);
+  }
+}
+
+float MelBanks::VtlnWarpMelFreq(
+    float vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
+    float vtln_high_cutoff,
+    float low_freq,  // upper+lower frequency cutoffs in mel computation
+    float high_freq, float vtln_warp_factor, float mel_freq) {
+  return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, low_freq,
+                               high_freq, vtln_warp_factor,
+                               InverseMelScale(mel_freq)));
+}
+
+MelBanks::MelBanks(const MelBanksOptions &opts,
+                   const FrameExtractionOptions &frame_opts,
+                   float vtln_warp_factor)
+    : htk_mode_(opts.htk_mode) {
+  int32_t num_bins = opts.num_bins;
+  if (num_bins < 3) KNF_LOG(FATAL) << "Must have at least 3 mel bins";
+
+  float sample_freq = frame_opts.samp_freq;
+  int32_t window_length_padded = frame_opts.PaddedWindowSize();
+  KNF_CHECK_EQ(window_length_padded % 2, 0);
+
+  int32_t num_fft_bins = window_length_padded / 2;
+  float nyquist = 0.5f * sample_freq;
+
+  float low_freq = opts.low_freq, high_freq;
+  if (opts.high_freq > 0.0f)
+    high_freq = opts.high_freq;
+  else
+    high_freq = nyquist + opts.high_freq;
+
+  if (low_freq < 0.0f || low_freq >= nyquist || high_freq <= 0.0f ||
+      high_freq > nyquist || high_freq <= low_freq) {
+    KNF_LOG(FATAL) << "Bad values in options: low-freq " << low_freq
+                   << " and high-freq " << high_freq << " vs. nyquist "
+                   << nyquist;
+  }
+
+  float fft_bin_width = sample_freq / window_length_padded;
+  // fft-bin width [think of it as Nyquist-freq / half-window-length]
+
+  float mel_low_freq = MelScale(low_freq);
+  float mel_high_freq = MelScale(high_freq);
+
+  debug_ = opts.debug_mel;
+
+  // divide by num_bins+1 in next line because of end-effects where the bins
+  // spread out to the sides.
+  float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1);
+
+  float vtln_low = opts.vtln_low, vtln_high = opts.vtln_high;
+  if (vtln_high < 0.0f) {
+    vtln_high += nyquist;
+  }
+
+  if (vtln_warp_factor != 1.0f &&
+      (vtln_low < 0.0f || vtln_low <= low_freq || vtln_low >= high_freq ||
+       vtln_high <= 0.0f || vtln_high >= high_freq || vtln_high <= vtln_low)) {
+    KNF_LOG(FATAL) << "Bad values in options: vtln-low " << vtln_low
+                   << " and vtln-high " << vtln_high << ", versus "
+                   << "low-freq " << low_freq << " and high-freq " << high_freq;
+  }
+
+  bins_.resize(num_bins);
+  center_freqs_.resize(num_bins);
+
+  for (int32_t bin = 0; bin < num_bins; ++bin) {
+    float left_mel = mel_low_freq + bin * mel_freq_delta,
+          center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
+          right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
+
+    if (vtln_warp_factor != 1.0f) {
+      left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
+                                 vtln_warp_factor, left_mel);
+      center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
+                                   vtln_warp_factor, center_mel);
+      right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
+                                  vtln_warp_factor, right_mel);
+    }
+    center_freqs_[bin] = InverseMelScale(center_mel);
+
+    // this_bin will be a vector of coefficients that is only
+    // nonzero where this mel bin is active.
+    std::vector<float> this_bin(num_fft_bins);
+
+    int32_t first_index = -1, last_index = -1;
+    for (int32_t i = 0; i < num_fft_bins; ++i) {
+      float freq = (fft_bin_width * i);  // Center frequency of this fft
+                                         // bin.
+      float mel = MelScale(freq);
+      if (mel > left_mel && mel < right_mel) {
+        float weight;
+        if (mel <= center_mel)
+          weight = (mel - left_mel) / (center_mel - left_mel);
+        else
+          weight = (right_mel - mel) / (right_mel - center_mel);
+        this_bin[i] = weight;
+        if (first_index == -1) first_index = i;
+        last_index = i;
+      }
+    }
+    KNF_CHECK(first_index != -1 && last_index >= first_index &&
+              "You may have set num_mel_bins too large.");
+
+    bins_[bin].first = first_index;
+    int32_t size = last_index + 1 - first_index;
+    bins_[bin].second.insert(bins_[bin].second.end(),
+                             this_bin.begin() + first_index,
+                             this_bin.begin() + first_index + size);
+
+    // Replicate a bug in HTK, for testing purposes.
+    if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0f) {
+      bins_[bin].second[0] = 0.0;
+    }
+  }  // for (int32_t bin = 0; bin < num_bins; ++bin) {
+
+  if (debug_) {
+    std::ostringstream os;
+    for (size_t i = 0; i < bins_.size(); i++) {
+      os << "bin " << i << ", offset = " << bins_[i].first << ", vec = ";
+      for (auto k : bins_[i].second) os << k << ", ";
+      os << "\n";
+    }
+    KNF_LOG(INFO) << os.str();
+  }
+}
+
+// "power_spectrum" contains fft energies.
+void MelBanks::Compute(const float *power_spectrum,
+                       float *mel_energies_out) const {
+  int32_t num_bins = bins_.size();
+
+  for (int32_t i = 0; i < num_bins; i++) {
+    int32_t offset = bins_[i].first;
+    const auto &v = bins_[i].second;
+    float energy = 0;
+    for (int32_t k = 0; k != v.size(); ++k) {
+      energy += v[k] * power_spectrum[k + offset];
+    }
+
+    // HTK-like flooring- for testing purposes (we prefer dither)
+    if (htk_mode_ && energy < 1.0) {
+      energy = 1.0;
+    }
+
+    mel_energies_out[i] = energy;
+
+    // The following assert was added due to a problem with OpenBlas that
+    // we had at one point (it was a bug in that library).  Just to detect
+    // it early.
+    KNF_CHECK_EQ(energy, energy);  // check that energy is not nan
+  }
+
+  if (debug_) {
+    fprintf(stderr, "MEL BANKS:\n");
+    for (int32_t i = 0; i < num_bins; i++)
+      fprintf(stderr, " %f", mel_energies_out[i]);
+    fprintf(stderr, "\n");
+  }
+}
+
+}  // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h
@ -0,0 +1,115 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// This file is copied/modified from kaldi/src/feat/mel-computations.h
+#ifndef KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
+#define KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
+
+#include <cmath>
+#include <string>
+
+#include "kaldi-native-fbank/csrc/feature-window.h"
+
+namespace knf {
+
+struct MelBanksOptions {
+  int32_t num_bins = 25;  // e.g. 25; number of triangular bins
+  float low_freq = 20;    // e.g. 20; lower frequency cutoff
+
+  // an upper frequency cutoff; 0 -> no cutoff, negative
+  // ->added to the Nyquist frequency to get the cutoff.
+  float high_freq = 0;
+
+  float vtln_low = 100;  // vtln lower cutoff of warping function.
+
+  // vtln upper cutoff of warping function: if negative, added
+  // to the Nyquist frequency to get the cutoff.
+  float vtln_high = -500;
+
+  bool debug_mel = false;
+  // htk_mode is a "hidden" config, it does not show up on command line.
+  // Enables more exact compatibility with HTK, for testing purposes.  Affects
+  // mel-energy flooring and reproduces a bug in HTK.
+  bool htk_mode = false;
+
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "num_bins: " << num_bins << "\n";
+    os << "low_freq: " << low_freq << "\n";
+    os << "high_freq: " << high_freq << "\n";
+    os << "vtln_low: " << vtln_low << "\n";
+    os << "vtln_high: " << vtln_high << "\n";
+    os << "debug_mel: " << debug_mel << "\n";
+    os << "htk_mode: " << htk_mode << "\n";
+    return os.str();
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts);
+
+class MelBanks {
+ public:
+  static inline float InverseMelScale(float mel_freq) {
+    return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f);
+  }
+
+  static inline float MelScale(float freq) {
+    return 1127.0f * logf(1.0f + freq / 700.0f);
+  }
+
+  static float VtlnWarpFreq(
+      float vtln_low_cutoff,
+      float vtln_high_cutoff,  // discontinuities in warp func
+      float low_freq,
+      float high_freq,  // upper+lower frequency cutoffs in
+      // the mel computation
+      float vtln_warp_factor, float freq);
+
+  static float VtlnWarpMelFreq(float vtln_low_cutoff, float vtln_high_cutoff,
+                               float low_freq, float high_freq,
+                               float vtln_warp_factor, float mel_freq);
+
+  // TODO(fangjun): Remove vtln_warp_factor
+  MelBanks(const MelBanksOptions &opts,
+           const FrameExtractionOptions &frame_opts, float vtln_warp_factor);
+
+  /// Compute Mel energies (note: not log energies).
+  /// At input, "fft_energies" contains the FFT energies (not log).
+  ///
+  /// @param fft_energies 1-D array of size num_fft_bins/2+1
+  /// @param mel_energies_out  1-D array of size num_mel_bins
+  void Compute(const float *fft_energies, float *mel_energies_out) const;
+
+  int32_t NumBins() const { return bins_.size(); }
+
+ private:
+  // center frequencies of bins, numbered from 0 ... num_bins-1.
+  // Needed by GetCenterFreqs().
+  std::vector<float> center_freqs_;
+
+  // the "bins_" vector is a vector, one for each bin, of a pair:
+  // (the first nonzero fft-bin), (the vector of weights).
+  std::vector<std::pair<int32_t, std::vector<float>>> bins_;
+
+  // TODO(fangjun): Remove debug_ and htk_mode_
+  bool debug_;
+  bool htk_mode_;
+};
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc
@ -0,0 +1,66 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kaldi-native-fbank/csrc/rfft.h"
+
+#include <cmath>
+#include <vector>
+
+#include "kaldi-native-fbank/csrc/log.h"
+
+// see fftsg.c
+#ifdef __cplusplus
+extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w);
+#else
+void rdft(int n, int isgn, double *a, int *ip, double *w);
+#endif
+
+namespace knf {
+class Rfft::RfftImpl {
+ public:
+  explicit RfftImpl(int32_t n) : n_(n), ip_(2 + std::sqrt(n / 2)), w_(n / 2) {
+    KNF_CHECK_EQ(n & (n - 1), 0);
+  }
+
+  void Compute(float *in_out) {
+    std::vector<double> d(in_out, in_out + n_);
+
+    Compute(d.data());
+
+    std::copy(d.begin(), d.end(), in_out);
+  }
+
+  void Compute(double *in_out) {
+    // 1 means forward fft
+    rdft(n_, 1, in_out, ip_.data(), w_.data());
+  }
+
+ private:
+  int32_t n_;
+  std::vector<int32_t> ip_;
+  std::vector<double> w_;
+};
+
+Rfft::Rfft(int32_t n) : impl_(std::make_unique<RfftImpl>(n)) {}
+
+Rfft::~Rfft() = default;
+
+void Rfft::Compute(float *in_out) { impl_->Compute(in_out); }
+void Rfft::Compute(double *in_out) { impl_->Compute(in_out); }
+
+}  // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h
@ -0,0 +1,56 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_NATIVE_FBANK_CSRC_RFFT_H_
+#define KALDI_NATIVE_FBANK_CSRC_RFFT_H_
+
+#include <memory>
+
+namespace knf {
+
+// n-point Real discrete Fourier transform
+// where n is a power of 2. n >= 2
+//
+//  R[k] = sum_j=0^n-1 in[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+//  I[k] = sum_j=0^n-1 in[j]*sin(2*pi*j*k/n), 0<k<n/2
+class Rfft {
+ public:
+  // @param n Number of fft bins. it should be a power of 2.
+  explicit Rfft(int32_t n);
+  ~Rfft();
+
+  /** @param in_out A 1-D array of size n.
+   *             On return:
+   *               in_out[0] = R[0]
+   *               in_out[1] = R[n/2]
+   *               for 1 < k < n/2,
+   *                 in_out[2*k] = R[k]
+   *                 in_out[2*k+1] = I[k]
+   *
+   */
+  void Compute(float *in_out);
+  void Compute(double *in_out);
+
+ private:
+  class RfftImpl;
+  std::unique_ptr<RfftImpl> impl_;
+};
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_RFFT_H_
--- a/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
@ -1,111 +0,0 @@
-# checkout the thirdparty/kaldi/base/kaldi-types.h
-# compile kaldi without openfst
-add_definitions("-DCOMPILE_WITHOUT_OPENFST")
-
-if ((NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/base))
-    file(COPY ../../../../speechx/speechx/kaldi/base DESTINATION ${CMAKE_CURRENT_LIST_DIR})
-    file(COPY ../../../../speechx/speechx/kaldi/feat DESTINATION ${CMAKE_CURRENT_LIST_DIR})
-    file(COPY ../../../../speechx/speechx/kaldi/matrix DESTINATION ${CMAKE_CURRENT_LIST_DIR})
-    file(COPY ../../../../speechx/speechx/kaldi/util DESTINATION ${CMAKE_CURRENT_LIST_DIR})
-endif()
-
-# kaldi-base
-add_library(kaldi-base STATIC
-  base/io-funcs.cc
-  base/kaldi-error.cc
-  base/kaldi-math.cc
-  base/kaldi-utils.cc
-  base/timer.cc
-)
-target_include_directories(kaldi-base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-
-# kaldi-matrix
-add_library(kaldi-matrix STATIC
-  matrix/compressed-matrix.cc
-  matrix/matrix-functions.cc
-  matrix/kaldi-matrix.cc
-  matrix/kaldi-vector.cc
-  matrix/optimization.cc
-  matrix/packed-matrix.cc
-  matrix/qr.cc
-  matrix/sparse-matrix.cc
-  matrix/sp-matrix.cc
-  matrix/srfft.cc
-  matrix/tp-matrix.cc
-)
-target_include_directories(kaldi-matrix PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-
-if (NOT MSVC)
-    target_link_libraries(kaldi-matrix PUBLIC kaldi-base libopenblas)
-else()
-    target_link_libraries(kaldi-matrix PUBLIC kaldi-base openblas)
-endif()
-
-# kaldi-util
-add_library(kaldi-util STATIC
-  util/kaldi-holder.cc
-  util/kaldi-io.cc
-  util/kaldi-semaphore.cc
-  util/kaldi-table.cc
-  util/kaldi-thread.cc
-  util/parse-options.cc
-  util/simple-io-funcs.cc
-  util/simple-options.cc
-  util/text-utils.cc
-)
-target_include_directories(kaldi-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix)
-
-# kaldi-feat-common
-add_library(kaldi-feat-common STATIC
-  feat/cmvn.cc
-  feat/feature-functions.cc
-  feat/feature-window.cc
-  feat/mel-computations.cc
-  feat/pitch-functions.cc
-  feat/resample.cc
-  feat/signal.cc
-  feat/wave-reader.cc
-)
-target_include_directories(kaldi-feat-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
-
-
-# kaldi-mfcc
-add_library(kaldi-mfcc STATIC
-  feat/feature-mfcc.cc
-)
-target_include_directories(kaldi-mfcc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
-
-
-# kaldi-fbank
-add_library(kaldi-fbank STATIC
-  feat/feature-fbank.cc
-)
-target_include_directories(kaldi-fbank PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
-
-
-set(KALDI_LIBRARIES
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-base.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-matrix.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-util.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-feat-common.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-mfcc.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-fbank.a
-)
-
-add_library(libkaldi INTERFACE)
-add_dependencies(libkaldi kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank)
-target_include_directories(libkaldi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
-
-if (APPLE)
-    target_link_libraries(libkaldi INTERFACE ${KALDI_LIBRARIES} libopenblas ${GFORTRAN_LIBRARIES_DIR}/libgfortran.a ${GFORTRAN_LIBRARIES_DIR}/libquadmath.a ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib)
-elseif (MSVC)
-    target_link_libraries(libkaldi INTERFACE kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank openblas)
-else()
-    target_link_libraries(libkaldi INTERFACE -Wl,--start-group -Wl,--whole-archive ${KALDI_LIBRARIES} libopenblas.a gfortran -Wl,--no-whole-archive -Wl,--end-group)
-endif()
-
-target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
--- a/audio/setup.py
+++ b/audio/setup.py
@ -51,8 +51,7 @@ base = [
 ]

 requirements = {
-    "install":
-    base,
+    "install": base,
    "develop": [
        "sox",
        "soxbindings",
@ -60,6 +59,7 @@ requirements = {
    ],
 }

+
 def check_call(cmd: str, shell=False, executable=None):
    try:
        sp.check_call(
@ -92,6 +92,7 @@ def check_output(cmd: Union[str, List[str], Tuple[str]], shell=False):
            file=sys.stderr)
    return out_bytes.strip().decode('utf8')

+
 def _run_cmd(cmd):
    try:
        return subprocess.check_output(
@ -100,6 +101,7 @@ def _run_cmd(cmd):
    except Exception:
        return None

+
@contextlib.contextmanager
 def pushd(new_dir):
    old_dir = os.getcwd()
@ -109,22 +111,26 @@ def pushd(new_dir):
    os.chdir(old_dir)
    print(old_dir)

+
 def read(*names, **kwargs):
    with io.open(
            os.path.join(os.path.dirname(__file__), *names),
            encoding=kwargs.get("encoding", "utf8")) as fp:
        return fp.read()

+
 def _remove(files: str):
    for f in files:
        f.unlink()

+
 ################################# Install ##################################


 def _post_install(install_lib_dir):
    pass

+
 class DevelopCommand(develop):
    def run(self):
        develop.run(self)
@ -142,7 +148,7 @@ class TestCommand(test):
        # Run nose ensuring that argv simulates running nosetests directly
        import nose
        nose.run_exit(argv=['nosetests', '-w', 'tests'])
-    
+
    def run_benchmark(self):
        for benchmark_item in glob.glob('tests/benchmark/*py'):
            os.system(f'pytest {benchmark_item}')
@ -188,6 +194,7 @@ def _make_version_file(version, sha):
    with open(version_path, "a") as f:
        f.write(f"__version__ = '{version}'\n")

+
 def _rm_version():
    file_ = ROOT_DIR / "paddleaudio" / "__init__.py"
    with open(file_, "r") as f:
@ -235,8 +242,8 @@ def main():
    if platform.system() != 'Windows' and platform.system() != 'Linux':
        lib_package_data = {'paddleaudio': ['lib/libgcc_s.1.1.dylib']}

-    if platform.system() == 'Linux':
-        lib_package_data = {'paddleaudio': ['lib/lib*']}
+    #if platform.system() == 'Linux':
+    #    lib_package_data = {'paddleaudio': ['lib/lib*']}

    setup_info = dict(
        # Metadata
@ -254,8 +261,7 @@ def main():
        python_requires='>=3.7',
        install_requires=requirements["install"],
        extras_require={
-            'develop':
-            requirements["develop"],
+            'develop': requirements["develop"],
            #'test': ["nose", "torchaudio==0.10.2", "pytest-benchmark", "librosa=0.8.1", "parameterized", "paddlepaddle"],
        },
        cmdclass={
@ -284,11 +290,11 @@ def main():
            'Programming Language :: Python :: 3.8',
            'Programming Language :: Python :: 3.9',
            'Programming Language :: Python :: 3.10',
-        ],
-    )
+        ], )

    setup(**setup_info)
    _rm_version()

+
 if __name__ == '__main__':
    main()