[audio]replace kaldi fbank with kaldi-native-fbank in paddleaudio (#2799)

* replace kaldi_fbank with kaldi-native-fbank in paddleaudio * fix mac
3 years ago · d7a6268bcc
parent 964211a81b
commit d7a6268bcc
30 changed files with 5234 additions and 359 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -31,7 +31,7 @@ repos:
        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
        -  --builtins=G,request
        -  --jobs=1
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
 -   repo : https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.0.1
@ -53,13 +53,13 @@ repos:
        entry: bash .pre-commit-hooks/clang-format.hook -i
        language: system
        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party/kaldi-native-fbank/csrc|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
    -   id: cpplint
        name: cpplint
        description: Static code analysis of C/C++ files
        language: python
        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
+        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party/kaldi-native-fbank/csrc|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
        entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent
 -   repo: https://github.com/asottile/reorder_python_imports
    rev: v2.4.0
--- a/audio/CMakeLists.txt
+++ b/audio/CMakeLists.txt
@ -41,24 +41,18 @@ option(BUILD_PADDLEAUDIO_PYTHON_EXTENSION "Build Python extension" ON)
 # cmake
 set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJECT_SOURCE_DIR}/cmake/external")
 if (NOT MSVC)
    find_package(GFortranLibs REQUIRED)
    include(FortranCInterface)
    include(FindGFortranLibs REQUIRED)
 endif()
 # fc_patch dir
 set(FETCHCONTENT_QUIET off)
 get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
 set(FETCHCONTENT_BASE_DIR ${fc_patch})
 set(THIRD_PARTY_PATH ${fc_patch})
 include(openblas)
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 include(cmake/pybind.cmake)
 include_directories(${PYTHON_INCLUDE_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/paddleaudio/third_party/)
 # packages
 find_package(Python3 COMPONENTS Interpreter Development)
--- a/audio/paddleaudio/CMakeLists.txt
+++ b/audio/paddleaudio/CMakeLists.txt
@ -1,19 +1,3 @@
 add_subdirectory(third_party)
 add_subdirectory(src)
 if (APPLE) 
  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib
          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib)
 endif(APPLE)
 if (UNIX AND NOT APPLE)
  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgfortran.so.5
          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libquadmath.so.0
          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.so.1
          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
 endif()
--- a/audio/paddleaudio/kaldi/init.py
+++ b/audio/paddleaudio/kaldi/init.py
@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .kaldi import fbank
-from .kaldi import pitch
+#from .kaldi import pitch
--- a/audio/paddleaudio/kaldi/kaldi.py
+++ b/audio/paddleaudio/kaldi/kaldi.py
@ -16,7 +16,6 @@ from paddleaudio._internal import module_utils
 __all__ = [
    'fbank',
    'pitch',
 ]
@ -33,8 +32,6 @@ def fbank(
        round_to_power_of_two: bool=True,
        blackman_coeff: float=0.42,
        snip_edges: bool=True,
        allow_downsample: bool=False,
        allow_upsample: bool=False,
        max_feature_vectors: int=-1,
        num_bins: int=23,
        low_freq: float=20,
@ -62,8 +59,6 @@ def fbank(
    frame_opts.round_to_power_of_two = round_to_power_of_two
    frame_opts.blackman_coeff = blackman_coeff
    frame_opts.snip_edges = snip_edges
    frame_opts.allow_downsample = allow_downsample
    frame_opts.allow_upsample = allow_upsample
    frame_opts.max_feature_vectors = max_feature_vectors
    mel_opts.num_bins = num_bins
@ -85,48 +80,48 @@ def fbank(
    return feat
-@module_utils.requires_kaldi()
+#@module_utils.requires_kaldi()
-def pitch(wav,
+#def pitch(wav,
-          samp_freq: int=16000,
+#samp_freq: int=16000,
-          frame_shift_ms: float=10.0,
+#frame_shift_ms: float=10.0,
-          frame_length_ms: float=25.0,
+#frame_length_ms: float=25.0,
-          preemph_coeff: float=0.0,
+#preemph_coeff: float=0.0,
-          min_f0: int=50,
+#min_f0: int=50,
-          max_f0: int=400,
+#max_f0: int=400,
-          soft_min_f0: float=10.0,
+#soft_min_f0: float=10.0,
-          penalty_factor: float=0.1,
+#penalty_factor: float=0.1,
-          lowpass_cutoff: int=1000,
+#lowpass_cutoff: int=1000,
-          resample_freq: int=4000,
+#resample_freq: int=4000,
-          delta_pitch: float=0.005,
+#delta_pitch: float=0.005,
-          nccf_ballast: int=7000,
+#nccf_ballast: int=7000,
-          lowpass_filter_width: int=1,
+#lowpass_filter_width: int=1,
-          upsample_filter_width: int=5,
+#upsample_filter_width: int=5,
-          max_frames_latency: int=0,
+#max_frames_latency: int=0,
-          frames_per_chunk: int=0,
+#frames_per_chunk: int=0,
-          simulate_first_pass_online: bool=False,
+#simulate_first_pass_online: bool=False,
-          recompute_frame: int=500,
+#recompute_frame: int=500,
-          nccf_ballast_online: bool=False,
+#nccf_ballast_online: bool=False,
-          snip_edges: bool=True):
+#snip_edges: bool=True):
-    pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
+#pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
-    pitch_opts.samp_freq = samp_freq
+#pitch_opts.samp_freq = samp_freq
-    pitch_opts.frame_shift_ms = frame_shift_ms
+#pitch_opts.frame_shift_ms = frame_shift_ms
-    pitch_opts.frame_length_ms = frame_length_ms
+#pitch_opts.frame_length_ms = frame_length_ms
-    pitch_opts.preemph_coeff = preemph_coeff
+#pitch_opts.preemph_coeff = preemph_coeff
-    pitch_opts.min_f0 = min_f0
+#pitch_opts.min_f0 = min_f0
-    pitch_opts.max_f0 = max_f0
+#pitch_opts.max_f0 = max_f0
-    pitch_opts.soft_min_f0 = soft_min_f0
+#pitch_opts.soft_min_f0 = soft_min_f0
-    pitch_opts.penalty_factor = penalty_factor
+#pitch_opts.penalty_factor = penalty_factor
-    pitch_opts.lowpass_cutoff = lowpass_cutoff
+#pitch_opts.lowpass_cutoff = lowpass_cutoff
-    pitch_opts.resample_freq = resample_freq
+#pitch_opts.resample_freq = resample_freq
-    pitch_opts.delta_pitch = delta_pitch
+#pitch_opts.delta_pitch = delta_pitch
-    pitch_opts.nccf_ballast = nccf_ballast
+#pitch_opts.nccf_ballast = nccf_ballast
-    pitch_opts.lowpass_filter_width = lowpass_filter_width
+#pitch_opts.lowpass_filter_width = lowpass_filter_width
-    pitch_opts.upsample_filter_width = upsample_filter_width
+#pitch_opts.upsample_filter_width = upsample_filter_width
-    pitch_opts.max_frames_latency = max_frames_latency
+#pitch_opts.max_frames_latency = max_frames_latency
-    pitch_opts.frames_per_chunk = frames_per_chunk
+#pitch_opts.frames_per_chunk = frames_per_chunk
-    pitch_opts.simulate_first_pass_online = simulate_first_pass_online
+#pitch_opts.simulate_first_pass_online = simulate_first_pass_online
-    pitch_opts.recompute_frame = recompute_frame
+#pitch_opts.recompute_frame = recompute_frame
-    pitch_opts.nccf_ballast_online = nccf_ballast_online
+#pitch_opts.nccf_ballast_online = nccf_ballast_online
-    pitch_opts.snip_edges = snip_edges
+#pitch_opts.snip_edges = snip_edges
-    pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
+#pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
-    return pitch
+#return pitch
--- a/audio/paddleaudio/src/CMakeLists.txt
+++ b/audio/paddleaudio/src/CMakeLists.txt
@ -52,7 +52,7 @@ if(BUILD_KALDI)
  list(
    APPEND
    LIBPADDLEAUDIO_LINK_LIBRARIES
-    libkaldi
+    kaldi-native-fbank-core
  )
  list(
    APPEND
@ -92,14 +92,6 @@ define_library(
  "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
 )
 if (APPLE)
  add_custom_command(TARGET libpaddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/libgcc_s.1.1.dylib" libpaddleaudio.so)
 endif(APPLE)
 if (UNIX AND NOT APPLE)
  set_target_properties(libpaddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN")
 endif()
 if (APPLE)
  set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
 else()
@ -207,11 +199,3 @@ define_extension(
 #     )
 # endif()
 endif()
 if (APPLE)
  add_custom_command(TARGET _paddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/lib/libgcc_s.1.1.dylib" _paddleaudio.so)
 endif(APPLE)
 if (UNIX AND NOT APPLE)
  set_target_properties(_paddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN/lib")
 endif()
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common.h
@ -16,7 +16,7 @@
 #include "pybind11/pybind11.h"
 #include "pybind11/numpy.h"
-#include "feat/feature-window.h"
+#include "kaldi-native-fbank/csrc/feature-window.h"
 namespace paddleaudio {
 namespace kaldi {
@ -28,18 +28,18 @@ class StreamingFeatureTpl {
  public:
    typedef typename F::Options Options;
    StreamingFeatureTpl(const Options& opts);
-    bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
+    bool ComputeFeature(const std::vector<float>& wav,
-                        ::kaldi::Vector<::kaldi::BaseFloat>* feats);
+                        std::vector<float>* feats);
-    void Reset() { remained_wav_.Resize(0); }
+    void Reset() { remained_wav_.resize(0); }
    int Dim() { return computer_.Dim(); }
  private:
-    bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
+    bool Compute(const std::vector<float>& waves,
-                 ::kaldi::Vector<::kaldi::BaseFloat>* feats);
+                 std::vector<float>* feats);
    Options opts_;
-    ::kaldi::FeatureWindowFunction window_function_;
+    knf::FeatureWindowFunction window_function_;
-    ::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
+    std::vector<float> remained_wav_;
    F computer_;
 };
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "base/kaldi-common.h"
 namespace paddleaudio {
 namespace kaldi {
@ -25,24 +24,29 @@ StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)
 template <class F>
 bool StreamingFeatureTpl<F>::ComputeFeature(
-    const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
+    const std::vector<float>& wav,
-    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
+    std::vector<float>* feats) {
    // append remaned waves
-    ::kaldi::int32 wav_len = wav.Dim();
+    int wav_len = wav.size();
    if (wav_len == 0) return false;
-    ::kaldi::int32 left_len = remained_wav_.Dim();
+    int left_len = remained_wav_.size();
-    ::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
+    std::vector<float> waves(left_len + wav_len);
-    waves.Range(0, left_len).CopyFromVec(remained_wav_);
+    std::memcpy(waves.data(),
-    waves.Range(left_len, wav_len).CopyFromVec(wav);
+                remained_wav_.data(),
                left_len * sizeof(float));
    std::memcpy(waves.data() + left_len,
                wav.data(),
                wav_len * sizeof(float));
    // cache remaned waves
-    ::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
+    knf::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
-    ::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
+    int num_frames = knf::NumFrames(waves.size(), frame_opts);
-    ::kaldi::int32 frame_shift = frame_opts.WindowShift();
+    int frame_shift = frame_opts.WindowShift();
-    ::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
+    int left_samples = waves.size() - frame_shift * num_frames;
-    remained_wav_.Resize(left_samples);
+    remained_wav_.resize(left_samples);
-    remained_wav_.CopyFromVec(
+    std::memcpy(remained_wav_.data(),
-        waves.Range(frame_shift * num_frames, left_samples));
+                waves.data() + frame_shift * num_frames,
                left_samples * sizeof(float));
    // compute speech feature
    Compute(waves, feats);
@ -51,40 +55,39 @@ bool StreamingFeatureTpl<F>::ComputeFeature(
 // Compute feat
 template <class F>
-bool StreamingFeatureTpl<F>::Compute(
+bool StreamingFeatureTpl<F>::Compute(const std::vector<float>& waves,
-    const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
+                                     std::vector<float>* feats) {
-    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
+    const knf::FrameExtractionOptions& frame_opts = computer_.GetFrameOptions();
-    ::kaldi::BaseFloat vtln_warp = 1.0;
+    int num_samples = waves.size();
-    const ::kaldi::FrameExtractionOptions& frame_opts =
+    int frame_length = frame_opts.WindowSize();
-        computer_.GetFrameOptions();
+    int sample_rate = frame_opts.samp_freq;
    ::kaldi::int32 num_samples = waves.Dim();
    ::kaldi::int32 frame_length = frame_opts.WindowSize();
    ::kaldi::int32 sample_rate = frame_opts.samp_freq;
    if (num_samples < frame_length) {
-        return false;
+        return true;
    }
-    ::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
+    int num_frames = knf::NumFrames(num_samples, frame_opts);
-    feats->Resize(num_frames * Dim());
+    feats->resize(num_frames * Dim());
-    ::kaldi::Vector<::kaldi::BaseFloat> window;
+    std::vector<float> window;
    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
-    for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
+    for (int frame = 0; frame < num_frames; frame++) {
-        ::kaldi::BaseFloat raw_log_energy = 0.0;
+        std::fill(window.begin(), window.end(), 0);
-        ::kaldi::ExtractWindow(0,
+        float raw_log_energy = 0.0;
-                               waves,
+        float vtln_warp = 1.0;
-                               frame,
+        knf::ExtractWindow(0,
-                               frame_opts,
+                           waves,
-                               window_function_,
+                           frame,
-                               &window,
+                           frame_opts,
-                               need_raw_log_energy ? &raw_log_energy : NULL);
+                           window_function_,
                           &window,
                           need_raw_log_energy ? &raw_log_energy : NULL);
-        ::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
+        std::vector<float> this_feature(computer_.Dim());
-                                                         ::kaldi::kUndefined);
+        computer_.Compute(
-        computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
+            raw_log_energy, vtln_warp, &window, this_feature.data());
-        ::kaldi::SubVector<::kaldi::BaseFloat> output_row(
+        std::memcpy(feats->data() + frame * Dim(),
-            feats->Data() + frame * Dim(), Dim());
+                    this_feature.data(),
-        output_row.CopyFromVec(this_feature);
+                    sizeof(float) * Dim());
    }
    return true;
 }
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
@ -13,16 +13,16 @@
 // limitations under the License.
 #include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
-#include "feat/pitch-functions.h"
+//#include "feat/pitch-functions.h"
 namespace paddleaudio {
 namespace kaldi {
 bool InitFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
+    knf::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::MelBanksOptions mel_opts,
    FbankOptions fbank_opts) {
-    ::kaldi::FbankOptions opts;
+    knf::FbankOptions opts;
    opts.frame_opts = frame_opts;
    opts.mel_opts = mel_opts;
    opts.use_energy = fbank_opts.use_energy;
@ -41,8 +41,8 @@ py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
 }
 py::array_t<float> ComputeFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
+    knf::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::MelBanksOptions mel_opts,
    FbankOptions fbank_opts,
    const py::array_t<float>& wav) {
    InitFbank(frame_opts, mel_opts, fbank_opts);
@ -55,21 +55,21 @@ void ResetFbank() {
    paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
 }
-py::array_t<float> ComputeKaldiPitch(
+//py::array_t<float> ComputeKaldiPitch(
-  const ::kaldi::PitchExtractionOptions& opts,
+  //const ::kaldi::PitchExtractionOptions& opts,
-  const py::array_t<float>& wav) {
+  //const py::array_t<float>& wav) {
-    py::buffer_info info = wav.request();
+    //py::buffer_info info = wav.request();
-    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+    //::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
-    ::kaldi::Matrix<::kaldi::BaseFloat> features;
+    //::kaldi::Matrix<::kaldi::BaseFloat> features;
-    ::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
+    //::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
-    auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
+    //auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
-    for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
+    //for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
-        std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
+        //std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
-                    sizeof(float)*features.NumCols());
+                    //sizeof(float)*features.NumCols());
-    }
+    //}
-   return result;
+   //return result;
-}
+//}
 }  // namespace kaldi
 }  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
@ -19,7 +19,7 @@
 #include <string>
 #include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
-#include "feat/pitch-functions.h"
+//#include "feat/pitch-functions.h"
 namespace py = pybind11;
@ -42,13 +42,13 @@ struct FbankOptions{
 };
 bool InitFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
+    knf::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::MelBanksOptions mel_opts,
    FbankOptions fbank_opts);
 py::array_t<float> ComputeFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
+    knf::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::MelBanksOptions mel_opts,
    FbankOptions fbank_opts,
    const py::array_t<float>& wav);
@ -56,9 +56,9 @@ py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);
 void ResetFbank();
-py::array_t<float> ComputeKaldiPitch(
+//py::array_t<float> ComputeKaldiPitch(
-    const ::kaldi::PitchExtractionOptions& opts,
+    //const ::kaldi::PitchExtractionOptions& opts,
-    const py::array_t<float>& wav);
+    //const py::array_t<float>& wav);
 }  // namespace kaldi
 }  // namespace paddleaudio
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
@ -22,7 +22,7 @@ KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
    return &instance;
 }
-bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
+bool KaldiFeatureWrapper::InitFbank(knf::FbankOptions opts) {
    fbank_.reset(new Fbank(opts));
    return true;
 }
@ -30,21 +30,18 @@ bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
 py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
    const py::array_t<float> wav) {
    py::buffer_info info = wav.request();
-    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+    std::vector<float> input_wav((float*)info.ptr, (float*)info.ptr + info.size);
-    ::kaldi::Vector<::kaldi::BaseFloat> feats;
+    std::vector<float> feats;
    bool flag = fbank_->ComputeFeature(input_wav, &feats);
-    if (flag == false || feats.Dim() == 0) return py::array_t<float>();
+    if (flag == false || feats.size() == 0) return py::array_t<float>();
-    auto result = py::array_t<float>(feats.Dim());
+    auto result = py::array_t<float>(feats.size());
    py::buffer_info xs = result.request();
    std::cout << std::endl;
    float* res_ptr = (float*)xs.ptr;
-    for (int idx = 0; idx < feats.Dim(); ++idx) {
+    std::memcpy(res_ptr, feats.data(), sizeof(float)*feats.size());
-        *res_ptr = feats(idx);
+    std::vector<int> shape{static_cast<int>(feats.size() / Dim()), 
-        res_ptr++;
+                           static_cast<int>(Dim())};
-    }
+    return result.reshape(shape);
    return result.reshape({feats.Dim() / Dim(), Dim()});
 }
 }  // namesapce kaldi
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
@ -14,20 +14,18 @@
 #pragma once
-#include "base/kaldi-common.h"
+#include "paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h"
 #include "feat/feature-fbank.h"
 #include "paddleaudio/src/pybind/kaldi/feature_common.h"
 namespace paddleaudio {
 namespace kaldi {
-typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
+typedef StreamingFeatureTpl<knf::FbankComputer> Fbank;
 class KaldiFeatureWrapper {
  public:
    static KaldiFeatureWrapper* GetInstance();
-    bool InitFbank(::kaldi::FbankOptions opts);
+    bool InitFbank(knf::FbankOptions opts);
    py::array_t<float> ComputeFbank(const py::array_t<float> wav);
    int Dim() { return fbank_->Dim(); }
    void ResetFbank() { fbank_->Reset(); }
--- a/audio/paddleaudio/src/pybind/pybind.cpp
+++ b/audio/paddleaudio/src/pybind/pybind.cpp
@ -2,7 +2,7 @@
 #ifdef INCLUDE_KALDI
 #include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
-#include "paddleaudio/third_party/kaldi/feat/feature-fbank.h"
+#include "paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h"
 #endif
 #ifdef INCLUDE_SOX
@ -89,53 +89,51 @@ PYBIND11_MODULE(_paddleaudio, m) {
 #ifdef INCLUDE_KALDI
    m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
-    py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
+    //py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
        //.def(py::init<>())
        //.def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
        //.def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
        //.def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
        //.def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
        //.def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
        //.def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
        //.def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
        //.def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
        //.def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
        //.def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
        //.def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
        //.def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
        //.def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
        //.def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
        //.def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
        //.def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
        //.def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
        //.def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
        //.def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
        //.def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
    //m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
    py::class_<knf::FrameExtractionOptions>(m, "FrameExtractionOptions")
        .def(py::init<>())            
-        .def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
+        .def_readwrite("samp_freq", &knf::FrameExtractionOptions::samp_freq)
-        .def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
+        .def_readwrite("frame_shift_ms", &knf::FrameExtractionOptions::frame_shift_ms)            
-        .def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
+        .def_readwrite("frame_length_ms", &knf::FrameExtractionOptions::frame_length_ms)
-        .def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
+        .def_readwrite("dither", &knf::FrameExtractionOptions::dither)            
-        .def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
+        .def_readwrite("preemph_coeff", &knf::FrameExtractionOptions::preemph_coeff)            
-        .def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
+        .def_readwrite("remove_dc_offset", &knf::FrameExtractionOptions::remove_dc_offset)            
-        .def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
+        .def_readwrite("window_type", &knf::FrameExtractionOptions::window_type)
-        .def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
+        .def_readwrite("round_to_power_of_two", &knf::FrameExtractionOptions::round_to_power_of_two)           
-        .def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
+        .def_readwrite("blackman_coeff", &knf::FrameExtractionOptions::blackman_coeff)          
-        .def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
+        .def_readwrite("snip_edges", &knf::FrameExtractionOptions::snip_edges)
-        .def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
+        .def_readwrite("max_feature_vectors", &knf::FrameExtractionOptions::max_feature_vectors);
-        .def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
+    py::class_<knf::MelBanksOptions>(m, "MelBanksOptions")
        .def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
        .def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
        .def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
        .def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
        .def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
        .def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
        .def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
        .def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
    m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
    py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
        .def(py::init<>())
-        .def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
+        .def_readwrite("num_bins", &knf::MelBanksOptions::num_bins)
-        .def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)            
+        .def_readwrite("low_freq", &knf::MelBanksOptions::low_freq)
-        .def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
+        .def_readwrite("high_freq", &knf::MelBanksOptions::high_freq)
-        .def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)            
+        .def_readwrite("vtln_low", &knf::MelBanksOptions::vtln_low)
-        .def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)            
+        .def_readwrite("vtln_high", &knf::MelBanksOptions::vtln_high)
-        .def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)            
+        .def_readwrite("debug_mel", &knf::MelBanksOptions::debug_mel)
-        .def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
+        .def_readwrite("htk_mode", &knf::MelBanksOptions::htk_mode);
        .def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)           
        .def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)          
        .def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
        .def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
        .def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
        .def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
    py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
        .def(py::init<>())
        .def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
        .def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
        .def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
        .def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
        .def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
        .def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
        .def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
    py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
        .def(py::init<>())
--- a/audio/paddleaudio/third_party/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/CMakeLists.txt
@ -11,5 +11,6 @@ endif()
 # kaldi
 ################################################################################
 if (BUILD_KALDI)
-  add_subdirectory(kaldi)
+  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
  add_subdirectory(kaldi-native-fbank/csrc)
 endif()
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt
@ -0,0 +1,22 @@
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../)
 add_library(kaldi-native-fbank-core
  feature-fbank.cc
  feature-functions.cc
  feature-window.cc
  fftsg.c
  log.cc
  mel-computations.cc
  rfft.cc
 )
 # We are using std::call_once() in log.h,which requires us to link with -pthread
 if(NOT WIN32)
  target_link_libraries(kaldi-native-fbank-core -pthread)
 endif()
 if(KNF_HAVE_EXECINFO_H)
  target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_EXECINFO_H=1)
 endif()
 if(KNF_HAVE_CXXABI_H)
  target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_CXXABI_H=1)
 endif()
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc
@ -0,0 +1,117 @@
 /**
 * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 *
 * See LICENSE for clarification regarding multiple authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // This file is copied/modified from kaldi/src/feat/feature-fbank.cc
 //
 #include "kaldi-native-fbank/csrc/feature-fbank.h"
 #include <cmath>
 #include "kaldi-native-fbank/csrc/feature-functions.h"
 namespace knf {
 static void Sqrt(float *in_out, int32_t n) {
  for (int32_t i = 0; i != n; ++i) {
    in_out[i] = std::sqrt(in_out[i]);
  }
 }
 std::ostream &operator<<(std::ostream &os, const FbankOptions &opts) {
  os << opts.ToString();
  return os;
 }
 FbankComputer::FbankComputer(const FbankOptions &opts)
    : opts_(opts), rfft_(opts.frame_opts.PaddedWindowSize()) {
  if (opts.energy_floor > 0.0f) {
    log_energy_floor_ = logf(opts.energy_floor);
  }
  // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
  // [note: this call caches it.]
  GetMelBanks(1.0f);
 }
 FbankComputer::~FbankComputer() {
  for (auto iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter)
    delete iter->second;
 }
 const MelBanks *FbankComputer::GetMelBanks(float vtln_warp) {
  MelBanks *this_mel_banks = nullptr;
  // std::map<float, MelBanks *>::iterator iter = mel_banks_.find(vtln_warp);
  auto iter = mel_banks_.find(vtln_warp);
  if (iter == mel_banks_.end()) {
    this_mel_banks = new MelBanks(opts_.mel_opts, opts_.frame_opts, vtln_warp);
    mel_banks_[vtln_warp] = this_mel_banks;
  } else {
    this_mel_banks = iter->second;
  }
  return this_mel_banks;
 }
 void FbankComputer::Compute(float signal_raw_log_energy, float vtln_warp,
                            std::vector<float> *signal_frame, float *feature) {
  const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
  KNF_CHECK_EQ(signal_frame->size(), opts_.frame_opts.PaddedWindowSize());
  // Compute energy after window function (not the raw one).
  if (opts_.use_energy && !opts_.raw_energy) {
    signal_raw_log_energy = std::log(
        std::max<float>(InnerProduct(signal_frame->data(), signal_frame->data(),
                                     signal_frame->size()),
                        std::numeric_limits<float>::epsilon()));
  }
  rfft_.Compute(signal_frame->data());  // signal_frame is modified in-place
  ComputePowerSpectrum(signal_frame);
  // Use magnitude instead of power if requested.
  if (!opts_.use_power) {
    Sqrt(signal_frame->data(), signal_frame->size() / 2 + 1);
  }
  int32_t mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
  // Its length is opts_.mel_opts.num_bins
  float *mel_energies = feature + mel_offset;
  // Sum with mel filter banks over the power spectrum
  mel_banks.Compute(signal_frame->data(), mel_energies);
  if (opts_.use_log_fbank) {
    // Avoid log of zero (which should be prevented anyway by dithering).
    for (int32_t i = 0; i != opts_.mel_opts.num_bins; ++i) {
      auto t = std::max(mel_energies[i], std::numeric_limits<float>::epsilon());
      mel_energies[i] = std::log(t);
    }
  }
  // Copy energy as first value (or the last, if htk_compat == true).
  if (opts_.use_energy) {
    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
      signal_raw_log_energy = log_energy_floor_;
    }
    int32_t energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
    feature[energy_index] = signal_raw_log_energy;
  }
 }
 }  // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h
@ -0,0 +1,132 @@
 /**
 * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 *
 * See LICENSE for clarification regarding multiple authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // This file is copied/modified from kaldi/src/feat/feature-fbank.h
 #ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
 #define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
 #include <map>
 #include "kaldi-native-fbank/csrc/feature-window.h"
 #include "kaldi-native-fbank/csrc/mel-computations.h"
 #include "kaldi-native-fbank/csrc/rfft.h"
 namespace knf {
 struct FbankOptions {
  FrameExtractionOptions frame_opts;
  MelBanksOptions mel_opts;
  // append an extra dimension with energy to the filter banks
  bool use_energy = false;
  float energy_floor = 0.0f;  // active iff use_energy==true
  // If true, compute log_energy before preemphasis and windowing
  // If false, compute log_energy after preemphasis ans windowing
  bool raw_energy = true;  // active iff use_energy==true
  // If true, put energy last (if using energy)
  // If false, put energy first
  bool htk_compat = false;  // active iff use_energy==true
  // if true (default), produce log-filterbank, else linear
  bool use_log_fbank = true;
  // if true (default), use power in filterbank
  // analysis, else magnitude.
  bool use_power = true;
  FbankOptions() { mel_opts.num_bins = 23; }
  std::string ToString() const {
    std::ostringstream os;
    os << "frame_opts: \n";
    os << frame_opts << "\n";
    os << "\n";
    os << "mel_opts: \n";
    os << mel_opts << "\n";
    os << "use_energy: " << use_energy << "\n";
    os << "energy_floor: " << energy_floor << "\n";
    os << "raw_energy: " << raw_energy << "\n";
    os << "htk_compat: " << htk_compat << "\n";
    os << "use_log_fbank: " << use_log_fbank << "\n";
    os << "use_power: " << use_power << "\n";
    return os.str();
  }
 };
 std::ostream &operator<<(std::ostream &os, const FbankOptions &opts);
 class FbankComputer {
 public:
  using Options = FbankOptions;
  explicit FbankComputer(const FbankOptions &opts);
  ~FbankComputer();
  int32_t Dim() const {
    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
  }
  // if true, compute log_energy_pre_window but after dithering and dc removal
  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
  const FrameExtractionOptions &GetFrameOptions() const {
    return opts_.frame_opts;
  }
  const FbankOptions &GetOptions() const { return opts_; }
  /**
     Function that computes one frame of features from
     one frame of signal.
     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
         prior to windowing and pre-emphasis, or
         log(numeric_limits<float>::min()), whichever is greater.  Must be
         ignored by this function if this class returns false from
         this->NeedsRawLogEnergy().
     @param [in] vtln_warp  The VTLN warping factor that the user wants
         to be applied when computing features for this utterance.  Will
         normally be 1.0, meaning no warping is to be done.  The value will
         be ignored for feature types that don't support VLTN, such as
         spectrogram features.
     @param [in] signal_frame  One frame of the signal,
       as extracted using the function ExtractWindow() using the options
       returned by this->GetFrameOptions().  The function will use the
       vector as a workspace, which is why it's a non-const pointer.
     @param [out] feature  Pointer to a vector of size this->Dim(), to which
         the computed feature will be written. It should be pre-allocated.
  */
  void Compute(float signal_raw_log_energy, float vtln_warp,
               std::vector<float> *signal_frame, float *feature);
 private:
  const MelBanks *GetMelBanks(float vtln_warp);
  FbankOptions opts_;
  float log_energy_floor_;
  std::map<float, MelBanks *> mel_banks_;  // float is VTLN coefficient.
  Rfft rfft_;
 };
 }  // namespace knf
 #endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc
@ -0,0 +1,49 @@
 /**
 * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 *
 * See LICENSE for clarification regarding multiple authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // This file is copied/modified from kaldi/src/feat/feature-functions.cc
 #include "kaldi-native-fbank/csrc/feature-functions.h"
 #include <cstdint>
 #include <vector>
 namespace knf {
 void ComputePowerSpectrum(std::vector<float> *complex_fft) {
  int32_t dim = complex_fft->size();
  // now we have in complex_fft, first half of complex spectrum
  // it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
  float *p = complex_fft->data();
  int32_t half_dim = dim / 2;
  float first_energy = p[0] * p[0];
  float last_energy = p[1] * p[1];  // handle this special case
  for (int32_t i = 1; i < half_dim; ++i) {
    float real = p[i * 2];
    float im = p[i * 2 + 1];
    p[i] = real * real + im * im;
  }
  p[0] = first_energy;
  p[half_dim] = last_energy;  // Will actually never be used, and anyway
  // if the signal has been bandlimited sensibly this should be zero.
 }
 }  // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h
@ -0,0 +1,38 @@
 /**
 * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 *
 * See LICENSE for clarification regarding multiple authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // This file is copied/modified from kaldi/src/feat/feature-functions.h
 #ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
 #define KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
 #include <vector>
 namespace knf {
 // ComputePowerSpectrum converts a complex FFT (as produced by the FFT
 // functions in csrc/rfft.h), and converts it into
 // a power spectrum.  If the complex FFT is a vector of size n (representing
 // half of the complex FFT of a real signal of size n, as described there),
 // this function computes in the first (n/2) + 1 elements of it, the
 // energies of the fft bins from zero to the Nyquist frequency.  Contents of the
 // remaining (n/2) - 1 elements are undefined at output.
 void ComputePowerSpectrum(std::vector<float> *complex_fft);
 }  // namespace knf
 #endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc
@ -0,0 +1,236 @@
 // kaldi-native-fbank/csrc/feature-window.cc
 //
 // Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 // This file is copied/modified from kaldi/src/feat/feature-window.cc
 #include "kaldi-native-fbank/csrc/feature-window.h"
 #include <cmath>
 #include <vector>
 #ifndef M_2PI
 #define M_2PI 6.283185307179586476925286766559005
 #endif
 namespace knf {
 std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts) {
  os << opts.ToString();
  return os;
 }
 FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
    : window_(opts.WindowSize()) {
  int32_t frame_length = opts.WindowSize();
  KNF_CHECK_GT(frame_length, 0);
  float *window_data = window_.data();
  double a = M_2PI / (frame_length - 1);
  for (int32_t i = 0; i < frame_length; i++) {
    double i_fl = static_cast<double>(i);
    if (opts.window_type == "hanning") {
      window_data[i] = 0.5 - 0.5 * cos(a * i_fl);
    } else if (opts.window_type == "sine") {
      // when you are checking ws wikipedia, please
      // note that 0.5 * a = M_PI/(frame_length-1)
      window_data[i] = sin(0.5 * a * i_fl);
    } else if (opts.window_type == "hamming") {
      window_data[i] = 0.54 - 0.46 * cos(a * i_fl);
    } else if (opts.window_type ==
               "povey") {  // like hamming but goes to zero at edges.
      window_data[i] = pow(0.5 - 0.5 * cos(a * i_fl), 0.85);
    } else if (opts.window_type == "rectangular") {
      window_data[i] = 1.0;
    } else if (opts.window_type == "blackman") {
      window_data[i] = opts.blackman_coeff - 0.5 * cos(a * i_fl) +
                       (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
    } else {
      KNF_LOG(FATAL) << "Invalid window type " << opts.window_type;
    }
  }
 }
 void FeatureWindowFunction::Apply(float *wave) const {
  int32_t window_size = window_.size();
  const float *p = window_.data();
  for (int32_t k = 0; k != window_size; ++k) {
    wave[k] *= p[k];
  }
 }
 int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts) {
  int64_t frame_shift = opts.WindowShift();
  if (opts.snip_edges) {
    return frame * frame_shift;
  } else {
    int64_t midpoint_of_frame = frame_shift * frame + frame_shift / 2,
            beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
    return beginning_of_frame;
  }
 }
 int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
                  bool flush /*= true*/) {
  int64_t frame_shift = opts.WindowShift();
  int64_t frame_length = opts.WindowSize();
  if (opts.snip_edges) {
    // with --snip-edges=true (the default), we use a HTK-like approach to
    // determining the number of frames-- all frames have to fit completely into
    // the waveform, and the first frame begins at sample zero.
    if (num_samples < frame_length)
      return 0;
    else
      return (1 + ((num_samples - frame_length) / frame_shift));
    // You can understand the expression above as follows: 'num_samples -
    // frame_length' is how much room we have to shift the frame within the
    // waveform; 'frame_shift' is how much we shift it each time; and the ratio
    // is how many times we can shift it (integer arithmetic rounds down).
  } else {
    // if --snip-edges=false, the number of frames is determined by rounding the
    // (file-length / frame-shift) to the nearest integer.  The point of this
    // formula is to make the number of frames an obvious and predictable
    // function of the frame shift and signal length, which makes many
    // segmentation-related questions simpler.
    //
    // Because integer division in C++ rounds toward zero, we add (half the
    // frame-shift minus epsilon) before dividing, to have the effect of
    // rounding towards the closest integer.
    int32_t num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
    if (flush) return num_frames;
    // note: 'end' always means the last plus one, i.e. one past the last.
    int64_t end_sample_of_last_frame =
        FirstSampleOfFrame(num_frames - 1, opts) + frame_length;
    // the following code is optimized more for clarity than efficiency.
    // If flush == false, we can't output frames that extend past the end
    // of the signal.
    while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
      num_frames--;
      end_sample_of_last_frame -= frame_shift;
    }
    return num_frames;
  }
 }
 void ExtractWindow(int64_t sample_offset, const std::vector<float> &wave,
                   int32_t f, const FrameExtractionOptions &opts,
                   const FeatureWindowFunction &window_function,
                   std::vector<float> *window,
                   float *log_energy_pre_window /*= nullptr*/) {
  KNF_CHECK(sample_offset >= 0 && wave.size() != 0);
  int32_t frame_length = opts.WindowSize();
  int32_t frame_length_padded = opts.PaddedWindowSize();
  int64_t num_samples = sample_offset + wave.size();
  int64_t start_sample = FirstSampleOfFrame(f, opts);
  int64_t end_sample = start_sample + frame_length;
  if (opts.snip_edges) {
    KNF_CHECK(start_sample >= sample_offset && end_sample <= num_samples);
  } else {
    KNF_CHECK(sample_offset == 0 || start_sample >= sample_offset);
  }
  if (window->size() != frame_length_padded) {
    window->resize(frame_length_padded);
  }
  // wave_start and wave_end are start and end indexes into 'wave', for the
  // piece of wave that we're trying to extract.
  int32_t wave_start = int32_t(start_sample - sample_offset);
  int32_t wave_end = wave_start + frame_length;
  if (wave_start >= 0 && wave_end <= wave.size()) {
    // the normal case-- no edge effects to consider.
    std::copy(wave.begin() + wave_start,
              wave.begin() + wave_start + frame_length, window->data());
  } else {
    // Deal with any end effects by reflection, if needed.  This code will only
    // be reached for about two frames per utterance, so we don't concern
    // ourselves excessively with efficiency.
    int32_t wave_dim = wave.size();
    for (int32_t s = 0; s < frame_length; ++s) {
      int32_t s_in_wave = s + wave_start;
      while (s_in_wave < 0 || s_in_wave >= wave_dim) {
        // reflect around the beginning or end of the wave.
        // e.g. -1 -> 0, -2 -> 1.
        // dim -> dim - 1, dim + 1 -> dim - 2.
        // the code supports repeated reflections, although this
        // would only be needed in pathological cases.
        if (s_in_wave < 0)
          s_in_wave = -s_in_wave - 1;
        else
          s_in_wave = 2 * wave_dim - 1 - s_in_wave;
      }
      (*window)[s] = wave[s_in_wave];
    }
  }
  ProcessWindow(opts, window_function, window->data(), log_energy_pre_window);
 }
 static void RemoveDcOffset(float *d, int32_t n) {
  float sum = 0;
  for (int32_t i = 0; i != n; ++i) {
    sum += d[i];
  }
  float mean = sum / n;
  for (int32_t i = 0; i != n; ++i) {
    d[i] -= mean;
  }
 }
 float InnerProduct(const float *a, const float *b, int32_t n) {
  float sum = 0;
  for (int32_t i = 0; i != n; ++i) {
    sum += a[i] * b[i];
  }
  return sum;
 }
 static void Preemphasize(float *d, int32_t n, float preemph_coeff) {
  if (preemph_coeff == 0.0) {
    return;
  }
  KNF_CHECK(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
  for (int32_t i = n - 1; i > 0; --i) {
    d[i] -= preemph_coeff * d[i - 1];
  }
  d[0] -= preemph_coeff * d[0];
 }
 void ProcessWindow(const FrameExtractionOptions &opts,
                   const FeatureWindowFunction &window_function, float *window,
                   float *log_energy_pre_window /*= nullptr*/) {
  int32_t frame_length = opts.WindowSize();
  // TODO(fangjun): Remove dither
  KNF_CHECK_EQ(opts.dither, 0);
  if (opts.remove_dc_offset) {
    RemoveDcOffset(window, frame_length);
  }
  if (log_energy_pre_window != NULL) {
    float energy = std::max<float>(InnerProduct(window, window, frame_length),
                                   std::numeric_limits<float>::epsilon());
    *log_energy_pre_window = std::log(energy);
  }
  if (opts.preemph_coeff != 0.0) {
    Preemphasize(window, frame_length, opts.preemph_coeff);
  }
  window_function.Apply(window);
 }
 }  // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h
@ -0,0 +1,178 @@
 // kaldi-native-fbank/csrc/feature-window.h
 //
 // Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 // This file is copied/modified from kaldi/src/feat/feature-window.h
 #ifndef KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
 #define KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
 #include <sstream>
 #include <string>
 #include <vector>
 #include "kaldi-native-fbank/csrc/log.h"
 namespace knf {
 inline int32_t RoundUpToNearestPowerOfTwo(int32_t n) {
  // copied from kaldi/src/base/kaldi-math.cc
  KNF_CHECK_GT(n, 0);
  n--;
  n |= n >> 1;
  n |= n >> 2;
  n |= n >> 4;
  n |= n >> 8;
  n |= n >> 16;
  return n + 1;
 }
 struct FrameExtractionOptions {
  float samp_freq = 16000;
  float frame_shift_ms = 10.0f;   // in milliseconds.
  float frame_length_ms = 25.0f;  // in milliseconds.
  float dither = 1.0f;            // Amount of dithering, 0.0 means no dither.
  float preemph_coeff = 0.97f;    // Preemphasis coefficient.
  bool remove_dc_offset = true;   // Subtract mean of wave before FFT.
  std::string window_type = "povey";  // e.g. Hamming window
  // May be "hamming", "rectangular", "povey", "hanning", "sine", "blackman"
  // "povey" is a window I made to be similar to Hamming but to go to zero at
  // the edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) I just don't think the
  // Hamming window makes sense as a windowing function.
  bool round_to_power_of_two = true;
  float blackman_coeff = 0.42f;
  bool snip_edges = true;
  // bool allow_downsample = false;
  // bool allow_upsample = false;
  // Used for streaming feature extraction. It indicates the number
  // of feature frames to keep in the recycling vector. -1 means to
  // keep all feature frames.
  int32_t max_feature_vectors = -1;
  int32_t WindowShift() const {
    return static_cast<int32_t>(samp_freq * 0.001f * frame_shift_ms);
  }
  int32_t WindowSize() const {
    return static_cast<int32_t>(samp_freq * 0.001f * frame_length_ms);
  }
  int32_t PaddedWindowSize() const {
    return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize())
                                  : WindowSize());
  }
  std::string ToString() const {
    std::ostringstream os;
 #define KNF_PRINT(x) os << #x << ": " << x << "\n"
    KNF_PRINT(samp_freq);
    KNF_PRINT(frame_shift_ms);
    KNF_PRINT(frame_length_ms);
    KNF_PRINT(dither);
    KNF_PRINT(preemph_coeff);
    KNF_PRINT(remove_dc_offset);
    KNF_PRINT(window_type);
    KNF_PRINT(round_to_power_of_two);
    KNF_PRINT(blackman_coeff);
    KNF_PRINT(snip_edges);
    // KNF_PRINT(allow_downsample);
    // KNF_PRINT(allow_upsample);
    KNF_PRINT(max_feature_vectors);
 #undef KNF_PRINT
    return os.str();
  }
 };
 std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts);
 class FeatureWindowFunction {
 public:
  FeatureWindowFunction() = default;
  explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
  /**
   * @param wave Pointer to a 1-D array of shape [window_size].
   *             It is modified in-place: wave[i] = wave[i] * window_[i].
   * @param
   */
  void Apply(float *wave) const;
 private:
  std::vector<float> window_;  // of size opts.WindowSize()
 };
 int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts);
 /**
   This function returns the number of frames that we can extract from a wave
   file with the given number of samples in it (assumed to have the same
   sampling rate as specified in 'opts').
      @param [in] num_samples  The number of samples in the wave file.
      @param [in] opts     The frame-extraction options class
      @param [in] flush   True if we are asserting that this number of samples
   is 'all there is', false if we expecting more data to possibly come in.  This
   only makes a difference to the answer
   if opts.snips_edges== false.  For offline feature extraction you always want
   flush == true.  In an online-decoding context, once you know (or decide) that
   no more data is coming in, you'd call it with flush == true at the end to
   flush out any remaining data.
 */
 int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
                  bool flush = true);
 /*
  ExtractWindow() extracts a windowed frame of waveform (possibly with a
  power-of-two, padded size, depending on the config), including all the
  processing done by ProcessWindow().
  @param [in] sample_offset  If 'wave' is not the entire waveform, but
                   part of it to the left has been discarded, then the
                   number of samples prior to 'wave' that we have
                   already discarded.  Set this to zero if you are
                   processing the entire waveform in one piece, or
                   if you get 'no matching function' compilation
                   errors when updating the code.
  @param [in] wave  The waveform
  @param [in] f     The frame index to be extracted, with
                    0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
  @param [in] opts  The options class to be used
  @param [in] window_function  The windowing function, as derived from the
                    options class.
  @param [out] window  The windowed, possibly-padded waveform to be
                     extracted.  Will be resized as needed.
  @param [out] log_energy_pre_window  If non-NULL, the log-energy of
                   the signal prior to pre-emphasis and multiplying by
                   the windowing function will be written to here.
 */
 void ExtractWindow(int64_t sample_offset, const std::vector<float> &wave,
                   int32_t f, const FrameExtractionOptions &opts,
                   const FeatureWindowFunction &window_function,
                   std::vector<float> *window,
                   float *log_energy_pre_window = nullptr);
 /**
  This function does all the windowing steps after actually
  extracting the windowed signal: depending on the
  configuration, it does dithering, dc offset removal,
  preemphasis, and multiplication by the windowing function.
   @param [in] opts  The options class to be used
   @param [in] window_function  The windowing function-- should have
                    been initialized using 'opts'.
   @param [in,out] window  A vector of size opts.WindowSize().  Note:
      it will typically be a sub-vector of a larger vector of size
      opts.PaddedWindowSize(), with the remaining samples zero,
      as the FFT code is more efficient if it operates on data with
      power-of-two size.
   @param [out]   log_energy_pre_window If non-NULL, then after dithering and
      DC offset removal, this function will write to this pointer the log of
      the total energy (i.e. sum-squared) of the frame.
 */
 void ProcessWindow(const FrameExtractionOptions &opts,
                   const FeatureWindowFunction &window_function, float *window,
                   float *log_energy_pre_window = nullptr);
 // Compute the inner product of two vectors
 float InnerProduct(const float *a, const float *b, int32_t n);
 }  // namespace knf
 #endif  // KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc
@ -0,0 +1,143 @@
 /**
 * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 *
 * See LICENSE for clarification regarding multiple authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*
 * Stack trace related stuff is from kaldi.
 * Refer to
 * https://github.com/kaldi-asr/kaldi/blob/master/src/base/kaldi-error.cc
 */
 #include "kaldi-native-fbank/csrc/log.h"
 #ifdef KNF_HAVE_EXECINFO_H
 #include <execinfo.h> // To get stack trace in error messages.
 #ifdef KNF_HAVE_CXXABI_H
 #include <cxxabi.h> // For name demangling.
 // Useful to decode the stack trace, but only used if we have execinfo.h
 #endif // KNF_HAVE_CXXABI_H
 #endif // KNF_HAVE_EXECINFO_H
 #include <stdlib.h>
 #include <ctime>
 #include <iomanip>
 #include <string>
 namespace knf {
 std::string GetDateTimeStr() {
  std::ostringstream os;
  std::time_t t = std::time(nullptr);
  std::tm tm = *std::localtime(&t);
  os << std::put_time(&tm, "%F %T"); // yyyy-mm-dd hh:mm:ss
  return os.str();
 }
 static bool LocateSymbolRange(const std::string &trace_name, std::size_t *begin,
                              std::size_t *end) {
  // Find the first '_' with leading ' ' or '('.
  *begin = std::string::npos;
  for (std::size_t i = 1; i < trace_name.size(); ++i) {
    if (trace_name[i] != '_') {
      continue;
    }
    if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') {
      *begin = i;
      break;
    }
  }
  if (*begin == std::string::npos) {
    return false;
  }
  *end = trace_name.find_first_of(" +", *begin);
  return *end != std::string::npos;
 }
 #ifdef KNF_HAVE_EXECINFO_H
 static std::string Demangle(const std::string &trace_name) {
 #ifndef KNF_HAVE_CXXABI_H
  return trace_name;
 #else  // KNF_HAVE_CXXABI_H
  // Try demangle the symbol. We are trying to support the following formats
  // produced by different platforms:
  //
  // Linux:
  //   ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d]
  //
  // Mac:
  //   0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813
  //
  // We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and
  // demangle it info a readable name like kaldi::UnitTextError.
  std::size_t begin, end;
  if (!LocateSymbolRange(trace_name, &begin, &end)) {
    return trace_name;
  }
  std::string symbol = trace_name.substr(begin, end - begin);
  int status;
  char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status);
  if (status == 0 && demangled_name != nullptr) {
    symbol = demangled_name;
    free(demangled_name);
  }
  return trace_name.substr(0, begin) + symbol +
         trace_name.substr(end, std::string::npos);
 #endif // KNF_HAVE_CXXABI_H
 }
 #endif // KNF_HAVE_EXECINFO_H
 std::string GetStackTrace() {
  std::string ans;
 #ifdef KNF_HAVE_EXECINFO_H
  constexpr const std::size_t kMaxTraceSize = 50;
  constexpr const std::size_t kMaxTracePrint = 50; // Must be even.
                                                   // Buffer for the trace.
  void *trace[kMaxTraceSize];
  // Get the trace.
  std::size_t size = backtrace(trace, kMaxTraceSize);
  // Get the trace symbols.
  char **trace_symbol = backtrace_symbols(trace, size);
  if (trace_symbol == nullptr)
    return ans;
  // Compose a human-readable backtrace string.
  ans += "[ Stack-Trace: ]\n";
  if (size <= kMaxTracePrint) {
    for (std::size_t i = 0; i < size; ++i) {
      ans += Demangle(trace_symbol[i]) + "\n";
    }
  } else { // Print out first+last (e.g.) 5.
    for (std::size_t i = 0; i < kMaxTracePrint / 2; ++i) {
      ans += Demangle(trace_symbol[i]) + "\n";
    }
    ans += ".\n.\n.\n";
    for (std::size_t i = size - kMaxTracePrint / 2; i < size; ++i) {
      ans += Demangle(trace_symbol[i]) + "\n";
    }
    if (size == kMaxTraceSize)
      ans += ".\n.\n.\n"; // Stack was too long, probably a bug.
  }
  // We must free the array of pointers allocated by backtrace_symbols(),
  // but not the strings themselves.
  free(trace_symbol);
 #endif // KNF_HAVE_EXECINFO_H
  return ans;
 }
 } // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h
@ -0,0 +1,347 @@
 /**
 * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 *
 * See LICENSE for clarification regarding multiple authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // The content in this file is copied/modified from
 // https://github.com/k2-fsa/k2/blob/master/k2/csrc/log.h
 #ifndef KALDI_NATIVE_FBANK_CSRC_LOG_H_
 #define KALDI_NATIVE_FBANK_CSRC_LOG_H_
 #include <stdio.h>
 #include <mutex>  // NOLINT
 #include <sstream>
 #include <string>
 namespace knf {
 #if defined(NDEBUG)
 constexpr bool kDisableDebug = true;
 #else
 constexpr bool kDisableDebug = false;
 #endif
 enum class LogLevel {
  kTrace = 0,
  kDebug = 1,
  kInfo = 2,
  kWarning = 3,
  kError = 4,
  kFatal = 5,  // print message and abort the program
 };
 // They are used in KNF_LOG(xxx), so their names
 // do not follow the google c++ code style
 //
 // You can use them in the following way:
 //
 //  KNF_LOG(TRACE) << "some message";
 //  KNF_LOG(DEBUG) << "some message";
 #ifndef _MSC_VER
 constexpr LogLevel TRACE = LogLevel::kTrace;
 constexpr LogLevel DEBUG = LogLevel::kDebug;
 constexpr LogLevel INFO = LogLevel::kInfo;
 constexpr LogLevel WARNING = LogLevel::kWarning;
 constexpr LogLevel ERROR = LogLevel::kError;
 constexpr LogLevel FATAL = LogLevel::kFatal;
 #else
 #define TRACE LogLevel::kTrace
 #define DEBUG LogLevel::kDebug
 #define INFO LogLevel::kInfo
 #define WARNING LogLevel::kWarning
 #define ERROR LogLevel::kError
 #define FATAL LogLevel::kFatal
 #endif
 std::string GetStackTrace();
 /* Return the current log level.
   If the current log level is TRACE, then all logged messages are printed out.
   If the current log level is DEBUG, log messages with "TRACE" level are not
   shown and all other levels are printed out.
   Similarly, if the current log level is INFO, log message with "TRACE" and
   "DEBUG" are not shown and all other levels are printed out.
   If it is FATAL, then only FATAL messages are shown.
 */
 inline LogLevel GetCurrentLogLevel() {
  static LogLevel log_level = INFO;
  static std::once_flag init_flag;
  std::call_once(init_flag, []() {
    const char *env_log_level = std::getenv("KNF_LOG_LEVEL");
    if (env_log_level == nullptr) return;
    std::string s = env_log_level;
    if (s == "TRACE")
      log_level = TRACE;
    else if (s == "DEBUG")
      log_level = DEBUG;
    else if (s == "INFO")
      log_level = INFO;
    else if (s == "WARNING")
      log_level = WARNING;
    else if (s == "ERROR")
      log_level = ERROR;
    else if (s == "FATAL")
      log_level = FATAL;
    else
      fprintf(stderr,
              "Unknown KNF_LOG_LEVEL: %s"
              "\nSupported values are: "
              "TRACE, DEBUG, INFO, WARNING, ERROR, FATAL",
              s.c_str());
  });
  return log_level;
 }
 inline bool EnableAbort() {
  static std::once_flag init_flag;
  static bool enable_abort = false;
  std::call_once(init_flag, []() {
    enable_abort = (std::getenv("KNF_ABORT") != nullptr);
  });
  return enable_abort;
 }
 class Logger {
 public:
  Logger(const char *filename, const char *func_name, uint32_t line_num,
         LogLevel level)
      : filename_(filename),
        func_name_(func_name),
        line_num_(line_num),
        level_(level) {
    cur_level_ = GetCurrentLogLevel();
    fprintf(stderr, "here\n");
    switch (level) {
      case TRACE:
        if (cur_level_ <= TRACE) fprintf(stderr, "[T] ");
        break;
      case DEBUG:
        if (cur_level_ <= DEBUG) fprintf(stderr, "[D] ");
        break;
      case INFO:
        if (cur_level_ <= INFO) fprintf(stderr, "[I] ");
        break;
      case WARNING:
        if (cur_level_ <= WARNING) fprintf(stderr, "[W] ");
        break;
      case ERROR:
        if (cur_level_ <= ERROR) fprintf(stderr, "[E] ");
        break;
      case FATAL:
        if (cur_level_ <= FATAL) fprintf(stderr, "[F] ");
        break;
    }
    if (cur_level_ <= level_) {
      fprintf(stderr, "%s:%u:%s ", filename, line_num, func_name);
    }
  }
  ~Logger() noexcept(false) {
    static constexpr const char *kErrMsg = R"(
    Some bad things happened. Please read the above error messages and stack
    trace. If you are using Python, the following command may be helpful:
      gdb --args python /path/to/your/code.py
    (You can use `gdb` to debug the code. Please consider compiling
    a debug version of KNF.).
    If you are unable to fix it, please open an issue at:
      https://github.com/csukuangfj/kaldi-native-fbank/issues/new
    )";
    fprintf(stderr, "\n");
    if (level_ == FATAL) {
      std::string stack_trace = GetStackTrace();
      if (!stack_trace.empty()) {
        fprintf(stderr, "\n\n%s\n", stack_trace.c_str());
      }
      fflush(nullptr);
 #ifndef __ANDROID_API__
      if (EnableAbort()) {
        // NOTE: abort() will terminate the program immediately without
        // printing the Python stack backtrace.
        abort();
      }
      throw std::runtime_error(kErrMsg);
 #else
      abort();
 #endif
    }
  }
  const Logger &operator<<(bool b) const {
    if (cur_level_ <= level_) {
      fprintf(stderr, b ? "true" : "false");
    }
    return *this;
  }
  const Logger &operator<<(int8_t i) const {
    if (cur_level_ <= level_) fprintf(stderr, "%d", i);
    return *this;
  }
  const Logger &operator<<(const char *s) const {
    if (cur_level_ <= level_) fprintf(stderr, "%s", s);
    return *this;
  }
  const Logger &operator<<(int32_t i) const {
    if (cur_level_ <= level_) fprintf(stderr, "%d", i);
    return *this;
  }
  const Logger &operator<<(uint32_t i) const {
    if (cur_level_ <= level_) fprintf(stderr, "%u", i);
    return *this;
  }
  const Logger &operator<<(uint64_t i) const {
    if (cur_level_ <= level_)
      fprintf(stderr, "%llu", (long long unsigned int)i);  // NOLINT
    return *this;
  }
  const Logger &operator<<(int64_t i) const {
    if (cur_level_ <= level_)
      fprintf(stderr, "%lli", (long long int)i);  // NOLINT
    return *this;
  }
  const Logger &operator<<(float f) const {
    if (cur_level_ <= level_) fprintf(stderr, "%f", f);
    return *this;
  }
  const Logger &operator<<(double d) const {
    if (cur_level_ <= level_) fprintf(stderr, "%f", d);
    return *this;
  }
  template <typename T>
  const Logger &operator<<(const T &t) const {
    // require T overloads operator<<
    std::ostringstream os;
    os << t;
    return *this << os.str().c_str();
  }
  // specialization to fix compile error: `stringstream << nullptr` is ambiguous
  const Logger &operator<<(const std::nullptr_t &null) const {
    if (cur_level_ <= level_) *this << "(null)";
    return *this;
  }
 private:
  const char *filename_;
  const char *func_name_;
  uint32_t line_num_;
  LogLevel level_;
  LogLevel cur_level_;
 };
 class Voidifier {
 public:
  void operator&(const Logger &)const {}
 };
 }  // namespace knf
 #if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) || \
    defined(__PRETTY_FUNCTION__)
 // for clang and GCC
 #define KNF_FUNC __PRETTY_FUNCTION__
 #else
 // for other compilers
 #define KNF_FUNC __func__
 #endif
 #define KNF_STATIC_ASSERT(x) static_assert(x, "")
 #define KNF_CHECK(x)                                                  \
  (x) ? (void)0                                                       \
      : ::knf::Voidifier() &                                          \
            ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
                << "Check failed: " << #x << " "
 // WARNING: x and y may be evaluated multiple times, but this happens only
 // when the check fails. Since the program aborts if it fails, we don't think
 // the extra evaluation of x and y matters.
 //
 // CAUTION: we recommend the following use case:
 //
 //      auto x = Foo();
 //      auto y = Bar();
 //      KNF_CHECK_EQ(x, y) << "Some message";
 //
 //  And please avoid
 //
 //      KNF_CHECK_EQ(Foo(), Bar());
 //
 //  if `Foo()` or `Bar()` causes some side effects, e.g., changing some
 //  local static variables or global variables.
 #define _KNF_CHECK_OP(x, y, op)                                              \
  ((x)op(y)) ? (void)0                                                       \
             : ::knf::Voidifier() &                                          \
                   ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
                       << "Check failed: " << #x << " " << #op << " " << #y  \
                       << " (" << (x) << " vs. " << (y) << ") "
 #define KNF_CHECK_EQ(x, y) _KNF_CHECK_OP(x, y, ==)
 #define KNF_CHECK_NE(x, y) _KNF_CHECK_OP(x, y, !=)
 #define KNF_CHECK_LT(x, y) _KNF_CHECK_OP(x, y, <)
 #define KNF_CHECK_LE(x, y) _KNF_CHECK_OP(x, y, <=)
 #define KNF_CHECK_GT(x, y) _KNF_CHECK_OP(x, y, >)
 #define KNF_CHECK_GE(x, y) _KNF_CHECK_OP(x, y, >=)
 #define KNF_LOG(x) ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::x)
 // ------------------------------------------------------------
 //       For debug check
 // ------------------------------------------------------------
 // If you define the macro "-D NDEBUG" while compiling kaldi-native-fbank,
 // the following macros are in fact empty and does nothing.
 #define KNF_DCHECK(x) ::knf::kDisableDebug ? (void)0 : KNF_CHECK(x)
 #define KNF_DCHECK_EQ(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_EQ(x, y)
 #define KNF_DCHECK_NE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_NE(x, y)
 #define KNF_DCHECK_LT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LT(x, y)
 #define KNF_DCHECK_LE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LE(x, y)
 #define KNF_DCHECK_GT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GT(x, y)
 #define KNF_DCHECK_GE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GE(x, y)
 #define KNF_DLOG(x) \
  ::knf::kDisableDebug ? (void)0 : ::knf::Voidifier() & KNF_LOG(x)
 #endif  // KALDI_NATIVE_FBANK_CSRC_LOG_H_
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc
@ -0,0 +1,256 @@
 /**
 * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 *
 * See LICENSE for clarification regarding multiple authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // This file is copied/modified from kaldi/src/feat/mel-computations.cc
 #include "kaldi-native-fbank/csrc/mel-computations.h"
 #include <algorithm>
 #include <sstream>
 #include "kaldi-native-fbank/csrc/feature-window.h"
 namespace knf {
 std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts) {
  os << opts.ToString();
  return os;
 }
 float MelBanks::VtlnWarpFreq(
    float vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
    float vtln_high_cutoff,
    float low_freq,  // upper+lower frequency cutoffs in mel computation
    float high_freq, float vtln_warp_factor, float freq) {
  /// This computes a VTLN warping function that is not the same as HTK's one,
  /// but has similar inputs (this function has the advantage of never producing
  /// empty bins).
  /// This function computes a warp function F(freq), defined between low_freq
  /// and high_freq inclusive, with the following properties:
  ///  F(low_freq) == low_freq
  ///  F(high_freq) == high_freq
  /// The function is continuous and piecewise linear with two inflection
  ///   points.
  /// The lower inflection point (measured in terms of the unwarped
  ///  frequency) is at frequency l, determined as described below.
  /// The higher inflection point is at a frequency h, determined as
  ///   described below.
  /// If l <= f <= h, then F(f) = f/vtln_warp_factor.
  /// If the higher inflection point (measured in terms of the unwarped
  ///   frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
  ///   Since (by the last point) F(h) == h/vtln_warp_factor, then
  ///   max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
  ///   h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
  ///     = vtln_high_cutoff * min(1, vtln_warp_factor).
  /// If the lower inflection point (measured in terms of the unwarped
  ///   frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
  ///   This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
  ///                       = vtln_low_cutoff * max(1, vtln_warp_factor)
  if (freq < low_freq || freq > high_freq)
    return freq;  // in case this gets called
  // for out-of-range frequencies, just return the freq.
  KNF_CHECK_GT(vtln_low_cutoff, low_freq);
  KNF_CHECK_LT(vtln_high_cutoff, high_freq);
  float one = 1.0f;
  float l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
  float h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
  float scale = 1.0f / vtln_warp_factor;
  float Fl = scale * l;  // F(l);
  float Fh = scale * h;  // F(h);
  KNF_CHECK(l > low_freq && h < high_freq);
  // slope of left part of the 3-piece linear function
  float scale_left = (Fl - low_freq) / (l - low_freq);
  // [slope of center part is just "scale"]
  // slope of right part of the 3-piece linear function
  float scale_right = (high_freq - Fh) / (high_freq - h);
  if (freq < l) {
    return low_freq + scale_left * (freq - low_freq);
  } else if (freq < h) {
    return scale * freq;
  } else {  // freq >= h
    return high_freq + scale_right * (freq - high_freq);
  }
 }
 float MelBanks::VtlnWarpMelFreq(
    float vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
    float vtln_high_cutoff,
    float low_freq,  // upper+lower frequency cutoffs in mel computation
    float high_freq, float vtln_warp_factor, float mel_freq) {
  return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, low_freq,
                               high_freq, vtln_warp_factor,
                               InverseMelScale(mel_freq)));
 }
 MelBanks::MelBanks(const MelBanksOptions &opts,
                   const FrameExtractionOptions &frame_opts,
                   float vtln_warp_factor)
    : htk_mode_(opts.htk_mode) {
  int32_t num_bins = opts.num_bins;
  if (num_bins < 3) KNF_LOG(FATAL) << "Must have at least 3 mel bins";
  float sample_freq = frame_opts.samp_freq;
  int32_t window_length_padded = frame_opts.PaddedWindowSize();
  KNF_CHECK_EQ(window_length_padded % 2, 0);
  int32_t num_fft_bins = window_length_padded / 2;
  float nyquist = 0.5f * sample_freq;
  float low_freq = opts.low_freq, high_freq;
  if (opts.high_freq > 0.0f)
    high_freq = opts.high_freq;
  else
    high_freq = nyquist + opts.high_freq;
  if (low_freq < 0.0f || low_freq >= nyquist || high_freq <= 0.0f ||
      high_freq > nyquist || high_freq <= low_freq) {
    KNF_LOG(FATAL) << "Bad values in options: low-freq " << low_freq
                   << " and high-freq " << high_freq << " vs. nyquist "
                   << nyquist;
  }
  float fft_bin_width = sample_freq / window_length_padded;
  // fft-bin width [think of it as Nyquist-freq / half-window-length]
  float mel_low_freq = MelScale(low_freq);
  float mel_high_freq = MelScale(high_freq);
  debug_ = opts.debug_mel;
  // divide by num_bins+1 in next line because of end-effects where the bins
  // spread out to the sides.
  float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1);
  float vtln_low = opts.vtln_low, vtln_high = opts.vtln_high;
  if (vtln_high < 0.0f) {
    vtln_high += nyquist;
  }
  if (vtln_warp_factor != 1.0f &&
      (vtln_low < 0.0f || vtln_low <= low_freq || vtln_low >= high_freq ||
       vtln_high <= 0.0f || vtln_high >= high_freq || vtln_high <= vtln_low)) {
    KNF_LOG(FATAL) << "Bad values in options: vtln-low " << vtln_low
                   << " and vtln-high " << vtln_high << ", versus "
                   << "low-freq " << low_freq << " and high-freq " << high_freq;
  }
  bins_.resize(num_bins);
  center_freqs_.resize(num_bins);
  for (int32_t bin = 0; bin < num_bins; ++bin) {
    float left_mel = mel_low_freq + bin * mel_freq_delta,
          center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
          right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
    if (vtln_warp_factor != 1.0f) {
      left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
                                 vtln_warp_factor, left_mel);
      center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
                                   vtln_warp_factor, center_mel);
      right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
                                  vtln_warp_factor, right_mel);
    }
    center_freqs_[bin] = InverseMelScale(center_mel);
    // this_bin will be a vector of coefficients that is only
    // nonzero where this mel bin is active.
    std::vector<float> this_bin(num_fft_bins);
    int32_t first_index = -1, last_index = -1;
    for (int32_t i = 0; i < num_fft_bins; ++i) {
      float freq = (fft_bin_width * i);  // Center frequency of this fft
                                         // bin.
      float mel = MelScale(freq);
      if (mel > left_mel && mel < right_mel) {
        float weight;
        if (mel <= center_mel)
          weight = (mel - left_mel) / (center_mel - left_mel);
        else
          weight = (right_mel - mel) / (right_mel - center_mel);
        this_bin[i] = weight;
        if (first_index == -1) first_index = i;
        last_index = i;
      }
    }
    KNF_CHECK(first_index != -1 && last_index >= first_index &&
              "You may have set num_mel_bins too large.");
    bins_[bin].first = first_index;
    int32_t size = last_index + 1 - first_index;
    bins_[bin].second.insert(bins_[bin].second.end(),
                             this_bin.begin() + first_index,
                             this_bin.begin() + first_index + size);
    // Replicate a bug in HTK, for testing purposes.
    if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0f) {
      bins_[bin].second[0] = 0.0;
    }
  }  // for (int32_t bin = 0; bin < num_bins; ++bin) {
  if (debug_) {
    std::ostringstream os;
    for (size_t i = 0; i < bins_.size(); i++) {
      os << "bin " << i << ", offset = " << bins_[i].first << ", vec = ";
      for (auto k : bins_[i].second) os << k << ", ";
      os << "\n";
    }
    KNF_LOG(INFO) << os.str();
  }
 }
 // "power_spectrum" contains fft energies.
 void MelBanks::Compute(const float *power_spectrum,
                       float *mel_energies_out) const {
  int32_t num_bins = bins_.size();
  for (int32_t i = 0; i < num_bins; i++) {
    int32_t offset = bins_[i].first;
    const auto &v = bins_[i].second;
    float energy = 0;
    for (int32_t k = 0; k != v.size(); ++k) {
      energy += v[k] * power_spectrum[k + offset];
    }
    // HTK-like flooring- for testing purposes (we prefer dither)
    if (htk_mode_ && energy < 1.0) {
      energy = 1.0;
    }
    mel_energies_out[i] = energy;
    // The following assert was added due to a problem with OpenBlas that
    // we had at one point (it was a bug in that library).  Just to detect
    // it early.
    KNF_CHECK_EQ(energy, energy);  // check that energy is not nan
  }
  if (debug_) {
    fprintf(stderr, "MEL BANKS:\n");
    for (int32_t i = 0; i < num_bins; i++)
      fprintf(stderr, " %f", mel_energies_out[i]);
    fprintf(stderr, "\n");
  }
 }
 }  // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h
@ -0,0 +1,115 @@
 /**
 * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 *
 * See LICENSE for clarification regarding multiple authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 // This file is copied/modified from kaldi/src/feat/mel-computations.h
 #ifndef KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
 #define KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
 #include <cmath>
 #include <string>
 #include "kaldi-native-fbank/csrc/feature-window.h"
 namespace knf {
 struct MelBanksOptions {
  int32_t num_bins = 25;  // e.g. 25; number of triangular bins
  float low_freq = 20;    // e.g. 20; lower frequency cutoff
  // an upper frequency cutoff; 0 -> no cutoff, negative
  // ->added to the Nyquist frequency to get the cutoff.
  float high_freq = 0;
  float vtln_low = 100;  // vtln lower cutoff of warping function.
  // vtln upper cutoff of warping function: if negative, added
  // to the Nyquist frequency to get the cutoff.
  float vtln_high = -500;
  bool debug_mel = false;
  // htk_mode is a "hidden" config, it does not show up on command line.
  // Enables more exact compatibility with HTK, for testing purposes.  Affects
  // mel-energy flooring and reproduces a bug in HTK.
  bool htk_mode = false;
  std::string ToString() const {
    std::ostringstream os;
    os << "num_bins: " << num_bins << "\n";
    os << "low_freq: " << low_freq << "\n";
    os << "high_freq: " << high_freq << "\n";
    os << "vtln_low: " << vtln_low << "\n";
    os << "vtln_high: " << vtln_high << "\n";
    os << "debug_mel: " << debug_mel << "\n";
    os << "htk_mode: " << htk_mode << "\n";
    return os.str();
  }
 };
 std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts);
 class MelBanks {
 public:
  static inline float InverseMelScale(float mel_freq) {
    return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f);
  }
  static inline float MelScale(float freq) {
    return 1127.0f * logf(1.0f + freq / 700.0f);
  }
  static float VtlnWarpFreq(
      float vtln_low_cutoff,
      float vtln_high_cutoff,  // discontinuities in warp func
      float low_freq,
      float high_freq,  // upper+lower frequency cutoffs in
      // the mel computation
      float vtln_warp_factor, float freq);
  static float VtlnWarpMelFreq(float vtln_low_cutoff, float vtln_high_cutoff,
                               float low_freq, float high_freq,
                               float vtln_warp_factor, float mel_freq);
  // TODO(fangjun): Remove vtln_warp_factor
  MelBanks(const MelBanksOptions &opts,
           const FrameExtractionOptions &frame_opts, float vtln_warp_factor);
  /// Compute Mel energies (note: not log energies).
  /// At input, "fft_energies" contains the FFT energies (not log).
  ///
  /// @param fft_energies 1-D array of size num_fft_bins/2+1
  /// @param mel_energies_out  1-D array of size num_mel_bins
  void Compute(const float *fft_energies, float *mel_energies_out) const;
  int32_t NumBins() const { return bins_.size(); }
 private:
  // center frequencies of bins, numbered from 0 ... num_bins-1.
  // Needed by GetCenterFreqs().
  std::vector<float> center_freqs_;
  // the "bins_" vector is a vector, one for each bin, of a pair:
  // (the first nonzero fft-bin), (the vector of weights).
  std::vector<std::pair<int32_t, std::vector<float>>> bins_;
  // TODO(fangjun): Remove debug_ and htk_mode_
  bool debug_;
  bool htk_mode_;
 };
 }  // namespace knf
 #endif  // KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc
@ -0,0 +1,66 @@
 /**
 * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 *
 * See LICENSE for clarification regarding multiple authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "kaldi-native-fbank/csrc/rfft.h"
 #include <cmath>
 #include <vector>
 #include "kaldi-native-fbank/csrc/log.h"
 // see fftsg.c
 #ifdef __cplusplus
 extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w);
 #else
 void rdft(int n, int isgn, double *a, int *ip, double *w);
 #endif
 namespace knf {
 class Rfft::RfftImpl {
 public:
  explicit RfftImpl(int32_t n) : n_(n), ip_(2 + std::sqrt(n / 2)), w_(n / 2) {
    KNF_CHECK_EQ(n & (n - 1), 0);
  }
  void Compute(float *in_out) {
    std::vector<double> d(in_out, in_out + n_);
    Compute(d.data());
    std::copy(d.begin(), d.end(), in_out);
  }
  void Compute(double *in_out) {
    // 1 means forward fft
    rdft(n_, 1, in_out, ip_.data(), w_.data());
  }
 private:
  int32_t n_;
  std::vector<int32_t> ip_;
  std::vector<double> w_;
 };
 Rfft::Rfft(int32_t n) : impl_(std::make_unique<RfftImpl>(n)) {}
 Rfft::~Rfft() = default;
 void Rfft::Compute(float *in_out) { impl_->Compute(in_out); }
 void Rfft::Compute(double *in_out) { impl_->Compute(in_out); }
 }  // namespace knf
--- a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h
@ -0,0 +1,56 @@
 /**
 * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
 *
 * See LICENSE for clarification regarding multiple authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef KALDI_NATIVE_FBANK_CSRC_RFFT_H_
 #define KALDI_NATIVE_FBANK_CSRC_RFFT_H_
 #include <memory>
 namespace knf {
 // n-point Real discrete Fourier transform
 // where n is a power of 2. n >= 2
 //
 //  R[k] = sum_j=0^n-1 in[j]*cos(2*pi*j*k/n), 0<=k<=n/2
 //  I[k] = sum_j=0^n-1 in[j]*sin(2*pi*j*k/n), 0<k<n/2
 class Rfft {
 public:
  // @param n Number of fft bins. it should be a power of 2.
  explicit Rfft(int32_t n);
  ~Rfft();
  /** @param in_out A 1-D array of size n.
   *             On return:
   *               in_out[0] = R[0]
   *               in_out[1] = R[n/2]
   *               for 1 < k < n/2,
   *                 in_out[2*k] = R[k]
   *                 in_out[2*k+1] = I[k]
   *
   */
  void Compute(float *in_out);
  void Compute(double *in_out);
 private:
  class RfftImpl;
  std::unique_ptr<RfftImpl> impl_;
 };
 }  // namespace knf
 #endif  // KALDI_NATIVE_FBANK_CSRC_RFFT_H_
--- a/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
@ -1,111 +0,0 @@
 # checkout the thirdparty/kaldi/base/kaldi-types.h
 # compile kaldi without openfst
 add_definitions("-DCOMPILE_WITHOUT_OPENFST")
 if ((NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/base))
    file(COPY ../../../../speechx/speechx/kaldi/base DESTINATION ${CMAKE_CURRENT_LIST_DIR})
    file(COPY ../../../../speechx/speechx/kaldi/feat DESTINATION ${CMAKE_CURRENT_LIST_DIR})
    file(COPY ../../../../speechx/speechx/kaldi/matrix DESTINATION ${CMAKE_CURRENT_LIST_DIR})
    file(COPY ../../../../speechx/speechx/kaldi/util DESTINATION ${CMAKE_CURRENT_LIST_DIR})
 endif()
 # kaldi-base
 add_library(kaldi-base STATIC
  base/io-funcs.cc
  base/kaldi-error.cc
  base/kaldi-math.cc
  base/kaldi-utils.cc
  base/timer.cc
 )
 target_include_directories(kaldi-base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 # kaldi-matrix
 add_library(kaldi-matrix STATIC
  matrix/compressed-matrix.cc
  matrix/matrix-functions.cc
  matrix/kaldi-matrix.cc
  matrix/kaldi-vector.cc
  matrix/optimization.cc
  matrix/packed-matrix.cc
  matrix/qr.cc
  matrix/sparse-matrix.cc
  matrix/sp-matrix.cc
  matrix/srfft.cc
  matrix/tp-matrix.cc
 )
 target_include_directories(kaldi-matrix PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 if (NOT MSVC)
    target_link_libraries(kaldi-matrix PUBLIC kaldi-base libopenblas)
 else()
    target_link_libraries(kaldi-matrix PUBLIC kaldi-base openblas)
 endif()
 # kaldi-util
 add_library(kaldi-util STATIC
  util/kaldi-holder.cc
  util/kaldi-io.cc
  util/kaldi-semaphore.cc
  util/kaldi-table.cc
  util/kaldi-thread.cc
  util/parse-options.cc
  util/simple-io-funcs.cc
  util/simple-options.cc
  util/text-utils.cc
 )
 target_include_directories(kaldi-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix)
 # kaldi-feat-common
 add_library(kaldi-feat-common STATIC
  feat/cmvn.cc
  feat/feature-functions.cc
  feat/feature-window.cc
  feat/mel-computations.cc
  feat/pitch-functions.cc
  feat/resample.cc
  feat/signal.cc
  feat/wave-reader.cc
 )
 target_include_directories(kaldi-feat-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
 # kaldi-mfcc
 add_library(kaldi-mfcc STATIC
  feat/feature-mfcc.cc
 )
 target_include_directories(kaldi-mfcc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
 # kaldi-fbank
 add_library(kaldi-fbank STATIC
  feat/feature-fbank.cc
 )
 target_include_directories(kaldi-fbank PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
 set(KALDI_LIBRARIES
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-base.a
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-matrix.a
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-util.a
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-feat-common.a
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-mfcc.a
  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-fbank.a
 )
 add_library(libkaldi INTERFACE)
 add_dependencies(libkaldi kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank)
 target_include_directories(libkaldi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 if (APPLE)
    target_link_libraries(libkaldi INTERFACE ${KALDI_LIBRARIES} libopenblas ${GFORTRAN_LIBRARIES_DIR}/libgfortran.a ${GFORTRAN_LIBRARIES_DIR}/libquadmath.a ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib)
 elseif (MSVC)
    target_link_libraries(libkaldi INTERFACE kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank openblas)
 else()
    target_link_libraries(libkaldi INTERFACE -Wl,--start-group -Wl,--whole-archive ${KALDI_LIBRARIES} libopenblas.a gfortran -Wl,--no-whole-archive -Wl,--end-group)
 endif()
 target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
--- a/audio/setup.py
+++ b/audio/setup.py
@ -51,8 +51,7 @@ base = [
 ]
 requirements = {
-    "install":
+    "install": base,
    base,
    "develop": [
        "sox",
        "soxbindings",
@ -60,6 +59,7 @@ requirements = {
    ],
 }
 def check_call(cmd: str, shell=False, executable=None):
    try:
        sp.check_call(
@ -92,6 +92,7 @@ def check_output(cmd: Union[str, List[str], Tuple[str]], shell=False):
            file=sys.stderr)
    return out_bytes.strip().decode('utf8')
 def _run_cmd(cmd):
    try:
        return subprocess.check_output(
@ -100,6 +101,7 @@ def _run_cmd(cmd):
    except Exception:
        return None
@contextlib.contextmanager
 def pushd(new_dir):
    old_dir = os.getcwd()
@ -109,22 +111,26 @@ def pushd(new_dir):
    os.chdir(old_dir)
    print(old_dir)
 def read(*names, **kwargs):
    with io.open(
            os.path.join(os.path.dirname(__file__), *names),
            encoding=kwargs.get("encoding", "utf8")) as fp:
        return fp.read()
 def _remove(files: str):
    for f in files:
        f.unlink()
 ################################# Install ##################################
 def _post_install(install_lib_dir):
    pass
 class DevelopCommand(develop):
    def run(self):
        develop.run(self)
@ -188,6 +194,7 @@ def _make_version_file(version, sha):
    with open(version_path, "a") as f:
        f.write(f"__version__ = '{version}'\n")
 def _rm_version():
    file_ = ROOT_DIR / "paddleaudio" / "__init__.py"
    with open(file_, "r") as f:
@ -235,8 +242,8 @@ def main():
    if platform.system() != 'Windows' and platform.system() != 'Linux':
        lib_package_data = {'paddleaudio': ['lib/libgcc_s.1.1.dylib']}
-    if platform.system() == 'Linux':
+    #if platform.system() == 'Linux':
-        lib_package_data = {'paddleaudio': ['lib/lib*']}
+    #    lib_package_data = {'paddleaudio': ['lib/lib*']}
    setup_info = dict(
        # Metadata
@ -254,8 +261,7 @@ def main():
        python_requires='>=3.7',
        install_requires=requirements["install"],
        extras_require={
-            'develop':
+            'develop': requirements["develop"],
            requirements["develop"],
            #'test': ["nose", "torchaudio==0.10.2", "pytest-benchmark", "librosa=0.8.1", "parameterized", "paddlepaddle"],
        },
        cmdclass={
@ -284,11 +290,11 @@ def main():
            'Programming Language :: Python :: 3.8',
            'Programming Language :: Python :: 3.9',
            'Programming Language :: Python :: 3.10',
-        ],
+        ], )
    )
    setup(**setup_info)
    _rm_version()
 if __name__ == '__main__':
    main()