From 7ae36b219d737c2cd20acb9cae766632234384ea Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Thu, 7 Jul 2022 11:21:03 +0800
Subject: [PATCH] add kaldi feature pitch

---
 CMakeLists.txt                                |   2 +-
 paddlespeech/audio/kaldi/__init__.py          |  15 +++
 paddlespeech/audio/kaldi/kaldi.py             | 103 +++++++++++++++
 .../audio/src/pybind/kaldi/kaldi_feature.cc   |   6 +-
 paddlespeech/audio/src/pybind/pybind.cpp      |   4 +-
 .../audio/third_party/kaldi/CMakeLists.txt    |   3 +-
 speechx/speechx/kaldi/feat/feature-plp.h      |   2 +-
 .../speechx/kaldi/feat/online-feature-itf.h   | 125 ++++++++++++++++++
 speechx/speechx/kaldi/feat/online-feature.h   |   2 +-
 speechx/speechx/kaldi/feat/pitch-functions.h  |   2 +-
 10 files changed, 254 insertions(+), 10 deletions(-)
 create mode 100644 paddlespeech/audio/kaldi/__init__.py
 create mode 100644 paddlespeech/audio/kaldi/kaldi.py
 create mode 100644 speechx/speechx/kaldi/feat/online-feature-itf.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 764277d1b..0d260dd3d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,4 +65,4 @@ add_subdirectory(paddlespeech/audio)
 
 # Summary
 include(cmake/summary.cmake)
-onnx_print_configuration_summary()
\ No newline at end of file
+onnx_print_configuration_summary()
diff --git a/paddlespeech/audio/kaldi/__init__.py b/paddlespeech/audio/kaldi/__init__.py
new file mode 100644
index 000000000..2b52ad23d
--- /dev/null
+++ b/paddlespeech/audio/kaldi/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import fbank
+from . import pitch
diff --git a/paddlespeech/audio/kaldi/kaldi.py b/paddlespeech/audio/kaldi/kaldi.py
new file mode 100644
index 000000000..53b82670c
--- /dev/null
+++ b/paddlespeech/audio/kaldi/kaldi.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.audio._internal import module_utils 
+import paddlespeech.audio.ops.paddleaudio.ComputeFbank as ComputeFbank
+import paddlespeech.audio.ops.paddleaudio.ComputeKaldiPitch as ComputeKaldiPitch
+
+__all__ = [
+    'fbank',
+    'pitch',
+]
+
+@module_utils.requires_kaldi()
+def fbank(wav,
+          samp_freq: int=16000,
+          frame_shift_ms: float=10.0,
+          frame_length_ms: float=25.0,
+          dither: float=0.0,
+          preemph_coeff: float=0.97,
+          remove_dc_offset: bool=True,
+          window_type: str='povey',
+          round_to_power_of_two: bool=True,
+          blackman_coeff: float=0.42,
+          snip_edges: bool=True,
+          allow_downsample: bool=False,
+          allow_upsample: bool=False,
+          max_feature_vectors: int=-1,
+          num_bins: int=23,
+          low_freq: float=20,
+          high_freq: float=0,
+          vtln_low: float=100,
+          vtln_high: float=-500,
+          debug_mel: bool=False,
+          htk_mode: bool=False,
+          use_energy: bool=False, # fbank opts
+          energy_floor: float=0.0,
+          raw_energy: bool=True,
+          htk_compat: bool=False,
+          use_log_fbank: bool=True,
+          use_power: bool=True):
+    feat = ComputeFbank(
+        samp_freq, frame_shift_ms, frame_length_ms,
+        dither, preemph_coeff, remove_dc_offset,
+        window_type, round_to_power_of_two, blackman_coeff,
+        snip_edges, allow_downsample, allow_upsample,
+        max_feature_vectors, num_bins, low_freq,
+        high_freq, vtln_low, vtln_high, debug_mel,
+        htk_mode, use_energy, energy_floor,
+        raw_energy, htk_compat, use_log_fbank, use_power, wav)
+    return feat
+
+@module_utils.requires_kaldi()
+def pitch(wav,
+          samp_freq: int=16000,
+          frame_shift_ms: float=10.0,
+          frame_length_ms: float=25.0,
+          preemph_coeff: float=0.0,
+          min_f0: int=50,
+          max_f0: int=400,
+          soft_min_f0: float=10.0,
+          penalty_factor: float=0.1,
+          lowpass_cutoff: int=1000,
+          resample_freq: int=4000,
+          delta_pitch: float=0.005,
+          nccf_ballast: int=7000,
+          lowpass_filter_width: int=1,
+          upsample_filter_width: int=5,
+          max_frames_latency: int=0,
+          frames_per_chunk: int=0,
+          simulate_first_pass_online: bool=False,
+          recompute_frame: int=500,
+          nccf_ballast_online: bool=False,
+          snip_edges: bool=True):
+    pitch = ComputeKaldiPitch(samp_freq, frame_shift_ms,
+          frame_length_ms,
+          preemph_coeff,
+          min_f0,
+          max_f0,
+          soft_min_f0,
+          penalty_factor,
+          lowpass_cutoff,
+          resample_freq,
+          delta_pitch,
+          nccf_ballast,
+          lowpass_filter_width,
+          upsample_filter_width,
+          max_frames_latency,
+          frames_per_chunk,
+          simulate_first_pass_online,
+          recompute_frame,
+          nccf_ballast_online,
+          snip_edges, wav)
+    return pitch
\ No newline at end of file
diff --git a/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.cc b/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.cc
index 58a498477..4ac9a93ce 100644
--- a/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.cc
+++ b/paddlespeech/audio/src/pybind/kaldi/kaldi_feature.cc
@@ -191,15 +191,15 @@ opts.preemph_coeff = preemph_coeff;
       opts.snip_edges = snip_edges;
 
    py::buffer_info info = wav.request();
-   kaldi::Vector<::kaldi::BaseFloat> input_wav(info.size);
+   ::kaldi::Vector<::kaldi::BaseFloat> input_wav(info.size);
    double* wav_ptr = (double*)info.ptr;
    for (int idx = 0; idx < info.size; ++idx) {
        input_wav(idx) = *wav_ptr;
        wav_ptr++;
    }
    
-   kaldi::Matrix<kaldi::BaseFloat> features; 
-   kaldi::ComputeKaldiPitch(opts, input_wav, &features);
+   ::kaldi::Matrix<::kaldi::BaseFloat> features;
+   ::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
    auto result = py::array_t<double>({features.NumRows(), features.NumCols()});
    for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
         for (int col_idx = 0; col_idx < features.NumCols(); ++col_idx) {
diff --git a/paddlespeech/audio/src/pybind/pybind.cpp b/paddlespeech/audio/src/pybind/pybind.cpp
index e7f307ca3..d434cbda6 100644
--- a/paddlespeech/audio/src/pybind/pybind.cpp
+++ b/paddlespeech/audio/src/pybind/pybind.cpp
@@ -4,15 +4,15 @@
 #include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h"
 #include "paddlespeech/audio/src/pybind/sox/io.h"
 
-// Sox
 PYBIND11_MODULE(_paddleaudio, m) {
+// Sox
     m.def("get_info_file",
           &paddleaudio::sox_io::get_info_file,
           "Get metadata of audio file.");
     m.def("get_info_fileobj",
           &paddleaudio::sox_io::get_info_fileobj,
           "Get metadata of audio in file object.");
-
+// kaldi feat
     m.def("InitFbank", &paddleaudio::kaldi::InitFbank, "init fbank");
     m.def("ResetFbank", &paddleaudio::kaldi::ResetFbank, "reset fbank");
     m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
diff --git a/paddlespeech/audio/third_party/kaldi/CMakeLists.txt b/paddlespeech/audio/third_party/kaldi/CMakeLists.txt
index 6934016ee..2304c59e6 100644
--- a/paddlespeech/audio/third_party/kaldi/CMakeLists.txt
+++ b/paddlespeech/audio/third_party/kaldi/CMakeLists.txt
@@ -70,6 +70,7 @@ add_library(kaldi-feat-common STATIC
   feat/feature-functions.cc
   feat/feature-window.cc
   feat/resample.cc
+  feat/pitch-functions.cc
   feat/mel-computations.cc
   feat/cmvn.cc
 )
@@ -113,4 +114,4 @@ target_link_libraries(libkaldi INTERFACE
   gfortran
   -Wl,--no-whole-archive -Wl,--end-group
 )
-target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
\ No newline at end of file
+target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
diff --git a/speechx/speechx/kaldi/feat/feature-plp.h b/speechx/speechx/kaldi/feat/feature-plp.h
index 4f156ca1e..cce6ee1c4 100644
--- a/speechx/speechx/kaldi/feat/feature-plp.h
+++ b/speechx/speechx/kaldi/feat/feature-plp.h
@@ -27,7 +27,7 @@
 #include "feat/feature-functions.h"
 #include "feat/feature-window.h"
 #include "feat/mel-computations.h"
-#include "itf/options-itf.h"
+#include "util/options-itf.h"
 
 namespace kaldi {
 /// @addtogroup  feat FeatureExtraction
diff --git a/speechx/speechx/kaldi/feat/online-feature-itf.h b/speechx/speechx/kaldi/feat/online-feature-itf.h
new file mode 100644
index 000000000..3d139b461
--- /dev/null
+++ b/speechx/speechx/kaldi/feat/online-feature-itf.h
@@ -0,0 +1,125 @@
+// itf/online-feature-itf.h
+
+// Copyright    2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_ITF_ONLINE_FEATURE_ITF_H_
+#define KALDI_ITF_ONLINE_FEATURE_ITF_H_ 1
+#include "base/kaldi-common.h"
+#include "matrix/matrix-lib.h"
+
+namespace kaldi {
+/// @ingroup Interfaces
+/// @{
+
+/**
+   OnlineFeatureInterface is an interface for online feature processing (it is
+   also usable in the offline setting, but currently we're not using it for
+   that).  This is for use in the online2/ directory, and it supersedes the
+   interface in ../online/online-feat-input.h.  We have a slightly different
+   model that puts more control in the hands of the calling thread, and won't
+   involve waiting on semaphores in the decoding thread.
+
+   This interface only specifies how the object *outputs* the features.
+   How it obtains the features, e.g. from a previous object or objects of type
+   OnlineFeatureInterface, is not specified in the interface and you will
+   likely define new constructors or methods in the derived type to do that.
+
+   You should appreciate that this interface is designed to allow random
+   access to features, as long as they are ready.  That is, the user
+   can call GetFrame for any frame less than NumFramesReady(), and when
+   implementing a child class you must not make assumptions about the
+   order in which the user makes these calls.
+*/
+
+class OnlineFeatureInterface {
+ public:
+  virtual int32 Dim() const = 0; /// returns the feature dimension.
+
+  /// Returns the total number of frames, since the start of the utterance, that
+  /// are now available.  In an online-decoding context, this will likely
+  /// increase with time as more data becomes available.
+  virtual int32 NumFramesReady() const = 0;
+
+  /// Returns true if this is the last frame.  Frame indices are zero-based, so the
+  /// first frame is zero.  IsLastFrame(-1) will return false, unless the file
+  /// is empty (which is a case that I'm not sure all the code will handle, so
+  /// be careful).  This function may return false for some frame if
+  /// we haven't yet decided to terminate decoding, but later true if we decide
+  /// to terminate decoding.  This function exists mainly to correctly handle
+  /// end effects in feature extraction, and is not a mechanism to determine how
+  /// many frames are in the decodable object (as it used to be, and for backward
+  /// compatibility, still is, in the Decodable interface).
+  virtual bool IsLastFrame(int32 frame) const = 0;
+
+  /// Gets the feature vector for this frame.  Before calling this for a given
+  /// frame, it is assumed that you called NumFramesReady() and it returned a
+  /// number greater than "frame".  Otherwise this call will likely crash with
+  /// an assert failure.  This function is not declared const, in case there is
+  /// some kind of caching going on, but most of the time it shouldn't modify
+  /// the class.
+  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) = 0;
+
+
+  /// This is like GetFrame() but for a collection of frames.  There is a
+  /// default implementation that just gets the frames one by one, but it
+  /// may be overridden for efficiency by child classes (since sometimes
+  /// it's more efficient to do things in a batch).
+  virtual void GetFrames(const std::vector<int32> &frames,
+                         MatrixBase<BaseFloat> *feats) {
+    KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
+    for (size_t i = 0; i < frames.size(); i++) {
+      SubVector<BaseFloat> feat(*feats, i);
+      GetFrame(frames[i], &feat);
+    }
+  }
+
+
+  // Returns frame shift in seconds.  Helps to estimate duration from frame
+  // counts.
+  virtual BaseFloat FrameShiftInSeconds() const = 0;
+
+  /// Virtual destructor.  Note: constructors that take another member of
+  /// type OnlineFeatureInterface are not expected to take ownership of
+  /// that pointer; the caller needs to keep track of that manually.
+  virtual ~OnlineFeatureInterface() { }
+
+};
+
+
+/// Add a virtual class for "source" features such as MFCC or PLP or pitch
+/// features.
+class OnlineBaseFeature: public OnlineFeatureInterface {
+ public:
+  /// This would be called from the application, when you get more wave data.
+  /// Note: the sampling_rate is typically only provided so the code can assert
+  /// that it matches the sampling rate expected in the options.
+  virtual void AcceptWaveform(BaseFloat sampling_rate,
+                              const VectorBase<BaseFloat> &waveform) = 0;
+
+  /// InputFinished() tells the class you won't be providing any
+  /// more waveform.  This will help flush out the last few frames
+  /// of delta or LDA features (it will typically affect the return value
+  /// of IsLastFrame.
+  virtual void InputFinished() = 0;
+};
+
+
+/// @}
+}  // namespace Kaldi
+
+#endif  // KALDI_ITF_ONLINE_FEATURE_ITF_H_
diff --git a/speechx/speechx/kaldi/feat/online-feature.h b/speechx/speechx/kaldi/feat/online-feature.h
index f2ebe45bf..f9b26ecc0 100644
--- a/speechx/speechx/kaldi/feat/online-feature.h
+++ b/speechx/speechx/kaldi/feat/online-feature.h
@@ -34,7 +34,7 @@
 #include "feat/feature-mfcc.h"
 #include "feat/feature-plp.h"
 #include "feat/feature-fbank.h"
-#include "itf/online-feature-itf.h"
+#include "feat/online-feature-itf.h"
 
 namespace kaldi {
 /// @addtogroup  onlinefeat OnlineFeatureExtraction
diff --git a/speechx/speechx/kaldi/feat/pitch-functions.h b/speechx/speechx/kaldi/feat/pitch-functions.h
index 70e85380b..9edf6c9ff 100644
--- a/speechx/speechx/kaldi/feat/pitch-functions.h
+++ b/speechx/speechx/kaldi/feat/pitch-functions.h
@@ -31,7 +31,7 @@
 
 #include "base/kaldi-error.h"
 #include "feat/mel-computations.h"
-#include "itf/online-feature-itf.h"
+#include "feat/online-feature-itf.h"
 #include "matrix/matrix-lib.h"
 #include "util/common-utils.h"