add frontend cmakelist

4 years ago · c60277515b
parent f03d48f79b
commit c60277515b
12 changed files with 179 additions and 88 deletions
--- a/docs/source/reference.md
+++ b/docs/source/reference.md
@ -35,3 +35,7 @@ We borrowed a lot of code from these repos to build `model` and `engine`, thanks
 * [librosa](https://github.com/librosa/librosa/blob/main/LICENSE.md)
 - ISC License
 - Audio feature
 * [ThreadPool](https://github.com/progschj/ThreadPool/blob/master/COPYING)
 - zlib License
 - ThreadPool
--- a/speechx/CMakeLists.txt
+++ b/speechx/CMakeLists.txt
@ -65,7 +65,7 @@ FetchContent_Declare(
  URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc
 )
 FetchContent_MakeAvailable(glog)
-include_directories(${glog_BINARY_DIR})
+include_directories(${glog_BINARY_DIR} ${glog_SOURCE_DIR}/src)
 # gtest
 FetchContent_Declare(googletest
--- a/speechx/speechx/CMakeLists.txt
+++ b/speechx/speechx/CMakeLists.txt
@ -4,11 +4,22 @@ project(speechx LANGUAGES CXX)
 link_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/openblas)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
 include_directories(
 ${CMAKE_CURRENT_SOURCE_DIR}
 ${CMAKE_CURRENT_SOURCE_DIR}/kaldi
 )
 add_subdirectory(kaldi)
 include_directories(
 ${CMAKE_CURRENT_SOURCE_DIR}
 ${CMAKE_CURRENT_SOURCE_DIR}/frontend
 )
 add_subdirectory(frontend)
 add_executable(mfcc-test codelab/feat_test/feature-mfcc-test.cc)
 target_link_libraries(mfcc-test kaldi-mfcc)
 add_executable(linear_spectrogram_main codelab/feat_test/linear_spectrogram_main.cc)
 target_link_libraries(linear_spectrogram_main frontend kaildi-util kaldi-feat)
--- a/speechx/speechx/base/basic_types.h
+++ b/speechx/speechx/base/basic_types.h
@ -16,7 +16,7 @@
 #include "kaldi/base/kaldi-types.h"
-#include <limits.h>
+#include <limits>
 typedef float               BaseFloat;
 typedef double              double64;
@ -35,7 +35,7 @@ typedef unsigned char      uint8;
 typedef unsigned short     uint16;
 typedef unsigned int       uint32;
-if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
+#if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
 typedef unsigned long uint64;
 #else
 typedef unsigned long long uint64;
--- a/speechx/speechx/base/thread_pool.h
+++ b/speechx/speechx/base/thread_pool.h
@ -1,3 +1,23 @@
 // Copyright (c) 2012 Jakob Progsch, Václav Zeman
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
 // arising from the use of this software.
 // Permission is granted to anyone to use this software for any purpose,
 // including commercial applications, and to alter it and redistribute it
 // freely, subject to the following restrictions:
 //   1. The origin of this software must not be misrepresented; you must not
 //   claim that you wrote the original software. If you use this software
 //   in a product, an acknowledgment in the product documentation would be
 //   appreciated but is not required.
 //   2. Altered source versions must be plainly marked as such, and must not be
 //   misrepresented as being the original software.
 //   3. This notice may not be removed or altered from any source
 //   distribution.
 // this code is from https://github.com/progschj/ThreadPool
 #ifndef BASE_THREAD_POOL_H
@ -97,4 +117,4 @@ inline ThreadPool::~ThreadPool()
        worker.join();
 }
-#endif
+#endif
--- a/speechx/speechx/codelab/feat_test/linear_spectrogram_main.cc
+++ b/speechx/speechx/codelab/feat_test/linear_spectrogram_main.cc
@ -2,6 +2,7 @@
 #include "frontend/linear_spectrogram.h"
 #include "frontend/normalizer.h"
 #include "frontend/feature_extractor_interface.h"
 #include "kaldi/util/table-types.h"
 #include "base/log.h"
 #include "base/flags.h"
@ -22,7 +23,7 @@ int main(int argc, char* argv[]) {
  ppspeech::LinearSpectrogramOptions opt;
  ppspeech::DecibelNormalizerOptions db_norm_opt;
  std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor =
-      new DecibelNormalizer(db_norm_opt);
+      new ppspeech::DecibelNormalizer(db_norm_opt);
  ppspeech::LinearSpectrogram linear_spectrogram(opt, base_featrue_extractor);
  for (; !wav_reader.Done(); wav_reader.Next()) {
--- a/speechx/speechx/frontend/CMakeLists.txt
+++ b/speechx/speechx/frontend/CMakeLists.txt
@ -0,0 +1,8 @@
 project(frontend)
 add_library(frontend
  normalizer.cc
  linear_spectrogram.cc  
 )
 target_link_libraries(frontend kaldi-matrix)
--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@ -15,16 +15,14 @@
 #pragma once
 #include "base/basic_types.h"
-#incldue "kaldi/matrix/kaldi-vector.h"
+#include "kaldi/matrix/kaldi-vector.h"
 namespace ppspeech {
 class FeatureExtractorInterface {
  public:
-    virtual void AcceptWaveform(const kaldi::Vector<kaldi::BaseFloat>& input) = 0;
+    virtual void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) = 0;
-    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
+    virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat) = 0;
    virtual void Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
                         kaldi::VectorBae<kaldi::BaseFloat>* feature) = 0;
    virtual size_t Dim() const = 0;
 };
--- a/speechx/speechx/frontend/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/linear_spectrogram.cc
@ -16,15 +16,36 @@
 #include "kaldi/base/kaldi-math.h"
 #include "kaldi/matrix/matrix-functions.h"
 namespace ppspeech {
 using kaldi::int32;
 using kaldi::BaseFloat;
 using kaldi::Vector;
 using kaldi::Matrix;
 using std::vector;
 //todo remove later
 void CopyVector2StdVector(const kaldi::Vector<BaseFloat>& input,
                          vector<BaseFloat>* output) {
  if (input.Dim() == 0) return;
  output->resize(input.Dim());
  for (size_t idx = 0; idx < input.Dim(); ++idx) {
    (*output)[idx] = input(idx);
  }
 }
 void CopyStdVector2Vector(const vector<BaseFloat>& input,
                          Vector<BaseFloat>* output) {
  if (input.empty()) return;
  output->Resize(input.size());
  for (size_t idx = 0; idx < input.size(); ++idx) {
    (*output)(idx) = input[idx];
  }
 }
 LinearSpectrogram::LinearSpectrogram(
    const LinearSpectrogramOptions& opts,
-    const std::unique_ptr<FeatureExtractorInterface> base_extractor) {
+    std::unique_ptr<FeatureExtractorInterface> base_extractor) {
  base_extractor_ = std::move(base_extractor);
  int32 window_size = opts.frame_opts.WindowSize();
  int32 window_shift = opts.frame_opts.WindowShift();
@ -41,11 +62,8 @@ LinearSpectrogram::LinearSpectrogram(
  dim_ = fft_points_ / 2 + 1;  // the dimension is Fs/2 Hz
 }
-void LinearSpectrogram::AcceptWavefrom(const Vector<BaseFloat>& input) {
+void LinearSpectrogram::AcceptWavefrom(const kaldi::VectorBase<BaseFloat>& input) {
-  wavefrom_.resize(input.Dim());
+  base_extractor_->AcceptWaveform(input);
  for (size_t idx = 0; idx < input.Dim(); ++idx) {
    waveform_[idx] = input(idx);
  }
 }
 void LinearSpectrogram::Hanning(vector<float>* data) const {
@ -58,11 +76,11 @@ void LinearSpectrogram::Hanning(vector<float>* data) const {
 bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
                                 vector<BaseFloat>* real,
-                                 vector<BaseFloat>* img) {
+                                 vector<BaseFloat>* img) const {
-  if (RealFft(v, true)) {
+  Vector<BaseFloat> v_tmp;
-    LOG(ERROR) << "compute the fft occurs error";
+  CopyStdVector2Vector(*v, &v_tmp);
-    return false;
+  RealFft(&v_tmp, true);
-  }
+  CopyVector2StdVector(v_tmp, v);
  real->push_back(v->at(0));
  img->push_back(0);
  for (int i = 1; i < v->size() / 2; i++) {
@ -75,36 +93,28 @@ bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
  return true;
 }
 //todo remove later
 void CopyVector2StdVector(const kaldi::Vector<BaseFloat>& input,
                          vector<BaseFloat>* output) {
 }
 // todo remove later
-bool LinearSpectrogram::ReadFeats(Matrix<BaseFloat>* feats) const {
+void LinearSpectrogram::ReadFeats(Matrix<BaseFloat>* feats) {
-  if (wavefrom_.Dim() == 0) {
+  Vector<BaseFloat> tmp;
-      return false;
+  Compute(tmp, &waveform_);
  }
  kaldi::Vector<BaseFloat> feats;
  Compute(wavefrom_, &feats);
  vector<vector<BaseFloat>> result;
  vector<BaseFloat> feats_vec; 
-  CopyVector2StdVector(feats, &feats_vec);
+  CopyVector2StdVector(waveform_, &feats_vec);
  Compute(feats_vec, result);
  feats->Resize(result.size(), result[0].size());
  for (int row_idx = 0; row_idx < result.size(); ++row_idx) {
    for (int col_idx = 0; col_idx < result.size(); ++col_idx) {
-        feats(row_idx, col_idx) = result[row_idx][col_idx];
+        (*feats)(row_idx, col_idx) = result[row_idx][col_idx];
    }
  }
-  wavefrom_.Resize(0);
+  waveform_.Resize(0);
  return true;
 }
 // only for test, remove later
 // todo: compute the feature frame by frame.
-void LinearSpectrogram::Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
+void LinearSpectrogram::Compute(const kaldi::Vector<kaldi::BaseFloat>& input,
-                                kaldi::VectorBae<kaldi::BaseFloat>* feature) {
+                                kaldi::Vector<kaldi::BaseFloat>* feature) {
-    base_extractor_->Compute(input, feature);
+    base_extractor_->Read(feature);
 }
 // Compute spectrogram feat, only for test, remove later
@ -112,9 +122,9 @@ void LinearSpectrogram::Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input
 bool LinearSpectrogram::Compute(const vector<float>& wave,
                                vector<vector<float>>& feat) {
  int num_samples = wave.size();
-  const int& frame_length = opts.frame_opts.WindowSize();
+  const int& frame_length = opts_.frame_opts.WindowSize();
-  const int& sample_rate = opts.frame_opts.samp_freq;
+  const int& sample_rate = opts_.frame_opts.samp_freq;
-  const int& frame_shift = opts.frame_opts.WindowShift();
+  const int& frame_shift = opts_.frame_opts.WindowShift();
  const int& fft_points = fft_points_;
  const float scale = hanning_window_energy_ * frame_shift;
@ -132,11 +142,11 @@ bool LinearSpectrogram::Compute(const vector<float>& wave,
  for (int i = 0; i < num_frames; ++i) {
    vector<float> data(wave.data() + i * frame_shift,
                       wave.data() + i * frame_shift + frame_length);
-    Hanning(data);
+    Hanning(&data);
    fft_img.clear();
    fft_real.clear();
    v.assign(data.begin(), data.end());
-    if (NumpyFft(&v, fft_real, fft_img)) {
+    if (NumpyFft(&v, &fft_real, &fft_img)) {
      LOG(ERROR)<< i  << " fft compute occurs error, please checkout the input data";
      return false;
    }
@ -155,5 +165,8 @@ bool LinearSpectrogram::Compute(const vector<float>& wave,
      // log added eps=1e-14
      feat[i][j] = std::log(feat[i][j] + 1e-14);
    }
  }
  return true;
 }
 }  // namespace ppspeech
--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@ -8,7 +8,7 @@
 namespace ppspeech {
 struct LinearSpectrogramOptions {
-    kaldi::FrameExtrationOptions frame_opts;
+    kaldi::FrameExtractionOptions frame_opts;
    LinearSpectrogramOptions():
        frame_opts() {}
@ -19,19 +19,19 @@ struct LinearSpectrogramOptions {
 class LinearSpectrogram : public FeatureExtractorInterface {
  public:
-    explict LinearSpectrogram(const LinearSpectrogramOptions& opts,
+    explicit LinearSpectrogram(const LinearSpectrogramOptions& opts,
-                              const std::unique_ptr<FeatureExtractorInterface> base_extractor);
+                               std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input);
+    virtual void AcceptWavefrom(const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat);
    virtual size_t Dim() const { return dim_; }
-    void ReadFeats(kaldi::Matrix<kaldi::BaesFloat>* feats) const;
+    void ReadFeats(kaldi::Matrix<kaldi::BaseFloat>* feats);
  private: 
-    void Hanning(std::vector<kaldi::BaseFloat>& data) const;
+    void Hanning(std::vector<kaldi::BaseFloat>* data) const;
-    kaldi::int32 Compute(const std::vector<kaldi::BaseFloat>& wave,
+    bool Compute(const std::vector<kaldi::BaseFloat>& wave,
-                         std::vector<std::vector<kaldi::BaseFloat>>& feat);
+                 std::vector<std::vector<kaldi::BaseFloat>>& feat);
-    void Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
+    void Compute(const kaldi::Vector<kaldi::BaseFloat>& input,
-                         kaldi::VectorBae<kaldi::BaseFloat>* feature);
+                 kaldi::Vector<kaldi::BaseFloat>* feature);
    bool NumpyFft(std::vector<kaldi::BaseFloat>* v,
                  std::vector<kaldi::BaseFloat>* real,
                  std::vector<kaldi::BaseFloat>* img) const;
@ -41,7 +41,7 @@ class LinearSpectrogram : public FeatureExtractorInterface {
    std::vector<kaldi::BaseFloat> hanning_window_;
    kaldi::BaseFloat hanning_window_energy_;
    LinearSpectrogramOptions opts_;
-    kaldi::Vector<kaldi::BaseFloat> wavefrom_; // remove later, todo(SmileGoat)
+    kaldi::Vector<kaldi::BaseFloat> waveform_; // remove later, todo(SmileGoat)
    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
    DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
 };
--- a/speechx/speechx/frontend/normalizer.cc
+++ b/speechx/speechx/frontend/normalizer.cc
@ -1,35 +1,62 @@
 #include "frontend/normalizer.h"
-DecibelNormalizer::DecibelNormalizer(
+namespace ppspeech {
    const DecibelNormalizerOptions& opts) {
 using kaldi::Vector;
 using kaldi::BaseFloat;
 using std::vector;
 DecibelNormalizer::DecibelNormalizer(const DecibelNormalizerOptions& opts) {
  opts_ = opts;
 }
-void DecibelNormalizer::AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input) {
+void DecibelNormalizer::AcceptWavefrom(const Vector<BaseFloat>& input) {
  waveform_ = input;
 }
 void DecibelNormalizer::Read(Vector<BaseFloat>* feat) {
  if (waveform_.Dim() == 0) return;
  Compute(waveform_, feat);
 }
-void DecibelNormalizer::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
+//todo remove later
 void CopyVector2StdVector(const kaldi::Vector<BaseFloat>& input,
                          vector<BaseFloat>* output) {
  if (input.Dim() == 0) return;
  output->resize(input.Dim());
  for (size_t idx = 0; idx < input.Dim(); ++idx) {
    (*output)[idx] = input(idx);
  }
 }
 void CopyStdVector2Vector(const vector<BaseFloat>& input,
                          Vector<BaseFloat>* output) {
  if (input.empty()) return;
  output->Resize(input.size());
  for (size_t idx = 0; idx < input.size(); ++idx) {
    (*output)(idx) = input[idx];
  }
 }
-bool DecibelNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
+bool DecibelNormalizer::Compute(const Vector<BaseFloat>& input,
-                                kaldi::Vector<kaldi::BaseFloat>* feat) {
+                                Vector<BaseFloat>* feat) const {
  // calculate db rms
-  float rms_db = 0.0;
+  BaseFloat rms_db = 0.0;
-  float mean_square = 0.0;
+  BaseFloat mean_square = 0.0;
-  float gain = 0.0;
+  BaseFloat gain = 0.0;
-  vector<BaseFloat> smaples;
+  BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
-  samples.resize(input.Size());
+
  vector<BaseFloat> samples;
  samples.resize(input.Dim());
  for (int32 i = 0; i < samples.size(); ++i) {
    samples[i] = input(i);
  }
  // square
  for (auto &d : samples) {
-    if (_opts.convert_int_float) {
+    if (opts_.convert_int_float) {
-    d = d * WAVE_FLOAT_NORMALIZATION;
+    d = d * wave_float_normlization;
    }
    mean_square += d * d;
  }
@ -37,12 +64,12 @@ bool DecibelNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
  // mean
  mean_square /= samples.size();
  rms_db = 10 * std::log10(mean_square);
-  gain = opts.target_db - rms_db;
+  gain = opts_.target_db - rms_db;
-  if (gain > opts.max_gain_db) {
+  if (gain > opts_.max_gain_db) {
-    LOG(ERROR) << "Unable to normalize segment to " << opts.target_db << "dB,"
+    LOG(ERROR) << "Unable to normalize segment to " << opts_.target_db << "dB,"
-                << "because the the probable gain have exceeds opts.max_gain_db" 
+                << "because the the probable gain have exceeds opts_.max_gain_db" 
-                <<  opts.max_gain_db << "dB.";
+                <<  opts_.max_gain_db << "dB.";
    return false;
  }
@ -51,27 +78,28 @@ bool DecibelNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
    // python item *= 10.0 ** (gain / 20.0)
    item *= std::pow(10.0, gain / 20.0);
  }
-
+  
  CopyStdVector2Vector(samples, feat);
  return true;
 }
-
+/*
 PPNormalizer::PPNormalizer(
    const PPNormalizerOptions& opts,
    const std::unique_ptr<FeatureExtractorInterface>& pre_extractor) {
 }
-void PPNormalizer::AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input) {
+void PPNormalizer::AcceptWavefrom(const Vector<BaseFloat>& input) {
 }
-void PPNormalizer::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
+void PPNormalizer::Read(Vector<BaseFloat>* feat) {
 }
-bool PPNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
+bool PPNormalizer::Compute(const Vector<BaseFloat>& input,
-                           kaldi::Vector<kaldi::BaseFloat>>* feat) {
+                           Vector<BaseFloat>>* feat) {
   if ((input.Dim() % mean_.Dim()) == 0) {
        LOG(ERROR) << "CMVN dimension is wrong!";
        return false;
@ -93,4 +121,6 @@ bool PPNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
    }
    return true;
-}
+}*/
 } // namespace ppspeech
--- a/speechx/speechx/frontend/normalizer.h
+++ b/speechx/speechx/frontend/normalizer.h
@ -1,7 +1,9 @@
 #pragma once
 #include "base/common.h"
 #include "frontend/feature_extractor_interface.h"
 #include "kaldi/util/options-itf.h"
 namespace ppspeech {
@ -9,6 +11,7 @@ namespace ppspeech {
 struct DecibelNormalizerOptions {
  float target_db;
  float max_gain_db;
  bool convert_int_float;
  DecibelNormalizerOptions() :
    target_db(-20),
    max_gain_db(300.0),
@ -23,16 +26,19 @@ struct DecibelNormalizerOptions {
 class DecibelNormalizer : public FeatureExtractorInterface {
  public:
-    explict DecibelNormalizer(const DecibelNormalizerOptions& opts,
+    explicit DecibelNormalizer(const DecibelNormalizerOptions& opts);
-                              const std::unique_ptr<FeatureExtractorInterface>& pre_extractor);
+    virtual void AcceptWavefrom(const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual void AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input);
+    virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat);
-    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual size_t Dim() const { return 0; }
    virtual size_t Dim() const;
    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& input,
-                 kaldi::Vector<kaldi::BaseFloat>>* feat);
+                 kaldi::Vector<kaldi::BaseFloat>* feat) const;
  private:
    DecibelNormalizerOptions opts_;
    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
    kaldi::Vector<kaldi::BaseFloat> waveform_;
 };
 /*
 struct NormalizerOptions {
  std::string mean_std_path;
  NormalizerOptions() :
@ -61,5 +67,5 @@ class PPNormalizer : public FeatureExtractorInterface {
    kaldi::Vector<float> variance_;
    NormalizerOptions _opts;
 };
-
+*/
 }  // namespace ppspeech