add frontend cmakelist

3 years ago · c60277515b
parent f03d48f79b
commit c60277515b
12 changed files with 179 additions and 88 deletions
--- a/docs/source/reference.md
+++ b/docs/source/reference.md
@ -35,3 +35,7 @@ We borrowed a lot of code from these repos to build `model` and `engine`, thanks
 * [librosa](https://github.com/librosa/librosa/blob/main/LICENSE.md)
 - ISC License
 - Audio feature
+
+* [ThreadPool](https://github.com/progschj/ThreadPool/blob/master/COPYING)
+- zlib License
+- ThreadPool
--- a/speechx/CMakeLists.txt
+++ b/speechx/CMakeLists.txt
@ -65,7 +65,7 @@ FetchContent_Declare(
  URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc
 )
 FetchContent_MakeAvailable(glog)
-include_directories(${glog_BINARY_DIR})
+include_directories(${glog_BINARY_DIR} ${glog_SOURCE_DIR}/src)

 # gtest
 FetchContent_Declare(googletest
--- a/speechx/speechx/CMakeLists.txt
+++ b/speechx/speechx/CMakeLists.txt
@ -4,11 +4,22 @@ project(speechx LANGUAGES CXX)

 link_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/openblas)

+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+
 include_directories(
 ${CMAKE_CURRENT_SOURCE_DIR}
 ${CMAKE_CURRENT_SOURCE_DIR}/kaldi
 )
 add_subdirectory(kaldi)

+include_directories(
+${CMAKE_CURRENT_SOURCE_DIR}
+${CMAKE_CURRENT_SOURCE_DIR}/frontend
+)
+add_subdirectory(frontend)
+
 add_executable(mfcc-test codelab/feat_test/feature-mfcc-test.cc)
 target_link_libraries(mfcc-test kaldi-mfcc)
+
+add_executable(linear_spectrogram_main codelab/feat_test/linear_spectrogram_main.cc)
+target_link_libraries(linear_spectrogram_main frontend kaildi-util kaldi-feat)
--- a/speechx/speechx/base/basic_types.h
+++ b/speechx/speechx/base/basic_types.h
@ -16,7 +16,7 @@

 #include "kaldi/base/kaldi-types.h"

-#include <limits.h>
+#include <limits>

 typedef float               BaseFloat;
 typedef double              double64;
@ -35,7 +35,7 @@ typedef unsigned char      uint8;
 typedef unsigned short     uint16;
 typedef unsigned int       uint32;

-if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
+#if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
 typedef unsigned long uint64;
 #else
 typedef unsigned long long uint64;
--- a/speechx/speechx/base/thread_pool.h
+++ b/speechx/speechx/base/thread_pool.h
@ -1,3 +1,23 @@
+// Copyright (c) 2012 Jakob Progsch, Václav Zeman
+
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+
+//   1. The origin of this software must not be misrepresented; you must not
+//   claim that you wrote the original software. If you use this software
+//   in a product, an acknowledgment in the product documentation would be
+//   appreciated but is not required.
+
+//   2. Altered source versions must be plainly marked as such, and must not be
+//   misrepresented as being the original software.
+
+//   3. This notice may not be removed or altered from any source
+//   distribution.
 // this code is from https://github.com/progschj/ThreadPool

 #ifndef BASE_THREAD_POOL_H
@ -97,4 +117,4 @@ inline ThreadPool::~ThreadPool()
        worker.join();
 }

-#endif
+#endif
--- a/speechx/speechx/codelab/feat_test/linear_spectrogram_main.cc
+++ b/speechx/speechx/codelab/feat_test/linear_spectrogram_main.cc
@ -2,6 +2,7 @@

 #include "frontend/linear_spectrogram.h"
 #include "frontend/normalizer.h"
+#include "frontend/feature_extractor_interface.h"
 #include "kaldi/util/table-types.h"
 #include "base/log.h"
 #include "base/flags.h"
@ -22,7 +23,7 @@ int main(int argc, char* argv[]) {
  ppspeech::LinearSpectrogramOptions opt;
  ppspeech::DecibelNormalizerOptions db_norm_opt;
  std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor =
-      new DecibelNormalizer(db_norm_opt);
+      new ppspeech::DecibelNormalizer(db_norm_opt);
  ppspeech::LinearSpectrogram linear_spectrogram(opt, base_featrue_extractor);

  for (; !wav_reader.Done(); wav_reader.Next()) {
--- a/speechx/speechx/frontend/CMakeLists.txt
+++ b/speechx/speechx/frontend/CMakeLists.txt
@ -0,0 +1,8 @@
+project(frontend)
+
+add_library(frontend
+  normalizer.cc
+  linear_spectrogram.cc  
+)
+
+target_link_libraries(frontend kaldi-matrix)
--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@ -15,16 +15,14 @@
 #pragma once

 #include "base/basic_types.h"
-#incldue "kaldi/matrix/kaldi-vector.h"
+#include "kaldi/matrix/kaldi-vector.h"

 namespace ppspeech {

 class FeatureExtractorInterface {
  public:
-    virtual void AcceptWaveform(const kaldi::Vector<kaldi::BaseFloat>& input) = 0;
-    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
-    virtual void Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
-                         kaldi::VectorBae<kaldi::BaseFloat>* feature) = 0;
+    virtual void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) = 0;
+    virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat) = 0;
    virtual size_t Dim() const = 0;
 };

--- a/speechx/speechx/frontend/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/linear_spectrogram.cc
@ -16,15 +16,36 @@
 #include "kaldi/base/kaldi-math.h"
 #include "kaldi/matrix/matrix-functions.h"

+namespace ppspeech {
+
 using kaldi::int32;
 using kaldi::BaseFloat;
 using kaldi::Vector;
 using kaldi::Matrix;
 using std::vector;

+//todo remove later
+void CopyVector2StdVector(const kaldi::Vector<BaseFloat>& input,
+                          vector<BaseFloat>* output) {
+  if (input.Dim() == 0) return;
+  output->resize(input.Dim());
+  for (size_t idx = 0; idx < input.Dim(); ++idx) {
+    (*output)[idx] = input(idx);
+  }
+}
+
+void CopyStdVector2Vector(const vector<BaseFloat>& input,
+                          Vector<BaseFloat>* output) {
+  if (input.empty()) return;
+  output->Resize(input.size());
+  for (size_t idx = 0; idx < input.size(); ++idx) {
+    (*output)(idx) = input[idx];
+  }
+}
+
 LinearSpectrogram::LinearSpectrogram(
    const LinearSpectrogramOptions& opts,
-    const std::unique_ptr<FeatureExtractorInterface> base_extractor) {
+    std::unique_ptr<FeatureExtractorInterface> base_extractor) {
  base_extractor_ = std::move(base_extractor);
  int32 window_size = opts.frame_opts.WindowSize();
  int32 window_shift = opts.frame_opts.WindowShift();
@ -41,11 +62,8 @@ LinearSpectrogram::LinearSpectrogram(
  dim_ = fft_points_ / 2 + 1;  // the dimension is Fs/2 Hz
 }

-void LinearSpectrogram::AcceptWavefrom(const Vector<BaseFloat>& input) {
-  wavefrom_.resize(input.Dim());
-  for (size_t idx = 0; idx < input.Dim(); ++idx) {
-    waveform_[idx] = input(idx);
-  }
+void LinearSpectrogram::AcceptWavefrom(const kaldi::VectorBase<BaseFloat>& input) {
+  base_extractor_->AcceptWaveform(input);
 }

 void LinearSpectrogram::Hanning(vector<float>* data) const {
@ -58,11 +76,11 @@ void LinearSpectrogram::Hanning(vector<float>* data) const {

 bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
                                 vector<BaseFloat>* real,
-                                 vector<BaseFloat>* img) {
-  if (RealFft(v, true)) {
-    LOG(ERROR) << "compute the fft occurs error";
-    return false;
-  }
+                                 vector<BaseFloat>* img) const {
+  Vector<BaseFloat> v_tmp;
+  CopyStdVector2Vector(*v, &v_tmp);
+  RealFft(&v_tmp, true);
+  CopyVector2StdVector(v_tmp, v);
  real->push_back(v->at(0));
  img->push_back(0);
  for (int i = 1; i < v->size() / 2; i++) {
@ -75,36 +93,28 @@ bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
  return true;
 }

-//todo remove later
-void CopyVector2StdVector(const kaldi::Vector<BaseFloat>& input,
-                          vector<BaseFloat>* output) {
-}
-
 // todo remove later
-bool LinearSpectrogram::ReadFeats(Matrix<BaseFloat>* feats) const {
-  if (wavefrom_.Dim() == 0) {
-      return false;
-  }
-  kaldi::Vector<BaseFloat> feats;
-  Compute(wavefrom_, &feats);
+void LinearSpectrogram::ReadFeats(Matrix<BaseFloat>* feats) {
+  Vector<BaseFloat> tmp;
+  Compute(tmp, &waveform_);
  vector<vector<BaseFloat>> result;
  vector<BaseFloat> feats_vec; 
-  CopyVector2StdVector(feats, &feats_vec);
+  CopyVector2StdVector(waveform_, &feats_vec);
  Compute(feats_vec, result);
  feats->Resize(result.size(), result[0].size());
  for (int row_idx = 0; row_idx < result.size(); ++row_idx) {
    for (int col_idx = 0; col_idx < result.size(); ++col_idx) {
-        feats(row_idx, col_idx) = result[row_idx][col_idx];
+        (*feats)(row_idx, col_idx) = result[row_idx][col_idx];
+    }
  }
-  wavefrom_.Resize(0);
-  return true;
+  waveform_.Resize(0);
 }

 // only for test, remove later
 // todo: compute the feature frame by frame.
-void LinearSpectrogram::Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
-                                kaldi::VectorBae<kaldi::BaseFloat>* feature) {
-    base_extractor_->Compute(input, feature);
+void LinearSpectrogram::Compute(const kaldi::Vector<kaldi::BaseFloat>& input,
+                                kaldi::Vector<kaldi::BaseFloat>* feature) {
+    base_extractor_->Read(feature);
 }

 // Compute spectrogram feat, only for test, remove later
@ -112,9 +122,9 @@ void LinearSpectrogram::Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input
 bool LinearSpectrogram::Compute(const vector<float>& wave,
                                vector<vector<float>>& feat) {
  int num_samples = wave.size();
-  const int& frame_length = opts.frame_opts.WindowSize();
-  const int& sample_rate = opts.frame_opts.samp_freq;
-  const int& frame_shift = opts.frame_opts.WindowShift();
+  const int& frame_length = opts_.frame_opts.WindowSize();
+  const int& sample_rate = opts_.frame_opts.samp_freq;
+  const int& frame_shift = opts_.frame_opts.WindowShift();
  const int& fft_points = fft_points_;
  const float scale = hanning_window_energy_ * frame_shift;

@ -132,11 +142,11 @@ bool LinearSpectrogram::Compute(const vector<float>& wave,
  for (int i = 0; i < num_frames; ++i) {
    vector<float> data(wave.data() + i * frame_shift,
                       wave.data() + i * frame_shift + frame_length);
-    Hanning(data);
+    Hanning(&data);
    fft_img.clear();
    fft_real.clear();
    v.assign(data.begin(), data.end());
-    if (NumpyFft(&v, fft_real, fft_img)) {
+    if (NumpyFft(&v, &fft_real, &fft_img)) {
      LOG(ERROR)<< i  << " fft compute occurs error, please checkout the input data";
      return false;
    }
@ -155,5 +165,8 @@ bool LinearSpectrogram::Compute(const vector<float>& wave,
      // log added eps=1e-14
      feat[i][j] = std::log(feat[i][j] + 1e-14);
    }
+  }
  return true;
 }
+
+}  // namespace ppspeech
--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@ -8,7 +8,7 @@
 namespace ppspeech {

 struct LinearSpectrogramOptions {
-    kaldi::FrameExtrationOptions frame_opts;
+    kaldi::FrameExtractionOptions frame_opts;
    LinearSpectrogramOptions():
        frame_opts() {}

@ -19,19 +19,19 @@ struct LinearSpectrogramOptions {

 class LinearSpectrogram : public FeatureExtractorInterface {
  public:
-    explict LinearSpectrogram(const LinearSpectrogramOptions& opts,
-                              const std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input);
-    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    explicit LinearSpectrogram(const LinearSpectrogramOptions& opts,
+                               std::unique_ptr<FeatureExtractorInterface> base_extractor);
+    virtual void AcceptWavefrom(const kaldi::VectorBase<kaldi::BaseFloat>& input);
+    virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat);
    virtual size_t Dim() const { return dim_; }
-    void ReadFeats(kaldi::Matrix<kaldi::BaesFloat>* feats) const;
+    void ReadFeats(kaldi::Matrix<kaldi::BaseFloat>* feats);

  private: 
-    void Hanning(std::vector<kaldi::BaseFloat>& data) const;
-    kaldi::int32 Compute(const std::vector<kaldi::BaseFloat>& wave,
-                         std::vector<std::vector<kaldi::BaseFloat>>& feat);
-    void Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
-                         kaldi::VectorBae<kaldi::BaseFloat>* feature);
+    void Hanning(std::vector<kaldi::BaseFloat>* data) const;
+    bool Compute(const std::vector<kaldi::BaseFloat>& wave,
+                 std::vector<std::vector<kaldi::BaseFloat>>& feat);
+    void Compute(const kaldi::Vector<kaldi::BaseFloat>& input,
+                 kaldi::Vector<kaldi::BaseFloat>* feature);
    bool NumpyFft(std::vector<kaldi::BaseFloat>* v,
                  std::vector<kaldi::BaseFloat>* real,
                  std::vector<kaldi::BaseFloat>* img) const;
@ -41,7 +41,7 @@ class LinearSpectrogram : public FeatureExtractorInterface {
    std::vector<kaldi::BaseFloat> hanning_window_;
    kaldi::BaseFloat hanning_window_energy_;
    LinearSpectrogramOptions opts_;
-    kaldi::Vector<kaldi::BaseFloat> wavefrom_; // remove later, todo(SmileGoat)
+    kaldi::Vector<kaldi::BaseFloat> waveform_; // remove later, todo(SmileGoat)
    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
    DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
 };
--- a/speechx/speechx/frontend/normalizer.cc
+++ b/speechx/speechx/frontend/normalizer.cc
@ -1,35 +1,62 @@

 #include "frontend/normalizer.h"

-DecibelNormalizer::DecibelNormalizer(
-    const DecibelNormalizerOptions& opts) {
+namespace ppspeech {

+using kaldi::Vector;
+using kaldi::BaseFloat;
+using std::vector;
+
+DecibelNormalizer::DecibelNormalizer(const DecibelNormalizerOptions& opts) {
+  opts_ = opts;
 }
                                    
-void DecibelNormalizer::AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input) {
+void DecibelNormalizer::AcceptWavefrom(const Vector<BaseFloat>& input) {
+  waveform_ = input;
+}

+void DecibelNormalizer::Read(Vector<BaseFloat>* feat) {
+  if (waveform_.Dim() == 0) return;
+  Compute(waveform_, feat);
 }

-void DecibelNormalizer::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
+//todo remove later
+void CopyVector2StdVector(const kaldi::Vector<BaseFloat>& input,
+                          vector<BaseFloat>* output) {
+  if (input.Dim() == 0) return;
+  output->resize(input.Dim());
+  for (size_t idx = 0; idx < input.Dim(); ++idx) {
+    (*output)[idx] = input(idx);
+  }
+}

+void CopyStdVector2Vector(const vector<BaseFloat>& input,
+                          Vector<BaseFloat>* output) {
+  if (input.empty()) return;
+  output->Resize(input.size());
+  for (size_t idx = 0; idx < input.size(); ++idx) {
+    (*output)(idx) = input[idx];
+  }
 }

-bool DecibelNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
-                                kaldi::Vector<kaldi::BaseFloat>* feat) {
+bool DecibelNormalizer::Compute(const Vector<BaseFloat>& input,
+                                Vector<BaseFloat>* feat) const {
  // calculate db rms
-  float rms_db = 0.0;
-  float mean_square = 0.0;
-  float gain = 0.0;
-  vector<BaseFloat> smaples;
-  samples.resize(input.Size());
+  BaseFloat rms_db = 0.0;
+  BaseFloat mean_square = 0.0;
+  BaseFloat gain = 0.0;
+  BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
+
+  vector<BaseFloat> samples;
+  samples.resize(input.Dim());
  for (int32 i = 0; i < samples.size(); ++i) {
    samples[i] = input(i);
  }
  
  // square
  for (auto &d : samples) {
-    if (_opts.convert_int_float) {
-    d = d * WAVE_FLOAT_NORMALIZATION;
+    if (opts_.convert_int_float) {
+    d = d * wave_float_normlization;
    }
    mean_square += d * d;
  }
@ -37,12 +64,12 @@ bool DecibelNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
  // mean
  mean_square /= samples.size();
  rms_db = 10 * std::log10(mean_square);
-  gain = opts.target_db - rms_db;
+  gain = opts_.target_db - rms_db;

-  if (gain > opts.max_gain_db) {
-    LOG(ERROR) << "Unable to normalize segment to " << opts.target_db << "dB,"
-                << "because the the probable gain have exceeds opts.max_gain_db" 
-                <<  opts.max_gain_db << "dB.";
+  if (gain > opts_.max_gain_db) {
+    LOG(ERROR) << "Unable to normalize segment to " << opts_.target_db << "dB,"
+                << "because the the probable gain have exceeds opts_.max_gain_db" 
+                <<  opts_.max_gain_db << "dB.";
    return false;
  }

@ -51,27 +78,28 @@ bool DecibelNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
    // python item *= 10.0 ** (gain / 20.0)
    item *= std::pow(10.0, gain / 20.0);
  }
-
+  
+  CopyStdVector2Vector(samples, feat);
  return true;
 }

-
+/*
 PPNormalizer::PPNormalizer(
    const PPNormalizerOptions& opts,
    const std::unique_ptr<FeatureExtractorInterface>& pre_extractor) {

 }
                                    
-void PPNormalizer::AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input) {
+void PPNormalizer::AcceptWavefrom(const Vector<BaseFloat>& input) {

 }

-void PPNormalizer::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
+void PPNormalizer::Read(Vector<BaseFloat>* feat) {

 }

-bool PPNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
-                           kaldi::Vector<kaldi::BaseFloat>>* feat) {
+bool PPNormalizer::Compute(const Vector<BaseFloat>& input,
+                           Vector<BaseFloat>>* feat) {
   if ((input.Dim() % mean_.Dim()) == 0) {
        LOG(ERROR) << "CMVN dimension is wrong!";
        return false;
@ -93,4 +121,6 @@ bool PPNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
    }

    return true;
-}
+}*/
+
+} // namespace ppspeech
--- a/speechx/speechx/frontend/normalizer.h
+++ b/speechx/speechx/frontend/normalizer.h
@ -1,7 +1,9 @@

 #pragma once

+#include "base/common.h"
 #include "frontend/feature_extractor_interface.h"
+#include "kaldi/util/options-itf.h"

 namespace ppspeech {

@ -9,6 +11,7 @@ namespace ppspeech {
 struct DecibelNormalizerOptions {
  float target_db;
  float max_gain_db;
+  bool convert_int_float;
  DecibelNormalizerOptions() :
    target_db(-20),
    max_gain_db(300.0),
@ -23,16 +26,19 @@ struct DecibelNormalizerOptions {

 class DecibelNormalizer : public FeatureExtractorInterface {
  public:
-    explict DecibelNormalizer(const DecibelNormalizerOptions& opts,
-                              const std::unique_ptr<FeatureExtractorInterface>& pre_extractor);
-    virtual void AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input);
-    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
-    virtual size_t Dim() const;
+    explicit DecibelNormalizer(const DecibelNormalizerOptions& opts);
+    virtual void AcceptWavefrom(const kaldi::VectorBase<kaldi::BaseFloat>& input);
+    virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat);
+    virtual size_t Dim() const { return 0; }
    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& input,
-                 kaldi::Vector<kaldi::BaseFloat>>* feat);
+                 kaldi::Vector<kaldi::BaseFloat>* feat) const;
  private:
+    DecibelNormalizerOptions opts_;
+    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
+    kaldi::Vector<kaldi::BaseFloat> waveform_;
 };

+/*
 struct NormalizerOptions {
  std::string mean_std_path;
  NormalizerOptions() :
@ -61,5 +67,5 @@ class PPNormalizer : public FeatureExtractorInterface {
    kaldi::Vector<float> variance_;
    NormalizerOptions _opts;
 };
-
+*/
 }  // namespace ppspeech