From c82cf0d0866dc34e9547fc02ab9e4d42ed16679b Mon Sep 17 00:00:00 2001
From: SmileGoat <goat.zhou@qq.com>
Date: Mon, 7 Mar 2022 20:51:53 +0800
Subject: [PATCH 1/7] add raw_audio & feature_cache

---
 .../examples/feat/linear-spectrogram-main.cc  | 257 ------------------
 speechx/speechx/frontend/feature_cache.cc     |  38 +++
 speechx/speechx/frontend/feature_cache.h      |  21 ++
 speechx/speechx/frontend/raw_audio.cc         |  60 ++++
 speechx/speechx/frontend/raw_audio.h          |  34 +++
 5 files changed, 153 insertions(+), 257 deletions(-)
 delete mode 100644 speechx/examples/feat/linear-spectrogram-main.cc
 create mode 100644 speechx/speechx/frontend/feature_cache.cc
 create mode 100644 speechx/speechx/frontend/feature_cache.h
 create mode 100644 speechx/speechx/frontend/raw_audio.cc
 create mode 100644 speechx/speechx/frontend/raw_audio.h
diff --git a/speechx/examples/feat/linear-spectrogram-main.cc b/speechx/examples/feat/linear-spectrogram-main.cc
deleted file mode 100644
index 3e2342c2..00000000
--- a/speechx/examples/feat/linear-spectrogram-main.cc
+++ /dev/null
@@ -1,257 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// todo refactor, repalce with gtest
-
-#include "base/flags.h"
-#include "base/log.h"
-#include "frontend/feature_extractor_interface.h"
-#include "frontend/linear_spectrogram.h"
-#include "frontend/normalizer.h"
-#include "kaldi/feat/wave-reader.h"
-#include "kaldi/util/kaldi-io.h"
-#include "kaldi/util/table-types.h"
-
-DEFINE_string(wav_rspecifier, "", "test wav path");
-DEFINE_string(feature_wspecifier, "", "test wav ark");
-DEFINE_string(feature_check_wspecifier, "", "test wav ark");
-DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark");
-
-
-std::vector<float> mean_{
-    -13730251.531853663, -12982852.199316509, -13673844.299583456,
-    -13089406.559646806, -12673095.524938712, -12823859.223276224,
-    -13590267.158903603, -14257618.467152044, -14374605.116185192,
-    -14490009.21822485,  -14849827.158924166, -15354435.470563512,
-    -15834149.206532761, -16172971.985514281, -16348740.496746974,
-    -16423536.699409386, -16556246.263649225, -16744088.772748645,
-    -16916184.08510357,  -17054034.840031497, -17165612.509455364,
-    -17255955.470915023, -17322572.527648456, -17408943.862033736,
-    -17521554.799865916, -17620623.254924215, -17699792.395918526,
-    -17723364.411134344, -17741483.4433254,   -17747426.888704527,
-    -17733315.928209435, -17748780.160905756, -17808336.883775543,
-    -17895918.671983004, -18009812.59173023,  -18098188.66548325,
-    -18195798.958462656, -18293617.62980999,  -18397432.92077201,
-    -18505834.787318766, -18585451.8100908,   -18652438.235649142,
-    -18700960.306275308, -18734944.58792185,  -18737426.313365128,
-    -18735347.165987637, -18738813.444170244, -18737086.848890636,
-    -18731576.2474336,   -18717405.44095871,  -18703089.25545657,
-    -18691014.546456724, -18692460.568905357, -18702119.628629155,
-    -18727710.621126678, -18761582.72034647,  -18806745.835547544,
-    -18850674.8692112,   -18884431.510951452, -18919999.992506847,
-    -18939303.799078144, -18952946.273760635, -18980289.22996379,
-    -19011610.17803294,  -19040948.61805145,  -19061021.429847397,
-    -19112055.53768819,  -19149667.414264943, -19201127.05091321,
-    -19270250.82564605,  -19334606.883057203, -19390513.336589377,
-    -19444176.259208687, -19502755.000038862, -19544333.014549147,
-    -19612668.183176614, -19681902.19006569,  -19771969.951249883,
-    -19873329.723376893, -19996752.59235844,  -20110031.131400537,
-    -20231658.612529557, -20319378.894054495, -20378534.45718066,
-    -20413332.089584175, -20438147.844177883, -20443710.248040095,
-    -20465457.02238927,  -20488610.969337028, -20516295.16424432,
-    -20541423.795738827, -20553192.874953747, -20573605.50701977,
-    -20577871.61936797,  -20571807.008916274, -20556242.38912231,
-    -20542199.30819195,  -20521239.063551214, -20519150.80004532,
-    -20527204.80248933,  -20536933.769257784, -20543470.522332076,
-    -20549700.089992985, -20551525.24958494,  -20554873.406493705,
-    -20564277.65794227,  -20572211.740052115, -20574305.69550465,
-    -20575494.450104576, -20567092.577932164, -20549302.929608088,
-    -20545445.11878376,  -20546625.326603737, -20549190.03499401,
-    -20554824.947828256, -20568341.378989458, -20577582.331383612,
-    -20577980.519402675, -20566603.03458152,  -20560131.592262644,
-    -20552166.469060015, -20549063.06763577,  -20544490.562339947,
-    -20539817.82346569,  -20528747.715731595, -20518026.24576161,
-    -20510977.844974525, -20506874.36087992,  -20506731.11977665,
-    -20510482.133420516, -20507760.92101862,  -20494644.834457114,
-    -20480107.89304893,  -20461312.091867123, -20442941.75080173,
-    -20426123.02834838,  -20424607.675283,    -20426810.369107097,
-    -20434024.50097819,  -20437404.75544205,  -20447688.63916367,
-    -20460893.335563846, -20482922.735127095, -20503610.119434915,
-    -20527062.76448319,  -20557830.035128627, -20593274.72068722,
-    -20632528.452965066, -20673637.471334763, -20733106.97143075,
-    -20842921.0447562,   -21054357.83621519,  -21416569.534189366,
-    -21978460.272811692, -22753170.052172784, -23671344.10563395,
-    -24613499.293358143, -25406477.12230188,  -25884377.82156489,
-    -26049040.62791664,  -26996879.104431007};
-std::vector<float> variance_{
-    213747175.10846674, 188395815.34302503, 212706429.10966414,
-    199109025.81461075, 189235901.23864496, 194901336.53253657,
-    217481594.29306737, 238689869.12327808, 243977501.24115244,
-    248479623.6431067,  259766741.47116545, 275516766.7790273,
-    291271202.3691234,  302693239.8220509,  308627358.3997694,
-    311143911.38788426, 315446105.07731867, 321705430.9341829,
-    327458907.4659941,  332245072.43223983, 336251717.5935284,
-    339694069.7639722,  342188204.4322228,  345587110.31313115,
-    349903086.2875232,  353660214.20643026, 356700344.5270885,
-    357665362.3529641,  358493352.05658793, 358857951.620328,
-    358375239.52774596, 358899733.6342954,  361051818.3511561,
-    364361716.05025816, 368750322.3771452,  372047800.6462831,
-    375655861.1349018,  379358519.1980013,  383327605.3935181,
-    387458599.282341,   390434692.3406868,  392994486.35057056,
-    394874418.04603153, 396230525.79763395, 396365592.0414835,
-    396334819.8242737,  396488353.19250053, 396438877.00744957,
-    396197980.4459586,  395590921.6672991,  395001107.62072515,
-    394528291.7318225,  394593110.424006,   395018405.59353715,
-    396110577.5415993,  397506704.0371068,  399400197.4657644,
-    401243568.2468382,  402687134.7805103,  404136047.2872507,
-    404883170.001883,   405522253.219517,   406660365.3626476,
-    407919346.0991902,  409045348.5384909,  409759588.7889818,
-    411974821.8564483,  413489718.78201455, 415535392.56684107,
-    418466481.97674364, 421104678.35678065, 423405392.5200779,
-    425550570.40798235, 427929423.9579701,  429585274.253478,
-    432368493.55181056, 435193587.13513297, 438886855.20476013,
-    443058876.8633751,  448181232.5093362,  452883835.6332396,
-    458056721.77926534, 461816531.22735566, 464363620.1970998,
-    465886343.5057493,  466928872.0651,     467180536.42647296,
-    468111848.70714295, 469138695.3071312,  470378429.6930793,
-    471517958.7132626,  472109050.4262365,  473087417.0177867,
-    473381322.04648733, 473220195.85483915, 472666071.8998819,
-    472124669.87879956, 471298571.411737,   471251033.2902761,
-    471672676.43128747, 472177147.2193172,  472572361.7711908,
-    472968783.7751127,  473156295.4164052,  473398034.82676554,
-    473897703.5203811,  474328271.33112127, 474452670.98002136,
-    474549003.99284613, 474252887.13567275, 473557462.909069,
-    473483385.85193115, 473609738.04855174, 473746944.82085115,
-    474016729.91696435, 474617321.94138587, 475045097.237122,
-    475125402.586558,   474664112.9824912,  474426247.5800283,
-    474104075.42796475, 473978219.7273978,  473773171.7798875,
-    473578534.69508696, 473102924.16904145, 472651240.5232615,
-    472374383.1810912,  472209479.6956096,  472202298.8921673,
-    472370090.76781124, 472220933.99374026, 471625467.37106377,
-    470994646.51883453, 470182428.9637543,  469348211.5939578,
-    468570387.4467277,  468540442.7225135,  468672018.90414184,
-    468994346.9533251,  469138757.58201426, 469553915.95710236,
-    470134523.38582784, 471082421.62055486, 471962316.51804745,
-    472939745.1708408,  474250621.5944825,  475773933.43199486,
-    477465399.71087736, 479218782.61382693, 481752299.7930922,
-    486608947.8984568,  496119403.2067917,  512730085.5704984,
-    539048915.2641417,  576285298.3548826,  621610270.2240586,
-    669308196.4436442,  710656993.5957186,  736344437.3725077,
-    745481288.0241544,  801121432.9925804};
-int count_ = 912592;
-
-void WriteMatrix() {
-    kaldi::Matrix<double> cmvn_stats(2, mean_.size() + 1);
-    for (size_t idx = 0; idx < mean_.size(); ++idx) {
-        cmvn_stats(0, idx) = mean_[idx];
-        cmvn_stats(1, idx) = variance_[idx];
-    }
-    cmvn_stats(0, mean_.size()) = count_;
-    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true);
-}
-
-int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, false);
-    google::InitGoogleLogging(argv[0]);
-
-    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
-        FLAGS_wav_rspecifier);
-    kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
-    kaldi::BaseFloatMatrixWriter feat_cmvn_check_writer(
-        FLAGS_feature_check_wspecifier);
-    WriteMatrix();
-
-    // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
-    // window -->linear_spectrogram --> cmvn
-    int32 num_done = 0, num_err = 0;
-    ppspeech::LinearSpectrogramOptions opt;
-    opt.frame_opts.frame_length_ms = 20;
-    opt.frame_opts.frame_shift_ms = 10;
-    ppspeech::DecibelNormalizerOptions db_norm_opt;
-    std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor(
-        new ppspeech::DecibelNormalizer(db_norm_opt));
-    ppspeech::LinearSpectrogram linear_spectrogram(
-        opt, std::move(base_feature_extractor));
-
-    ppspeech::CMVN cmvn(FLAGS_cmvn_write_path);
-
-    float streaming_chunk = 0.36;
-    int sample_rate = 16000;
-    int chunk_sample_size = streaming_chunk * sample_rate;
-
-    LOG(INFO) << mean_.size();
-    for (size_t i = 0; i < mean_.size(); i++) {
-        mean_[i] /= count_;
-        variance_[i] = variance_[i] / count_ - mean_[i] * mean_[i];
-        if (variance_[i] < 1.0e-20) {
-            variance_[i] = 1.0e-20;
-        }
-        variance_[i] = 1.0 / std::sqrt(variance_[i]);
-    }
-
-    for (; !wav_reader.Done(); wav_reader.Next()) {
-        std::string utt = wav_reader.Key();
-        const kaldi::WaveData& wave_data = wav_reader.Value();
-
-        int32 this_channel = 0;
-        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
-                                                    this_channel);
-        int tot_samples = waveform.Dim();
-        int sample_offset = 0;
-        std::vector<kaldi::Matrix<BaseFloat>> feats;
-        int feature_rows = 0;
-        while (sample_offset < tot_samples) {
-            int cur_chunk_size =
-                std::min(chunk_sample_size, tot_samples - sample_offset);
-            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
-            for (int i = 0; i < cur_chunk_size; ++i) {
-                wav_chunk(i) = waveform(sample_offset + i);
-            }
-            kaldi::Matrix<BaseFloat> features;
-            linear_spectrogram.AcceptWaveform(wav_chunk);
-            linear_spectrogram.ReadFeats(&features);
-
-            feats.push_back(features);
-            sample_offset += cur_chunk_size;
-            feature_rows += features.NumRows();
-        }
-
-        int cur_idx = 0;
-        kaldi::Matrix<kaldi::BaseFloat> features(feature_rows,
-                                                 feats[0].NumCols());
-        for (auto feat : feats) {
-            for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) {
-                for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) {
-                    features(cur_idx, col_idx) =
-                        (feat(row_idx, col_idx) - mean_[col_idx]) *
-                        variance_[col_idx];
-                }
-                ++cur_idx;
-            }
-        }
-        feat_writer.Write(utt, features);
-
-        cur_idx = 0;
-        kaldi::Matrix<kaldi::BaseFloat> features_check(feature_rows,
-                                                       feats[0].NumCols());
-        for (auto feat : feats) {
-            for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) {
-                for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) {
-                    features_check(cur_idx, col_idx) = feat(row_idx, col_idx);
-                }
-                kaldi::SubVector<BaseFloat> row_feat(features_check, cur_idx);
-                cmvn.ApplyCMVN(true, &row_feat);
-                ++cur_idx;
-            }
-        }
-        feat_cmvn_check_writer.Write(utt, features_check);
-
-        if (num_done % 50 == 0 && num_done != 0)
-            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
-        num_done++;
-    }
-    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
-              << " with errors.";
-    return (num_done != 0 ? 0 : 1);
-}
diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc
new file mode 100644
index 00000000..07f2cbf7
--- /dev/null
+++ b/speechx/speechx/frontend/feature_cache.cc
@@ -0,0 +1,38 @@
+#include "frontend/feature_cache.h"
+
+void FeatureCache::AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) {
+    base_extractor_->AcceptWaveform(input);
+    // feed current data
+    while (base_extractor_->IsLastFrame()) {
+      Compute();
+    }
+}
+
+// pop feature chunk 
+void FeatureCache::Read(kaldi::VectorBase<kaldi::BaseFloat>* feat) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    while (cache_.empty()) {
+        ready_read_condition_.wait(lock);
+    }
+    feat->CopyFromVec(cache_.front());
+    cache_.pop();
+    ready_feed_condition_.notify_one();
+}
+
+// read all data from base_feature_extractor_ into cache_
+void FeatureCache::Compute() {
+    // compute and feed
+    Vector<BaseFloat> feature_chunk(base_extractor_->Dim());
+    base_extractor_->Read(&feature_chunk);
+    std::lock_guard<std::mutex> lock(mutex_);
+    while (cache_.size() >= max_size_) {
+        ready_feed_condition_.wait(lock);
+    }
+    cache_.push(feature_chunk);
+    ready_read_condition_.notify_one();
+}
+
+// compute the last chunk data && set feed finished 
+void FeatureCache::InputFinishd() {
+    Compute();
+}
diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h
new file mode 100644
index 00000000..71dc455c
--- /dev/null
+++ b/speechx/speechx/frontend/feature_cache.h
@@ -0,0 +1,21 @@
+#include "frontend/feature_extractor_interface.h"
+
+class FeatureCache {
+  public:
+    explicit FeatureCache(FeatureExtractorInterface base_extractor); 
+    void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input);
+    void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat);
+    void Dim() { return base_extractor_->Dim(); }
+    void SetFinished();
+    bool IsFinished();
+
+  private:
+    bool finished_;
+    mutable std::mutex mutex_;
+    size_t max_size;
+    std::queue<kaldi::Vector<BaseFloat>> cache_;
+    std::shared_ptr<FeatureExtractorInterface> base_extractor_;
+    std::condition_variable ready_feed_condition_;
+    std::condition_variable ready_read_condition_;
+    DISALLOW_COPY_AND_ASSGIN(FeatureCache);
+};
\ No newline at end of file
diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc
new file mode 100644
index 00000000..f4584828
--- /dev/null
+++ b/speechx/speechx/frontend/raw_audio.cc
@@ -0,0 +1,60 @@
+#include "frontend/raw_audio.h"
+#include "kaldi/base/timer.h"
+
+namespace ppspeech {
+
+RawAudioSource::RawAudioSource(int buffer_size = 65536) 
+    : finished_(false),
+      data_length_(0),
+      start_(0),
+      timeout_(5) {
+  ring_buffer_.resize(buffer_size);
+} 
+
+// todo length > buffer size, condition_var
+bool RawAudioSource::AcceptWaveform(const VectorBase<BaseFloat>& data) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  for (size_t idx = 0; idx < data.Dim(); ++idx) {
+      ring_buffer_[idx % ring_buffer_.size()] = data(idx);
+  }
+  data_length_ += length;
+}
+
+// todo length > buffer size
+//bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) {
+  //std::lock_guard<std::mutex> lock(mutex_);
+  //for (size_t idx = 0; idx < length; ++idx) {
+      //ring_buffer_[idx % ring_buffer_.size()] = data[idx];
+  //}
+  //data_length_ += length;
+  //finish_condition_.notify_one();
+//}
+
+bool RawAudioSource::Read(Vector<BaseFloat>* feats) {
+  size_t chunk_size = feats->Dim();
+  Timer timer;
+  if (chunk_size > data_length_) {
+    while (true) {
+      int32 elapsed = static_cat<int32>(timer.Elapsed() * 1000);
+      if (finished_ || > timeout_) {
+        chunk_size = data_length_;
+        feats->Resize(chunk_size);
+        break;
+      }
+      sleep(1); 
+    }
+  }
+  std::lock_guard<std::mutex> lock(mutex_);
+  for (size_t idx = 0; idx < chunk_size; ++idx) {
+    feats->Data()[idx] = ring_buffer_[idx];
+  }
+  data_length_ -= chunk_size;
+  start_ = (start_ + chunk_size) % ring_buffer_.size();
+  finish_condition_.notify_one();
+}
+
+//size_t RawAudioSource::GetDataLength() {
+//  return data_length_;
+//}
+
+} // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h
new file mode 100644
index 00000000..24a4b2e8
--- /dev/null
+++ b/speechx/speechx/frontend/raw_audio.h
@@ -0,0 +1,34 @@
+
+#pragma once
+
+#include "frontend/feature_extractor_interface.h"
+#include "base/common.h"
+
+#pragma once
+
+namespace ppspeech {
+
+class RawAudioSource {
+  public:
+    RawAudioSource(int buffer_size = kint16max);
+    virtual void AcceptWaveform(kaldi::BaseFloat* data, int length);
+    void AcceptWaveformByByte(char* data, lnt length) {}
+    void AcceptWaveformByShort(kaldi::int16* data, int length) {}
+
+    // read chunk data in buffer
+    bool Read(VectorBase<BaseFloat>* feats);
+    void SetFinished() { finished_ = true; }
+    bool IsFinished() { return finished_; }
+
+  private:
+    vector<kaldi::BaseFloat> ring_buffer_;
+    size_t start_;
+    size_t data_length_;
+    bool finished_;
+    mutable std::mutex mutext_;
+    std::condition_variable ready_read_condition_;
+    std::condition_variable ready_feed_condition_;
+    kaldi::int32 timeout_;
+};
+
+} // namespace ppspeech
\ No newline at end of file

From 8ec6a8c0a8fc521ebe68fca4cde8cd3eb8b538d2 Mon Sep 17 00:00:00 2001
From: SmileGoat <goat.zhou@qq.com>
Date: Mon, 7 Mar 2022 20:57:40 +0800
Subject: [PATCH 2/7] add streaming_feat_main

---
 .../examples/feat/linear_spectrogram_main.cc  | 257 ++++++++++++++++++
 speechx/examples/feat/streaming_feat_main.cc  |  56 ++++
 2 files changed, 313 insertions(+)
 create mode 100644 speechx/examples/feat/linear_spectrogram_main.cc
 create mode 100644 speechx/examples/feat/streaming_feat_main.cc

diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc
new file mode 100644
index 00000000..3e2342c2
--- /dev/null
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@@ -0,0 +1,257 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// todo refactor, repalce with gtest
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "frontend/feature_extractor_interface.h"
+#include "frontend/linear_spectrogram.h"
+#include "frontend/normalizer.h"
+#include "kaldi/feat/wave-reader.h"
+#include "kaldi/util/kaldi-io.h"
+#include "kaldi/util/table-types.h"
+
+DEFINE_string(wav_rspecifier, "", "test wav path");
+DEFINE_string(feature_wspecifier, "", "test wav ark");
+DEFINE_string(feature_check_wspecifier, "", "test wav ark");
+DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark");
+
+
+std::vector<float> mean_{
+    -13730251.531853663, -12982852.199316509, -13673844.299583456,
+    -13089406.559646806, -12673095.524938712, -12823859.223276224,
+    -13590267.158903603, -14257618.467152044, -14374605.116185192,
+    -14490009.21822485,  -14849827.158924166, -15354435.470563512,
+    -15834149.206532761, -16172971.985514281, -16348740.496746974,
+    -16423536.699409386, -16556246.263649225, -16744088.772748645,
+    -16916184.08510357,  -17054034.840031497, -17165612.509455364,
+    -17255955.470915023, -17322572.527648456, -17408943.862033736,
+    -17521554.799865916, -17620623.254924215, -17699792.395918526,
+    -17723364.411134344, -17741483.4433254,   -17747426.888704527,
+    -17733315.928209435, -17748780.160905756, -17808336.883775543,
+    -17895918.671983004, -18009812.59173023,  -18098188.66548325,
+    -18195798.958462656, -18293617.62980999,  -18397432.92077201,
+    -18505834.787318766, -18585451.8100908,   -18652438.235649142,
+    -18700960.306275308, -18734944.58792185,  -18737426.313365128,
+    -18735347.165987637, -18738813.444170244, -18737086.848890636,
+    -18731576.2474336,   -18717405.44095871,  -18703089.25545657,
+    -18691014.546456724, -18692460.568905357, -18702119.628629155,
+    -18727710.621126678, -18761582.72034647,  -18806745.835547544,
+    -18850674.8692112,   -18884431.510951452, -18919999.992506847,
+    -18939303.799078144, -18952946.273760635, -18980289.22996379,
+    -19011610.17803294,  -19040948.61805145,  -19061021.429847397,
+    -19112055.53768819,  -19149667.414264943, -19201127.05091321,
+    -19270250.82564605,  -19334606.883057203, -19390513.336589377,
+    -19444176.259208687, -19502755.000038862, -19544333.014549147,
+    -19612668.183176614, -19681902.19006569,  -19771969.951249883,
+    -19873329.723376893, -19996752.59235844,  -20110031.131400537,
+    -20231658.612529557, -20319378.894054495, -20378534.45718066,
+    -20413332.089584175, -20438147.844177883, -20443710.248040095,
+    -20465457.02238927,  -20488610.969337028, -20516295.16424432,
+    -20541423.795738827, -20553192.874953747, -20573605.50701977,
+    -20577871.61936797,  -20571807.008916274, -20556242.38912231,
+    -20542199.30819195,  -20521239.063551214, -20519150.80004532,
+    -20527204.80248933,  -20536933.769257784, -20543470.522332076,
+    -20549700.089992985, -20551525.24958494,  -20554873.406493705,
+    -20564277.65794227,  -20572211.740052115, -20574305.69550465,
+    -20575494.450104576, -20567092.577932164, -20549302.929608088,
+    -20545445.11878376,  -20546625.326603737, -20549190.03499401,
+    -20554824.947828256, -20568341.378989458, -20577582.331383612,
+    -20577980.519402675, -20566603.03458152,  -20560131.592262644,
+    -20552166.469060015, -20549063.06763577,  -20544490.562339947,
+    -20539817.82346569,  -20528747.715731595, -20518026.24576161,
+    -20510977.844974525, -20506874.36087992,  -20506731.11977665,
+    -20510482.133420516, -20507760.92101862,  -20494644.834457114,
+    -20480107.89304893,  -20461312.091867123, -20442941.75080173,
+    -20426123.02834838,  -20424607.675283,    -20426810.369107097,
+    -20434024.50097819,  -20437404.75544205,  -20447688.63916367,
+    -20460893.335563846, -20482922.735127095, -20503610.119434915,
+    -20527062.76448319,  -20557830.035128627, -20593274.72068722,
+    -20632528.452965066, -20673637.471334763, -20733106.97143075,
+    -20842921.0447562,   -21054357.83621519,  -21416569.534189366,
+    -21978460.272811692, -22753170.052172784, -23671344.10563395,
+    -24613499.293358143, -25406477.12230188,  -25884377.82156489,
+    -26049040.62791664,  -26996879.104431007};
+std::vector<float> variance_{
+    213747175.10846674, 188395815.34302503, 212706429.10966414,
+    199109025.81461075, 189235901.23864496, 194901336.53253657,
+    217481594.29306737, 238689869.12327808, 243977501.24115244,
+    248479623.6431067,  259766741.47116545, 275516766.7790273,
+    291271202.3691234,  302693239.8220509,  308627358.3997694,
+    311143911.38788426, 315446105.07731867, 321705430.9341829,
+    327458907.4659941,  332245072.43223983, 336251717.5935284,
+    339694069.7639722,  342188204.4322228,  345587110.31313115,
+    349903086.2875232,  353660214.20643026, 356700344.5270885,
+    357665362.3529641,  358493352.05658793, 358857951.620328,
+    358375239.52774596, 358899733.6342954,  361051818.3511561,
+    364361716.05025816, 368750322.3771452,  372047800.6462831,
+    375655861.1349018,  379358519.1980013,  383327605.3935181,
+    387458599.282341,   390434692.3406868,  392994486.35057056,
+    394874418.04603153, 396230525.79763395, 396365592.0414835,
+    396334819.8242737,  396488353.19250053, 396438877.00744957,
+    396197980.4459586,  395590921.6672991,  395001107.62072515,
+    394528291.7318225,  394593110.424006,   395018405.59353715,
+    396110577.5415993,  397506704.0371068,  399400197.4657644,
+    401243568.2468382,  402687134.7805103,  404136047.2872507,
+    404883170.001883,   405522253.219517,   406660365.3626476,
+    407919346.0991902,  409045348.5384909,  409759588.7889818,
+    411974821.8564483,  413489718.78201455, 415535392.56684107,
+    418466481.97674364, 421104678.35678065, 423405392.5200779,
+    425550570.40798235, 427929423.9579701,  429585274.253478,
+    432368493.55181056, 435193587.13513297, 438886855.20476013,
+    443058876.8633751,  448181232.5093362,  452883835.6332396,
+    458056721.77926534, 461816531.22735566, 464363620.1970998,
+    465886343.5057493,  466928872.0651,     467180536.42647296,
+    468111848.70714295, 469138695.3071312,  470378429.6930793,
+    471517958.7132626,  472109050.4262365,  473087417.0177867,
+    473381322.04648733, 473220195.85483915, 472666071.8998819,
+    472124669.87879956, 471298571.411737,   471251033.2902761,
+    471672676.43128747, 472177147.2193172,  472572361.7711908,
+    472968783.7751127,  473156295.4164052,  473398034.82676554,
+    473897703.5203811,  474328271.33112127, 474452670.98002136,
+    474549003.99284613, 474252887.13567275, 473557462.909069,
+    473483385.85193115, 473609738.04855174, 473746944.82085115,
+    474016729.91696435, 474617321.94138587, 475045097.237122,
+    475125402.586558,   474664112.9824912,  474426247.5800283,
+    474104075.42796475, 473978219.7273978,  473773171.7798875,
+    473578534.69508696, 473102924.16904145, 472651240.5232615,
+    472374383.1810912,  472209479.6956096,  472202298.8921673,
+    472370090.76781124, 472220933.99374026, 471625467.37106377,
+    470994646.51883453, 470182428.9637543,  469348211.5939578,
+    468570387.4467277,  468540442.7225135,  468672018.90414184,
+    468994346.9533251,  469138757.58201426, 469553915.95710236,
+    470134523.38582784, 471082421.62055486, 471962316.51804745,
+    472939745.1708408,  474250621.5944825,  475773933.43199486,
+    477465399.71087736, 479218782.61382693, 481752299.7930922,
+    486608947.8984568,  496119403.2067917,  512730085.5704984,
+    539048915.2641417,  576285298.3548826,  621610270.2240586,
+    669308196.4436442,  710656993.5957186,  736344437.3725077,
+    745481288.0241544,  801121432.9925804};
+int count_ = 912592;
+
+void WriteMatrix() {
+    kaldi::Matrix<double> cmvn_stats(2, mean_.size() + 1);
+    for (size_t idx = 0; idx < mean_.size(); ++idx) {
+        cmvn_stats(0, idx) = mean_[idx];
+        cmvn_stats(1, idx) = variance_[idx];
+    }
+    cmvn_stats(0, mean_.size()) = count_;
+    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true);
+}
+
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
+        FLAGS_wav_rspecifier);
+    kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
+    kaldi::BaseFloatMatrixWriter feat_cmvn_check_writer(
+        FLAGS_feature_check_wspecifier);
+    WriteMatrix();
+
+    // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
+    // window -->linear_spectrogram --> cmvn
+    int32 num_done = 0, num_err = 0;
+    ppspeech::LinearSpectrogramOptions opt;
+    opt.frame_opts.frame_length_ms = 20;
+    opt.frame_opts.frame_shift_ms = 10;
+    ppspeech::DecibelNormalizerOptions db_norm_opt;
+    std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor(
+        new ppspeech::DecibelNormalizer(db_norm_opt));
+    ppspeech::LinearSpectrogram linear_spectrogram(
+        opt, std::move(base_feature_extractor));
+
+    ppspeech::CMVN cmvn(FLAGS_cmvn_write_path);
+
+    float streaming_chunk = 0.36;
+    int sample_rate = 16000;
+    int chunk_sample_size = streaming_chunk * sample_rate;
+
+    LOG(INFO) << mean_.size();
+    for (size_t i = 0; i < mean_.size(); i++) {
+        mean_[i] /= count_;
+        variance_[i] = variance_[i] / count_ - mean_[i] * mean_[i];
+        if (variance_[i] < 1.0e-20) {
+            variance_[i] = 1.0e-20;
+        }
+        variance_[i] = 1.0 / std::sqrt(variance_[i]);
+    }
+
+    for (; !wav_reader.Done(); wav_reader.Next()) {
+        std::string utt = wav_reader.Key();
+        const kaldi::WaveData& wave_data = wav_reader.Value();
+
+        int32 this_channel = 0;
+        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
+                                                    this_channel);
+        int tot_samples = waveform.Dim();
+        int sample_offset = 0;
+        std::vector<kaldi::Matrix<BaseFloat>> feats;
+        int feature_rows = 0;
+        while (sample_offset < tot_samples) {
+            int cur_chunk_size =
+                std::min(chunk_sample_size, tot_samples - sample_offset);
+            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
+            for (int i = 0; i < cur_chunk_size; ++i) {
+                wav_chunk(i) = waveform(sample_offset + i);
+            }
+            kaldi::Matrix<BaseFloat> features;
+            linear_spectrogram.AcceptWaveform(wav_chunk);
+            linear_spectrogram.ReadFeats(&features);
+
+            feats.push_back(features);
+            sample_offset += cur_chunk_size;
+            feature_rows += features.NumRows();
+        }
+
+        int cur_idx = 0;
+        kaldi::Matrix<kaldi::BaseFloat> features(feature_rows,
+                                                 feats[0].NumCols());
+        for (auto feat : feats) {
+            for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) {
+                for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) {
+                    features(cur_idx, col_idx) =
+                        (feat(row_idx, col_idx) - mean_[col_idx]) *
+                        variance_[col_idx];
+                }
+                ++cur_idx;
+            }
+        }
+        feat_writer.Write(utt, features);
+
+        cur_idx = 0;
+        kaldi::Matrix<kaldi::BaseFloat> features_check(feature_rows,
+                                                       feats[0].NumCols());
+        for (auto feat : feats) {
+            for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) {
+                for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) {
+                    features_check(cur_idx, col_idx) = feat(row_idx, col_idx);
+                }
+                kaldi::SubVector<BaseFloat> row_feat(features_check, cur_idx);
+                cmvn.ApplyCMVN(true, &row_feat);
+                ++cur_idx;
+            }
+        }
+        feat_cmvn_check_writer.Write(utt, features_check);
+
+        if (num_done % 50 == 0 && num_done != 0)
+            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
+        num_done++;
+    }
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/examples/feat/streaming_feat_main.cc b/speechx/examples/feat/streaming_feat_main.cc
new file mode 100644
index 00000000..29133045
--- /dev/null
+++ b/speechx/examples/feat/streaming_feat_main.cc
@@ -0,0 +1,56 @@
+// todo refactor, repalce with gtest
+
+#include "frontend/linear_spectrogram.h"
+#include "frontend/normalizer.h"
+#include "frontend/feature_extractor_interface.h"
+#include "kaldi/util/table-types.h"
+#include "base/log.h"
+#include "base/flags.h"
+#include "kaldi/feat/wave-reader.h"
+#include "kaldi/util/kaldi-io.h"
+
+DEFINE_string(wav_rspecifier, "", "test wav path");
+DEFINE_string(feature_wspecifier, "", "test wav ark");
+DEFINE_string(cmvn_path, "./cmvn.ark", "test wav ark");
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, false);
+  google::InitGoogleLogging(argv[0]);
+  
+  kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(FLAGS_wav_rspecifier);
+  kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
+
+  // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning window -->linear_spectrogram --> cmvn
+  // --> feature_cache
+  int32 num_done = 0, num_err = 0;
+  ppspeech::LinearSpectrogramOptions opt;
+  opt.frame_opts.frame_length_ms = 20;
+  opt.frame_opts.frame_shift_ms = 10;
+  ppspeech::DecibelNormalizerOptions db_norm_opt;
+  std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor(
+      new ppspeech::DecibelNormalizer(db_norm_opt));
+
+  std::shared_ptr<ppspeech::FeatureExtractorInterface> linear_spectrogram(
+      new ppspeech::LinearSpectrogram(opt, base_feature_extractor));
+
+  std::shared_ptr<ppspeech::FeatureExtractorInterface> cmvn(
+      new ppspeech::CMVN(FLAGS_cmvn_path, linear_spectrogram);
+  ppspeech::FeatureCache(cmvn);
+
+  float streaming_chunk = 0.36;
+  int sample_rate = 16000;
+  int chunk_sample_size = streaming_chunk * sample_rate;
+  // thread 1 feed feature
+
+  for (; !wav_reader.Done(); wav_reader.Next()) {
+    std::string utt = wav_reader.Key();
+    const kaldi::WaveData &wave_data = wav_reader.Value();
+
+    if (num_done % 50 == 0 && num_done != 0)
+    KALDI_VLOG(2) << "Processed " << num_done << " utterances";
+    num_done++;
+  }
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}

From ac0e417032601e78621b845c1431b173362428d0 Mon Sep 17 00:00:00 2001
From: SmileGoat <goat.zhou@qq.com>
Date: Tue, 8 Mar 2022 20:40:20 +0800
Subject: [PATCH 3/7] make streaming pipeline work

---
 speechx/examples/feat/streaming_feat_main.cc  |  75 ++++-
 speechx/speechx/base/common.h                 |  11 +-
 speechx/speechx/frontend/CMakeLists.txt       |   3 +-
 .../frontend/feature_extractor_interface.h    |   5 +-
 .../speechx/frontend/linear_spectrogram.cc    | 229 +++++++-------
 speechx/speechx/frontend/linear_spectrogram.h |  43 ++-
 speechx/speechx/frontend/normalizer.cc        | 289 +++++++++---------
 speechx/speechx/frontend/normalizer.h         |  70 ++---
 8 files changed, 366 insertions(+), 359 deletions(-)

diff --git a/speechx/examples/feat/streaming_feat_main.cc b/speechx/examples/feat/streaming_feat_main.cc
index 29133045..b3ee9842 100644
--- a/speechx/examples/feat/streaming_feat_main.cc
+++ b/speechx/examples/feat/streaming_feat_main.cc
@@ -1,17 +1,34 @@
 // todo refactor, repalce with gtest
 
+#include "base/log.h"
+#include "base/flags.h"
 #include "frontend/linear_spectrogram.h"
 #include "frontend/normalizer.h"
 #include "frontend/feature_extractor_interface.h"
+#include "frontend/raw_audio.h"
 #include "kaldi/util/table-types.h"
-#include "base/log.h"
-#include "base/flags.h"
 #include "kaldi/feat/wave-reader.h"
 #include "kaldi/util/kaldi-io.h"
 
 DEFINE_string(wav_rspecifier, "", "test wav path");
 DEFINE_string(feature_wspecifier, "", "test wav ark");
-DEFINE_string(cmvn_path, "./cmvn.ark", "test wav ark");
+DEFINE_string(feature_check_wspecifier, "", "test wav ark");
+DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark");
+
+
+std::vector<float> mean_{-13730251.531853663, -12982852.199316509, -13673844.299583456, -13089406.559646806, -12673095.524938712, -12823859.223276224, -13590267.158903603, -14257618.467152044, -14374605.116185192, -14490009.21822485, -14849827.158924166, -15354435.470563512, -15834149.206532761, -16172971.985514281, -16348740.496746974, -16423536.699409386, -16556246.263649225, -16744088.772748645, -16916184.08510357, -17054034.840031497, -17165612.509455364, -17255955.470915023, -17322572.527648456, -17408943.862033736, -17521554.799865916, -17620623.254924215, -17699792.395918526, -17723364.411134344, -17741483.4433254, -17747426.888704527, -17733315.928209435, -17748780.160905756, -17808336.883775543, -17895918.671983004, -18009812.59173023, -18098188.66548325, -18195798.958462656, -18293617.62980999, -18397432.92077201, -18505834.787318766, -18585451.8100908, -18652438.235649142, -18700960.306275308, -18734944.58792185, -18737426.313365128, -18735347.165987637, -18738813.444170244, -18737086.848890636, -18731576.2474336, -18717405.44095871, -18703089.25545657, -18691014.546456724, -18692460.568905357, -18702119.628629155, -18727710.621126678, -18761582.72034647, -18806745.835547544, -18850674.8692112, -18884431.510951452, -18919999.992506847, -18939303.799078144, -18952946.273760635, -18980289.22996379, -19011610.17803294, -19040948.61805145, -19061021.429847397, -19112055.53768819, -19149667.414264943, -19201127.05091321, -19270250.82564605, -19334606.883057203, -19390513.336589377, -19444176.259208687, -19502755.000038862, -19544333.014549147, -19612668.183176614, -19681902.19006569, -19771969.951249883, -19873329.723376893, -19996752.59235844, -20110031.131400537, -20231658.612529557, -20319378.894054495, -20378534.45718066, -20413332.089584175, -20438147.844177883, -20443710.248040095, -20465457.02238927, -20488610.969337028, -20516295.16424432, -20541423.795738827, -20553192.874953747, -20573605.50701977, -20577871.61936797, -20571807.008916274, -20556242.38912231, -20542199.30819195, -20521239.063551214, -20519150.80004532, -20527204.80248933, -20536933.769257784, -20543470.522332076, -20549700.089992985, -20551525.24958494, -20554873.406493705, -20564277.65794227, -20572211.740052115, -20574305.69550465, -20575494.450104576, -20567092.577932164, -20549302.929608088, -20545445.11878376, -20546625.326603737, -20549190.03499401, -20554824.947828256, -20568341.378989458, -20577582.331383612, -20577980.519402675, -20566603.03458152, -20560131.592262644, -20552166.469060015, -20549063.06763577, -20544490.562339947, -20539817.82346569, -20528747.715731595, -20518026.24576161, -20510977.844974525, -20506874.36087992, -20506731.11977665, -20510482.133420516, -20507760.92101862, -20494644.834457114, -20480107.89304893, -20461312.091867123, -20442941.75080173, -20426123.02834838, -20424607.675283, -20426810.369107097, -20434024.50097819, -20437404.75544205, -20447688.63916367, -20460893.335563846, -20482922.735127095, -20503610.119434915, -20527062.76448319, -20557830.035128627, -20593274.72068722, -20632528.452965066, -20673637.471334763, -20733106.97143075, -20842921.0447562, -21054357.83621519, -21416569.534189366, -21978460.272811692, -22753170.052172784, -23671344.10563395, -24613499.293358143, -25406477.12230188, -25884377.82156489, -26049040.62791664, -26996879.104431007};
+std::vector<float> variance_{213747175.10846674, 188395815.34302503, 212706429.10966414, 199109025.81461075, 189235901.23864496, 194901336.53253657, 217481594.29306737, 238689869.12327808, 243977501.24115244, 248479623.6431067, 259766741.47116545, 275516766.7790273, 291271202.3691234, 302693239.8220509, 308627358.3997694, 311143911.38788426, 315446105.07731867, 321705430.9341829, 327458907.4659941, 332245072.43223983, 336251717.5935284, 339694069.7639722, 342188204.4322228, 345587110.31313115, 349903086.2875232, 353660214.20643026, 356700344.5270885, 357665362.3529641, 358493352.05658793, 358857951.620328, 358375239.52774596, 358899733.6342954, 361051818.3511561, 364361716.05025816, 368750322.3771452, 372047800.6462831, 375655861.1349018, 379358519.1980013, 383327605.3935181, 387458599.282341, 390434692.3406868, 392994486.35057056, 394874418.04603153, 396230525.79763395, 396365592.0414835, 396334819.8242737, 396488353.19250053, 396438877.00744957, 396197980.4459586, 395590921.6672991, 395001107.62072515, 394528291.7318225, 394593110.424006, 395018405.59353715, 396110577.5415993, 397506704.0371068, 399400197.4657644, 401243568.2468382, 402687134.7805103, 404136047.2872507, 404883170.001883, 405522253.219517, 406660365.3626476, 407919346.0991902, 409045348.5384909, 409759588.7889818, 411974821.8564483, 413489718.78201455, 415535392.56684107, 418466481.97674364, 421104678.35678065, 423405392.5200779, 425550570.40798235, 427929423.9579701, 429585274.253478, 432368493.55181056, 435193587.13513297, 438886855.20476013, 443058876.8633751, 448181232.5093362, 452883835.6332396, 458056721.77926534, 461816531.22735566, 464363620.1970998, 465886343.5057493, 466928872.0651, 467180536.42647296, 468111848.70714295, 469138695.3071312, 470378429.6930793, 471517958.7132626, 472109050.4262365, 473087417.0177867, 473381322.04648733, 473220195.85483915, 472666071.8998819, 472124669.87879956, 471298571.411737, 471251033.2902761, 471672676.43128747, 472177147.2193172, 472572361.7711908, 472968783.7751127, 473156295.4164052, 473398034.82676554, 473897703.5203811, 474328271.33112127, 474452670.98002136, 474549003.99284613, 474252887.13567275, 473557462.909069, 473483385.85193115, 473609738.04855174, 473746944.82085115, 474016729.91696435, 474617321.94138587, 475045097.237122, 475125402.586558, 474664112.9824912, 474426247.5800283, 474104075.42796475, 473978219.7273978, 473773171.7798875, 473578534.69508696, 473102924.16904145, 472651240.5232615, 472374383.1810912, 472209479.6956096, 472202298.8921673, 472370090.76781124, 472220933.99374026, 471625467.37106377, 470994646.51883453, 470182428.9637543, 469348211.5939578, 468570387.4467277, 468540442.7225135, 468672018.90414184, 468994346.9533251, 469138757.58201426, 469553915.95710236, 470134523.38582784, 471082421.62055486, 471962316.51804745, 472939745.1708408, 474250621.5944825, 475773933.43199486, 477465399.71087736, 479218782.61382693, 481752299.7930922, 486608947.8984568, 496119403.2067917, 512730085.5704984, 539048915.2641417, 576285298.3548826, 621610270.2240586, 669308196.4436442, 710656993.5957186, 736344437.3725077, 745481288.0241544, 801121432.9925804};
+int count_ = 912592;
+
+void WriteMatrix() {
+  kaldi::Matrix<double> cmvn_stats(2, mean_.size()+ 1); 
+  for (size_t idx = 0; idx < mean_.size(); ++idx) {
+    cmvn_stats(0, idx) = mean_[idx];
+    cmvn_stats(1, idx) = variance_[idx];
+  }
+  cmvn_stats(0, mean_.size()) = count_;
+  kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true);
+}
 
 int main(int argc, char* argv[]) {
   gflags::ParseCommandLineFlags(&argc, &argv, false);
@@ -19,33 +36,69 @@ int main(int argc, char* argv[]) {
   
   kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(FLAGS_wav_rspecifier);
   kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
+  WriteMatrix();
 
   // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning window -->linear_spectrogram --> cmvn
-  // --> feature_cache
   int32 num_done = 0, num_err = 0;
+  std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new ppspeech::RawDataSource());
+
   ppspeech::LinearSpectrogramOptions opt;
   opt.frame_opts.frame_length_ms = 20;
   opt.frame_opts.frame_shift_ms = 10;
   ppspeech::DecibelNormalizerOptions db_norm_opt;
   std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor(
-      new ppspeech::DecibelNormalizer(db_norm_opt));
+    new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
 
-  std::shared_ptr<ppspeech::FeatureExtractorInterface> linear_spectrogram(
-      new ppspeech::LinearSpectrogram(opt, base_feature_extractor));
+  std::unique_ptr<ppspeech::FeatureExtractorInterface> linear_spectrogram(
+    new ppspeech::LinearSpectrogram(opt, std::move(base_feature_extractor)));
 
-  std::shared_ptr<ppspeech::FeatureExtractorInterface> cmvn(
-      new ppspeech::CMVN(FLAGS_cmvn_path, linear_spectrogram);
-  ppspeech::FeatureCache(cmvn);
+  ppspeech::CMVN cmvn(FLAGS_cmvn_write_path, std::move(linear_spectrogram));
 
   float streaming_chunk = 0.36;
   int sample_rate = 16000;
   int chunk_sample_size = streaming_chunk * sample_rate;
-  // thread 1 feed feature
 
   for (; !wav_reader.Done(); wav_reader.Next()) {
     std::string utt = wav_reader.Key();
     const kaldi::WaveData &wave_data = wav_reader.Value();
 
+    int32 this_channel = 0;
+    kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(), this_channel);
+    int tot_samples = waveform.Dim(); 
+    int sample_offset = 0;
+    std::vector<kaldi::Vector<BaseFloat>> feats;
+    int feature_rows = 0;
+    while (sample_offset < tot_samples) {
+      int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset);
+      kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
+      for (int i = 0; i < cur_chunk_size; ++i) {
+        wav_chunk(i) = waveform(sample_offset + i); 
+      }
+      kaldi::Vector<BaseFloat> features;
+      cmvn.AcceptWaveform(wav_chunk);
+      cmvn.Read(&features);
+
+      std::cout << wav_chunk(0) << std::endl;
+      std::cout << features(0) << std::endl;
+
+      feats.push_back(features);
+      sample_offset += cur_chunk_size;
+      feature_rows += features.Dim() / cmvn.Dim();
+    }
+
+    int cur_idx = 0;
+    kaldi::Matrix<kaldi::BaseFloat> features(feature_rows, cmvn.Dim()); 
+    for (auto feat : feats) {
+      int num_rows = feat.Dim() / cmvn.Dim();
+      for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
+          for (int col_idx = 0; col_idx < cmvn.Dim(); ++col_idx) {
+              features(cur_idx, col_idx) = feat(row_idx*cmvn.Dim() + col_idx);
+          }
+          ++cur_idx;
+      }
+    }
+    feat_writer.Write(utt, features);
+
     if (num_done % 50 == 0 && num_done != 0)
     KALDI_VLOG(2) << "Processed " << num_done << " utterances";
     num_done++;
diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h
index 3b58f73c..ac01a977 100644
--- a/speechx/speechx/base/common.h
+++ b/speechx/speechx/base/common.h
@@ -15,22 +15,23 @@
 #pragma once
 
 #include <deque>
-#include <fstream>
 #include <iostream>
 #include <istream>
+#include <fstream>
 #include <map>
 #include <memory>
-#include <mutex>
 #include <ostream>
 #include <set>
 #include <sstream>
 #include <stack>
 #include <string>
+#include <vector>
 #include <unordered_map>
 #include <unordered_set>
-#include <vector>
+#include <mutex>
+#include <condition_variable>
 
-#include "base/basic_types.h"
-#include "base/flags.h"
 #include "base/log.h"
+#include "base/flags.h"
+#include "base/basic_types.h"
 #include "base/macros.h"
diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt
index da81a481..e43bd182 100644
--- a/speechx/speechx/frontend/CMakeLists.txt
+++ b/speechx/speechx/frontend/CMakeLists.txt
@@ -2,7 +2,8 @@ project(frontend)
 
 add_library(frontend STATIC
   normalizer.cc
-  linear_spectrogram.cc  
+  linear_spectrogram.cc
+  raw_audio.cc
 )
 
 target_link_libraries(frontend PUBLIC kaldi-matrix)
\ No newline at end of file
diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h
index e39f5e46..fc06f24a 100644
--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@@ -21,9 +21,8 @@ namespace ppspeech {
 
 class FeatureExtractorInterface {
   public:
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input) = 0;
-    virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat) = 0;
+    virtual void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) = 0;
+    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
     virtual size_t Dim() const = 0;
 };
 
diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc
index 6c008c39..ed4c2977 100644
--- a/speechx/speechx/frontend/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/linear_spectrogram.cc
@@ -25,153 +25,146 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;
 
-// todo remove later
+//todo remove later
 void CopyVector2StdVector_(const VectorBase<BaseFloat>& input,
-                           vector<BaseFloat>* output) {
-    if (input.Dim() == 0) return;
-    output->resize(input.Dim());
-    for (size_t idx = 0; idx < input.Dim(); ++idx) {
-        (*output)[idx] = input(idx);
-    }
+                          vector<BaseFloat>* output) {
+  if (input.Dim() == 0) return;
+  output->resize(input.Dim());
+  for (size_t idx = 0; idx < input.Dim(); ++idx) {
+    (*output)[idx] = input(idx);
+  }
 }
 
 void CopyStdVector2Vector_(const vector<BaseFloat>& input,
-                           Vector<BaseFloat>* output) {
-    if (input.empty()) return;
-    output->Resize(input.size());
-    for (size_t idx = 0; idx < input.size(); ++idx) {
-        (*output)(idx) = input[idx];
-    }
+                          Vector<BaseFloat>* output) {
+  if (input.empty()) return;
+  output->Resize(input.size());
+  for (size_t idx = 0; idx < input.size(); ++idx) {
+    (*output)(idx) = input[idx];
+  }
 }
 
 LinearSpectrogram::LinearSpectrogram(
     const LinearSpectrogramOptions& opts,
     std::unique_ptr<FeatureExtractorInterface> base_extractor) {
-    opts_ = opts;
-    base_extractor_ = std::move(base_extractor);
-    int32 window_size = opts.frame_opts.WindowSize();
-    int32 window_shift = opts.frame_opts.WindowShift();
-    fft_points_ = window_size;
-    hanning_window_.resize(window_size);
-
-    double a = M_2PI / (window_size - 1);
-    hanning_window_energy_ = 0;
-    for (int i = 0; i < window_size; ++i) {
-        hanning_window_[i] = 0.5 - 0.5 * cos(a * i);
-        hanning_window_energy_ += hanning_window_[i] * hanning_window_[i];
-    }
-
-    dim_ = fft_points_ / 2 + 1;  // the dimension is Fs/2 Hz
+  opts_ = opts;
+  base_extractor_ = std::move(base_extractor);
+  int32 window_size = opts.frame_opts.WindowSize();
+  int32 window_shift = opts.frame_opts.WindowShift();
+  fft_points_ = window_size;
+  chunk_sample_size_ = 
+      static_cast<int32>(opts.streaming_chunk * opts.frame_opts.samp_freq);
+  hanning_window_.resize(window_size);
+
+  double a = M_2PI / (window_size - 1);
+  hanning_window_energy_ = 0;
+  for (int i = 0; i < window_size; ++i) {
+    hanning_window_[i] = 0.5 - 0.5 * cos(a * i);
+    hanning_window_energy_ += hanning_window_[i] * hanning_window_[i];
+  }
+
+  dim_ = fft_points_ / 2 + 1;  // the dimension is Fs/2 Hz
 }
 
 void LinearSpectrogram::AcceptWaveform(const VectorBase<BaseFloat>& input) {
     base_extractor_->AcceptWaveform(input);
 }
 
+void LinearSpectrogram::Read(Vector<BaseFloat>* feat) {
+  Vector<BaseFloat> input_feats(chunk_sample_size_);
+  base_extractor_->Read(&input_feats);
+  vector<BaseFloat> input_feats_vec(input_feats.Dim());
+  CopyVector2StdVector_(input_feats, &input_feats_vec);
+  //for (int idx = 0; idx < input_feats.Dim(); ++idx) {
+  //  input_feats_vec[idx] = input_feats(idx); 
+  //}
+  vector<vector<BaseFloat>> result;
+  Compute(input_feats_vec, result);
+  int32 feat_size = 0;
+  if (result.size() != 0) {
+    feat_size = result.size() * result[0].size();
+  }
+  feat->Resize(feat_size);
+  for (size_t idx = 0; idx < feat_size; ++idx) {
+    (*feat)(idx) = result[idx / dim_][idx % dim_];
+  }
+  return;
+}
+
 void LinearSpectrogram::Hanning(vector<float>* data) const {
-    CHECK_GE(data->size(), hanning_window_.size());
+  CHECK_GE(data->size(), hanning_window_.size());
 
-    for (size_t i = 0; i < hanning_window_.size(); ++i) {
-        data->at(i) *= hanning_window_[i];
-    }
+  for (size_t i = 0; i < hanning_window_.size(); ++i) {
+      data->at(i) *= hanning_window_[i];
+  }
 }
 
 bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
                                  vector<BaseFloat>* real,
                                  vector<BaseFloat>* img) const {
-    Vector<BaseFloat> v_tmp;
-    CopyStdVector2Vector_(*v, &v_tmp);
-    RealFft(&v_tmp, true);
-    CopyVector2StdVector_(v_tmp, v);
-    real->push_back(v->at(0));
-    img->push_back(0);
-    for (int i = 1; i < v->size() / 2; i++) {
-        real->push_back(v->at(2 * i));
-        img->push_back(v->at(2 * i + 1));
-    }
-    real->push_back(v->at(1));
-    img->push_back(0);
-
-    return true;
-}
-
-// todo remove later
-void LinearSpectrogram::ReadFeats(Matrix<BaseFloat>* feats) {
-    Vector<BaseFloat> tmp;
-    waveform_.Resize(base_extractor_->Dim());
-    Compute(tmp, &waveform_);
-    vector<vector<BaseFloat>> result;
-    vector<BaseFloat> feats_vec;
-    CopyVector2StdVector_(waveform_, &feats_vec);
-    Compute(feats_vec, result);
-    feats->Resize(result.size(), result[0].size());
-    for (int row_idx = 0; row_idx < result.size(); ++row_idx) {
-        for (int col_idx = 0; col_idx < result[0].size(); ++col_idx) {
-            (*feats)(row_idx, col_idx) = result[row_idx][col_idx];
-        }
-    }
-    waveform_.Resize(0);
-}
-
-void LinearSpectrogram::Read(VectorBase<BaseFloat>* feat) {
-    // todo
-    return;
-}
-
-// only for test, remove later
-// todo: compute the feature frame by frame.
-void LinearSpectrogram::Compute(const VectorBase<kaldi::BaseFloat>& input,
-                                VectorBase<kaldi::BaseFloat>* feature) {
-    base_extractor_->Read(feature);
+  Vector<BaseFloat> v_tmp;
+  CopyStdVector2Vector_(*v, &v_tmp);
+  RealFft(&v_tmp, true);
+  CopyVector2StdVector_(v_tmp, v);
+  real->push_back(v->at(0));
+  img->push_back(0);
+  for (int i = 1; i < v->size() / 2; i++) {
+    real->push_back(v->at(2 * i));
+    img->push_back(v->at(2 * i + 1));
+  }
+  real->push_back(v->at(1));
+  img->push_back(0);
+
+  return true;
 }
 
 // Compute spectrogram feat, only for test, remove later
 // todo: refactor later (SmileGoat)
 bool LinearSpectrogram::Compute(const vector<float>& wave,
                                 vector<vector<float>>& feat) {
-    int num_samples = wave.size();
-    const int& frame_length = opts_.frame_opts.WindowSize();
-    const int& sample_rate = opts_.frame_opts.samp_freq;
-    const int& frame_shift = opts_.frame_opts.WindowShift();
-    const int& fft_points = fft_points_;
-    const float scale = hanning_window_energy_ * sample_rate;
-
-    if (num_samples < frame_length) {
-        return true;
-    }
-
-    int num_frames = 1 + ((num_samples - frame_length) / frame_shift);
-    feat.resize(num_frames);
-    vector<float> fft_real((fft_points_ / 2 + 1), 0);
-    vector<float> fft_img((fft_points_ / 2 + 1), 0);
-    vector<float> v(frame_length, 0);
-    vector<float> power((fft_points / 2 + 1));
-
-    for (int i = 0; i < num_frames; ++i) {
-        vector<float> data(wave.data() + i * frame_shift,
-                           wave.data() + i * frame_shift + frame_length);
-        Hanning(&data);
-        fft_img.clear();
-        fft_real.clear();
-        v.assign(data.begin(), data.end());
-        NumpyFft(&v, &fft_real, &fft_img);
-
-        feat[i].resize(fft_points / 2 + 1);  // the last dimension is Fs/2 Hz
-        for (int j = 0; j < (fft_points / 2 + 1); ++j) {
-            power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j];
-            feat[i][j] = power[j];
-
-            if (j == 0 || j == feat[0].size() - 1) {
-                feat[i][j] /= scale;
-            } else {
-                feat[i][j] *= (2.0 / scale);
-            }
-
-            // log added eps=1e-14
-            feat[i][j] = std::log(feat[i][j] + 1e-14);
-        }
+  int num_samples = wave.size();
+  const int& frame_length = opts_.frame_opts.WindowSize();
+  const int& sample_rate = opts_.frame_opts.samp_freq;
+  const int& frame_shift = opts_.frame_opts.WindowShift();
+  const int& fft_points = fft_points_;
+  const float scale = hanning_window_energy_ * sample_rate;
+
+  if (num_samples < frame_length) {
+          return true;
+  }
+
+  int num_frames = 1 + ((num_samples - frame_length) / frame_shift);
+  feat.resize(num_frames);
+  vector<float> fft_real((fft_points_ / 2 + 1), 0);
+  vector<float> fft_img((fft_points_ / 2 + 1), 0);
+  vector<float> v(frame_length, 0);
+  vector<float> power((fft_points / 2 + 1));
+
+  for (int i = 0; i < num_frames; ++i) {
+    vector<float> data(wave.data() + i * frame_shift,
+                       wave.data() + i * frame_shift + frame_length);
+    Hanning(&data);
+    fft_img.clear();
+    fft_real.clear();
+    v.assign(data.begin(), data.end());
+    NumpyFft(&v, &fft_real, &fft_img);
+
+    feat[i].resize(fft_points / 2 + 1);  // the last dimension is Fs/2 Hz
+    for (int j = 0; j < (fft_points / 2 + 1); ++j) {
+      power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j];
+      feat[i][j] = power[j];
+
+      if (j == 0 || j == feat[0].size() - 1) {
+          feat[i][j] /= scale;
+      } else {
+          feat[i][j] *= (2.0 / scale);
+      }
+
+      // log added eps=1e-14
+      feat[i][j] = std::log(feat[i][j] + 1e-14);
     }
-    return true;
+  }
+  return true;
 }
 
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h
index 20b5e4b5..e4dc3e33 100644
--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@@ -1,45 +1,35 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 
 #pragma once
 
-#include "base/common.h"
 #include "frontend/feature_extractor_interface.h"
 #include "kaldi/feat/feature-window.h"
+#include "base/common.h"
 
 namespace ppspeech {
 
 struct LinearSpectrogramOptions {
     kaldi::FrameExtractionOptions frame_opts;
-    LinearSpectrogramOptions() : frame_opts() {}
-
-    void Register(kaldi::OptionsItf* opts) { frame_opts.Register(opts); }
+    kaldi::BaseFloat streaming_chunk;
+    LinearSpectrogramOptions():
+        streaming_chunk(0.36),
+        frame_opts() {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        opts->Register("streaming-chunk", &streaming_chunk, "streaming chunk size");
+        frame_opts.Register(opts);
+    }
 };
 
 class LinearSpectrogram : public FeatureExtractorInterface {
   public:
-    explicit LinearSpectrogram(
-        const LinearSpectrogramOptions& opts,
-        std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat);
+    explicit LinearSpectrogram(const LinearSpectrogramOptions& opts,
+                               std::unique_ptr<FeatureExtractorInterface> base_extractor);
+    virtual void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input);
+    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
     virtual size_t Dim() const { return dim_; }
     void ReadFeats(kaldi::Matrix<kaldi::BaseFloat>* feats);
 
-  private:
+  private: 
     void Hanning(std::vector<kaldi::BaseFloat>* data) const;
     bool Compute(const std::vector<kaldi::BaseFloat>& wave,
                  std::vector<std::vector<kaldi::BaseFloat>>& feat);
@@ -54,8 +44,9 @@ class LinearSpectrogram : public FeatureExtractorInterface {
     std::vector<kaldi::BaseFloat> hanning_window_;
     kaldi::BaseFloat hanning_window_energy_;
     LinearSpectrogramOptions opts_;
-    kaldi::Vector<kaldi::BaseFloat> waveform_;  // remove later, todo(SmileGoat)
+    kaldi::Vector<kaldi::BaseFloat> waveform_; // remove later, todo(SmileGoat)
     std::unique_ptr<FeatureExtractorInterface> base_extractor_;
+    int chunk_sample_size_;
     DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
 };
 
diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc
index abf798e5..69c9ab59 100644
--- a/speechx/speechx/frontend/normalizer.cc
+++ b/speechx/speechx/frontend/normalizer.cc
@@ -1,17 +1,3 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 
 #include "frontend/normalizer.h"
 #include "kaldi/feat/cmvn.h"
@@ -24,175 +10,176 @@ using kaldi::VectorBase;
 using kaldi::BaseFloat;
 using std::vector;
 using kaldi::SubVector;
+using std::unique_ptr;
 
-DecibelNormalizer::DecibelNormalizer(const DecibelNormalizerOptions& opts) {
-    opts_ = opts;
-    dim_ = 0;
+DecibelNormalizer::DecibelNormalizer(const DecibelNormalizerOptions& opts,
+    std::unique_ptr<FeatureExtractorInterface> base_extractor) {
+  base_extractor_ = std::move(base_extractor);
+  opts_ = opts;
+  dim_ = 0;
 }
-
-void DecibelNormalizer::AcceptWaveform(
-    const kaldi::VectorBase<BaseFloat>& input) {
-    dim_ = input.Dim();
-    waveform_.Resize(input.Dim());
-    waveform_.CopyFromVec(input);
+                                    
+void DecibelNormalizer::AcceptWaveform(const kaldi::VectorBase<BaseFloat>& input) {
+  //dim_ = input.Dim();
+  //waveform_.Resize(input.Dim());
+  //waveform_.CopyFromVec(input);
+  base_extractor_->AcceptWaveform(input);
 }
 
-void DecibelNormalizer::Read(kaldi::VectorBase<BaseFloat>* feat) {
-    if (waveform_.Dim() == 0) return;
-    Compute(waveform_, feat);
+void DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* feat) {
+  // if (waveform_.Dim() == 0) return;
+  base_extractor_->Read(feat);
+  Compute(feat);
 }
 
-// todo remove later
+//todo remove later
 void CopyVector2StdVector(const kaldi::VectorBase<BaseFloat>& input,
                           vector<BaseFloat>* output) {
-    if (input.Dim() == 0) return;
-    output->resize(input.Dim());
-    for (size_t idx = 0; idx < input.Dim(); ++idx) {
-        (*output)[idx] = input(idx);
-    }
+  if (input.Dim() == 0) return;
+  output->resize(input.Dim());
+  for (size_t idx = 0; idx < input.Dim(); ++idx) {
+    (*output)[idx] = input(idx);
+  }
 }
 
 void CopyStdVector2Vector(const vector<BaseFloat>& input,
                           VectorBase<BaseFloat>* output) {
-    if (input.empty()) return;
-    assert(input.size() == output->Dim());
-    for (size_t idx = 0; idx < input.size(); ++idx) {
-        (*output)(idx) = input[idx];
-    }
+  if (input.empty()) return;
+  assert(input.size() == output->Dim());
+  for (size_t idx = 0; idx < input.size(); ++idx) {
+    (*output)(idx) = input[idx];
+  }
 }
 
-bool DecibelNormalizer::Compute(const VectorBase<BaseFloat>& input,
-                                VectorBase<BaseFloat>* feat) const {
-    // calculate db rms
-    BaseFloat rms_db = 0.0;
-    BaseFloat mean_square = 0.0;
-    BaseFloat gain = 0.0;
-    BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
-
-    vector<BaseFloat> samples;
-    samples.resize(input.Dim());
-    for (int32 i = 0; i < samples.size(); ++i) {
-        samples[i] = input(i);
-    }
-
-    // square
-    for (auto& d : samples) {
-        if (opts_.convert_int_float) {
-            d = d * wave_float_normlization;
-        }
-        mean_square += d * d;
+bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feat) const {
+  // calculate db rms
+  BaseFloat rms_db = 0.0;
+  BaseFloat mean_square = 0.0;
+  BaseFloat gain = 0.0;
+  BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
+
+  vector<BaseFloat> samples;
+  samples.resize(feat->Dim());
+  for (size_t i = 0; i < samples.size(); ++i) {
+    samples[i] = (*feat)(i);
+  }
+  
+  // square
+  for (auto &d : samples) {
+    if (opts_.convert_int_float) {
+    d = d * wave_float_normlization;
     }
-
-    // mean
-    mean_square /= samples.size();
-    rms_db = 10 * std::log10(mean_square);
-    gain = opts_.target_db - rms_db;
-
-    if (gain > opts_.max_gain_db) {
-        LOG(ERROR)
-            << "Unable to normalize segment to " << opts_.target_db << "dB,"
-            << "because the the probable gain have exceeds opts_.max_gain_db"
-            << opts_.max_gain_db << "dB.";
-        return false;
-    }
-
-    // Note that this is an in-place transformation.
-    for (auto& item : samples) {
-        // python item *= 10.0 ** (gain / 20.0)
-        item *= std::pow(10.0, gain / 20.0);
-    }
-
-    CopyStdVector2Vector(samples, feat);
-    return true;
+    mean_square += d * d;
+  }
+
+  // mean
+  mean_square /= samples.size();
+  rms_db = 10 * std::log10(mean_square);
+  gain = opts_.target_db - rms_db;
+
+  if (gain > opts_.max_gain_db) {
+    LOG(ERROR) << "Unable to normalize segment to " << opts_.target_db << "dB,"
+                << "because the the probable gain have exceeds opts_.max_gain_db" 
+                <<  opts_.max_gain_db << "dB.";
+    return false;
+  }
+
+  // Note that this is an in-place transformation.
+  for (auto &item : samples) {
+    // python item *= 10.0 ** (gain / 20.0)
+    item *= std::pow(10.0, gain / 20.0);
+  }
+  
+  CopyStdVector2Vector(samples, feat);
+  return true;
 }
 
-CMVN::CMVN(std::string cmvn_file) : var_norm_(true) {
+CMVN::CMVN(std::string cmvn_file, 
+           unique_ptr<FeatureExtractorInterface> base_extractor) 
+    : var_norm_(true) {
+    base_extractor_ = std::move(base_extractor);
     bool binary;
     kaldi::Input ki(cmvn_file, &binary);
     stats_.Read(ki.Stream(), binary);
+    dim_ = stats_.NumCols() - 1;
 }
 
 void CMVN::AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) {
+    base_extractor_->AcceptWaveform(input);
     return;
 }
 
-void CMVN::Read(kaldi::VectorBase<BaseFloat>* feat) { return; }
+void CMVN::Read(kaldi::Vector<BaseFloat>* feat) {
+    base_extractor_->Read(feat);
+    Compute(feat);
+    return;
+}
 
 // feats contain num_frames feature.
-void CMVN::ApplyCMVN(bool var_norm, VectorBase<BaseFloat>* feats) {
-    KALDI_ASSERT(feats != NULL);
-    int32 dim = stats_.NumCols() - 1;
-    if (stats_.NumRows() > 2 || stats_.NumRows() < 1 ||
-        feats->Dim() % dim != 0) {
-        KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x'
-                  << stats_.NumCols() << ", feats " << feats->Dim() << 'x';
+void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
+  KALDI_ASSERT(feats != NULL); 
+  int32 dim = stats_.NumCols() - 1;
+  if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || feats->Dim() % dim != 0) {
+    KALDI_ERR << "Dim mismatch: cmvn "
+              << stats_.NumRows() << 'x' << stats_.NumCols()
+              << ", feats " << feats->Dim() << 'x';
+  }
+  if (stats_.NumRows() == 1 && var_norm_) {
+    KALDI_ERR << "You requested variance normalization but no variance stats_ "
+              << "are supplied.";
+  }
+
+  double count = stats_(0, dim);
+  // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
+  // computing an offset and representing it as stats_, we use a count of one.
+  if (count < 1.0)
+    KALDI_ERR << "Insufficient stats_ for cepstral mean and variance normalization: "
+              << "count = " << count;
+
+  if (!var_norm_) {
+    Vector<BaseFloat> offset(feats->Dim());
+    SubVector<double> mean_stats(stats_.RowData(0), dim);
+    Vector<double> mean_stats_apply(feats->Dim());
+    //fill the datat of mean_stats in mean_stats_appy whose dim is equal with the dim of feature.
+    //the dim of feats = dim * num_frames;
+    for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) {
+      SubVector<double> stats_tmp(mean_stats_apply.Data() + dim*idx, dim);
+      stats_tmp.CopyFromVec(mean_stats);
     }
-    if (stats_.NumRows() == 1 && var_norm) {
-        KALDI_ERR
-            << "You requested variance normalization but no variance stats_ "
-            << "are supplied.";
-    }
-
-    double count = stats_(0, dim);
-    // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
-    // computing an offset and representing it as stats_, we use a count of one.
-    if (count < 1.0)
-        KALDI_ERR << "Insufficient stats_ for cepstral mean and variance "
-                     "normalization: "
-                  << "count = " << count;
-
-    if (!var_norm) {
-        Vector<BaseFloat> offset(feats->Dim());
-        SubVector<double> mean_stats(stats_.RowData(0), dim);
-        Vector<double> mean_stats_apply(feats->Dim());
-        // fill the datat of mean_stats in mean_stats_appy whose dim is equal
-        // with the dim of feature.
-        // the dim of feats = dim * num_frames;
-        for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) {
-            SubVector<double> stats_tmp(mean_stats_apply.Data() + dim * idx,
-                                        dim);
-            stats_tmp.CopyFromVec(mean_stats);
-        }
-        offset.AddVec(-1.0 / count, mean_stats_apply);
-        feats->AddVec(1.0, offset);
-        return;
+    offset.AddVec(-1.0 / count, mean_stats_apply);
+    feats->AddVec(1.0, offset);
+    return;
+  }
+  // norm(0, d) = mean offset;
+  // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
+  kaldi::Matrix<BaseFloat> norm(2, feats->Dim());
+  for (int32 d = 0; d < dim; d++) {
+    double mean, offset, scale;
+    mean = stats_(0, d)/count;
+    double var = (stats_(1, d)/count) - mean*mean,
+        floor = 1.0e-20;
+    if (var < floor) {
+      KALDI_WARN << "Flooring cepstral variance from " << var << " to "
+                 << floor;
+      var = floor;
     }
-    // norm(0, d) = mean offset;
-    // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
-    kaldi::Matrix<BaseFloat> norm(2, feats->Dim());
-    for (int32 d = 0; d < dim; d++) {
-        double mean, offset, scale;
-        mean = stats_(0, d) / count;
-        double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20;
-        if (var < floor) {
-            KALDI_WARN << "Flooring cepstral variance from " << var << " to "
-                       << floor;
-            var = floor;
-        }
-        scale = 1.0 / sqrt(var);
-        if (scale != scale || 1 / scale == 0.0)
-            KALDI_ERR
-                << "NaN or infinity in cepstral mean/variance computation";
-        offset = -(mean * scale);
-        for (int32 d_skip = d; d_skip < feats->Dim();) {
-            norm(0, d_skip) = offset;
-            norm(1, d_skip) = scale;
-            d_skip = d_skip + dim;
-        }
+    scale = 1.0 / sqrt(var);
+    if (scale != scale || 1/scale == 0.0)
+      KALDI_ERR << "NaN or infinity in cepstral mean/variance computation";
+    offset = -(mean*scale);
+    for (int32 d_skip = d; d_skip < feats->Dim();) {
+      norm(0, d_skip) = offset;
+      norm(1, d_skip) = scale;
+      d_skip = d_skip + dim;
     }
-    // Apply the normalization.
-    feats->MulElements(norm.Row(1));
-    feats->AddVec(1.0, norm.Row(0));
+  }
+  // Apply the normalization.
+  feats->MulElements(norm.Row(1));
+  feats->AddVec(1.0, norm.Row(0));
 }
 
-void CMVN::ApplyCMVNMatrix(bool var_norm, kaldi::MatrixBase<BaseFloat>* feats) {
-    ApplyCmvn(stats_, var_norm, feats);
+void CMVN::ApplyCMVN(kaldi::MatrixBase<BaseFloat>* feats) {
+  ApplyCmvn(stats_, var_norm_, feats);
 }
 
-bool CMVN::Compute(const VectorBase<BaseFloat>& input,
-                   VectorBase<BaseFloat>* feat) const {
-    return false;
-}
-
-
-}  // namespace ppspeech
+} // namespace ppspeech
diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h
index 6af5cdd8..13c5b8df 100644
--- a/speechx/speechx/frontend/normalizer.h
+++ b/speechx/speechx/frontend/normalizer.h
@@ -1,56 +1,40 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 
 #pragma once
 
 #include "base/common.h"
 #include "frontend/feature_extractor_interface.h"
-#include "kaldi/matrix/kaldi-matrix.h"
 #include "kaldi/util/options-itf.h"
+#include "kaldi/matrix/kaldi-matrix.h"
 
 namespace ppspeech {
 
 struct DecibelNormalizerOptions {
-    float target_db;
-    float max_gain_db;
-    bool convert_int_float;
-    DecibelNormalizerOptions()
-        : target_db(-20), max_gain_db(300.0), convert_int_float(false) {}
+  float target_db;
+  float max_gain_db;
+  bool convert_int_float;
+  DecibelNormalizerOptions() :
+    target_db(-20),
+    max_gain_db(300.0),
+    convert_int_float(false){}
 
     void Register(kaldi::OptionsItf* opts) {
-        opts->Register(
-            "target-db", &target_db, "target db for db normalization");
-        opts->Register(
-            "max-gain-db", &max_gain_db, "max gain db for db normalization");
-        opts->Register("convert-int-float",
-                       &convert_int_float,
-                       "if convert int samples to float");
+      opts->Register("target-db", &target_db, "target db for db normalization");
+      opts->Register("max-gain-db", &max_gain_db, "max gain db for db normalization");
+      opts->Register("convert-int-float", &convert_int_float, "if convert int samples to float");
     }
 };
 
 class DecibelNormalizer : public FeatureExtractorInterface {
   public:
-    explicit DecibelNormalizer(const DecibelNormalizerOptions& opts);
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat);
+    explicit DecibelNormalizer(
+        const DecibelNormalizerOptions& opts,
+        std::unique_ptr<FeatureExtractorInterface> base_extractor);
+    virtual void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input);
+    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
     virtual size_t Dim() const { return dim_; }
-    bool Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
-                 kaldi::VectorBase<kaldi::BaseFloat>* feat) const;
 
   private:
+    bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* feat) const;
     DecibelNormalizerOptions opts_;
     size_t dim_;
     std::unique_ptr<FeatureExtractorInterface> base_extractor_;
@@ -60,20 +44,18 @@ class DecibelNormalizer : public FeatureExtractorInterface {
 
 class CMVN : public FeatureExtractorInterface {
   public:
-    explicit CMVN(std::string cmvn_file);
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat);
-    virtual size_t Dim() const { return stats_.NumCols() - 1; }
-    bool Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
-                 kaldi::VectorBase<kaldi::BaseFloat>* feat) const;
-    // for test
-    void ApplyCMVN(bool var_norm, kaldi::VectorBase<BaseFloat>* feats);
-    void ApplyCMVNMatrix(bool var_norm, kaldi::MatrixBase<BaseFloat>* feats);
+    explicit CMVN(
+        std::string cmvn_file,
+        std::unique_ptr<FeatureExtractorInterface> base_extractor);
+    virtual void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input);
+    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual size_t Dim() const { return dim_; }
 
   private:
+    void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feat) const;
+    void ApplyCMVN(kaldi::MatrixBase<BaseFloat>* feats);
     kaldi::Matrix<double> stats_;
-    std::shared_ptr<FeatureExtractorInterface> base_extractor_;
+    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
     size_t dim_;
     bool var_norm_;
 };

From c769d9078124d2419cf72493945ea9f75ef7f300 Mon Sep 17 00:00:00 2001
From: SmileGoat <goat.zhou@qq.com>
Date: Wed, 9 Mar 2022 21:01:39 +0800
Subject: [PATCH 4/7] make feature cache& raw audio work

---
 speechx/examples/feat/CMakeLists.txt          |   6 +-
 .../examples/feat/linear_spectrogram_main.cc  |  76 +++--
 speechx/examples/feat/streaming_feat_main.cc  | 109 -------
 speechx/speechx/base/common.h                 |  13 +-
 speechx/speechx/frontend/CMakeLists.txt       |   3 +-
 speechx/speechx/frontend/feature_cache.cc     |  78 ++++-
 speechx/speechx/frontend/feature_cache.h      |  51 +++-
 .../frontend/feature_extractor_interface.h    |   8 +-
 .../speechx/frontend/linear_spectrogram.cc    | 217 +++++++-------
 speechx/speechx/frontend/linear_spectrogram.h |  40 ++-
 speechx/speechx/frontend/normalizer.cc        | 280 ++++++++++--------
 speechx/speechx/frontend/normalizer.h         |  57 ++--
 speechx/speechx/frontend/raw_audio.cc         | 119 +++++---
 speechx/speechx/frontend/raw_audio.h          |  71 ++++-
 14 files changed, 610 insertions(+), 518 deletions(-)
 delete mode 100644 speechx/examples/feat/streaming_feat_main.cc

diff --git a/speechx/examples/feat/CMakeLists.txt b/speechx/examples/feat/CMakeLists.txt
index 44738e60..b8f516af 100644
--- a/speechx/examples/feat/CMakeLists.txt
+++ b/speechx/examples/feat/CMakeLists.txt
@@ -5,6 +5,6 @@ add_executable(mfcc-test ${CMAKE_CURRENT_SOURCE_DIR}/feature-mfcc-test.cc)
 target_include_directories(mfcc-test PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
 target_link_libraries(mfcc-test kaldi-mfcc)
 
-add_executable(linear-spectrogram-main ${CMAKE_CURRENT_SOURCE_DIR}/linear-spectrogram-main.cc)
-target_include_directories(linear-spectrogram-main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(linear-spectrogram-main frontend kaldi-util kaldi-feat-common gflags glog)
\ No newline at end of file
+add_executable(linear_spectrogram_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_main.cc)
+target_include_directories(linear_spectrogram_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
\ No newline at end of file
diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc
index 3e2342c2..c2ca6187 100644
--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@@ -14,11 +14,13 @@
 
 // todo refactor, repalce with gtest
 
+#include "frontend/linear_spectrogram.h"
 #include "base/flags.h"
 #include "base/log.h"
+#include "frontend/feature_cache.h"
 #include "frontend/feature_extractor_interface.h"
-#include "frontend/linear_spectrogram.h"
 #include "frontend/normalizer.h"
+#include "frontend/raw_audio.h"
 #include "kaldi/feat/wave-reader.h"
 #include "kaldi/util/kaldi-io.h"
 #include "kaldi/util/table-types.h"
@@ -158,38 +160,37 @@ int main(int argc, char* argv[]) {
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
         FLAGS_wav_rspecifier);
     kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
-    kaldi::BaseFloatMatrixWriter feat_cmvn_check_writer(
-        FLAGS_feature_check_wspecifier);
     WriteMatrix();
 
     // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
     // window -->linear_spectrogram --> cmvn
     int32 num_done = 0, num_err = 0;
+    // std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
+    // ppspeech::RawDataSource());
+    std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
+        new ppspeech::RawAudioSource());
+
     ppspeech::LinearSpectrogramOptions opt;
     opt.frame_opts.frame_length_ms = 20;
     opt.frame_opts.frame_shift_ms = 10;
     ppspeech::DecibelNormalizerOptions db_norm_opt;
     std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor(
-        new ppspeech::DecibelNormalizer(db_norm_opt));
-    ppspeech::LinearSpectrogram linear_spectrogram(
-        opt, std::move(base_feature_extractor));
+        new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
+
+    std::unique_ptr<ppspeech::FeatureExtractorInterface> linear_spectrogram(
+        new ppspeech::LinearSpectrogram(opt,
+                                        std::move(base_feature_extractor)));
 
-    ppspeech::CMVN cmvn(FLAGS_cmvn_write_path);
+    std::unique_ptr<ppspeech::FeatureExtractorInterface> cmvn(
+        new ppspeech::CMVN(FLAGS_cmvn_write_path,
+                           std::move(linear_spectrogram)));
+
+    ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn));
 
     float streaming_chunk = 0.36;
     int sample_rate = 16000;
     int chunk_sample_size = streaming_chunk * sample_rate;
 
-    LOG(INFO) << mean_.size();
-    for (size_t i = 0; i < mean_.size(); i++) {
-        mean_[i] /= count_;
-        variance_[i] = variance_[i] / count_ - mean_[i] * mean_[i];
-        if (variance_[i] < 1.0e-20) {
-            variance_[i] = 1.0e-20;
-        }
-        variance_[i] = 1.0 / std::sqrt(variance_[i]);
-    }
-
     for (; !wav_reader.Done(); wav_reader.Next()) {
         std::string utt = wav_reader.Key();
         const kaldi::WaveData& wave_data = wav_reader.Value();
@@ -199,54 +200,45 @@ int main(int argc, char* argv[]) {
                                                     this_channel);
         int tot_samples = waveform.Dim();
         int sample_offset = 0;
-        std::vector<kaldi::Matrix<BaseFloat>> feats;
+        std::vector<kaldi::Vector<BaseFloat>> feats;
         int feature_rows = 0;
         while (sample_offset < tot_samples) {
             int cur_chunk_size =
                 std::min(chunk_sample_size, tot_samples - sample_offset);
+
             kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
             for (int i = 0; i < cur_chunk_size; ++i) {
                 wav_chunk(i) = waveform(sample_offset + i);
             }
-            kaldi::Matrix<BaseFloat> features;
-            linear_spectrogram.AcceptWaveform(wav_chunk);
-            linear_spectrogram.ReadFeats(&features);
+            kaldi::Vector<BaseFloat> features;
+            feature_cache.AcceptWaveform(wav_chunk);
+            if (cur_chunk_size < chunk_sample_size) {
+                feature_cache.SetFinished();
+            }
+            feature_cache.Read(&features);
+            if (features.Dim() == 0) break;
 
             feats.push_back(features);
             sample_offset += cur_chunk_size;
-            feature_rows += features.NumRows();
+            feature_rows += features.Dim() / feature_cache.Dim();
         }
 
         int cur_idx = 0;
         kaldi::Matrix<kaldi::BaseFloat> features(feature_rows,
-                                                 feats[0].NumCols());
+                                                 feature_cache.Dim());
         for (auto feat : feats) {
-            for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) {
-                for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) {
+            int num_rows = feat.Dim() / feature_cache.Dim();
+            for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
+                for (size_t col_idx = 0; col_idx < feature_cache.Dim();
+                     ++col_idx) {
                     features(cur_idx, col_idx) =
-                        (feat(row_idx, col_idx) - mean_[col_idx]) *
-                        variance_[col_idx];
+                        feat(row_idx * feature_cache.Dim() + col_idx);
                 }
                 ++cur_idx;
             }
         }
         feat_writer.Write(utt, features);
 
-        cur_idx = 0;
-        kaldi::Matrix<kaldi::BaseFloat> features_check(feature_rows,
-                                                       feats[0].NumCols());
-        for (auto feat : feats) {
-            for (int row_idx = 0; row_idx < feat.NumRows(); ++row_idx) {
-                for (int col_idx = 0; col_idx < feat.NumCols(); ++col_idx) {
-                    features_check(cur_idx, col_idx) = feat(row_idx, col_idx);
-                }
-                kaldi::SubVector<BaseFloat> row_feat(features_check, cur_idx);
-                cmvn.ApplyCMVN(true, &row_feat);
-                ++cur_idx;
-            }
-        }
-        feat_cmvn_check_writer.Write(utt, features_check);
-
         if (num_done % 50 == 0 && num_done != 0)
             KALDI_VLOG(2) << "Processed " << num_done << " utterances";
         num_done++;
diff --git a/speechx/examples/feat/streaming_feat_main.cc b/speechx/examples/feat/streaming_feat_main.cc
deleted file mode 100644
index b3ee9842..00000000
--- a/speechx/examples/feat/streaming_feat_main.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-// todo refactor, repalce with gtest
-
-#include "base/log.h"
-#include "base/flags.h"
-#include "frontend/linear_spectrogram.h"
-#include "frontend/normalizer.h"
-#include "frontend/feature_extractor_interface.h"
-#include "frontend/raw_audio.h"
-#include "kaldi/util/table-types.h"
-#include "kaldi/feat/wave-reader.h"
-#include "kaldi/util/kaldi-io.h"
-
-DEFINE_string(wav_rspecifier, "", "test wav path");
-DEFINE_string(feature_wspecifier, "", "test wav ark");
-DEFINE_string(feature_check_wspecifier, "", "test wav ark");
-DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark");
-
-
-std::vector<float> mean_{-13730251.531853663, -12982852.199316509, -13673844.299583456, -13089406.559646806, -12673095.524938712, -12823859.223276224, -13590267.158903603, -14257618.467152044, -14374605.116185192, -14490009.21822485, -14849827.158924166, -15354435.470563512, -15834149.206532761, -16172971.985514281, -16348740.496746974, -16423536.699409386, -16556246.263649225, -16744088.772748645, -16916184.08510357, -17054034.840031497, -17165612.509455364, -17255955.470915023, -17322572.527648456, -17408943.862033736, -17521554.799865916, -17620623.254924215, -17699792.395918526, -17723364.411134344, -17741483.4433254, -17747426.888704527, -17733315.928209435, -17748780.160905756, -17808336.883775543, -17895918.671983004, -18009812.59173023, -18098188.66548325, -18195798.958462656, -18293617.62980999, -18397432.92077201, -18505834.787318766, -18585451.8100908, -18652438.235649142, -18700960.306275308, -18734944.58792185, -18737426.313365128, -18735347.165987637, -18738813.444170244, -18737086.848890636, -18731576.2474336, -18717405.44095871, -18703089.25545657, -18691014.546456724, -18692460.568905357, -18702119.628629155, -18727710.621126678, -18761582.72034647, -18806745.835547544, -18850674.8692112, -18884431.510951452, -18919999.992506847, -18939303.799078144, -18952946.273760635, -18980289.22996379, -19011610.17803294, -19040948.61805145, -19061021.429847397, -19112055.53768819, -19149667.414264943, -19201127.05091321, -19270250.82564605, -19334606.883057203, -19390513.336589377, -19444176.259208687, -19502755.000038862, -19544333.014549147, -19612668.183176614, -19681902.19006569, -19771969.951249883, -19873329.723376893, -19996752.59235844, -20110031.131400537, -20231658.612529557, -20319378.894054495, -20378534.45718066, -20413332.089584175, -20438147.844177883, -20443710.248040095, -20465457.02238927, -20488610.969337028, -20516295.16424432, -20541423.795738827, -20553192.874953747, -20573605.50701977, -20577871.61936797, -20571807.008916274, -20556242.38912231, -20542199.30819195, -20521239.063551214, -20519150.80004532, -20527204.80248933, -20536933.769257784, -20543470.522332076, -20549700.089992985, -20551525.24958494, -20554873.406493705, -20564277.65794227, -20572211.740052115, -20574305.69550465, -20575494.450104576, -20567092.577932164, -20549302.929608088, -20545445.11878376, -20546625.326603737, -20549190.03499401, -20554824.947828256, -20568341.378989458, -20577582.331383612, -20577980.519402675, -20566603.03458152, -20560131.592262644, -20552166.469060015, -20549063.06763577, -20544490.562339947, -20539817.82346569, -20528747.715731595, -20518026.24576161, -20510977.844974525, -20506874.36087992, -20506731.11977665, -20510482.133420516, -20507760.92101862, -20494644.834457114, -20480107.89304893, -20461312.091867123, -20442941.75080173, -20426123.02834838, -20424607.675283, -20426810.369107097, -20434024.50097819, -20437404.75544205, -20447688.63916367, -20460893.335563846, -20482922.735127095, -20503610.119434915, -20527062.76448319, -20557830.035128627, -20593274.72068722, -20632528.452965066, -20673637.471334763, -20733106.97143075, -20842921.0447562, -21054357.83621519, -21416569.534189366, -21978460.272811692, -22753170.052172784, -23671344.10563395, -24613499.293358143, -25406477.12230188, -25884377.82156489, -26049040.62791664, -26996879.104431007};
-std::vector<float> variance_{213747175.10846674, 188395815.34302503, 212706429.10966414, 199109025.81461075, 189235901.23864496, 194901336.53253657, 217481594.29306737, 238689869.12327808, 243977501.24115244, 248479623.6431067, 259766741.47116545, 275516766.7790273, 291271202.3691234, 302693239.8220509, 308627358.3997694, 311143911.38788426, 315446105.07731867, 321705430.9341829, 327458907.4659941, 332245072.43223983, 336251717.5935284, 339694069.7639722, 342188204.4322228, 345587110.31313115, 349903086.2875232, 353660214.20643026, 356700344.5270885, 357665362.3529641, 358493352.05658793, 358857951.620328, 358375239.52774596, 358899733.6342954, 361051818.3511561, 364361716.05025816, 368750322.3771452, 372047800.6462831, 375655861.1349018, 379358519.1980013, 383327605.3935181, 387458599.282341, 390434692.3406868, 392994486.35057056, 394874418.04603153, 396230525.79763395, 396365592.0414835, 396334819.8242737, 396488353.19250053, 396438877.00744957, 396197980.4459586, 395590921.6672991, 395001107.62072515, 394528291.7318225, 394593110.424006, 395018405.59353715, 396110577.5415993, 397506704.0371068, 399400197.4657644, 401243568.2468382, 402687134.7805103, 404136047.2872507, 404883170.001883, 405522253.219517, 406660365.3626476, 407919346.0991902, 409045348.5384909, 409759588.7889818, 411974821.8564483, 413489718.78201455, 415535392.56684107, 418466481.97674364, 421104678.35678065, 423405392.5200779, 425550570.40798235, 427929423.9579701, 429585274.253478, 432368493.55181056, 435193587.13513297, 438886855.20476013, 443058876.8633751, 448181232.5093362, 452883835.6332396, 458056721.77926534, 461816531.22735566, 464363620.1970998, 465886343.5057493, 466928872.0651, 467180536.42647296, 468111848.70714295, 469138695.3071312, 470378429.6930793, 471517958.7132626, 472109050.4262365, 473087417.0177867, 473381322.04648733, 473220195.85483915, 472666071.8998819, 472124669.87879956, 471298571.411737, 471251033.2902761, 471672676.43128747, 472177147.2193172, 472572361.7711908, 472968783.7751127, 473156295.4164052, 473398034.82676554, 473897703.5203811, 474328271.33112127, 474452670.98002136, 474549003.99284613, 474252887.13567275, 473557462.909069, 473483385.85193115, 473609738.04855174, 473746944.82085115, 474016729.91696435, 474617321.94138587, 475045097.237122, 475125402.586558, 474664112.9824912, 474426247.5800283, 474104075.42796475, 473978219.7273978, 473773171.7798875, 473578534.69508696, 473102924.16904145, 472651240.5232615, 472374383.1810912, 472209479.6956096, 472202298.8921673, 472370090.76781124, 472220933.99374026, 471625467.37106377, 470994646.51883453, 470182428.9637543, 469348211.5939578, 468570387.4467277, 468540442.7225135, 468672018.90414184, 468994346.9533251, 469138757.58201426, 469553915.95710236, 470134523.38582784, 471082421.62055486, 471962316.51804745, 472939745.1708408, 474250621.5944825, 475773933.43199486, 477465399.71087736, 479218782.61382693, 481752299.7930922, 486608947.8984568, 496119403.2067917, 512730085.5704984, 539048915.2641417, 576285298.3548826, 621610270.2240586, 669308196.4436442, 710656993.5957186, 736344437.3725077, 745481288.0241544, 801121432.9925804};
-int count_ = 912592;
-
-void WriteMatrix() {
-  kaldi::Matrix<double> cmvn_stats(2, mean_.size()+ 1); 
-  for (size_t idx = 0; idx < mean_.size(); ++idx) {
-    cmvn_stats(0, idx) = mean_[idx];
-    cmvn_stats(1, idx) = variance_[idx];
-  }
-  cmvn_stats(0, mean_.size()) = count_;
-  kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true);
-}
-
-int main(int argc, char* argv[]) {
-  gflags::ParseCommandLineFlags(&argc, &argv, false);
-  google::InitGoogleLogging(argv[0]);
-  
-  kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(FLAGS_wav_rspecifier);
-  kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
-  WriteMatrix();
-
-  // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning window -->linear_spectrogram --> cmvn
-  int32 num_done = 0, num_err = 0;
-  std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new ppspeech::RawDataSource());
-
-  ppspeech::LinearSpectrogramOptions opt;
-  opt.frame_opts.frame_length_ms = 20;
-  opt.frame_opts.frame_shift_ms = 10;
-  ppspeech::DecibelNormalizerOptions db_norm_opt;
-  std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor(
-    new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
-
-  std::unique_ptr<ppspeech::FeatureExtractorInterface> linear_spectrogram(
-    new ppspeech::LinearSpectrogram(opt, std::move(base_feature_extractor)));
-
-  ppspeech::CMVN cmvn(FLAGS_cmvn_write_path, std::move(linear_spectrogram));
-
-  float streaming_chunk = 0.36;
-  int sample_rate = 16000;
-  int chunk_sample_size = streaming_chunk * sample_rate;
-
-  for (; !wav_reader.Done(); wav_reader.Next()) {
-    std::string utt = wav_reader.Key();
-    const kaldi::WaveData &wave_data = wav_reader.Value();
-
-    int32 this_channel = 0;
-    kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(), this_channel);
-    int tot_samples = waveform.Dim(); 
-    int sample_offset = 0;
-    std::vector<kaldi::Vector<BaseFloat>> feats;
-    int feature_rows = 0;
-    while (sample_offset < tot_samples) {
-      int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset);
-      kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
-      for (int i = 0; i < cur_chunk_size; ++i) {
-        wav_chunk(i) = waveform(sample_offset + i); 
-      }
-      kaldi::Vector<BaseFloat> features;
-      cmvn.AcceptWaveform(wav_chunk);
-      cmvn.Read(&features);
-
-      std::cout << wav_chunk(0) << std::endl;
-      std::cout << features(0) << std::endl;
-
-      feats.push_back(features);
-      sample_offset += cur_chunk_size;
-      feature_rows += features.Dim() / cmvn.Dim();
-    }
-
-    int cur_idx = 0;
-    kaldi::Matrix<kaldi::BaseFloat> features(feature_rows, cmvn.Dim()); 
-    for (auto feat : feats) {
-      int num_rows = feat.Dim() / cmvn.Dim();
-      for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
-          for (int col_idx = 0; col_idx < cmvn.Dim(); ++col_idx) {
-              features(cur_idx, col_idx) = feat(row_idx*cmvn.Dim() + col_idx);
-          }
-          ++cur_idx;
-      }
-    }
-    feat_writer.Write(utt, features);
-
-    if (num_done % 50 == 0 && num_done != 0)
-    KALDI_VLOG(2) << "Processed " << num_done << " utterances";
-    num_done++;
-  }
-    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
-              << " with errors.";
-    return (num_done != 0 ? 0 : 1);
-}
diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h
index ac01a977..7502bc5e 100644
--- a/speechx/speechx/base/common.h
+++ b/speechx/speechx/base/common.h
@@ -14,24 +14,25 @@
 
 #pragma once
 
+#include <condition_variable>
 #include <deque>
+#include <fstream>
 #include <iostream>
 #include <istream>
-#include <fstream>
 #include <map>
 #include <memory>
+#include <mutex>
 #include <ostream>
+#include <queue>
 #include <set>
 #include <sstream>
 #include <stack>
 #include <string>
-#include <vector>
 #include <unordered_map>
 #include <unordered_set>
-#include <mutex>
-#include <condition_variable>
+#include <vector>
 
-#include "base/log.h"
-#include "base/flags.h"
 #include "base/basic_types.h"
+#include "base/flags.h"
+#include "base/log.h"
 #include "base/macros.h"
diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt
index e43bd182..44ca52cd 100644
--- a/speechx/speechx/frontend/CMakeLists.txt
+++ b/speechx/speechx/frontend/CMakeLists.txt
@@ -4,6 +4,7 @@ add_library(frontend STATIC
   normalizer.cc
   linear_spectrogram.cc
   raw_audio.cc
+  feature_cache.cc
 )
 
-target_link_libraries(frontend PUBLIC kaldi-matrix)
\ No newline at end of file
+target_link_libraries(frontend PUBLIC kaldi-matrix)
diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc
index 07f2cbf7..df366a06 100644
--- a/speechx/speechx/frontend/feature_cache.cc
+++ b/speechx/speechx/frontend/feature_cache.cc
@@ -1,38 +1,84 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "frontend/feature_cache.h"
 
-void FeatureCache::AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) {
+namespace ppspeech {
+
+using kaldi::Vector;
+using kaldi::VectorBase;
+using kaldi::BaseFloat;
+using std::vector;
+using kaldi::SubVector;
+using std::unique_ptr;
+
+FeatureCache::FeatureCache(
+    int max_size, unique_ptr<FeatureExtractorInterface> base_extractor) {
+    max_size_ = max_size;
+    base_extractor_ = std::move(base_extractor);
+}
+
+void FeatureCache::AcceptWaveform(
+    const kaldi::VectorBase<kaldi::BaseFloat>& input) {
     base_extractor_->AcceptWaveform(input);
     // feed current data
-    while (base_extractor_->IsLastFrame()) {
-      Compute();
-    }
+    bool result = false;
+    do {
+        result = Compute();
+    } while (result);
 }
 
-// pop feature chunk 
-void FeatureCache::Read(kaldi::VectorBase<kaldi::BaseFloat>* feat) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    while (cache_.empty()) {
+// pop feature chunk
+bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
+    kaldi::Timer timer;
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (cache_.empty() && base_extractor_->IsFinished() == false) {
         ready_read_condition_.wait(lock);
+        BaseFloat elapsed = timer.Elapsed() * 1000;
+        // todo replace 1.0 with timeout_
+        if (elapsed > 1.0) {
+            return false;
+        }
+        usleep(1000);  // sleep 1 ms
     }
+    if (cache_.empty()) return false;
+    feat->Resize(cache_.front().Dim());
     feat->CopyFromVec(cache_.front());
     cache_.pop();
     ready_feed_condition_.notify_one();
+    return true;
 }
 
 // read all data from base_feature_extractor_ into cache_
-void FeatureCache::Compute() {
+bool FeatureCache::Compute() {
     // compute and feed
-    Vector<BaseFloat> feature_chunk(base_extractor_->Dim());
-    base_extractor_->Read(&feature_chunk);
-    std::lock_guard<std::mutex> lock(mutex_);
+    Vector<BaseFloat> feature_chunk;
+    bool result = base_extractor_->Read(&feature_chunk);
+    std::unique_lock<std::mutex> lock(mutex_);
     while (cache_.size() >= max_size_) {
         ready_feed_condition_.wait(lock);
     }
-    cache_.push(feature_chunk);
+    if (feature_chunk.Dim() != 0) {
+        cache_.push(feature_chunk);
+    }
     ready_read_condition_.notify_one();
+    return result;
 }
 
-// compute the last chunk data && set feed finished 
-void FeatureCache::InputFinishd() {
-    Compute();
+void Reset() {
+    // std::lock_guard<std::mutex> lock(mutex_);
+    return;
 }
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h
index 71dc455c..c7d66251 100644
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@@ -1,21 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/common.h"
 #include "frontend/feature_extractor_interface.h"
 
-class FeatureCache {
+namespace ppspeech {
+
+class FeatureCache : public FeatureExtractorInterface {
   public:
-    explicit FeatureCache(FeatureExtractorInterface base_extractor); 
-    void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat);
-    void Dim() { return base_extractor_->Dim(); }
-    void SetFinished();
-    bool IsFinished();
+    explicit FeatureCache(
+        int32 max_size = kint16max,
+        std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
+    virtual void AcceptWaveform(
+        const kaldi::VectorBase<kaldi::BaseFloat>& input);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual size_t Dim() const { return base_extractor_->Dim(); }
+    virtual void SetFinished() {
+        base_extractor_->SetFinished();
+        Compute();
+    }
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
 
   private:
+    bool Compute();
+
     bool finished_;
-    mutable std::mutex mutex_;
-    size_t max_size;
+    std::mutex mutex_;
+    size_t max_size_;
     std::queue<kaldi::Vector<BaseFloat>> cache_;
-    std::shared_ptr<FeatureExtractorInterface> base_extractor_;
+    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
     std::condition_variable ready_feed_condition_;
     std::condition_variable ready_read_condition_;
-    DISALLOW_COPY_AND_ASSGIN(FeatureCache);
-};
\ No newline at end of file
+    //    DISALLOW_COPY_AND_ASSGIN(FeatureCache);
+};
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h
index fc06f24a..e490bc75 100644
--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@@ -21,9 +21,13 @@ namespace ppspeech {
 
 class FeatureExtractorInterface {
   public:
-    virtual void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) = 0;
-    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
+    virtual void AcceptWaveform(
+        const kaldi::VectorBase<kaldi::BaseFloat>& input) = 0;
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
     virtual size_t Dim() const = 0;
+    virtual void SetFinished() = 0;
+    virtual bool IsFinished() const = 0;
+    // virtual void Reset();
 };
 
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc
index ed4c2977..73cffea5 100644
--- a/speechx/speechx/frontend/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/linear_spectrogram.cc
@@ -25,146 +25,145 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;
 
-//todo remove later
+// todo remove later
 void CopyVector2StdVector_(const VectorBase<BaseFloat>& input,
-                          vector<BaseFloat>* output) {
-  if (input.Dim() == 0) return;
-  output->resize(input.Dim());
-  for (size_t idx = 0; idx < input.Dim(); ++idx) {
-    (*output)[idx] = input(idx);
-  }
+                           vector<BaseFloat>* output) {
+    if (input.Dim() == 0) return;
+    output->resize(input.Dim());
+    for (size_t idx = 0; idx < input.Dim(); ++idx) {
+        (*output)[idx] = input(idx);
+    }
 }
 
 void CopyStdVector2Vector_(const vector<BaseFloat>& input,
-                          Vector<BaseFloat>* output) {
-  if (input.empty()) return;
-  output->Resize(input.size());
-  for (size_t idx = 0; idx < input.size(); ++idx) {
-    (*output)(idx) = input[idx];
-  }
+                           Vector<BaseFloat>* output) {
+    if (input.empty()) return;
+    output->Resize(input.size());
+    for (size_t idx = 0; idx < input.size(); ++idx) {
+        (*output)(idx) = input[idx];
+    }
 }
 
 LinearSpectrogram::LinearSpectrogram(
     const LinearSpectrogramOptions& opts,
     std::unique_ptr<FeatureExtractorInterface> base_extractor) {
-  opts_ = opts;
-  base_extractor_ = std::move(base_extractor);
-  int32 window_size = opts.frame_opts.WindowSize();
-  int32 window_shift = opts.frame_opts.WindowShift();
-  fft_points_ = window_size;
-  chunk_sample_size_ = 
-      static_cast<int32>(opts.streaming_chunk * opts.frame_opts.samp_freq);
-  hanning_window_.resize(window_size);
-
-  double a = M_2PI / (window_size - 1);
-  hanning_window_energy_ = 0;
-  for (int i = 0; i < window_size; ++i) {
-    hanning_window_[i] = 0.5 - 0.5 * cos(a * i);
-    hanning_window_energy_ += hanning_window_[i] * hanning_window_[i];
-  }
-
-  dim_ = fft_points_ / 2 + 1;  // the dimension is Fs/2 Hz
+    opts_ = opts;
+    base_extractor_ = std::move(base_extractor);
+    int32 window_size = opts.frame_opts.WindowSize();
+    int32 window_shift = opts.frame_opts.WindowShift();
+    fft_points_ = window_size;
+    chunk_sample_size_ =
+        static_cast<int32>(opts.streaming_chunk * opts.frame_opts.samp_freq);
+    hanning_window_.resize(window_size);
+
+    double a = M_2PI / (window_size - 1);
+    hanning_window_energy_ = 0;
+    for (int i = 0; i < window_size; ++i) {
+        hanning_window_[i] = 0.5 - 0.5 * cos(a * i);
+        hanning_window_energy_ += hanning_window_[i] * hanning_window_[i];
+    }
+
+    dim_ = fft_points_ / 2 + 1;  // the dimension is Fs/2 Hz
 }
 
 void LinearSpectrogram::AcceptWaveform(const VectorBase<BaseFloat>& input) {
     base_extractor_->AcceptWaveform(input);
 }
 
-void LinearSpectrogram::Read(Vector<BaseFloat>* feat) {
-  Vector<BaseFloat> input_feats(chunk_sample_size_);
-  base_extractor_->Read(&input_feats);
-  vector<BaseFloat> input_feats_vec(input_feats.Dim());
-  CopyVector2StdVector_(input_feats, &input_feats_vec);
-  //for (int idx = 0; idx < input_feats.Dim(); ++idx) {
-  //  input_feats_vec[idx] = input_feats(idx); 
-  //}
-  vector<vector<BaseFloat>> result;
-  Compute(input_feats_vec, result);
-  int32 feat_size = 0;
-  if (result.size() != 0) {
-    feat_size = result.size() * result[0].size();
-  }
-  feat->Resize(feat_size);
-  for (size_t idx = 0; idx < feat_size; ++idx) {
-    (*feat)(idx) = result[idx / dim_][idx % dim_];
-  }
-  return;
+bool LinearSpectrogram::Read(Vector<BaseFloat>* feat) {
+    Vector<BaseFloat> input_feats(chunk_sample_size_);
+    bool flag = base_extractor_->Read(&input_feats);
+    if (flag == false || input_feats.Dim() == 0) return false;
+
+    vector<BaseFloat> input_feats_vec(input_feats.Dim());
+    CopyVector2StdVector_(input_feats, &input_feats_vec);
+    vector<vector<BaseFloat>> result;
+    Compute(input_feats_vec, result);
+    int32 feat_size = 0;
+    if (result.size() != 0) {
+        feat_size = result.size() * result[0].size();
+    }
+    feat->Resize(feat_size);
+    for (size_t idx = 0; idx < feat_size; ++idx) {
+        (*feat)(idx) = result[idx / dim_][idx % dim_];
+    }
+    return true;
 }
 
 void LinearSpectrogram::Hanning(vector<float>* data) const {
-  CHECK_GE(data->size(), hanning_window_.size());
+    CHECK_GE(data->size(), hanning_window_.size());
 
-  for (size_t i = 0; i < hanning_window_.size(); ++i) {
-      data->at(i) *= hanning_window_[i];
-  }
+    for (size_t i = 0; i < hanning_window_.size(); ++i) {
+        data->at(i) *= hanning_window_[i];
+    }
 }
 
 bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
                                  vector<BaseFloat>* real,
                                  vector<BaseFloat>* img) const {
-  Vector<BaseFloat> v_tmp;
-  CopyStdVector2Vector_(*v, &v_tmp);
-  RealFft(&v_tmp, true);
-  CopyVector2StdVector_(v_tmp, v);
-  real->push_back(v->at(0));
-  img->push_back(0);
-  for (int i = 1; i < v->size() / 2; i++) {
-    real->push_back(v->at(2 * i));
-    img->push_back(v->at(2 * i + 1));
-  }
-  real->push_back(v->at(1));
-  img->push_back(0);
-
-  return true;
+    Vector<BaseFloat> v_tmp;
+    CopyStdVector2Vector_(*v, &v_tmp);
+    RealFft(&v_tmp, true);
+    CopyVector2StdVector_(v_tmp, v);
+    real->push_back(v->at(0));
+    img->push_back(0);
+    for (int i = 1; i < v->size() / 2; i++) {
+        real->push_back(v->at(2 * i));
+        img->push_back(v->at(2 * i + 1));
+    }
+    real->push_back(v->at(1));
+    img->push_back(0);
+
+    return true;
 }
 
 // Compute spectrogram feat, only for test, remove later
 // todo: refactor later (SmileGoat)
 bool LinearSpectrogram::Compute(const vector<float>& wave,
                                 vector<vector<float>>& feat) {
-  int num_samples = wave.size();
-  const int& frame_length = opts_.frame_opts.WindowSize();
-  const int& sample_rate = opts_.frame_opts.samp_freq;
-  const int& frame_shift = opts_.frame_opts.WindowShift();
-  const int& fft_points = fft_points_;
-  const float scale = hanning_window_energy_ * sample_rate;
-
-  if (num_samples < frame_length) {
-          return true;
-  }
-
-  int num_frames = 1 + ((num_samples - frame_length) / frame_shift);
-  feat.resize(num_frames);
-  vector<float> fft_real((fft_points_ / 2 + 1), 0);
-  vector<float> fft_img((fft_points_ / 2 + 1), 0);
-  vector<float> v(frame_length, 0);
-  vector<float> power((fft_points / 2 + 1));
-
-  for (int i = 0; i < num_frames; ++i) {
-    vector<float> data(wave.data() + i * frame_shift,
-                       wave.data() + i * frame_shift + frame_length);
-    Hanning(&data);
-    fft_img.clear();
-    fft_real.clear();
-    v.assign(data.begin(), data.end());
-    NumpyFft(&v, &fft_real, &fft_img);
-
-    feat[i].resize(fft_points / 2 + 1);  // the last dimension is Fs/2 Hz
-    for (int j = 0; j < (fft_points / 2 + 1); ++j) {
-      power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j];
-      feat[i][j] = power[j];
-
-      if (j == 0 || j == feat[0].size() - 1) {
-          feat[i][j] /= scale;
-      } else {
-          feat[i][j] *= (2.0 / scale);
-      }
-
-      // log added eps=1e-14
-      feat[i][j] = std::log(feat[i][j] + 1e-14);
+    int num_samples = wave.size();
+    const int& frame_length = opts_.frame_opts.WindowSize();
+    const int& sample_rate = opts_.frame_opts.samp_freq;
+    const int& frame_shift = opts_.frame_opts.WindowShift();
+    const int& fft_points = fft_points_;
+    const float scale = hanning_window_energy_ * sample_rate;
+
+    if (num_samples < frame_length) {
+        return true;
+    }
+
+    int num_frames = 1 + ((num_samples - frame_length) / frame_shift);
+    feat.resize(num_frames);
+    vector<float> fft_real((fft_points_ / 2 + 1), 0);
+    vector<float> fft_img((fft_points_ / 2 + 1), 0);
+    vector<float> v(frame_length, 0);
+    vector<float> power((fft_points / 2 + 1));
+
+    for (int i = 0; i < num_frames; ++i) {
+        vector<float> data(wave.data() + i * frame_shift,
+                           wave.data() + i * frame_shift + frame_length);
+        Hanning(&data);
+        fft_img.clear();
+        fft_real.clear();
+        v.assign(data.begin(), data.end());
+        NumpyFft(&v, &fft_real, &fft_img);
+
+        feat[i].resize(fft_points / 2 + 1);  // the last dimension is Fs/2 Hz
+        for (int j = 0; j < (fft_points / 2 + 1); ++j) {
+            power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j];
+            feat[i][j] = power[j];
+
+            if (j == 0 || j == feat[0].size() - 1) {
+                feat[i][j] /= scale;
+            } else {
+                feat[i][j] *= (2.0 / scale);
+            }
+
+            // log added eps=1e-14
+            feat[i][j] = std::log(feat[i][j] + 1e-14);
+        }
     }
-  }
-  return true;
+    return true;
 }
 
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h
index e4dc3e33..c18438eb 100644
--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@@ -1,35 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 
 #pragma once
 
+#include "base/common.h"
 #include "frontend/feature_extractor_interface.h"
 #include "kaldi/feat/feature-window.h"
-#include "base/common.h"
 
 namespace ppspeech {
 
 struct LinearSpectrogramOptions {
     kaldi::FrameExtractionOptions frame_opts;
     kaldi::BaseFloat streaming_chunk;
-    LinearSpectrogramOptions():
-        streaming_chunk(0.36),
-        frame_opts() {}
+    LinearSpectrogramOptions() : streaming_chunk(0.36), frame_opts() {}
 
     void Register(kaldi::OptionsItf* opts) {
-        opts->Register("streaming-chunk", &streaming_chunk, "streaming chunk size");
+        opts->Register(
+            "streaming-chunk", &streaming_chunk, "streaming chunk size");
         frame_opts.Register(opts);
     }
 };
 
 class LinearSpectrogram : public FeatureExtractorInterface {
   public:
-    explicit LinearSpectrogram(const LinearSpectrogramOptions& opts,
-                               std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    explicit LinearSpectrogram(
+        const LinearSpectrogramOptions& opts,
+        std::unique_ptr<FeatureExtractorInterface> base_extractor);
+    virtual void AcceptWaveform(
+        const kaldi::VectorBase<kaldi::BaseFloat>& input);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
     virtual size_t Dim() const { return dim_; }
-    void ReadFeats(kaldi::Matrix<kaldi::BaseFloat>* feats);
+    virtual void SetFinished() { base_extractor_->SetFinished(); }
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
 
-  private: 
+  private:
     void Hanning(std::vector<kaldi::BaseFloat>* data) const;
     bool Compute(const std::vector<kaldi::BaseFloat>& wave,
                  std::vector<std::vector<kaldi::BaseFloat>>& feat);
@@ -44,7 +60,7 @@ class LinearSpectrogram : public FeatureExtractorInterface {
     std::vector<kaldi::BaseFloat> hanning_window_;
     kaldi::BaseFloat hanning_window_energy_;
     LinearSpectrogramOptions opts_;
-    kaldi::Vector<kaldi::BaseFloat> waveform_; // remove later, todo(SmileGoat)
+    kaldi::Vector<kaldi::BaseFloat> waveform_;  // remove later, todo(SmileGoat)
     std::unique_ptr<FeatureExtractorInterface> base_extractor_;
     int chunk_sample_size_;
     DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc
index 69c9ab59..8aaf33de 100644
--- a/speechx/speechx/frontend/normalizer.cc
+++ b/speechx/speechx/frontend/normalizer.cc
@@ -1,3 +1,17 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 
 #include "frontend/normalizer.h"
 #include "kaldi/feat/cmvn.h"
@@ -12,90 +26,96 @@ using std::vector;
 using kaldi::SubVector;
 using std::unique_ptr;
 
-DecibelNormalizer::DecibelNormalizer(const DecibelNormalizerOptions& opts,
+DecibelNormalizer::DecibelNormalizer(
+    const DecibelNormalizerOptions& opts,
     std::unique_ptr<FeatureExtractorInterface> base_extractor) {
-  base_extractor_ = std::move(base_extractor);
-  opts_ = opts;
-  dim_ = 0;
+    base_extractor_ = std::move(base_extractor);
+    opts_ = opts;
+    dim_ = 0;
 }
-                                    
-void DecibelNormalizer::AcceptWaveform(const kaldi::VectorBase<BaseFloat>& input) {
-  //dim_ = input.Dim();
-  //waveform_.Resize(input.Dim());
-  //waveform_.CopyFromVec(input);
-  base_extractor_->AcceptWaveform(input);
+
+void DecibelNormalizer::AcceptWaveform(
+    const kaldi::VectorBase<BaseFloat>& input) {
+    // dim_ = input.Dim();
+    // waveform_.Resize(input.Dim());
+    // waveform_.CopyFromVec(input);
+    base_extractor_->AcceptWaveform(input);
 }
 
-void DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* feat) {
-  // if (waveform_.Dim() == 0) return;
-  base_extractor_->Read(feat);
-  Compute(feat);
+bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* feat) {
+    // if (waveform_.Dim() == 0) return;
+    if (base_extractor_->Read(feat) == false || feat->Dim() == 0) {
+        return false;
+    }
+    Compute(feat);
+    return true;
 }
 
-//todo remove later
+// todo remove later
 void CopyVector2StdVector(const kaldi::VectorBase<BaseFloat>& input,
                           vector<BaseFloat>* output) {
-  if (input.Dim() == 0) return;
-  output->resize(input.Dim());
-  for (size_t idx = 0; idx < input.Dim(); ++idx) {
-    (*output)[idx] = input(idx);
-  }
+    if (input.Dim() == 0) return;
+    output->resize(input.Dim());
+    for (size_t idx = 0; idx < input.Dim(); ++idx) {
+        (*output)[idx] = input(idx);
+    }
 }
 
 void CopyStdVector2Vector(const vector<BaseFloat>& input,
                           VectorBase<BaseFloat>* output) {
-  if (input.empty()) return;
-  assert(input.size() == output->Dim());
-  for (size_t idx = 0; idx < input.size(); ++idx) {
-    (*output)(idx) = input[idx];
-  }
+    if (input.empty()) return;
+    assert(input.size() == output->Dim());
+    for (size_t idx = 0; idx < input.size(); ++idx) {
+        (*output)(idx) = input[idx];
+    }
 }
 
 bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feat) const {
-  // calculate db rms
-  BaseFloat rms_db = 0.0;
-  BaseFloat mean_square = 0.0;
-  BaseFloat gain = 0.0;
-  BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
-
-  vector<BaseFloat> samples;
-  samples.resize(feat->Dim());
-  for (size_t i = 0; i < samples.size(); ++i) {
-    samples[i] = (*feat)(i);
-  }
-  
-  // square
-  for (auto &d : samples) {
-    if (opts_.convert_int_float) {
-    d = d * wave_float_normlization;
+    // calculate db rms
+    BaseFloat rms_db = 0.0;
+    BaseFloat mean_square = 0.0;
+    BaseFloat gain = 0.0;
+    BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
+
+    vector<BaseFloat> samples;
+    samples.resize(feat->Dim());
+    for (size_t i = 0; i < samples.size(); ++i) {
+        samples[i] = (*feat)(i);
+    }
+
+    // square
+    for (auto& d : samples) {
+        if (opts_.convert_int_float) {
+            d = d * wave_float_normlization;
+        }
+        mean_square += d * d;
     }
-    mean_square += d * d;
-  }
-
-  // mean
-  mean_square /= samples.size();
-  rms_db = 10 * std::log10(mean_square);
-  gain = opts_.target_db - rms_db;
-
-  if (gain > opts_.max_gain_db) {
-    LOG(ERROR) << "Unable to normalize segment to " << opts_.target_db << "dB,"
-                << "because the the probable gain have exceeds opts_.max_gain_db" 
-                <<  opts_.max_gain_db << "dB.";
-    return false;
-  }
-
-  // Note that this is an in-place transformation.
-  for (auto &item : samples) {
-    // python item *= 10.0 ** (gain / 20.0)
-    item *= std::pow(10.0, gain / 20.0);
-  }
-  
-  CopyStdVector2Vector(samples, feat);
-  return true;
+
+    // mean
+    mean_square /= samples.size();
+    rms_db = 10 * std::log10(mean_square);
+    gain = opts_.target_db - rms_db;
+
+    if (gain > opts_.max_gain_db) {
+        LOG(ERROR)
+            << "Unable to normalize segment to " << opts_.target_db << "dB,"
+            << "because the the probable gain have exceeds opts_.max_gain_db"
+            << opts_.max_gain_db << "dB.";
+        return false;
+    }
+
+    // Note that this is an in-place transformation.
+    for (auto& item : samples) {
+        // python item *= 10.0 ** (gain / 20.0)
+        item *= std::pow(10.0, gain / 20.0);
+    }
+
+    CopyStdVector2Vector(samples, feat);
+    return true;
 }
 
-CMVN::CMVN(std::string cmvn_file, 
-           unique_ptr<FeatureExtractorInterface> base_extractor) 
+CMVN::CMVN(std::string cmvn_file,
+           unique_ptr<FeatureExtractorInterface> base_extractor)
     : var_norm_(true) {
     base_extractor_ = std::move(base_extractor);
     bool binary;
@@ -109,77 +129,83 @@ void CMVN::AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) {
     return;
 }
 
-void CMVN::Read(kaldi::Vector<BaseFloat>* feat) {
-    base_extractor_->Read(feat);
+bool CMVN::Read(kaldi::Vector<BaseFloat>* feat) {
+    if (base_extractor_->Read(feat) == false) {
+        return false;
+    }
     Compute(feat);
-    return;
+    return true;
 }
 
 // feats contain num_frames feature.
 void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
-  KALDI_ASSERT(feats != NULL); 
-  int32 dim = stats_.NumCols() - 1;
-  if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || feats->Dim() % dim != 0) {
-    KALDI_ERR << "Dim mismatch: cmvn "
-              << stats_.NumRows() << 'x' << stats_.NumCols()
-              << ", feats " << feats->Dim() << 'x';
-  }
-  if (stats_.NumRows() == 1 && var_norm_) {
-    KALDI_ERR << "You requested variance normalization but no variance stats_ "
-              << "are supplied.";
-  }
-
-  double count = stats_(0, dim);
-  // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
-  // computing an offset and representing it as stats_, we use a count of one.
-  if (count < 1.0)
-    KALDI_ERR << "Insufficient stats_ for cepstral mean and variance normalization: "
-              << "count = " << count;
-
-  if (!var_norm_) {
-    Vector<BaseFloat> offset(feats->Dim());
-    SubVector<double> mean_stats(stats_.RowData(0), dim);
-    Vector<double> mean_stats_apply(feats->Dim());
-    //fill the datat of mean_stats in mean_stats_appy whose dim is equal with the dim of feature.
-    //the dim of feats = dim * num_frames;
-    for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) {
-      SubVector<double> stats_tmp(mean_stats_apply.Data() + dim*idx, dim);
-      stats_tmp.CopyFromVec(mean_stats);
+    KALDI_ASSERT(feats != NULL);
+    int32 dim = stats_.NumCols() - 1;
+    if (stats_.NumRows() > 2 || stats_.NumRows() < 1 ||
+        feats->Dim() % dim != 0) {
+        KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x'
+                  << stats_.NumCols() << ", feats " << feats->Dim() << 'x';
     }
-    offset.AddVec(-1.0 / count, mean_stats_apply);
-    feats->AddVec(1.0, offset);
-    return;
-  }
-  // norm(0, d) = mean offset;
-  // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
-  kaldi::Matrix<BaseFloat> norm(2, feats->Dim());
-  for (int32 d = 0; d < dim; d++) {
-    double mean, offset, scale;
-    mean = stats_(0, d)/count;
-    double var = (stats_(1, d)/count) - mean*mean,
-        floor = 1.0e-20;
-    if (var < floor) {
-      KALDI_WARN << "Flooring cepstral variance from " << var << " to "
-                 << floor;
-      var = floor;
+    if (stats_.NumRows() == 1 && var_norm_) {
+        KALDI_ERR
+            << "You requested variance normalization but no variance stats_ "
+            << "are supplied.";
+    }
+
+    double count = stats_(0, dim);
+    // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
+    // computing an offset and representing it as stats_, we use a count of one.
+    if (count < 1.0)
+        KALDI_ERR << "Insufficient stats_ for cepstral mean and variance "
+                     "normalization: "
+                  << "count = " << count;
+
+    if (!var_norm_) {
+        Vector<BaseFloat> offset(feats->Dim());
+        SubVector<double> mean_stats(stats_.RowData(0), dim);
+        Vector<double> mean_stats_apply(feats->Dim());
+        // fill the datat of mean_stats in mean_stats_appy whose dim is equal
+        // with the dim of feature.
+        // the dim of feats = dim * num_frames;
+        for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) {
+            SubVector<double> stats_tmp(mean_stats_apply.Data() + dim * idx,
+                                        dim);
+            stats_tmp.CopyFromVec(mean_stats);
+        }
+        offset.AddVec(-1.0 / count, mean_stats_apply);
+        feats->AddVec(1.0, offset);
+        return;
     }
-    scale = 1.0 / sqrt(var);
-    if (scale != scale || 1/scale == 0.0)
-      KALDI_ERR << "NaN or infinity in cepstral mean/variance computation";
-    offset = -(mean*scale);
-    for (int32 d_skip = d; d_skip < feats->Dim();) {
-      norm(0, d_skip) = offset;
-      norm(1, d_skip) = scale;
-      d_skip = d_skip + dim;
+    // norm(0, d) = mean offset;
+    // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
+    kaldi::Matrix<BaseFloat> norm(2, feats->Dim());
+    for (int32 d = 0; d < dim; d++) {
+        double mean, offset, scale;
+        mean = stats_(0, d) / count;
+        double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20;
+        if (var < floor) {
+            KALDI_WARN << "Flooring cepstral variance from " << var << " to "
+                       << floor;
+            var = floor;
+        }
+        scale = 1.0 / sqrt(var);
+        if (scale != scale || 1 / scale == 0.0)
+            KALDI_ERR
+                << "NaN or infinity in cepstral mean/variance computation";
+        offset = -(mean * scale);
+        for (int32 d_skip = d; d_skip < feats->Dim();) {
+            norm(0, d_skip) = offset;
+            norm(1, d_skip) = scale;
+            d_skip = d_skip + dim;
+        }
     }
-  }
-  // Apply the normalization.
-  feats->MulElements(norm.Row(1));
-  feats->AddVec(1.0, norm.Row(0));
+    // Apply the normalization.
+    feats->MulElements(norm.Row(1));
+    feats->AddVec(1.0, norm.Row(0));
 }
 
 void CMVN::ApplyCMVN(kaldi::MatrixBase<BaseFloat>* feats) {
-  ApplyCmvn(stats_, var_norm_, feats);
+    ApplyCmvn(stats_, var_norm_, feats);
 }
 
-} // namespace ppspeech
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h
index 13c5b8df..189e0e2b 100644
--- a/speechx/speechx/frontend/normalizer.h
+++ b/speechx/speechx/frontend/normalizer.h
@@ -1,26 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 
 #pragma once
 
 #include "base/common.h"
 #include "frontend/feature_extractor_interface.h"
-#include "kaldi/util/options-itf.h"
 #include "kaldi/matrix/kaldi-matrix.h"
+#include "kaldi/util/options-itf.h"
 
 namespace ppspeech {
 
 struct DecibelNormalizerOptions {
-  float target_db;
-  float max_gain_db;
-  bool convert_int_float;
-  DecibelNormalizerOptions() :
-    target_db(-20),
-    max_gain_db(300.0),
-    convert_int_float(false){}
+    float target_db;
+    float max_gain_db;
+    bool convert_int_float;
+    DecibelNormalizerOptions()
+        : target_db(-20), max_gain_db(300.0), convert_int_float(false) {}
 
     void Register(kaldi::OptionsItf* opts) {
-      opts->Register("target-db", &target_db, "target db for db normalization");
-      opts->Register("max-gain-db", &max_gain_db, "max gain db for db normalization");
-      opts->Register("convert-int-float", &convert_int_float, "if convert int samples to float");
+        opts->Register(
+            "target-db", &target_db, "target db for db normalization");
+        opts->Register(
+            "max-gain-db", &max_gain_db, "max gain db for db normalization");
+        opts->Register("convert-int-float",
+                       &convert_int_float,
+                       "if convert int samples to float");
     }
 };
 
@@ -29,9 +45,12 @@ class DecibelNormalizer : public FeatureExtractorInterface {
     explicit DecibelNormalizer(
         const DecibelNormalizerOptions& opts,
         std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual void AcceptWaveform(
+        const kaldi::VectorBase<kaldi::BaseFloat>& input);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
     virtual size_t Dim() const { return dim_; }
+    virtual void SetFinished() { base_extractor_->SetFinished(); }
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
 
   private:
     bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* feat) const;
@@ -44,12 +63,14 @@ class DecibelNormalizer : public FeatureExtractorInterface {
 
 class CMVN : public FeatureExtractorInterface {
   public:
-    explicit CMVN(
-        std::string cmvn_file,
-        std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    explicit CMVN(std::string cmvn_file,
+                  std::unique_ptr<FeatureExtractorInterface> base_extractor);
+    virtual void AcceptWaveform(
+        const kaldi::VectorBase<kaldi::BaseFloat>& input);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
     virtual size_t Dim() const { return dim_; }
+    virtual void SetFinished() { base_extractor_->SetFinished(); }
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
 
   private:
     void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feat) const;
diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc
index f4584828..0c7f4d21 100644
--- a/speechx/speechx/frontend/raw_audio.cc
+++ b/speechx/speechx/frontend/raw_audio.cc
@@ -1,60 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "frontend/raw_audio.h"
 #include "kaldi/base/timer.h"
 
 namespace ppspeech {
 
-RawAudioSource::RawAudioSource(int buffer_size = 65536) 
-    : finished_(false),
-      data_length_(0),
-      start_(0),
-      timeout_(5) {
-  ring_buffer_.resize(buffer_size);
-} 
-
-// todo length > buffer size, condition_var
-bool RawAudioSource::AcceptWaveform(const VectorBase<BaseFloat>& data) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  for (size_t idx = 0; idx < data.Dim(); ++idx) {
-      ring_buffer_[idx % ring_buffer_.size()] = data(idx);
-  }
-  data_length_ += length;
+using kaldi::BaseFloat;
+using kaldi::VectorBase;
+using kaldi::Vector;
+
+RawAudioSource::RawAudioSource(int buffer_size)
+    : finished_(false), data_length_(0), start_(0), timeout_(1) {
+    ring_buffer_.resize(buffer_size);
 }
 
-// todo length > buffer size
-//bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) {
-  //std::lock_guard<std::mutex> lock(mutex_);
-  //for (size_t idx = 0; idx < length; ++idx) {
-      //ring_buffer_[idx % ring_buffer_.size()] = data[idx];
-  //}
-  //data_length_ += length;
-  //finish_condition_.notify_one();
+void RawAudioSource::AcceptWaveform(const VectorBase<BaseFloat>& data) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (data_length_ + data.Dim() > ring_buffer_.size()) {
+        ready_feed_condition_.wait(lock);
+    }
+    for (size_t idx = 0; idx < data.Dim(); ++idx) {
+        ring_buffer_[idx % ring_buffer_.size()] = data(idx);
+    }
+    data_length_ += data.Dim();
+    ready_read_condition_.notify_one();
+}
+
+// bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) {
+// std::unique_lock<std::mutex> lock(mutex_);
+// for (size_t idx = 0; idx < length; ++idx) {
+// ring_buffer_[idx % ring_buffer_.size()] = data[idx];
+//}
+// data_length_ += length;
+// finish_condition_.notify_one();
 //}
 
-bool RawAudioSource::Read(Vector<BaseFloat>* feats) {
-  size_t chunk_size = feats->Dim();
-  Timer timer;
-  if (chunk_size > data_length_) {
-    while (true) {
-      int32 elapsed = static_cat<int32>(timer.Elapsed() * 1000);
-      if (finished_ || > timeout_) {
+bool RawAudioSource::Read(Vector<BaseFloat>* feat) {
+    size_t chunk_size = feat->Dim();
+    kaldi::Timer timer;
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (chunk_size > data_length_) {
+        // when audio is empty and no more data feed
+        // ready_read_condition will block in dead lock.
+        // ready_read_condition_.wait(lock);
+        int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000);
+        if (elapsed > timeout_) {
+            if (finished_ == true) {  // read last chunk data
+                break;
+            }
+            if (chunk_size > data_length_) {
+                return false;
+            }
+        }
+        usleep(100);  // sleep 0.1 ms
+    }
+
+    // read last chunk data
+    if (chunk_size > data_length_) {
         chunk_size = data_length_;
-        feats->Resize(chunk_size);
-        break;
-      }
-      sleep(1); 
+        feat->Resize(chunk_size);
     }
-  }
-  std::lock_guard<std::mutex> lock(mutex_);
-  for (size_t idx = 0; idx < chunk_size; ++idx) {
-    feats->Data()[idx] = ring_buffer_[idx];
-  }
-  data_length_ -= chunk_size;
-  start_ = (start_ + chunk_size) % ring_buffer_.size();
-  finish_condition_.notify_one();
-}
 
-//size_t RawAudioSource::GetDataLength() {
-//  return data_length_;
-//}
+    for (size_t idx = 0; idx < chunk_size; ++idx) {
+        feat->Data()[idx] = ring_buffer_[idx];
+    }
+    data_length_ -= chunk_size;
+    start_ = (start_ + chunk_size) % ring_buffer_.size();
+    ready_feed_condition_.notify_one();
+    return true;
+}
 
-} // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h
index 24a4b2e8..1893da25 100644
--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/raw_audio.h
@@ -1,34 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 
 #pragma once
 
-#include "frontend/feature_extractor_interface.h"
 #include "base/common.h"
+#include "frontend/feature_extractor_interface.h"
 
 #pragma once
 
 namespace ppspeech {
 
-class RawAudioSource {
+class RawAudioSource : public FeatureExtractorInterface {
   public:
-    RawAudioSource(int buffer_size = kint16max);
-    virtual void AcceptWaveform(kaldi::BaseFloat* data, int length);
-    void AcceptWaveformByByte(char* data, lnt length) {}
-    void AcceptWaveformByShort(kaldi::int16* data, int length) {}
-
-    // read chunk data in buffer
-    bool Read(VectorBase<BaseFloat>* feats);
-    void SetFinished() { finished_ = true; }
-    bool IsFinished() { return finished_; }
+    explicit RawAudioSource(int buffer_size = kint16max);
+    virtual void AcceptWaveform(const kaldi::VectorBase<BaseFloat>& data);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual size_t Dim() const { return data_length_; }
+    virtual void SetFinished() {
+        std::lock_guard<std::mutex> lock(mutex_);
+        finished_ = true;
+    }
+    virtual bool IsFinished() const { return finished_; }
 
   private:
-    vector<kaldi::BaseFloat> ring_buffer_;
+    std::vector<kaldi::BaseFloat> ring_buffer_;
     size_t start_;
     size_t data_length_;
     bool finished_;
-    mutable std::mutex mutext_;
+    mutable std::mutex mutex_;
     std::condition_variable ready_read_condition_;
     std::condition_variable ready_feed_condition_;
     kaldi::int32 timeout_;
+
+    DISALLOW_COPY_AND_ASSIGN(RawAudioSource);
+};
+
+// it is a datasource for testing different frontend module.
+class RawDataSource : public FeatureExtractorInterface {
+  public:
+    explicit RawDataSource() { finished_ = false; }
+    virtual void AcceptWaveform(
+        const kaldi::VectorBase<kaldi::BaseFloat>& input) {
+        data_ = input;
+    }
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
+        if (data_.Dim() == 0) {
+            return false;
+        }
+        (*feat) = data_;
+        data_.Resize(0);
+        return true;
+    }
+    virtual size_t Dim() const { return data_.Dim(); }
+    virtual void SetFinished() { finished_ = true; }
+    virtual bool IsFinished() const { return finished_; }
+
+  private:
+    kaldi::Vector<kaldi::BaseFloat> data_;
+    bool finished_;
+
+    DISALLOW_COPY_AND_ASSIGN(RawDataSource);
 };
 
-} // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech
\ No newline at end of file

From 7c1b432830b4a2680875b628097b4ce27f08e440 Mon Sep 17 00:00:00 2001
From: SmileGoat <goat.zhou@qq.com>
Date: Thu, 10 Mar 2022 09:54:32 +0800
Subject: [PATCH 5/7] format

---
 speechx/speechx/frontend/feature_cache.h | 4 +++-
 speechx/speechx/frontend/raw_audio.cc    | 5 ++---
 speechx/speechx/frontend/raw_audio.h     | 5 +----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h
index c7d66251..5849cc5c 100644
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
+
 #include "base/common.h"
 #include "frontend/feature_extractor_interface.h"
 
@@ -45,4 +47,4 @@ class FeatureCache : public FeatureExtractorInterface {
     //    DISALLOW_COPY_AND_ASSGIN(FeatureCache);
 };
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc
index 0c7f4d21..1e265a57 100644
--- a/speechx/speechx/frontend/raw_audio.cc
+++ b/speechx/speechx/frontend/raw_audio.cc
@@ -35,7 +35,6 @@ void RawAudioSource::AcceptWaveform(const VectorBase<BaseFloat>& data) {
         ring_buffer_[idx % ring_buffer_.size()] = data(idx);
     }
     data_length_ += data.Dim();
-    ready_read_condition_.notify_one();
 }
 
 // bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) {
@@ -53,7 +52,7 @@ bool RawAudioSource::Read(Vector<BaseFloat>* feat) {
     std::unique_lock<std::mutex> lock(mutex_);
     while (chunk_size > data_length_) {
         // when audio is empty and no more data feed
-        // ready_read_condition will block in dead lock.
+        // ready_read_condition will block in dead lock. so replace with timeout_
         // ready_read_condition_.wait(lock);
         int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000);
         if (elapsed > timeout_) {
@@ -82,4 +81,4 @@ bool RawAudioSource::Read(Vector<BaseFloat>* feat) {
     return true;
 }
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h
index 1893da25..c3ebe559 100644
--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/raw_audio.h
@@ -18,8 +18,6 @@
 #include "base/common.h"
 #include "frontend/feature_extractor_interface.h"
 
-#pragma once
-
 namespace ppspeech {
 
 class RawAudioSource : public FeatureExtractorInterface {
@@ -40,7 +38,6 @@ class RawAudioSource : public FeatureExtractorInterface {
     size_t data_length_;
     bool finished_;
     mutable std::mutex mutex_;
-    std::condition_variable ready_read_condition_;
     std::condition_variable ready_feed_condition_;
     kaldi::int32 timeout_;
 
@@ -74,4 +71,4 @@ class RawDataSource : public FeatureExtractorInterface {
     DISALLOW_COPY_AND_ASSIGN(RawDataSource);
 };
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech

From 22fe1c9dbeaee33c46931ad11748d6346c472854 Mon Sep 17 00:00:00 2001
From: SmileGoat <goat.zhou@qq.com>
Date: Thu, 10 Mar 2022 15:01:45 +0800
Subject: [PATCH 6/7] rename interface & add comment to Dim()

---
 .../examples/feat/linear_spectrogram_main.cc  | 15 ++++----
 speechx/speechx/frontend/feature_cache.cc     | 12 +++---
 speechx/speechx/frontend/feature_cache.h      | 11 ++++--
 .../frontend/feature_extractor_interface.h    | 13 +++++--
 .../speechx/frontend/linear_spectrogram.cc    | 13 ++++---
 speechx/speechx/frontend/linear_spectrogram.h | 10 ++---
 speechx/speechx/frontend/normalizer.cc        | 37 +++++++++----------
 speechx/speechx/frontend/normalizer.h         | 21 +++++++----
 speechx/speechx/frontend/raw_audio.cc         | 31 ++++++----------
 speechx/speechx/frontend/raw_audio.h          | 22 ++++++-----
 10 files changed, 94 insertions(+), 91 deletions(-)

diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc
index c2ca6187..f137a52c 100644
--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@@ -25,10 +25,9 @@
 #include "kaldi/util/kaldi-io.h"
 #include "kaldi/util/table-types.h"
 
-DEFINE_string(wav_rspecifier, "", "test wav path");
-DEFINE_string(feature_wspecifier, "", "test wav ark");
-DEFINE_string(feature_check_wspecifier, "", "test wav ark");
-DEFINE_string(cmvn_write_path, "./cmvn.ark", "test wav ark");
+DEFINE_string(wav_rspecifier, "", "test wav scp path");
+DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
+DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
 
 
 std::vector<float> mean_{
@@ -165,10 +164,10 @@ int main(int argc, char* argv[]) {
     // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
     // window -->linear_spectrogram --> cmvn
     int32 num_done = 0, num_err = 0;
-    // std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
-    // ppspeech::RawDataSource());
+    //std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
+     //ppspeech::RawDataCache());
     std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
-        new ppspeech::RawAudioSource());
+        new ppspeech::RawAudioCache());
 
     ppspeech::LinearSpectrogramOptions opt;
     opt.frame_opts.frame_length_ms = 20;
@@ -211,7 +210,7 @@ int main(int argc, char* argv[]) {
                 wav_chunk(i) = waveform(sample_offset + i);
             }
             kaldi::Vector<BaseFloat> features;
-            feature_cache.AcceptWaveform(wav_chunk);
+            feature_cache.Accept(wav_chunk);
             if (cur_chunk_size < chunk_sample_size) {
                 feature_cache.SetFinished();
             }
diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc
index df366a06..27982f64 100644
--- a/speechx/speechx/frontend/feature_cache.cc
+++ b/speechx/speechx/frontend/feature_cache.cc
@@ -29,9 +29,9 @@ FeatureCache::FeatureCache(
     base_extractor_ = std::move(base_extractor);
 }
 
-void FeatureCache::AcceptWaveform(
-    const kaldi::VectorBase<kaldi::BaseFloat>& input) {
-    base_extractor_->AcceptWaveform(input);
+void FeatureCache::Accept(
+    const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+    base_extractor_->Accept(inputs);
     // feed current data
     bool result = false;
     do {
@@ -40,7 +40,7 @@ void FeatureCache::AcceptWaveform(
 }
 
 // pop feature chunk
-bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
+bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* output_feats) {
     kaldi::Timer timer;
     std::unique_lock<std::mutex> lock(mutex_);
     while (cache_.empty() && base_extractor_->IsFinished() == false) {
@@ -53,8 +53,8 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
         usleep(1000);  // sleep 1 ms
     }
     if (cache_.empty()) return false;
-    feat->Resize(cache_.front().Dim());
-    feat->CopyFromVec(cache_.front());
+    output_feats->Resize(cache_.front().Dim());
+    output_feats->CopyFromVec(cache_.front());
     cache_.pop();
     ready_feed_condition_.notify_one();
     return true;
diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h
index 5849cc5c..9442fe1f 100644
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@@ -24,12 +24,15 @@ class FeatureCache : public FeatureExtractorInterface {
     explicit FeatureCache(
         int32 max_size = kint16max,
         std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual void Accept(
+        const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+    // output_feats dim = num_frames * feature_dim
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_feats);
+    // feature cache only cache feature which from base extractor
     virtual size_t Dim() const { return base_extractor_->Dim(); }
     virtual void SetFinished() {
         base_extractor_->SetFinished();
+        // read the last chunk data
         Compute();
     }
     virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
@@ -44,7 +47,7 @@ class FeatureCache : public FeatureExtractorInterface {
     std::unique_ptr<FeatureExtractorInterface> base_extractor_;
     std::condition_variable ready_feed_condition_;
     std::condition_variable ready_read_condition_;
-    //    DISALLOW_COPY_AND_ASSGIN(FeatureCache);
+    //DISALLOW_COPY_AND_ASSGIN(FeatureCache);
 };
 
 }  // namespace ppspeech
diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h
index e490bc75..70fa93ae 100644
--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@@ -21,13 +21,18 @@ namespace ppspeech {
 
 class FeatureExtractorInterface {
   public:
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input) = 0;
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
+    // accept input data
+    virtual void Accept(
+        const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
+    // get the processed result
+    // the length of output = feature_row * feature_dim,
+    // the Matrix is squashed into Vector
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs) = 0;
+    // the Dim is the feature dim
     virtual size_t Dim() const = 0;
     virtual void SetFinished() = 0;
     virtual bool IsFinished() const = 0;
     // virtual void Reset();
 };
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc
index 73cffea5..c0ae553f 100644
--- a/speechx/speechx/frontend/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/linear_spectrogram.cc
@@ -66,11 +66,11 @@ LinearSpectrogram::LinearSpectrogram(
     dim_ = fft_points_ / 2 + 1;  // the dimension is Fs/2 Hz
 }
 
-void LinearSpectrogram::AcceptWaveform(const VectorBase<BaseFloat>& input) {
-    base_extractor_->AcceptWaveform(input);
+void LinearSpectrogram::Accept(const VectorBase<BaseFloat>& inputs) {
+    base_extractor_->Accept(inputs);
 }
 
-bool LinearSpectrogram::Read(Vector<BaseFloat>* feat) {
+bool LinearSpectrogram::Read(Vector<BaseFloat>* output_feats) {
     Vector<BaseFloat> input_feats(chunk_sample_size_);
     bool flag = base_extractor_->Read(&input_feats);
     if (flag == false || input_feats.Dim() == 0) return false;
@@ -83,9 +83,10 @@ bool LinearSpectrogram::Read(Vector<BaseFloat>* feat) {
     if (result.size() != 0) {
         feat_size = result.size() * result[0].size();
     }
-    feat->Resize(feat_size);
+    output_feats->Resize(feat_size);
+    // todo refactor (SimleGoat)
     for (size_t idx = 0; idx < feat_size; ++idx) {
-        (*feat)(idx) = result[idx / dim_][idx % dim_];
+        (*output_feats)(idx) = result[idx / dim_][idx % dim_];
     }
     return true;
 }
@@ -117,7 +118,7 @@ bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
     return true;
 }
 
-// Compute spectrogram feat, only for test, remove later
+// Compute spectrogram feat
 // todo: refactor later (SmileGoat)
 bool LinearSpectrogram::Compute(const vector<float>& wave,
                                 vector<vector<float>>& feat) {
diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h
index c18438eb..5c73f207 100644
--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@@ -38,9 +38,10 @@ class LinearSpectrogram : public FeatureExtractorInterface {
     explicit LinearSpectrogram(
         const LinearSpectrogramOptions& opts,
         std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual void Accept(
+        const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_feats);
+    // the dim_ is the dim of single frame feature
     virtual size_t Dim() const { return dim_; }
     virtual void SetFinished() { base_extractor_->SetFinished(); }
     virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
@@ -49,8 +50,6 @@ class LinearSpectrogram : public FeatureExtractorInterface {
     void Hanning(std::vector<kaldi::BaseFloat>* data) const;
     bool Compute(const std::vector<kaldi::BaseFloat>& wave,
                  std::vector<std::vector<kaldi::BaseFloat>>& feat);
-    void Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
-                 kaldi::VectorBase<kaldi::BaseFloat>* feature);
     bool NumpyFft(std::vector<kaldi::BaseFloat>* v,
                   std::vector<kaldi::BaseFloat>* real,
                   std::vector<kaldi::BaseFloat>* img) const;
@@ -60,7 +59,6 @@ class LinearSpectrogram : public FeatureExtractorInterface {
     std::vector<kaldi::BaseFloat> hanning_window_;
     kaldi::BaseFloat hanning_window_energy_;
     LinearSpectrogramOptions opts_;
-    kaldi::Vector<kaldi::BaseFloat> waveform_;  // remove later, todo(SmileGoat)
     std::unique_ptr<FeatureExtractorInterface> base_extractor_;
     int chunk_sample_size_;
     DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc
index 8aaf33de..3af44c38 100644
--- a/speechx/speechx/frontend/normalizer.cc
+++ b/speechx/speechx/frontend/normalizer.cc
@@ -31,23 +31,20 @@ DecibelNormalizer::DecibelNormalizer(
     std::unique_ptr<FeatureExtractorInterface> base_extractor) {
     base_extractor_ = std::move(base_extractor);
     opts_ = opts;
-    dim_ = 0;
+    dim_ = 1;
 }
 
-void DecibelNormalizer::AcceptWaveform(
-    const kaldi::VectorBase<BaseFloat>& input) {
-    // dim_ = input.Dim();
-    // waveform_.Resize(input.Dim());
-    // waveform_.CopyFromVec(input);
-    base_extractor_->AcceptWaveform(input);
+void DecibelNormalizer::Accept(
+    const kaldi::VectorBase<BaseFloat>& inputs_wave) {
+    base_extractor_->Accept(inputs_wave);
 }
 
-bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* feat) {
-    // if (waveform_.Dim() == 0) return;
-    if (base_extractor_->Read(feat) == false || feat->Dim() == 0) {
+bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* outputs_wave) {
+    if (base_extractor_->Read(outputs_wave) == false || 
+        outputs_wave->Dim() == 0) {
         return false;
     }
-    Compute(feat);
+    Compute(outputs_wave);
     return true;
 }
 
@@ -70,7 +67,7 @@ void CopyStdVector2Vector(const vector<BaseFloat>& input,
     }
 }
 
-bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feat) const {
+bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feats) const {
     // calculate db rms
     BaseFloat rms_db = 0.0;
     BaseFloat mean_square = 0.0;
@@ -78,9 +75,9 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feat) const {
     BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
 
     vector<BaseFloat> samples;
-    samples.resize(feat->Dim());
+    samples.resize(feats->Dim());
     for (size_t i = 0; i < samples.size(); ++i) {
-        samples[i] = (*feat)(i);
+        samples[i] = (*feats)(i);
     }
 
     // square
@@ -110,7 +107,7 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feat) const {
         item *= std::pow(10.0, gain / 20.0);
     }
 
-    CopyStdVector2Vector(samples, feat);
+    CopyStdVector2Vector(samples, feats);
     return true;
 }
 
@@ -124,16 +121,16 @@ CMVN::CMVN(std::string cmvn_file,
     dim_ = stats_.NumCols() - 1;
 }
 
-void CMVN::AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) {
-    base_extractor_->AcceptWaveform(input);
+void CMVN::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& feats) {
+    base_extractor_->Accept(feats);
     return;
 }
 
-bool CMVN::Read(kaldi::Vector<BaseFloat>* feat) {
-    if (base_extractor_->Read(feat) == false) {
+bool CMVN::Read(kaldi::Vector<BaseFloat>* outputs) {
+    if (base_extractor_->Read(outputs) == false) {
         return false;
     }
-    Compute(feat);
+    Compute(outputs);
     return true;
 }
 
diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h
index 189e0e2b..ab333624 100644
--- a/speechx/speechx/frontend/normalizer.h
+++ b/speechx/speechx/frontend/normalizer.h
@@ -45,15 +45,16 @@ class DecibelNormalizer : public FeatureExtractorInterface {
     explicit DecibelNormalizer(
         const DecibelNormalizerOptions& opts,
         std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual void Accept(
+        const kaldi::VectorBase<kaldi::BaseFloat>& inputs_wave);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs_wave);
+    // noramlize audio, the dim is 1.
     virtual size_t Dim() const { return dim_; }
     virtual void SetFinished() { base_extractor_->SetFinished(); }
     virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
 
   private:
-    bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* feat) const;
+    bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;
     DecibelNormalizerOptions opts_;
     size_t dim_;
     std::unique_ptr<FeatureExtractorInterface> base_extractor_;
@@ -65,15 +66,19 @@ class CMVN : public FeatureExtractorInterface {
   public:
     explicit CMVN(std::string cmvn_file,
                   std::unique_ptr<FeatureExtractorInterface> base_extractor);
-    virtual void AcceptWaveform(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual void Accept(
+        const kaldi::VectorBase<kaldi::BaseFloat>& feats);
+
+    // the length of outputs = feature_row * feature_dim,
+    // the Matrix is squashed into Vector
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs);
+    // the dim_ is the feautre dim.
     virtual size_t Dim() const { return dim_; }
     virtual void SetFinished() { base_extractor_->SetFinished(); }
     virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
 
   private:
-    void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feat) const;
+    void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;
     void ApplyCMVN(kaldi::MatrixBase<BaseFloat>* feats);
     kaldi::Matrix<double> stats_;
     std::unique_ptr<FeatureExtractorInterface> base_extractor_;
diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc
index 1e265a57..7cfeb9e4 100644
--- a/speechx/speechx/frontend/raw_audio.cc
+++ b/speechx/speechx/frontend/raw_audio.cc
@@ -21,33 +21,25 @@ using kaldi::BaseFloat;
 using kaldi::VectorBase;
 using kaldi::Vector;
 
-RawAudioSource::RawAudioSource(int buffer_size)
+RawAudioCache::RawAudioCache(int buffer_size)
     : finished_(false), data_length_(0), start_(0), timeout_(1) {
     ring_buffer_.resize(buffer_size);
 }
 
-void RawAudioSource::AcceptWaveform(const VectorBase<BaseFloat>& data) {
+void RawAudioCache::Accept(const VectorBase<BaseFloat>& input_audio) {
     std::unique_lock<std::mutex> lock(mutex_);
-    while (data_length_ + data.Dim() > ring_buffer_.size()) {
+    while (data_length_ + input_audio.Dim() > ring_buffer_.size()) {
         ready_feed_condition_.wait(lock);
     }
-    for (size_t idx = 0; idx < data.Dim(); ++idx) {
-        ring_buffer_[idx % ring_buffer_.size()] = data(idx);
+    for (size_t idx = 0; idx < input_audio.Dim(); ++idx) {
+        int32 buffer_idx = (idx + start_) % ring_buffer_.size(); 
+        ring_buffer_[buffer_idx] = input_audio(idx);
     }
-    data_length_ += data.Dim();
+    data_length_ += input_audio.Dim();
 }
 
-// bool RawAudioSource::AcceptWaveform(BaseFloat* data, int length) {
-// std::unique_lock<std::mutex> lock(mutex_);
-// for (size_t idx = 0; idx < length; ++idx) {
-// ring_buffer_[idx % ring_buffer_.size()] = data[idx];
-//}
-// data_length_ += length;
-// finish_condition_.notify_one();
-//}
-
-bool RawAudioSource::Read(Vector<BaseFloat>* feat) {
-    size_t chunk_size = feat->Dim();
+bool RawAudioCache::Read(Vector<BaseFloat>* output_audio) {
+    size_t chunk_size = output_audio->Dim();
     kaldi::Timer timer;
     std::unique_lock<std::mutex> lock(mutex_);
     while (chunk_size > data_length_) {
@@ -69,11 +61,12 @@ bool RawAudioSource::Read(Vector<BaseFloat>* feat) {
     // read last chunk data
     if (chunk_size > data_length_) {
         chunk_size = data_length_;
-        feat->Resize(chunk_size);
+        output_audio->Resize(chunk_size);
     }
 
     for (size_t idx = 0; idx < chunk_size; ++idx) {
-        feat->Data()[idx] = ring_buffer_[idx];
+        int buff_idx = (start_ + idx) % ring_buffer_.size();
+        output_audio->Data()[idx] = ring_buffer_[buff_idx];
     }
     data_length_ -= chunk_size;
     start_ = (start_ + chunk_size) % ring_buffer_.size();
diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h
index c3ebe559..c3f5a0e1 100644
--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/raw_audio.h
@@ -20,12 +20,13 @@
 
 namespace ppspeech {
 
-class RawAudioSource : public FeatureExtractorInterface {
+class RawAudioCache : public FeatureExtractorInterface {
   public:
-    explicit RawAudioSource(int buffer_size = kint16max);
-    virtual void AcceptWaveform(const kaldi::VectorBase<BaseFloat>& data);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat);
-    virtual size_t Dim() const { return data_length_; }
+    explicit RawAudioCache(int buffer_size = kint16max);
+    virtual void Accept(const kaldi::VectorBase<BaseFloat>& input_audio);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_audio);
+    // the audio dim is 1
+    virtual size_t Dim() const { return 1; }
     virtual void SetFinished() {
         std::lock_guard<std::mutex> lock(mutex_);
         finished_ = true;
@@ -41,14 +42,14 @@ class RawAudioSource : public FeatureExtractorInterface {
     std::condition_variable ready_feed_condition_;
     kaldi::int32 timeout_;
 
-    DISALLOW_COPY_AND_ASSIGN(RawAudioSource);
+    DISALLOW_COPY_AND_ASSIGN(RawAudioCache);
 };
 
 // it is a datasource for testing different frontend module.
-class RawDataSource : public FeatureExtractorInterface {
+class RawDataCache: public FeatureExtractorInterface {
   public:
-    explicit RawDataSource() { finished_ = false; }
-    virtual void AcceptWaveform(
+    explicit RawDataCache() { finished_ = false; }
+    virtual void Accept(
         const kaldi::VectorBase<kaldi::BaseFloat>& input) {
         data_ = input;
     }
@@ -60,6 +61,7 @@ class RawDataSource : public FeatureExtractorInterface {
         data_.Resize(0);
         return true;
     }
+    //the dim is data_ length
     virtual size_t Dim() const { return data_.Dim(); }
     virtual void SetFinished() { finished_ = true; }
     virtual bool IsFinished() const { return finished_; }
@@ -68,7 +70,7 @@ class RawDataSource : public FeatureExtractorInterface {
     kaldi::Vector<kaldi::BaseFloat> data_;
     bool finished_;
 
-    DISALLOW_COPY_AND_ASSIGN(RawDataSource);
+    DISALLOW_COPY_AND_ASSIGN(RawDataCache);
 };
 
 }  // namespace ppspeech

From 027feae9f233014b842f1fa2d6921e77f38f69d4 Mon Sep 17 00:00:00 2001
From: SmileGoat <goat.zhou@qq.com>
Date: Thu, 10 Mar 2022 16:28:11 +0800
Subject: [PATCH 7/7] rename arg of Accept & Read

---
 speechx/speechx/frontend/feature_cache.cc     |  6 ++--
 speechx/speechx/frontend/feature_cache.h      |  4 +--
 .../frontend/feature_extractor_interface.h    |  3 +-
 .../speechx/frontend/linear_spectrogram.cc    | 30 +++++++++----------
 speechx/speechx/frontend/linear_spectrogram.h |  6 ++--
 speechx/speechx/frontend/normalizer.cc        | 30 +++++++++----------
 speechx/speechx/frontend/normalizer.h         | 12 ++++----
 speechx/speechx/frontend/raw_audio.cc         | 18 +++++------
 speechx/speechx/frontend/raw_audio.h          | 15 +++++-----
 9 files changed, 63 insertions(+), 61 deletions(-)

diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/feature_cache.cc
index 27982f64..b353df16 100644
--- a/speechx/speechx/frontend/feature_cache.cc
+++ b/speechx/speechx/frontend/feature_cache.cc
@@ -40,7 +40,7 @@ void FeatureCache::Accept(
 }
 
 // pop feature chunk
-bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* output_feats) {
+bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
     kaldi::Timer timer;
     std::unique_lock<std::mutex> lock(mutex_);
     while (cache_.empty() && base_extractor_->IsFinished() == false) {
@@ -53,8 +53,8 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* output_feats) {
         usleep(1000);  // sleep 1 ms
     }
     if (cache_.empty()) return false;
-    output_feats->Resize(cache_.front().Dim());
-    output_feats->CopyFromVec(cache_.front());
+    feats->Resize(cache_.front().Dim());
+    feats->CopyFromVec(cache_.front());
     cache_.pop();
     ready_feed_condition_.notify_one();
     return true;
diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h
index 9442fe1f..03b11f57 100644
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@@ -26,8 +26,8 @@ class FeatureCache : public FeatureExtractorInterface {
         std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
     virtual void Accept(
         const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
-    // output_feats dim = num_frames * feature_dim
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_feats);
+    // feats dim = num_frames * feature_dim
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
     // feature cache only cache feature which from base extractor
     virtual size_t Dim() const { return base_extractor_->Dim(); }
     virtual void SetFinished() {
diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h
index 70fa93ae..64cc67f3 100644
--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@@ -21,7 +21,8 @@ namespace ppspeech {
 
 class FeatureExtractorInterface {
   public:
-    // accept input data
+    // accept input data, accept feature or raw waves which decided 
+    // by the base_extractor
     virtual void Accept(
         const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
     // get the processed result
diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc
index c0ae553f..7491716c 100644
--- a/speechx/speechx/frontend/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/linear_spectrogram.cc
@@ -70,7 +70,7 @@ void LinearSpectrogram::Accept(const VectorBase<BaseFloat>& inputs) {
     base_extractor_->Accept(inputs);
 }
 
-bool LinearSpectrogram::Read(Vector<BaseFloat>* output_feats) {
+bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
     Vector<BaseFloat> input_feats(chunk_sample_size_);
     bool flag = base_extractor_->Read(&input_feats);
     if (flag == false || input_feats.Dim() == 0) return false;
@@ -83,10 +83,10 @@ bool LinearSpectrogram::Read(Vector<BaseFloat>* output_feats) {
     if (result.size() != 0) {
         feat_size = result.size() * result[0].size();
     }
-    output_feats->Resize(feat_size);
+    feats->Resize(feat_size);
     // todo refactor (SimleGoat)
     for (size_t idx = 0; idx < feat_size; ++idx) {
-        (*output_feats)(idx) = result[idx / dim_][idx % dim_];
+        (*feats)(idx) = result[idx / dim_][idx % dim_];
     }
     return true;
 }
@@ -120,9 +120,9 @@ bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
 
 // Compute spectrogram feat
 // todo: refactor later (SmileGoat)
-bool LinearSpectrogram::Compute(const vector<float>& wave,
-                                vector<vector<float>>& feat) {
-    int num_samples = wave.size();
+bool LinearSpectrogram::Compute(const vector<float>& waves,
+                                vector<vector<float>>& feats) {
+    int num_samples = waves.size();
     const int& frame_length = opts_.frame_opts.WindowSize();
     const int& sample_rate = opts_.frame_opts.samp_freq;
     const int& frame_shift = opts_.frame_opts.WindowShift();
@@ -134,34 +134,34 @@ bool LinearSpectrogram::Compute(const vector<float>& wave,
     }
 
     int num_frames = 1 + ((num_samples - frame_length) / frame_shift);
-    feat.resize(num_frames);
+    feats.resize(num_frames);
     vector<float> fft_real((fft_points_ / 2 + 1), 0);
     vector<float> fft_img((fft_points_ / 2 + 1), 0);
     vector<float> v(frame_length, 0);
     vector<float> power((fft_points / 2 + 1));
 
     for (int i = 0; i < num_frames; ++i) {
-        vector<float> data(wave.data() + i * frame_shift,
-                           wave.data() + i * frame_shift + frame_length);
+        vector<float> data(waves.data() + i * frame_shift,
+                           waves.data() + i * frame_shift + frame_length);
         Hanning(&data);
         fft_img.clear();
         fft_real.clear();
         v.assign(data.begin(), data.end());
         NumpyFft(&v, &fft_real, &fft_img);
 
-        feat[i].resize(fft_points / 2 + 1);  // the last dimension is Fs/2 Hz
+        feats[i].resize(fft_points / 2 + 1);  // the last dimension is Fs/2 Hz
         for (int j = 0; j < (fft_points / 2 + 1); ++j) {
             power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j];
-            feat[i][j] = power[j];
+            feats[i][j] = power[j];
 
-            if (j == 0 || j == feat[0].size() - 1) {
-                feat[i][j] /= scale;
+            if (j == 0 || j == feats[0].size() - 1) {
+                feats[i][j] /= scale;
             } else {
-                feat[i][j] *= (2.0 / scale);
+                feats[i][j] *= (2.0 / scale);
             }
 
             // log added eps=1e-14
-            feat[i][j] = std::log(feat[i][j] + 1e-14);
+            feats[i][j] = std::log(feats[i][j] + 1e-14);
         }
     }
     return true;
diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h
index 5c73f207..790263d9 100644
--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@@ -40,7 +40,7 @@ class LinearSpectrogram : public FeatureExtractorInterface {
         std::unique_ptr<FeatureExtractorInterface> base_extractor);
     virtual void Accept(
         const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_feats);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
     // the dim_ is the dim of single frame feature
     virtual size_t Dim() const { return dim_; }
     virtual void SetFinished() { base_extractor_->SetFinished(); }
@@ -48,8 +48,8 @@ class LinearSpectrogram : public FeatureExtractorInterface {
 
   private:
     void Hanning(std::vector<kaldi::BaseFloat>* data) const;
-    bool Compute(const std::vector<kaldi::BaseFloat>& wave,
-                 std::vector<std::vector<kaldi::BaseFloat>>& feat);
+    bool Compute(const std::vector<kaldi::BaseFloat>& waves,
+                 std::vector<std::vector<kaldi::BaseFloat>>& feats);
     bool NumpyFft(std::vector<kaldi::BaseFloat>* v,
                   std::vector<kaldi::BaseFloat>* real,
                   std::vector<kaldi::BaseFloat>* img) const;
diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc
index 3af44c38..fbb2b645 100644
--- a/speechx/speechx/frontend/normalizer.cc
+++ b/speechx/speechx/frontend/normalizer.cc
@@ -35,16 +35,16 @@ DecibelNormalizer::DecibelNormalizer(
 }
 
 void DecibelNormalizer::Accept(
-    const kaldi::VectorBase<BaseFloat>& inputs_wave) {
-    base_extractor_->Accept(inputs_wave);
+    const kaldi::VectorBase<BaseFloat>& waves) {
+    base_extractor_->Accept(waves);
 }
 
-bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* outputs_wave) {
-    if (base_extractor_->Read(outputs_wave) == false || 
-        outputs_wave->Dim() == 0) {
+bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* waves) {
+    if (base_extractor_->Read(waves) == false || 
+        waves->Dim() == 0) {
         return false;
     }
-    Compute(outputs_wave);
+    Compute(waves);
     return true;
 }
 
@@ -67,7 +67,7 @@ void CopyStdVector2Vector(const vector<BaseFloat>& input,
     }
 }
 
-bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feats) const {
+bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
     // calculate db rms
     BaseFloat rms_db = 0.0;
     BaseFloat mean_square = 0.0;
@@ -75,9 +75,9 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feats) const {
     BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
 
     vector<BaseFloat> samples;
-    samples.resize(feats->Dim());
+    samples.resize(waves->Dim());
     for (size_t i = 0; i < samples.size(); ++i) {
-        samples[i] = (*feats)(i);
+        samples[i] = (*waves)(i);
     }
 
     // square
@@ -107,7 +107,7 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* feats) const {
         item *= std::pow(10.0, gain / 20.0);
     }
 
-    CopyStdVector2Vector(samples, feats);
+    CopyStdVector2Vector(samples, waves);
     return true;
 }
 
@@ -121,16 +121,16 @@ CMVN::CMVN(std::string cmvn_file,
     dim_ = stats_.NumCols() - 1;
 }
 
-void CMVN::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& feats) {
-    base_extractor_->Accept(feats);
+void CMVN::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+    base_extractor_->Accept(inputs);
     return;
 }
 
-bool CMVN::Read(kaldi::Vector<BaseFloat>* outputs) {
-    if (base_extractor_->Read(outputs) == false) {
+bool CMVN::Read(kaldi::Vector<BaseFloat>* feats) {
+    if (base_extractor_->Read(feats) == false) {
         return false;
     }
-    Compute(outputs);
+    Compute(feats);
     return true;
 }
 
diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/normalizer.h
index ab333624..b9daa853 100644
--- a/speechx/speechx/frontend/normalizer.h
+++ b/speechx/speechx/frontend/normalizer.h
@@ -46,15 +46,15 @@ class DecibelNormalizer : public FeatureExtractorInterface {
         const DecibelNormalizerOptions& opts,
         std::unique_ptr<FeatureExtractorInterface> base_extractor);
     virtual void Accept(
-        const kaldi::VectorBase<kaldi::BaseFloat>& inputs_wave);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs_wave);
+        const kaldi::VectorBase<kaldi::BaseFloat>& waves);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
     // noramlize audio, the dim is 1.
     virtual size_t Dim() const { return dim_; }
     virtual void SetFinished() { base_extractor_->SetFinished(); }
     virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
 
   private:
-    bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;
+    bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* waves) const;
     DecibelNormalizerOptions opts_;
     size_t dim_;
     std::unique_ptr<FeatureExtractorInterface> base_extractor_;
@@ -67,11 +67,11 @@ class CMVN : public FeatureExtractorInterface {
     explicit CMVN(std::string cmvn_file,
                   std::unique_ptr<FeatureExtractorInterface> base_extractor);
     virtual void Accept(
-        const kaldi::VectorBase<kaldi::BaseFloat>& feats);
+        const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
 
-    // the length of outputs = feature_row * feature_dim,
+    // the length of feats = feature_row * feature_dim,
     // the Matrix is squashed into Vector
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
     // the dim_ is the feautre dim.
     virtual size_t Dim() const { return dim_; }
     virtual void SetFinished() { base_extractor_->SetFinished(); }
diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/raw_audio.cc
index 7cfeb9e4..0f3d83ec 100644
--- a/speechx/speechx/frontend/raw_audio.cc
+++ b/speechx/speechx/frontend/raw_audio.cc
@@ -26,20 +26,20 @@ RawAudioCache::RawAudioCache(int buffer_size)
     ring_buffer_.resize(buffer_size);
 }
 
-void RawAudioCache::Accept(const VectorBase<BaseFloat>& input_audio) {
+void RawAudioCache::Accept(const VectorBase<BaseFloat>& waves) {
     std::unique_lock<std::mutex> lock(mutex_);
-    while (data_length_ + input_audio.Dim() > ring_buffer_.size()) {
+    while (data_length_ + waves.Dim() > ring_buffer_.size()) {
         ready_feed_condition_.wait(lock);
     }
-    for (size_t idx = 0; idx < input_audio.Dim(); ++idx) {
+    for (size_t idx = 0; idx < waves.Dim(); ++idx) {
         int32 buffer_idx = (idx + start_) % ring_buffer_.size(); 
-        ring_buffer_[buffer_idx] = input_audio(idx);
+        ring_buffer_[buffer_idx] = waves(idx);
     }
-    data_length_ += input_audio.Dim();
+    data_length_ += waves.Dim();
 }
 
-bool RawAudioCache::Read(Vector<BaseFloat>* output_audio) {
-    size_t chunk_size = output_audio->Dim();
+bool RawAudioCache::Read(Vector<BaseFloat>* waves) {
+    size_t chunk_size = waves->Dim();
     kaldi::Timer timer;
     std::unique_lock<std::mutex> lock(mutex_);
     while (chunk_size > data_length_) {
@@ -61,12 +61,12 @@ bool RawAudioCache::Read(Vector<BaseFloat>* output_audio) {
     // read last chunk data
     if (chunk_size > data_length_) {
         chunk_size = data_length_;
-        output_audio->Resize(chunk_size);
+        waves->Resize(chunk_size);
     }
 
     for (size_t idx = 0; idx < chunk_size; ++idx) {
         int buff_idx = (start_ + idx) % ring_buffer_.size();
-        output_audio->Data()[idx] = ring_buffer_[buff_idx];
+        waves->Data()[idx] = ring_buffer_[buff_idx];
     }
     data_length_ -= chunk_size;
     start_ = (start_ + chunk_size) % ring_buffer_.size();
diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h
index c3f5a0e1..996b6e78 100644
--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/raw_audio.h
@@ -23,8 +23,8 @@ namespace ppspeech {
 class RawAudioCache : public FeatureExtractorInterface {
   public:
     explicit RawAudioCache(int buffer_size = kint16max);
-    virtual void Accept(const kaldi::VectorBase<BaseFloat>& input_audio);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* output_audio);
+    virtual void Accept(const kaldi::VectorBase<BaseFloat>& waves);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
     // the audio dim is 1
     virtual size_t Dim() const { return 1; }
     virtual void SetFinished() {
@@ -45,19 +45,20 @@ class RawAudioCache : public FeatureExtractorInterface {
     DISALLOW_COPY_AND_ASSIGN(RawAudioCache);
 };
 
-// it is a datasource for testing different frontend module.
+// it is a data source to test different frontend module.
+// it Accepts waves or feats. 
 class RawDataCache: public FeatureExtractorInterface {
   public:
     explicit RawDataCache() { finished_ = false; }
     virtual void Accept(
-        const kaldi::VectorBase<kaldi::BaseFloat>& input) {
-        data_ = input;
+        const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+        data_ = inputs;
     }
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
         if (data_.Dim() == 0) {
             return false;
         }
-        (*feat) = data_;
+        (*feats) = data_;
         data_.Resize(0);
         return true;
     }