From 156ccfe4e3f0f768b96ef296a373d81afeeb2f97 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Wed, 1 Jun 2022 22:07:04 +0800
Subject: [PATCH 1/5] refactor frontend

---
 speechx/speechx/decoder/param.h               |   5 +-
 speechx/speechx/frontend/audio/audio_cache.h  |   4 +-
 .../frontend/audio/compute_fbank_main.cc      |  11 +-
 .../audio/compute_linear_spectrogram_main.cc  |   1 -
 speechx/speechx/frontend/audio/fbank.cc       | 103 ++++--------------
 speechx/speechx/frontend/audio/fbank.h        |  57 +++-------
 .../speechx/frontend/audio/feature_common.h   |  54 +++++++++
 .../frontend/audio/feature_common_inl.h       |  95 ++++++++++++++++
 .../speechx/frontend/audio/feature_pipeline.h |   2 +-
 .../frontend/audio/linear_spectrogram.cc      |  91 ++++------------
 .../frontend/audio/linear_spectrogram.h       |  57 ++++------
 11 files changed, 238 insertions(+), 242 deletions(-)
 create mode 100644 speechx/speechx/frontend/audio/feature_common.h
 create mode 100644 speechx/speechx/frontend/audio/feature_common_inl.h

diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index 495e5236..c8396a58 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -62,7 +62,6 @@ namespace ppspeech {
 FeaturePipelineOptions InitFeaturePipelineOptions() {
     FeaturePipelineOptions opts;
     opts.cmvn_file = FLAGS_cmvn_file;
-    opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
     kaldi::FrameExtractionOptions frame_opts;
     frame_opts.dither = 0.0;
     frame_opts.frame_shift_ms = 10;
@@ -71,8 +70,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
         opts.to_float32 = false;
         frame_opts.window_type = "povey";
         frame_opts.frame_length_ms = 25;
-        opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
-        opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
+        opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
+        opts.fbank_opts.frame_opts = frame_opts;
     } else {
         opts.to_float32 = true;
         frame_opts.remove_dc_offset = false;
diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h
index 4ebcd947..45498e46 100644
--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@@ -30,8 +30,8 @@ class AudioCache : public FrontendInterface {
 
     virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
 
-    // the audio dim is 1, one sample
-    virtual size_t Dim() const { return 1; }
+    // the audio dim is 1, one sample, we return size_ instead.
+    virtual size_t Dim() const { return size_; }
 
     virtual void SetFinished() {
         std::lock_guard<std::mutex> lock(mutex_);
diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc
index 18024719..f7a42315 100644
--- a/speechx/speechx/frontend/audio/compute_fbank_main.cc
+++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@@ -49,12 +49,11 @@ int main(int argc, char* argv[]) {
     std::unique_ptr<ppspeech::FrontendInterface> data_source(
         new ppspeech::AudioCache(3600 * 1600, false));
 
-    ppspeech::FbankOptions opt;
-    opt.fbank_opts.frame_opts.frame_length_ms = 25;
-    opt.fbank_opts.frame_opts.frame_shift_ms = 10;
-    opt.streaming_chunk = FLAGS_streaming_chunk;
-    opt.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
-    opt.fbank_opts.frame_opts.dither = 0.0;
+    kaldi::FbankOptions opt;
+    opt.frame_opts.frame_length_ms = 25;
+    opt.frame_opts.frame_shift_ms = 10;
+    opt.mel_opts.num_bins = FLAGS_num_bins;
+    opt.frame_opts.dither = 0.0;
 
     std::unique_ptr<ppspeech::FrontendInterface> fbank(
         new ppspeech::Fbank(opt, std::move(data_source)));
diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
index cc7a5e17..162c3529 100644
--- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
+++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
@@ -49,7 +49,6 @@ int main(int argc, char* argv[]) {
     ppspeech::LinearSpectrogramOptions opt;
     opt.frame_opts.frame_length_ms = 20;
     opt.frame_opts.frame_shift_ms = 10;
-    opt.streaming_chunk = FLAGS_streaming_chunk;
     opt.frame_opts.dither = 0.0;
     opt.frame_opts.remove_dc_offset = false;
     opt.frame_opts.window_type = "hanning";
diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc
index fea9032a..1f22263a 100644
--- a/speechx/speechx/frontend/audio/fbank.cc
+++ b/speechx/speechx/frontend/audio/fbank.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
 #include "frontend/audio/fbank.h"
 #include "kaldi/base/kaldi-math.h"
 #include "kaldi/feat/feature-common.h"
@@ -29,95 +28,33 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;
 
-// todo refactor later:(SmileGoat)
-
-Fbank::Fbank(const FbankOptions& opts,
-             std::unique_ptr<FrontendInterface> base_extractor)
+FbankComputer::FbankComputer(const Options& opts)
     : opts_(opts),
-      computer_(opts.fbank_opts),
-      window_function_(opts.fbank_opts.frame_opts) {
-    base_extractor_ = std::move(base_extractor);
-    chunk_sample_size_ = static_cast<int32>(
-        opts.streaming_chunk * opts.fbank_opts.frame_opts.samp_freq);
-}
+    computer_(opts) {}
 
-void Fbank::Accept(const VectorBase<BaseFloat>& inputs) {
-    base_extractor_->Accept(inputs);
+int32 FbankComputer::Dim() const {
+    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
 }
 
-bool Fbank::Read(Vector<BaseFloat>* feats) {
-    Vector<BaseFloat> wav(chunk_sample_size_);
-    bool flag = base_extractor_->Read(&wav);
-    if (flag == false || wav.Dim() == 0) return false;
-
-    // append remaned waves
-    int32 wav_len = wav.Dim();
-    int32 left_len = remained_wav_.Dim();
-    Vector<BaseFloat> waves(left_len + wav_len);
-    waves.Range(0, left_len).CopyFromVec(remained_wav_);
-    waves.Range(left_len, wav_len).CopyFromVec(wav);
-
-    // compute speech feature
-    Compute(waves, feats);
-
-    // cache remaned waves
-    kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
-    int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
-    int32 frame_shift = frame_opts.WindowShift();
-    int32 left_samples = waves.Dim() - frame_shift * num_frames;
-    remained_wav_.Resize(left_samples);
-    remained_wav_.CopyFromVec(
-        waves.Range(frame_shift * num_frames, left_samples));
-    return true;
+bool FbankComputer::NeedRawLogEnergy() {
+    return opts_.use_energy && opts_.raw_energy; 
 }
 
-// Compute spectrogram feat
-bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
-    const kaldi::FrameExtractionOptions& frame_opts =
-        computer_.GetFrameOptions();
-    int32 num_samples = waves.Dim();
-    int32 frame_length = frame_opts.WindowSize();
-    int32 sample_rate = frame_opts.samp_freq;
-    if (num_samples < frame_length) {
-        return true;
-    }
-
-    int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
-    feats->Resize(num_frames * Dim());
-
-    Vector<BaseFloat> window;
-    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
-    for (int32 frame = 0; frame < num_frames; frame++) {
-        BaseFloat raw_log_energy = 0.0;
-        kaldi::ExtractWindow(0,
-                             waves,
-                             frame,
-                             frame_opts,
-                             window_function_,
-                             &window,
-                             need_raw_log_energy ? &raw_log_energy : NULL);
-
-
-        Vector<BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
-        // note: this online feature-extraction code does not support VTLN.
-        RealFft(&window, true);
-        kaldi::ComputePowerSpectrum(&window);
-        const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
-        SubVector<BaseFloat> power_spectrum(window, 0, window.Dim() / 2 + 1);
-        if (!opts_.fbank_opts.use_power) {
-            power_spectrum.ApplyPow(0.5);
-        }
-        int32 mel_offset =
-            ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1
-                                                                           : 0);
-        SubVector<BaseFloat> mel_energies(
-            this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
-        mel_bank.Compute(power_spectrum, &mel_energies);
-        mel_energies.ApplyFloor(1e-07);
-        mel_energies.ApplyLog();
-        SubVector<BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
-        output_row.CopyFromVec(this_feature);
+// Compute feat
+bool FbankComputer::Compute(Vector<BaseFloat>* window, Vector<BaseFloat>* feat) {
+    RealFft(window, true);
+    kaldi::ComputePowerSpectrum(window);
+    const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
+    SubVector<BaseFloat> power_spectrum(*window, 0, window->Dim() / 2 + 1);
+    if (!opts_.use_power) {
+        power_spectrum.ApplyPow(0.5);
     }
+    int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
+    SubVector<BaseFloat> mel_energies(
+        *feat, mel_offset, opts_.mel_opts.num_bins);
+    mel_bank.Compute(power_spectrum, &mel_energies);
+    mel_energies.ApplyFloor(1e-07);
+    mel_energies.ApplyLog();
     return true;
 }
 
diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/frontend/audio/fbank.h
index 66957dc6..e513b969 100644
--- a/speechx/speechx/frontend/audio/fbank.h
+++ b/speechx/speechx/frontend/audio/fbank.h
@@ -16,62 +16,35 @@
 
 #include "base/common.h"
 #include "frontend/audio/frontend_itf.h"
+#include "frontend/audio/feature_common.h"
 #include "kaldi/feat/feature-fbank.h"
 #include "kaldi/feat/feature-mfcc.h"
 #include "kaldi/matrix/kaldi-vector.h"
 
 namespace ppspeech {
 
-struct FbankOptions {
-    kaldi::FbankOptions fbank_opts;
-    kaldi::BaseFloat streaming_chunk;  // second
-
-    FbankOptions() : streaming_chunk(0.1), fbank_opts() {}
-
-    void Register(kaldi::OptionsItf* opts) {
-        opts->Register("streaming-chunk",
-                       &streaming_chunk,
-                       "streaming chunk size, default: 0.1 sec");
-        fbank_opts.Register(opts);
-    }
-};
-
-
-class Fbank : public FrontendInterface {
+class FbankComputer {
   public:
-    explicit Fbank(const FbankOptions& opts,
-                   std::unique_ptr<FrontendInterface> base_extractor);
-    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+    typedef kaldi::FbankOptions Options;     
+    explicit FbankComputer(const Options& opts);
 
-    // the dim_ is the dim of single frame feature
-    virtual size_t Dim() const { return computer_.Dim(); }
-
-    virtual void SetFinished() { base_extractor_->SetFinished(); }
+    kaldi::FrameExtractionOptions& GetFrameOptions() {
+      return opts_.frame_opts;
+    }
 
-    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+    bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
+                 kaldi::Vector<kaldi::BaseFloat>* feat);
+    int32 Dim() const;
 
-    virtual void Reset() {
-        base_extractor_->Reset();
-        remained_wav_.Resize(0);
-    }
+    bool NeedRawLogEnergy();
 
   private:
-    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
-                 kaldi::Vector<kaldi::BaseFloat>* feats);
+    Options opts_;
 
-    FbankOptions opts_;
-    std::unique_ptr<FrontendInterface> base_extractor_;
-
-    kaldi::FeatureWindowFunction window_function_;
     kaldi::FbankComputer computer_;
-    // features_ is the Mfcc or Plp or Fbank features that we have already
-    // computed.
-    kaldi::Vector<kaldi::BaseFloat> features_;
-    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
-    kaldi::int32 chunk_sample_size_;
-
-    DISALLOW_COPY_AND_ASSIGN(Fbank);
+    //DISALLOW_COPY_AND_ASSIGN(FbankComputer);
 };
 
+typedef StreamingFeatureTpl<FbankComputer> Fbank;
+
 }  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/feature_common.h b/speechx/speechx/frontend/audio/feature_common.h
new file mode 100644
index 00000000..0763f012
--- /dev/null
+++ b/speechx/speechx/frontend/audio/feature_common.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "frontend_itf.h"
+#include "kaldi/feat/feature-window.h"
+
+namespace ppspeech {
+
+template <class F>
+class StreamingFeatureTpl : public FrontendInterface {
+  public:
+    typedef typename F::Options Options;
+    StreamingFeatureTpl(const Options& opts, 
+                        std::unique_ptr<FrontendInterface> base_extractor);
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+    
+    // the dim_ is the dim of single frame feature
+    virtual size_t Dim() const { return computer_.Dim(); }
+
+    virtual void SetFinished() { base_extractor_->SetFinished(); }
+
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+
+    virtual void Reset() {
+        base_extractor_->Reset();
+        remained_wav_.Resize(0);
+    }
+  private:
+    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, 
+                 kaldi::Vector<kaldi::BaseFloat>* feats);
+    Options opts_;
+    std::unique_ptr<FrontendInterface> base_extractor_;
+    kaldi::FeatureWindowFunction window_function_;
+    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
+    F computer_;
+};
+
+}  // namespace ppspeech
+
+#include "frontend/audio/feature_common_inl.h"
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/feature_common_inl.h b/speechx/speechx/frontend/audio/feature_common_inl.h
new file mode 100644
index 00000000..0199ab41
--- /dev/null
+++ b/speechx/speechx/frontend/audio/feature_common_inl.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+namespace ppspeech {
+
+template <class F>
+StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts, 
+                        std::unique_ptr<FrontendInterface> base_extractor):
+                        opts_(opts),
+                        computer_(opts),
+                        window_function_(opts.frame_opts) {
+    base_extractor_ = std::move(base_extractor);
+}
+
+template <class F>
+void StreamingFeatureTpl<F>::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+    base_extractor_->Accept(inputs);
+}
+
+template <class F>
+bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
+    kaldi::Vector<kaldi::BaseFloat> wav(base_extractor_->Dim());
+    bool flag = base_extractor_->Read(&wav);
+    if (flag == false || wav.Dim() == 0) return false;
+
+    // append remaned waves
+    int32 wav_len = wav.Dim();
+    int32 left_len = remained_wav_.Dim();
+    kaldi::Vector<kaldi::BaseFloat> waves(left_len + wav_len);
+    waves.Range(0, left_len).CopyFromVec(remained_wav_);
+    waves.Range(left_len, wav_len).CopyFromVec(wav);
+
+    // compute speech feature
+    Compute(waves, feats);
+
+    // cache remaned waves
+    kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
+    int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
+    int32 frame_shift = frame_opts.WindowShift();
+    int32 left_samples = waves.Dim() - frame_shift * num_frames;
+    remained_wav_.Resize(left_samples);
+    remained_wav_.CopyFromVec(
+        waves.Range(frame_shift * num_frames, left_samples));
+    return true;
+}
+
+// Compute feat
+template <class F>
+bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
+                                     kaldi::Vector<kaldi::BaseFloat>* feats) {
+    const kaldi::FrameExtractionOptions& frame_opts =
+        computer_.GetFrameOptions();
+    int32 num_samples = waves.Dim();
+    int32 frame_length = frame_opts.WindowSize();
+    int32 sample_rate = frame_opts.samp_freq;
+    if (num_samples < frame_length) {
+        return true;
+    }
+
+    int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
+    feats->Resize(num_frames * Dim());
+
+    kaldi::Vector<kaldi::BaseFloat> window;
+    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
+    for (int32 frame = 0; frame < num_frames; frame++) {
+        kaldi::BaseFloat raw_log_energy = 0.0;
+        kaldi::ExtractWindow(0,
+                             waves,
+                             frame,
+                             frame_opts,
+                             window_function_,
+                             &window,
+                             need_raw_log_energy ? &raw_log_energy : NULL);
+
+        kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
+        computer_.Compute(&window, &this_feature);
+        kaldi::SubVector<kaldi::BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
+        output_row.CopyFromVec(this_feature);
+    }
+    return true;
+}
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index b848f548..49b2f267 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -32,7 +32,7 @@ struct FeaturePipelineOptions {
     bool to_float32;  // true, only for linear feature
     bool use_fbank;
     LinearSpectrogramOptions linear_spectrogram_opts;
-    FbankOptions fbank_opts;
+    kaldi::FbankOptions fbank_opts;
     FeatureCacheOptions feature_cache_opts;
     AssemblerOptions assembler_opts;
 
diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.cc b/speechx/speechx/frontend/audio/linear_spectrogram.cc
index 9ef5e766..76580fd5 100644
--- a/speechx/speechx/frontend/audio/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc
@@ -28,81 +28,32 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;
 
-LinearSpectrogram::LinearSpectrogram(
-    const LinearSpectrogramOptions& opts,
-    std::unique_ptr<FrontendInterface> base_extractor)
-    : opts_(opts), feature_window_funtion_(opts.frame_opts) {
-    base_extractor_ = std::move(base_extractor);
+LinearSpectrogramComputer::LinearSpectrogramComputer(
+    const Options& opts)
+    : opts_(opts) {
+    kaldi::FeatureWindowFunction feature_window_function(opts.frame_opts);
     int32 window_size = opts.frame_opts.WindowSize();
-    int32 window_shift = opts.frame_opts.WindowShift();
+    frame_length_ = window_size;
     dim_ = window_size / 2 + 1;
-    chunk_sample_size_ =
-        static_cast<int32>(opts.streaming_chunk * opts.frame_opts.samp_freq);
-    hanning_window_energy_ = kaldi::VecVec(feature_window_funtion_.window,
-                                           feature_window_funtion_.window);
-}
-
-void LinearSpectrogram::Accept(const VectorBase<BaseFloat>& inputs) {
-    base_extractor_->Accept(inputs);
-}
-
-bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
-    Vector<BaseFloat> input_feats(chunk_sample_size_);
-    bool flag = base_extractor_->Read(&input_feats);
-    if (flag == false || input_feats.Dim() == 0) return false;
-
-    int32 feat_len = input_feats.Dim();
-    int32 left_len = remained_wav_.Dim();
-    Vector<BaseFloat> waves(feat_len + left_len);
-    waves.Range(0, left_len).CopyFromVec(remained_wav_);
-    waves.Range(left_len, feat_len).CopyFromVec(input_feats);
-    Compute(waves, feats);
-    int32 frame_shift = opts_.frame_opts.WindowShift();
-    int32 num_frames = kaldi::NumFrames(waves.Dim(), opts_.frame_opts);
-    int32 left_samples = waves.Dim() - frame_shift * num_frames;
-    remained_wav_.Resize(left_samples);
-    remained_wav_.CopyFromVec(
-        waves.Range(frame_shift * num_frames, left_samples));
-    return true;
+    BaseFloat hanning_window_energy = kaldi::VecVec(feature_window_function.window,
+                                          feature_window_function.window);
+    int32 sample_rate = opts.frame_opts.samp_freq;
+    scale_ = 2.0 / (hanning_window_energy * sample_rate);
 }
 
 // Compute spectrogram feat
-bool LinearSpectrogram::Compute(const Vector<BaseFloat>& waves,
-                                Vector<BaseFloat>* feats) {
-    int32 num_samples = waves.Dim();
-    int32 frame_length = opts_.frame_opts.WindowSize();
-    int32 sample_rate = opts_.frame_opts.samp_freq;
-    BaseFloat scale = 2.0 / (hanning_window_energy_ * sample_rate);
-
-    if (num_samples < frame_length) {
-        return true;
-    }
-
-    int32 num_frames = kaldi::NumFrames(num_samples, opts_.frame_opts);
-    feats->Resize(num_frames * dim_);
-    Vector<BaseFloat> window;
-
-    for (int frame_idx = 0; frame_idx < num_frames; ++frame_idx) {
-        kaldi::ExtractWindow(0,
-                             waves,
-                             frame_idx,
-                             opts_.frame_opts,
-                             feature_window_funtion_,
-                             &window,
-                             NULL);
-
-        SubVector<BaseFloat> output_row(feats->Data() + frame_idx * dim_, dim_);
-        window.Resize(frame_length, kaldi::kCopyData);
-        RealFft(&window, true);
-        kaldi::ComputePowerSpectrum(&window);
-        SubVector<BaseFloat> power_spectrum(window, 0, dim_);
-        power_spectrum.Scale(scale);
-        power_spectrum(0) = power_spectrum(0) / 2;
-        power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
-        power_spectrum.Add(1e-14);
-        power_spectrum.ApplyLog();
-        output_row.CopyFromVec(power_spectrum);
-    }
+bool LinearSpectrogramComputer::Compute(Vector<BaseFloat>* window,
+                                Vector<BaseFloat>* feat) {
+    window->Resize(frame_length_, kaldi::kCopyData);
+    RealFft(window, true);
+    kaldi::ComputePowerSpectrum(window);
+    SubVector<BaseFloat> power_spectrum(*window, 0, dim_);
+    power_spectrum.Scale(scale_);
+    power_spectrum(0) = power_spectrum(0) / 2;
+    power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
+    power_spectrum.Add(1e-14);
+    power_spectrum.ApplyLog();
+    feat->CopyFromVec(power_spectrum);
     return true;
 }
 
diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.h b/speechx/speechx/frontend/audio/linear_spectrogram.h
index 2764b7cf..7ef30dbc 100644
--- a/speechx/speechx/frontend/audio/linear_spectrogram.h
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.h
@@ -18,52 +18,41 @@
 #include "base/common.h"
 #include "frontend/audio/frontend_itf.h"
 #include "kaldi/feat/feature-window.h"
+#include "frontend/audio/feature_common.h"
 
 namespace ppspeech {
 
 struct LinearSpectrogramOptions {
     kaldi::FrameExtractionOptions frame_opts;
-    kaldi::BaseFloat streaming_chunk;  // second
-
-    LinearSpectrogramOptions() : streaming_chunk(0.1), frame_opts() {}
-
-    void Register(kaldi::OptionsItf* opts) {
-        opts->Register("streaming-chunk",
-                       &streaming_chunk,
-                       "streaming chunk size, default: 0.1 sec");
-        frame_opts.Register(opts);
-    }
+    LinearSpectrogramOptions(): frame_opts() {}
 };
 
-class LinearSpectrogram : public FrontendInterface {
+class LinearSpectrogramComputer {
   public:
-    explicit LinearSpectrogram(
-        const LinearSpectrogramOptions& opts,
-        std::unique_ptr<FrontendInterface> base_extractor);
-    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
-    // the dim_ is the dim of single frame feature
-    virtual size_t Dim() const { return dim_; }
-    virtual void SetFinished() { base_extractor_->SetFinished(); }
-    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
-    virtual void Reset() {
-        base_extractor_->Reset();
-        remained_wav_.Resize(0);
+    typedef LinearSpectrogramOptions Options;
+    explicit LinearSpectrogramComputer(const Options& opts);
+
+    kaldi::FrameExtractionOptions& GetFrameOptions() {
+      return opts_.frame_opts;
     }
 
-  private:
-    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
-                 kaldi::Vector<kaldi::BaseFloat>* feats);
+    bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
+                 kaldi::Vector<kaldi::BaseFloat>* feat);
 
-    size_t dim_;
-    kaldi::FeatureWindowFunction feature_window_funtion_;
-    kaldi::BaseFloat hanning_window_energy_;
-    LinearSpectrogramOptions opts_;
-    std::unique_ptr<FrontendInterface> base_extractor_;
-    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
-    int chunk_sample_size_;
-    DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
+    int32 Dim() const { return dim_; }
+
+    bool NeedRawLogEnergy() { return false; }
+    
+  private:
+    kaldi::BaseFloat scale_;
+    Options opts_;
+    int32 frame_length_;
+    int32 dim_;
 };
 
+typedef StreamingFeatureTpl<LinearSpectrogramComputer> LinearSpectrogram;
+
+    //DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
+
 
 }  // namespace ppspeech
\ No newline at end of file

From ce322a60f8105895857ee8997f8feec35be70e7f Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Wed, 1 Jun 2022 22:45:23 +0800
Subject: [PATCH 2/5] remove streaming opt, test=doc

---
 speechx/examples/custom_asr/run.sh                    | 1 -
 speechx/examples/ds2_ol/aishell/run.sh                | 2 --
 speechx/examples/ds2_ol/aishell/run_fbank.sh          | 1 -
 speechx/examples/ds2_ol/websocket/websocket_client.sh | 2 +-
 speechx/examples/ds2_ol/websocket/websocket_server.sh | 1 -
 speechx/speechx/decoder/param.h                       | 1 -
 speechx/speechx/decoder/recognizer_main.cc            | 4 ++--
 7 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/speechx/examples/custom_asr/run.sh b/speechx/examples/custom_asr/run.sh
index dddcf9fd..ed67a52b 100644
--- a/speechx/examples/custom_asr/run.sh
+++ b/speechx/examples/custom_asr/run.sh
@@ -71,7 +71,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   recognizer_test_main \
     --wav_rspecifier=scp:$wav_scp \
     --cmvn_file=$cmvn \
-    --streaming_chunk=30 \
     --use_fbank=true \
     --model_path=$model_dir/avg_10.jit.pdmodel \
     --param_path=$model_dir/avg_10.jit.pdiparams \
diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
index e1001e25..97ce1e65 100755
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -78,7 +78,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
         --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
         --cmvn_file=$cmvn \
-        --streaming_chunk=0.36
     echo "feature make have finished!!!"
 fi
 
@@ -155,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
         --cmvn_file=$cmvn \
         --model_path=$model_dir/avg_1.jit.pdmodel \
-        --streaming_chunk=30 \
         --param_path=$model_dir/avg_1.jit.pdiparams \
         --word_symbol_table=$wfst/words.txt \
         --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh
index 6e131677..6712eb1c 100755
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -152,7 +152,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
         --cmvn_file=$cmvn \
         --model_path=$model_dir/avg_5.jit.pdmodel \
-        --streaming_chunk=30 \
         --use_fbank=true \
         --param_path=$model_dir/avg_5.jit.pdiparams \
         --word_symbol_table=$wfst/words.txt \
diff --git a/speechx/examples/ds2_ol/websocket/websocket_client.sh b/speechx/examples/ds2_ol/websocket/websocket_client.sh
index 2a52d2a3..7cd0fdab 100755
--- a/speechx/examples/ds2_ol/websocket/websocket_client.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_client.sh
@@ -32,4 +32,4 @@ export GLOG_logtostderr=1
 
 # websocket client
 websocket_client_main \
-    --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36
\ No newline at end of file
+    --wav_rspecifier=scp:$data/$aishell_wav_scp
diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh
index f798dfd4..45dbf303 100755
--- a/speechx/examples/ds2_ol/websocket/websocket_server.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh
@@ -62,7 +62,6 @@ fi
 websocket_server_main \
     --cmvn_file=$cmvn \
     --model_path=$model_dir/avg_1.jit.pdmodel \
-    --streaming_chunk=0.1 \
     --param_path=$model_dir/avg_1.jit.pdiparams \
     --word_symbol_table=$wfst/words.txt \
     --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index c8396a58..f3560343 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -25,7 +25,6 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
 // feature, or fbank");
 DEFINE_int32(num_bins, 161, "num bins of mel");
 DEFINE_string(cmvn_file, "", "read cmvn");
-DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
 // feature sliding window
 DEFINE_int32(receptive_field_length,
              7,
diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc
index 7aef73f7..027dadd6 100644
--- a/speechx/speechx/decoder/recognizer_main.cc
+++ b/speechx/speechx/decoder/recognizer_main.cc
@@ -33,7 +33,7 @@ int main(int argc, char* argv[]) {
     kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
 
     int sample_rate = FLAGS_sample_rate;
-    float streaming_chunk = FLAGS_streaming_chunk;
+    float streaming_chunk = 0.1;
     int chunk_sample_size = streaming_chunk * sample_rate;
     LOG(INFO) << "sr: " << sample_rate;
     LOG(INFO) << "chunk size (s): " << streaming_chunk;
@@ -96,4 +96,4 @@ int main(int argc, char* argv[]) {
     KALDI_LOG << " cost:" << elapsed << " s";
     KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s";
     KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration;
-}
\ No newline at end of file
+}

From 1a10c4cde89eafe9de7dc4c191bcabbb6ff80ca0 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Wed, 1 Jun 2022 22:58:30 +0800
Subject: [PATCH 3/5] format

---
 speechx/speechx/frontend/audio/fbank.h              |  8 ++++----
 speechx/speechx/frontend/audio/linear_spectrogram.h | 11 +++++------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/frontend/audio/fbank.h
index e513b969..a1e65413 100644
--- a/speechx/speechx/frontend/audio/fbank.h
+++ b/speechx/speechx/frontend/audio/fbank.h
@@ -15,8 +15,8 @@
 #pragma once
 
 #include "base/common.h"
-#include "frontend/audio/frontend_itf.h"
 #include "frontend/audio/feature_common.h"
+#include "frontend/audio/frontend_itf.h"
 #include "kaldi/feat/feature-fbank.h"
 #include "kaldi/feat/feature-mfcc.h"
 #include "kaldi/matrix/kaldi-vector.h"
@@ -25,11 +25,11 @@ namespace ppspeech {
 
 class FbankComputer {
   public:
-    typedef kaldi::FbankOptions Options;     
+    typedef kaldi::FbankOptions Options;
     explicit FbankComputer(const Options& opts);
 
     kaldi::FrameExtractionOptions& GetFrameOptions() {
-      return opts_.frame_opts;
+        return opts_.frame_opts;
     }
 
     bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
@@ -42,7 +42,7 @@ class FbankComputer {
     Options opts_;
 
     kaldi::FbankComputer computer_;
-    //DISALLOW_COPY_AND_ASSIGN(FbankComputer);
+    DISALLOW_COPY_AND_ASSIGN(FbankComputer);
 };
 
 typedef StreamingFeatureTpl<FbankComputer> Fbank;
diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.h b/speechx/speechx/frontend/audio/linear_spectrogram.h
index 7ef30dbc..de957c23 100644
--- a/speechx/speechx/frontend/audio/linear_spectrogram.h
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.h
@@ -16,15 +16,15 @@
 #pragma once
 
 #include "base/common.h"
+#include "frontend/audio/feature_common.h"
 #include "frontend/audio/frontend_itf.h"
 #include "kaldi/feat/feature-window.h"
-#include "frontend/audio/feature_common.h"
 
 namespace ppspeech {
 
 struct LinearSpectrogramOptions {
     kaldi::FrameExtractionOptions frame_opts;
-    LinearSpectrogramOptions(): frame_opts() {}
+    LinearSpectrogramOptions() : frame_opts() {}
 };
 
 class LinearSpectrogramComputer {
@@ -33,7 +33,7 @@ class LinearSpectrogramComputer {
     explicit LinearSpectrogramComputer(const Options& opts);
 
     kaldi::FrameExtractionOptions& GetFrameOptions() {
-      return opts_.frame_opts;
+        return opts_.frame_opts;
     }
 
     bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
@@ -42,17 +42,16 @@ class LinearSpectrogramComputer {
     int32 Dim() const { return dim_; }
 
     bool NeedRawLogEnergy() { return false; }
-    
+
   private:
     kaldi::BaseFloat scale_;
     Options opts_;
     int32 frame_length_;
     int32 dim_;
+    DISALLOW_COPY_AND_ASSIGN(LinearSpectrogramComputer);
 };
 
 typedef StreamingFeatureTpl<LinearSpectrogramComputer> LinearSpectrogram;
 
-    //DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
-
 
 }  // namespace ppspeech
\ No newline at end of file

From 583c62db772870097692d1ce19af1b3189a21ca8 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Thu, 2 Jun 2022 09:30:32 +0800
Subject: [PATCH 4/5] add streaming chunk,test=doc

---
 speechx/examples/ds2_ol/websocket/websocket_client.sh | 2 +-
 speechx/speechx/decoder/recognizer_main.cc            | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/speechx/examples/ds2_ol/websocket/websocket_client.sh b/speechx/examples/ds2_ol/websocket/websocket_client.sh
index 7cd0fdab..a508adfb 100755
--- a/speechx/examples/ds2_ol/websocket/websocket_client.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_client.sh
@@ -32,4 +32,4 @@ export GLOG_logtostderr=1
 
 # websocket client
 websocket_client_main \
-    --wav_rspecifier=scp:$data/$aishell_wav_scp
+    --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.5
diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc
index 027dadd6..23251353 100644
--- a/speechx/speechx/decoder/recognizer_main.cc
+++ b/speechx/speechx/decoder/recognizer_main.cc
@@ -19,6 +19,7 @@
 
 DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
+DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(sample_rate, 16000, "sample rate");
 
 int main(int argc, char* argv[]) {
@@ -33,7 +34,7 @@ int main(int argc, char* argv[]) {
     kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
 
     int sample_rate = FLAGS_sample_rate;
-    float streaming_chunk = 0.1;
+    float streaming_chunk = FLAGS_streaming_chunk;
     int chunk_sample_size = streaming_chunk * sample_rate;
     LOG(INFO) << "sr: " << sample_rate;
     LOG(INFO) << "chunk size (s): " << streaming_chunk;

From 6dbf30816b5b83f7f35c29e20f45cd411846950e Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Thu, 2 Jun 2022 11:04:58 +0800
Subject: [PATCH 5/5] fix comment

---
 speechx/speechx/frontend/audio/audio_cache.h        | 3 ++-
 speechx/speechx/frontend/audio/feature_common.h     | 4 ++--
 speechx/speechx/frontend/audio/feature_common_inl.h | 6 +++---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h
index 45498e46..e31a8aeb 100644
--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@@ -30,7 +30,8 @@ class AudioCache : public FrontendInterface {
 
     virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
 
-    // the audio dim is 1, one sample, we return size_ instead.
+    // the audio dim is 1, one sample, which is useless, 
+    // so we return size_(cache samples) instead.
     virtual size_t Dim() const { return size_; }
 
     virtual void SetFinished() {
diff --git a/speechx/speechx/frontend/audio/feature_common.h b/speechx/speechx/frontend/audio/feature_common.h
index 0763f012..e03634d3 100644
--- a/speechx/speechx/frontend/audio/feature_common.h
+++ b/speechx/speechx/frontend/audio/feature_common.h
@@ -25,7 +25,7 @@ class StreamingFeatureTpl : public FrontendInterface {
     typedef typename F::Options Options;
     StreamingFeatureTpl(const Options& opts, 
                         std::unique_ptr<FrontendInterface> base_extractor);
-    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
     virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
     
     // the dim_ is the dim of single frame feature
@@ -51,4 +51,4 @@ class StreamingFeatureTpl : public FrontendInterface {
 
 }  // namespace ppspeech
 
-#include "frontend/audio/feature_common_inl.h"
\ No newline at end of file
+#include "frontend/audio/feature_common_inl.h"
diff --git a/speechx/speechx/frontend/audio/feature_common_inl.h b/speechx/speechx/frontend/audio/feature_common_inl.h
index 0199ab41..a482ef55 100644
--- a/speechx/speechx/frontend/audio/feature_common_inl.h
+++ b/speechx/speechx/frontend/audio/feature_common_inl.h
@@ -25,8 +25,8 @@ StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts,
 }
 
 template <class F>
-void StreamingFeatureTpl<F>::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
-    base_extractor_->Accept(inputs);
+void StreamingFeatureTpl<F>::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves) {
+    base_extractor_->Accept(waves);
 }
 
 template <class F>
@@ -92,4 +92,4 @@ bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& wave
     return true;
 }
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech