Merge pull request #2003 from SmileGoat/refactor_file_struct

[speechx] refactor frontend
2 years ago · 182858bf88
parent 82c1f4c508 6dbf30816b
commit 182858bf88
17 changed files with 241 additions and 250 deletions
--- a/speechx/examples/custom_asr/run.sh
+++ b/speechx/examples/custom_asr/run.sh
@ -71,7 +71,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
  recognizer_test_main \
    --wav_rspecifier=scp:$wav_scp \
    --cmvn_file=$cmvn \
-    --streaming_chunk=30 \
    --use_fbank=true \
    --model_path=$model_dir/avg_10.jit.pdmodel \
    --param_path=$model_dir/avg_10.jit.pdiparams \
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@ -78,7 +78,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
        --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
        --cmvn_file=$cmvn \
-        --streaming_chunk=0.36
    echo "feature make have finished!!!"
 fi

@ -155,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
        --cmvn_file=$cmvn \
        --model_path=$model_dir/avg_1.jit.pdmodel \
-        --streaming_chunk=30 \
        --param_path=$model_dir/avg_1.jit.pdiparams \
        --word_symbol_table=$wfst/words.txt \
        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@ -152,7 +152,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
        --cmvn_file=$cmvn \
        --model_path=$model_dir/avg_5.jit.pdmodel \
-        --streaming_chunk=30 \
        --use_fbank=true \
        --param_path=$model_dir/avg_5.jit.pdiparams \
        --word_symbol_table=$wfst/words.txt \
--- a/speechx/examples/ds2_ol/websocket/websocket_client.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_client.sh
@ -32,4 +32,4 @@ export GLOG_logtostderr=1

 # websocket client
 websocket_client_main \
-    --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36
+    --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.5
--- a/speechx/examples/ds2_ol/websocket/websocket_server.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh
@ -62,7 +62,6 @@ fi
 websocket_server_main \
    --cmvn_file=$cmvn \
    --model_path=$model_dir/avg_1.jit.pdmodel \
-    --streaming_chunk=0.1 \
    --param_path=$model_dir/avg_1.jit.pdiparams \
    --word_symbol_table=$wfst/words.txt \
    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@ -25,7 +25,6 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
 // feature, or fbank");
 DEFINE_int32(num_bins, 161, "num bins of mel");
 DEFINE_string(cmvn_file, "", "read cmvn");
-DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
 // feature sliding window
 DEFINE_int32(receptive_field_length,
             7,
@ -62,7 +61,6 @@ namespace ppspeech {
 FeaturePipelineOptions InitFeaturePipelineOptions() {
    FeaturePipelineOptions opts;
    opts.cmvn_file = FLAGS_cmvn_file;
-    opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
    kaldi::FrameExtractionOptions frame_opts;
    frame_opts.dither = 0.0;
    frame_opts.frame_shift_ms = 10;
@ -71,8 +69,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
        opts.to_float32 = false;
        frame_opts.window_type = "povey";
        frame_opts.frame_length_ms = 25;
-        opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
-        opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
+        opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
+        opts.fbank_opts.frame_opts = frame_opts;
    } else {
        opts.to_float32 = true;
        frame_opts.remove_dc_offset = false;
--- a/speechx/speechx/decoder/recognizer_main.cc
+++ b/speechx/speechx/decoder/recognizer_main.cc
@ -19,6 +19,7 @@

 DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
+DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(sample_rate, 16000, "sample rate");

 int main(int argc, char* argv[]) {
@ -96,4 +97,4 @@ int main(int argc, char* argv[]) {
    KALDI_LOG << " cost:" << elapsed << " s";
    KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s";
    KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration;
-}
+}
--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@ -30,8 +30,9 @@ class AudioCache : public FrontendInterface {

    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);

-    // the audio dim is 1, one sample
-    virtual size_t Dim() const { return 1; }
+    // the audio dim is 1, one sample, which is useless, 
+    // so we return size_(cache samples) instead.
+    virtual size_t Dim() const { return size_; }

    virtual void SetFinished() {
        std::lock_guard<std::mutex> lock(mutex_);
--- a/speechx/speechx/frontend/audio/compute_fbank_main.cc
+++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@ -49,12 +49,11 @@ int main(int argc, char* argv[]) {
    std::unique_ptr<ppspeech::FrontendInterface> data_source(
        new ppspeech::AudioCache(3600 * 1600, false));

-    ppspeech::FbankOptions opt;
-    opt.fbank_opts.frame_opts.frame_length_ms = 25;
-    opt.fbank_opts.frame_opts.frame_shift_ms = 10;
-    opt.streaming_chunk = FLAGS_streaming_chunk;
-    opt.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
-    opt.fbank_opts.frame_opts.dither = 0.0;
+    kaldi::FbankOptions opt;
+    opt.frame_opts.frame_length_ms = 25;
+    opt.frame_opts.frame_shift_ms = 10;
+    opt.mel_opts.num_bins = FLAGS_num_bins;
+    opt.frame_opts.dither = 0.0;

    std::unique_ptr<ppspeech::FrontendInterface> fbank(
        new ppspeech::Fbank(opt, std::move(data_source)));
--- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
+++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
@ -49,7 +49,6 @@ int main(int argc, char* argv[]) {
    ppspeech::LinearSpectrogramOptions opt;
    opt.frame_opts.frame_length_ms = 20;
    opt.frame_opts.frame_shift_ms = 10;
-    opt.streaming_chunk = FLAGS_streaming_chunk;
    opt.frame_opts.dither = 0.0;
    opt.frame_opts.remove_dc_offset = false;
    opt.frame_opts.window_type = "hanning";
--- a/speechx/speechx/frontend/audio/fbank.cc
+++ b/speechx/speechx/frontend/audio/fbank.cc
@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-
 #include "frontend/audio/fbank.h"
 #include "kaldi/base/kaldi-math.h"
 #include "kaldi/feat/feature-common.h"
@ -29,95 +28,33 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;

-// todo refactor later:(SmileGoat)
-
-Fbank::Fbank(const FbankOptions& opts,
-             std::unique_ptr<FrontendInterface> base_extractor)
+FbankComputer::FbankComputer(const Options& opts)
    : opts_(opts),
-      computer_(opts.fbank_opts),
-      window_function_(opts.fbank_opts.frame_opts) {
-    base_extractor_ = std::move(base_extractor);
-    chunk_sample_size_ = static_cast<int32>(
-        opts.streaming_chunk * opts.fbank_opts.frame_opts.samp_freq);
-}
+    computer_(opts) {}

-void Fbank::Accept(const VectorBase<BaseFloat>& inputs) {
-    base_extractor_->Accept(inputs);
+int32 FbankComputer::Dim() const {
+    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
 }

-bool Fbank::Read(Vector<BaseFloat>* feats) {
-    Vector<BaseFloat> wav(chunk_sample_size_);
-    bool flag = base_extractor_->Read(&wav);
-    if (flag == false || wav.Dim() == 0) return false;
-
-    // append remaned waves
-    int32 wav_len = wav.Dim();
-    int32 left_len = remained_wav_.Dim();
-    Vector<BaseFloat> waves(left_len + wav_len);
-    waves.Range(0, left_len).CopyFromVec(remained_wav_);
-    waves.Range(left_len, wav_len).CopyFromVec(wav);
-
-    // compute speech feature
-    Compute(waves, feats);
-
-    // cache remaned waves
-    kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
-    int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
-    int32 frame_shift = frame_opts.WindowShift();
-    int32 left_samples = waves.Dim() - frame_shift * num_frames;
-    remained_wav_.Resize(left_samples);
-    remained_wav_.CopyFromVec(
-        waves.Range(frame_shift * num_frames, left_samples));
-    return true;
+bool FbankComputer::NeedRawLogEnergy() {
+    return opts_.use_energy && opts_.raw_energy; 
 }

-// Compute spectrogram feat
-bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
-    const kaldi::FrameExtractionOptions& frame_opts =
-        computer_.GetFrameOptions();
-    int32 num_samples = waves.Dim();
-    int32 frame_length = frame_opts.WindowSize();
-    int32 sample_rate = frame_opts.samp_freq;
-    if (num_samples < frame_length) {
-        return true;
-    }
-
-    int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
-    feats->Resize(num_frames * Dim());
-
-    Vector<BaseFloat> window;
-    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
-    for (int32 frame = 0; frame < num_frames; frame++) {
-        BaseFloat raw_log_energy = 0.0;
-        kaldi::ExtractWindow(0,
-                             waves,
-                             frame,
-                             frame_opts,
-                             window_function_,
-                             &window,
-                             need_raw_log_energy ? &raw_log_energy : NULL);
-
-
-        Vector<BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
-        // note: this online feature-extraction code does not support VTLN.
-        RealFft(&window, true);
-        kaldi::ComputePowerSpectrum(&window);
-        const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
-        SubVector<BaseFloat> power_spectrum(window, 0, window.Dim() / 2 + 1);
-        if (!opts_.fbank_opts.use_power) {
-            power_spectrum.ApplyPow(0.5);
-        }
-        int32 mel_offset =
-            ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1
-                                                                           : 0);
-        SubVector<BaseFloat> mel_energies(
-            this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
-        mel_bank.Compute(power_spectrum, &mel_energies);
-        mel_energies.ApplyFloor(1e-07);
-        mel_energies.ApplyLog();
-        SubVector<BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
-        output_row.CopyFromVec(this_feature);
+// Compute feat
+bool FbankComputer::Compute(Vector<BaseFloat>* window, Vector<BaseFloat>* feat) {
+    RealFft(window, true);
+    kaldi::ComputePowerSpectrum(window);
+    const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
+    SubVector<BaseFloat> power_spectrum(*window, 0, window->Dim() / 2 + 1);
+    if (!opts_.use_power) {
+        power_spectrum.ApplyPow(0.5);
    }
+    int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
+    SubVector<BaseFloat> mel_energies(
+        *feat, mel_offset, opts_.mel_opts.num_bins);
+    mel_bank.Compute(power_spectrum, &mel_energies);
+    mel_energies.ApplyFloor(1e-07);
+    mel_energies.ApplyLog();
    return true;
 }

--- a/speechx/speechx/frontend/audio/fbank.h
+++ b/speechx/speechx/frontend/audio/fbank.h
@ -15,6 +15,7 @@
 #pragma once

 #include "base/common.h"
+#include "frontend/audio/feature_common.h"
 #include "frontend/audio/frontend_itf.h"
 #include "kaldi/feat/feature-fbank.h"
 #include "kaldi/feat/feature-mfcc.h"
@ -22,56 +23,28 @@

 namespace ppspeech {

-struct FbankOptions {
-    kaldi::FbankOptions fbank_opts;
-    kaldi::BaseFloat streaming_chunk;  // second
-
-    FbankOptions() : streaming_chunk(0.1), fbank_opts() {}
-
-    void Register(kaldi::OptionsItf* opts) {
-        opts->Register("streaming-chunk",
-                       &streaming_chunk,
-                       "streaming chunk size, default: 0.1 sec");
-        fbank_opts.Register(opts);
-    }
-};
-
-
-class Fbank : public FrontendInterface {
+class FbankComputer {
  public:
-    explicit Fbank(const FbankOptions& opts,
-                   std::unique_ptr<FrontendInterface> base_extractor);
-    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+    typedef kaldi::FbankOptions Options;
+    explicit FbankComputer(const Options& opts);

-    // the dim_ is the dim of single frame feature
-    virtual size_t Dim() const { return computer_.Dim(); }
-
-    virtual void SetFinished() { base_extractor_->SetFinished(); }
+    kaldi::FrameExtractionOptions& GetFrameOptions() {
+        return opts_.frame_opts;
+    }

-    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+    bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
+                 kaldi::Vector<kaldi::BaseFloat>* feat);
+    int32 Dim() const;

-    virtual void Reset() {
-        base_extractor_->Reset();
-        remained_wav_.Resize(0);
-    }
+    bool NeedRawLogEnergy();

  private:
-    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
-                 kaldi::Vector<kaldi::BaseFloat>* feats);
+    Options opts_;

-    FbankOptions opts_;
-    std::unique_ptr<FrontendInterface> base_extractor_;
-
-    kaldi::FeatureWindowFunction window_function_;
    kaldi::FbankComputer computer_;
-    // features_ is the Mfcc or Plp or Fbank features that we have already
-    // computed.
-    kaldi::Vector<kaldi::BaseFloat> features_;
-    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
-    kaldi::int32 chunk_sample_size_;
-
-    DISALLOW_COPY_AND_ASSIGN(Fbank);
+    DISALLOW_COPY_AND_ASSIGN(FbankComputer);
 };

+typedef StreamingFeatureTpl<FbankComputer> Fbank;
+
 }  // namespace ppspeech
--- a/speechx/speechx/frontend/audio/feature_common.h
+++ b/speechx/speechx/frontend/audio/feature_common.h
@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "frontend_itf.h"
+#include "kaldi/feat/feature-window.h"
+
+namespace ppspeech {
+
+template <class F>
+class StreamingFeatureTpl : public FrontendInterface {
+  public:
+    typedef typename F::Options Options;
+    StreamingFeatureTpl(const Options& opts, 
+                        std::unique_ptr<FrontendInterface> base_extractor);
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+    
+    // the dim_ is the dim of single frame feature
+    virtual size_t Dim() const { return computer_.Dim(); }
+
+    virtual void SetFinished() { base_extractor_->SetFinished(); }
+
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+
+    virtual void Reset() {
+        base_extractor_->Reset();
+        remained_wav_.Resize(0);
+    }
+  private:
+    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, 
+                 kaldi::Vector<kaldi::BaseFloat>* feats);
+    Options opts_;
+    std::unique_ptr<FrontendInterface> base_extractor_;
+    kaldi::FeatureWindowFunction window_function_;
+    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
+    F computer_;
+};
+
+}  // namespace ppspeech
+
+#include "frontend/audio/feature_common_inl.h"
--- a/speechx/speechx/frontend/audio/feature_common_inl.h
+++ b/speechx/speechx/frontend/audio/feature_common_inl.h
@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+namespace ppspeech {
+
+template <class F>
+StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts, 
+                        std::unique_ptr<FrontendInterface> base_extractor):
+                        opts_(opts),
+                        computer_(opts),
+                        window_function_(opts.frame_opts) {
+    base_extractor_ = std::move(base_extractor);
+}
+
+template <class F>
+void StreamingFeatureTpl<F>::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves) {
+    base_extractor_->Accept(waves);
+}
+
+template <class F>
+bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
+    kaldi::Vector<kaldi::BaseFloat> wav(base_extractor_->Dim());
+    bool flag = base_extractor_->Read(&wav);
+    if (flag == false || wav.Dim() == 0) return false;
+
+    // append remaned waves
+    int32 wav_len = wav.Dim();
+    int32 left_len = remained_wav_.Dim();
+    kaldi::Vector<kaldi::BaseFloat> waves(left_len + wav_len);
+    waves.Range(0, left_len).CopyFromVec(remained_wav_);
+    waves.Range(left_len, wav_len).CopyFromVec(wav);
+
+    // compute speech feature
+    Compute(waves, feats);
+
+    // cache remaned waves
+    kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
+    int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
+    int32 frame_shift = frame_opts.WindowShift();
+    int32 left_samples = waves.Dim() - frame_shift * num_frames;
+    remained_wav_.Resize(left_samples);
+    remained_wav_.CopyFromVec(
+        waves.Range(frame_shift * num_frames, left_samples));
+    return true;
+}
+
+// Compute feat
+template <class F>
+bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
+                                     kaldi::Vector<kaldi::BaseFloat>* feats) {
+    const kaldi::FrameExtractionOptions& frame_opts =
+        computer_.GetFrameOptions();
+    int32 num_samples = waves.Dim();
+    int32 frame_length = frame_opts.WindowSize();
+    int32 sample_rate = frame_opts.samp_freq;
+    if (num_samples < frame_length) {
+        return true;
+    }
+
+    int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
+    feats->Resize(num_frames * Dim());
+
+    kaldi::Vector<kaldi::BaseFloat> window;
+    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
+    for (int32 frame = 0; frame < num_frames; frame++) {
+        kaldi::BaseFloat raw_log_energy = 0.0;
+        kaldi::ExtractWindow(0,
+                             waves,
+                             frame,
+                             frame_opts,
+                             window_function_,
+                             &window,
+                             need_raw_log_energy ? &raw_log_energy : NULL);
+
+        kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
+        computer_.Compute(&window, &this_feature);
+        kaldi::SubVector<kaldi::BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
+        output_row.CopyFromVec(this_feature);
+    }
+    return true;
+}
+
+}  // namespace ppspeech
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@ -32,7 +32,7 @@ struct FeaturePipelineOptions {
    bool to_float32;  // true, only for linear feature
    bool use_fbank;
    LinearSpectrogramOptions linear_spectrogram_opts;
-    FbankOptions fbank_opts;
+    kaldi::FbankOptions fbank_opts;
    FeatureCacheOptions feature_cache_opts;
    AssemblerOptions assembler_opts;

--- a/speechx/speechx/frontend/audio/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc
@ -28,81 +28,32 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;

-LinearSpectrogram::LinearSpectrogram(
-    const LinearSpectrogramOptions& opts,
-    std::unique_ptr<FrontendInterface> base_extractor)
-    : opts_(opts), feature_window_funtion_(opts.frame_opts) {
-    base_extractor_ = std::move(base_extractor);
+LinearSpectrogramComputer::LinearSpectrogramComputer(
+    const Options& opts)
+    : opts_(opts) {
+    kaldi::FeatureWindowFunction feature_window_function(opts.frame_opts);
    int32 window_size = opts.frame_opts.WindowSize();
-    int32 window_shift = opts.frame_opts.WindowShift();
+    frame_length_ = window_size;
    dim_ = window_size / 2 + 1;
-    chunk_sample_size_ =
-        static_cast<int32>(opts.streaming_chunk * opts.frame_opts.samp_freq);
-    hanning_window_energy_ = kaldi::VecVec(feature_window_funtion_.window,
-                                           feature_window_funtion_.window);
-}
-
-void LinearSpectrogram::Accept(const VectorBase<BaseFloat>& inputs) {
-    base_extractor_->Accept(inputs);
-}
-
-bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
-    Vector<BaseFloat> input_feats(chunk_sample_size_);
-    bool flag = base_extractor_->Read(&input_feats);
-    if (flag == false || input_feats.Dim() == 0) return false;
-
-    int32 feat_len = input_feats.Dim();
-    int32 left_len = remained_wav_.Dim();
-    Vector<BaseFloat> waves(feat_len + left_len);
-    waves.Range(0, left_len).CopyFromVec(remained_wav_);
-    waves.Range(left_len, feat_len).CopyFromVec(input_feats);
-    Compute(waves, feats);
-    int32 frame_shift = opts_.frame_opts.WindowShift();
-    int32 num_frames = kaldi::NumFrames(waves.Dim(), opts_.frame_opts);
-    int32 left_samples = waves.Dim() - frame_shift * num_frames;
-    remained_wav_.Resize(left_samples);
-    remained_wav_.CopyFromVec(
-        waves.Range(frame_shift * num_frames, left_samples));
-    return true;
+    BaseFloat hanning_window_energy = kaldi::VecVec(feature_window_function.window,
+                                          feature_window_function.window);
+    int32 sample_rate = opts.frame_opts.samp_freq;
+    scale_ = 2.0 / (hanning_window_energy * sample_rate);
 }

 // Compute spectrogram feat
-bool LinearSpectrogram::Compute(const Vector<BaseFloat>& waves,
-                                Vector<BaseFloat>* feats) {
-    int32 num_samples = waves.Dim();
-    int32 frame_length = opts_.frame_opts.WindowSize();
-    int32 sample_rate = opts_.frame_opts.samp_freq;
-    BaseFloat scale = 2.0 / (hanning_window_energy_ * sample_rate);
-
-    if (num_samples < frame_length) {
-        return true;
-    }
-
-    int32 num_frames = kaldi::NumFrames(num_samples, opts_.frame_opts);
-    feats->Resize(num_frames * dim_);
-    Vector<BaseFloat> window;
-
-    for (int frame_idx = 0; frame_idx < num_frames; ++frame_idx) {
-        kaldi::ExtractWindow(0,
-                             waves,
-                             frame_idx,
-                             opts_.frame_opts,
-                             feature_window_funtion_,
-                             &window,
-                             NULL);
-
-        SubVector<BaseFloat> output_row(feats->Data() + frame_idx * dim_, dim_);
-        window.Resize(frame_length, kaldi::kCopyData);
-        RealFft(&window, true);
-        kaldi::ComputePowerSpectrum(&window);
-        SubVector<BaseFloat> power_spectrum(window, 0, dim_);
-        power_spectrum.Scale(scale);
-        power_spectrum(0) = power_spectrum(0) / 2;
-        power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
-        power_spectrum.Add(1e-14);
-        power_spectrum.ApplyLog();
-        output_row.CopyFromVec(power_spectrum);
-    }
+bool LinearSpectrogramComputer::Compute(Vector<BaseFloat>* window,
+                                Vector<BaseFloat>* feat) {
+    window->Resize(frame_length_, kaldi::kCopyData);
+    RealFft(window, true);
+    kaldi::ComputePowerSpectrum(window);
+    SubVector<BaseFloat> power_spectrum(*window, 0, dim_);
+    power_spectrum.Scale(scale_);
+    power_spectrum(0) = power_spectrum(0) / 2;
+    power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
+    power_spectrum.Add(1e-14);
+    power_spectrum.ApplyLog();
+    feat->CopyFromVec(power_spectrum);
    return true;
 }

--- a/speechx/speechx/frontend/audio/linear_spectrogram.h
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.h
@ -16,6 +16,7 @@
 #pragma once

 #include "base/common.h"
+#include "frontend/audio/feature_common.h"
 #include "frontend/audio/frontend_itf.h"
 #include "kaldi/feat/feature-window.h"

@ -23,47 +24,34 @@ namespace ppspeech {

 struct LinearSpectrogramOptions {
    kaldi::FrameExtractionOptions frame_opts;
-    kaldi::BaseFloat streaming_chunk;  // second
-
-    LinearSpectrogramOptions() : streaming_chunk(0.1), frame_opts() {}
-
-    void Register(kaldi::OptionsItf* opts) {
-        opts->Register("streaming-chunk",
-                       &streaming_chunk,
-                       "streaming chunk size, default: 0.1 sec");
-        frame_opts.Register(opts);
-    }
+    LinearSpectrogramOptions() : frame_opts() {}
 };

-class LinearSpectrogram : public FrontendInterface {
+class LinearSpectrogramComputer {
  public:
-    explicit LinearSpectrogram(
-        const LinearSpectrogramOptions& opts,
-        std::unique_ptr<FrontendInterface> base_extractor);
-    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
-    // the dim_ is the dim of single frame feature
-    virtual size_t Dim() const { return dim_; }
-    virtual void SetFinished() { base_extractor_->SetFinished(); }
-    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
-    virtual void Reset() {
-        base_extractor_->Reset();
-        remained_wav_.Resize(0);
+    typedef LinearSpectrogramOptions Options;
+    explicit LinearSpectrogramComputer(const Options& opts);
+
+    kaldi::FrameExtractionOptions& GetFrameOptions() {
+        return opts_.frame_opts;
    }

-  private:
-    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
-                 kaldi::Vector<kaldi::BaseFloat>* feats);
+    bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
+                 kaldi::Vector<kaldi::BaseFloat>* feat);

-    size_t dim_;
-    kaldi::FeatureWindowFunction feature_window_funtion_;
-    kaldi::BaseFloat hanning_window_energy_;
-    LinearSpectrogramOptions opts_;
-    std::unique_ptr<FrontendInterface> base_extractor_;
-    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
-    int chunk_sample_size_;
-    DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
+    int32 Dim() const { return dim_; }
+
+    bool NeedRawLogEnergy() { return false; }
+
+  private:
+    kaldi::BaseFloat scale_;
+    Options opts_;
+    int32 frame_length_;
+    int32 dim_;
+    DISALLOW_COPY_AND_ASSIGN(LinearSpectrogramComputer);
 };

+typedef StreamingFeatureTpl<LinearSpectrogramComputer> LinearSpectrogram;
+

 }  // namespace ppspeech