Merge pull request #2003 from SmileGoat/refactor_file_struct

[speechx] refactor frontend
4 years ago · 182858bf88
parent 82c1f4c508 6dbf30816b
commit 182858bf88
17 changed files with 241 additions and 250 deletions
--- a/speechx/examples/custom_asr/run.sh
+++ b/speechx/examples/custom_asr/run.sh
@ -71,7 +71,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
  recognizer_test_main \
    --wav_rspecifier=scp:$wav_scp \
    --cmvn_file=$cmvn \
    --streaming_chunk=30 \
    --use_fbank=true \
    --model_path=$model_dir/avg_10.jit.pdmodel \
    --param_path=$model_dir/avg_10.jit.pdiparams \
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@ -78,7 +78,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
        --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
        --cmvn_file=$cmvn \
        --streaming_chunk=0.36
    echo "feature make have finished!!!"
 fi
@ -155,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
        --cmvn_file=$cmvn \
        --model_path=$model_dir/avg_1.jit.pdmodel \
        --streaming_chunk=30 \
        --param_path=$model_dir/avg_1.jit.pdiparams \
        --word_symbol_table=$wfst/words.txt \
        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@ -152,7 +152,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
        --cmvn_file=$cmvn \
        --model_path=$model_dir/avg_5.jit.pdmodel \
        --streaming_chunk=30 \
        --use_fbank=true \
        --param_path=$model_dir/avg_5.jit.pdiparams \
        --word_symbol_table=$wfst/words.txt \
--- a/speechx/examples/ds2_ol/websocket/websocket_client.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_client.sh
@ -32,4 +32,4 @@ export GLOG_logtostderr=1
 # websocket client
 websocket_client_main \
-    --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36
+    --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.5
--- a/speechx/examples/ds2_ol/websocket/websocket_server.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh
@ -62,7 +62,6 @@ fi
 websocket_server_main \
    --cmvn_file=$cmvn \
    --model_path=$model_dir/avg_1.jit.pdmodel \
    --streaming_chunk=0.1 \
    --param_path=$model_dir/avg_1.jit.pdiparams \
    --word_symbol_table=$wfst/words.txt \
    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@ -25,7 +25,6 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
 // feature, or fbank");
 DEFINE_int32(num_bins, 161, "num bins of mel");
 DEFINE_string(cmvn_file, "", "read cmvn");
 DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
 // feature sliding window
 DEFINE_int32(receptive_field_length,
             7,
@ -62,7 +61,6 @@ namespace ppspeech {
 FeaturePipelineOptions InitFeaturePipelineOptions() {
    FeaturePipelineOptions opts;
    opts.cmvn_file = FLAGS_cmvn_file;
    opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
    kaldi::FrameExtractionOptions frame_opts;
    frame_opts.dither = 0.0;
    frame_opts.frame_shift_ms = 10;
@ -71,8 +69,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
        opts.to_float32 = false;
        frame_opts.window_type = "povey";
        frame_opts.frame_length_ms = 25;
-        opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
+        opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
-        opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
+        opts.fbank_opts.frame_opts = frame_opts;
    } else {
        opts.to_float32 = true;
        frame_opts.remove_dc_offset = false;
--- a/speechx/speechx/decoder/recognizer_main.cc
+++ b/speechx/speechx/decoder/recognizer_main.cc
@ -19,6 +19,7 @@
 DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
 DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
 DEFINE_int32(sample_rate, 16000, "sample rate");
 int main(int argc, char* argv[]) {
@ -96,4 +97,4 @@ int main(int argc, char* argv[]) {
    KALDI_LOG << " cost:" << elapsed << " s";
    KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s";
    KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration;
-}
+}
--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@ -30,8 +30,9 @@ class AudioCache : public FrontendInterface {
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
-    // the audio dim is 1, one sample
+    // the audio dim is 1, one sample, which is useless, 
-    virtual size_t Dim() const { return 1; }
+    // so we return size_(cache samples) instead.
    virtual size_t Dim() const { return size_; }
    virtual void SetFinished() {
        std::lock_guard<std::mutex> lock(mutex_);
--- a/speechx/speechx/frontend/audio/compute_fbank_main.cc
+++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@ -49,12 +49,11 @@ int main(int argc, char* argv[]) {
    std::unique_ptr<ppspeech::FrontendInterface> data_source(
        new ppspeech::AudioCache(3600 * 1600, false));
-    ppspeech::FbankOptions opt;
+    kaldi::FbankOptions opt;
-    opt.fbank_opts.frame_opts.frame_length_ms = 25;
+    opt.frame_opts.frame_length_ms = 25;
-    opt.fbank_opts.frame_opts.frame_shift_ms = 10;
+    opt.frame_opts.frame_shift_ms = 10;
-    opt.streaming_chunk = FLAGS_streaming_chunk;
+    opt.mel_opts.num_bins = FLAGS_num_bins;
-    opt.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
+    opt.frame_opts.dither = 0.0;
    opt.fbank_opts.frame_opts.dither = 0.0;
    std::unique_ptr<ppspeech::FrontendInterface> fbank(
        new ppspeech::Fbank(opt, std::move(data_source)));
--- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
+++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
@ -49,7 +49,6 @@ int main(int argc, char* argv[]) {
    ppspeech::LinearSpectrogramOptions opt;
    opt.frame_opts.frame_length_ms = 20;
    opt.frame_opts.frame_shift_ms = 10;
    opt.streaming_chunk = FLAGS_streaming_chunk;
    opt.frame_opts.dither = 0.0;
    opt.frame_opts.remove_dc_offset = false;
    opt.frame_opts.window_type = "hanning";
--- a/speechx/speechx/frontend/audio/fbank.cc
+++ b/speechx/speechx/frontend/audio/fbank.cc
@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "frontend/audio/fbank.h"
 #include "kaldi/base/kaldi-math.h"
 #include "kaldi/feat/feature-common.h"
@ -29,95 +28,33 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;
-// todo refactor later:(SmileGoat)
+FbankComputer::FbankComputer(const Options& opts)
 Fbank::Fbank(const FbankOptions& opts,
             std::unique_ptr<FrontendInterface> base_extractor)
    : opts_(opts),
-      computer_(opts.fbank_opts),
+    computer_(opts) {}
      window_function_(opts.fbank_opts.frame_opts) {
    base_extractor_ = std::move(base_extractor);
    chunk_sample_size_ = static_cast<int32>(
        opts.streaming_chunk * opts.fbank_opts.frame_opts.samp_freq);
 }
-void Fbank::Accept(const VectorBase<BaseFloat>& inputs) {
+int32 FbankComputer::Dim() const {
-    base_extractor_->Accept(inputs);
+    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
 }
-bool Fbank::Read(Vector<BaseFloat>* feats) {
+bool FbankComputer::NeedRawLogEnergy() {
-    Vector<BaseFloat> wav(chunk_sample_size_);
+    return opts_.use_energy && opts_.raw_energy; 
    bool flag = base_extractor_->Read(&wav);
    if (flag == false || wav.Dim() == 0) return false;
    // append remaned waves
    int32 wav_len = wav.Dim();
    int32 left_len = remained_wav_.Dim();
    Vector<BaseFloat> waves(left_len + wav_len);
    waves.Range(0, left_len).CopyFromVec(remained_wav_);
    waves.Range(left_len, wav_len).CopyFromVec(wav);
    // compute speech feature
    Compute(waves, feats);
    // cache remaned waves
    kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
    int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
    int32 frame_shift = frame_opts.WindowShift();
    int32 left_samples = waves.Dim() - frame_shift * num_frames;
    remained_wav_.Resize(left_samples);
    remained_wav_.CopyFromVec(
        waves.Range(frame_shift * num_frames, left_samples));
    return true;
 }
-// Compute spectrogram feat
+// Compute feat
-bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
+bool FbankComputer::Compute(Vector<BaseFloat>* window, Vector<BaseFloat>* feat) {
-    const kaldi::FrameExtractionOptions& frame_opts =
+    RealFft(window, true);
-        computer_.GetFrameOptions();
+    kaldi::ComputePowerSpectrum(window);
-    int32 num_samples = waves.Dim();
+    const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
-    int32 frame_length = frame_opts.WindowSize();
+    SubVector<BaseFloat> power_spectrum(*window, 0, window->Dim() / 2 + 1);
-    int32 sample_rate = frame_opts.samp_freq;
+    if (!opts_.use_power) {
-    if (num_samples < frame_length) {
+        power_spectrum.ApplyPow(0.5);
        return true;
    }
    int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
    feats->Resize(num_frames * Dim());
    Vector<BaseFloat> window;
    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
    for (int32 frame = 0; frame < num_frames; frame++) {
        BaseFloat raw_log_energy = 0.0;
        kaldi::ExtractWindow(0,
                             waves,
                             frame,
                             frame_opts,
                             window_function_,
                             &window,
                             need_raw_log_energy ? &raw_log_energy : NULL);
        Vector<BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
        // note: this online feature-extraction code does not support VTLN.
        RealFft(&window, true);
        kaldi::ComputePowerSpectrum(&window);
        const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
        SubVector<BaseFloat> power_spectrum(window, 0, window.Dim() / 2 + 1);
        if (!opts_.fbank_opts.use_power) {
            power_spectrum.ApplyPow(0.5);
        }
        int32 mel_offset =
            ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1
                                                                           : 0);
        SubVector<BaseFloat> mel_energies(
            this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
        mel_bank.Compute(power_spectrum, &mel_energies);
        mel_energies.ApplyFloor(1e-07);
        mel_energies.ApplyLog();
        SubVector<BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
        output_row.CopyFromVec(this_feature);
    }
    int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
    SubVector<BaseFloat> mel_energies(
        *feat, mel_offset, opts_.mel_opts.num_bins);
    mel_bank.Compute(power_spectrum, &mel_energies);
    mel_energies.ApplyFloor(1e-07);
    mel_energies.ApplyLog();
    return true;
 }
--- a/speechx/speechx/frontend/audio/fbank.h
+++ b/speechx/speechx/frontend/audio/fbank.h
@ -15,6 +15,7 @@
 #pragma once
 #include "base/common.h"
 #include "frontend/audio/feature_common.h"
 #include "frontend/audio/frontend_itf.h"
 #include "kaldi/feat/feature-fbank.h"
 #include "kaldi/feat/feature-mfcc.h"
@ -22,56 +23,28 @@
 namespace ppspeech {
-struct FbankOptions {
+class FbankComputer {
    kaldi::FbankOptions fbank_opts;
    kaldi::BaseFloat streaming_chunk;  // second
    FbankOptions() : streaming_chunk(0.1), fbank_opts() {}
    void Register(kaldi::OptionsItf* opts) {
        opts->Register("streaming-chunk",
                       &streaming_chunk,
                       "streaming chunk size, default: 0.1 sec");
        fbank_opts.Register(opts);
    }
 };
 class Fbank : public FrontendInterface {
  public:
-    explicit Fbank(const FbankOptions& opts,
+    typedef kaldi::FbankOptions Options;
-                   std::unique_ptr<FrontendInterface> base_extractor);
+    explicit FbankComputer(const Options& opts);
    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
-    // the dim_ is the dim of single frame feature
+    kaldi::FrameExtractionOptions& GetFrameOptions() {
-    virtual size_t Dim() const { return computer_.Dim(); }
+        return opts_.frame_opts;
-
+    }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
-    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+    bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
                 kaldi::Vector<kaldi::BaseFloat>* feat);
    int32 Dim() const;
-    virtual void Reset() {
+    bool NeedRawLogEnergy();
        base_extractor_->Reset();
        remained_wav_.Resize(0);
    }
  private:
-    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
+    Options opts_;
                 kaldi::Vector<kaldi::BaseFloat>* feats);
    FbankOptions opts_;
    std::unique_ptr<FrontendInterface> base_extractor_;
    kaldi::FeatureWindowFunction window_function_;
    kaldi::FbankComputer computer_;
-    // features_ is the Mfcc or Plp or Fbank features that we have already
+    DISALLOW_COPY_AND_ASSIGN(FbankComputer);
    // computed.
    kaldi::Vector<kaldi::BaseFloat> features_;
    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
    kaldi::int32 chunk_sample_size_;
    DISALLOW_COPY_AND_ASSIGN(Fbank);
 };
 typedef StreamingFeatureTpl<FbankComputer> Fbank;
 }  // namespace ppspeech
--- a/speechx/speechx/frontend/audio/feature_common.h
+++ b/speechx/speechx/frontend/audio/feature_common.h
@ -0,0 +1,54 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "frontend_itf.h"
 #include "kaldi/feat/feature-window.h"
 namespace ppspeech {
 template <class F>
 class StreamingFeatureTpl : public FrontendInterface {
  public:
    typedef typename F::Options Options;
    StreamingFeatureTpl(const Options& opts, 
                        std::unique_ptr<FrontendInterface> base_extractor);
    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
    // the dim_ is the dim of single frame feature
    virtual size_t Dim() const { return computer_.Dim(); }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
    virtual void Reset() {
        base_extractor_->Reset();
        remained_wav_.Resize(0);
    }
  private:
    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, 
                 kaldi::Vector<kaldi::BaseFloat>* feats);
    Options opts_;
    std::unique_ptr<FrontendInterface> base_extractor_;
    kaldi::FeatureWindowFunction window_function_;
    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
    F computer_;
 };
 }  // namespace ppspeech
 #include "frontend/audio/feature_common_inl.h"
--- a/speechx/speechx/frontend/audio/feature_common_inl.h
+++ b/speechx/speechx/frontend/audio/feature_common_inl.h
@ -0,0 +1,95 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 namespace ppspeech {
 template <class F>
 StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts, 
                        std::unique_ptr<FrontendInterface> base_extractor):
                        opts_(opts),
                        computer_(opts),
                        window_function_(opts.frame_opts) {
    base_extractor_ = std::move(base_extractor);
 }
 template <class F>
 void StreamingFeatureTpl<F>::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves) {
    base_extractor_->Accept(waves);
 }
 template <class F>
 bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
    kaldi::Vector<kaldi::BaseFloat> wav(base_extractor_->Dim());
    bool flag = base_extractor_->Read(&wav);
    if (flag == false || wav.Dim() == 0) return false;
    // append remaned waves
    int32 wav_len = wav.Dim();
    int32 left_len = remained_wav_.Dim();
    kaldi::Vector<kaldi::BaseFloat> waves(left_len + wav_len);
    waves.Range(0, left_len).CopyFromVec(remained_wav_);
    waves.Range(left_len, wav_len).CopyFromVec(wav);
    // compute speech feature
    Compute(waves, feats);
    // cache remaned waves
    kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
    int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
    int32 frame_shift = frame_opts.WindowShift();
    int32 left_samples = waves.Dim() - frame_shift * num_frames;
    remained_wav_.Resize(left_samples);
    remained_wav_.CopyFromVec(
        waves.Range(frame_shift * num_frames, left_samples));
    return true;
 }
 // Compute feat
 template <class F>
 bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
                                     kaldi::Vector<kaldi::BaseFloat>* feats) {
    const kaldi::FrameExtractionOptions& frame_opts =
        computer_.GetFrameOptions();
    int32 num_samples = waves.Dim();
    int32 frame_length = frame_opts.WindowSize();
    int32 sample_rate = frame_opts.samp_freq;
    if (num_samples < frame_length) {
        return true;
    }
    int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
    feats->Resize(num_frames * Dim());
    kaldi::Vector<kaldi::BaseFloat> window;
    bool need_raw_log_energy = computer_.NeedRawLogEnergy();
    for (int32 frame = 0; frame < num_frames; frame++) {
        kaldi::BaseFloat raw_log_energy = 0.0;
        kaldi::ExtractWindow(0,
                             waves,
                             frame,
                             frame_opts,
                             window_function_,
                             &window,
                             need_raw_log_energy ? &raw_log_energy : NULL);
        kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
        computer_.Compute(&window, &this_feature);
        kaldi::SubVector<kaldi::BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
        output_row.CopyFromVec(this_feature);
    }
    return true;
 }
 }  // namespace ppspeech
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@ -32,7 +32,7 @@ struct FeaturePipelineOptions {
    bool to_float32;  // true, only for linear feature
    bool use_fbank;
    LinearSpectrogramOptions linear_spectrogram_opts;
-    FbankOptions fbank_opts;
+    kaldi::FbankOptions fbank_opts;
    FeatureCacheOptions feature_cache_opts;
    AssemblerOptions assembler_opts;
--- a/speechx/speechx/frontend/audio/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc
@ -28,81 +28,32 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;
-LinearSpectrogram::LinearSpectrogram(
+LinearSpectrogramComputer::LinearSpectrogramComputer(
-    const LinearSpectrogramOptions& opts,
+    const Options& opts)
-    std::unique_ptr<FrontendInterface> base_extractor)
+    : opts_(opts) {
-    : opts_(opts), feature_window_funtion_(opts.frame_opts) {
+    kaldi::FeatureWindowFunction feature_window_function(opts.frame_opts);
    base_extractor_ = std::move(base_extractor);
    int32 window_size = opts.frame_opts.WindowSize();
-    int32 window_shift = opts.frame_opts.WindowShift();
+    frame_length_ = window_size;
    dim_ = window_size / 2 + 1;
-    chunk_sample_size_ =
+    BaseFloat hanning_window_energy = kaldi::VecVec(feature_window_function.window,
-        static_cast<int32>(opts.streaming_chunk * opts.frame_opts.samp_freq);
+                                          feature_window_function.window);
-    hanning_window_energy_ = kaldi::VecVec(feature_window_funtion_.window,
+    int32 sample_rate = opts.frame_opts.samp_freq;
-                                           feature_window_funtion_.window);
+    scale_ = 2.0 / (hanning_window_energy * sample_rate);
 }
 void LinearSpectrogram::Accept(const VectorBase<BaseFloat>& inputs) {
    base_extractor_->Accept(inputs);
 }
 bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
    Vector<BaseFloat> input_feats(chunk_sample_size_);
    bool flag = base_extractor_->Read(&input_feats);
    if (flag == false || input_feats.Dim() == 0) return false;
    int32 feat_len = input_feats.Dim();
    int32 left_len = remained_wav_.Dim();
    Vector<BaseFloat> waves(feat_len + left_len);
    waves.Range(0, left_len).CopyFromVec(remained_wav_);
    waves.Range(left_len, feat_len).CopyFromVec(input_feats);
    Compute(waves, feats);
    int32 frame_shift = opts_.frame_opts.WindowShift();
    int32 num_frames = kaldi::NumFrames(waves.Dim(), opts_.frame_opts);
    int32 left_samples = waves.Dim() - frame_shift * num_frames;
    remained_wav_.Resize(left_samples);
    remained_wav_.CopyFromVec(
        waves.Range(frame_shift * num_frames, left_samples));
    return true;
 }
 // Compute spectrogram feat
-bool LinearSpectrogram::Compute(const Vector<BaseFloat>& waves,
+bool LinearSpectrogramComputer::Compute(Vector<BaseFloat>* window,
-                                Vector<BaseFloat>* feats) {
+                                Vector<BaseFloat>* feat) {
-    int32 num_samples = waves.Dim();
+    window->Resize(frame_length_, kaldi::kCopyData);
-    int32 frame_length = opts_.frame_opts.WindowSize();
+    RealFft(window, true);
-    int32 sample_rate = opts_.frame_opts.samp_freq;
+    kaldi::ComputePowerSpectrum(window);
-    BaseFloat scale = 2.0 / (hanning_window_energy_ * sample_rate);
+    SubVector<BaseFloat> power_spectrum(*window, 0, dim_);
-
+    power_spectrum.Scale(scale_);
-    if (num_samples < frame_length) {
+    power_spectrum(0) = power_spectrum(0) / 2;
-        return true;
+    power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
-    }
+    power_spectrum.Add(1e-14);
-
+    power_spectrum.ApplyLog();
-    int32 num_frames = kaldi::NumFrames(num_samples, opts_.frame_opts);
+    feat->CopyFromVec(power_spectrum);
    feats->Resize(num_frames * dim_);
    Vector<BaseFloat> window;
    for (int frame_idx = 0; frame_idx < num_frames; ++frame_idx) {
        kaldi::ExtractWindow(0,
                             waves,
                             frame_idx,
                             opts_.frame_opts,
                             feature_window_funtion_,
                             &window,
                             NULL);
        SubVector<BaseFloat> output_row(feats->Data() + frame_idx * dim_, dim_);
        window.Resize(frame_length, kaldi::kCopyData);
        RealFft(&window, true);
        kaldi::ComputePowerSpectrum(&window);
        SubVector<BaseFloat> power_spectrum(window, 0, dim_);
        power_spectrum.Scale(scale);
        power_spectrum(0) = power_spectrum(0) / 2;
        power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
        power_spectrum.Add(1e-14);
        power_spectrum.ApplyLog();
        output_row.CopyFromVec(power_spectrum);
    }
    return true;
 }
--- a/speechx/speechx/frontend/audio/linear_spectrogram.h
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.h
@ -16,6 +16,7 @@
 #pragma once
 #include "base/common.h"
 #include "frontend/audio/feature_common.h"
 #include "frontend/audio/frontend_itf.h"
 #include "kaldi/feat/feature-window.h"
@ -23,47 +24,34 @@ namespace ppspeech {
 struct LinearSpectrogramOptions {
    kaldi::FrameExtractionOptions frame_opts;
-    kaldi::BaseFloat streaming_chunk;  // second
+    LinearSpectrogramOptions() : frame_opts() {}
    LinearSpectrogramOptions() : streaming_chunk(0.1), frame_opts() {}
    void Register(kaldi::OptionsItf* opts) {
        opts->Register("streaming-chunk",
                       &streaming_chunk,
                       "streaming chunk size, default: 0.1 sec");
        frame_opts.Register(opts);
    }
 };
-class LinearSpectrogram : public FrontendInterface {
+class LinearSpectrogramComputer {
  public:
-    explicit LinearSpectrogram(
+    typedef LinearSpectrogramOptions Options;
-        const LinearSpectrogramOptions& opts,
+    explicit LinearSpectrogramComputer(const Options& opts);
-        std::unique_ptr<FrontendInterface> base_extractor);
+
-    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+    kaldi::FrameExtractionOptions& GetFrameOptions() {
-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+        return opts_.frame_opts;
    // the dim_ is the dim of single frame feature
    virtual size_t Dim() const { return dim_; }
    virtual void SetFinished() { base_extractor_->SetFinished(); }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
    virtual void Reset() {
        base_extractor_->Reset();
        remained_wav_.Resize(0);
    }
-  private:
+    bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
-    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
+                 kaldi::Vector<kaldi::BaseFloat>* feat);
                 kaldi::Vector<kaldi::BaseFloat>* feats);
-    size_t dim_;
+    int32 Dim() const { return dim_; }
-    kaldi::FeatureWindowFunction feature_window_funtion_;
+
-    kaldi::BaseFloat hanning_window_energy_;
+    bool NeedRawLogEnergy() { return false; }
-    LinearSpectrogramOptions opts_;
+
-    std::unique_ptr<FrontendInterface> base_extractor_;
+  private:
-    kaldi::Vector<kaldi::BaseFloat> remained_wav_;
+    kaldi::BaseFloat scale_;
-    int chunk_sample_size_;
+    Options opts_;
-    DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
+    int32 frame_length_;
    int32 dim_;
    DISALLOW_COPY_AND_ASSIGN(LinearSpectrogramComputer);
 };
 typedef StreamingFeatureTpl<LinearSpectrogramComputer> LinearSpectrogram;
 }  // namespace ppspeech