Merge pull request #1993 from SmileGoat/refactor_file_struct

[speechx]add assembler in frontend,test=doc
2 years ago · 174cf0765f
parent 0c6dddb101 952b1a1451
commit 174cf0765f
10 changed files with 160 additions and 47 deletions
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@ -81,8 +81,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
        frame_opts.preemph_coeff = 0.0;
        opts.linear_spectrogram_opts.frame_opts = frame_opts;
    }
-    opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length;
-    opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate;
+    opts.assembler_opts.frame_chunk_size = FLAGS_receptive_field_length;
+    opts.assembler_opts.frame_chunk_stride = FLAGS_downsampling_rate;
    return opts;
 }

--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
@ -8,6 +8,7 @@ add_library(frontend STATIC
  feature_cache.cc
  feature_pipeline.cc
  fbank.cc
+  assembler.cc
 )
 target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank)

--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "frontend/audio/assembler.h"
+
+namespace ppspeech {
+
+using kaldi::Vector;
+using kaldi::VectorBase;
+using kaldi::BaseFloat;
+using std::unique_ptr;
+
+Assembler::Assembler(AssemblerOptions opts,
+                     unique_ptr<FrontendInterface> base_extractor) {
+    frame_chunk_stride_ = opts.frame_chunk_stride;
+    frame_chunk_size_ = opts.frame_chunk_size;
+    base_extractor_ = std::move(base_extractor);
+    dim_ = base_extractor_->Dim();
+}
+
+void Assembler::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+    // read inputs
+    base_extractor_->Accept(inputs);
+}
+
+// pop feature chunk
+bool Assembler::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
+    feats->Resize(dim_ * frame_chunk_size_);
+    bool result = Compute(feats);
+    return result;
+}
+
+// read all data from base_feature_extractor_ into cache_
+bool Assembler::Compute(Vector<BaseFloat>* feats) {
+    // compute and feed
+    bool result = false;
+    while (feature_cache_.size() < frame_chunk_size_) {
+        Vector<BaseFloat> feature;
+        result = base_extractor_->Read(&feature);
+        if (result == false || feature.Dim() == 0) return false;
+        feature_cache_.push(feature);
+    }
+
+    int32 counter = 0; 
+    int32 cache_size = frame_chunk_size_ - frame_chunk_stride_;
+    int32 elem_dim = base_extractor_->Dim();
+    while (counter < frame_chunk_size_) {
+      Vector<BaseFloat>& val = feature_cache_.front();
+      int32 start = counter * elem_dim;
+      feats->Range(start, elem_dim).CopyFromVec(val);
+      if (frame_chunk_size_ - counter <= cache_size ) {
+          feature_cache_.push(val);
+      }
+      feature_cache_.pop();
+      counter++;
+    }
+
+    return result;
+}
+
+}  // namespace ppspeech
--- a/speechx/speechx/frontend/audio/assembler.h
+++ b/speechx/speechx/frontend/audio/assembler.h
@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+
+namespace ppspeech {
+
+struct AssemblerOptions {
+    int32 frame_chunk_size;
+    int32 frame_chunk_stride;
+    
+    AssemblerOptions()
+        : frame_chunk_size(1),
+          frame_chunk_stride(1) {}
+};
+
+class Assembler : public FrontendInterface {
+  public:
+    explicit Assembler(
+        AssemblerOptions opts,
+        std::unique_ptr<FrontendInterface> base_extractor = NULL);
+
+    // Feed feats or waves
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+
+    // feats size = num_frames * feat_dim
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+
+    // feat dim
+    virtual size_t Dim() const { return dim_; }
+
+    virtual void SetFinished() {
+        base_extractor_->SetFinished();
+    }
+
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+
+    virtual void Reset() {
+        base_extractor_->Reset();
+    }
+
+  private:
+    bool Compute(kaldi::Vector<kaldi::BaseFloat>* feats);
+
+    int32 dim_;
+    int32 frame_chunk_size_;    // window
+    int32 frame_chunk_stride_;  // stride
+    std::queue<kaldi::Vector<kaldi::BaseFloat>> feature_cache_;
+    std::unique_ptr<FrontendInterface> base_extractor_;
+    DISALLOW_COPY_AND_ASSIGN(Assembler);
+};
+
+}  // namespace ppspeech
--- a/speechx/speechx/frontend/audio/compute_fbank_main.cc
+++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@ -64,10 +64,6 @@ int main(int argc, char* argv[]) {

    ppspeech::FeatureCacheOptions feat_cache_opts;
    // the feature cache output feature chunk by chunk.
-    // frame_chunk_size : num frame of a chunk.
-    // frame_chunk_stride: chunk sliding window stride.
-    feat_cache_opts.frame_chunk_stride = 1;
-    feat_cache_opts.frame_chunk_size = 1;
    ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
    LOG(INFO) << "fbank: " << true;
    LOG(INFO) << "feat dim: " << feature_cache.Dim();
--- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
+++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
@ -66,10 +66,6 @@ int main(int argc, char* argv[]) {

    ppspeech::FeatureCacheOptions feat_cache_opts;
    // the feature cache output feature chunk by chunk.
-    // frame_chunk_size : num frame of a chunk.
-    // frame_chunk_stride: chunk sliding window stride.
-    feat_cache_opts.frame_chunk_stride = 1;
-    feat_cache_opts.frame_chunk_size = 1;
    ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
    LOG(INFO) << "feat dim: " << feature_cache.Dim();

--- a/speechx/speechx/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@ -26,8 +26,6 @@ using std::unique_ptr;
 FeatureCache::FeatureCache(FeatureCacheOptions opts,
                           unique_ptr<FrontendInterface> base_extractor) {
    max_size_ = opts.max_size;
-    frame_chunk_stride_ = opts.frame_chunk_stride;
-    frame_chunk_size_ = opts.frame_chunk_size;
    timeout_ = opts.timeout;  // ms
    base_extractor_ = std::move(base_extractor);
    dim_ = base_extractor_->Dim();
@ -74,24 +72,11 @@ bool FeatureCache::Compute() {
    bool result = base_extractor_->Read(&feature);
    if (result == false || feature.Dim() == 0) return false;

-    // join with remained
-    int32 joint_len = feature.Dim() + remained_feature_.Dim();
-    Vector<BaseFloat> joint_feature(joint_len);
-    joint_feature.Range(0, remained_feature_.Dim())
-        .CopyFromVec(remained_feature_);
-    joint_feature.Range(remained_feature_.Dim(), feature.Dim())
-        .CopyFromVec(feature);
-
-    // one by one, or stride with window
-    // controlled by frame_chunk_stride_ and frame_chunk_size_
-    int32 num_chunk =
-        ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;
+    int32 num_chunk = feature.Dim() / dim_ ;
    for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
-        int32 start = chunk_idx * frame_chunk_stride_ * dim_;
-
-        Vector<BaseFloat> feature_chunk(frame_chunk_size_ * dim_);
-        SubVector<BaseFloat> tmp(joint_feature.Data() + start,
-                                 frame_chunk_size_ * dim_);
+        int32 start = chunk_idx *  dim_;
+        Vector<BaseFloat> feature_chunk(dim_);
+        SubVector<BaseFloat> tmp(feature.Data() + start, dim_);
        feature_chunk.CopyFromVec(tmp);

        std::unique_lock<std::mutex> lock(mutex_);
@ -104,13 +89,6 @@ bool FeatureCache::Compute() {
        cache_.push(feature_chunk);
        ready_read_condition_.notify_one();
    }
-
-    // cache remained feats
-    int32 remained_feature_len =
-        joint_len - num_chunk * frame_chunk_stride_ * dim_;
-    remained_feature_.Resize(remained_feature_len);
-    remained_feature_.CopyFromVec(joint_feature.Range(
-        frame_chunk_stride_ * num_chunk * dim_, remained_feature_len));
    return result;
 }

--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@ -21,13 +21,9 @@ namespace ppspeech {

 struct FeatureCacheOptions {
    int32 max_size;
-    int32 frame_chunk_size;
-    int32 frame_chunk_stride;
    int32 timeout;  // ms
    FeatureCacheOptions()
        : max_size(kint16max),
-          frame_chunk_size(1),
-          frame_chunk_stride(1),
          timeout(1) {}
 };

@ -80,7 +76,7 @@ class FeatureCache : public FrontendInterface {
    std::condition_variable ready_feed_condition_;
    std::condition_variable ready_read_condition_;

-    // DISALLOW_COPY_AND_ASSGIN(FeatureCache);
+    DISALLOW_COPY_AND_ASSIGN(FeatureCache);
 };

 }  // namespace ppspeech
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@ -35,8 +35,11 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
    unique_ptr<FrontendInterface> cmvn(
        new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));

-    base_extractor_.reset(
+    unique_ptr<FrontendInterface> cache(
        new ppspeech::FeatureCache(opts.feature_cache_opts, std::move(cmvn)));
+
+    base_extractor_.reset(
+        new ppspeech::Assembler(opts.assembler_opts, std::move(cache)));
 }

 }  // ppspeech
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@ -23,6 +23,7 @@
 #include "frontend/audio/frontend_itf.h"
 #include "frontend/audio/linear_spectrogram.h"
 #include "frontend/audio/normalizer.h"
+#include "frontend/audio/assembler.h"

 namespace ppspeech {

@ -33,13 +34,16 @@ struct FeaturePipelineOptions {
    LinearSpectrogramOptions linear_spectrogram_opts;
    FbankOptions fbank_opts;
    FeatureCacheOptions feature_cache_opts;
+    AssemblerOptions assembler_opts;
+
    FeaturePipelineOptions()
        : cmvn_file(""),
          to_float32(false),  // true, only for linear feature
          use_fbank(true),
          linear_spectrogram_opts(),
          fbank_opts(),
-          feature_cache_opts() {}
+          feature_cache_opts(),
+          assembler_opts() {}
 };

 class FeaturePipeline : public FrontendInterface {