Merge pull request #1993 from SmileGoat/refactor_file_struct

[speechx]add assembler in frontend,test=doc
pull/1996/head
Hui Zhang 2 years ago committed by GitHub
commit 174cf0765f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -81,8 +81,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
frame_opts.preemph_coeff = 0.0;
opts.linear_spectrogram_opts.frame_opts = frame_opts;
}
opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length;
opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate;
opts.assembler_opts.frame_chunk_size = FLAGS_receptive_field_length;
opts.assembler_opts.frame_chunk_stride = FLAGS_downsampling_rate;
return opts;
}

@ -8,6 +8,7 @@ add_library(frontend STATIC
feature_cache.cc
feature_pipeline.cc
fbank.cc
assembler.cc
)
target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank)

@ -0,0 +1,72 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/assembler.h"
namespace ppspeech {
using kaldi::Vector;
using kaldi::VectorBase;
using kaldi::BaseFloat;
using std::unique_ptr;
Assembler::Assembler(AssemblerOptions opts,
unique_ptr<FrontendInterface> base_extractor) {
frame_chunk_stride_ = opts.frame_chunk_stride;
frame_chunk_size_ = opts.frame_chunk_size;
base_extractor_ = std::move(base_extractor);
dim_ = base_extractor_->Dim();
}
void Assembler::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
// read inputs
base_extractor_->Accept(inputs);
}
// pop feature chunk
bool Assembler::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
feats->Resize(dim_ * frame_chunk_size_);
bool result = Compute(feats);
return result;
}
// read all data from base_feature_extractor_ into cache_
bool Assembler::Compute(Vector<BaseFloat>* feats) {
// compute and feed
bool result = false;
while (feature_cache_.size() < frame_chunk_size_) {
Vector<BaseFloat> feature;
result = base_extractor_->Read(&feature);
if (result == false || feature.Dim() == 0) return false;
feature_cache_.push(feature);
}
int32 counter = 0;
int32 cache_size = frame_chunk_size_ - frame_chunk_stride_;
int32 elem_dim = base_extractor_->Dim();
while (counter < frame_chunk_size_) {
Vector<BaseFloat>& val = feature_cache_.front();
int32 start = counter * elem_dim;
feats->Range(start, elem_dim).CopyFromVec(val);
if (frame_chunk_size_ - counter <= cache_size ) {
feature_cache_.push(val);
}
feature_cache_.pop();
counter++;
}
return result;
}
} // namespace ppspeech

@ -0,0 +1,67 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
namespace ppspeech {
struct AssemblerOptions {
int32 frame_chunk_size;
int32 frame_chunk_stride;
AssemblerOptions()
: frame_chunk_size(1),
frame_chunk_stride(1) {}
};
class Assembler : public FrontendInterface {
public:
explicit Assembler(
AssemblerOptions opts,
std::unique_ptr<FrontendInterface> base_extractor = NULL);
// Feed feats or waves
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
// feats size = num_frames * feat_dim
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// feat dim
virtual size_t Dim() const { return dim_; }
virtual void SetFinished() {
base_extractor_->SetFinished();
}
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() {
base_extractor_->Reset();
}
private:
bool Compute(kaldi::Vector<kaldi::BaseFloat>* feats);
int32 dim_;
int32 frame_chunk_size_; // window
int32 frame_chunk_stride_; // stride
std::queue<kaldi::Vector<kaldi::BaseFloat>> feature_cache_;
std::unique_ptr<FrontendInterface> base_extractor_;
DISALLOW_COPY_AND_ASSIGN(Assembler);
};
} // namespace ppspeech

@ -64,10 +64,6 @@ int main(int argc, char* argv[]) {
ppspeech::FeatureCacheOptions feat_cache_opts;
// the feature cache output feature chunk by chunk.
// frame_chunk_size : num frame of a chunk.
// frame_chunk_stride: chunk sliding window stride.
feat_cache_opts.frame_chunk_stride = 1;
feat_cache_opts.frame_chunk_size = 1;
ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
LOG(INFO) << "fbank: " << true;
LOG(INFO) << "feat dim: " << feature_cache.Dim();

@ -66,10 +66,6 @@ int main(int argc, char* argv[]) {
ppspeech::FeatureCacheOptions feat_cache_opts;
// the feature cache output feature chunk by chunk.
// frame_chunk_size : num frame of a chunk.
// frame_chunk_stride: chunk sliding window stride.
feat_cache_opts.frame_chunk_stride = 1;
feat_cache_opts.frame_chunk_size = 1;
ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
LOG(INFO) << "feat dim: " << feature_cache.Dim();

@ -26,8 +26,6 @@ using std::unique_ptr;
FeatureCache::FeatureCache(FeatureCacheOptions opts,
unique_ptr<FrontendInterface> base_extractor) {
max_size_ = opts.max_size;
frame_chunk_stride_ = opts.frame_chunk_stride;
frame_chunk_size_ = opts.frame_chunk_size;
timeout_ = opts.timeout; // ms
base_extractor_ = std::move(base_extractor);
dim_ = base_extractor_->Dim();
@ -74,24 +72,11 @@ bool FeatureCache::Compute() {
bool result = base_extractor_->Read(&feature);
if (result == false || feature.Dim() == 0) return false;
// join with remained
int32 joint_len = feature.Dim() + remained_feature_.Dim();
Vector<BaseFloat> joint_feature(joint_len);
joint_feature.Range(0, remained_feature_.Dim())
.CopyFromVec(remained_feature_);
joint_feature.Range(remained_feature_.Dim(), feature.Dim())
.CopyFromVec(feature);
// one by one, or stride with window
// controlled by frame_chunk_stride_ and frame_chunk_size_
int32 num_chunk =
((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;
int32 num_chunk = feature.Dim() / dim_ ;
for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
int32 start = chunk_idx * frame_chunk_stride_ * dim_;
Vector<BaseFloat> feature_chunk(frame_chunk_size_ * dim_);
SubVector<BaseFloat> tmp(joint_feature.Data() + start,
frame_chunk_size_ * dim_);
int32 start = chunk_idx * dim_;
Vector<BaseFloat> feature_chunk(dim_);
SubVector<BaseFloat> tmp(feature.Data() + start, dim_);
feature_chunk.CopyFromVec(tmp);
std::unique_lock<std::mutex> lock(mutex_);
@ -104,13 +89,6 @@ bool FeatureCache::Compute() {
cache_.push(feature_chunk);
ready_read_condition_.notify_one();
}
// cache remained feats
int32 remained_feature_len =
joint_len - num_chunk * frame_chunk_stride_ * dim_;
remained_feature_.Resize(remained_feature_len);
remained_feature_.CopyFromVec(joint_feature.Range(
frame_chunk_stride_ * num_chunk * dim_, remained_feature_len));
return result;
}

@ -21,13 +21,9 @@ namespace ppspeech {
struct FeatureCacheOptions {
int32 max_size;
int32 frame_chunk_size;
int32 frame_chunk_stride;
int32 timeout; // ms
FeatureCacheOptions()
: max_size(kint16max),
frame_chunk_size(1),
frame_chunk_stride(1),
timeout(1) {}
};
@ -80,7 +76,7 @@ class FeatureCache : public FrontendInterface {
std::condition_variable ready_feed_condition_;
std::condition_variable ready_read_condition_;
// DISALLOW_COPY_AND_ASSGIN(FeatureCache);
DISALLOW_COPY_AND_ASSIGN(FeatureCache);
};
} // namespace ppspeech

@ -35,8 +35,11 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
unique_ptr<FrontendInterface> cmvn(
new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));
base_extractor_.reset(
unique_ptr<FrontendInterface> cache(
new ppspeech::FeatureCache(opts.feature_cache_opts, std::move(cmvn)));
base_extractor_.reset(
new ppspeech::Assembler(opts.assembler_opts, std::move(cache)));
}
} // ppspeech

@ -23,6 +23,7 @@
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/normalizer.h"
#include "frontend/audio/assembler.h"
namespace ppspeech {
@ -33,13 +34,16 @@ struct FeaturePipelineOptions {
LinearSpectrogramOptions linear_spectrogram_opts;
FbankOptions fbank_opts;
FeatureCacheOptions feature_cache_opts;
AssemblerOptions assembler_opts;
FeaturePipelineOptions()
: cmvn_file(""),
to_float32(false), // true, only for linear feature
use_fbank(true),
linear_spectrogram_opts(),
fbank_opts(),
feature_cache_opts() {}
feature_cache_opts(),
assembler_opts() {}
};
class FeaturePipeline : public FrontendInterface {

Loading…
Cancel
Save