From 7e334ce890a512f067af9a0918632a1c3c45001e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 19 Oct 2022 12:43:47 +0000 Subject: [PATCH] fix assembler buf, which not clear cache, and fill zero default --- .../u2pp_ol/wenetspeech/local/recognizer.sh | 2 +- speechx/speechx/frontend/audio/assembler.cc | 50 ++++++++++++++----- speechx/speechx/frontend/audio/assembler.h | 31 ++++++------ speechx/speechx/frontend/audio/audio_cache.cc | 4 ++ speechx/speechx/frontend/audio/audio_cache.h | 4 +- .../speechx/frontend/audio/feature_cache.cc | 3 ++ .../speechx/frontend/audio/feature_cache.h | 10 ++-- speechx/speechx/nnet/u2_nnet.cc | 1 + .../speechx/recognizer/u2_recognizer_main.cc | 6 ++- 9 files changed, 77 insertions(+), 34 deletions(-) diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh index bf4635458..f71a8003e 100755 --- a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh +++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh @@ -5,7 +5,7 @@ set -e data=data exp=exp -nj=20 +nj=40 mkdir -p $exp diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc index 37eeec80f..ff1b1f28f 100644 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -23,9 +23,11 @@ using std::unique_ptr; Assembler::Assembler(AssemblerOptions opts, unique_ptr base_extractor) { + fill_zero_ = opts.fill_zero; frame_chunk_stride_ = opts.subsampling_rate * opts.nnet_decoder_chunk; frame_chunk_size_ = (opts.nnet_decoder_chunk - 1) * opts.subsampling_rate + opts.receptive_filed_length; + cache_size_ = frame_chunk_size_ - frame_chunk_stride_; receptive_filed_length_ = opts.receptive_filed_length; base_extractor_ = std::move(base_extractor); dim_ = base_extractor_->Dim(); @@ -38,14 +40,13 @@ void Assembler::Accept(const kaldi::VectorBase& inputs) { // pop feature chunk bool Assembler::Read(kaldi::Vector* feats) { - feats->Resize(dim_ * frame_chunk_size_); bool result = Compute(feats); return result; } -// read all data from base_feature_extractor_ into cache_ +// read frame by frame from base_feature_extractor_ into cache_ bool Assembler::Compute(Vector* feats) { - // compute and feed + // compute and feed frame by frame bool result = false; while (feature_cache_.size() < frame_chunk_size_) { Vector feature; @@ -54,33 +55,58 @@ bool Assembler::Compute(Vector* feats) { if (IsFinished() == false) return false; break; } + + CHECK(feature.Dim() == dim_); + nframes_ += 1; + VLOG(1) << "nframes: " << nframes_; + feature_cache_.push(feature); } if (feature_cache_.size() < receptive_filed_length_) { + VLOG(1) << "feature_cache less than receptive_filed_lenght. " << feature_cache_.size() << ": " << receptive_filed_length_; return false; } - while (feature_cache_.size() < frame_chunk_size_) { - Vector feature(dim_, kaldi::kSetZero); - feature_cache_.push(feature); + + if (fill_zero_){ + while (feature_cache_.size() < frame_chunk_size_) { + Vector feature(dim_, kaldi::kSetZero); + nframes_ += 1; + feature_cache_.push(feature); + } } + int32 this_chunk_size = std::min(static_cast(feature_cache_.size()), frame_chunk_size_); + feats->Resize(dim_ * this_chunk_size); + int32 counter = 0; - int32 cache_size = frame_chunk_size_ - frame_chunk_stride_; - int32 elem_dim = base_extractor_->Dim(); - while (counter < frame_chunk_size_) { + while (counter < this_chunk_size) { Vector& val = feature_cache_.front(); - int32 start = counter * elem_dim; - feats->Range(start, elem_dim).CopyFromVec(val); - if (frame_chunk_size_ - counter <= cache_size) { + CHECK(val.Dim() == dim_) << val.Dim(); + + int32 start = counter * dim_; + feats->Range(start, dim_).CopyFromVec(val); + + if (this_chunk_size - counter <= cache_size_) { feature_cache_.push(val); } + + // val is reference, so we should pop here feature_cache_.pop(); + counter++; } return result; } + + void Assembler::Reset() { + std::queue> empty; + std::swap(feature_cache_, empty); + nframes_ = 0; + base_extractor_->Reset(); +} + } // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/assembler.h b/speechx/speechx/frontend/audio/assembler.h index 258e61f2b..4f165ea80 100644 --- a/speechx/speechx/frontend/audio/assembler.h +++ b/speechx/speechx/frontend/audio/assembler.h @@ -22,14 +22,10 @@ namespace ppspeech { struct AssemblerOptions { // refer:https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/s2t/exps/deepspeech2/model.py // the nnet batch forward - int32 receptive_filed_length; - int32 subsampling_rate; - int32 nnet_decoder_chunk; - - AssemblerOptions() - : receptive_filed_length(1), - subsampling_rate(1), - nnet_decoder_chunk(1) {} + int32 receptive_filed_length{1}; + int32 subsampling_rate{1}; + int32 nnet_decoder_chunk{1}; + bool fill_zero{false}; // whether fill zero when last chunk is not equal to frame_chunk_size_ }; class Assembler : public FrontendInterface { @@ -39,29 +35,34 @@ class Assembler : public FrontendInterface { std::unique_ptr base_extractor = NULL); // Feed feats or waves - virtual void Accept(const kaldi::VectorBase& inputs); + void Accept(const kaldi::VectorBase& inputs) override; // feats size = num_frames * feat_dim - virtual bool Read(kaldi::Vector* feats); + bool Read(kaldi::Vector* feats) override; // feat dim - virtual size_t Dim() const { return dim_; } + size_t Dim() const override { return dim_; } - virtual void SetFinished() { base_extractor_->SetFinished(); } + void SetFinished() override { base_extractor_->SetFinished(); } - virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + bool IsFinished() const override { return base_extractor_->IsFinished(); } - virtual void Reset() { base_extractor_->Reset(); } + void Reset() override; private: bool Compute(kaldi::Vector* feats); - int32 dim_; + bool fill_zero_{false}; + + int32 dim_; // feat dim int32 frame_chunk_size_; // window int32 frame_chunk_stride_; // stride + int32 cache_size_; // window - stride int32 receptive_filed_length_; std::queue> feature_cache_; std::unique_ptr base_extractor_; + + int32 nframes_; // num frame computed DISALLOW_COPY_AND_ASSIGN(Assembler); }; diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc index b7a15acd7..71e5d09eb 100644 --- a/speechx/speechx/frontend/audio/audio_cache.cc +++ b/speechx/speechx/frontend/audio/audio_cache.cc @@ -83,6 +83,10 @@ bool AudioCache::Read(Vector* waves) { } size_ -= chunk_size; offset_ = (offset_ + chunk_size) % ring_buffer_.size(); + + nsamples_ += chunk_size; + VLOG(1) << "nsamples readed: " << nsamples_; + ready_feed_condition_.notify_one(); return true; } diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h index fc07d4bab..da422daa5 100644 --- a/speechx/speechx/frontend/audio/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -41,10 +41,11 @@ class AudioCache : public FrontendInterface { virtual bool IsFinished() const { return finished_; } - virtual void Reset() { + void Reset() override { offset_ = 0; size_ = 0; finished_ = false; + nsamples_ = 0; } private: @@ -61,6 +62,7 @@ class AudioCache : public FrontendInterface { kaldi::int32 timeout_; // millisecond bool to_float32_; // int16 -> float32. used in linear_spectrogram + int32 nsamples_; // number samples readed. DISALLOW_COPY_AND_ASSIGN(AudioCache); }; diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc index 509a98c3b..c712e48e6 100644 --- a/speechx/speechx/frontend/audio/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -73,6 +73,9 @@ bool FeatureCache::Compute() { if (result == false || feature.Dim() == 0) return false; int32 num_chunk = feature.Dim() / dim_; + nframe_ += num_chunk; + VLOG(1) << "nframe computed: " << nframe_; + for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) { int32 start = chunk_idx * dim_; Vector feature_chunk(dim_); diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h index b922de12c..09d7f7ebf 100644 --- a/speechx/speechx/frontend/audio/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -51,11 +51,12 @@ class FeatureCache : public FrontendInterface { virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { + void Reset() override { + std::queue> empty; + std::swap(cache_, empty); + nframe_ = 0; base_extractor_->Reset(); - while (!cache_.empty()) { - cache_.pop(); - } + VLOG(1) << "feature cache reset: cache size: " << cache_.size(); } private: @@ -74,6 +75,7 @@ class FeatureCache : public FrontendInterface { std::condition_variable ready_feed_condition_; std::condition_variable ready_read_condition_; + int32 nframe_; // num of feature computed DISALLOW_COPY_AND_ASSIGN(FeatureCache); }; diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index baae2ce8f..63a8a793a 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -153,6 +153,7 @@ void U2Nnet::Reset() { std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); encoder_outs_.clear(); + VLOG(1) << "u2nnet reset"; } // Debug API diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index 38bd5ccca..2375586ea 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -82,9 +82,13 @@ int main(int argc, char* argv[]) { // no overlap sample_offset += cur_chunk_size; } + CHECK(sample_offset == tot_samples); + + // recognizer.SetFinished(); + // second pass decoding recognizer.Rescoring(); - + std::string result = recognizer.GetFinalResult(); recognizer.Reset();