diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index 03a7c1336..07e8e5608 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -69,20 +69,30 @@ void CTCPrefixBeamSearch::InitDecoder() { Reset(); } void CTCPrefixBeamSearch::AdvanceDecode( const std::shared_ptr& decodable) { + double search_cost = 0.0; + double feat_nnet_cost = 0.0; while (1) { // forward frame by frame + kaldi::Timer timer; std::vector frame_prob; bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob); + feat_nnet_cost += timer.Elapsed(); if (flag == false) { - VLOG(1) << "decoder advance decode exit." << frame_prob.size(); + VLOG(3) << "decoder advance decode exit." << frame_prob.size(); break; } + timer.Reset(); std::vector> likelihood; likelihood.push_back(frame_prob); AdvanceDecoding(likelihood); + search_cost += timer.Elapsed(); + VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_; } + VLOG(1) << "AdvanceDecode feat + forward cost: " << feat_nnet_cost + << " sec."; + VLOG(1) << "AdvanceDecode search cost: " << search_cost << " sec."; } static bool PrefixScoreCompare( diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc index 56dfc3aaf..9d5fc4036 100644 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -40,7 +40,9 @@ void Assembler::Accept(const kaldi::VectorBase& inputs) { // pop feature chunk bool Assembler::Read(kaldi::Vector* feats) { + kaldi::Timer timer; bool result = Compute(feats); + VLOG(1) << "Assembler::Read cost: " << timer.Elapsed() << " sec."; return result; } @@ -51,14 +53,14 @@ bool Assembler::Compute(Vector* feats) { Vector feature; bool result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) { - VLOG(1) << "result: " << result + VLOG(3) << "result: " << result << " feature dim: " << feature.Dim(); if (IsFinished() == false) { - VLOG(1) << "finished reading feature. cache size: " + VLOG(3) << "finished reading feature. cache size: " << feature_cache_.size(); return false; } else { - VLOG(1) << "break"; + VLOG(3) << "break"; break; } } @@ -67,11 +69,11 @@ bool Assembler::Compute(Vector* feats) { feature_cache_.push(feature); nframes_ += 1; - VLOG(1) << "nframes: " << nframes_; + VLOG(3) << "nframes: " << nframes_; } if (feature_cache_.size() < receptive_filed_length_) { - VLOG(1) << "feature_cache less than receptive_filed_lenght. " + VLOG(3) << "feature_cache less than receptive_filed_lenght. " << feature_cache_.size() << ": " << receptive_filed_length_; return false; } @@ -87,7 +89,7 @@ bool Assembler::Compute(Vector* feats) { int32 this_chunk_size = std::min(static_cast(feature_cache_.size()), frame_chunk_size_); feats->Resize(dim_ * this_chunk_size); - VLOG(1) << "read " << this_chunk_size << " feat."; + VLOG(3) << "read " << this_chunk_size << " feat."; int32 counter = 0; while (counter < this_chunk_size) { diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc index 61ef8841d..c6a91f4b3 100644 --- a/speechx/speechx/frontend/audio/audio_cache.cc +++ b/speechx/speechx/frontend/audio/audio_cache.cc @@ -38,6 +38,7 @@ BaseFloat AudioCache::Convert2PCM32(BaseFloat val) { } void AudioCache::Accept(const VectorBase& waves) { + kaldi::Timer timer; std::unique_lock lock(mutex_); while (size_ + waves.Dim() > ring_buffer_.size()) { ready_feed_condition_.wait(lock); @@ -48,11 +49,13 @@ void AudioCache::Accept(const VectorBase& waves) { if (to_float32_) ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx)); } size_ += waves.Dim(); + VLOG(1) << "AudioCache::Accept cost: " << timer.Elapsed() << " sec. " + << waves.Dim() << " samples."; } bool AudioCache::Read(Vector* waves) { - size_t chunk_size = waves->Dim(); kaldi::Timer timer; + size_t chunk_size = waves->Dim(); std::unique_lock lock(mutex_); while (chunk_size > size_) { // when audio is empty and no more data feed @@ -86,9 +89,11 @@ bool AudioCache::Read(Vector* waves) { offset_ = (offset_ + chunk_size) % ring_buffer_.size(); nsamples_ += chunk_size; - VLOG(1) << "nsamples readed: " << nsamples_; + VLOG(3) << "nsamples readed: " << nsamples_; ready_feed_condition_.notify_one(); + VLOG(1) << "AudioCache::Read cost: " << timer.Elapsed() << " sec. " + << chunk_size << " samples."; return true; } diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc index 3d80e0011..a4d861d2d 100644 --- a/speechx/speechx/frontend/audio/cmvn.cc +++ b/speechx/speechx/frontend/audio/cmvn.cc @@ -50,8 +50,11 @@ bool CMVN::Read(kaldi::Vector* feats) { if (base_extractor_->Read(feats) == false || feats->Dim() == 0) { return false; } + // appply cmvn + kaldi::Timer timer; Compute(feats); + VLOG(1) << "CMVN::Read cost: " << timer.Elapsed() << " sec."; return true; } diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h index 5f5cd51b4..f538df1dd 100644 --- a/speechx/speechx/frontend/audio/data_cache.h +++ b/speechx/speechx/frontend/audio/data_cache.h @@ -27,27 +27,32 @@ namespace ppspeech { // pre-recorded audio/feature class DataCache : public FrontendInterface { public: - DataCache() { finished_ = false; } + DataCache() : finished_{false}, dim_{0} {} // accept waves/feats - virtual void Accept(const kaldi::VectorBase& inputs) { + void Accept(const kaldi::VectorBase& inputs) override { data_ = inputs; + SetDim(data_.Dim()); } - virtual bool Read(kaldi::Vector* feats) { + bool Read(kaldi::Vector* feats) override { if (data_.Dim() == 0) { return false; } (*feats) = data_; data_.Resize(0); + SetDim(data_.Dim()); return true; } - virtual void SetFinished() { finished_ = true; } - virtual bool IsFinished() const { return finished_; } - virtual size_t Dim() const { return dim_; } + void SetFinished() override { finished_ = true; } + bool IsFinished() const override { return finished_; } + size_t Dim() const override { return dim_; } void SetDim(int32 dim) { dim_ = dim; } - virtual void Reset() { finished_ = true; } + void Reset() override { + finished_ = true; + dim_ = 0; + } private: kaldi::Vector data_; diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc index 3f05eae62..5110d7046 100644 --- a/speechx/speechx/frontend/audio/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -34,6 +34,7 @@ FeatureCache::FeatureCache(FeatureCacheOptions opts, void FeatureCache::Accept(const kaldi::VectorBase& inputs) { // read inputs base_extractor_->Accept(inputs); + // feed current data bool result = false; do { @@ -62,6 +63,7 @@ bool FeatureCache::Read(kaldi::Vector* feats) { feats->CopyFromVec(cache_.front()); cache_.pop(); ready_feed_condition_.notify_one(); + VLOG(1) << "FeatureCache::Read cost: " << timer.Elapsed() << " sec."; return true; } @@ -72,9 +74,11 @@ bool FeatureCache::Compute() { bool result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) return false; + kaldi::Timer timer; + int32 num_chunk = feature.Dim() / dim_; nframe_ += num_chunk; - VLOG(1) << "nframe computed: " << nframe_; + VLOG(3) << "nframe computed: " << nframe_; for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) { int32 start = chunk_idx * dim_; @@ -92,7 +96,10 @@ bool FeatureCache::Compute() { cache_.push(feature_chunk); ready_read_condition_.notify_one(); } - return result; + + VLOG(1) << "FeatureCache::Compute cost: " << timer.Elapsed() << " sec. " + << num_chunk << " feats."; + return true; } } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h index bd8692251..a4ebd6047 100644 --- a/speechx/speechx/frontend/audio/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -58,7 +58,7 @@ class FeatureCache : public FrontendInterface { std::swap(cache_, empty); nframe_ = 0; base_extractor_->Reset(); - VLOG(1) << "feature cache reset: cache size: " << cache_.size(); + VLOG(3) << "feature cache reset: cache size: " << cache_.size(); } private: diff --git a/speechx/speechx/frontend/audio/feature_common_inl.h b/speechx/speechx/frontend/audio/feature_common_inl.h index b86f79918..dcf44ef61 100644 --- a/speechx/speechx/frontend/audio/feature_common_inl.h +++ b/speechx/speechx/frontend/audio/feature_common_inl.h @@ -34,6 +34,7 @@ bool StreamingFeatureTpl::Read(kaldi::Vector* feats) { bool flag = base_extractor_->Read(&wav); if (flag == false || wav.Dim() == 0) return false; + kaldi::Timer timer; // append remaned waves int32 wav_len = wav.Dim(); int32 left_len = remained_wav_.Dim(); @@ -52,6 +53,8 @@ bool StreamingFeatureTpl::Read(kaldi::Vector* feats) { remained_wav_.Resize(left_samples); remained_wav_.CopyFromVec( waves.Range(frame_shift * num_frames, left_samples)); + VLOG(1) << "StreamingFeatureTpl::Read cost: " << timer.Elapsed() + << " sec."; return true; } diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 11d60d3e2..7f6859082 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -68,9 +68,10 @@ bool Decodable::AdvanceChunk() { Vector features; if (frontend_ == NULL || frontend_->Read(&features) == false) { // no feat or frontend_ not init. - VLOG(1) << "decodable exit;"; + VLOG(3) << "decodable exit;"; return false; } + VLOG(1) << "AdvanceChunk feat cost: " << timer.Elapsed() << " sec."; VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats."; // forward feats @@ -88,7 +89,8 @@ bool Decodable::AdvanceChunk() { // update state, decoding frame. frame_offset_ = frames_ready_; frames_ready_ += nnet_out_cache_.NumRows(); - VLOG(2) << "Forward feat chunk cost: " << timer.Elapsed() << " sec."; + VLOG(1) << "AdvanceChunk feat + forward cost: " << timer.Elapsed() + << " sec."; return true; } @@ -115,7 +117,7 @@ bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, // read one frame likelihood bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { if (EnsureFrameHaveComputed(frame) == false) { - VLOG(1) << "framelikehood exit."; + VLOG(3) << "framelikehood exit."; return false; } @@ -168,7 +170,9 @@ void Decodable::Reset() { void Decodable::AttentionRescoring(const std::vector>& hyps, float reverse_weight, std::vector* rescoring_score) { + kaldi::Timer timer; nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score); + VLOG(1) << "Attention Rescoring cost: " << timer.Elapsed() << " sec."; } } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 19cb85fda..7707406a1 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -154,7 +154,7 @@ void U2Nnet::Reset() { std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); encoder_outs_.clear(); - VLOG(1) << "u2nnet reset"; + VLOG(3) << "u2nnet reset"; } // Debug API @@ -168,6 +168,7 @@ void U2Nnet::FeedEncoderOuts(const paddle::Tensor& encoder_out) { void U2Nnet::FeedForward(const kaldi::Vector& features, const int32& feature_dim, NnetOut* out) { + kaldi::Timer timer; std::vector chunk_feats(features.Data(), features.Data() + features.Dim()); @@ -179,6 +180,8 @@ void U2Nnet::FeedForward(const kaldi::Vector& features, std::memcpy(out->logprobs.Data(), ctc_probs.data(), ctc_probs.size() * sizeof(kaldi::BaseFloat)); + VLOG(1) << "FeedForward cost: " << timer.Elapsed() << " sec. " + << chunk_feats.size() / feature_dim << " frames."; } @@ -415,7 +418,6 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, #ifdef USE_PROFILING RecordEvent event("AttentionRescoring", TracerEventType::UserDefined, 1); #endif - CHECK(rescoring_score != nullptr); int num_hyps = hyps.size(); @@ -627,7 +629,7 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, // combinded left-to-right and right-to-lfet score (*rescoring_score)[i] = score * (1 - reverse_weight) + r_score * reverse_weight; - VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score + VLOG(3) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score << " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i]; @@ -639,7 +641,7 @@ void U2Nnet::EncoderOuts( std::vector>* encoder_out) const { // list of (B=1,T,D) int size = encoder_outs_.size(); - VLOG(1) << "encoder_outs_ size: " << size; + VLOG(3) << "encoder_outs_ size: " << size; for (int i = 0; i < size; i++) { const paddle::Tensor& item = encoder_outs_[i]; @@ -649,7 +651,7 @@ void U2Nnet::EncoderOuts( const int& T = shape[1]; const int& D = shape[2]; CHECK(B == 1) << "Only support batch one."; - VLOG(1) << "encoder out " << i << " shape: (" << B << "," << T << "," + VLOG(3) << "encoder out " << i << " shape: (" << B << "," << T << "," << D << ")"; const float* this_tensor_ptr = item.data(); diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc index 382f622f2..d1d308ebd 100644 --- a/speechx/speechx/recognizer/u2_recognizer.cc +++ b/speechx/speechx/recognizer/u2_recognizer.cc @@ -67,7 +67,10 @@ void U2Recognizer::ResetContinuousDecoding() { void U2Recognizer::Accept(const VectorBase& waves) { + kaldi::Timer timer; feature_pipeline_->Accept(waves); + VLOG(1) << "feed waves cost: " << timer.Elapsed() << " sec. " << waves.Dim() + << " samples."; } @@ -78,9 +81,7 @@ void U2Recognizer::Decode() { void U2Recognizer::Rescoring() { // Do attention Rescoring - kaldi::Timer timer; AttentionRescoring(); - VLOG(1) << "Rescoring cost latency: " << timer.Elapsed() << " sec."; } void U2Recognizer::UpdateResult(bool finish) { @@ -181,15 +182,13 @@ void U2Recognizer::AttentionRescoring() { return; } - kaldi::Timer timer; std::vector rescoring_score; decodable_->AttentionRescoring( hypotheses, opts_.decoder_opts.reverse_weight, &rescoring_score); - VLOG(1) << "Attention Rescoring takes " << timer.Elapsed() << " sec."; // combine ctc score and rescoring score for (size_t i = 0; i < num_hyps; i++) { - VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i] + VLOG(3) << "hyp " << i << " rescoring_score: " << rescoring_score[i] << " ctc_score: " << result_[i].score << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight << " ctc_weight: " << opts_.decoder_opts.ctc_weight; @@ -197,12 +196,12 @@ void U2Recognizer::AttentionRescoring() { opts_.decoder_opts.rescoring_weight * rescoring_score[i] + opts_.decoder_opts.ctc_weight * result_[i].score; - VLOG(1) << "hyp: " << result_[0].sentence + VLOG(3) << "hyp: " << result_[0].sentence << " score: " << result_[0].score; } std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); - VLOG(1) << "result: " << result_[0].sentence + VLOG(3) << "result: " << result_[0].sentence << " score: " << result_[0].score; } diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index 5cb8dbb15..d7c584074 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -31,6 +31,7 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; double tot_wav_duration = 0.0; + double tot_decode_time = 0.0; kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); @@ -47,9 +48,7 @@ int main(int argc, char* argv[]) { ppspeech::U2RecognizerResource::InitFromFlags(); ppspeech::U2Recognizer recognizer(resource); - kaldi::Timer timer; for (; !wav_reader.Done(); wav_reader.Next()) { - kaldi::Timer local_timer; std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); LOG(INFO) << "utt: " << utt; @@ -65,6 +64,8 @@ int main(int argc, char* argv[]) { int sample_offset = 0; int cnt = 0; + kaldi::Timer timer; + kaldi::Timer local_timer; while (sample_offset < tot_samples) { int cur_chunk_size = @@ -95,6 +96,8 @@ int main(int argc, char* argv[]) { // second pass decoding recognizer.Rescoring(); + tot_decode_time += timer.Elapsed(); + std::string result = recognizer.GetFinalResult(); recognizer.Reset(); @@ -115,10 +118,8 @@ int main(int argc, char* argv[]) { ++num_done; } - double elapsed = timer.Elapsed(); - LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done); - LOG(INFO) << "total cost:" << elapsed << " sec"; LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec"; - LOG(INFO) << "RTF is: " << elapsed / tot_wav_duration; + LOG(INFO) << "total decode cost:" << tot_decode_time << " sec"; + LOG(INFO) << "RTF is: " << tot_decode_time / tot_wav_duration; }