diff --git a/speechx/build.sh b/speechx/build.sh index e0a386752..7655f9635 100755 --- a/speechx/build.sh +++ b/speechx/build.sh @@ -20,4 +20,4 @@ fi mkdir -p build cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR} -cmake --build build +cmake --build build -j diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index a0fe5b2ac..04530fb9d 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -76,11 +76,15 @@ void CTCPrefixBeamSearch::AdvanceDecode( // forward frame by frame std::vector frame_prob; bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob); - if (flag == false) break; + if (flag == false) { + LOG(INFO) << "decoder advance decode exit." << frame_prob.size(); + break; + } std::vector> likelihood; likelihood.push_back(frame_prob); AdvanceDecoding(likelihood); + VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_; } } @@ -114,7 +118,11 @@ void CTCPrefixBeamSearch::AdvanceDecoding( std::vector topk_score; std::vector topk_index; TopK(logp_t, first_beam_size, &topk_score, &topk_index); - + VLOG(2) << "topk: " << num_frame_decoded_ << " " << *std::max_element(logp_t.begin(), logp_t.end()) << " " << topk_score[0]; + for (int i = 0; i < topk_score.size(); i++){ + VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i]; + } + // 2. token passing for (int i = 0; i < topk_index.size(); ++i) { int id = topk_index[i]; @@ -295,7 +303,18 @@ void CTCPrefixBeamSearch::UpdateOutputs( outputs_.emplace_back(output); } -void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } +void CTCPrefixBeamSearch::FinalizeSearch() { + UpdateFinalContext(); + + VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_; + int cnt = 0; + for (int i = 0; i < hypotheses_.size(); i ++){ + VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() << " ctc score: " << likelihood_[i]; + for (int j = 0; j < hypotheses_[i].size(); j ++){ + VLOG(2) << hypotheses_[i][j]; + } + } +} void CTCPrefixBeamSearch::UpdateFinalContext() { if (context_graph_ == nullptr) return; diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc index ff1b1f28f..afee3a6a3 100644 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -52,15 +52,21 @@ bool Assembler::Compute(Vector* feats) { Vector feature; result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) { - if (IsFinished() == false) return false; - break; + VLOG(1) << "result: " << result << "feature dim: " << feature.Dim(); + if (IsFinished() == false) { + LOG(INFO) << "finished reading feature. cache size: " << feature_cache_.size(); + return false; + } else { + LOG(INFO) << "break"; + break; + } } CHECK(feature.Dim() == dim_); + feature_cache_.push(feature); + nframes_ += 1; VLOG(1) << "nframes: " << nframes_; - - feature_cache_.push(feature); } if (feature_cache_.size() < receptive_filed_length_) { @@ -68,8 +74,7 @@ bool Assembler::Compute(Vector* feats) { return false; } - - if (fill_zero_){ + if (fill_zero_) { while (feature_cache_.size() < frame_chunk_size_) { Vector feature(dim_, kaldi::kSetZero); nframes_ += 1; @@ -79,6 +84,7 @@ bool Assembler::Compute(Vector* feats) { int32 this_chunk_size = std::min(static_cast(feature_cache_.size()), frame_chunk_size_); feats->Resize(dim_ * this_chunk_size); + VLOG(1) << "read " << this_chunk_size << " feat."; int32 counter = 0; while (counter < this_chunk_size) { @@ -97,6 +103,7 @@ bool Assembler::Compute(Vector* feats) { counter++; } + CHECK(feature_cache_.size() == cache_size_ ); return result; } diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h index 09d7f7ebf..b4ed58fff 100644 --- a/speechx/speechx/frontend/audio/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -41,12 +41,14 @@ class FeatureCache : public FrontendInterface { virtual size_t Dim() const { return dim_; } virtual void SetFinished() { + LOG(INFO) << "set finished"; // std::unique_lock lock(mutex_); base_extractor_->SetFinished(); - LOG(INFO) << "set finished"; + // read the last chunk data Compute(); // ready_feed_condition_.notify_one(); + LOG(INFO) << "compute last feats done."; } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 9bad8ed45..6956a2cb8 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -36,8 +36,6 @@ void Decodable::Acceptlikelihood(const Matrix& likelihood) { frames_ready_ += likelihood.NumRows(); } -// Decodable::Init(DecodableConfig config) { -//} // return the size of frame have computed. int32 Decodable::NumFramesReady() const { return frames_ready_; } @@ -70,9 +68,10 @@ bool Decodable::AdvanceChunk() { Vector features; if (frontend_ == NULL || frontend_->Read(&features) == false) { // no feat or frontend_ not init. + VLOG(1) << "decodable exit;"; return false; } - VLOG(2) << "Forward with " << features.Dim() << " frames."; + VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats."; // forward feats NnetOut out; @@ -80,6 +79,7 @@ bool Decodable::AdvanceChunk() { int32& vocab_dim = out.vocab_dim; Vector& logprobs = out.logprobs; + VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim << " decoder frames."; // cache nnet outupts nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim); nnet_out_cache_.CopyRowsFromVec(logprobs); @@ -114,15 +114,20 @@ bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, // read one frame likelihood bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { if (EnsureFrameHaveComputed(frame) == false) { + LOG(INFO) << "framelikehood exit."; return false; } + int nrows = nnet_out_cache_.NumRows(); + CHECK(nrows == (frames_ready_ - frame_offset_)); int vocab_size = nnet_out_cache_.NumCols(); likelihood->resize(vocab_size); for (int32 idx = 0; idx < vocab_size; ++idx) { (*likelihood)[idx] = nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_; + + VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " " << nnet_out_cache_.NumRows() << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx); } return true; } diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 63a8a793a..07e2dde2a 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -440,6 +440,7 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, max_hyps_len = std::max(max_hyps_len, len); hyps_len_ptr[i] = static_cast(len); } + VLOG(2) << "max_hyps_len: " << max_hyps_len; paddle::Tensor hyps_tensor = paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64); @@ -625,8 +626,8 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, // combinded left-to-right and right-to-lfet score (*rescoring_score)[i] = score * (1 - reverse_weight) + r_score * reverse_weight; - VLOG(1) << "hyp " << i << " score: " << score << " r_score: " << r_score - << " reverse_weight: " << reverse_weight; + VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score + << " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i]; } } diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc index 75834aa5d..b4a1257b6 100644 --- a/speechx/speechx/recognizer/u2_recognizer.cc +++ b/speechx/speechx/recognizer/u2_recognizer.cc @@ -52,7 +52,6 @@ void U2Recognizer::Reset() { num_frames_ = 0; result_.clear(); - feature_pipeline_->Reset(); decodable_->Reset(); decoder_->Reset(); } @@ -62,7 +61,6 @@ void U2Recognizer::ResetContinuousDecoding() { num_frames_ = 0; result_.clear(); - feature_pipeline_->Reset(); decodable_->Reset(); decoder_->Reset(); } @@ -192,10 +190,12 @@ void U2Recognizer::AttentionRescoring() { // combine ctc score and rescoring score for (size_t i = 0; i < num_hyps; i++) { VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i] - << " ctc_score: " << result_[i].score; + << " ctc_score: " << result_[i].score << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight << " ctc_weight: " << opts_.decoder_opts.ctc_weight; result_[i].score = opts_.decoder_opts.rescoring_weight * rescoring_score[i] + opts_.decoder_opts.ctc_weight * result_[i].score; + + VLOG(1) << "hyp: " << result_[0].sentence << " score: " << result_[0].score; } std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index 2375586ea..bfb37fb8e 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -62,6 +62,7 @@ int main(int argc, char* argv[]) { LOG(INFO) << "wav len (sample): " << tot_samples; int sample_offset = 0; + int cnt = 0; while (sample_offset < tot_samples) { int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset); @@ -77,12 +78,14 @@ int main(int argc, char* argv[]) { recognizer.SetFinished(); } recognizer.Decode(); - LOG(INFO) << "Pratial result: " << recognizer.GetPartialResult(); + LOG(INFO) << "Pratial result: " << cnt << " " << recognizer.GetPartialResult(); // no overlap sample_offset += cur_chunk_size; + cnt++; } CHECK(sample_offset == tot_samples); + VLOG(1) << "num decode: " << cnt; // recognizer.SetFinished();