|
|
|
@ -1,3 +1,5 @@
|
|
|
|
|
// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu)
|
|
|
|
|
// 2022 Binbin Zhang (binbzha@qq.com)
|
|
|
|
|
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
|
//
|
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
@ -13,11 +15,12 @@
|
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "decoder/ctc_prefix_beam_search_decoder.h"
|
|
|
|
|
#include "base/common.h"
|
|
|
|
|
#include "decoder/ctc_beam_search_opt.h"
|
|
|
|
|
#include "decoder/ctc_prefix_beam_search_score.h"
|
|
|
|
|
#include "decoder/ctc_prefix_beam_search_decoder.h"
|
|
|
|
|
#include "utils/math.h"
|
|
|
|
|
#include "absl/strings/str_join.h"
|
|
|
|
|
|
|
|
|
|
#ifdef USE_PROFILING
|
|
|
|
|
#include "paddle/fluid/platform/profiler.h"
|
|
|
|
@ -29,10 +32,10 @@ namespace ppspeech {
|
|
|
|
|
|
|
|
|
|
CTCPrefixBeamSearch::CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts)
|
|
|
|
|
: opts_(opts) {
|
|
|
|
|
InitDecoder();
|
|
|
|
|
Reset();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CTCPrefixBeamSearch::InitDecoder() {
|
|
|
|
|
void CTCPrefixBeamSearch::Reset() {
|
|
|
|
|
num_frame_decoded_ = 0;
|
|
|
|
|
|
|
|
|
|
cur_hyps_.clear();
|
|
|
|
@ -43,8 +46,6 @@ void CTCPrefixBeamSearch::InitDecoder() {
|
|
|
|
|
times_.clear();
|
|
|
|
|
outputs_.clear();
|
|
|
|
|
|
|
|
|
|
abs_time_step_ = 0;
|
|
|
|
|
|
|
|
|
|
// empty hyp with Score
|
|
|
|
|
std::vector<int> empty;
|
|
|
|
|
PrefixScore prefix_score;
|
|
|
|
@ -58,56 +59,20 @@ void CTCPrefixBeamSearch::InitDecoder() {
|
|
|
|
|
hypotheses_.emplace_back(empty);
|
|
|
|
|
likelihood_.emplace_back(prefix_score.TotalScore());
|
|
|
|
|
times_.emplace_back(empty);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CTCPrefixBeamSearch::Reset() {
|
|
|
|
|
InitDecoder();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CTCPrefixBeamSearch::Decode(
|
|
|
|
|
std::shared_ptr<kaldi::DecodableInterface> decodable) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
void CTCPrefixBeamSearch::InitDecoder() { Reset(); }
|
|
|
|
|
|
|
|
|
|
int32 CTCPrefixBeamSearch::NumFrameDecoded() { return num_frame_decoded_ + 1; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void CTCPrefixBeamSearch::UpdateOutputs(
|
|
|
|
|
const std::pair<std::vector<int>, PrefixScore>& prefix) {
|
|
|
|
|
const std::vector<int>& input = prefix.first;
|
|
|
|
|
// const std::vector<int>& start_boundaries = prefix.second.start_boundaries;
|
|
|
|
|
// const std::vector<int>& end_boundaries = prefix.second.end_boundaries;
|
|
|
|
|
|
|
|
|
|
std::vector<int> output;
|
|
|
|
|
int s = 0;
|
|
|
|
|
int e = 0;
|
|
|
|
|
for (int i = 0; i < input.size(); ++i) {
|
|
|
|
|
// if (s < start_boundaries.size() && i == start_boundaries[s]){
|
|
|
|
|
// // <context>
|
|
|
|
|
// output.emplace_back(context_graph_->start_tag_id());
|
|
|
|
|
// ++s;
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
output.emplace_back(input[i]);
|
|
|
|
|
|
|
|
|
|
// if (e < end_boundaries.size() && i == end_boundaries[e]){
|
|
|
|
|
// // </context>
|
|
|
|
|
// output.emplace_back(context_graph_->end_tag_id());
|
|
|
|
|
// ++e;
|
|
|
|
|
// }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
outputs_.emplace_back(output);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void CTCPrefixBeamSearch::AdvanceDecode(
|
|
|
|
|
const std::shared_ptr<kaldi::DecodableInterface>& decodable) {
|
|
|
|
|
while (1) {
|
|
|
|
|
// forward frame by frame
|
|
|
|
|
std::vector<kaldi::BaseFloat> frame_prob;
|
|
|
|
|
bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob);
|
|
|
|
|
if (flag == false) break;
|
|
|
|
|
|
|
|
|
|
std::vector<std::vector<kaldi::BaseFloat>> likelihood;
|
|
|
|
|
likelihood.push_back(frame_prob);
|
|
|
|
|
AdvanceDecoding(likelihood);
|
|
|
|
@ -122,10 +87,12 @@ static bool PrefixScoreCompare(
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>& logp) {
|
|
|
|
|
void CTCPrefixBeamSearch::AdvanceDecoding(
|
|
|
|
|
const std::vector<std::vector<kaldi::BaseFloat>>& logp) {
|
|
|
|
|
#ifdef USE_PROFILING
|
|
|
|
|
RecordEvent event(
|
|
|
|
|
"CtcPrefixBeamSearch::AdvanceDecoding", TracerEventType::UserDefined, 1);
|
|
|
|
|
RecordEvent event("CtcPrefixBeamSearch::AdvanceDecoding",
|
|
|
|
|
TracerEventType::UserDefined,
|
|
|
|
|
1);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
if (logp.size() == 0) return;
|
|
|
|
@ -133,9 +100,10 @@ void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>&
|
|
|
|
|
int first_beam_size =
|
|
|
|
|
std::min(static_cast<int>(logp[0].size()), opts_.first_beam_size);
|
|
|
|
|
|
|
|
|
|
for (int t = 0; t < logp.size(); ++t, ++abs_time_step_) {
|
|
|
|
|
for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_) {
|
|
|
|
|
const std::vector<float>& logp_t = logp[t];
|
|
|
|
|
std::unordered_map<std::vector<int>, PrefixScore, PrefixScoreHash> next_hyps;
|
|
|
|
|
std::unordered_map<std::vector<int>, PrefixScore, PrefixScoreHash>
|
|
|
|
|
next_hyps;
|
|
|
|
|
|
|
|
|
|
// 1. first beam prune, only select topk candidates
|
|
|
|
|
std::vector<float> topk_score;
|
|
|
|
@ -151,16 +119,20 @@ void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>&
|
|
|
|
|
const std::vector<int>& prefix = it.first;
|
|
|
|
|
const PrefixScore& prefix_score = it.second;
|
|
|
|
|
|
|
|
|
|
// If prefix doesn't exist in next_hyps, next_hyps[prefix] will insert
|
|
|
|
|
// PrefixScore(-inf, -inf) by default, since the default constructor
|
|
|
|
|
// If prefix doesn't exist in next_hyps, next_hyps[prefix] will
|
|
|
|
|
// insert
|
|
|
|
|
// PrefixScore(-inf, -inf) by default, since the default
|
|
|
|
|
// constructor
|
|
|
|
|
// of PrefixScore will set fields b(blank ending Score) and
|
|
|
|
|
// nb(none blank ending Score) to -inf, respectively.
|
|
|
|
|
|
|
|
|
|
if (id == opts_.blank) {
|
|
|
|
|
// case 0: *a + <blank> => *a, *a<blank> + <blank> => *a, prefix not
|
|
|
|
|
// case 0: *a + <blank> => *a, *a<blank> + <blank> => *a,
|
|
|
|
|
// prefix not
|
|
|
|
|
// change
|
|
|
|
|
PrefixScore& next_score = next_hyps[prefix];
|
|
|
|
|
next_score.b = LogSumExp(next_score.b, prefix_score.Score() + prob);
|
|
|
|
|
next_score.b =
|
|
|
|
|
LogSumExp(next_score.b, prefix_score.Score() + prob);
|
|
|
|
|
|
|
|
|
|
// timestamp, blank is slince, not effact timestamp
|
|
|
|
|
next_score.v_b = prefix_score.ViterbiScore() + prob;
|
|
|
|
@ -175,7 +147,8 @@ void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>&
|
|
|
|
|
} else if (!prefix.empty() && id == prefix.back()) {
|
|
|
|
|
// case 1: *a + a => *a, prefix not changed
|
|
|
|
|
PrefixScore& next_score1 = next_hyps[prefix];
|
|
|
|
|
next_score1.nb = LogSumExp(next_score1.nb, prefix_score.nb + prob);
|
|
|
|
|
next_score1.nb =
|
|
|
|
|
LogSumExp(next_score1.nb, prefix_score.nb + prob);
|
|
|
|
|
|
|
|
|
|
// timestamp, non-blank symbol effact timestamp
|
|
|
|
|
if (next_score1.v_nb < prefix_score.v_nb + prob) {
|
|
|
|
@ -187,7 +160,7 @@ void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>&
|
|
|
|
|
// update this timestamp as token appeared here.
|
|
|
|
|
next_score1.times_nb = prefix_score.times_nb;
|
|
|
|
|
assert(next_score1.times_nb.size() > 0);
|
|
|
|
|
next_score1.times_nb.back() = abs_time_step_;
|
|
|
|
|
next_score1.times_nb.back() = num_frame_decoded_;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -201,7 +174,8 @@ void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>&
|
|
|
|
|
std::vector<int> new_prefix(prefix);
|
|
|
|
|
new_prefix.emplace_back(id);
|
|
|
|
|
PrefixScore& next_score2 = next_hyps[new_prefix];
|
|
|
|
|
next_score2.nb = LogSumExp(next_score2.nb, prefix_score.b + prob);
|
|
|
|
|
next_score2.nb =
|
|
|
|
|
LogSumExp(next_score2.nb, prefix_score.b + prob);
|
|
|
|
|
|
|
|
|
|
// timestamp, non-blank symbol effact timestamp
|
|
|
|
|
if (next_score2.v_nb < prefix_score.v_b + prob) {
|
|
|
|
@ -210,7 +184,7 @@ void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>&
|
|
|
|
|
// new token added
|
|
|
|
|
next_score2.cur_token_prob = prob;
|
|
|
|
|
next_score2.times_nb = prefix_score.times_b;
|
|
|
|
|
next_score2.times_nb.emplace_back(abs_time_step_);
|
|
|
|
|
next_score2.times_nb.emplace_back(num_frame_decoded_);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Prefix changed, calculate the context Score.
|
|
|
|
@ -226,7 +200,8 @@ void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>&
|
|
|
|
|
std::vector<int> new_prefix(prefix);
|
|
|
|
|
new_prefix.emplace_back(id);
|
|
|
|
|
PrefixScore& next_score = next_hyps[new_prefix];
|
|
|
|
|
next_score.nb = LogSumExp(next_score.nb, prefix_score.Score() + prob);
|
|
|
|
|
next_score.nb =
|
|
|
|
|
LogSumExp(next_score.nb, prefix_score.Score() + prob);
|
|
|
|
|
|
|
|
|
|
// timetamp, non-blank symbol effact timestamp
|
|
|
|
|
if (next_score.v_nb < prefix_score.ViterbiScore() + prob) {
|
|
|
|
@ -234,7 +209,7 @@ void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>&
|
|
|
|
|
|
|
|
|
|
next_score.cur_token_prob = prob;
|
|
|
|
|
next_score.times_nb = prefix_score.Times();
|
|
|
|
|
next_score.times_nb.emplace_back(abs_time_step_);
|
|
|
|
|
next_score.times_nb.emplace_back(num_frame_decoded_);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Prefix changed, calculate the context Score.
|
|
|
|
@ -248,8 +223,8 @@ void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>&
|
|
|
|
|
} // end for (int i = 0; i < topk_index.size(); ++i)
|
|
|
|
|
|
|
|
|
|
// 3. second beam prune, only keep top n best paths
|
|
|
|
|
std::vector<std::pair<std::vector<int>, PrefixScore>> arr(next_hyps.begin(),
|
|
|
|
|
next_hyps.end());
|
|
|
|
|
std::vector<std::pair<std::vector<int>, PrefixScore>> arr(
|
|
|
|
|
next_hyps.begin(), next_hyps.end());
|
|
|
|
|
int second_beam_size =
|
|
|
|
|
std::min(static_cast<int>(arr.size()), opts_.second_beam_size);
|
|
|
|
|
std::nth_element(arr.begin(),
|
|
|
|
@ -261,9 +236,7 @@ void CTCPrefixBeamSearch::AdvanceDecoding(const std::vector<std::vector<float>>&
|
|
|
|
|
|
|
|
|
|
// 4. update cur_hyps by next_hyps, and get new result
|
|
|
|
|
UpdateHypotheses(arr);
|
|
|
|
|
|
|
|
|
|
num_frame_decoded_++;
|
|
|
|
|
} // end for (int t = 0; t < logp.size(); ++t, ++abs_time_step_)
|
|
|
|
|
} // end for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -288,21 +261,52 @@ void CTCPrefixBeamSearch::UpdateHypotheses(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); }
|
|
|
|
|
void CTCPrefixBeamSearch::UpdateOutputs(
|
|
|
|
|
const std::pair<std::vector<int>, PrefixScore>& prefix) {
|
|
|
|
|
const std::vector<int>& input = prefix.first;
|
|
|
|
|
const std::vector<int>& start_boundaries = prefix.second.start_boundaries;
|
|
|
|
|
const std::vector<int>& end_boundaries = prefix.second.end_boundaries;
|
|
|
|
|
|
|
|
|
|
// add <context> </context> tag
|
|
|
|
|
std::vector<int> output;
|
|
|
|
|
int s = 0;
|
|
|
|
|
int e = 0;
|
|
|
|
|
for (int i = 0; i < input.size(); ++i) {
|
|
|
|
|
// if (s < start_boundaries.size() && i == start_boundaries[s]){
|
|
|
|
|
// // <context>
|
|
|
|
|
// output.emplace_back(context_graph_->start_tag_id());
|
|
|
|
|
// ++s;
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
output.emplace_back(input[i]);
|
|
|
|
|
|
|
|
|
|
// if (e < end_boundaries.size() && i == end_boundaries[e]){
|
|
|
|
|
// // </context>
|
|
|
|
|
// output.emplace_back(context_graph_->end_tag_id());
|
|
|
|
|
// ++e;
|
|
|
|
|
// }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
outputs_.emplace_back(output);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CTCPrefixBeamSearch::FinalizeSearch() {
|
|
|
|
|
UpdateFinalContext();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CTCPrefixBeamSearch::UpdateFinalContext() {
|
|
|
|
|
if (context_graph_ == nullptr) return;
|
|
|
|
|
assert(hypotheses_.size() == cur_hyps_.size());
|
|
|
|
|
assert(hypotheses_.size() == likelihood_.size());
|
|
|
|
|
|
|
|
|
|
CHECK(hypotheses_.size() == cur_hyps_.size());
|
|
|
|
|
CHECK(hypotheses_.size() == likelihood_.size());
|
|
|
|
|
|
|
|
|
|
// We should backoff the context Score/state when the context is
|
|
|
|
|
// not fully matched at the last time.
|
|
|
|
|
for (const auto& prefix : hypotheses_) {
|
|
|
|
|
PrefixScore& prefix_score = cur_hyps_[prefix];
|
|
|
|
|
if (prefix_score.context_score != 0) {
|
|
|
|
|
// prefix_score.UpdateContext(context_graph_, prefix_score, 0,
|
|
|
|
|
// prefix.size());
|
|
|
|
|
prefix_score.UpdateContext(context_graph_, prefix_score, 0,
|
|
|
|
|
prefix.size());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
std::vector<std::pair<std::vector<int>, PrefixScore>> arr(cur_hyps_.begin(),
|
|
|
|
@ -313,5 +317,44 @@ void CTCPrefixBeamSearch::UpdateFinalContext() {
|
|
|
|
|
UpdateHypotheses(arr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string CTCPrefixBeamSearch::GetBestPath(int index) {
|
|
|
|
|
int n_hyps = Outputs().size();
|
|
|
|
|
CHECK(n_hyps > 0);
|
|
|
|
|
CHECK(index < n_hyps);
|
|
|
|
|
std::vector<int> one = Outputs()[index];
|
|
|
|
|
return std::string(absl::StrJoin(one, kSpaceSymbol));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string CTCPrefixBeamSearch::GetBestPath() {
|
|
|
|
|
return GetBestPath(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::pair<double, std::string>> CTCPrefixBeamSearch::GetNBestPath(int n) {
|
|
|
|
|
int hyps_size = hypotheses_.size();
|
|
|
|
|
CHECK(hyps_size > 0);
|
|
|
|
|
|
|
|
|
|
int min_n = n == -1 ? hypotheses_.size() : std::min(n, hyps_size);
|
|
|
|
|
|
|
|
|
|
std::vector<std::pair<double, std::string>> n_best;
|
|
|
|
|
n_best.reserve(min_n);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < min_n; i++){
|
|
|
|
|
n_best.emplace_back(Likelihood()[i], GetBestPath(i) );
|
|
|
|
|
}
|
|
|
|
|
return n_best;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::pair<double, std::string>> CTCPrefixBeamSearch::GetNBestPath() {
|
|
|
|
|
return GetNBestPath(-1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string CTCPrefixBeamSearch::GetFinalBestPath() {
|
|
|
|
|
return GetBestPath();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string CTCPrefixBeamSearch::GetPartialResult() {
|
|
|
|
|
return GetBestPath();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} // namespace ppspeech
|