You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
137 lines
4.0 KiB
137 lines
4.0 KiB
3 years ago
|
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
3 years ago
|
#include "nnet/decodable.h"
|
||
|
|
||
|
namespace ppspeech {
|
||
|
|
||
3 years ago
|
using kaldi::BaseFloat;
|
||
|
using kaldi::Matrix;
|
||
3 years ago
|
using kaldi::Vector;
|
||
2 years ago
|
using std::vector;
|
||
3 years ago
|
|
||
2 years ago
|
Decodable::Decodable(const std::shared_ptr<NnetProducer>& nnet_producer,
|
||
3 years ago
|
kaldi::BaseFloat acoustic_scale)
|
||
2 years ago
|
: nnet_producer_(nnet_producer),
|
||
3 years ago
|
frame_offset_(0),
|
||
|
frames_ready_(0),
|
||
|
acoustic_scale_(acoustic_scale) {}
|
||
3 years ago
|
|
||
2 years ago
|
// for debug
|
||
3 years ago
|
void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
|
||
2 years ago
|
nnet_producer_->Acceptlikelihood(likelihood);
|
||
3 years ago
|
}
|
||
|
|
||
3 years ago
|
// return the size of frame have computed.
|
||
|
int32 Decodable::NumFramesReady() const { return frames_ready_; }
|
||
|
|
||
2 years ago
|
|
||
3 years ago
|
// frame idx is from 0 to frame_ready_ -1;
|
||
3 years ago
|
bool Decodable::IsLastFrame(int32 frame) {
|
||
2 years ago
|
EnsureFrameHaveComputed(frame);
|
||
3 years ago
|
return frame >= frames_ready_;
|
||
3 years ago
|
}
|
||
|
|
||
3 years ago
|
int32 Decodable::NumIndices() const { return 0; }
|
||
3 years ago
|
|
||
3 years ago
|
// the ilable(TokenId) of wfst(TLG) insert <eps>(id = 0) in front of Nnet prob
|
||
|
// id.
|
||
|
int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; }
|
||
3 years ago
|
|
||
2 years ago
|
|
||
3 years ago
|
bool Decodable::EnsureFrameHaveComputed(int32 frame) {
|
||
2 years ago
|
// decoding frame
|
||
3 years ago
|
if (frame >= frames_ready_) {
|
||
|
return AdvanceChunk();
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool Decodable::AdvanceChunk() {
|
||
2 years ago
|
kaldi::Timer timer;
|
||
2 years ago
|
bool flag = nnet_producer_->Read(&framelikelihood_);
|
||
|
if (flag == false) return false;
|
||
3 years ago
|
frame_offset_ = frames_ready_;
|
||
2 years ago
|
frames_ready_ += 1;
|
||
2 years ago
|
VLOG(1) << "AdvanceChunk feat + forward cost: " << timer.Elapsed()
|
||
|
<< " sec.";
|
||
2 years ago
|
return true;
|
||
|
}
|
||
|
|
||
2 years ago
|
bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
|
||
|
int* vocab_dim) {
|
||
2 years ago
|
if (AdvanceChunk() == false) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
2 years ago
|
if (framelikelihood_.empty()) {
|
||
2 years ago
|
LOG(WARNING) << "No new nnet out in cache.";
|
||
|
return false;
|
||
|
}
|
||
|
|
||
2 years ago
|
size_t dim = framelikelihood_.size();
|
||
|
logprobs->Resize(framelikelihood_.size());
|
||
|
std::memcpy(logprobs->Data(),
|
||
|
framelikelihood_.data(),
|
||
|
dim * sizeof(kaldi::BaseFloat));
|
||
|
*vocab_dim = framelikelihood_.size();
|
||
3 years ago
|
return true;
|
||
3 years ago
|
}
|
||
|
|
||
2 years ago
|
// read one frame likelihood
|
||
3 years ago
|
bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
|
||
2 years ago
|
if (EnsureFrameHaveComputed(frame) == false) {
|
||
2 years ago
|
VLOG(3) << "framelikehood exit.";
|
||
2 years ago
|
return false;
|
||
|
}
|
||
|
|
||
2 years ago
|
CHECK_EQ(1, (frames_ready_ - frame_offset_));
|
||
|
*likelihood = framelikelihood_;
|
||
3 years ago
|
return true;
|
||
3 years ago
|
}
|
||
|
|
||
2 years ago
|
BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) {
|
||
|
if (EnsureFrameHaveComputed(frame) == false) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
2 years ago
|
CHECK_LE(index, framelikelihood_.size());
|
||
2 years ago
|
CHECK_LE(frame, frames_ready_);
|
||
|
|
||
|
// the nnet output is prob ranther than log prob
|
||
|
// the index - 1, because the ilabel
|
||
|
BaseFloat logprob = 0.0;
|
||
|
int32 frame_idx = frame - frame_offset_;
|
||
2 years ago
|
CHECK_EQ(frame_idx, 0);
|
||
|
logprob = framelikelihood_[TokenId2NnetId(index)];
|
||
2 years ago
|
return acoustic_scale_ * logprob;
|
||
|
}
|
||
|
|
||
3 years ago
|
void Decodable::Reset() {
|
||
2 years ago
|
if (nnet_producer_ != nullptr) nnet_producer_->Reset();
|
||
3 years ago
|
frame_offset_ = 0;
|
||
3 years ago
|
frames_ready_ = 0;
|
||
2 years ago
|
framelikelihood_.clear();
|
||
3 years ago
|
}
|
||
|
|
||
2 years ago
|
void Decodable::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
|
||
2 years ago
|
float reverse_weight,
|
||
|
std::vector<float>* rescoring_score) {
|
||
2 years ago
|
kaldi::Timer timer;
|
||
2 years ago
|
nnet_producer_->AttentionRescoring(hyps, reverse_weight, rescoring_score);
|
||
2 years ago
|
VLOG(1) << "Attention Rescoring cost: " << timer.Elapsed() << " sec.";
|
||
2 years ago
|
}
|
||
|
|
||
2 years ago
|
} // namespace ppspeech
|