format code

pull/2524/head
Hui Zhang 2 years ago
parent 28dafea0e0
commit 850096a3a0

@ -118,11 +118,13 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
std::vector<float> topk_score;
std::vector<int32_t> topk_index;
TopK(logp_t, first_beam_size, &topk_score, &topk_index);
VLOG(2) << "topk: " << num_frame_decoded_ << " " << *std::max_element(logp_t.begin(), logp_t.end()) << " " << topk_score[0];
for (int i = 0; i < topk_score.size(); i++){
VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i];
VLOG(2) << "topk: " << num_frame_decoded_ << " "
<< *std::max_element(logp_t.begin(), logp_t.end()) << " "
<< topk_score[0];
for (int i = 0; i < topk_score.size(); i++) {
VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i];
}
// 2. token passing
for (int i = 0; i < topk_index.size(); ++i) {
int id = topk_index[i];
@ -303,15 +305,16 @@ void CTCPrefixBeamSearch::UpdateOutputs(
outputs_.emplace_back(output);
}
void CTCPrefixBeamSearch::FinalizeSearch() {
UpdateFinalContext();
void CTCPrefixBeamSearch::FinalizeSearch() {
UpdateFinalContext();
VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_;
int cnt = 0;
for (int i = 0; i < hypotheses_.size(); i ++){
VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() << " ctc score: " << likelihood_[i];
for (int j = 0; j < hypotheses_[i].size(); j ++){
VLOG(2) << hypotheses_[i][j];
for (int i = 0; i < hypotheses_.size(); i++) {
VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size()
<< " ctc score: " << likelihood_[i];
for (int j = 0; j < hypotheses_[i].size(); j++) {
VLOG(2) << hypotheses_[i][j];
}
}
}

@ -13,7 +13,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc
// modified from
// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc
#pragma once

@ -13,7 +13,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h
// modified from
// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h
#pragma once

@ -20,7 +20,9 @@
// feature
DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
DEFINE_bool(fill_zero, false, "fill zero at last chunk, when chunk < chunk_size");
DEFINE_bool(fill_zero,
false,
"fill zero at last chunk, when chunk < chunk_size");
// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear
// feature, or fbank");
DEFINE_int32(num_bins, 161, "num bins of mel");

@ -16,9 +16,9 @@
namespace ppspeech {
using kaldi::BaseFloat;
using kaldi::Vector;
using kaldi::VectorBase;
using kaldi::BaseFloat;
using std::unique_ptr;
Assembler::Assembler(AssemblerOptions opts,
@ -51,9 +51,11 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
Vector<BaseFloat> feature;
bool result = base_extractor_->Read(&feature);
if (result == false || feature.Dim() == 0) {
VLOG(1) << "result: " << result << " feature dim: " << feature.Dim();
VLOG(1) << "result: " << result
<< " feature dim: " << feature.Dim();
if (IsFinished() == false) {
VLOG(1) << "finished reading feature. cache size: " << feature_cache_.size();
VLOG(1) << "finished reading feature. cache size: "
<< feature_cache_.size();
return false;
} else {
VLOG(1) << "break";
@ -69,7 +71,8 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
}
if (feature_cache_.size() < receptive_filed_length_) {
VLOG(1) << "feature_cache less than receptive_filed_lenght. " << feature_cache_.size() << ": " << receptive_filed_length_;
VLOG(1) << "feature_cache less than receptive_filed_lenght. "
<< feature_cache_.size() << ": " << receptive_filed_length_;
return false;
}
@ -81,7 +84,8 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
}
}
int32 this_chunk_size = std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_);
int32 this_chunk_size =
std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_);
feats->Resize(dim_ * this_chunk_size);
VLOG(1) << "read " << this_chunk_size << " feat.";
@ -89,7 +93,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
while (counter < this_chunk_size) {
Vector<BaseFloat>& val = feature_cache_.front();
CHECK(val.Dim() == dim_) << val.Dim();
int32 start = counter * dim_;
feats->Range(start, dim_).CopyFromVec(val);
@ -99,7 +103,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
// val is reference, so we should pop here
feature_cache_.pop();
counter++;
}
CHECK(feature_cache_.size() == cache_size_);
@ -108,11 +112,11 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
}
void Assembler::Reset() {
void Assembler::Reset() {
std::queue<kaldi::Vector<kaldi::BaseFloat>> empty;
std::swap(feature_cache_, empty);
nframes_ = 0;
base_extractor_->Reset();
base_extractor_->Reset();
}
} // namespace ppspeech

@ -25,7 +25,8 @@ struct AssemblerOptions {
int32 receptive_filed_length{1};
int32 subsampling_rate{1};
int32 nnet_decoder_chunk{1};
bool fill_zero{false}; // whether fill zero when last chunk is not equal to frame_chunk_size_
bool fill_zero{false}; // whether fill zero when last chunk is not equal to
// frame_chunk_size_
};
class Assembler : public FrontendInterface {
@ -62,7 +63,7 @@ class Assembler : public FrontendInterface {
std::queue<kaldi::Vector<kaldi::BaseFloat>> feature_cache_;
std::unique_ptr<FrontendInterface> base_extractor_;
int32 nframes_; // num frame computed
int32 nframes_; // num frame computed
DISALLOW_COPY_AND_ASSIGN(Assembler);
};

@ -13,13 +13,14 @@
// limitations under the License.
#include "frontend/audio/audio_cache.h"
#include "kaldi/base/timer.h"
namespace ppspeech {
using kaldi::BaseFloat;
using kaldi::VectorBase;
using kaldi::Vector;
using kaldi::VectorBase;
AudioCache::AudioCache(int buffer_size, bool to_float32)
: finished_(false),
@ -85,8 +86,8 @@ bool AudioCache::Read(Vector<BaseFloat>* waves) {
offset_ = (offset_ + chunk_size) % ring_buffer_.size();
nsamples_ += chunk_size;
VLOG(1) << "nsamples readed: " << nsamples_;
VLOG(1) << "nsamples readed: " << nsamples_;
ready_feed_condition_.notify_one();
return true;
}

@ -62,7 +62,7 @@ class AudioCache : public FrontendInterface {
kaldi::int32 timeout_; // millisecond
bool to_float32_; // int16 -> float32. used in linear_spectrogram
int32 nsamples_; // number samples readed.
int32 nsamples_; // number samples readed.
DISALLOW_COPY_AND_ASSIGN(AudioCache);
};

@ -16,12 +16,12 @@
namespace ppspeech {
using kaldi::Vector;
using kaldi::VectorBase;
using kaldi::BaseFloat;
using std::vector;
using kaldi::SubVector;
using kaldi::Vector;
using kaldi::VectorBase;
using std::unique_ptr;
using std::vector;
FeatureCache::FeatureCache(FeatureCacheOptions opts,
unique_ptr<FrontendInterface> base_extractor) {

@ -77,7 +77,7 @@ class FeatureCache : public FrontendInterface {
std::condition_variable ready_feed_condition_;
std::condition_variable ready_read_condition_;
int32 nframe_; // num of feature computed
int32 nframe_; // num of feature computed
DISALLOW_COPY_AND_ASSIGN(FeatureCache);
};

@ -91,8 +91,7 @@ struct FeaturePipelineOptions {
<< opts.assembler_opts.receptive_filed_length;
LOG(INFO) << "nnet chunk size: "
<< opts.assembler_opts.nnet_decoder_chunk;
LOG(INFO) << "frontend fill zeros: "
<< opts.assembler_opts.fill_zero;
LOG(INFO) << "frontend fill zeros: " << opts.assembler_opts.fill_zero;
return opts;
}
};

@ -79,7 +79,8 @@ bool Decodable::AdvanceChunk() {
int32& vocab_dim = out.vocab_dim;
Vector<BaseFloat>& logprobs = out.logprobs;
VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim << " decoder frames.";
VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim
<< " decoder frames.";
// cache nnet outupts
nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim);
nnet_out_cache_.CopyRowsFromVec(logprobs);
@ -127,7 +128,9 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
(*likelihood)[idx] =
nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;
VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " " << nnet_out_cache_.NumRows() << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx);
VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " "
<< nnet_out_cache_.NumRows()
<< " logprob: " << nnet_out_cache_(frame - frame_offset_, idx);
}
return true;
}

@ -13,7 +13,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc
// modified from
// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc
#include "nnet/u2_nnet.h"
@ -129,7 +130,7 @@ U2Nnet::U2Nnet(const U2Nnet& other) {
forward_attention_decoder_ = other.forward_attention_decoder_;
ctc_activation_ = other.ctc_activation_;
offset_ = other.offset_;
offset_ = other.offset_;
// copy model ptr
model_ = other.model_;
@ -626,8 +627,10 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
// combinded left-to-right and right-to-lfet score
(*rescoring_score)[i] =
score * (1 - reverse_weight) + r_score * reverse_weight;
VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score
<< " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i];
VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score
<< " r_score: " << r_score
<< " reverse_weight: " << reverse_weight
<< " final score: " << (*rescoring_score)[i];
}
}

@ -13,7 +13,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h
// modified from
// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h
#pragma once
#include "base/common.h"

@ -190,12 +190,15 @@ void U2Recognizer::AttentionRescoring() {
// combine ctc score and rescoring score
for (size_t i = 0; i < num_hyps; i++) {
VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i]
<< " ctc_score: " << result_[i].score << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight << " ctc_weight: " << opts_.decoder_opts.ctc_weight;
<< " ctc_score: " << result_[i].score
<< " rescoring_weight: " << opts_.decoder_opts.rescoring_weight
<< " ctc_weight: " << opts_.decoder_opts.ctc_weight;
result_[i].score =
opts_.decoder_opts.rescoring_weight * rescoring_score[i] +
opts_.decoder_opts.ctc_weight * result_[i].score;
VLOG(1) << "hyp: " << result_[0].sentence << " score: " << result_[0].score;
VLOG(1) << "hyp: " << result_[0].sentence
<< " score: " << result_[0].score;
}
std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc);

@ -96,13 +96,14 @@ struct U2RecognizerResource {
U2RecognizerResource resource;
resource.vocab_path = FLAGS_vocab_path;
resource.acoustic_scale = FLAGS_acoustic_scale;
LOG(INFO) << "vocab path: " << resource.vocab_path;
LOG(INFO) << "acoustic_scale: " << resource.acoustic_scale;
LOG(INFO) << "vocab path: " << resource.vocab_path;
LOG(INFO) << "acoustic_scale: " << resource.acoustic_scale;
resource.feature_pipeline_opts =
ppspeech::FeaturePipelineOptions::InitFromFlags();
resource.feature_pipeline_opts.assembler_opts.fill_zero = false;
LOG(INFO) << "u2 need fill zero be false: " << resource.feature_pipeline_opts.assembler_opts.fill_zero;
LOG(INFO) << "u2 need fill zero be false: "
<< resource.feature_pipeline_opts.assembler_opts.fill_zero;
resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags();
return resource;

@ -78,7 +78,8 @@ int main(int argc, char* argv[]) {
recognizer.SetFinished();
}
recognizer.Decode();
LOG(INFO) << "Pratial result: " << cnt << " " << recognizer.GetPartialResult();
LOG(INFO) << "Pratial result: " << cnt << " "
<< recognizer.GetPartialResult();
// no overlap
sample_offset += cur_chunk_size;
@ -88,7 +89,7 @@ int main(int argc, char* argv[]) {
// second pass decoding
recognizer.Rescoring();
std::string result = recognizer.GetFinalResult();
recognizer.Reset();

@ -79,10 +79,10 @@ void TopK(const std::vector<T>& data,
int cur = values->size() - 1;
while (!pq.empty()) {
const auto& item = pq.top();
(*values)[cur] = item.first;
(*indices)[cur] = item.second;
// item if reference, must pop here
pq.pop();

Loading…
Cancel
Save