diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc index 04530fb9..2986ea7e 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -118,11 +118,13 @@ void CTCPrefixBeamSearch::AdvanceDecoding( std::vector topk_score; std::vector topk_index; TopK(logp_t, first_beam_size, &topk_score, &topk_index); - VLOG(2) << "topk: " << num_frame_decoded_ << " " << *std::max_element(logp_t.begin(), logp_t.end()) << " " << topk_score[0]; - for (int i = 0; i < topk_score.size(); i++){ - VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i]; + VLOG(2) << "topk: " << num_frame_decoded_ << " " + << *std::max_element(logp_t.begin(), logp_t.end()) << " " + << topk_score[0]; + for (int i = 0; i < topk_score.size(); i++) { + VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i]; } - + // 2. token passing for (int i = 0; i < topk_index.size(); ++i) { int id = topk_index[i]; @@ -303,15 +305,16 @@ void CTCPrefixBeamSearch::UpdateOutputs( outputs_.emplace_back(output); } -void CTCPrefixBeamSearch::FinalizeSearch() { - UpdateFinalContext(); - +void CTCPrefixBeamSearch::FinalizeSearch() { + UpdateFinalContext(); + VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_; int cnt = 0; - for (int i = 0; i < hypotheses_.size(); i ++){ - VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() << " ctc score: " << likelihood_[i]; - for (int j = 0; j < hypotheses_[i].size(); j ++){ - VLOG(2) << hypotheses_[i][j]; + for (int i = 0; i < hypotheses_.size(); i++) { + VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() + << " ctc score: " << likelihood_[i]; + for (int j = 0; j < hypotheses_[i].size(); j++) { + VLOG(2) << hypotheses_[i][j]; } } } diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h index ef96ecd9..475b4d35 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -13,7 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc #pragma once diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h index 908be1d6..3547b2b7 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h @@ -13,7 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h #pragma once diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 1a332755..ebdd7119 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -20,7 +20,9 @@ // feature DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); -DEFINE_bool(fill_zero, false, "fill zero at last chunk, when chunk < chunk_size"); +DEFINE_bool(fill_zero, + false, + "fill zero at last chunk, when chunk < chunk_size"); // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear // feature, or fbank"); DEFINE_int32(num_bins, 161, "num bins of mel"); diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc index 26a3905b..56dfc3aa 100644 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -16,9 +16,9 @@ namespace ppspeech { +using kaldi::BaseFloat; using kaldi::Vector; using kaldi::VectorBase; -using kaldi::BaseFloat; using std::unique_ptr; Assembler::Assembler(AssemblerOptions opts, @@ -51,9 +51,11 @@ bool Assembler::Compute(Vector* feats) { Vector feature; bool result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) { - VLOG(1) << "result: " << result << " feature dim: " << feature.Dim(); + VLOG(1) << "result: " << result + << " feature dim: " << feature.Dim(); if (IsFinished() == false) { - VLOG(1) << "finished reading feature. cache size: " << feature_cache_.size(); + VLOG(1) << "finished reading feature. cache size: " + << feature_cache_.size(); return false; } else { VLOG(1) << "break"; @@ -69,7 +71,8 @@ bool Assembler::Compute(Vector* feats) { } if (feature_cache_.size() < receptive_filed_length_) { - VLOG(1) << "feature_cache less than receptive_filed_lenght. " << feature_cache_.size() << ": " << receptive_filed_length_; + VLOG(1) << "feature_cache less than receptive_filed_lenght. " + << feature_cache_.size() << ": " << receptive_filed_length_; return false; } @@ -81,7 +84,8 @@ bool Assembler::Compute(Vector* feats) { } } - int32 this_chunk_size = std::min(static_cast(feature_cache_.size()), frame_chunk_size_); + int32 this_chunk_size = + std::min(static_cast(feature_cache_.size()), frame_chunk_size_); feats->Resize(dim_ * this_chunk_size); VLOG(1) << "read " << this_chunk_size << " feat."; @@ -89,7 +93,7 @@ bool Assembler::Compute(Vector* feats) { while (counter < this_chunk_size) { Vector& val = feature_cache_.front(); CHECK(val.Dim() == dim_) << val.Dim(); - + int32 start = counter * dim_; feats->Range(start, dim_).CopyFromVec(val); @@ -99,7 +103,7 @@ bool Assembler::Compute(Vector* feats) { // val is reference, so we should pop here feature_cache_.pop(); - + counter++; } CHECK(feature_cache_.size() == cache_size_); @@ -108,11 +112,11 @@ bool Assembler::Compute(Vector* feats) { } - void Assembler::Reset() { +void Assembler::Reset() { std::queue> empty; std::swap(feature_cache_, empty); nframes_ = 0; - base_extractor_->Reset(); + base_extractor_->Reset(); } } // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/assembler.h b/speechx/speechx/frontend/audio/assembler.h index 4f165ea8..72e6f635 100644 --- a/speechx/speechx/frontend/audio/assembler.h +++ b/speechx/speechx/frontend/audio/assembler.h @@ -25,7 +25,8 @@ struct AssemblerOptions { int32 receptive_filed_length{1}; int32 subsampling_rate{1}; int32 nnet_decoder_chunk{1}; - bool fill_zero{false}; // whether fill zero when last chunk is not equal to frame_chunk_size_ + bool fill_zero{false}; // whether fill zero when last chunk is not equal to + // frame_chunk_size_ }; class Assembler : public FrontendInterface { @@ -62,7 +63,7 @@ class Assembler : public FrontendInterface { std::queue> feature_cache_; std::unique_ptr base_extractor_; - int32 nframes_; // num frame computed + int32 nframes_; // num frame computed DISALLOW_COPY_AND_ASSIGN(Assembler); }; diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc index 71e5d09e..61ef8841 100644 --- a/speechx/speechx/frontend/audio/audio_cache.cc +++ b/speechx/speechx/frontend/audio/audio_cache.cc @@ -13,13 +13,14 @@ // limitations under the License. #include "frontend/audio/audio_cache.h" + #include "kaldi/base/timer.h" namespace ppspeech { using kaldi::BaseFloat; -using kaldi::VectorBase; using kaldi::Vector; +using kaldi::VectorBase; AudioCache::AudioCache(int buffer_size, bool to_float32) : finished_(false), @@ -85,8 +86,8 @@ bool AudioCache::Read(Vector* waves) { offset_ = (offset_ + chunk_size) % ring_buffer_.size(); nsamples_ += chunk_size; - VLOG(1) << "nsamples readed: " << nsamples_; - + VLOG(1) << "nsamples readed: " << nsamples_; + ready_feed_condition_.notify_one(); return true; } diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h index da422daa..4708a6e0 100644 --- a/speechx/speechx/frontend/audio/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -62,7 +62,7 @@ class AudioCache : public FrontendInterface { kaldi::int32 timeout_; // millisecond bool to_float32_; // int16 -> float32. used in linear_spectrogram - int32 nsamples_; // number samples readed. + int32 nsamples_; // number samples readed. DISALLOW_COPY_AND_ASSIGN(AudioCache); }; diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc index c712e48e..3f05eae6 100644 --- a/speechx/speechx/frontend/audio/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -16,12 +16,12 @@ namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; FeatureCache::FeatureCache(FeatureCacheOptions opts, unique_ptr base_extractor) { diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h index b4ed58ff..bd869225 100644 --- a/speechx/speechx/frontend/audio/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -77,7 +77,7 @@ class FeatureCache : public FrontendInterface { std::condition_variable ready_feed_condition_; std::condition_variable ready_read_condition_; - int32 nframe_; // num of feature computed + int32 nframe_; // num of feature computed DISALLOW_COPY_AND_ASSIGN(FeatureCache); }; diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index e06995b1..e83a3f31 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -91,8 +91,7 @@ struct FeaturePipelineOptions { << opts.assembler_opts.receptive_filed_length; LOG(INFO) << "nnet chunk size: " << opts.assembler_opts.nnet_decoder_chunk; - LOG(INFO) << "frontend fill zeros: " - << opts.assembler_opts.fill_zero; + LOG(INFO) << "frontend fill zeros: " << opts.assembler_opts.fill_zero; return opts; } }; diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index a7de58b5..11d60d3e 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -79,7 +79,8 @@ bool Decodable::AdvanceChunk() { int32& vocab_dim = out.vocab_dim; Vector& logprobs = out.logprobs; - VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim << " decoder frames."; + VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim + << " decoder frames."; // cache nnet outupts nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim); nnet_out_cache_.CopyRowsFromVec(logprobs); @@ -127,7 +128,9 @@ bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { (*likelihood)[idx] = nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_; - VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " " << nnet_out_cache_.NumRows() << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx); + VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " " + << nnet_out_cache_.NumRows() + << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx); } return true; } diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 07e2dde2..636e2ad4 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -13,7 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc #include "nnet/u2_nnet.h" @@ -129,7 +130,7 @@ U2Nnet::U2Nnet(const U2Nnet& other) { forward_attention_decoder_ = other.forward_attention_decoder_; ctc_activation_ = other.ctc_activation_; - offset_ = other.offset_; + offset_ = other.offset_; // copy model ptr model_ = other.model_; @@ -626,8 +627,10 @@ void U2Nnet::AttentionRescoring(const std::vector>& hyps, // combinded left-to-right and right-to-lfet score (*rescoring_score)[i] = score * (1 - reverse_weight) + r_score * reverse_weight; - VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score - << " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i]; + VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score + << " r_score: " << r_score + << " reverse_weight: " << reverse_weight + << " final score: " << (*rescoring_score)[i]; } } diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 6cbc0570..e548d4c0 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -13,7 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h #pragma once #include "base/common.h" diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc index b4a1257b..4ec64665 100644 --- a/speechx/speechx/recognizer/u2_recognizer.cc +++ b/speechx/speechx/recognizer/u2_recognizer.cc @@ -190,12 +190,15 @@ void U2Recognizer::AttentionRescoring() { // combine ctc score and rescoring score for (size_t i = 0; i < num_hyps; i++) { VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i] - << " ctc_score: " << result_[i].score << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight << " ctc_weight: " << opts_.decoder_opts.ctc_weight; + << " ctc_score: " << result_[i].score + << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight + << " ctc_weight: " << opts_.decoder_opts.ctc_weight; result_[i].score = opts_.decoder_opts.rescoring_weight * rescoring_score[i] + opts_.decoder_opts.ctc_weight * result_[i].score; - VLOG(1) << "hyp: " << result_[0].sentence << " score: " << result_[0].score; + VLOG(1) << "hyp: " << result_[0].sentence + << " score: " << result_[0].score; } std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h index 4746d86f..9b43b08f 100644 --- a/speechx/speechx/recognizer/u2_recognizer.h +++ b/speechx/speechx/recognizer/u2_recognizer.h @@ -96,13 +96,14 @@ struct U2RecognizerResource { U2RecognizerResource resource; resource.vocab_path = FLAGS_vocab_path; resource.acoustic_scale = FLAGS_acoustic_scale; - LOG(INFO) << "vocab path: " << resource.vocab_path; - LOG(INFO) << "acoustic_scale: " << resource.acoustic_scale; + LOG(INFO) << "vocab path: " << resource.vocab_path; + LOG(INFO) << "acoustic_scale: " << resource.acoustic_scale; resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); resource.feature_pipeline_opts.assembler_opts.fill_zero = false; - LOG(INFO) << "u2 need fill zero be false: " << resource.feature_pipeline_opts.assembler_opts.fill_zero; + LOG(INFO) << "u2 need fill zero be false: " + << resource.feature_pipeline_opts.assembler_opts.fill_zero; resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags(); return resource; diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index 7e59d6cb..c02e1c23 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -78,7 +78,8 @@ int main(int argc, char* argv[]) { recognizer.SetFinished(); } recognizer.Decode(); - LOG(INFO) << "Pratial result: " << cnt << " " << recognizer.GetPartialResult(); + LOG(INFO) << "Pratial result: " << cnt << " " + << recognizer.GetPartialResult(); // no overlap sample_offset += cur_chunk_size; @@ -88,7 +89,7 @@ int main(int argc, char* argv[]) { // second pass decoding recognizer.Rescoring(); - + std::string result = recognizer.GetFinalResult(); recognizer.Reset(); diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc index 959740a0..71656cb3 100644 --- a/speechx/speechx/utils/math.cc +++ b/speechx/speechx/utils/math.cc @@ -79,10 +79,10 @@ void TopK(const std::vector& data, int cur = values->size() - 1; while (!pq.empty()) { const auto& item = pq.top(); - + (*values)[cur] = item.first; (*indices)[cur] = item.second; - + // item if reference, must pop here pq.pop();