// Copyright 2022 Horizon Robotics. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // modified from // https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc #include "nnet/u2_nnet.h" #ifdef WITH_PROFILING #include "paddle/fluid/platform/profiler.h" using paddle::platform::RecordEvent; using paddle::platform::TracerEventType; #endif // end WITH_PROFILING namespace ppspeech { void U2Nnet::LoadModel(const std::string& model_path_w_prefix) { paddle::jit::utils::InitKernelSignatureMap(); #ifdef WITH_GPU dev_ = phi::GPUPlace(); #else dev_ = phi::CPUPlace(); #endif paddle::jit::Layer model = paddle::jit::Load(model_path_w_prefix, dev_); model_ = std::make_shared(std::move(model)); subsampling_rate_ = model_->Attribute("subsampling_rate"); right_context_ = model_->Attribute("right_context"); sos_ = model_->Attribute("sos_symbol"); eos_ = model_->Attribute("eos_symbol"); is_bidecoder_ = model_->Attribute("is_bidirectional_decoder"); forward_encoder_chunk_ = model_->Function("forward_encoder_chunk"); forward_attention_decoder_ = model_->Function("forward_attention_decoder"); ctc_activation_ = model_->Function("ctc_activation"); CHECK(forward_encoder_chunk_.IsValid()); CHECK(forward_attention_decoder_.IsValid()); CHECK(ctc_activation_.IsValid()); LOG(INFO) << "Paddle Model Info: "; LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; LOG(INFO) << "\tright context " << right_context_; LOG(INFO) << "\tsos " << sos_; LOG(INFO) << "\teos " << eos_; LOG(INFO) << "\tis bidecoder " << is_bidecoder_ << std::endl; Warmup(); } void U2Nnet::Warmup() { #ifdef WITH_PROFILING RecordEvent event("warmup", TracerEventType::UserDefined, 1); #endif { #ifdef WITH_PROFILING RecordEvent event( "warmup-encoder-ctc", TracerEventType::UserDefined, 1); #endif int feat_dim = 80; int frame_num = 16 * 4 + 3; // chunk_size * downsample_rate + // (receptive_field - downsample_rate) paddle::Tensor feats = paddle::full( {1, frame_num, feat_dim}, 0.12f, paddle::DataType::FLOAT32); paddle::Tensor offset = paddle::zeros({1}, paddle::DataType::INT32); paddle::Tensor att_cache = paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32); paddle::Tensor cnn_cache = paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32); std::vector inputs = { feats, offset, /*required_cache_size, */ att_cache, cnn_cache}; std::vector outputs = forward_encoder_chunk_(inputs); auto chunk_out = outputs[0]; inputs = std::move(std::vector({chunk_out})); outputs = ctc_activation_(inputs); } { #ifdef WITH_PROFILING RecordEvent event("warmup-decoder", TracerEventType::UserDefined, 1); #endif auto hyps = paddle::full({10, 8}, 10, paddle::DataType::INT64, phi::CPUPlace()); auto hyps_lens = paddle::full({10}, 8, paddle::DataType::INT64, phi::CPUPlace()); auto encoder_out = paddle::ones( {1, 20, 512}, paddle::DataType::FLOAT32, phi::CPUPlace()); std::vector inputs{ hyps, hyps_lens, encoder_out}; std::vector outputs = forward_attention_decoder_(inputs); } Reset(); } U2Nnet::U2Nnet(const ModelOptions& opts) : opts_(opts) { LoadModel(opts_.model_path); } // shallow copy U2Nnet::U2Nnet(const U2Nnet& other) { // copy meta chunk_size_ = other.chunk_size_; num_left_chunks_ = other.num_left_chunks_; offset_ = other.offset_; // copy model ptr model_ = other.model_->Clone(); ctc_activation_ = model_->Function("ctc_activation"); subsampling_rate_ = model_->Attribute("subsampling_rate"); right_context_ = model_->Attribute("right_context"); sos_ = model_->Attribute("sos_symbol"); eos_ = model_->Attribute("eos_symbol"); is_bidecoder_ = model_->Attribute("is_bidirectional_decoder"); forward_encoder_chunk_ = model_->Function("forward_encoder_chunk"); forward_attention_decoder_ = model_->Function("forward_attention_decoder"); ctc_activation_ = model_->Function("ctc_activation"); CHECK(forward_encoder_chunk_.IsValid()); CHECK(forward_attention_decoder_.IsValid()); CHECK(ctc_activation_.IsValid()); LOG(INFO) << "Paddle Model Info: "; LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; LOG(INFO) << "\tright context " << right_context_; LOG(INFO) << "\tsos " << sos_; LOG(INFO) << "\teos " << eos_; LOG(INFO) << "\tis bidecoder " << is_bidecoder_ << std::endl; // ignore inner states } std::shared_ptr U2Nnet::Clone() const { auto asr_model = std::make_shared(*this); // reset inner state for new decoding asr_model->Reset(); return asr_model; } void U2Nnet::Reset() { offset_ = 0; att_cache_ = std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); cnn_cache_ = std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); encoder_outs_.clear(); VLOG(3) << "u2nnet reset"; } // Debug API void U2Nnet::FeedEncoderOuts(const paddle::Tensor& encoder_out) { // encoder_out (T,D) encoder_outs_.clear(); encoder_outs_.push_back(encoder_out); } void U2Nnet::FeedForward(const std::vector& features, const int32& feature_dim, NnetOut* out) { kaldi::Timer timer; std::vector ctc_probs; ForwardEncoderChunkImpl( features, feature_dim, &out->logprobs, &out->vocab_dim); VLOG(1) << "FeedForward cost: " << timer.Elapsed() << " sec. " << features.size() / feature_dim << " frames."; } void U2Nnet::ForwardEncoderChunkImpl( const std::vector& chunk_feats, const int32& feat_dim, std::vector* out_prob, int32* vocab_dim) { #ifdef WITH_PROFILING RecordEvent event( "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1); #endif // 1. splice cached_feature, and chunk_feats // First dimension is B, which is 1. // int num_frames = cached_feats_.size() + chunk_feats.size(); int num_frames = chunk_feats.size() / feat_dim; VLOG(3) << "num_frames: " << num_frames; VLOG(3) << "feat_dim: " << feat_dim; // feats (B=1,T,D) paddle::Tensor feats = paddle::zeros({1, num_frames, feat_dim}, paddle::DataType::FLOAT32); float* feats_ptr = feats.mutable_data(); // not cache feature in nnet CHECK_EQ(cached_feats_.size(), 0); // CHECK_EQ(std::is_same::value, true); std::memcpy(feats_ptr, chunk_feats.data(), chunk_feats.size() * sizeof(kaldi::BaseFloat)); VLOG(3) << "feats shape: " << feats.shape()[0] << ", " << feats.shape()[1] << ", " << feats.shape()[2]; #ifndef NDEBUG { std::stringstream path("feat", std::ios_base::app | std::ios_base::out); path << offset_; std::ofstream feat_fobj(path.str().c_str(), std::ios::out); CHECK(feat_fobj.is_open()); // feat_fobj << feats.shape()[0] << " " << feats.shape()[1] << " " // << feats.shape()[2] << "\n"; for (int i = 0; i < feats.numel(); i++) { feat_fobj << std::setprecision(18) << feats_ptr[i] << " "; if ((i + 1) % feat_dim == 0) { feat_fobj << "\n"; } } feat_fobj << "\n"; } #endif // Endocer chunk forward #ifdef WITH_GPU feats = feats.copy_to(paddle::GPUPlace(), /*blocking*/ false); att_cache_ = att_cache_.copy_to(paddle::GPUPlace()), /*blocking*/ false; cnn_cache_ = cnn_cache_.copy_to(Paddle::GPUPlace(), /*blocking*/ false); #endif int required_cache_size = num_left_chunks_ * chunk_size_; // -1 * 16 // must be scalar, but paddle do not have scalar. paddle::Tensor offset = paddle::full({1}, offset_, paddle::DataType::INT32); // freeze `required_cache_size` in graph, so not specific it in function // call. std::vector inputs = { feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_}; CHECK_EQ(inputs.size(), 4); std::vector outputs = forward_encoder_chunk_(inputs); CHECK_EQ(outputs.size(), 3); #ifdef WITH_GPU paddle::Tensor chunk_out = outputs[0].copy_to(paddle::CPUPlace()); att_cache_ = outputs[1].copy_to(paddle::CPUPlace()); cnn_cache_ = outputs[2].copy_to(paddle::CPUPlace()); #else paddle::Tensor chunk_out = outputs[0]; att_cache_ = outputs[1]; cnn_cache_ = outputs[2]; #endif #ifndef NDEBUG { std::stringstream path("encoder_logits", std::ios_base::app | std::ios_base::out); auto i = offset_ - chunk_out.shape()[1]; path << std::max(i, 0L); std::ofstream logits_fobj(path.str().c_str(), std::ios::out); CHECK(logits_fobj.is_open()); logits_fobj << chunk_out.shape()[0] << " " << chunk_out.shape()[1] << " " << chunk_out.shape()[2] << "\n"; const float* chunk_out_ptr = chunk_out.data(); logits_fobj << chunk_out_ptr << std::endl; for (int i = 0; i < chunk_out.numel(); i++) { logits_fobj << chunk_out_ptr[i] << " "; } logits_fobj << "\n"; } #endif // end TEST_DEBUG // current offset in decoder frame // not used in nnet offset_ += chunk_out.shape()[1]; VLOG(2) << "encoder out chunk size: " << chunk_out.shape()[1] << " total: " << offset_; // collects encoder outs. encoder_outs_.push_back(chunk_out); VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); #ifndef NDEBUG { std::stringstream path("encoder_logits_list", std::ios_base::app | std::ios_base::out); path << offset_ - encoder_outs_[0].shape()[1]; std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); CHECK(logits_out_fobj.is_open()); logits_out_fobj << encoder_outs_[0].shape()[0] << " " << encoder_outs_[0].shape()[1] << " " << encoder_outs_[0].shape()[2] << "\n"; const float* encoder_outs_ptr = encoder_outs_[0].data(); logits_out_fobj << encoder_outs_ptr << std::endl; for (int i = 0; i < encoder_outs_[0].numel(); i++) { logits_out_fobj << encoder_outs_ptr[i] << " "; } logits_out_fobj << "\n"; } #endif // end TEST_DEBUG #ifdef WITH_GPU #error "Not implementation." #else // compute ctc_activation == log_softmax inputs.clear(); outputs.clear(); inputs.push_back(chunk_out); CHECK_EQ(inputs.size(), 1); outputs = ctc_activation_(inputs); CHECK_EQ(outputs.size(), 1); paddle::Tensor ctc_log_probs = outputs[0]; #ifndef NDEBUG { std::stringstream path("encoder_logprob", std::ios_base::app | std::ios_base::out); path << offset_ - chunk_out.shape()[1]; std::ofstream logprob_fobj(path.str().c_str(), std::ios::out); CHECK(logprob_fobj.is_open()); logprob_fobj << ctc_log_probs.shape()[0] << " " << ctc_log_probs.shape()[1] << " " << ctc_log_probs.shape()[2] << "\n"; const float* logprob_ptr = ctc_log_probs.data(); for (int i = 0; i < ctc_log_probs.numel(); i++) { logprob_fobj << logprob_ptr[i] << " "; if ((i + 1) % ctc_log_probs.shape()[2] == 0) { logprob_fobj << "\n"; } } logprob_fobj << "\n"; } #endif // end TEST_DEBUG #endif // end WITH_GPU // Copy to output, (B=1,T,D) std::vector ctc_log_probs_shape = ctc_log_probs.shape(); CHECK_EQ(ctc_log_probs_shape.size(), 3); int B = ctc_log_probs_shape[0]; CHECK_EQ(B, 1); int T = ctc_log_probs_shape[1]; int D = ctc_log_probs_shape[2]; *vocab_dim = D; float* ctc_log_probs_ptr = ctc_log_probs.data(); out_prob->resize(T * D); std::memcpy( out_prob->data(), ctc_log_probs_ptr, T * D * sizeof(kaldi::BaseFloat)); #ifndef NDEBUG { std::stringstream path("encoder_logits_list_ctc", std::ios_base::app | std::ios_base::out); path << offset_ - encoder_outs_[0].shape()[1]; std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); CHECK(logits_out_fobj.is_open()); logits_out_fobj << encoder_outs_[0].shape()[0] << " " << encoder_outs_[0].shape()[1] << " " << encoder_outs_[0].shape()[2] << "\n"; const float* encoder_outs_ptr = encoder_outs_[0].data(); logits_out_fobj << encoder_outs_ptr << std::endl; for (int i = 0; i < encoder_outs_[0].numel(); i++) { logits_out_fobj << encoder_outs_ptr[i] << " "; } logits_out_fobj << "\n"; } #endif // end TEST_DEBUG return; } float U2Nnet::ComputePathScore(const paddle::Tensor& prob, const std::vector& hyp, int eos) { // sum `hyp` path scores in `prob` // prob (1, Umax, V) // hyp (U,) float score = 0.0f; std::vector dims = prob.shape(); CHECK_EQ(dims.size(), 3); VLOG(2) << "prob shape: " << dims[0] << ", " << dims[1] << ", " << dims[2]; CHECK_EQ(dims[0], 1); int vocab_dim = static_cast(dims[2]); const float* prob_ptr = prob.data(); for (size_t i = 0; i < hyp.size(); ++i) { const float* row = prob_ptr + i * vocab_dim; score += row[hyp[i]]; } const float* row = prob_ptr + hyp.size() * vocab_dim; score += row[eos]; return score; } void U2Nnet::AttentionRescoring(const std::vector>& hyps, float reverse_weight, std::vector* rescoring_score) { #ifdef WITH_PROFILING RecordEvent event("AttentionRescoring", TracerEventType::UserDefined, 1); #endif CHECK(rescoring_score != nullptr); int num_hyps = hyps.size(); rescoring_score->resize(num_hyps, 0.0f); if (num_hyps == 0) return; VLOG(2) << "num hyps: " << num_hyps; if (encoder_outs_.size() == 0) { // no encoder outs std::cerr << "encoder_outs_.size() is zero. Please check it." << std::endl; return; } // prepare input paddle::Tensor hyps_lens = paddle::zeros({num_hyps}, paddle::DataType::INT64); int64_t* hyps_len_ptr = hyps_lens.mutable_data(); int max_hyps_len = 0; for (size_t i = 0; i < num_hyps; ++i) { int len = hyps[i].size() + 1; // eos max_hyps_len = std::max(max_hyps_len, len); hyps_len_ptr[i] = static_cast(len); } VLOG(2) << "max_hyps_len: " << max_hyps_len; paddle::Tensor hyps_tensor = paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64); int64_t* hyps_ptr = hyps_tensor.mutable_data(); for (size_t i = 0; i < num_hyps; ++i) { const std::vector& hyp = hyps[i]; int64_t* row = hyps_ptr + max_hyps_len * i; row[0] = sos_; for (size_t j = 0; j < hyp.size(); ++j) { row[j + 1] = hyp[j]; } } #ifndef NDEBUG { std::stringstream path("encoder_logits_concat", std::ios_base::app | std::ios_base::out); for (int j = 0; j < encoder_outs_.size(); j++) { path << j; std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); CHECK(logits_out_fobj.is_open()); logits_out_fobj << encoder_outs_[j].shape()[0] << " " << encoder_outs_[j].shape()[1] << " " << encoder_outs_[j].shape()[2] << "\n"; const float* encoder_outs_ptr = encoder_outs_[j].data(); for (int i = 0; i < encoder_outs_[j].numel(); i++) { logits_out_fobj << encoder_outs_ptr[i] << " "; } logits_out_fobj << "\n"; } } #endif // end TEST_DEBUG // forward attention decoder by hyps and correspoinding encoder_outs_ paddle::Tensor encoder_out = paddle::concat(encoder_outs_, 1); VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); #ifndef NDEBUG { std::stringstream path("encoder_out0", std::ios_base::app | std::ios_base::out); std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out); CHECK(encoder_out_fobj.is_open()); encoder_out_fobj << encoder_outs_[0].shape()[0] << " " << encoder_outs_[0].shape()[1] << " " << encoder_outs_[0].shape()[2] << "\n"; const float* enc_logprob_ptr = encoder_outs_[0].data(); size_t size = encoder_outs_[0].numel(); for (int i = 0; i < size; i++) { encoder_out_fobj << enc_logprob_ptr[i] << "\n"; } } #endif // end TEST_DEBUG #ifndef NDEBUG { std::stringstream path("encoder_out", std::ios_base::app | std::ios_base::out); std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out); CHECK(encoder_out_fobj.is_open()); encoder_out_fobj << encoder_out.shape()[0] << " " << encoder_out.shape()[1] << " " << encoder_out.shape()[2] << "\n"; const float* enc_logprob_ptr = encoder_out.data(); size_t size = encoder_out.numel(); for (int i = 0; i < size; i++) { encoder_out_fobj << enc_logprob_ptr[i] << "\n"; } } #endif // end TEST_DEBUG std::vector inputs{ hyps_tensor, hyps_lens, encoder_out}; std::vector outputs = forward_attention_decoder_(inputs); CHECK_EQ(outputs.size(), 2); // (B, Umax, V) paddle::Tensor probs = outputs[0]; std::vector probs_shape = probs.shape(); CHECK_EQ(probs_shape.size(), 3); CHECK_EQ(probs_shape[0], num_hyps); CHECK_EQ(probs_shape[1], max_hyps_len); #ifndef NDEBUG { std::stringstream path("decoder_logprob", std::ios_base::app | std::ios_base::out); std::ofstream dec_logprob_fobj(path.str().c_str(), std::ios::out); CHECK(dec_logprob_fobj.is_open()); dec_logprob_fobj << probs.shape()[0] << " " << probs.shape()[1] << " " << probs.shape()[2] << "\n"; const float* dec_logprob_ptr = probs.data(); size_t size = probs.numel(); for (int i = 0; i < size; i++) { dec_logprob_fobj << dec_logprob_ptr[i] << "\n"; } } #endif // end TEST_DEBUG #ifndef NDEBUG { std::stringstream path("hyps_lens", std::ios_base::app | std::ios_base::out); std::ofstream hyps_len_fobj(path.str().c_str(), std::ios::out); CHECK(hyps_len_fobj.is_open()); const int64_t* hyps_lens_ptr = hyps_lens.data(); size_t size = hyps_lens.numel(); for (int i = 0; i < size; i++) { hyps_len_fobj << hyps_lens_ptr[i] << "\n"; } } #endif // end TEST_DEBUG #ifndef NDEBUG { std::stringstream path("hyps_tensor", std::ios_base::app | std::ios_base::out); std::ofstream hyps_tensor_fobj(path.str().c_str(), std::ios::out); CHECK(hyps_tensor_fobj.is_open()); const int64_t* hyps_tensor_ptr = hyps_tensor.data(); size_t size = hyps_tensor.numel(); for (int i = 0; i < size; i++) { hyps_tensor_fobj << hyps_tensor_ptr[i] << "\n"; } } #endif // end TEST_DEBUG paddle::Tensor r_probs = outputs[1]; std::vector r_probs_shape = r_probs.shape(); if (is_bidecoder_ && reverse_weight > 0) { CHECK_EQ(r_probs_shape.size(), 3); CHECK_EQ(r_probs_shape[0], num_hyps); CHECK_EQ(r_probs_shape[1], max_hyps_len); } else { // dump r_probs CHECK_EQ(r_probs_shape.size(), 1); CHECK_EQ(r_probs_shape[0], 1) << r_probs_shape[0]; } // compute rescoring score using IntArray = paddle::experimental::IntArray; std::vector probs_v = paddle::experimental::split_with_num(probs, num_hyps, 0); VLOG(2) << "split prob: " << probs_v.size() << " " << probs_v[0].shape().size() << " 0: " << probs_v[0].shape()[0] << ", " << probs_v[0].shape()[1] << ", " << probs_v[0].shape()[2]; CHECK(static_cast(probs_v.size()) == num_hyps) << ": is " << probs_v.size() << " expect: " << num_hyps; std::vector r_probs_v; if (is_bidecoder_ && reverse_weight > 0) { r_probs_v = paddle::experimental::split_with_num(r_probs, num_hyps, 0); CHECK(static_cast(r_probs_v.size()) == num_hyps) << "r_probs_v size: is " << r_probs_v.size() << " expect: " << num_hyps; } for (int i = 0; i < num_hyps; ++i) { const std::vector& hyp = hyps[i]; // left-to-right decoder score float score = 0.0f; score = ComputePathScore(probs_v[i], hyp, eos_); // right-to-left decoder score float r_score = 0.0f; if (is_bidecoder_ && reverse_weight > 0) { std::vector r_hyp(hyp.size()); std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); r_score = ComputePathScore(r_probs_v[i], r_hyp, eos_); } // combinded left-to-right and right-to-lfet score (*rescoring_score)[i] = score * (1 - reverse_weight) + r_score * reverse_weight; VLOG(3) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score << " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i]; } } void U2Nnet::EncoderOuts( std::vector>* encoder_out) const { // list of (B=1,T,D) int size = encoder_outs_.size(); VLOG(3) << "encoder_outs_ size: " << size; for (int i = 0; i < size; i++) { const paddle::Tensor& item = encoder_outs_[i]; const std::vector shape = item.shape(); CHECK_EQ(shape.size(), 3); const int& B = shape[0]; const int& T = shape[1]; const int& D = shape[2]; CHECK(B == 1) << "Only support batch one."; VLOG(3) << "encoder out " << i << " shape: (" << B << "," << T << "," << D << ")"; const float* this_tensor_ptr = item.data(); for (int j = 0; j < T; j++) { const float* cur = this_tensor_ptr + j * D; std::vector out(D); std::memcpy(out.data(), cur, D * sizeof(kaldi::BaseFloat)); encoder_out->emplace_back(out); } } } } // namespace ppspeech