From cd1ced4ea0f9f85835a63b7afd2b47f8f14a963f Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 11 Oct 2022 06:43:07 +0000
Subject: [PATCH] add nnetout struct

---
 speechx/examples/ds2_ol/aishell/run.sh |  4 ++--
 speechx/speechx/nnet/CMakeLists.txt    |  1 -
 speechx/speechx/nnet/decodable.cc      | 25 +++++++++++++------------
 speechx/speechx/nnet/decodable.h       |  2 +-
 speechx/speechx/nnet/ds2_nnet.cc       | 15 +++++++++------
 speechx/speechx/nnet/ds2_nnet.h        |  5 ++---
 speechx/speechx/nnet/nnet_itf.h        | 17 ++++++++++++++---
 speechx/speechx/nnet/u2_nnet.cc        | 18 ++++++++++--------
 speechx/speechx/nnet/u2_nnet.h         | 15 ++++++++-------
 9 files changed, 59 insertions(+), 43 deletions(-)
diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
index 82e889ce5..a29be17bf 100755
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-set +x
+set -x
 set -e
 
 . path.sh
@@ -11,7 +11,7 @@ stop_stage=100
 . utils/parse_options.sh
 
 # 1. compile
-if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+if [ ! -d ${SPEECHX_BUILD} ]; then
     pushd ${SPEECHX_ROOT} 
     bash build.sh
     popd
diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt
index 2a1812fdf..435666163 100644
--- a/speechx/speechx/nnet/CMakeLists.txt
+++ b/speechx/speechx/nnet/CMakeLists.txt
@@ -14,7 +14,6 @@ target_link_libraries(nnet absl::strings)
 if(USING_U2)
   target_compile_options(nnet  PUBLIC ${PADDLE_COMPILE_FLAGS})
   target_include_directories(nnet  PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
-  # target_link_libraries(nnet  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
 endif()
 
 
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index 7780e5ae6..40fac182f 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -32,7 +32,7 @@ Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
 
 // for debug
 void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
-    nnet_cache_ = likelihood;
+    nnet_out_cache_ = likelihood;
     frames_ready_ += likelihood.NumRows();
 }
 
@@ -56,13 +56,13 @@ int32 Decodable::NumIndices() const { return 0; }
 int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; }
 
 BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) {
-    CHECK_LE(index, nnet_cache_.NumCols());
+    CHECK_LE(index, nnet_out_cache_.NumCols());
     CHECK_LE(frame, frames_ready_);
     int32 frame_idx = frame - frame_offset_;
     // the nnet output is prob ranther than log prob
     // the index - 1, because the ilabel
     return acoustic_scale_ *
-           std::log(nnet_cache_(frame_idx, TokenId2NnetId(index)) +
+           std::log(nnet_out_cache_(frame_idx, TokenId2NnetId(index)) +
                     std::numeric_limits<float>::min());
 }
 
@@ -82,17 +82,18 @@ bool Decodable::AdvanceChunk() {
     }
 
     // forward feats
-    int32 vocab_dim = 0;
-    Vector<BaseFloat> probs;
-    nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim);
+    NnetOut out;
+    nnet_->FeedForward(features, frontend_->Dim(), &out);
+    int32& vocab_dim = out.vocab_dim;
+    Vector<BaseFloat>& probs = out.logprobs;
 
     // cache nnet outupts
-    nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
-    nnet_cache_.CopyRowsFromVec(probs);
+    nnet_out_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
+    nnet_out_cache_.CopyRowsFromVec(probs);
 
     // update state
     frame_offset_ = frames_ready_;
-    frames_ready_ += nnet_cache_.NumRows();
+    frames_ready_ += nnet_out_cache_.NumRows();
     return true;
 }
 
@@ -102,12 +103,12 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
         return false;
     }
 
-    int vocab_size = nnet_cache_.NumCols();
+    int vocab_size = nnet_out_cache_.NumCols();
     likelihood->resize(vocab_size);
 
     for (int32 idx = 0; idx < vocab_size; ++idx) {
         (*likelihood)[idx] =
-            nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_;
+            nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;
     }
     return true;
 }
@@ -117,7 +118,7 @@ void Decodable::Reset() {
     if (nnet_ != nullptr) nnet_->Reset();
     frame_offset_ = 0;
     frames_ready_ = 0;
-    nnet_cache_.Resize(0, 0);
+    nnet_out_cache_.Resize(0, 0);
 }
 
 }  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h
index 241d04198..8786e4f20 100644
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -62,7 +62,7 @@ class Decodable : public kaldi::DecodableInterface {
     std::shared_ptr<NnetInterface> nnet_;
 
     // nnet outputs' cache
-    kaldi::Matrix<kaldi::BaseFloat> nnet_cache_;
+    kaldi::Matrix<kaldi::BaseFloat> nnet_out_cache_;
 
     // the frame is nnet prob frame rather than audio feature frame
     // nnet frame subsample the feature frame
diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc
index a89c0f20e..c6add03c3 100644
--- a/speechx/speechx/nnet/ds2_nnet.cc
+++ b/speechx/speechx/nnet/ds2_nnet.cc
@@ -143,9 +143,8 @@ shared_ptr<Tensor<BaseFloat>> PaddleNnet::GetCacheEncoder(const string& name) {
 }
 
 void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
-                             int32 feature_dim,
-                             Vector<BaseFloat>* inferences,
-                             int32* inference_dim) {
+                             const int32& feature_dim,
+                             NnetOut* out) {
     paddle_infer::Predictor* predictor = GetPredictor();
 
     int feat_row = features.Dim() / feature_dim;
@@ -203,9 +202,13 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
     std::vector<int> output_shape = output_tensor->shape();
     int32 row = output_shape[1];
     int32 col = output_shape[2];
-    inferences->Resize(row * col);
-    *inference_dim = col;
-    output_tensor->CopyToCpu(inferences->Data());
+
+
+    // inferences->Resize(row * col);
+    // *inference_dim = col;
+    out->logprobs.Resize(row*col);
+    out->vocab_dim = col;
+    output_tensor->CopyToCpu(out->logprobs.Data());
 
     ReleasePredictor(predictor);
 }
diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h
index e2b3d5bc4..717bdb721 100644
--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -97,9 +97,8 @@ class PaddleNnet : public NnetInterface {
     PaddleNnet(const ModelOptions& opts);
 
     virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
-                             int32 feature_dim,
-                             kaldi::Vector<kaldi::BaseFloat>* inferences,
-                             int32* inference_dim);
+                             const int32& feature_dim,
+                             NnetOut* out);
 
     void Dim();
     virtual void Reset();
diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h
index ac040fbaa..12fe3c272 100644
--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -21,12 +21,23 @@
 
 namespace ppspeech {
 
+struct NnetOut{
+  // nnet out, maybe logprob or prob
+  kaldi::Vector<kaldi::BaseFloat> logprobs;
+  int32 vocab_dim;
+
+  // nnet state. Only using in Attention model.
+  std::vector<std::vector<kaldi::BaseFloat>> encoder_outs;
+
+  NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {} 
+};
+
+
 class NnetInterface {
   public:
     virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
-                             int32 feature_dim,
-                             kaldi::Vector<kaldi::BaseFloat>* inferences,
-                             int32* inference_dim) = 0;
+                             const int32& feature_dim,
+                             NnetOut* out) = 0;
     virtual void Reset() = 0;
     virtual ~NnetInterface() {}
 };
diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc
index 67ef0952a..26d7da8f9 100644
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -64,7 +64,7 @@ void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
 
 void U2NnetBase::ForwardEncoderChunk(
     const std::vector<kaldi::BaseFloat>& chunk_feats,
-    int32 feat_dim,
+    const int32& feat_dim,
     std::vector<kaldi::BaseFloat>* ctc_probs,
     int32* vocab_dim) {
     ctc_probs->clear();
@@ -221,16 +221,17 @@ void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
 
 
 void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
-                         int32 feature_dim,
-                         kaldi::Vector<BaseFloat>* inferences,
-                         int32* inference_dim) {
+                         const int32& feature_dim,
+                         NnetOut* out) {
     std::vector<kaldi::BaseFloat> chunk_feats(features.Data(),
                                               features.Data() + features.Dim());
+
     std::vector<kaldi::BaseFloat> ctc_probs;
     ForwardEncoderChunkImpl(
-        chunk_feats, feature_dim, &ctc_probs, inference_dim);
-    inferences->Resize(ctc_probs.size(), kaldi::kSetZero);
-    std::memcpy(inferences->Data(),
+        chunk_feats, feature_dim, &ctc_probs, &out->vocab_dim);
+
+    out->logprobs.Resize(ctc_probs.size(), kaldi::kSetZero);
+    std::memcpy(out->logprobs.Data(),
                 ctc_probs.data(),
                 ctc_probs.size() * sizeof(kaldi::BaseFloat));
 }
@@ -238,9 +239,10 @@ void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
 
 void U2Nnet::ForwardEncoderChunkImpl(
     const std::vector<kaldi::BaseFloat>& chunk_feats,
-    int32 feat_dim,
+    const int32& feat_dim,
     std::vector<kaldi::BaseFloat>* out_prob,
     int32* vocab_dim) {
+
 #ifdef USE_PROFILING
     RecordEvent event(
         "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1);
diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h
index ddc85b45f..874429599 100644
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -61,7 +61,7 @@ class U2NnetBase : public NnetInterface {
 
     virtual void ForwardEncoderChunk(
         const std::vector<kaldi::BaseFloat>& chunk_feats,
-        int32 feat_dim,
+        const int32& feat_dim,
         std::vector<kaldi::BaseFloat>* ctc_probs,
         int32* vocab_dim);
 
@@ -72,7 +72,7 @@ class U2NnetBase : public NnetInterface {
   protected:
     virtual void ForwardEncoderChunkImpl(
         const std::vector<kaldi::BaseFloat>& chunk_feats,
-        int32 feat_dim,
+        const int32& feat_dim,
         std::vector<kaldi::BaseFloat>* ctc_probs,
         int32* vocab_dim) = 0;
 
@@ -93,7 +93,7 @@ class U2NnetBase : public NnetInterface {
                           // case. Otherwise, none streaming case
     int num_left_chunks_{-1};  // -1 means all left chunks
 
-    // asr decoder state
+    // asr decoder state, not used in nnet
     int offset_{0};  // current offset in encoder output time stamp. Used by
                      // position embedding.
     std::vector<std::vector<float>> cached_feats_{};  // features cache
@@ -106,9 +106,8 @@ class U2Nnet : public U2NnetBase {
     U2Nnet(const U2Nnet& other);
 
     void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
-                     int32 feature_dim,
-                     kaldi::Vector<kaldi::BaseFloat>* inferences,
-                     int32* inference_dim) override;
+                     const int32& feature_dim,
+                     NnetOut* out) override;
 
     void Reset() override;
 
@@ -123,7 +122,7 @@ class U2Nnet : public U2NnetBase {
 
     void ForwardEncoderChunkImpl(
         const std::vector<kaldi::BaseFloat>& chunk_feats,
-        int32 feat_dim,
+        const int32& feat_dim,
         std::vector<kaldi::BaseFloat>* ctc_probs,
         int32* vocab_dim) override;
 
@@ -138,6 +137,8 @@ class U2Nnet : public U2NnetBase {
     // debug
     void FeedEncoderOuts(paddle::Tensor& encoder_out);
 
+    const std::vector<paddle::Tensor>& EncoderOuts() const {return encoder_outs_; }
+
   private:
     U2ModelOptions opts_;