diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index 82e889ce5..a29be17bf 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -1,5 +1,5 @@ #!/bin/bash -set +x +set -x set -e . path.sh @@ -11,7 +11,7 @@ stop_stage=100 . utils/parse_options.sh # 1. compile -if [ ! -d ${SPEECHX_EXAMPLES} ]; then +if [ ! -d ${SPEECHX_BUILD} ]; then pushd ${SPEECHX_ROOT} bash build.sh popd diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt index 2a1812fdf..435666163 100644 --- a/speechx/speechx/nnet/CMakeLists.txt +++ b/speechx/speechx/nnet/CMakeLists.txt @@ -14,7 +14,6 @@ target_link_libraries(nnet absl::strings) if(USING_U2) target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS}) target_include_directories(nnet PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) - # target_link_libraries(nnet ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) endif() diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 7780e5ae6..40fac182f 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -32,7 +32,7 @@ Decodable::Decodable(const std::shared_ptr& nnet, // for debug void Decodable::Acceptlikelihood(const Matrix& likelihood) { - nnet_cache_ = likelihood; + nnet_out_cache_ = likelihood; frames_ready_ += likelihood.NumRows(); } @@ -56,13 +56,13 @@ int32 Decodable::NumIndices() const { return 0; } int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; } BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) { - CHECK_LE(index, nnet_cache_.NumCols()); + CHECK_LE(index, nnet_out_cache_.NumCols()); CHECK_LE(frame, frames_ready_); int32 frame_idx = frame - frame_offset_; // the nnet output is prob ranther than log prob // the index - 1, because the ilabel return acoustic_scale_ * - std::log(nnet_cache_(frame_idx, TokenId2NnetId(index)) + + std::log(nnet_out_cache_(frame_idx, TokenId2NnetId(index)) + std::numeric_limits::min()); } @@ -82,17 +82,18 @@ bool Decodable::AdvanceChunk() { } // forward feats - int32 vocab_dim = 0; - Vector probs; - nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim); + NnetOut out; + nnet_->FeedForward(features, frontend_->Dim(), &out); + int32& vocab_dim = out.vocab_dim; + Vector& probs = out.logprobs; // cache nnet outupts - nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim); - nnet_cache_.CopyRowsFromVec(probs); + nnet_out_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim); + nnet_out_cache_.CopyRowsFromVec(probs); // update state frame_offset_ = frames_ready_; - frames_ready_ += nnet_cache_.NumRows(); + frames_ready_ += nnet_out_cache_.NumRows(); return true; } @@ -102,12 +103,12 @@ bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { return false; } - int vocab_size = nnet_cache_.NumCols(); + int vocab_size = nnet_out_cache_.NumCols(); likelihood->resize(vocab_size); for (int32 idx = 0; idx < vocab_size; ++idx) { (*likelihood)[idx] = - nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_; + nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_; } return true; } @@ -117,7 +118,7 @@ void Decodable::Reset() { if (nnet_ != nullptr) nnet_->Reset(); frame_offset_ = 0; frames_ready_ = 0; - nnet_cache_.Resize(0, 0); + nnet_out_cache_.Resize(0, 0); } } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 241d04198..8786e4f20 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -62,7 +62,7 @@ class Decodable : public kaldi::DecodableInterface { std::shared_ptr nnet_; // nnet outputs' cache - kaldi::Matrix nnet_cache_; + kaldi::Matrix nnet_out_cache_; // the frame is nnet prob frame rather than audio feature frame // nnet frame subsample the feature frame diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc index a89c0f20e..c6add03c3 100644 --- a/speechx/speechx/nnet/ds2_nnet.cc +++ b/speechx/speechx/nnet/ds2_nnet.cc @@ -143,9 +143,8 @@ shared_ptr> PaddleNnet::GetCacheEncoder(const string& name) { } void PaddleNnet::FeedForward(const Vector& features, - int32 feature_dim, - Vector* inferences, - int32* inference_dim) { + const int32& feature_dim, + NnetOut* out) { paddle_infer::Predictor* predictor = GetPredictor(); int feat_row = features.Dim() / feature_dim; @@ -203,9 +202,13 @@ void PaddleNnet::FeedForward(const Vector& features, std::vector output_shape = output_tensor->shape(); int32 row = output_shape[1]; int32 col = output_shape[2]; - inferences->Resize(row * col); - *inference_dim = col; - output_tensor->CopyToCpu(inferences->Data()); + + + // inferences->Resize(row * col); + // *inference_dim = col; + out->logprobs.Resize(row*col); + out->vocab_dim = col; + output_tensor->CopyToCpu(out->logprobs.Data()); ReleasePredictor(predictor); } diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index e2b3d5bc4..717bdb721 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -97,9 +97,8 @@ class PaddleNnet : public NnetInterface { PaddleNnet(const ModelOptions& opts); virtual void FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim); + const int32& feature_dim, + NnetOut* out); void Dim(); virtual void Reset(); diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index ac040fbaa..12fe3c272 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -21,12 +21,23 @@ namespace ppspeech { +struct NnetOut{ + // nnet out, maybe logprob or prob + kaldi::Vector logprobs; + int32 vocab_dim; + + // nnet state. Only using in Attention model. + std::vector> encoder_outs; + + NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {} +}; + + class NnetInterface { public: virtual void FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim) = 0; + const int32& feature_dim, + NnetOut* out) = 0; virtual void Reset() = 0; virtual ~NnetInterface() {} }; diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 67ef0952a..26d7da8f9 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -64,7 +64,7 @@ void U2NnetBase::CacheFeature(const std::vector& chunk_feats, void U2NnetBase::ForwardEncoderChunk( const std::vector& chunk_feats, - int32 feat_dim, + const int32& feat_dim, std::vector* ctc_probs, int32* vocab_dim) { ctc_probs->clear(); @@ -221,16 +221,17 @@ void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) { void U2Nnet::FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim) { + const int32& feature_dim, + NnetOut* out) { std::vector chunk_feats(features.Data(), features.Data() + features.Dim()); + std::vector ctc_probs; ForwardEncoderChunkImpl( - chunk_feats, feature_dim, &ctc_probs, inference_dim); - inferences->Resize(ctc_probs.size(), kaldi::kSetZero); - std::memcpy(inferences->Data(), + chunk_feats, feature_dim, &ctc_probs, &out->vocab_dim); + + out->logprobs.Resize(ctc_probs.size(), kaldi::kSetZero); + std::memcpy(out->logprobs.Data(), ctc_probs.data(), ctc_probs.size() * sizeof(kaldi::BaseFloat)); } @@ -238,9 +239,10 @@ void U2Nnet::FeedForward(const kaldi::Vector& features, void U2Nnet::ForwardEncoderChunkImpl( const std::vector& chunk_feats, - int32 feat_dim, + const int32& feat_dim, std::vector* out_prob, int32* vocab_dim) { + #ifdef USE_PROFILING RecordEvent event( "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1); diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index ddc85b45f..874429599 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -61,7 +61,7 @@ class U2NnetBase : public NnetInterface { virtual void ForwardEncoderChunk( const std::vector& chunk_feats, - int32 feat_dim, + const int32& feat_dim, std::vector* ctc_probs, int32* vocab_dim); @@ -72,7 +72,7 @@ class U2NnetBase : public NnetInterface { protected: virtual void ForwardEncoderChunkImpl( const std::vector& chunk_feats, - int32 feat_dim, + const int32& feat_dim, std::vector* ctc_probs, int32* vocab_dim) = 0; @@ -93,7 +93,7 @@ class U2NnetBase : public NnetInterface { // case. Otherwise, none streaming case int num_left_chunks_{-1}; // -1 means all left chunks - // asr decoder state + // asr decoder state, not used in nnet int offset_{0}; // current offset in encoder output time stamp. Used by // position embedding. std::vector> cached_feats_{}; // features cache @@ -106,9 +106,8 @@ class U2Nnet : public U2NnetBase { U2Nnet(const U2Nnet& other); void FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim) override; + const int32& feature_dim, + NnetOut* out) override; void Reset() override; @@ -123,7 +122,7 @@ class U2Nnet : public U2NnetBase { void ForwardEncoderChunkImpl( const std::vector& chunk_feats, - int32 feat_dim, + const int32& feat_dim, std::vector* ctc_probs, int32* vocab_dim) override; @@ -138,6 +137,8 @@ class U2Nnet : public U2NnetBase { // debug void FeedEncoderOuts(paddle::Tensor& encoder_out); + const std::vector& EncoderOuts() const {return encoder_outs_; } + private: U2ModelOptions opts_;