add nnetout struct

pull/2524/head
Hui Zhang 2 years ago
parent 290c23b9d7
commit cd1ced4ea0

@ -1,5 +1,5 @@
#!/bin/bash #!/bin/bash
set +x set -x
set -e set -e
. path.sh . path.sh
@ -11,7 +11,7 @@ stop_stage=100
. utils/parse_options.sh . utils/parse_options.sh
# 1. compile # 1. compile
if [ ! -d ${SPEECHX_EXAMPLES} ]; then if [ ! -d ${SPEECHX_BUILD} ]; then
pushd ${SPEECHX_ROOT} pushd ${SPEECHX_ROOT}
bash build.sh bash build.sh
popd popd

@ -14,7 +14,6 @@ target_link_libraries(nnet absl::strings)
if(USING_U2) if(USING_U2)
target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS}) target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS})
target_include_directories(nnet PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) target_include_directories(nnet PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
# target_link_libraries(nnet ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
endif() endif()

@ -32,7 +32,7 @@ Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
// for debug // for debug
void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) { void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
nnet_cache_ = likelihood; nnet_out_cache_ = likelihood;
frames_ready_ += likelihood.NumRows(); frames_ready_ += likelihood.NumRows();
} }
@ -56,13 +56,13 @@ int32 Decodable::NumIndices() const { return 0; }
int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; } int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; }
BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) { BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) {
CHECK_LE(index, nnet_cache_.NumCols()); CHECK_LE(index, nnet_out_cache_.NumCols());
CHECK_LE(frame, frames_ready_); CHECK_LE(frame, frames_ready_);
int32 frame_idx = frame - frame_offset_; int32 frame_idx = frame - frame_offset_;
// the nnet output is prob ranther than log prob // the nnet output is prob ranther than log prob
// the index - 1, because the ilabel // the index - 1, because the ilabel
return acoustic_scale_ * return acoustic_scale_ *
std::log(nnet_cache_(frame_idx, TokenId2NnetId(index)) + std::log(nnet_out_cache_(frame_idx, TokenId2NnetId(index)) +
std::numeric_limits<float>::min()); std::numeric_limits<float>::min());
} }
@ -82,17 +82,18 @@ bool Decodable::AdvanceChunk() {
} }
// forward feats // forward feats
int32 vocab_dim = 0; NnetOut out;
Vector<BaseFloat> probs; nnet_->FeedForward(features, frontend_->Dim(), &out);
nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim); int32& vocab_dim = out.vocab_dim;
Vector<BaseFloat>& probs = out.logprobs;
// cache nnet outupts // cache nnet outupts
nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim); nnet_out_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
nnet_cache_.CopyRowsFromVec(probs); nnet_out_cache_.CopyRowsFromVec(probs);
// update state // update state
frame_offset_ = frames_ready_; frame_offset_ = frames_ready_;
frames_ready_ += nnet_cache_.NumRows(); frames_ready_ += nnet_out_cache_.NumRows();
return true; return true;
} }
@ -102,12 +103,12 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
return false; return false;
} }
int vocab_size = nnet_cache_.NumCols(); int vocab_size = nnet_out_cache_.NumCols();
likelihood->resize(vocab_size); likelihood->resize(vocab_size);
for (int32 idx = 0; idx < vocab_size; ++idx) { for (int32 idx = 0; idx < vocab_size; ++idx) {
(*likelihood)[idx] = (*likelihood)[idx] =
nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_; nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;
} }
return true; return true;
} }
@ -117,7 +118,7 @@ void Decodable::Reset() {
if (nnet_ != nullptr) nnet_->Reset(); if (nnet_ != nullptr) nnet_->Reset();
frame_offset_ = 0; frame_offset_ = 0;
frames_ready_ = 0; frames_ready_ = 0;
nnet_cache_.Resize(0, 0); nnet_out_cache_.Resize(0, 0);
} }
} // namespace ppspeech } // namespace ppspeech

@ -62,7 +62,7 @@ class Decodable : public kaldi::DecodableInterface {
std::shared_ptr<NnetInterface> nnet_; std::shared_ptr<NnetInterface> nnet_;
// nnet outputs' cache // nnet outputs' cache
kaldi::Matrix<kaldi::BaseFloat> nnet_cache_; kaldi::Matrix<kaldi::BaseFloat> nnet_out_cache_;
// the frame is nnet prob frame rather than audio feature frame // the frame is nnet prob frame rather than audio feature frame
// nnet frame subsample the feature frame // nnet frame subsample the feature frame

@ -143,9 +143,8 @@ shared_ptr<Tensor<BaseFloat>> PaddleNnet::GetCacheEncoder(const string& name) {
} }
void PaddleNnet::FeedForward(const Vector<BaseFloat>& features, void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
int32 feature_dim, const int32& feature_dim,
Vector<BaseFloat>* inferences, NnetOut* out) {
int32* inference_dim) {
paddle_infer::Predictor* predictor = GetPredictor(); paddle_infer::Predictor* predictor = GetPredictor();
int feat_row = features.Dim() / feature_dim; int feat_row = features.Dim() / feature_dim;
@ -203,9 +202,13 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
std::vector<int> output_shape = output_tensor->shape(); std::vector<int> output_shape = output_tensor->shape();
int32 row = output_shape[1]; int32 row = output_shape[1];
int32 col = output_shape[2]; int32 col = output_shape[2];
inferences->Resize(row * col);
*inference_dim = col;
output_tensor->CopyToCpu(inferences->Data()); // inferences->Resize(row * col);
// *inference_dim = col;
out->logprobs.Resize(row*col);
out->vocab_dim = col;
output_tensor->CopyToCpu(out->logprobs.Data());
ReleasePredictor(predictor); ReleasePredictor(predictor);
} }

@ -97,9 +97,8 @@ class PaddleNnet : public NnetInterface {
PaddleNnet(const ModelOptions& opts); PaddleNnet(const ModelOptions& opts);
virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features, virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
int32 feature_dim, const int32& feature_dim,
kaldi::Vector<kaldi::BaseFloat>* inferences, NnetOut* out);
int32* inference_dim);
void Dim(); void Dim();
virtual void Reset(); virtual void Reset();

@ -21,12 +21,23 @@
namespace ppspeech { namespace ppspeech {
struct NnetOut{
// nnet out, maybe logprob or prob
kaldi::Vector<kaldi::BaseFloat> logprobs;
int32 vocab_dim;
// nnet state. Only using in Attention model.
std::vector<std::vector<kaldi::BaseFloat>> encoder_outs;
NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {}
};
class NnetInterface { class NnetInterface {
public: public:
virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features, virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
int32 feature_dim, const int32& feature_dim,
kaldi::Vector<kaldi::BaseFloat>* inferences, NnetOut* out) = 0;
int32* inference_dim) = 0;
virtual void Reset() = 0; virtual void Reset() = 0;
virtual ~NnetInterface() {} virtual ~NnetInterface() {}
}; };

@ -64,7 +64,7 @@ void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
void U2NnetBase::ForwardEncoderChunk( void U2NnetBase::ForwardEncoderChunk(
const std::vector<kaldi::BaseFloat>& chunk_feats, const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim, const int32& feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs, std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim) { int32* vocab_dim) {
ctc_probs->clear(); ctc_probs->clear();
@ -221,16 +221,17 @@ void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features, void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
int32 feature_dim, const int32& feature_dim,
kaldi::Vector<BaseFloat>* inferences, NnetOut* out) {
int32* inference_dim) {
std::vector<kaldi::BaseFloat> chunk_feats(features.Data(), std::vector<kaldi::BaseFloat> chunk_feats(features.Data(),
features.Data() + features.Dim()); features.Data() + features.Dim());
std::vector<kaldi::BaseFloat> ctc_probs; std::vector<kaldi::BaseFloat> ctc_probs;
ForwardEncoderChunkImpl( ForwardEncoderChunkImpl(
chunk_feats, feature_dim, &ctc_probs, inference_dim); chunk_feats, feature_dim, &ctc_probs, &out->vocab_dim);
inferences->Resize(ctc_probs.size(), kaldi::kSetZero);
std::memcpy(inferences->Data(), out->logprobs.Resize(ctc_probs.size(), kaldi::kSetZero);
std::memcpy(out->logprobs.Data(),
ctc_probs.data(), ctc_probs.data(),
ctc_probs.size() * sizeof(kaldi::BaseFloat)); ctc_probs.size() * sizeof(kaldi::BaseFloat));
} }
@ -238,9 +239,10 @@ void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
void U2Nnet::ForwardEncoderChunkImpl( void U2Nnet::ForwardEncoderChunkImpl(
const std::vector<kaldi::BaseFloat>& chunk_feats, const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim, const int32& feat_dim,
std::vector<kaldi::BaseFloat>* out_prob, std::vector<kaldi::BaseFloat>* out_prob,
int32* vocab_dim) { int32* vocab_dim) {
#ifdef USE_PROFILING #ifdef USE_PROFILING
RecordEvent event( RecordEvent event(
"ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1); "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1);

@ -61,7 +61,7 @@ class U2NnetBase : public NnetInterface {
virtual void ForwardEncoderChunk( virtual void ForwardEncoderChunk(
const std::vector<kaldi::BaseFloat>& chunk_feats, const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim, const int32& feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs, std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim); int32* vocab_dim);
@ -72,7 +72,7 @@ class U2NnetBase : public NnetInterface {
protected: protected:
virtual void ForwardEncoderChunkImpl( virtual void ForwardEncoderChunkImpl(
const std::vector<kaldi::BaseFloat>& chunk_feats, const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim, const int32& feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs, std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim) = 0; int32* vocab_dim) = 0;
@ -93,7 +93,7 @@ class U2NnetBase : public NnetInterface {
// case. Otherwise, none streaming case // case. Otherwise, none streaming case
int num_left_chunks_{-1}; // -1 means all left chunks int num_left_chunks_{-1}; // -1 means all left chunks
// asr decoder state // asr decoder state, not used in nnet
int offset_{0}; // current offset in encoder output time stamp. Used by int offset_{0}; // current offset in encoder output time stamp. Used by
// position embedding. // position embedding.
std::vector<std::vector<float>> cached_feats_{}; // features cache std::vector<std::vector<float>> cached_feats_{}; // features cache
@ -106,9 +106,8 @@ class U2Nnet : public U2NnetBase {
U2Nnet(const U2Nnet& other); U2Nnet(const U2Nnet& other);
void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features, void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
int32 feature_dim, const int32& feature_dim,
kaldi::Vector<kaldi::BaseFloat>* inferences, NnetOut* out) override;
int32* inference_dim) override;
void Reset() override; void Reset() override;
@ -123,7 +122,7 @@ class U2Nnet : public U2NnetBase {
void ForwardEncoderChunkImpl( void ForwardEncoderChunkImpl(
const std::vector<kaldi::BaseFloat>& chunk_feats, const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim, const int32& feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs, std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim) override; int32* vocab_dim) override;
@ -138,6 +137,8 @@ class U2Nnet : public U2NnetBase {
// debug // debug
void FeedEncoderOuts(paddle::Tensor& encoder_out); void FeedEncoderOuts(paddle::Tensor& encoder_out);
const std::vector<paddle::Tensor>& EncoderOuts() const {return encoder_outs_; }
private: private:
U2ModelOptions opts_; U2ModelOptions opts_;

Loading…
Cancel
Save