add nnetout struct

pull/2524/head
Hui Zhang 2 years ago
parent 290c23b9d7
commit cd1ced4ea0

@ -1,5 +1,5 @@
#!/bin/bash
set +x
set -x
set -e
. path.sh
@ -11,7 +11,7 @@ stop_stage=100
. utils/parse_options.sh
# 1. compile
if [ ! -d ${SPEECHX_EXAMPLES} ]; then
if [ ! -d ${SPEECHX_BUILD} ]; then
pushd ${SPEECHX_ROOT}
bash build.sh
popd

@ -14,7 +14,6 @@ target_link_libraries(nnet absl::strings)
if(USING_U2)
target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS})
target_include_directories(nnet PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
# target_link_libraries(nnet ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
endif()

@ -32,7 +32,7 @@ Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
// for debug
void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
nnet_cache_ = likelihood;
nnet_out_cache_ = likelihood;
frames_ready_ += likelihood.NumRows();
}
@ -56,13 +56,13 @@ int32 Decodable::NumIndices() const { return 0; }
int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; }
BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) {
CHECK_LE(index, nnet_cache_.NumCols());
CHECK_LE(index, nnet_out_cache_.NumCols());
CHECK_LE(frame, frames_ready_);
int32 frame_idx = frame - frame_offset_;
// the nnet output is prob ranther than log prob
// the index - 1, because the ilabel
return acoustic_scale_ *
std::log(nnet_cache_(frame_idx, TokenId2NnetId(index)) +
std::log(nnet_out_cache_(frame_idx, TokenId2NnetId(index)) +
std::numeric_limits<float>::min());
}
@ -82,17 +82,18 @@ bool Decodable::AdvanceChunk() {
}
// forward feats
int32 vocab_dim = 0;
Vector<BaseFloat> probs;
nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim);
NnetOut out;
nnet_->FeedForward(features, frontend_->Dim(), &out);
int32& vocab_dim = out.vocab_dim;
Vector<BaseFloat>& probs = out.logprobs;
// cache nnet outupts
nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
nnet_cache_.CopyRowsFromVec(probs);
nnet_out_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
nnet_out_cache_.CopyRowsFromVec(probs);
// update state
frame_offset_ = frames_ready_;
frames_ready_ += nnet_cache_.NumRows();
frames_ready_ += nnet_out_cache_.NumRows();
return true;
}
@ -102,12 +103,12 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
return false;
}
int vocab_size = nnet_cache_.NumCols();
int vocab_size = nnet_out_cache_.NumCols();
likelihood->resize(vocab_size);
for (int32 idx = 0; idx < vocab_size; ++idx) {
(*likelihood)[idx] =
nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_;
nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;
}
return true;
}
@ -117,7 +118,7 @@ void Decodable::Reset() {
if (nnet_ != nullptr) nnet_->Reset();
frame_offset_ = 0;
frames_ready_ = 0;
nnet_cache_.Resize(0, 0);
nnet_out_cache_.Resize(0, 0);
}
} // namespace ppspeech

@ -62,7 +62,7 @@ class Decodable : public kaldi::DecodableInterface {
std::shared_ptr<NnetInterface> nnet_;
// nnet outputs' cache
kaldi::Matrix<kaldi::BaseFloat> nnet_cache_;
kaldi::Matrix<kaldi::BaseFloat> nnet_out_cache_;
// the frame is nnet prob frame rather than audio feature frame
// nnet frame subsample the feature frame

@ -143,9 +143,8 @@ shared_ptr<Tensor<BaseFloat>> PaddleNnet::GetCacheEncoder(const string& name) {
}
void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
int32 feature_dim,
Vector<BaseFloat>* inferences,
int32* inference_dim) {
const int32& feature_dim,
NnetOut* out) {
paddle_infer::Predictor* predictor = GetPredictor();
int feat_row = features.Dim() / feature_dim;
@ -203,9 +202,13 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
std::vector<int> output_shape = output_tensor->shape();
int32 row = output_shape[1];
int32 col = output_shape[2];
inferences->Resize(row * col);
*inference_dim = col;
output_tensor->CopyToCpu(inferences->Data());
// inferences->Resize(row * col);
// *inference_dim = col;
out->logprobs.Resize(row*col);
out->vocab_dim = col;
output_tensor->CopyToCpu(out->logprobs.Data());
ReleasePredictor(predictor);
}

@ -97,9 +97,8 @@ class PaddleNnet : public NnetInterface {
PaddleNnet(const ModelOptions& opts);
virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
int32 feature_dim,
kaldi::Vector<kaldi::BaseFloat>* inferences,
int32* inference_dim);
const int32& feature_dim,
NnetOut* out);
void Dim();
virtual void Reset();

@ -21,12 +21,23 @@
namespace ppspeech {
struct NnetOut{
// nnet out, maybe logprob or prob
kaldi::Vector<kaldi::BaseFloat> logprobs;
int32 vocab_dim;
// nnet state. Only using in Attention model.
std::vector<std::vector<kaldi::BaseFloat>> encoder_outs;
NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {}
};
class NnetInterface {
public:
virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
int32 feature_dim,
kaldi::Vector<kaldi::BaseFloat>* inferences,
int32* inference_dim) = 0;
const int32& feature_dim,
NnetOut* out) = 0;
virtual void Reset() = 0;
virtual ~NnetInterface() {}
};

@ -64,7 +64,7 @@ void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
void U2NnetBase::ForwardEncoderChunk(
const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim,
const int32& feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim) {
ctc_probs->clear();
@ -221,16 +221,17 @@ void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
int32 feature_dim,
kaldi::Vector<BaseFloat>* inferences,
int32* inference_dim) {
const int32& feature_dim,
NnetOut* out) {
std::vector<kaldi::BaseFloat> chunk_feats(features.Data(),
features.Data() + features.Dim());
std::vector<kaldi::BaseFloat> ctc_probs;
ForwardEncoderChunkImpl(
chunk_feats, feature_dim, &ctc_probs, inference_dim);
inferences->Resize(ctc_probs.size(), kaldi::kSetZero);
std::memcpy(inferences->Data(),
chunk_feats, feature_dim, &ctc_probs, &out->vocab_dim);
out->logprobs.Resize(ctc_probs.size(), kaldi::kSetZero);
std::memcpy(out->logprobs.Data(),
ctc_probs.data(),
ctc_probs.size() * sizeof(kaldi::BaseFloat));
}
@ -238,9 +239,10 @@ void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
void U2Nnet::ForwardEncoderChunkImpl(
const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim,
const int32& feat_dim,
std::vector<kaldi::BaseFloat>* out_prob,
int32* vocab_dim) {
#ifdef USE_PROFILING
RecordEvent event(
"ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1);

@ -61,7 +61,7 @@ class U2NnetBase : public NnetInterface {
virtual void ForwardEncoderChunk(
const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim,
const int32& feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim);
@ -72,7 +72,7 @@ class U2NnetBase : public NnetInterface {
protected:
virtual void ForwardEncoderChunkImpl(
const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim,
const int32& feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim) = 0;
@ -93,7 +93,7 @@ class U2NnetBase : public NnetInterface {
// case. Otherwise, none streaming case
int num_left_chunks_{-1}; // -1 means all left chunks
// asr decoder state
// asr decoder state, not used in nnet
int offset_{0}; // current offset in encoder output time stamp. Used by
// position embedding.
std::vector<std::vector<float>> cached_feats_{}; // features cache
@ -106,9 +106,8 @@ class U2Nnet : public U2NnetBase {
U2Nnet(const U2Nnet& other);
void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
int32 feature_dim,
kaldi::Vector<kaldi::BaseFloat>* inferences,
int32* inference_dim) override;
const int32& feature_dim,
NnetOut* out) override;
void Reset() override;
@ -123,7 +122,7 @@ class U2Nnet : public U2NnetBase {
void ForwardEncoderChunkImpl(
const std::vector<kaldi::BaseFloat>& chunk_feats,
int32 feat_dim,
const int32& feat_dim,
std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim) override;
@ -138,6 +137,8 @@ class U2Nnet : public U2NnetBase {
// debug
void FeedEncoderOuts(paddle::Tensor& encoder_out);
const std::vector<paddle::Tensor>& EncoderOuts() const {return encoder_outs_; }
private:
U2ModelOptions opts_;

Loading…
Cancel
Save