add profiling timer

3 years ago · 4638c35cbc
parent 67424bedc5
commit 4638c35cbc
12 changed files with 81 additions and 40 deletions
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@ -69,20 +69,30 @@ void CTCPrefixBeamSearch::InitDecoder() { Reset(); }

 void CTCPrefixBeamSearch::AdvanceDecode(
    const std::shared_ptr<kaldi::DecodableInterface>& decodable) {
+    double search_cost = 0.0;
+    double feat_nnet_cost = 0.0;
    while (1) {
        // forward frame by frame
+        kaldi::Timer timer;
        std::vector<kaldi::BaseFloat> frame_prob;
        bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob);
+        feat_nnet_cost += timer.Elapsed();
        if (flag == false) {
-            VLOG(1) << "decoder advance decode exit." << frame_prob.size();
+            VLOG(3) << "decoder advance decode exit." << frame_prob.size();
            break;
        }

+        timer.Reset();
        std::vector<std::vector<kaldi::BaseFloat>> likelihood;
        likelihood.push_back(frame_prob);
        AdvanceDecoding(likelihood);
+        search_cost += timer.Elapsed();
+
        VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_;
    }
+    VLOG(1) << "AdvanceDecode feat + forward  cost: " << feat_nnet_cost
+            << " sec.";
+    VLOG(1) << "AdvanceDecode search  cost: " << search_cost << " sec.";
 }

 static bool PrefixScoreCompare(
--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@ -40,7 +40,9 @@ void Assembler::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {

 // pop feature chunk
 bool Assembler::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
+    kaldi::Timer timer;
    bool result = Compute(feats);
+    VLOG(1) << "Assembler::Read cost: " << timer.Elapsed() << " sec.";
    return result;
 }

@ -51,14 +53,14 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
        Vector<BaseFloat> feature;
        bool result = base_extractor_->Read(&feature);
        if (result == false || feature.Dim() == 0) {
-            VLOG(1) << "result: " << result
+            VLOG(3) << "result: " << result
                    << " feature dim: " << feature.Dim();
            if (IsFinished() == false) {
-                VLOG(1) << "finished reading feature. cache size: "
+                VLOG(3) << "finished reading feature. cache size: "
                        << feature_cache_.size();
                return false;
            } else {
-                VLOG(1) << "break";
+                VLOG(3) << "break";
                break;
            }
        }
@ -67,11 +69,11 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
        feature_cache_.push(feature);

        nframes_ += 1;
-        VLOG(1) << "nframes: " << nframes_;
+        VLOG(3) << "nframes: " << nframes_;
    }

    if (feature_cache_.size() < receptive_filed_length_) {
-        VLOG(1) << "feature_cache less than receptive_filed_lenght. "
+        VLOG(3) << "feature_cache less than receptive_filed_lenght. "
                << feature_cache_.size() << ": " << receptive_filed_length_;
        return false;
    }
@ -87,7 +89,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
    int32 this_chunk_size =
        std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_);
    feats->Resize(dim_ * this_chunk_size);
-    VLOG(1) << "read " << this_chunk_size << " feat.";
+    VLOG(3) << "read " << this_chunk_size << " feat.";

    int32 counter = 0;
    while (counter < this_chunk_size) {
--- a/speechx/speechx/frontend/audio/audio_cache.cc
+++ b/speechx/speechx/frontend/audio/audio_cache.cc
@ -38,6 +38,7 @@ BaseFloat AudioCache::Convert2PCM32(BaseFloat val) {
 }

 void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
+    kaldi::Timer timer;
    std::unique_lock<std::mutex> lock(mutex_);
    while (size_ + waves.Dim() > ring_buffer_.size()) {
        ready_feed_condition_.wait(lock);
@ -48,11 +49,13 @@ void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
        if (to_float32_) ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx));
    }
    size_ += waves.Dim();
+    VLOG(1) << "AudioCache::Accept cost: " << timer.Elapsed() << " sec. "
+            << waves.Dim() << " samples.";
 }

 bool AudioCache::Read(Vector<BaseFloat>* waves) {
-    size_t chunk_size = waves->Dim();
    kaldi::Timer timer;
+    size_t chunk_size = waves->Dim();
    std::unique_lock<std::mutex> lock(mutex_);
    while (chunk_size > size_) {
        // when audio is empty and no more data feed
@ -86,9 +89,11 @@ bool AudioCache::Read(Vector<BaseFloat>* waves) {
    offset_ = (offset_ + chunk_size) % ring_buffer_.size();

    nsamples_ += chunk_size;
-    VLOG(1) << "nsamples readed: " << nsamples_;
+    VLOG(3) << "nsamples readed: " << nsamples_;

    ready_feed_condition_.notify_one();
+    VLOG(1) << "AudioCache::Read cost: " << timer.Elapsed() << " sec. "
+            << chunk_size << " samples.";
    return true;
 }

--- a/speechx/speechx/frontend/audio/cmvn.cc
+++ b/speechx/speechx/frontend/audio/cmvn.cc
@ -50,8 +50,11 @@ bool CMVN::Read(kaldi::Vector<BaseFloat>* feats) {
    if (base_extractor_->Read(feats) == false || feats->Dim() == 0) {
        return false;
    }
+
    // appply cmvn
+    kaldi::Timer timer;
    Compute(feats);
+    VLOG(1) << "CMVN::Read cost: " << timer.Elapsed() << " sec.";
    return true;
 }

--- a/speechx/speechx/frontend/audio/data_cache.h
+++ b/speechx/speechx/frontend/audio/data_cache.h
@ -27,27 +27,32 @@ namespace ppspeech {
 // pre-recorded audio/feature
 class DataCache : public FrontendInterface {
  public:
-    DataCache() { finished_ = false; }
+    DataCache() : finished_{false}, dim_{0} {}

    // accept waves/feats
-    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+    void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) override {
        data_ = inputs;
+        SetDim(data_.Dim());
    }

-    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
+    bool Read(kaldi::Vector<kaldi::BaseFloat>* feats) override {
        if (data_.Dim() == 0) {
            return false;
        }
        (*feats) = data_;
        data_.Resize(0);
+        SetDim(data_.Dim());
        return true;
    }

-    virtual void SetFinished() { finished_ = true; }
-    virtual bool IsFinished() const { return finished_; }
-    virtual size_t Dim() const { return dim_; }
+    void SetFinished() override { finished_ = true; }
+    bool IsFinished() const override { return finished_; }
+    size_t Dim() const override { return dim_; }
    void SetDim(int32 dim) { dim_ = dim; }
-    virtual void Reset() { finished_ = true; }
+    void Reset() override {
+        finished_ = true;
+        dim_ = 0;
+    }

  private:
    kaldi::Vector<kaldi::BaseFloat> data_;
--- a/speechx/speechx/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@ -34,6 +34,7 @@ FeatureCache::FeatureCache(FeatureCacheOptions opts,
 void FeatureCache::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
    // read inputs
    base_extractor_->Accept(inputs);
+
    // feed current data
    bool result = false;
    do {
@ -62,6 +63,7 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
    feats->CopyFromVec(cache_.front());
    cache_.pop();
    ready_feed_condition_.notify_one();
+    VLOG(1) << "FeatureCache::Read cost: " << timer.Elapsed() << " sec.";
    return true;
 }

@ -72,9 +74,11 @@ bool FeatureCache::Compute() {
    bool result = base_extractor_->Read(&feature);
    if (result == false || feature.Dim() == 0) return false;

+    kaldi::Timer timer;
+
    int32 num_chunk = feature.Dim() / dim_;
    nframe_ += num_chunk;
-    VLOG(1) << "nframe computed: " << nframe_;
+    VLOG(3) << "nframe computed: " << nframe_;

    for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
        int32 start = chunk_idx * dim_;
@ -92,7 +96,10 @@ bool FeatureCache::Compute() {
        cache_.push(feature_chunk);
        ready_read_condition_.notify_one();
    }
-    return result;
+
+    VLOG(1) << "FeatureCache::Compute cost: " << timer.Elapsed() << " sec. "
+            << num_chunk << " feats.";
+    return true;
 }

 }  // namespace ppspeech
--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@ -58,7 +58,7 @@ class FeatureCache : public FrontendInterface {
        std::swap(cache_, empty);
        nframe_ = 0;
        base_extractor_->Reset();
-        VLOG(1) << "feature cache reset: cache size: " << cache_.size();
+        VLOG(3) << "feature cache reset: cache size: " << cache_.size();
    }

  private:
--- a/speechx/speechx/frontend/audio/feature_common_inl.h
+++ b/speechx/speechx/frontend/audio/feature_common_inl.h
@ -34,6 +34,7 @@ bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
    bool flag = base_extractor_->Read(&wav);
    if (flag == false || wav.Dim() == 0) return false;

+    kaldi::Timer timer;
    // append remaned waves
    int32 wav_len = wav.Dim();
    int32 left_len = remained_wav_.Dim();
@ -52,6 +53,8 @@ bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
    remained_wav_.Resize(left_samples);
    remained_wav_.CopyFromVec(
        waves.Range(frame_shift * num_frames, left_samples));
+    VLOG(1) << "StreamingFeatureTpl<F>::Read cost: " << timer.Elapsed()
+            << " sec.";
    return true;
 }

--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@ -68,9 +68,10 @@ bool Decodable::AdvanceChunk() {
    Vector<BaseFloat> features;
    if (frontend_ == NULL || frontend_->Read(&features) == false) {
        // no feat or frontend_ not init.
-        VLOG(1) << "decodable exit;";
+        VLOG(3) << "decodable exit;";
        return false;
    }
+    VLOG(1) << "AdvanceChunk feat cost: " << timer.Elapsed() << " sec.";
    VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats.";

    // forward feats
@ -88,7 +89,8 @@ bool Decodable::AdvanceChunk() {
    // update state, decoding frame.
    frame_offset_ = frames_ready_;
    frames_ready_ += nnet_out_cache_.NumRows();
-    VLOG(2) << "Forward feat chunk cost: " << timer.Elapsed() << " sec.";
+    VLOG(1) << "AdvanceChunk feat + forward cost: " << timer.Elapsed()
+            << " sec.";
    return true;
 }

@ -115,7 +117,7 @@ bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
 // read one frame likelihood
 bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
    if (EnsureFrameHaveComputed(frame) == false) {
-        VLOG(1) << "framelikehood exit.";
+        VLOG(3) << "framelikehood exit.";
        return false;
    }

@ -168,7 +170,9 @@ void Decodable::Reset() {
 void Decodable::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
                                   float reverse_weight,
                                   std::vector<float>* rescoring_score) {
+    kaldi::Timer timer;
    nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score);
+    VLOG(1) << "Attention Rescoring cost:  " << timer.Elapsed() << " sec.";
 }

 }  // namespace ppspeech
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@ -154,7 +154,7 @@ void U2Nnet::Reset() {
        std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32));

    encoder_outs_.clear();
-    VLOG(1) << "u2nnet reset";
+    VLOG(3) << "u2nnet reset";
 }

 // Debug API
@ -168,6 +168,7 @@ void U2Nnet::FeedEncoderOuts(const paddle::Tensor& encoder_out) {
 void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
                         const int32& feature_dim,
                         NnetOut* out) {
+    kaldi::Timer timer;
    std::vector<kaldi::BaseFloat> chunk_feats(features.Data(),
                                              features.Data() + features.Dim());

@ -179,6 +180,8 @@ void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
    std::memcpy(out->logprobs.Data(),
                ctc_probs.data(),
                ctc_probs.size() * sizeof(kaldi::BaseFloat));
+    VLOG(1) << "FeedForward cost: " << timer.Elapsed() << " sec. "
+            << chunk_feats.size() / feature_dim << " frames.";
 }


@ -415,7 +418,6 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
 #ifdef USE_PROFILING
    RecordEvent event("AttentionRescoring", TracerEventType::UserDefined, 1);
 #endif
-
    CHECK(rescoring_score != nullptr);

    int num_hyps = hyps.size();
@ -627,7 +629,7 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
        // combinded left-to-right and right-to-lfet score
        (*rescoring_score)[i] =
            score * (1 - reverse_weight) + r_score * reverse_weight;
-        VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score
+        VLOG(3) << "hyp " << i << " " << hyp.size() << " score: " << score
                << " r_score: " << r_score
                << " reverse_weight: " << reverse_weight
                << " final score: " << (*rescoring_score)[i];
@ -639,7 +641,7 @@ void U2Nnet::EncoderOuts(
    std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const {
    // list of (B=1,T,D)
    int size = encoder_outs_.size();
-    VLOG(1) << "encoder_outs_ size: " << size;
+    VLOG(3) << "encoder_outs_ size: " << size;

    for (int i = 0; i < size; i++) {
        const paddle::Tensor& item = encoder_outs_[i];
@ -649,7 +651,7 @@ void U2Nnet::EncoderOuts(
        const int& T = shape[1];
        const int& D = shape[2];
        CHECK(B == 1) << "Only support batch one.";
-        VLOG(1) << "encoder out " << i << " shape: (" << B << "," << T << ","
+        VLOG(3) << "encoder out " << i << " shape: (" << B << "," << T << ","
                << D << ")";

        const float* this_tensor_ptr = item.data<float>();
--- a/speechx/speechx/recognizer/u2_recognizer.cc
+++ b/speechx/speechx/recognizer/u2_recognizer.cc
@ -67,7 +67,10 @@ void U2Recognizer::ResetContinuousDecoding() {


 void U2Recognizer::Accept(const VectorBase<BaseFloat>& waves) {
+    kaldi::Timer timer;
    feature_pipeline_->Accept(waves);
+    VLOG(1) << "feed waves cost: " << timer.Elapsed() << " sec. " << waves.Dim()
+            << " samples.";
 }


@ -78,9 +81,7 @@ void U2Recognizer::Decode() {

 void U2Recognizer::Rescoring() {
    // Do attention Rescoring
-    kaldi::Timer timer;
    AttentionRescoring();
-    VLOG(1) << "Rescoring cost latency: " << timer.Elapsed() << " sec.";
 }

 void U2Recognizer::UpdateResult(bool finish) {
@ -181,15 +182,13 @@ void U2Recognizer::AttentionRescoring() {
        return;
    }

-    kaldi::Timer timer;
    std::vector<float> rescoring_score;
    decodable_->AttentionRescoring(
        hypotheses, opts_.decoder_opts.reverse_weight, &rescoring_score);
-    VLOG(1) << "Attention Rescoring takes " << timer.Elapsed() << " sec.";

    // combine ctc score and rescoring score
    for (size_t i = 0; i < num_hyps; i++) {
-        VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i]
+        VLOG(3) << "hyp " << i << " rescoring_score: " << rescoring_score[i]
                << " ctc_score: " << result_[i].score
                << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight
                << " ctc_weight: " << opts_.decoder_opts.ctc_weight;
@ -197,12 +196,12 @@ void U2Recognizer::AttentionRescoring() {
            opts_.decoder_opts.rescoring_weight * rescoring_score[i] +
            opts_.decoder_opts.ctc_weight * result_[i].score;

-        VLOG(1) << "hyp: " << result_[0].sentence
+        VLOG(3) << "hyp: " << result_[0].sentence
                << " score: " << result_[0].score;
    }

    std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc);
-    VLOG(1) << "result: " << result_[0].sentence
+    VLOG(3) << "result: " << result_[0].sentence
            << " score: " << result_[0].score;
 }

--- a/speechx/speechx/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@ -31,6 +31,7 @@ int main(int argc, char* argv[]) {

    int32 num_done = 0, num_err = 0;
    double tot_wav_duration = 0.0;
+    double tot_decode_time = 0.0;

    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
        FLAGS_wav_rspecifier);
@ -47,9 +48,7 @@ int main(int argc, char* argv[]) {
        ppspeech::U2RecognizerResource::InitFromFlags();
    ppspeech::U2Recognizer recognizer(resource);

-    kaldi::Timer timer;
    for (; !wav_reader.Done(); wav_reader.Next()) {
-        kaldi::Timer local_timer;
        std::string utt = wav_reader.Key();
        const kaldi::WaveData& wave_data = wav_reader.Value();
        LOG(INFO) << "utt: " << utt;
@ -65,6 +64,8 @@ int main(int argc, char* argv[]) {

        int sample_offset = 0;
        int cnt = 0;
+        kaldi::Timer timer;
+        kaldi::Timer local_timer;

        while (sample_offset < tot_samples) {
            int cur_chunk_size =
@ -95,6 +96,8 @@ int main(int argc, char* argv[]) {
        // second pass decoding
        recognizer.Rescoring();

+        tot_decode_time += timer.Elapsed();
+
        std::string result = recognizer.GetFinalResult();

        recognizer.Reset();
@ -115,10 +118,8 @@ int main(int argc, char* argv[]) {
        ++num_done;
    }

-    double elapsed = timer.Elapsed();
-
    LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done);
-    LOG(INFO) << "total cost:" << elapsed << " sec";
    LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec";
-    LOG(INFO) << "RTF is: " << elapsed / tot_wav_duration;
+    LOG(INFO) << "total decode cost:" << tot_decode_time << " sec";
+    LOG(INFO) << "RTF is: " << tot_decode_time / tot_wav_duration;
 }