fix u2 nnet out frames num

3 years ago · a75abc1828
parent cd1ced4ea0
commit a75abc1828
6 changed files with 26 additions and 20 deletions
--- a/speechx/.gitignore
+++ b/speechx/.gitignore
@ -1 +1,2 @@
 tools/valgrind*
+*log
--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@ -21,15 +21,15 @@

 namespace ppspeech {

-struct NnetOut{
-  // nnet out, maybe logprob or prob
-  kaldi::Vector<kaldi::BaseFloat> logprobs;
-  int32 vocab_dim;
+struct NnetOut {
+    // nnet out, maybe logprob or prob
+    kaldi::Vector<kaldi::BaseFloat> logprobs;
+    int32 vocab_dim;

-  // nnet state. Only using in Attention model.
-  std::vector<std::vector<kaldi::BaseFloat>> encoder_outs;
+    // nnet state. Only using in Attention model.
+    std::vector<std::vector<kaldi::BaseFloat>> encoder_outs;

-  NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {} 
+    NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {}
 };


--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@ -313,10 +313,8 @@ void U2Nnet::ForwardEncoderChunkImpl(
    // call.
    std::vector<paddle::Tensor> inputs = {
        feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_};
-    VLOG(3) << "inputs size: " << inputs.size();
    CHECK(inputs.size() == 4);
    std::vector<paddle::Tensor> outputs = forward_encoder_chunk_(inputs);
-    VLOG(3) << "outputs size: " << outputs.size();
    CHECK(outputs.size() == 3);

 #ifdef USE_GPU
@ -351,10 +349,12 @@ void U2Nnet::ForwardEncoderChunkImpl(
    // current offset in decoder frame
    // not used in nnet
    offset_ += chunk_out.shape()[1];
+    VLOG(2) << "encoder out chunk size: " << chunk_out.shape()[1] << " total: " << offset_ ;
+    

    // collects encoder outs.
-    VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();
    encoder_outs_.push_back(chunk_out);
+    VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size();

 #ifdef TEST_DEBUG
    {
--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@ -137,7 +137,9 @@ class U2Nnet : public U2NnetBase {
    // debug
    void FeedEncoderOuts(paddle::Tensor& encoder_out);

-    const std::vector<paddle::Tensor>& EncoderOuts() const {return encoder_outs_; }
+    const std::vector<paddle::Tensor>& EncoderOuts() const {
+        return encoder_outs_;
+    }

  private:
    U2ModelOptions opts_;
--- a/speechx/speechx/nnet/u2_nnet_main.cc
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@ -95,29 +95,29 @@ int main(int argc, char* argv[]) {
        //                    kaldi::kCopyData);
        // }

-        int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
        int32 frame_idx = 0;
        std::vector<kaldi::Vector<kaldi::BaseFloat>> prob_vec;
        int32 ori_feature_len = feature.NumRows();
+        int32 num_chunks = feature.NumRows() / chunk_stride + 1;
+        LOG(INFO) << "num_chunks: " << num_chunks;

        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
-            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
-                                                          feat_dim);
-
-            int32 feature_chunk_size = 0;
+            int32 this_chunk_size = 0;
            if (ori_feature_len > chunk_idx * chunk_stride) {
-                feature_chunk_size = std::min(
+                this_chunk_size = std::min(
                    ori_feature_len - chunk_idx * chunk_stride, chunk_size);
            }
-            if (feature_chunk_size < receptive_field_length) {
+            if (this_chunk_size < receptive_field_length) {
                LOG(WARNING) << "utt: " << utt << " skip last "
-                             << feature_chunk_size << " frames, expect is "
+                             << this_chunk_size << " frames, expect is "
                             << receptive_field_length;
                break;
            }

+            kaldi::Vector<kaldi::BaseFloat> feature_chunk(this_chunk_size *
+                                                          feat_dim);
            int32 start = chunk_idx * chunk_stride;
-            for (int row_id = 0; row_id < chunk_size; ++row_id) {
+            for (int row_id = 0; row_id < this_chunk_size; ++row_id) {
                kaldi::SubVector<kaldi::BaseFloat> feat_row(feature, start);
                kaldi::SubVector<kaldi::BaseFloat> feature_chunk_row(
                    feature_chunk.Data() + row_id * feat_dim, feat_dim);
--- a/speechx/tools/clang-format.sh
+++ b/speechx/tools/clang-format.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+
+find speechx -name '*.c' -o -name '*.h' -not -path "*kaldi*" | xargs -I{} clang-format -i  {}