format code

2 years ago · 850096a3a0
parent 28dafea0e0
commit 850096a3a0
18 changed files with 75 additions and 51 deletions
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@ -118,11 +118,13 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
        std::vector<float> topk_score;
        std::vector<int32_t> topk_index;
        TopK(logp_t, first_beam_size, &topk_score, &topk_index);
-        VLOG(2) << "topk: " << num_frame_decoded_ << " " <<  *std::max_element(logp_t.begin(), logp_t.end()) << " " << topk_score[0];
-        for (int i = 0; i < topk_score.size(); i++){
-             VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i];
+        VLOG(2) << "topk: " << num_frame_decoded_ << " "
+                << *std::max_element(logp_t.begin(), logp_t.end()) << " "
+                << topk_score[0];
+        for (int i = 0; i < topk_score.size(); i++) {
+            VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i];
        }
-       
+
        // 2. token passing
        for (int i = 0; i < topk_index.size(); ++i) {
            int id = topk_index[i];
@ -303,15 +305,16 @@ void CTCPrefixBeamSearch::UpdateOutputs(
    outputs_.emplace_back(output);
 }

-void CTCPrefixBeamSearch::FinalizeSearch() { 
-    UpdateFinalContext(); 
-    
+void CTCPrefixBeamSearch::FinalizeSearch() {
+    UpdateFinalContext();
+
    VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_;
    int cnt = 0;
-    for (int i = 0; i < hypotheses_.size(); i ++){
-        VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() << " ctc score: " << likelihood_[i];
-        for (int j = 0; j < hypotheses_[i].size(); j ++){
-            VLOG(2) <<  hypotheses_[i][j];
+    for (int i = 0; i < hypotheses_.size(); i++) {
+        VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size()
+                << " ctc score: " << likelihood_[i];
+        for (int j = 0; j < hypotheses_[i].size(); j++) {
+            VLOG(2) << hypotheses_[i][j];
        }
    }
 }
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@ -13,7 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc
+// modified from
+// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc

 #pragma once

--- a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
@ -13,7 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h
+// modified from
+// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h

 #pragma once

--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@ -20,7 +20,9 @@

 // feature
 DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
-DEFINE_bool(fill_zero, false, "fill zero at last chunk, when chunk < chunk_size");
+DEFINE_bool(fill_zero,
+            false,
+            "fill zero at last chunk, when chunk < chunk_size");
 // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear
 // feature, or fbank");
 DEFINE_int32(num_bins, 161, "num bins of mel");
--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@ -16,9 +16,9 @@

 namespace ppspeech {

+using kaldi::BaseFloat;
 using kaldi::Vector;
 using kaldi::VectorBase;
-using kaldi::BaseFloat;
 using std::unique_ptr;

 Assembler::Assembler(AssemblerOptions opts,
@ -51,9 +51,11 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
        Vector<BaseFloat> feature;
        bool result = base_extractor_->Read(&feature);
        if (result == false || feature.Dim() == 0) {
-            VLOG(1) << "result: " << result << " feature dim: " << feature.Dim();
+            VLOG(1) << "result: " << result
+                    << " feature dim: " << feature.Dim();
            if (IsFinished() == false) {
-                VLOG(1) << "finished reading feature. cache size: " << feature_cache_.size();
+                VLOG(1) << "finished reading feature. cache size: "
+                        << feature_cache_.size();
                return false;
            } else {
                VLOG(1) << "break";
@ -69,7 +71,8 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
    }

    if (feature_cache_.size() < receptive_filed_length_) {
-        VLOG(1) << "feature_cache less than receptive_filed_lenght. " << feature_cache_.size() << ": " << receptive_filed_length_;
+        VLOG(1) << "feature_cache less than receptive_filed_lenght. "
+                << feature_cache_.size() << ": " << receptive_filed_length_;
        return false;
    }

@ -81,7 +84,8 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
        }
    }

-    int32 this_chunk_size = std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_);
+    int32 this_chunk_size =
+        std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_);
    feats->Resize(dim_ * this_chunk_size);
    VLOG(1) << "read " << this_chunk_size << " feat.";

@ -89,7 +93,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
    while (counter < this_chunk_size) {
        Vector<BaseFloat>& val = feature_cache_.front();
        CHECK(val.Dim() == dim_) << val.Dim();
-      
+
        int32 start = counter * dim_;
        feats->Range(start, dim_).CopyFromVec(val);

@ -99,7 +103,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {

        // val is reference, so we should pop here
        feature_cache_.pop();
-  
+
        counter++;
    }
    CHECK(feature_cache_.size() == cache_size_);
@ -108,11 +112,11 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
 }


- void Assembler::Reset() { 
+void Assembler::Reset() {
    std::queue<kaldi::Vector<kaldi::BaseFloat>> empty;
    std::swap(feature_cache_, empty);
    nframes_ = 0;
-    base_extractor_->Reset(); 
+    base_extractor_->Reset();
 }

 }  // namespace ppspeech
--- a/speechx/speechx/frontend/audio/assembler.h
+++ b/speechx/speechx/frontend/audio/assembler.h
@ -25,7 +25,8 @@ struct AssemblerOptions {
    int32 receptive_filed_length{1};
    int32 subsampling_rate{1};
    int32 nnet_decoder_chunk{1};
-    bool fill_zero{false}; // whether fill zero when last chunk is not equal to frame_chunk_size_
+    bool fill_zero{false};  // whether fill zero when last chunk is not equal to
+                            // frame_chunk_size_
 };

 class Assembler : public FrontendInterface {
@ -62,7 +63,7 @@ class Assembler : public FrontendInterface {
    std::queue<kaldi::Vector<kaldi::BaseFloat>> feature_cache_;
    std::unique_ptr<FrontendInterface> base_extractor_;

-    int32 nframes_; // num frame computed
+    int32 nframes_;  // num frame computed
    DISALLOW_COPY_AND_ASSIGN(Assembler);
 };

--- a/speechx/speechx/frontend/audio/audio_cache.cc
+++ b/speechx/speechx/frontend/audio/audio_cache.cc
@ -13,13 +13,14 @@
 // limitations under the License.

 #include "frontend/audio/audio_cache.h"
+
 #include "kaldi/base/timer.h"

 namespace ppspeech {

 using kaldi::BaseFloat;
-using kaldi::VectorBase;
 using kaldi::Vector;
+using kaldi::VectorBase;

 AudioCache::AudioCache(int buffer_size, bool to_float32)
    : finished_(false),
@ -85,8 +86,8 @@ bool AudioCache::Read(Vector<BaseFloat>* waves) {
    offset_ = (offset_ + chunk_size) % ring_buffer_.size();

    nsamples_ += chunk_size;
-    VLOG(1) << "nsamples readed: " <<  nsamples_;
-    
+    VLOG(1) << "nsamples readed: " << nsamples_;
+
    ready_feed_condition_.notify_one();
    return true;
 }
--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@ -62,7 +62,7 @@ class AudioCache : public FrontendInterface {
    kaldi::int32 timeout_;  // millisecond
    bool to_float32_;       // int16 -> float32. used in linear_spectrogram

-    int32 nsamples_; // number samples readed.
+    int32 nsamples_;  // number samples readed.
    DISALLOW_COPY_AND_ASSIGN(AudioCache);
 };

--- a/speechx/speechx/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@ -16,12 +16,12 @@

 namespace ppspeech {

-using kaldi::Vector;
-using kaldi::VectorBase;
 using kaldi::BaseFloat;
-using std::vector;
 using kaldi::SubVector;
+using kaldi::Vector;
+using kaldi::VectorBase;
 using std::unique_ptr;
+using std::vector;

 FeatureCache::FeatureCache(FeatureCacheOptions opts,
                           unique_ptr<FrontendInterface> base_extractor) {
--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@ -77,7 +77,7 @@ class FeatureCache : public FrontendInterface {
    std::condition_variable ready_feed_condition_;
    std::condition_variable ready_read_condition_;

-    int32 nframe_; // num of feature computed
+    int32 nframe_;  // num of feature computed
    DISALLOW_COPY_AND_ASSIGN(FeatureCache);
 };

--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@ -91,8 +91,7 @@ struct FeaturePipelineOptions {
                  << opts.assembler_opts.receptive_filed_length;
        LOG(INFO) << "nnet chunk size: "
                  << opts.assembler_opts.nnet_decoder_chunk;
-        LOG(INFO) << "frontend fill zeros: "
-                  << opts.assembler_opts.fill_zero;
+        LOG(INFO) << "frontend fill zeros: " << opts.assembler_opts.fill_zero;
        return opts;
    }
 };
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@ -79,7 +79,8 @@ bool Decodable::AdvanceChunk() {
    int32& vocab_dim = out.vocab_dim;
    Vector<BaseFloat>& logprobs = out.logprobs;

-    VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim  << " decoder frames.";
+    VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim
+            << " decoder frames.";
    // cache nnet outupts
    nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim);
    nnet_out_cache_.CopyRowsFromVec(logprobs);
@ -127,7 +128,9 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
        (*likelihood)[idx] =
            nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;

-        VLOG(4) << "nnet out: " << frame  << " offset:" << frame_offset_  << " " << nnet_out_cache_.NumRows() << " logprob: " <<  nnet_out_cache_(frame - frame_offset_, idx);
+        VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " "
+                << nnet_out_cache_.NumRows()
+                << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx);
    }
    return true;
 }
--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@ -13,7 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc
+// modified from
+// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc

 #include "nnet/u2_nnet.h"

@ -129,7 +130,7 @@ U2Nnet::U2Nnet(const U2Nnet& other) {
    forward_attention_decoder_ = other.forward_attention_decoder_;
    ctc_activation_ = other.ctc_activation_;

-    offset_ = other.offset_; 
+    offset_ = other.offset_;

    // copy model ptr
    model_ = other.model_;
@ -626,8 +627,10 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
        // combinded left-to-right and right-to-lfet score
        (*rescoring_score)[i] =
            score * (1 - reverse_weight) + r_score * reverse_weight;
-        VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score
-                << " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i];
+        VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score
+                << " r_score: " << r_score
+                << " reverse_weight: " << reverse_weight
+                << " final score: " << (*rescoring_score)[i];
    }
 }

--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@ -13,7 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// modified from https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h
+// modified from
+// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h
 #pragma once

 #include "base/common.h"
--- a/speechx/speechx/recognizer/u2_recognizer.cc
+++ b/speechx/speechx/recognizer/u2_recognizer.cc
@ -190,12 +190,15 @@ void U2Recognizer::AttentionRescoring() {
    // combine ctc score and rescoring score
    for (size_t i = 0; i < num_hyps; i++) {
        VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i]
-                << " ctc_score: " << result_[i].score << " rescoring_weight: " <<  opts_.decoder_opts.rescoring_weight << " ctc_weight: " <<  opts_.decoder_opts.ctc_weight;
+                << " ctc_score: " << result_[i].score
+                << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight
+                << " ctc_weight: " << opts_.decoder_opts.ctc_weight;
        result_[i].score =
            opts_.decoder_opts.rescoring_weight * rescoring_score[i] +
            opts_.decoder_opts.ctc_weight * result_[i].score;

-        VLOG(1) << "hyp: " << result_[0].sentence << " score: " << result_[0].score;
+        VLOG(1) << "hyp: " << result_[0].sentence
+                << " score: " << result_[0].score;
    }

    std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc);
--- a/speechx/speechx/recognizer/u2_recognizer.h
+++ b/speechx/speechx/recognizer/u2_recognizer.h
@ -96,13 +96,14 @@ struct U2RecognizerResource {
        U2RecognizerResource resource;
        resource.vocab_path = FLAGS_vocab_path;
        resource.acoustic_scale = FLAGS_acoustic_scale;
-        LOG(INFO) << "vocab path: " <<  resource.vocab_path;
-        LOG(INFO) << "acoustic_scale: " <<  resource.acoustic_scale;
+        LOG(INFO) << "vocab path: " << resource.vocab_path;
+        LOG(INFO) << "acoustic_scale: " << resource.acoustic_scale;

        resource.feature_pipeline_opts =
            ppspeech::FeaturePipelineOptions::InitFromFlags();
        resource.feature_pipeline_opts.assembler_opts.fill_zero = false;
-        LOG(INFO) << "u2 need fill zero be false: " << resource.feature_pipeline_opts.assembler_opts.fill_zero;
+        LOG(INFO) << "u2 need fill zero be false: "
+                  << resource.feature_pipeline_opts.assembler_opts.fill_zero;
        resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
        resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags();
        return resource;
--- a/speechx/speechx/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@ -78,7 +78,8 @@ int main(int argc, char* argv[]) {
                recognizer.SetFinished();
            }
            recognizer.Decode();
-            LOG(INFO) << "Pratial result: " << cnt << " " << recognizer.GetPartialResult();
+            LOG(INFO) << "Pratial result: " << cnt << " "
+                      << recognizer.GetPartialResult();

            // no overlap
            sample_offset += cur_chunk_size;
@ -88,7 +89,7 @@ int main(int argc, char* argv[]) {

        // second pass decoding
        recognizer.Rescoring();
-    
+
        std::string result = recognizer.GetFinalResult();

        recognizer.Reset();
--- a/speechx/speechx/utils/math.cc
+++ b/speechx/speechx/utils/math.cc
@ -79,10 +79,10 @@ void TopK(const std::vector<T>& data,
    int cur = values->size() - 1;
    while (!pq.empty()) {
        const auto& item = pq.top();
-    
+
        (*values)[cur] = item.first;
        (*indices)[cur] = item.second;
-         
+
        // item if reference, must pop here
        pq.pop();