From cb66b742ab75287e35915e13b59cf71bf42d4146 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 31 Mar 2022 09:03:01 +0000
Subject: [PATCH] more comment

---
 .../examples/decoder/offline_decoder_main.cc  | 41 ++++++++++++++-----
 speechx/examples/decoder/run.sh               |  2 +-
 speechx/examples/feat/feature-mfcc-test.cc    |  1 -
 .../examples/feat/linear_spectrogram_main.cc  | 38 +++++++++--------
 speechx/speechx/frontend/feature_cache.h      |  4 +-
 speechx/speechx/frontend/raw_audio.h          |  3 +-
 6 files changed, 57 insertions(+), 32 deletions(-)
diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc
index eccd7c099..3a94cc947 100644
--- a/speechx/examples/decoder/offline_decoder_main.cc
+++ b/speechx/examples/decoder/offline_decoder_main.cc
@@ -22,11 +22,12 @@
 #include "nnet/decodable.h"
 #include "nnet/paddle_nnet.h"
 
-DEFINE_string(feature_respecifier, "", "test feature rspecifier");
+DEFINE_string(feature_respecifier, "", "feature matrix rspecifier");
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
 DEFINE_string(lm_path, "lm.klm", "language model");
+DEFINE_int32(chunk_size, 35, "feat chunk size");
 
 
 using kaldi::BaseFloat;
@@ -43,14 +44,16 @@ int main(int argc, char* argv[]) {
     std::string model_params = FLAGS_param_path;
     std::string dict_file = FLAGS_dict_file;
     std::string lm_path = FLAGS_lm_path;
+    int32 chunk_size = FLAGS_chunk_size;
+    LOG(INFO) << "model path: " << model_graph;
+    LOG(INFO) << "model param: " << model_params;
+    LOG(INFO) << "dict path: " << dict_file;
+    LOG(INFO) << "lm path: " << lm_path;
+    LOG(INFO) << "chunk size (frame): " << chunk_size;
 
     int32 num_done = 0, num_err = 0;
 
-    ppspeech::CTCBeamSearchOptions opts;
-    opts.dict_file = dict_file;
-    opts.lm_path = lm_path;
-    ppspeech::CTCBeamSearch decoder(opts);
-
+    // frontend + nnet is decodable
     ppspeech::ModelOptions model_opts;
     model_opts.model_path = model_graph;
     model_opts.params_path = model_params;
@@ -60,34 +63,50 @@ int main(int argc, char* argv[]) {
         new ppspeech::RawDataCache());
     std::shared_ptr<ppspeech::Decodable> decodable(
         new ppspeech::Decodable(nnet, raw_data));
+    LOG(INFO) << "Init decodeable.";
 
-    int32 chunk_size = 35;
-    decoder.InitDecoder();
-    LOG(INFO) << "chunk size: " << chunk_size;
+    // init decoder
+    ppspeech::CTCBeamSearchOptions opts;
+    opts.dict_file = dict_file;
+    opts.lm_path = lm_path;
+    ppspeech::CTCBeamSearch decoder(opts);
+    LOG(INFO) << "Init decoder.";
 
+    decoder.InitDecoder();
     for (; !feature_reader.Done(); feature_reader.Next()) {
         string utt = feature_reader.Key();
         const kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
+        LOG(INFO) << "utt: " << utt;
+
+        // feat dim
         raw_data->SetDim(feature.NumCols());
+        LOG(INFO) << "dim: " << raw_data->Dim();
+
         int32 row_idx = 0;
         int32 num_chunks = feature.NumRows() / chunk_size;
+        LOG(INFO) << "n chunks: " << num_chunks;
         for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+            // feat chunk
             kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
                                                           feature.NumCols());
             for (int row_id = 0; row_id < chunk_size; ++row_id) {
-                kaldi::SubVector<kaldi::BaseFloat> tmp(feature, row_idx);
+                kaldi::SubVector<kaldi::BaseFloat> feat_one_row(feature,
+                                                                row_idx);
                 kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
                     feature_chunk.Data() + row_id * feature.NumCols(),
                     feature.NumCols());
-                f_chunk_tmp.CopyFromVec(tmp);
+                f_chunk_tmp.CopyFromVec(feat_one_row);
                 row_idx++;
             }
+            // feed to raw cache
             raw_data->Accept(feature_chunk);
             if (chunk_idx == num_chunks - 1) {
                 raw_data->SetFinished();
             }
+            // decode step
             decoder.AdvanceDecode(decodable);
         }
+
         std::string result;
         result = decoder.GetFinalBestPath();
         KALDI_LOG << " the result of " << utt << " is " << result;
diff --git a/speechx/examples/decoder/run.sh b/speechx/examples/decoder/run.sh
index 1e5a678c2..ddda89702 100755
--- a/speechx/examples/decoder/run.sh
+++ b/speechx/examples/decoder/run.sh
@@ -28,7 +28,7 @@ cmvn=./cmvn.ark
 
 export GLOG_logtostderr=1
 
-# 3. run feat
+# 3. gen linear feat
 linear_spectrogram_main \
     --wav_rspecifier=scp:$model_dir/wav.scp \
     --feature_wspecifier=ark,t:$feat_wspecifier \
diff --git a/speechx/examples/feat/feature-mfcc-test.cc b/speechx/examples/feat/feature-mfcc-test.cc
index ae32aba9e..48a9e1c29 100644
--- a/speechx/examples/feat/feature-mfcc-test.cc
+++ b/speechx/examples/feat/feature-mfcc-test.cc
@@ -41,7 +41,6 @@
 
 using namespace kaldi;
 
-
 static void UnitTestReadWave() {
     std::cout << "=== UnitTestReadWave() ===\n";
 
diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc
index a27db56fd..cde78c4d3 100644
--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@@ -151,7 +151,7 @@ void WriteMatrix() {
         cmvn_stats(1, idx) = variance_[idx];
     }
     cmvn_stats(0, mean_.size()) = count_;
-    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true);
+    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, false);
 }
 
 int main(int argc, char* argv[]) {
@@ -163,51 +163,56 @@ int main(int argc, char* argv[]) {
     kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
     WriteMatrix();
 
-    // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
-    // window -->linear_spectrogram --> cmvn
+
     int32 num_done = 0, num_err = 0;
+
+    // feature pipeline: wave cache --> decibel_normalizer --> hanning
+    // window -->linear_spectrogram --> global cmvn -> feat cache
+
     // std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
     // ppspeech::RawDataCache());
     std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
         new ppspeech::RawAudioCache());
 
+    ppspeech::DecibelNormalizerOptions db_norm_opt;
+    std::unique_ptr<ppspeech::FeatureExtractorInterface> db_norm(
+        new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
+
     ppspeech::LinearSpectrogramOptions opt;
     opt.frame_opts.frame_length_ms = 20;
     opt.frame_opts.frame_shift_ms = 10;
-    LOG(INFO) << "frame length (ms):" << opt.frame_opts.frame_length_ms;
-    LOG(INFO) << "frame shift (ms):" << opt.frame_opts.frame_shift_ms;
-
-    ppspeech::DecibelNormalizerOptions db_norm_opt;
-    std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor(
-        new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
+    LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
+    LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;
 
     std::unique_ptr<ppspeech::FeatureExtractorInterface> linear_spectrogram(
-        new ppspeech::LinearSpectrogram(opt,
-                                        std::move(base_feature_extractor)));
+        new ppspeech::LinearSpectrogram(opt, std::move(db_norm)));
 
     std::unique_ptr<ppspeech::FeatureExtractorInterface> cmvn(
         new ppspeech::CMVN(FLAGS_cmvn_write_path,
                            std::move(linear_spectrogram)));
 
     ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn));
+    LOG(INFO) << "feat dim: " << feature_cache.Dim();
 
-    float streaming_chunk = 0.36;
     int sample_rate = 16000;
+    float streaming_chunk = 0.36;
     int chunk_sample_size = streaming_chunk * sample_rate;
-
-    LOG(INFO) << "sr:" << sample_rate;
-    LOG(INFO) << "chunk size (s):" << streaming_chunk;
-    LOG(INFO) << "chunk size (sample):" << chunk_sample_size;
+    LOG(INFO) << "sr: " << sample_rate;
+    LOG(INFO) << "chunk size (s): " << streaming_chunk;
+    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
 
 
     for (; !wav_reader.Done(); wav_reader.Next()) {
         std::string utt = wav_reader.Key();
         const kaldi::WaveData& wave_data = wav_reader.Value();
+        LOG(INFO) << "process utt: " << utt;
 
         int32 this_channel = 0;
         kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
                                                     this_channel);
         int tot_samples = waveform.Dim();
+        LOG(INFO) << "wav len (sample): " << tot_samples;
+
         int sample_offset = 0;
         std::vector<kaldi::Vector<BaseFloat>> feats;
         int feature_rows = 0;
@@ -219,6 +224,7 @@ int main(int argc, char* argv[]) {
             for (int i = 0; i < cur_chunk_size; ++i) {
                 wav_chunk(i) = waveform(sample_offset + i);
             }
+
             kaldi::Vector<BaseFloat> features;
             feature_cache.Accept(wav_chunk);
             if (cur_chunk_size < chunk_sample_size) {
diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h
index b6bbdf3cb..f52b9b0f6 100644
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@@ -28,10 +28,10 @@ class FeatureCache : public FeatureExtractorInterface {
     // Feed feats or waves
     virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
 
-    // feats dim = num_frames * feature_dim
+    // feats size = num_frames * feat_dim
     virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
 
-    // feature cache only cache feature which from base extractor
+    // feat dim
     virtual size_t Dim() const { return base_extractor_->Dim(); }
 
     virtual void SetFinished() {
diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h
index ce75c137c..7a28f2c98 100644
--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/raw_audio.h
@@ -68,9 +68,10 @@ class RawDataCache : public FeatureExtractorInterface {
         data_.Resize(0);
         return true;
     }
-    virtual size_t Dim() const { return dim_; }
+
     virtual void SetFinished() { finished_ = true; }
     virtual bool IsFinished() const { return finished_; }
+    virtual size_t Dim() const { return dim_; }
     void SetDim(int32 dim) { dim_ = dim; }
     virtual void Reset() { finished_ = true; }