From 17ea30e7cac2367e2d7850e38d7db7fb7dd50558 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 17 Oct 2022 05:56:38 +0000
Subject: [PATCH] u2 recog test main ok

---
 .../examples/codelab/u2/local/recognizer.sh   | 22 ++++++++++++++
 speechx/speechx/decoder/param.h               |  8 +++--
 speechx/speechx/decoder/u2_recognizer.cc      |  4 +++
 speechx/speechx/decoder/u2_recognizer.h       |  5 ++--
 speechx/speechx/decoder/u2_recognizer_main.cc | 13 ++++----
 speechx/speechx/frontend/audio/cmvn.cc        | 30 ++++++++++---------
 .../frontend/audio/feature_pipeline.cc        |  1 +
 7 files changed, 59 insertions(+), 24 deletions(-)
 create mode 100755 speechx/examples/codelab/u2/local/recognizer.sh
diff --git a/speechx/examples/codelab/u2/local/recognizer.sh b/speechx/examples/codelab/u2/local/recognizer.sh
new file mode 100755
index 000000000..a73597538
--- /dev/null
+++ b/speechx/examples/codelab/u2/local/recognizer.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+
+. path.sh
+
+data=data
+exp=exp
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+
+u2_recognizer_main \
+    --use_fbank=true \
+    --num_bins=80 \
+    --cmvn_file=$exp/cmvn.ark \
+    --model_path=$model_dir/export.jit \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --downsampling_rate=4 \
+    --vocab_path=$model_dir/unit.txt \
+    --wav_rspecifier=scp:$data/wav.scp \
+    --result_wspecifier=ark,t:$exp/result.ark
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index e0f22d8c6..1827e82d6 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -52,11 +52,12 @@ DEFINE_string(model_cache_names,
               "chunk_state_h_box,chunk_state_c_box",
               "model cache names");
 DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
+DEFINE_string(vocab_path, "", "nnet vocab path.");
 
 // decoder
-DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
-DEFINE_string(graph_path, "TLG", "decoder graph");
 DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
+DEFINE_string(graph_path, "TLG", "decoder graph");
+DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
 DEFINE_int32(max_active, 7500, "max active");
 DEFINE_double(beam, 15.0, "decoder beam");
 DEFINE_double(lattice_beam, 7.5, "decoder beam");
@@ -72,13 +73,14 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
     frame_opts.dither = 0.0;
     frame_opts.frame_shift_ms = 10;
     opts.use_fbank = FLAGS_use_fbank;
-    LOG(INFO) << "feature type: " << opts.use_fbank ? "fbank" : "linear";
+    LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear");
     if (opts.use_fbank) {
         opts.to_float32 = false;
         frame_opts.window_type = "povey";
         frame_opts.frame_length_ms = 25;
         opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
         opts.fbank_opts.frame_opts = frame_opts;
+        LOG(INFO) << "num bins: " << opts.fbank_opts.mel_opts.num_bins;
     } else {
         opts.to_float32 = true;
         frame_opts.remove_dc_offset = false;
diff --git a/speechx/speechx/decoder/u2_recognizer.cc b/speechx/speechx/decoder/u2_recognizer.cc
index 0ace086c4..8fcc5d79b 100644
--- a/speechx/speechx/decoder/u2_recognizer.cc
+++ b/speechx/speechx/decoder/u2_recognizer.cc
@@ -33,12 +33,15 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource): opts_(resource
     BaseFloat am_scale = resource.acoustic_scale;
     decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale));
 
+    CHECK(resource.vocab_path != "");
     decoder_.reset(new CTCPrefixBeamSearch(resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts));
 
     unit_table_ = decoder_->VocabTable();
     symbol_table_ = unit_table_;
 
     input_finished_ = false;
+
+    Reset();
 }
 
 void U2Recognizer::Reset() {
@@ -69,6 +72,7 @@ void U2Recognizer::Accept(const VectorBase<BaseFloat>& waves) {
 
 void U2Recognizer::Decode() { 
     decoder_->AdvanceDecode(decodable_); 
+    UpdateResult(false);
 }
 
 void U2Recognizer::Rescoring() {
diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/decoder/u2_recognizer.h
index 0947e5933..a65cae3b3 100644
--- a/speechx/speechx/decoder/u2_recognizer.h
+++ b/speechx/speechx/decoder/u2_recognizer.h
@@ -92,12 +92,13 @@ struct DecodeOptions {
 
 
 struct U2RecognizerResource {
+    kaldi::BaseFloat acoustic_scale{1.0};
+    std::string vocab_path{};
+
     FeaturePipelineOptions feature_pipeline_opts{};
     ModelOptions model_opts{};
     DecodeOptions decoder_opts{};
     //    CTCBeamSearchOptions beam_search_opts;
-    kaldi::BaseFloat acoustic_scale{1.0};
-    std::string vocab_path{};
 };
 
 
diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/decoder/u2_recognizer_main.cc
index 70bc7d675..ab2c66950 100644
--- a/speechx/speechx/decoder/u2_recognizer_main.cc
+++ b/speechx/speechx/decoder/u2_recognizer_main.cc
@@ -25,13 +25,16 @@ DEFINE_int32(sample_rate, 16000, "sample rate");
 
 ppspeech::U2RecognizerResource InitOpts() {
     ppspeech::U2RecognizerResource resource;
+    resource.vocab_path = FLAGS_vocab_path;
     resource.acoustic_scale = FLAGS_acoustic_scale;
-    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();
 
+    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();
+    LOG(INFO) << "feature!";
     ppspeech::ModelOptions model_opts;
     model_opts.model_path = FLAGS_model_path;
 
     resource.model_opts = model_opts;
+     LOG(INFO) << "model!";
 
     ppspeech::DecodeOptions decoder_opts;
     decoder_opts.chunk_size=16;
@@ -44,6 +47,7 @@ ppspeech::U2RecognizerResource InitOpts() {
     decoder_opts.ctc_prefix_search_opts.second_beam_size = 10;
 
     resource.decoder_opts = decoder_opts;
+    LOG(INFO) << "decoder!";
     return resource;
 }
 
@@ -57,9 +61,6 @@ int main(int argc, char* argv[]) {
     int32 num_done = 0, num_err = 0;
     double tot_wav_duration = 0.0;
 
-    ppspeech::U2RecognizerResource resource = InitOpts();
-    ppspeech::U2Recognizer recognizer(resource);
-
     kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
         FLAGS_wav_rspecifier);
     kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
@@ -71,8 +72,10 @@ int main(int argc, char* argv[]) {
     LOG(INFO) << "chunk size (s): " << streaming_chunk;
     LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
 
-    kaldi::Timer timer;
+    ppspeech::U2RecognizerResource resource = InitOpts();
+    ppspeech::U2Recognizer recognizer(resource);
 
+    kaldi::Timer timer;
     for (; !wav_reader.Done(); wav_reader.Next()) {
         std::string utt = wav_reader.Key();
         const kaldi::WaveData& wave_data = wav_reader.Value();
diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc
index 1ea83aba5..5e84a1a12 100644
--- a/speechx/speechx/frontend/audio/cmvn.cc
+++ b/speechx/speechx/frontend/audio/cmvn.cc
@@ -29,7 +29,9 @@ using std::unique_ptr;
 
 CMVN::CMVN(std::string cmvn_file, unique_ptr<FrontendInterface> base_extractor)
     : var_norm_(true) {
+    CHECK(cmvn_file != "");
     base_extractor_ = std::move(base_extractor);
+
     bool binary;
     kaldi::Input ki(cmvn_file, &binary);
     stats_.Read(ki.Stream(), binary);
@@ -55,11 +57,11 @@ bool CMVN::Read(kaldi::Vector<BaseFloat>* feats) {
 // feats contain num_frames feature.
 void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
     KALDI_ASSERT(feats != NULL);
-    int32 dim = stats_.NumCols() - 1;
+   
     if (stats_.NumRows() > 2 || stats_.NumRows() < 1 ||
-        feats->Dim() % dim != 0) {
-        KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x'
-                  << stats_.NumCols() << ", feats " << feats->Dim() << 'x';
+        feats->Dim() % dim_ != 0) {
+        KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << ','
+                  << stats_.NumCols() - 1 << ", feats " << feats->Dim() << 'x';
     }
     if (stats_.NumRows() == 1 && var_norm_) {
         KALDI_ERR
@@ -67,7 +69,7 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
             << "are supplied.";
     }
 
-    double count = stats_(0, dim);
+    double count = stats_(0, dim_);
     // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
     // computing an offset and representing it as stats_, we use a count of one.
     if (count < 1.0)
@@ -77,14 +79,14 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
 
     if (!var_norm_) {
         Vector<BaseFloat> offset(feats->Dim());
-        SubVector<double> mean_stats(stats_.RowData(0), dim);
+        SubVector<double> mean_stats(stats_.RowData(0), dim_);
         Vector<double> mean_stats_apply(feats->Dim());
-        // fill the datat of mean_stats in mean_stats_appy whose dim is equal
-        // with the dim of feature.
-        // the dim of feats = dim * num_frames;
-        for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) {
-            SubVector<double> stats_tmp(mean_stats_apply.Data() + dim * idx,
-                                        dim);
+        // fill the datat of mean_stats in mean_stats_appy whose dim_ is equal
+        // with the dim_ of feature.
+        // the dim_ of feats = dim_ * num_frames;
+        for (int32 idx = 0; idx < feats->Dim() / dim_; ++idx) {
+            SubVector<double> stats_tmp(mean_stats_apply.Data() + dim_ * idx,
+                                        dim_);
             stats_tmp.CopyFromVec(mean_stats);
         }
         offset.AddVec(-1.0 / count, mean_stats_apply);
@@ -94,7 +96,7 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
     // norm(0, d) = mean offset;
     // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
     kaldi::Matrix<BaseFloat> norm(2, feats->Dim());
-    for (int32 d = 0; d < dim; d++) {
+    for (int32 d = 0; d < dim_; d++) {
         double mean, offset, scale;
         mean = stats_(0, d) / count;
         double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20;
@@ -111,7 +113,7 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
         for (int32 d_skip = d; d_skip < feats->Dim();) {
             norm(0, d_skip) = offset;
             norm(1, d_skip) = scale;
-            d_skip = d_skip + dim;
+            d_skip = d_skip + dim_;
         }
     }
     // Apply the normalization.
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc
index 9fc35c958..7232efc44 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -32,6 +32,7 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) : opts_(opt
             opts.linear_spectrogram_opts, std::move(data_source)));
     }
 
+    CHECK(opts.cmvn_file != "");
     unique_ptr<FrontendInterface> cmvn(
         new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));