From cb66b742ab75287e35915e13b59cf71bf42d4146 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 31 Mar 2022 09:03:01 +0000 Subject: [PATCH] more comment --- .../examples/decoder/offline_decoder_main.cc | 41 ++++++++++++++----- speechx/examples/decoder/run.sh | 2 +- speechx/examples/feat/feature-mfcc-test.cc | 1 - .../examples/feat/linear_spectrogram_main.cc | 38 +++++++++-------- speechx/speechx/frontend/feature_cache.h | 4 +- speechx/speechx/frontend/raw_audio.h | 3 +- 6 files changed, 57 insertions(+), 32 deletions(-) diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc index eccd7c099..3a94cc947 100644 --- a/speechx/examples/decoder/offline_decoder_main.cc +++ b/speechx/examples/decoder/offline_decoder_main.cc @@ -22,11 +22,12 @@ #include "nnet/decodable.h" #include "nnet/paddle_nnet.h" -DEFINE_string(feature_respecifier, "", "test feature rspecifier"); +DEFINE_string(feature_respecifier, "", "feature matrix rspecifier"); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm"); DEFINE_string(lm_path, "lm.klm", "language model"); +DEFINE_int32(chunk_size, 35, "feat chunk size"); using kaldi::BaseFloat; @@ -43,14 +44,16 @@ int main(int argc, char* argv[]) { std::string model_params = FLAGS_param_path; std::string dict_file = FLAGS_dict_file; std::string lm_path = FLAGS_lm_path; + int32 chunk_size = FLAGS_chunk_size; + LOG(INFO) << "model path: " << model_graph; + LOG(INFO) << "model param: " << model_params; + LOG(INFO) << "dict path: " << dict_file; + LOG(INFO) << "lm path: " << lm_path; + LOG(INFO) << "chunk size (frame): " << chunk_size; int32 num_done = 0, num_err = 0; - ppspeech::CTCBeamSearchOptions opts; - opts.dict_file = dict_file; - opts.lm_path = lm_path; - ppspeech::CTCBeamSearch decoder(opts); - + // frontend + nnet is decodable ppspeech::ModelOptions model_opts; model_opts.model_path = model_graph; model_opts.params_path = model_params; @@ -60,34 +63,50 @@ int main(int argc, char* argv[]) { new ppspeech::RawDataCache()); std::shared_ptr decodable( new ppspeech::Decodable(nnet, raw_data)); + LOG(INFO) << "Init decodeable."; - int32 chunk_size = 35; - decoder.InitDecoder(); - LOG(INFO) << "chunk size: " << chunk_size; + // init decoder + ppspeech::CTCBeamSearchOptions opts; + opts.dict_file = dict_file; + opts.lm_path = lm_path; + ppspeech::CTCBeamSearch decoder(opts); + LOG(INFO) << "Init decoder."; + decoder.InitDecoder(); for (; !feature_reader.Done(); feature_reader.Next()) { string utt = feature_reader.Key(); const kaldi::Matrix feature = feature_reader.Value(); + LOG(INFO) << "utt: " << utt; + + // feat dim raw_data->SetDim(feature.NumCols()); + LOG(INFO) << "dim: " << raw_data->Dim(); + int32 row_idx = 0; int32 num_chunks = feature.NumRows() / chunk_size; + LOG(INFO) << "n chunks: " << num_chunks; for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + // feat chunk kaldi::Vector feature_chunk(chunk_size * feature.NumCols()); for (int row_id = 0; row_id < chunk_size; ++row_id) { - kaldi::SubVector tmp(feature, row_idx); + kaldi::SubVector feat_one_row(feature, + row_idx); kaldi::SubVector f_chunk_tmp( feature_chunk.Data() + row_id * feature.NumCols(), feature.NumCols()); - f_chunk_tmp.CopyFromVec(tmp); + f_chunk_tmp.CopyFromVec(feat_one_row); row_idx++; } + // feed to raw cache raw_data->Accept(feature_chunk); if (chunk_idx == num_chunks - 1) { raw_data->SetFinished(); } + // decode step decoder.AdvanceDecode(decodable); } + std::string result; result = decoder.GetFinalBestPath(); KALDI_LOG << " the result of " << utt << " is " << result; diff --git a/speechx/examples/decoder/run.sh b/speechx/examples/decoder/run.sh index 1e5a678c2..ddda89702 100755 --- a/speechx/examples/decoder/run.sh +++ b/speechx/examples/decoder/run.sh @@ -28,7 +28,7 @@ cmvn=./cmvn.ark export GLOG_logtostderr=1 -# 3. run feat +# 3. gen linear feat linear_spectrogram_main \ --wav_rspecifier=scp:$model_dir/wav.scp \ --feature_wspecifier=ark,t:$feat_wspecifier \ diff --git a/speechx/examples/feat/feature-mfcc-test.cc b/speechx/examples/feat/feature-mfcc-test.cc index ae32aba9e..48a9e1c29 100644 --- a/speechx/examples/feat/feature-mfcc-test.cc +++ b/speechx/examples/feat/feature-mfcc-test.cc @@ -41,7 +41,6 @@ using namespace kaldi; - static void UnitTestReadWave() { std::cout << "=== UnitTestReadWave() ===\n"; diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc index a27db56fd..cde78c4d3 100644 --- a/speechx/examples/feat/linear_spectrogram_main.cc +++ b/speechx/examples/feat/linear_spectrogram_main.cc @@ -151,7 +151,7 @@ void WriteMatrix() { cmvn_stats(1, idx) = variance_[idx]; } cmvn_stats(0, mean_.size()) = count_; - kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true); + kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, false); } int main(int argc, char* argv[]) { @@ -163,51 +163,56 @@ int main(int argc, char* argv[]) { kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); WriteMatrix(); - // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning - // window -->linear_spectrogram --> cmvn + int32 num_done = 0, num_err = 0; + + // feature pipeline: wave cache --> decibel_normalizer --> hanning + // window -->linear_spectrogram --> global cmvn -> feat cache + // std::unique_ptr data_source(new // ppspeech::RawDataCache()); std::unique_ptr data_source( new ppspeech::RawAudioCache()); + ppspeech::DecibelNormalizerOptions db_norm_opt; + std::unique_ptr db_norm( + new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); + ppspeech::LinearSpectrogramOptions opt; opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_shift_ms = 10; - LOG(INFO) << "frame length (ms):" << opt.frame_opts.frame_length_ms; - LOG(INFO) << "frame shift (ms):" << opt.frame_opts.frame_shift_ms; - - ppspeech::DecibelNormalizerOptions db_norm_opt; - std::unique_ptr base_feature_extractor( - new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source))); + LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; + LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; std::unique_ptr linear_spectrogram( - new ppspeech::LinearSpectrogram(opt, - std::move(base_feature_extractor))); + new ppspeech::LinearSpectrogram(opt, std::move(db_norm))); std::unique_ptr cmvn( new ppspeech::CMVN(FLAGS_cmvn_write_path, std::move(linear_spectrogram))); ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn)); + LOG(INFO) << "feat dim: " << feature_cache.Dim(); - float streaming_chunk = 0.36; int sample_rate = 16000; + float streaming_chunk = 0.36; int chunk_sample_size = streaming_chunk * sample_rate; - - LOG(INFO) << "sr:" << sample_rate; - LOG(INFO) << "chunk size (s):" << streaming_chunk; - LOG(INFO) << "chunk size (sample):" << chunk_sample_size; + LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "chunk size (s): " << streaming_chunk; + LOG(INFO) << "chunk size (sample): " << chunk_sample_size; for (; !wav_reader.Done(); wav_reader.Next()) { std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); + LOG(INFO) << "process utt: " << utt; int32 this_channel = 0; kaldi::SubVector waveform(wave_data.Data(), this_channel); int tot_samples = waveform.Dim(); + LOG(INFO) << "wav len (sample): " << tot_samples; + int sample_offset = 0; std::vector> feats; int feature_rows = 0; @@ -219,6 +224,7 @@ int main(int argc, char* argv[]) { for (int i = 0; i < cur_chunk_size; ++i) { wav_chunk(i) = waveform(sample_offset + i); } + kaldi::Vector features; feature_cache.Accept(wav_chunk); if (cur_chunk_size < chunk_sample_size) { diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/feature_cache.h index b6bbdf3cb..f52b9b0f6 100644 --- a/speechx/speechx/frontend/feature_cache.h +++ b/speechx/speechx/frontend/feature_cache.h @@ -28,10 +28,10 @@ class FeatureCache : public FeatureExtractorInterface { // Feed feats or waves virtual void Accept(const kaldi::VectorBase& inputs); - // feats dim = num_frames * feature_dim + // feats size = num_frames * feat_dim virtual bool Read(kaldi::Vector* feats); - // feature cache only cache feature which from base extractor + // feat dim virtual size_t Dim() const { return base_extractor_->Dim(); } virtual void SetFinished() { diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/raw_audio.h index ce75c137c..7a28f2c98 100644 --- a/speechx/speechx/frontend/raw_audio.h +++ b/speechx/speechx/frontend/raw_audio.h @@ -68,9 +68,10 @@ class RawDataCache : public FeatureExtractorInterface { data_.Resize(0); return true; } - virtual size_t Dim() const { return dim_; } + virtual void SetFinished() { finished_ = true; } virtual bool IsFinished() const { return finished_; } + virtual size_t Dim() const { return dim_; } void SetDim(int32 dim) { dim_ = dim; } virtual void Reset() { finished_ = true; }