From 17ea30e7cac2367e2d7850e38d7db7fb7dd50558 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 17 Oct 2022 05:56:38 +0000 Subject: [PATCH] u2 recog test main ok --- .../examples/codelab/u2/local/recognizer.sh | 22 ++++++++++++++ speechx/speechx/decoder/param.h | 8 +++-- speechx/speechx/decoder/u2_recognizer.cc | 4 +++ speechx/speechx/decoder/u2_recognizer.h | 5 ++-- speechx/speechx/decoder/u2_recognizer_main.cc | 13 ++++---- speechx/speechx/frontend/audio/cmvn.cc | 30 ++++++++++--------- .../frontend/audio/feature_pipeline.cc | 1 + 7 files changed, 59 insertions(+), 24 deletions(-) create mode 100755 speechx/examples/codelab/u2/local/recognizer.sh diff --git a/speechx/examples/codelab/u2/local/recognizer.sh b/speechx/examples/codelab/u2/local/recognizer.sh new file mode 100755 index 000000000..a73597538 --- /dev/null +++ b/speechx/examples/codelab/u2/local/recognizer.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +u2_recognizer_main \ + --use_fbank=true \ + --num_bins=80 \ + --cmvn_file=$exp/cmvn.ark \ + --model_path=$model_dir/export.jit \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --downsampling_rate=4 \ + --vocab_path=$model_dir/unit.txt \ + --wav_rspecifier=scp:$data/wav.scp \ + --result_wspecifier=ark,t:$exp/result.ark diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index e0f22d8c6..1827e82d6 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -52,11 +52,12 @@ DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); +DEFINE_string(vocab_path, "", "nnet vocab path."); // decoder -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); +DEFINE_string(graph_path, "TLG", "decoder graph"); +DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); DEFINE_int32(max_active, 7500, "max active"); DEFINE_double(beam, 15.0, "decoder beam"); DEFINE_double(lattice_beam, 7.5, "decoder beam"); @@ -72,13 +73,14 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { frame_opts.dither = 0.0; frame_opts.frame_shift_ms = 10; opts.use_fbank = FLAGS_use_fbank; - LOG(INFO) << "feature type: " << opts.use_fbank ? "fbank" : "linear"; + LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear"); if (opts.use_fbank) { opts.to_float32 = false; frame_opts.window_type = "povey"; frame_opts.frame_length_ms = 25; opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; opts.fbank_opts.frame_opts = frame_opts; + LOG(INFO) << "num bins: " << opts.fbank_opts.mel_opts.num_bins; } else { opts.to_float32 = true; frame_opts.remove_dc_offset = false; diff --git a/speechx/speechx/decoder/u2_recognizer.cc b/speechx/speechx/decoder/u2_recognizer.cc index 0ace086c4..8fcc5d79b 100644 --- a/speechx/speechx/decoder/u2_recognizer.cc +++ b/speechx/speechx/decoder/u2_recognizer.cc @@ -33,12 +33,15 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource): opts_(resource BaseFloat am_scale = resource.acoustic_scale; decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale)); + CHECK(resource.vocab_path != ""); decoder_.reset(new CTCPrefixBeamSearch(resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts)); unit_table_ = decoder_->VocabTable(); symbol_table_ = unit_table_; input_finished_ = false; + + Reset(); } void U2Recognizer::Reset() { @@ -69,6 +72,7 @@ void U2Recognizer::Accept(const VectorBase& waves) { void U2Recognizer::Decode() { decoder_->AdvanceDecode(decodable_); + UpdateResult(false); } void U2Recognizer::Rescoring() { diff --git a/speechx/speechx/decoder/u2_recognizer.h b/speechx/speechx/decoder/u2_recognizer.h index 0947e5933..a65cae3b3 100644 --- a/speechx/speechx/decoder/u2_recognizer.h +++ b/speechx/speechx/decoder/u2_recognizer.h @@ -92,12 +92,13 @@ struct DecodeOptions { struct U2RecognizerResource { + kaldi::BaseFloat acoustic_scale{1.0}; + std::string vocab_path{}; + FeaturePipelineOptions feature_pipeline_opts{}; ModelOptions model_opts{}; DecodeOptions decoder_opts{}; // CTCBeamSearchOptions beam_search_opts; - kaldi::BaseFloat acoustic_scale{1.0}; - std::string vocab_path{}; }; diff --git a/speechx/speechx/decoder/u2_recognizer_main.cc b/speechx/speechx/decoder/u2_recognizer_main.cc index 70bc7d675..ab2c66950 100644 --- a/speechx/speechx/decoder/u2_recognizer_main.cc +++ b/speechx/speechx/decoder/u2_recognizer_main.cc @@ -25,13 +25,16 @@ DEFINE_int32(sample_rate, 16000, "sample rate"); ppspeech::U2RecognizerResource InitOpts() { ppspeech::U2RecognizerResource resource; + resource.vocab_path = FLAGS_vocab_path; resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions(); + resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions(); + LOG(INFO) << "feature!"; ppspeech::ModelOptions model_opts; model_opts.model_path = FLAGS_model_path; resource.model_opts = model_opts; + LOG(INFO) << "model!"; ppspeech::DecodeOptions decoder_opts; decoder_opts.chunk_size=16; @@ -44,6 +47,7 @@ ppspeech::U2RecognizerResource InitOpts() { decoder_opts.ctc_prefix_search_opts.second_beam_size = 10; resource.decoder_opts = decoder_opts; + LOG(INFO) << "decoder!"; return resource; } @@ -57,9 +61,6 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; double tot_wav_duration = 0.0; - ppspeech::U2RecognizerResource resource = InitOpts(); - ppspeech::U2Recognizer recognizer(resource); - kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); @@ -71,8 +72,10 @@ int main(int argc, char* argv[]) { LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; - kaldi::Timer timer; + ppspeech::U2RecognizerResource resource = InitOpts(); + ppspeech::U2Recognizer recognizer(resource); + kaldi::Timer timer; for (; !wav_reader.Done(); wav_reader.Next()) { std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc index 1ea83aba5..5e84a1a12 100644 --- a/speechx/speechx/frontend/audio/cmvn.cc +++ b/speechx/speechx/frontend/audio/cmvn.cc @@ -29,7 +29,9 @@ using std::unique_ptr; CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) : var_norm_(true) { + CHECK(cmvn_file != ""); base_extractor_ = std::move(base_extractor); + bool binary; kaldi::Input ki(cmvn_file, &binary); stats_.Read(ki.Stream(), binary); @@ -55,11 +57,11 @@ bool CMVN::Read(kaldi::Vector* feats) { // feats contain num_frames feature. void CMVN::Compute(VectorBase* feats) const { KALDI_ASSERT(feats != NULL); - int32 dim = stats_.NumCols() - 1; + if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || - feats->Dim() % dim != 0) { - KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x' - << stats_.NumCols() << ", feats " << feats->Dim() << 'x'; + feats->Dim() % dim_ != 0) { + KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << ',' + << stats_.NumCols() - 1 << ", feats " << feats->Dim() << 'x'; } if (stats_.NumRows() == 1 && var_norm_) { KALDI_ERR @@ -67,7 +69,7 @@ void CMVN::Compute(VectorBase* feats) const { << "are supplied."; } - double count = stats_(0, dim); + double count = stats_(0, dim_); // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when // computing an offset and representing it as stats_, we use a count of one. if (count < 1.0) @@ -77,14 +79,14 @@ void CMVN::Compute(VectorBase* feats) const { if (!var_norm_) { Vector offset(feats->Dim()); - SubVector mean_stats(stats_.RowData(0), dim); + SubVector mean_stats(stats_.RowData(0), dim_); Vector mean_stats_apply(feats->Dim()); - // fill the datat of mean_stats in mean_stats_appy whose dim is equal - // with the dim of feature. - // the dim of feats = dim * num_frames; - for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) { - SubVector stats_tmp(mean_stats_apply.Data() + dim * idx, - dim); + // fill the datat of mean_stats in mean_stats_appy whose dim_ is equal + // with the dim_ of feature. + // the dim_ of feats = dim_ * num_frames; + for (int32 idx = 0; idx < feats->Dim() / dim_; ++idx) { + SubVector stats_tmp(mean_stats_apply.Data() + dim_ * idx, + dim_); stats_tmp.CopyFromVec(mean_stats); } offset.AddVec(-1.0 / count, mean_stats_apply); @@ -94,7 +96,7 @@ void CMVN::Compute(VectorBase* feats) const { // norm(0, d) = mean offset; // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d). kaldi::Matrix norm(2, feats->Dim()); - for (int32 d = 0; d < dim; d++) { + for (int32 d = 0; d < dim_; d++) { double mean, offset, scale; mean = stats_(0, d) / count; double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20; @@ -111,7 +113,7 @@ void CMVN::Compute(VectorBase* feats) const { for (int32 d_skip = d; d_skip < feats->Dim();) { norm(0, d_skip) = offset; norm(1, d_skip) = scale; - d_skip = d_skip + dim; + d_skip = d_skip + dim_; } } // Apply the normalization. diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 9fc35c958..7232efc44 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -32,6 +32,7 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) : opts_(opt opts.linear_spectrogram_opts, std::move(data_source))); } + CHECK(opts.cmvn_file != ""); unique_ptr cmvn( new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));