From 5d5266abff63a32c8f1c97351a299371b4b40abc Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 6 May 2022 02:36:37 +0000 Subject: [PATCH] rm to_float32 flags, default is fbank --- speechx/examples/ds2_ol/aishell/run.sh | 1 - .../ds2_ol/decoder/recognizer_test_main.cc | 4 ++- .../ds2_ol/feat/compute_fbank_main.cc | 1 + .../feat/linear-spectrogram-wo-db-norm-ol.cc | 3 +- .../ds2_ol/websocket/websocket_server.sh | 1 - speechx/speechx/decoder/param.h | 31 ++++++++++++------- .../speechx/frontend/audio/feature_pipeline.h | 6 ++-- 7 files changed, 28 insertions(+), 19 deletions(-) diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index b44200b0..650cb140 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -155,7 +155,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ - --to_float32=true \ --streaming_chunk=30 \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc index 00764f53..476fac05 100644 --- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc +++ b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc @@ -19,6 +19,7 @@ DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); +DEFINE_int32(sample_rate, 16000, "sample rate"); int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); @@ -30,7 +31,8 @@ int main(int argc, char* argv[]) { kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - int sample_rate = 16000; + + int sample_rate = FLAGS_sample_rate; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; LOG(INFO) << "sr: " << sample_rate; diff --git a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc index 7beaa587..67683eeb 100644 --- a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc +++ b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc @@ -69,6 +69,7 @@ int main(int argc, char* argv[]) { feat_cache_opts.frame_chunk_stride = 1; feat_cache_opts.frame_chunk_size = 1; ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); + LOG(INFO) << "fbank: " << true; LOG(INFO) << "feat dim: " << feature_cache.Dim(); int sample_rate = 16000; diff --git a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc index c3652ad4..bbf0e690 100644 --- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc +++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc @@ -56,6 +56,7 @@ int main(int argc, char* argv[]) { opt.frame_opts.remove_dc_offset = false; opt.frame_opts.window_type = "hanning"; opt.frame_opts.preemph_coeff = 0.0; + LOG(INFO) << "linear feature: " << true; LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms; LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms; @@ -77,7 +78,7 @@ int main(int argc, char* argv[]) { int sample_rate = 16000; float streaming_chunk = FLAGS_streaming_chunk; int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "sample rate: " << sample_rate; LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh index 0e389f89..fc57e326 100755 --- a/speechx/examples/ds2_ol/websocket/websocket_server.sh +++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh @@ -63,7 +63,6 @@ websocket_server_main \ --cmvn_file=$cmvn \ --model_path=$model_dir/avg_1.jit.pdmodel \ --streaming_chunk=0.1 \ - --to_float32=true \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 85de08ca..9905bc6e 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -19,23 +19,23 @@ #include "decoder/ctc_tlg_decoder.h" #include "frontend/audio/feature_pipeline.h" +// feature +DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); +// DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear feature, or fbank"); +DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size"); -DEFINE_bool(to_float32, true, "audio convert to pcm32"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); -DEFINE_int32(max_active, 7500, "max active"); -DEFINE_double(beam, 15.0, "decoder beam"); -DEFINE_double(lattice_beam, 7.5, "decoder beam"); +// feature sliding window DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=5) downsampling module."); DEFINE_int32(downsampling_rate, 4, "two CNN(kernel=5) module downsampling rate."); + +// nnet +DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); +DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string( model_input_names, "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", @@ -47,8 +47,14 @@ DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); -DEFINE_bool(use_fbank, false, "use fbank or linear feature"); -DEFINE_int32(num_bins, 161, "num bins of mel"); + +// decoder +DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); +DEFINE_string(graph_path, "TLG", "decoder graph"); +DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); +DEFINE_int32(max_active, 7500, "max active"); +DEFINE_double(beam, 15.0, "decoder beam"); +DEFINE_double(lattice_beam, 7.5, "decoder beam"); namespace ppspeech { // todo refactor later @@ -56,17 +62,18 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { FeaturePipelineOptions opts; opts.cmvn_file = FLAGS_cmvn_file; opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk; - opts.to_float32 = FLAGS_to_float32; kaldi::FrameExtractionOptions frame_opts; frame_opts.dither = 0.0; frame_opts.frame_shift_ms = 10; opts.use_fbank = FLAGS_use_fbank; if (opts.use_fbank) { + opts.to_float32 = false; frame_opts.window_type = "povey"; frame_opts.frame_length_ms = 25; opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; opts.fbank_opts.fbank_opts.frame_opts = frame_opts; } else { + opts.to_float32 = true; frame_opts.remove_dc_offset = false; frame_opts.frame_length_ms = 20; frame_opts.window_type = "hanning"; diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 4868d37e..1acf62a9 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -28,15 +28,15 @@ namespace ppspeech { struct FeaturePipelineOptions { std::string cmvn_file; - bool to_float32; + bool to_float32; // true, only for linear feature bool use_fbank; LinearSpectrogramOptions linear_spectrogram_opts; FbankOptions fbank_opts; FeatureCacheOptions feature_cache_opts; FeaturePipelineOptions() : cmvn_file(""), - to_float32(false), - use_fbank(false), + to_float32(false), // true, only for linear feature + use_fbank(true), linear_spectrogram_opts(), fbank_opts(), feature_cache_opts() {}