From 147338eec684b2241f280acc0df7053b651fa378 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sun, 24 Apr 2022 09:25:33 +0000 Subject: [PATCH] config param for nnet --- paddlespeech/__init__.py | 2 ++ .../ctc-prefix-beam-search-decoder-ol.cc | 6 ++-- .../ds2_ol/decoder/wfst-decoder-ol.cc | 6 ++-- .../feat/linear-spectrogram-wo-db-norm-ol.cc | 4 +-- speechx/speechx/common/CMakeLists.txt | 0 speechx/speechx/decoder/param.h | 7 ++-- speechx/speechx/frontend/audio/audio_cache.h | 4 +-- .../speechx/frontend/audio/feature_cache.cc | 11 +++++-- speechx/speechx/nnet/paddle_nnet.cc | 26 +++++++++++---- speechx/speechx/nnet/paddle_nnet.h | 33 +++++++++---------- 10 files changed, 62 insertions(+), 37 deletions(-) delete mode 100644 speechx/speechx/common/CMakeLists.txt diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py index b781c4a8..4b1c0ef3 100644 --- a/paddlespeech/__init__.py +++ b/paddlespeech/__init__.py @@ -14,3 +14,5 @@ import _locale _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) + + diff --git a/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc b/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc index 6a6495aa..a04b1c86 100644 --- a/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc +++ b/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc @@ -41,7 +41,8 @@ DEFINE_string( DEFINE_string(model_output_names, "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0", "model output names"); -DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names"); +DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); +DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); using kaldi::BaseFloat; using kaldi::Matrix; @@ -77,7 +78,8 @@ int main(int argc, char* argv[]) { ppspeech::ModelOptions model_opts; model_opts.model_path = model_path; model_opts.param_path = model_params; - model_opts.cache_shape = FLAGS_model_cache_names; + model_opts.cache_names = FLAGS_model_cache_names; + model_opts.cache_shape = FLAGS_model_cache_shapes; model_opts.input_names = FLAGS_model_input_names; model_opts.output_names = FLAGS_model_output_names; std::shared_ptr nnet( diff --git a/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc b/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc index 544e59cb..bbb27b39 100644 --- a/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc +++ b/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc @@ -44,7 +44,8 @@ DEFINE_string( DEFINE_string(model_output_names, "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0", "model output names"); -DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names"); +DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); +DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); using kaldi::BaseFloat; using kaldi::Matrix; @@ -80,7 +81,8 @@ int main(int argc, char* argv[]) { ppspeech::ModelOptions model_opts; model_opts.model_path = model_graph; model_opts.param_path = model_params; - model_opts.cache_shape = FLAGS_model_cache_names; + model_opts.cache_names = FLAGS_model_cache_names; + model_opts.cache_shape = FLAGS_model_cache_shapes; model_opts.input_names = FLAGS_model_input_names; model_opts.output_names = FLAGS_model_output_names; std::shared_ptr nnet( diff --git a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc index f8f62f84..0b066d6a 100644 --- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc +++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc @@ -42,8 +42,8 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; - // feature pipeline: wave cache --> hanning - // window -->linear_spectrogram --> global cmvn -> feat cache + // feature pipeline: wave cache --> hanning window + // -->linear_spectrogram --> global cmvn -> feat cache std::unique_ptr data_source( new ppspeech::AudioCache(3600 * 1600, true)); diff --git a/speechx/speechx/common/CMakeLists.txt b/speechx/speechx/common/CMakeLists.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index f6bd77ca..cd68e5e9 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -43,7 +43,8 @@ DEFINE_string( DEFINE_string(model_output_names, "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0", "model output names"); -DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names"); +DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); +DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); namespace ppspeech { @@ -70,7 +71,9 @@ ModelOptions InitModelOptions() { ModelOptions model_opts; model_opts.model_path = FLAGS_model_path; model_opts.param_path = FLAGS_param_path; - model_opts.cache_shape = FLAGS_model_cache_names; + model_opts.cache_names = FLAGS_model_cache_names; + model_opts.cache_shape = FLAGS_model_cache_shapes; + model_opts.input_names = FLAGS_model_input_names; model_opts.output_names = FLAGS_model_output_names; return model_opts; } diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h index 371b2cc2..c44deddd 100644 --- a/speechx/speechx/frontend/audio/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -24,7 +24,7 @@ namespace ppspeech { class AudioCache : public FrontendInterface { public: explicit AudioCache(int buffer_size = 1000 * kint16max, - bool to_float32 = true); + bool to_float32 = false); virtual void Accept(const kaldi::VectorBase& waves); @@ -58,7 +58,7 @@ class AudioCache : public FrontendInterface { std::mutex mutex_; std::condition_variable ready_feed_condition_; kaldi::int32 timeout_; // millisecond - bool to_float32_; + bool to_float32_; // int16 -> float32. used in linear_spectrogram DISALLOW_COPY_AND_ASSIGN(AudioCache); }; diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc index e1e1043b..05283bb7 100644 --- a/speechx/speechx/frontend/audio/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -58,6 +58,8 @@ bool FeatureCache::Read(kaldi::Vector* feats) { usleep(100); // sleep 0.1 ms } if (cache_.empty()) return false; + + // read from cache feats->Resize(cache_.front().Dim()); feats->CopyFromVec(cache_.front()); cache_.pop(); @@ -74,15 +76,16 @@ bool FeatureCache::Compute() { // join with remained int32 joint_len = feature.Dim() + remained_feature_.Dim(); - int32 num_chunk = - ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1; - Vector joint_feature(joint_len); joint_feature.Range(0, remained_feature_.Dim()) .CopyFromVec(remained_feature_); joint_feature.Range(remained_feature_.Dim(), feature.Dim()) .CopyFromVec(feature); + // one by one, or stride with window + // controlled by frame_chunk_stride_ and frame_chunk_size_ + int32 num_chunk = + ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1; for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) { int32 start = chunk_idx * frame_chunk_stride_ * dim_; @@ -101,6 +104,8 @@ bool FeatureCache::Compute() { cache_.push(feature_chunk); ready_read_condition_.notify_one(); } + + // cache remained feats int32 remained_feature_len = joint_len - num_chunk * frame_chunk_stride_ * dim_; remained_feature_.Resize(remained_feature_len); diff --git a/speechx/speechx/nnet/paddle_nnet.cc b/speechx/speechx/nnet/paddle_nnet.cc index f8e1f697..6cf6aa5e 100644 --- a/speechx/speechx/nnet/paddle_nnet.cc +++ b/speechx/speechx/nnet/paddle_nnet.cc @@ -74,6 +74,7 @@ PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) { LOG(INFO) << "output names: " << opts.output_names; vector input_names_vec = absl::StrSplit(opts.input_names, ","); vector output_names_vec = absl::StrSplit(opts.output_names, ","); + paddle_infer::Predictor* predictor = GetPredictor(); std::vector model_input_names = predictor->GetInputNames(); @@ -87,6 +88,7 @@ PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) { for (size_t i = 0; i < output_names_vec.size(); i++) { assert(output_names_vec[i] == model_output_names[i]); } + ReleasePredictor(predictor); InitCacheEncouts(opts); } @@ -95,6 +97,7 @@ void PaddleNnet::Reset() { InitCacheEncouts(opts_); } paddle_infer::Predictor* PaddleNnet::GetPredictor() { paddle_infer::Predictor* predictor = nullptr; + std::lock_guard guard(pool_mutex); int pred_id = 0; @@ -144,15 +147,19 @@ void PaddleNnet::FeedForward(const Vector& features, Vector* inferences, int32* inference_dim) { paddle_infer::Predictor* predictor = GetPredictor(); + int feat_row = features.Dim() / feature_dim; + std::vector input_names = predictor->GetInputNames(); std::vector output_names = predictor->GetOutputNames(); + // feed inputs std::unique_ptr input_tensor = predictor->GetInputHandle(input_names[0]); std::vector INPUT_SHAPE = {1, feat_row, feature_dim}; input_tensor->Reshape(INPUT_SHAPE); input_tensor->CopyFromCpu(features.Data()); + std::unique_ptr input_len = predictor->GetInputHandle(input_names[1]); std::vector input_len_size = {1}; @@ -161,32 +168,36 @@ void PaddleNnet::FeedForward(const Vector& features, audio_len.push_back(feat_row); input_len->CopyFromCpu(audio_len.data()); - std::unique_ptr h_box = + std::unique_ptr state_h = predictor->GetInputHandle(input_names[2]); shared_ptr> h_cache = GetCacheEncoder(input_names[2]); - h_box->Reshape(h_cache->get_shape()); - h_box->CopyFromCpu(h_cache->get_data().data()); - std::unique_ptr c_box = + state_h->Reshape(h_cache->get_shape()); + state_h->CopyFromCpu(h_cache->get_data().data()); + + std::unique_ptr state_c = predictor->GetInputHandle(input_names[3]); shared_ptr> c_cache = GetCacheEncoder(input_names[3]); - c_box->Reshape(c_cache->get_shape()); - c_box->CopyFromCpu(c_cache->get_data().data()); + state_c->Reshape(c_cache->get_shape()); + state_c->CopyFromCpu(c_cache->get_data().data()); + + // forward bool success = predictor->Run(); if (success == false) { LOG(INFO) << "predictor run occurs error"; } + // fetch outpus std::unique_ptr h_out = predictor->GetOutputHandle(output_names[2]); assert(h_cache->get_shape() == h_out->shape()); h_out->CopyToCpu(h_cache->get_data().data()); + std::unique_ptr c_out = predictor->GetOutputHandle(output_names[3]); assert(c_cache->get_shape() == c_out->shape()); c_out->CopyToCpu(c_cache->get_data().data()); - // get result std::unique_ptr output_tensor = predictor->GetOutputHandle(output_names[0]); std::vector output_shape = output_tensor->shape(); @@ -195,6 +206,7 @@ void PaddleNnet::FeedForward(const Vector& features, inferences->Resize(row * col); *inference_dim = col; output_tensor->CopyToCpu(inferences->Data()); + ReleasePredictor(predictor); } diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/paddle_nnet.h index 8b4ed478..fdd51048 100644 --- a/speechx/speechx/nnet/paddle_nnet.h +++ b/speechx/speechx/nnet/paddle_nnet.h @@ -24,7 +24,7 @@ namespace ppspeech { struct ModelOptions { std::string model_path; std::string param_path; - int thread_num; + int thread_num; // predictor thread pool size bool use_gpu; bool switch_ir_optim; std::string input_names; @@ -34,19 +34,14 @@ struct ModelOptions { bool enable_fc_padding; bool enable_profile; ModelOptions() - : model_path("avg_1.jit.pdmodel"), - param_path("avg_1.jit.pdiparams"), + : model_path(""), + param_path(""), thread_num(2), use_gpu(false), - input_names( - "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_" - "box"), - output_names( - "save_infer_model/scale_0.tmp_1,save_infer_model/" - "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/" - "scale_3.tmp_1"), - cache_names("chunk_state_h_box,chunk_state_c_box"), - cache_shape("3-1-1024,3-1-1024"), + input_names(""), + output_names(""), + cache_names(""), + cache_shape(""), switch_ir_optim(false), enable_fc_padding(false), enable_profile(false) {} @@ -76,17 +71,19 @@ class Tensor { public: Tensor() {} Tensor(const std::vector& shape) : _shape(shape) { - int data_size = std::accumulate( + int neml = std::accumulate( _shape.begin(), _shape.end(), 1, std::multiplies()); - LOG(INFO) << "data size: " << data_size; - _data.resize(data_size, 0); + LOG(INFO) << "Tensor neml: " << neml; + _data.resize(neml, 0); } + void reshape(const std::vector& shape) { _shape = shape; - int data_size = std::accumulate( + int neml = std::accumulate( _shape.begin(), _shape.end(), 1, std::multiplies()); - _data.resize(data_size, 0); + _data.resize(neml, 0); } + const std::vector& get_shape() const { return _shape; } std::vector& get_data() { return _data; } @@ -98,10 +95,12 @@ class Tensor { class PaddleNnet : public NnetInterface { public: PaddleNnet(const ModelOptions& opts); + virtual void FeedForward(const kaldi::Vector& features, int32 feature_dim, kaldi::Vector* inferences, int32* inference_dim); + void Dim(); virtual void Reset(); std::shared_ptr> GetCacheEncoder(