diff --git a/speechx/examples/codelab/decoder/run.sh b/speechx/examples/codelab/decoder/run.sh index a911eb033..1a9e3cd7e 100755 --- a/speechx/examples/codelab/decoder/run.sh +++ b/speechx/examples/codelab/decoder/run.sh @@ -69,7 +69,7 @@ compute_linear_spectrogram_main \ echo "compute linear spectrogram feature." # run ctc beam search decoder as streaming -ctc_prefix_beam_search_decoder_main \ +ctc_beam_search_decoder_main \ --result_wspecifier=ark,t:$exp_dir/result.txt \ --feature_rspecifier=ark:$feat_wspecifier \ --model_path=$model_dir/avg_1.jit.pdmodel \ diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index a29be17bf..e5fccc03d 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -84,7 +84,7 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # recognizer utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ @@ -103,7 +103,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # decode with lm utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 720728354..88ed62872 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -84,7 +84,7 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # recognizer utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ @@ -102,7 +102,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # decode with lm utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt index 1df935112..8d04a9976 100644 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -12,7 +12,7 @@ add_library(decoder STATIC target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder) set(BINS - ctc_prefix_beam_search_decoder_main + ctc_beam_search_decoder_main nnet_logprob_decoder_main recognizer_main tlg_decoder_main diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h index 9d0a5d142..19dbf2f69 100644 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// used by deepspeech2 + #include "base/common.h" #include "decoder/ctc_decoders/path_trie.h" #include "decoder/ctc_decoders/scorer.h" diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc similarity index 99% rename from speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc rename to speechx/speechx/decoder/ctc_beam_search_decoder_main.cc index 445f470f9..7e245e9b8 100644 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// todo refactor, repalce with gtest +// used by deepspeech2 #include "base/flags.h" #include "base/log.h" diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search.cc b/speechx/speechx/decoder/ctc_prefix_beam_search.cc new file mode 100644 index 000000000..e69de29bb diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index ed895aeda..8a5990dc8 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -67,6 +67,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { frame_opts.dither = 0.0; frame_opts.frame_shift_ms = 10; opts.use_fbank = FLAGS_use_fbank; + LOG(INFO) << "feature type: " << opts.use_fbank ? "fbank" : "linear"; if (opts.use_fbank) { opts.to_float32 = false; frame_opts.window_type = "povey"; diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 1483949b9..b76c6280a 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -157,4 +157,10 @@ void Decodable::Reset() { nnet_out_cache_.Resize(0, 0); } +void Decodable::AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score){ + nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score); +} + } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h index 1ee6afbf8..bfb750675 100644 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -30,23 +30,31 @@ class Decodable : public kaldi::DecodableInterface { // void Init(DecodableOpts config); - // nnet logprob output + // nnet logprob output, used by wfst virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index); + // nnet output + virtual bool FrameLikelihood(int32 frame, + std::vector* likelihood); + + // forward nnet with feats + bool AdvanceChunk(); + + // forward nnet with feats, and get nnet output + bool AdvanceChunk(kaldi::Vector* logprobs, + int* vocab_dim); + + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score); + virtual bool IsLastFrame(int32 frame); // nnet output dim, e.g. vocab size virtual int32 NumIndices() const; - // nnet prob output - virtual bool FrameLikelihood(int32 frame, - std::vector* likelihood); - virtual int32 NumFramesReady() const; - // for offline test - void Acceptlikelihood(const kaldi::Matrix& likelihood); - void Reset(); bool IsInputFinished() const { return frontend_->IsFinished(); } @@ -57,11 +65,8 @@ class Decodable : public kaldi::DecodableInterface { std::shared_ptr Nnet() { return nnet_; } - // forward nnet with feats - bool AdvanceChunk(); - // forward nnet with feats, and get nnet output - bool AdvanceChunk(kaldi::Vector* logprobs, - int* vocab_dim); + // for offline test + void Acceptlikelihood(const kaldi::Matrix& likelihood); private: std::shared_ptr frontend_; diff --git a/speechx/speechx/nnet/ds2_nnet.h b/speechx/speechx/nnet/ds2_nnet.h index 9e2cb77b7..cd1648b44 100644 --- a/speechx/speechx/nnet/ds2_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -15,56 +15,11 @@ #include #include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" -#include "kaldi/util/options-itf.h" #include "nnet/nnet_itf.h" #include "paddle_inference_api.h" namespace ppspeech { -struct ModelOptions { - std::string model_path; - std::string param_path; - int thread_num; // predictor thread pool size - bool use_gpu; - bool switch_ir_optim; - std::string input_names; - std::string output_names; - std::string cache_names; - std::string cache_shape; - bool enable_fc_padding; - bool enable_profile; - ModelOptions() - : model_path(""), - param_path(""), - thread_num(2), - use_gpu(false), - input_names(""), - output_names(""), - cache_names(""), - cache_shape(""), - switch_ir_optim(false), - enable_fc_padding(false), - enable_profile(false) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("model-path", &model_path, "model file path"); - opts->Register("model-param", ¶m_path, "params model file path"); - opts->Register("thread-num", &thread_num, "thread num"); - opts->Register("use-gpu", &use_gpu, "if use gpu"); - opts->Register("input-names", &input_names, "paddle input names"); - opts->Register("output-names", &output_names, "paddle output names"); - opts->Register("cache-names", &cache_names, "cache names"); - opts->Register("cache-shape", &cache_shape, "cache shape"); - opts->Register("switch-ir-optiom", - &switch_ir_optim, - "paddle SwitchIrOptim option"); - opts->Register("enable-fc-padding", - &enable_fc_padding, - "paddle EnableFCPadding option"); - opts->Register( - "enable-profile", &enable_profile, "paddle EnableProfile option"); - } -}; template class Tensor { @@ -100,6 +55,12 @@ class PaddleNnet : public NnetInterface { const int32& feature_dim, NnetOut* out) override; + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) override { + VLOG(2) << "deepspeech2 not has AttentionRescoring."; + } + void Dim(); void Reset() override; diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h index d05aabea4..2e21ff9bf 100644 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -18,9 +18,56 @@ #include "base/basic_types.h" #include "kaldi/base/kaldi-types.h" #include "kaldi/matrix/kaldi-matrix.h" +#include "kaldi/util/options-itf.h" namespace ppspeech { + +struct ModelOptions { + std::string model_path; + std::string param_path; + int thread_num; // predictor thread pool size for ds2; + bool use_gpu; + bool switch_ir_optim; + std::string input_names; + std::string output_names; + std::string cache_names; + std::string cache_shape; + bool enable_fc_padding; + bool enable_profile; + ModelOptions() + : model_path(""), + param_path(""), + thread_num(1), + use_gpu(false), + input_names(""), + output_names(""), + cache_names(""), + cache_shape(""), + switch_ir_optim(false), + enable_fc_padding(false), + enable_profile(false) {} + + void Register(kaldi::OptionsItf* opts) { + opts->Register("model-path", &model_path, "model file path"); + opts->Register("model-param", ¶m_path, "params model file path"); + opts->Register("thread-num", &thread_num, "thread num"); + opts->Register("use-gpu", &use_gpu, "if use gpu"); + opts->Register("input-names", &input_names, "paddle input names"); + opts->Register("output-names", &output_names, "paddle output names"); + opts->Register("cache-names", &cache_names, "cache names"); + opts->Register("cache-shape", &cache_shape, "cache shape"); + opts->Register("switch-ir-optiom", + &switch_ir_optim, + "paddle SwitchIrOptim option"); + opts->Register("enable-fc-padding", + &enable_fc_padding, + "paddle EnableFCPadding option"); + opts->Register( + "enable-profile", &enable_profile, "paddle EnableProfile option"); + } +}; + struct NnetOut { // nnet out. maybe logprob or prob. Almost time this is logprob. kaldi::Vector logprobs; @@ -45,6 +92,10 @@ class NnetInterface { const int32& feature_dim, NnetOut* out) = 0; + virtual void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) = 0; + // reset nnet state, e.g. nnet_logprob_cache_, offset_, encoder_outs_. virtual void Reset() = 0; diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc index 74f8cf788..71252477e 100644 --- a/speechx/speechx/nnet/u2_nnet.cc +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -166,7 +166,7 @@ void U2Nnet::Warmup() { Reset(); } -U2Nnet::U2Nnet(const U2ModelOptions& opts) : opts_(opts) { +U2Nnet::U2Nnet(const ModelOptions& opts) : opts_(opts) { LoadModel(opts_.model_path); } diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h index 4ecbac26f..1bac652e8 100644 --- a/speechx/speechx/nnet/u2_nnet.h +++ b/speechx/speechx/nnet/u2_nnet.h @@ -17,28 +17,14 @@ #include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" -#include "kaldi/util/options-itf.h" -#include "nnet/nnet_itf.h" +#include "nnet/nnet_itf.h" #include "paddle/extension.h" #include "paddle/jit/all.h" #include "paddle/phi/api/all.h" namespace ppspeech { -struct U2ModelOptions { - std::string model_path; - int thread_num; - bool use_gpu; - U2ModelOptions() : model_path(""), thread_num(1), use_gpu(false) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("model-path", &model_path, "model file path"); - opts->Register("thread-num", &thread_num, "thread num"); - opts->Register("use-gpu", &use_gpu, "if use gpu"); - } -}; - class U2NnetBase : public NnetInterface { public: @@ -65,10 +51,6 @@ class U2NnetBase : public NnetInterface { std::vector* ctc_probs, int32* vocab_dim); - virtual void AttentionRescoring(const std::vector>& hyps, - float reverse_weight, - std::vector* rescoring_score) = 0; - protected: virtual void ForwardEncoderChunkImpl( const std::vector& chunk_feats, @@ -102,7 +84,7 @@ class U2NnetBase : public NnetInterface { class U2Nnet : public U2NnetBase { public: - U2Nnet(const U2ModelOptions& opts); + U2Nnet(const ModelOptions& opts); U2Nnet(const U2Nnet& other); void FeedForward(const kaldi::Vector& features, @@ -143,7 +125,7 @@ class U2Nnet : public U2NnetBase { std::vector>* encoder_out) const; private: - U2ModelOptions opts_; + ModelOptions opts_; phi::Place dev_; std::shared_ptr model_{nullptr}; diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc index 0c5aed54e..2dd1fa0d3 100644 --- a/speechx/speechx/nnet/u2_nnet_main.cc +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -58,7 +58,7 @@ int main(int argc, char* argv[]) { kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier); kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(FLAGS_nnet_encoder_outs_wspecifier); - ppspeech::U2ModelOptions model_opts; + ppspeech::ModelOptions model_opts; model_opts.model_path = FLAGS_model_path; int32 chunk_size =