unify model opts; add attention rescore in decodable; rename ds2 ctc beam search

pull/2524/head
Hui Zhang 3 years ago
parent 6987751ff8
commit 5c8725e8cd

@ -69,7 +69,7 @@ compute_linear_spectrogram_main \
echo "compute linear spectrogram feature."
# run ctc beam search decoder as streaming
ctc_prefix_beam_search_decoder_main \
ctc_beam_search_decoder_main \
--result_wspecifier=ark,t:$exp_dir/result.txt \
--feature_rspecifier=ark:$feat_wspecifier \
--model_path=$model_dir/avg_1.jit.pdmodel \

@ -84,7 +84,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# recognizer
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
ctc_prefix_beam_search_decoder_main \
ctc_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
@ -103,7 +103,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# decode with lm
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
ctc_prefix_beam_search_decoder_main \
ctc_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \

@ -84,7 +84,7 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# recognizer
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \
ctc_prefix_beam_search_decoder_main \
ctc_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
--model_path=$model_dir/avg_5.jit.pdmodel \
--param_path=$model_dir/avg_5.jit.pdiparams \
@ -102,7 +102,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# decode with lm
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \
ctc_prefix_beam_search_decoder_main \
ctc_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
--model_path=$model_dir/avg_5.jit.pdmodel \
--param_path=$model_dir/avg_5.jit.pdiparams \

@ -12,7 +12,7 @@ add_library(decoder STATIC
target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder)
set(BINS
ctc_prefix_beam_search_decoder_main
ctc_beam_search_decoder_main
nnet_logprob_decoder_main
recognizer_main
tlg_decoder_main

@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// used by deepspeech2
#include "base/common.h"
#include "decoder/ctc_decoders/path_trie.h"
#include "decoder/ctc_decoders/scorer.h"

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// todo refactor, repalce with gtest
// used by deepspeech2
#include "base/flags.h"
#include "base/log.h"

@ -67,6 +67,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
frame_opts.dither = 0.0;
frame_opts.frame_shift_ms = 10;
opts.use_fbank = FLAGS_use_fbank;
LOG(INFO) << "feature type: " << opts.use_fbank ? "fbank" : "linear";
if (opts.use_fbank) {
opts.to_float32 = false;
frame_opts.window_type = "povey";

@ -157,4 +157,10 @@ void Decodable::Reset() {
nnet_out_cache_.Resize(0, 0);
}
void Decodable::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
float reverse_weight,
std::vector<float>* rescoring_score){
nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score);
}
} // namespace ppspeech

@ -30,23 +30,31 @@ class Decodable : public kaldi::DecodableInterface {
// void Init(DecodableOpts config);
// nnet logprob output
// nnet logprob output, used by wfst
virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index);
// nnet output
virtual bool FrameLikelihood(int32 frame,
std::vector<kaldi::BaseFloat>* likelihood);
// forward nnet with feats
bool AdvanceChunk();
// forward nnet with feats, and get nnet output
bool AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
int* vocab_dim);
void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
float reverse_weight,
std::vector<float>* rescoring_score);
virtual bool IsLastFrame(int32 frame);
// nnet output dim, e.g. vocab size
virtual int32 NumIndices() const;
// nnet prob output
virtual bool FrameLikelihood(int32 frame,
std::vector<kaldi::BaseFloat>* likelihood);
virtual int32 NumFramesReady() const;
// for offline test
void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);
void Reset();
bool IsInputFinished() const { return frontend_->IsFinished(); }
@ -57,11 +65,8 @@ class Decodable : public kaldi::DecodableInterface {
std::shared_ptr<NnetInterface> Nnet() { return nnet_; }
// forward nnet with feats
bool AdvanceChunk();
// forward nnet with feats, and get nnet output
bool AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
int* vocab_dim);
// for offline test
void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);
private:
std::shared_ptr<FrontendInterface> frontend_;

@ -15,56 +15,11 @@
#include <numeric>
#include "base/common.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/options-itf.h"
#include "nnet/nnet_itf.h"
#include "paddle_inference_api.h"
namespace ppspeech {
struct ModelOptions {
std::string model_path;
std::string param_path;
int thread_num; // predictor thread pool size
bool use_gpu;
bool switch_ir_optim;
std::string input_names;
std::string output_names;
std::string cache_names;
std::string cache_shape;
bool enable_fc_padding;
bool enable_profile;
ModelOptions()
: model_path(""),
param_path(""),
thread_num(2),
use_gpu(false),
input_names(""),
output_names(""),
cache_names(""),
cache_shape(""),
switch_ir_optim(false),
enable_fc_padding(false),
enable_profile(false) {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("model-path", &model_path, "model file path");
opts->Register("model-param", &param_path, "params model file path");
opts->Register("thread-num", &thread_num, "thread num");
opts->Register("use-gpu", &use_gpu, "if use gpu");
opts->Register("input-names", &input_names, "paddle input names");
opts->Register("output-names", &output_names, "paddle output names");
opts->Register("cache-names", &cache_names, "cache names");
opts->Register("cache-shape", &cache_shape, "cache shape");
opts->Register("switch-ir-optiom",
&switch_ir_optim,
"paddle SwitchIrOptim option");
opts->Register("enable-fc-padding",
&enable_fc_padding,
"paddle EnableFCPadding option");
opts->Register(
"enable-profile", &enable_profile, "paddle EnableProfile option");
}
};
template <typename T>
class Tensor {
@ -100,6 +55,12 @@ class PaddleNnet : public NnetInterface {
const int32& feature_dim,
NnetOut* out) override;
void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
float reverse_weight,
std::vector<float>* rescoring_score) override {
VLOG(2) << "deepspeech2 not has AttentionRescoring.";
}
void Dim();
void Reset() override;

@ -18,9 +18,56 @@
#include "base/basic_types.h"
#include "kaldi/base/kaldi-types.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/options-itf.h"
namespace ppspeech {
struct ModelOptions {
std::string model_path;
std::string param_path;
int thread_num; // predictor thread pool size for ds2;
bool use_gpu;
bool switch_ir_optim;
std::string input_names;
std::string output_names;
std::string cache_names;
std::string cache_shape;
bool enable_fc_padding;
bool enable_profile;
ModelOptions()
: model_path(""),
param_path(""),
thread_num(1),
use_gpu(false),
input_names(""),
output_names(""),
cache_names(""),
cache_shape(""),
switch_ir_optim(false),
enable_fc_padding(false),
enable_profile(false) {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("model-path", &model_path, "model file path");
opts->Register("model-param", &param_path, "params model file path");
opts->Register("thread-num", &thread_num, "thread num");
opts->Register("use-gpu", &use_gpu, "if use gpu");
opts->Register("input-names", &input_names, "paddle input names");
opts->Register("output-names", &output_names, "paddle output names");
opts->Register("cache-names", &cache_names, "cache names");
opts->Register("cache-shape", &cache_shape, "cache shape");
opts->Register("switch-ir-optiom",
&switch_ir_optim,
"paddle SwitchIrOptim option");
opts->Register("enable-fc-padding",
&enable_fc_padding,
"paddle EnableFCPadding option");
opts->Register(
"enable-profile", &enable_profile, "paddle EnableProfile option");
}
};
struct NnetOut {
// nnet out. maybe logprob or prob. Almost time this is logprob.
kaldi::Vector<kaldi::BaseFloat> logprobs;
@ -45,6 +92,10 @@ class NnetInterface {
const int32& feature_dim,
NnetOut* out) = 0;
virtual void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
float reverse_weight,
std::vector<float>* rescoring_score) = 0;
// reset nnet state, e.g. nnet_logprob_cache_, offset_, encoder_outs_.
virtual void Reset() = 0;

@ -166,7 +166,7 @@ void U2Nnet::Warmup() {
Reset();
}
U2Nnet::U2Nnet(const U2ModelOptions& opts) : opts_(opts) {
U2Nnet::U2Nnet(const ModelOptions& opts) : opts_(opts) {
LoadModel(opts_.model_path);
}

@ -17,28 +17,14 @@
#include "base/common.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/options-itf.h"
#include "nnet/nnet_itf.h"
#include "nnet/nnet_itf.h"
#include "paddle/extension.h"
#include "paddle/jit/all.h"
#include "paddle/phi/api/all.h"
namespace ppspeech {
struct U2ModelOptions {
std::string model_path;
int thread_num;
bool use_gpu;
U2ModelOptions() : model_path(""), thread_num(1), use_gpu(false) {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("model-path", &model_path, "model file path");
opts->Register("thread-num", &thread_num, "thread num");
opts->Register("use-gpu", &use_gpu, "if use gpu");
}
};
class U2NnetBase : public NnetInterface {
public:
@ -65,10 +51,6 @@ class U2NnetBase : public NnetInterface {
std::vector<kaldi::BaseFloat>* ctc_probs,
int32* vocab_dim);
virtual void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
float reverse_weight,
std::vector<float>* rescoring_score) = 0;
protected:
virtual void ForwardEncoderChunkImpl(
const std::vector<kaldi::BaseFloat>& chunk_feats,
@ -102,7 +84,7 @@ class U2NnetBase : public NnetInterface {
class U2Nnet : public U2NnetBase {
public:
U2Nnet(const U2ModelOptions& opts);
U2Nnet(const ModelOptions& opts);
U2Nnet(const U2Nnet& other);
void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
@ -143,7 +125,7 @@ class U2Nnet : public U2NnetBase {
std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const;
private:
U2ModelOptions opts_;
ModelOptions opts_;
phi::Place dev_;
std::shared_ptr<paddle::jit::Layer> model_{nullptr};

@ -58,7 +58,7 @@ int main(int argc, char* argv[]) {
kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier);
kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(FLAGS_nnet_encoder_outs_wspecifier);
ppspeech::U2ModelOptions model_opts;
ppspeech::ModelOptions model_opts;
model_opts.model_path = FLAGS_model_path;
int32 chunk_size =

Loading…
Cancel
Save