refactor ctc opts, extract decoder interface, add ctc beamsearch score

3 years ago · bc1b6c2e7c
parent 5c8725e8cd
commit bc1b6c2e7c
16 changed files with 351 additions and 78 deletions
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@ -135,7 +135,7 @@ fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    #  TLG decoder
    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
-    tlg_decoder_main \
+    ctc_tlg_decoder_main \
        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
        --model_path=$model_dir/avg_1.jit.pdmodel \
        --param_path=$model_dir/avg_1.jit.pdiparams \
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@ -133,7 +133,7 @@ fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    #  TLG decoder
    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wfst.log \
-    tlg_decoder_main \
+    ctc_tlg_decoder_main \
        --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
        --model_path=$model_dir/avg_5.jit.pdmodel \
        --param_path=$model_dir/avg_5.jit.pdiparams \
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@ -15,7 +15,7 @@ set(BINS
  ctc_beam_search_decoder_main
  nnet_logprob_decoder_main
  recognizer_main
-  tlg_decoder_main
+  ctc_tlg_decoder_main
 )
 foreach(bin_name IN LISTS BINS)
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "decoder/ctc_beam_search_decoder.h"
-#include "base/basic_types.h"
+#include "base/common.h"
 #include "decoder/ctc_decoders/decoder_utils.h"
 #include "decoder/ctc_beam_search_decoder.h"
 #include "utils/file_utils.h"
 namespace ppspeech {
@ -26,7 +26,7 @@ using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
 CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
    : opts_(opts),
      init_ext_scorer_(nullptr),
-      blank_id_(-1),
+      blank_id_(opts.blank),
      space_id_(-1),
      num_frame_decoded_(0),
      root_(nullptr) {
@ -43,9 +43,9 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
            opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_);
    }
-    blank_id_ = 0;
+    CHECK(blank_id_==0);
    auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " ");
    auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " ");
    space_id_ = it - vocabulary_.begin();
    // if no space in vocabulary
    if ((size_t)space_id_ >= vocabulary_.size()) {
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h
@ -14,67 +14,48 @@
 // used by deepspeech2
-#include "base/common.h"
+#pragma once
 #include "decoder/ctc_beam_search_opt.h"
 #include "decoder/ctc_decoders/path_trie.h"
 #include "decoder/ctc_decoders/scorer.h"
-#include "kaldi/decoder/decodable-itf.h"
+#include "decoder/decoder_itf.h"
 #include "util/parse-options.h"
 #pragma once
 namespace ppspeech {
-struct CTCBeamSearchOptions {
+class CTCBeamSearch : public DecoderInterface {
    std::string dict_file;
    std::string lm_path;
    BaseFloat alpha;
    BaseFloat beta;
    BaseFloat cutoff_prob;
    int beam_size;
    int cutoff_top_n;
    int num_proc_bsearch;
    CTCBeamSearchOptions()
        : dict_file("vocab.txt"),
          lm_path(""),
          alpha(1.9f),
          beta(5.0),
          beam_size(300),
          cutoff_prob(0.99f),
          cutoff_top_n(40),
          num_proc_bsearch(10) {}
    void Register(kaldi::OptionsItf* opts) {
        opts->Register("dict", &dict_file, "dict file ");
        opts->Register("lm-path", &lm_path, "language model file");
        opts->Register("alpha", &alpha, "alpha");
        opts->Register("beta", &beta, "beta");
        opts->Register(
            "beam-size", &beam_size, "beam size for beam search method");
        opts->Register("cutoff-prob", &cutoff_prob, "cutoff probs");
        opts->Register("cutoff-top-n", &cutoff_top_n, "cutoff top n");
        opts->Register(
            "num-proc-bsearch", &num_proc_bsearch, "num proc bsearch");
    }
 };
 class CTCBeamSearch {
  public:
    explicit CTCBeamSearch(const CTCBeamSearchOptions& opts);
    ~CTCBeamSearch() {}
    void InitDecoder();
    void Reset();
    void AdvanceDecode(
        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
    std::string GetFinalBestPath();
    std::string GetPartialResult() {
        CHECK(false) << "Not implement.";
        return {};
    }
    void Decode(std::shared_ptr<kaldi::DecodableInterface> decodable);
    std::string GetBestPath();
    std::vector<std::pair<double, std::string>> GetNBestPath();
-    std::string GetFinalBestPath();
+
    int NumFrameDecoded();
    int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
                          std::vector<std::string>& nbest_words);
    void AdvanceDecode(
        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
    void Reset();
  private:
    void ResetPrefixes();
    int32 SearchOneChar(const bool& full_beam,
                        const std::pair<size_t, BaseFloat>& log_prob_idx,
                        const BaseFloat& min_cutoff);
@ -93,4 +74,4 @@ class CTCBeamSearch {
    DISALLOW_COPY_AND_ASSIGN(CTCBeamSearch);
 };
-}  // namespace basr
+}  // namespace ppspeech
--- a/speechx/speechx/decoder/ctc_beam_search_opt.h
+++ b/speechx/speechx/decoder/ctc_beam_search_opt.h
@ -0,0 +1,78 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "base/common.h"
 #include "util/parse-options.h"
 #pragma once
 namespace ppspeech {
 struct CTCBeamSearchOptions {
    // common
    int blank;
    // ds2
    std::string dict_file;
    std::string lm_path;
    int beam_size;
    BaseFloat alpha;
    BaseFloat beta;
    BaseFloat cutoff_prob;
    int cutoff_top_n;
    int num_proc_bsearch;
    // u2
    int first_beam_size;
    int second_beam_size;
    CTCBeamSearchOptions()
        : blank(0),
          dict_file("vocab.txt"),
          lm_path(""),
          alpha(1.9f),
          beta(5.0),
          beam_size(300),
          cutoff_prob(0.99f),
          cutoff_top_n(40),
          num_proc_bsearch(10),
          first_beam_size(10),
          second_beam_size(10) {}
    void Register(kaldi::OptionsItf* opts) {
        std::string module = "Ds2BeamSearchConfig: ";
        opts->Register("dict", &dict_file, module + "vocab file path.");
        opts->Register(
            "lm-path", &lm_path, module + "ngram language model path.");
        opts->Register("alpha", &alpha, module + "alpha");
        opts->Register("beta", &beta, module + "beta");
        opts->Register("beam-size",
                       &beam_size,
                       module + "beam size for beam search method");
        opts->Register("cutoff-prob", &cutoff_prob, module + "cutoff probs");
        opts->Register("cutoff-top-n", &cutoff_top_n, module + "cutoff top n");
        opts->Register(
            "num-proc-bsearch", &num_proc_bsearch, module + "num proc bsearch");
        opts->Register("blank", &blank, "blank id, default is 0.");
        module = "U2BeamSearchConfig: ";
        opts->Register(
            "first-beam-size", &first_beam_size, module + "first beam size.");
        opts->Register("second-beam-size",
                       &second_beam_size,
                       module + "second beam size.");
    }
 };
 }  // namespace ppspeech
--- a/speechx/speechx/decoder/ctc_prefix_beam_search.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search.cc
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@ -0,0 +1,13 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h
@ -0,0 +1,64 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "decoder/ctc_beam_search_opt.h"
 #include "decoder/ctc_prefix_beam_search_score.h"
 #include "decoder/decoder_itf.h"
 #include "kaldi/decoder/decodable-itf.h"
 namespace ppspeech {
 class CTCPrefixBeamSearch : public DecoderInterface {
  public:
    explicit CTCPrefixBeamSearch(const CTCBeamSearchOptions& opts);
    ~CTCPrefixBeamSearch() {}
    void InitDecoder();
    void Decode(std::shared_ptr<kaldi::DecodableInterface> decodable);
    std::string GetBestPath();
    std::vector<std::pair<double, std::string>> GetNBestPath();
    std::string GetFinalBestPath();
    int NumFrameDecoded();
    int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
                          std::vector<std::string>& nbest_words);
    void AdvanceDecode(
        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
    void Reset();
  private:
    void ResetPrefixes();
    int32 SearchOneChar(const bool& full_beam,
                        const std::pair<size_t, BaseFloat>& log_prob_idx,
                        const BaseFloat& min_cutoff);
    void CalculateApproxScore();
    void LMRescore();
    void AdvanceDecoding(const std::vector<std::vector<BaseFloat>>& probs);
    CTCBeamSearchOptions opts_;
    size_t blank_id_;
    int num_frame_decoded_;
    DISALLOW_COPY_AND_ASSIGN(CTCPrefixBeamSearch);
 };
 }  // namespace basr
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h
@ -0,0 +1,68 @@
 // Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "base/common.h"
 #include "utils/math.h"
 namespace ppspeech {
 struct PrefxiScore {
    // decoding, unit in log scale
    float b = -kFloatMax;   // blank ending score
    float nb = -kFloatMax;  // none-blank ending score
    // timestamp, unit in log sclae
    float v_b = -kFloatMax;             // viterbi blank ending score
    float v_nb = -kFloatMax;            // niterbi none-blank ending score
    float cur_token_prob = -kFloatMax;  // prob of current token
    std::vector<int> times_b;           // times of viterbi blank path
    std::vector<int> times_nb;          // times of viterbi non-blank path
    // context state
    bool has_context = false;
    int context_state = 0;
    float context_score = 0;
    // decoding score, sum
    float Score() const { return LogSumExp(b, nb); }
    // decodign score with context bias
    float TotalScore() const { return Score() + context_score; }
    // timestamp score, max
    float ViterbiScore() const { return std::max(v_b, v_nb); }
    // get timestamp
    const std::vector<int>& Times() const {
        return v_b > v_nb ? times_b : times_nb;
    }
 };
 struct PrefixScoreHash {
    // https://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector
    std::size_t operator()(const std::vector<int>& prefix) const {
        std::size_t seed = prefix.size();
        for (auto& i : prefix) {
            seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2);
        }
        return seed;
    }
 };
 using PrefixWithScoreType = std::pair<std::vector<int>, PrefixScoreHash>;
 }  // namespace ppspeech
--- a/speechx/speechx/decoder/ctc_tlg_decoder.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc
@ -22,24 +22,24 @@ TLGDecoder::TLGDecoder(TLGDecoderOptions opts) {
        fst::SymbolTable::ReadText(opts.word_symbol_table));
    decoder_.reset(new kaldi::LatticeFasterOnlineDecoder(*fst_, opts.opts));
    decoder_->InitDecoding();
-    frame_decoded_size_ = 0;
+    num_frame_decoded_ = 0;
 }
 void TLGDecoder::InitDecoder() {
    decoder_->InitDecoding();
-    frame_decoded_size_ = 0;
+    num_frame_decoded_ = 0;
 }
 void TLGDecoder::AdvanceDecode(
    const std::shared_ptr<kaldi::DecodableInterface>& decodable) {
-    while (!decodable->IsLastFrame(frame_decoded_size_)) {
+    while (!decodable->IsLastFrame(num_frame_decoded_)) {
        AdvanceDecoding(decodable.get());
    }
 }
 void TLGDecoder::AdvanceDecoding(kaldi::DecodableInterface* decodable) {
    decoder_->AdvanceDecoding(decodable, 1);
-    frame_decoded_size_++;
+    num_frame_decoded_++;
 }
 void TLGDecoder::Reset() {
@ -48,7 +48,7 @@ void TLGDecoder::Reset() {
 }
 std::string TLGDecoder::GetPartialResult() {
-    if (frame_decoded_size_ == 0) {
+    if (num_frame_decoded_ == 0) {
        // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call
        // BestPathEnd if no frames were decoded.")
        return std::string("");
@ -68,7 +68,7 @@ std::string TLGDecoder::GetPartialResult() {
 }
 std::string TLGDecoder::GetFinalBestPath() {
-    if (frame_decoded_size_ == 0) {
+    if (num_frame_decoded_ == 0) {
        // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call
        // BestPathEnd if no frames were decoded.")
        return std::string("");
--- a/speechx/speechx/decoder/ctc_tlg_decoder.h
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.h
@ -14,8 +14,9 @@
 #pragma once
-#include "base/basic_types.h"
+#include "base/common.h"
-#include "kaldi/decoder/decodable-itf.h"
+#include "decoder/decoder_itf.h"
 #include "kaldi/decoder/lattice-faster-online-decoder.h"
 #include "util/parse-options.h"
@ -30,21 +31,31 @@ struct TLGDecoderOptions {
    TLGDecoderOptions() : word_symbol_table(""), fst_path("") {}
 };
-class TLGDecoder {
+class TLGDecoder : public DecoderInterface {
  public:
    explicit TLGDecoder(TLGDecoderOptions opts);
    ~TLGDecoder() = default;
    void InitDecoder();
    void Reset();
    void AdvanceDecode(
        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
    std::string GetFinalBestPath();
    std::string GetPartialResult();
    void Decode();
    std::string GetBestPath();
    std::vector<std::pair<double, std::string>> GetNBestPath();
-    std::string GetFinalBestPath();
+
    std::string GetPartialResult();
    int NumFrameDecoded();
    int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
                          std::vector<std::string>& nbest_words);
-    void AdvanceDecode(
+
        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
    void Reset();
  private:
    void AdvanceDecoding(kaldi::DecodableInterface* decodable);
@ -53,7 +64,7 @@ class TLGDecoder {
    std::shared_ptr<fst::Fst<fst::StdArc>> fst_;
    std::shared_ptr<fst::SymbolTable> word_symbol_table_;
    // the frame size which have decoded starts from 0.
-    int32 frame_decoded_size_;
+    int32 num_frame_decoded_;
 };
--- a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc
@ -14,13 +14,15 @@
 // todo refactor, repalce with gtest
-#include "base/flags.h"
+#include "base/common.h"
-#include "base/log.h"
+
 #include "decoder/ctc_tlg_decoder.h"
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
 #include "nnet/ds2_nnet.h"
 #include "decoder/ctc_tlg_decoder.h"
 #include "kaldi/util/table-types.h"
 DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
 DEFINE_string(result_wspecifier, "", "test result wspecifier");
--- a/speechx/speechx/decoder/decoder_itf.h
+++ b/speechx/speechx/decoder/decoder_itf.h
@ -0,0 +1,56 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "base/common.h"
 #include "kaldi/decoder/decodable-itf.h"
 namespace ppspeech {
 class DecoderInterface {
  public:
    virtual ~DecoderInterface() {}
    virtual void InitDecoder() = 0;
    virtual void Reset() = 0;
    virtual void AdvanceDecode(
        const std::shared_ptr<kaldi::DecodableInterface>& decodable) = 0;
    virtual std::string GetFinalBestPath() = 0;
    virtual std::string GetPartialResult() = 0;
    // void Decode();
    // std::string GetBestPath();
    // std::vector<std::pair<double, std::string>> GetNBestPath();
    // int NumFrameDecoded();
    // int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
    //                       std::vector<std::string>& nbest_words);
  private:
    // void AdvanceDecoding(kaldi::DecodableInterface* decodable);
    // current decoding frame number
    int32 num_frame_decoded_;
 };
 }  // namespace ppspeech
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@ -39,14 +39,14 @@ class Decodable : public kaldi::DecodableInterface {
    // forward nnet with feats
    bool AdvanceChunk();
-    
+
    // forward nnet with feats, and get nnet output
    bool AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
                      int* vocab_dim);
-                      
+
    void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
-                          float reverse_weight,
+                            float reverse_weight,
-                          std::vector<float>* rescoring_score);
+                            std::vector<float>* rescoring_score);
    virtual bool IsLastFrame(int32 frame);
--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@ -56,9 +56,9 @@ class PaddleNnet : public NnetInterface {
                     NnetOut* out) override;
    void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
-                                    float reverse_weight,
+                            float reverse_weight,
-                                    std::vector<float>* rescoring_score) override {
+                            std::vector<float>* rescoring_score) override {
-      VLOG(2) << "deepspeech2 not has AttentionRescoring.";
+        VLOG(2) << "deepspeech2 not has AttentionRescoring.";
    }
    void Dim();