// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the // "COPYING.LESSER.3"); #ifndef SCORER_H_ #define SCORER_H_ #include #include #include #include #include "lm/enumerate_vocab.hh" #include "lm/virtual_interface.hh" #include "lm/word_index.hh" #include "path_trie.h" const double OOV_SCORE = -1000.0; const std::string START_TOKEN = ""; const std::string UNK_TOKEN = ""; const std::string END_TOKEN = ""; // Implement a callback to retrive the dictionary of language model. class RetriveStrEnumerateVocab : public lm::EnumerateVocab { public: RetriveStrEnumerateVocab() {} void Add(lm::WordIndex index, const StringPiece &str) { vocabulary.push_back(std::string(str.data(), str.length())); } std::vector vocabulary; }; /* External scorer to query score for n-gram or sentence, including language * model scoring and word insertion. * * Example: * Scorer scorer(alpha, beta, "path_of_language_model"); * scorer.get_log_cond_prob({ "WORD1", "WORD2", "WORD3" }); * scorer.get_sent_log_prob({ "WORD1", "WORD2", "WORD3" }); */ class Scorer { public: Scorer(double alpha, double beta, const std::string &lm_path, const std::vector &vocabulary); ~Scorer(); double get_log_cond_prob(const std::vector &words); double get_sent_log_prob(const std::vector &words); // return the max order size_t get_max_order() const { return max_order_; } // return the dictionary size of language model size_t get_dict_size() const { return dict_size_; } // retrun true if the language model is character based bool is_character_based() const { return is_character_based_; } // reset params alpha & beta void reset_params(float alpha, float beta); // make ngram for a given prefix std::vector make_ngram(PathTrie *prefix); // trransform the labels in index to the vector of words (word based lm) or // the vector of characters (character based lm) std::vector split_labels(const std::vector &labels); // language model weight double alpha; // word insertion weight double beta; // pointer to the dictionary of FST void *dictionary; protected: // necessary setup: load language model, set char map, fill FST's dictionary void setup(const std::string &lm_path, const std::vector &vocab_list); // load language model from given path void load_lm(const std::string &lm_path); // fill dictionary for FST void fill_dictionary(bool add_space); // set char map void set_char_map(const std::vector &char_list); double get_log_prob(const std::vector &words); // translate the vector in index to string std::string vec2str(const std::vector &input); private: void *language_model_; bool is_character_based_; size_t max_order_; size_t dict_size_; int SPACE_ID_; std::vector char_list_; std::unordered_map char_map_; std::vector vocabulary_; }; #endif // SCORER_H_