PaddleSpeech/deepspeech/decoders/swig/scorer.h

// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef SCORER_H_
#define SCORER_H_

#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include "lm/enumerate_vocab.hh"
#include "lm/virtual_interface.hh"
#include "lm/word_index.hh"
#include "util/string_piece.hh"

#include "path_trie.h"

const double OOV_SCORE = -1000.0;
const std::string START_TOKEN = "<s>";
const std::string UNK_TOKEN = "<unk>";
const std::string END_TOKEN = "</s>";

// Implement a callback to retrive the dictionary of language model.
class RetriveStrEnumerateVocab : public lm::EnumerateVocab {
public:
  RetriveStrEnumerateVocab() {}

  void Add(lm::WordIndex index, const StringPiece &str) {
    vocabulary.push_back(std::string(str.data(), str.length()));
  }

  std::vector<std::string> vocabulary;
};

/* External scorer to query score for n-gram or sentence, including language
 * model scoring and word insertion.
 *
 * Example:
 *     Scorer scorer(alpha, beta, "path_of_language_model");
 *     scorer.get_log_cond_prob({ "WORD1", "WORD2", "WORD3" });
 *     scorer.get_sent_log_prob({ "WORD1", "WORD2", "WORD3" });
 */
class Scorer {
public:
  Scorer(double alpha,
         double beta,
         const std::string &lm_path,
         const std::vector<std::string> &vocabulary);
  ~Scorer();

  double get_log_cond_prob(const std::vector<std::string> &words);

  double get_sent_log_prob(const std::vector<std::string> &words);

  // return the max order
  size_t get_max_order() const { return max_order_; }

  // return the dictionary size of language model
  size_t get_dict_size() const { return dict_size_; }

  // retrun true if the language model is character based
  bool is_character_based() const { return is_character_based_; }

  // reset params alpha & beta
  void reset_params(float alpha, float beta);

  // make ngram for a given prefix
  std::vector<std::string> make_ngram(PathTrie *prefix);

  // trransform the labels in index to the vector of words (word based lm) or
  // the vector of characters (character based lm)
  std::vector<std::string> split_labels(const std::vector<int> &labels);

  // language model weight
  double alpha;
  // word insertion weight
  double beta;

  // pointer to the dictionary of FST
  void *dictionary;

protected:
  // necessary setup: load language model, set char map, fill FST's dictionary
  void setup(const std::string &lm_path,
             const std::vector<std::string> &vocab_list);

  // load language model from given path
  void load_lm(const std::string &lm_path);

  // fill dictionary for FST
  void fill_dictionary(bool add_space);

  // set char map
  void set_char_map(const std::vector<std::string> &char_list);

  double get_log_prob(const std::vector<std::string> &words);

  // translate the vector in index to string
  std::string vec2str(const std::vector<int> &input);

private:
  void *language_model_;
  bool is_character_based_;
  size_t max_order_;
  size_t dict_size_;

  int SPACE_ID_;
  std::vector<std::string> char_list_;
  std::unordered_map<std::string, int> char_map_;

  std::vector<std::string> vocabulary_;
};

#endif  // SCORER_H_
Support paddle 2.x (#538) * 2.x model * model test pass * fix data * fix soundfile with flac support * one thread dataloader test pass * export feasture size add trainer and utils add setup model and dataloader update travis using Bionic dist * add venv; test under venv * fix unittest; train and valid * add train and config * add config and train script * fix ctc cuda memcopy error * fix imports * fix train valid log * fix dataset batch shuffle shift start from 1 fix rank_zero_only decreator error close tensorboard when train over add decoding config and code * test process can run * test with decoding * test and infer with decoding * fix infer * fix ctc loss lr schedule sortagrad logger * aishell egs * refactor train add aishell egs * fix dataset batch shuffle and add batch sampler log print model parameter * fix model and ctc * sequence_mask make all inputs zeros, which cause grad be zero, this is a bug of LessThanOp add grad clip by global norm add model train test notebook * ctc loss remove run prefix using ord value as text id * using unk when training compute_loss need text ids ord id using in test mode, which compute wer/cer * fix tester * add lr_deacy refactor code * fix tools * fix ci add tune fix gru model bugs add dataset and model test * fix decoding * refactor repo fix decoding * fix musan and rir dataset * refactor io, loss, conv, rnn, gradclip, model, utils * fix ci and import * refactor model add export jit model * add deploy bin and test it * rm uselss egs * add layer tools * refactor socket server new model from pretrain * remve useless * fix instability loss and grad nan or inf for librispeech training * fix sampler * fix libri train.sh * fix doc * add license on cpp * fix doc * fix libri script * fix install * clip 5 wer 7.39, clip 400 wer 7.54, 1.8 clip 400 baseline 7.49 4 years ago			`// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

add initial files for deployment 7 years ago			`#ifndef SCORER_H_`
			`#define SCORER_H_`

Refactor scorer and move utility functions to decoder_util.h 7 years ago			`#include <memory>`
format C++ source code 7 years ago			`#include <string>`
convert data structure for prefix from map to trie tree 7 years ago			`#include <unordered_map>`
format C++ source code 7 years ago			`#include <vector>`
format header includes & update setup info 7 years ago
Refactor scorer and move utility functions to decoder_util.h 7 years ago			`#include "lm/enumerate_vocab.hh"`
			`#include "lm/virtual_interface.hh"`
format C++ source code 7 years ago			`#include "lm/word_index.hh"`
			`#include "util/string_piece.hh"`
add initial files for deployment 7 years ago
format header includes & update setup info 7 years ago			`#include "path_trie.h"`

streamline source code 7 years ago			`const double OOV_SCORE = -1000.0;`
Refactor scorer and move utility functions to decoder_util.h 7 years ago			`const std::string START_TOKEN = "<s>";`
			`const std::string UNK_TOKEN = "<unk>";`
			`const std::string END_TOKEN = "</s>";`
add initial files for deployment 7 years ago
format varabiables' name & add more comments 7 years ago			`// Implement a callback to retrive the dictionary of language model.`
Refactor scorer and move utility functions to decoder_util.h 7 years ago			`class RetriveStrEnumerateVocab : public lm::EnumerateVocab {`
			`public:`
format C++ source code 7 years ago			`RetriveStrEnumerateVocab() {}`
add initial files for deployment 7 years ago
append some comments 7 years ago			`void Add(lm::WordIndex index, const StringPiece &str) {`
format C++ source code 7 years ago			`vocabulary.push_back(std::string(str.data(), str.length()));`
			`}`
Refactor scorer and move utility functions to decoder_util.h 7 years ago
format C++ source code 7 years ago			`std::vector<std::string> vocabulary;`
Refactor scorer and move utility functions to decoder_util.h 7 years ago			`};`
code cleanup for the deployment decoder 7 years ago
append some comments 7 years ago			`/* External scorer to query score for n-gram or sentence, including language`
			`* model scoring and word insertion.`
format header includes & update setup info 7 years ago			`*`
			`* Example:`
			`* Scorer scorer(alpha, beta, "path_of_language_model");`
			`* scorer.get_log_cond_prob({ "WORD1", "WORD2", "WORD3" });`
			`* scorer.get_sent_log_prob({ "WORD1", "WORD2", "WORD3" });`
			`*/`
format C++ source code 7 years ago			`class Scorer {`
add initial files for deployment 7 years ago			`public:`
adjust scorer's init & add logging for scorer & separate long functions 7 years ago			`Scorer(double alpha,`
			`double beta,`
			`const std::string &lm_path,`
			`const std::vector<std::string> &vocabulary);`
format C++ source code 7 years ago			`~Scorer();`
clean up code & update README for decoder in deployment 7 years ago
append some comments 7 years ago			`double get_log_cond_prob(const std::vector<std::string> &words);`
clean up code & update README for decoder in deployment 7 years ago
append some comments 7 years ago			`double get_sent_log_prob(const std::vector<std::string> &words);`
clean up code & update README for decoder in deployment 7 years ago
format varabiables' name & add more comments 7 years ago			`// return the max order`
			`size_t get_max_order() const { return max_order_; }`
clean up code & update README for decoder in deployment 7 years ago
format varabiables' name & add more comments 7 years ago			`// return the dictionary size of language model`
			`size_t get_dict_size() const { return dict_size_; }`
clean up code & update README for decoder in deployment 7 years ago
format varabiables' name & add more comments 7 years ago			`// retrun true if the language model is character based`
			`bool is_character_based() const { return is_character_based_; }`
clean up code & update README for decoder in deployment 7 years ago
format C++ source code 7 years ago			`// reset params alpha & beta`
			`void reset_params(float alpha, float beta);`
clean up code & update README for decoder in deployment 7 years ago
adjust scorer's init & add logging for scorer & separate long functions 7 years ago			`// make ngram for a given prefix`
append some comments 7 years ago			`std::vector<std::string> make_ngram(PathTrie *prefix);`
clean up code & update README for decoder in deployment 7 years ago
adjust scorer's init & add logging for scorer & separate long functions 7 years ago			`// trransform the labels in index to the vector of words (word based lm) or`
			`// the vector of characters (character based lm)`
append some comments 7 years ago			`std::vector<std::string> split_labels(const std::vector<int> &labels);`
clean up code & update README for decoder in deployment 7 years ago
format varabiables' name & add more comments 7 years ago			`// language model weight`
format C++ source code 7 years ago			`double alpha;`
format varabiables' name & add more comments 7 years ago			`// word insertion weight`
format C++ source code 7 years ago			`double beta;`
clean up code & update README for decoder in deployment 7 years ago
format varabiables' name & add more comments 7 years ago			`// pointer to the dictionary of FST`
append some comments 7 years ago			`void *dictionary;`
code cleanup for the deployment decoder 7 years ago
Refactor scorer and move utility functions to decoder_util.h 7 years ago			`protected:`
format varabiables' name & add more comments 7 years ago			`// necessary setup: load language model, set char map, fill FST's dictionary`
adjust scorer's init & add logging for scorer & separate long functions 7 years ago			`void setup(const std::string &lm_path,`
			`const std::vector<std::string> &vocab_list);`

format varabiables' name & add more comments 7 years ago			`// load language model from given path`
adjust scorer's init & add logging for scorer & separate long functions 7 years ago			`void load_lm(const std::string &lm_path);`

format varabiables' name & add more comments 7 years ago			`// fill dictionary for FST`
adjust scorer's init & add logging for scorer & separate long functions 7 years ago			`void fill_dictionary(bool add_space);`

			`// set char map`
			`void set_char_map(const std::vector<std::string> &char_list);`
clean up code & update README for decoder in deployment 7 years ago
append some comments 7 years ago			`double get_log_prob(const std::vector<std::string> &words);`
clean up code & update README for decoder in deployment 7 years ago
format varabiables' name & add more comments 7 years ago			`// translate the vector in index to string`
append some comments 7 years ago			`std::string vec2str(const std::vector<int> &input);`
Refactor scorer and move utility functions to decoder_util.h 7 years ago
			`private:`
format varabiables' name & add more comments 7 years ago			`void *language_model_;`
			`bool is_character_based_;`
			`size_t max_order_;`
			`size_t dict_size_;`
convert data structure for prefix from map to trie tree 7 years ago
format varabiables' name & add more comments 7 years ago			`int SPACE_ID_;`
			`std::vector<std::string> char_list_;`
fix decoders: force indices in FST starting from one & add version check in setup 7 years ago			`std::unordered_map<std::string, int> char_map_;`
convert data structure for prefix from map to trie tree 7 years ago
format varabiables' name & add more comments 7 years ago			`std::vector<std::string> vocabulary_;`
add initial files for deployment 7 years ago			`};`

format C++ source code 7 years ago			`#endif // SCORER_H_`