PaddleSpeech/deploy/decoder_utils.cpp

#include <limits>
#include <algorithm>
#include <cmath>
#include "decoder_utils.h"

size_t get_utf8_str_len(const std::string& str) {
    size_t str_len = 0;
    for (char c : str) {
        str_len += ((c & 0xc0) != 0x80);
    }
    return str_len;
}

//------------------------------------------------------
//Splits string into vector of strings representing
//UTF-8 characters (not same as chars)
//------------------------------------------------------
std::vector<std::string> UTF8_split(const std::string& str)
{
  std::vector<std::string> result;
  std::string out_str;

  for (char c : str)
    {
      if ((c & 0xc0) != 0x80)  //new UTF-8 character
        {
          if (!out_str.empty())
            {
              result.push_back(out_str);
              out_str.clear();
            }
        }

      out_str.append(1, c);
    }
  result.push_back(out_str);
  return result;
}

//-------------------------------------------------------
//  Overriding less than operator for sorting
//-------------------------------------------------------
bool prefix_compare(const PathTrie* x,  const PathTrie* y) {
    if (x->_score == y->_score) {
        if (x->_character == y->_character) {
            return false;
        } else {
            return (x->_character < y->_character);
        }
    } else {
        return x->_score > y->_score;
    }
}  //---------- End path_compare ---------------------------

// --------------------------------------------------------------
// Adds word to fst without copying entire dictionary
// --------------------------------------------------------------
void add_word_to_fst(const std::vector<int>& word,
                     fst::StdVectorFst* dictionary) {
    if (dictionary->NumStates() == 0) {
        fst::StdVectorFst::StateId start = dictionary->AddState();
        assert(start == 0);
        dictionary->SetStart(start);
    }
    fst::StdVectorFst::StateId src = dictionary->Start();
    fst::StdVectorFst::StateId dst;
    for (auto c : word) {
        dst = dictionary->AddState();
        dictionary->AddArc(src, fst::StdArc(c, c, 0, dst));
        src = dst;
    }
    dictionary->SetFinal(dst, fst::StdArc::Weight::One());
}  // ------------ End of add_word_to_fst -----------------------

// ---------------------------------------------------------
// Adds a word to the dictionary FST based on char_map
// ---------------------------------------------------------
bool add_word_to_dictionary(const std::string& word,
                         const std::unordered_map<std::string, int>& char_map,
                         bool add_space,
                         int SPACE,
                         fst::StdVectorFst* dictionary) {
    auto characters = UTF8_split(word);

    std::vector<int> int_word;

    for (auto& c : characters) {
        if (c == " ") {
            int_word.push_back(SPACE);
        } else {
            auto int_c = char_map.find(c);
            if (int_c != char_map.end()) {
                int_word.push_back(int_c->second);
            } else {
                return false;  // return without adding
            }
        }
    }

    if (add_space) {
        int_word.push_back(SPACE);
    }

    add_word_to_fst(int_word, dictionary);
    return true;
}  // -------------- End of addWordToDictionary ------------
reorganize cpp files 7 years ago			`#include <limits>`
			`#include <algorithm>`
			`#include <cmath>`
			`#include "decoder_utils.h"`

Refactor scorer and move utility functions to decoder_util.h 7 years ago			`size_t get_utf8_str_len(const std::string& str) {`
			`size_t str_len = 0;`
			`for (char c : str) {`
			`str_len += ((c & 0xc0) != 0x80);`
			`}`
			`return str_len;`
			`}`
convert data structure for prefix from map to trie tree 7 years ago
enable finite-state transducer in beam search decoding 7 years ago			`//------------------------------------------------------`
			`//Splits string into vector of strings representing`
			`//UTF-8 characters (not same as chars)`
			`//------------------------------------------------------`
			`std::vector<std::string> UTF8_split(const std::string& str)`
			`{`
			`std::vector<std::string> result;`
			`std::string out_str;`

			`for (char c : str)`
			`{`
			`if ((c & 0xc0) != 0x80) //new UTF-8 character`
			`{`
			`if (!out_str.empty())`
			`{`
			`result.push_back(out_str);`
			`out_str.clear();`
			`}`
			`}`

			`out_str.append(1, c);`
			`}`
			`result.push_back(out_str);`
			`return result;`
			`}`

convert data structure for prefix from map to trie tree 7 years ago			`//-------------------------------------------------------`
			`// Overriding less than operator for sorting`
			`//-------------------------------------------------------`
			`bool prefix_compare(const PathTrie* x, const PathTrie* y) {`
			`if (x->_score == y->_score) {`
			`if (x->_character == y->_character) {`
			`return false;`
			`} else {`
			`return (x->_character < y->_character);`
			`}`
			`} else {`
			`return x->_score > y->_score;`
			`}`
			`} //---------- End path_compare ---------------------------`

			`// --------------------------------------------------------------`
			`// Adds word to fst without copying entire dictionary`
			`// --------------------------------------------------------------`
			`void add_word_to_fst(const std::vector<int>& word,`
			`fst::StdVectorFst* dictionary) {`
			`if (dictionary->NumStates() == 0) {`
			`fst::StdVectorFst::StateId start = dictionary->AddState();`
			`assert(start == 0);`
			`dictionary->SetStart(start);`
			`}`
			`fst::StdVectorFst::StateId src = dictionary->Start();`
			`fst::StdVectorFst::StateId dst;`
			`for (auto c : word) {`
			`dst = dictionary->AddState();`
			`dictionary->AddArc(src, fst::StdArc(c, c, 0, dst));`
			`src = dst;`
			`}`
			`dictionary->SetFinal(dst, fst::StdArc::Weight::One());`
			`} // ------------ End of add_word_to_fst -----------------------`

			`// ---------------------------------------------------------`
			`// Adds a word to the dictionary FST based on char_map`
			`// ---------------------------------------------------------`
enable finite-state transducer in beam search decoding 7 years ago			`bool add_word_to_dictionary(const std::string& word,`
convert data structure for prefix from map to trie tree 7 years ago			`const std::unordered_map<std::string, int>& char_map,`
			`bool add_space,`
			`int SPACE,`
			`fst::StdVectorFst* dictionary) {`
			`auto characters = UTF8_split(word);`

			`std::vector<int> int_word;`

			`for (auto& c : characters) {`
			`if (c == " ") {`
			`int_word.push_back(SPACE);`
			`} else {`
			`auto int_c = char_map.find(c);`
			`if (int_c != char_map.end()) {`
			`int_word.push_back(int_c->second);`
			`} else {`
			`return false; // return without adding`
			`}`
			`}`
			`}`

			`if (add_space) {`
			`int_word.push_back(SPACE);`
			`}`

			`add_word_to_fst(int_word, dictionary);`
			`return true;`
			`} // -------------- End of addWordToDictionary ------------`