You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/deploy/decoder_utils.cpp

107 lines
3.2 KiB

#include <limits>
#include <algorithm>
#include <cmath>
#include "decoder_utils.h"
size_t get_utf8_str_len(const std::string& str) {
size_t str_len = 0;
for (char c : str) {
str_len += ((c & 0xc0) != 0x80);
}
return str_len;
}
//------------------------------------------------------
//Splits string into vector of strings representing
//UTF-8 characters (not same as chars)
//------------------------------------------------------
std::vector<std::string> UTF8_split(const std::string& str)
{
std::vector<std::string> result;
std::string out_str;
for (char c : str)
{
if ((c & 0xc0) != 0x80) //new UTF-8 character
{
if (!out_str.empty())
{
result.push_back(out_str);
out_str.clear();
}
}
out_str.append(1, c);
}
result.push_back(out_str);
return result;
}
//-------------------------------------------------------
// Overriding less than operator for sorting
//-------------------------------------------------------
bool prefix_compare(const PathTrie* x, const PathTrie* y) {
if (x->_score == y->_score) {
if (x->_character == y->_character) {
return false;
} else {
return (x->_character < y->_character);
}
} else {
return x->_score > y->_score;
}
} //---------- End path_compare ---------------------------
// --------------------------------------------------------------
// Adds word to fst without copying entire dictionary
// --------------------------------------------------------------
void add_word_to_fst(const std::vector<int>& word,
fst::StdVectorFst* dictionary) {
if (dictionary->NumStates() == 0) {
fst::StdVectorFst::StateId start = dictionary->AddState();
assert(start == 0);
dictionary->SetStart(start);
}
fst::StdVectorFst::StateId src = dictionary->Start();
fst::StdVectorFst::StateId dst;
for (auto c : word) {
dst = dictionary->AddState();
dictionary->AddArc(src, fst::StdArc(c, c, 0, dst));
src = dst;
}
dictionary->SetFinal(dst, fst::StdArc::Weight::One());
} // ------------ End of add_word_to_fst -----------------------
// ---------------------------------------------------------
// Adds a word to the dictionary FST based on char_map
// ---------------------------------------------------------
bool add_word_to_dictionary(const std::string& word,
const std::unordered_map<std::string, int>& char_map,
bool add_space,
int SPACE,
fst::StdVectorFst* dictionary) {
auto characters = UTF8_split(word);
std::vector<int> int_word;
for (auto& c : characters) {
if (c == " ") {
int_word.push_back(SPACE);
} else {
auto int_c = char_map.find(c);
if (int_c != char_map.end()) {
int_word.push_back(int_c->second);
} else {
return false; // return without adding
}
}
}
if (add_space) {
int_word.push_back(SPACE);
}
add_word_to_fst(int_word, dictionary);
return true;
} // -------------- End of addWordToDictionary ------------