#include "Vocab.h" #include #include #include #include #include using namespace std; Vocab::Vocab(const char *filename) { ifstream in(filename); string line; if (in) // 有该文件 { while (getline(in, line)) // line中不包括每行的换行符 { vocab.push_back(line); } // cout << vocab[1719] << endl; } // else // 没有该文件 //{ // cout << "no such file" << endl; // } } Vocab::~Vocab() { } string Vocab::vector2string(vector in) { stringstream ss; for (auto it = in.begin(); it != in.end(); it++) { ss << vocab[*it]; } return ss.str(); } int str2int(string str) { const char *ch_array = str.c_str(); if (((ch_array[0] & 0xf0) != 0xe0) || ((ch_array[1] & 0xc0) != 0x80) || ((ch_array[2] & 0xc0) != 0x80)) return 0; int val = ((ch_array[0] & 0x0f) << 12) | ((ch_array[1] & 0x3f) << 6) | (ch_array[2] & 0x3f); return val; } bool Vocab::isChinese(string ch) { if (ch.size() != 3) { return false; } int unicode = str2int(ch); if (unicode >= 19968 && unicode <= 40959) { return true; } return false; } string Vocab::vector2stringV2(vector in) { int i; list words; int is_pre_english = false; int pre_english_len = 0; int is_combining = false; string combine = ""; for (auto it = in.begin(); it != in.end(); it++) { string word = vocab[*it]; // step1 space character skips if (word == "~~" || word == "~~" || word == "") continue; // step2 combie phoneme to full word { int sub_word = !(word.find("@@") == string::npos); // process word start and middle part if (sub_word) { combine += word.erase(word.length() - 2); is_combining = true; continue; } // process word end part else if (is_combining) { combine += word; is_combining = false; word = combine; combine = ""; } } // step3 process english word deal with space , turn abbreviation to upper case { // input word is chinese, not need process if (isChinese(word)) { words.push_back(word); is_pre_english = false; } // input word is english word else { // pre word is chinese if (!is_pre_english) { word[0] = word[0] - 32; words.push_back(word); pre_english_len = word.size(); } // pre word is english word else { // single letter turn to upper case if (word.size() == 1) { word[0] = word[0] - 32; } if (pre_english_len > 1) { words.push_back(" "); words.push_back(word); pre_english_len = word.size(); } else { if (word.size() > 1) { words.push_back(" "); } words.push_back(word); pre_english_len = word.size(); } } is_pre_english = true; } } } // for (auto it = words.begin(); it != words.end(); it++) { // cout << *it << endl; // } stringstream ss; for (auto it = words.begin(); it != words.end(); it++) { ss << *it; } return ss.str(); } string Vocab::vector2stringV3(string in) { int i; list words; words.push_back(in.c_str()); int is_pre_english = false; int pre_english_len = 0; int is_combining = false; string combine = ""; for (auto it = in.begin(); it != in.end(); it++) { string word = vocab[*it]; // step1 space character skips if (word == "~~" || word == "~~" || word == "") continue; // step2 combie phoneme to full word { int sub_word = !(word.find("@@") == string::npos); // process word start and middle part if (sub_word) { combine += word.erase(word.length() - 2); is_combining = true; continue; } // process word end part else if (is_combining) { combine += word; is_combining = false; word = combine; combine = ""; } } // step3 process english word deal with space , turn abbreviation to upper case { // input word is chinese, not need process if (isChinese(word)) { words.push_back(word); is_pre_english = false; } // input word is english word else { // pre word is chinese if (!is_pre_english) { word[0] = word[0] - 32; words.push_back(word); pre_english_len = word.size(); } // pre word is english word else { // single letter turn to upper case if (word.size() == 1) { word[0] = word[0] - 32; } if (pre_english_len > 1) { words.push_back(" "); words.push_back(word); pre_english_len = word.size(); } else { if (word.size() > 1) { words.push_back(" "); } words.push_back(word); pre_english_len = word.size(); } } is_pre_english = true; } } } // for (auto it = words.begin(); it != words.end(); it++) { // cout << *it << endl; // } stringstream ss; for (auto it = words.begin(); it != words.end(); it++) { ss << *it; } return ss.str(); } int Vocab::size() { return vocab.size(); }