pull/3030/head
TianYuan 3 years ago
parent c5417c32bf
commit b4f56ef67f

@ -60,7 +60,7 @@ int main(int argc, char** argv) {
for (int i = 0; i < sentence_part.size(); i++) {
LOG(INFO) << "Raw sentence is: "
<< ppspeech::wstring2utf8string(sentence_part[i]);
front_inst->SentenceNormalize(sentence_part[i]);
front_inst->SentenceNormalize(&sentence_part[i]);
s_sentence = ppspeech::wstring2utf8string(sentence_part[i]);
LOG(INFO) << "After normalization sentence is: " << s_sentence;

@ -199,14 +199,14 @@ std::string TextNormalizer::Digits2Text(const std::wstring &num) {
}
// 日期2021年8月18日 --> 二零二一年八月十八日
int TextNormalizer::ReData(std::wstring &sentence) {
int TextNormalizer::ReData(std::wstring *sentence) {
std::wregex reg(
L"(\\d{4}|\\d{2})年((0?[1-9]|1[0-2])月)?(((0?[1-9])|((1|2)[0-9])|30|31)"
L"([日号]))?");
std::wsmatch match;
std::string rep;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
rep = "";
rep += SingleDigit2Text(match[1]) + "";
if (match[3] != L"") {
@ -217,7 +217,7 @@ int TextNormalizer::ReData(std::wstring &sentence) {
wstring2utf8string(match[9]);
}
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -228,18 +228,18 @@ int TextNormalizer::ReData(std::wstring &sentence) {
// XX-XX-XX or XX/XX/XX 例如2021/08/18 --> 二零二一年八月十八日
int TextNormalizer::ReData2(std::wstring &sentence) {
int TextNormalizer::ReData2(std::wstring *sentence) {
std::wregex reg(
L"(\\d{4})([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])");
std::wsmatch match;
std::string rep;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
rep = "";
rep += (SingleDigit2Text(match[1]) + "");
rep += (MultiDigit2Text(match[3], false, false) + "");
rep += (MultiDigit2Text(match[4], false, false) + "");
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -249,12 +249,12 @@ int TextNormalizer::ReData2(std::wstring &sentence) {
}
// XX:XX:XX 09:09:02 --> 九点零九分零二秒
int TextNormalizer::ReTime(std::wstring &sentence) {
int TextNormalizer::ReTime(std::wstring *sentence) {
std::wregex reg(L"([0-1]?[0-9]|2[0-3]):([0-5][0-9])(:([0-5][0-9]))?");
std::wsmatch match;
std::string rep;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
rep = "";
rep += (MultiDigit2Text(match[1], false, false) + "");
if (absl::StartsWith(wstring2utf8string(match[2]), "0")) {
@ -266,7 +266,7 @@ int TextNormalizer::ReTime(std::wstring &sentence) {
}
rep += (MultiDigit2Text(match[4]) + "");
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -276,7 +276,7 @@ int TextNormalizer::ReTime(std::wstring &sentence) {
}
// 温度,例如:-24.3℃ --> 零下二十四点三度
int TextNormalizer::ReTemperature(std::wstring &sentence) {
int TextNormalizer::ReTemperature(std::wstring *sentence) {
std::wregex reg(L"(-?)(\\d+(\\.\\d+)?)(°C|℃|度|摄氏度)");
std::wsmatch match;
std::string rep;
@ -284,12 +284,12 @@ int TextNormalizer::ReTemperature(std::wstring &sentence) {
std::vector<std::string> integer_decimal;
std::string unit;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
match[1] == L"-" ? sign = "" : sign = "";
match[4] == L"摄氏度" ? unit = "摄氏度" : unit = "";
rep = sign + Digits2Text(match[2]) + unit;
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -299,16 +299,16 @@ int TextNormalizer::ReTemperature(std::wstring &sentence) {
}
// 分数,例如: 1/3 --> 三分之一
int TextNormalizer::ReFrac(std::wstring &sentence) {
int TextNormalizer::ReFrac(std::wstring *sentence) {
std::wregex reg(L"(-?)(\\d+)/(\\d+)");
std::wsmatch match;
std::string sign;
std::string rep;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
match[1] == L"-" ? sign = "" : sign = "";
rep = sign + MultiDigit2Text(match[3]) + "分之" +
MultiDigit2Text(match[2]);
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -318,18 +318,18 @@ int TextNormalizer::ReFrac(std::wstring &sentence) {
}
// 百分数例如45.5% --> 百分之四十五点五
int TextNormalizer::RePercentage(std::wstring &sentence) {
int TextNormalizer::RePercentage(std::wstring *sentence) {
std::wregex reg(L"(-?)(\\d+(\\.\\d+)?)%");
std::wsmatch match;
std::string sign;
std::string rep;
std::vector<std::string> integer_decimal;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
match[1] == L"-" ? sign = "" : sign = "";
rep = sign + "百分之" + Digits2Text(match[2]);
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -339,21 +339,21 @@ int TextNormalizer::RePercentage(std::wstring &sentence) {
}
// 手机号码,例如:+86 18883862235 --> 八六幺八八八三八六二二三五
int TextNormalizer::ReMobilePhone(std::wstring &sentence) {
int TextNormalizer::ReMobilePhone(std::wstring *sentence) {
std::wregex reg(
L"(\\d)?((\\+?86 ?)?1([38]\\d|5[0-35-9]|7[678]|9[89])\\d{8})(\\d)?");
std::wsmatch match;
std::string rep;
std::vector<std::string> country_phonenum;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
country_phonenum = absl::StrSplit(wstring2utf8string(match[0]), "+");
rep = "";
for (int i = 0; i < country_phonenum.size(); i++) {
LOG(INFO) << country_phonenum[i];
rep += SingleDigit2Text(country_phonenum[i], true);
}
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -363,20 +363,20 @@ int TextNormalizer::ReMobilePhone(std::wstring &sentence) {
}
// 座机号码例如010-51093154 --> 零幺零五幺零九三幺五四
int TextNormalizer::RePhone(std::wstring &sentence) {
int TextNormalizer::RePhone(std::wstring *sentence) {
std::wregex reg(
L"(\\d)?((0(10|2[1-3]|[3-9]\\d{2})-?)?[1-9]\\d{6,7})(\\d)?");
std::wsmatch match;
std::vector<std::string> zone_phonenum;
std::string rep;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
rep = "";
zone_phonenum = absl::StrSplit(wstring2utf8string(match[0]), "-");
for (int i = 0; i < zone_phonenum.size(); i++) {
rep += SingleDigit2Text(zone_phonenum[i], true);
}
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -386,7 +386,7 @@ int TextNormalizer::RePhone(std::wstring &sentence) {
}
// 范围例如60~90 --> 六十到九十
int TextNormalizer::ReRange(std::wstring &sentence) {
int TextNormalizer::ReRange(std::wstring *sentence) {
std::wregex reg(
L"((-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+)))[-~]((-?)((\\d+)(\\.\\d+)?)|(\\.("
L"\\d+)))");
@ -395,7 +395,7 @@ int TextNormalizer::ReRange(std::wstring &sentence) {
std::string sign1;
std::string sign2;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
rep = "";
match[2] == L"-" ? sign1 = "" : sign1 = "";
if (match[6] != L"") {
@ -410,7 +410,7 @@ int TextNormalizer::ReRange(std::wstring &sentence) {
rep += sign2 + Digits2Text(match[10]);
}
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -420,13 +420,13 @@ int TextNormalizer::ReRange(std::wstring &sentence) {
}
// 带负号的整数,例如:-10 --> 负十
int TextNormalizer::ReInterger(std::wstring &sentence) {
int TextNormalizer::ReInterger(std::wstring *sentence) {
std::wregex reg(L"(-)(\\d+)");
std::wsmatch match;
std::string rep;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
rep = "" + MultiDigit2Text(match[2]);
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -436,13 +436,13 @@ int TextNormalizer::ReInterger(std::wstring &sentence) {
}
// 纯小数
int TextNormalizer::ReDecimalNum(std::wstring &sentence) {
int TextNormalizer::ReDecimalNum(std::wstring *sentence) {
std::wregex reg(L"(-?)((\\d+)(\\.\\d+))|(\\.(\\d+))");
std::wsmatch match;
std::string sign;
std::string rep;
// std::vector<std::string> integer_decimal;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
match[1] == L"-" ? sign = "" : sign = "";
if (match[5] != L"") {
rep = sign + Digits2Text(match[5]);
@ -450,7 +450,7 @@ int TextNormalizer::ReDecimalNum(std::wstring &sentence) {
rep = sign + Digits2Text(match[2]);
}
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -460,7 +460,7 @@ int TextNormalizer::ReDecimalNum(std::wstring &sentence) {
}
// 正整数 + 量词
int TextNormalizer::RePositiveQuantifiers(std::wstring &sentence) {
int TextNormalizer::RePositiveQuantifiers(std::wstring *sentence) {
std::wstring common_quantifiers =
L"(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|"
L"担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|"
@ -475,9 +475,9 @@ int TextNormalizer::RePositiveQuantifiers(std::wstring &sentence) {
std::wregex reg(L"(\\d+)([多余几])?" + common_quantifiers);
std::wsmatch match;
std::string rep;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
rep = MultiDigit2Text(match[1]);
Replace(&sentence,
Replace(sentence,
match.position(1),
match.length(1),
utf8string2wstring(rep));
@ -487,11 +487,11 @@ int TextNormalizer::RePositiveQuantifiers(std::wstring &sentence) {
}
// 编号类数字,例如: 89757 --> 八九七五七
int TextNormalizer::ReDefalutNum(std::wstring &sentence) {
int TextNormalizer::ReDefalutNum(std::wstring *sentence) {
std::wregex reg(L"\\d{3}\\d*");
std::wsmatch match;
while (std::regex_search(sentence, match, reg)) {
Replace(&sentence,
while (std::regex_search(*sentence, match, reg)) {
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(SingleDigit2Text(match[0])));
@ -500,12 +500,12 @@ int TextNormalizer::ReDefalutNum(std::wstring &sentence) {
return 0;
}
int TextNormalizer::ReNumber(std::wstring &sentence) {
int TextNormalizer::ReNumber(std::wstring *sentence) {
std::wregex reg(L"(-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+))");
std::wsmatch match;
std::string sign;
std::string rep;
while (std::regex_search(sentence, match, reg)) {
while (std::regex_search(*sentence, match, reg)) {
match[1] == L"-" ? sign = "" : sign = "";
if (match[5] != L"") {
rep = sign + Digits2Text(match[5]);
@ -513,7 +513,7 @@ int TextNormalizer::ReNumber(std::wstring &sentence) {
rep = sign + Digits2Text(match[2]);
}
Replace(&sentence,
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
@ -522,7 +522,7 @@ int TextNormalizer::ReNumber(std::wstring &sentence) {
}
// 整体正则,按顺序
int TextNormalizer::SentenceNormalize(std::wstring &sentence) {
int TextNormalizer::SentenceNormalize(std::wstring *sentence) {
ReData(sentence);
ReData2(sentence);
ReTime(sentence);

@ -51,21 +51,21 @@ class TextNormalizer {
std::string Digits2Text(const std::string &num_str);
std::string Digits2Text(const std::wstring &num);
int ReData(std::wstring &sentence);
int ReData2(std::wstring &sentence);
int ReTime(std::wstring &sentence);
int ReTemperature(std::wstring &sentence);
int ReFrac(std::wstring &sentence);
int RePercentage(std::wstring &sentence);
int ReMobilePhone(std::wstring &sentence);
int RePhone(std::wstring &sentence);
int ReRange(std::wstring &sentence);
int ReInterger(std::wstring &sentence);
int ReDecimalNum(std::wstring &sentence);
int RePositiveQuantifiers(std::wstring &sentence);
int ReDefalutNum(std::wstring &sentence);
int ReNumber(std::wstring &sentence);
int SentenceNormalize(std::wstring &sentence);
int ReData(std::wstring *sentence);
int ReData2(std::wstring *sentence);
int ReTime(std::wstring *sentence);
int ReTemperature(std::wstring *sentence);
int ReFrac(std::wstring *sentence);
int RePercentage(std::wstring *sentence);
int ReMobilePhone(std::wstring *sentence);
int RePhone(std::wstring *sentence);
int ReRange(std::wstring *sentence);
int ReInterger(std::wstring *sentence);
int ReDecimalNum(std::wstring *sentence);
int RePositiveQuantifiers(std::wstring *sentence);
int ReDefalutNum(std::wstring *sentence);
int ReNumber(std::wstring *sentence);
int SentenceNormalize(std::wstring *sentence);
private:

Loading…
Cancel
Save