|
|
@ -1,3 +1,16 @@
|
|
|
|
|
|
|
|
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
|
|
|
|
//
|
|
|
|
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
|
|
|
|
//
|
|
|
|
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
//
|
|
|
|
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
|
|
|
|
// limitations under the License.
|
|
|
|
#include "front/front_interface.h"
|
|
|
|
#include "front/front_interface.h"
|
|
|
|
|
|
|
|
|
|
|
|
namespace ppspeech {
|
|
|
|
namespace ppspeech {
|
|
|
@ -11,66 +24,93 @@ int FrontEngineInterface::init() {
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
_jieba = new cppjieba::Jieba(_jieba_dict_path, _jieba_hmm_path, _jieba_user_dict_path,
|
|
|
|
_jieba = new cppjieba::Jieba(_jieba_dict_path,
|
|
|
|
_jieba_idf_path, _jieba_stop_word_path);
|
|
|
|
_jieba_hmm_path,
|
|
|
|
|
|
|
|
_jieba_user_dict_path,
|
|
|
|
_punc = {",", "。", "、", "?", ":", ";", "~", "!",
|
|
|
|
_jieba_idf_path,
|
|
|
|
",", ".", "?", "!", ":", ";", "/", "\\"};
|
|
|
|
_jieba_stop_word_path);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_punc = {",",
|
|
|
|
|
|
|
|
"。",
|
|
|
|
|
|
|
|
"、",
|
|
|
|
|
|
|
|
"?",
|
|
|
|
|
|
|
|
":",
|
|
|
|
|
|
|
|
";",
|
|
|
|
|
|
|
|
"~",
|
|
|
|
|
|
|
|
"!",
|
|
|
|
|
|
|
|
",",
|
|
|
|
|
|
|
|
".",
|
|
|
|
|
|
|
|
"?",
|
|
|
|
|
|
|
|
"!",
|
|
|
|
|
|
|
|
":",
|
|
|
|
|
|
|
|
";",
|
|
|
|
|
|
|
|
"/",
|
|
|
|
|
|
|
|
"\\"};
|
|
|
|
_punc_omit = {"“", "”", "\"", "\""};
|
|
|
|
_punc_omit = {"“", "”", "\"", "\""};
|
|
|
|
|
|
|
|
|
|
|
|
// 需要儿化音处理的词语
|
|
|
|
// 需要儿化音处理的词语
|
|
|
|
must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"};
|
|
|
|
must_erhua = {
|
|
|
|
not_erhua = {
|
|
|
|
"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"};
|
|
|
|
"虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿",
|
|
|
|
not_erhua = {"虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿",
|
|
|
|
"拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
|
|
|
|
"有儿", "一儿", "我儿", "俺儿", "妻儿", "拐儿",
|
|
|
|
"流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
|
|
|
|
"聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿",
|
|
|
|
"孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
|
|
|
|
"婴幼儿", "连体儿", "脑瘫儿", "流浪儿", "体弱儿", "混血儿",
|
|
|
|
"狗儿"
|
|
|
|
"蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿",
|
|
|
|
};
|
|
|
|
"侄儿", "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿",
|
|
|
|
|
|
|
|
"花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
|
|
|
|
must_not_neural_tone_words = {"男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子"};
|
|
|
|
"狗儿"};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
must_not_neural_tone_words = {
|
|
|
|
|
|
|
|
"男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子"};
|
|
|
|
// 需要轻声处理的词语
|
|
|
|
// 需要轻声处理的词语
|
|
|
|
must_neural_tone_words = {
|
|
|
|
must_neural_tone_words = {
|
|
|
|
"麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头", "馄饨", "风筝",
|
|
|
|
"麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头",
|
|
|
|
"难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖", "铃铛", "铁匠", "钥匙", "里脊",
|
|
|
|
"馄饨", "风筝", "难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖",
|
|
|
|
"里头", "部分", "那么", "道士", "造化", "迷糊", "连累", "这么", "这个", "运气", "过去",
|
|
|
|
"铃铛", "铁匠", "钥匙", "里脊", "里头", "部分", "那么", "道士", "造化",
|
|
|
|
"软和", "转悠", "踏实", "跳蚤", "跟头", "趔趄", "财主", "豆腐", "讲究", "记性", "记号",
|
|
|
|
"迷糊", "连累", "这么", "这个", "运气", "过去", "软和", "转悠", "踏实",
|
|
|
|
"认识", "规矩", "见识", "裁缝", "补丁", "衣裳", "衣服", "衙门", "街坊", "行李", "行当",
|
|
|
|
"跳蚤", "跟头", "趔趄", "财主", "豆腐", "讲究", "记性", "记号", "认识",
|
|
|
|
"蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄", "萝卜", "荸荠", "苗条", "苗头", "苍蝇", "芝麻",
|
|
|
|
"规矩", "见识", "裁缝", "补丁", "衣裳", "衣服", "衙门", "街坊", "行李",
|
|
|
|
"舒服", "舒坦", "舌头", "自在", "膏药", "脾气", "脑袋", "脊梁", "能耐", "胳膊", "胭脂",
|
|
|
|
"行当", "蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄", "萝卜", "荸荠", "苗条",
|
|
|
|
"胡萝", "胡琴", "胡同", "聪明", "耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆",
|
|
|
|
"苗头", "苍蝇", "芝麻", "舒服", "舒坦", "舌头", "自在", "膏药", "脾气",
|
|
|
|
"老头", "老太", "翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂",
|
|
|
|
"脑袋", "脊梁", "能耐", "胳膊", "胭脂", "胡萝", "胡琴", "胡同", "聪明",
|
|
|
|
"精神", "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语", "笑话", "窟窿",
|
|
|
|
"耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆", "老头", "老太",
|
|
|
|
"窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气", "秀才", "福气", "祖宗", "砚台",
|
|
|
|
"翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂",
|
|
|
|
"码头", "石榴", "石头", "石匠", "知识", "眼睛", "眯缝", "眨巴", "眉毛", "相声", "盘算",
|
|
|
|
"精神", "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语",
|
|
|
|
"白净", "痢疾", "痛快", "疟疾", "疙瘩", "疏忽", "畜生", "生意", "甘蔗", "琵琶", "琢磨",
|
|
|
|
"笑话", "窟窿", "窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气",
|
|
|
|
"琉璃", "玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务", "牲口", "牙碜", "牌楼", "爽快",
|
|
|
|
"秀才", "福气", "祖宗", "砚台", "码头", "石榴", "石头", "石匠", "知识",
|
|
|
|
"爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心", "炊帚", "灯笼", "火候", "漂亮", "滑溜",
|
|
|
|
"眼睛", "眯缝", "眨巴", "眉毛", "相声", "盘算", "白净", "痢疾", "痛快",
|
|
|
|
"溜达", "温和", "清楚", "消息", "浪头", "活泼", "比方", "正经", "欺负", "模糊", "槟榔",
|
|
|
|
"疟疾", "疙瘩", "疏忽", "畜生", "生意", "甘蔗", "琵琶", "琢磨", "琉璃",
|
|
|
|
"棺材", "棒槌", "棉花", "核桃", "栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事",
|
|
|
|
"玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务", "牲口", "牙碜", "牌楼",
|
|
|
|
"木头", "木匠", "朋友", "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾",
|
|
|
|
"爽快", "爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心", "炊帚", "灯笼",
|
|
|
|
"收成", "提防", "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄", "招牌", "招呼",
|
|
|
|
"火候", "漂亮", "滑溜", "溜达", "温和", "清楚", "消息", "浪头", "活泼",
|
|
|
|
"抬举", "护士", "折腾", "扫帚", "打量", "打算", "打点", "打扮", "打听", "打发", "扎实",
|
|
|
|
"比方", "正经", "欺负", "模糊", "槟榔", "棺材", "棒槌", "棉花", "核桃",
|
|
|
|
"扁担", "戒指", "懒得", "意识", "意思", "情形", "悟性", "怪物", "思量", "怎么", "念头",
|
|
|
|
"栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事", "木头", "木匠",
|
|
|
|
"念叨", "快活", "忙活", "志气", "心思", "得罪", "张罗", "弟兄", "开通", "应酬", "庄稼",
|
|
|
|
"朋友", "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾",
|
|
|
|
"干事", "帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌", "差事", "工夫", "岁数",
|
|
|
|
"收成", "提防", "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄",
|
|
|
|
"屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头", "对付", "寡妇", "家伙", "客气",
|
|
|
|
"招牌", "招呼", "抬举", "护士", "折腾", "扫帚", "打量", "打算", "打点",
|
|
|
|
"实在", "官司", "学问", "学生", "字号", "嫁妆", "媳妇", "媒人", "婆家", "娘家", "委屈",
|
|
|
|
"打扮", "打听", "打发", "扎实", "扁担", "戒指", "懒得", "意识", "意思",
|
|
|
|
"姑娘", "姐夫", "妯娌", "妥当", "妖精", "奴才", "女婿", "头发", "太阳", "大爷", "大方",
|
|
|
|
"情形", "悟性", "怪物", "思量", "怎么", "念头", "念叨", "快活", "忙活",
|
|
|
|
"大意", "大夫", "多少", "多么", "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴",
|
|
|
|
"志气", "心思", "得罪", "张罗", "弟兄", "开通", "应酬", "庄稼", "干事",
|
|
|
|
"嘱咐", "嘟囔", "嘀咕", "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴", "哈欠", "哆嗦",
|
|
|
|
"帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌", "差事", "工夫",
|
|
|
|
"咳嗽", "和尚", "告诉", "告示", "含糊", "吓唬", "后头", "名字", "名堂", "合同", "吆喝",
|
|
|
|
"岁数", "屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头", "对付",
|
|
|
|
"叫唤", "口袋", "厚道", "厉害", "千斤", "包袱", "包涵", "匀称", "勤快", "动静", "动弹",
|
|
|
|
"寡妇", "家伙", "客气", "实在", "官司", "学问", "学生", "字号", "嫁妆",
|
|
|
|
"功夫", "力气", "前头", "刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析", "出息",
|
|
|
|
"媳妇", "媒人", "婆家", "娘家", "委屈", "姑娘", "姐夫", "妯娌", "妥当",
|
|
|
|
"凑合", "凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟", "便宜", "使唤",
|
|
|
|
"妖精", "奴才", "女婿", "头发", "太阳", "大爷", "大方", "大意", "大夫",
|
|
|
|
"佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么", "人家", "亲戚", "亲家",
|
|
|
|
"多少", "多么", "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴",
|
|
|
|
"交情", "云彩", "事情", "买卖", "主意", "丫头", "丧气", "两口", "东西", "东家", "世故",
|
|
|
|
"嘱咐", "嘟囔", "嘀咕", "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴",
|
|
|
|
"不由", "不在", "下水", "下巴", "上头", "上司", "丈夫", "丈人", "一辈", "那个", "菩萨",
|
|
|
|
"哈欠", "哆嗦", "咳嗽", "和尚", "告诉", "告示", "含糊", "吓唬", "后头",
|
|
|
|
"父亲", "母亲", "咕噜", "邋遢", "费用", "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅",
|
|
|
|
"名字", "名堂", "合同", "吆喝", "叫唤", "口袋", "厚道", "厉害", "千斤",
|
|
|
|
"幸福", "熟悉", "计划", "扑腾", "蜡烛", "姥爷", "照顾", "喉咙", "吉他", "弄堂", "蚂蚱",
|
|
|
|
"包袱", "包涵", "匀称", "勤快", "动静", "动弹", "功夫", "力气", "前头",
|
|
|
|
"凤凰", "拖沓", "寒碜", "糟蹋", "倒腾", "报复", "逻辑", "盘缠", "喽啰", "牢骚", "咖喱",
|
|
|
|
"刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析", "出息", "凑合",
|
|
|
|
"扫把", "惦记"
|
|
|
|
"凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟", "便宜",
|
|
|
|
};
|
|
|
|
"使唤", "佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么",
|
|
|
|
|
|
|
|
"人家", "亲戚", "亲家", "交情", "云彩", "事情", "买卖", "主意", "丫头",
|
|
|
|
|
|
|
|
"丧气", "两口", "东西", "东家", "世故", "不由", "不在", "下水", "下巴",
|
|
|
|
|
|
|
|
"上头", "上司", "丈夫", "丈人", "一辈", "那个", "菩萨", "父亲", "母亲",
|
|
|
|
|
|
|
|
"咕噜", "邋遢", "费用", "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅",
|
|
|
|
|
|
|
|
"幸福", "熟悉", "计划", "扑腾", "蜡烛", "姥爷", "照顾", "喉咙", "吉他",
|
|
|
|
|
|
|
|
"弄堂", "蚂蚱", "凤凰", "拖沓", "寒碜", "糟蹋", "倒腾", "报复", "逻辑",
|
|
|
|
|
|
|
|
"盘缠", "喽啰", "牢骚", "咖喱", "扫把", "惦记"};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 生成词典(词到音素的映射)
|
|
|
|
// 生成词典(词到音素的映射)
|
|
|
@ -137,7 +177,8 @@ int FrontEngineInterface::ReadConfFile() {
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::Trand2Simp(const std::wstring &sentence, std::wstring &sentence_simp) {
|
|
|
|
int FrontEngineInterface::Trand2Simp(const std::wstring &sentence,
|
|
|
|
|
|
|
|
std::wstring &sentence_simp) {
|
|
|
|
// sentence_simp = sentence;
|
|
|
|
// sentence_simp = sentence;
|
|
|
|
for (int i = 0; i < sentence.length(); i++) {
|
|
|
|
for (int i = 0; i < sentence.length(); i++) {
|
|
|
|
std::wstring temp(1, sentence[i]);
|
|
|
|
std::wstring temp(1, sentence[i]);
|
|
|
@ -146,14 +187,16 @@ int FrontEngineInterface::Trand2Simp(const std::wstring &sentence, std::wstring
|
|
|
|
if (trand_simp_map.find(sigle_word) == trand_simp_map.end()) {
|
|
|
|
if (trand_simp_map.find(sigle_word) == trand_simp_map.end()) {
|
|
|
|
sentence_simp += temp;
|
|
|
|
sentence_simp += temp;
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
sentence_simp += (ppspeech::utf8string2wstring(trand_simp_map[sigle_word]));
|
|
|
|
sentence_simp +=
|
|
|
|
|
|
|
|
(ppspeech::utf8string2wstring(trand_simp_map[sigle_word]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::GenDict(const std::string &dict_file, std::map<std::string, std::string> &map) {
|
|
|
|
int FrontEngineInterface::GenDict(const std::string &dict_file,
|
|
|
|
|
|
|
|
std::map<std::string, std::string> &map) {
|
|
|
|
std::ifstream is(dict_file.c_str(), std::ifstream::in);
|
|
|
|
std::ifstream is(dict_file.c_str(), std::ifstream::in);
|
|
|
|
if (!is.good()) {
|
|
|
|
if (!is.good()) {
|
|
|
|
LOG(ERROR) << "Cannot open dict file: " << dict_file;
|
|
|
|
LOG(ERROR) << "Cannot open dict file: " << dict_file;
|
|
|
@ -169,7 +212,8 @@ int FrontEngineInterface::GenDict(const std::string &dict_file, std::map<std::st
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::GetSegResult(std::vector<std::pair<std::string, std::string>> &seg,
|
|
|
|
int FrontEngineInterface::GetSegResult(
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string, std::string>> &seg,
|
|
|
|
std::vector<std::string> &seg_words) {
|
|
|
|
std::vector<std::string> &seg_words) {
|
|
|
|
std::vector<std::pair<std::string, std::string>>::iterator iter;
|
|
|
|
std::vector<std::pair<std::string, std::string>>::iterator iter;
|
|
|
|
for (iter = seg.begin(); iter != seg.end(); iter++) {
|
|
|
|
for (iter = seg.begin(); iter != seg.end(); iter++) {
|
|
|
@ -178,8 +222,11 @@ int FrontEngineInterface::GetSegResult(std::vector<std::pair<std::string, std::s
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::GetSentenceIds(const std::string &sentence, std::vector<int> &phoneids, std::vector<int> &toneids) {
|
|
|
|
int FrontEngineInterface::GetSentenceIds(const std::string &sentence,
|
|
|
|
std::vector<std::pair<std::string, std::string>> cut_result; //分词结果包含词和词性
|
|
|
|
std::vector<int> &phoneids,
|
|
|
|
|
|
|
|
std::vector<int> &toneids) {
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string, std::string>>
|
|
|
|
|
|
|
|
cut_result; //分词结果包含词和词性
|
|
|
|
if (0 != Cut(sentence, cut_result)) {
|
|
|
|
if (0 != Cut(sentence, cut_result)) {
|
|
|
|
LOG(ERROR) << "Cut sentence: \"" << sentence << "\" failed";
|
|
|
|
LOG(ERROR) << "Cut sentence: \"" << sentence << "\" failed";
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
@ -192,7 +239,9 @@ int FrontEngineInterface::GetSentenceIds(const std::string &sentence, std::vecto
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::GetWordsIds(const std::vector<std::pair<std::string, std::string>> &cut_result, std::vector<int> &phoneids,
|
|
|
|
int FrontEngineInterface::GetWordsIds(
|
|
|
|
|
|
|
|
const std::vector<std::pair<std::string, std::string>> &cut_result,
|
|
|
|
|
|
|
|
std::vector<int> &phoneids,
|
|
|
|
std::vector<int> &toneids) {
|
|
|
|
std::vector<int> &toneids) {
|
|
|
|
std::string word;
|
|
|
|
std::string word;
|
|
|
|
std::string pos;
|
|
|
|
std::string pos;
|
|
|
@ -202,15 +251,19 @@ int FrontEngineInterface::GetWordsIds(const std::vector<std::pair<std::string, s
|
|
|
|
for (int i = 0; i < cut_result.size(); i++) {
|
|
|
|
for (int i = 0; i < cut_result.size(); i++) {
|
|
|
|
word = cut_result[i].first;
|
|
|
|
word = cut_result[i].first;
|
|
|
|
pos = cut_result[i].second;
|
|
|
|
pos = cut_result[i].second;
|
|
|
|
if (std::find(_punc_omit.begin(), _punc_omit.end(), word) == _punc_omit.end()) { // 非可忽略的标点
|
|
|
|
if (std::find(_punc_omit.begin(), _punc_omit.end(), word) ==
|
|
|
|
|
|
|
|
_punc_omit.end()) { // 非可忽略的标点
|
|
|
|
word_initials = {};
|
|
|
|
word_initials = {};
|
|
|
|
word_finals = {};
|
|
|
|
word_finals = {};
|
|
|
|
phone = "";
|
|
|
|
phone = "";
|
|
|
|
// 判断是否在标点符号集合中
|
|
|
|
// 判断是否在标点符号集合中
|
|
|
|
if (std::find(_punc.begin(), _punc.end(), word) == _punc.end()) { // 文字
|
|
|
|
if (std::find(_punc.begin(), _punc.end(), word) ==
|
|
|
|
|
|
|
|
_punc.end()) { // 文字
|
|
|
|
// 获取字词的声母韵母列表
|
|
|
|
// 获取字词的声母韵母列表
|
|
|
|
if (0 != GetInitialsFinals(word, word_initials, word_finals)) {
|
|
|
|
if (0 != GetInitialsFinals(word, word_initials, word_finals)) {
|
|
|
|
LOG(ERROR) << "Genarate the word_initials and word_finals of " << word << " failed";
|
|
|
|
LOG(ERROR)
|
|
|
|
|
|
|
|
<< "Genarate the word_initials and word_finals of "
|
|
|
|
|
|
|
|
<< word << " failed";
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -220,7 +273,8 @@ int FrontEngineInterface::GetWordsIds(const std::vector<std::pair<std::string, s
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 对儿化音进行修改
|
|
|
|
// 对儿化音进行修改
|
|
|
|
std::vector<std::vector<std::string>> new_initals_finals = MergeErhua(word_initials, word_finals, word, pos);
|
|
|
|
std::vector<std::vector<std::string>> new_initals_finals =
|
|
|
|
|
|
|
|
MergeErhua(word_initials, word_finals, word, pos);
|
|
|
|
word_initials = new_initals_finals[0];
|
|
|
|
word_initials = new_initals_finals[0];
|
|
|
|
word_finals = new_initals_finals[1];
|
|
|
|
word_finals = new_initals_finals[1];
|
|
|
|
|
|
|
|
|
|
|
@ -256,10 +310,11 @@ int FrontEngineInterface::GetWordsIds(const std::vector<std::pair<std::string, s
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::Cut(const std::string &sentence, std::vector<std::pair<std::string, std::string>> &cut_result) {
|
|
|
|
int FrontEngineInterface::Cut(
|
|
|
|
|
|
|
|
const std::string &sentence,
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string, std::string>> &cut_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>> cut_result_jieba;
|
|
|
|
std::vector<std::pair<std::string, std::string>> cut_result_jieba;
|
|
|
|
|
|
|
|
|
|
|
|
// 结巴分词
|
|
|
|
// 结巴分词
|
|
|
@ -274,7 +329,8 @@ int FrontEngineInterface::Cut(const std::string &sentence, std::vector<std::pair
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::GetPhone(const std::string &word, std::string &phone) {
|
|
|
|
int FrontEngineInterface::GetPhone(const std::string &word,
|
|
|
|
|
|
|
|
std::string &phone) {
|
|
|
|
// 判断 word 在不在 词典里,如果不在,进行CutAll分词
|
|
|
|
// 判断 word 在不在 词典里,如果不在,进行CutAll分词
|
|
|
|
if (word_phone_map.find(word) == word_phone_map.end()) {
|
|
|
|
if (word_phone_map.find(word) == word_phone_map.end()) {
|
|
|
|
std::vector<std::string> wordcut;
|
|
|
|
std::vector<std::string> wordcut;
|
|
|
@ -290,26 +346,33 @@ int FrontEngineInterface::GetPhone(const std::string &word, std::string &phone)
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::Phone2Phoneid(const std::string &phone, std::vector<int> &phoneid, std::vector<int> &toneid) {
|
|
|
|
int FrontEngineInterface::Phone2Phoneid(const std::string &phone,
|
|
|
|
|
|
|
|
std::vector<int> &phoneid,
|
|
|
|
|
|
|
|
std::vector<int> &toneid) {
|
|
|
|
std::vector<std::string> phone_vec;
|
|
|
|
std::vector<std::string> phone_vec;
|
|
|
|
phone_vec = absl::StrSplit(phone, " ");
|
|
|
|
phone_vec = absl::StrSplit(phone, " ");
|
|
|
|
std::string temp_phone;
|
|
|
|
std::string temp_phone;
|
|
|
|
for (int i = 0; i < phone_vec.size(); i++) {
|
|
|
|
for (int i = 0; i < phone_vec.size(); i++) {
|
|
|
|
temp_phone = phone_vec[i];
|
|
|
|
temp_phone = phone_vec[i];
|
|
|
|
if (_seperate_tone == "true") {
|
|
|
|
if (_seperate_tone == "true") {
|
|
|
|
phoneid.push_back(atoi((phone_id_map[temp_phone.substr(0, temp_phone.length()-1)]).c_str()));
|
|
|
|
phoneid.push_back(atoi(
|
|
|
|
toneid.push_back(atoi((tone_id_map[temp_phone.substr(temp_phone.length()-1, temp_phone.length())]).c_str()));
|
|
|
|
(phone_id_map[temp_phone.substr(0, temp_phone.length() - 1)])
|
|
|
|
|
|
|
|
.c_str()));
|
|
|
|
|
|
|
|
toneid.push_back(
|
|
|
|
|
|
|
|
atoi((tone_id_map[temp_phone.substr(temp_phone.length() - 1,
|
|
|
|
|
|
|
|
temp_phone.length())])
|
|
|
|
|
|
|
|
.c_str()));
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
phoneid.push_back(atoi((phone_id_map[temp_phone]).c_str()));
|
|
|
|
phoneid.push_back(atoi((phone_id_map[temp_phone]).c_str()));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声
|
|
|
|
// 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声
|
|
|
|
bool FrontEngineInterface::AllToneThree(const std::vector<std::string> &finals) {
|
|
|
|
bool FrontEngineInterface::AllToneThree(
|
|
|
|
|
|
|
|
const std::vector<std::string> &finals) {
|
|
|
|
bool flags = true;
|
|
|
|
bool flags = true;
|
|
|
|
for (int i = 0; i < finals.size(); i++) {
|
|
|
|
for (int i = 0; i < finals.size(); i++) {
|
|
|
|
if ((int)finals[i].back() != 51) { //如果读音不为第三声
|
|
|
|
if ((int)finals[i].back() != 51) { //如果读音不为第三声
|
|
|
@ -317,7 +380,6 @@ bool FrontEngineInterface::AllToneThree(const std::vector<std::string> &finals)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return flags;
|
|
|
|
return flags;
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 判断词是否是叠词
|
|
|
|
// 判断词是否是叠词
|
|
|
@ -329,11 +391,14 @@ bool FrontEngineInterface::IsReduplication(const std::string &word) {
|
|
|
|
flags = true;
|
|
|
|
flags = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return flags;
|
|
|
|
return flags;
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 获取每个字词的声母和韵母列表, word_initials 为声母列表,word_finals 为韵母列表
|
|
|
|
// 获取每个字词的声母和韵母列表, word_initials 为声母列表,word_finals
|
|
|
|
int FrontEngineInterface::GetInitialsFinals(const std::string &word, std::vector<std::string> &word_initials, std::vector<std::string> &word_finals) {
|
|
|
|
// 为韵母列表
|
|
|
|
|
|
|
|
int FrontEngineInterface::GetInitialsFinals(
|
|
|
|
|
|
|
|
const std::string &word,
|
|
|
|
|
|
|
|
std::vector<std::string> &word_initials,
|
|
|
|
|
|
|
|
std::vector<std::string> &word_finals) {
|
|
|
|
std::string phone;
|
|
|
|
std::string phone;
|
|
|
|
GetPhone(word, phone); //获取字词对应的音素
|
|
|
|
GetPhone(word, phone); //获取字词对应的音素
|
|
|
|
std::vector<std::string> phone_vec = absl::StrSplit(phone, " ");
|
|
|
|
std::vector<std::string> phone_vec = absl::StrSplit(phone, " ");
|
|
|
@ -344,7 +409,8 @@ int FrontEngineInterface::GetInitialsFinals(const std::string &word, std::vector
|
|
|
|
start += 1;
|
|
|
|
start += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// 最后一位不是数字或者最后一位的数字是0,均表示声母,第二个是韵母
|
|
|
|
// 最后一位不是数字或者最后一位的数字是0,均表示声母,第二个是韵母
|
|
|
|
else if(isdigit(phone_vec[start].back()) == 0 || (int)phone_vec[start].back() == 48) {
|
|
|
|
else if (isdigit(phone_vec[start].back()) == 0 ||
|
|
|
|
|
|
|
|
(int)phone_vec[start].back() == 48) {
|
|
|
|
word_initials.push_back(phone_vec[start]);
|
|
|
|
word_initials.push_back(phone_vec[start]);
|
|
|
|
word_finals.push_back(phone_vec[start + 1]);
|
|
|
|
word_finals.push_back(phone_vec[start + 1]);
|
|
|
|
start += 2;
|
|
|
|
start += 2;
|
|
|
@ -355,13 +421,15 @@ int FrontEngineInterface::GetInitialsFinals(const std::string &word, std::vector
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
assert(word_finals.size() == ppspeech::utf8string2wstring(word).length() && word_finals.size() == word_initials.size());
|
|
|
|
assert(word_finals.size() == ppspeech::utf8string2wstring(word).length() &&
|
|
|
|
|
|
|
|
word_finals.size() == word_initials.size());
|
|
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 获取每个字词的韵母列表
|
|
|
|
// 获取每个字词的韵母列表
|
|
|
|
int FrontEngineInterface::GetFinals(const std::string &word, std::vector<std::string> &word_finals) {
|
|
|
|
int FrontEngineInterface::GetFinals(const std::string &word,
|
|
|
|
|
|
|
|
std::vector<std::string> &word_finals) {
|
|
|
|
std::vector<std::string> word_initials;
|
|
|
|
std::vector<std::string> word_initials;
|
|
|
|
if (0 != GetInitialsFinals(word, word_initials, word_finals)) {
|
|
|
|
if (0 != GetInitialsFinals(word, word_initials, word_finals)) {
|
|
|
|
LOG(ERROR) << "Failed to get word finals";
|
|
|
|
LOG(ERROR) << "Failed to get word finals";
|
|
|
@ -371,23 +439,26 @@ int FrontEngineInterface::GetFinals(const std::string &word, std::vector<std::st
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::Word2WordVec(const std::string &word, std::vector<std::wstring> &wordvec) {
|
|
|
|
int FrontEngineInterface::Word2WordVec(const std::string &word,
|
|
|
|
|
|
|
|
std::vector<std::wstring> &wordvec) {
|
|
|
|
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
|
|
|
|
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
|
|
|
|
for (int i = 0; i < word_wstr.length(); i++) {
|
|
|
|
for (int i = 0; i < word_wstr.length(); i++) {
|
|
|
|
std::wstring word_sigle(1, word_wstr[i]);
|
|
|
|
std::wstring word_sigle(1, word_wstr[i]);
|
|
|
|
wordvec.push_back(word_sigle);
|
|
|
|
wordvec.push_back(word_sigle);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// yuantian01解释:把一个词再进行分词找到。例子:小雨伞 --> 小 雨伞 或者 小雨 伞
|
|
|
|
// yuantian01解释:把一个词再进行分词找到。例子:小雨伞 --> 小 雨伞 或者 小雨 伞
|
|
|
|
int FrontEngineInterface::SplitWord(const std::string &word, std::vector<std::string> &new_word_vec) {
|
|
|
|
int FrontEngineInterface::SplitWord(const std::string &word,
|
|
|
|
|
|
|
|
std::vector<std::string> &new_word_vec) {
|
|
|
|
std::vector<std::string> word_vec;
|
|
|
|
std::vector<std::string> word_vec;
|
|
|
|
std::string second_subword;
|
|
|
|
std::string second_subword;
|
|
|
|
_jieba->CutForSearch(word, word_vec);
|
|
|
|
_jieba->CutForSearch(word, word_vec);
|
|
|
|
// 升序
|
|
|
|
// 升序
|
|
|
|
std::sort(word_vec.begin(), word_vec.end(), [](std::string a, std::string b ) {return a.size() > b.size();});
|
|
|
|
std::sort(word_vec.begin(),
|
|
|
|
|
|
|
|
word_vec.end(),
|
|
|
|
|
|
|
|
[](std::string a, std::string b) { return a.size() > b.size(); });
|
|
|
|
std::string first_subword = word_vec[0]; // 提取长度最短的字符串
|
|
|
|
std::string first_subword = word_vec[0]; // 提取长度最短的字符串
|
|
|
|
int first_begin_idx = word.find_first_of(first_subword);
|
|
|
|
int first_begin_idx = word.find_first_of(first_subword);
|
|
|
|
if (first_begin_idx == 0) {
|
|
|
|
if (first_begin_idx == 0) {
|
|
|
@ -401,12 +472,12 @@ int FrontEngineInterface::SplitWord(const std::string &word, std::vector<std::st
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// example: 不 一起 --> 不一起
|
|
|
|
// example: 不 一起 --> 不一起
|
|
|
|
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeBu(std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeBu(
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>> result;
|
|
|
|
std::vector<std::pair<std::string, std::string>> result;
|
|
|
|
std::string word;
|
|
|
|
std::string word;
|
|
|
|
std::string pos;
|
|
|
|
std::string pos;
|
|
|
@ -432,7 +503,8 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeBu(s
|
|
|
|
return result;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::Mergeyi(std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::Mergeyi(
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>> result_temp;
|
|
|
|
std::vector<std::pair<std::string, std::string>> result_temp;
|
|
|
|
std::string word;
|
|
|
|
std::string word;
|
|
|
|
std::string pos;
|
|
|
|
std::string pos;
|
|
|
@ -442,10 +514,13 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::Mergeyi(s
|
|
|
|
word = seg_result[i].first;
|
|
|
|
word = seg_result[i].first;
|
|
|
|
pos = seg_result[i].second;
|
|
|
|
pos = seg_result[i].second;
|
|
|
|
if ((i - 1 >= 0) && (word == "一") && (i + 1 < seg_result.size()) &&
|
|
|
|
if ((i - 1 >= 0) && (word == "一") && (i + 1 < seg_result.size()) &&
|
|
|
|
(seg_result[i - 1].first == seg_result[i + 1].first) && seg_result[i - 1].second == "v") {
|
|
|
|
(seg_result[i - 1].first == seg_result[i + 1].first) &&
|
|
|
|
result_temp[i - 1].first = result_temp[i - 1].first + "一" + result_temp[i - 1].first;
|
|
|
|
seg_result[i - 1].second == "v") {
|
|
|
|
|
|
|
|
result_temp[i - 1].first =
|
|
|
|
|
|
|
|
result_temp[i - 1].first + "一" + result_temp[i - 1].first;
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
if((i - 2 >= 0) && (seg_result[i - 1].first == "一") && (seg_result[i - 2].first == word) && (pos == "v")) {
|
|
|
|
if ((i - 2 >= 0) && (seg_result[i - 1].first == "一") &&
|
|
|
|
|
|
|
|
(seg_result[i - 2].first == word) && (pos == "v")) {
|
|
|
|
continue;
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
result_temp.push_back(make_pair(word, pos));
|
|
|
|
result_temp.push_back(make_pair(word, pos));
|
|
|
@ -463,14 +538,15 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::Mergeyi(s
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
result.push_back(make_pair(word, pos));
|
|
|
|
result.push_back(make_pair(word, pos));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// example: 你 你 --> 你你
|
|
|
|
// example: 你 你 --> 你你
|
|
|
|
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeReduplication(std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>>
|
|
|
|
|
|
|
|
FrontEngineInterface::MergeReduplication(
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>> result;
|
|
|
|
std::vector<std::pair<std::string, std::string>> result;
|
|
|
|
std::string word;
|
|
|
|
std::string word;
|
|
|
|
std::string pos;
|
|
|
|
std::string pos;
|
|
|
@ -489,7 +565,9 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeRedu
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// the first and the second words are all_tone_three
|
|
|
|
// the first and the second words are all_tone_three
|
|
|
|
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThreeTones(std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>>
|
|
|
|
|
|
|
|
FrontEngineInterface::MergeThreeTones(
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>> result;
|
|
|
|
std::vector<std::pair<std::string, std::string>> result;
|
|
|
|
std::string word;
|
|
|
|
std::string word;
|
|
|
|
std::string pos;
|
|
|
|
std::string pos;
|
|
|
@ -499,7 +577,8 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
|
|
|
|
|
|
|
|
|
|
|
|
// 判断最后一个分词结果是不是标点,不看标点的声母韵母
|
|
|
|
// 判断最后一个分词结果是不是标点,不看标点的声母韵母
|
|
|
|
int word_num = seg_result.size() - 1;
|
|
|
|
int word_num = seg_result.size() - 1;
|
|
|
|
if(std::find(_punc.begin(), _punc.end(), seg_result[word_num].first) == _punc.end()){ // 最后一个分词结果不是标点
|
|
|
|
if (std::find(_punc.begin(), _punc.end(), seg_result[word_num].first) ==
|
|
|
|
|
|
|
|
_punc.end()) { // 最后一个分词结果不是标点
|
|
|
|
word_num += 1;
|
|
|
|
word_num += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -508,7 +587,8 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
|
|
|
|
word_final = {};
|
|
|
|
word_final = {};
|
|
|
|
word = seg_result[i].first;
|
|
|
|
word = seg_result[i].first;
|
|
|
|
pos = seg_result[i].second;
|
|
|
|
pos = seg_result[i].second;
|
|
|
|
if(std::find(_punc_omit.begin(), _punc_omit.end(), word) == _punc_omit.end()) { // 非可忽略的标点,即文字
|
|
|
|
if (std::find(_punc_omit.begin(), _punc_omit.end(), word) ==
|
|
|
|
|
|
|
|
_punc_omit.end()) { // 非可忽略的标点,即文字
|
|
|
|
if (0 != GetFinals(word, word_final)) {
|
|
|
|
if (0 != GetFinals(word, word_final)) {
|
|
|
|
LOG(ERROR) << "Failed to get the final of word.";
|
|
|
|
LOG(ERROR) << "Failed to get the final of word.";
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -522,10 +602,15 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
|
|
|
|
for (int i = 0; i < word_num; i++) {
|
|
|
|
for (int i = 0; i < word_num; i++) {
|
|
|
|
word = seg_result[i].first;
|
|
|
|
word = seg_result[i].first;
|
|
|
|
pos = seg_result[i].second;
|
|
|
|
pos = seg_result[i].second;
|
|
|
|
if(i - 1 >= 0 && AllToneThree(finals[i - 1]) && AllToneThree(finals[i]) && !merge_last[i - 1]) {
|
|
|
|
if (i - 1 >= 0 && AllToneThree(finals[i - 1]) &&
|
|
|
|
// if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
|
|
|
AllToneThree(finals[i]) && !merge_last[i - 1]) {
|
|
|
|
|
|
|
|
// if the last word is reduplication, not merge, because
|
|
|
|
|
|
|
|
// reduplication need to be _neural_sandhi
|
|
|
|
if (!IsReduplication(seg_result[i - 1].first) &&
|
|
|
|
if (!IsReduplication(seg_result[i - 1].first) &&
|
|
|
|
(ppspeech::utf8string2wstring(seg_result[i - 1].first)).length() + (ppspeech::utf8string2wstring(word)).length() <= 3) {
|
|
|
|
(ppspeech::utf8string2wstring(seg_result[i - 1].first))
|
|
|
|
|
|
|
|
.length() +
|
|
|
|
|
|
|
|
(ppspeech::utf8string2wstring(word)).length() <=
|
|
|
|
|
|
|
|
3) {
|
|
|
|
result.back().first = result.back().first + seg_result[i].first;
|
|
|
|
result.back().first = result.back().first + seg_result[i].first;
|
|
|
|
merge_last[i] = true;
|
|
|
|
merge_last[i] = true;
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
@ -538,14 +623,17 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
|
|
|
|
|
|
|
|
|
|
|
|
//把标点的分词结果补上
|
|
|
|
//把标点的分词结果补上
|
|
|
|
if (word_num < seg_result.size()) {
|
|
|
|
if (word_num < seg_result.size()) {
|
|
|
|
result.push_back(make_pair(seg_result[word_num].first, seg_result[word_num].second));
|
|
|
|
result.push_back(
|
|
|
|
|
|
|
|
make_pair(seg_result[word_num].first, seg_result[word_num].second));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// the last char of first word and the first char of second word is tone_three
|
|
|
|
// the last char of first word and the first char of second word is tone_three
|
|
|
|
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThreeTones2(std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>>
|
|
|
|
|
|
|
|
FrontEngineInterface::MergeThreeTones2(
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>> result;
|
|
|
|
std::vector<std::pair<std::string, std::string>> result;
|
|
|
|
std::string word;
|
|
|
|
std::string word;
|
|
|
|
std::string pos;
|
|
|
|
std::string pos;
|
|
|
@ -555,7 +643,8 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
|
|
|
|
|
|
|
|
|
|
|
|
// 判断最后一个分词结果是不是标点
|
|
|
|
// 判断最后一个分词结果是不是标点
|
|
|
|
int word_num = seg_result.size() - 1;
|
|
|
|
int word_num = seg_result.size() - 1;
|
|
|
|
if(std::find(_punc.begin(), _punc.end(), seg_result[word_num].first) == _punc.end()){ // 最后一个分词结果不是标点
|
|
|
|
if (std::find(_punc.begin(), _punc.end(), seg_result[word_num].first) ==
|
|
|
|
|
|
|
|
_punc.end()) { // 最后一个分词结果不是标点
|
|
|
|
word_num += 1;
|
|
|
|
word_num += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -565,7 +654,8 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
|
|
|
|
word = seg_result[i].first;
|
|
|
|
word = seg_result[i].first;
|
|
|
|
pos = seg_result[i].second;
|
|
|
|
pos = seg_result[i].second;
|
|
|
|
// 如果是文字,则获取韵母,如果是可忽略的标点,例如引号,则跳过
|
|
|
|
// 如果是文字,则获取韵母,如果是可忽略的标点,例如引号,则跳过
|
|
|
|
if(std::find(_punc_omit.begin(), _punc_omit.end(), word) == _punc_omit.end()) {
|
|
|
|
if (std::find(_punc_omit.begin(), _punc_omit.end(), word) ==
|
|
|
|
|
|
|
|
_punc_omit.end()) {
|
|
|
|
if (0 != GetFinals(word, word_final)) {
|
|
|
|
if (0 != GetFinals(word, word_final)) {
|
|
|
|
LOG(ERROR) << "Failed to get the final of word.";
|
|
|
|
LOG(ERROR) << "Failed to get the final of word.";
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -579,11 +669,18 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
|
|
|
|
for (int i = 0; i < word_num; i++) {
|
|
|
|
for (int i = 0; i < word_num; i++) {
|
|
|
|
word = seg_result[i].first;
|
|
|
|
word = seg_result[i].first;
|
|
|
|
pos = seg_result[i].second;
|
|
|
|
pos = seg_result[i].second;
|
|
|
|
if(i - 1 >= 0 && !finals[i - 1].empty() && absl::EndsWith(finals[i - 1].back(), "3") == true &&
|
|
|
|
if (i - 1 >= 0 && !finals[i - 1].empty() &&
|
|
|
|
!finals[i].empty() && absl::EndsWith(finals[i].front(), "3") == true && !merge_last[i - 1]) {
|
|
|
|
absl::EndsWith(finals[i - 1].back(), "3") == true &&
|
|
|
|
// if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
|
|
|
!finals[i].empty() &&
|
|
|
|
|
|
|
|
absl::EndsWith(finals[i].front(), "3") == true &&
|
|
|
|
|
|
|
|
!merge_last[i - 1]) {
|
|
|
|
|
|
|
|
// if the last word is reduplication, not merge, because
|
|
|
|
|
|
|
|
// reduplication need to be _neural_sandhi
|
|
|
|
if (!IsReduplication(seg_result[i - 1].first) &&
|
|
|
|
if (!IsReduplication(seg_result[i - 1].first) &&
|
|
|
|
(ppspeech::utf8string2wstring(seg_result[i - 1].first)).length() + ppspeech::utf8string2wstring(word).length() <= 3) {
|
|
|
|
(ppspeech::utf8string2wstring(seg_result[i - 1].first))
|
|
|
|
|
|
|
|
.length() +
|
|
|
|
|
|
|
|
ppspeech::utf8string2wstring(word).length() <=
|
|
|
|
|
|
|
|
3) {
|
|
|
|
result.back().first = result.back().first + seg_result[i].first;
|
|
|
|
result.back().first = result.back().first + seg_result[i].first;
|
|
|
|
merge_last[i] = true;
|
|
|
|
merge_last[i] = true;
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
@ -596,14 +693,16 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
|
|
|
|
|
|
|
|
|
|
|
|
//把标点的分词结果补上
|
|
|
|
//把标点的分词结果补上
|
|
|
|
if (word_num < seg_result.size()) {
|
|
|
|
if (word_num < seg_result.size()) {
|
|
|
|
result.push_back(make_pair(seg_result[word_num].first, seg_result[word_num].second));
|
|
|
|
result.push_back(
|
|
|
|
|
|
|
|
make_pair(seg_result[word_num].first, seg_result[word_num].second));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// example: 吃饭 儿 --> 吃饭儿
|
|
|
|
// example: 吃饭 儿 --> 吃饭儿
|
|
|
|
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeEr(std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeEr(
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string, std::string>> &seg_result) {
|
|
|
|
std::vector<std::pair<std::string, std::string>> result;
|
|
|
|
std::vector<std::pair<std::string, std::string>> result;
|
|
|
|
std::string word;
|
|
|
|
std::string word;
|
|
|
|
std::string pos;
|
|
|
|
std::string pos;
|
|
|
@ -621,12 +720,13 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeEr(s
|
|
|
|
return result;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::MergeforModify(std::vector<std::pair<std::string, std::string>> &seg_word_type,
|
|
|
|
int FrontEngineInterface::MergeforModify(
|
|
|
|
|
|
|
|
std::vector<std::pair<std::string, std::string>> &seg_word_type,
|
|
|
|
std::vector<std::pair<std::string, std::string>> &modify_seg_word_type) {
|
|
|
|
std::vector<std::pair<std::string, std::string>> &modify_seg_word_type) {
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<std::string> seg_result;
|
|
|
|
std::vector<std::string> seg_result;
|
|
|
|
GetSegResult(seg_word_type, seg_result);
|
|
|
|
GetSegResult(seg_word_type, seg_result);
|
|
|
|
LOG(INFO) << "Before merge, seg result is: " << limonp::Join(seg_result.begin(), seg_result.end(), "/");
|
|
|
|
LOG(INFO) << "Before merge, seg result is: "
|
|
|
|
|
|
|
|
<< limonp::Join(seg_result.begin(), seg_result.end(), "/");
|
|
|
|
|
|
|
|
|
|
|
|
modify_seg_word_type = MergeBu(seg_word_type);
|
|
|
|
modify_seg_word_type = MergeBu(seg_word_type);
|
|
|
|
modify_seg_word_type = Mergeyi(modify_seg_word_type);
|
|
|
|
modify_seg_word_type = Mergeyi(modify_seg_word_type);
|
|
|
@ -637,13 +737,15 @@ int FrontEngineInterface::MergeforModify(std::vector<std::pair<std::string, std:
|
|
|
|
|
|
|
|
|
|
|
|
seg_result = {};
|
|
|
|
seg_result = {};
|
|
|
|
GetSegResult(modify_seg_word_type, seg_result);
|
|
|
|
GetSegResult(modify_seg_word_type, seg_result);
|
|
|
|
LOG(INFO) << "After merge, seg result is: " << limonp::Join(seg_result.begin(), seg_result.end(), "/");
|
|
|
|
LOG(INFO) << "After merge, seg result is: "
|
|
|
|
|
|
|
|
<< limonp::Join(seg_result.begin(), seg_result.end(), "/");
|
|
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::BuSandi(const std::string &word, std::vector<std::string> &finals) {
|
|
|
|
int FrontEngineInterface::BuSandi(const std::string &word,
|
|
|
|
|
|
|
|
std::vector<std::string> &finals) {
|
|
|
|
std::wstring bu = L"不";
|
|
|
|
std::wstring bu = L"不";
|
|
|
|
std::vector<std::wstring> wordvec;
|
|
|
|
std::vector<std::wstring> wordvec;
|
|
|
|
// 一个词转成向量形式
|
|
|
|
// 一个词转成向量形式
|
|
|
@ -669,7 +771,8 @@ int FrontEngineInterface::BuSandi(const std::string &word, std::vector<std::stri
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::YiSandhi(const std::string &word, std::vector<std::string> &finals) {
|
|
|
|
int FrontEngineInterface::YiSandhi(const std::string &word,
|
|
|
|
|
|
|
|
std::vector<std::string> &finals) {
|
|
|
|
std::wstring yi = L"一";
|
|
|
|
std::wstring yi = L"一";
|
|
|
|
std::vector<std::wstring> wordvec;
|
|
|
|
std::vector<std::wstring> wordvec;
|
|
|
|
// 一个词转成向量形式
|
|
|
|
// 一个词转成向量形式
|
|
|
@ -692,7 +795,8 @@ int FrontEngineInterface::YiSandhi(const std::string &word, std::vector<std::str
|
|
|
|
if (flags == 0) {
|
|
|
|
if (flags == 0) {
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if(wordvec.size() == 3 && wordvec[1] == yi && wordvec[0] == wordvec[2]) {
|
|
|
|
} else if (wordvec.size() == 3 && wordvec[1] == yi &&
|
|
|
|
|
|
|
|
wordvec[0] == wordvec[2]) {
|
|
|
|
// "一" between reduplication words shold be yi5, e.g. 看一看
|
|
|
|
// "一" between reduplication words shold be yi5, e.g. 看一看
|
|
|
|
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "5");
|
|
|
|
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "5");
|
|
|
|
} else if (wordvec[0] == L"第" && wordvec[1] == yi) { //以第一位开始
|
|
|
|
} else if (wordvec[0] == L"第" && wordvec[1] == yi) { //以第一位开始
|
|
|
@ -702,10 +806,12 @@ int FrontEngineInterface::YiSandhi(const std::string &word, std::vector<std::str
|
|
|
|
if (wordvec[i] == yi && i + 1 < wordvec.size()) {
|
|
|
|
if (wordvec[i] == yi && i + 1 < wordvec.size()) {
|
|
|
|
if (absl::EndsWith(finals[i + 1], "4") == true) {
|
|
|
|
if (absl::EndsWith(finals[i + 1], "4") == true) {
|
|
|
|
// "一" before tone4 should be yi2, e.g. 一段
|
|
|
|
// "一" before tone4 should be yi2, e.g. 一段
|
|
|
|
finals[i] = finals[i].replace(finals[i].length() - 1, 1, "2");
|
|
|
|
finals[i] =
|
|
|
|
|
|
|
|
finals[i].replace(finals[i].length() - 1, 1, "2");
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
// "一" before non-tone4 should be yi4, e.g. 一天
|
|
|
|
// "一" before non-tone4 should be yi4, e.g. 一天
|
|
|
|
finals[i] = finals[i].replace(finals[i].length() - 1, 1, "4");
|
|
|
|
finals[i] =
|
|
|
|
|
|
|
|
finals[i].replace(finals[i].length() - 1, 1, "4");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -714,7 +820,9 @@ int FrontEngineInterface::YiSandhi(const std::string &word, std::vector<std::str
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::NeuralSandhi(const std::string &word, const std::string &pos, std::vector<std::string> &finals) {
|
|
|
|
int FrontEngineInterface::NeuralSandhi(const std::string &word,
|
|
|
|
|
|
|
|
const std::string &pos,
|
|
|
|
|
|
|
|
std::vector<std::string> &finals) {
|
|
|
|
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
|
|
|
|
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
|
|
|
|
std::vector<std::wstring> wordvec;
|
|
|
|
std::vector<std::wstring> wordvec;
|
|
|
|
// 一个词转成向量形式
|
|
|
|
// 一个词转成向量形式
|
|
|
@ -728,7 +836,8 @@ int FrontEngineInterface::NeuralSandhi(const std::string &word, const std::strin
|
|
|
|
// 情况1:reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
|
|
|
|
// 情况1:reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
|
|
|
|
for (int j = 0; j < wordvec.size(); j++) {
|
|
|
|
for (int j = 0; j < wordvec.size(); j++) {
|
|
|
|
std::string inits = "nva";
|
|
|
|
std::string inits = "nva";
|
|
|
|
if(j - 1 >= 0 && wordvec[j] == wordvec[j - 1] && inits.find(pos[0]) != inits.npos) {
|
|
|
|
if (j - 1 >= 0 && wordvec[j] == wordvec[j - 1] &&
|
|
|
|
|
|
|
|
inits.find(pos[0]) != inits.npos) {
|
|
|
|
finals[j] = finals[j].replace(finals[j].length() - 1, 1, "5");
|
|
|
|
finals[j] = finals[j].replace(finals[j].length() - 1, 1, "5");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -749,27 +858,49 @@ int FrontEngineInterface::NeuralSandhi(const std::string &word, const std::strin
|
|
|
|
auto ge_idx = word_wstr.find_first_of(ge); // 出现“个”的第一个位置
|
|
|
|
auto ge_idx = word_wstr.find_first_of(ge); // 出现“个”的第一个位置
|
|
|
|
|
|
|
|
|
|
|
|
if (word_num >= 1 && yuqici.find(wordvec.back()) != yuqici.npos) {
|
|
|
|
if (word_num >= 1 && yuqici.find(wordvec.back()) != yuqici.npos) {
|
|
|
|
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
finals.back() =
|
|
|
|
|
|
|
|
finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
} else if (word_num >= 1 && de.find(wordvec.back()) != de.npos) {
|
|
|
|
} else if (word_num >= 1 && de.find(wordvec.back()) != de.npos) {
|
|
|
|
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
finals.back() =
|
|
|
|
} else if(word_num == 1 && le.find(wordvec[0]) != le.npos && find(le_pos.begin(), le_pos.end(), pos) != le_pos.end()) {
|
|
|
|
finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
} else if (word_num == 1 && le.find(wordvec[0]) != le.npos &&
|
|
|
|
} else if(word_num > 1 && men.find(wordvec.back()) != men.npos && find(men_pos.begin(), men_pos.end(), pos) != men_pos.end()
|
|
|
|
find(le_pos.begin(), le_pos.end(), pos) != le_pos.end()) {
|
|
|
|
&& find(must_not_neural_tone_words.begin(), must_not_neural_tone_words.end(), word) != must_not_neural_tone_words.end()) {
|
|
|
|
finals.back() =
|
|
|
|
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
} else if(word_num > 1 && weizhi.find(wordvec.back()) != weizhi.npos && find(weizhi_pos.begin(), weizhi_pos.end(), pos) != weizhi_pos.end()) {
|
|
|
|
} else if (word_num > 1 && men.find(wordvec.back()) != men.npos &&
|
|
|
|
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
find(men_pos.begin(), men_pos.end(), pos) != men_pos.end() &&
|
|
|
|
} else if(word_num > 1 && dong.find(wordvec.back()) != dong.npos && fangxiang.find(wordvec[word_num - 2]) != fangxiang.npos) {
|
|
|
|
find(must_not_neural_tone_words.begin(),
|
|
|
|
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
must_not_neural_tone_words.end(),
|
|
|
|
|
|
|
|
word) != must_not_neural_tone_words.end()) {
|
|
|
|
|
|
|
|
finals.back() =
|
|
|
|
|
|
|
|
finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
|
|
|
|
} else if (word_num > 1 && weizhi.find(wordvec.back()) != weizhi.npos &&
|
|
|
|
|
|
|
|
find(weizhi_pos.begin(), weizhi_pos.end(), pos) !=
|
|
|
|
|
|
|
|
weizhi_pos.end()) {
|
|
|
|
|
|
|
|
finals.back() =
|
|
|
|
|
|
|
|
finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
|
|
|
|
} else if (word_num > 1 && dong.find(wordvec.back()) != dong.npos &&
|
|
|
|
|
|
|
|
fangxiang.find(wordvec[word_num - 2]) != fangxiang.npos) {
|
|
|
|
|
|
|
|
finals.back() =
|
|
|
|
|
|
|
|
finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// 情况3:对“个”字前面带有修饰词的字词读音处理
|
|
|
|
// 情况3:对“个”字前面带有修饰词的字词读音处理
|
|
|
|
else if((ge_idx != word_wstr.npos && ge_idx >= 1 && xiushi.find(wordvec[ge_idx - 1]) != xiushi.npos)
|
|
|
|
else if ((ge_idx != word_wstr.npos && ge_idx >= 1 &&
|
|
|
|
|| word_wstr == ge) {
|
|
|
|
xiushi.find(wordvec[ge_idx - 1]) != xiushi.npos) ||
|
|
|
|
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
word_wstr == ge) {
|
|
|
|
|
|
|
|
finals.back() =
|
|
|
|
|
|
|
|
finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
if(find(must_neural_tone_words.begin(), must_neural_tone_words.end(), word) != must_neural_tone_words.end()
|
|
|
|
if (find(must_neural_tone_words.begin(),
|
|
|
|
|| (word_num >= 2 && find(must_neural_tone_words.begin(), must_neural_tone_words.end(), ppspeech::wstring2utf8string(word_wstr.substr(word_num - 2))) != must_neural_tone_words.end())) {
|
|
|
|
must_neural_tone_words.end(),
|
|
|
|
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
word) != must_neural_tone_words.end() ||
|
|
|
|
|
|
|
|
(word_num >= 2 &&
|
|
|
|
|
|
|
|
find(must_neural_tone_words.begin(),
|
|
|
|
|
|
|
|
must_neural_tone_words.end(),
|
|
|
|
|
|
|
|
ppspeech::wstring2utf8string(word_wstr.substr(
|
|
|
|
|
|
|
|
word_num - 2))) != must_neural_tone_words.end())) {
|
|
|
|
|
|
|
|
finals.back() =
|
|
|
|
|
|
|
|
finals.back().replace(finals.back().length() - 1, 1, "5");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -782,25 +913,39 @@ int FrontEngineInterface::NeuralSandhi(const std::string &word, const std::strin
|
|
|
|
// 创建对应的 韵母列表
|
|
|
|
// 创建对应的 韵母列表
|
|
|
|
std::vector<std::vector<std::string>> finals_list;
|
|
|
|
std::vector<std::vector<std::string>> finals_list;
|
|
|
|
std::vector<std::string> finals_temp;
|
|
|
|
std::vector<std::string> finals_temp;
|
|
|
|
finals_temp.assign(finals.begin(), finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length());
|
|
|
|
finals_temp.assign(
|
|
|
|
|
|
|
|
finals.begin(),
|
|
|
|
|
|
|
|
finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length());
|
|
|
|
finals_list.push_back(finals_temp);
|
|
|
|
finals_list.push_back(finals_temp);
|
|
|
|
finals_temp.assign(finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length(), finals.end());
|
|
|
|
finals_temp.assign(
|
|
|
|
|
|
|
|
finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length(),
|
|
|
|
|
|
|
|
finals.end());
|
|
|
|
finals_list.push_back(finals_temp);
|
|
|
|
finals_list.push_back(finals_temp);
|
|
|
|
|
|
|
|
|
|
|
|
finals = {};
|
|
|
|
finals = {};
|
|
|
|
for (int i = 0; i < word_list.size(); i++) {
|
|
|
|
for (int i = 0; i < word_list.size(); i++) {
|
|
|
|
std::wstring temp_wstr = ppspeech::utf8string2wstring(word_list[i]);
|
|
|
|
std::wstring temp_wstr = ppspeech::utf8string2wstring(word_list[i]);
|
|
|
|
if((find(must_neural_tone_words.begin(), must_neural_tone_words.end(), word_list[i]) != must_neural_tone_words.end())
|
|
|
|
if ((find(must_neural_tone_words.begin(),
|
|
|
|
|| (temp_wstr.length() >= 2 && find(must_neural_tone_words.begin(), must_neural_tone_words.end(), ppspeech::wstring2utf8string(temp_wstr.substr(temp_wstr.length() - 2))) != must_neural_tone_words.end())) {
|
|
|
|
must_neural_tone_words.end(),
|
|
|
|
finals_list[i].back() = finals_list[i].back().replace(finals_list[i].back().length() - 1, 1, "5");
|
|
|
|
word_list[i]) != must_neural_tone_words.end()) ||
|
|
|
|
}
|
|
|
|
(temp_wstr.length() >= 2 &&
|
|
|
|
finals.insert(finals.end(), finals_list[i].begin(), finals_list[i].end());
|
|
|
|
find(must_neural_tone_words.begin(),
|
|
|
|
|
|
|
|
must_neural_tone_words.end(),
|
|
|
|
|
|
|
|
ppspeech::wstring2utf8string(
|
|
|
|
|
|
|
|
temp_wstr.substr(temp_wstr.length() - 2))) !=
|
|
|
|
|
|
|
|
must_neural_tone_words.end())) {
|
|
|
|
|
|
|
|
finals_list[i].back() = finals_list[i].back().replace(
|
|
|
|
|
|
|
|
finals_list[i].back().length() - 1, 1, "5");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
finals.insert(
|
|
|
|
|
|
|
|
finals.end(), finals_list[i].begin(), finals_list[i].end());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::ThreeSandhi(const std::string &word, std::vector<std::string> &finals) {
|
|
|
|
int FrontEngineInterface::ThreeSandhi(const std::string &word,
|
|
|
|
|
|
|
|
std::vector<std::string> &finals) {
|
|
|
|
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
|
|
|
|
std::wstring word_wstr = ppspeech::utf8string2wstring(word);
|
|
|
|
std::vector<std::vector<std::string>> finals_list;
|
|
|
|
std::vector<std::vector<std::string>> finals_list;
|
|
|
|
std::vector<std::string> finals_temp;
|
|
|
|
std::vector<std::string> finals_temp;
|
|
|
@ -828,31 +973,43 @@ int FrontEngineInterface::ThreeSandhi(const std::string &word, std::vector<std::
|
|
|
|
if (temp_wstr.length() == 2) {
|
|
|
|
if (temp_wstr.length() == 2) {
|
|
|
|
finals[0] = finals[0].replace(finals[0].length() - 1, 1, "2");
|
|
|
|
finals[0] = finals[0].replace(finals[0].length() - 1, 1, "2");
|
|
|
|
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "2");
|
|
|
|
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "2");
|
|
|
|
} else if(temp_wstr.length() == 1) { //monosyllabic + disyllabic, e.g. 纸/老虎
|
|
|
|
} else if (temp_wstr.length() ==
|
|
|
|
|
|
|
|
1) { // monosyllabic + disyllabic, e.g. 纸/老虎
|
|
|
|
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "2");
|
|
|
|
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "2");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
// 创建对应的 韵母列表
|
|
|
|
// 创建对应的 韵母列表
|
|
|
|
finals_temp = {};
|
|
|
|
finals_temp = {};
|
|
|
|
finals_list = {};
|
|
|
|
finals_list = {};
|
|
|
|
finals_temp.assign(finals.begin(), finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length());
|
|
|
|
finals_temp.assign(
|
|
|
|
|
|
|
|
finals.begin(),
|
|
|
|
|
|
|
|
finals.begin() +
|
|
|
|
|
|
|
|
ppspeech::utf8string2wstring(word_list[0]).length());
|
|
|
|
finals_list.push_back(finals_temp);
|
|
|
|
finals_list.push_back(finals_temp);
|
|
|
|
finals_temp.assign(finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length(), finals.end());
|
|
|
|
finals_temp.assign(
|
|
|
|
|
|
|
|
finals.begin() +
|
|
|
|
|
|
|
|
ppspeech::utf8string2wstring(word_list[0]).length(),
|
|
|
|
|
|
|
|
finals.end());
|
|
|
|
finals_list.push_back(finals_temp);
|
|
|
|
finals_list.push_back(finals_temp);
|
|
|
|
|
|
|
|
|
|
|
|
finals = {};
|
|
|
|
finals = {};
|
|
|
|
for (int i = 0; i < finals_list.size(); i++) {
|
|
|
|
for (int i = 0; i < finals_list.size(); i++) {
|
|
|
|
// e.g. 所有/人
|
|
|
|
// e.g. 所有/人
|
|
|
|
if(AllToneThree(finals_list[i]) && finals_list[i].size() == 2) {
|
|
|
|
if (AllToneThree(finals_list[i]) &&
|
|
|
|
finals_list[i][0] = finals_list[i][0].replace(finals_list[i][0].length() - 1, 1, "2");
|
|
|
|
finals_list[i].size() == 2) {
|
|
|
|
} else if(i == 1 && !(AllToneThree(finals_list[i])) && absl::EndsWith(finals_list[i][0], "3") == true
|
|
|
|
finals_list[i][0] = finals_list[i][0].replace(
|
|
|
|
&& absl::EndsWith(finals_list[0].back(), "3") == true) {
|
|
|
|
finals_list[i][0].length() - 1, 1, "2");
|
|
|
|
finals_list[0].back() = finals_list[0].back().replace(finals_list[0].back().length() - 1, 1, "2");
|
|
|
|
} else if (i == 1 && !(AllToneThree(finals_list[i])) &&
|
|
|
|
|
|
|
|
absl::EndsWith(finals_list[i][0], "3") == true &&
|
|
|
|
|
|
|
|
absl::EndsWith(finals_list[0].back(), "3") == true) {
|
|
|
|
|
|
|
|
finals_list[0].back() = finals_list[0].back().replace(
|
|
|
|
|
|
|
|
finals_list[0].back().length() - 1, 1, "2");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
finals.insert(finals.end(), finals_list[0].begin(), finals_list[0].end());
|
|
|
|
finals.insert(
|
|
|
|
finals.insert(finals.end(), finals_list[1].begin(), finals_list[1].end());
|
|
|
|
finals.end(), finals_list[0].begin(), finals_list[0].end());
|
|
|
|
|
|
|
|
finals.insert(
|
|
|
|
|
|
|
|
finals.end(), finals_list[1].begin(), finals_list[1].end());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
} else if (word_num == 4) { //将成语拆分为两个长度为 2 的单词
|
|
|
|
} else if (word_num == 4) { //将成语拆分为两个长度为 2 的单词
|
|
|
@ -867,19 +1024,23 @@ int FrontEngineInterface::ThreeSandhi(const std::string &word, std::vector<std::
|
|
|
|
finals = {};
|
|
|
|
finals = {};
|
|
|
|
for (int j = 0; j < finals_list.size(); j++) {
|
|
|
|
for (int j = 0; j < finals_list.size(); j++) {
|
|
|
|
if (AllToneThree(finals_list[j])) {
|
|
|
|
if (AllToneThree(finals_list[j])) {
|
|
|
|
finals_list[j][0] = finals_list[j][0].replace(finals_list[j][0].length() - 1, 1, "2");
|
|
|
|
finals_list[j][0] = finals_list[j][0].replace(
|
|
|
|
|
|
|
|
finals_list[j][0].length() - 1, 1, "2");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
finals.insert(finals.end(), finals_list[j].begin(), finals_list[j].end());
|
|
|
|
finals.insert(
|
|
|
|
|
|
|
|
finals.end(), finals_list[j].begin(), finals_list[j].end());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int FrontEngineInterface::ModifyTone(const std::string &word, const std::string &pos, std::vector<std::string> &finals) {
|
|
|
|
int FrontEngineInterface::ModifyTone(const std::string &word,
|
|
|
|
|
|
|
|
const std::string &pos,
|
|
|
|
|
|
|
|
std::vector<std::string> &finals) {
|
|
|
|
if ((0 != BuSandi(word, finals)) || (0 != YiSandhi(word, finals)) ||
|
|
|
|
if ((0 != BuSandi(word, finals)) || (0 != YiSandhi(word, finals)) ||
|
|
|
|
(0 != NeuralSandhi(word, pos, finals)) || (0 != ThreeSandhi(word,finals))) {
|
|
|
|
(0 != NeuralSandhi(word, pos, finals)) ||
|
|
|
|
|
|
|
|
(0 != ThreeSandhi(word, finals))) {
|
|
|
|
LOG(ERROR) << "Failed to modify tone of the word: " << word;
|
|
|
|
LOG(ERROR) << "Failed to modify tone of the word: " << word;
|
|
|
|
return -1;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -887,7 +1048,11 @@ int FrontEngineInterface::ModifyTone(const std::string &word, const std::string
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(const std::vector<std::string> &initials, const std::vector<std::string> &finals, const std::string &word, const std::string &pos) {
|
|
|
|
std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(
|
|
|
|
|
|
|
|
const std::vector<std::string> &initials,
|
|
|
|
|
|
|
|
const std::vector<std::string> &finals,
|
|
|
|
|
|
|
|
const std::string &word,
|
|
|
|
|
|
|
|
const std::string &pos) {
|
|
|
|
std::vector<std::string> new_initials = {};
|
|
|
|
std::vector<std::string> new_initials = {};
|
|
|
|
std::vector<std::string> new_finals = {};
|
|
|
|
std::vector<std::string> new_finals = {};
|
|
|
|
std::vector<std::vector<std::string>> new_initials_finals;
|
|
|
|
std::vector<std::vector<std::string>> new_initials_finals;
|
|
|
@ -900,8 +1065,11 @@ std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(const std
|
|
|
|
}
|
|
|
|
}
|
|
|
|
int word_num = wordvec.size();
|
|
|
|
int word_num = wordvec.size();
|
|
|
|
|
|
|
|
|
|
|
|
if((find(must_erhua.begin(), must_erhua.end(), word) == must_erhua.end()) &&
|
|
|
|
if ((find(must_erhua.begin(), must_erhua.end(), word) ==
|
|
|
|
((find(not_erhua.begin(), not_erhua.end(), word) != not_erhua.end()) || (find(specified_pos.begin(), specified_pos.end(), pos) != specified_pos.end()))) {
|
|
|
|
must_erhua.end()) &&
|
|
|
|
|
|
|
|
((find(not_erhua.begin(), not_erhua.end(), word) != not_erhua.end()) ||
|
|
|
|
|
|
|
|
(find(specified_pos.begin(), specified_pos.end(), pos) !=
|
|
|
|
|
|
|
|
specified_pos.end()))) {
|
|
|
|
new_initials_finals.push_back(initials);
|
|
|
|
new_initials_finals.push_back(initials);
|
|
|
|
new_initials_finals.push_back(finals);
|
|
|
|
new_initials_finals.push_back(finals);
|
|
|
|
return new_initials_finals;
|
|
|
|
return new_initials_finals;
|
|
|
@ -914,9 +1082,16 @@ std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(const std
|
|
|
|
|
|
|
|
|
|
|
|
assert(finals.size() == word_num);
|
|
|
|
assert(finals.size() == word_num);
|
|
|
|
for (int i = 0; i < finals.size(); i++) {
|
|
|
|
for (int i = 0; i < finals.size(); i++) {
|
|
|
|
if(i == finals.size() - 1 && wordvec[i] == L"儿" && (finals[i] == "er2" || finals[i] == "er5") && word_num >= 2 &&
|
|
|
|
if (i == finals.size() - 1 && wordvec[i] == L"儿" &&
|
|
|
|
find(not_erhua.begin(), not_erhua.end(), ppspeech::wstring2utf8string(word_wstr.substr(word_wstr.length() - 2))) == not_erhua.end() && !new_finals.empty()) {
|
|
|
|
(finals[i] == "er2" || finals[i] == "er5") && word_num >= 2 &&
|
|
|
|
new_finals.back() = new_finals.back().substr(0, new_finals.back().length()-1) + "r" + new_finals.back().substr(new_finals.back().length()-1);
|
|
|
|
find(not_erhua.begin(),
|
|
|
|
|
|
|
|
not_erhua.end(),
|
|
|
|
|
|
|
|
ppspeech::wstring2utf8string(word_wstr.substr(
|
|
|
|
|
|
|
|
word_wstr.length() - 2))) == not_erhua.end() &&
|
|
|
|
|
|
|
|
!new_finals.empty()) {
|
|
|
|
|
|
|
|
new_finals.back() =
|
|
|
|
|
|
|
|
new_finals.back().substr(0, new_finals.back().length() - 1) +
|
|
|
|
|
|
|
|
"r" + new_finals.back().substr(new_finals.back().length() - 1);
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
new_initials.push_back(initials[i]);
|
|
|
|
new_initials.push_back(initials[i]);
|
|
|
|
new_finals.push_back(finals[i]);
|
|
|
|
new_finals.push_back(finals[i]);
|
|
|
@ -926,8 +1101,5 @@ std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(const std
|
|
|
|
new_initials_finals.push_back(new_finals);
|
|
|
|
new_initials_finals.push_back(new_finals);
|
|
|
|
|
|
|
|
|
|
|
|
return new_initials_finals;
|
|
|
|
return new_initials_finals;
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ppspeech
|
|
|
|