add license for TTSCppFrontend

pull/3030/head
TianYuan 3 years ago
parent 34f2995bcf
commit c0fb40110a

@ -1,9 +1,22 @@
#include <string> // Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//#include "utils/dir_utils.h" //
#include "front/front_interface.h" // Licensed under the Apache License, Version 2.0 (the "License");
#include <glog/logging.h> // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h>
#include <map> #include <map>
#include <string>
#include "front/front_interface.h"
DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized"); DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized");
DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file"); DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file");
@ -42,10 +55,11 @@ int main(int argc, char** argv) {
LOG(INFO) << "Segment sentences through punctuation successfully"; LOG(INFO) << "Segment sentences through punctuation successfully";
// 分句后获取音素id // 分句后获取音素id
LOG(INFO) << "Start to get the phoneme and tone id sequence of each sentence"; LOG(INFO)
<< "Start to get the phoneme and tone id sequence of each sentence";
for (int i = 0; i < sentence_part.size(); i++) { for (int i = 0; i < sentence_part.size(); i++) {
LOG(INFO) << "Raw sentence is: "
LOG(INFO) << "Raw sentence is: " << ppspeech::wstring2utf8string(sentence_part[i]); << ppspeech::wstring2utf8string(sentence_part[i]);
front_inst->SentenceNormalize(sentence_part[i]); front_inst->SentenceNormalize(sentence_part[i]);
s_sentence = ppspeech::wstring2utf8string(sentence_part[i]); s_sentence = ppspeech::wstring2utf8string(sentence_part[i]);
LOG(INFO) << "After normalization sentence is: " << s_sentence; LOG(INFO) << "After normalization sentence is: " << s_sentence;
@ -54,12 +68,12 @@ int main(int argc, char** argv) {
LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed"; LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
return -1; return -1;
} }
} }
LOG(INFO) << "The phoneids of the sentence is: " << limonp::Join(phoneids.begin(), phoneids.end(), " "); LOG(INFO) << "The phoneids of the sentence is: "
LOG(INFO) << "The toneids of the sentence is: " << limonp::Join(toneids.begin(), toneids.end(), " "); << limonp::Join(phoneids.begin(), phoneids.end(), " ");
LOG(INFO) << "The toneids of the sentence is: "
<< limonp::Join(toneids.begin(), toneids.end(), " ");
LOG(INFO) << "Get the phoneme id sequence of each sentence successfully"; LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

@ -1,19 +1,28 @@
# !/usr/bin/env python3 # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# -*- coding: utf-8 -*-
########################################################################
# #
# Copyright 2021 liangyunming(liangyunming@baidu.com) # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# #
# Execute the script when PaddleSpeech has been installed # http://www.apache.org/licenses/LICENSE-2.0
# PaddleSpeech: https://github.com/PaddlePaddle/PaddleSpeech #
# Unless required by applicable law or agreed to in writing, software
######################################################################## # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse import argparse
import configparser import configparser
from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.frontend.zh_frontend import Frontend
def get_phone(frontend, word, merge_sentences=True, print_info=False, robot=False, get_tone_ids=False):
def get_phone(frontend,
word,
merge_sentences=True,
print_info=False,
robot=False,
get_tone_ids=False):
phonemes = frontend.get_phonemes(word, merge_sentences, print_info, robot) phonemes = frontend.get_phonemes(word, merge_sentences, print_info, robot)
# Some optimizations # Some optimizations
phones, tones = frontend._get_phone_tone(phonemes[0], get_tone_ids) phones, tones = frontend._get_phone_tone(phonemes[0], get_tone_ids)
@ -22,7 +31,10 @@ def get_phone(frontend, word, merge_sentences=True, print_info=False, robot=Fals
return phones, tones return phones, tones
def gen_word2phone_dict(frontend, jieba_words_dict, word2phone_dict, get_tone=False): def gen_word2phone_dict(frontend,
jieba_words_dict,
word2phone_dict,
get_tone=False):
with open(jieba_words_dict, "r") as f1, open(word2phone_dict, "w+") as f2: with open(jieba_words_dict, "r") as f1, open(word2phone_dict, "w+") as f2:
for line in f1.readlines(): for line in f1.readlines():
word = line.split(" ")[0] word = line.split(" ")[0]
@ -45,18 +57,21 @@ def gen_word2phone_dict(frontend, jieba_words_dict, word2phone_dict, get_tone=Fa
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(description="Generate dictionary")
description="Generate dictionary")
parser.add_argument( parser.add_argument(
"--config", type=str, default="./config.ini", help="config file.") "--config", type=str, default="./config.ini", help="config file.")
parser.add_argument( parser.add_argument(
"--am_type", type=str, default="fastspeech2", help="fastspeech2 or speedyspeech") "--am_type",
type=str,
default="fastspeech2",
help="fastspeech2 or speedyspeech")
args = parser.parse_args() args = parser.parse_args()
# Read config # Read config
cf = configparser.ConfigParser() cf = configparser.ConfigParser()
cf.read(args.config) cf.read(args.config)
jieba_words_dict_file = cf.get("jieba", "jieba_words_dict") # get words dict jieba_words_dict_file = cf.get("jieba",
"jieba_words_dict") # get words dict
am_type = args.am_type am_type = args.am_type
if (am_type == "fastspeech2"): if (am_type == "fastspeech2"):
@ -66,18 +81,27 @@ def main():
frontend = Frontend(phone_vocab_path=phone2id_dict_file) frontend = Frontend(phone_vocab_path=phone2id_dict_file)
print("frontend done!") print("frontend done!")
gen_word2phone_dict(frontend, jieba_words_dict_file, word2phone_dict_file, get_tone=False) gen_word2phone_dict(
frontend,
jieba_words_dict_file,
word2phone_dict_file,
get_tone=False)
elif (am_type == "speedyspeech"): elif (am_type == "speedyspeech"):
phone2id_dict_file = cf.get(am_type, "phone2id_dict") phone2id_dict_file = cf.get(am_type, "phone2id_dict")
tone2id_dict_file = cf.get(am_type, "tone2id_dict") tone2id_dict_file = cf.get(am_type, "tone2id_dict")
word2phone_dict_file = cf.get(am_type, "word2phone_dict") word2phone_dict_file = cf.get(am_type, "word2phone_dict")
frontend = Frontend(phone_vocab_path=phone2id_dict_file, tone_vocab_path=tone2id_dict_file) frontend = Frontend(
phone_vocab_path=phone2id_dict_file,
tone_vocab_path=tone2id_dict_file)
print("frontend done!") print("frontend done!")
gen_word2phone_dict(frontend, jieba_words_dict_file, word2phone_dict_file, get_tone=True) gen_word2phone_dict(
frontend,
jieba_words_dict_file,
word2phone_dict_file,
get_tone=True)
else: else:
print("Please set correct am type, fastspeech2 or speedyspeech.") print("Please set correct am type, fastspeech2 or speedyspeech.")

@ -1,10 +1,23 @@
#from parakeet.frontend.vocab import Vocab # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
PHONESFILE = "./dict/phones.txt" PHONESFILE = "./dict/phones.txt"
PHONES_ID_FILE = "./dict/phonesid.dict" PHONES_ID_FILE = "./dict/phonesid.dict"
TONESFILE = "./dict/tones.txt" TONESFILE = "./dict/tones.txt"
TONES_ID_FILE = "./dict/tonesid.dict" TONES_ID_FILE = "./dict/tonesid.dict"
def GenIdFile(file, idfile): def GenIdFile(file, idfile):
id = 2 id = 2
with open(file, 'r') as f1, open(idfile, "w+") as f2: with open(file, 'r') as f1, open(idfile, "w+") as f2:
@ -16,7 +29,7 @@ def GenIdFile(file, idfile):
f2.write(phone + " " + str(id) + "\n") f2.write(phone + " " + str(id) + "\n")
id += 1 id += 1
if __name__ == "__main__": if __name__ == "__main__":
GenIdFile(PHONESFILE, PHONES_ID_FILE) GenIdFile(PHONESFILE, PHONES_ID_FILE)
GenIdFile(TONESFILE, TONES_ID_FILE) GenIdFile(TONESFILE, TONES_ID_FILE)

@ -1,9 +1,25 @@
from pypinyin import lazy_pinyin, Style # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re import re
from pypinyin import lazy_pinyin
from pypinyin import Style
worddict = "./dict/jieba_part.dict.utf8" worddict = "./dict/jieba_part.dict.utf8"
newdict = "./dict/word_phones.dict" newdict = "./dict/word_phones.dict"
def GenPhones(initials, finals, seperate=True): def GenPhones(initials, finals, seperate=True):
phones = [] phones = []
@ -14,9 +30,9 @@ def GenPhones(initials, finals, seperate=True):
elif c in ['zh', 'ch', 'sh', 'r']: elif c in ['zh', 'ch', 'sh', 'r']:
v = re.sub('i', 'iii', v) v = re.sub('i', 'iii', v)
if c: if c:
if seperate == True: if seperate is True:
phones.append(c + '0') phones.append(c + '0')
elif seperate == False: elif seperate is False:
phones.append(c) phones.append(c)
else: else:
print("Not sure whether phone and tone need to be separated") print("Not sure whether phone and tone need to be separated")
@ -28,8 +44,10 @@ def GenPhones(initials, finals, seperate=True):
with open(worddict, "r") as f1, open(newdict, "w+") as f2: with open(worddict, "r") as f1, open(newdict, "w+") as f2:
for line in f1.readlines(): for line in f1.readlines():
word = line.split(" ")[0] word = line.split(" ")[0]
initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) initials = lazy_pinyin(
finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) word, neutral_tone_with_five=True, style=Style.INITIALS)
finals = lazy_pinyin(
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
phones = GenPhones(initials, finals, True) phones = GenPhones(initials, finals, True)

@ -1,18 +1,28 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "base/type_conv.h" #include "base/type_conv.h"
namespace ppspeech { namespace ppspeech {
// wstring to string // wstring to string
std::string wstring2utf8string(const std::wstring& str) std::string wstring2utf8string(const std::wstring& str) {
{
static std::wstring_convert<std::codecvt_utf8<wchar_t>> strCnv; static std::wstring_convert<std::codecvt_utf8<wchar_t>> strCnv;
return strCnv.to_bytes(str); return strCnv.to_bytes(str);
} }
// string to wstring // string to wstring
std::wstring utf8string2wstring(const std::string& str) std::wstring utf8string2wstring(const std::string& str) {
{
static std::wstring_convert<std::codecvt_utf8<wchar_t>> strCnv; static std::wstring_convert<std::codecvt_utf8<wchar_t>> strCnv;
return strCnv.from_bytes(str); return strCnv.from_bytes(str);
} }
} // namespace ppspeech
}

@ -1,9 +1,23 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef BASE_TYPE_CONVC_H #ifndef BASE_TYPE_CONVC_H
#define BASE_TYPE_CONVC_H #define BASE_TYPE_CONVC_H
#include <string>
#include <locale>
#include <codecvt> #include <codecvt>
#include <locale>
#include <string>
namespace ppspeech { namespace ppspeech {
@ -12,7 +26,6 @@ std::string wstring2utf8string(const std::wstring& str);
// string to wstring // string to wstring
std::wstring utf8string2wstring(const std::string& str); std::wstring utf8string2wstring(const std::string& str);
} }
#endif // BASE_TYPE_CONVC_H #endif // BASE_TYPE_CONVC_H

@ -1,3 +1,16 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "front/front_interface.h" #include "front/front_interface.h"
namespace ppspeech { namespace ppspeech {
@ -11,66 +24,93 @@ int FrontEngineInterface::init() {
return -1; return -1;
} }
_jieba = new cppjieba::Jieba(_jieba_dict_path, _jieba_hmm_path, _jieba_user_dict_path, _jieba = new cppjieba::Jieba(_jieba_dict_path,
_jieba_idf_path, _jieba_stop_word_path); _jieba_hmm_path,
_jieba_user_dict_path,
_punc = {"", "", "", "", "", "", "~", "", _jieba_idf_path,
",", ".", "?", "!", ":", ";", "/", "\\"}; _jieba_stop_word_path);
_punc = {"",
"",
"",
"",
"",
"",
"~",
"",
",",
".",
"?",
"!",
":",
";",
"/",
"\\"};
_punc_omit = {"", "", "\"", "\""}; _punc_omit = {"", "", "\"", "\""};
// 需要儿化音处理的词语 // 需要儿化音处理的词语
must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"}; must_erhua = {
not_erhua = { "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"};
"虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿", not_erhua = {"虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿",
"拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿", "有儿", "一儿", "我儿", "俺儿", "妻儿", "拐儿",
"流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿",
"孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿", "婴幼儿", "连体儿", "脑瘫儿", "流浪儿", "体弱儿", "混血儿",
"狗儿" "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿",
}; "侄儿", "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿",
"花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
must_not_neural_tone_words = {"男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子"}; "狗儿"};
must_not_neural_tone_words = {
"男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子"};
// 需要轻声处理的词语 // 需要轻声处理的词语
must_neural_tone_words = { must_neural_tone_words = {
"麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头", "馄饨", "风筝", "麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头",
"难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖", "铃铛", "铁匠", "钥匙", "里脊", "馄饨", "风筝", "难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖",
"里头", "部分", "那么", "道士", "造化", "迷糊", "连累", "这么", "这个", "运气", "过去", "铃铛", "铁匠", "钥匙", "里脊", "里头", "部分", "那么", "道士", "造化",
"软和", "转悠", "踏实", "跳蚤", "跟头", "趔趄", "财主", "豆腐", "讲究", "记性", "记号", "迷糊", "连累", "这么", "这个", "运气", "过去", "软和", "转悠", "踏实",
"认识", "规矩", "见识", "裁缝", "补丁", "衣裳", "衣服", "衙门", "街坊", "行李", "行当", "跳蚤", "跟头", "趔趄", "财主", "豆腐", "讲究", "记性", "记号", "认识",
"蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄", "萝卜", "荸荠", "苗条", "苗头", "苍蝇", "芝麻", "规矩", "见识", "裁缝", "补丁", "衣裳", "衣服", "衙门", "街坊", "行李",
"舒服", "舒坦", "舌头", "自在", "膏药", "脾气", "脑袋", "脊梁", "能耐", "胳膊", "胭脂", "行当", "蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄", "萝卜", "荸荠", "苗条",
"胡萝", "胡琴", "胡同", "聪明", "耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆", "苗头", "苍蝇", "芝麻", "舒服", "舒坦", "舌头", "自在", "膏药", "脾气",
"老头", "老太", "翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂", "脑袋", "脊梁", "能耐", "胳膊", "胭脂", "胡萝", "胡琴", "胡同", "聪明",
"精神", "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语", "笑话", "窟窿", "耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆", "老头", "老太",
"窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气", "秀才", "福气", "祖宗", "砚台", "翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂",
"码头", "石榴", "石头", "石匠", "知识", "眼睛", "眯缝", "眨巴", "眉毛", "相声", "盘算", "精神", "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语",
"白净", "痢疾", "痛快", "疟疾", "疙瘩", "疏忽", "畜生", "生意", "甘蔗", "琵琶", "琢磨", "笑话", "窟窿", "窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气",
"琉璃", "玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务", "牲口", "牙碜", "牌楼", "爽快", "秀才", "福气", "祖宗", "砚台", "码头", "石榴", "石头", "石匠", "知识",
"爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心", "炊帚", "灯笼", "火候", "漂亮", "滑溜", "眼睛", "眯缝", "眨巴", "眉毛", "相声", "盘算", "白净", "痢疾", "痛快",
"溜达", "温和", "清楚", "消息", "浪头", "活泼", "比方", "正经", "欺负", "模糊", "槟榔", "疟疾", "疙瘩", "疏忽", "畜生", "生意", "甘蔗", "琵琶", "琢磨", "琉璃",
"棺材", "棒槌", "棉花", "核桃", "栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事", "玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务", "牲口", "牙碜", "牌楼",
"木头", "木匠", "朋友", "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾", "爽快", "爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心", "炊帚", "灯笼",
"收成", "提防", "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄", "招牌", "招呼", "火候", "漂亮", "滑溜", "溜达", "温和", "清楚", "消息", "浪头", "活泼",
"抬举", "护士", "折腾", "扫帚", "打量", "打算", "打点", "打扮", "打听", "打发", "扎实", "比方", "正经", "欺负", "模糊", "槟榔", "棺材", "棒槌", "棉花", "核桃",
"扁担", "戒指", "懒得", "意识", "意思", "情形", "悟性", "怪物", "思量", "怎么", "念头", "栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事", "木头", "木匠",
"念叨", "快活", "忙活", "志气", "心思", "得罪", "张罗", "弟兄", "开通", "应酬", "庄稼", "朋友", "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾",
"干事", "帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌", "差事", "工夫", "岁数", "收成", "提防", "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄",
"屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头", "对付", "寡妇", "家伙", "客气", "招牌", "招呼", "抬举", "护士", "折腾", "扫帚", "打量", "打算", "打点",
"实在", "官司", "学问", "学生", "字号", "嫁妆", "媳妇", "媒人", "婆家", "娘家", "委屈", "打扮", "打听", "打发", "扎实", "扁担", "戒指", "懒得", "意识", "意思",
"姑娘", "姐夫", "妯娌", "妥当", "妖精", "奴才", "女婿", "头发", "太阳", "大爷", "大方", "情形", "悟性", "怪物", "思量", "怎么", "念头", "念叨", "快活", "忙活",
"大意", "大夫", "多少", "多么", "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴", "志气", "心思", "得罪", "张罗", "弟兄", "开通", "应酬", "庄稼", "干事",
"嘱咐", "嘟囔", "嘀咕", "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴", "哈欠", "哆嗦", "帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌", "差事", "工夫",
"咳嗽", "和尚", "告诉", "告示", "含糊", "吓唬", "后头", "名字", "名堂", "合同", "吆喝", "岁数", "屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头", "对付",
"叫唤", "口袋", "厚道", "厉害", "千斤", "包袱", "包涵", "匀称", "勤快", "动静", "动弹", "寡妇", "家伙", "客气", "实在", "官司", "学问", "学生", "字号", "嫁妆",
"功夫", "力气", "前头", "刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析", "出息", "媳妇", "媒人", "婆家", "娘家", "委屈", "姑娘", "姐夫", "妯娌", "妥当",
"凑合", "凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟", "便宜", "使唤", "妖精", "奴才", "女婿", "头发", "太阳", "大爷", "大方", "大意", "大夫",
"佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么", "人家", "亲戚", "亲家", "多少", "多么", "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴",
"交情", "云彩", "事情", "买卖", "主意", "丫头", "丧气", "两口", "东西", "东家", "世故", "嘱咐", "嘟囔", "嘀咕", "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴",
"不由", "不在", "下水", "下巴", "上头", "上司", "丈夫", "丈人", "一辈", "那个", "菩萨", "哈欠", "哆嗦", "咳嗽", "和尚", "告诉", "告示", "含糊", "吓唬", "后头",
"父亲", "母亲", "咕噜", "邋遢", "费用", "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅", "名字", "名堂", "合同", "吆喝", "叫唤", "口袋", "厚道", "厉害", "千斤",
"幸福", "熟悉", "计划", "扑腾", "蜡烛", "姥爷", "照顾", "喉咙", "吉他", "弄堂", "蚂蚱", "包袱", "包涵", "匀称", "勤快", "动静", "动弹", "功夫", "力气", "前头",
"凤凰", "拖沓", "寒碜", "糟蹋", "倒腾", "报复", "逻辑", "盘缠", "喽啰", "牢骚", "咖喱", "刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析", "出息", "凑合",
"扫把", "惦记" "凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟", "便宜",
}; "使唤", "佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么",
"人家", "亲戚", "亲家", "交情", "云彩", "事情", "买卖", "主意", "丫头",
"丧气", "两口", "东西", "东家", "世故", "不由", "不在", "下水", "下巴",
"上头", "上司", "丈夫", "丈人", "一辈", "那个", "菩萨", "父亲", "母亲",
"咕噜", "邋遢", "费用", "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅",
"幸福", "熟悉", "计划", "扑腾", "蜡烛", "姥爷", "照顾", "喉咙", "吉他",
"弄堂", "蚂蚱", "凤凰", "拖沓", "寒碜", "糟蹋", "倒腾", "报复", "逻辑",
"盘缠", "喽啰", "牢骚", "咖喱", "扫把", "惦记"};
// 生成词典(词到音素的映射) // 生成词典(词到音素的映射)
@ -137,7 +177,8 @@ int FrontEngineInterface::ReadConfFile() {
return 0; return 0;
} }
int FrontEngineInterface::Trand2Simp(const std::wstring &sentence, std::wstring &sentence_simp) { int FrontEngineInterface::Trand2Simp(const std::wstring &sentence,
std::wstring &sentence_simp) {
// sentence_simp = sentence; // sentence_simp = sentence;
for (int i = 0; i < sentence.length(); i++) { for (int i = 0; i < sentence.length(); i++) {
std::wstring temp(1, sentence[i]); std::wstring temp(1, sentence[i]);
@ -146,14 +187,16 @@ int FrontEngineInterface::Trand2Simp(const std::wstring &sentence, std::wstring
if (trand_simp_map.find(sigle_word) == trand_simp_map.end()) { if (trand_simp_map.find(sigle_word) == trand_simp_map.end()) {
sentence_simp += temp; sentence_simp += temp;
} else { } else {
sentence_simp += (ppspeech::utf8string2wstring(trand_simp_map[sigle_word])); sentence_simp +=
(ppspeech::utf8string2wstring(trand_simp_map[sigle_word]));
} }
} }
return 0; return 0;
} }
int FrontEngineInterface::GenDict(const std::string &dict_file, std::map<std::string, std::string> &map) { int FrontEngineInterface::GenDict(const std::string &dict_file,
std::map<std::string, std::string> &map) {
std::ifstream is(dict_file.c_str(), std::ifstream::in); std::ifstream is(dict_file.c_str(), std::ifstream::in);
if (!is.good()) { if (!is.good()) {
LOG(ERROR) << "Cannot open dict file: " << dict_file; LOG(ERROR) << "Cannot open dict file: " << dict_file;
@ -169,7 +212,8 @@ int FrontEngineInterface::GenDict(const std::string &dict_file, std::map<std::st
return 0; return 0;
} }
int FrontEngineInterface::GetSegResult(std::vector<std::pair<std::string, std::string>> &seg, int FrontEngineInterface::GetSegResult(
std::vector<std::pair<std::string, std::string>> &seg,
std::vector<std::string> &seg_words) { std::vector<std::string> &seg_words) {
std::vector<std::pair<std::string, std::string>>::iterator iter; std::vector<std::pair<std::string, std::string>>::iterator iter;
for (iter = seg.begin(); iter != seg.end(); iter++) { for (iter = seg.begin(); iter != seg.end(); iter++) {
@ -178,8 +222,11 @@ int FrontEngineInterface::GetSegResult(std::vector<std::pair<std::string, std::s
return 0; return 0;
} }
int FrontEngineInterface::GetSentenceIds(const std::string &sentence, std::vector<int> &phoneids, std::vector<int> &toneids) { int FrontEngineInterface::GetSentenceIds(const std::string &sentence,
std::vector<std::pair<std::string, std::string>> cut_result; //分词结果包含词和词性 std::vector<int> &phoneids,
std::vector<int> &toneids) {
std::vector<std::pair<std::string, std::string>>
cut_result; //分词结果包含词和词性
if (0 != Cut(sentence, cut_result)) { if (0 != Cut(sentence, cut_result)) {
LOG(ERROR) << "Cut sentence: \"" << sentence << "\" failed"; LOG(ERROR) << "Cut sentence: \"" << sentence << "\" failed";
return -1; return -1;
@ -192,7 +239,9 @@ int FrontEngineInterface::GetSentenceIds(const std::string &sentence, std::vecto
return 0; return 0;
} }
int FrontEngineInterface::GetWordsIds(const std::vector<std::pair<std::string, std::string>> &cut_result, std::vector<int> &phoneids, int FrontEngineInterface::GetWordsIds(
const std::vector<std::pair<std::string, std::string>> &cut_result,
std::vector<int> &phoneids,
std::vector<int> &toneids) { std::vector<int> &toneids) {
std::string word; std::string word;
std::string pos; std::string pos;
@ -202,15 +251,19 @@ int FrontEngineInterface::GetWordsIds(const std::vector<std::pair<std::string, s
for (int i = 0; i < cut_result.size(); i++) { for (int i = 0; i < cut_result.size(); i++) {
word = cut_result[i].first; word = cut_result[i].first;
pos = cut_result[i].second; pos = cut_result[i].second;
if (std::find(_punc_omit.begin(), _punc_omit.end(), word) == _punc_omit.end()) { // 非可忽略的标点 if (std::find(_punc_omit.begin(), _punc_omit.end(), word) ==
_punc_omit.end()) { // 非可忽略的标点
word_initials = {}; word_initials = {};
word_finals = {}; word_finals = {};
phone = ""; phone = "";
// 判断是否在标点符号集合中 // 判断是否在标点符号集合中
if (std::find(_punc.begin(), _punc.end(), word) == _punc.end()) { // 文字 if (std::find(_punc.begin(), _punc.end(), word) ==
_punc.end()) { // 文字
// 获取字词的声母韵母列表 // 获取字词的声母韵母列表
if (0 != GetInitialsFinals(word, word_initials, word_finals)) { if (0 != GetInitialsFinals(word, word_initials, word_finals)) {
LOG(ERROR) << "Genarate the word_initials and word_finals of " << word << " failed"; LOG(ERROR)
<< "Genarate the word_initials and word_finals of "
<< word << " failed";
return -1; return -1;
} }
@ -220,7 +273,8 @@ int FrontEngineInterface::GetWordsIds(const std::vector<std::pair<std::string, s
} }
// 对儿化音进行修改 // 对儿化音进行修改
std::vector<std::vector<std::string>> new_initals_finals = MergeErhua(word_initials, word_finals, word, pos); std::vector<std::vector<std::string>> new_initals_finals =
MergeErhua(word_initials, word_finals, word, pos);
word_initials = new_initals_finals[0]; word_initials = new_initals_finals[0];
word_finals = new_initals_finals[1]; word_finals = new_initals_finals[1];
@ -256,10 +310,11 @@ int FrontEngineInterface::GetWordsIds(const std::vector<std::pair<std::string, s
} }
return 0; return 0;
} }
int FrontEngineInterface::Cut(const std::string &sentence, std::vector<std::pair<std::string, std::string>> &cut_result) { int FrontEngineInterface::Cut(
const std::string &sentence,
std::vector<std::pair<std::string, std::string>> &cut_result) {
std::vector<std::pair<std::string, std::string>> cut_result_jieba; std::vector<std::pair<std::string, std::string>> cut_result_jieba;
// 结巴分词 // 结巴分词
@ -274,7 +329,8 @@ int FrontEngineInterface::Cut(const std::string &sentence, std::vector<std::pair
return 0; return 0;
} }
int FrontEngineInterface::GetPhone(const std::string &word, std::string &phone) { int FrontEngineInterface::GetPhone(const std::string &word,
std::string &phone) {
// 判断 word 在不在 词典里如果不在进行CutAll分词 // 判断 word 在不在 词典里如果不在进行CutAll分词
if (word_phone_map.find(word) == word_phone_map.end()) { if (word_phone_map.find(word) == word_phone_map.end()) {
std::vector<std::string> wordcut; std::vector<std::string> wordcut;
@ -290,26 +346,33 @@ int FrontEngineInterface::GetPhone(const std::string &word, std::string &phone)
return 0; return 0;
} }
int FrontEngineInterface::Phone2Phoneid(const std::string &phone, std::vector<int> &phoneid, std::vector<int> &toneid) { int FrontEngineInterface::Phone2Phoneid(const std::string &phone,
std::vector<int> &phoneid,
std::vector<int> &toneid) {
std::vector<std::string> phone_vec; std::vector<std::string> phone_vec;
phone_vec = absl::StrSplit(phone, " "); phone_vec = absl::StrSplit(phone, " ");
std::string temp_phone; std::string temp_phone;
for (int i = 0; i < phone_vec.size(); i++) { for (int i = 0; i < phone_vec.size(); i++) {
temp_phone = phone_vec[i]; temp_phone = phone_vec[i];
if (_seperate_tone == "true") { if (_seperate_tone == "true") {
phoneid.push_back(atoi((phone_id_map[temp_phone.substr(0, temp_phone.length()-1)]).c_str())); phoneid.push_back(atoi(
toneid.push_back(atoi((tone_id_map[temp_phone.substr(temp_phone.length()-1, temp_phone.length())]).c_str())); (phone_id_map[temp_phone.substr(0, temp_phone.length() - 1)])
.c_str()));
toneid.push_back(
atoi((tone_id_map[temp_phone.substr(temp_phone.length() - 1,
temp_phone.length())])
.c_str()));
} else { } else {
phoneid.push_back(atoi((phone_id_map[temp_phone]).c_str())); phoneid.push_back(atoi((phone_id_map[temp_phone]).c_str()));
} }
} }
return 0; return 0;
} }
// 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声 // 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声
bool FrontEngineInterface::AllToneThree(const std::vector<std::string> &finals) { bool FrontEngineInterface::AllToneThree(
const std::vector<std::string> &finals) {
bool flags = true; bool flags = true;
for (int i = 0; i < finals.size(); i++) { for (int i = 0; i < finals.size(); i++) {
if ((int)finals[i].back() != 51) { //如果读音不为第三声 if ((int)finals[i].back() != 51) { //如果读音不为第三声
@ -317,7 +380,6 @@ bool FrontEngineInterface::AllToneThree(const std::vector<std::string> &finals)
} }
} }
return flags; return flags;
} }
// 判断词是否是叠词 // 判断词是否是叠词
@ -329,11 +391,14 @@ bool FrontEngineInterface::IsReduplication(const std::string &word) {
flags = true; flags = true;
} }
return flags; return flags;
} }
// 获取每个字词的声母和韵母列表, word_initials 为声母列表word_finals 为韵母列表 // 获取每个字词的声母和韵母列表, word_initials 为声母列表word_finals
int FrontEngineInterface::GetInitialsFinals(const std::string &word, std::vector<std::string> &word_initials, std::vector<std::string> &word_finals) { // 为韵母列表
int FrontEngineInterface::GetInitialsFinals(
const std::string &word,
std::vector<std::string> &word_initials,
std::vector<std::string> &word_finals) {
std::string phone; std::string phone;
GetPhone(word, phone); //获取字词对应的音素 GetPhone(word, phone); //获取字词对应的音素
std::vector<std::string> phone_vec = absl::StrSplit(phone, " "); std::vector<std::string> phone_vec = absl::StrSplit(phone, " ");
@ -344,7 +409,8 @@ int FrontEngineInterface::GetInitialsFinals(const std::string &word, std::vector
start += 1; start += 1;
} }
// 最后一位不是数字或者最后一位的数字是0均表示声母第二个是韵母 // 最后一位不是数字或者最后一位的数字是0均表示声母第二个是韵母
else if(isdigit(phone_vec[start].back()) == 0 || (int)phone_vec[start].back() == 48) { else if (isdigit(phone_vec[start].back()) == 0 ||
(int)phone_vec[start].back() == 48) {
word_initials.push_back(phone_vec[start]); word_initials.push_back(phone_vec[start]);
word_finals.push_back(phone_vec[start + 1]); word_finals.push_back(phone_vec[start + 1]);
start += 2; start += 2;
@ -355,13 +421,15 @@ int FrontEngineInterface::GetInitialsFinals(const std::string &word, std::vector
} }
} }
assert(word_finals.size() == ppspeech::utf8string2wstring(word).length() && word_finals.size() == word_initials.size()); assert(word_finals.size() == ppspeech::utf8string2wstring(word).length() &&
word_finals.size() == word_initials.size());
return 0; return 0;
} }
// 获取每个字词的韵母列表 // 获取每个字词的韵母列表
int FrontEngineInterface::GetFinals(const std::string &word, std::vector<std::string> &word_finals) { int FrontEngineInterface::GetFinals(const std::string &word,
std::vector<std::string> &word_finals) {
std::vector<std::string> word_initials; std::vector<std::string> word_initials;
if (0 != GetInitialsFinals(word, word_initials, word_finals)) { if (0 != GetInitialsFinals(word, word_initials, word_finals)) {
LOG(ERROR) << "Failed to get word finals"; LOG(ERROR) << "Failed to get word finals";
@ -371,23 +439,26 @@ int FrontEngineInterface::GetFinals(const std::string &word, std::vector<std::st
return 0; return 0;
} }
int FrontEngineInterface::Word2WordVec(const std::string &word, std::vector<std::wstring> &wordvec) { int FrontEngineInterface::Word2WordVec(const std::string &word,
std::vector<std::wstring> &wordvec) {
std::wstring word_wstr = ppspeech::utf8string2wstring(word); std::wstring word_wstr = ppspeech::utf8string2wstring(word);
for (int i = 0; i < word_wstr.length(); i++) { for (int i = 0; i < word_wstr.length(); i++) {
std::wstring word_sigle(1, word_wstr[i]); std::wstring word_sigle(1, word_wstr[i]);
wordvec.push_back(word_sigle); wordvec.push_back(word_sigle);
} }
return 0; return 0;
} }
// yuantian01解释把一个词再进行分词找到。例子小雨伞 --> 小 雨伞 或者 小雨 伞 // yuantian01解释把一个词再进行分词找到。例子小雨伞 --> 小 雨伞 或者 小雨 伞
int FrontEngineInterface::SplitWord(const std::string &word, std::vector<std::string> &new_word_vec) { int FrontEngineInterface::SplitWord(const std::string &word,
std::vector<std::string> &new_word_vec) {
std::vector<std::string> word_vec; std::vector<std::string> word_vec;
std::string second_subword; std::string second_subword;
_jieba->CutForSearch(word, word_vec); _jieba->CutForSearch(word, word_vec);
// 升序 // 升序
std::sort(word_vec.begin(), word_vec.end(), [](std::string a, std::string b ) {return a.size() > b.size();}); std::sort(word_vec.begin(),
word_vec.end(),
[](std::string a, std::string b) { return a.size() > b.size(); });
std::string first_subword = word_vec[0]; // 提取长度最短的字符串 std::string first_subword = word_vec[0]; // 提取长度最短的字符串
int first_begin_idx = word.find_first_of(first_subword); int first_begin_idx = word.find_first_of(first_subword);
if (first_begin_idx == 0) { if (first_begin_idx == 0) {
@ -401,12 +472,12 @@ int FrontEngineInterface::SplitWord(const std::string &word, std::vector<std::st
} }
return 0; return 0;
} }
// example: 不 一起 --> 不一起 // example: 不 一起 --> 不一起
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeBu(std::vector<std::pair<std::string, std::string>> &seg_result) { std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeBu(
std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result; std::vector<std::pair<std::string, std::string>> result;
std::string word; std::string word;
std::string pos; std::string pos;
@ -432,7 +503,8 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeBu(s
return result; return result;
} }
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::Mergeyi(std::vector<std::pair<std::string, std::string>> &seg_result) { std::vector<std::pair<std::string, std::string>> FrontEngineInterface::Mergeyi(
std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result_temp; std::vector<std::pair<std::string, std::string>> result_temp;
std::string word; std::string word;
std::string pos; std::string pos;
@ -442,10 +514,13 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::Mergeyi(s
word = seg_result[i].first; word = seg_result[i].first;
pos = seg_result[i].second; pos = seg_result[i].second;
if ((i - 1 >= 0) && (word == "") && (i + 1 < seg_result.size()) && if ((i - 1 >= 0) && (word == "") && (i + 1 < seg_result.size()) &&
(seg_result[i - 1].first == seg_result[i + 1].first) && seg_result[i - 1].second == "v") { (seg_result[i - 1].first == seg_result[i + 1].first) &&
result_temp[i - 1].first = result_temp[i - 1].first + "" + result_temp[i - 1].first; seg_result[i - 1].second == "v") {
result_temp[i - 1].first =
result_temp[i - 1].first + "" + result_temp[i - 1].first;
} else { } else {
if((i - 2 >= 0) && (seg_result[i - 1].first == "") && (seg_result[i - 2].first == word) && (pos == "v")) { if ((i - 2 >= 0) && (seg_result[i - 1].first == "") &&
(seg_result[i - 2].first == word) && (pos == "v")) {
continue; continue;
} else { } else {
result_temp.push_back(make_pair(word, pos)); result_temp.push_back(make_pair(word, pos));
@ -463,14 +538,15 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::Mergeyi(s
} else { } else {
result.push_back(make_pair(word, pos)); result.push_back(make_pair(word, pos));
} }
} }
return result; return result;
} }
// example: 你 你 --> 你你 // example: 你 你 --> 你你
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeReduplication(std::vector<std::pair<std::string, std::string>> &seg_result) { std::vector<std::pair<std::string, std::string>>
FrontEngineInterface::MergeReduplication(
std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result; std::vector<std::pair<std::string, std::string>> result;
std::string word; std::string word;
std::string pos; std::string pos;
@ -489,7 +565,9 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeRedu
} }
// the first and the second words are all_tone_three // the first and the second words are all_tone_three
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThreeTones(std::vector<std::pair<std::string, std::string>> &seg_result) { std::vector<std::pair<std::string, std::string>>
FrontEngineInterface::MergeThreeTones(
std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result; std::vector<std::pair<std::string, std::string>> result;
std::string word; std::string word;
std::string pos; std::string pos;
@ -499,7 +577,8 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
// 判断最后一个分词结果是不是标点,不看标点的声母韵母 // 判断最后一个分词结果是不是标点,不看标点的声母韵母
int word_num = seg_result.size() - 1; int word_num = seg_result.size() - 1;
if(std::find(_punc.begin(), _punc.end(), seg_result[word_num].first) == _punc.end()){ // 最后一个分词结果不是标点 if (std::find(_punc.begin(), _punc.end(), seg_result[word_num].first) ==
_punc.end()) { // 最后一个分词结果不是标点
word_num += 1; word_num += 1;
} }
@ -508,7 +587,8 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
word_final = {}; word_final = {};
word = seg_result[i].first; word = seg_result[i].first;
pos = seg_result[i].second; pos = seg_result[i].second;
if(std::find(_punc_omit.begin(), _punc_omit.end(), word) == _punc_omit.end()) { // 非可忽略的标点,即文字 if (std::find(_punc_omit.begin(), _punc_omit.end(), word) ==
_punc_omit.end()) { // 非可忽略的标点,即文字
if (0 != GetFinals(word, word_final)) { if (0 != GetFinals(word, word_final)) {
LOG(ERROR) << "Failed to get the final of word."; LOG(ERROR) << "Failed to get the final of word.";
} }
@ -522,10 +602,15 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
for (int i = 0; i < word_num; i++) { for (int i = 0; i < word_num; i++) {
word = seg_result[i].first; word = seg_result[i].first;
pos = seg_result[i].second; pos = seg_result[i].second;
if(i - 1 >= 0 && AllToneThree(finals[i - 1]) && AllToneThree(finals[i]) && !merge_last[i - 1]) { if (i - 1 >= 0 && AllToneThree(finals[i - 1]) &&
// if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi AllToneThree(finals[i]) && !merge_last[i - 1]) {
// if the last word is reduplication, not merge, because
// reduplication need to be _neural_sandhi
if (!IsReduplication(seg_result[i - 1].first) && if (!IsReduplication(seg_result[i - 1].first) &&
(ppspeech::utf8string2wstring(seg_result[i - 1].first)).length() + (ppspeech::utf8string2wstring(word)).length() <= 3) { (ppspeech::utf8string2wstring(seg_result[i - 1].first))
.length() +
(ppspeech::utf8string2wstring(word)).length() <=
3) {
result.back().first = result.back().first + seg_result[i].first; result.back().first = result.back().first + seg_result[i].first;
merge_last[i] = true; merge_last[i] = true;
} else { } else {
@ -538,14 +623,17 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
//把标点的分词结果补上 //把标点的分词结果补上
if (word_num < seg_result.size()) { if (word_num < seg_result.size()) {
result.push_back(make_pair(seg_result[word_num].first, seg_result[word_num].second)); result.push_back(
make_pair(seg_result[word_num].first, seg_result[word_num].second));
} }
return result; return result;
} }
// the last char of first word and the first char of second word is tone_three // the last char of first word and the first char of second word is tone_three
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThreeTones2(std::vector<std::pair<std::string, std::string>> &seg_result) { std::vector<std::pair<std::string, std::string>>
FrontEngineInterface::MergeThreeTones2(
std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result; std::vector<std::pair<std::string, std::string>> result;
std::string word; std::string word;
std::string pos; std::string pos;
@ -555,7 +643,8 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
// 判断最后一个分词结果是不是标点 // 判断最后一个分词结果是不是标点
int word_num = seg_result.size() - 1; int word_num = seg_result.size() - 1;
if(std::find(_punc.begin(), _punc.end(), seg_result[word_num].first) == _punc.end()){ // 最后一个分词结果不是标点 if (std::find(_punc.begin(), _punc.end(), seg_result[word_num].first) ==
_punc.end()) { // 最后一个分词结果不是标点
word_num += 1; word_num += 1;
} }
@ -565,7 +654,8 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
word = seg_result[i].first; word = seg_result[i].first;
pos = seg_result[i].second; pos = seg_result[i].second;
// 如果是文字,则获取韵母,如果是可忽略的标点,例如引号,则跳过 // 如果是文字,则获取韵母,如果是可忽略的标点,例如引号,则跳过
if(std::find(_punc_omit.begin(), _punc_omit.end(), word) == _punc_omit.end()) { if (std::find(_punc_omit.begin(), _punc_omit.end(), word) ==
_punc_omit.end()) {
if (0 != GetFinals(word, word_final)) { if (0 != GetFinals(word, word_final)) {
LOG(ERROR) << "Failed to get the final of word."; LOG(ERROR) << "Failed to get the final of word.";
} }
@ -579,11 +669,18 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
for (int i = 0; i < word_num; i++) { for (int i = 0; i < word_num; i++) {
word = seg_result[i].first; word = seg_result[i].first;
pos = seg_result[i].second; pos = seg_result[i].second;
if(i - 1 >= 0 && !finals[i - 1].empty() && absl::EndsWith(finals[i - 1].back(), "3") == true && if (i - 1 >= 0 && !finals[i - 1].empty() &&
!finals[i].empty() && absl::EndsWith(finals[i].front(), "3") == true && !merge_last[i - 1]) { absl::EndsWith(finals[i - 1].back(), "3") == true &&
// if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi !finals[i].empty() &&
absl::EndsWith(finals[i].front(), "3") == true &&
!merge_last[i - 1]) {
// if the last word is reduplication, not merge, because
// reduplication need to be _neural_sandhi
if (!IsReduplication(seg_result[i - 1].first) && if (!IsReduplication(seg_result[i - 1].first) &&
(ppspeech::utf8string2wstring(seg_result[i - 1].first)).length() + ppspeech::utf8string2wstring(word).length() <= 3) { (ppspeech::utf8string2wstring(seg_result[i - 1].first))
.length() +
ppspeech::utf8string2wstring(word).length() <=
3) {
result.back().first = result.back().first + seg_result[i].first; result.back().first = result.back().first + seg_result[i].first;
merge_last[i] = true; merge_last[i] = true;
} else { } else {
@ -596,14 +693,16 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeThre
//把标点的分词结果补上 //把标点的分词结果补上
if (word_num < seg_result.size()) { if (word_num < seg_result.size()) {
result.push_back(make_pair(seg_result[word_num].first, seg_result[word_num].second)); result.push_back(
make_pair(seg_result[word_num].first, seg_result[word_num].second));
} }
return result; return result;
} }
// example: 吃饭 儿 --> 吃饭儿 // example: 吃饭 儿 --> 吃饭儿
std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeEr(std::vector<std::pair<std::string, std::string>> &seg_result) { std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeEr(
std::vector<std::pair<std::string, std::string>> &seg_result) {
std::vector<std::pair<std::string, std::string>> result; std::vector<std::pair<std::string, std::string>> result;
std::string word; std::string word;
std::string pos; std::string pos;
@ -621,12 +720,13 @@ std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeEr(s
return result; return result;
} }
int FrontEngineInterface::MergeforModify(std::vector<std::pair<std::string, std::string>> &seg_word_type, int FrontEngineInterface::MergeforModify(
std::vector<std::pair<std::string, std::string>> &seg_word_type,
std::vector<std::pair<std::string, std::string>> &modify_seg_word_type) { std::vector<std::pair<std::string, std::string>> &modify_seg_word_type) {
std::vector<std::string> seg_result; std::vector<std::string> seg_result;
GetSegResult(seg_word_type, seg_result); GetSegResult(seg_word_type, seg_result);
LOG(INFO) << "Before merge, seg result is: " << limonp::Join(seg_result.begin(), seg_result.end(), "/"); LOG(INFO) << "Before merge, seg result is: "
<< limonp::Join(seg_result.begin(), seg_result.end(), "/");
modify_seg_word_type = MergeBu(seg_word_type); modify_seg_word_type = MergeBu(seg_word_type);
modify_seg_word_type = Mergeyi(modify_seg_word_type); modify_seg_word_type = Mergeyi(modify_seg_word_type);
@ -637,13 +737,15 @@ int FrontEngineInterface::MergeforModify(std::vector<std::pair<std::string, std:
seg_result = {}; seg_result = {};
GetSegResult(modify_seg_word_type, seg_result); GetSegResult(modify_seg_word_type, seg_result);
LOG(INFO) << "After merge, seg result is: " << limonp::Join(seg_result.begin(), seg_result.end(), "/"); LOG(INFO) << "After merge, seg result is: "
<< limonp::Join(seg_result.begin(), seg_result.end(), "/");
return 0; return 0;
} }
int FrontEngineInterface::BuSandi(const std::string &word, std::vector<std::string> &finals) { int FrontEngineInterface::BuSandi(const std::string &word,
std::vector<std::string> &finals) {
std::wstring bu = L""; std::wstring bu = L"";
std::vector<std::wstring> wordvec; std::vector<std::wstring> wordvec;
// 一个词转成向量形式 // 一个词转成向量形式
@ -669,7 +771,8 @@ int FrontEngineInterface::BuSandi(const std::string &word, std::vector<std::stri
} }
int FrontEngineInterface::YiSandhi(const std::string &word, std::vector<std::string> &finals) { int FrontEngineInterface::YiSandhi(const std::string &word,
std::vector<std::string> &finals) {
std::wstring yi = L""; std::wstring yi = L"";
std::vector<std::wstring> wordvec; std::vector<std::wstring> wordvec;
// 一个词转成向量形式 // 一个词转成向量形式
@ -692,7 +795,8 @@ int FrontEngineInterface::YiSandhi(const std::string &word, std::vector<std::str
if (flags == 0) { if (flags == 0) {
return 0; return 0;
} }
} else if(wordvec.size() == 3 && wordvec[1] == yi && wordvec[0] == wordvec[2]) { } else if (wordvec.size() == 3 && wordvec[1] == yi &&
wordvec[0] == wordvec[2]) {
// "一" between reduplication words shold be yi5, e.g. 看一看 // "一" between reduplication words shold be yi5, e.g. 看一看
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "5"); finals[1] = finals[1].replace(finals[1].length() - 1, 1, "5");
} else if (wordvec[0] == L"" && wordvec[1] == yi) { //以第一位开始 } else if (wordvec[0] == L"" && wordvec[1] == yi) { //以第一位开始
@ -702,10 +806,12 @@ int FrontEngineInterface::YiSandhi(const std::string &word, std::vector<std::str
if (wordvec[i] == yi && i + 1 < wordvec.size()) { if (wordvec[i] == yi && i + 1 < wordvec.size()) {
if (absl::EndsWith(finals[i + 1], "4") == true) { if (absl::EndsWith(finals[i + 1], "4") == true) {
// "一" before tone4 should be yi2, e.g. 一段 // "一" before tone4 should be yi2, e.g. 一段
finals[i] = finals[i].replace(finals[i].length() - 1, 1, "2"); finals[i] =
finals[i].replace(finals[i].length() - 1, 1, "2");
} else { } else {
// "一" before non-tone4 should be yi4, e.g. 一天 // "一" before non-tone4 should be yi4, e.g. 一天
finals[i] = finals[i].replace(finals[i].length() - 1, 1, "4"); finals[i] =
finals[i].replace(finals[i].length() - 1, 1, "4");
} }
} }
} }
@ -714,7 +820,9 @@ int FrontEngineInterface::YiSandhi(const std::string &word, std::vector<std::str
return 0; return 0;
} }
int FrontEngineInterface::NeuralSandhi(const std::string &word, const std::string &pos, std::vector<std::string> &finals) { int FrontEngineInterface::NeuralSandhi(const std::string &word,
const std::string &pos,
std::vector<std::string> &finals) {
std::wstring word_wstr = ppspeech::utf8string2wstring(word); std::wstring word_wstr = ppspeech::utf8string2wstring(word);
std::vector<std::wstring> wordvec; std::vector<std::wstring> wordvec;
// 一个词转成向量形式 // 一个词转成向量形式
@ -728,7 +836,8 @@ int FrontEngineInterface::NeuralSandhi(const std::string &word, const std::strin
// 情况1reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 // 情况1reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
for (int j = 0; j < wordvec.size(); j++) { for (int j = 0; j < wordvec.size(); j++) {
std::string inits = "nva"; std::string inits = "nva";
if(j - 1 >= 0 && wordvec[j] == wordvec[j - 1] && inits.find(pos[0]) != inits.npos) { if (j - 1 >= 0 && wordvec[j] == wordvec[j - 1] &&
inits.find(pos[0]) != inits.npos) {
finals[j] = finals[j].replace(finals[j].length() - 1, 1, "5"); finals[j] = finals[j].replace(finals[j].length() - 1, 1, "5");
} }
} }
@ -749,27 +858,49 @@ int FrontEngineInterface::NeuralSandhi(const std::string &word, const std::strin
auto ge_idx = word_wstr.find_first_of(ge); // 出现“个”的第一个位置 auto ge_idx = word_wstr.find_first_of(ge); // 出现“个”的第一个位置
if (word_num >= 1 && yuqici.find(wordvec.back()) != yuqici.npos) { if (word_num >= 1 && yuqici.find(wordvec.back()) != yuqici.npos) {
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5"); finals.back() =
finals.back().replace(finals.back().length() - 1, 1, "5");
} else if (word_num >= 1 && de.find(wordvec.back()) != de.npos) { } else if (word_num >= 1 && de.find(wordvec.back()) != de.npos) {
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5"); finals.back() =
} else if(word_num == 1 && le.find(wordvec[0]) != le.npos && find(le_pos.begin(), le_pos.end(), pos) != le_pos.end()) { finals.back().replace(finals.back().length() - 1, 1, "5");
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5"); } else if (word_num == 1 && le.find(wordvec[0]) != le.npos &&
} else if(word_num > 1 && men.find(wordvec.back()) != men.npos && find(men_pos.begin(), men_pos.end(), pos) != men_pos.end() find(le_pos.begin(), le_pos.end(), pos) != le_pos.end()) {
&& find(must_not_neural_tone_words.begin(), must_not_neural_tone_words.end(), word) != must_not_neural_tone_words.end()) { finals.back() =
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5"); finals.back().replace(finals.back().length() - 1, 1, "5");
} else if(word_num > 1 && weizhi.find(wordvec.back()) != weizhi.npos && find(weizhi_pos.begin(), weizhi_pos.end(), pos) != weizhi_pos.end()) { } else if (word_num > 1 && men.find(wordvec.back()) != men.npos &&
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5"); find(men_pos.begin(), men_pos.end(), pos) != men_pos.end() &&
} else if(word_num > 1 && dong.find(wordvec.back()) != dong.npos && fangxiang.find(wordvec[word_num - 2]) != fangxiang.npos) { find(must_not_neural_tone_words.begin(),
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5"); must_not_neural_tone_words.end(),
word) != must_not_neural_tone_words.end()) {
finals.back() =
finals.back().replace(finals.back().length() - 1, 1, "5");
} else if (word_num > 1 && weizhi.find(wordvec.back()) != weizhi.npos &&
find(weizhi_pos.begin(), weizhi_pos.end(), pos) !=
weizhi_pos.end()) {
finals.back() =
finals.back().replace(finals.back().length() - 1, 1, "5");
} else if (word_num > 1 && dong.find(wordvec.back()) != dong.npos &&
fangxiang.find(wordvec[word_num - 2]) != fangxiang.npos) {
finals.back() =
finals.back().replace(finals.back().length() - 1, 1, "5");
} }
// 情况3对“个”字前面带有修饰词的字词读音处理 // 情况3对“个”字前面带有修饰词的字词读音处理
else if((ge_idx != word_wstr.npos && ge_idx >= 1 && xiushi.find(wordvec[ge_idx - 1]) != xiushi.npos) else if ((ge_idx != word_wstr.npos && ge_idx >= 1 &&
|| word_wstr == ge) { xiushi.find(wordvec[ge_idx - 1]) != xiushi.npos) ||
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5"); word_wstr == ge) {
finals.back() =
finals.back().replace(finals.back().length() - 1, 1, "5");
} else { } else {
if(find(must_neural_tone_words.begin(), must_neural_tone_words.end(), word) != must_neural_tone_words.end() if (find(must_neural_tone_words.begin(),
|| (word_num >= 2 && find(must_neural_tone_words.begin(), must_neural_tone_words.end(), ppspeech::wstring2utf8string(word_wstr.substr(word_num - 2))) != must_neural_tone_words.end())) { must_neural_tone_words.end(),
finals.back() = finals.back().replace(finals.back().length() - 1, 1, "5"); word) != must_neural_tone_words.end() ||
(word_num >= 2 &&
find(must_neural_tone_words.begin(),
must_neural_tone_words.end(),
ppspeech::wstring2utf8string(word_wstr.substr(
word_num - 2))) != must_neural_tone_words.end())) {
finals.back() =
finals.back().replace(finals.back().length() - 1, 1, "5");
} }
} }
@ -782,25 +913,39 @@ int FrontEngineInterface::NeuralSandhi(const std::string &word, const std::strin
// 创建对应的 韵母列表 // 创建对应的 韵母列表
std::vector<std::vector<std::string>> finals_list; std::vector<std::vector<std::string>> finals_list;
std::vector<std::string> finals_temp; std::vector<std::string> finals_temp;
finals_temp.assign(finals.begin(), finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length()); finals_temp.assign(
finals.begin(),
finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length());
finals_list.push_back(finals_temp); finals_list.push_back(finals_temp);
finals_temp.assign(finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length(), finals.end()); finals_temp.assign(
finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length(),
finals.end());
finals_list.push_back(finals_temp); finals_list.push_back(finals_temp);
finals = {}; finals = {};
for (int i = 0; i < word_list.size(); i++) { for (int i = 0; i < word_list.size(); i++) {
std::wstring temp_wstr = ppspeech::utf8string2wstring(word_list[i]); std::wstring temp_wstr = ppspeech::utf8string2wstring(word_list[i]);
if((find(must_neural_tone_words.begin(), must_neural_tone_words.end(), word_list[i]) != must_neural_tone_words.end()) if ((find(must_neural_tone_words.begin(),
|| (temp_wstr.length() >= 2 && find(must_neural_tone_words.begin(), must_neural_tone_words.end(), ppspeech::wstring2utf8string(temp_wstr.substr(temp_wstr.length() - 2))) != must_neural_tone_words.end())) { must_neural_tone_words.end(),
finals_list[i].back() = finals_list[i].back().replace(finals_list[i].back().length() - 1, 1, "5"); word_list[i]) != must_neural_tone_words.end()) ||
} (temp_wstr.length() >= 2 &&
finals.insert(finals.end(), finals_list[i].begin(), finals_list[i].end()); find(must_neural_tone_words.begin(),
must_neural_tone_words.end(),
ppspeech::wstring2utf8string(
temp_wstr.substr(temp_wstr.length() - 2))) !=
must_neural_tone_words.end())) {
finals_list[i].back() = finals_list[i].back().replace(
finals_list[i].back().length() - 1, 1, "5");
}
finals.insert(
finals.end(), finals_list[i].begin(), finals_list[i].end());
} }
return 0; return 0;
} }
int FrontEngineInterface::ThreeSandhi(const std::string &word, std::vector<std::string> &finals) { int FrontEngineInterface::ThreeSandhi(const std::string &word,
std::vector<std::string> &finals) {
std::wstring word_wstr = ppspeech::utf8string2wstring(word); std::wstring word_wstr = ppspeech::utf8string2wstring(word);
std::vector<std::vector<std::string>> finals_list; std::vector<std::vector<std::string>> finals_list;
std::vector<std::string> finals_temp; std::vector<std::string> finals_temp;
@ -828,31 +973,43 @@ int FrontEngineInterface::ThreeSandhi(const std::string &word, std::vector<std::
if (temp_wstr.length() == 2) { if (temp_wstr.length() == 2) {
finals[0] = finals[0].replace(finals[0].length() - 1, 1, "2"); finals[0] = finals[0].replace(finals[0].length() - 1, 1, "2");
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "2"); finals[1] = finals[1].replace(finals[1].length() - 1, 1, "2");
} else if(temp_wstr.length() == 1) { //monosyllabic + disyllabic, e.g. 纸/老虎 } else if (temp_wstr.length() ==
1) { // monosyllabic + disyllabic, e.g. 纸/老虎
finals[1] = finals[1].replace(finals[1].length() - 1, 1, "2"); finals[1] = finals[1].replace(finals[1].length() - 1, 1, "2");
} }
} else { } else {
// 创建对应的 韵母列表 // 创建对应的 韵母列表
finals_temp = {}; finals_temp = {};
finals_list = {}; finals_list = {};
finals_temp.assign(finals.begin(), finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length()); finals_temp.assign(
finals.begin(),
finals.begin() +
ppspeech::utf8string2wstring(word_list[0]).length());
finals_list.push_back(finals_temp); finals_list.push_back(finals_temp);
finals_temp.assign(finals.begin() + ppspeech::utf8string2wstring(word_list[0]).length(), finals.end()); finals_temp.assign(
finals.begin() +
ppspeech::utf8string2wstring(word_list[0]).length(),
finals.end());
finals_list.push_back(finals_temp); finals_list.push_back(finals_temp);
finals = {}; finals = {};
for (int i = 0; i < finals_list.size(); i++) { for (int i = 0; i < finals_list.size(); i++) {
// e.g. 所有/人 // e.g. 所有/人
if(AllToneThree(finals_list[i]) && finals_list[i].size() == 2) { if (AllToneThree(finals_list[i]) &&
finals_list[i][0] = finals_list[i][0].replace(finals_list[i][0].length() - 1, 1, "2"); finals_list[i].size() == 2) {
} else if(i == 1 && !(AllToneThree(finals_list[i])) && absl::EndsWith(finals_list[i][0], "3") == true finals_list[i][0] = finals_list[i][0].replace(
&& absl::EndsWith(finals_list[0].back(), "3") == true) { finals_list[i][0].length() - 1, 1, "2");
finals_list[0].back() = finals_list[0].back().replace(finals_list[0].back().length() - 1, 1, "2"); } else if (i == 1 && !(AllToneThree(finals_list[i])) &&
absl::EndsWith(finals_list[i][0], "3") == true &&
absl::EndsWith(finals_list[0].back(), "3") == true) {
finals_list[0].back() = finals_list[0].back().replace(
finals_list[0].back().length() - 1, 1, "2");
} }
} }
finals.insert(finals.end(), finals_list[0].begin(), finals_list[0].end()); finals.insert(
finals.insert(finals.end(), finals_list[1].begin(), finals_list[1].end()); finals.end(), finals_list[0].begin(), finals_list[0].end());
finals.insert(
finals.end(), finals_list[1].begin(), finals_list[1].end());
} }
} else if (word_num == 4) { //将成语拆分为两个长度为 2 的单词 } else if (word_num == 4) { //将成语拆分为两个长度为 2 的单词
@ -867,19 +1024,23 @@ int FrontEngineInterface::ThreeSandhi(const std::string &word, std::vector<std::
finals = {}; finals = {};
for (int j = 0; j < finals_list.size(); j++) { for (int j = 0; j < finals_list.size(); j++) {
if (AllToneThree(finals_list[j])) { if (AllToneThree(finals_list[j])) {
finals_list[j][0] = finals_list[j][0].replace(finals_list[j][0].length() - 1, 1, "2"); finals_list[j][0] = finals_list[j][0].replace(
finals_list[j][0].length() - 1, 1, "2");
} }
finals.insert(finals.end(), finals_list[j].begin(), finals_list[j].end()); finals.insert(
finals.end(), finals_list[j].begin(), finals_list[j].end());
} }
} }
return 0; return 0;
} }
int FrontEngineInterface::ModifyTone(const std::string &word, const std::string &pos, std::vector<std::string> &finals) { int FrontEngineInterface::ModifyTone(const std::string &word,
const std::string &pos,
std::vector<std::string> &finals) {
if ((0 != BuSandi(word, finals)) || (0 != YiSandhi(word, finals)) || if ((0 != BuSandi(word, finals)) || (0 != YiSandhi(word, finals)) ||
(0 != NeuralSandhi(word, pos, finals)) || (0 != ThreeSandhi(word,finals))) { (0 != NeuralSandhi(word, pos, finals)) ||
(0 != ThreeSandhi(word, finals))) {
LOG(ERROR) << "Failed to modify tone of the word: " << word; LOG(ERROR) << "Failed to modify tone of the word: " << word;
return -1; return -1;
} }
@ -887,7 +1048,11 @@ int FrontEngineInterface::ModifyTone(const std::string &word, const std::string
return 0; return 0;
} }
std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(const std::vector<std::string> &initials, const std::vector<std::string> &finals, const std::string &word, const std::string &pos) { std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(
const std::vector<std::string> &initials,
const std::vector<std::string> &finals,
const std::string &word,
const std::string &pos) {
std::vector<std::string> new_initials = {}; std::vector<std::string> new_initials = {};
std::vector<std::string> new_finals = {}; std::vector<std::string> new_finals = {};
std::vector<std::vector<std::string>> new_initials_finals; std::vector<std::vector<std::string>> new_initials_finals;
@ -900,8 +1065,11 @@ std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(const std
} }
int word_num = wordvec.size(); int word_num = wordvec.size();
if((find(must_erhua.begin(), must_erhua.end(), word) == must_erhua.end()) && if ((find(must_erhua.begin(), must_erhua.end(), word) ==
((find(not_erhua.begin(), not_erhua.end(), word) != not_erhua.end()) || (find(specified_pos.begin(), specified_pos.end(), pos) != specified_pos.end()))) { must_erhua.end()) &&
((find(not_erhua.begin(), not_erhua.end(), word) != not_erhua.end()) ||
(find(specified_pos.begin(), specified_pos.end(), pos) !=
specified_pos.end()))) {
new_initials_finals.push_back(initials); new_initials_finals.push_back(initials);
new_initials_finals.push_back(finals); new_initials_finals.push_back(finals);
return new_initials_finals; return new_initials_finals;
@ -914,9 +1082,16 @@ std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(const std
assert(finals.size() == word_num); assert(finals.size() == word_num);
for (int i = 0; i < finals.size(); i++) { for (int i = 0; i < finals.size(); i++) {
if(i == finals.size() - 1 && wordvec[i] == L"" && (finals[i] == "er2" || finals[i] == "er5") && word_num >= 2 && if (i == finals.size() - 1 && wordvec[i] == L"" &&
find(not_erhua.begin(), not_erhua.end(), ppspeech::wstring2utf8string(word_wstr.substr(word_wstr.length() - 2))) == not_erhua.end() && !new_finals.empty()) { (finals[i] == "er2" || finals[i] == "er5") && word_num >= 2 &&
new_finals.back() = new_finals.back().substr(0, new_finals.back().length()-1) + "r" + new_finals.back().substr(new_finals.back().length()-1); find(not_erhua.begin(),
not_erhua.end(),
ppspeech::wstring2utf8string(word_wstr.substr(
word_wstr.length() - 2))) == not_erhua.end() &&
!new_finals.empty()) {
new_finals.back() =
new_finals.back().substr(0, new_finals.back().length() - 1) +
"r" + new_finals.back().substr(new_finals.back().length() - 1);
} else { } else {
new_initials.push_back(initials[i]); new_initials.push_back(initials[i]);
new_finals.push_back(finals[i]); new_finals.push_back(finals[i]);
@ -926,8 +1101,5 @@ std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(const std
new_initials_finals.push_back(new_finals); new_initials_finals.push_back(new_finals);
return new_initials_finals; return new_initials_finals;
}
} }
} // namespace ppspeech

@ -1,15 +1,28 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H #ifndef PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H
#define PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H #define PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H
#include <glog/logging.h>
#include <fstream>
#include <map> #include <map>
#include <string>
#include <memory> #include <memory>
#include <fstream> #include <string>
#include <glog/logging.h>
//#include "utils/dir_utils.h" //#include "utils/dir_utils.h"
#include <cppjieba/Jieba.hpp> #include <cppjieba/Jieba.hpp>
#include "front/text_normalize.h"
#include "absl/strings/str_split.h" #include "absl/strings/str_split.h"
#include "front/text_normalize.h"
namespace ppspeech { namespace ppspeech {
@ -24,9 +37,7 @@ namespace ppspeech {
} }
int init(); int init();
~FrontEngineInterface() { ~FrontEngineInterface() {}
}
// 读取配置文件 // 读取配置文件
int ReadConfFile(); int ReadConfFile();
@ -35,25 +46,39 @@ namespace ppspeech {
int Trand2Simp(const std::wstring &sentence, std::wstring &sentence_simp); int Trand2Simp(const std::wstring &sentence, std::wstring &sentence_simp);
// 生成字典 // 生成字典
int GenDict(const std::string &file, std::map<std::string, std::string> &map); int GenDict(const std::string &file,
std::map<std::string, std::string> &map);
// 由 词+词性的分词结果转为仅包含词的结果 // 由 词+词性的分词结果转为仅包含词的结果
int GetSegResult(std::vector<std::pair<std::string, std::string>> &seg, std::vector<std::string> &seg_words); int GetSegResult(std::vector<std::pair<std::string, std::string>> &seg,
std::vector<std::string> &seg_words);
// 生成句子的音素音调id。如果音素和音调未分开则 toneids 为空fastspeech2反之则不为空(speedyspeech)
int GetSentenceIds(const std::string &sentence, std::vector<int> &phoneids, std::vector<int> &toneids); // 生成句子的音素音调id。如果音素和音调未分开则 toneids
// 为空fastspeech2反之则不为空(speedyspeech)
// 根据分词结果获取词的音素音调id并对读音进行适当修改 (ModifyTone)。如果音素和音调未分开,则 toneids 为空fastspeech2反之则不为空(speedyspeech) int GetSentenceIds(const std::string &sentence,
int GetWordsIds(const std::vector<std::pair<std::string, std::string>> &cut_result, std::vector<int> &phoneids, std::vector<int> &toneids); std::vector<int> &phoneids,
std::vector<int> &toneids);
// 结巴分词生成包含词和词性的分词结果,再对分词结果进行适当修改 (MergeforModify)
int Cut(const std::string &sentence, std::vector<std::pair<std::string, std::string>> &cut_result); // 根据分词结果获取词的音素音调id并对读音进行适当修改
// (ModifyTone)。如果音素和音调未分开,则 toneids
// 为空fastspeech2反之则不为空(speedyspeech)
int GetWordsIds(
const std::vector<std::pair<std::string, std::string>> &cut_result,
std::vector<int> &phoneids,
std::vector<int> &toneids);
// 结巴分词生成包含词和词性的分词结果,再对分词结果进行适当修改
// (MergeforModify)
int Cut(const std::string &sentence,
std::vector<std::pair<std::string, std::string>> &cut_result);
// 字词到音素的映射,查找字典 // 字词到音素的映射,查找字典
int GetPhone(const std::string &word, std::string &phone); int GetPhone(const std::string &word, std::string &phone);
// 音素到音素id // 音素到音素id
int Phone2Phoneid(const std::string &phone, std::vector<int> &phoneid, std::vector<int> &toneids); int Phone2Phoneid(const std::string &phone,
std::vector<int> &phoneid,
std::vector<int> &toneids);
// 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声 // 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声
@ -63,37 +88,50 @@ namespace ppspeech {
bool IsReduplication(const std::string &word); bool IsReduplication(const std::string &word);
// 获取每个字词的声母韵母列表 // 获取每个字词的声母韵母列表
int GetInitialsFinals(const std::string &word, std::vector<std::string> &word_initials, std::vector<std::string> &word_finals); int GetInitialsFinals(const std::string &word,
std::vector<std::string> &word_initials,
std::vector<std::string> &word_finals);
// 获取每个字词的韵母列表 // 获取每个字词的韵母列表
int GetFinals(const std::string &word, std::vector<std::string> &word_finals); int GetFinals(const std::string &word,
std::vector<std::string> &word_finals);
// 整个词转成向量形式,向量的每个元素对应词的一个字 // 整个词转成向量形式,向量的每个元素对应词的一个字
int Word2WordVec(const std::string &word, std::vector<std::wstring> &wordvec); int Word2WordVec(const std::string &word,
std::vector<std::wstring> &wordvec);
// 将整个词重新进行 full cut分词后各个词会在词典中 // 将整个词重新进行 full cut分词后各个词会在词典中
int SplitWord(const std::string &word, std::vector<std::string> &fullcut_word); int SplitWord(const std::string &word,
std::vector<std::string> &fullcut_word);
// 对分词结果进行处理:对包含“不”字的分词结果进行整理 // 对分词结果进行处理:对包含“不”字的分词结果进行整理
std::vector<std::pair<std::string, std::string>> MergeBu(std::vector<std::pair<std::string, std::string>> &seg_result); std::vector<std::pair<std::string, std::string>> MergeBu(
std::vector<std::pair<std::string, std::string>> &seg_result);
// 对分词结果进行处理:对包含“一”字的分词结果进行整理 // 对分词结果进行处理:对包含“一”字的分词结果进行整理
std::vector<std::pair<std::string, std::string>> Mergeyi(std::vector<std::pair<std::string, std::string>> &seg_result); std::vector<std::pair<std::string, std::string>> Mergeyi(
std::vector<std::pair<std::string, std::string>> &seg_result);
// 对分词结果进行处理:对前后相同的两个字进行合并 // 对分词结果进行处理:对前后相同的两个字进行合并
std::vector<std::pair<std::string, std::string>> MergeReduplication(std::vector<std::pair<std::string, std::string>> &seg_result); std::vector<std::pair<std::string, std::string>> MergeReduplication(
std::vector<std::pair<std::string, std::string>> &seg_result);
// 对一个词和后一个词他们的读音均为第三声的两个词进行合并 // 对一个词和后一个词他们的读音均为第三声的两个词进行合并
std::vector<std::pair<std::string, std::string>> MergeThreeTones(std::vector<std::pair<std::string, std::string>> &seg_result); std::vector<std::pair<std::string, std::string>> MergeThreeTones(
std::vector<std::pair<std::string, std::string>> &seg_result);
// 对一个词的最后一个读音和后一个词的第一个读音为第三声的两个词进行合并 // 对一个词的最后一个读音和后一个词的第一个读音为第三声的两个词进行合并
std::vector<std::pair<std::string, std::string>> MergeThreeTones2(std::vector<std::pair<std::string, std::string>> &seg_result); std::vector<std::pair<std::string, std::string>> MergeThreeTones2(
std::vector<std::pair<std::string, std::string>> &seg_result);
// 对分词结果进行处理:对包含“儿”字的分词结果进行整理 // 对分词结果进行处理:对包含“儿”字的分词结果进行整理
std::vector<std::pair<std::string, std::string>> MergeEr(std::vector<std::pair<std::string, std::string>> &seg_result); std::vector<std::pair<std::string, std::string>> MergeEr(
std::vector<std::pair<std::string, std::string>> &seg_result);
// 对分词结果进行处理、修改 // 对分词结果进行处理、修改
int MergeforModify(std::vector<std::pair<std::string, std::string>> &seg_result, std::vector<std::pair<std::string, std::string>> &merge_seg_result); int MergeforModify(
std::vector<std::pair<std::string, std::string>> &seg_result,
std::vector<std::pair<std::string, std::string>> &merge_seg_result);
// 对包含“不”字的相关词音调进行修改 // 对包含“不”字的相关词音调进行修改
@ -103,18 +141,25 @@ namespace ppspeech {
int YiSandhi(const std::string &word, std::vector<std::string> &finals); int YiSandhi(const std::string &word, std::vector<std::string> &finals);
// 对一些特殊词(包括量词,语助词等)的相关词音调进行修改 // 对一些特殊词(包括量词,语助词等)的相关词音调进行修改
int NeuralSandhi(const std::string &word, const std::string &pos, std::vector<std::string> &finals); int NeuralSandhi(const std::string &word,
const std::string &pos,
std::vector<std::string> &finals);
// 对包含第三声的相关词音调进行修改 // 对包含第三声的相关词音调进行修改
int ThreeSandhi(const std::string &word, std::vector<std::string> &finals); int ThreeSandhi(const std::string &word, std::vector<std::string> &finals);
// 对字词音调进行处理、修改 // 对字词音调进行处理、修改
int ModifyTone(const std::string &word, const std::string &pos, std::vector<std::string> &finals); int ModifyTone(const std::string &word,
const std::string &pos,
std::vector<std::string> &finals);
// 对儿化音进行处理 // 对儿化音进行处理
std::vector<std::vector<std::string>> MergeErhua(const std::vector<std::string> &initials, const std::vector<std::string> &finals, const std::string &word, const std::string &pos); std::vector<std::vector<std::string>> MergeErhua(
const std::vector<std::string> &initials,
const std::vector<std::string> &finals,
const std::string &word,
const std::string &pos);
private: private:
@ -148,9 +193,6 @@ namespace ppspeech {
std::vector<std::string> must_not_neural_tone_words; std::vector<std::string> must_not_neural_tone_words;
std::vector<std::string> must_neural_tone_words; std::vector<std::string> must_neural_tone_words;
}; };
} }
#endif #endif

@ -1,10 +1,22 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "front/text_normalize.h" #include "front/text_normalize.h"
namespace ppspeech { namespace ppspeech {
// 初始化 digits_map and unit_map // 初始化 digits_map and unit_map
int TextNormalizer::InitMap() { int TextNormalizer::InitMap() {
digits_map["0"] = ""; digits_map["0"] = "";
digits_map["1"] = ""; digits_map["1"] = "";
digits_map["2"] = ""; digits_map["2"] = "";
@ -26,23 +38,27 @@ int TextNormalizer::InitMap() {
} }
// 替换 // 替换
int TextNormalizer::Replace(std::wstring &sentence, const int &pos, const int &len, const std::wstring &repstr) { int TextNormalizer::Replace(std::wstring &sentence,
const int &pos,
const int &len,
const std::wstring &repstr) {
// 删除原来的 // 删除原来的
sentence.erase(pos, len); sentence.erase(pos, len);
// 插入新的 // 插入新的
sentence.insert(pos, repstr); sentence.insert(pos, repstr);
return 0; return 0;
} }
// 根据标点符号切分句子 // 根据标点符号切分句子
int TextNormalizer::SplitByPunc(const std::wstring &sentence, std::vector<std::wstring> &sentence_part) { int TextNormalizer::SplitByPunc(const std::wstring &sentence,
std::vector<std::wstring> &sentence_part) {
std::wstring temp = sentence; std::wstring temp = sentence;
std::wregex reg(L"[:,;。?!,;?!]"); std::wregex reg(L"[:,;。?!,;?!]");
std::wsmatch match; std::wsmatch match;
while (std::regex_search(temp, match, reg)) { while (std::regex_search(temp, match, reg)) {
sentence_part.push_back(temp.substr(0, match.position(0) + match.length(0))); sentence_part.push_back(
temp.substr(0, match.position(0) + match.length(0)));
Replace(temp, 0, match.position(0) + match.length(0), L""); Replace(temp, 0, match.position(0) + match.length(0), L"");
} }
// 如果最后没有标点符号 // 如果最后没有标点符号
@ -53,9 +69,10 @@ int TextNormalizer::SplitByPunc(const std::wstring &sentence, std::vector<std::w
} }
// 数字转文本10200 - > 一万零二百 // 数字转文本10200 - > 一万零二百
std::string TextNormalizer::CreateTextValue(const std::string &num_str, bool use_zero) { std::string TextNormalizer::CreateTextValue(const std::string &num_str,
bool use_zero) {
std::string num_lstrip = std::string(absl::StripPrefix(num_str, "0")).data(); std::string num_lstrip =
std::string(absl::StripPrefix(num_str, "0")).data();
int len = num_lstrip.length(); int len = num_lstrip.length();
if (len == 0) { if (len == 0) {
@ -86,12 +103,14 @@ std::string TextNormalizer::CreateTextValue(const std::string &num_str, bool use
first_part = num_str.substr(0, num_str.length() - largest_unit); first_part = num_str.substr(0, num_str.length() - largest_unit);
second_part = num_str.substr(num_str.length() - largest_unit); second_part = num_str.substr(num_str.length() - largest_unit);
return CreateTextValue(first_part, use_zero) + units_map[largest_unit] + CreateTextValue(second_part, use_zero); return CreateTextValue(first_part, use_zero) + units_map[largest_unit] +
CreateTextValue(second_part, use_zero);
} }
} }
// 数字一个一个对应,可直接用于年份,电话,手机, // 数字一个一个对应,可直接用于年份,电话,手机,
std::string TextNormalizer::SingleDigit2Text(const std::string &num_str, bool alt_one) { std::string TextNormalizer::SingleDigit2Text(const std::string &num_str,
bool alt_one) {
std::string text = ""; std::string text = "";
if (alt_one) { if (alt_one) {
digits_map["1"] = ""; digits_map["1"] = "";
@ -110,13 +129,16 @@ std::string TextNormalizer::SingleDigit2Text(const std::string &num_str, bool al
return text; return text;
} }
std::string TextNormalizer::SingleDigit2Text(const std::wstring &num, bool alt_one) { std::string TextNormalizer::SingleDigit2Text(const std::wstring &num,
bool alt_one) {
std::string num_str = wstring2utf8string(num); std::string num_str = wstring2utf8string(num);
return SingleDigit2Text(num_str, alt_one); return SingleDigit2Text(num_str, alt_one);
} }
// 数字整体对应,可直接用于月份,日期,数值整数部分 // 数字整体对应,可直接用于月份,日期,数值整数部分
std::string TextNormalizer::MultiDigit2Text(const std::string &num_str, bool alt_one, bool use_zero) { std::string TextNormalizer::MultiDigit2Text(const std::string &num_str,
bool alt_one,
bool use_zero) {
LOG(INFO) << "aaaaaaaaaaaaaaaa: " << alt_one << use_zero; LOG(INFO) << "aaaaaaaaaaaaaaaa: " << alt_one << use_zero;
if (alt_one) { if (alt_one) {
digits_map["1"] = ""; digits_map["1"] = "";
@ -124,18 +146,22 @@ std::string TextNormalizer::MultiDigit2Text(const std::string &num_str, bool alt
digits_map["1"] = ""; digits_map["1"] = "";
} }
std::wstring result = utf8string2wstring(CreateTextValue(num_str, use_zero)); std::wstring result =
utf8string2wstring(CreateTextValue(num_str, use_zero));
std::wstring result_0(1, result[0]); std::wstring result_0(1, result[0]);
std::wstring result_1(1, result[1]); std::wstring result_1(1, result[1]);
// 一十八 --> 十八 // 一十八 --> 十八
if ((result_0 == utf8string2wstring(digits_map["1"])) && (result_1 == utf8string2wstring(units_map[1]))) { if ((result_0 == utf8string2wstring(digits_map["1"])) &&
(result_1 == utf8string2wstring(units_map[1]))) {
return wstring2utf8string(result.substr(1, result.length())); return wstring2utf8string(result.substr(1, result.length()));
} else { } else {
return wstring2utf8string(result); return wstring2utf8string(result);
} }
} }
std::string TextNormalizer::MultiDigit2Text(const std::wstring &num, bool alt_one, bool use_zero) { std::string TextNormalizer::MultiDigit2Text(const std::wstring &num,
bool alt_one,
bool use_zero) {
std::string num_str = wstring2utf8string(num); std::string num_str = wstring2utf8string(num);
return MultiDigit2Text(num_str, alt_one, use_zero); return MultiDigit2Text(num_str, alt_one, use_zero);
} }
@ -150,10 +176,15 @@ std::string TextNormalizer::Digits2Text(const std::string &num_str) {
text = MultiDigit2Text(integer_decimal[0]); text = MultiDigit2Text(integer_decimal[0]);
} else if (integer_decimal.size() == 2) { // 小数 } else if (integer_decimal.size() == 2) { // 小数
if (integer_decimal[0] == "") { // 无整数的小数类型,例如:.22 if (integer_decimal[0] == "") { // 无整数的小数类型,例如:.22
text = "" + SingleDigit2Text(std::string(absl::StripSuffix(integer_decimal[1], "0")).data()); text = "" +
SingleDigit2Text(
std::string(absl::StripSuffix(integer_decimal[1], "0"))
.data());
} else { // 常规小数类型例如12.34 } else { // 常规小数类型例如12.34
text = MultiDigit2Text(integer_decimal[0]) + "" + \ text = MultiDigit2Text(integer_decimal[0]) + "" +
SingleDigit2Text(std::string(absl::StripSuffix(integer_decimal[1], "0")).data()); SingleDigit2Text(
std::string(absl::StripSuffix(integer_decimal[1], "0"))
.data());
} }
} else { } else {
return "The value does not conform to the numeric format"; return "The value does not conform to the numeric format";
@ -169,7 +200,9 @@ std::string TextNormalizer::Digits2Text(const std::wstring &num) {
// 日期2021年8月18日 --> 二零二一年八月十八日 // 日期2021年8月18日 --> 二零二一年八月十八日
int TextNormalizer::ReData(std::wstring &sentence) { int TextNormalizer::ReData(std::wstring &sentence) {
std::wregex reg(L"(\\d{4}|\\d{2})年((0?[1-9]|1[0-2])月)?(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?"); std::wregex reg(
L"(\\d{4}|\\d{2})年((0?[1-9]|1[0-2])月)?(((0?[1-9])|((1|2)[0-9])|30|31)"
L"([日号]))?");
std::wsmatch match; std::wsmatch match;
std::string rep; std::string rep;
@ -180,11 +213,14 @@ int TextNormalizer::ReData(std::wstring &sentence) {
rep += MultiDigit2Text(match[3], false, false) + ""; rep += MultiDigit2Text(match[3], false, false) + "";
} }
if (match[5] != L"") { if (match[5] != L"") {
rep += MultiDigit2Text(match[5], false, false) + wstring2utf8string(match[9]); rep += MultiDigit2Text(match[5], false, false) +
wstring2utf8string(match[9]);
} }
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
@ -193,7 +229,8 @@ int TextNormalizer::ReData(std::wstring &sentence) {
// XX-XX-XX or XX/XX/XX 例如2021/08/18 --> 二零二一年八月十八日 // XX-XX-XX or XX/XX/XX 例如2021/08/18 --> 二零二一年八月十八日
int TextNormalizer::ReData2(std::wstring &sentence) { int TextNormalizer::ReData2(std::wstring &sentence) {
std::wregex reg(L"(\\d{4})([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])"); std::wregex reg(
L"(\\d{4})([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])");
std::wsmatch match; std::wsmatch match;
std::string rep; std::string rep;
@ -202,8 +239,10 @@ int TextNormalizer::ReData2(std::wstring &sentence) {
rep += (SingleDigit2Text(match[1]) + ""); rep += (SingleDigit2Text(match[1]) + "");
rep += (MultiDigit2Text(match[3], false, false) + ""); rep += (MultiDigit2Text(match[3], false, false) + "");
rep += (MultiDigit2Text(match[4], false, false) + ""); rep += (MultiDigit2Text(match[4], false, false) + "");
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
@ -227,7 +266,10 @@ int TextNormalizer::ReTime(std::wstring &sentence) {
} }
rep += (MultiDigit2Text(match[4]) + ""); rep += (MultiDigit2Text(match[4]) + "");
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
@ -247,12 +289,13 @@ int TextNormalizer::ReTemperature(std::wstring &sentence) {
match[4] == L"摄氏度" ? unit = "摄氏度" : unit = ""; match[4] == L"摄氏度" ? unit = "摄氏度" : unit = "";
rep = sign + Digits2Text(match[2]) + unit; rep = sign + Digits2Text(match[2]) + unit;
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
} }
// 分数,例如: 1/3 --> 三分之一 // 分数,例如: 1/3 --> 三分之一
@ -263,8 +306,12 @@ int TextNormalizer::ReFrac(std::wstring &sentence) {
std::string rep; std::string rep;
while (std::regex_search(sentence, match, reg)) { while (std::regex_search(sentence, match, reg)) {
match[1] == L"-" ? sign = "" : sign = ""; match[1] == L"-" ? sign = "" : sign = "";
rep = sign + MultiDigit2Text(match[3]) + "分之" + MultiDigit2Text(match[2]); rep = sign + MultiDigit2Text(match[3]) + "分之" +
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); MultiDigit2Text(match[2]);
Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
@ -282,7 +329,10 @@ int TextNormalizer::RePercentage(std::wstring &sentence) {
match[1] == L"-" ? sign = "" : sign = ""; match[1] == L"-" ? sign = "" : sign = "";
rep = sign + "百分之" + Digits2Text(match[2]); rep = sign + "百分之" + Digits2Text(match[2]);
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
@ -290,7 +340,8 @@ int TextNormalizer::RePercentage(std::wstring &sentence) {
// 手机号码,例如:+86 18883862235 --> 八六幺八八八三八六二二三五 // 手机号码,例如:+86 18883862235 --> 八六幺八八八三八六二二三五
int TextNormalizer::ReMobilePhone(std::wstring &sentence) { int TextNormalizer::ReMobilePhone(std::wstring &sentence) {
std::wregex reg(L"(\\d)?((\\+?86 ?)?1([38]\\d|5[0-35-9]|7[678]|9[89])\\d{8})(\\d)?"); std::wregex reg(
L"(\\d)?((\\+?86 ?)?1([38]\\d|5[0-35-9]|7[678]|9[89])\\d{8})(\\d)?");
std::wsmatch match; std::wsmatch match;
std::string rep; std::string rep;
std::vector<std::string> country_phonenum; std::vector<std::string> country_phonenum;
@ -302,8 +353,10 @@ int TextNormalizer::ReMobilePhone(std::wstring &sentence) {
LOG(INFO) << country_phonenum[i]; LOG(INFO) << country_phonenum[i];
rep += SingleDigit2Text(country_phonenum[i], true); rep += SingleDigit2Text(country_phonenum[i], true);
} }
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
@ -311,7 +364,8 @@ int TextNormalizer::ReMobilePhone(std::wstring &sentence) {
// 座机号码例如010-51093154 --> 零幺零五幺零九三幺五四 // 座机号码例如010-51093154 --> 零幺零五幺零九三幺五四
int TextNormalizer::RePhone(std::wstring &sentence) { int TextNormalizer::RePhone(std::wstring &sentence) {
std::wregex reg(L"(\\d)?((0(10|2[1-3]|[3-9]\\d{2})-?)?[1-9]\\d{6,7})(\\d)?"); std::wregex reg(
L"(\\d)?((0(10|2[1-3]|[3-9]\\d{2})-?)?[1-9]\\d{6,7})(\\d)?");
std::wsmatch match; std::wsmatch match;
std::vector<std::string> zone_phonenum; std::vector<std::string> zone_phonenum;
std::string rep; std::string rep;
@ -322,7 +376,10 @@ int TextNormalizer::RePhone(std::wstring &sentence) {
for (int i = 0; i < zone_phonenum.size(); i++) { for (int i = 0; i < zone_phonenum.size(); i++) {
rep += SingleDigit2Text(zone_phonenum[i], true); rep += SingleDigit2Text(zone_phonenum[i], true);
} }
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
@ -330,7 +387,9 @@ int TextNormalizer::RePhone(std::wstring &sentence) {
// 范围例如60~90 --> 六十到九十 // 范围例如60~90 --> 六十到九十
int TextNormalizer::ReRange(std::wstring &sentence) { int TextNormalizer::ReRange(std::wstring &sentence) {
std::wregex reg(L"((-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+)))[-~]((-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+)))"); std::wregex reg(
L"((-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+)))[-~]((-?)((\\d+)(\\.\\d+)?)|(\\.("
L"\\d+)))");
std::wsmatch match; std::wsmatch match;
std::string rep; std::string rep;
std::string sign1; std::string sign1;
@ -351,7 +410,10 @@ int TextNormalizer::ReRange(std::wstring &sentence) {
rep += sign2 + Digits2Text(match[10]); rep += sign2 + Digits2Text(match[10]);
} }
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
@ -364,7 +426,10 @@ int TextNormalizer::ReInterger(std::wstring &sentence) {
std::string rep; std::string rep;
while (std::regex_search(sentence, match, reg)) { while (std::regex_search(sentence, match, reg)) {
rep = "" + MultiDigit2Text(match[2]); rep = "" + MultiDigit2Text(match[2]);
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
@ -385,7 +450,10 @@ int TextNormalizer::ReDecimalNum(std::wstring &sentence) {
rep = sign + Digits2Text(match[2]); rep = sign + Digits2Text(match[2]);
} }
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
@ -393,18 +461,26 @@ int TextNormalizer::ReDecimalNum(std::wstring &sentence) {
// 正整数 + 量词 // 正整数 + 量词
int TextNormalizer::RePositiveQuantifiers(std::wstring &sentence) { int TextNormalizer::RePositiveQuantifiers(std::wstring &sentence) {
std::wstring common_quantifiers = L"(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲| \ std::wstring common_quantifiers =
||||||||||||||||||||||||||||||||||线|||||| \ L"(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|"
|||||||||||||||||||(||)|||()||||||||||(||| \ L"担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|"
|)|||||||||||||||||||||||||||||||||||||| \ L"溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|"
||||||||||||||||||||||||||||||||||||(亿||| \ L"本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|"
||)|(亿|||||||)|(亿||||||)|||)"; L"毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|"
L"合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|"
L"卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|纪|岁|世|更|"
L"夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|"
L"元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|"
L"百万|万|千|百|)块|角|毛|分)";
std::wregex reg(L"(\\d+)([多余几])?" + common_quantifiers); std::wregex reg(L"(\\d+)([多余几])?" + common_quantifiers);
std::wsmatch match; std::wsmatch match;
std::string rep; std::string rep;
while (std::regex_search(sentence, match, reg)) { while (std::regex_search(sentence, match, reg)) {
rep = MultiDigit2Text(match[1]); rep = MultiDigit2Text(match[1]);
Replace(sentence, match.position(1), match.length(1), utf8string2wstring(rep)); Replace(sentence,
match.position(1),
match.length(1),
utf8string2wstring(rep));
} }
return 0; return 0;
@ -415,7 +491,10 @@ int TextNormalizer::ReDefalutNum(std::wstring &sentence) {
std::wregex reg(L"\\d{3}\\d*"); std::wregex reg(L"\\d{3}\\d*");
std::wsmatch match; std::wsmatch match;
while (std::regex_search(sentence, match, reg)) { while (std::regex_search(sentence, match, reg)) {
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(SingleDigit2Text(match[0]))); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(SingleDigit2Text(match[0])));
} }
return 0; return 0;
@ -434,7 +513,10 @@ int TextNormalizer::ReNumber(std::wstring &sentence) {
rep = sign + Digits2Text(match[2]); rep = sign + Digits2Text(match[2]);
} }
Replace(sentence, match.position(0), match.length(0), utf8string2wstring(rep)); Replace(sentence,
match.position(0),
match.length(0),
utf8string2wstring(rep));
} }
return 0; return 0;
} }
@ -457,6 +539,4 @@ int TextNormalizer::SentenceNormalize(std::wstring &sentence) {
ReNumber(sentence); ReNumber(sentence);
return 0; return 0;
} }
} // namespace ppspeech
}

@ -1,11 +1,24 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H #ifndef PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H
#define PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H #define PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H
#include <glog/logging.h>
#include <codecvt>
#include <map> #include <map>
#include <regex> #include <regex>
#include <string> #include <string>
#include <codecvt>
#include <glog/logging.h>
#include "absl/strings/str_split.h" #include "absl/strings/str_split.h"
#include "absl/strings/strip.h" #include "absl/strings/strip.h"
#include "base/type_conv.h" #include "base/type_conv.h"
@ -14,22 +27,27 @@ namespace ppspeech {
class TextNormalizer { class TextNormalizer {
public: public:
TextNormalizer() { TextNormalizer() { InitMap(); }
InitMap(); ~TextNormalizer() {}
}
~TextNormalizer() {
}
int InitMap(); int InitMap();
int Replace(std::wstring &sentence, const int &pos, const int &len, const std::wstring &repstr); int Replace(std::wstring &sentence,
int SplitByPunc(const std::wstring &sentence, std::vector<std::wstring> &sentence_part); const int &pos,
const int &len,
const std::wstring &repstr);
int SplitByPunc(const std::wstring &sentence,
std::vector<std::wstring> &sentence_part);
std::string CreateTextValue(const std::string &num, bool use_zero = true); std::string CreateTextValue(const std::string &num, bool use_zero = true);
std::string SingleDigit2Text(const std::string &num_str, bool alt_one = false); std::string SingleDigit2Text(const std::string &num_str,
bool alt_one = false);
std::string SingleDigit2Text(const std::wstring &num, bool alt_one = false); std::string SingleDigit2Text(const std::wstring &num, bool alt_one = false);
std::string MultiDigit2Text(const std::string &num_str, bool alt_one = false, bool use_zero = true); std::string MultiDigit2Text(const std::string &num_str,
std::string MultiDigit2Text(const std::wstring &num, bool alt_one = false, bool use_zero = true); bool alt_one = false,
bool use_zero = true);
std::string MultiDigit2Text(const std::wstring &num,
bool alt_one = false,
bool use_zero = true);
std::string Digits2Text(const std::string &num_str); std::string Digits2Text(const std::string &num_str);
std::string Digits2Text(const std::wstring &num); std::string Digits2Text(const std::wstring &num);
@ -53,10 +71,7 @@ public:
private: private:
std::map<std::string, std::string> digits_map; std::map<std::string, std::string> digits_map;
std::map<int, std::string> units_map; std::map<int, std::string> units_map;
}; };
} // namespace ppspeech
}
#endif #endif
Loading…
Cancel
Save