优化中文文本前端

pull/2221/head
wangcanlong 3 years ago
parent 070a08f2be
commit 2498b9ce66

@ -0,0 +1,5 @@
polyphonic:
湖泊: ['hu2','po1']
弹力: ['tan2','li4']
颤抖: ['chan4','dou3']
鸭绿江: ['ya1','lu4','jiang1']

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import re import re
import yaml
from typing import Dict from typing import Dict
from typing import List from typing import List
@ -19,6 +20,7 @@ import jieba.posseg as psg
import numpy as np import numpy as np
import paddle import paddle
from g2pM import G2pM from g2pM import G2pM
from g2pw import G2PWConverter
from pypinyin import lazy_pinyin from pypinyin import lazy_pinyin
from pypinyin import load_phrases_dict from pypinyin import load_phrases_dict
from pypinyin import load_single_dict from pypinyin import load_single_dict
@ -53,9 +55,23 @@ def insert_after_character(lst, item):
return result return result
class Polyphonic():
def __init__(self,dict_file="./paddlespeech/t2s/frontend/polyphonic.yaml"):
with open(dict_file, encoding='utf8') as polyphonic_file:
# 解析yaml
polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
self.polyphonic_words = polyphonic_dict["polyphonic"]
def correct_pronunciation(self,word,pinyin):
# 词汇被词典收录则返回纠正后的读音
if word in self.polyphonic_words.keys():
pinyin = self.polyphonic_words[word]
# 否则返回原读音
return pinyin
class Frontend(): class Frontend():
def __init__(self, def __init__(self,
g2p_model="pypinyin", g2p_model="g2pW",
phone_vocab_path=None, phone_vocab_path=None,
tone_vocab_path=None): tone_vocab_path=None):
self.tone_modifier = ToneSandhi() self.tone_modifier = ToneSandhi()
@ -67,6 +83,12 @@ class Frontend():
self.g2pM_model = G2pM() self.g2pM_model = G2pM()
self.pinyin2phone = generate_lexicon( self.pinyin2phone = generate_lexicon(
with_tone=True, with_erhua=False) with_tone=True, with_erhua=False)
elif self.g2p_model == "g2pW":
self.corrector = Polyphonic()
self.g2pW_model = G2PWConverter(style='pinyin', enable_non_tradional_chinese=True)
self.pinyin2phone = generate_lexicon(
with_tone=True, with_erhua=False)
else: else:
self.__init__pypinyin() self.__init__pypinyin()
self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"} self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"}
@ -139,6 +161,24 @@ class Frontend():
# If it's not pinyin (possibly punctuation) or no conversion is required # If it's not pinyin (possibly punctuation) or no conversion is required
initials.append(pinyin) initials.append(pinyin)
finals.append(pinyin) finals.append(pinyin)
elif self.g2p_model == "g2pW":
pinyins = self.g2pW_model(word)[0]
if pinyins == [None]:
pinyins = [word]
for pinyin in pinyins:
pinyin = pinyin.replace("u:", "v")
if pinyin in self.pinyin2phone:
initial_final_list = self.pinyin2phone[pinyin].split(" ")
if len(initial_final_list) == 2:
initials.append(initial_final_list[0])
finals.append(initial_final_list[1])
elif len(initial_final_list) == 1:
initials.append('')
finals.append(initial_final_list[1])
else:
# If it's not pinyin (possibly punctuation) or no conversion is required
initials.append(pinyin)
finals.append(pinyin)
return initials, finals return initials, finals
# if merge_sentences, merge all sentences into one phone sequence # if merge_sentences, merge all sentences into one phone sequence
@ -150,12 +190,51 @@ class Frontend():
phones_list = [] phones_list = []
for seg in segments: for seg in segments:
phones = [] phones = []
initials = []
finals = []
# Replace all English words in the sentence # Replace all English words in the sentence
seg = re.sub('[a-zA-Z]+', '', seg) seg = re.sub('[a-zA-Z]+', '', seg)
seg_cut = psg.lcut(seg) seg_cut = psg.lcut(seg)
initials = []
finals = []
seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut) seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
if self.g2p_model == "g2pW":
pinyins = self.g2pW_model(seg)[0]
pre_word_length = 0
for word, pos in seg_cut:
sub_initials = []
sub_finals = []
now_word_length = pre_word_length + len(word)
if pos == 'eng':
pre_word_length = now_word_length
continue
word_pinyins = pinyins[pre_word_length:now_word_length]
# 矫正发音
word_pinyins = self.corrector.correct_pronunciation(word,word_pinyins)
for pinyin,char in zip(word_pinyins,word):
if pinyin == None:
pinyin = char
pinyin = pinyin.replace("u:", "v")
if pinyin in self.pinyin2phone:
initial_final_list = self.pinyin2phone[pinyin].split(" ")
if len(initial_final_list) == 2:
sub_initials.append(initial_final_list[0])
sub_finals.append(initial_final_list[1])
elif len(initial_final_list) == 1:
sub_initials.append('')
sub_finals.append(initial_final_list[1])
else:
# If it's not pinyin (possibly punctuation) or no conversion is required
sub_initials.append(pinyin)
sub_finals.append(pinyin)
pre_word_length = now_word_length
sub_finals = self.tone_modifier.modified_tone(word, pos,
sub_finals)
if with_erhua:
sub_initials, sub_finals = self._merge_erhua(
sub_initials, sub_finals, word, pos)
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
else:
for word, pos in seg_cut: for word, pos in seg_cut:
if pos == 'eng': if pos == 'eng':
continue continue
@ -170,7 +249,6 @@ class Frontend():
# assert len(sub_initials) == len(sub_finals) == len(word) # assert len(sub_initials) == len(sub_finals) == len(word)
initials = sum(initials, []) initials = sum(initials, [])
finals = sum(finals, []) finals = sum(finals, [])
for c, v in zip(initials, finals): for c, v in zip(initials, finals):
# NOTE: post process for pypinyin outputs # NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii # we discriminate i, ii and iii

Loading…
Cancel
Save