优化中文文本前端

pull/2221/head
wangcanlong 3 years ago
parent 070a08f2be
commit 2498b9ce66

@ -0,0 +1,5 @@
polyphonic:
湖泊: ['hu2','po1']
弹力: ['tan2','li4']
颤抖: ['chan4','dou3']
鸭绿江: ['ya1','lu4','jiang1']

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import yaml
from typing import Dict
from typing import List
@ -19,6 +20,7 @@ import jieba.posseg as psg
import numpy as np
import paddle
from g2pM import G2pM
from g2pw import G2PWConverter
from pypinyin import lazy_pinyin
from pypinyin import load_phrases_dict
from pypinyin import load_single_dict
@ -53,9 +55,23 @@ def insert_after_character(lst, item):
return result
class Polyphonic():
def __init__(self,dict_file="./paddlespeech/t2s/frontend/polyphonic.yaml"):
with open(dict_file, encoding='utf8') as polyphonic_file:
# 解析yaml
polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
self.polyphonic_words = polyphonic_dict["polyphonic"]
def correct_pronunciation(self,word,pinyin):
# 词汇被词典收录则返回纠正后的读音
if word in self.polyphonic_words.keys():
pinyin = self.polyphonic_words[word]
# 否则返回原读音
return pinyin
class Frontend():
def __init__(self,
g2p_model="pypinyin",
g2p_model="g2pW",
phone_vocab_path=None,
tone_vocab_path=None):
self.tone_modifier = ToneSandhi()
@ -67,6 +83,12 @@ class Frontend():
self.g2pM_model = G2pM()
self.pinyin2phone = generate_lexicon(
with_tone=True, with_erhua=False)
elif self.g2p_model == "g2pW":
self.corrector = Polyphonic()
self.g2pW_model = G2PWConverter(style='pinyin', enable_non_tradional_chinese=True)
self.pinyin2phone = generate_lexicon(
with_tone=True, with_erhua=False)
else:
self.__init__pypinyin()
self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"}
@ -139,6 +161,24 @@ class Frontend():
# If it's not pinyin (possibly punctuation) or no conversion is required
initials.append(pinyin)
finals.append(pinyin)
elif self.g2p_model == "g2pW":
pinyins = self.g2pW_model(word)[0]
if pinyins == [None]:
pinyins = [word]
for pinyin in pinyins:
pinyin = pinyin.replace("u:", "v")
if pinyin in self.pinyin2phone:
initial_final_list = self.pinyin2phone[pinyin].split(" ")
if len(initial_final_list) == 2:
initials.append(initial_final_list[0])
finals.append(initial_final_list[1])
elif len(initial_final_list) == 1:
initials.append('')
finals.append(initial_final_list[1])
else:
# If it's not pinyin (possibly punctuation) or no conversion is required
initials.append(pinyin)
finals.append(pinyin)
return initials, finals
# if merge_sentences, merge all sentences into one phone sequence
@ -150,27 +190,65 @@ class Frontend():
phones_list = []
for seg in segments:
phones = []
initials = []
finals = []
# Replace all English words in the sentence
seg = re.sub('[a-zA-Z]+', '', seg)
seg_cut = psg.lcut(seg)
initials = []
finals = []
seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
for word, pos in seg_cut:
if pos == 'eng':
continue
sub_initials, sub_finals = self._get_initials_finals(word)
sub_finals = self.tone_modifier.modified_tone(word, pos,
sub_finals)
if with_erhua:
sub_initials, sub_finals = self._merge_erhua(
sub_initials, sub_finals, word, pos)
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
if self.g2p_model == "g2pW":
pinyins = self.g2pW_model(seg)[0]
pre_word_length = 0
for word, pos in seg_cut:
sub_initials = []
sub_finals = []
now_word_length = pre_word_length + len(word)
if pos == 'eng':
pre_word_length = now_word_length
continue
word_pinyins = pinyins[pre_word_length:now_word_length]
# 矫正发音
word_pinyins = self.corrector.correct_pronunciation(word,word_pinyins)
for pinyin,char in zip(word_pinyins,word):
if pinyin == None:
pinyin = char
pinyin = pinyin.replace("u:", "v")
if pinyin in self.pinyin2phone:
initial_final_list = self.pinyin2phone[pinyin].split(" ")
if len(initial_final_list) == 2:
sub_initials.append(initial_final_list[0])
sub_finals.append(initial_final_list[1])
elif len(initial_final_list) == 1:
sub_initials.append('')
sub_finals.append(initial_final_list[1])
else:
# If it's not pinyin (possibly punctuation) or no conversion is required
sub_initials.append(pinyin)
sub_finals.append(pinyin)
pre_word_length = now_word_length
sub_finals = self.tone_modifier.modified_tone(word, pos,
sub_finals)
if with_erhua:
sub_initials, sub_finals = self._merge_erhua(
sub_initials, sub_finals, word, pos)
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
else:
for word, pos in seg_cut:
if pos == 'eng':
continue
sub_initials, sub_finals = self._get_initials_finals(word)
sub_finals = self.tone_modifier.modified_tone(word, pos,
sub_finals)
if with_erhua:
sub_initials, sub_finals = self._merge_erhua(
sub_initials, sub_finals, word, pos)
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
initials = sum(initials, [])
finals = sum(finals, [])
for c, v in zip(initials, finals):
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii

Loading…
Cancel
Save