enable chinese words' pinyin specified in text of ssml formats, test=tts

pull/2531/head
david.95 2 years ago
parent b76968e6d9
commit 13a7fa9808

@ -13,6 +13,7 @@
# limitations under the License.
import math
import os
import re
from pathlib import Path
from typing import Any
from typing import Dict
@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.modules.normalizer import ZScore
from paddlespeech.utils.dynamic_import import dynamic_import
# remove [W:onnxruntime: xxx] from ort
ort.set_default_logger_severity(3)
@ -103,7 +105,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
sentences = []
with open(text_file, 'rt') as f:
for line in f:
items = line.strip().split()
items = re.split(r"\s+", line.strip(), 1)
utt_id = items[0]
if lang == 'zh':
sentence = "".join(items[1:])
@ -180,7 +182,7 @@ def run_frontend(frontend: object,
to_tensor: bool=True):
outs = dict()
if lang == 'zh':
input_ids = frontend.get_input_ids(
input_ids = frontend.get_input_ids_ssml(
text,
merge_sentences=merge_sentences,
get_tone_ids=get_tone_ids,

@ -13,6 +13,7 @@
# limitations under the License.
import os
import re
from operator import itemgetter
from typing import Dict
from typing import List
@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
INITIALS = [
'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
@ -81,6 +83,7 @@ class Frontend():
g2p_model="g2pW",
phone_vocab_path=None,
tone_vocab_path=None):
self.mix_ssml_processor = MixTextProcessor()
self.tone_modifier = ToneSandhi()
self.text_normalizer = TextNormalizer()
self.punc = ":,;。?!“”‘’':,;.?!"
@ -143,6 +146,7 @@ class Frontend():
tone_id = [line.strip().split() for line in f.readlines()]
for tone, id in tone_id:
self.vocab_tones[tone] = int(id)
self.mix_ssml_processor.__repr__()
def _init_pypinyin(self):
large_pinyin.load()
@ -281,6 +285,65 @@ class Frontend():
phones_list.append(merge_list)
return phones_list
def _split_word_to_char(self, words):
res = []
for x in words:
res.append(x)
return res
# if using ssml, have pingyin specified, assign pinyin to words
def _g2p_assign(self,
words: List[str],
pinyin_spec: List[str],
merge_sentences: bool=True) -> List[List[str]]:
phones_list = []
initials = []
finals = []
words = self._split_word_to_char(words[0])
for pinyin, char in zip(pinyin_spec, words):
sub_initials = []
sub_finals = []
pinyin = pinyin.replace("u:", "v")
#self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
if pinyin in self.pinyin2phone:
initial_final_list = self.pinyin2phone[pinyin].split(" ")
if len(initial_final_list) == 2:
sub_initials.append(initial_final_list[0])
sub_finals.append(initial_final_list[1])
elif len(initial_final_list) == 1:
sub_initials.append('')
sub_finals.append(initial_final_list[1])
else:
# If it's not pinyin (possibly punctuation) or no conversion is required
sub_initials.append(pinyin)
sub_finals.append(pinyin)
initials.append(sub_initials)
finals.append(sub_finals)
initials = sum(initials, [])
finals = sum(finals, [])
phones = []
for c, v in zip(initials, finals):
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if c and c not in self.punc:
phones.append(c)
if c and c in self.punc:
phones.append('sp')
if v and v not in self.punc:
phones.append(v)
phones_list.append(phones)
if merge_sentences:
merge_list = sum(phones_list, [])
# rm the last 'sp' to avoid the noise at the end
# cause in the training data, no 'sp' in the end
if merge_list[-1] == 'sp':
merge_list = merge_list[:-1]
phones_list = []
phones_list.append(merge_list)
return phones_list
def _merge_erhua(self,
initials: List[str],
finals: List[str],
@ -396,6 +459,52 @@ class Frontend():
print("----------------------------")
return phonemes
#@an added for ssml pinyin
def get_phonemes_ssml(self,
ssml_inputs: list,
merge_sentences: bool=True,
with_erhua: bool=True,
robot: bool=False,
print_info: bool=False) -> List[List[str]]:
all_phonemes = []
for word_pinyin_item in ssml_inputs:
phonemes = []
sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
sentences = self.text_normalizer.normalize(sentence)
if len(pinyin_spec) == 0:
phonemes = self._g2p(
sentences,
merge_sentences=merge_sentences,
with_erhua=with_erhua)
else:
# phonemes should be pinyin_spec
phonemes = self._g2p_assign(
sentences, pinyin_spec, merge_sentences=merge_sentences)
all_phonemes = all_phonemes + phonemes
if robot:
new_phonemes = []
for sentence in all_phonemes:
new_sentence = []
for item in sentence:
# `er` only have tone `2`
if item[-1] in "12345" and item != "er2":
item = item[:-1] + "1"
new_sentence.append(item)
new_phonemes.append(new_sentence)
all_phonemes = new_phonemes
if print_info:
print("----------------------------")
print("text norm results:")
print(sentences)
print("----------------------------")
print("g2p results:")
print(all_phonemes[0])
print("----------------------------")
return [sum(all_phonemes, [])]
def get_input_ids(self,
sentence: str,
merge_sentences: bool=True,
@ -405,6 +514,7 @@ class Frontend():
add_blank: bool=False,
blank_token: str="<pad>",
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
phonemes = self.get_phonemes(
sentence,
merge_sentences=merge_sentences,
@ -437,3 +547,49 @@ class Frontend():
if temp_phone_ids:
result["phone_ids"] = temp_phone_ids
return result
# @an added for ssml
def get_input_ids_ssml(
self,
sentence: str,
merge_sentences: bool=True,
get_tone_ids: bool=False,
robot: bool=False,
print_info: bool=False,
add_blank: bool=False,
blank_token: str="<pad>",
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
l_inputs = MixTextProcessor.get_pinyin_split(sentence)
phonemes = self.get_phonemes_ssml(
l_inputs,
merge_sentences=merge_sentences,
print_info=print_info,
robot=robot)
result = {}
phones = []
tones = []
temp_phone_ids = []
temp_tone_ids = []
for part_phonemes in phonemes:
phones, tones = self._get_phone_tone(
part_phonemes, get_tone_ids=get_tone_ids)
if add_blank:
phones = insert_after_character(phones, blank_token)
if tones:
tone_ids = self._t2id(tones)
if to_tensor:
tone_ids = paddle.to_tensor(tone_ids)
temp_tone_ids.append(tone_ids)
if phones:
phone_ids = self._p2id(phones)
# if use paddle.to_tensor() in onnxruntime, the first time will be too low
if to_tensor:
phone_ids = paddle.to_tensor(phone_ids)
temp_phone_ids.append(phone_ids)
if temp_tone_ids:
result["tone_ids"] = temp_tone_ids
if temp_phone_ids:
result["phone_ids"] = temp_phone_ids
return result

@ -0,0 +1,163 @@
# -*- coding: utf-8 -*-
import re
import xml.dom.minidom
import xml.parsers.expat
from xml.dom.minidom import Node
from xml.dom.minidom import parseString
'''
Note: xml 有5种特殊字符 &<>"'
其一采用<![CDATA[ ]]>特殊标签将包含特殊字符的字符串封装起来
例如
<TitleName><![CDATA["姓名"]]></TitleName>
其二使用XML转义序列表示这些特殊的字符这5个特殊字符所对应XML转义序列为
& &amp;
< &lt;
> &gt;
" &quot;
' &apos;
例如
<TitleName>&quot;姓名&quot;</TitleName>
'''
class MixTextProcessor():
def __repr__(self):
print("@an MixTextProcessor class")
def get_xml_content(self, mixstr):
'''返回字符串的 xml 内容'''
xmlptn = re.compile(r"<speak>.*?</speak>", re.M | re.S)
ctn = re.search(xmlptn, mixstr)
if ctn:
return ctn.group(0)
else:
return None
def get_content_split(self, mixstr):
''' 文本分解顺序加了列表中按非xml 和 xml 分开,对应的字符串,带标点符号
不能去除空格因为xml 中tag 属性带空格
'''
ctlist = []
# print("Testing:",mixstr[:20])
patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
mat = re.match(patn, mixstr)
if mat:
pre_xml = mat.group(1)
in_xml = mat.group(2)
after_xml = mat.group(3)
ctlist.append(pre_xml)
ctlist.append(in_xml)
ctlist.append(after_xml)
return ctlist
else:
ctlist.append(mixstr)
return ctlist
@classmethod
def get_pinyin_split(self, mixstr):
ctlist = []
patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
mat = re.match(patn, mixstr)
if mat:
pre_xml = mat.group(1)
in_xml = mat.group(2)
after_xml = mat.group(3)
ctlist.append([pre_xml, []])
dom = DomXml(in_xml)
pinyinlist = dom.get_pinyins_for_xml()
ctlist = ctlist + pinyinlist
ctlist.append([after_xml, []])
else:
ctlist.append([mixstr, []])
return ctlist
class DomXml():
def __init__(self, xmlstr):
print("Parse xml str:", xmlstr)
self.tdom = parseString(xmlstr) #Document
# print("tdom:",type(self.tdom))
self.root = self.tdom.documentElement #Element
# print("root:",type(self.root))
self.rnode = self.tdom.childNodes #NodeList
# print("rnode:",type(self.rnode))
pass
def get_text(self):
'''返回xml 内容的所有文本内容的 列表'''
res = []
for x1 in self.rnode:
if x1.nodeType == Node.TEXT_NODE:
res.append(x1.value)
else:
for x2 in x1.childNodes:
if isinstance(x2, xml.dom.minidom.Text):
res.append(x2.data)
else:
for x3 in x2.childNodes:
if isinstance(x3, xml.dom.minidom.Text):
res.append(x3.data)
else:
print("len(nodes of x3):", len(x3.childNodes))
return res
def get_xmlchild_list(self):
'''返回xml 内容的列表, 包括所有文本内容(不带tag)'''
res = []
for x1 in self.rnode:
if x1.nodeType == Node.TEXT_NODE:
res.append(x1.value)
else:
for x2 in x1.childNodes:
if isinstance(x2, xml.dom.minidom.Text):
res.append(x2.data)
else:
for x3 in x2.childNodes:
if isinstance(x3, xml.dom.minidom.Text):
res.append(x3.data)
else:
print("len(nodes of x3):", len(x3.childNodes))
print(res)
return res
def get_pinyins_for_xml(self):
'''返回xml 内容,如果字符串 和 拼音的 list , 如 ['''
res = []
for x1 in self.rnode:
if x1.nodeType == Node.TEXT_NODE:
t = re.sub(r"\s+", "", x1.value)
res.append([t, []])
else:
for x2 in x1.childNodes:
if isinstance(x2, xml.dom.minidom.Text):
t = re.sub(r"\s+", "", x2.data)
res.append([t, []])
else:
# print("x2",x2,x2.tagName)
if x2.hasAttribute('pinyin'):
pinyin_value = x2.getAttribute("pinyin")
pinyins = pinyin_value.split(" ")
for x3 in x2.childNodes:
# print('x3',x3)
if isinstance(x3, xml.dom.minidom.Text):
t = re.sub(r"\s+", "", x3.data)
res.append([t, pinyins])
else:
print("len(nodes of x3):", len(x3.childNodes))
return res
def get_all_tags(self, tag_name):
'''获取所有的tag 及属性值'''
alltags = self.root.getElementsByTagName(tag_name)
for x in alltags:
if x.hasAttribute('pinyin'): # pinyin
print(x.tagName, 'pinyin',
x.getAttribute('pinyin'), x.firstChild.data)
Loading…
Cancel
Save