Merge pull request #2531 from david-95/hongliang1014

[TTS]Add SSML for Chinese Text Frontend
pull/2590/head
TianYuan 2 years ago committed by GitHub
commit f5e80cef18
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
## Acknowledgement
- Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples.
- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data.
- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added SSML for Chinese Text Frontend.
- Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW).
- Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
- Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.

@ -928,7 +928,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
## 致谢
- 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。
- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。
- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。新增 SSML 中文文本前端处理。
- 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。
- 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。
- 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。

@ -18,5 +18,6 @@ from . import exps
from . import frontend
from . import models
from . import modules
from . import ssml
from . import training
from . import utils

@ -13,6 +13,7 @@
# limitations under the License.
import math
import os
import re
from pathlib import Path
from typing import Any
from typing import Dict
@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.modules.normalizer import ZScore
from paddlespeech.utils.dynamic_import import dynamic_import
# remove [W:onnxruntime: xxx] from ort
ort.set_default_logger_severity(3)
@ -103,14 +105,15 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
sentences = []
with open(text_file, 'rt') as f:
for line in f:
items = line.strip().split()
utt_id = items[0]
if lang == 'zh':
sentence = "".join(items[1:])
elif lang == 'en':
sentence = " ".join(items[1:])
elif lang == 'mix':
sentence = " ".join(items[1:])
if line.strip() != "":
items = re.split(r"\s+", line.strip(), 1)
utt_id = items[0]
if lang == 'zh':
sentence = "".join(items[1:])
elif lang == 'en':
sentence = " ".join(items[1:])
elif lang == 'mix':
sentence = " ".join(items[1:])
sentences.append((utt_id, sentence))
return sentences
@ -180,11 +183,20 @@ def run_frontend(frontend: object,
to_tensor: bool=True):
outs = dict()
if lang == 'zh':
input_ids = frontend.get_input_ids(
text,
merge_sentences=merge_sentences,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
input_ids = {}
if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text,
re.DOTALL):
input_ids = frontend.get_input_ids_ssml(
text,
merge_sentences=merge_sentences,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
else:
input_ids = frontend.get_input_ids(
text,
merge_sentences=merge_sentences,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
phone_ids = input_ids["phone_ids"]
if get_tone_ids:
tone_ids = input_ids["tone_ids"]

@ -13,6 +13,7 @@
# limitations under the License.
import os
import re
from operator import itemgetter
from typing import Dict
from typing import List
@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
INITIALS = [
'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
@ -81,6 +83,7 @@ class Frontend():
g2p_model="g2pW",
phone_vocab_path=None,
tone_vocab_path=None):
self.mix_ssml_processor = MixTextProcessor()
self.tone_modifier = ToneSandhi()
self.text_normalizer = TextNormalizer()
self.punc = ":,;。?!“”‘’':,;.?!"
@ -281,6 +284,65 @@ class Frontend():
phones_list.append(merge_list)
return phones_list
def _split_word_to_char(self, words):
res = []
for x in words:
res.append(x)
return res
# if using ssml, have pingyin specified, assign pinyin to words
def _g2p_assign(self,
words: List[str],
pinyin_spec: List[str],
merge_sentences: bool=True) -> List[List[str]]:
phones_list = []
initials = []
finals = []
words = self._split_word_to_char(words[0])
for pinyin, char in zip(pinyin_spec, words):
sub_initials = []
sub_finals = []
pinyin = pinyin.replace("u:", "v")
#self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
if pinyin in self.pinyin2phone:
initial_final_list = self.pinyin2phone[pinyin].split(" ")
if len(initial_final_list) == 2:
sub_initials.append(initial_final_list[0])
sub_finals.append(initial_final_list[1])
elif len(initial_final_list) == 1:
sub_initials.append('')
sub_finals.append(initial_final_list[1])
else:
# If it's not pinyin (possibly punctuation) or no conversion is required
sub_initials.append(pinyin)
sub_finals.append(pinyin)
initials.append(sub_initials)
finals.append(sub_finals)
initials = sum(initials, [])
finals = sum(finals, [])
phones = []
for c, v in zip(initials, finals):
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if c and c not in self.punc:
phones.append(c)
if c and c in self.punc:
phones.append('sp')
if v and v not in self.punc:
phones.append(v)
phones_list.append(phones)
if merge_sentences:
merge_list = sum(phones_list, [])
# rm the last 'sp' to avoid the noise at the end
# cause in the training data, no 'sp' in the end
if merge_list[-1] == 'sp':
merge_list = merge_list[:-1]
phones_list = []
phones_list.append(merge_list)
return phones_list
def _merge_erhua(self,
initials: List[str],
finals: List[str],
@ -396,6 +458,52 @@ class Frontend():
print("----------------------------")
return phonemes
#@an added for ssml pinyin
def get_phonemes_ssml(self,
ssml_inputs: list,
merge_sentences: bool=True,
with_erhua: bool=True,
robot: bool=False,
print_info: bool=False) -> List[List[str]]:
all_phonemes = []
for word_pinyin_item in ssml_inputs:
phonemes = []
sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
sentences = self.text_normalizer.normalize(sentence)
if len(pinyin_spec) == 0:
phonemes = self._g2p(
sentences,
merge_sentences=merge_sentences,
with_erhua=with_erhua)
else:
# phonemes should be pinyin_spec
phonemes = self._g2p_assign(
sentences, pinyin_spec, merge_sentences=merge_sentences)
all_phonemes = all_phonemes + phonemes
if robot:
new_phonemes = []
for sentence in all_phonemes:
new_sentence = []
for item in sentence:
# `er` only have tone `2`
if item[-1] in "12345" and item != "er2":
item = item[:-1] + "1"
new_sentence.append(item)
new_phonemes.append(new_sentence)
all_phonemes = new_phonemes
if print_info:
print("----------------------------")
print("text norm results:")
print(sentences)
print("----------------------------")
print("g2p results:")
print(all_phonemes[0])
print("----------------------------")
return [sum(all_phonemes, [])]
def get_input_ids(self,
sentence: str,
merge_sentences: bool=True,
@ -405,6 +513,7 @@ class Frontend():
add_blank: bool=False,
blank_token: str="<pad>",
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
phonemes = self.get_phonemes(
sentence,
merge_sentences=merge_sentences,
@ -437,3 +546,49 @@ class Frontend():
if temp_phone_ids:
result["phone_ids"] = temp_phone_ids
return result
# @an added for ssml
def get_input_ids_ssml(
self,
sentence: str,
merge_sentences: bool=True,
get_tone_ids: bool=False,
robot: bool=False,
print_info: bool=False,
add_blank: bool=False,
blank_token: str="<pad>",
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
l_inputs = MixTextProcessor.get_pinyin_split(sentence)
phonemes = self.get_phonemes_ssml(
l_inputs,
merge_sentences=merge_sentences,
print_info=print_info,
robot=robot)
result = {}
phones = []
tones = []
temp_phone_ids = []
temp_tone_ids = []
for part_phonemes in phonemes:
phones, tones = self._get_phone_tone(
part_phonemes, get_tone_ids=get_tone_ids)
if add_blank:
phones = insert_after_character(phones, blank_token)
if tones:
tone_ids = self._t2id(tones)
if to_tensor:
tone_ids = paddle.to_tensor(tone_ids)
temp_tone_ids.append(tone_ids)
if phones:
phone_ids = self._p2id(phones)
# if use paddle.to_tensor() in onnxruntime, the first time will be too low
if to_tensor:
phone_ids = paddle.to_tensor(phone_ids)
temp_phone_ids.append(phone_ids)
if temp_tone_ids:
result["tone_ids"] = temp_tone_ids
if temp_phone_ids:
result["phone_ids"] = temp_phone_ids
return result

@ -0,0 +1,14 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .xml_processor import *

@ -0,0 +1,158 @@
# -*- coding: utf-8 -*-
import re
import xml.dom.minidom
import xml.parsers.expat
from xml.dom.minidom import Node
from xml.dom.minidom import parseString
'''
Note: xml 有5种特殊字符 &<>"'
其一采用<![CDATA[ ]]>特殊标签将包含特殊字符的字符串封装起来
例如
<TitleName><![CDATA["姓名"]]></TitleName>
其二使用XML转义序列表示这些特殊的字符这5个特殊字符所对应XML转义序列为
& &amp;
< &lt;
> &gt;
" &quot;
' &apos;
例如
<TitleName>&quot;姓名&quot;</TitleName>
'''
class MixTextProcessor():
def __repr__(self):
print("@an MixTextProcessor class")
def get_xml_content(self, mixstr):
'''返回字符串的 xml 内容'''
xmlptn = re.compile(r"<speak>.*?</speak>", re.M | re.S)
ctn = re.search(xmlptn, mixstr)
if ctn:
return ctn.group(0)
else:
return None
def get_content_split(self, mixstr):
''' 文本分解,顺序加了列表中,按非 xml 和 xml 分开,对应的字符串,带标点符号
不能去除空格因为 xml 中tag 属性带空格
'''
ctlist = []
# print("Testing:",mixstr[:20])
patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
mat = re.match(patn, mixstr)
if mat:
pre_xml = mat.group(1)
in_xml = mat.group(2)
after_xml = mat.group(3)
ctlist.append(pre_xml)
ctlist.append(in_xml)
ctlist.append(after_xml)
return ctlist
else:
ctlist.append(mixstr)
return ctlist
@classmethod
def get_pinyin_split(self, mixstr):
ctlist = []
patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
mat = re.match(patn, mixstr)
if mat:
pre_xml = mat.group(1)
in_xml = mat.group(2)
after_xml = mat.group(3)
ctlist.append([pre_xml, []])
dom = DomXml(in_xml)
pinyinlist = dom.get_pinyins_for_xml()
ctlist = ctlist + pinyinlist
ctlist.append([after_xml, []])
else:
ctlist.append([mixstr, []])
return ctlist
class DomXml():
def __init__(self, xmlstr):
self.tdom = parseString(xmlstr) #Document
self.root = self.tdom.documentElement #Element
self.rnode = self.tdom.childNodes #NodeList
def get_text(self):
'''返回 xml 内容的所有文本内容的列表'''
res = []
for x1 in self.rnode:
if x1.nodeType == Node.TEXT_NODE:
res.append(x1.value)
else:
for x2 in x1.childNodes:
if isinstance(x2, xml.dom.minidom.Text):
res.append(x2.data)
else:
for x3 in x2.childNodes:
if isinstance(x3, xml.dom.minidom.Text):
res.append(x3.data)
else:
print("len(nodes of x3):", len(x3.childNodes))
return res
def get_xmlchild_list(self):
'''返回 xml 内容的列表,包括所有文本内容(不带 tag)'''
res = []
for x1 in self.rnode:
if x1.nodeType == Node.TEXT_NODE:
res.append(x1.value)
else:
for x2 in x1.childNodes:
if isinstance(x2, xml.dom.minidom.Text):
res.append(x2.data)
else:
for x3 in x2.childNodes:
if isinstance(x3, xml.dom.minidom.Text):
res.append(x3.data)
else:
print("len(nodes of x3):", len(x3.childNodes))
print(res)
return res
def get_pinyins_for_xml(self):
'''返回 xml 内容,字符串和拼音的 list '''
res = []
for x1 in self.rnode:
if x1.nodeType == Node.TEXT_NODE:
t = re.sub(r"\s+", "", x1.value)
res.append([t, []])
else:
for x2 in x1.childNodes:
if isinstance(x2, xml.dom.minidom.Text):
t = re.sub(r"\s+", "", x2.data)
res.append([t, []])
else:
# print("x2",x2,x2.tagName)
if x2.hasAttribute('pinyin'):
pinyin_value = x2.getAttribute("pinyin")
pinyins = pinyin_value.split(" ")
for x3 in x2.childNodes:
# print('x3',x3)
if isinstance(x3, xml.dom.minidom.Text):
t = re.sub(r"\s+", "", x3.data)
res.append([t, pinyins])
else:
print("len(nodes of x3):", len(x3.childNodes))
return res
def get_all_tags(self, tag_name):
'''获取所有的 tag 及属性值'''
alltags = self.root.getElementsByTagName(tag_name)
for x in alltags:
if x.hasAttribute('pinyin'): # pinyin
print(x.tagName, 'pinyin',
x.getAttribute('pinyin'), x.firstChild.data)
Loading…
Cancel
Save