diff --git a/README.md b/README.md
index 49e40624..d02ac4c6 100644
--- a/README.md
+++ b/README.md
@@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
## Acknowledgement
- Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples.
-- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data.
+- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added SSML for Chinese Text Frontend.
- Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW).
- Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
- Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.
diff --git a/README_cn.md b/README_cn.md
index bf3ff4df..0c3af5dd 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -928,7 +928,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
## 致谢
- 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。
-- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。
+- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。新增 SSML 中文文本前端处理。
- 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。
- 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。
- 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。
diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py
index 7d93c026..57fe82a9 100644
--- a/paddlespeech/t2s/__init__.py
+++ b/paddlespeech/t2s/__init__.py
@@ -18,5 +18,6 @@ from . import exps
from . import frontend
from . import models
from . import modules
+from . import ssml
from . import training
from . import utils
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 15d8dfb7..41663891 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -13,6 +13,7 @@
# limitations under the License.
import math
import os
+import re
from pathlib import Path
from typing import Any
from typing import Dict
@@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
from paddlespeech.t2s.frontend.zh_frontend import Frontend
from paddlespeech.t2s.modules.normalizer import ZScore
from paddlespeech.utils.dynamic_import import dynamic_import
+
# remove [W:onnxruntime: xxx] from ort
ort.set_default_logger_severity(3)
@@ -103,14 +105,15 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
sentences = []
with open(text_file, 'rt') as f:
for line in f:
- items = line.strip().split()
- utt_id = items[0]
- if lang == 'zh':
- sentence = "".join(items[1:])
- elif lang == 'en':
- sentence = " ".join(items[1:])
- elif lang == 'mix':
- sentence = " ".join(items[1:])
+ if line.strip() != "":
+ items = re.split(r"\s+", line.strip(), 1)
+ utt_id = items[0]
+ if lang == 'zh':
+ sentence = "".join(items[1:])
+ elif lang == 'en':
+ sentence = " ".join(items[1:])
+ elif lang == 'mix':
+ sentence = " ".join(items[1:])
sentences.append((utt_id, sentence))
return sentences
@@ -180,11 +183,20 @@ def run_frontend(frontend: object,
to_tensor: bool=True):
outs = dict()
if lang == 'zh':
- input_ids = frontend.get_input_ids(
- text,
- merge_sentences=merge_sentences,
- get_tone_ids=get_tone_ids,
- to_tensor=to_tensor)
+ input_ids = {}
+ if text.strip() != "" and re.match(r".*?.*?.*", text,
+ re.DOTALL):
+ input_ids = frontend.get_input_ids_ssml(
+ text,
+ merge_sentences=merge_sentences,
+ get_tone_ids=get_tone_ids,
+ to_tensor=to_tensor)
+ else:
+ input_ids = frontend.get_input_ids(
+ text,
+ merge_sentences=merge_sentences,
+ get_tone_ids=get_tone_ids,
+ to_tensor=to_tensor)
phone_ids = input_ids["phone_ids"]
if get_tone_ids:
tone_ids = input_ids["tone_ids"]
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 722eed60..e3028698 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -13,6 +13,7 @@
# limitations under the License.
import os
import re
+from operator import itemgetter
from typing import Dict
from typing import List
@@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
+from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
INITIALS = [
'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
@@ -81,6 +83,7 @@ class Frontend():
g2p_model="g2pW",
phone_vocab_path=None,
tone_vocab_path=None):
+ self.mix_ssml_processor = MixTextProcessor()
self.tone_modifier = ToneSandhi()
self.text_normalizer = TextNormalizer()
self.punc = ":,;。?!“”‘’':,;.?!"
@@ -281,6 +284,65 @@ class Frontend():
phones_list.append(merge_list)
return phones_list
+ def _split_word_to_char(self, words):
+ res = []
+ for x in words:
+ res.append(x)
+ return res
+
+ # if using ssml, have pingyin specified, assign pinyin to words
+ def _g2p_assign(self,
+ words: List[str],
+ pinyin_spec: List[str],
+ merge_sentences: bool=True) -> List[List[str]]:
+ phones_list = []
+ initials = []
+ finals = []
+
+ words = self._split_word_to_char(words[0])
+ for pinyin, char in zip(pinyin_spec, words):
+ sub_initials = []
+ sub_finals = []
+ pinyin = pinyin.replace("u:", "v")
+ #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
+ if pinyin in self.pinyin2phone:
+ initial_final_list = self.pinyin2phone[pinyin].split(" ")
+ if len(initial_final_list) == 2:
+ sub_initials.append(initial_final_list[0])
+ sub_finals.append(initial_final_list[1])
+ elif len(initial_final_list) == 1:
+ sub_initials.append('')
+ sub_finals.append(initial_final_list[1])
+ else:
+ # If it's not pinyin (possibly punctuation) or no conversion is required
+ sub_initials.append(pinyin)
+ sub_finals.append(pinyin)
+ initials.append(sub_initials)
+ finals.append(sub_finals)
+
+ initials = sum(initials, [])
+ finals = sum(finals, [])
+ phones = []
+ for c, v in zip(initials, finals):
+ # NOTE: post process for pypinyin outputs
+ # we discriminate i, ii and iii
+ if c and c not in self.punc:
+ phones.append(c)
+ if c and c in self.punc:
+ phones.append('sp')
+ if v and v not in self.punc:
+ phones.append(v)
+ phones_list.append(phones)
+ if merge_sentences:
+ merge_list = sum(phones_list, [])
+ # rm the last 'sp' to avoid the noise at the end
+ # cause in the training data, no 'sp' in the end
+ if merge_list[-1] == 'sp':
+ merge_list = merge_list[:-1]
+ phones_list = []
+ phones_list.append(merge_list)
+ return phones_list
+
def _merge_erhua(self,
initials: List[str],
finals: List[str],
@@ -396,6 +458,52 @@ class Frontend():
print("----------------------------")
return phonemes
+ #@an added for ssml pinyin
+ def get_phonemes_ssml(self,
+ ssml_inputs: list,
+ merge_sentences: bool=True,
+ with_erhua: bool=True,
+ robot: bool=False,
+ print_info: bool=False) -> List[List[str]]:
+ all_phonemes = []
+ for word_pinyin_item in ssml_inputs:
+ phonemes = []
+ sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
+ sentences = self.text_normalizer.normalize(sentence)
+ if len(pinyin_spec) == 0:
+ phonemes = self._g2p(
+ sentences,
+ merge_sentences=merge_sentences,
+ with_erhua=with_erhua)
+ else:
+ # phonemes should be pinyin_spec
+ phonemes = self._g2p_assign(
+ sentences, pinyin_spec, merge_sentences=merge_sentences)
+
+ all_phonemes = all_phonemes + phonemes
+
+ if robot:
+ new_phonemes = []
+ for sentence in all_phonemes:
+ new_sentence = []
+ for item in sentence:
+ # `er` only have tone `2`
+ if item[-1] in "12345" and item != "er2":
+ item = item[:-1] + "1"
+ new_sentence.append(item)
+ new_phonemes.append(new_sentence)
+ all_phonemes = new_phonemes
+
+ if print_info:
+ print("----------------------------")
+ print("text norm results:")
+ print(sentences)
+ print("----------------------------")
+ print("g2p results:")
+ print(all_phonemes[0])
+ print("----------------------------")
+ return [sum(all_phonemes, [])]
+
def get_input_ids(self,
sentence: str,
merge_sentences: bool=True,
@@ -405,6 +513,7 @@ class Frontend():
add_blank: bool=False,
blank_token: str="",
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
+
phonemes = self.get_phonemes(
sentence,
merge_sentences=merge_sentences,
@@ -437,3 +546,49 @@ class Frontend():
if temp_phone_ids:
result["phone_ids"] = temp_phone_ids
return result
+
+ # @an added for ssml
+ def get_input_ids_ssml(
+ self,
+ sentence: str,
+ merge_sentences: bool=True,
+ get_tone_ids: bool=False,
+ robot: bool=False,
+ print_info: bool=False,
+ add_blank: bool=False,
+ blank_token: str="",
+ to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
+
+ l_inputs = MixTextProcessor.get_pinyin_split(sentence)
+ phonemes = self.get_phonemes_ssml(
+ l_inputs,
+ merge_sentences=merge_sentences,
+ print_info=print_info,
+ robot=robot)
+ result = {}
+ phones = []
+ tones = []
+ temp_phone_ids = []
+ temp_tone_ids = []
+
+ for part_phonemes in phonemes:
+ phones, tones = self._get_phone_tone(
+ part_phonemes, get_tone_ids=get_tone_ids)
+ if add_blank:
+ phones = insert_after_character(phones, blank_token)
+ if tones:
+ tone_ids = self._t2id(tones)
+ if to_tensor:
+ tone_ids = paddle.to_tensor(tone_ids)
+ temp_tone_ids.append(tone_ids)
+ if phones:
+ phone_ids = self._p2id(phones)
+ # if use paddle.to_tensor() in onnxruntime, the first time will be too low
+ if to_tensor:
+ phone_ids = paddle.to_tensor(phone_ids)
+ temp_phone_ids.append(phone_ids)
+ if temp_tone_ids:
+ result["tone_ids"] = temp_tone_ids
+ if temp_phone_ids:
+ result["phone_ids"] = temp_phone_ids
+ return result
diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py
new file mode 100644
index 00000000..9b4db053
--- /dev/null
+++ b/paddlespeech/t2s/ssml/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .xml_processor import *
diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py
new file mode 100644
index 00000000..b3912134
--- /dev/null
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+import re
+import xml.dom.minidom
+import xml.parsers.expat
+from xml.dom.minidom import Node
+from xml.dom.minidom import parseString
+'''
+Note: xml 有5种特殊字符, &<>"'
+其一,采用特殊标签,将包含特殊字符的字符串封装起来。
+例如:
+
+其二,使用XML转义序列表示这些特殊的字符,这5个特殊字符所对应XML转义序列为:
+& &
+< <
+> >
+" "
+' '
+例如:
+"姓名"
+
+'''
+
+
+class MixTextProcessor():
+ def __repr__(self):
+ print("@an MixTextProcessor class")
+
+ def get_xml_content(self, mixstr):
+ '''返回字符串的 xml 内容'''
+ xmlptn = re.compile(r".*?", re.M | re.S)
+ ctn = re.search(xmlptn, mixstr)
+ if ctn:
+ return ctn.group(0)
+ else:
+ return None
+
+ def get_content_split(self, mixstr):
+ ''' 文本分解,顺序加了列表中,按非 xml 和 xml 分开,对应的字符串,带标点符号
+ 不能去除空格,因为 xml 中tag 属性带空格
+ '''
+ ctlist = []
+ # print("Testing:",mixstr[:20])
+ patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S)
+ mat = re.match(patn, mixstr)
+ if mat:
+ pre_xml = mat.group(1)
+ in_xml = mat.group(2)
+ after_xml = mat.group(3)
+
+ ctlist.append(pre_xml)
+ ctlist.append(in_xml)
+ ctlist.append(after_xml)
+ return ctlist
+ else:
+ ctlist.append(mixstr)
+ return ctlist
+
+ @classmethod
+ def get_pinyin_split(self, mixstr):
+ ctlist = []
+ patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S)
+ mat = re.match(patn, mixstr)
+ if mat:
+ pre_xml = mat.group(1)
+ in_xml = mat.group(2)
+ after_xml = mat.group(3)
+
+ ctlist.append([pre_xml, []])
+ dom = DomXml(in_xml)
+ pinyinlist = dom.get_pinyins_for_xml()
+ ctlist = ctlist + pinyinlist
+ ctlist.append([after_xml, []])
+ else:
+ ctlist.append([mixstr, []])
+ return ctlist
+
+
+class DomXml():
+ def __init__(self, xmlstr):
+ self.tdom = parseString(xmlstr) #Document
+ self.root = self.tdom.documentElement #Element
+ self.rnode = self.tdom.childNodes #NodeList
+
+ def get_text(self):
+ '''返回 xml 内容的所有文本内容的列表'''
+ res = []
+
+ for x1 in self.rnode:
+ if x1.nodeType == Node.TEXT_NODE:
+ res.append(x1.value)
+ else:
+ for x2 in x1.childNodes:
+ if isinstance(x2, xml.dom.minidom.Text):
+ res.append(x2.data)
+ else:
+ for x3 in x2.childNodes:
+ if isinstance(x3, xml.dom.minidom.Text):
+ res.append(x3.data)
+ else:
+ print("len(nodes of x3):", len(x3.childNodes))
+
+ return res
+
+ def get_xmlchild_list(self):
+ '''返回 xml 内容的列表,包括所有文本内容(不带 tag)'''
+ res = []
+
+ for x1 in self.rnode:
+ if x1.nodeType == Node.TEXT_NODE:
+ res.append(x1.value)
+ else:
+ for x2 in x1.childNodes:
+ if isinstance(x2, xml.dom.minidom.Text):
+ res.append(x2.data)
+ else:
+ for x3 in x2.childNodes:
+ if isinstance(x3, xml.dom.minidom.Text):
+ res.append(x3.data)
+ else:
+ print("len(nodes of x3):", len(x3.childNodes))
+ print(res)
+ return res
+
+ def get_pinyins_for_xml(self):
+ '''返回 xml 内容,字符串和拼音的 list '''
+ res = []
+
+ for x1 in self.rnode:
+ if x1.nodeType == Node.TEXT_NODE:
+ t = re.sub(r"\s+", "", x1.value)
+ res.append([t, []])
+ else:
+ for x2 in x1.childNodes:
+ if isinstance(x2, xml.dom.minidom.Text):
+ t = re.sub(r"\s+", "", x2.data)
+ res.append([t, []])
+ else:
+ # print("x2",x2,x2.tagName)
+ if x2.hasAttribute('pinyin'):
+ pinyin_value = x2.getAttribute("pinyin")
+ pinyins = pinyin_value.split(" ")
+ for x3 in x2.childNodes:
+ # print('x3',x3)
+ if isinstance(x3, xml.dom.minidom.Text):
+ t = re.sub(r"\s+", "", x3.data)
+ res.append([t, pinyins])
+ else:
+ print("len(nodes of x3):", len(x3.childNodes))
+
+ return res
+
+ def get_all_tags(self, tag_name):
+ '''获取所有的 tag 及属性值'''
+ alltags = self.root.getElementsByTagName(tag_name)
+ for x in alltags:
+ if x.hasAttribute('pinyin'): # pinyin
+ print(x.tagName, 'pinyin',
+ x.getAttribute('pinyin'), x.firstChild.data)