From 13a7fa9808d0faaa1589e0ef0659c537bd4d5dbb Mon Sep 17 00:00:00 2001 From: "david.95" Date: Fri, 14 Oct 2022 15:37:33 +0800 Subject: [PATCH 01/11] enable chinese words' pinyin specified in text of ssml formats, test=tts --- paddlespeech/t2s/exps/syn_utils.py | 6 +- paddlespeech/t2s/frontend/zh_frontend.py | 156 ++++++++++++++++++++++ paddlespeech/t2s/ssml/xml_processor.py | 163 +++++++++++++++++++++++ 3 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 paddlespeech/t2s/ssml/xml_processor.py diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 15d8dfb78..f9d1cd1b5 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -13,6 +13,7 @@ # limitations under the License. import math import os +import re from pathlib import Path from typing import Any from typing import Dict @@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.utils.dynamic_import import dynamic_import + # remove [W:onnxruntime: xxx] from ort ort.set_default_logger_severity(3) @@ -103,7 +105,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): sentences = [] with open(text_file, 'rt') as f: for line in f: - items = line.strip().split() + items = re.split(r"\s+", line.strip(), 1) utt_id = items[0] if lang == 'zh': sentence = "".join(items[1:]) @@ -180,7 +182,7 @@ def run_frontend(frontend: object, to_tensor: bool=True): outs = dict() if lang == 'zh': - input_ids = frontend.get_input_ids( + input_ids = frontend.get_input_ids_ssml( text, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids, diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 722eed601..25558780b 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -13,6 +13,7 @@ # limitations under the License. import os import re +from operator import itemgetter from typing import Dict from typing import List @@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer +from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor INITIALS = [ 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', @@ -81,6 +83,7 @@ class Frontend(): g2p_model="g2pW", phone_vocab_path=None, tone_vocab_path=None): + self.mix_ssml_processor = MixTextProcessor() self.tone_modifier = ToneSandhi() self.text_normalizer = TextNormalizer() self.punc = ":,;。?!“”‘’':,;.?!" @@ -143,6 +146,7 @@ class Frontend(): tone_id = [line.strip().split() for line in f.readlines()] for tone, id in tone_id: self.vocab_tones[tone] = int(id) + self.mix_ssml_processor.__repr__() def _init_pypinyin(self): large_pinyin.load() @@ -281,6 +285,65 @@ class Frontend(): phones_list.append(merge_list) return phones_list + def _split_word_to_char(self, words): + res = [] + for x in words: + res.append(x) + return res + + # if using ssml, have pingyin specified, assign pinyin to words + def _g2p_assign(self, + words: List[str], + pinyin_spec: List[str], + merge_sentences: bool=True) -> List[List[str]]: + phones_list = [] + initials = [] + finals = [] + + words = self._split_word_to_char(words[0]) + for pinyin, char in zip(pinyin_spec, words): + sub_initials = [] + sub_finals = [] + pinyin = pinyin.replace("u:", "v") + #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu + if pinyin in self.pinyin2phone: + initial_final_list = self.pinyin2phone[pinyin].split(" ") + if len(initial_final_list) == 2: + sub_initials.append(initial_final_list[0]) + sub_finals.append(initial_final_list[1]) + elif len(initial_final_list) == 1: + sub_initials.append('') + sub_finals.append(initial_final_list[1]) + else: + # If it's not pinyin (possibly punctuation) or no conversion is required + sub_initials.append(pinyin) + sub_finals.append(pinyin) + initials.append(sub_initials) + finals.append(sub_finals) + + initials = sum(initials, []) + finals = sum(finals, []) + phones = [] + for c, v in zip(initials, finals): + # NOTE: post process for pypinyin outputs + # we discriminate i, ii and iii + if c and c not in self.punc: + phones.append(c) + if c and c in self.punc: + phones.append('sp') + if v and v not in self.punc: + phones.append(v) + phones_list.append(phones) + if merge_sentences: + merge_list = sum(phones_list, []) + # rm the last 'sp' to avoid the noise at the end + # cause in the training data, no 'sp' in the end + if merge_list[-1] == 'sp': + merge_list = merge_list[:-1] + phones_list = [] + phones_list.append(merge_list) + return phones_list + def _merge_erhua(self, initials: List[str], finals: List[str], @@ -396,6 +459,52 @@ class Frontend(): print("----------------------------") return phonemes + #@an added for ssml pinyin + def get_phonemes_ssml(self, + ssml_inputs: list, + merge_sentences: bool=True, + with_erhua: bool=True, + robot: bool=False, + print_info: bool=False) -> List[List[str]]: + all_phonemes = [] + for word_pinyin_item in ssml_inputs: + phonemes = [] + sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item) + sentences = self.text_normalizer.normalize(sentence) + if len(pinyin_spec) == 0: + phonemes = self._g2p( + sentences, + merge_sentences=merge_sentences, + with_erhua=with_erhua) + else: + # phonemes should be pinyin_spec + phonemes = self._g2p_assign( + sentences, pinyin_spec, merge_sentences=merge_sentences) + + all_phonemes = all_phonemes + phonemes + + if robot: + new_phonemes = [] + for sentence in all_phonemes: + new_sentence = [] + for item in sentence: + # `er` only have tone `2` + if item[-1] in "12345" and item != "er2": + item = item[:-1] + "1" + new_sentence.append(item) + new_phonemes.append(new_sentence) + all_phonemes = new_phonemes + + if print_info: + print("----------------------------") + print("text norm results:") + print(sentences) + print("----------------------------") + print("g2p results:") + print(all_phonemes[0]) + print("----------------------------") + return [sum(all_phonemes, [])] + def get_input_ids(self, sentence: str, merge_sentences: bool=True, @@ -405,6 +514,7 @@ class Frontend(): add_blank: bool=False, blank_token: str="", to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: + phonemes = self.get_phonemes( sentence, merge_sentences=merge_sentences, @@ -437,3 +547,49 @@ class Frontend(): if temp_phone_ids: result["phone_ids"] = temp_phone_ids return result + + # @an added for ssml + def get_input_ids_ssml( + self, + sentence: str, + merge_sentences: bool=True, + get_tone_ids: bool=False, + robot: bool=False, + print_info: bool=False, + add_blank: bool=False, + blank_token: str="", + to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: + + l_inputs = MixTextProcessor.get_pinyin_split(sentence) + phonemes = self.get_phonemes_ssml( + l_inputs, + merge_sentences=merge_sentences, + print_info=print_info, + robot=robot) + result = {} + phones = [] + tones = [] + temp_phone_ids = [] + temp_tone_ids = [] + + for part_phonemes in phonemes: + phones, tones = self._get_phone_tone( + part_phonemes, get_tone_ids=get_tone_ids) + if add_blank: + phones = insert_after_character(phones, blank_token) + if tones: + tone_ids = self._t2id(tones) + if to_tensor: + tone_ids = paddle.to_tensor(tone_ids) + temp_tone_ids.append(tone_ids) + if phones: + phone_ids = self._p2id(phones) + # if use paddle.to_tensor() in onnxruntime, the first time will be too low + if to_tensor: + phone_ids = paddle.to_tensor(phone_ids) + temp_phone_ids.append(phone_ids) + if temp_tone_ids: + result["tone_ids"] = temp_tone_ids + if temp_phone_ids: + result["phone_ids"] = temp_phone_ids + return result diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py new file mode 100644 index 000000000..54f24f59f --- /dev/null +++ b/paddlespeech/t2s/ssml/xml_processor.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- +import re +import xml.dom.minidom +import xml.parsers.expat +from xml.dom.minidom import Node +from xml.dom.minidom import parseString +''' +Note: xml 有5种特殊字符, &<>"' +其一,采用特殊标签,将包含特殊字符的字符串封装起来。 +例如: + +其二,使用XML转义序列表示这些特殊的字符,这5个特殊字符所对应XML转义序列为: +& & +< < +> > +" " +' ' +例如: +"姓名" + +''' + + +class MixTextProcessor(): + def __repr__(self): + print("@an MixTextProcessor class") + + def get_xml_content(self, mixstr): + '''返回字符串的 xml 内容''' + xmlptn = re.compile(r".*?", re.M | re.S) + ctn = re.search(xmlptn, mixstr) + if ctn: + return ctn.group(0) + else: + return None + + def get_content_split(self, mixstr): + ''' 文本分解,顺序加了列表中,按非xml 和 xml 分开,对应的字符串,带标点符号 + 不能去除空格,因为xml 中tag 属性带空格 + ''' + ctlist = [] + # print("Testing:",mixstr[:20]) + patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) + mat = re.match(patn, mixstr) + if mat: + pre_xml = mat.group(1) + in_xml = mat.group(2) + after_xml = mat.group(3) + + ctlist.append(pre_xml) + ctlist.append(in_xml) + ctlist.append(after_xml) + return ctlist + else: + ctlist.append(mixstr) + return ctlist + + @classmethod + def get_pinyin_split(self, mixstr): + ctlist = [] + patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) + mat = re.match(patn, mixstr) + if mat: + pre_xml = mat.group(1) + in_xml = mat.group(2) + after_xml = mat.group(3) + + ctlist.append([pre_xml, []]) + dom = DomXml(in_xml) + pinyinlist = dom.get_pinyins_for_xml() + ctlist = ctlist + pinyinlist + ctlist.append([after_xml, []]) + else: + ctlist.append([mixstr, []]) + return ctlist + + +class DomXml(): + def __init__(self, xmlstr): + print("Parse xml str:", xmlstr) + self.tdom = parseString(xmlstr) #Document + # print("tdom:",type(self.tdom)) + self.root = self.tdom.documentElement #Element + # print("root:",type(self.root)) + self.rnode = self.tdom.childNodes #NodeList + # print("rnode:",type(self.rnode)) + pass + + def get_text(self): + '''返回xml 内容的所有文本内容的 列表''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + res.append(x1.value) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + res.append(x2.data) + else: + for x3 in x2.childNodes: + if isinstance(x3, xml.dom.minidom.Text): + res.append(x3.data) + else: + print("len(nodes of x3):", len(x3.childNodes)) + + return res + + def get_xmlchild_list(self): + '''返回xml 内容的列表, 包括所有文本内容(不带tag)''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + res.append(x1.value) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + res.append(x2.data) + else: + for x3 in x2.childNodes: + if isinstance(x3, xml.dom.minidom.Text): + res.append(x3.data) + else: + print("len(nodes of x3):", len(x3.childNodes)) + print(res) + return res + + def get_pinyins_for_xml(self): + '''返回xml 内容,如果字符串 和 拼音的 list , 如 [''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + t = re.sub(r"\s+", "", x1.value) + res.append([t, []]) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + t = re.sub(r"\s+", "", x2.data) + res.append([t, []]) + else: + # print("x2",x2,x2.tagName) + if x2.hasAttribute('pinyin'): + pinyin_value = x2.getAttribute("pinyin") + pinyins = pinyin_value.split(" ") + for x3 in x2.childNodes: + # print('x3',x3) + if isinstance(x3, xml.dom.minidom.Text): + t = re.sub(r"\s+", "", x3.data) + res.append([t, pinyins]) + else: + print("len(nodes of x3):", len(x3.childNodes)) + + return res + + def get_all_tags(self, tag_name): + '''获取所有的tag 及属性值''' + alltags = self.root.getElementsByTagName(tag_name) + for x in alltags: + if x.hasAttribute('pinyin'): # pinyin + print(x.tagName, 'pinyin', + x.getAttribute('pinyin'), x.firstChild.data) From 278c7a41a83412f02bc4b0b98832c5076f0940cf Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 14:59:23 +0800 Subject: [PATCH 02/11] add module define to fix ci, test=tts --- paddlespeech/t2s/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py index 7d93c026e..57fe82a9c 100644 --- a/paddlespeech/t2s/__init__.py +++ b/paddlespeech/t2s/__init__.py @@ -18,5 +18,6 @@ from . import exps from . import frontend from . import models from . import modules +from . import ssml from . import training from . import utils From 29508f400b23211c9e7380800e2d02c9a16a426f Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 16:44:29 +0800 Subject: [PATCH 03/11] to fix CI issue, test=tts --- paddlespeech/t2s/ssml/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 paddlespeech/t2s/ssml/__init__.py diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py new file mode 100644 index 000000000..e69de29bb From f56cc08b18f5fb6fc3254db4dd40ec3597d34f36 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 16:55:07 +0800 Subject: [PATCH 04/11] add license content, test=tts --- paddlespeech/t2s/ssml/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py index e69de29bb..abf198b97 100644 --- a/paddlespeech/t2s/ssml/__init__.py +++ b/paddlespeech/t2s/ssml/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From 1067088debd49ba308fc55a8c55d1d04f211ff51 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 17:18:27 +0800 Subject: [PATCH 05/11] modify __init__ --- paddlespeech/t2s/ssml/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py index abf198b97..f344250d2 100644 --- a/paddlespeech/t2s/ssml/__init__.py +++ b/paddlespeech/t2s/ssml/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .xml_processor import * From 89e9ea69ebb884d5ba13d02c66c29475a153f2ea Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 17:29:46 +0800 Subject: [PATCH 06/11] modify __init__ --- paddlespeech/t2s/ssml/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py index f344250d2..9b4db053b 100644 --- a/paddlespeech/t2s/ssml/__init__.py +++ b/paddlespeech/t2s/ssml/__init__.py @@ -11,5 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .xml_processor import * From f295d2d4450099f2cf8b7e2d417a9c9599230563 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Mon, 17 Oct 2022 18:00:13 +0800 Subject: [PATCH 07/11] remove useless code --- paddlespeech/t2s/frontend/zh_frontend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 25558780b..e30286986 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -146,7 +146,6 @@ class Frontend(): tone_id = [line.strip().split() for line in f.readlines()] for tone, id in tone_id: self.vocab_tones[tone] = int(id) - self.mix_ssml_processor.__repr__() def _init_pypinyin(self): large_pinyin.load() From 3ac7ac253f66c46f01aa11be3de95d6177f47107 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 09:29:11 +0800 Subject: [PATCH 08/11] fix review issue,test=tts --- paddlespeech/t2s/ssml/xml_processor.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py index 54f24f59f..b39121347 100644 --- a/paddlespeech/t2s/ssml/xml_processor.py +++ b/paddlespeech/t2s/ssml/xml_processor.py @@ -35,8 +35,8 @@ class MixTextProcessor(): return None def get_content_split(self, mixstr): - ''' 文本分解,顺序加了列表中,按非xml 和 xml 分开,对应的字符串,带标点符号 - 不能去除空格,因为xml 中tag 属性带空格 + ''' 文本分解,顺序加了列表中,按非 xml 和 xml 分开,对应的字符串,带标点符号 + 不能去除空格,因为 xml 中tag 属性带空格 ''' ctlist = [] # print("Testing:",mixstr[:20]) @@ -77,17 +77,12 @@ class MixTextProcessor(): class DomXml(): def __init__(self, xmlstr): - print("Parse xml str:", xmlstr) self.tdom = parseString(xmlstr) #Document - # print("tdom:",type(self.tdom)) self.root = self.tdom.documentElement #Element - # print("root:",type(self.root)) self.rnode = self.tdom.childNodes #NodeList - # print("rnode:",type(self.rnode)) - pass def get_text(self): - '''返回xml 内容的所有文本内容的 列表''' + '''返回 xml 内容的所有文本内容的列表''' res = [] for x1 in self.rnode: @@ -107,7 +102,7 @@ class DomXml(): return res def get_xmlchild_list(self): - '''返回xml 内容的列表, 包括所有文本内容(不带tag)''' + '''返回 xml 内容的列表,包括所有文本内容(不带 tag)''' res = [] for x1 in self.rnode: @@ -127,7 +122,7 @@ class DomXml(): return res def get_pinyins_for_xml(self): - '''返回xml 内容,如果字符串 和 拼音的 list , 如 [''' + '''返回 xml 内容,字符串和拼音的 list ''' res = [] for x1 in self.rnode: @@ -155,7 +150,7 @@ class DomXml(): return res def get_all_tags(self, tag_name): - '''获取所有的tag 及属性值''' + '''获取所有的 tag 及属性值''' alltags = self.root.getElementsByTagName(tag_name) for x in alltags: if x.hasAttribute('pinyin'): # pinyin From 7d5ae651ce92d0bd953f0de54b81d00cf951b01d Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 10:07:21 +0800 Subject: [PATCH 09/11] add readme thanks --- README.md | 2 +- README_cn.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 49e40624d..0abb3fd69 100644 --- a/README.md +++ b/README.md @@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. -- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. +- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data.Added SSML for Chinese Text Frontend. - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. diff --git a/README_cn.md b/README_cn.md index bf3ff4dfd..0c3af5dd4 100644 --- a/README_cn.md +++ b/README_cn.md @@ -928,7 +928,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 ## 致谢 - 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。 -- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。 +- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。新增 SSML 中文文本前端处理。 - 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。 From ec1f9edd562275e2d2799c16e36a304bae172e1c Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 10:11:26 +0800 Subject: [PATCH 10/11] add space after punctions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0abb3fd69..d02ac4c6b 100644 --- a/README.md +++ b/README.md @@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. -- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data.Added SSML for Chinese Text Frontend. +- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added SSML for Chinese Text Frontend. - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. From ed0138c6e324a87e31a23138bafe6f878ed8f4e9 Mon Sep 17 00:00:00 2001 From: "david.95" Date: Thu, 20 Oct 2022 18:09:41 +0800 Subject: [PATCH 11/11] add condition check if a ssml input and filter space line, test=tts --- paddlespeech/t2s/exps/syn_utils.py | 36 +++++++++++++++++++----------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index f9d1cd1b5..41663891e 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -105,14 +105,15 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): sentences = [] with open(text_file, 'rt') as f: for line in f: - items = re.split(r"\s+", line.strip(), 1) - utt_id = items[0] - if lang == 'zh': - sentence = "".join(items[1:]) - elif lang == 'en': - sentence = " ".join(items[1:]) - elif lang == 'mix': - sentence = " ".join(items[1:]) + if line.strip() != "": + items = re.split(r"\s+", line.strip(), 1) + utt_id = items[0] + if lang == 'zh': + sentence = "".join(items[1:]) + elif lang == 'en': + sentence = " ".join(items[1:]) + elif lang == 'mix': + sentence = " ".join(items[1:]) sentences.append((utt_id, sentence)) return sentences @@ -182,11 +183,20 @@ def run_frontend(frontend: object, to_tensor: bool=True): outs = dict() if lang == 'zh': - input_ids = frontend.get_input_ids_ssml( - text, - merge_sentences=merge_sentences, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) + input_ids = {} + if text.strip() != "" and re.match(r".*?.*?.*", text, + re.DOTALL): + input_ids = frontend.get_input_ids_ssml( + text, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) + else: + input_ids = frontend.get_input_ids( + text, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) phone_ids = input_ids["phone_ids"] if get_tone_ids: tone_ids = input_ids["tone_ids"]