diff --git a/paddlespeech/t2s/assets/__init__.py b/paddlespeech/t2s/assets/__init__.py index e69de29b..595add0a 100644 --- a/paddlespeech/t2s/assets/__init__.py +++ b/paddlespeech/t2s/assets/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 93146df0..cafd065a 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -117,7 +117,7 @@ def evaluate(args): sentences = get_sentences(text_file=args.text, lang=args.lang) for utt_id, sentence in sentences: - print(f"{utt_id} {sentence} ...") + print(f"{utt_id} {sentence}") with timer() as t: if am_name == "diffsinger": text = "" @@ -135,7 +135,7 @@ def evaluate(args): lang=args.lang, svs_input=svs_input) phone_ids = frontend_dict['phone_ids'] - # pprint(f"process: {utt_id} {phone_ids}") + # pprint(f"{utt_id} {phone_ids}") with paddle.no_grad(): flags = 0 diff --git a/paddlespeech/t2s/frontend/canton_frontend.py b/paddlespeech/t2s/frontend/canton_frontend.py index dd3484d5..bbb7bcf0 100644 --- a/paddlespeech/t2s/frontend/canton_frontend.py +++ b/paddlespeech/t2s/frontend/canton_frontend.py @@ -48,7 +48,7 @@ def jyuping_to_phonemes(cantons: List[str]): class CantonFrontend(): def __init__(self, phone_vocab_path: str): self.text_normalizer = TextNormalizer() - self.punc = ":,;。?!“”‘’':,;.?!" + self.punc = "、:,;。?!“”‘’':,;.?!" self.vocab_phones = {} if phone_vocab_path: diff --git a/paddlespeech/t2s/frontend/en_frontend.py b/paddlespeech/t2s/frontend/en_frontend.py index 81991e0d..c58bed7d 100644 --- a/paddlespeech/t2s/frontend/en_frontend.py +++ b/paddlespeech/t2s/frontend/en_frontend.py @@ -1 +1,14 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .phonectic import English diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py index 628984cb..2ebfe135 100644 --- a/paddlespeech/t2s/frontend/mix_frontend.py +++ b/paddlespeech/t2s/frontend/mix_frontend.py @@ -106,76 +106,95 @@ class MixFrontend(): get_tone_ids: bool=False, add_sp: bool=True, to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: - ''' 1. 添加SSML支持,先列出 文字 和 标签内容, - 然后添加到tmpSegments数组里 - ''' - d_inputs = MixTextProcessor.get_dom_split(sentence) - tmpSegments = [] - for instr in d_inputs: - ''' 暂时只支持 say-as ''' - if instr.lower().startswith("" segments.append(tuple(currentSeg)) + # en segments.append(seg) + # reset currentSeg = ["", ""] else: + # zh if currentSeg[0] == '': + # first see currentSeg[0] = seg[0] currentSeg[1] = seg[1] else: + # merge zh currentSeg[0] = currentSeg[0] + seg[0] + if currentSeg[0] != '': + # last zh currentSeg[0] = "" + currentSeg[0] + "" segments.append(tuple(currentSeg)) phones_list = [] result = {} + # 008 我们要去云南 team building, 非常非常 happy. + # seg ('我们要去云南 ', 'zh') + # seg ('team building, ', 'en') + # seg ('非常非常 ', 'zh') + # seg ('happy.', 'en') + # [('我们要去云南 ', 'zh'), ('team building, ', 'en'), ('非常非常 ', 'zh'), ('happy.', 'en')] for seg in segments: content = seg[0] lang = seg[1] - if content != '': - if lang == "en": - input_ids = self.en_frontend.get_input_ids( - content, merge_sentences=False, to_tensor=to_tensor) + + if not content: + continue + + if lang == "en": + input_ids = self.en_frontend.get_input_ids( + content, merge_sentences=False, to_tensor=to_tensor) + else: + if content.strip() != "" and \ + re.match(r".*?.*?.*", content, re.DOTALL): + # process ssml + input_ids = self.zh_frontend.get_input_ids_ssml( + content, + merge_sentences=False, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) else: - ''' 3. 把带speak tag的中文和普通文字分开处理 - ''' - if content.strip() != "" and \ - re.match(r".*?.*?.*", content, re.DOTALL): - input_ids = self.zh_frontend.get_input_ids_ssml( - content, - merge_sentences=False, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) - else: - input_ids = self.zh_frontend.get_input_ids( - content, - merge_sentences=False, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) - if add_sp: - if to_tensor: - input_ids["phone_ids"][-1] = paddle.concat( - [input_ids["phone_ids"][-1], self.sp_id_tensor]) - else: - input_ids["phone_ids"][-1] = np.concatenate( - (input_ids["phone_ids"][-1], self.sp_id_numpy)) + # process plain text + input_ids = self.zh_frontend.get_input_ids( + content, + merge_sentences=False, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) + + if add_sp: + # add sp between zh and en + if to_tensor: + input_ids["phone_ids"][-1] = paddle.concat( + [input_ids["phone_ids"][-1], self.sp_id_tensor]) + else: + input_ids["phone_ids"][-1] = np.concatenate( + (input_ids["phone_ids"][-1], self.sp_id_numpy)) - for phones in input_ids["phone_ids"]: - phones_list.append(phones) + phones_list.extend(input_ids["phone_ids"]) if merge_sentences: merge_list = paddle.concat(phones_list) diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index 2112ff4a..b3f64dc5 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -55,7 +55,7 @@ class English(Phonetics): self.punctuations = get_punctuations("en") self.vocab = Vocab(self.phonemes + self.punctuations) self.vocab_phones = {} - self.punc = ":,;。?!“”‘’':,;.?!" + self.punc = "、:,;。?!“”‘’':,;.?!" self.text_normalizer = TextNormalizer() if phone_vocab_path: with open(phone_vocab_path, 'rt', encoding='utf-8') as f: diff --git a/paddlespeech/t2s/frontend/polyphonic.py b/paddlespeech/t2s/frontend/polyphonic.py new file mode 100644 index 00000000..885064a6 --- /dev/null +++ b/paddlespeech/t2s/frontend/polyphonic.py @@ -0,0 +1,38 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import yaml + + +class Polyphonic(): + def __init__(self): + with open( + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'polyphonic.yaml'), + 'r', + encoding='utf-8') as polyphonic_file: + # 解析yaml + polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader) + self.polyphonic_words = polyphonic_dict["polyphonic"] + + def correct_pronunciation(self, word, pinyin): + # 词汇被词典收录则返回纠正后的读音 + print(word, pinyin) + if word in self.polyphonic_words.keys(): + pinyin = self.polyphonic_words[word] + print('new', pinyin) + # 否则返回原读音 + return pinyin diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml index 50659afb..f52b1cf5 100644 --- a/paddlespeech/t2s/frontend/polyphonic.yaml +++ b/paddlespeech/t2s/frontend/polyphonic.yaml @@ -48,4 +48,7 @@ polyphonic: 唉: ['ai4'] 扎实: ['zha1','shi2'] 干将: ['gan4','jiang4'] - 陈威行: ['chen2', 'wei1', 'hang2'] \ No newline at end of file + 陈威行: ['chen2', 'wei1', 'hang2'] + 郭晟: ['guo1', 'sheng4'] + 中标: ['zhong4', 'biao1'] + 抗住: ['kang2', 'zhu4'] \ No newline at end of file diff --git a/paddlespeech/t2s/frontend/sing_frontend.py b/paddlespeech/t2s/frontend/sing_frontend.py index c2aecf27..fff72a10 100644 --- a/paddlespeech/t2s/frontend/sing_frontend.py +++ b/paddlespeech/t2s/frontend/sing_frontend.py @@ -29,7 +29,7 @@ class SingFrontend(): pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line. phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line. """ - self.punc = '[:,;。?!“”‘’\':,;.?!]' + self.punc = '[、:,;。?!“”‘’\':,;.?!]' self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'} if pinyin_phone_path: diff --git a/paddlespeech/t2s/frontend/ssml/__init__.py b/paddlespeech/t2s/frontend/ssml/__init__.py index 9b4db053..b1b9d726 100644 --- a/paddlespeech/t2s/frontend/ssml/__init__.py +++ b/paddlespeech/t2s/frontend/ssml/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/t2s/frontend/ssml/xml_processor.py b/paddlespeech/t2s/frontend/ssml/xml_processor.py index 3a5177d1..1d216c31 100644 --- a/paddlespeech/t2s/frontend/ssml/xml_processor.py +++ b/paddlespeech/t2s/frontend/ssml/xml_processor.py @@ -1,4 +1,17 @@ # -*- coding: utf-8 -*- +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import re import xml.dom.minidom import xml.parsers.expat @@ -68,7 +81,8 @@ class MixTextProcessor(): after_xml = mat.group(3) # pre with none syllable - ctlist.append([pre_xml, []]) + if pre_xml: + ctlist.append([pre_xml, []]) # between with syllable # [(sub sentence, [syllables]), ...] @@ -77,9 +91,11 @@ class MixTextProcessor(): ctlist = ctlist + pinyinlist # post with none syllable - ctlist.append([after_xml, []]) + if after_xml: + ctlist.append([after_xml, []]) else: ctlist.append([mixstr, []]) + return ctlist @classmethod @@ -94,15 +110,18 @@ class MixTextProcessor(): in_xml = mat.group(2) after_xml = mat.group(3) - ctlist.append(pre_xml) + if pre_xml: + ctlist.append(pre_xml) + dom = DomXml(in_xml) tags = dom.get_text_and_sayas_tags() ctlist.extend(tags) - ctlist.append(after_xml) - return ctlist + if after_xml: + ctlist.append(after_xml) else: ctlist.append(mixstr) + return ctlist diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 5902540c..690f69aa 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -68,9 +68,9 @@ class ToneSandhi(): '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎', '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得', '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打', - '考考', '整整', '莘莘', '落地', '算子', '家家户户' + '考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青' } - self.punc = ":,;。?!“”‘’':,;.?!" + self.punc = "、:,;。?!“”‘’':,;.?!" def _split_word(self, word: str) -> List[str]: word_list = jieba.cut_for_search(word) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 2e0b2ffd..1431bc6d 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -31,6 +31,7 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon +from paddlespeech.t2s.frontend.polyphonic import Polyphonic from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi @@ -68,26 +69,6 @@ def insert_after_character(lst, item): return result -class Polyphonic(): - def __init__(self): - with open( - os.path.join( - os.path.dirname(os.path.abspath(__file__)), - 'polyphonic.yaml'), - 'r', - encoding='utf-8') as polyphonic_file: - # 解析yaml - polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader) - self.polyphonic_words = polyphonic_dict["polyphonic"] - - def correct_pronunciation(self, word, pinyin): - # 词汇被词典收录则返回纠正后的读音 - if word in self.polyphonic_words.keys(): - pinyin = self.polyphonic_words[word] - # 否则返回原读音 - return pinyin - - class Frontend(): def __init__(self, g2p_model="g2pW", @@ -95,7 +76,7 @@ class Frontend(): tone_vocab_path=None, use_rhy=False): - self.punc = ":,;。?!“”‘’':,;.?!" + self.punc = "、:,;。?!“”‘’':,;.?!" self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4'] self.phrases_dict = { '开户行': [['ka1i'], ['hu4'], ['hang2']], @@ -567,6 +548,7 @@ class Frontend(): phones = [] for c, v in zip(initials, finals): + # c for consonant, v for vowel # NOTE: post process for pypinyin outputs # we discriminate i, ii and iii if c and c not in self.punc: @@ -633,16 +615,19 @@ class Frontend(): new_phonemes.append(new_sentence) all_phonemes = new_phonemes + if merge_sentences: + all_phonemes = [sum(all_phonemes, [])] + if print_info: print("----------------------------") print("text norm results:") print(sentences) print("----------------------------") print("g2p results:") - print(all_phonemes[0]) + print(all_phonemes) print("----------------------------") - return [sum(all_phonemes, [])] + return all_phonemes def add_sp_if_no(self, phonemes): """ diff --git a/tests/unit/tts/test_mixfrontend.py b/tests/unit/tts/test_mixfrontend.py index fdfebf46..24167338 100644 --- a/tests/unit/tts/test_mixfrontend.py +++ b/tests/unit/tts/test_mixfrontend.py @@ -423,7 +423,7 @@ if __name__ == '__main__': segs = frontend.split_by_lang(text) print(segs) - # 对于SSML的xml标记处理不好。 + # 对于SSML的xml标记处理不好。需要先解析SSML,后处理中英的划分。 text = "我们的声学模型使用了 Fast Speech Two。前浪在沙滩上,沙滩上倒了一堆。 想象干干的树干了, 里面有个干尸,不知是被谁死的。" print(text) # [('', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒', 'en'), ('了, 里面有个干尸,不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干', 'en'), ('死的。', 'en')]