fix long text oom using ssml; filter comma; update polyphonic

pull/3316/head
Hui Zhang 1 year ago
parent 108e73e1a0
commit d53c499447

@ -0,0 +1,13 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -117,7 +117,7 @@ def evaluate(args):
sentences = get_sentences(text_file=args.text, lang=args.lang) sentences = get_sentences(text_file=args.text, lang=args.lang)
for utt_id, sentence in sentences: for utt_id, sentence in sentences:
print(f"{utt_id} {sentence} ...") print(f"{utt_id} {sentence}")
with timer() as t: with timer() as t:
if am_name == "diffsinger": if am_name == "diffsinger":
text = "" text = ""
@ -135,7 +135,7 @@ def evaluate(args):
lang=args.lang, lang=args.lang,
svs_input=svs_input) svs_input=svs_input)
phone_ids = frontend_dict['phone_ids'] phone_ids = frontend_dict['phone_ids']
# pprint(f"process: {utt_id} {phone_ids}") # pprint(f"{utt_id} {phone_ids}")
with paddle.no_grad(): with paddle.no_grad():
flags = 0 flags = 0

@ -48,7 +48,7 @@ def jyuping_to_phonemes(cantons: List[str]):
class CantonFrontend(): class CantonFrontend():
def __init__(self, phone_vocab_path: str): def __init__(self, phone_vocab_path: str):
self.text_normalizer = TextNormalizer() self.text_normalizer = TextNormalizer()
self.punc = ":,;。?!“”‘’':,;.?!" self.punc = ":,;。?!“”‘’':,;.?!"
self.vocab_phones = {} self.vocab_phones = {}
if phone_vocab_path: if phone_vocab_path:

@ -1 +1,14 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .phonectic import English from .phonectic import English

@ -106,76 +106,95 @@ class MixFrontend():
get_tone_ids: bool=False, get_tone_ids: bool=False,
add_sp: bool=True, add_sp: bool=True,
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
''' 1. 添加SSML支持先列出 文字 和 <say-as>标签内容, # XML Document Object Model (DOM)
然后添加到tmpSegments数组里 doms = MixTextProcessor.get_dom_split(sentence)
'''
d_inputs = MixTextProcessor.get_dom_split(sentence) lang_splits = []
tmpSegments = [] for dom in doms:
for instr in d_inputs: if dom.lower().startswith("<say-as pinyin="):
''' 暂时只支持 say-as ''' # `<say-as pinyin=` for zh lang
if instr.lower().startswith("<say-as"): lang_splits.append((dom, "zh"))
tmpSegments.append((instr, "zh"))
else: else:
tmpSegments.extend(self.split_by_lang(instr)) # process zh, en and zh/en
''' 2. 把zh的merge到一起避免合成结果中间停顿 lang_splits.extend(self.split_by_lang(dom))
'''
# merge adjacent zh segment
segments = [] segments = []
currentSeg = ["", ""] currentSeg = ["", ""]
for seg in tmpSegments: for seg in lang_splits:
if seg[1] == "en" or seg[1] == "other": if seg[1] == "en" or seg[1] == "other":
if currentSeg[0] == '': if currentSeg[0] == '':
# first see
segments.append(seg) segments.append(seg)
else: else:
# zh
currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>" currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
segments.append(tuple(currentSeg)) segments.append(tuple(currentSeg))
# en
segments.append(seg) segments.append(seg)
# reset
currentSeg = ["", ""] currentSeg = ["", ""]
else: else:
# zh
if currentSeg[0] == '': if currentSeg[0] == '':
# first see
currentSeg[0] = seg[0] currentSeg[0] = seg[0]
currentSeg[1] = seg[1] currentSeg[1] = seg[1]
else: else:
# merge zh
currentSeg[0] = currentSeg[0] + seg[0] currentSeg[0] = currentSeg[0] + seg[0]
if currentSeg[0] != '': if currentSeg[0] != '':
# last zh
currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>" currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
segments.append(tuple(currentSeg)) segments.append(tuple(currentSeg))
phones_list = [] phones_list = []
result = {} result = {}
# 008 我们要去云南 team building, 非常非常 happy.
# seg ('我们要去云南 ', 'zh')
# seg ('team building, ', 'en')
# seg ('非常非常 ', 'zh')
# seg ('happy.', 'en')
# [('<speak>我们要去云南 </speak>', 'zh'), ('team building, ', 'en'), ('<speak>非常非常 </speak>', 'zh'), ('happy.', 'en')]
for seg in segments: for seg in segments:
content = seg[0] content = seg[0]
lang = seg[1] lang = seg[1]
if content != '':
if lang == "en": if not content:
input_ids = self.en_frontend.get_input_ids( continue
content, merge_sentences=False, to_tensor=to_tensor)
if lang == "en":
input_ids = self.en_frontend.get_input_ids(
content, merge_sentences=False, to_tensor=to_tensor)
else:
if content.strip() != "" and \
re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
# process ssml
input_ids = self.zh_frontend.get_input_ids_ssml(
content,
merge_sentences=False,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
else: else:
''' 3. 把带speak tag的中文和普通文字分开处理 # process plain text
''' input_ids = self.zh_frontend.get_input_ids(
if content.strip() != "" and \ content,
re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL): merge_sentences=False,
input_ids = self.zh_frontend.get_input_ids_ssml( get_tone_ids=get_tone_ids,
content, to_tensor=to_tensor)
merge_sentences=False,
get_tone_ids=get_tone_ids, if add_sp:
to_tensor=to_tensor) # add sp between zh and en
else: if to_tensor:
input_ids = self.zh_frontend.get_input_ids( input_ids["phone_ids"][-1] = paddle.concat(
content, [input_ids["phone_ids"][-1], self.sp_id_tensor])
merge_sentences=False, else:
get_tone_ids=get_tone_ids, input_ids["phone_ids"][-1] = np.concatenate(
to_tensor=to_tensor) (input_ids["phone_ids"][-1], self.sp_id_numpy))
if add_sp:
if to_tensor:
input_ids["phone_ids"][-1] = paddle.concat(
[input_ids["phone_ids"][-1], self.sp_id_tensor])
else:
input_ids["phone_ids"][-1] = np.concatenate(
(input_ids["phone_ids"][-1], self.sp_id_numpy))
for phones in input_ids["phone_ids"]: phones_list.extend(input_ids["phone_ids"])
phones_list.append(phones)
if merge_sentences: if merge_sentences:
merge_list = paddle.concat(phones_list) merge_list = paddle.concat(phones_list)

@ -55,7 +55,7 @@ class English(Phonetics):
self.punctuations = get_punctuations("en") self.punctuations = get_punctuations("en")
self.vocab = Vocab(self.phonemes + self.punctuations) self.vocab = Vocab(self.phonemes + self.punctuations)
self.vocab_phones = {} self.vocab_phones = {}
self.punc = ":,;。?!“”‘’':,;.?!" self.punc = ":,;。?!“”‘’':,;.?!"
self.text_normalizer = TextNormalizer() self.text_normalizer = TextNormalizer()
if phone_vocab_path: if phone_vocab_path:
with open(phone_vocab_path, 'rt', encoding='utf-8') as f: with open(phone_vocab_path, 'rt', encoding='utf-8') as f:

@ -0,0 +1,38 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import yaml
class Polyphonic():
def __init__(self):
with open(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'polyphonic.yaml'),
'r',
encoding='utf-8') as polyphonic_file:
# 解析yaml
polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
self.polyphonic_words = polyphonic_dict["polyphonic"]
def correct_pronunciation(self, word, pinyin):
# 词汇被词典收录则返回纠正后的读音
print(word, pinyin)
if word in self.polyphonic_words.keys():
pinyin = self.polyphonic_words[word]
print('new', pinyin)
# 否则返回原读音
return pinyin

@ -48,4 +48,7 @@ polyphonic:
: ['ai4'] : ['ai4']
扎实: ['zha1','shi2'] 扎实: ['zha1','shi2']
干将: ['gan4','jiang4'] 干将: ['gan4','jiang4']
陈威行: ['chen2', 'wei1', 'hang2'] 陈威行: ['chen2', 'wei1', 'hang2']
郭晟: ['guo1', 'sheng4']
中标: ['zhong4', 'biao1']
抗住: ['kang2', 'zhu4']

@ -29,7 +29,7 @@ class SingFrontend():
pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line. pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line. phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
""" """
self.punc = '[:,;。?!“”‘’\':,;.?!]' self.punc = '[:,;。?!“”‘’\':,;.?!]'
self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'} self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'}
if pinyin_phone_path: if pinyin_phone_path:

@ -1,4 +1,4 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.

@ -1,4 +1,17 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re import re
import xml.dom.minidom import xml.dom.minidom
import xml.parsers.expat import xml.parsers.expat
@ -68,7 +81,8 @@ class MixTextProcessor():
after_xml = mat.group(3) after_xml = mat.group(3)
# pre with none syllable # pre with none syllable
ctlist.append([pre_xml, []]) if pre_xml:
ctlist.append([pre_xml, []])
# between with syllable # between with syllable
# [(sub sentence, [syllables]), ...] # [(sub sentence, [syllables]), ...]
@ -77,9 +91,11 @@ class MixTextProcessor():
ctlist = ctlist + pinyinlist ctlist = ctlist + pinyinlist
# post with none syllable # post with none syllable
ctlist.append([after_xml, []]) if after_xml:
ctlist.append([after_xml, []])
else: else:
ctlist.append([mixstr, []]) ctlist.append([mixstr, []])
return ctlist return ctlist
@classmethod @classmethod
@ -94,15 +110,18 @@ class MixTextProcessor():
in_xml = mat.group(2) in_xml = mat.group(2)
after_xml = mat.group(3) after_xml = mat.group(3)
ctlist.append(pre_xml) if pre_xml:
ctlist.append(pre_xml)
dom = DomXml(in_xml) dom = DomXml(in_xml)
tags = dom.get_text_and_sayas_tags() tags = dom.get_text_and_sayas_tags()
ctlist.extend(tags) ctlist.extend(tags)
ctlist.append(after_xml) if after_xml:
return ctlist ctlist.append(after_xml)
else: else:
ctlist.append(mixstr) ctlist.append(mixstr)
return ctlist return ctlist

@ -68,9 +68,9 @@ class ToneSandhi():
'男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎', '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
'幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得', '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
'耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打', '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
'考考', '整整', '莘莘', '落地', '算子', '家家户户' '考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
} }
self.punc = ":,;。?!“”‘’':,;.?!" self.punc = ":,;。?!“”‘’':,;.?!"
def _split_word(self, word: str) -> List[str]: def _split_word(self, word: str) -> List[str]:
word_list = jieba.cut_for_search(word) word_list = jieba.cut_for_search(word)

@ -31,6 +31,7 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin
from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
from paddlespeech.t2s.frontend.polyphonic import Polyphonic
from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor
from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
@ -68,26 +69,6 @@ def insert_after_character(lst, item):
return result return result
class Polyphonic():
def __init__(self):
with open(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'polyphonic.yaml'),
'r',
encoding='utf-8') as polyphonic_file:
# 解析yaml
polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
self.polyphonic_words = polyphonic_dict["polyphonic"]
def correct_pronunciation(self, word, pinyin):
# 词汇被词典收录则返回纠正后的读音
if word in self.polyphonic_words.keys():
pinyin = self.polyphonic_words[word]
# 否则返回原读音
return pinyin
class Frontend(): class Frontend():
def __init__(self, def __init__(self,
g2p_model="g2pW", g2p_model="g2pW",
@ -95,7 +76,7 @@ class Frontend():
tone_vocab_path=None, tone_vocab_path=None,
use_rhy=False): use_rhy=False):
self.punc = ":,;。?!“”‘’':,;.?!" self.punc = ":,;。?!“”‘’':,;.?!"
self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4'] self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4']
self.phrases_dict = { self.phrases_dict = {
'开户行': [['ka1i'], ['hu4'], ['hang2']], '开户行': [['ka1i'], ['hu4'], ['hang2']],
@ -567,6 +548,7 @@ class Frontend():
phones = [] phones = []
for c, v in zip(initials, finals): for c, v in zip(initials, finals):
# c for consonant, v for vowel
# NOTE: post process for pypinyin outputs # NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii # we discriminate i, ii and iii
if c and c not in self.punc: if c and c not in self.punc:
@ -633,16 +615,19 @@ class Frontend():
new_phonemes.append(new_sentence) new_phonemes.append(new_sentence)
all_phonemes = new_phonemes all_phonemes = new_phonemes
if merge_sentences:
all_phonemes = [sum(all_phonemes, [])]
if print_info: if print_info:
print("----------------------------") print("----------------------------")
print("text norm results:") print("text norm results:")
print(sentences) print(sentences)
print("----------------------------") print("----------------------------")
print("g2p results:") print("g2p results:")
print(all_phonemes[0]) print(all_phonemes)
print("----------------------------") print("----------------------------")
return [sum(all_phonemes, [])] return all_phonemes
def add_sp_if_no(self, phonemes): def add_sp_if_no(self, phonemes):
""" """

@ -423,7 +423,7 @@ if __name__ == '__main__':
segs = frontend.split_by_lang(text) segs = frontend.split_by_lang(text)
print(segs) print(segs)
# 对于SSML的xml标记处理不好。 # 对于SSML的xml标记处理不好。需要先解析SSML后处理中英的划分。
text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>" text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
print(text) print(text)
# [('<speak>', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土</', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干</', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸,不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干</', 'zh'), ('say-as>', 'en'), ('死的。</', 'zh'), ('speak>', 'en')] # [('<speak>', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土</', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干</', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸,不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干</', 'zh'), ('say-as>', 'en'), ('死的。</', 'zh'), ('speak>', 'en')]

Loading…
Cancel
Save