fix long text oom using ssml; filter comma; update polyphonic

pull/3316/head
Hui Zhang 1 year ago
parent 108e73e1a0
commit d53c499447

@ -0,0 +1,13 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -117,7 +117,7 @@ def evaluate(args):
sentences = get_sentences(text_file=args.text, lang=args.lang)
for utt_id, sentence in sentences:
print(f"{utt_id} {sentence} ...")
print(f"{utt_id} {sentence}")
with timer() as t:
if am_name == "diffsinger":
text = ""
@ -135,7 +135,7 @@ def evaluate(args):
lang=args.lang,
svs_input=svs_input)
phone_ids = frontend_dict['phone_ids']
# pprint(f"process: {utt_id} {phone_ids}")
# pprint(f"{utt_id} {phone_ids}")
with paddle.no_grad():
flags = 0

@ -48,7 +48,7 @@ def jyuping_to_phonemes(cantons: List[str]):
class CantonFrontend():
def __init__(self, phone_vocab_path: str):
self.text_normalizer = TextNormalizer()
self.punc = ":,;。?!“”‘’':,;.?!"
self.punc = ":,;。?!“”‘’':,;.?!"
self.vocab_phones = {}
if phone_vocab_path:

@ -1 +1,14 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .phonectic import English

@ -106,76 +106,95 @@ class MixFrontend():
get_tone_ids: bool=False,
add_sp: bool=True,
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
''' 1. 添加SSML支持先列出 文字 和 <say-as>标签内容,
然后添加到tmpSegments数组里
'''
d_inputs = MixTextProcessor.get_dom_split(sentence)
tmpSegments = []
for instr in d_inputs:
''' 暂时只支持 say-as '''
if instr.lower().startswith("<say-as"):
tmpSegments.append((instr, "zh"))
# XML Document Object Model (DOM)
doms = MixTextProcessor.get_dom_split(sentence)
lang_splits = []
for dom in doms:
if dom.lower().startswith("<say-as pinyin="):
# `<say-as pinyin=` for zh lang
lang_splits.append((dom, "zh"))
else:
tmpSegments.extend(self.split_by_lang(instr))
''' 2. 把zh的merge到一起避免合成结果中间停顿
'''
# process zh, en and zh/en
lang_splits.extend(self.split_by_lang(dom))
# merge adjacent zh segment
segments = []
currentSeg = ["", ""]
for seg in tmpSegments:
for seg in lang_splits:
if seg[1] == "en" or seg[1] == "other":
if currentSeg[0] == '':
# first see
segments.append(seg)
else:
# zh
currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
segments.append(tuple(currentSeg))
# en
segments.append(seg)
# reset
currentSeg = ["", ""]
else:
# zh
if currentSeg[0] == '':
# first see
currentSeg[0] = seg[0]
currentSeg[1] = seg[1]
else:
# merge zh
currentSeg[0] = currentSeg[0] + seg[0]
if currentSeg[0] != '':
# last zh
currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
segments.append(tuple(currentSeg))
phones_list = []
result = {}
# 008 我们要去云南 team building, 非常非常 happy.
# seg ('我们要去云南 ', 'zh')
# seg ('team building, ', 'en')
# seg ('非常非常 ', 'zh')
# seg ('happy.', 'en')
# [('<speak>我们要去云南 </speak>', 'zh'), ('team building, ', 'en'), ('<speak>非常非常 </speak>', 'zh'), ('happy.', 'en')]
for seg in segments:
content = seg[0]
lang = seg[1]
if content != '':
if lang == "en":
input_ids = self.en_frontend.get_input_ids(
content, merge_sentences=False, to_tensor=to_tensor)
if not content:
continue
if lang == "en":
input_ids = self.en_frontend.get_input_ids(
content, merge_sentences=False, to_tensor=to_tensor)
else:
if content.strip() != "" and \
re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
# process ssml
input_ids = self.zh_frontend.get_input_ids_ssml(
content,
merge_sentences=False,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
else:
''' 3. 把带speak tag的中文和普通文字分开处理
'''
if content.strip() != "" and \
re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
input_ids = self.zh_frontend.get_input_ids_ssml(
content,
merge_sentences=False,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
else:
input_ids = self.zh_frontend.get_input_ids(
content,
merge_sentences=False,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
if add_sp:
if to_tensor:
input_ids["phone_ids"][-1] = paddle.concat(
[input_ids["phone_ids"][-1], self.sp_id_tensor])
else:
input_ids["phone_ids"][-1] = np.concatenate(
(input_ids["phone_ids"][-1], self.sp_id_numpy))
# process plain text
input_ids = self.zh_frontend.get_input_ids(
content,
merge_sentences=False,
get_tone_ids=get_tone_ids,
to_tensor=to_tensor)
if add_sp:
# add sp between zh and en
if to_tensor:
input_ids["phone_ids"][-1] = paddle.concat(
[input_ids["phone_ids"][-1], self.sp_id_tensor])
else:
input_ids["phone_ids"][-1] = np.concatenate(
(input_ids["phone_ids"][-1], self.sp_id_numpy))
for phones in input_ids["phone_ids"]:
phones_list.append(phones)
phones_list.extend(input_ids["phone_ids"])
if merge_sentences:
merge_list = paddle.concat(phones_list)

@ -55,7 +55,7 @@ class English(Phonetics):
self.punctuations = get_punctuations("en")
self.vocab = Vocab(self.phonemes + self.punctuations)
self.vocab_phones = {}
self.punc = ":,;。?!“”‘’':,;.?!"
self.punc = ":,;。?!“”‘’':,;.?!"
self.text_normalizer = TextNormalizer()
if phone_vocab_path:
with open(phone_vocab_path, 'rt', encoding='utf-8') as f:

@ -0,0 +1,38 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import yaml
class Polyphonic():
def __init__(self):
with open(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'polyphonic.yaml'),
'r',
encoding='utf-8') as polyphonic_file:
# 解析yaml
polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
self.polyphonic_words = polyphonic_dict["polyphonic"]
def correct_pronunciation(self, word, pinyin):
# 词汇被词典收录则返回纠正后的读音
print(word, pinyin)
if word in self.polyphonic_words.keys():
pinyin = self.polyphonic_words[word]
print('new', pinyin)
# 否则返回原读音
return pinyin

@ -48,4 +48,7 @@ polyphonic:
: ['ai4']
扎实: ['zha1','shi2']
干将: ['gan4','jiang4']
陈威行: ['chen2', 'wei1', 'hang2']
陈威行: ['chen2', 'wei1', 'hang2']
郭晟: ['guo1', 'sheng4']
中标: ['zhong4', 'biao1']
抗住: ['kang2', 'zhu4']

@ -29,7 +29,7 @@ class SingFrontend():
pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
"""
self.punc = '[:,;。?!“”‘’\':,;.?!]'
self.punc = '[:,;。?!“”‘’\':,;.?!]'
self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'}
if pinyin_phone_path:

@ -1,4 +1,4 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

@ -1,4 +1,17 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import xml.dom.minidom
import xml.parsers.expat
@ -68,7 +81,8 @@ class MixTextProcessor():
after_xml = mat.group(3)
# pre with none syllable
ctlist.append([pre_xml, []])
if pre_xml:
ctlist.append([pre_xml, []])
# between with syllable
# [(sub sentence, [syllables]), ...]
@ -77,9 +91,11 @@ class MixTextProcessor():
ctlist = ctlist + pinyinlist
# post with none syllable
ctlist.append([after_xml, []])
if after_xml:
ctlist.append([after_xml, []])
else:
ctlist.append([mixstr, []])
return ctlist
@classmethod
@ -94,15 +110,18 @@ class MixTextProcessor():
in_xml = mat.group(2)
after_xml = mat.group(3)
ctlist.append(pre_xml)
if pre_xml:
ctlist.append(pre_xml)
dom = DomXml(in_xml)
tags = dom.get_text_and_sayas_tags()
ctlist.extend(tags)
ctlist.append(after_xml)
return ctlist
if after_xml:
ctlist.append(after_xml)
else:
ctlist.append(mixstr)
return ctlist

@ -68,9 +68,9 @@ class ToneSandhi():
'男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
'幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
'耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
'考考', '整整', '莘莘', '落地', '算子', '家家户户'
'考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
}
self.punc = ":,;。?!“”‘’':,;.?!"
self.punc = ":,;。?!“”‘’':,;.?!"
def _split_word(self, word: str) -> List[str]:
word_list = jieba.cut_for_search(word)

@ -31,6 +31,7 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin
from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
from paddlespeech.t2s.frontend.polyphonic import Polyphonic
from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor
from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
@ -68,26 +69,6 @@ def insert_after_character(lst, item):
return result
class Polyphonic():
def __init__(self):
with open(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'polyphonic.yaml'),
'r',
encoding='utf-8') as polyphonic_file:
# 解析yaml
polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
self.polyphonic_words = polyphonic_dict["polyphonic"]
def correct_pronunciation(self, word, pinyin):
# 词汇被词典收录则返回纠正后的读音
if word in self.polyphonic_words.keys():
pinyin = self.polyphonic_words[word]
# 否则返回原读音
return pinyin
class Frontend():
def __init__(self,
g2p_model="g2pW",
@ -95,7 +76,7 @@ class Frontend():
tone_vocab_path=None,
use_rhy=False):
self.punc = ":,;。?!“”‘’':,;.?!"
self.punc = ":,;。?!“”‘’':,;.?!"
self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4']
self.phrases_dict = {
'开户行': [['ka1i'], ['hu4'], ['hang2']],
@ -567,6 +548,7 @@ class Frontend():
phones = []
for c, v in zip(initials, finals):
# c for consonant, v for vowel
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if c and c not in self.punc:
@ -633,16 +615,19 @@ class Frontend():
new_phonemes.append(new_sentence)
all_phonemes = new_phonemes
if merge_sentences:
all_phonemes = [sum(all_phonemes, [])]
if print_info:
print("----------------------------")
print("text norm results:")
print(sentences)
print("----------------------------")
print("g2p results:")
print(all_phonemes[0])
print(all_phonemes)
print("----------------------------")
return [sum(all_phonemes, [])]
return all_phonemes
def add_sp_if_no(self, phonemes):
"""

@ -423,7 +423,7 @@ if __name__ == '__main__':
segs = frontend.split_by_lang(text)
print(segs)
# 对于SSML的xml标记处理不好。
# 对于SSML的xml标记处理不好。需要先解析SSML后处理中英的划分。
text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
print(text)
# [('<speak>', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土</', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干</', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸,不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干</', 'zh'), ('say-as>', 'en'), ('死的。</', 'zh'), ('speak>', 'en')]

Loading…
Cancel
Save