add ssml unit test

pull/3316/head
Hui Zhang 1 year ago
parent 4d867700eb
commit 9727e67a3f

@ -26,12 +26,12 @@ repos:
- --no-sort-keys - --no-sort-keys
- --autofix - --autofix
- id: check-merge-conflict - id: check-merge-conflict
- id: flake8 # - id: flake8
aergs: # aergs:
- --ignore=E501,E228,E226,E261,E266,E128,E402,W503 # - --ignore=E501,E228,E226,E261,E266,E128,E402,W503
- --builtins=G,request # - --builtins=G,request
- --jobs=1 # - --jobs=1
exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$ # exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
- repo : https://github.com/Lucas-C/pre-commit-hooks - repo : https://github.com/Lucas-C/pre-commit-hooks
rev: v1.0.1 rev: v1.0.1

@ -115,9 +115,9 @@ def evaluate(args):
sentences = get_sentences_svs(text_file=args.text) sentences = get_sentences_svs(text_file=args.text)
else: else:
sentences = get_sentences(text_file=args.text, lang=args.lang) sentences = get_sentences(text_file=args.text, lang=args.lang)
pprint(f"inputs: {sentences}")
for utt_id, sentence in sentences: for utt_id, sentence in sentences:
print(f"{utt_id} {sentence} ...")
with timer() as t: with timer() as t:
if am_name == "diffsinger": if am_name == "diffsinger":
text = "" text = ""

@ -29,7 +29,8 @@ INITIALS = [
INITIALS += ['sp', 'spl', 'spn', 'sil'] INITIALS += ['sp', 'spl', 'spn', 'sil']
def get_lines(cantons: List[str]): def jyuping_to_phonemes(cantons: List[str]):
# jyuping to inital and final
phones = [] phones = []
for canton in cantons: for canton in cantons:
for consonant in INITIALS: for consonant in INITIALS:
@ -61,8 +62,11 @@ class CantonFrontend():
merge_sentences: bool=True) -> List[List[str]]: merge_sentences: bool=True) -> List[List[str]]:
phones_list = [] phones_list = []
for sentence in sentences: for sentence in sentences:
# jyuping
# 'gam3 ngaam1 lou5 sai3 jiu1 kau4 keoi5 dang2 zan6 jiu3 hoi1 wui2, zing6 dai1 ge2 je5 ngo5 wui5 gaau2 dim6 ga3 laa3.'
phones_str = ToJyutping.get_jyutping_text(sentence) phones_str = ToJyutping.get_jyutping_text(sentence)
phones_split = get_lines(phones_str.split(' ')) # phonemes
phones_split = jyuping_to_phonemes(phones_str.split(' '))
phones_list.append(phones_split) phones_list.append(phones_split)
return phones_list return phones_list
@ -78,8 +82,11 @@ class CantonFrontend():
sentence: str, sentence: str,
merge_sentences: bool=True, merge_sentences: bool=True,
print_info: bool=False) -> List[List[str]]: print_info: bool=False) -> List[List[str]]:
# TN & Text Segmentation
sentences = self.text_normalizer.normalize(sentence) sentences = self.text_normalizer.normalize(sentence)
# G2P
phonemes = self._g2p(sentences, merge_sentences=merge_sentences) phonemes = self._g2p(sentences, merge_sentences=merge_sentences)
if print_info: if print_info:
print("----------------------------") print("----------------------------")
print("text norm results:") print("text norm results:")
@ -88,6 +95,7 @@ class CantonFrontend():
print("g2p results:") print("g2p results:")
print(phonemes) print(phonemes)
print("----------------------------") print("----------------------------")
return phonemes return phonemes
def get_input_ids(self, def get_input_ids(self,
@ -98,9 +106,9 @@ class CantonFrontend():
phonemes = self.get_phonemes( phonemes = self.get_phonemes(
sentence, merge_sentences=merge_sentences, print_info=print_info) sentence, merge_sentences=merge_sentences, print_info=print_info)
result = {} result = {}
temp_phone_ids = [] temp_phone_ids = []
for phones in phonemes: for phones in phonemes:
if phones: if phones:
phone_ids = self._p2id(phones) phone_ids = self._p2id(phones)
@ -108,6 +116,8 @@ class CantonFrontend():
if to_tensor: if to_tensor:
phone_ids = paddle.to_tensor(phone_ids) phone_ids = paddle.to_tensor(phone_ids)
temp_phone_ids.append(phone_ids) temp_phone_ids.append(phone_ids)
if temp_phone_ids: if temp_phone_ids:
result["phone_ids"] = temp_phone_ids result["phone_ids"] = temp_phone_ids
return result return result

@ -17,7 +17,6 @@ Note: xml 有5种特殊字符 &<>"'
' &apos; ' &apos;
例如 例如
<TitleName>&quot;姓名&quot;</TitleName> <TitleName>&quot;姓名&quot;</TitleName>
''' '''
@ -61,14 +60,23 @@ class MixTextProcessor():
patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S) patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
mat = re.match(patn, mixstr) mat = re.match(patn, mixstr)
if mat: if mat:
# pre <speak>
pre_xml = mat.group(1) pre_xml = mat.group(1)
# between <speak> ... </speak>
in_xml = mat.group(2) in_xml = mat.group(2)
# post </speak>
after_xml = mat.group(3) after_xml = mat.group(3)
# pre with none syllable
ctlist.append([pre_xml, []]) ctlist.append([pre_xml, []])
# between with syllable
# [(sub sentence, [syllables]), ...]
dom = DomXml(in_xml) dom = DomXml(in_xml)
pinyinlist = dom.get_pinyins_for_xml() pinyinlist = dom.get_pinyins_for_xml()
ctlist = ctlist + pinyinlist ctlist = ctlist + pinyinlist
# post with none syllable
ctlist.append([after_xml, []]) ctlist.append([after_xml, []])
else: else:
ctlist.append([mixstr, []]) ctlist.append([mixstr, []])

@ -14,6 +14,7 @@
import os import os
import re import re
from operator import itemgetter from operator import itemgetter
from pprint import pprint
from typing import Dict from typing import Dict
from typing import List from typing import List
@ -41,6 +42,9 @@ INITIALS = [
] ]
INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil'] INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil']
# 0 for None, 5 for neutral
TONES = ["0", "1", "2", "3", "4", "5"]
def intersperse(lst, item): def intersperse(lst, item):
result = [item] * (len(lst) * 2 + 1) result = [item] * (len(lst) * 2 + 1)
@ -597,11 +601,13 @@ class Frontend():
all_phonemes = [] all_phonemes = []
for word_pinyin_item in ssml_inputs: for word_pinyin_item in ssml_inputs:
phonemes = [] phonemes = []
print("ssml inputs:", word_pinyin_item)
# ['你喜欢', []] -> 你喜欢 []
sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item) sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
print('ssml g2p:', sentence, pinyin_spec)
# TN & Text Segmentation # TN & Text Segmentation
sentences = self.text_normalizer.normalize(sentence) sentences = self.text_normalizer.normalize(sentence)
if len(pinyin_spec) == 0: if len(pinyin_spec) == 0:
# g2p word w/o specified <say-as> # g2p word w/o specified <say-as>
phonemes = self._g2p( phonemes = self._g2p(
@ -635,6 +641,7 @@ class Frontend():
print("g2p results:") print("g2p results:")
print(all_phonemes[0]) print(all_phonemes[0])
print("----------------------------") print("----------------------------")
return [sum(all_phonemes, [])] return [sum(all_phonemes, [])]
def add_sp_if_no(self, phonemes): def add_sp_if_no(self, phonemes):
@ -711,10 +718,10 @@ class Frontend():
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
# split setence by SSML tag. # split setence by SSML tag.
l_inputs = MixTextProcessor.get_pinyin_split(sentence) texts = MixTextProcessor.get_pinyin_split(sentence)
phonemes = self.get_phonemes_ssml( phonemes = self.get_phonemes_ssml(
l_inputs, texts,
merge_sentences=merge_sentences, merge_sentences=merge_sentences,
print_info=print_info, print_info=print_info,
robot=robot) robot=robot)

@ -0,0 +1,61 @@
from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
if __name__ == '__main__':
text = "你好吗,<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>thank you."
# SSML: 13
# 0 ['你好吗,', []]
# 1 ['我们的声学模型使用了FastSpeechTwo。前浪', []]
# 2 ['倒', ['dao3']]
# 3 ['在沙滩上,沙滩上倒了一堆', []]
# 4 ['土', ['tu3']]
# 5 ['。想象', []]
# 6 ['干干', ['gan1', 'gan1']]
# 7 ['的树干', []]
# 8 ['倒', ['dao3']]
# 9 ['了,里面有个干尸,不知是被谁', []]
# 10 ['干', ['gan4']]
# 11 ['死的。', []]
# 12 ['thank you.', []]
inputs = MixTextProcessor.get_pinyin_split(text)
print(f"SSML get_pinyin_split: {len(inputs)}")
for i, sub in enumerate(inputs):
print(i, sub)
print()
# SSML get_dom_split: 13
# 0 你好吗,
# 1 我们的声学模型使用了 Fast Speech Two。前浪
# 2 <say-as pinyin="dao3">倒</say-as>
# 3 在沙滩上,沙滩上倒了一堆
# 4 <say-as pinyin="tu3">土</say-as>
# 5 。 想象
# 6 <say-as pinyin="gan1 gan1">干干</say-as>
# 7 的树干
# 8 <say-as pinyin="dao3">倒</say-as>
# 9 了, 里面有个干尸,不知是被谁
# 10 <say-as pinyin="gan4">干</say-as>
# 11 死的。
# 12 thank you.
inputs = MixTextProcessor.get_dom_split(text)
print(f"SSML get_dom_split: {len(inputs)}")
for i, sub in enumerate(inputs):
print(i, sub)
print()
# SSML object.get_pinyin_split: 246
# <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
outs = MixTextProcessor().get_xml_content(text)
print(f"SSML object.get_pinyin_split: {len(outs)}")
print(outs)
print()
# SSML object.get_content_split: 30 你好吗,
# 1 <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>
# 倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
# 2 thank you.
outs = MixTextProcessor().get_content_split(text)
print(f"SSML object.get_content_split: {len(outs)}")
for i, sub in enumerate(outs):
print(i, sub)
print()
Loading…
Cancel
Save