diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f72b44ac..44bbd5ca 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,12 +26,12 @@ repos:
- --no-sort-keys
- --autofix
- id: check-merge-conflict
- - id: flake8
- aergs:
- - --ignore=E501,E228,E226,E261,E266,E128,E402,W503
- - --builtins=G,request
- - --jobs=1
- exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+ # - id: flake8
+ # aergs:
+ # - --ignore=E501,E228,E226,E261,E266,E128,E402,W503
+ # - --builtins=G,request
+ # - --jobs=1
+ # exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
- repo : https://github.com/Lucas-C/pre-commit-hooks
rev: v1.0.1
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index f7821384..93146df0 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -115,9 +115,9 @@ def evaluate(args):
sentences = get_sentences_svs(text_file=args.text)
else:
sentences = get_sentences(text_file=args.text, lang=args.lang)
- pprint(f"inputs: {sentences}")
for utt_id, sentence in sentences:
+ print(f"{utt_id} {sentence} ...")
with timer() as t:
if am_name == "diffsinger":
text = ""
diff --git a/paddlespeech/t2s/frontend/canton_frontend.py b/paddlespeech/t2s/frontend/canton_frontend.py
index f2c7175f..dd3484d5 100644
--- a/paddlespeech/t2s/frontend/canton_frontend.py
+++ b/paddlespeech/t2s/frontend/canton_frontend.py
@@ -29,7 +29,8 @@ INITIALS = [
INITIALS += ['sp', 'spl', 'spn', 'sil']
-def get_lines(cantons: List[str]):
+def jyuping_to_phonemes(cantons: List[str]):
+ # jyuping to inital and final
phones = []
for canton in cantons:
for consonant in INITIALS:
@@ -61,8 +62,11 @@ class CantonFrontend():
merge_sentences: bool=True) -> List[List[str]]:
phones_list = []
for sentence in sentences:
+ # jyuping
+ # 'gam3 ngaam1 lou5 sai3 jiu1 kau4 keoi5 dang2 zan6 jiu3 hoi1 wui2, zing6 dai1 ge2 je5 ngo5 wui5 gaau2 dim6 ga3 laa3.'
phones_str = ToJyutping.get_jyutping_text(sentence)
- phones_split = get_lines(phones_str.split(' '))
+ # phonemes
+ phones_split = jyuping_to_phonemes(phones_str.split(' '))
phones_list.append(phones_split)
return phones_list
@@ -78,8 +82,11 @@ class CantonFrontend():
sentence: str,
merge_sentences: bool=True,
print_info: bool=False) -> List[List[str]]:
+ # TN & Text Segmentation
sentences = self.text_normalizer.normalize(sentence)
+ # G2P
phonemes = self._g2p(sentences, merge_sentences=merge_sentences)
+
if print_info:
print("----------------------------")
print("text norm results:")
@@ -88,6 +95,7 @@ class CantonFrontend():
print("g2p results:")
print(phonemes)
print("----------------------------")
+
return phonemes
def get_input_ids(self,
@@ -98,9 +106,9 @@ class CantonFrontend():
phonemes = self.get_phonemes(
sentence, merge_sentences=merge_sentences, print_info=print_info)
+
result = {}
temp_phone_ids = []
-
for phones in phonemes:
if phones:
phone_ids = self._p2id(phones)
@@ -108,6 +116,8 @@ class CantonFrontend():
if to_tensor:
phone_ids = paddle.to_tensor(phone_ids)
temp_phone_ids.append(phone_ids)
+
if temp_phone_ids:
result["phone_ids"] = temp_phone_ids
+
return result
diff --git a/paddlespeech/t2s/frontend/ssml/xml_processor.py b/paddlespeech/t2s/frontend/ssml/xml_processor.py
index 3e713d5d..3a5177d1 100644
--- a/paddlespeech/t2s/frontend/ssml/xml_processor.py
+++ b/paddlespeech/t2s/frontend/ssml/xml_processor.py
@@ -17,7 +17,6 @@ Note: xml 有5种特殊字符, &<>"'
' '
例如:
"姓名"
-
'''
@@ -61,14 +60,23 @@ class MixTextProcessor():
patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S)
mat = re.match(patn, mixstr)
if mat:
+ # pre
pre_xml = mat.group(1)
+ # between ...
in_xml = mat.group(2)
+ # post
after_xml = mat.group(3)
+ # pre with none syllable
ctlist.append([pre_xml, []])
+
+ # between with syllable
+ # [(sub sentence, [syllables]), ...]
dom = DomXml(in_xml)
pinyinlist = dom.get_pinyins_for_xml()
ctlist = ctlist + pinyinlist
+
+ # post with none syllable
ctlist.append([after_xml, []])
else:
ctlist.append([mixstr, []])
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 498a09fa..2e0b2ffd 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -14,6 +14,7 @@
import os
import re
from operator import itemgetter
+from pprint import pprint
from typing import Dict
from typing import List
@@ -41,6 +42,9 @@ INITIALS = [
]
INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil']
+# 0 for None, 5 for neutral
+TONES = ["0", "1", "2", "3", "4", "5"]
+
def intersperse(lst, item):
result = [item] * (len(lst) * 2 + 1)
@@ -597,11 +601,13 @@ class Frontend():
all_phonemes = []
for word_pinyin_item in ssml_inputs:
phonemes = []
- print("ssml inputs:", word_pinyin_item)
+
+ # ['你喜欢', []] -> 你喜欢 []
sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
- print('ssml g2p:', sentence, pinyin_spec)
+
# TN & Text Segmentation
sentences = self.text_normalizer.normalize(sentence)
+
if len(pinyin_spec) == 0:
# g2p word w/o specified
phonemes = self._g2p(
@@ -635,6 +641,7 @@ class Frontend():
print("g2p results:")
print(all_phonemes[0])
print("----------------------------")
+
return [sum(all_phonemes, [])]
def add_sp_if_no(self, phonemes):
@@ -711,10 +718,10 @@ class Frontend():
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
# split setence by SSML tag.
- l_inputs = MixTextProcessor.get_pinyin_split(sentence)
+ texts = MixTextProcessor.get_pinyin_split(sentence)
phonemes = self.get_phonemes_ssml(
- l_inputs,
+ texts,
merge_sentences=merge_sentences,
print_info=print_info,
robot=robot)
diff --git a/tests/unit/tts/test_ssml.py b/tests/unit/tts/test_ssml.py
new file mode 100644
index 00000000..382558a4
--- /dev/null
+++ b/tests/unit/tts/test_ssml.py
@@ -0,0 +1,61 @@
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
+
+if __name__ == '__main__':
+ text = "你好吗,我们的声学模型使用了 Fast Speech Two。前浪倒在沙滩上,沙滩上倒了一堆土。 想象干干的树干倒了, 里面有个干尸,不知是被谁干死的。thank you."
+
+ # SSML: 13
+ # 0 ['你好吗,', []]
+ # 1 ['我们的声学模型使用了FastSpeechTwo。前浪', []]
+ # 2 ['倒', ['dao3']]
+ # 3 ['在沙滩上,沙滩上倒了一堆', []]
+ # 4 ['土', ['tu3']]
+ # 5 ['。想象', []]
+ # 6 ['干干', ['gan1', 'gan1']]
+ # 7 ['的树干', []]
+ # 8 ['倒', ['dao3']]
+ # 9 ['了,里面有个干尸,不知是被谁', []]
+ # 10 ['干', ['gan4']]
+ # 11 ['死的。', []]
+ # 12 ['thank you.', []]
+ inputs = MixTextProcessor.get_pinyin_split(text)
+ print(f"SSML get_pinyin_split: {len(inputs)}")
+ for i, sub in enumerate(inputs):
+ print(i, sub)
+ print()
+
+ # SSML get_dom_split: 13
+ # 0 你好吗,
+ # 1 我们的声学模型使用了 Fast Speech Two。前浪
+ # 2 倒
+ # 3 在沙滩上,沙滩上倒了一堆
+ # 4 土
+ # 5 。 想象
+ # 6 干干
+ # 7 的树干
+ # 8 倒
+ # 9 了, 里面有个干尸,不知是被谁
+ # 10 干
+ # 11 死的。
+ # 12 thank you.
+ inputs = MixTextProcessor.get_dom_split(text)
+ print(f"SSML get_dom_split: {len(inputs)}")
+ for i, sub in enumerate(inputs):
+ print(i, sub)
+ print()
+
+ # SSML object.get_pinyin_split: 246
+ # 我们的声学模型使用了 Fast Speech Two。前浪倒在沙滩上,沙滩上倒了一堆土。 想象干干的树干倒了, 里面有个干尸,不知是被谁干死的。
+ outs = MixTextProcessor().get_xml_content(text)
+ print(f"SSML object.get_pinyin_split: {len(outs)}")
+ print(outs)
+ print()
+
+ # SSML object.get_content_split: 30 你好吗,
+ # 1 我们的声学模型使用了 Fast Speech Two。前浪倒在沙滩上,沙滩上倒了一堆土。 想象干干的树干
+ # 倒了, 里面有个干尸,不知是被谁干死的。
+ # 2 thank you.
+ outs = MixTextProcessor().get_content_split(text)
+ print(f"SSML object.get_content_split: {len(outs)}")
+ for i, sub in enumerate(outs):
+ print(i, sub)
+ print()