Merge pull request #2531 from david-95/hongliang1014

[TTS]Add SSML for Chinese Text Frontend
3 years ago · f5e80cef18
parent 09b45a8b4d ed0138c6e3
commit f5e80cef18
7 changed files with 355 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P

 ## Acknowledgement
 - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples.
- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. 
+- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added SSML for Chinese Text Frontend. 
 - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW).
 - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
 - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.
--- a/README_cn.md
+++ b/README_cn.md
@ -928,7 +928,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声

 ## 致谢
 - 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。
- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题，贡献补充多条程序和数据。
+- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题，贡献补充多条程序和数据。新增 SSML 中文文本前端处理。
 - 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。
 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议，以及在诸多问题上的帮助。
 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。
--- a/paddlespeech/t2s/init.py
+++ b/paddlespeech/t2s/init.py
@ -18,5 +18,6 @@ from . import exps
 from . import frontend
 from . import models
 from . import modules
+from . import ssml
 from . import training
 from . import utils
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -13,6 +13,7 @@
 # limitations under the License.
 import math
 import os
+import re
 from pathlib import Path
 from typing import Any
 from typing import Dict
@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 from paddlespeech.utils.dynamic_import import dynamic_import
+
 # remove [W:onnxruntime: xxx] from ort
 ort.set_default_logger_severity(3)

@ -103,14 +105,15 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
    sentences = []
    with open(text_file, 'rt') as f:
        for line in f:
-            items = line.strip().split()
-            utt_id = items[0]
-            if lang == 'zh':
-                sentence = "".join(items[1:])
-            elif lang == 'en':
-                sentence = " ".join(items[1:])
-            elif lang == 'mix':
-                sentence = " ".join(items[1:])
+            if line.strip() != "":
+                items = re.split(r"\s+", line.strip(), 1)
+                utt_id = items[0]
+                if lang == 'zh':
+                    sentence = "".join(items[1:])
+                elif lang == 'en':
+                    sentence = " ".join(items[1:])
+                elif lang == 'mix':
+                    sentence = " ".join(items[1:])
            sentences.append((utt_id, sentence))
    return sentences

@ -180,11 +183,20 @@ def run_frontend(frontend: object,
                 to_tensor: bool=True):
    outs = dict()
    if lang == 'zh':
-        input_ids = frontend.get_input_ids(
-            text,
-            merge_sentences=merge_sentences,
-            get_tone_ids=get_tone_ids,
-            to_tensor=to_tensor)
+        input_ids = {}
+        if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text,
+                                           re.DOTALL):
+            input_ids = frontend.get_input_ids_ssml(
+                text,
+                merge_sentences=merge_sentences,
+                get_tone_ids=get_tone_ids,
+                to_tensor=to_tensor)
+        else:
+            input_ids = frontend.get_input_ids(
+                text,
+                merge_sentences=merge_sentences,
+                get_tone_ids=get_tone_ids,
+                to_tensor=to_tensor)
        phone_ids = input_ids["phone_ids"]
        if get_tone_ids:
            tone_ids = input_ids["tone_ids"]
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@ -13,6 +13,7 @@
 # limitations under the License.
 import os
 import re
+from operator import itemgetter
 from typing import Dict
 from typing import List

@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
 from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
 from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
 from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
+from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor

 INITIALS = [
    'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
@ -81,6 +83,7 @@ class Frontend():
                 g2p_model="g2pW",
                 phone_vocab_path=None,
                 tone_vocab_path=None):
+        self.mix_ssml_processor = MixTextProcessor()
        self.tone_modifier = ToneSandhi()
        self.text_normalizer = TextNormalizer()
        self.punc = "：，；。？！“”‘’':,;.?!"
@ -281,6 +284,65 @@ class Frontend():
            phones_list.append(merge_list)
        return phones_list

+    def _split_word_to_char(self, words):
+        res = []
+        for x in words:
+            res.append(x)
+        return res
+
+    # if using ssml, have pingyin specified, assign pinyin to words
+    def _g2p_assign(self,
+                    words: List[str],
+                    pinyin_spec: List[str],
+                    merge_sentences: bool=True) -> List[List[str]]:
+        phones_list = []
+        initials = []
+        finals = []
+
+        words = self._split_word_to_char(words[0])
+        for pinyin, char in zip(pinyin_spec, words):
+            sub_initials = []
+            sub_finals = []
+            pinyin = pinyin.replace("u:", "v")
+            #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
+            if pinyin in self.pinyin2phone:
+                initial_final_list = self.pinyin2phone[pinyin].split(" ")
+                if len(initial_final_list) == 2:
+                    sub_initials.append(initial_final_list[0])
+                    sub_finals.append(initial_final_list[1])
+                elif len(initial_final_list) == 1:
+                    sub_initials.append('')
+                    sub_finals.append(initial_final_list[1])
+            else:
+                # If it's not pinyin (possibly punctuation) or no conversion is required
+                sub_initials.append(pinyin)
+                sub_finals.append(pinyin)
+            initials.append(sub_initials)
+            finals.append(sub_finals)
+
+        initials = sum(initials, [])
+        finals = sum(finals, [])
+        phones = []
+        for c, v in zip(initials, finals):
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if c and c not in self.punc:
+                phones.append(c)
+            if c and c in self.punc:
+                phones.append('sp')
+            if v and v not in self.punc:
+                phones.append(v)
+        phones_list.append(phones)
+        if merge_sentences:
+            merge_list = sum(phones_list, [])
+            # rm the last 'sp' to avoid the noise at the end
+            # cause in the training data, no 'sp' in the end
+            if merge_list[-1] == 'sp':
+                merge_list = merge_list[:-1]
+            phones_list = []
+            phones_list.append(merge_list)
+        return phones_list
+
    def _merge_erhua(self,
                     initials: List[str],
                     finals: List[str],
@ -396,6 +458,52 @@ class Frontend():
            print("----------------------------")
        return phonemes

+    #@an added for ssml pinyin 
+    def get_phonemes_ssml(self,
+                          ssml_inputs: list,
+                          merge_sentences: bool=True,
+                          with_erhua: bool=True,
+                          robot: bool=False,
+                          print_info: bool=False) -> List[List[str]]:
+        all_phonemes = []
+        for word_pinyin_item in ssml_inputs:
+            phonemes = []
+            sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
+            sentences = self.text_normalizer.normalize(sentence)
+            if len(pinyin_spec) == 0:
+                phonemes = self._g2p(
+                    sentences,
+                    merge_sentences=merge_sentences,
+                    with_erhua=with_erhua)
+            else:
+                # phonemes should be pinyin_spec 
+                phonemes = self._g2p_assign(
+                    sentences, pinyin_spec, merge_sentences=merge_sentences)
+
+            all_phonemes = all_phonemes + phonemes
+
+        if robot:
+            new_phonemes = []
+            for sentence in all_phonemes:
+                new_sentence = []
+                for item in sentence:
+                    # `er` only have tone `2`
+                    if item[-1] in "12345" and item != "er2":
+                        item = item[:-1] + "1"
+                    new_sentence.append(item)
+                new_phonemes.append(new_sentence)
+            all_phonemes = new_phonemes
+
+        if print_info:
+            print("----------------------------")
+            print("text norm results:")
+            print(sentences)
+            print("----------------------------")
+            print("g2p results:")
+            print(all_phonemes[0])
+            print("----------------------------")
+        return [sum(all_phonemes, [])]
+
    def get_input_ids(self,
                      sentence: str,
                      merge_sentences: bool=True,
@ -405,6 +513,7 @@ class Frontend():
                      add_blank: bool=False,
                      blank_token: str="<pad>",
                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
+
        phonemes = self.get_phonemes(
            sentence,
            merge_sentences=merge_sentences,
@ -437,3 +546,49 @@ class Frontend():
        if temp_phone_ids:
            result["phone_ids"] = temp_phone_ids
        return result
+
+    # @an added for ssml
+    def get_input_ids_ssml(
+            self,
+            sentence: str,
+            merge_sentences: bool=True,
+            get_tone_ids: bool=False,
+            robot: bool=False,
+            print_info: bool=False,
+            add_blank: bool=False,
+            blank_token: str="<pad>",
+            to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
+
+        l_inputs = MixTextProcessor.get_pinyin_split(sentence)
+        phonemes = self.get_phonemes_ssml(
+            l_inputs,
+            merge_sentences=merge_sentences,
+            print_info=print_info,
+            robot=robot)
+        result = {}
+        phones = []
+        tones = []
+        temp_phone_ids = []
+        temp_tone_ids = []
+
+        for part_phonemes in phonemes:
+            phones, tones = self._get_phone_tone(
+                part_phonemes, get_tone_ids=get_tone_ids)
+            if add_blank:
+                phones = insert_after_character(phones, blank_token)
+            if tones:
+                tone_ids = self._t2id(tones)
+                if to_tensor:
+                    tone_ids = paddle.to_tensor(tone_ids)
+                temp_tone_ids.append(tone_ids)
+            if phones:
+                phone_ids = self._p2id(phones)
+                # if use paddle.to_tensor() in onnxruntime, the first time will be too low
+                if to_tensor:
+                    phone_ids = paddle.to_tensor(phone_ids)
+                temp_phone_ids.append(phone_ids)
+        if temp_tone_ids:
+            result["tone_ids"] = temp_tone_ids
+        if temp_phone_ids:
+            result["phone_ids"] = temp_phone_ids
+        return result
--- a/paddlespeech/t2s/ssml/init.py
+++ b/paddlespeech/t2s/ssml/init.py
@ -0,0 +1,14 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .xml_processor import *
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+import re
+import xml.dom.minidom
+import xml.parsers.expat
+from xml.dom.minidom import Node
+from xml.dom.minidom import parseString
+'''
+Note:  xml 有5种特殊字符， &<>"'
+其一，采用<![CDATA[ ]]>特殊标签，将包含特殊字符的字符串封装起来。
+例如：
+<TitleName><![CDATA["姓名"]]></TitleName>
+其二，使用XML转义序列表示这些特殊的字符，这5个特殊字符所对应XML转义序列为：
+&  &amp;
+<  &lt;
+>  &gt;
+"  &quot;
+'  &apos;
+例如：
+<TitleName>&quot;姓名&quot;</TitleName>
+
+'''
+
+
+class MixTextProcessor():
+    def __repr__(self):
+        print("@an MixTextProcessor class")
+
+    def get_xml_content(self, mixstr):
+        '''返回字符串的 xml 内容'''
+        xmlptn = re.compile(r"<speak>.*?</speak>", re.M | re.S)
+        ctn = re.search(xmlptn, mixstr)
+        if ctn:
+            return ctn.group(0)
+        else:
+            return None
+
+    def get_content_split(self, mixstr):
+        ''' 文本分解，顺序加了列表中，按非 xml 和 xml 分开，对应的字符串,带标点符号
+        不能去除空格，因为 xml 中tag 属性带空格
+        '''
+        ctlist = []
+        # print("Testing:",mixstr[:20])
+        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
+        mat = re.match(patn, mixstr)
+        if mat:
+            pre_xml = mat.group(1)
+            in_xml = mat.group(2)
+            after_xml = mat.group(3)
+
+            ctlist.append(pre_xml)
+            ctlist.append(in_xml)
+            ctlist.append(after_xml)
+            return ctlist
+        else:
+            ctlist.append(mixstr)
+        return ctlist
+
+    @classmethod
+    def get_pinyin_split(self, mixstr):
+        ctlist = []
+        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
+        mat = re.match(patn, mixstr)
+        if mat:
+            pre_xml = mat.group(1)
+            in_xml = mat.group(2)
+            after_xml = mat.group(3)
+
+            ctlist.append([pre_xml, []])
+            dom = DomXml(in_xml)
+            pinyinlist = dom.get_pinyins_for_xml()
+            ctlist = ctlist + pinyinlist
+            ctlist.append([after_xml, []])
+        else:
+            ctlist.append([mixstr, []])
+        return ctlist
+
+
+class DomXml():
+    def __init__(self, xmlstr):
+        self.tdom = parseString(xmlstr)  #Document
+        self.root = self.tdom.documentElement  #Element
+        self.rnode = self.tdom.childNodes  #NodeList
+
+    def get_text(self):
+        '''返回 xml 内容的所有文本内容的列表'''
+        res = []
+
+        for x1 in self.rnode:
+            if x1.nodeType == Node.TEXT_NODE:
+                res.append(x1.value)
+            else:
+                for x2 in x1.childNodes:
+                    if isinstance(x2, xml.dom.minidom.Text):
+                        res.append(x2.data)
+                    else:
+                        for x3 in x2.childNodes:
+                            if isinstance(x3, xml.dom.minidom.Text):
+                                res.append(x3.data)
+                            else:
+                                print("len(nodes of x3):", len(x3.childNodes))
+
+        return res
+
+    def get_xmlchild_list(self):
+        '''返回 xml 内容的列表，包括所有文本内容(不带 tag)'''
+        res = []
+
+        for x1 in self.rnode:
+            if x1.nodeType == Node.TEXT_NODE:
+                res.append(x1.value)
+            else:
+                for x2 in x1.childNodes:
+                    if isinstance(x2, xml.dom.minidom.Text):
+                        res.append(x2.data)
+                    else:
+                        for x3 in x2.childNodes:
+                            if isinstance(x3, xml.dom.minidom.Text):
+                                res.append(x3.data)
+                            else:
+                                print("len(nodes of x3):", len(x3.childNodes))
+        print(res)
+        return res
+
+    def get_pinyins_for_xml(self):
+        '''返回 xml 内容，字符串和拼音的 list '''
+        res = []
+
+        for x1 in self.rnode:
+            if x1.nodeType == Node.TEXT_NODE:
+                t = re.sub(r"\s+", "", x1.value)
+                res.append([t, []])
+            else:
+                for x2 in x1.childNodes:
+                    if isinstance(x2, xml.dom.minidom.Text):
+                        t = re.sub(r"\s+", "", x2.data)
+                        res.append([t, []])
+                    else:
+                        # print("x2",x2,x2.tagName)
+                        if x2.hasAttribute('pinyin'):
+                            pinyin_value = x2.getAttribute("pinyin")
+                            pinyins = pinyin_value.split(" ")
+                        for x3 in x2.childNodes:
+                            # print('x3',x3)
+                            if isinstance(x3, xml.dom.minidom.Text):
+                                t = re.sub(r"\s+", "", x3.data)
+                                res.append([t, pinyins])
+                            else:
+                                print("len(nodes of x3):", len(x3.childNodes))
+
+        return res
+
+    def get_all_tags(self, tag_name):
+        '''获取所有的 tag 及属性值'''
+        alltags = self.root.getElementsByTagName(tag_name)
+        for x in alltags:
+            if x.hasAttribute('pinyin'):  # pinyin
+                print(x.tagName, 'pinyin',
+                      x.getAttribute('pinyin'), x.firstChild.data)