Merge pull request #2531 from david-95/hongliang1014

[TTS]Add SSML for Chinese Text Frontend
3 years ago · f5e80cef18
parent 09b45a8b4d ed0138c6e3
commit f5e80cef18
7 changed files with 355 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -923,7 +923,7 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
 ## Acknowledgement
 - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples.
- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. 
+- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. Added SSML for Chinese Text Frontend. 
 - Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW).
 - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
 - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.
--- a/README_cn.md
+++ b/README_cn.md
@ -928,7 +928,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
 ## 致谢
 - 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。
- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题，贡献补充多条程序和数据。
+- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题，贡献补充多条程序和数据。新增 SSML 中文文本前端处理。
 - 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。
 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议，以及在诸多问题上的帮助。
 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。
--- a/paddlespeech/t2s/init.py
+++ b/paddlespeech/t2s/init.py
@ -18,5 +18,6 @@ from . import exps
 from . import frontend
 from . import models
 from . import modules
 from . import ssml
 from . import training
 from . import utils
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -13,6 +13,7 @@
 # limitations under the License.
 import math
 import os
 import re
 from pathlib import Path
 from typing import Any
 from typing import Dict
@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 from paddlespeech.utils.dynamic_import import dynamic_import
 # remove [W:onnxruntime: xxx] from ort
 ort.set_default_logger_severity(3)
@ -103,14 +105,15 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
    sentences = []
    with open(text_file, 'rt') as f:
        for line in f:
-            items = line.strip().split()
+            if line.strip() != "":
-            utt_id = items[0]
+                items = re.split(r"\s+", line.strip(), 1)
-            if lang == 'zh':
+                utt_id = items[0]
-                sentence = "".join(items[1:])
+                if lang == 'zh':
-            elif lang == 'en':
+                    sentence = "".join(items[1:])
-                sentence = " ".join(items[1:])
+                elif lang == 'en':
-            elif lang == 'mix':
+                    sentence = " ".join(items[1:])
-                sentence = " ".join(items[1:])
+                elif lang == 'mix':
                    sentence = " ".join(items[1:])
            sentences.append((utt_id, sentence))
    return sentences
@ -180,11 +183,20 @@ def run_frontend(frontend: object,
                 to_tensor: bool=True):
    outs = dict()
    if lang == 'zh':
-        input_ids = frontend.get_input_ids(
+        input_ids = {}
-            text,
+        if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text,
-            merge_sentences=merge_sentences,
+                                           re.DOTALL):
-            get_tone_ids=get_tone_ids,
+            input_ids = frontend.get_input_ids_ssml(
-            to_tensor=to_tensor)
+                text,
                merge_sentences=merge_sentences,
                get_tone_ids=get_tone_ids,
                to_tensor=to_tensor)
        else:
            input_ids = frontend.get_input_ids(
                text,
                merge_sentences=merge_sentences,
                get_tone_ids=get_tone_ids,
                to_tensor=to_tensor)
        phone_ids = input_ids["phone_ids"]
        if get_tone_ids:
            tone_ids = input_ids["tone_ids"]
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@ -13,6 +13,7 @@
 # limitations under the License.
 import os
 import re
 from operator import itemgetter
 from typing import Dict
 from typing import List
@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
 from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
 from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
 from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
 from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
 INITIALS = [
    'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
@ -81,6 +83,7 @@ class Frontend():
                 g2p_model="g2pW",
                 phone_vocab_path=None,
                 tone_vocab_path=None):
        self.mix_ssml_processor = MixTextProcessor()
        self.tone_modifier = ToneSandhi()
        self.text_normalizer = TextNormalizer()
        self.punc = "：，；。？！“”‘’':,;.?!"
@ -281,6 +284,65 @@ class Frontend():
            phones_list.append(merge_list)
        return phones_list
    def _split_word_to_char(self, words):
        res = []
        for x in words:
            res.append(x)
        return res
    # if using ssml, have pingyin specified, assign pinyin to words
    def _g2p_assign(self,
                    words: List[str],
                    pinyin_spec: List[str],
                    merge_sentences: bool=True) -> List[List[str]]:
        phones_list = []
        initials = []
        finals = []
        words = self._split_word_to_char(words[0])
        for pinyin, char in zip(pinyin_spec, words):
            sub_initials = []
            sub_finals = []
            pinyin = pinyin.replace("u:", "v")
            #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
            if pinyin in self.pinyin2phone:
                initial_final_list = self.pinyin2phone[pinyin].split(" ")
                if len(initial_final_list) == 2:
                    sub_initials.append(initial_final_list[0])
                    sub_finals.append(initial_final_list[1])
                elif len(initial_final_list) == 1:
                    sub_initials.append('')
                    sub_finals.append(initial_final_list[1])
            else:
                # If it's not pinyin (possibly punctuation) or no conversion is required
                sub_initials.append(pinyin)
                sub_finals.append(pinyin)
            initials.append(sub_initials)
            finals.append(sub_finals)
        initials = sum(initials, [])
        finals = sum(finals, [])
        phones = []
        for c, v in zip(initials, finals):
            # NOTE: post process for pypinyin outputs
            # we discriminate i, ii and iii
            if c and c not in self.punc:
                phones.append(c)
            if c and c in self.punc:
                phones.append('sp')
            if v and v not in self.punc:
                phones.append(v)
        phones_list.append(phones)
        if merge_sentences:
            merge_list = sum(phones_list, [])
            # rm the last 'sp' to avoid the noise at the end
            # cause in the training data, no 'sp' in the end
            if merge_list[-1] == 'sp':
                merge_list = merge_list[:-1]
            phones_list = []
            phones_list.append(merge_list)
        return phones_list
    def _merge_erhua(self,
                     initials: List[str],
                     finals: List[str],
@ -396,6 +458,52 @@ class Frontend():
            print("----------------------------")
        return phonemes
    #@an added for ssml pinyin 
    def get_phonemes_ssml(self,
                          ssml_inputs: list,
                          merge_sentences: bool=True,
                          with_erhua: bool=True,
                          robot: bool=False,
                          print_info: bool=False) -> List[List[str]]:
        all_phonemes = []
        for word_pinyin_item in ssml_inputs:
            phonemes = []
            sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
            sentences = self.text_normalizer.normalize(sentence)
            if len(pinyin_spec) == 0:
                phonemes = self._g2p(
                    sentences,
                    merge_sentences=merge_sentences,
                    with_erhua=with_erhua)
            else:
                # phonemes should be pinyin_spec 
                phonemes = self._g2p_assign(
                    sentences, pinyin_spec, merge_sentences=merge_sentences)
            all_phonemes = all_phonemes + phonemes
        if robot:
            new_phonemes = []
            for sentence in all_phonemes:
                new_sentence = []
                for item in sentence:
                    # `er` only have tone `2`
                    if item[-1] in "12345" and item != "er2":
                        item = item[:-1] + "1"
                    new_sentence.append(item)
                new_phonemes.append(new_sentence)
            all_phonemes = new_phonemes
        if print_info:
            print("----------------------------")
            print("text norm results:")
            print(sentences)
            print("----------------------------")
            print("g2p results:")
            print(all_phonemes[0])
            print("----------------------------")
        return [sum(all_phonemes, [])]
    def get_input_ids(self,
                      sentence: str,
                      merge_sentences: bool=True,
@ -405,6 +513,7 @@ class Frontend():
                      add_blank: bool=False,
                      blank_token: str="<pad>",
                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
        phonemes = self.get_phonemes(
            sentence,
            merge_sentences=merge_sentences,
@ -437,3 +546,49 @@ class Frontend():
        if temp_phone_ids:
            result["phone_ids"] = temp_phone_ids
        return result
    # @an added for ssml
    def get_input_ids_ssml(
            self,
            sentence: str,
            merge_sentences: bool=True,
            get_tone_ids: bool=False,
            robot: bool=False,
            print_info: bool=False,
            add_blank: bool=False,
            blank_token: str="<pad>",
            to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
        l_inputs = MixTextProcessor.get_pinyin_split(sentence)
        phonemes = self.get_phonemes_ssml(
            l_inputs,
            merge_sentences=merge_sentences,
            print_info=print_info,
            robot=robot)
        result = {}
        phones = []
        tones = []
        temp_phone_ids = []
        temp_tone_ids = []
        for part_phonemes in phonemes:
            phones, tones = self._get_phone_tone(
                part_phonemes, get_tone_ids=get_tone_ids)
            if add_blank:
                phones = insert_after_character(phones, blank_token)
            if tones:
                tone_ids = self._t2id(tones)
                if to_tensor:
                    tone_ids = paddle.to_tensor(tone_ids)
                temp_tone_ids.append(tone_ids)
            if phones:
                phone_ids = self._p2id(phones)
                # if use paddle.to_tensor() in onnxruntime, the first time will be too low
                if to_tensor:
                    phone_ids = paddle.to_tensor(phone_ids)
                temp_phone_ids.append(phone_ids)
        if temp_tone_ids:
            result["tone_ids"] = temp_tone_ids
        if temp_phone_ids:
            result["phone_ids"] = temp_phone_ids
        return result
--- a/paddlespeech/t2s/ssml/init.py
+++ b/paddlespeech/t2s/ssml/init.py
@ -0,0 +1,14 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .xml_processor import *
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@ -0,0 +1,158 @@
 # -*- coding: utf-8 -*-
 import re
 import xml.dom.minidom
 import xml.parsers.expat
 from xml.dom.minidom import Node
 from xml.dom.minidom import parseString
 '''
 Note:  xml 有5种特殊字符， &<>"'
 其一，采用<![CDATA[ ]]>特殊标签，将包含特殊字符的字符串封装起来。
 例如：
 <TitleName><![CDATA["姓名"]]></TitleName>
 其二，使用XML转义序列表示这些特殊的字符，这5个特殊字符所对应XML转义序列为：
 &  &amp;
 <  &lt;
 >  &gt;
 "  &quot;
 '  &apos;
 例如：
 <TitleName>&quot;姓名&quot;</TitleName>
 '''
 class MixTextProcessor():
    def __repr__(self):
        print("@an MixTextProcessor class")
    def get_xml_content(self, mixstr):
        '''返回字符串的 xml 内容'''
        xmlptn = re.compile(r"<speak>.*?</speak>", re.M | re.S)
        ctn = re.search(xmlptn, mixstr)
        if ctn:
            return ctn.group(0)
        else:
            return None
    def get_content_split(self, mixstr):
        ''' 文本分解，顺序加了列表中，按非 xml 和 xml 分开，对应的字符串,带标点符号
        不能去除空格，因为 xml 中tag 属性带空格
        '''
        ctlist = []
        # print("Testing:",mixstr[:20])
        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
        mat = re.match(patn, mixstr)
        if mat:
            pre_xml = mat.group(1)
            in_xml = mat.group(2)
            after_xml = mat.group(3)
            ctlist.append(pre_xml)
            ctlist.append(in_xml)
            ctlist.append(after_xml)
            return ctlist
        else:
            ctlist.append(mixstr)
        return ctlist
    @classmethod
    def get_pinyin_split(self, mixstr):
        ctlist = []
        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
        mat = re.match(patn, mixstr)
        if mat:
            pre_xml = mat.group(1)
            in_xml = mat.group(2)
            after_xml = mat.group(3)
            ctlist.append([pre_xml, []])
            dom = DomXml(in_xml)
            pinyinlist = dom.get_pinyins_for_xml()
            ctlist = ctlist + pinyinlist
            ctlist.append([after_xml, []])
        else:
            ctlist.append([mixstr, []])
        return ctlist
 class DomXml():
    def __init__(self, xmlstr):
        self.tdom = parseString(xmlstr)  #Document
        self.root = self.tdom.documentElement  #Element
        self.rnode = self.tdom.childNodes  #NodeList
    def get_text(self):
        '''返回 xml 内容的所有文本内容的列表'''
        res = []
        for x1 in self.rnode:
            if x1.nodeType == Node.TEXT_NODE:
                res.append(x1.value)
            else:
                for x2 in x1.childNodes:
                    if isinstance(x2, xml.dom.minidom.Text):
                        res.append(x2.data)
                    else:
                        for x3 in x2.childNodes:
                            if isinstance(x3, xml.dom.minidom.Text):
                                res.append(x3.data)
                            else:
                                print("len(nodes of x3):", len(x3.childNodes))
        return res
    def get_xmlchild_list(self):
        '''返回 xml 内容的列表，包括所有文本内容(不带 tag)'''
        res = []
        for x1 in self.rnode:
            if x1.nodeType == Node.TEXT_NODE:
                res.append(x1.value)
            else:
                for x2 in x1.childNodes:
                    if isinstance(x2, xml.dom.minidom.Text):
                        res.append(x2.data)
                    else:
                        for x3 in x2.childNodes:
                            if isinstance(x3, xml.dom.minidom.Text):
                                res.append(x3.data)
                            else:
                                print("len(nodes of x3):", len(x3.childNodes))
        print(res)
        return res
    def get_pinyins_for_xml(self):
        '''返回 xml 内容，字符串和拼音的 list '''
        res = []
        for x1 in self.rnode:
            if x1.nodeType == Node.TEXT_NODE:
                t = re.sub(r"\s+", "", x1.value)
                res.append([t, []])
            else:
                for x2 in x1.childNodes:
                    if isinstance(x2, xml.dom.minidom.Text):
                        t = re.sub(r"\s+", "", x2.data)
                        res.append([t, []])
                    else:
                        # print("x2",x2,x2.tagName)
                        if x2.hasAttribute('pinyin'):
                            pinyin_value = x2.getAttribute("pinyin")
                            pinyins = pinyin_value.split(" ")
                        for x3 in x2.childNodes:
                            # print('x3',x3)
                            if isinstance(x3, xml.dom.minidom.Text):
                                t = re.sub(r"\s+", "", x3.data)
                                res.append([t, pinyins])
                            else:
                                print("len(nodes of x3):", len(x3.childNodes))
        return res
    def get_all_tags(self, tag_name):
        '''获取所有的 tag 及属性值'''
        alltags = self.root.getElementsByTagName(tag_name)
        for x in alltags:
            if x.hasAttribute('pinyin'):  # pinyin
                print(x.tagName, 'pinyin',
                      x.getAttribute('pinyin'), x.firstChild.data)