enable chinese words' pinyin specified in text of ssml formats, test=tts

2 years ago · 13a7fa9808
parent b76968e6d9
commit 13a7fa9808
3 changed files with 323 additions and 2 deletions
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -13,6 +13,7 @@
 # limitations under the License.
 import math
 import os
 import re
 from pathlib import Path
 from typing import Any
 from typing import Dict
@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 from paddlespeech.utils.dynamic_import import dynamic_import
 # remove [W:onnxruntime: xxx] from ort
 ort.set_default_logger_severity(3)
@ -103,7 +105,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
    sentences = []
    with open(text_file, 'rt') as f:
        for line in f:
-            items = line.strip().split()
+            items = re.split(r"\s+", line.strip(), 1)
            utt_id = items[0]
            if lang == 'zh':
                sentence = "".join(items[1:])
@ -180,7 +182,7 @@ def run_frontend(frontend: object,
                 to_tensor: bool=True):
    outs = dict()
    if lang == 'zh':
-        input_ids = frontend.get_input_ids(
+        input_ids = frontend.get_input_ids_ssml(
            text,
            merge_sentences=merge_sentences,
            get_tone_ids=get_tone_ids,
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@ -13,6 +13,7 @@
 # limitations under the License.
 import os
 import re
 from operator import itemgetter
 from typing import Dict
 from typing import List
@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
 from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
 from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
 from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
 from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
 INITIALS = [
    'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
@ -81,6 +83,7 @@ class Frontend():
                 g2p_model="g2pW",
                 phone_vocab_path=None,
                 tone_vocab_path=None):
        self.mix_ssml_processor = MixTextProcessor()
        self.tone_modifier = ToneSandhi()
        self.text_normalizer = TextNormalizer()
        self.punc = "：，；。？！“”‘’':,;.?!"
@ -143,6 +146,7 @@ class Frontend():
                tone_id = [line.strip().split() for line in f.readlines()]
            for tone, id in tone_id:
                self.vocab_tones[tone] = int(id)
        self.mix_ssml_processor.__repr__()
    def _init_pypinyin(self):
        large_pinyin.load()
@ -281,6 +285,65 @@ class Frontend():
            phones_list.append(merge_list)
        return phones_list
    def _split_word_to_char(self, words):
        res = []
        for x in words:
            res.append(x)
        return res
    # if using ssml, have pingyin specified, assign pinyin to words
    def _g2p_assign(self,
                    words: List[str],
                    pinyin_spec: List[str],
                    merge_sentences: bool=True) -> List[List[str]]:
        phones_list = []
        initials = []
        finals = []
        words = self._split_word_to_char(words[0])
        for pinyin, char in zip(pinyin_spec, words):
            sub_initials = []
            sub_finals = []
            pinyin = pinyin.replace("u:", "v")
            #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
            if pinyin in self.pinyin2phone:
                initial_final_list = self.pinyin2phone[pinyin].split(" ")
                if len(initial_final_list) == 2:
                    sub_initials.append(initial_final_list[0])
                    sub_finals.append(initial_final_list[1])
                elif len(initial_final_list) == 1:
                    sub_initials.append('')
                    sub_finals.append(initial_final_list[1])
            else:
                # If it's not pinyin (possibly punctuation) or no conversion is required
                sub_initials.append(pinyin)
                sub_finals.append(pinyin)
            initials.append(sub_initials)
            finals.append(sub_finals)
        initials = sum(initials, [])
        finals = sum(finals, [])
        phones = []
        for c, v in zip(initials, finals):
            # NOTE: post process for pypinyin outputs
            # we discriminate i, ii and iii
            if c and c not in self.punc:
                phones.append(c)
            if c and c in self.punc:
                phones.append('sp')
            if v and v not in self.punc:
                phones.append(v)
        phones_list.append(phones)
        if merge_sentences:
            merge_list = sum(phones_list, [])
            # rm the last 'sp' to avoid the noise at the end
            # cause in the training data, no 'sp' in the end
            if merge_list[-1] == 'sp':
                merge_list = merge_list[:-1]
            phones_list = []
            phones_list.append(merge_list)
        return phones_list
    def _merge_erhua(self,
                     initials: List[str],
                     finals: List[str],
@ -396,6 +459,52 @@ class Frontend():
            print("----------------------------")
        return phonemes
    #@an added for ssml pinyin 
    def get_phonemes_ssml(self,
                          ssml_inputs: list,
                          merge_sentences: bool=True,
                          with_erhua: bool=True,
                          robot: bool=False,
                          print_info: bool=False) -> List[List[str]]:
        all_phonemes = []
        for word_pinyin_item in ssml_inputs:
            phonemes = []
            sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
            sentences = self.text_normalizer.normalize(sentence)
            if len(pinyin_spec) == 0:
                phonemes = self._g2p(
                    sentences,
                    merge_sentences=merge_sentences,
                    with_erhua=with_erhua)
            else:
                # phonemes should be pinyin_spec 
                phonemes = self._g2p_assign(
                    sentences, pinyin_spec, merge_sentences=merge_sentences)
            all_phonemes = all_phonemes + phonemes
        if robot:
            new_phonemes = []
            for sentence in all_phonemes:
                new_sentence = []
                for item in sentence:
                    # `er` only have tone `2`
                    if item[-1] in "12345" and item != "er2":
                        item = item[:-1] + "1"
                    new_sentence.append(item)
                new_phonemes.append(new_sentence)
            all_phonemes = new_phonemes
        if print_info:
            print("----------------------------")
            print("text norm results:")
            print(sentences)
            print("----------------------------")
            print("g2p results:")
            print(all_phonemes[0])
            print("----------------------------")
        return [sum(all_phonemes, [])]
    def get_input_ids(self,
                      sentence: str,
                      merge_sentences: bool=True,
@ -405,6 +514,7 @@ class Frontend():
                      add_blank: bool=False,
                      blank_token: str="<pad>",
                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
        phonemes = self.get_phonemes(
            sentence,
            merge_sentences=merge_sentences,
@ -437,3 +547,49 @@ class Frontend():
        if temp_phone_ids:
            result["phone_ids"] = temp_phone_ids
        return result
    # @an added for ssml
    def get_input_ids_ssml(
            self,
            sentence: str,
            merge_sentences: bool=True,
            get_tone_ids: bool=False,
            robot: bool=False,
            print_info: bool=False,
            add_blank: bool=False,
            blank_token: str="<pad>",
            to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
        l_inputs = MixTextProcessor.get_pinyin_split(sentence)
        phonemes = self.get_phonemes_ssml(
            l_inputs,
            merge_sentences=merge_sentences,
            print_info=print_info,
            robot=robot)
        result = {}
        phones = []
        tones = []
        temp_phone_ids = []
        temp_tone_ids = []
        for part_phonemes in phonemes:
            phones, tones = self._get_phone_tone(
                part_phonemes, get_tone_ids=get_tone_ids)
            if add_blank:
                phones = insert_after_character(phones, blank_token)
            if tones:
                tone_ids = self._t2id(tones)
                if to_tensor:
                    tone_ids = paddle.to_tensor(tone_ids)
                temp_tone_ids.append(tone_ids)
            if phones:
                phone_ids = self._p2id(phones)
                # if use paddle.to_tensor() in onnxruntime, the first time will be too low
                if to_tensor:
                    phone_ids = paddle.to_tensor(phone_ids)
                temp_phone_ids.append(phone_ids)
        if temp_tone_ids:
            result["tone_ids"] = temp_tone_ids
        if temp_phone_ids:
            result["phone_ids"] = temp_phone_ids
        return result
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@ -0,0 +1,163 @@
 # -*- coding: utf-8 -*-
 import re
 import xml.dom.minidom
 import xml.parsers.expat
 from xml.dom.minidom import Node
 from xml.dom.minidom import parseString
 '''
 Note:  xml 有5种特殊字符， &<>"'
 其一，采用<![CDATA[ ]]>特殊标签，将包含特殊字符的字符串封装起来。
 例如：
 <TitleName><![CDATA["姓名"]]></TitleName>
 其二，使用XML转义序列表示这些特殊的字符，这5个特殊字符所对应XML转义序列为：
 &  &amp;
 <  &lt;
 >  &gt;
 "  &quot;
 '  &apos;
 例如：
 <TitleName>&quot;姓名&quot;</TitleName>
 '''
 class MixTextProcessor():
    def __repr__(self):
        print("@an MixTextProcessor class")
    def get_xml_content(self, mixstr):
        '''返回字符串的 xml 内容'''
        xmlptn = re.compile(r"<speak>.*?</speak>", re.M | re.S)
        ctn = re.search(xmlptn, mixstr)
        if ctn:
            return ctn.group(0)
        else:
            return None
    def get_content_split(self, mixstr):
        ''' 文本分解，顺序加了列表中，按非xml 和 xml 分开，对应的字符串,带标点符号
        不能去除空格，因为xml 中tag 属性带空格
        '''
        ctlist = []
        # print("Testing:",mixstr[:20])
        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
        mat = re.match(patn, mixstr)
        if mat:
            pre_xml = mat.group(1)
            in_xml = mat.group(2)
            after_xml = mat.group(3)
            ctlist.append(pre_xml)
            ctlist.append(in_xml)
            ctlist.append(after_xml)
            return ctlist
        else:
            ctlist.append(mixstr)
        return ctlist
    @classmethod
    def get_pinyin_split(self, mixstr):
        ctlist = []
        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
        mat = re.match(patn, mixstr)
        if mat:
            pre_xml = mat.group(1)
            in_xml = mat.group(2)
            after_xml = mat.group(3)
            ctlist.append([pre_xml, []])
            dom = DomXml(in_xml)
            pinyinlist = dom.get_pinyins_for_xml()
            ctlist = ctlist + pinyinlist
            ctlist.append([after_xml, []])
        else:
            ctlist.append([mixstr, []])
        return ctlist
 class DomXml():
    def __init__(self, xmlstr):
        print("Parse xml str:", xmlstr)
        self.tdom = parseString(xmlstr)  #Document
        # print("tdom:",type(self.tdom)) 
        self.root = self.tdom.documentElement  #Element
        # print("root:",type(self.root)) 
        self.rnode = self.tdom.childNodes  #NodeList
        # print("rnode:",type(self.rnode))
        pass
    def get_text(self):
        '''返回xml 内容的所有文本内容的 列表'''
        res = []
        for x1 in self.rnode:
            if x1.nodeType == Node.TEXT_NODE:
                res.append(x1.value)
            else:
                for x2 in x1.childNodes:
                    if isinstance(x2, xml.dom.minidom.Text):
                        res.append(x2.data)
                    else:
                        for x3 in x2.childNodes:
                            if isinstance(x3, xml.dom.minidom.Text):
                                res.append(x3.data)
                            else:
                                print("len(nodes of x3):", len(x3.childNodes))
        return res
    def get_xmlchild_list(self):
        '''返回xml 内容的列表， 包括所有文本内容(不带tag)'''
        res = []
        for x1 in self.rnode:
            if x1.nodeType == Node.TEXT_NODE:
                res.append(x1.value)
            else:
                for x2 in x1.childNodes:
                    if isinstance(x2, xml.dom.minidom.Text):
                        res.append(x2.data)
                    else:
                        for x3 in x2.childNodes:
                            if isinstance(x3, xml.dom.minidom.Text):
                                res.append(x3.data)
                            else:
                                print("len(nodes of x3):", len(x3.childNodes))
        print(res)
        return res
    def get_pinyins_for_xml(self):
        '''返回xml 内容，如果字符串 和 拼音的 list , 如 ['''
        res = []
        for x1 in self.rnode:
            if x1.nodeType == Node.TEXT_NODE:
                t = re.sub(r"\s+", "", x1.value)
                res.append([t, []])
            else:
                for x2 in x1.childNodes:
                    if isinstance(x2, xml.dom.minidom.Text):
                        t = re.sub(r"\s+", "", x2.data)
                        res.append([t, []])
                    else:
                        # print("x2",x2,x2.tagName)
                        if x2.hasAttribute('pinyin'):
                            pinyin_value = x2.getAttribute("pinyin")
                            pinyins = pinyin_value.split(" ")
                        for x3 in x2.childNodes:
                            # print('x3',x3)
                            if isinstance(x3, xml.dom.minidom.Text):
                                t = re.sub(r"\s+", "", x3.data)
                                res.append([t, pinyins])
                            else:
                                print("len(nodes of x3):", len(x3.childNodes))
        return res
    def get_all_tags(self, tag_name):
        '''获取所有的tag 及属性值'''
        alltags = self.root.getElementsByTagName(tag_name)
        for x in alltags:
            if x.hasAttribute('pinyin'):  # pinyin
                print(x.tagName, 'pinyin',
                      x.getAttribute('pinyin'), x.firstChild.data)