[tts]中英文语音合成时添加SSML支持

3 years ago · 2660ff425a
parent c54c9506aa
commit 2660ff425a
3 changed files with 83 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,6 +13,7 @@
 *.done
 *.whl
 *.egg-info
+.history
 build
 *output/

--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from typing import Dict
 from typing import List

@ -18,6 +19,7 @@ import paddle

 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor


 class MixFrontend():
@ -107,7 +109,38 @@ class MixFrontend():
                      add_sp: bool=True,
                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:

-        segments = self.get_segment(sentence)
+        ''' 1. 添加SSML支持，先列出 文字 和 <say-as>标签内容，然后添加到tmpSegments数组里
+        '''
+        d_inputs = MixTextProcessor.get_dom_split(sentence)
+        tmpSegments = []
+        for instr in d_inputs:
+            if instr.lower().startswith("<say-as"):
+                tmpSegments.append((instr, "zh"))
+            else:
+                tmpSegments.extend(self.get_segment(instr))
+
+        ''' 2. 把zh的merge到一起，避免合成结果中间停顿
+        '''
+        segments = []
+        currentSeg = ["", ""]
+        for seg in tmpSegments:
+            if seg[1] == "en" or seg[1] == "other":
+                if currentSeg[0] == '':
+                    segments.append(seg)
+                else:
+                    currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
+                    segments.append(tuple(currentSeg))
+                    segments.append(seg)
+                    currentSeg = ["", ""]
+            else:
+                if currentSeg[0] == '':
+                    currentSeg[0] = seg[0]
+                    currentSeg[1] = seg[1]
+                else:
+                    currentSeg[0] = currentSeg[0] + seg[0]
+        if currentSeg[0] != '':
+            currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
+            segments.append(tuple(currentSeg))

        phones_list = []
        result = {}
@ -120,11 +153,19 @@ class MixFrontend():
                    input_ids = self.en_frontend.get_input_ids(
                        content, merge_sentences=False, to_tensor=to_tensor)
                else:
-                    input_ids = self.zh_frontend.get_input_ids(
-                        content,
-                        merge_sentences=False,
-                        get_tone_ids=get_tone_ids,
-                        to_tensor=to_tensor)
+                    if content.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", content,
+                                                    re.DOTALL):
+                        input_ids = self.zh_frontend.get_input_ids_ssml(
+                            content,
+                            merge_sentences=False,
+                            get_tone_ids=get_tone_ids,
+                            to_tensor=to_tensor)
+                    else:
+                        input_ids = self.zh_frontend.get_input_ids(
+                            content,
+                            merge_sentences=False,
+                            get_tone_ids=get_tone_ids,
+                            to_tensor=to_tensor)
                if add_sp:
                    input_ids["phone_ids"][-1] = paddle.concat(
                        [input_ids["phone_ids"][-1], self.sp_id_tensor])
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@ -73,7 +73,29 @@ class MixTextProcessor():
        else:
            ctlist.append([mixstr, []])
        return ctlist
+    
+    @classmethod
+    def get_dom_split(self, mixstr):
+        ''' 文本分解，顺序加了列表中，返回文本和say-as标签（供mix进一步处理）
+        '''
+        ctlist = []
+        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
+        mat = re.match(patn, mixstr)
+        if mat:
+            pre_xml = mat.group(1)
+            in_xml = mat.group(2)
+            after_xml = mat.group(3)

+            ctlist.append(pre_xml)
+            dom = DomXml(in_xml)
+            tags = dom.get_text_and_sayas_tags()
+            ctlist.extend(tags)
+            
+            ctlist.append(after_xml)
+            return ctlist
+        else:
+            ctlist.append(mixstr)
+        return ctlist

 class DomXml():
    def __init__(self, xmlstr):
@ -156,3 +178,16 @@ class DomXml():
            if x.hasAttribute('pinyin'):  # pinyin
                print(x.tagName, 'pinyin',
                      x.getAttribute('pinyin'), x.firstChild.data)
+
+    def get_text_and_sayas_tags(self):
+        '''返回 xml 内容的列表，包括所有文本内容和<say-as> tag'''
+        res = []
+
+        for x1 in self.rnode:
+            if x1.nodeType == Node.TEXT_NODE:
+                res.append(x1.value)
+            else:
+                for x2 in x1.childNodes:
+                    res.append(x2.toxml())
+        print(res)
+        return res