From 9b7bf4bb865dab8d29668bddcafd90ef5c66c3d0 Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Tue, 16 Aug 2022 06:28:13 +0000
Subject: [PATCH] fix point bug, test=tts

---
 paddlespeech/t2s/frontend/mix_frontend.py | 43 +++++++++++++++++++----
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
index 5f145098..8f6822e1 100644
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -62,9 +62,31 @@ class MixFrontend():
 
     def _split(self, text: str) -> List[str]:
         text = re.sub(r'[《》【】<=>{}()（）#&@“”^_|…\\]', '', text)
+        # 替换英文句子的句号 "." --> "。" 用于后续分句
+        point = "."
+        point_indexs = []
+        index = -1
+        for i in range(text.count(point)):
+            index = text.find(".", index + 1, len(text))
+            point_indexs.append(index)
+
+        print(point_indexs)
+
+        for point_index in point_indexs:
+            # 如果点在最开始或者最末尾的位置，不处理
+            if point_index == 0 or point_index == len(text) - 1:
+                pass
+            else:
+                if ((self.is_alphabet(text[point_index - 1]) or
+                     text[point_index - 1] == " ") and
+                    (self.is_alphabet(text[point_index + 1]) or
+                     text[point_index + 1] == " ")):
+                    text = text.replace(text[point_index], "。")
+
         text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
         text = text.strip()
         sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
+
         return sentences
 
     def _distinguish(self, text: str) -> List[str]:
@@ -77,9 +99,11 @@ class MixFrontend():
         temp_seg = ""
         temp_lang = ""
 
-        # Determine the type of each character. type: blank, chinese, alphabet, number, unk.
+        # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
         for ch in text:
-            if self.is_chinese(ch):
+            if ch == ".":
+                types.append("point")
+            elif self.is_chinese(ch):
                 types.append("zh")
             elif self.is_alphabet(ch):
                 types.append("en")
@@ -96,21 +120,26 @@ class MixFrontend():
 
             # find the first char of the seg
             if flag == 0:
-                if types[i] != "unk" and types[i] != "blank":
+                # 首个字符是中文，英文或者数字
+                if types[i] == "zh" or types[i] == "en" or types[i] == "num":
                     temp_seg += text[i]
                     temp_lang = types[i]
                     flag = 1
 
             else:
-                if types[i] == temp_lang or types[i] == "num":
+                # 数字和小数点均与前面的字符合并，类型属于前面一个字符的类型
+                if types[i] == temp_lang or types[i] == "num" or types[
+                        i] == "point":
                     temp_seg += text[i]
 
-                elif temp_lang == "num" and types[i] != "unk":
+                # 数字与后面的任意字符都拼接
+                elif temp_lang == "num":
                     temp_seg += text[i]
                     if types[i] == "zh" or types[i] == "en":
                         temp_lang = types[i]
 
-                elif temp_lang == "en" and types[i] == "blank":
+                # 如果是空格则与前面字符拼接
+                elif types[i] == "blank":
                     temp_seg += text[i]
 
                 elif types[i] == "unk":
@@ -119,7 +148,7 @@ class MixFrontend():
                 else:
                     segments.append((temp_seg, temp_lang))
 
-                    if types[i] != "unk" and types[i] != "blank":
+                    if types[i] == "zh" or types[i] == "en":
                         temp_seg = text[i]
                         temp_lang = types[i]
                         flag = 1