Merge pull request #2273 from lym0302/r1.1

[cherry-pick] [r1.1] fix point bug
3 years ago · 81f293590a
parent 256f13ca50 9b7bf4bb86
commit 81f293590a
1 changed files with 36 additions and 7 deletions
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@ -62,9 +62,31 @@ class MixFrontend():

    def _split(self, text: str) -> List[str]:
        text = re.sub(r'[《》【】<=>{}()（）#&@“”^_|…\\]', '', text)
+        # 替换英文句子的句号 "." --> "。" 用于后续分句
+        point = "."
+        point_indexs = []
+        index = -1
+        for i in range(text.count(point)):
+            index = text.find(".", index + 1, len(text))
+            point_indexs.append(index)
+
+        print(point_indexs)
+
+        for point_index in point_indexs:
+            # 如果点在最开始或者最末尾的位置，不处理
+            if point_index == 0 or point_index == len(text) - 1:
+                pass
+            else:
+                if ((self.is_alphabet(text[point_index - 1]) or
+                     text[point_index - 1] == " ") and
+                    (self.is_alphabet(text[point_index + 1]) or
+                     text[point_index + 1] == " ")):
+                    text = text.replace(text[point_index], "。")
+
        text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
        text = text.strip()
        sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
+
        return sentences

    def _distinguish(self, text: str) -> List[str]:
@ -77,9 +99,11 @@ class MixFrontend():
        temp_seg = ""
        temp_lang = ""

-        # Determine the type of each character. type: blank, chinese, alphabet, number, unk.
+        # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
        for ch in text:
-            if self.is_chinese(ch):
+            if ch == ".":
+                types.append("point")
+            elif self.is_chinese(ch):
                types.append("zh")
            elif self.is_alphabet(ch):
                types.append("en")
@ -96,21 +120,26 @@ class MixFrontend():

            # find the first char of the seg
            if flag == 0:
-                if types[i] != "unk" and types[i] != "blank":
+                # 首个字符是中文，英文或者数字
+                if types[i] == "zh" or types[i] == "en" or types[i] == "num":
                    temp_seg += text[i]
                    temp_lang = types[i]
                    flag = 1

            else:
-                if types[i] == temp_lang or types[i] == "num":
+                # 数字和小数点均与前面的字符合并，类型属于前面一个字符的类型
+                if types[i] == temp_lang or types[i] == "num" or types[
+                        i] == "point":
                    temp_seg += text[i]

-                elif temp_lang == "num" and types[i] != "unk":
+                # 数字与后面的任意字符都拼接
+                elif temp_lang == "num":
                    temp_seg += text[i]
                    if types[i] == "zh" or types[i] == "en":
                        temp_lang = types[i]

-                elif temp_lang == "en" and types[i] == "blank":
+                # 如果是空格则与前面字符拼接
+                elif types[i] == "blank":
                    temp_seg += text[i]

                elif types[i] == "unk":
@ -119,7 +148,7 @@ class MixFrontend():
                else:
                    segments.append((temp_seg, temp_lang))

-                    if types[i] != "unk" and types[i] != "blank":
+                    if types[i] == "zh" or types[i] == "en":
                        temp_seg = text[i]
                        temp_lang = types[i]
                        flag = 1