diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py index 5f145098..8f6822e1 100644 --- a/paddlespeech/t2s/frontend/mix_frontend.py +++ b/paddlespeech/t2s/frontend/mix_frontend.py @@ -62,9 +62,31 @@ class MixFrontend(): def _split(self, text: str) -> List[str]: text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text) + # 替换英文句子的句号 "." --> "。" 用于后续分句 + point = "." + point_indexs = [] + index = -1 + for i in range(text.count(point)): + index = text.find(".", index + 1, len(text)) + point_indexs.append(index) + + print(point_indexs) + + for point_index in point_indexs: + # 如果点在最开始或者最末尾的位置,不处理 + if point_index == 0 or point_index == len(text) - 1: + pass + else: + if ((self.is_alphabet(text[point_index - 1]) or + text[point_index - 1] == " ") and + (self.is_alphabet(text[point_index + 1]) or + text[point_index + 1] == " ")): + text = text.replace(text[point_index], "。") + text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] + return sentences def _distinguish(self, text: str) -> List[str]: @@ -77,9 +99,11 @@ class MixFrontend(): temp_seg = "" temp_lang = "" - # Determine the type of each character. type: blank, chinese, alphabet, number, unk. + # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point. for ch in text: - if self.is_chinese(ch): + if ch == ".": + types.append("point") + elif self.is_chinese(ch): types.append("zh") elif self.is_alphabet(ch): types.append("en") @@ -96,21 +120,26 @@ class MixFrontend(): # find the first char of the seg if flag == 0: - if types[i] != "unk" and types[i] != "blank": + # 首个字符是中文,英文或者数字 + if types[i] == "zh" or types[i] == "en" or types[i] == "num": temp_seg += text[i] temp_lang = types[i] flag = 1 else: - if types[i] == temp_lang or types[i] == "num": + # 数字和小数点均与前面的字符合并,类型属于前面一个字符的类型 + if types[i] == temp_lang or types[i] == "num" or types[ + i] == "point": temp_seg += text[i] - elif temp_lang == "num" and types[i] != "unk": + # 数字与后面的任意字符都拼接 + elif temp_lang == "num": temp_seg += text[i] if types[i] == "zh" or types[i] == "en": temp_lang = types[i] - elif temp_lang == "en" and types[i] == "blank": + # 如果是空格则与前面字符拼接 + elif types[i] == "blank": temp_seg += text[i] elif types[i] == "unk": @@ -119,7 +148,7 @@ class MixFrontend(): else: segments.append((temp_seg, temp_lang)) - if types[i] != "unk" and types[i] != "blank": + if types[i] == "zh" or types[i] == "en": temp_seg = text[i] temp_lang = types[i] flag = 1