Merge pull request #2255 from lym0302/develop

[tts] fix point bug
pull/2272/head
TianYuan 2 years ago committed by GitHub
commit 4a59702d60
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -62,9 +62,31 @@ class MixFrontend():
def _split(self, text: str) -> List[str]: def _split(self, text: str) -> List[str]:
text = re.sub(r'[《》【】<=>{}()#&@“”^_|…\\]', '', text) text = re.sub(r'[《》【】<=>{}()#&@“”^_|…\\]', '', text)
# 替换英文句子的句号 "." --> "。" 用于后续分句
point = "."
point_indexs = []
index = -1
for i in range(text.count(point)):
index = text.find(".", index + 1, len(text))
point_indexs.append(index)
print(point_indexs)
for point_index in point_indexs:
# 如果点在最开始或者最末尾的位置,不处理
if point_index == 0 or point_index == len(text) - 1:
pass
else:
if ((self.is_alphabet(text[point_index - 1]) or
text[point_index - 1] == " ") and
(self.is_alphabet(text[point_index + 1]) or
text[point_index + 1] == " ")):
text = text.replace(text[point_index], "")
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
text = text.strip() text = text.strip()
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
return sentences return sentences
def _distinguish(self, text: str) -> List[str]: def _distinguish(self, text: str) -> List[str]:
@ -77,9 +99,11 @@ class MixFrontend():
temp_seg = "" temp_seg = ""
temp_lang = "" temp_lang = ""
# Determine the type of each character. type: blank, chinese, alphabet, number, unk. # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
for ch in text: for ch in text:
if self.is_chinese(ch): if ch == ".":
types.append("point")
elif self.is_chinese(ch):
types.append("zh") types.append("zh")
elif self.is_alphabet(ch): elif self.is_alphabet(ch):
types.append("en") types.append("en")
@ -96,21 +120,26 @@ class MixFrontend():
# find the first char of the seg # find the first char of the seg
if flag == 0: if flag == 0:
if types[i] != "unk" and types[i] != "blank": # 首个字符是中文,英文或者数字
if types[i] == "zh" or types[i] == "en" or types[i] == "num":
temp_seg += text[i] temp_seg += text[i]
temp_lang = types[i] temp_lang = types[i]
flag = 1 flag = 1
else: else:
if types[i] == temp_lang or types[i] == "num": # 数字和小数点均与前面的字符合并,类型属于前面一个字符的类型
if types[i] == temp_lang or types[i] == "num" or types[
i] == "point":
temp_seg += text[i] temp_seg += text[i]
elif temp_lang == "num" and types[i] != "unk": # 数字与后面的任意字符都拼接
elif temp_lang == "num":
temp_seg += text[i] temp_seg += text[i]
if types[i] == "zh" or types[i] == "en": if types[i] == "zh" or types[i] == "en":
temp_lang = types[i] temp_lang = types[i]
elif temp_lang == "en" and types[i] == "blank": # 如果是空格则与前面字符拼接
elif types[i] == "blank":
temp_seg += text[i] temp_seg += text[i]
elif types[i] == "unk": elif types[i] == "unk":
@ -119,7 +148,7 @@ class MixFrontend():
else: else:
segments.append((temp_seg, temp_lang)) segments.append((temp_seg, temp_lang))
if types[i] != "unk" and types[i] != "blank": if types[i] == "zh" or types[i] == "en":
temp_seg = text[i] temp_seg = text[i]
temp_lang = types[i] temp_lang = types[i]
flag = 1 flag = 1

Loading…
Cancel
Save