Merge pull request #2273 from lym0302/r1.1

[cherry-pick] [r1.1] fix point bug
pull/2275/head
liangym 2 years ago committed by GitHub
commit 81f293590a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -62,9 +62,31 @@ class MixFrontend():
def _split(self, text: str) -> List[str]:
text = re.sub(r'[《》【】<=>{}()#&@“”^_|…\\]', '', text)
# 替换英文句子的句号 "." --> "。" 用于后续分句
point = "."
point_indexs = []
index = -1
for i in range(text.count(point)):
index = text.find(".", index + 1, len(text))
point_indexs.append(index)
print(point_indexs)
for point_index in point_indexs:
# 如果点在最开始或者最末尾的位置,不处理
if point_index == 0 or point_index == len(text) - 1:
pass
else:
if ((self.is_alphabet(text[point_index - 1]) or
text[point_index - 1] == " ") and
(self.is_alphabet(text[point_index + 1]) or
text[point_index + 1] == " ")):
text = text.replace(text[point_index], "")
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
text = text.strip()
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
return sentences
def _distinguish(self, text: str) -> List[str]:
@ -77,9 +99,11 @@ class MixFrontend():
temp_seg = ""
temp_lang = ""
# Determine the type of each character. type: blank, chinese, alphabet, number, unk.
# Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
for ch in text:
if self.is_chinese(ch):
if ch == ".":
types.append("point")
elif self.is_chinese(ch):
types.append("zh")
elif self.is_alphabet(ch):
types.append("en")
@ -96,21 +120,26 @@ class MixFrontend():
# find the first char of the seg
if flag == 0:
if types[i] != "unk" and types[i] != "blank":
# 首个字符是中文,英文或者数字
if types[i] == "zh" or types[i] == "en" or types[i] == "num":
temp_seg += text[i]
temp_lang = types[i]
flag = 1
else:
if types[i] == temp_lang or types[i] == "num":
# 数字和小数点均与前面的字符合并,类型属于前面一个字符的类型
if types[i] == temp_lang or types[i] == "num" or types[
i] == "point":
temp_seg += text[i]
elif temp_lang == "num" and types[i] != "unk":
# 数字与后面的任意字符都拼接
elif temp_lang == "num":
temp_seg += text[i]
if types[i] == "zh" or types[i] == "en":
temp_lang = types[i]
elif temp_lang == "en" and types[i] == "blank":
# 如果是空格则与前面字符拼接
elif types[i] == "blank":
temp_seg += text[i]
elif types[i] == "unk":
@ -119,7 +148,7 @@ class MixFrontend():
else:
segments.append((temp_seg, temp_lang))
if types[i] != "unk" and types[i] != "blank":
if types[i] == "zh" or types[i] == "en":
temp_seg = text[i]
temp_lang = types[i]
flag = 1

Loading…
Cancel
Save