add double byte char for zh normalization (#2661)

pull/2670/head
David An (An Hongliang) 2 years ago committed by GitHub
parent 94a487bd81
commit 1c3d2cb89e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -19,7 +19,7 @@ from pypinyin.constants import SUPPORT_UCS4
# 全角半角转换
# 英文字符全角 -> 半角映射表 (num: 52)
F2H_ASCII_LETTERS = {
chr(ord(char) + 65248): char
ord(char) + 65248: ord(char)
for char in string.ascii_letters
}
@ -27,12 +27,12 @@ F2H_ASCII_LETTERS = {
H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
# 数字字符全角 -> 半角映射表 (num: 10)
F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits}
F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
# 数字字符半角 -> 全角映射表
H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
# 标点符号全角 -> 半角映射表 (num: 32)
F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation}
F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
# 标点符号半角 -> 全角映射表
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}

@ -74,6 +74,17 @@ class TextNormalizer():
def _post_replace(self, sentence: str) -> str:
sentence = sentence.replace('/', '')
sentence = sentence.replace('~', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
return sentence

Loading…
Cancel
Save