|
|
|
@ -65,7 +65,7 @@ class TextNormalizer():
|
|
|
|
|
if lang == "zh":
|
|
|
|
|
text = text.replace(" ", "")
|
|
|
|
|
# 过滤掉特殊字符
|
|
|
|
|
text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text)
|
|
|
|
|
text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|…\\]', '', text)
|
|
|
|
|
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
|
|
|
|
|
text = text.strip()
|
|
|
|
|
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
|
|
|
|
@ -110,7 +110,8 @@ class TextNormalizer():
|
|
|
|
|
sentence = sentence.replace('χ', '器')
|
|
|
|
|
sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
|
|
|
|
|
sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
|
|
|
|
|
sentence = sentence.replace("——", "--")
|
|
|
|
|
# re filter special characters, have one more character "-" than line 68
|
|
|
|
|
sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|…\\]', '', sentence)
|
|
|
|
|
return sentence
|
|
|
|
|
|
|
|
|
|
def normalize_sentence(self, sentence: str) -> str:
|
|
|
|
@ -149,6 +150,5 @@ class TextNormalizer():
|
|
|
|
|
|
|
|
|
|
def normalize(self, text: str) -> List[str]:
|
|
|
|
|
sentences = self._split(text)
|
|
|
|
|
|
|
|
|
|
sentences = [self.normalize_sentence(sent) for sent in sentences]
|
|
|
|
|
return sentences
|
|
|
|
|