From bd01bc155de267202588a821ccb0695952059e23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20An=20=EF=BC=88An=20Hongliang=EF=BC=89?= Date: Mon, 28 Nov 2022 14:54:22 +0800 Subject: [PATCH] add greek char and fix issue2571 (#2683) Co-authored-by: TianYuan --- .../zh_normalization/text_normlization.py | 31 +++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index 1942e6661..1250e96ca 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -65,7 +65,7 @@ class TextNormalizer(): if lang == "zh": text = text.replace(" ", "") # 过滤掉特殊字符 - text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text) + text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|…\\]', '', text) text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] @@ -85,7 +85,33 @@ class TextNormalizer(): sentence = sentence.replace('⑧', '八') sentence = sentence.replace('⑨', '九') sentence = sentence.replace('⑩', '十') - + sentence = sentence.replace('α', '阿尔法') + sentence = sentence.replace('β', '贝塔') + sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛') + sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔') + sentence = sentence.replace('ε', '艾普西龙') + sentence = sentence.replace('ζ', '捷塔') + sentence = sentence.replace('η', '依塔') + sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔') + sentence = sentence.replace('ι', '艾欧塔') + sentence = sentence.replace('κ', '喀帕') + sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达') + sentence = sentence.replace('μ', '缪') + sentence = sentence.replace('ν', '拗') + sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西') + sentence = sentence.replace('ο', '欧米克伦') + sentence = sentence.replace('π', '派').replace('Π', '派') + sentence = sentence.replace('ρ', '肉') + sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace( + 'σ', '西格玛') + sentence = sentence.replace('τ', '套') + sentence = sentence.replace('υ', '宇普西龙') + sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾') + sentence = sentence.replace('χ', '器') + sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛') + sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽') + # re filter special characters, have one more character "-" than line 68 + sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|…\\]', '', sentence) return sentence def normalize_sentence(self, sentence: str) -> str: @@ -124,6 +150,5 @@ class TextNormalizer(): def normalize(self, text: str) -> List[str]: sentences = self._split(text) - sentences = [self.normalize_sentence(sent) for sent in sentences] return sentences