From e1a495a8e6ec3c2f931cb962b29d4160355992f3 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 16 Aug 2022 04:54:19 +0000 Subject: [PATCH] fix tone sand_hi bugs for Chinese frontend --- examples/other/g2p/README.md | 6 +- paddlespeech/resource/pretrained_models.py | 4 +- paddlespeech/t2s/frontend/g2pw/onnx_api.py | 63 ++++++++++++------- paddlespeech/t2s/frontend/polyphonic.yaml | 20 +++++- paddlespeech/t2s/frontend/tone_sandhi.py | 72 ++++++++++++---------- paddlespeech/t2s/frontend/zh_frontend.py | 47 +++++++++----- 6 files changed, 135 insertions(+), 77 deletions(-) diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md index 84f5fe23..a8f8f734 100644 --- a/examples/other/g2p/README.md +++ b/examples/other/g2p/README.md @@ -7,18 +7,18 @@ We use `WER` as an evaluation criterion. # Start Run the command below to get the results of the test. + ```bash ./run.sh ``` -The `avg WER` of g2p is: 0.028952373312476395 +The `avg WER` of g2p is: 0.024219452438490413 ```text ,--------------------------------------------------------------------. | ./exp/g2p/text.g2p | |--------------------------------------------------------------------| | SPKR | # Snt # Wrd | Corr Sub Del Ins Err S.Err | - |--------+-----------------+-----------------------------------------| - | Sum/Avg| 9996 299181 | 97.2 2.8 0.0 0.1 2.9 53.3 | + | Sum/Avg| 9996 299181 | 97.6 2.4 0.0 0.0 2.4 49.2 | `--------------------------------------------------------------------' ``` diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 9d9be0ac..872d564c 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -1359,9 +1359,9 @@ g2pw_onnx_models = { 'G2PWModel': { '1.0': { 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel.tar', + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.0.zip', 'md5': - '63bc0894af15a5a591e58b2130a2bcac', + '7e049a55547da840502cf99e8a64f20e', }, }, } diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py index 3a406ad2..9e708ec8 100644 --- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py +++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py @@ -31,8 +31,11 @@ from paddlespeech.t2s.frontend.g2pw.dataset import get_char_phoneme_labels from paddlespeech.t2s.frontend.g2pw.dataset import get_phoneme_labels from paddlespeech.t2s.frontend.g2pw.dataset import prepare_onnx_input from paddlespeech.t2s.frontend.g2pw.utils import load_config +from paddlespeech.t2s.frontend.zh_normalization.char_convert import tranditional_to_simplified from paddlespeech.utils.env import MODEL_HOME +model_version = '1.0' + def predict(session, onnx_input, labels): all_preds = [] @@ -62,34 +65,38 @@ class G2PWOnnxConverter: style='bopomofo', model_source=None, enable_non_tradional_chinese=False): - if not os.path.exists(os.path.join(model_dir, 'G2PWModel/g2pW.onnx')): - uncompress_path = download_and_decompress( - g2pw_onnx_models['G2PWModel']['1.0'], model_dir) + uncompress_path = download_and_decompress( + g2pw_onnx_models['G2PWModel'][model_version], model_dir) sess_options = onnxruntime.SessionOptions() sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL sess_options.intra_op_num_threads = 2 self.session_g2pW = onnxruntime.InferenceSession( - os.path.join(model_dir, 'G2PWModel/g2pW.onnx'), + os.path.join(uncompress_path, 'g2pW.onnx'), sess_options=sess_options) self.config = load_config( - os.path.join(model_dir, 'G2PWModel/config.py'), use_default=True) + os.path.join(uncompress_path, 'config.py'), use_default=True) self.model_source = model_source if model_source else self.config.model_source self.enable_opencc = enable_non_tradional_chinese self.tokenizer = BertTokenizer.from_pretrained(self.config.model_source) - polyphonic_chars_path = os.path.join(model_dir, - 'G2PWModel/POLYPHONIC_CHARS.txt') - monophonic_chars_path = os.path.join(model_dir, - 'G2PWModel/MONOPHONIC_CHARS.txt') + polyphonic_chars_path = os.path.join(uncompress_path, + 'POLYPHONIC_CHARS.txt') + monophonic_chars_path = os.path.join(uncompress_path, + 'MONOPHONIC_CHARS.txt') self.polyphonic_chars = [ line.split('\t') for line in open(polyphonic_chars_path, encoding='utf-8').read() .strip().split('\n') ] + self.non_polyphonic = { + '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗', + '肖', '瘙', '誒', '泊' + } + self.non_monophonic = {'似', '攢'} self.monophonic_chars = [ line.split('\t') for line in open(monophonic_chars_path, encoding='utf-8').read() @@ -101,13 +108,27 @@ class G2PWOnnxConverter: self.polyphonic_chars) self.chars = sorted(list(self.char2phonemes.keys())) + + self.polyphonic_chars_new = set(self.chars) + for char in self.non_polyphonic: + if char in self.polyphonic_chars_new: + self.polyphonic_chars_new.remove(char) + + self.monophonic_chars_dict = { + char: phoneme + for char, phoneme in self.monophonic_chars + } + for char in self.non_monophonic: + if char in self.monophonic_chars_dict: + self.monophonic_chars_dict.pop(char) + self.pos_tags = [ 'UNK', 'A', 'C', 'D', 'I', 'N', 'P', 'T', 'V', 'DE', 'SHI' ] with open( - os.path.join(model_dir, - 'G2PWModel/bopomofo_to_pinyin_wo_tune_dict.json'), + os.path.join(uncompress_path, + 'bopomofo_to_pinyin_wo_tune_dict.json'), 'r', encoding='utf-8') as fr: self.bopomofo_convert_dict = json.load(fr) @@ -117,7 +138,7 @@ class G2PWOnnxConverter: }[style] with open( - os.path.join(model_dir, 'G2PWModel/char_bopomofo_dict.json'), + os.path.join(uncompress_path, 'char_bopomofo_dict.json'), 'r', encoding='utf-8') as fr: self.char_bopomofo_dict = json.load(fr) @@ -175,25 +196,25 @@ class G2PWOnnxConverter: return results def _prepare_data(self, sentences): - polyphonic_chars = set(self.chars) - monophonic_chars_dict = { - char: phoneme - for char, phoneme in self.monophonic_chars - } texts, query_ids, sent_ids, partial_results = [], [], [], [] for sent_id, sent in enumerate(sentences): - pypinyin_result = pinyin(sent, style=Style.TONE3) + # pypinyin works well for Simplified Chinese than Traditional Chinese + sent_s = tranditional_to_simplified(sent) + pypinyin_result = pinyin(sent_s, style=Style.TONE3) partial_result = [None] * len(sent) for i, char in enumerate(sent): - if char in polyphonic_chars: + if char in self.polyphonic_chars_new: texts.append(sent) query_ids.append(i) sent_ids.append(sent_id) - elif char in monophonic_chars_dict: + elif char in self.monophonic_chars_dict: partial_result[i] = self.style_convert_func( - monophonic_chars_dict[char]) + self.monophonic_chars_dict[char]) elif char in self.char_bopomofo_dict: partial_result[i] = pypinyin_result[i][0] # partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0]) + else: + partial_result[i] = pypinyin_result[i][0] + partial_results.append(partial_result) return texts, query_ids, sent_ids, partial_results diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml index 629bcd26..2c7cf33f 100644 --- a/paddlespeech/t2s/frontend/polyphonic.yaml +++ b/paddlespeech/t2s/frontend/polyphonic.yaml @@ -23,4 +23,22 @@ polyphonic: 鸭绿江: ['ya1','lu4','jiang1'] 撒切尔: ['sa4','qie4','er3'] 比比皆是: ['bi3','bi3','jie1','shi4'] - 身无长物: ['shen1','wu2','chang2','wu4'] \ No newline at end of file + 身无长物: ['shen1','wu2','chang2','wu4'] + 手里: ['shou2','li3'] + 关卡: ['guan1','qia3'] + 怀揣: ['huai2','chuai1'] + 挑剔: ['tiao1','ti4'] + 供称: ['gong4','cheng1'] + 作坊: ['zuo1', 'fang5'] + 中医: ['zhong1','yi1'] + 嚷嚷: ['rang1','rang5'] + 商厦: ['shang1','sha4'] + 大厦: ['da4','sha4'] + 刹车: ['sha1','che1'] + 嘚瑟: ['de4','se5'] + 朝鲜: ['chao2','xian3'] + 阿房宫: ['e1','pang2','gong1'] + 阿胶: ['e1','jiao1'] + 咖喱: ['ga1','li5'] + 时分: ['shi2','fen1'] + 蚌埠: ['beng4','bu4'] diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index e3102b9b..e5ef617a 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -41,30 +41,32 @@ class ToneSandhi(): '棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事', '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾', '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼', - '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打点', '打扮', '打听', '打发', '扎实', - '扁担', '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头', - '念叨', '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', - '干事', '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', - '屁股', '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', - '实在', '官司', '学问', '学生', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈', - '姑娘', '姐夫', '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方', - '大意', '大夫', '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴', - '嘱咐', '嘟囔', '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦', - '咳嗽', '和尚', '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝', - '叫唤', '口袋', '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹', - '功夫', '力气', '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息', - '凑合', '凉快', '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤', - '佩服', '作坊', '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家', - '交情', '云彩', '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故', - '不由', '不在', '下水', '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', - '父亲', '母亲', '咕噜', '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', - '幸福', '熟悉', '计划', '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', - '凤凰', '拖沓', '寒碜', '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', - '扫把', '惦记' + '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打扮', '打听', '打发', '扎实', '扁担', + '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头', '念叨', + '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', '干事', + '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', '屁股', + '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', '实在', + '官司', '学问', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈', '姑娘', '姐夫', + '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方', '大意', '大夫', + '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴', '嘱咐', '嘟囔', + '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦', '咳嗽', '和尚', + '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝', '叫唤', '口袋', + '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹', '功夫', '力气', + '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息', '凑合', '凉快', + '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤', '佩服', '作坊', + '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家', '交情', '云彩', + '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故', '不由', '下水', + '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', '父亲', '母亲', '咕噜', + '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划', + '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜', + '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记', '戏弄', + '将军', '别人' } self.must_not_neural_tone_words = { - "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎", - "幺幺" + '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎', + '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得', + '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打', + '考考', '整整', '莘莘' } self.punc = ":,;。?!“”‘’':,;.?!" @@ -75,27 +77,24 @@ class ToneSandhi(): # finals: ['ia1', 'i3'] def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: - + if word in self.must_not_neural_tone_words: + return finals # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 for j, item in enumerate(word): - if j - 1 >= 0 and item == word[j - 1] and pos[0] in { - "n", "v", "a" - } and word not in self.must_not_neural_tone_words: + if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}: finals[j] = finals[j][:-1] + "5" ge_idx = word.find("个") - if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": + if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": finals[-1] = finals[-1][:-1] + "5" elif len(word) >= 1 and word[-1] in "的地得": finals[-1] = finals[-1][:-1] + "5" # e.g. 走了, 看着, 去过 elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}: finals[-1] = finals[-1][:-1] + "5" - elif len(word) > 1 and word[-1] in "们子" and pos in { - "r", "n" - } and word not in self.must_not_neural_tone_words: + elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"}: finals[-1] = finals[-1][:-1] + "5" - # e.g. 桌上, 地下, 家里 - elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}: + # e.g. 桌上, 地下 + elif len(word) > 1 and word[-1] in "上下" and pos in {"s", "l", "f"}: finals[-1] = finals[-1][:-1] + "5" # e.g. 上来, 下去 elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开": @@ -147,7 +146,7 @@ class ToneSandhi(): for i, char in enumerate(word): if char == "一" and i + 1 < len(word): # "一" before tone4 should be yi2, e.g. 一段 - if finals[i + 1][-1] == "4": + if finals[i + 1][-1] in {'4', '5'}: finals[i] = finals[i][:-1] + "2" # "一" before non-tone4 should be yi4, e.g. 一天 else: @@ -239,7 +238,12 @@ class ToneSandhi(): for i, (word, pos) in enumerate(seg): if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][ 0] == seg[i + 1][0] and seg[i - 1][1] == "v": - new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] + if i - 1 < len(new_seg): + new_seg[i - + 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0] + else: + new_seg.append([word, pos]) + new_seg.append([seg[i + 1][0], pos]) else: if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][ 0] == word and pos == "v": diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 9513a459..722eed60 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -84,6 +84,24 @@ class Frontend(): self.tone_modifier = ToneSandhi() self.text_normalizer = TextNormalizer() self.punc = ":,;。?!“”‘’':,;.?!" + self.phrases_dict = { + '开户行': [['ka1i'], ['hu4'], ['hang2']], + '发卡行': [['fa4'], ['ka3'], ['hang2']], + '放款行': [['fa4ng'], ['kua3n'], ['hang2']], + '茧行': [['jia3n'], ['hang2']], + '行号': [['hang2'], ['ha4o']], + '各地': [['ge4'], ['di4']], + '借还款': [['jie4'], ['hua2n'], ['kua3n']], + '时间为': [['shi2'], ['jia1n'], ['we2i']], + '为准': [['we2i'], ['zhu3n']], + '色差': [['se4'], ['cha1']], + '嗲': [['dia3']], + '呗': [['bei5']], + '不': [['bu4']], + '咗': [['zuo5']], + '嘞': [['lei5']], + '掺和': [['chan1'], ['huo5']] + } # g2p_model can be pypinyin and g2pM and g2pW self.g2p_model = g2p_model if self.g2p_model == "g2pM": @@ -91,6 +109,8 @@ class Frontend(): self.pinyin2phone = generate_lexicon( with_tone=True, with_erhua=False) elif self.g2p_model == "g2pW": + # use pypinyin as backup for non polyphonic characters in g2pW + self._init_pypinyin() self.corrector = Polyphonic() self.g2pM_model = G2pM() self.g2pW_model = G2PWOnnxConverter( @@ -99,8 +119,10 @@ class Frontend(): with_tone=True, with_erhua=False) else: - self.__init__pypinyin() - self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"} + self._init_pypinyin() + self.must_erhua = { + "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿" + } self.not_erhua = { "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿", "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿", @@ -108,6 +130,7 @@ class Frontend(): "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿", "狗儿" } + self.vocab_phones = {} self.vocab_tones = {} if phone_vocab_path: @@ -121,20 +144,9 @@ class Frontend(): for tone, id in tone_id: self.vocab_tones[tone] = int(id) - def __init__pypinyin(self): + def _init_pypinyin(self): large_pinyin.load() - - load_phrases_dict({u'开户行': [[u'ka1i'], [u'hu4'], [u'hang2']]}) - load_phrases_dict({u'发卡行': [[u'fa4'], [u'ka3'], [u'hang2']]}) - load_phrases_dict({u'放款行': [[u'fa4ng'], [u'kua3n'], [u'hang2']]}) - load_phrases_dict({u'茧行': [[u'jia3n'], [u'hang2']]}) - load_phrases_dict({u'行号': [[u'hang2'], [u'ha4o']]}) - load_phrases_dict({u'各地': [[u'ge4'], [u'di4']]}) - load_phrases_dict({u'借还款': [[u'jie4'], [u'hua2n'], [u'kua3n']]}) - load_phrases_dict({u'时间为': [[u'shi2'], [u'jia1n'], [u'we2i']]}) - load_phrases_dict({u'为准': [[u'we2i'], [u'zhu3n']]}) - load_phrases_dict({u'色差': [[u'se4'], [u'cha1']]}) - + load_phrases_dict(self.phrases_dict) # 调整字的拼音顺序 load_single_dict({ord(u'地'): u'de,di4'}) @@ -258,7 +270,6 @@ class Frontend(): phones.append('sp') if v and v not in self.punc: phones.append(v) - phones_list.append(phones) if merge_sentences: merge_list = sum(phones_list, []) @@ -275,6 +286,10 @@ class Frontend(): finals: List[str], word: str, pos: str) -> List[List[str]]: + # fix er1 + for i, phn in enumerate(finals): + if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1': + finals[i] = 'er2' if word not in self.must_erhua and (word in self.not_erhua or pos in {"a", "j", "nr"}): return initials, finals