update g2pW dict

pull/2230/head
BarryKCL 2 years ago
parent 6593c24968
commit a84b40ef79

@ -1345,7 +1345,7 @@ g2pw_onnx_models = {
'url': 'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel.tar', 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel.tar',
'md5': 'md5':
'86a3dd8db0291c575c46e134111dce23', '63bc0894af15a5a591e58b2130a2bcac',
}, },
}, },
} }

@ -8,7 +8,7 @@ import onnxruntime
import numpy as np import numpy as np
from opencc import OpenCC from opencc import OpenCC
from pypinyin import pinyin, lazy_pinyin, Style
from paddlenlp.transformers import BertTokenizer from paddlenlp.transformers import BertTokenizer
from paddlespeech.utils.env import MODEL_HOME from paddlespeech.utils.env import MODEL_HOME
from paddlespeech.t2s.frontend.g2pw.dataset import prepare_data,\ from paddlespeech.t2s.frontend.g2pw.dataset import prepare_data,\
@ -127,6 +127,7 @@ class G2PWOnnxConverter:
} }
texts, query_ids, sent_ids, partial_results = [], [], [], [] texts, query_ids, sent_ids, partial_results = [], [], [], []
for sent_id, sent in enumerate(sentences): for sent_id, sent in enumerate(sentences):
pypinyin_result = pinyin(sent,style=Style.TONE3)
partial_result = [None] * len(sent) partial_result = [None] * len(sent)
for i, char in enumerate(sent): for i, char in enumerate(sent):
if char in polyphonic_chars: if char in polyphonic_chars:
@ -136,6 +137,7 @@ class G2PWOnnxConverter:
elif char in monophonic_chars_dict: elif char in monophonic_chars_dict:
partial_result[i] = self.style_convert_func(monophonic_chars_dict[char]) partial_result[i] = self.style_convert_func(monophonic_chars_dict[char])
elif char in self.char_bopomofo_dict: elif char in self.char_bopomofo_dict:
partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0]) partial_result[i] = pypinyin_result[i][0]
# partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0])
partial_results.append(partial_result) partial_results.append(partial_result)
return texts, query_ids, sent_ids, partial_results return texts, query_ids, sent_ids, partial_results

@ -1,5 +1,26 @@
polyphonic: polyphonic:
湖泊: ['hu2','po1'] 湖泊: ['hu2','po1']
地壳: ['di4','qiao4']
柏树: ['bai3','shu4']
曝光: ['bao4','guang1']
弹力: ['tan2','li4'] 弹力: ['tan2','li4']
字帖: ['zi4','tie4']
口吃: ['kou3','chi1']
包扎: ['bao1','za1']
哪吒: ['ne2','zha1']
说服: ['shuo1','fu2']
识字: ['shi2','zi4']
骨头: ['gu3','tou5']
对称: ['dui4','chen4']
口供: ['kou3','gong4']
抹布: ['ma1','bu4']
露背: ['lu4','bei4']
圈养: ['juan4', 'yang3']
眼眶: ['yan3', 'kuang4']
品行: ['pin3','xing2']
颤抖: ['chan4','dou3'] 颤抖: ['chan4','dou3']
差不多: ['cha4','bu5','duo1']
鸭绿江: ['ya1','lu4','jiang1'] 鸭绿江: ['ya1','lu4','jiang1']
撒切尔: ['sa4','qie4','er3']
比比皆是: ['bi3','bi3','jie1','shi4']
身无长物: ['shen1','wu2','chang2','wu4']
Loading…
Cancel
Save