You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/python-pinyin/pinyin-data/tools/gen_gb_pua.py

167 lines
5.8 KiB

E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code
3 years ago
# -*- coding: utf-8 -*-
import re
import sys
sys.path.append('.')
from merge_unihan import parse_pinyins
def get_pinyins(file_path):
with open(file_path) as fp:
return parse_pinyins(fp)
def get_pua_map():
text = '''
# A6D9 E78D () FE10 (︐)
# A6DA E78E () FE12 (︒)
# A6DB E78F () FE11 (︑)
# A6DC E790 () FE13 (︓)
# A6DD E791 () FE14 (︔)
# A6DE E792 () FE15 (︕)
# A6DF E793 () FE16 (︖)
# A6EC E794 () FE17 (︗)
# A6ED E795 () FE18 (︘)
# A8BC E7C7 () 1E3F (ḿ) 1E3F (ḿ)
# A8BF E7C8 () 01F9 (ǹ) 01F9 (ǹ)
# A989 E7E7 () 303E (〾) 303E (〾)
# A98A E7E8 () 2FF0 (⿰) 2FF0 (⿰)
# A98B E7E9 () 2FF1 (⿱) 2FF1 (⿱)
# A98C E7EA () 2FF2 (⿲) 2FF2 (⿲)
# A98D E7EB () 2FF3 (⿳) 2FF3 (⿳)
# A98E E7EC () 2FF4 (⿴) 2FF4 (⿴)
# A98F E7ED () 2FF5 (⿵) 2FF5 (⿵)
# A990 E7EE () 2FF6 (⿶) 2FF6 (⿶)
# A991 E7EF () 2FF7 (⿷) 2FF7 (⿷)
# A992 E7F0 () 2FF8 (⿸) 2FF8 (⿸)
# A993 E7F1 () 2FF9 (⿹) 2FF9 (⿹)
# A994 E7F2 () 2FFA (⿺) 2FFA (⿺)
# A995 E7F3 () 2FFB (⿻) 2FFB (⿻)
FE50 E815 () 2E81 () 2E81 ()
FE51 E816 () E816 () 20087 (𠂇)
FE52 E817 () E817 () 20089 (𠂉)
FE53 E818 () E818 () 200CC (𠃌)
FE54 E819 () 2E84 () 2E84 ()
FE55 E81A () 3473 () 3473 ()
FE56 E81B () 3447 () 3447 ()
FE57 E81C () 2E88 () 2E88 ()
FE58 E81D () 2E8B () 2E8B ()
FE59 E81E () E81E () 9FB4 ()
FE5A E81F () 359E () 359E ()
FE5B E820 () 361A () 361A ()
FE5C E821 () 360E () 360E ()
FE5D E822 () 2E8C () 2E8C ()
FE5E E823 () 2E97 () 2E97 ()
FE5F E824 () 396E () 396E ()
FE60 E825 () 3918 () 3918 ()
FE61 E826 () E826 () 9FB5 ()
FE62 E827 () 39CF () 39CF ()
FE63 E828 () 39DF () 39DF ()
FE64 E829 () 3A73 () 3A73 ()
FE65 E82A () 39D0 () 39D0 ()
FE66 E82B () E82B () 9FB6 ()
FE67 E82C () E82C () 9FB7 ()
FE68 E82D () 3B4E () 3B4E ()
FE69 E82E () 3C6E () 3C6E ()
FE6A E82F () 3CE0 () 3CE0 ()
FE6B E830 () 2EA7 () 2EA7 ()
FE6C E831 () E831 () 215D7 (𡗗)
FE6D E832 () E832 () 9FB8 ()
FE6E E833 () 2EAA () 2EAA ()
FE6F E834 () 4056 () 4056 ()
FE70 E835 () 415F () 415F ()
FE71 E836 () 2EAE () 2EAE ()
FE72 E837 () 4337 () 4337 ()
FE73 E838 () 2EB3 () 2EB3 ()
FE74 E839 () 2EB6 () 2EB6 ()
FE75 E83A () 2EB7 () 2EB7 ()
FE76 E83B () E83B () 2298F (𢦏)
FE77 E83C () 43B1 () 43B1 ()
FE78 E83D () 43AC () 43AC ()
FE79 E83E () 2EBB () 2EBB ()
FE7A E83F () 43DD () 43DD ()
FE7B E840 () 44D6 () 44D6 ()
FE7C E841 () 4661 () 4661 ()
FE7D E842 () 464C () 464C ()
FE7E E843 () E843 () 9FB9 ()
FE80 E844 () 4723 () 4723 ()
FE81 E845 () 4729 () 4729 ()
FE82 E846 () 477C () 477C ()
FE83 E847 () 478D () 478D ()
FE84 E848 () 2ECA () 2ECA ()
FE85 E849 () 4947 () 4947 ()
FE86 E84A () 497A () 497A ()
FE87 E84B () 497D () 497D ()
FE88 E84C () 4982 () 4982 ()
FE89 E84D () 4983 () 4983 ()
FE8A E84E () 4985 () 4985 ()
FE8B E84F () 4986 () 4986 ()
FE8C E850 () 499F () 499F ()
FE8D E851 () 499B () 499B ()
FE8E E852 () 49B7 () 49B7 ()
FE8F E853 () 49B6 () 49B6 ()
FE90 E854 () E854 () 9FBA ()
FE91 E855 () E855 () 241FE (𤇾)
FE92 E856 () 4CA3 () 4CA3 ()
FE93 E857 () 4C9F () 4C9F ()
FE94 E858 () 4CA0 () 4CA0 ()
FE95 E859 () 4CA1 () 4CA1 ()
FE96 E85A () 4C77 () 4C77 ()
FE97 E85B () 4CA2 () 4CA2 ()
FE98 E85C () 4D13 () 4D13 ()
FE99 E85D () 4D14 () 4D14 ()
FE9A E85E () 4D15 () 4D15 ()
FE9B E85F () 4D16 () 4D16 ()
FE9C E860 () 4D17 () 4D17 ()
FE9D E861 () 4D18 () 4D18 ()
FE9E E862 () 4D19 () 4D19 ()
FE9F E863 () 4DAE () 4DAE ()
FEA0 E864 () E864 () 9FBB ()
'''.strip()
for line in text.split('\n'):
if line.startswith('#'):
continue
gb, gbk, gb_18030, unicode_4_1 = line.split('\t')
# print(gb, gbk, gb_18030, unicode_4_1)
# print(get_han_point(gbk), get_han_point(unicode_4_1))
yield get_han_point(gbk), get_han_point(unicode_4_1)
def get_han_point(text):
if not text:
return '', ''
regex = re.compile(r'(?P<point>[A-Z0-9]+) \((?P<han>[^\)]+)\)')
result = regex.findall(text)
return result[0]
def point_to_u_point(point):
point = point.upper()
if not point.startswith('U+'):
point = 'U+' + point
return point
def gen_pua_data(gbk, unicode_4_1, pinyin_map):
gbk_point, gbk_han = gbk
gbk_point = point_to_u_point(gbk_point)
unicode_4_1_point, unicode_4_1_han = unicode_4_1
unicode_4_1_point = point_to_u_point(unicode_4_1_point)
pinyins = ','.join(pinyin_map.get(unicode_4_1_point, []))
prefix = ''
if not pinyins:
prefix = '# '
return (
'{prefix}{gbk_point}: {pinyins} # {gbk_han} '
'Unihan: {unicode_4_1_point} {unicode_4_1_han}'
).format(**locals())
if __name__ == '__main__':
pinyin_map = get_pinyins('pinyin.txt')
print('# GBK/GB 18030 PUA 映射\n'
'# 详见https://zh.wikipedia.org/wiki/GB_18030#PUA')
for gbk, unicode_4_1 in get_pua_map():
print(gen_pua_data(gbk, unicode_4_1, pinyin_map))