You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/python-pinyin/pinyin-data/unihan/parse_pinyin.py

103 lines
2.8 KiB

E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code
4 years ago
# -*- coding: utf-8 -*-
import functools
import operator
import re
def re_match_pinyin_line(kind):
return re.compile(
r'^U\+(?P<code>[0-9A-Z]+)\t{}\t(?P<pinyin>.+)$'.format(kind)
)
PINYIN = r'[^\d\.,]+'
re_khanyupinyin = re.compile(r'''
(?:\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:
((?:%(pinyin)s,)*)
(%(pinyin)s)
''' % ({'pinyin': PINYIN}), re.X)
re_kmandarin = re.compile(r'''
()()
({pinyin})
'''.format(pinyin=PINYIN), re.X)
re_kxhc1983 = re.compile(r'''
()()[0-9]{4}\.[0-9]{3}\*?
(?:,[0-9]{4}\.[0-9]{3}\*?)*:
(%(pinyin)s)
''' % ({'pinyin': PINYIN}), re.X)
re_khanyupinlu = re.compile(r'''
()()({pinyin})\([0-9]+\)
'''.format(pinyin=PINYIN), re.X)
re_ktghz2013 = re.compile(r'''
()()[0-9]{3}\.[0-9]{3}
(?:,[0-9]{3}\.[0-9]{3})*:
(%(pinyin)s)
''' % ({'pinyin': PINYIN}), re.X)
re_kinds_map = {
'kHanyuPinyin': re_khanyupinyin,
'kMandarin': re_kmandarin,
'kXHC1983': re_kxhc1983,
'kHanyuPinlu': re_khanyupinlu,
'kTGHZ2013': re_ktghz2013,
}
def remove_dup_items(lst):
new_list = []
for item in lst:
if item not in new_list:
new_list.append(item)
return new_list
def parse(lines, kind='kHanyuPinyin', ignore_prefix='#') -> str:
re_line = re_match_pinyin_line(kind)
re_pinyin = re_kinds_map[kind]
for line in lines:
line = line.strip()
if line.startswith(ignore_prefix):
continue
match = re_line.match(line)
if match is None:
continue
code = match.group('code')
raw_pinyin = match.group('pinyin')
raw_pinyins = re_pinyin.findall(raw_pinyin)
# 处理有三个或三个以上拼音的情况,此时 raw_pinyins 类似
# [(' xī,', 'lǔ '), (' lǔ,', 'xī')] or [('shú,dú,', 'tù')]
for n, values in enumerate(raw_pinyins):
value = []
for v in values:
value.extend(v.split(','))
raw_pinyins[n] = value
pinyins = functools.reduce(
operator.add, raw_pinyins
)
pinyins = [x.strip() for x in pinyins if x.strip()]
pinyins = remove_dup_items(pinyins)
pinyin = ','.join(pinyins)
yield code, pinyin
def save_data(pinyins, writer):
for code, pinyin in pinyins:
gl = {}
exec('hanzi=chr(0x{})'.format(code), gl)
hanzi = gl['hanzi']
line = 'U+{code}: {pinyin} # {hanzi}\n'.format(
code=code, pinyin=pinyin, hanzi=hanzi
)
writer.write(line)
if __name__ == '__main__':
with open('Unihan_Readings.txt') as fp:
for kind in ('kHanyuPinyin', 'kMandarin',
'kHanyuPinlu', 'kXHC1983', 'kTGHZ2013'):
fp.seek(0)
with open('{}.txt'.format(kind), 'w') as writer:
pinyins = parse(fp.readlines(), kind=kind)
save_data(pinyins, writer)