PaddleSpeech/third_party/python-pinyin/pinyin-data/unihan/parse_pinyin.py

# -*- coding: utf-8 -*-
import functools
import operator
import re


def re_match_pinyin_line(kind):
    return re.compile(
        r'^U\+(?P<code>[0-9A-Z]+)\t{}\t(?P<pinyin>.+)$'.format(kind)
    )


PINYIN = r'[^\d\.,]+'
re_khanyupinyin = re.compile(r'''
    (?:\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:
    ((?:%(pinyin)s,)*)
    (%(pinyin)s)
''' % ({'pinyin': PINYIN}), re.X)
re_kmandarin = re.compile(r'''
    ()()
    ({pinyin})
'''.format(pinyin=PINYIN), re.X)
re_kxhc1983 = re.compile(r'''
    ()()[0-9]{4}\.[0-9]{3}\*?
    (?:,[0-9]{4}\.[0-9]{3}\*?)*:
    (%(pinyin)s)
''' % ({'pinyin': PINYIN}), re.X)
re_khanyupinlu = re.compile(r'''
    ()()({pinyin})\([0-9]+\)
'''.format(pinyin=PINYIN), re.X)
re_ktghz2013 = re.compile(r'''
    ()()[0-9]{3}\.[0-9]{3}
    (?:,[0-9]{3}\.[0-9]{3})*:
    (%(pinyin)s)
''' % ({'pinyin': PINYIN}), re.X)
re_kinds_map = {
    'kHanyuPinyin': re_khanyupinyin,
    'kMandarin': re_kmandarin,
    'kXHC1983': re_kxhc1983,
    'kHanyuPinlu': re_khanyupinlu,
    'kTGHZ2013': re_ktghz2013,
}


def remove_dup_items(lst):
    new_list = []
    for item in lst:
        if item not in new_list:
            new_list.append(item)
    return new_list


def parse(lines, kind='kHanyuPinyin', ignore_prefix='#') -> str:
    re_line = re_match_pinyin_line(kind)
    re_pinyin = re_kinds_map[kind]
    for line in lines:
        line = line.strip()
        if line.startswith(ignore_prefix):
            continue
        match = re_line.match(line)
        if match is None:
            continue

        code = match.group('code')
        raw_pinyin = match.group('pinyin')
        raw_pinyins = re_pinyin.findall(raw_pinyin)
        # 处理有三个或三个以上拼音的情况，此时 raw_pinyins 类似
        # [(' xī,', 'lǔ '), (' lǔ,', 'xī')] or [('shú,dú,', 'tù')]
        for n, values in enumerate(raw_pinyins):
            value = []
            for v in values:
                value.extend(v.split(','))
            raw_pinyins[n] = value

        pinyins = functools.reduce(
            operator.add, raw_pinyins
        )
        pinyins = [x.strip() for x in pinyins if x.strip()]
        pinyins = remove_dup_items(pinyins)
        pinyin = ','.join(pinyins)
        yield code, pinyin


def save_data(pinyins, writer):
    for code, pinyin in pinyins:
        gl = {}
        exec('hanzi=chr(0x{})'.format(code), gl)
        hanzi = gl['hanzi']
        line = 'U+{code}: {pinyin}  # {hanzi}\n'.format(
            code=code, pinyin=pinyin, hanzi=hanzi
        )
        writer.write(line)


if __name__ == '__main__':
    with open('Unihan_Readings.txt') as fp:
        for kind in ('kHanyuPinyin', 'kMandarin',
                     'kHanyuPinlu', 'kXHC1983', 'kTGHZ2013'):
            fp.seek(0)
            with open('{}.txt'.format(kind), 'w') as writer:
                pinyins = parse(fp.readlines(), kind=kind)
                save_data(pinyins, writer)
E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code 4 years ago			`# -- coding: utf-8 --`
			`import functools`
			`import operator`
			`import re`


			`def re_match_pinyin_line(kind):`
			`return re.compile(`
			`r'^U\+(?P<code>[0-9A-Z]+)\t{}\t(?P<pinyin>.+)$'.format(kind)`
			`)`


			`PINYIN = r'[^\d\.,]+'`
			`re_khanyupinyin = re.compile(r'''`
			`(?:\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:`
			`((?:%(pinyin)s,)*)`
			`(%(pinyin)s)`
			`''' % ({'pinyin': PINYIN}), re.X)`
			`re_kmandarin = re.compile(r'''`
			`()()`
			`({pinyin})`
			`'''.format(pinyin=PINYIN), re.X)`
			`re_kxhc1983 = re.compile(r'''`
			`()()[0-9]{4}\.[0-9]{3}\*?`
			`(?:,[0-9]{4}\.[0-9]{3}\?):`
			`(%(pinyin)s)`
			`''' % ({'pinyin': PINYIN}), re.X)`
			`re_khanyupinlu = re.compile(r'''`
			`()()({pinyin})\([0-9]+\)`
			`'''.format(pinyin=PINYIN), re.X)`
			`re_ktghz2013 = re.compile(r'''`
			`()()[0-9]{3}\.[0-9]{3}`
			`(?:,[0-9]{3}\.[0-9]{3})*:`
			`(%(pinyin)s)`
			`''' % ({'pinyin': PINYIN}), re.X)`
			`re_kinds_map = {`
			`'kHanyuPinyin': re_khanyupinyin,`
			`'kMandarin': re_kmandarin,`
			`'kXHC1983': re_kxhc1983,`
			`'kHanyuPinlu': re_khanyupinlu,`
			`'kTGHZ2013': re_ktghz2013,`
			`}`


			`def remove_dup_items(lst):`
			`new_list = []`
			`for item in lst:`
			`if item not in new_list:`
			`new_list.append(item)`
			`return new_list`


			`def parse(lines, kind='kHanyuPinyin', ignore_prefix='#') -> str:`
			`re_line = re_match_pinyin_line(kind)`
			`re_pinyin = re_kinds_map[kind]`
			`for line in lines:`
			`line = line.strip()`
			`if line.startswith(ignore_prefix):`
			`continue`
			`match = re_line.match(line)`
			`if match is None:`
			`continue`

			`code = match.group('code')`
			`raw_pinyin = match.group('pinyin')`
			`raw_pinyins = re_pinyin.findall(raw_pinyin)`
			`# 处理有三个或三个以上拼音的情况，此时 raw_pinyins 类似`
			`# [(' xī,', 'lǔ '), (' lǔ,', 'xī')] or [('shú,dú,', 'tù')]`
			`for n, values in enumerate(raw_pinyins):`
			`value = []`
			`for v in values:`
			`value.extend(v.split(','))`
			`raw_pinyins[n] = value`

			`pinyins = functools.reduce(`
			`operator.add, raw_pinyins`
			`)`
			`pinyins = [x.strip() for x in pinyins if x.strip()]`
			`pinyins = remove_dup_items(pinyins)`
			`pinyin = ','.join(pinyins)`
			`yield code, pinyin`


			`def save_data(pinyins, writer):`
			`for code, pinyin in pinyins:`
			`gl = {}`
			`exec('hanzi=chr(0x{})'.format(code), gl)`
			`hanzi = gl['hanzi']`
			`line = 'U+{code}: {pinyin} # {hanzi}\n'.format(`
			`code=code, pinyin=pinyin, hanzi=hanzi`
			`)`
			`writer.write(line)`


			`if __name__ == '__main__':`
			`with open('Unihan_Readings.txt') as fp:`
			`for kind in ('kHanyuPinyin', 'kMandarin',`
			`'kHanyuPinlu', 'kXHC1983', 'kTGHZ2013'):`
			`fp.seek(0)`
			`with open('{}.txt'.format(kind), 'w') as writer:`
			`pinyins = parse(fp.readlines(), kind=kind)`
			`save_data(pinyins, writer)`