You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/python-pinyin/pinyin-data/tools/gen_8105.py

49 lines
1.3 KiB

# -*- coding: utf-8 -*-
"""生成初始的 kMandarin_8105.txt"""
from merge_unihan import parse_pinyins, code_to_hanzi
def parse_china_x():
with open('tools/china-8105-06062014.txt') as fp:
for line in fp:
line = line.strip()
if line.startswith('#') or not line:
continue
yield line.split()[0]
def parse_zdic():
with open('zdic.txt') as fp:
return parse_pinyins(fp)
def parse_kmandain():
with open('pinyin.txt') as fp:
return parse_pinyins(fp)
def diff(kmandarin, zdic, commons):
for key in commons:
hanzi = code_to_hanzi(key)
if key in kmandarin:
value = kmandarin[key][0]
if key in zdic and value != zdic[key][0]:
yield '{0}: {1} # {2} -> {3}'.format(
key, value, hanzi, zdic[key][0]
)
else:
yield '{0}: {1} # {2}'.format(key, value, hanzi)
elif key in zdic:
value = zdic[key][0]
yield '{0}: {1} # {2}'.format(key, value, hanzi)
else:
yield '# {0}: {1} # {2}'.format(key, '<-', hanzi)
if __name__ == '__main__':
zdic = parse_zdic()
kmandarin = parse_kmandain()
commons = parse_china_x()
lst = diff(kmandarin, zdic, commons)
for x in lst:
print(x)