PaddleSpeech/third_party/python-pinyin/pinyin-data/tools/improve_8105.py

# -*- coding: utf-8 -*-
"""补充 8105 中汉字的拼音数据"""
from collections import namedtuple
import re
import sys

from pyquery import PyQuery
import requests

re_pinyin = re.compile(r'拼音：(?P<pinyin>\S+) ')
re_code = re.compile(r'统一码\w?：(?P<code>\S+) ')
re_alternate = re.compile(r'异体字：\s+?(?P<alternate>\S+)')
HanziInfo = namedtuple('HanziInfo', 'pinyin code alternate')


def fetch_html(url, params):
    response = requests.get(url, params=params)
    return response.content


def fetch_info(hanzi):
    url = 'http://www.guoxuedashi.com/zidian/so.php'
    params = {
        'sokeyzi': hanzi,
        'kz': 1,
        'submit': '',
    }
    html = fetch_html(url, params)
    pq = PyQuery(html)
    pq = PyQuery(pq('table.zui td')[1])
    text = pq('tr').text()
    text_alternate = pq(html)('.info_txt2')('em').text()

    pinyin = ''
    pinyin_match = re_pinyin.search(text)
    if pinyin_match is not None:
        pinyin = pinyin_match.group('pinyin')
    code = re_code.search(text).group('code')
    alternate = ''
    alternate_match = re_alternate.search(text_alternate)
    if alternate_match is not None:
        alternate = alternate_match.group('alternate')

    return HanziInfo(pinyin, code, alternate)


def parse_hanzi(hanzi):
    info = fetch_info(hanzi)
    if (not info.pinyin) and info.alternate:
        alternate = fetch_info(info.alternate)
    else:
        alternate = ''
    return HanziInfo(info.pinyin, info.code, alternate)


def main(lines):
    for line in lines:
        if line.startswith('# U+') and '<-' in line:
            # # U+xxx ... -> U+xxx
            code = line.split(':')[0].strip('# ')
            # U+xxx -> xxx
            code = code[2:]
            info = parse_hanzi(code)
            pinyin = info.pinyin
            extra = ''
            if (not pinyin) and info.alternate:
                alternate = info.alternate
                pinyin = alternate.pinyin
                extra = '  => U+{0}'.format(alternate.code)
                if ',' in pinyin:
                    first_pinyin, extra_pinyin = pinyin.split(',', 1)
                    pinyin = first_pinyin
                    extra += '  ?-> ' + extra_pinyin
            if pinyin:
                line = line.strip()
                # # U+xxx -> U+xxx
                line = line[2:]
                line = line.replace('<-', pinyin)
                if extra:
                    line += extra
        yield line.strip()

if __name__ == '__main__':
    args = sys.argv[1:]
    input_file = args[0]
    with open(input_file) as fp:
        for line in main(fp):
            print(line)
-												E2E/Streaming Transformer/Conformer ASR (#578)

* add cmvn and label smoothing loss layer

* add layer for transformer

* add glu and conformer conv

* add torch compatiable hack, mask funcs

* not hack size since it exists

* add test; attention

* add attention, common utils, hack paddle

* add audio utils

* conformer batch padding mask bug fix #223

* fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2

* fix ci

* fix ci

* add encoder

* refactor egs

* add decoder

* refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils

* refactor docs

* add fix

* fix readme

* fix bugs, refactor collator, add pad_sequence, fix ckpt bugs

* fix docstring

* refactor data feed order

* add u2 model

* refactor cmvn, test

* add utils

* add u2 config

* fix bugs

* fix bugs

* fix autograd maybe has problem when using inplace operation

* refactor data, build vocab; add format data

* fix text featurizer

* refactor build vocab

* add fbank, refactor feature of speech

* refactor audio feat

* refactor data preprare

* refactor data

* model init from config

* add u2 bins

* flake8

* can train

* fix bugs, add coverage, add scripts

* test can run

* fix data

* speed perturb with sox

* add spec aug

* fix for train

* fix train logitc

* fix logger

* log valid loss, time dataset process

* using np for speed perturb, remove some debug log of grad clip

* fix logger

* fix build vocab

* fix logger name

* using module logger as default

* fix

* fix install

* reorder imports

* fix board logger

* fix logger

* kaldi fbank and mfcc

* fix cmvn and print prarams

* fix add_eos_sos and cmvn

* fix cmvn compute

* fix logger and cmvn

* fix subsampling, label smoothing loss, remove useless

* add notebook test

* fix log

* fix tb logger

* multi gpu valid

* fix log

* fix log

* fix config

* fix compute cmvn, need paddle 2.1

* add cmvn notebook

* fix layer tools

* fix compute cmvn

* add rtf

* fix decoding

* fix layer tools

* fix log, add avg script

* more avg and test info

* fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh;

* add vimrc

* refactor tiny script, add transformer and stream conf

* spm demo; librisppech scripts and confs

* fix log

* add librispeech scripts

* refactor data pipe; fix conf; fix u2 default params

* fix bugs

* refactor aishell scripts

* fix test

* fix cmvn

* fix s0 scripts

* fix ds2 scripts and bugs

* fix dev & test dataset filter

* fix dataset filter

* filter dev

* fix ckpt path

* filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test

* add comment

* add syllable doc

* fix ds2 configs

* add doc

* add pypinyin tools

* fix decoder using blank_id=0

* mmseg with pybind11

* format code
											
										
										
											3 years ago
+								# -*- coding: utf-8 -*-
 								"""补充 8105 中汉字的拼音数据"""
 								from collections import namedtuple
 								import re
 								import sys
 								from pyquery import PyQuery
 								import requests
 								re_pinyin = re.compile(r'拼音：(?P<pinyin>\S+) ')
 								re_code = re.compile(r'统一码\w?：(?P<code>\S+) ')
 								re_alternate = re.compile(r'异体字：\s+?(?P<alternate>\S+)')
 								HanziInfo = namedtuple('HanziInfo', 'pinyin code alternate')
 								def fetch_html(url, params):
 								    response = requests.get(url, params=params)
 								    return response.content
 								def fetch_info(hanzi):
 								    url = 'http://www.guoxuedashi.com/zidian/so.php'
 								    params = {
 								        'sokeyzi': hanzi,
 								        'kz': 1,
 								        'submit': '',
 								    }
 								    html = fetch_html(url, params)
 								    pq = PyQuery(html)
 								    pq = PyQuery(pq('table.zui td')[1])
 								    text = pq('tr').text()
 								    text_alternate = pq(html)('.info_txt2')('em').text()
 								    pinyin = ''
 								    pinyin_match = re_pinyin.search(text)
 								    if pinyin_match is not None:
 								        pinyin = pinyin_match.group('pinyin')
 								    code = re_code.search(text).group('code')
 								    alternate = ''
 								    alternate_match = re_alternate.search(text_alternate)
 								    if alternate_match is not None:
 								        alternate = alternate_match.group('alternate')
 								    return HanziInfo(pinyin, code, alternate)
 								def parse_hanzi(hanzi):
 								    info = fetch_info(hanzi)
 								    if (not info.pinyin) and info.alternate:
 								        alternate = fetch_info(info.alternate)
 								    else:
 								        alternate = ''
 								    return HanziInfo(info.pinyin, info.code, alternate)
 								def main(lines):
 								    for line in lines:
 								        if line.startswith('# U+') and '<-' in line:
 								            # # U+xxx ... -> U+xxx
 								            code = line.split(':')[0].strip('# ')
 								            # U+xxx -> xxx
 								            code = code[2:]
 								            info = parse_hanzi(code)
 								            pinyin = info.pinyin
 								            extra = ''
 								            if (not pinyin) and info.alternate:
 								                alternate = info.alternate
 								                pinyin = alternate.pinyin
 								                extra = '  => U+{0}'.format(alternate.code)
 								                if ',' in pinyin:
 								                    first_pinyin, extra_pinyin = pinyin.split(',', 1)
 								                    pinyin = first_pinyin
 								                    extra += '  ?-> ' + extra_pinyin
 								            if pinyin:
 								                line = line.strip()
 								                # # U+xxx -> U+xxx
 								                line = line[2:]
 								                line = line.replace('<-', pinyin)
 								                if extra:
 								                    line += extra
 								        yield line.strip()
 								if __name__ == '__main__':
 								    args = sys.argv[1:]
 								    input_file = args[0]
 								    with open(input_file) as fp:
 								        for line in main(fp):
 								            print(line)