using third party python pinyin

4 years ago · c77241cb0f
parent 6a8d0c3175
commit c77241cb0f
8 changed files with 170 additions and 13 deletions
--- a/third_party/phkit/phkit/init.py
+++ b/third_party/phkit/phkit/init.py
@ -100,7 +100,7 @@ readme_docs = [__doc__, version_doc,
 from .chinese import text_to_sequence as chinese_text_to_sequence, sequence_to_text as chinese_sequence_to_text
 from .english import text_to_sequence as english_text_to_sequence, sequence_to_text as english_sequence_to_text
-from .pinyinkit import lazy_pinyin, pinyin, slug, initialize
+from .pinyinkit import lazy_pinyin
 # 兼容0.1.0之前的版本，python3.7以上版本支持。
 from .chinese import convert, number, phoneme, sequence, symbol, style
--- a/third_party/phkit/phkit/chinese/convert.py
+++ b/third_party/phkit/phkit/chinese/convert.py
@ -8,9 +8,9 @@
 全角半角转换，简体繁体转换。
 """
-from hanziconv import hanziconv
+from .hanziconv import HanziConv
-hc = hanziconv.HanziConv()
+hc = HanziConv()
 # 繁体转简体
 fan2jian = hc.toSimplified
--- a/third_party/phkit/phkit/chinese/hanziconv.py
+++ b/third_party/phkit/phkit/chinese/hanziconv.py
@ -0,0 +1,99 @@
 # Copyright 2014 Bernard Yue
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 __doc__ = """
 Hanzi Converter 繁簡轉換器 | 繁简转换器
 This module provides functions converting chinese text between simplified and
 traditional characters.  It returns unicode represnetation of the text.
 Class HanziConv is the main entry point of the module, you can import the
 class by doing:
    >>> from hanziconv import HanziConv
 """
 import os
 from zhon import cedict
 class HanziConv():
    """This class supports hanzi (漢字) convention between simplified and
    traditional format"""
    __traditional_charmap = cedict.traditional
    __simplified_charmap = cedict.simplified
    @classmethod
    def __convert(cls, text, toTraditional=True):
        """Convert `text` to Traditional characters if `toTraditional` is
        True, else convert to simplified characters
        :param text:           data to convert
        :param toTraditional:  True -- convert to traditional text
                               False -- covert to simplified text
        :returns:              converted 'text`
        """
        if isinstance(text, bytes):
            text = text.decode('utf-8')
        fromMap = cls.__simplified_charmap
        toMap = cls.__traditional_charmap
        if not toTraditional:
            fromMap = cls.__traditional_charmap
            toMap = cls.__simplified_charmap
        final = []
        for c in text:
            index = fromMap.find(c)
            if index != -1:
                final.append(toMap[index])
            else:
                final.append(c)
        return ''.join(final)
    @classmethod
    def toSimplified(cls, text):
        """Convert `text` to simplified character string.  Assuming text is
        traditional character string
        :param text:  text to convert
        :returns:     converted UTF-8 characters
        >>> from hanziconv import HanziConv
        >>> print(HanziConv.toSimplified('繁簡轉換器'))
        繁简转换器
        """
        return cls.__convert(text, toTraditional=False)
    @classmethod
    def toTraditional(cls, text):
        """Convert `text` to traditional character string.  Assuming text is
        simplified character string
        :param text:  text to convert
        :returns:     converted UTF-8 characters
        >>> from hanziconv import HanziConv
        >>> print(HanziConv.toTraditional('繁简转换器'))
        繁簡轉換器
        """
        return cls.__convert(text, toTraditional=True)
    @classmethod
    def same(cls, text1, text2):
        """Return True if text1 and text2 meant literally the same, False
        otherwise
        :param text1: string to compare to ``text2``
        :param text2: string to compare to ``text1``
        :returns:     **True**  -- ``text1`` and ``text2`` are the same in meaning,
                      **False** -- otherwise
        >>> from hanziconv import HanziConv
        >>> print(HanziConv.same('繁简转换器', '繁簡轉換器'))
        True
        """
        t1 = cls.toSimplified(text1)
        t2 = cls.toSimplified(text2)
        return t1 == t2
--- a/third_party/phkit/phkit/chinese/number.py
+++ b/third_party/phkit/phkit/chinese/number.py
@ -19,6 +19,14 @@ _number_group_re = re.compile(r"([0-9]+)")
 def say_digit(num: str):
    """123 -> 一二三
    Args:
        num (str): [description]
    Returns:
        [type]: [description]
    """
    outs = []
    for zi in num:
        outs.append(_number_cn[int(zi)])
@ -31,6 +39,7 @@ def say_number(num: str):
        return _number_cn[0]
    elif len(x) > 16:
        return num
    length = len(x)
    outs = []
    for num, zi in enumerate(x):
--- a/third_party/phkit/phkit/pinyinkit/init.py
+++ b/third_party/phkit/phkit/pinyinkit/init.py
@ -3,8 +3,61 @@
 文本转拼音的模块，依赖python-pinyin，jieba，phrase-pinyin-data模块。
 """
 import re
-from .core import lazy_pinyin, pinyin, slug, Style, initialize
+#from .core import lazy_pinyin, Style
-from pypinyin.style import convert
+from .core import lazy_pinyin as lazy_pinyin_local
 from pypinyin import lazy_pinyin, Style, load_phrases_dict, load_phrases_dict
 def parse_pinyin_txt(inpath):
    # U+4E2D: zhōng,zhòng  # 中
    outs = []
    with open(inpath, encoding="utf8") as fin:
        for line in tqdm(fin, desc='load pinyin', ncols=80, mininterval=1):
            if line.startswith("#"):
                continue
            res = _ziyin_re.search(line)
            if res:
                zi = res.group(3).strip()
                if len(zi) == 1:
                    outs.append([zi, res.group(2).strip().split(",")])
                else:
                    print(line)
            elif line.strip():
                print(line)
    return {ord(z): ','.join(p) for z, p in outs}
 def parse_phrase_txt(inpath):
    # 一一对应: yī yī duì yìng
    outs = []
    with open(inpath, encoding="utf8") as fin:
        for line in tqdm(fin, desc='load phrase', ncols=80, mininterval=1):
            if line.startswith("#"):
                continue
            parts = line.split(":")
            zs = parts[0].strip()
            ps = parts[1].strip().split()
            if len(parts) == 2 and len(zs) == len(ps) and len(zs) >= 2:
                outs.append([zs, ps])
            elif line.strip():
                print(line)
    return {zs: [[p] for p in ps] for zs, ps in outs}
 def initialize():
    # 导入数据
    inpath = Path(__file__).absolute().parent.joinpath('phrase_pinyin.txt.py')
    _phrases_dict = parse_phrase_txt(inpath)
    load_phrases_dict(_phrases_dict)  # big:398815 small:36776
    inpath = Path(__file__).absolute().parent.joinpath('single_pinyin.txt.py')
    _pinyin_dict = parse_pinyin_txt(inpath)
    load_single_dict(_pinyin_dict)  # 41451
    jieba.initialize()
    # for word, _ in tqdm(_phrases_dict.items(), desc='jieba add word', ncols=80, mininterval=1):
    #     jieba.add_word(word)
 # 兼容0.1.0之前的版本。
 # 音调：5为轻声
@ -21,6 +74,8 @@ def text2pinyin(text, errors=None, **kwargs):
    if errors is None:
        errors = default_errors
    pin = lazy_pinyin(text, style=Style.TONE3, errors=errors, strict=True, neutral_tone_with_five=True, **kwargs)
    pino = lazy_pinyin_local(text, style=Style.TONE3, errors=errors, strict=True, neutral_tone_with_five=True, **kwargs)
    assert pin == pino
    return pin
--- a/third_party/phkit/phkit/pinyinkit/core.py
+++ b/third_party/phkit/phkit/pinyinkit/core.py
@ -6,11 +6,8 @@
 Base on python-pinyin(pypinyin), phrase-pinyin-data, pinyin-data and jieba.
 """
 from __future__ import unicode_literals
 from itertools import chain
 from pypinyin.compat import text_type
 from pypinyin.constants import (
    PHRASES_DICT, PINYIN_DICT, Style
 )
@ -29,7 +26,6 @@ _true_pin_re = re.compile(r"[^a-zA-Z]+")
 is_initialized = False
 def load_single_dict(pinyin_dict, style='default'):
    """载入用户自定义的单字拼音库
@ -152,7 +148,7 @@ class Pinyin(object):
        """
        # 对字符串进行分词处理
-        if isinstance(hans, text_type):
+        if isinstance(hans, str):
            han_list = self.seg(hans)
        else:
            han_list = chain(*(self.seg(x) for x in hans))
--- a/third_party/phkit/requirements.txt
+++ b/third_party/phkit/requirements.txt
@ -1,5 +1,3 @@
 pypinyin
 hanziconv
 jieba
 inflect
 unidecode
--- a/third_party/phkit/setup.py
+++ b/third_party/phkit/setup.py
@ -28,7 +28,7 @@ import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(os.path.splitext(os.path.basename(__name__))[0])
-install_requires = ['pypinyin>=0.41.0', 'hanziconv', 'jieba>=0.42.1', 'tqdm', 'inflect', 'unidecode']
+install_requires = ['jieba>=0.42.1', 'tqdm', 'inflect', 'unidecode']
 requires = install_requires