diff --git a/third_party/phkit/phkit/__init__.py b/third_party/phkit/phkit/__init__.py index 3c5e94ccd..e4b9dbd97 100644 --- a/third_party/phkit/phkit/__init__.py +++ b/third_party/phkit/phkit/__init__.py @@ -100,7 +100,7 @@ readme_docs = [__doc__, version_doc, from .chinese import text_to_sequence as chinese_text_to_sequence, sequence_to_text as chinese_sequence_to_text from .english import text_to_sequence as english_text_to_sequence, sequence_to_text as english_sequence_to_text -from .pinyinkit import lazy_pinyin, pinyin, slug, initialize +from .pinyinkit import lazy_pinyin # 兼容0.1.0之前的版本,python3.7以上版本支持。 from .chinese import convert, number, phoneme, sequence, symbol, style diff --git a/third_party/phkit/phkit/chinese/convert.py b/third_party/phkit/phkit/chinese/convert.py index bc9382220..ba2d90838 100644 --- a/third_party/phkit/phkit/chinese/convert.py +++ b/third_party/phkit/phkit/chinese/convert.py @@ -8,9 +8,9 @@ 全角半角转换,简体繁体转换。 """ -from hanziconv import hanziconv +from .hanziconv import HanziConv -hc = hanziconv.HanziConv() +hc = HanziConv() # 繁体转简体 fan2jian = hc.toSimplified diff --git a/third_party/phkit/phkit/chinese/hanziconv.py b/third_party/phkit/phkit/chinese/hanziconv.py new file mode 100644 index 000000000..32a8d22c4 --- /dev/null +++ b/third_party/phkit/phkit/chinese/hanziconv.py @@ -0,0 +1,99 @@ +# Copyright 2014 Bernard Yue +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +__doc__ = """ +Hanzi Converter 繁簡轉換器 | 繁简转换器 +This module provides functions converting chinese text between simplified and +traditional characters. It returns unicode represnetation of the text. +Class HanziConv is the main entry point of the module, you can import the +class by doing: + >>> from hanziconv import HanziConv +""" + +import os +from zhon import cedict + +class HanziConv(): + """This class supports hanzi (漢字) convention between simplified and + traditional format""" + __traditional_charmap = cedict.traditional + __simplified_charmap = cedict.simplified + + @classmethod + def __convert(cls, text, toTraditional=True): + """Convert `text` to Traditional characters if `toTraditional` is + True, else convert to simplified characters + :param text: data to convert + :param toTraditional: True -- convert to traditional text + False -- covert to simplified text + :returns: converted 'text` + """ + if isinstance(text, bytes): + text = text.decode('utf-8') + + fromMap = cls.__simplified_charmap + toMap = cls.__traditional_charmap + if not toTraditional: + fromMap = cls.__traditional_charmap + toMap = cls.__simplified_charmap + + final = [] + for c in text: + index = fromMap.find(c) + if index != -1: + final.append(toMap[index]) + else: + final.append(c) + return ''.join(final) + + @classmethod + def toSimplified(cls, text): + """Convert `text` to simplified character string. Assuming text is + traditional character string + :param text: text to convert + :returns: converted UTF-8 characters + >>> from hanziconv import HanziConv + >>> print(HanziConv.toSimplified('繁簡轉換器')) + 繁简转换器 + """ + return cls.__convert(text, toTraditional=False) + + @classmethod + def toTraditional(cls, text): + """Convert `text` to traditional character string. Assuming text is + simplified character string + :param text: text to convert + :returns: converted UTF-8 characters + >>> from hanziconv import HanziConv + >>> print(HanziConv.toTraditional('繁简转换器')) + 繁簡轉換器 + """ + return cls.__convert(text, toTraditional=True) + + @classmethod + def same(cls, text1, text2): + """Return True if text1 and text2 meant literally the same, False + otherwise + :param text1: string to compare to ``text2`` + :param text2: string to compare to ``text1`` + :returns: **True** -- ``text1`` and ``text2`` are the same in meaning, + **False** -- otherwise + >>> from hanziconv import HanziConv + >>> print(HanziConv.same('繁简转换器', '繁簡轉換器')) + True + """ + t1 = cls.toSimplified(text1) + t2 = cls.toSimplified(text2) + return t1 == t2 \ No newline at end of file diff --git a/third_party/phkit/phkit/chinese/number.py b/third_party/phkit/phkit/chinese/number.py index 1a3ab827e..f6efa6d32 100644 --- a/third_party/phkit/phkit/chinese/number.py +++ b/third_party/phkit/phkit/chinese/number.py @@ -19,6 +19,14 @@ _number_group_re = re.compile(r"([0-9]+)") def say_digit(num: str): + """123 -> 一二三 + + Args: + num (str): [description] + + Returns: + [type]: [description] + """ outs = [] for zi in num: outs.append(_number_cn[int(zi)]) @@ -31,6 +39,7 @@ def say_number(num: str): return _number_cn[0] elif len(x) > 16: return num + length = len(x) outs = [] for num, zi in enumerate(x): diff --git a/third_party/phkit/phkit/pinyinkit/__init__.py b/third_party/phkit/phkit/pinyinkit/__init__.py index 9b71e0a2b..490ff75dd 100644 --- a/third_party/phkit/phkit/pinyinkit/__init__.py +++ b/third_party/phkit/phkit/pinyinkit/__init__.py @@ -3,8 +3,61 @@ 文本转拼音的模块,依赖python-pinyin,jieba,phrase-pinyin-data模块。 """ import re -from .core import lazy_pinyin, pinyin, slug, Style, initialize -from pypinyin.style import convert +#from .core import lazy_pinyin, Style +from .core import lazy_pinyin as lazy_pinyin_local +from pypinyin import lazy_pinyin, Style, load_phrases_dict, load_phrases_dict + + +def parse_pinyin_txt(inpath): + # U+4E2D: zhōng,zhòng # 中 + outs = [] + with open(inpath, encoding="utf8") as fin: + for line in tqdm(fin, desc='load pinyin', ncols=80, mininterval=1): + if line.startswith("#"): + continue + res = _ziyin_re.search(line) + if res: + zi = res.group(3).strip() + if len(zi) == 1: + outs.append([zi, res.group(2).strip().split(",")]) + else: + print(line) + elif line.strip(): + print(line) + return {ord(z): ','.join(p) for z, p in outs} + + +def parse_phrase_txt(inpath): + # 一一对应: yī yī duì yìng + outs = [] + with open(inpath, encoding="utf8") as fin: + for line in tqdm(fin, desc='load phrase', ncols=80, mininterval=1): + if line.startswith("#"): + continue + parts = line.split(":") + zs = parts[0].strip() + ps = parts[1].strip().split() + if len(parts) == 2 and len(zs) == len(ps) and len(zs) >= 2: + outs.append([zs, ps]) + elif line.strip(): + print(line) + return {zs: [[p] for p in ps] for zs, ps in outs} + + +def initialize(): + # 导入数据 + inpath = Path(__file__).absolute().parent.joinpath('phrase_pinyin.txt.py') + _phrases_dict = parse_phrase_txt(inpath) + load_phrases_dict(_phrases_dict) # big:398815 small:36776 + + inpath = Path(__file__).absolute().parent.joinpath('single_pinyin.txt.py') + _pinyin_dict = parse_pinyin_txt(inpath) + load_single_dict(_pinyin_dict) # 41451 + + jieba.initialize() + # for word, _ in tqdm(_phrases_dict.items(), desc='jieba add word', ncols=80, mininterval=1): + # jieba.add_word(word) + # 兼容0.1.0之前的版本。 # 音调:5为轻声 @@ -21,6 +74,8 @@ def text2pinyin(text, errors=None, **kwargs): if errors is None: errors = default_errors pin = lazy_pinyin(text, style=Style.TONE3, errors=errors, strict=True, neutral_tone_with_five=True, **kwargs) + pino = lazy_pinyin_local(text, style=Style.TONE3, errors=errors, strict=True, neutral_tone_with_five=True, **kwargs) + assert pin == pino return pin diff --git a/third_party/phkit/phkit/pinyinkit/core.py b/third_party/phkit/phkit/pinyinkit/core.py index 2a5af83a9..57e9ac74e 100644 --- a/third_party/phkit/phkit/pinyinkit/core.py +++ b/third_party/phkit/phkit/pinyinkit/core.py @@ -6,11 +6,8 @@ Base on python-pinyin(pypinyin), phrase-pinyin-data, pinyin-data and jieba. """ -from __future__ import unicode_literals - from itertools import chain -from pypinyin.compat import text_type from pypinyin.constants import ( PHRASES_DICT, PINYIN_DICT, Style ) @@ -29,7 +26,6 @@ _true_pin_re = re.compile(r"[^a-zA-Z]+") is_initialized = False - def load_single_dict(pinyin_dict, style='default'): """载入用户自定义的单字拼音库 @@ -152,7 +148,7 @@ class Pinyin(object): """ # 对字符串进行分词处理 - if isinstance(hans, text_type): + if isinstance(hans, str): han_list = self.seg(hans) else: han_list = chain(*(self.seg(x) for x in hans)) diff --git a/third_party/phkit/requirements.txt b/third_party/phkit/requirements.txt index 3ba361403..6e873ee3f 100644 --- a/third_party/phkit/requirements.txt +++ b/third_party/phkit/requirements.txt @@ -1,5 +1,3 @@ -pypinyin -hanziconv jieba inflect unidecode diff --git a/third_party/phkit/setup.py b/third_party/phkit/setup.py index f35b5aea0..f0a86c09e 100644 --- a/third_party/phkit/setup.py +++ b/third_party/phkit/setup.py @@ -28,7 +28,7 @@ import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(os.path.splitext(os.path.basename(__name__))[0]) -install_requires = ['pypinyin>=0.41.0', 'hanziconv', 'jieba>=0.42.1', 'tqdm', 'inflect', 'unidecode'] +install_requires = ['jieba>=0.42.1', 'tqdm', 'inflect', 'unidecode'] requires = install_requires