using third party python pinyin

pull/637/head
Hui Zhang 4 years ago
parent 6a8d0c3175
commit c77241cb0f

@ -100,7 +100,7 @@ readme_docs = [__doc__, version_doc,
from .chinese import text_to_sequence as chinese_text_to_sequence, sequence_to_text as chinese_sequence_to_text from .chinese import text_to_sequence as chinese_text_to_sequence, sequence_to_text as chinese_sequence_to_text
from .english import text_to_sequence as english_text_to_sequence, sequence_to_text as english_sequence_to_text from .english import text_to_sequence as english_text_to_sequence, sequence_to_text as english_sequence_to_text
from .pinyinkit import lazy_pinyin, pinyin, slug, initialize from .pinyinkit import lazy_pinyin
# 兼容0.1.0之前的版本python3.7以上版本支持。 # 兼容0.1.0之前的版本python3.7以上版本支持。
from .chinese import convert, number, phoneme, sequence, symbol, style from .chinese import convert, number, phoneme, sequence, symbol, style

@ -8,9 +8,9 @@
全角半角转换简体繁体转换 全角半角转换简体繁体转换
""" """
from hanziconv import hanziconv from .hanziconv import HanziConv
hc = hanziconv.HanziConv() hc = HanziConv()
# 繁体转简体 # 繁体转简体
fan2jian = hc.toSimplified fan2jian = hc.toSimplified

@ -0,0 +1,99 @@
# Copyright 2014 Bernard Yue
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
__doc__ = """
Hanzi Converter 繁簡轉換器 | 繁简转换器
This module provides functions converting chinese text between simplified and
traditional characters. It returns unicode represnetation of the text.
Class HanziConv is the main entry point of the module, you can import the
class by doing:
>>> from hanziconv import HanziConv
"""
import os
from zhon import cedict
class HanziConv():
"""This class supports hanzi (漢字) convention between simplified and
traditional format"""
__traditional_charmap = cedict.traditional
__simplified_charmap = cedict.simplified
@classmethod
def __convert(cls, text, toTraditional=True):
"""Convert `text` to Traditional characters if `toTraditional` is
True, else convert to simplified characters
:param text: data to convert
:param toTraditional: True -- convert to traditional text
False -- covert to simplified text
:returns: converted 'text`
"""
if isinstance(text, bytes):
text = text.decode('utf-8')
fromMap = cls.__simplified_charmap
toMap = cls.__traditional_charmap
if not toTraditional:
fromMap = cls.__traditional_charmap
toMap = cls.__simplified_charmap
final = []
for c in text:
index = fromMap.find(c)
if index != -1:
final.append(toMap[index])
else:
final.append(c)
return ''.join(final)
@classmethod
def toSimplified(cls, text):
"""Convert `text` to simplified character string. Assuming text is
traditional character string
:param text: text to convert
:returns: converted UTF-8 characters
>>> from hanziconv import HanziConv
>>> print(HanziConv.toSimplified('繁簡轉換器'))
繁简转换器
"""
return cls.__convert(text, toTraditional=False)
@classmethod
def toTraditional(cls, text):
"""Convert `text` to traditional character string. Assuming text is
simplified character string
:param text: text to convert
:returns: converted UTF-8 characters
>>> from hanziconv import HanziConv
>>> print(HanziConv.toTraditional('繁简转换器'))
繁簡轉換器
"""
return cls.__convert(text, toTraditional=True)
@classmethod
def same(cls, text1, text2):
"""Return True if text1 and text2 meant literally the same, False
otherwise
:param text1: string to compare to ``text2``
:param text2: string to compare to ``text1``
:returns: **True** -- ``text1`` and ``text2`` are the same in meaning,
**False** -- otherwise
>>> from hanziconv import HanziConv
>>> print(HanziConv.same('繁简转换器', '繁簡轉換器'))
True
"""
t1 = cls.toSimplified(text1)
t2 = cls.toSimplified(text2)
return t1 == t2

@ -19,6 +19,14 @@ _number_group_re = re.compile(r"([0-9]+)")
def say_digit(num: str): def say_digit(num: str):
"""123 -> 一二三
Args:
num (str): [description]
Returns:
[type]: [description]
"""
outs = [] outs = []
for zi in num: for zi in num:
outs.append(_number_cn[int(zi)]) outs.append(_number_cn[int(zi)])
@ -31,6 +39,7 @@ def say_number(num: str):
return _number_cn[0] return _number_cn[0]
elif len(x) > 16: elif len(x) > 16:
return num return num
length = len(x) length = len(x)
outs = [] outs = []
for num, zi in enumerate(x): for num, zi in enumerate(x):

@ -3,8 +3,61 @@
文本转拼音的模块依赖python-pinyinjiebaphrase-pinyin-data模块 文本转拼音的模块依赖python-pinyinjiebaphrase-pinyin-data模块
""" """
import re import re
from .core import lazy_pinyin, pinyin, slug, Style, initialize #from .core import lazy_pinyin, Style
from pypinyin.style import convert from .core import lazy_pinyin as lazy_pinyin_local
from pypinyin import lazy_pinyin, Style, load_phrases_dict, load_phrases_dict
def parse_pinyin_txt(inpath):
# U+4E2D: zhōng,zhòng # 中
outs = []
with open(inpath, encoding="utf8") as fin:
for line in tqdm(fin, desc='load pinyin', ncols=80, mininterval=1):
if line.startswith("#"):
continue
res = _ziyin_re.search(line)
if res:
zi = res.group(3).strip()
if len(zi) == 1:
outs.append([zi, res.group(2).strip().split(",")])
else:
print(line)
elif line.strip():
print(line)
return {ord(z): ','.join(p) for z, p in outs}
def parse_phrase_txt(inpath):
# 一一对应: yī yī duì yìng
outs = []
with open(inpath, encoding="utf8") as fin:
for line in tqdm(fin, desc='load phrase', ncols=80, mininterval=1):
if line.startswith("#"):
continue
parts = line.split(":")
zs = parts[0].strip()
ps = parts[1].strip().split()
if len(parts) == 2 and len(zs) == len(ps) and len(zs) >= 2:
outs.append([zs, ps])
elif line.strip():
print(line)
return {zs: [[p] for p in ps] for zs, ps in outs}
def initialize():
# 导入数据
inpath = Path(__file__).absolute().parent.joinpath('phrase_pinyin.txt.py')
_phrases_dict = parse_phrase_txt(inpath)
load_phrases_dict(_phrases_dict) # big:398815 small:36776
inpath = Path(__file__).absolute().parent.joinpath('single_pinyin.txt.py')
_pinyin_dict = parse_pinyin_txt(inpath)
load_single_dict(_pinyin_dict) # 41451
jieba.initialize()
# for word, _ in tqdm(_phrases_dict.items(), desc='jieba add word', ncols=80, mininterval=1):
# jieba.add_word(word)
# 兼容0.1.0之前的版本。 # 兼容0.1.0之前的版本。
# 音调5为轻声 # 音调5为轻声
@ -21,6 +74,8 @@ def text2pinyin(text, errors=None, **kwargs):
if errors is None: if errors is None:
errors = default_errors errors = default_errors
pin = lazy_pinyin(text, style=Style.TONE3, errors=errors, strict=True, neutral_tone_with_five=True, **kwargs) pin = lazy_pinyin(text, style=Style.TONE3, errors=errors, strict=True, neutral_tone_with_five=True, **kwargs)
pino = lazy_pinyin_local(text, style=Style.TONE3, errors=errors, strict=True, neutral_tone_with_five=True, **kwargs)
assert pin == pino
return pin return pin

@ -6,11 +6,8 @@
Base on python-pinyin(pypinyin), phrase-pinyin-data, pinyin-data and jieba. Base on python-pinyin(pypinyin), phrase-pinyin-data, pinyin-data and jieba.
""" """
from __future__ import unicode_literals
from itertools import chain from itertools import chain
from pypinyin.compat import text_type
from pypinyin.constants import ( from pypinyin.constants import (
PHRASES_DICT, PINYIN_DICT, Style PHRASES_DICT, PINYIN_DICT, Style
) )
@ -29,7 +26,6 @@ _true_pin_re = re.compile(r"[^a-zA-Z]+")
is_initialized = False is_initialized = False
def load_single_dict(pinyin_dict, style='default'): def load_single_dict(pinyin_dict, style='default'):
"""载入用户自定义的单字拼音库 """载入用户自定义的单字拼音库
@ -152,7 +148,7 @@ class Pinyin(object):
""" """
# 对字符串进行分词处理 # 对字符串进行分词处理
if isinstance(hans, text_type): if isinstance(hans, str):
han_list = self.seg(hans) han_list = self.seg(hans)
else: else:
han_list = chain(*(self.seg(x) for x in hans)) han_list = chain(*(self.seg(x) for x in hans))

@ -1,5 +1,3 @@
pypinyin
hanziconv
jieba jieba
inflect inflect
unidecode unidecode

@ -28,7 +28,7 @@ import logging
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(os.path.splitext(os.path.basename(__name__))[0]) logger = logging.getLogger(os.path.splitext(os.path.basename(__name__))[0])
install_requires = ['pypinyin>=0.41.0', 'hanziconv', 'jieba>=0.42.1', 'tqdm', 'inflect', 'unidecode'] install_requires = ['jieba>=0.42.1', 'tqdm', 'inflect', 'unidecode']
requires = install_requires requires = install_requires

Loading…
Cancel
Save