add Cedict egs & pypinyin using jieba as wordseg & phkit using local pypinyin package (#637)

* down and parse cedict * remove useless * using third party python pinyin * jieba as default wordseg * remove useless * remove pinyin dict * using jieba.lcut * remove doc of cedict egs * add fan2jian test * add description for say_digit
5 years ago · 749a113037
parent f22f681992
commit 749a113037
27 changed files with 259 additions and 79222 deletions
--- a/examples/cc-cedict/.gitignore
+++ b/examples/cc-cedict/.gitignore
@ -0,0 +1,2 @@
+data
+exp
--- a/examples/cc-cedict/README.md
+++ b/examples/cc-cedict/README.md
--- a/examples/cc-cedict/local/parser.py
+++ b/examples/cc-cedict/local/parser.py
@ -0,0 +1,78 @@
+# https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py
+
+#A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
+
+#Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
+
+#Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
+
+#Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
+
+#This code was written by Franki Allegra in February 2020.
+
+
+import sys
+import json
+
+# usage: bin ccedict dump.json
+
+with open(sys.argv[1], 'rt') as file:
+    text = file.read()
+    lines = text.split('\n')
+    dict_lines = list(lines)
+
+    def parse_line(line):
+        parsed = {}
+        if line == '':
+            dict_lines.remove(line)
+            return 0
+        if line.startswith('#'):
+            return 0
+        if line.startswith('%'):
+            return 0
+        line = line.rstrip('/')
+        line = line.split('/')
+        if len(line) <= 1:
+            return 0
+        english = line[1]
+        char_and_pinyin = line[0].split('[')
+        characters = char_and_pinyin[0]
+        characters = characters.split()
+        traditional = characters[0]
+        simplified = characters[1]
+        pinyin = char_and_pinyin[1]
+        pinyin = pinyin.rstrip()
+        pinyin = pinyin.rstrip("]")
+        parsed['traditional'] = traditional
+        parsed['simplified'] = simplified
+        parsed['pinyin'] = pinyin
+        parsed['english'] = english
+        list_of_dicts.append(parsed)
+
+    def remove_surnames():
+        for x in range(len(list_of_dicts)-1, -1, -1):
+            if "surname " in list_of_dicts[x]['english']:
+                if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']:
+                    list_of_dicts.pop(x)
+
+    def main():
+
+        #make each line into a dictionary
+        print("Parsing dictionary . . .")
+        for line in dict_lines:
+                parse_line(line)
+
+        #remove entries for surnames from the data (optional):
+        print("Removing Surnames . . .")
+        remove_surnames()
+
+
+        print("Saving to database (this may take a few minutes) . . .")
+        with open(sys.argv[2], 'wt') as fout:
+            for one_dict in list_of_dicts:
+                json_str = json.dumps(one_dict)
+                fout.write(json_str + "\n")
+        print('Done!')
+
+list_of_dicts = []
+parsed_dict = main()
--- a/examples/cc-cedict/path.sh
+++ b/examples/cc-cedict/path.sh
@ -0,0 +1,10 @@
+export MAIN_ROOT=${PWD}/../../
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH}
--- a/examples/cc-cedict/run.sh
+++ b/examples/cc-cedict/run.sh
@ -0,0 +1,39 @@
+#!/bin/bash
+
+# CC-CEDICT download: https://www.mdbg.net/chinese/dictionary?page=cc-cedict
+# The word dictionary of this website is based on CC-CEDICT.
+# CC-CEDICT is a continuation of the CEDICT project started by Paul Denisowski in 1997 with the
+# aim to provide a complete downloadable Chinese to English dictionary with pronunciation in pinyin for the Chinese characters.
+# This website allows you to easily add new entries or correct existing entries in CC-CEDICT.
+# Submitted entries will be checked and processed frequently and released for download in CEDICT format on this page.
+
+set -e
+source path.sh
+
+stage=-1
+stop_stage=100
+
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
+
+
+cedict_url=https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.zip
+cedict=cedict_1_0_ts_utf-8_mdbg.zip
+
+mkdir -p data
+
+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
+    test -f data/${cedict} || wget -O data/${cedict} ${cedict_url}
+    pushd data
+    unzip ${cedict}
+    popd
+
+fi
+
+mkdir -p exp
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
+    cp data/cedict_ts.u8 exp/cedict
+    python3 local/parser.py exp/cedict exp/cedict.json
+fi
+
--- a/third_party/phkit/phkit/init.py
+++ b/third_party/phkit/phkit/init.py
@ -100,7 +100,7 @@ readme_docs = [__doc__, version_doc,

 from .chinese import text_to_sequence as chinese_text_to_sequence, sequence_to_text as chinese_sequence_to_text
 from .english import text_to_sequence as english_text_to_sequence, sequence_to_text as english_sequence_to_text
-from .pinyinkit import lazy_pinyin, pinyin, slug, initialize
+from .pinyinkit import lazy_pinyin

 # 兼容0.1.0之前的版本，python3.7以上版本支持。
 from .chinese import convert, number, phoneme, sequence, symbol, style
--- a/third_party/phkit/phkit/chinese/convert.py
+++ b/third_party/phkit/phkit/chinese/convert.py
@ -8,9 +8,9 @@

 全角半角转换，简体繁体转换。
 """
-from hanziconv import hanziconv
+from .hanziconv import HanziConv

-hc = hanziconv.HanziConv()
+hc = HanziConv()

 # 繁体转简体
 fan2jian = hc.toSimplified
@ -48,4 +48,4 @@ if __name__ == "__main__":
    assert ban2quan("aA1 ,:$。、") == "ａＡ１　，：＄。、"
    assert quan2ban("ａＡ１　，：＄。、") == "aA1 ,:$。、"
    assert jian2fan("中国语言") == "中國語言"
-    assert jian2fan("中國語言") == "中国语言"
+    assert fan2jian("中國語言") == "中国语言"
--- a/third_party/phkit/phkit/chinese/hanziconv.py
+++ b/third_party/phkit/phkit/chinese/hanziconv.py
@ -0,0 +1,99 @@
+# Copyright 2014 Bernard Yue
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+__doc__ = """
+Hanzi Converter 繁簡轉換器 | 繁简转换器
+This module provides functions converting chinese text between simplified and
+traditional characters.  It returns unicode represnetation of the text.
+Class HanziConv is the main entry point of the module, you can import the
+class by doing:
+    >>> from hanziconv import HanziConv
+"""
+
+import os
+from zhon import cedict
+
+class HanziConv():
+    """This class supports hanzi (漢字) convention between simplified and
+    traditional format"""
+    __traditional_charmap = cedict.traditional
+    __simplified_charmap = cedict.simplified
+
+    @classmethod
+    def __convert(cls, text, toTraditional=True):
+        """Convert `text` to Traditional characters if `toTraditional` is
+        True, else convert to simplified characters
+        :param text:           data to convert
+        :param toTraditional:  True -- convert to traditional text
+                               False -- covert to simplified text
+        :returns:              converted 'text`
+        """
+        if isinstance(text, bytes):
+            text = text.decode('utf-8')
+
+        fromMap = cls.__simplified_charmap
+        toMap = cls.__traditional_charmap
+        if not toTraditional:
+            fromMap = cls.__traditional_charmap
+            toMap = cls.__simplified_charmap
+
+        final = []
+        for c in text:
+            index = fromMap.find(c)
+            if index != -1:
+                final.append(toMap[index])
+            else:
+                final.append(c)
+        return ''.join(final)
+
+    @classmethod
+    def toSimplified(cls, text):
+        """Convert `text` to simplified character string.  Assuming text is
+        traditional character string
+        :param text:  text to convert
+        :returns:     converted UTF-8 characters
+        >>> from hanziconv import HanziConv
+        >>> print(HanziConv.toSimplified('繁簡轉換器'))
+        繁简转换器
+        """
+        return cls.__convert(text, toTraditional=False)
+
+    @classmethod
+    def toTraditional(cls, text):
+        """Convert `text` to traditional character string.  Assuming text is
+        simplified character string
+        :param text:  text to convert
+        :returns:     converted UTF-8 characters
+        >>> from hanziconv import HanziConv
+        >>> print(HanziConv.toTraditional('繁简转换器'))
+        繁簡轉換器
+        """
+        return cls.__convert(text, toTraditional=True)
+
+    @classmethod
+    def same(cls, text1, text2):
+        """Return True if text1 and text2 meant literally the same, False
+        otherwise
+        :param text1: string to compare to ``text2``
+        :param text2: string to compare to ``text1``
+        :returns:     **True**  -- ``text1`` and ``text2`` are the same in meaning,
+                      **False** -- otherwise
+        >>> from hanziconv import HanziConv
+        >>> print(HanziConv.same('繁简转换器', '繁簡轉換器'))
+        True
+        """
+        t1 = cls.toSimplified(text1)
+        t2 = cls.toSimplified(text2)
+        return t1 == t2
--- a/third_party/phkit/phkit/chinese/number.py
+++ b/third_party/phkit/phkit/chinese/number.py
@ -18,7 +18,15 @@ _grade_level = {'万', '亿', '个'}
 _number_group_re = re.compile(r"([0-9]+)")


-def say_digit(num: str):
+def say_digit(num: str) -> str:
+    """123 -> 一二三
+
+    Args:
+        num (str): digit
+
+    Returns:
+        str: hanzi number
+    """
    outs = []
    for zi in num:
        outs.append(_number_cn[int(zi)])
@ -31,6 +39,7 @@ def say_number(num: str):
        return _number_cn[0]
    elif len(x) > 16:
        return num
+    
    length = len(x)
    outs = []
    for num, zi in enumerate(x):
--- a/third_party/phkit/phkit/pinyinkit/README.md
+++ b/third_party/phkit/phkit/pinyinkit/README.md
@ -1,294 +0,0 @@
-汉字拼音转换工具（Python 版）
-=============================
-
-|Build| |appveyor| |Coverage| |Pypi version| |DOI|
-
-
-将汉字转为拼音。可以用于汉字注音、排序、检索(`Russian translation`_) 。
-
-基于 `hotoo/pinyin <https://github.com/hotoo/pinyin>`__ 开发。
-
-* Documentation: http://pypinyin.rtfd.io/
-* GitHub: https://github.com/mozillazg/python-pinyin
-* License: MIT license
-* PyPI: https://pypi.org/project/pypinyin
-* Python version: 2.7, pypy, pypy3, 3.4, 3.5, 3.6, 3.7, 3.8
-
-.. contents::
-
-
-特性
----
-
-* 根据词组智能匹配最正确的拼音。
-* 支持多音字。
-* 简单的繁体支持, 注音支持。
-* 支持多种不同拼音/注音风格。
-
-
-安装
----
-
-.. code-block:: bash
-
-    $ pip install pypinyin
-
-
-使用示例
--------
-
-Python 3(Python 2 下把 ``'中心'`` 替换为 ``u'中心'`` 即可):
-
-.. code-block:: python
-
-    >>> from pypinyin import pinyin, lazy_pinyin, Style
-    >>> pinyin('中心')
-    [['zhōng'], ['xīn']]
-    >>> pinyin('中心', heteronym=True)  # 启用多音字模式
-    [['zhōng', 'zhòng'], ['xīn']]
-    >>> pinyin('中心', style=Style.FIRST_LETTER)  # 设置拼音风格
-    [['z'], ['x']]
-    >>> pinyin('中心', style=Style.TONE2, heteronym=True)
-    [['zho1ng', 'zho4ng'], ['xi1n']]
-    >>> pinyin('中心', style=Style.TONE3, heteronym=True)
-    [['zhong1', 'zhong4'], ['xin1']]
-    >>> pinyin('中心', style=Style.BOPOMOFO)  # 注音风格
-    [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
-    >>> lazy_pinyin('中心')  # 不考虑多音字的情况
-    ['zhong', 'xin']
-
-
-**注意事项** ：
-
-* 拼音结果不会标明哪个韵母是轻声，轻声的韵母没有声调或数字标识（使用 ``5`` 标识轻声的方法见 `文档 <https://pypinyin.readthedocs.io/zh_CN/master/contrib.html#neutraltonewith5mixin>`__ ）。
-* 无声调相关拼音风格下的结果会使用 ``v`` 表示 ``ü`` （使用 ``ü`` 代替 ``v`` 的方法见 `文档 <https://pypinyin.readthedocs.io/zh_CN/master/contrib.html#v2umixin>`__ ）。
-
-命令行工具：
-
-.. code-block:: console
-
-    $ pypinyin 音乐
-    yīn yuè
-    $ pypinyin -h
-
-
-文档
--------
-
-详细文档请访问：http://pypinyin.rtfd.io/ 。
-
-项目代码开发方面的问题可以看看 `开发文档`_ 。
-
-
-FAQ
---------
-
-词语中的多音字拼音有误？
-+++++++++++++++++++++++++++++
-
-目前是通过词组拼音库的方式来解决多音字问题的。如果出现拼音有误的情况，
-可以自定义词组拼音来调整词语中的拼音：
-
-.. code-block:: python
-
-    >>> from pypinyin import Style, pinyin, load_phrases_dict
-    >>> pinyin('步履蹒跚')
-    [['bù'], ['lǚ'], ['mán'], ['shān']]
-    >>> load_phrases_dict({'步履蹒跚': [['bù'], ['lǚ'], ['pán'], ['shān']]})
-    >>> pinyin('步履蹒跚')
-    [['bù'], ['lǚ'], ['pán'], ['shān']]
-
-详见 `文档 <https://pypinyin.readthedocs.io/zh_CN/master/usage.html#custom-dict>`__ 。
-
-为什么没有 y, w, yu 几个声母？
-++++++++++++++++++++++++++++++++++++++++++++
-
-.. code-block:: python
-
-    >>> from pypinyin import Style, pinyin
-    >>> pinyin('下雨天', style=Style.INITIALS)
-    [['x'], [''], ['t']]
-
-因为根据 `《汉语拼音方案》 <http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html>`__ ，
-y，w，ü (yu) 都不是声母。
-
-    声母风格（INITIALS）下，“雨”、“我”、“圆”等汉字返回空字符串，因为根据
-    `《汉语拼音方案》 <http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html>`__ ，
-    y，w，ü (yu) 都不是声母，在某些特定韵母无声母时，才加上 y 或 w，而 ü 也有其特定规则。    —— @hotoo
-
-    **如果你觉得这个给你带来了麻烦，那么也请小心一些无声母的汉字（如“啊”、“饿”、“按”、“昂”等）。
-    这时候你也许需要的是首字母风格（FIRST_LETTER）**。    —— @hotoo
-
-    参考: `hotoo/pinyin#57 <https://github.com/hotoo/pinyin/issues/57>`__,
-    `#22 <https://github.com/mozillazg/python-pinyin/pull/22>`__,
-    `#27 <https://github.com/mozillazg/python-pinyin/issues/27>`__,
-    `#44 <https://github.com/mozillazg/python-pinyin/issues/44>`__
-
-如果觉得这个行为不是你想要的，就是想把 y 当成声母的话，可以指定 ``strict=False`` ，
-这个可能会符合你的预期：
-
-.. code-block:: python
-
-    >>> from pypinyin import Style, pinyin
-    >>> pinyin('下雨天', style=Style.INITIALS)
-    [['x'], [''], ['t']]
-    >>> pinyin('下雨天', style=Style.INITIALS, strict=False)
-    [['x'], ['y'], ['t']]
-
-详见 `strict 参数的影响`_ 。
-
-如何减少内存占用
-++++++++++++++++++++
-
-如果对拼音的准确性不是特别在意的话，可以通过设置环境变量 ``PYPINYIN_NO_PHRASES``
-和 ``PYPINYIN_NO_DICT_COPY`` 来节省内存。
-详见 `文档 <https://pypinyin.readthedocs.io/zh_CN/master/faq.html#no-phrases>`__
-
-
-更多 FAQ 详见文档中的
-`FAQ <https://pypinyin.readthedocs.io/zh_CN/master/faq.html>`__ 部分。
-
-
-.. _#13 : https://github.com/mozillazg/python-pinyin/issues/113
-.. _strict 参数的影响: https://pypinyin.readthedocs.io/zh_CN/master/usage.html#strict
-
-
-拼音数据
---------
-
-* 单个汉字的拼音使用 `pinyin-data`_ 的数据
-* 词组的拼音使用 `phrase-pinyin-data`_ 的数据
-
-
-Related Projects
-----------------
-
-* `hotoo/pinyin`__: 汉字拼音转换工具 Node.js/JavaScript 版。
-* `mozillazg/go-pinyin`__: 汉字拼音转换工具 Go 版。
-* `mozillazg/rust-pinyin`__: 汉字拼音转换工具 Rust 版。
-
-
-__ https://github.com/hotoo/pinyin
-__ https://github.com/mozillazg/go-pinyin
-__ https://github.com/mozillazg/rust-pinyin
-
-
-.. |Build| image:: https://img.shields.io/circleci/project/github/mozillazg/python-pinyin/master.svg
-   :target: https://circleci.com/gh/mozillazg/python-pinyin
-.. |appveyor| image:: https://ci.appveyor.com/api/projects/status/ni8gdyextfa85yqo/branch/master?svg=true
-   :target: https://ci.appveyor.com/project/mozillazg/python-pinyin
-.. |Coverage| image:: https://img.shields.io/codecov/c/github/mozillazg/python-pinyin/master.svg
-   :target: https://codecov.io/gh/mozillazg/python-pinyin
-.. |PyPI version| image:: https://img.shields.io/pypi/v/pypinyin.svg
-   :target: https://pypi.org/project/pypinyin/
-.. |DOI| image:: https://zenodo.org/badge/12830126.svg
-   :target: https://zenodo.org/badge/latestdoi/12830126
-
-
-
-.. _Russian translation: https://github.com/mozillazg/python-pinyin/blob/master/README_ru.rst
-.. _pinyin-data: https://github.com/mozillazg/pinyin-data
-.. _phrase-pinyin-data: https://github.com/mozillazg/phrase-pinyin-data
-.. _开发文档: https://pypinyin.readthedocs.io/zh_CN/develop/develop.html
-
-
-
-# pinyin-data [![Build Status](https://travis-ci.org/mozillazg/pinyin-data.svg?branch=master)](https://travis-ci.org/mozillazg/pinyin-data)
-
-汉字拼音数据。
-
-
-## 数据介绍
-
-拼音数据的格式：
-
-    {code point}: {pinyins}  # {hanzi} {comments}
-
-* 以 `#` 开头的行是注释，行内 `#` 后面的字符也是注释
-* `{pinyins}` 中使用逗号分隔多个拼音
-* 示例：
-
-        # 注释
-        U+4E2D: zhōng,zhòng  # 中
-
-
-[Unihan Database][unihan] 数据版本：
-
-> Date: 2018-11-09 21:36:19 GMT [JHJ]    
-> Unicode version: 12.0.0
-
-* `kHanyuPinyin.txt`: [Unihan Database][unihan] 中 [kHanyuPinyin](http://www.unicode.org/reports/tr38/#kHanyuPinyin) 部分的拼音数据（来源于《漢語大字典》的拼音数据）
-* `kXHC1983.txt`: [Unihan Database][unihan] 中 [kXHC1983](http://www.unicode.org/reports/tr38/#kXHC1983) 部分的拼音数据（来源于《现代汉语词典》的拼音数据）
-* `kHanyuPinlu.txt`: [Unihan Database][unihan] 中 [kHanyuPinlu](http://www.unicode.org/reports/tr38/#kHanyuPinlu) 部分的拼音数据（来源于《現代漢語頻率詞典》的拼音数据）
-* `kMandarin.txt`: [Unihan Database][unihan] 中 [kMandarin](http://www.unicode.org/reports/tr38/#kMandarin) 部分的拼音数据（普通话中最常用的一个读音。zh-CN 为主，如果 zh-CN 中没有则使用 zh-TW 中的拼音）
-* `kMandarin_overwrite.txt`: 手工纠正 `kMandarin.txt` 中有误的拼音数据（**可以修改**）
-* `GBK_PUA.txt`: [Private Use Area](https://en.wikipedia.org/wiki/Private_Use_Areas) 中有拼音的汉字，参考 [GB 18030 - 维基百科，自由的百科全书](https://zh.wikipedia.org/wiki/GB_18030#PUA) （**可以修改**）
-* `nonCJKUI.txt`: 不属于 [CJK Unified Ideograph](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs) 但是却有拼音的字符（**可以修改**）
-* `kanji.txt`: [日本自造汉字](https://zh.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97#7_%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97%E7%9A%84%E6%B1%89%E8%AF%AD%E6%99%AE%E9%80%9A%E8%AF%9D%E8%A7%84%E8%8C%83%E8%AF%BB%E9%9F%B3%E8%A1%A8) 的拼音数据 （**可以修改**）
-* `kMandarin_8105.txt`: [《通用规范汉字表》](https://zh.wikipedia.org/wiki/通用规范汉字表)(2013 年版)里 8105 个汉字最常用的一个读音 (**可以修改**)
-* `overwrite.txt`: 手工纠正的拼音数据（**可以修改**）
-* `pinyin.txt`: 合并上述文件后的拼音数据
-* `zdic.txt`: [汉典网](http://zdic.net) 的拼音数据（**可以修改**）
-
-
-## 参考资料
-
-* [汉语拼音方案](http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html)
-* [Unihan Database Lookup](http://www.unicode.org/charts/unihan.html)
-* [汉典 zdic.net](http://www.zdic.net/)
-* [字海网，叶典网](http://zisea.com/)
-* [国学大师_国学网](http://www.guoxuedashi.com/)
-* [Unicode、GB2312、GBK和GB18030中的汉字](http://www.fmddlmyy.cn/text24.html)
-* [GB 18030 - 维基百科，自由的百科全书](https://zh.wikipedia.org/wiki/GB_18030#PUA)
-* [通用规范汉字表 - 维基百科，自由的百科全书](https://zh.wikipedia.org/wiki/%E9%80%9A%E7%94%A8%E8%A7%84%E8%8C%83%E6%B1%89%E5%AD%97%E8%A1%A8)
-* [China’s 通用规范汉字表 (Tōngyòng Guīfàn Hànzìbiǎo)](https://blogs.adobe.com/CCJKType/2014/03/china-8105.html)
-* [日本汉字的汉语读音规范](http://www.moe.gov.cn/s78/A19/yxs_left/moe_810/s230/201001/t20100115_75698.html)
-* [日本汉字的汉语普通话规范读音表- 维基百科](https://zh.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97#7_%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97%E7%9A%84%E6%B1%89%E8%AF%AD%E6%99%AE%E9%80%9A%E8%AF%9D%E8%A7%84%E8%8C%83%E8%AF%BB%E9%9F%B3%E8%A1%A8)
-
-[unihan]: http://www.unicode.org/charts/unihan.html
-
-
-# phrase-pinyin-data [![Build Status](https://travis-ci.org/mozillazg/phrase-pinyin-data.svg?branch=master)](https://travis-ci.org/mozillazg/phrase-pinyin-data)
-
-词语拼音数据。
-
-
-## 数据介绍
-
-拼音数据的格式：
-
-```
-{phrase}: {pinyin}
-```
-
-* 以 `#` 开头的行是注释
-* 行尾的 `#` 也是注释
-* `{phrase}` 汉字词语
-* `{pinyin}` 词语的拼音，使用空格分隔每个汉字的拼音
-* 一行一个词语的读音，有多个音的词语会出现在多行
-* 示例：
-
-  ```
-  # 注释
-  中国: zhōng guó
-  北京: běi jīng  # 注释
-  ```
-
-文件说明:
-
-* `overwrite.txt`: 手工纠正的拼音数据
-* `pinyin.txt`: `pinyin.txt + overwrite.txt` 后的拼音数据
-* `zdic_cibs.txt`: [汉典网](http://www.zdic.net/) 汉语词典拼音数据
-* `zdic_cybs.txt`: [汉典网](http://www.zdic.net/) 成语词典拼音数据
-* `cc_cedict.txt`: [cc-cedict.org](https://cc-cedict.org/) 拼音数据
-* `large_pinyin.txt`: `zdic_cibs.txt + zdic_cybs.txt + cc_cedict.txt + pinyin.txt + overwrite.txt` 后的拼音数据
-
-
-## 参考资料
-
-* 初始数据基于 [phrases-dict.js](https://github.com/hotoo/pinyin/blob/05f74496c34ccb32db1a0fd0b358a798a22a51e5/data/phrases-dict.js) 和 [phrases_dict.py](https://github.com/mozillazg/python-pinyin/blob/366de0363ff1fb9a718ce668448bea59de09a4bf/pypinyin/phrases_dict.py)
-* [汉典 zdic.net](http://www.zdic.net/)
-* [字海网，叶典网](http://zisea.com/)
-* [国学大师_国学网](http://www.guoxuedashi.com/)
-* [CC-CEDICT download - MDBG English to Chinese dictionary](http://www.mdbg.net/chindict/chindict.php?page=cc-cedict)
-
--- a/third_party/phkit/phkit/pinyinkit/init.py
+++ b/third_party/phkit/phkit/pinyinkit/init.py
@ -3,8 +3,7 @@
 文本转拼音的模块，依赖python-pinyin，jieba，phrase-pinyin-data模块。
 """
 import re
-from .core import lazy_pinyin, pinyin, slug, Style, initialize
-from pypinyin.style import convert
+from pypinyin import lazy_pinyin, Style

 # 兼容0.1.0之前的版本。
 # 音调：5为轻声
--- a/third_party/phkit/phkit/pinyinkit/core.py
+++ b/third_party/phkit/phkit/pinyinkit/core.py
@ -1,457 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# author: kuangdd
-# date: 2020/5/30
-"""
-Base on python-pinyin(pypinyin), phrase-pinyin-data, pinyin-data and jieba.
-"""
-
-from __future__ import unicode_literals
-
-from itertools import chain
-
-from pypinyin.compat import text_type
-from pypinyin.constants import (
-    PHRASES_DICT, PINYIN_DICT, Style
-)
-from pypinyin.converter import DefaultConverter, _mixConverter
-from pypinyin.seg import mmseg
-from pypinyin.seg.simpleseg import seg
-from pypinyin.utils import _replace_tone2_style_dict_to_default
-
-from tqdm import tqdm
-import jieba
-import re
-from pathlib import Path
-
-_ziyin_re = re.compile(r"^U\+(\w+?):(.+?)#(.+)$")
-_true_pin_re = re.compile(r"[^a-zA-Z]+")
-
-is_initialized = False
-
-
-def load_single_dict(pinyin_dict, style='default'):
-    """载入用户自定义的单字拼音库
-
-    :param pinyin_dict: 单字拼音库。比如： ``{0x963F: u"ā,ē"}``
-    :param style: pinyin_dict 参数值的拼音库风格. 支持 'default', 'tone2'
-    :type pinyin_dict: dict
-    """
-    if style == 'tone2':
-        for k, v in pinyin_dict.items():
-            v = _replace_tone2_style_dict_to_default(v)
-            PINYIN_DICT[k] = v
-    else:
-        PINYIN_DICT.update(pinyin_dict)
-
-    mmseg.retrain(mmseg.seg)
-
-
-def load_phrases_dict(phrases_dict, style='default'):
-    """载入用户自定义的词语拼音库
-
-    :param phrases_dict: 词语拼音库。比如： ``{u"阿爸": [[u"ā"], [u"bà"]]}``
-    :param style: phrases_dict 参数值的拼音库风格. 支持 'default', 'tone2'
-    :type phrases_dict: dict
-    """
-    if style == 'tone2':
-        for k, value in phrases_dict.items():
-            v = [
-                list(map(_replace_tone2_style_dict_to_default, pys))
-                for pys in value
-            ]
-            PHRASES_DICT[k] = v
-    else:
-        PHRASES_DICT.update(phrases_dict)
-
-    mmseg.retrain(mmseg.seg)
-
-
-def parse_pinyin_txt(inpath):
-    # U+4E2D: zhōng,zhòng  # 中
-    outs = []
-    with open(inpath, encoding="utf8") as fin:
-        for line in tqdm(fin, desc='load pinyin', ncols=80, mininterval=1):
-            if line.startswith("#"):
-                continue
-            res = _ziyin_re.search(line)
-            if res:
-                zi = res.group(3).strip()
-                if len(zi) == 1:
-                    outs.append([zi, res.group(2).strip().split(",")])
-                else:
-                    print(line)
-            elif line.strip():
-                print(line)
-    return {ord(z): ','.join(p) for z, p in outs}
-
-
-def parse_phrase_txt(inpath):
-    # 一一对应: yī yī duì yìng
-    outs = []
-    with open(inpath, encoding="utf8") as fin:
-        for line in tqdm(fin, desc='load phrase', ncols=80, mininterval=1):
-            if line.startswith("#"):
-                continue
-            parts = line.split(":")
-            zs = parts[0].strip()
-            ps = parts[1].strip().split()
-            if len(parts) == 2 and len(zs) == len(ps) and len(zs) >= 2:
-                outs.append([zs, ps])
-            elif line.strip():
-                print(line)
-    return {zs: [[p] for p in ps] for zs, ps in outs}
-
-
-def initialize():
-    # 导入数据
-    inpath = Path(__file__).absolute().parent.joinpath('phrase_pinyin.txt.py')
-    _phrases_dict = parse_phrase_txt(inpath)
-    load_phrases_dict(_phrases_dict)  # big:398815 small:36776
-
-    inpath = Path(__file__).absolute().parent.joinpath('single_pinyin.txt.py')
-    _pinyin_dict = parse_pinyin_txt(inpath)
-    load_single_dict(_pinyin_dict)  # 41451
-
-    jieba.initialize()
-    # for word, _ in tqdm(_phrases_dict.items(), desc='jieba add word', ncols=80, mininterval=1):
-    #     jieba.add_word(word)
-    global is_initialized
-    is_initialized = True
-
-
-class Pinyin(object):
-
-    def __init__(self, converter=None, **kwargs):
-        self._converter = converter or DefaultConverter()
-
-    def pinyin(self, hans, style=Style.TONE, heteronym=False,
-               errors='default', strict=True, **kwargs):
-        """将汉字转换为拼音，返回汉字的拼音列表。
-
-        :param hans: 汉字字符串( ``'你好吗'`` )或列表( ``['你好', '吗']`` ).
-                     可以使用自己喜爱的分词模块对字符串进行分词处理,
-                     只需将经过分词处理的字符串列表传进来就可以了。
-        :type hans: unicode 字符串或字符串列表
-        :param style: 指定拼音风格，默认是 :py:attr:`~pypinyin.Style.TONE` 风格。
-                      更多拼音风格详见 :class:`~pypinyin.Style`
-        :param errors: 指定如何处理没有拼音的字符。详见 :ref:`handle_no_pinyin`
-
-                       * ``'default'``: 保留原始字符
-                       * ``'ignore'``: 忽略该字符
-                       * ``'replace'``: 替换为去掉 ``\\u`` 的 unicode 编码字符串
-                         (``'\\u90aa'`` => ``'90aa'``)
-                       * callable 对象: 回调函数之类的可调用对象。
-
-        :param heteronym: 是否启用多音字
-        :param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
-                       是否严格遵照《汉语拼音方案》来处理声母和韵母，
-                       详见 :ref:`strict`
-        :return: 拼音列表
-        :rtype: list
-
-        """
-        # 对字符串进行分词处理
-        if isinstance(hans, text_type):
-            han_list = self.seg(hans)
-        else:
-            han_list = chain(*(self.seg(x) for x in hans))
-
-        pys = []
-        for words in han_list:
-            pys.extend(
-                self._converter.convert(
-                    words, style, heteronym, errors, strict=strict))
-        return pys
-
-    def lazy_pinyin(self, hans, style=Style.NORMAL,
-                    errors='default', strict=True, **kwargs):
-        """将汉字转换为拼音，返回不包含多音字结果的拼音列表.
-
-        与 :py:func:`~pypinyin.pinyin` 的区别是每个汉字的拼音是个字符串，
-        并且每个字只包含一个读音.
-
-        :param hans: 汉字
-        :type hans: unicode or list
-        :param style: 指定拼音风格，默认是 :py:attr:`~pypinyin.Style.NORMAL` 风格。
-                      更多拼音风格详见 :class:`~pypinyin.Style`。
-        :param errors: 指定如何处理没有拼音的字符，详情请参考
-                       :py:func:`~pypinyin.pinyin`
-        :param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
-                       是否严格遵照《汉语拼音方案》来处理声母和韵母，
-                       详见 :ref:`strict`
-        :return: 拼音列表(e.g. ``['zhong', 'guo', 'ren']``)
-        :rtype: list
-
-        """
-        return list(
-            chain(
-                *self.pinyin(
-                    hans, style=style, heteronym=False,
-                    errors=errors, strict=strict)))
-
-    def pre_seg(self, hans, **kwargs):
-        """对字符串进行分词前将调用 ``pre_seg`` 方法对未分词的字符串做预处理。
-
-        默认原样返回传入的 ``hans``。
-
-        如果这个方法的返回值类型是 ``list``，表示返回的是一个分词后的结果，此时，
-        ``seg`` 方法中将不再调用 ``seg_function`` 进行分词。
-
-        :param hans: 分词前的字符串
-        :return: ``None`` or ``list``
-        """
-        outs = list(jieba.cut(hans))  # 默认用jieba分词，从语义角度分词。
-        return outs
-
-    def seg(self, hans, **kwargs):
-        """对汉字进行分词。
-
-        分词前会调用 ``pre_seg`` 方法，分词后会调用 ``post_seg`` 方法。
-
-        :param hans:
-        :return:
-        """
-        pre_data = self.pre_seg(hans)
-        if isinstance(pre_data, list):
-            seg_data = pre_data
-        else:
-            seg_data = self.get_seg()(hans)
-
-        post_data = self.post_seg(hans, seg_data)
-        if isinstance(post_data, list):
-            return post_data
-
-        return seg_data
-
-    def get_seg(self, **kwargs):
-        """获取分词函数。
-
-        :return: 分词函数
-        """
-        return seg
-
-    def post_seg(self, hans, seg_data, **kwargs):
-        """对字符串进行分词后将调用 ``post_seg`` 方法对分词后的结果做处理。
-
-        默认原样返回传入的 ``seg_data``。
-
-        如果这个方法的返回值类型是 ``list``，表示对分词结果做了二次处理，此时，
-        ``seg`` 方法将以这个返回的数据作为返回值。
-
-        :param hans: 分词前的字符串
-        :param seg_data: 分词后的结果
-        :type seg_data: list
-        :return: ``None`` or ``list``
-        """
-        pass
-
-
-_default_convert = DefaultConverter()
-_default_pinyin = Pinyin(_default_convert)
-
-
-def to_fixed(pinyin, style, strict=True):
-    # 用于向后兼容，TODO: 废弃
-    return _default_convert.convert_style(
-        '', pinyin, style=style, strict=strict, default=pinyin)
-
-
-_to_fixed = to_fixed
-
-
-def handle_nopinyin(chars, errors='default', heteronym=True):
-    # 用于向后兼容，TODO: 废弃
-    return _default_convert.handle_nopinyin(
-        chars, style=None, errors=errors, heteronym=heteronym, strict=True)
-
-
-def single_pinyin(han, style, heteronym, errors='default', strict=True):
-    # 用于向后兼容，TODO: 废弃
-    return _default_convert._single_pinyin(
-        han, style, heteronym, errors=errors, strict=strict)
-
-
-def phrase_pinyin(phrase, style, heteronym, errors='default', strict=True):
-    # 用于向后兼容，TODO: 废弃
-    return _default_convert._phrase_pinyin(
-        phrase, style, heteronym, errors=errors, strict=strict)
-
-
-def pinyin(hans, style=Style.TONE, heteronym=False,
-           errors='default', strict=True,
-           v_to_u=False, neutral_tone_with_five=False):
-    """将汉字转换为拼音，返回汉字的拼音列表。
-
-    :param hans: 汉字字符串( ``'你好吗'`` )或列表( ``['你好', '吗']`` ).
-                 可以使用自己喜爱的分词模块对字符串进行分词处理,
-                 只需将经过分词处理的字符串列表传进来就可以了。
-    :type hans: unicode 字符串或字符串列表
-    :param style: 指定拼音风格，默认是 :py:attr:`~pypinyin.Style.TONE` 风格。
-                  更多拼音风格详见 :class:`~pypinyin.Style`
-    :param errors: 指定如何处理没有拼音的字符。详见 :ref:`handle_no_pinyin`
-
-                   * ``'default'``: 保留原始字符
-                   * ``'ignore'``: 忽略该字符
-                   * ``'replace'``: 替换为去掉 ``\\u`` 的 unicode 编码字符串
-                     (``'\\u90aa'`` => ``'90aa'``)
-                   * callable 对象: 回调函数之类的可调用对象。
-
-    :param heteronym: 是否启用多音字
-    :param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
-                   是否严格遵照《汉语拼音方案》来处理声母和韵母，
-                   详见 :ref:`strict`
-    :param v_to_u: 无声调相关拼音风格下的结果是否使用 ``ü`` 代替原来的 ``v``
-    :type v_to_u: bool
-    :param neutral_tone_with_five: 声调使用数字表示的相关拼音风格下的结果是否
-                                   使用 5 标识轻声
-    :type neutral_tone_with_five: bool
-    :return: 拼音列表
-    :rtype: list
-
-    :raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常
-
-    Usage::
-
-      >>> from pypinyin import pinyin, Style
-      >>> import pypinyin
-      >>> pinyin('中心')
-      [['zhōng'], ['xīn']]
-      >>> pinyin('中心', heteronym=True)  # 启用多音字模式
-      [['zhōng', 'zhòng'], ['xīn']]
-      >>> pinyin('中心', style=Style.FIRST_LETTER)  # 设置拼音风格
-      [['z'], ['x']]
-      >>> pinyin('中心', style=Style.TONE2)
-      [['zho1ng'], ['xi1n']]
-      >>> pinyin('中心', style=Style.CYRILLIC)
-      [['чжун1'], ['синь1']]
-      >>> pinyin('战略', v_to_u=True, style=Style.NORMAL)
-      [['zhan'], ['lüe']]
-      >>> pinyin('衣裳', style=Style.TONE3, neutral_tone_with_five=True)
-      [['yi1'], ['shang5']]
-    """
-    global is_initialized
-    if not is_initialized:
-        initialize()
-        is_initialized = True
-    _pinyin = Pinyin(_mixConverter(
-        v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five))
-    return _pinyin.pinyin(
-        hans, style=style, heteronym=heteronym, errors=errors, strict=strict)
-
-
-def slug(hans, style=Style.NORMAL, heteronym=False, separator='-',
-         errors='default', strict=True):
-    """将汉字转换为拼音，然后生成 slug 字符串.
-
-    :param hans: 汉字
-    :type hans: unicode or list
-    :param style: 指定拼音风格，默认是 :py:attr:`~pypinyin.Style.NORMAL` 风格。
-                  更多拼音风格详见 :class:`~pypinyin.Style`
-    :param heteronym: 是否启用多音字
-    :param separator: 两个拼音间的分隔符/连接符
-    :param errors: 指定如何处理没有拼音的字符，详情请参考
-                   :py:func:`~pypinyin.pinyin`
-    :param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
-                   是否严格遵照《汉语拼音方案》来处理声母和韵母，
-                   详见 :ref:`strict`
-    :return: slug 字符串.
-
-    :raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常
-
-    ::
-
-      >>> import pypinyin
-      >>> from pypinyin import Style
-      >>> pypinyin.slug('中国人')
-      'zhong-guo-ren'
-      >>> pypinyin.slug('中国人', separator=' ')
-      'zhong guo ren'
-      >>> pypinyin.slug('中国人', style=Style.FIRST_LETTER)
-      'z-g-r'
-      >>> pypinyin.slug('中国人', style=Style.CYRILLIC)
-      'чжун1-го2-жэнь2'
-    """
-    global is_initialized
-    if not is_initialized:
-        initialize()
-        is_initialized = True
-    return separator.join(
-        chain(
-            *_default_pinyin.pinyin(
-                hans, style=style, heteronym=heteronym,
-                errors=errors, strict=strict
-            )
-        )
-    )
-
-
-def lazy_pinyin(hans, style=Style.NORMAL, errors='default', strict=True,
-                v_to_u=False, neutral_tone_with_five=False):
-    """将汉字转换为拼音，返回不包含多音字结果的拼音列表.
-
-    与 :py:func:`~pypinyin.pinyin` 的区别是返回的拼音是个字符串，
-    并且每个字只包含一个读音.
-
-    :param hans: 汉字
-    :type hans: unicode or list
-    :param style: 指定拼音风格，默认是 :py:attr:`~pypinyin.Style.NORMAL` 风格。
-                  更多拼音风格详见 :class:`~pypinyin.Style`。
-    :param errors: 指定如何处理没有拼音的字符，详情请参考
-                   :py:func:`~pypinyin.pinyin`
-    :param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
-                   是否严格遵照《汉语拼音方案》来处理声母和韵母，
-                   详见 :ref:`strict`
-    :param v_to_u: 无声调相关拼音风格下的结果是否使用 ``ü`` 代替原来的 ``v``
-    :type v_to_u: bool
-    :param neutral_tone_with_five: 声调使用数字表示的相关拼音风格下的结果是否
-                                   使用 5 标识轻声
-    :type neutral_tone_with_five: bool
-    :return: 拼音列表(e.g. ``['zhong', 'guo', 'ren']``)
-    :rtype: list
-
-    :raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常
-
-    Usage::
-
-      >>> from pypinyin import lazy_pinyin, Style
-      >>> import pypinyin
-      >>> lazy_pinyin('中心')
-      ['zhong', 'xin']
-      >>> lazy_pinyin('中心', style=Style.TONE)
-      ['zhōng', 'xīn']
-      >>> lazy_pinyin('中心', style=Style.FIRST_LETTER)
-      ['z', 'x']
-      >>> lazy_pinyin('中心', style=Style.TONE2)
-      ['zho1ng', 'xi1n']
-      >>> lazy_pinyin('中心', style=Style.CYRILLIC)
-      ['чжун1', 'синь1']
-      >>> lazy_pinyin('战略', v_to_u=True)
-      ['zhan', 'lüe']
-      >>> lazy_pinyin('衣裳', style=Style.TONE3, neutral_tone_with_five=True)
-      ['yi1', 'shang5']
-    """
-    global is_initialized
-    if not is_initialized:
-        initialize()
-        is_initialized = True
-    _pinyin = Pinyin(_mixConverter(
-        v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five))
-    return _pinyin.lazy_pinyin(
-        hans, style=style, errors=errors, strict=strict)
-
-
-if __name__ == "__main__":
-    print(__file__)
-    han = '老师很重视这个问题啊，请重说一遍。。。很难说有山难发生，理发师和会计谁会发财？'
-    out = _default_pinyin.seg(han)
-    assert out == ['老师', '很', '重视', '这个', '问题', '啊', '，', '请重', '说', '一遍', '。', '。', '。', '很难说', '有山难', '发生', '，',
-                   '理发师', '和', '会计', '谁', '会', '发财', '？']
-
-    out = lazy_pinyin(han, style=8, neutral_tone_with_five=True)
-    assert out == ['lao3', 'shi1', 'hen3', 'zhong4', 'shi4', 'zhe4', 'ge4', 'wen4', 'ti2', 'a5', '，', 'qing3', 'zhong4',
-                   'shuo1', 'yi1', 'bian4', '。', '。', '。', 'hen3', 'nan2', 'shuo1', 'you3', 'shan1', 'nan2', 'fa1',
-                   'sheng1', '，', 'li3', 'fa4', 'shi1', 'he2', 'kuai4', 'ji4', 'shui2', 'hui4', 'fa1', 'cai2', '？']
-
-    out = slug(han, style=8, separator=' ')
-    assert out == 'lao3 shi1 hen3 zhong4 shi4 zhe4 ge4 wen4 ti2 a ， qing3 zhong4 shuo1 yi1 bian4 。 。 。 hen3 nan2 shuo1 you3 shan1 nan2 fa1 sheng1 ， li3 fa4 shi1 he2 kuai4 ji4 shui2 hui4 fa1 cai2 ？'
--- a/third_party/phkit/phkit/pinyinkit/phrase_pinyin.txt.py
+++ b/third_party/phkit/phkit/pinyinkit/phrase_pinyin.txt.py
--- a/third_party/phkit/phkit/pinyinkit/single_pinyin.txt.py
+++ b/third_party/phkit/phkit/pinyinkit/single_pinyin.txt.py
--- a/third_party/phkit/requirements.txt
+++ b/third_party/phkit/requirements.txt
@ -1,5 +1,3 @@
-pypinyin
-hanziconv
 jieba
 inflect
 unidecode
--- a/third_party/phkit/setup.py
+++ b/third_party/phkit/setup.py
@ -28,7 +28,7 @@ import logging

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(os.path.splitext(os.path.basename(__name__))[0])
-install_requires = ['pypinyin>=0.41.0', 'hanziconv', 'jieba>=0.42.1', 'tqdm', 'inflect', 'unidecode']
+install_requires = ['jieba>=0.42.1', 'tqdm', 'inflect', 'unidecode']
 requires = install_requires


--- a/third_party/phkit/test.py
+++ b/third_party/phkit/test.py
@ -46,6 +46,16 @@ def test_phkit():
    assert result == target


+def test_convert():
+    from phkit import ban2quan, quan2ban, jian2fan, fan2jian
+    assert ban2quan("aA1 ,:$。、") == "ａＡ１　，：＄。、"
+    assert quan2ban("ａＡ１　，：＄。、") == "aA1 ,:$。、"
+    assert jian2fan("中国语言") == "中國語言"
+    assert fan2jian("中國語言") == "中国语言"
+    print(fan2jian("中國語言"))
+    print(jian2fan("中国语言"))
+
 if __name__ == "__main__":
    print(__file__)
    test_phkit()
+    test_convert()
--- a/third_party/python-pinyin/pypinyin/core.py
+++ b/third_party/python-pinyin/pypinyin/core.py
@ -16,6 +16,7 @@ from pypinyin.converter import DefaultConverter
 from pypinyin.seg import mmseg
 from pypinyin.seg import simpleseg
 from pypinyin.utils import (_replace_tone2_style_dict_to_default)
+import jieba

 TStyle = Style
 TErrors = Union[Callable[[Text], Text], Text]
@ -139,7 +140,8 @@ class Pinyin():
        :param hans: 分词前的字符串
        :return: ``None`` or ``list``
        """
-        pass
+        outs = jieba.lcut(hans)  # 默认用jieba分词，从语义角度分词。
+        return outs

    def post_seg(self, hans: Text, seg_data: List[Text],
                 **kwargs: Any) -> Optional[List[Text]]:
--- a/third_party/python-pinyin/requirements.txt
+++ b/third_party/python-pinyin/requirements.txt
@ -10,3 +10,4 @@ Sphinx
 tox
 twine
 wheel>=0.21
+jieba
--- a/third_party/python-pinyin/setup.py
+++ b/third_party/python-pinyin/setup.py
@ -17,7 +17,7 @@ packages = [
    'pypinyin.style',
 ]

-requirements = []
+requirements = ["jieba"]
 if sys.version_info[:2] < (3, 4):
    requirements.append('enum34')
 if sys.version_info[:2] < (3, 5):
--- a/third_party/zhon/.travis.yml
+++ b/third_party/zhon/.travis.yml
@ -5,12 +5,6 @@ script: tox

 matrix:
  include:
-    - python: 2.7
-      env: TOXENV=py27
-    - python: 3.4
-      env: TOXENV=py34
-    - python: 3.5
-      env: TOXENV=py35
    - python: 3.6
      env: TOXENV=py36
    - python: 3.6
--- a/third_party/zhon/AUTHORS.rst
+++ b/third_party/zhon/AUTHORS.rst
@ -1,14 +0,0 @@
-=======
-Credits
-=======
-
-Author and Maintainer
---------------------
-
-* Thomas Roten <https://github.com/tsroten>
-
-Contributors
------------
-
-None yet. Why not be the first?
-
--- a/third_party/zhon/CHANGES.rst
+++ b/third_party/zhon/CHANGES.rst
@ -1,88 +0,0 @@
-Changes
-=======
-
-v0.1.0 (2013-05-05)
-------------------
-
-* Initial release
-
-v0.1.1 (2013-05-05)
-------------------
-
-* Adds zhon.cedict package to setup.py
-
-v0.2.0 (2013-05-07)
-------------------
-
-* Allows for mapping between simplified and traditional.
-* Adds logging to build_string().
-* Adds constants for numbered Pinyin and accented Pinyin.
-
-v0.2.1 (2013-05-07)
-------------------
-
-* Fixes typo in README.rst.
-
-v.1.0.0 (2014-01-25)
--------------------
-
-* Complete rewrite that refactors code, renames constants, and improves Pinyin
-  support.
-
-v.1.1.0 (2014-01-28)
--------------------
-
-* Adds ``zhon.pinyin.punctuation`` constant.
-* Adds ``zhon.pinyin.accented_syllable``, ``zhon.pinyin.accented_word``, and
-  ``zhon.pinyin.accented_sentence`` constants.
-* Adds ``zhon.pinyin.numbered_syllable``, ``zhon.pinyin.numbered_word``, and
-  ``zhon.pinyin.numbered_sentence`` constants.
-* Fixes some README.rst typos.
-* Clarifies information regarding Traditional and Simplified character
-  constants in README.rst.
-* Adds constant short names to README.rst.
-
-v.1.1.1 (2014-01-29)
--------------------
-
-* Adds documentation.
-* Adds ``zhon.cedict.all`` constant.
-* Removes duplicate code ranges from ``zhon.hanzi.characters``.
-* Makes ``zhon.hanzi.non_stops`` a string containing all non-stops instead of
-  a string containing code ranges.
-* Removes duplicate letters in ``zhon.pinyin.consonants``.
-* Refactors Pinyin vowels/consonant code.
-* Removes the Latin alpha from ``zhon.pinyin.vowels``. Fixes #16.
-* Adds ``cjk_ideographs`` alias for ``zhon.hanzi.characters``.
-* Fixes various typos.
-* Removes numbers from Pinyin word constants. Fixes #15.
-* Adds lowercase and uppercase constants to ``zhon.pinyin``.
-* Fixes a bug with ``zhon.pinyin.sentence``.
-* Adds ``sent`` alias for ``zhon.pinyin.sentence``.
-
-v.1.1.2 (2014-01-31)
--------------------
-
-* Fixes bug with ``zhon.cedict.all``.
-
-v.1.1.3 (2014-02-12)
--------------------
-
-* Adds Ideographic number zero to ``zhon.hanzi.characters``. Fixes #17.
-* Fixes r-suffix bug. Fixes #18.
-
-v.1.1.4 (2015-01-25)
--------------------
-
-* Removes duplicate module declarations in documentation.
-* Moves tests inside zhon package.
-* Adds travis config file.
-* Adds Python 3.4 tests to travis and tox.
-* Fixes flake8 warnings.
-* Adds distutil fallback import statment to setup.py.
-* Adds missing hanzi punctuation. Fixes #19.
-
-v.1.1.5 (2016-05-23)
--------------------
-
-* Add missing Zhuyin characters. Fixes #23.
--- a/third_party/zhon/CONTRIBUTING.rst
+++ b/third_party/zhon/CONTRIBUTING.rst
@ -1,107 +0,0 @@
-============
-Contributing
-============
-
-Contributions are welcome, and they are greatly appreciated! Every
-little bit helps, and credit will always be given. 
-
-You can contribute in many ways:
-
-Types of Contributions
----------------------
-
-Report Bugs
-~~~~~~~~~~~
-
-Report bugs at https://github.com/tsroten/zhon/issues.
-
-If you are reporting a bug, please include:
-
-* Your operating system name and version.
-* Any details about your local setup that might be helpful in troubleshooting.
-* Detailed steps to reproduce the bug.
-
-Fix Bugs
-~~~~~~~~
-
-Look through the GitHub issues for bugs. Anything tagged with "bug"
-is open to whoever wants to implement it.
-
-Implement Features
-~~~~~~~~~~~~~~~~~~
-
-Look through the GitHub issues for features. Anything tagged with "feature"
-is open to whoever wants to implement it.
-
-Write Documentation
-~~~~~~~~~~~~~~~~~~~
-
-Zhon could always use more documentation, whether as part of the 
-official Zhon docs, in docstrings, or even on the web in blog posts,
-articles, and such.
-
-Submit Feedback
-~~~~~~~~~~~~~~~
-
-The best way to send feedback is to file an issue at https://github.com/tsroten/zhon/issues.
-
-If you are proposing a feature:
-
-* Explain in detail how it would work.
-* Keep the scope as narrow as possible, to make it easier to implement.
-* Remember that this is a volunteer-driven project, and that contributions
-  are welcome :)
-
-Get Started!
------------
-
-Ready to contribute? Here's how to set up `zhon` for local development.
-
-1. Fork the `zhon` repo on GitHub.
-2. Clone your fork locally::
-
-    $ git clone git@github.com:your_name_here/zhon.git
-
-3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
-
-    $ mkvirtualenv zhon
-    $ cd zhon/
-    $ python setup.py develop
-
-4. Create a branch for local development::
-
-    $ git checkout -b name-of-your-bugfix-or-feature
-   
-   Now you can make your changes locally.
-
-5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
-
-    $ flake8 zhon
-    $ python setup.py test
-    $ tox
-
-   To get flake8 and tox, just pip install them into your virtualenv.
-
-   You can ignore the flake8 errors regarding `zhon.cedict` files. Rather than include hundreds of newline characters in each file, we are ignoring those errors.
-
-6. Commit your changes and push your branch to GitHub::
-
-    $ git add .
-    $ git commit -m "Your detailed description of your changes."
-    $ git push origin name-of-your-bugfix-or-feature
-
-7. Submit a pull request through the GitHub website.
-
-Pull Request Guidelines
-----------------------
-
-Before you submit a pull request, check that it meets these guidelines:
-
-1. The pull request should include tests.
-2. If the pull request adds functionality, the docs should be updated. Put
-   your new functionality into a function with a docstring, and add the
-   feature to the list in README.rst.
-3. The pull request should work for Python 2.7, 3.3, and 3.4. Check 
-   https://travis-ci.org/tsroten/zhon/pull_requests
-   and make sure that the tests pass for all supported Python versions.
-4. If you want to receive credit, add your name to `AUTHORS.rst`.
--- a/third_party/zhon/LICENSE.txt
+++ b/third_party/zhon/LICENSE.txt
@ -1,7 +0,0 @@
-Copyright (c) 2013-2014 Thomas Roten
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/third_party/zhon/README.rst
+++ b/third_party/zhon/README.rst
@ -61,4 +61,3 @@ Getting Started
 * `Install Zhon <http://zhon.readthedocs.org/en/latest/#installation>`_
 * Read `Zhon's introduction <http://zhon.readthedocs.org/en/latest/#using-zhon>`_
 * Learn from the `API documentation <http://zhon.readthedocs.org/en/latest/#zhon-hanzi>`_
-* `Contribute <https://github.com/tsroten/zhon/blob/develop/CONTRIBUTING.rst>`_ documentation, code, or feedback
--- a/third_party/zhon/setup.py
+++ b/third_party/zhon/setup.py
@ -37,9 +37,6 @@ setup(
        'Development Status :: 5 - Production/Stable',
        'License :: OSI Approved :: MIT License',
        'Programming Language :: Python',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.4',
-        'Programming Language :: Python :: 3.5',
        'Programming Language :: Python :: 3.6',
        'Topic :: Software Development :: Libraries :: Python Modules',
        'Topic :: Text Processing :: Linguistic',