add Cedict egs & pypinyin using jieba as wordseg & phkit using local pypinyin package (#637)
* down and parse cedict * remove useless * using third party python pinyin * jieba as default wordseg * remove useless * remove pinyin dict * using jieba.lcut * remove doc of cedict egs * add fan2jian test * add description for say_digitpull/640/head
parent
f22f681992
commit
749a113037
@ -0,0 +1,2 @@
|
|||||||
|
data
|
||||||
|
exp
|
@ -0,0 +1,78 @@
|
|||||||
|
# https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py
|
||||||
|
|
||||||
|
#A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
|
||||||
|
|
||||||
|
#Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
|
||||||
|
|
||||||
|
#Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
|
||||||
|
|
||||||
|
#Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
|
||||||
|
|
||||||
|
#This code was written by Franki Allegra in February 2020.
|
||||||
|
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
|
# usage: bin ccedict dump.json
|
||||||
|
|
||||||
|
with open(sys.argv[1], 'rt') as file:
|
||||||
|
text = file.read()
|
||||||
|
lines = text.split('\n')
|
||||||
|
dict_lines = list(lines)
|
||||||
|
|
||||||
|
def parse_line(line):
|
||||||
|
parsed = {}
|
||||||
|
if line == '':
|
||||||
|
dict_lines.remove(line)
|
||||||
|
return 0
|
||||||
|
if line.startswith('#'):
|
||||||
|
return 0
|
||||||
|
if line.startswith('%'):
|
||||||
|
return 0
|
||||||
|
line = line.rstrip('/')
|
||||||
|
line = line.split('/')
|
||||||
|
if len(line) <= 1:
|
||||||
|
return 0
|
||||||
|
english = line[1]
|
||||||
|
char_and_pinyin = line[0].split('[')
|
||||||
|
characters = char_and_pinyin[0]
|
||||||
|
characters = characters.split()
|
||||||
|
traditional = characters[0]
|
||||||
|
simplified = characters[1]
|
||||||
|
pinyin = char_and_pinyin[1]
|
||||||
|
pinyin = pinyin.rstrip()
|
||||||
|
pinyin = pinyin.rstrip("]")
|
||||||
|
parsed['traditional'] = traditional
|
||||||
|
parsed['simplified'] = simplified
|
||||||
|
parsed['pinyin'] = pinyin
|
||||||
|
parsed['english'] = english
|
||||||
|
list_of_dicts.append(parsed)
|
||||||
|
|
||||||
|
def remove_surnames():
|
||||||
|
for x in range(len(list_of_dicts)-1, -1, -1):
|
||||||
|
if "surname " in list_of_dicts[x]['english']:
|
||||||
|
if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']:
|
||||||
|
list_of_dicts.pop(x)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
#make each line into a dictionary
|
||||||
|
print("Parsing dictionary . . .")
|
||||||
|
for line in dict_lines:
|
||||||
|
parse_line(line)
|
||||||
|
|
||||||
|
#remove entries for surnames from the data (optional):
|
||||||
|
print("Removing Surnames . . .")
|
||||||
|
remove_surnames()
|
||||||
|
|
||||||
|
|
||||||
|
print("Saving to database (this may take a few minutes) . . .")
|
||||||
|
with open(sys.argv[2], 'wt') as fout:
|
||||||
|
for one_dict in list_of_dicts:
|
||||||
|
json_str = json.dumps(one_dict)
|
||||||
|
fout.write(json_str + "\n")
|
||||||
|
print('Done!')
|
||||||
|
|
||||||
|
list_of_dicts = []
|
||||||
|
parsed_dict = main()
|
@ -0,0 +1,10 @@
|
|||||||
|
export MAIN_ROOT=${PWD}/../../
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH}
|
@ -0,0 +1,39 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# CC-CEDICT download: https://www.mdbg.net/chinese/dictionary?page=cc-cedict
|
||||||
|
# The word dictionary of this website is based on CC-CEDICT.
|
||||||
|
# CC-CEDICT is a continuation of the CEDICT project started by Paul Denisowski in 1997 with the
|
||||||
|
# aim to provide a complete downloadable Chinese to English dictionary with pronunciation in pinyin for the Chinese characters.
|
||||||
|
# This website allows you to easily add new entries or correct existing entries in CC-CEDICT.
|
||||||
|
# Submitted entries will be checked and processed frequently and released for download in CEDICT format on this page.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||||
|
|
||||||
|
|
||||||
|
cedict_url=https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.zip
|
||||||
|
cedict=cedict_1_0_ts_utf-8_mdbg.zip
|
||||||
|
|
||||||
|
mkdir -p data
|
||||||
|
|
||||||
|
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
|
||||||
|
test -f data/${cedict} || wget -O data/${cedict} ${cedict_url}
|
||||||
|
pushd data
|
||||||
|
unzip ${cedict}
|
||||||
|
popd
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p exp
|
||||||
|
|
||||||
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||||
|
cp data/cedict_ts.u8 exp/cedict
|
||||||
|
python3 local/parser.py exp/cedict exp/cedict.json
|
||||||
|
fi
|
||||||
|
|
@ -0,0 +1,99 @@
|
|||||||
|
# Copyright 2014 Bernard Yue
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
__doc__ = """
|
||||||
|
Hanzi Converter 繁簡轉換器 | 繁简转换器
|
||||||
|
This module provides functions converting chinese text between simplified and
|
||||||
|
traditional characters. It returns unicode represnetation of the text.
|
||||||
|
Class HanziConv is the main entry point of the module, you can import the
|
||||||
|
class by doing:
|
||||||
|
>>> from hanziconv import HanziConv
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from zhon import cedict
|
||||||
|
|
||||||
|
class HanziConv():
|
||||||
|
"""This class supports hanzi (漢字) convention between simplified and
|
||||||
|
traditional format"""
|
||||||
|
__traditional_charmap = cedict.traditional
|
||||||
|
__simplified_charmap = cedict.simplified
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __convert(cls, text, toTraditional=True):
|
||||||
|
"""Convert `text` to Traditional characters if `toTraditional` is
|
||||||
|
True, else convert to simplified characters
|
||||||
|
:param text: data to convert
|
||||||
|
:param toTraditional: True -- convert to traditional text
|
||||||
|
False -- covert to simplified text
|
||||||
|
:returns: converted 'text`
|
||||||
|
"""
|
||||||
|
if isinstance(text, bytes):
|
||||||
|
text = text.decode('utf-8')
|
||||||
|
|
||||||
|
fromMap = cls.__simplified_charmap
|
||||||
|
toMap = cls.__traditional_charmap
|
||||||
|
if not toTraditional:
|
||||||
|
fromMap = cls.__traditional_charmap
|
||||||
|
toMap = cls.__simplified_charmap
|
||||||
|
|
||||||
|
final = []
|
||||||
|
for c in text:
|
||||||
|
index = fromMap.find(c)
|
||||||
|
if index != -1:
|
||||||
|
final.append(toMap[index])
|
||||||
|
else:
|
||||||
|
final.append(c)
|
||||||
|
return ''.join(final)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def toSimplified(cls, text):
|
||||||
|
"""Convert `text` to simplified character string. Assuming text is
|
||||||
|
traditional character string
|
||||||
|
:param text: text to convert
|
||||||
|
:returns: converted UTF-8 characters
|
||||||
|
>>> from hanziconv import HanziConv
|
||||||
|
>>> print(HanziConv.toSimplified('繁簡轉換器'))
|
||||||
|
繁简转换器
|
||||||
|
"""
|
||||||
|
return cls.__convert(text, toTraditional=False)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def toTraditional(cls, text):
|
||||||
|
"""Convert `text` to traditional character string. Assuming text is
|
||||||
|
simplified character string
|
||||||
|
:param text: text to convert
|
||||||
|
:returns: converted UTF-8 characters
|
||||||
|
>>> from hanziconv import HanziConv
|
||||||
|
>>> print(HanziConv.toTraditional('繁简转换器'))
|
||||||
|
繁簡轉換器
|
||||||
|
"""
|
||||||
|
return cls.__convert(text, toTraditional=True)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def same(cls, text1, text2):
|
||||||
|
"""Return True if text1 and text2 meant literally the same, False
|
||||||
|
otherwise
|
||||||
|
:param text1: string to compare to ``text2``
|
||||||
|
:param text2: string to compare to ``text1``
|
||||||
|
:returns: **True** -- ``text1`` and ``text2`` are the same in meaning,
|
||||||
|
**False** -- otherwise
|
||||||
|
>>> from hanziconv import HanziConv
|
||||||
|
>>> print(HanziConv.same('繁简转换器', '繁簡轉換器'))
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
t1 = cls.toSimplified(text1)
|
||||||
|
t2 = cls.toSimplified(text2)
|
||||||
|
return t1 == t2
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,14 +0,0 @@
|
|||||||
=======
|
|
||||||
Credits
|
|
||||||
=======
|
|
||||||
|
|
||||||
Author and Maintainer
|
|
||||||
---------------------
|
|
||||||
|
|
||||||
* Thomas Roten <https://github.com/tsroten>
|
|
||||||
|
|
||||||
Contributors
|
|
||||||
------------
|
|
||||||
|
|
||||||
None yet. Why not be the first?
|
|
||||||
|
|
@ -1,88 +0,0 @@
|
|||||||
Changes
|
|
||||||
=======
|
|
||||||
|
|
||||||
v0.1.0 (2013-05-05)
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
* Initial release
|
|
||||||
|
|
||||||
v0.1.1 (2013-05-05)
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
* Adds zhon.cedict package to setup.py
|
|
||||||
|
|
||||||
v0.2.0 (2013-05-07)
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
* Allows for mapping between simplified and traditional.
|
|
||||||
* Adds logging to build_string().
|
|
||||||
* Adds constants for numbered Pinyin and accented Pinyin.
|
|
||||||
|
|
||||||
v0.2.1 (2013-05-07)
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
* Fixes typo in README.rst.
|
|
||||||
|
|
||||||
v.1.0.0 (2014-01-25)
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
* Complete rewrite that refactors code, renames constants, and improves Pinyin
|
|
||||||
support.
|
|
||||||
|
|
||||||
v.1.1.0 (2014-01-28)
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
* Adds ``zhon.pinyin.punctuation`` constant.
|
|
||||||
* Adds ``zhon.pinyin.accented_syllable``, ``zhon.pinyin.accented_word``, and
|
|
||||||
``zhon.pinyin.accented_sentence`` constants.
|
|
||||||
* Adds ``zhon.pinyin.numbered_syllable``, ``zhon.pinyin.numbered_word``, and
|
|
||||||
``zhon.pinyin.numbered_sentence`` constants.
|
|
||||||
* Fixes some README.rst typos.
|
|
||||||
* Clarifies information regarding Traditional and Simplified character
|
|
||||||
constants in README.rst.
|
|
||||||
* Adds constant short names to README.rst.
|
|
||||||
|
|
||||||
v.1.1.1 (2014-01-29)
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
* Adds documentation.
|
|
||||||
* Adds ``zhon.cedict.all`` constant.
|
|
||||||
* Removes duplicate code ranges from ``zhon.hanzi.characters``.
|
|
||||||
* Makes ``zhon.hanzi.non_stops`` a string containing all non-stops instead of
|
|
||||||
a string containing code ranges.
|
|
||||||
* Removes duplicate letters in ``zhon.pinyin.consonants``.
|
|
||||||
* Refactors Pinyin vowels/consonant code.
|
|
||||||
* Removes the Latin alpha from ``zhon.pinyin.vowels``. Fixes #16.
|
|
||||||
* Adds ``cjk_ideographs`` alias for ``zhon.hanzi.characters``.
|
|
||||||
* Fixes various typos.
|
|
||||||
* Removes numbers from Pinyin word constants. Fixes #15.
|
|
||||||
* Adds lowercase and uppercase constants to ``zhon.pinyin``.
|
|
||||||
* Fixes a bug with ``zhon.pinyin.sentence``.
|
|
||||||
* Adds ``sent`` alias for ``zhon.pinyin.sentence``.
|
|
||||||
|
|
||||||
v.1.1.2 (2014-01-31)
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
* Fixes bug with ``zhon.cedict.all``.
|
|
||||||
|
|
||||||
v.1.1.3 (2014-02-12)
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
* Adds Ideographic number zero to ``zhon.hanzi.characters``. Fixes #17.
|
|
||||||
* Fixes r-suffix bug. Fixes #18.
|
|
||||||
|
|
||||||
v.1.1.4 (2015-01-25)
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
* Removes duplicate module declarations in documentation.
|
|
||||||
* Moves tests inside zhon package.
|
|
||||||
* Adds travis config file.
|
|
||||||
* Adds Python 3.4 tests to travis and tox.
|
|
||||||
* Fixes flake8 warnings.
|
|
||||||
* Adds distutil fallback import statment to setup.py.
|
|
||||||
* Adds missing hanzi punctuation. Fixes #19.
|
|
||||||
|
|
||||||
v.1.1.5 (2016-05-23)
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
* Add missing Zhuyin characters. Fixes #23.
|
|
@ -1,107 +0,0 @@
|
|||||||
============
|
|
||||||
Contributing
|
|
||||||
============
|
|
||||||
|
|
||||||
Contributions are welcome, and they are greatly appreciated! Every
|
|
||||||
little bit helps, and credit will always be given.
|
|
||||||
|
|
||||||
You can contribute in many ways:
|
|
||||||
|
|
||||||
Types of Contributions
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
Report Bugs
|
|
||||||
~~~~~~~~~~~
|
|
||||||
|
|
||||||
Report bugs at https://github.com/tsroten/zhon/issues.
|
|
||||||
|
|
||||||
If you are reporting a bug, please include:
|
|
||||||
|
|
||||||
* Your operating system name and version.
|
|
||||||
* Any details about your local setup that might be helpful in troubleshooting.
|
|
||||||
* Detailed steps to reproduce the bug.
|
|
||||||
|
|
||||||
Fix Bugs
|
|
||||||
~~~~~~~~
|
|
||||||
|
|
||||||
Look through the GitHub issues for bugs. Anything tagged with "bug"
|
|
||||||
is open to whoever wants to implement it.
|
|
||||||
|
|
||||||
Implement Features
|
|
||||||
~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Look through the GitHub issues for features. Anything tagged with "feature"
|
|
||||||
is open to whoever wants to implement it.
|
|
||||||
|
|
||||||
Write Documentation
|
|
||||||
~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Zhon could always use more documentation, whether as part of the
|
|
||||||
official Zhon docs, in docstrings, or even on the web in blog posts,
|
|
||||||
articles, and such.
|
|
||||||
|
|
||||||
Submit Feedback
|
|
||||||
~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
The best way to send feedback is to file an issue at https://github.com/tsroten/zhon/issues.
|
|
||||||
|
|
||||||
If you are proposing a feature:
|
|
||||||
|
|
||||||
* Explain in detail how it would work.
|
|
||||||
* Keep the scope as narrow as possible, to make it easier to implement.
|
|
||||||
* Remember that this is a volunteer-driven project, and that contributions
|
|
||||||
are welcome :)
|
|
||||||
|
|
||||||
Get Started!
|
|
||||||
------------
|
|
||||||
|
|
||||||
Ready to contribute? Here's how to set up `zhon` for local development.
|
|
||||||
|
|
||||||
1. Fork the `zhon` repo on GitHub.
|
|
||||||
2. Clone your fork locally::
|
|
||||||
|
|
||||||
$ git clone git@github.com:your_name_here/zhon.git
|
|
||||||
|
|
||||||
3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
|
|
||||||
|
|
||||||
$ mkvirtualenv zhon
|
|
||||||
$ cd zhon/
|
|
||||||
$ python setup.py develop
|
|
||||||
|
|
||||||
4. Create a branch for local development::
|
|
||||||
|
|
||||||
$ git checkout -b name-of-your-bugfix-or-feature
|
|
||||||
|
|
||||||
Now you can make your changes locally.
|
|
||||||
|
|
||||||
5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
|
|
||||||
|
|
||||||
$ flake8 zhon
|
|
||||||
$ python setup.py test
|
|
||||||
$ tox
|
|
||||||
|
|
||||||
To get flake8 and tox, just pip install them into your virtualenv.
|
|
||||||
|
|
||||||
You can ignore the flake8 errors regarding `zhon.cedict` files. Rather than include hundreds of newline characters in each file, we are ignoring those errors.
|
|
||||||
|
|
||||||
6. Commit your changes and push your branch to GitHub::
|
|
||||||
|
|
||||||
$ git add .
|
|
||||||
$ git commit -m "Your detailed description of your changes."
|
|
||||||
$ git push origin name-of-your-bugfix-or-feature
|
|
||||||
|
|
||||||
7. Submit a pull request through the GitHub website.
|
|
||||||
|
|
||||||
Pull Request Guidelines
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
Before you submit a pull request, check that it meets these guidelines:
|
|
||||||
|
|
||||||
1. The pull request should include tests.
|
|
||||||
2. If the pull request adds functionality, the docs should be updated. Put
|
|
||||||
your new functionality into a function with a docstring, and add the
|
|
||||||
feature to the list in README.rst.
|
|
||||||
3. The pull request should work for Python 2.7, 3.3, and 3.4. Check
|
|
||||||
https://travis-ci.org/tsroten/zhon/pull_requests
|
|
||||||
and make sure that the tests pass for all supported Python versions.
|
|
||||||
4. If you want to receive credit, add your name to `AUTHORS.rst`.
|
|
@ -1,7 +0,0 @@
|
|||||||
Copyright (c) 2013-2014 Thomas Roten
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
Loading…
Reference in new issue