add pypinyin tools

pull/578/head
Hui Zhang 4 years ago
parent e94da615a3
commit ab7aa43d15

@ -0,0 +1,6 @@
[bumpversion]
commit = True
tag = True
current_version = 0.41.0
[bumpversion:file:pypinyin/__init__.py]

@ -0,0 +1,153 @@
# Python CircleCI 2.0 configuration file
#
# Check https://circleci.com/docs/2.0/language-python/ for more details
#
version: 2
jobs:
python3.8: &DEFAULT
docker:
- image: circleci/python:3.8
environment:
TOX_ENV: py38
RUN_CHECK: 1
working_directory: ~/repo
steps:
- checkout
# Download and cache dependencies
# - restore_cache:
# keys:
# - v1-dependencies-{{ .Environment.TOX_ENV }}-{{ checksum "requirements_dev.txt" }}
- run:
name: install dependencies
command: |
# pip install -U pip virtualenv --user
if ! which virtualenv; then
pip install 'virtualenv<=20.0.21' --user
fi
export PATH="~/.local/bin:$PATH"
virtualenv venv
. venv/bin/activate
pip install codecov
pip install tox
if [[ $RUN_CHECK == 1 ]]; then
pip install -U -r requirements_dev.txt
fi
if [[ $(python -c "import sys; print(sys.stdin.encoding)" |grep None) ]]; then
export PYTHONIOENCODING=utf-8
fi
#
# - save_cache:
# paths:
# - ./venv
# key: v1-dependencies-{{ .Environment.TOX_ENV }}-{{ checksum "requirements_dev.txt" }}
- run:
name: run tests
command: |
. venv/bin/activate
if [[ $RUN_CHECK == 1 ]]; then
pre-commit run --all-files
mypy pypinyin
fi
tox -e $TOX_ENV
python setup.py install
pypinyin hello
echo hello | pypinyin
pypinyin < setup.cfg
codecov
- store_artifacts:
path: test-reports
destination: test-reports
python3.9:
<<: *DEFAULT
docker:
- image: circleci/python:3.9
environment:
TOX_ENV: py39
python3.7:
<<: *DEFAULT
docker:
- image: circleci/python:3.7
environment:
TOX_ENV: py37
python3.6:
<<: *DEFAULT
docker:
- image: circleci/python:3.6
environment:
TOX_ENV: py36
python3.5:
<<: *DEFAULT
docker:
- image: circleci/python:3.5
environment:
TOX_ENV: py35
python3.4:
<<: *DEFAULT
docker:
- image: circleci/python:3.4
environment:
TOX_ENV: py34
python2.7:
<<: *DEFAULT
docker:
- image: circleci/python:2.7
environment:
TOX_ENV: py27
# python2.6:
# <<: *DEFAULT
# docker:
# - image: python:2.6
# environment:
# TOX_ENV: py33
pypy2:
<<: *DEFAULT
docker:
- image: pypy:2
environment:
TOX_ENV: pypy
pypy3:
<<: *DEFAULT
docker:
- image: pypy:3
environment:
TOX_ENV: pypy3
workflows:
version: 2
testing:
jobs:
- python3.9
- python3.8
- python3.7
- python3.6
- python3.5
- python3.4
- python2.7
- pypy2
- pypy3

@ -0,0 +1,15 @@
[run]
branch = True
omit =
# pypinyin/runner.py
pypinyin/__main__.py
[report]
exclude_lines =
pragma: no cover
except NameError
except ImportError
pass
def main
if py3:
if __name__ == .__main__.:

@ -0,0 +1,25 @@
# EditorConfig is awesome: http://EditorConfig.org
# top-most EditorConfig file
root = true
# Unix-style newlines with a newline ending every file
[*]
charset = utf-8
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
# Indentiation
[*.{py,rst}]
indent_style = space
indent_size = 4
[Makefile]
indent_style = tab
indent_size = 4
[*.{ini,yml}]
indent_style = space
indent_size = 2
[*.md]
trim_trailing_whitespace = false

@ -0,0 +1,50 @@
[flake8]
########## OPTIONS ##########
# Set the maximum length that any line (with some exceptions) may be.
max-line-length = 120
################### FILE PATTERNS ##########################
# Provide a comma-separated list of glob patterns to exclude from checks.
exclude =
# git folder
.git,
# python cache
__pycache__,
third_party/,
# Provide a comma-separate list of glob patterns to include for checks.
filename =
*.py
########## RULES ##########
# ERROR CODES
#
# E/W - PEP8 errors/warnings (pycodestyle)
# F - linting errors (pyflakes)
# C - McCabe complexity error (mccabe)
#
# W503 - line break before binary operator
# Specify a list of codes to ignore.
ignore =
W503
E252,E262,E127,E265,E126,E266,E241,E261,E128,E125
W291,W293,W605
E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,
# these ignores are from flake8-bugbear; please fix!
B007,B008,
# these ignores are from flake8-comprehensions; please fix!
C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
# Specify the list of error codes you wish Flake8 to report.
select =
E,
W,
F,
C

@ -0,0 +1,16 @@
# Contributing
* 如果是关于单个汉字的拼音有误的问题,麻烦前往 [pinyin-data][pinyin-data] 进行反馈。
* 如果是关于词组的拼音有误的问题,麻烦前往 [phrase-pinyin-data][phrase-pinyin-data] 进行反馈。
* 有任何疑问或建议欢迎创建 [issue][issue] 或提交 [PR][pr] 。
* 项目代码开发方面的问题可以看看 [开发文档][开发文档] 。
Thanks for contributing! :heart:
[pinyin-data]: https://github.com/mozillazg/pinyin-data/issues
[phrase-pinyin-data]: https://github.com/mozillazg/phrase-pinyin-data
[issue]: https://github.com/mozillazg/python-pinyin/issues
[pr]: https://github.com/mozillazg/python-pinyin/pulls
[开发文档]: https://pypinyin.readthedocs.io/zh_CN/develop/develop.html

@ -0,0 +1,21 @@
## 运行环境
* 操作系统Linux/macOS/Windows
* Python 版本:
* pypinyin 版本:
<!--
P.S. 可以通过 `python -V` 获取 Python 版本。
P.S. 可以通过 `pypinyin -V` 或者 `pip freeze |grep pypinyin``pypinyin.__version__` 获取 pypinyin 版本信息。
-->
## 问题描述
## 问题复现步骤
<!--
感谢反馈!❤️
-->

@ -0,0 +1,15 @@
## PR 描述
## 待办事项
* [ ] 符合代码规范
* [ ] 单元测试
* [ ] 文档
<!--
感谢你的贡献!❤️
P.S. 麻烦选择 `develop` 分支作为 PR 的目标分支,谢谢~
-->

@ -0,0 +1,29 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: CI
on: [push, pull_request]
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [windows-latest]
# python-version: [3.7, 3.8]
python-version: [3.9]
tox-env: [py37, py38, py39]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install tox
- name: Test with tox
run: tox -e ${{ matrix.tox-env}}

@ -0,0 +1,71 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
name: "CodeQL"
on:
push:
branches: [master, develop]
pull_request:
# The branches below must be a subset of the branches above
branches: [master, develop]
schedule:
- cron: '0 2 * * 6'
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
# Override automatic language detection by changing the below list
# Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python']
language: ['python']
# Learn more...
# https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection
steps:
- name: Checkout repository
uses: actions/checkout@v2
with:
# We must fetch at least the immediate parents so that if this is
# a pull request then we can checkout the head.
fetch-depth: 2
# If this run was triggered by a pull request event, then checkout
# the head of the pull request instead of the merge commit.
- run: git checkout HEAD^2
if: ${{ github.event_name == 'pull_request' }}
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v1
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v1
# Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
# and modify them (or add more) to build your code if your project
# uses a compiled language
#- run: |
# make bootstrap
# make release
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1

@ -0,0 +1,54 @@
*.py[cod]
*.sw[op]
# C extensions
*.so
# Packages
*.egg
*.egg-info
dist
build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg
lib
lib64
_build
# Installer logs
pip-log.txt
# Unit test / coverage reports
.coverage
.tox
nosetests.xml
htmlcov
# Translations
*.mo
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
tools/words.txt
*~
tools/phrases_dict.txt
venv
.cache/
2.7/
.python-version
venv2.7/
venvPyInstaller/
output.dat
vocab.bin
vocab.large.bin
.mypy_cache/
.pytest_cache/
/pypinyin/phrases_dict_large.py

@ -0,0 +1,6 @@
[submodule "pinyin-data"]
path = pinyin-data
url = https://github.com/mozillazg/pinyin-data.git
[submodule "phrase-pinyin-data"]
path = phrase-pinyin-data
url = https://github.com/mozillazg/phrase-pinyin-data.git

@ -0,0 +1,29 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks.git
rev: v3.4.0
hooks:
- id: check-merge-conflict
- id: debug-statements
exclude: 'tools/|(pypinyin/(phrases_dict.py|pinyin_dict.py|phonetic_symbol.py))'
- id: double-quote-string-fixer
exclude: 'pypinyin/(phrases_dict.py|pinyin_dict.py|phonetic_symbol.py)'
- id: end-of-file-fixer
exclude: '.bumpversion.cfg'
- id: requirements-txt-fixer
- id: trailing-whitespace
- repo: https://gitlab.com/pycqa/flake8
rev: 3.8.4
hooks:
- id: flake8
exclude: 'tools|pypinyin/(phrases_dict.py|pinyin_dict.py|phonetic_symbol.py)|(docs/conf.py)'
# - repo: https://github.com/pre-commit/mirrors-mypy
# rev: 'v0.812'
# hooks:
# - id: mypy
# files: 'pypinyin/'
- repo: https://github.com/pre-commit/mirrors-yapf.git
sha: v0.16.0
hooks:
- id: yapf
files: \.py$
exclude: (?=phrase-pinyin-data|pinyin-data).*(\.py)$

@ -0,0 +1,3 @@
[style]
based_on_style = pep8
column_limit = 80

@ -0,0 +1,12 @@
{
"scanSettings": {
"baseBranches": []
},
"checkRunSettings": {
"vulnerableCheckRunConclusionLevel": "failure",
"displayMode": "diff"
},
"issueSettings": {
"minSeverityLevel": "LOW"
}
}

@ -0,0 +1,903 @@
Changelog
---------
`0.41.0`_ (2021-03-13)
++++++++++++++++++++++++
* **[New]** 新增 ``pypinyin.contrib.tone_convert`` 模块,用于
``Style.TONE````Style.TONE2````Style.TONE3````Style.NORMAL`` 风格的拼音之间互相转换。
详见 `文档 <https://pypinyin.readthedocs.io/zh_CN/develop/contrib.html#tone-convert>`__
* **[Improved]** 使用 `pinyin-data`_ v0.10.2 的拼音数据。
`0.40.0`_ (2020-11-22)
++++++++++++++++++++++++
* **[Improved]** 精简 phrases_dict, 删除 phrases_dict 中凡是能通过 pinyin_dict 得到相同结果的数据。
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.10.5 的词语拼音数据。
* **[Improved]** 使用 `pinyin-data`_ v0.10.1 的拼音数据。
`0.39.1`_ (2020-10-08)
++++++++++++++++++++++++
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.10.4 的词语拼音数据。
* **[Improved]** 使用 `pinyin-data`_ v0.10.0 的拼音数据。
`0.39.0`_ (2020-08-16)
++++++++++++++++++++++++
* **[New]** ``pinyin````lazy_pinyin`` 函数增加参数 ``v_to_u````neutral_tone_with_five``:
* ``v_to_u=True`` 时在无声调相关拼音风格下使用 ``ü`` 代替原来的 ``v``
.. code-block:: python
>>> lazy_pinyin('战略')
['zhan', 'lve']
>>> lazy_pinyin('战略', v_to_u=True)
['zhan', 'lüe']
* ``neutral_tone_with_five=True`` 时在数字标识声调相关风格下使用 ``5`` 标识轻声
.. code-block:: python
>>> lazy_pinyin('衣裳', style=Style.TONE3)
['yi1', 'shang']
>>> lazy_pinyin('衣裳', style=Style.TONE3, neutral_tone_with_five=True)
['yi1', 'shang5']
`0.38.1`_ (2020-07-05)
++++++++++++++++++++++++
* **[Improved]** 优化内置分词,处理前缀匹配导致无法正确识别尾部词语的问题。 Fixed `#205`_
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.10.3 的词语拼音数据。
`0.38.0`_ (2020-06-07)
++++++++++++++++++++++++
* **[Improved]** 优化内置分词,严格按照是否是词语来分词。 Fixed `#139`_
* **[Improved]** 使用 `pinyin-data`_ v0.9.0 的拼音数据。
`0.37.0`_ (2020-02-09)
++++++++++++++++++++++++
* **[Bugfixed]** 修复 ``NeutralToneWith5Mixin````TONE3`` 相关风格未把 5 标在预期的拼音末尾位置。
* **[New]** 增加 Python 3.8 下的测试,正式支持 Python 3.8 。
`0.36.0`_ (2019-10-27)
+++++++++++++++++++++++
* **[New]** 增加 ``V2UMixin`` 用于支持无声调相关拼音风格下的结果使用 ``ü`` 代替原来的 ``v``
详见 `文档 <https://pypinyin.readthedocs.io/zh_CN/master/contrib.html#v2umixin>`__
* **[New]** 增加 ``NeutralToneWith5Mixin`` 用于支持使用数字表示声调的拼音风格下使用 5 标识轻声。
详见 `文档 <https://pypinyin.readthedocs.io/zh_CN/master/contrib.html#neutraltonewith5mixin>`__
* **[New]** 增加 ``Pinyin````DefaultConverter`` 类用于实现自定义处理过程和结果
(实验性功能,绝大部分用户无需关心新增的这两个类)。
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.10.2 的词语拼音数据。
* **[Improved]** 使用 `pinyin-data`_ v0.8.1 的拼音数据。
`0.35.4`_ (2019-07-13)
+++++++++++++++++++++++
* **[Bugfixed]** 修复 ```` ``ê̄`` ``ế`` ``ê̌`` ```` 这几个音无法转换为不含声调结果的问题。
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.10.1 的词语拼音数据。 Fixed `#174`_
* **[Improved]** 使用 `pinyin-data`_ v0.8.0 的拼音数据。
* **[Improved]** 修复一处参数注释错误。(via `#176`_ Thanks `@yangwe1`_)
`0.35.3`_ (2019-05-11)
++++++++++++++++++++++++
* **[Bugfixed]** 修复鼻音 ```` 无法转换为不含声调结果的问题。
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.10.0 的词语拼音数据。
Fixed `#166`_ `#167`_ `#169`_ `#170`_
* **[Improved]** Windows CI 增加在 x64 下跑测试 (via `#164`_ Thanks `@hanabi1224`_)
`0.35.2`_ (2019-04-06)
+++++++++++++++++++++++
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.9.2 的词语拼音数据。 Fixed `#159`_ `#160`_
* **[Improved]** 使用 `pinyin-data`_ v0.7.0 的拼音数据。
`0.35.1`_ (2019-03-02)
+++++++++++++++++++++++
* **[Bugfixed]** 修复 ``朝阳````heteronym=False`` 时输出了多个音的情况。
`0.35.0`_ (2019-02-24)
+++++++++++++++++++++++
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.9.0 的词语拼音数据。 Fixed `#154`_ `#149`_
* **[New]** 支持 ``朝阳`` 这种一个词多个音( ``'朝阳': [['zhāo', 'cháo'], ['yáng']]`` )在多音字模式下输出多个音。 Fixed `#154`_
`0.34.1`_ (2018-12-30)
+++++++++++++++++++++++
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.8.5 的词语拼音数据。 Fixed `#151`_
`0.34.0`_ (2018-12-08)
+++++++++++++++++++++++
不兼容旧版的变更
~~~~~~~~~~~~~~~~~~
* **[Changed]** 当 ``errors`` 参数的值是个回调对象并且返回值是个 ``list`` 时,
会使用这个 list 来 extend 结果 list (via `#147`_ . Thanks `@howl-anderson`_ ) ::
# 更新前
>>> pinyin('你好☆☆', errors=lambda x: ['star' for _ in x])
[['nǐ'], ['hǎo'], ['star', 'star']]
# 更新后
>>> pinyin('你好☆☆', errors=lambda x: ['star' for _ in x])
[['nǐ'], ['hǎo'], ['star'], ['star']]
详见文档: https://pypinyin.readthedocs.io/zh_CN/develop/usage.html#handle-no-pinyin
`0.33.2`_ (2018-11-03)
++++++++++++++++++++++++
* **[Bugfixed]** 修复 ``strict=True`` 时韵母相关风格下没有正确处理韵母 ``üan`` 的问题。
`0.33.1`_ (2018-09-23)
++++++++++++++++++++++++
* **[Improved]** 使用 `pinyin-data`_ v0.6.2 的拼音数据。
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.8.4 的词语拼音数据。
`0.33.0`_ (2018-08-05)
++++++++++++++++++++++++
* **[Bugfixed]** 修复命令行程序在 ``sys.stdin.encoding````None`` 时无法正常工作的问题。
* **[Improved]** 使用 `pinyin-data`_ v0.6.1 的拼音数据。
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.8.3 的词语拼音数据。
* Fixed `#137`_
* **[Changed]** 不再测试 Python 2.6 和 Python 3.3,增加测试 Python 3.7 和 PyPy3
即不保证程序兼容 Python 2.6 和 Python 3.3。
`0.32.0`_ (2018-07-28)
++++++++++++++++++++++++
* **[Improved]** 使用 `pinyin-data`_ v0.6.0 的拼音数据。
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.8.2 的词语拼音数据。
`0.31.0`_ (2018-06-10)
++++++++++++++++++++++++
* **[New]** 增加 py.typed 标记文件,支持 `PEP 561`_ (via `#130`_)
* **[Changed]** 使用 `phrase-pinyin-data`_ v0.7.3 的词语拼音数据。
* fixed `#112`_ `#117`_ `#122`_ `#131`_
* 精简词组拼音,删除部分数据有误的拼音数据
`0.30.1`_ (2018-04-25)
++++++++++++++++++++++++
* **[Improved]** 更新文档和测试。(via `7fa0b87 <https://github.com/mozillazg/python-pinyin/commit/7fa0b879df47e8a7e5af5edb5f243dd4ea645410>`_)
* **[Improved]** 对用户传入的已进行分词处理的数据进行二次分词以便提高准确性。(via `#126`_)
* **[Improved]** 使用 `pinyin-data`_ v0.5.1 的拼音数据。(via `#125`_)
`0.30.0`_ (2018-02-03)
++++++++++++++++++++++++
* **[New]** 支持有拼音的非汉字字符 ```` (via `#119`_)。
* **[Changed]** 修复之前无意中把 ``pinyin`` 函数中的 ``strict`` 参数的默认值修改为了 ``False``
现在把 ``strict`` 参数的默认值恢复为预期的 ``True`` (via `#121`_)。关于 ``strict`` 参数的影响详见文档: `strict 参数的影响`_
`0.29.0`_ (2018-01-14)
++++++++++++++++++++++++
* **[New]** 可以通过环境变量 ``PYPINYIN_NO_DICT_COPY`` 禁用代码内对 dict 的 copy 操作,节省内存(via `#115`_ thanks `@daya0576`_ )。
`0.28.0`_ (2017-12-08)
++++++++++++++++++++++++
* **[New]** 给代码增加类型注解(via `#110`_)。
`0.27.0`_ (2017-10-28)
++++++++++++++++++++++++
* **[New]** 命令行工具支持通过更简便的方式指定参数及拼音风格。
(详见 `#105`_, Thanks `@wdscxsj`_ )
* **[Improved]** 增加说明 ``strict`` 参数对结果有什么影响的文档。
`0.26.1`_ (2017-10-25)
++++++++++++++++++++++++
* **[Improved]** 使用 `phrase-pinyin-data`_ v0.5.1 的词语拼音数据。fixed `#106`_
`0.26.0`_ (2017-10-12)
+++++++++++++++++++++++
* **[Changed]** 不再自动调用 jieba 分词模块,改为自动调用内置的正向最大匹配分词模块来分词。
(via `#102`_)
`0.25.0`_ (2017-10-01)
+++++++++++++++++++++++
* **[New]** 内置一个正向最大匹配分词模块,使用内置的词语拼音库来训练这个分词模块,
解决自定义词语库有时可能不生效的问题(因为这个词语在 jieba 等分词模块中不是可用词)。(via `#81`_)
获取拼音或自定义词库后使用:
.. code-block:: python
>>> from pypinyin import pinyin, load_phrases_dict
>>> load_phrases_dict({'了局': [['liǎo'], ['jú']]})
>>> pinyin('了局啊') # 使用 jieba 分词
Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/s6/z9r_07h53pj_d4x7qjszwmbw0000gn/T/jieba.cache
Loading model cost 1.175 seconds.
Prefix dict has been built succesfully.
[['le'], ['jú'], ['a']]
>>> from pypinyin.contrib.mmseg import seg, retrain
>>> retrain(seg) # 没有使用 load_phrases_dict 时可以不调用这个函数
>>> pinyin(seg.cut('了局啊')) # 使用内置的正向最大匹配分词
[['liǎo'], ['jú'], ['a']]
>>>
单独使用:
.. code-block:: python
>>> from pypinyin.contrib.mmseg import seg
>>> text = '你好,我是中国人,我爱我的祖国'
>>> seg.cut(text)
<generator object Seg.cut at 0x10b2df2b0>
>>> list(seg.cut(text))
['你好', '', '我', '是', '中国人', '', '我', '爱',
'我的', '祖', '国']
>>> seg.train(['祖国', '我是'])
>>> list(seg.cut(text))
['你好', '', '我是', '中国人', '', '我', '爱',
'我的', '祖国']
>>>
`0.24.0`_ (2017-09-17)
++++++++++++++++++++++++
* **[New]** 支持类似 pyinstaller 的打包工具对使用 pypinyin 的程序进行打包,
不会出现跟打包前不一样的输出(比如: `#92`_ via `#93`_ )。
`0.23.0`_ (2017-07-09)
++++++++++++++++++++++++
* **[New]** 使用 `phrase-pinyin-data`_ v0.5.0 的词语拼音数据。
`0.22.0`_ (2017-06-14)
++++++++++++++++++++++++
* **[New]** 支持 IronPython (via `#86`_). Thanks `@LevyLession`_
`0.21.1`_ (2017-05-29)
++++++++++++++++++++++++
* **[Bugfixed]** 修复在 Python 2 下通过 pip install 安装 wheel 格式的安装包后, 无法正常使用的问题。Python 2 下没有自动安装依赖包)
`0.21.0`_ (2017-05-14)
++++++++++++++++++++++++
* **[New]** 重构各拼音风格实现,支持自定义拼音风格或覆盖已有拼音风格的实现.
.. code-block:: python
from pypinyin.style import register
@register('style1')
def func(pinyin, **kwargs):
# pinyin = xxx # convert to style1
return pinyin
def func(pinyin, **kwargs):
# pinyin = xxx # convert to style2
return pinyin
register('style2', func=func)
`0.20.0`_ (2017-05-13)
++++++++++++++++++++++++
* **[New]** 增加 ``strict`` 参数来控制处理声母和韵母时是否严格遵循 `《汉语拼音方案》 <http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html>`_ 标准。
``strict=True`` 时根据 `《汉语拼音方案》 <http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html>`_ 的如下规则处理声母、在韵母相关风格下还原正确的韵母:
* 21 个声母: ``b p m f d t n l g k h j q x zh ch sh r z c s`` **y, w 不是声母**
* i行的韵母前面没有声母的时候写成yi(衣)ya(呀)ye(耶)yao(腰)you(忧)yan(烟)yin(因)yang(央)ying(英)yong(雍)。(**y 不是声母**
* u行的韵母前面没有声母的时候写成wu(乌)wa(蛙)wo(窝)wai(歪)wei(威)wan(弯)wen(温)wang(汪)weng(翁)。(**w 不是声母**
* ü行的韵母前面没有声母的时候写成yu(迂)yue(约)yuan(冤)yun(晕);ü上两点省略。(**韵母相关风格下还原正确的韵母 ü**
* ü行的韵跟声母jqx拼的时候写成ju(居)qu(区)xu(虚),ü上两点也省略;
但是跟声母nl拼的时候仍然写成nü(女)lü(吕)。(**韵母相关风格下还原正确的韵母 ü**
* iouueiuen前面加声母的时候写成iuuiun。例如niu(牛)gui(归)lun(论)。(**韵母相关风格下还原正确的韵母 iouueiuen**
具体差异可以查看 tests/test_standard.py 中的对比结果测试用例
* **[Changed]** 改为使用 enum 定义拼音风格(兼容旧版本)
`0.19.0`_ (2017-05-05)
++++++++++++++++++++++++
* **[New]** 韵母风格下根据 `汉语拼音方案`_ 还原原始的 ``iou`` , ``uei`` , ``uen`` 韵母。
iouueiuen前面加声母的时候写成iuuiun。
例如niu(牛)gui(归)lun(论)。即:
* niu 的韵母是 iou
* gui 的韵母是 uei
* lun 的韵母是 uen
* **[Fixed]** 修复韵母相关风格下没有正确处理 ``wu`` 的韵母的问题
(比如: ``````FINALS_TONE`` 风格下的结果是 ```` 的问题) 。
* **[Fixed]** 修复漏了 ǖ -> v1 的转换。
`0.18.2`_ (2017-04-25)
++++++++++++++++++++++++
* **[Fixed]** 使用 `phrase-pinyin-data`_ v0.4.1 的词语拼音数据, fixed `#72`_
`0.18.1`_ (2017-03-22)
++++++++++++++++++++++++
* **[Improved]** PyPI 上传过程中出了点问题。
`0.18.0`_ (2017-03-22)
++++++++++++++++++++++++
* **[Changed]** 使用 `phrase-pinyin-data`_ v0.4.0 的词语拼音数据。
`0.17.0`_ (2017-03-13)
++++++++++++++++++++++++
* **[Changed]** 词语拼音数据改为使用来自 `phrase-pinyin-data`_ v0.3.1 的拼音数据。
* **[Fixed]** 修正 ``斯事体大`` 的拼音。
`0.16.1`_ (2017-02-12)
++++++++++++++++++++++++
* **[Improved]** 使用 `pinyin-data`_ v0.4.1 的拼音数据. fixed `#58`_
* **[Improved]** 更新 `厦门` 的拼音. fixed `#59`_
`0.16.0`_ (2016-11-27)
++++++++++++++++++++++++
* **[New]** Added new pinyin styles - ``CYRILLIC`` (汉语拼音与俄语字母对照表) and ``CYRILLIC _FIRST`` (via `#55`_ thanks `@tyrbonit`_)
.. code-block:: python
>>> pypinyin.pinyin('中心', style=pypinyin.CYRILLIC)
[['чжун1'], ['синь1']]
>>> pypinyin.pinyin('中心', style=pypinyin.CYRILLIC_FIRST)
[['ч'], ['с']]
* **[New]** Added Russian translation README (`README_ru.rst`_)
* **[New]** Command-line tool supported the new pinyin styles: ``CYRILLIC, CYRILLIC_FIRST``
`0.15.0`_ (2016-10-18)
++++++++++++++++++++++++
* **[Changed]** 使用 `pinyin-data`_ v0.4.0 的拼音数据
`0.14.0`_ (2016-09-24)
++++++++++++++++++++++++
* **[New]** 新增注音 ``BOPOMOFO`` 及注音首字母 ``BOPOMOFO_FIRST`` 风格(via `#51`_ thanks `@gumblex`_ `@Artoria2e5`_)
.. code-block:: python
>>> pypinyin.pinyin('中心', style=pypinyin.BOPOMOFO)
[['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
>>> pypinyin.pinyin('中心', style=pypinyin.BOPOMOFO_FIRST)
[['ㄓ'], ['ㄒ']]
* **[New]** 新增音调在拼音后的 ``TONE3`` 以及 ``FINALS_TONE3`` 风格(via `#51`_ thanks `@gumblex`_ `@Artoria2e5`_ )
.. code-block:: python
>>> pypinyin.pinyin('中心', style=pypinyin.TONE3)
[['zhong1'], ['xin1']]
>>> pypinyin.pinyin('中心', style=pypinyin.FINALS_TONE3)
[['ong1'], ['in1']]
* **[New]** 命令行程序支持新增的四个风格: ``TONE3, FINALS_TONE3, BOPOMOFO, BOPOMOFO_FIRST``
* **[Bugfixed]** 修复 TONE2 中 ü 标轻声的问题(像 侵略 -> qi1n lv0e4以及去除文档中 0 表示轻声(via `#51`_ thanks `@gumblex`_)
* **[Changed]** 不再使用 0 表示轻声,轻声时没有数字(via `#51`_ thanks `@gumblex`_)
`0.13.0`_ (2016-08-19)
++++++++++++++++++++++++
* **[Changed]** 分离词组库中包含中文逗号的词语(via `f097b6a <https://github.com/mozillazg/python-pinyin/commit/f097b6ad7b9e2acbc1ecc214991be510f4f95d72>`_)
* **[Changed]** 使用 `pinyin-data`_ v0.3.0 的拼音数据
`0.12.1`_ (2016-05-11)
++++++++++++++++++++++++
* **[Bugfixed]** 修复一些词语存在拼音粘连在一起的情况. (`#41`_ thanks `@jolly-tao`_ )
`0.12.0`_ (2016-03-12)
++++++++++++++++++++++++
* **[Changed]** 单个汉字的拼音数据改为使用来自 `pinyin-data`_ 的拼音数据。
* **[New]** 命令行程序支持从标准输入读取汉字信息::
$ echo "你好" | pypinyin
nǐ hǎo
$ pypinyin < hello.txt
nǐ hǎo
`0.11.1`_ (2016-02-17)
+++++++++++++++++++++++
* **[Bugfixed]** 更新 phrases_dict 修复类似 `#36`_ 的问题。thanks `@someus`_
`0.11.0`_ (2016-01-16)
+++++++++++++++++++++++
* **[Changed]** 分割 ``__init__.py````compat.py``, ``constants.py`` ``core.py````utils.py``
影响: ``__init__.py`` 中只保留文档中提到过的 api, 如果使用了不在文档中的 api 则需要调整代码。
`0.10.0`_ (2016-01-02)
+++++++++++++++++++++++
* **[New]** Python 3.3++++ 以上版本默认支持 ``U++++20000 ~ U++++2FA1F`` 区间内的汉字(详见 `#33`_)
`0.9.5`_ (2015-12-19)
+++++++++++++++++++++++
* **[Bugfixed]** 修复未正确处理鼻音(详见 `汉语拼音 - 维基百科`_ )的问题(`#31`_ thanks `@xulin97`_ ):
* ``ḿ、ń、ň、ǹ`` 对应 “呒”、“呣”、“唔”、“嗯”等字。
这些字之前在各种风格下都输出原始的汉字而不是拼音。
`0.9.4`_ (2015-11-27)
+++++++++++++++++++++++
* **[Improved]** 细微调整,主要是更新文档
`0.9.3`_ (2015-11-15)
+++++++++++++++++++++++
* **[Bugfixed]** Fixed Python 3 compatibility was broken.
`0.9.2`_ (2015-11-15)
+++++++++++++++++++++++
* **[New]** ``load_single_dict````load_phrases_dict`` 增加 ``style`` 参数支持 TONE2 风格的拼音 ::
load_single_dict({ord(u'啊'): 'a1'}, style='tone2')
load_phrases_dict({u"阿爸": [[u"a1"], [u"ba4"]]}, style='tone2'}
* **[Improved]** Improved docs
`0.9.1`_ (2015-10-17)
+++++++++++++++++++++++
* **[Bugfixed][Changed]** 修复 ``ju``, ``qu``, ``xu``, ``yu``, ``yi````wu`` 的韵母( `#26`_ ). Thanks `@MingStar`_ :
* ``ju``, ``qu``, ``xu`` 的韵母应该是 ``v``
* ``yi`` 的韵母是 ``i``
* ``wu`` 的韵母是 ``u``
* 从现在开始 ``y`` 既不是声母也不是韵母,详见 `汉语拼音方案`_
`0.9.0`_ (2015-09-20)
+++++++++++++++++++++++
* **[Changed]** 将拼音词典库里的国际音标字母替换为 ASCII 字母. Thanks `@MingStar`_ :
* ``ɑ -> a``
* ``ɡ -> g``
`0.8.5`_ (2015-08-23)
+++++++++++++++++++++++
* **[Bugfixed]** 修复 zh, ch, sh, z, c, s 顺序问题导致获取声母有误
`0.8.4`_ (2015-08-23)
+++++++++++++++++++++++
* **[Changed]** ``y``, ``w`` 也不是声母. (`hotoo/pinyin#57 <https://github.com/hotoo/pinyin/issues/57>`__):
* 以 ``y``, ``w`` 开头的拼音在声母(``INITIALS``)模式下将返回 ``['']``
`0.8.3`_ (2015-08-20)
+++++++++++++++++++++++
* **[Improved]** 上传到 PyPI 出了点问题,但是又 `没法重新上传 <http://sourceforge.net/p/pypi/support-requests/468/>`__ ,只好新增一个版本
`0.8.2`_ (2015-08-20)
+++++++++++++++++++++++
* **[Bugfixed][Changed]** 修复误把 yu 放入声母列表里的 BUG(`#22`_). Thanks `@MingStar`_
`0.8.1`_ (2015-07-04)
+++++++++++++++++++++++
* **[Bugfixed]** 重构内置的分词功能,修复“无法正确处理包含空格的字符串的问题”
`0.8.0`_ (2015-06-27)
++++++++++++++++++++++++
* **[New]** 内置简单的分词功能,完善处理没有拼音的字符
(如果不需要处理多音字问题, 现在可以不用安装 ``jieba`` 或其他分词模块了)::
# 之前, 安装了结巴分词模块
lazy_pinyin(u'你好abc☆☆')
[u'ni', u'hao', 'a', 'b', 'c', u'\u2606', u'\u2606']
# 现在, 无论是否安装结巴分词模块
lazy_pinyin(u'你好abc☆☆')
[u'ni', u'hao', u'abc\u2606\u2606']
* | **[Changed]** 当 ``errors`` 参数是回调函数时,函数的参数由 ``单个字符`` 变更为 ``单个字符或词组``
| 即: 对于 ``abc`` 字符串, 之前将调用三次 ``errors`` 回调函数: ``func('a') ... func('b') ... func('abc')``
| 现在只调用一次: ``func('abc')``
* **[Changed]** 将英文字符也纳入 ``errors`` 参数的处理范围::
# 之前
lazy_pinyin(u'abc', errors='ignore')
[u'abc']
# 现在
lazy_pinyin(u'abc', errors='ignore')
[]
`0.7.0`_ (2015-06-20)
++++++++++++++++++++++++
* **[Bugfixed]** Python 2 下无法使用 ``from pypinyin import *`` 的问题
* **[New]** 支持以下环境变量:
* ``PYPINYIN_NO_JIEBA=true``: 禁用“自动调用结巴分词模块”
* ``PYPINYIN_NO_PHRASES=true``: 禁用内置的“词组拼音库”
`0.6.0`_ (2015-06-10)
++++++++++++++++++++++++
* **[New]** ``errors`` 参数支持回调函数(`#17`_): ::
def foobar(char):
return u'a'
pinyin(u'あ', errors=foobar)
`0.5.7`_ (2015-05-17)
++++++++++++++++++++++
* **[Bugfixed]** 纠正包含 "便宜" 的一些词组的读音
`0.5.6`_ (2015-02-26)
++++++++++++++++++++++
* **[Bugfixed]** "苹果" pinyin error. `#11`__
* **[Bugfixed]** 重复 import jieba 的问题
* **[Improved]** 精简 phrases_dict
* **[Improved]** 更新文档
__ https://github.com/mozillazg/python-pinyin/issues/11
`0.5.5`_ (2015-01-27)
++++++++++++++++++++++
* **[Bugfixed]** phrases_dict error
`0.5.4`_ (2014-12-26)
++++++++++++++++++++++
* **[Bugfixed]** 无法正确处理由分词模块产生的中英文混合词组比如B超维生素C的问题. `#8`__
__ https://github.com/mozillazg/python-pinyin/issues/8
`0.5.3`_ (2014-12-07)
++++++++++++++++++++++
* **[Improved]** 更新拼音库
`0.5.2`_ (2014-09-21)
+++++++++++++++++++++
* **[Improved]** 载入拼音库时,改为载入其副本。防止内置的拼音库被破坏
* **[Bugfixed]** ``胜败乃兵家常事`` 的音标问题
`0.5.1`_ (2014-03-09)
+++++++++++++++++++++
* **[New]** 参数 ``errors`` 用来控制如何处理没有拼音的字符:
* ``'default'``: 保留原始字符
* ``'ignore'``: 忽略该字符
* ``'replace'``: 替换为去掉 ``\u`` 的 unicode 编码字符串(``u'\u90aa'`` => ``u'90aa'``)
只处理 ``[^a-zA-Z0-9_]`` 字符。
`0.5.0`_ (2014-03-01)
+++++++++++++++++++++
* **[Changed]** **使用新的单字拼音库内容和格式**
| 新的格式:``{0x963F: u"ā,ē"}``
| 旧的格式:``{u'啊': u"ā,ē"}``
`0.4.4`_ (2014-01-16)
+++++++++++++++++++++
* **[Improved]** 清理命令行命令的输出结果,去除无关信息
* **[Bugfixed]** “ImportError: No module named runner”
`0.4.3`_ (2014-01-10)
+++++++++++++++++++++
* **[Bugfixed]** 命令行工具在 Python 3 下的兼容性问题
`0.4.2`_ (2014-01-10)
+++++++++++++++++++++
* **[Changed]** 拼音风格前的 ``STYLE_`` 前缀(兼容包含 ``STYLE_`` 前缀的拼音风格)
* **[New]** 命令行工具,具体用法请见: ``pypinyin -h``
`0.4.1`_ (2014-01-04)
+++++++++++++++++++++
* **[New]** 支持自定义拼音库,方便用户修正程序结果(``load_single_dict``, ``load_phrases_dict``)
`0.4.0`_ (2014-01-03)
+++++++++++++++++++++
* **[Changed]** 将 ``jieba`` 模块改为可选安装,用户可以选择使用自己喜爱的分词模块对汉字进行分词处理
* **[New]** 支持 Python 3
`0.3.1`_ (2013-12-24)
+++++++++++++++++++++
* **[New]** ``lazy_pinyin`` ::
>>> lazy_pinyin(u'中心')
['zhong', 'xin']
`0.3.0`_ (2013-09-26)
+++++++++++++++++++++
* **[Bugfixed]** 首字母风格无法正确处理只有韵母的汉字
* **[New]** 三个拼音风格:
* ``pypinyin.STYLE_FINALS`` 韵母风格1只返回各个拼音的韵母部分不带声调。如 ``ong uo``
* ``pypinyin.STYLE_FINALS_TONE`` 韵母风格2带声调声调在韵母第一个字母上。如 ``ōng uó``
* ``pypinyin.STYLE_FINALS_TONE2`` 韵母风格2带声调声调在各个拼音之后用数字 [0-4] 进行表示。如: ``o1ng uo2``
`0.2.0`_ (2013-09-22)
+++++++++++++++++++++
* **[Improved]** 完善对中英文混合字符串的支持::
>> pypinyin.pinyin(u'你好abc')
[[u'n\u01d0'], [u'h\u01ceo'], [u'abc']]
0.1.0 (2013-09-21)
++++++++++++++++++
* **[New]** Initial Release
.. _#17: https://github.com/mozillazg/python-pinyin/pull/17
.. _#22: https://github.com/mozillazg/python-pinyin/pull/22
.. _#26: https://github.com/mozillazg/python-pinyin/pull/26
.. _@MingStar: https://github.com/MingStar
.. _汉语拼音方案: https://zh.wiktionary.org/wiki/%E9%99%84%E5%BD%95:%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3%E6%96%B9%E6%A1%88
.. _汉语拼音方案.pdf: http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html
.. _汉语拼音 - 维基百科: https://zh.wikipedia.org/wiki/%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3#cite_ref-10
.. _@xulin97: https://github.com/xulin97
.. _#31: https://github.com/mozillazg/python-pinyin/issues/31
.. _#33: https://github.com/mozillazg/python-pinyin/pull/33
.. _#36: https://github.com/mozillazg/python-pinyin/issues/36
.. _pinyin-data: https://github.com/mozillazg/pinyin-data
.. _@someus: https://github.com/someus
.. _#34: https://github.com/mozillazg/python-pinyin/issues/34
.. _#41: https://github.com/mozillazg/python-pinyin/issues/41
.. _@jolly-tao: https://github.com/jolly-tao
.. _@gumblex: https://github.com/gumblex
.. _@Artoria2e5: https://github.com/Artoria2e5
.. _#51: https://github.com/mozillazg/python-pinyin/issues/51
.. _#55: https://github.com/mozillazg/python-pinyin/pull/55
.. _@tyrbonit: https://github.com/tyrbonit
.. _README_ru.rst: https://github.com/mozillazg/python-pinyin/blob/master/README_ru.rst
.. _#58: https://github.com/mozillazg/python-pinyin/issues/58
.. _#59: https://github.com/mozillazg/python-pinyin/issues/59
.. _#72: https://github.com/mozillazg/python-pinyin/issues/72
.. _phrase-pinyin-data: https://github.com/mozillazg/phrase-pinyin-data
.. _@LevyLession: https://github.com/LevyLession
.. _#86: https://github.com/mozillazg/python-pinyin/issues/86
.. _#92: https://github.com/mozillazg/python-pinyin/issues/92
.. _#93: https://github.com/mozillazg/python-pinyin/issues/93
.. _#81: https://github.com/mozillazg/python-pinyin/issues/81
.. _#102: https://github.com/mozillazg/python-pinyin/issues/102
.. _#105: https://github.com/mozillazg/python-pinyin/issues/105
.. _#106: https://github.com/mozillazg/python-pinyin/issues/106
.. _@wdscxsj: https://github.com/wdscxsj
.. _#110: https://github.com/mozillazg/python-pinyin/pull/110
.. _#115: https://github.com/mozillazg/python-pinyin/pull/115
.. _#119: https://github.com/mozillazg/python-pinyin/pull/119
.. _@daya0576: https://github.com/daya0576
.. _#121: https://github.com/mozillazg/python-pinyin/pull/121
.. _#125: https://github.com/mozillazg/python-pinyin/pull/125
.. _#126: https://github.com/mozillazg/python-pinyin/pull/126
.. _#112: https://github.com/mozillazg/python-pinyin/issues/112
.. _#117: https://github.com/mozillazg/python-pinyin/issues/117
.. _#122: https://github.com/mozillazg/python-pinyin/issues/122
.. _#131: https://github.com/mozillazg/python-pinyin/issues/131
.. _#130: https://github.com/mozillazg/python-pinyin/pull/130
.. _PEP 561: https://www.python.org/dev/peps/pep-0561/
.. _#137: https://github.com/mozillazg/python-pinyin/issues/137
.. _#147: https://github.com/mozillazg/python-pinyin/pull/147
.. _@howl-anderson: https://github.com/howl-anderson
.. _#151: https://github.com/mozillazg/python-pinyin/issues/151
.. _#154: https://github.com/mozillazg/python-pinyin/issues/154
.. _#149: https://github.com/mozillazg/python-pinyin/issues/149
.. _#159: https://github.com/mozillazg/python-pinyin/issues/159
.. _#160: https://github.com/mozillazg/python-pinyin/issues/160
.. _strict 参数的影响: https://pypinyin.readthedocs.io/zh_CN/master/usage.html#strict
.. _#166: https://github.com/mozillazg/python-pinyin/issues/166
.. _#167: https://github.com/mozillazg/python-pinyin/issues/167
.. _#169: https://github.com/mozillazg/python-pinyin/issues/169
.. _#170: https://github.com/mozillazg/python-pinyin/issues/170
.. _#174: https://github.com/mozillazg/python-pinyin/issues/174
.. _#139: https://github.com/mozillazg/python-pinyin/issues/139
.. _#205: https://github.com/mozillazg/python-pinyin/issues/205
.. _#164: https://github.com/mozillazg/python-pinyin/pull/164
.. _#176: https://github.com/mozillazg/python-pinyin/pull/176
.. _@hanabi1224: https://github.com/hanabi1224
.. _@yangwe1: https://github.com/yangwe1
.. _0.2.0: https://github.com/mozillazg/python-pinyin/compare/v0.1.0...v0.2.0
.. _0.3.0: https://github.com/mozillazg/python-pinyin/compare/v0.2.0...v0.3.0
.. _0.3.1: https://github.com/mozillazg/python-pinyin/compare/v0.3.0...v0.3.1
.. _0.4.0: https://github.com/mozillazg/python-pinyin/compare/v0.3.1...v0.4.0
.. _0.4.1: https://github.com/mozillazg/python-pinyin/compare/v0.4.0...v0.4.1
.. _0.4.2: https://github.com/mozillazg/python-pinyin/compare/v0.4.1...v0.4.2
.. _0.4.3: https://github.com/mozillazg/python-pinyin/compare/v0.4.2...v0.4.3
.. _0.4.4: https://github.com/mozillazg/python-pinyin/compare/v0.4.3...v0.4.4
.. _0.5.0: https://github.com/mozillazg/python-pinyin/compare/v0.4.4...v0.5.0
.. _0.5.1: https://github.com/mozillazg/python-pinyin/compare/v0.5.0...v0.5.1
.. _0.5.2: https://github.com/mozillazg/python-pinyin/compare/v0.5.1...v0.5.2
.. _0.5.3: https://github.com/mozillazg/python-pinyin/compare/v0.5.2...v0.5.3
.. _0.5.4: https://github.com/mozillazg/python-pinyin/compare/v0.5.3...v0.5.4
.. _0.5.5: https://github.com/mozillazg/python-pinyin/compare/v0.5.4...v0.5.5
.. _0.5.6: https://github.com/mozillazg/python-pinyin/compare/v0.5.5...v0.5.6
.. _0.5.7: https://github.com/mozillazg/python-pinyin/compare/v0.5.6...v0.5.7
.. _0.6.0: https://github.com/mozillazg/python-pinyin/compare/v0.5.7...v0.6.0
.. _0.7.0: https://github.com/mozillazg/python-pinyin/compare/v0.6.0...v0.7.0
.. _0.8.0: https://github.com/mozillazg/python-pinyin/compare/v0.7.0...v0.8.0
.. _0.8.1: https://github.com/mozillazg/python-pinyin/compare/v0.8.0...v0.8.1
.. _0.8.2: https://github.com/mozillazg/python-pinyin/compare/v0.8.1...v0.8.2
.. _0.8.3: https://github.com/mozillazg/python-pinyin/compare/v0.8.2...v0.8.3
.. _0.8.4: https://github.com/mozillazg/python-pinyin/compare/v0.8.3...v0.8.4
.. _0.8.5: https://github.com/mozillazg/python-pinyin/compare/v0.8.4...v0.8.5
.. _0.9.0: https://github.com/mozillazg/python-pinyin/compare/v0.8.5...v0.9.0
.. _0.9.1: https://github.com/mozillazg/python-pinyin/compare/v0.9.0...v0.9.1
.. _0.9.2: https://github.com/mozillazg/python-pinyin/compare/v0.9.1...v0.9.2
.. _0.9.3: https://github.com/mozillazg/python-pinyin/compare/v0.9.2...v0.9.3
.. _0.9.4: https://github.com/mozillazg/python-pinyin/compare/v0.9.3...v0.9.4
.. _0.9.5: https://github.com/mozillazg/python-pinyin/compare/v0.9.4...v0.9.5
.. _0.10.0: https://github.com/mozillazg/python-pinyin/compare/v0.9.5...v0.10.0
.. _0.11.0: https://github.com/mozillazg/python-pinyin/compare/v0.10.0...v0.11.0
.. _0.11.1: https://github.com/mozillazg/python-pinyin/compare/v0.11.0...v0.11.1
.. _0.12.0: https://github.com/mozillazg/python-pinyin/compare/v0.11.1...v0.12.0
.. _0.12.1: https://github.com/mozillazg/python-pinyin/compare/v0.12.0...v0.12.1
.. _0.13.0: https://github.com/mozillazg/python-pinyin/compare/v0.12.1...v0.13.0
.. _0.14.0: https://github.com/mozillazg/python-pinyin/compare/v0.13.0...v0.14.0
.. _0.15.0: https://github.com/mozillazg/python-pinyin/compare/v0.14.0...v0.15.0
.. _0.16.0: https://github.com/mozillazg/python-pinyin/compare/v0.15.0...v0.16.0
.. _0.16.1: https://github.com/mozillazg/python-pinyin/compare/v0.16.0...v0.16.1
.. _0.17.0: https://github.com/mozillazg/python-pinyin/compare/v0.16.1...v0.17.0
.. _0.18.0: https://github.com/mozillazg/python-pinyin/compare/v0.17.0...v0.18.0
.. _0.18.1: https://github.com/mozillazg/python-pinyin/compare/v0.18.0...v0.18.1
.. _0.18.2: https://github.com/mozillazg/python-pinyin/compare/v0.18.1...v0.18.2
.. _0.19.0: https://github.com/mozillazg/python-pinyin/compare/v0.18.2...v0.19.0
.. _0.20.0: https://github.com/mozillazg/python-pinyin/compare/v0.19.0...v0.20.0
.. _0.21.0: https://github.com/mozillazg/python-pinyin/compare/v0.20.0...v0.21.0
.. _0.21.1: https://github.com/mozillazg/python-pinyin/compare/v0.21.0...v0.21.1
.. _0.22.0: https://github.com/mozillazg/python-pinyin/compare/v0.21.1...v0.22.0
.. _0.23.0: https://github.com/mozillazg/python-pinyin/compare/v0.22.0...v0.23.0
.. _0.24.0: https://github.com/mozillazg/python-pinyin/compare/v0.23.0...v0.24.0
.. _0.25.0: https://github.com/mozillazg/python-pinyin/compare/v0.24.0...v0.25.0
.. _0.26.0: https://github.com/mozillazg/python-pinyin/compare/v0.25.0...v0.26.0
.. _0.26.1: https://github.com/mozillazg/python-pinyin/compare/v0.26.0...v0.26.1
.. _0.27.0: https://github.com/mozillazg/python-pinyin/compare/v0.26.1...v0.27.0
.. _0.28.0: https://github.com/mozillazg/python-pinyin/compare/v0.27.0...v0.28.0
.. _0.29.0: https://github.com/mozillazg/python-pinyin/compare/v0.28.0...v0.29.0
.. _0.30.0: https://github.com/mozillazg/python-pinyin/compare/v0.29.0...v0.30.0
.. _0.30.1: https://github.com/mozillazg/python-pinyin/compare/v0.30.0...v0.30.1
.. _0.31.0: https://github.com/mozillazg/python-pinyin/compare/v0.30.1...v0.31.0
.. _0.32.0: https://github.com/mozillazg/python-pinyin/compare/v0.31.0...v0.32.0
.. _0.33.0: https://github.com/mozillazg/python-pinyin/compare/v0.32.0...v0.33.0
.. _0.33.1: https://github.com/mozillazg/python-pinyin/compare/v0.33.0...v0.33.1
.. _0.33.2: https://github.com/mozillazg/python-pinyin/compare/v0.33.1...v0.33.2
.. _0.34.0: https://github.com/mozillazg/python-pinyin/compare/v0.33.2...v0.34.0
.. _0.34.1: https://github.com/mozillazg/python-pinyin/compare/v0.34.0...v0.34.1
.. _0.35.0: https://github.com/mozillazg/python-pinyin/compare/v0.34.1...v0.35.0
.. _0.35.1: https://github.com/mozillazg/python-pinyin/compare/v0.35.0...v0.35.1
.. _0.35.2: https://github.com/mozillazg/python-pinyin/compare/v0.35.1...v0.35.2
.. _0.35.3: https://github.com/mozillazg/python-pinyin/compare/v0.35.2...v0.35.3
.. _0.35.4: https://github.com/mozillazg/python-pinyin/compare/v0.35.3...v0.35.4
.. _0.36.0: https://github.com/mozillazg/python-pinyin/compare/v0.35.4...v0.36.0
.. _0.37.0: https://github.com/mozillazg/python-pinyin/compare/v0.36.0...v0.37.0
.. _0.38.0: https://github.com/mozillazg/python-pinyin/compare/v0.37.0...v0.38.0
.. _0.38.1: https://github.com/mozillazg/python-pinyin/compare/v0.38.0...v0.38.1
.. _0.39.0: https://github.com/mozillazg/python-pinyin/compare/v0.38.1...v0.39.0
.. _0.39.1: https://github.com/mozillazg/python-pinyin/compare/v0.39.0...v0.39.1
.. _0.40.0: https://github.com/mozillazg/python-pinyin/compare/v0.39.1...v0.40.0
.. _0.41.0: https://github.com/mozillazg/python-pinyin/compare/v0.40.0...v0.41.0

@ -0,0 +1,46 @@
# Contributor Covenant Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at mozillazg101@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
[homepage]: http://contributor-covenant.org
[version]: http://contributor-covenant.org/version/1/4/

@ -0,0 +1,20 @@
The MIT License (MIT)
Copyright (c) 2016 mozillazg, 闲耘 <hotoo.cn@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

@ -0,0 +1,2 @@
include README.rst LICENSE.txt CHANGELOG.rst
recursive-include pypinyin *.pyi py.typed

@ -0,0 +1,97 @@
help:
@echo "test run test"
@echo "publish publish to PyPI"
@echo "publish_test publish to TestPyPI"
@echo "docs_html make html docs"
@echo "docs_serve serve docs"
@echo "gen_data gen pinyin data"
@echo "gen_pinyin_dict gen single hanzi pinyin dict"
@echo "gen_phrases_dict gen phrase hanzi pinyin dict"
@echo "lint run lint"
@echo "clean - remove all build, test, coverage and Python artifacts"
@echo "clean-build - remove build artifacts"
@echo "clean-pyc - remove Python file artifacts"
@echo "clean-test - remove test and coverage artifacts"
.PHONY: test
test: lint
@echo "run test"
make testonly
.PHONY: testonly
testonly:
py.test --random-order --cov pypinyin tests/ pypinyin/
.PHONY: publish
publish: clean
@echo "publish to pypi"
python setup.py sdist
python setup.py bdist_wheel
twine upload dist/*
.PHONY: publish_test
publish_test: clean
@echo "publish to test pypi"
python setup.py sdist
python setup.py bdist_wheel
twine upload --repository test dist/*
.PHONY: docs_html
docs_html:
cd docs && make html
.PHONY: docs_serve
docs_serve: docs_html
cd docs/_build/html && python -m http.server
.PHONY: gen_data
gen_data: gen_pinyin_dict gen_phrases_dict
.PHONY: gen_pinyin_dict
gen_pinyin_dict:
python gen_pinyin_dict.py pinyin-data/pinyin.txt pypinyin/pinyin_dict.py
.PHONY: gen_phrases_dict
gen_phrases_dict:
python gen_phrases_dict.py phrase-pinyin-data/pinyin.txt pypinyin/phrases_dict_large.py
python tidy_phrases_dict.py
.PHONY: lint
lint:
pre-commit run --all-files
mypy --strict pypinyin
clean: clean-build clean-pyc clean-test
clean-build:
rm -fr build/
rm -fr dist/
rm -fr .eggs/
find . -name '*.egg-info' -exec rm -fr {} +
find . -name '*.egg' -exec rm -f {} +
clean-pyc:
find . -name '*.pyc' -exec rm -f {} +
find . -name '*.pyo' -exec rm -f {} +
find . -name '*~' -exec rm -f {} +
find . -name '__pycache__' -exec rm -fr {} +
clean-test:
rm -fr .tox/
rm -f .coverage
rm -fr htmlcov/
rebase_master:
git fetch origin && git rebase origin/master
merge_dev:
git merge --no-ff origin/develop
bump_patch:
bumpversion --verbose patch
bump_minor:
bumpversion --verbose minor
start_next:
git push && git push --tags && git checkout develop && git rebase master && git push

@ -0,0 +1,8 @@
# Modify from
* [python-pinyin](https://github.com/mozillazg/python-pinyin.git)
commit: 55e524aa1b7b8eec3d15c5306043c6cdd5938b03
licence: MIT
## Features
* only support py3
* remove pyi

@ -0,0 +1,210 @@
汉字拼音转换工具Python 版)
=============================
|Build| |GitHubAction| |Coverage| |Pypi version| |DOI|
将汉字转为拼音。可以用于汉字注音、排序、检索(`Russian translation`_) 。
基于 `hotoo/pinyin <https://github.com/hotoo/pinyin>`__ 开发。
* Documentation: http://pypinyin.rtfd.io/
* GitHub: https://github.com/mozillazg/python-pinyin
* License: MIT license
* PyPI: https://pypi.org/project/pypinyin
* Python version: 2.7, pypy, pypy3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9
.. contents::
特性
----
* 根据词组智能匹配最正确的拼音。
* 支持多音字。
* 简单的繁体支持, 注音支持。
* 支持多种不同拼音/注音风格。
安装
----
.. code-block:: bash
$ pip install pypinyin
使用示例
--------
Python 3(Python 2 下把 ``'中心'`` 替换为 ``u'中心'`` 即可):
.. code-block:: python
>>> from pypinyin import pinyin, lazy_pinyin, Style
>>> pinyin('中心')
[['zhōng'], ['xīn']]
>>> pinyin('中心', heteronym=True) # 启用多音字模式
[['zhōng', 'zhòng'], ['xīn']]
>>> pinyin('中心', style=Style.FIRST_LETTER) # 设置拼音风格
[['z'], ['x']]
>>> pinyin('中心', style=Style.TONE2, heteronym=True)
[['zho1ng', 'zho4ng'], ['xi1n']]
>>> pinyin('中心', style=Style.TONE3, heteronym=True)
[['zhong1', 'zhong4'], ['xin1']]
>>> pinyin('中心', style=Style.BOPOMOFO) # 注音风格
[['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
>>> lazy_pinyin('中心') # 不考虑多音字的情况
['zhong', 'xin']
>>> lazy_pinyin('战略', v_to_u=True) # 不使用 v 表示 ü
['zhan', 'lüe']
# 使用 5 标识轻声
>>> lazy_pinyin('衣裳', style=Style.TONE3, neutral_tone_with_five=True)
['yi1', 'shang5']
**注意事项**
* 默认情况下拼音结果不会标明哪个韵母是轻声,轻声的韵母没有声调或数字标识(可以通过参数 ``neutral_tone_with_five=True`` 开启使用 ``5`` 标识轻声 )。
* 默认情况下无声调相关拼音风格下的结果会使用 ``v`` 表示 ``ü`` (可以通过参数 ``v_to_u=True`` 开启使用 ``ü`` 代替 ``v`` )。
* 默认情况下会原样输出没有拼音的字符(自定义处理没有拼音的字符的方法见 `文档 <https://pypinyin.readthedocs.io/zh_CN/master/usage.html#handle-no-pinyin>`__ )。
命令行工具:
.. code-block:: console
$ pypinyin 音乐
yīn yuè
$ pypinyin -h
文档
--------
详细文档请访问http://pypinyin.rtfd.io/ 。
项目代码开发方面的问题可以看看 `开发文档`_
FAQ
---------
词语中的多音字拼音有误?
+++++++++++++++++++++++++++++
目前是通过词组拼音库的方式来解决多音字问题的。如果出现拼音有误的情况,
可以自定义词组拼音来调整词语中的拼音:
.. code-block:: python
>>> from pypinyin import Style, pinyin, load_phrases_dict
>>> pinyin('步履蹒跚')
[['bù'], ['lǚ'], ['mán'], ['shān']]
>>> load_phrases_dict({'步履蹒跚': [['bù'], ['lǚ'], ['pán'], ['shān']]})
>>> pinyin('步履蹒跚')
[['bù'], ['lǚ'], ['pán'], ['shān']]
详见 `文档 <https://pypinyin.readthedocs.io/zh_CN/master/usage.html#custom-dict>`__
如果是分词导致的拼音有误的话,可以先使用其他的分词模块对数据进行分词处理,
然后将分词后的词组结果列表作为函数的参数即可:
.. code-block:: python
>>> # 使用其他分词模块分词,比如 jieba 之类,
>>> #或者基于 phrases_dict.py 里的词语数据使用其他分词算法分词
>>> words = list(jieba.cut('每股24.67美元的确定性协议'))
>>> pinyin(words)
为什么没有 y, w, yu 几个声母?
++++++++++++++++++++++++++++++++++++++++++++
.. code-block:: python
>>> from pypinyin import Style, pinyin
>>> pinyin('下雨天', style=Style.INITIALS)
[['x'], [''], ['t']]
因为根据 `《汉语拼音方案》 <http://www.moe.gov.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html>`__
ywü (yu) 都不是声母。
声母风格INITIALS“雨”、“我”、“圆”等汉字返回空字符串因为根据
`《汉语拼音方案》 <http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html>`__
ywü (yu) 都不是声母,在某些特定韵母无声母时,才加上 y 或 w而 ü 也有其特定规则。 —— @hotoo
**如果你觉得这个给你带来了麻烦,那么也请小心一些无声母的汉字(如“啊”、“饿”、“按”、“昂”等)。
这时候你也许需要的是首字母风格FIRST_LETTER**。 —— @hotoo
参考: `hotoo/pinyin#57 <https://github.com/hotoo/pinyin/issues/57>`__,
`#22 <https://github.com/mozillazg/python-pinyin/pull/22>`__,
`#27 <https://github.com/mozillazg/python-pinyin/issues/27>`__,
`#44 <https://github.com/mozillazg/python-pinyin/issues/44>`__
如果觉得这个行为不是你想要的,就是想把 y 当成声母的话,可以指定 ``strict=False``
这个可能会符合你的预期:
.. code-block:: python
>>> from pypinyin import Style, pinyin
>>> pinyin('下雨天', style=Style.INITIALS)
[['x'], [''], ['t']]
>>> pinyin('下雨天', style=Style.INITIALS, strict=False)
[['x'], ['y'], ['t']]
详见 `strict 参数的影响`_
如何减少内存占用
++++++++++++++++++++
如果对拼音的准确性不是特别在意的话,可以通过设置环境变量 ``PYPINYIN_NO_PHRASES``
``PYPINYIN_NO_DICT_COPY`` 来节省内存。
详见 `文档 <https://pypinyin.readthedocs.io/zh_CN/master/faq.html#no-phrases>`__
更多 FAQ 详见文档中的
`FAQ <https://pypinyin.readthedocs.io/zh_CN/master/faq.html>`__ 部分。
.. _#13 : https://github.com/mozillazg/python-pinyin/issues/113
.. _strict 参数的影响: https://pypinyin.readthedocs.io/zh_CN/master/usage.html#strict
拼音数据
---------
* 单个汉字的拼音使用 `pinyin-data`_ 的数据
* 词组的拼音使用 `phrase-pinyin-data`_ 的数据
* 声母和韵母使用 `《汉语拼音方案》 <http://www.moe.gov.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html>`__ 的数据
Related Projects
-----------------
* `hotoo/pinyin`__: 汉字拼音转换工具 Node.js/JavaScript 版。
* `mozillazg/go-pinyin`__: 汉字拼音转换工具 Go 版。
* `mozillazg/rust-pinyin`__: 汉字拼音转换工具 Rust 版。
__ https://github.com/hotoo/pinyin
__ https://github.com/mozillazg/go-pinyin
__ https://github.com/mozillazg/rust-pinyin
.. |Build| image:: https://img.shields.io/circleci/project/github/mozillazg/python-pinyin/master.svg
:target: https://circleci.com/gh/mozillazg/python-pinyin
.. |GitHubAction| image:: https://github.com/mozillazg/python-pinyin/workflows/CI/badge.svg
:target: https://github.com/mozillazg/python-pinyin/actions
.. |Coverage| image:: https://img.shields.io/codecov/c/github/mozillazg/python-pinyin/master.svg
:target: https://codecov.io/gh/mozillazg/python-pinyin
.. |PyPI version| image:: https://img.shields.io/pypi/v/pypinyin.svg
:target: https://pypi.org/project/pypinyin/
.. |DOI| image:: https://zenodo.org/badge/12830126.svg
:target: https://zenodo.org/badge/latestdoi/12830126
.. _Russian translation: https://github.com/mozillazg/python-pinyin/blob/master/README_ru.rst
.. _pinyin-data: https://github.com/mozillazg/pinyin-data
.. _phrase-pinyin-data: https://github.com/mozillazg/phrase-pinyin-data
.. _开发文档: https://pypinyin.readthedocs.io/zh_CN/develop/develop.html

@ -0,0 +1,177 @@
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BaiduPCS.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BaiduPCS.qhc"
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/BaiduPCS"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BaiduPCS"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."

@ -0,0 +1,43 @@
API
====
.. _style:
拼音风格
-----------
.. autoclass:: pypinyin.Style
:members:
:undoc-members:
:member-order: bysource
.. _core_api:
核心 API
-------------
.. autofunction:: pypinyin.pinyin
.. autofunction:: pypinyin.lazy_pinyin
.. autofunction:: pypinyin.load_single_dict
.. autofunction:: pypinyin.load_phrases_dict
.. autofunction:: pypinyin.slug
.. _convert_style:
注册新的拼音风格
-----------------
.. autofunction:: pypinyin.style.register
.. _seg:
.. _#27: https://github.com/mozillazg/python-pinyin/issues/27

@ -0,0 +1,309 @@
# -*- coding: utf-8 -*-
#
# pypinyin documentation build configuration file, created by
# sphinx-quickstart on Fri Sep 06 22:22:13 2013.
#
# This file is execfile()d with the current directory set to its containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys, os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.insert(0, os.path.abspath('.'))
sys.path.insert(0, os.path.abspath('.'))
sys.path.insert(0, os.path.abspath('..'))
# sys.path.insert(0, os.path.abspath('../pypinyin'))
# -- General configuration -----------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
'sphinx.ext.extlinks',
'sphinx.ext.todo',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
import pypinyin
# General information about the project.
project = pypinyin.__title__
copyright = pypinyin.__copyright__
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = pypinyin.__version__
# The full version, including alpha/beta/rc tags.
release = pypinyin.__version__
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
language = 'zh_CN'
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False
# -- Options for HTML output ---------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'nature'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'pypinyindoc'
# -- Options for LaTeX output --------------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'pypinyin.tex', 'pypinyin Documentation', 'mozillazg', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output --------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [('index', 'pypinyin', 'pypinyin Documentation', ['mozillazg'], 1)]
# If true, show URL addresses after external links.
#man_show_urls = False
# -- Options for Texinfo output ------------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'pypinyin', 'pypinyin Documentation', 'mozillazg', 'pypinyin',
'One line description of project.', 'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#texinfo_no_detailmenu = False
# -- Options for Epub output ---------------------------------------------------
# Bibliographic Dublin Core info.
epub_title = 'pypinyin'
epub_author = 'mozillazg'
epub_publisher = 'mozillazg'
epub_copyright = '2016 mozillazg'
# The language of the text. It defaults to the language option
# or en if the language is not set.
#epub_language = ''
# The scheme of the identifier. Typical schemes are ISBN or URL.
#epub_scheme = ''
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#epub_identifier = ''
# A unique identification for the text.
#epub_uid = ''
# A tuple containing the cover image and cover page html template filenames.
#epub_cover = ()
# A sequence of (type, uri, title) tuples for the guide element of content.opf.
#epub_guide = ()
# HTML files that should be inserted before the pages created by sphinx.
# The format is a list of tuples containing the path and title.
#epub_pre_files = []
# HTML files that should be inserted after the pages created by sphinx.
# The format is a list of tuples containing the path and title.
#epub_post_files = []
# A list of files that should not be packed into the epub file.
#epub_exclude_files = []
# The depth of the table of contents in toc.ncx.
#epub_tocdepth = 3
# Allow duplicate toc entries.
#epub_tocdup = True
# Fix unsupported image types using the PIL.
#epub_fix_images = False
# Scale large images.
#epub_max_image_width = 0
# If 'no', URL addresses will not be shown.
#epub_show_urls = 'inline'
# If false, no index is generated.
#epub_use_index = True
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {'http://docs.python.org/': None}

@ -0,0 +1,38 @@
.. _contrib:
contrib
========
.. _tone_convert:
拼音转换
--------
.. autofunction:: pypinyin.contrib.tone_convert.to_normal
.. autofunction:: pypinyin.contrib.tone_convert.to_tone
.. autofunction:: pypinyin.contrib.tone_convert.to_tone2
.. autofunction:: pypinyin.contrib.tone_convert.to_tone3
.. autofunction:: pypinyin.contrib.tone_convert.tone_to_normal
.. autofunction:: pypinyin.contrib.tone_convert.tone_to_tone2
.. autofunction:: pypinyin.contrib.tone_convert.tone_to_tone3
.. autofunction:: pypinyin.contrib.tone_convert.tone2_to_normal
.. autofunction:: pypinyin.contrib.tone_convert.tone2_to_tone
.. autofunction:: pypinyin.contrib.tone_convert.tone2_to_tone3
.. autofunction:: pypinyin.contrib.tone_convert.tone3_to_normal
.. autofunction:: pypinyin.contrib.tone_convert.tone3_to_tone
.. autofunction:: pypinyin.contrib.tone_convert.tone3_to_tone2
V2UMixin
---------
.. autoclass:: pypinyin.contrib.uv.V2UMixin
NeutralToneWith5Mixin
-----------------------
.. autoclass:: pypinyin.contrib.neutral_tone.NeutralToneWith5Mixin

@ -0,0 +1,127 @@
.. _develop:
开发文档
========
准备开发环境
-------------
::
$ virtualenv venv
$ . venv/bin/activate
(venv) $ pip install -U -r requirements_dev.txt
(venv) $ pip install -e .
(venv) $ pre-commit install
TODO: 把这个步骤放到一个 make 命令中。
.. note::
推荐在 Python 3.6+ 环境下进行开发。
测试
------
可以通过 ``make test`` 命令在当前 Python 版本下运行单元测试: ::
(venv) $ make test
可以通过 ``tox`` 测试程序在多个 Python 版本下的单元测试结果(这一步也可以在提 PR 的时候通过 CI 来运行): ::
(venv) $ tox
.. note::
如果对测试有疑问或者有些测试实在无法通过,可以先提交 PR 大家一起来看看。
目录结构
--------
关键文件和目录 ::
$ tree -L 2
.
├── CHANGELOG.rst # 更新日志
├── Makefile
├── README.rst
├── docs # 文档
├── gen_phrases_dict.py # 生成 phrases_dict.py 的脚本
├── gen_pinyin_dict.py # 生成 pinyin_dict.py 的脚本
├── phrase-pinyin-data # gen_phrases_dict.py 使用的数据源
├── pinyin-data # gen_pinyin_dict.py 使用的数据源
├── pypinyin # pypinyin 模块源代码
│   ├── __init__.py
│   ├── __main__.py # 命令行程序的入口
│   ├── compat.py
│   ├── constants.py
│   ├── contrib # 目前包含了一个分词模块
│   ├── core.py # pypinyin 模块的核心逻辑
│   ├── phonetic_symbol.py
│   ├── phrases_dict.py # 词组的拼音数据,由 gen_phrases_dict.py 生成
│   ├── pinyin_dict.py # 单个汉字的拼音数据,由 gen_pinyin_dict.py 生成
│   ├── runner.py # 命令行程序的主逻辑
│   ├── standard.py # strict=True 时的拼音转换逻辑
│   ├── style # 各种拼音风格在 style 目录下实现
│   ├── utils.py
├── pytest.ini
├── requirements_dev.txt
├── setup.cfg
├── setup.py
├── tests
├── tox.ini
实现思路/主逻辑
----------------
主逻辑:
1. 对输入的字符串按是否是汉字进行分词(``seg``
2. 对分词结果的每个词条进行获取词条拼音的逻辑
1. 检查词条是否是汉字,不是汉字则走处理没有拼音数据的逻辑(``handle_nopinyin``
2. 检查词条是否在 ``PHRASES_DICT`` 中,如果在直接取 ``PHRASES_DICT`` 中这个词条的拼音数据
3. 如果词条不在 ``PHRASES_DICT`` 中,遍历词条包含的字符,每个字符进行 ``single_pinyin`` 逻辑处理
3. ``single_pinyin`` 的逻辑:
1. 检查字符是否在 ``PINYIN_DICT`` 中,如果在的话,取 ``PINYIN_DICT`` 中这个字符的拼音数据
2. 如果不在的话,走 ``handle_nopinyin`` 逻辑
4. ``handle_nopinyin`` 逻辑: 根据 ``errors`` 参数的值返回不同的结果。
5. 对上面的步骤获得的拼音数据按指定的拼音风格进行转换。
* ``PHRASES_DICT``:词组拼音数据
* ``PINYIN_DICT``: 单个汉字的拼音数据
TODO: 画流程图
发布新版本
----------
1. 切分到 develop 分支
2. rebase master 分支的代码: ``make rebase_master``
3. 通过 ``make gen_data`` 生成最新的数据文件
4. 通过 ``make test`` 跑测试
5. 更新 CHANGELOG
6. 提交代码
7. 检查 develop 分支的 CI 结果
8. 切换到 master 分支
9. 合并 develop 分支代码: ``make merge_dev``
10. 更新版本号:
* 大改动(1.1.x -> 1.2.x)``make bump_minor``
* 小改动(1.1.1 -> 1.1.2)``make bump_patch``
11. 发布到 test pypi: ``make publish_test``
12. 安装和测试发布到 test pypi 上的版本
13. 发布到 pypi: ``make publish``
14. 安装和测试发布到 pypi 上的版本
15. 提交 master 分支代码,更新 develop 分支代码,进入下一个开发阶段:``make start_next``

@ -0,0 +1,61 @@
FAQ
-----
.. _no_phrases:
如何禁用内置的“词组拼音库”
++++++++++++++++++++++++++++++++
设置环境变量 ``PYPINYIN_NO_PHRASES=true`` 即可
.. _no_dict_copy:
如何禁用默认的“拼音库”copy 操作
+++++++++++++++++++++++++++++++++++++++++++
设置环境变量 ``PYPINYIN_NO_DICT_COPY=true`` 即可.
副作用: 用户的自定义拼音库出现问题时, 无法回退到自带的拼音库.
.. _limit_memory:
如何减少内存占用
+++++++++++++++++++++
如果对拼音正确性不在意的话,可以按照上面所说的设置环境变量 ``PYPINYIN_NO_PHRASES``
``PYPINYIN_NO_DICT_COPY`` 详见 `#13`_
.. _initials_problem:
``INITIALS`` 声母风格下,以 ``y``, ``w``, ``yu`` 开头的汉字返回空字符串
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
比如:
.. code:: python
pinyin('火影忍者', style=Style.INITIALS)
[['h'], [''], ['r'], ['zh']]
因为 ``y``, ``w``, ``yu`` 都不是声母。参考:
`hotoo/pinyin#57 <https://github.com/hotoo/pinyin/issues/57>`__,
`#22 <https://github.com/mozillazg/python-pinyin/pull/22>`__,
`#27 <https://github.com/mozillazg/python-pinyin/issues/27>`__,
`#44 <https://github.com/mozillazg/python-pinyin/issues/44>`__
声母风格INITIALS“雨”、“我”、“圆”等汉字返回空字符串因为根据
`《汉语拼音方案》 <http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html>`__
ywü (yu) 都不是声母,在某些特定韵母无声母时,才加上 y 或 w而 ü 也有其特定规则。
如果你觉得这个给你带来了麻烦,那么也请小心一些无声母的汉字(如“啊”、“饿”、“按”、“昂”等)。
这时候你也许需要的是首字母风格FIRST_LETTER。 —— @hotoo
如果觉得这个行为不是你想要的,就是想把 y 当成声母的话,可以指定 ``strict=False`` 这个可能会符合你的预期。详见 `strict 参数的影响`_
.. _#13: https://github.com/mozillazg/python-pinyin/issues/113
.. _strict 参数的影响: https://pypinyin.readthedocs.io/zh_CN/master/usage.html#strict

@ -0,0 +1,64 @@
.. pypinyin documentation master file, created by
sphinx-quickstart on Fri Sep 06 22:22:13 2013.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
汉字拼音转换工具Python 版)
=============================
|Build| |Coverage| |Pypi version|
将汉字转为拼音。可以用于汉字注音、排序、检索(`Russian translation`_) 。
基于 `hotoo/pinyin <https://github.com/hotoo/pinyin>`__ 开发。
* Documentation: http://pypinyin.rtfd.io
* GitHub: https://github.com/mozillazg/python-pinyin
* License: MIT license
* PyPI: https://pypi.org/project/pypinyin
* Python version: 2.7, pypy, pypy3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9
特性
----
* 根据词组智能匹配最正确的拼音。
* 支持多音字。
* 简单的繁体支持, 注音支持。
* 支持多种不同拼音风格。
.. |Build| image:: https://img.shields.io/circleci/project/github/mozillazg/python-pinyin/master.svg
:target: https://circleci.com/gh/mozillazg/python-pinyin
.. |Coverage| image:: https://img.shields.io/codecov/c/github/mozillazg/python-pinyin/master.svg
:target: https://codecov.io/gh/mozillazg/python-pinyin
.. |PyPI version| image:: https://img.shields.io/pypi/v/pypinyin.svg
:target: https://pypi.org/project/pypinyin/
.. |PyPI downloads| image:: https://img.shields.io/pypi/dm/pypinyin.svg
:target: https://pypi.org/project/pypinyin/
.. _Russian translation: https://github.com/mozillazg/python-pinyin/blob/master/README_ru.rst
Contents
--------
.. toctree::
:maxdepth: 4
installation
usage
api
contrib
develop
faq
related
CHANGELOG
Indices and tables
------------------
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

@ -0,0 +1,20 @@
安装
======
可以使用 pip 进行安装:
.. code-block:: bash
$ pip install pypinyin
easy_install 安装:
.. code-block:: bash
$ easy_install pypinyin
源码安装:
.. code-block:: bash
$ python setup.py install

@ -0,0 +1,242 @@
@ECHO OFF
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set BUILDDIR=_build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
set I18NSPHINXOPTS=%SPHINXOPTS% .
if NOT "%PAPER%" == "" (
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
)
if "%1" == "" goto help
if "%1" == "help" (
:help
echo.Please use `make ^<target^>` where ^<target^> is one of
echo. html to make standalone HTML files
echo. dirhtml to make HTML files named index.html in directories
echo. singlehtml to make a single large HTML file
echo. pickle to make pickle files
echo. json to make JSON files
echo. htmlhelp to make HTML files and a HTML help project
echo. qthelp to make HTML files and a qthelp project
echo. devhelp to make HTML files and a Devhelp project
echo. epub to make an epub
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
echo. text to make text files
echo. man to make manual pages
echo. texinfo to make Texinfo files
echo. gettext to make PO message catalogs
echo. changes to make an overview over all changed/added/deprecated items
echo. xml to make Docutils-native XML files
echo. pseudoxml to make pseudoxml-XML files for display purposes
echo. linkcheck to check all external links for integrity
echo. doctest to run all doctests embedded in the documentation if enabled
goto end
)
if "%1" == "clean" (
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
del /q /s %BUILDDIR%\*
goto end
)
%SPHINXBUILD% 2> nul
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
if "%1" == "html" (
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
goto end
)
if "%1" == "dirhtml" (
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
goto end
)
if "%1" == "singlehtml" (
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
goto end
)
if "%1" == "pickle" (
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the pickle files.
goto end
)
if "%1" == "json" (
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the JSON files.
goto end
)
if "%1" == "htmlhelp" (
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
goto end
)
if "%1" == "qthelp" (
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\BaiduPCS.qhcp
echo.To view the help file:
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\BaiduPCS.ghc
goto end
)
if "%1" == "devhelp" (
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished.
goto end
)
if "%1" == "epub" (
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The epub file is in %BUILDDIR%/epub.
goto end
)
if "%1" == "latex" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
if errorlevel 1 exit /b 1
echo.
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdf" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf
cd %BUILDDIR%/..
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdfja" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf-ja
cd %BUILDDIR%/..
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "text" (
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The text files are in %BUILDDIR%/text.
goto end
)
if "%1" == "man" (
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The manual pages are in %BUILDDIR%/man.
goto end
)
if "%1" == "texinfo" (
%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
goto end
)
if "%1" == "gettext" (
%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
goto end
)
if "%1" == "changes" (
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
if errorlevel 1 exit /b 1
echo.
echo.The overview file is in %BUILDDIR%/changes.
goto end
)
if "%1" == "linkcheck" (
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
if errorlevel 1 exit /b 1
echo.
echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
goto end
)
if "%1" == "doctest" (
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
if errorlevel 1 exit /b 1
echo.
echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
goto end
)
if "%1" == "xml" (
%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The XML files are in %BUILDDIR%/xml.
goto end
)
if "%1" == "pseudoxml" (
%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
goto end
)
:end

@ -0,0 +1,10 @@
Related Projects
===================
* `hotoo/pinyin`__: 汉字拼音转换工具 Node.js/JavaScript 版。
* `mozillazg/go-pinyin`__: 汉字拼音转换工具 Go 版。
* `mozillazg/rust-pinyin`__: 汉字拼音转换工具 Rust 版。
__ https://github.com/hotoo/pinyin
__ https://github.com/mozillazg/go-pinyin
__ https://github.com/mozillazg/rust-pinyin

@ -0,0 +1,247 @@
使用
======
.. _example:
示例
-------
.. code-block:: python
>>> from pypinyin import pinyin, lazy_pinyin, Style
>>> pinyin('中心')
[['zhōng'], ['xīn']]
>>> pinyin('中心', heteronym=True) # 启用多音字模式
[['zhōng', 'zhòng'], ['xīn']]
>>> pinyin('中心', style=Style.FIRST_LETTER) # 设置拼音风格
[['z'], ['x']]
>>> pinyin('中心', style=Style.TONE2, heteronym=True)
[['zho1ng', 'zho4ng'], ['xi1n']]
>>> lazy_pinyin('中心') # 不考虑多音字的情况
['zhong', 'xin']
**注意事项**
* 默认情况下拼音结果不会标明哪个韵母是轻声,轻声的韵母没有声调或数字标识(可以通过参数 ``neutral_tone_with_five=True`` 开启使用 ``5`` 标识轻声 )。
* 默认情况下无声调相关拼音风格下的结果会使用 ``v`` 表示 ``ü`` (可以通过参数 ``v_to_u=True`` 开启使用 ``ü`` 代替 ``v`` )。
* 默认情况下会原样输出没有拼音的字符(自定义处理没有拼音的字符的方法见 `文档 <https://pypinyin.readthedocs.io/zh_CN/master/usage.html#handle-no-pinyin>`__ )。
.. _handle_no_pinyin:
处理不包含拼音的字符
---------------------
当程序遇到不包含拼音的字符(串)时,会根据 ``errors`` 参数的值做相应的处理:
* ``default`` (默认行为): 不做任何处理,原样返回::
pinyin('你好☆☆')
[['nǐ'], ['hǎo'], ['☆☆']]
* ``ignore`` : 忽略该字符 ::
pinyin('你好☆☆', errors='ignore')
[['nǐ'], ['hǎo']]
* ``replace`` : 替换为去掉 ``\u`` 的 unicode 编码::
pinyin('你好☆☆', errors='replace')
[['nǐ'], ['hǎo'], ['26062606']]
* callable 对象 : 提供一个回调函数,接受无拼音字符(串)作为参数,
支持的返回值类型: ``unicode````list````None`` 。::
pinyin('你好☆☆', errors=lambda x: 'star')
[['nǐ'], ['hǎo'], ['star']]
pinyin('你好☆☆', errors=lambda x: None)
[['nǐ'], ['hǎo']]
返回值类型为 ``list`` 时,会自动 expend list ::
pinyin('你好☆☆', errors=lambda x: ['star' for _ in x])
[['nǐ'], ['hǎo'], ['star'], ['star']]
# 指定多音字
pinyin('你好☆☆', heteronym=True, errors=lambda x: [['star', '☆'] for _ in x])
[['nǐ'], ['hǎo'], ['star', '☆'], ['star', '☆']]
.. _custom_dict:
自定义拼音库
------------
如果对结果不满意,可以通过
:py:func:`~pypinyin.load_single_dict`
:py:func:`~pypinyin.load_phrases_dict`
以自定义拼音库的方式修正结果:
.. code-block:: python
>> from pypinyin import lazy_pinyin, load_phrases_dict, Style, load_single_dict
>> hans = '桔子'
>> lazy_pinyin(hans, style=Style.TONE2)
['jie2', 'zi3']
>> load_phrases_dict({'桔子': [['jú'], ['zǐ']]}) # 增加 "桔子" 词组
>> lazy_pinyin(hans, style=Style.TONE2)
['ju2', 'zi3']
>>
>> hans = '还没'
>> lazy_pinyin(hans, style=Style.TONE2)
['hua2n', 'me2i']
>> load_single_dict({ord('还'): 'hái,huán'}) # 调整 "还" 字的拼音顺序
>>> lazy_pinyin('还没', style=Style.TONE2)
['ha2i', 'me2i']
.. _custom_style:
自定义拼音风格
----------------
可以通过 :py:func:`~pypinyin.style.register` 来实现自定义拼音风格的需求:
.. code-block:: python
In [1]: from pypinyin import lazy_pinyin
In [2]: from pypinyin.style import register
In [3]: @register('kiss')
...: def kiss(pinyin, **kwargs):
...: return '😘 {0}'.format(pinyin)
...:
In [4]: lazy_pinyin('么么', style='kiss')
Out[4]: ['😘 me', '😘 me']
.. _strict:
``strict`` 参数的影响
-------------------------------
``strict`` 参数用于控制处理声母和韵母时是否严格遵循 `《汉语拼音方案》`_ 标准:
.. code-block:: python
In [1]: from pypinyin import Style, lazy_pinyin
In [2]: lazy_pinyin('乌', style=Style.TONE)
Out[2]: ['wū']
In [3]: lazy_pinyin('乌', style=Style.INITIALS)
Out[3]: ['']
In [4]: lazy_pinyin('乌', style=Style.INITIALS, strict=False)
Out[4]: ['w']
In [5]: lazy_pinyin('迂', style=Style.TONE)
Out[5]: ['yū']
In [6]: lazy_pinyin('迂', style=Style.FINALS_TONE)
Out[6]: ['ǖ']
In [7]: lazy_pinyin('迂', style=Style.FINALS_TONE, strict=False)
Out[7]: ['ū']
``strict=True`` 时根据 `《汉语拼音方案》`_ 的如下规则处理声母、在韵母相关风格下还原正确的韵母
(只对只获取声母或只获取韵母相关拼音风格有效,不影响其他获取完整拼音信息的拼音风格的结果):
* 21 个声母: ``b p m f d t n l g k h j q x zh ch sh r z c s`` **y, w 不是声母**
* i行的韵母前面没有声母的时候写成yi(衣)ya(呀)ye(耶)yao(腰)you(忧)yan(烟)
yin(因)yang(央)ying(英)yong(雍)。(**y 不是声母**
* u行的韵母前面没有声母的时候写成wu(乌)wa(蛙)wo(窝)wai(歪)wei(威)wan(弯)
wen(温)wang(汪)weng(翁)。(**w 不是声母**
* ü行的韵母前面没有声母的时候写成yu(迂)yue(约)yuan(冤)yun(晕);ü上两点省略。
**韵母相关风格下还原正确的韵母 ü**
* ü行的韵跟声母jqx拼的时候写成ju(居)qu(区)xu(虚),ü上两点也省略;
但是跟声母nl拼的时候仍然写成nü(女)lü(吕)。(**韵母相关风格下还原正确的韵母 ü**
* iouueiuen前面加声母的时候写成iuuiun。例如niu(牛)gui(归)lun(论)。
**韵母相关风格下还原正确的韵母 iouueiuen**
``strict=False`` 时就是不遵守上面的规则来处理声母和韵母,
比如:``y``, ``w`` 会被当做声母yu(迂) 的韵母就是一般认为的 ``u`` 等。
具体差异可以查看 `tests/test_standard.py <https://github.com/mozillazg/python-pinyin/blob/master/tests/test_standard.py>`_ 中的对比结果测试用例
.. _cli:
命令行工具
------------
程序内置了一个命令行工具 ``pypinyin`` :
.. code-block:: console
$ pypinyin 音乐
yīn yuè
$ pypinyin -h
命令行工具支持如下参数:
.. code-block:: console
$ pypinyin -h
usage: pypinyin [-h] [-V] [-f {pinyin,slug}]
[-s {NORMAL,zhao,TONE,zh4ao,TONE2,zha4o,TONE3,zhao4,INITIALS,zh,FIRST_LETTER,z,FINALS,ao,FINALS_TONE,4ao,FINALS_TONE2,a4o,FINALS_TONE3,ao4,BOPOMOFO,BOPOMOFO_FIRST,CYRILLIC,CYRILLIC_FIRST}]
[-p SEPARATOR] [-e {default,ignore,replace}] [-m]
hans
convert chinese to pinyin.
positional arguments:
hans chinese string
optional arguments:
-h, --help show this help message and exit
-V, --version show program's version number and exit
-f {pinyin,slug}, --func {pinyin,slug}
function name (default: "pinyin")
-s {NORMAL,zhao,TONE,zh4ao,TONE2,zha4o,TONE3,zhao4,INITIALS,zh,FIRST_LETTER,z,FINALS,ao,FINALS_TONE,4ao,FINALS_TONE2,a4o,FINALS_TONE3,ao4,BOPOMOFO,BOPOMOFO_FIRST,CYRILLIC,CYRILLIC_FIRST}, --style {NORMAL,zhao,TONE,zh4ao,TONE2,zha4o,TONE3,zhao4,INITIALS,zh,FIRST_LETTER,z,FINALS,ao,FINALS_TONE,4ao,FINALS_TONE2,a4o,FINALS_TONE3,ao4,BOPOMOFO,BOPOMOFO_FIRST,CYRILLIC,CYRILLIC_FIRST}
pinyin style (default: "zh4ao")
-p SEPARATOR, --separator SEPARATOR
slug separator (default: "-")
-e {default,ignore,replace}, --errors {default,ignore,replace}
how to handle none-pinyin string (default: "default")
-m, --heteronym enable heteronym
``-s``, ``--style`` 参数可以选值的含义如下:
================== =========================================
-s 或 --style 的值 对应的拼音风格
================== =========================================
zhao :py:attr:`~pypinyin.Style.NORMAL`
zh4ao :py:attr:`~pypinyin.Style.TONE`
zha4o :py:attr:`~pypinyin.Style.TONE2`
zhao4 :py:attr:`~pypinyin.Style.TONE3`
zh :py:attr:`~pypinyin.Style.INITIALS`
z :py:attr:`~pypinyin.Style.FIRST_LETTER`
ao :py:attr:`~pypinyin.Style.FINALS`
4ao :py:attr:`~pypinyin.Style.FINALS_TONE`
a4o :py:attr:`~pypinyin.Style.FINALS_TONE2`
ao4 :py:attr:`~pypinyin.Style.FINALS_TONE3`
NORMAL :py:attr:`~pypinyin.Style.NORMAL`
TONE :py:attr:`~pypinyin.Style.TONE`
TONE2 :py:attr:`~pypinyin.Style.TONE2`
TONE3 :py:attr:`~pypinyin.Style.TONE3`
INITIALS :py:attr:`~pypinyin.Style.INITIALS`
FIRST_LETTER :py:attr:`~pypinyin.Style.FIRST_LETTER`
FINALS :py:attr:`~pypinyin.Style.FINALS`
FINALS_TONE :py:attr:`~pypinyin.Style.FINALS_TONE`
FINALS_TONE2 :py:attr:`~pypinyin.Style.FINALS_TONE2`
FINALS_TONE3 :py:attr:`~pypinyin.Style.FINALS_TONE3`
BOPOMOFO :py:attr:`~pypinyin.Style.BOPOMOFO`
BOPOMOFO_FIRST :py:attr:`~pypinyin.Style.BOPOMOFO_FIRST`
CYRILLIC :py:attr:`~pypinyin.Style.CYRILLIC`
CYRILLIC_FIRST :py:attr:`~pypinyin.Style.CYRILLIC_FIRST`
================== =========================================
.. _《汉语拼音方案》: http://www.moe.gov.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html

@ -0,0 +1,60 @@
import sys
def remove_dup_items(lst):
new_lst = []
for item in lst:
if item not in new_lst:
new_lst.append(item)
return new_lst
def parse(fp):
phrases_dict = {}
for line in in_fp.readlines():
line = line.strip()
if line.startswith('#') or not line:
continue
# 中国: zhōng guó
data = line.split('#')[0]
hanzi, pinyin = data.strip().split(':')
hanzi = hanzi.strip()
# [[zhōng], [guó]]
pinyin_list = [[s] for s in pinyin.split()]
if hanzi not in phrases_dict:
phrases_dict[hanzi] = pinyin_list
else:
for index, value in enumerate(phrases_dict[hanzi]):
value.extend(pinyin_list[index])
phrases_dict[hanzi][index] = remove_dup_items(value)
return phrases_dict
def main(in_fp, out_fp):
out_fp.write('''# Warning: Auto-generated file, don't edit.
phrases_dict = {
''')
hanzi_pairs = sorted(parse(in_fp).items(), key=lambda x: x[0])
for hanzi, pinyin_list in hanzi_pairs:
# 中国: [[zhōng], [guó]]
new_line = " '{hanzi}': {pinyin_list},\n".format(
hanzi=hanzi.strip(), pinyin_list=pinyin_list)
out_fp.write(new_line)
out_fp.write('}\n')
if __name__ == '__main__':
if len(sys.argv) == 1:
print('python gen_phrases_dict.py INPUT OUTPUT')
sys.exit(1)
in_f = sys.argv[1]
out_f = sys.argv[2]
with open(in_f) as in_fp, open(out_f, 'w') as out_fp:
main(in_fp, out_fp)

@ -0,0 +1,37 @@
import sys
def main(in_fp, out_fp):
out_fp.write('''# Warning: Auto-generated file, don't edit.
pinyin_dict = {
''')
for line in in_fp.readlines():
line = line.strip()
if line.startswith('#') or not line:
continue
else:
# line is U+4E2D: zhōng,zhòng # 中
# raw_line U+4E2D: zhōng,zhòng
raw_line = line.split('#')[0].strip()
# 0x4E2D: zhōng,zhòng
new_line = raw_line.replace('U+', '0x')
# 0x4E2D: 'zhōng,zhòng
new_line = new_line.replace(': ', ": '")
# 0x4E2D: 'zhōng,zhòng'\n
new_line = " {new_line}',\n".format(new_line=new_line)
out_fp.write(new_line)
out_fp.write('}\n')
if __name__ == '__main__':
if len(sys.argv) == 1:
print('python gen_pinyin_dict.py INPUT OUTPUT')
sys.exit(1)
in_f = sys.argv[1]
out_f = sys.argv[2]
with open(in_f) as in_fp, open(out_f, 'w') as out_fp:
main(in_fp, out_fp)

@ -0,0 +1,11 @@
[bumpversion]
commit = True
tag = True
current_version = 0.10.5
[bumpversion:file:merge.py]
[bumpversion:file:pinyin.txt]
[bumpversion:file:large_pinyin.txt]

@ -0,0 +1,92 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# IPython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
new.txt
cc-cedict.zip
cedict_ts.u8

@ -0,0 +1,6 @@
language: python
python:
- 3.6
script:
- make merge

@ -0,0 +1,213 @@
# ChangeLog
## [0.10.5] (2020-11-22)
* 增加 `还君明珠` 的拼音。
## [0.10.4] (2020-10-08)
* 纠正一些词语的拼音。
## [0.10.3] (2020-07-05)
* 增加 `还珠` 的拼音。
## [0.10.2] (2019-10-26)
* 纠正一些词语的拼音。
## [0.10.1] (2019-07-06)
* 修正部分拼音数据。
## [0.10.0] (2019-05-10)
* 新增 `cc_cedict.txt`: [cc-cedict.org](https://cc-cedict.org/) 拼音数据。Thanks [@hanabi1224]
* 纠正一些词语的拼音
## [0.9.2] (2019-04-06)
* 修复部分词语的拼音声调标错了位置的问题
## [0.9.1] (2019-03-31)
* 纠正一批词语的的拼音:
* `鸟事`
* `虮虱相吊`
* `别鹤离鸾`
* `年华垂暮`
* `本枝百世`
* `操戈同室`
* 部分词语中 `丢` 的拼音
## [0.9.0] (2019-02-23)
* 新增 `腌臢: ā zā`
* `朝阳` 增加 `cháo yáng` 这个音
* 新增 `土地`、`领地`、`基地`
## [0.8.5] (2018-12-26)
* 纠正 `油炸`、`洗发` 的拼音
## [0.8.4] (2018-09-16)
* 纠正 `步履蹒跚` 的拼音
* 纠正部分词语中 `长` 的拼音
## [0.8.3] (2018-08-04)
* 纠正部分 `查`、`大` 的读音 (via [ee1ded4])
## [0.8.2] (2018-07-28)
* 纠正 `有一只` 的读音 (via [330b348])
## [0.8.1] (2018-07-28)
* 纠正几个 `一` 的读音 (via [6e3b9eb])
* 修复部分拼音包含 `xh` 的问题 (via [ae12df98])
## [0.8.0] (2018-07-08)
* 纠正 `称雨道晴` 的拼音 (via [67412ab])
* 纠正部分词语中 `干` 的拼音 (via [38474cb])
* 增加 `时长` 的拼音 (via [c40b965])
## [0.7.3] (2018-06-10)
* 纠正 `一语中的`, `一语中人` 的拼音 (via [3b62ed3])
## [0.7.2] (2018-06-10)
* 纠正部分拼音数据 (via [af5d783])
## [0.7.1] (2018-06-04)
* 纠正 `负债累累` `经纶济世` 的拼音 (via [#16])
## [0.7.0] (2018-05-27)
* 新增 zdic_cibs.txt 和 zdic_cybs.txt (via [#13])
* `zdic_cibs.txt`: [汉典网](http://www.zdic.net) 汉语词典拼音数据
* `zdic_cybs.txt`: [汉典网](http://www.zdic.net) 成语词典拼音数据
* 增加基于 zdic_cibs.txt 和 zdic_cybs.txt 的 large_pinyin.txt (via [#13])
* 纠正部分读音(via [#10],[#11], [#15])
## [0.6.0] (2018-03-11)
* Revert [#3](https://github.com/mozillazg/phrase-pinyin-data/pull/3) 增加的拼音数据(错误有点多)
## [0.5.1] (2017-10-25)
* 修正一批缺少 ā 和 dī 不对的词语(via [#7][#7])
## [0.5.0] (2017-07-09)
* 增加 `还贷` 的拼音(Thanks [@zhuangh](https://github.com/zhuangh))
## [0.4.1] (2017-04-10)
* 纠正 `朝阳`, `昂昂自若` 的拼音(via [e6d6d27][e6d6d27], [6e7ea16][6e7ea16])
## [0.4.0] (2017-03-22)
* 新增2万多个词组拼音数据(via [fc50fcd][fc50fcd], 感谢 [@onsunsl][@onsunsl] 分享他/她收集的43400个拼音数据: [#3][#3] ).
## [0.3.1] (2017-03-13)
* 纠正 `斯事体大` 的拼音
## [0.3.0] (2017-03-12)
* 增加 overwrite.txt 用于新增/纠正拼音数据
* 纠正 `便宜`, `所长`, `打开天窗说亮话` 的拼音数据
* 增加 `朝阳区`
## [0.2.0] (2017-03-04)
* 添加一批拼音(via [04de9f7][04de9f7])。
## 0.1.0 (2017-03-04)
* Initial Release
[0.10.4]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.10.3...v0.10.4
[0.10.3]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.10.2...v0.10.3
[0.10.2]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.10.1...v0.10.2
[0.10.1]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.10.0...v0.10.1
[0.10.0]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.9.2...v0.10.0
[0.9.2]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.9.1...v0.9.2
[0.9.1]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.9.0...v0.9.1
[0.9.0]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.8.5...v0.9.0
[0.8.5]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.8.4...v0.8.5
[0.8.4]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.8.3...v0.8.4
[0.8.3]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.8.2...v0.8.3
[0.8.2]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.8.1...v0.8.2
[0.8.1]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.8.0...v0.8.1
[0.8.0]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.7.3...v0.8.0
[0.7.3]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.7.2...v0.7.3
[0.7.2]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.7.1...v0.7.2
[0.7.1]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.7.0...v0.7.1
[0.7.0]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.6.0...v0.7.0
[0.6.0]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.5.0...v0.6.0
[0.5.1]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.5.0...v0.5.1
[0.5.0]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.4.1...v0.5.0
[0.4.1]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.4.0...v0.4.1
[0.4.0]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.3.1...v0.4.0
[0.3.1]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.3.0...v0.3.1
[0.3.0]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.2.0...v0.3.0
[0.2.0]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.1.0...v0.2.0
[04de9f7]: https://github.com/mozillazg/phrase-pinyin-data/commit/04de9f7f520e2f2188cb4c468c30d6fb811a20ba
[fc50fcd]: https://github.com/mozillazg/phrase-pinyin-data/commit/fc50fcd7faa94205096d582fc7a1b31265943a85
[@onsunsl]: https://github.com/onsunsl
[#3]: https://github.com/mozillazg/phrase-pinyin-data/pull/3
[e6d6d27]: https://github.com/mozillazg/phrase-pinyin-data/commit/e6d6d270900fdca32ccbe9a414ea4642e537e522
[6e7ea16]: https://github.com/mozillazg/phrase-pinyin-data/commit/6e7ea167dee0c812514f0bf9701ff5c103a566af
[#7]: https://github.com/mozillazg/phrase-pinyin-data/pull/7
[#10]: https://github.com/mozillazg/phrase-pinyin-data/pull/10
[#11]: https://github.com/mozillazg/phrase-pinyin-data/pull/11
[#13]: https://github.com/mozillazg/phrase-pinyin-data/pull/13
[#15]: https://github.com/mozillazg/phrase-pinyin-data/pull/15
[#16]: https://github.com/mozillazg/phrase-pinyin-data/pull/16
[af5d783]: https://github.com/mozillazg/phrase-pinyin-data/commit/af5d7831b0e84e4a5306e304b3b2da3268e35f17
[3b62ed3]: https://github.com/mozillazg/phrase-pinyin-data/commit/3b62ed303f129868c7ccee4f2d5e44dcea7d30d4
[67412ab]: https://github.com/mozillazg/phrase-pinyin-data/commit/67412abbf8570ac80a41dc012f228c0864823a62
[38474cb]: https://github.com/mozillazg/phrase-pinyin-data/commit/38474cb91dedd27b3d51b39811704f3d045837b1
[c40b965]: https://github.com/mozillazg/phrase-pinyin-data/commit/c40b9653ea2ab066d1c0606e9e07dd4225ff2485
[6e3b9eb]: https://github.com/mozillazg/phrase-pinyin-data/commit/6e3b9eb805ed3e3a5955c179e752ec5e1293216f
[ae12df98]: https://github.com/mozillazg/phrase-pinyin-data/commit/ae12df98438a508249bdf591334b6415bb5ccf8d
[330b348]: https://github.com/mozillazg/phrase-pinyin-data/commit/330b3481ba350de07b580991a5a8b7a83aaefde9
[ee1ded4]: https://github.com/mozillazg/phrase-pinyin-data/commit/ee1ded4938624ac4ce3dc7991ab370e09dbd745c
[@hanabi1224]: https://github.com/hanabi1224
[0.10.5]: https://github.com/mozillazg/phrase-pinyin-data/compare/v0.10.4...v0.10.5

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2017 mozillazg
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -0,0 +1,42 @@
.PHONY: help
help:
@echo "merge update pinyin.txt and large_pinyin.txt"
@echo "er find r"
@echo "check check unexpected char"
@echo "cedict_get get latest cedict data"
@echo "cedict parse latest cedict data"
.PHONY: merge
merge:
python merge.py pinyin.txt overwrite.txt > new.txt && mv new.txt pinyin.txt
python merge.py zdic_cibs.txt zdic_cybs.txt cc_cedict.txt pinyin.txt overwrite.txt > new.txt && mv new.txt large_pinyin.txt
.PHONY: er
er:
cat overwrite.txt|grep 儿|grep -v ér|grep -v er
.PHONY: tone_mark
tone_mark:
ls *.txt | xargs -L 1 sed -i 's/ùo/uò/g'
ls *.txt | xargs -L 1 sed -i 's/oǔ/ǒu/g'
ls *.txt | xargs -L 1 sed -i 's/ùi/uì/g'
ls *.txt | xargs -L 1 sed -i 's/íe/ié/g'
ls *.txt | xargs -L 1 sed -i 's/ùi/uì/g'
ls *.txt | xargs -L 1 sed -i 's/ǐe/iě/g'
ls *.txt | xargs -L 1 sed -i 's/aō/āo/g'
ls *.txt | xargs -L 1 sed -i 's/ìan/iàn/g'
ls *.txt | xargs -L 1 sed -i 's/īan/iān/g'
.PHONY: check
check: tone_mark
-rg 'ɡ|ɑ'
.PHONY: cedict_get
cedict_get:
python -m pip install -U -r requirements_dev.txt
python get_latest_cc_cedict.py
.PHONY: cedict
cedict:
python -m pip install -U -r requirements_dev.txt
python parse_latest_cc_cedict.py

@ -0,0 +1,54 @@
# phrase-pinyin-data [![Build Status](https://travis-ci.org/mozillazg/phrase-pinyin-data.svg?branch=master)](https://travis-ci.org/mozillazg/phrase-pinyin-data)
词语拼音数据。
## 数据介绍
拼音数据的格式:
```
{phrase}: {pinyin}
```
* 以 `#` 开头的行是注释
* 行尾的 `#` 也是注释
* `{phrase}` 汉字词语
* `{pinyin}` 词语的拼音,使用空格分隔每个汉字的拼音
* 一行一个词语的读音,有多个音的词语会出现在多行
* 示例:
```
# 注释
中国: zhōng guó
北京: běi jīng # 注释
```
文件说明:
* `overwrite.txt`: 手工纠正的拼音数据
* `pinyin.txt`: `pinyin.txt + overwrite.txt` 后的拼音数据
* `zdic_cibs.txt`: [汉典网](http://www.zdic.net/) 汉语词典拼音数据
* `zdic_cybs.txt`: [汉典网](http://www.zdic.net/) 成语词典拼音数据
* `cc_cedict.txt`: [cc-cedict.org](https://cc-cedict.org/) 拼音数据
* `large_pinyin.txt`: `zdic_cibs.txt + zdic_cybs.txt + cc_cedict.txt + pinyin.txt + overwrite.txt` 后的拼音数据
## 修改数据
* 修改 `pinyin.txt``overwrite.txt` 都可以了
* 执行 `make merge` 命令可以按照合并规则生成最新的 `pinyin.txt`
## 参考资料
* 初始数据基于 [phrases-dict.js](https://github.com/hotoo/pinyin/blob/05f74496c34ccb32db1a0fd0b358a798a22a51e5/data/phrases-dict.js) 和 [phrases_dict.py](https://github.com/mozillazg/python-pinyin/blob/366de0363ff1fb9a718ce668448bea59de09a4bf/pypinyin/phrases_dict.py)
* [汉典 zdic.net](http://www.zdic.net/)
* [字海网,叶典网](http://zisea.com/)
* [国学大师_国学网](http://www.guoxuedashi.com/)
* [CC-CEDICT download - MDBG English to Chinese dictionary](http://www.mdbg.net/chindict/chindict.php?page=cc-cedict)
* [漢語大詞典](http://www.ivantsoi.com/hydcd/search.html)
## 相关项目
* [mozillazg/pinyin-data](https://github.com/mozillazg/pinyin-data): 汉字拼音数据

File diff suppressed because it is too large Load Diff

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
import os
import io
import shutil
import codecs
import zipfile
import requests
ROOT = os.path.dirname(os.path.realpath(__file__))
if __name__ == '__main__':
DOWNLOAD_URL = 'https://cc-cedict.org/editor/editor_export_cedict.php?c=zip'
zip_file_path = os.path.join(ROOT, 'cc-cedict.zip')
with open(zip_file_path, 'wb') as f:
response = requests.get(DOWNLOAD_URL, stream=True)
shutil.copyfileobj(response.raw, f)
with open(zip_file_path, 'rb') as fp:
z = zipfile.ZipFile(fp)
z.extractall(ROOT)

File diff suppressed because it is too large Load Diff

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
import sys
import codecs
def parse(lines):
"""
:yield: hanzi, others
"""
for line in lines:
line = line.strip()
if line.startswith('#') or not line:
continue
hanzi, others = line.split(':', 1)
yield hanzi.strip(), others.strip()
def merge(pinyin_d_list):
"""
:rtype: dict
"""
final_d = {}
for overwrite_d in pinyin_d_list:
final_d.update(overwrite_d)
return final_d
def sort(pinyin_d):
"""
:rtype: list
"""
return sorted(pinyin_d.items(), key=lambda x: x[0])
def output(pinyin_s):
print('# version: 0.10.5')
print('# source: https://github.com/mozillazg/phrase-pinyin-data')
for hanzi, pinyin in pinyin_s:
hanzi = hanzi.split('_')[0]
print('{hanzi}: {pinyin}'.format(hanzi=hanzi, pinyin=pinyin))
def main(files):
pinyin_d_list = []
for name in files:
with codecs.open(name, 'r', 'utf-8-sig') as fp:
d = {}
for h, p in parse(fp):
d.setdefault(h, p)
pinyin_d_list.append(d)
pinyin_d = merge(pinyin_d_list)
output(sort(pinyin_d))
if __name__ == '__main__':
main(sys.argv[1:])

@ -0,0 +1,7 @@
# 新增或纠正的拼音数据
# 升级版本的时候会合并回 pinyin.txt
# 示例
斯事体大: sī shì tǐ dà
朝阳: zhāo yáng
朝阳_2: cháo yáng
还君明珠: huán jūn míng zhū

@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
import os
import io
import re
import codecs
from pypinyin.phonetic_symbol import phonetic_symbol
from pypinyin.pinyin_dict import pinyin_dict
from pypinyin.style.tone import ToneConverter
ROOT = os.path.dirname(os.path.realpath(__file__))
tone_converter = ToneConverter()
tone3_2_tone_dict = {}
for k, v in pinyin_dict.items():
parts = v.split(',')
for part in parts:
part = part.strip()
if part:
tone3 = tone_converter.to_tone3(part).strip().lower()
if tone3:
tone3_2_tone_dict[tone3] = part
def tone3_to_tone1(tone3):
tone3 = tone3.strip().lower()
# 儿化
if tone3 == 'r5':
return 'er'
# 轻声
if '5' in tone3:
new = tone3.replace('5', '')
if new:
return new
# 律
if 'u:' in tone3:
tone3 = tone3.replace('u:', 'v')
return tone3_2_tone_dict[tone3]
if __name__ == '__main__':
LINE_PARTS_RE = re.compile(
r'(?P<zht>\w+)\s+(?P<zhs>\w+)\s+\[(?P<py>.+?)\]')
LETTER_DIGIT_RE = re.compile(r'[a-zA-Z0-9]')
cnt = 0
with codecs.open(os.path.join(ROOT, 'cc_cedict.txt'), 'w', 'utf-8-sig') as fpw:
with codecs.open(os.path.join(ROOT, 'cedict_ts.u8'), 'r', 'utf-8-sig') as fpr:
for line in fpr:
line_stripped = line.strip()
if not line or line_stripped[0] == '#' or line_stripped[0] == '%':
continue
# print(line_stripped)
parts = LINE_PARTS_RE.match(line_stripped)
if not parts:
continue
zhs = parts.group('zhs')
py = parts.group('py').split()
try:
tone1 = [tone3_to_tone1(i) for i in py]
except Exception as e:
print(e)
#input()
continue
#print(zhs, py, tone1)
if LETTER_DIGIT_RE.search(zhs):
continue
if len(zhs) < 2:
continue
fpw.write(f'{zhs}: {" ".join(tone1)}\n')
cnt += 1
if cnt % 10000 == 0:
print(f'{cnt} lines processed...')

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,9 @@
[bumpversion]
commit = True
tag = True
current_version = 0.10.2
[bumpversion:file:merge_unihan.py]
[bumpversion:file:pinyin.txt]

@ -0,0 +1,31 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: Python application
on:
push:
branches: [ ]
pull_request:
branches: [ ]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: try merge_unihan
run: |
make merge_unihan

@ -0,0 +1,62 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
#Ipython Notebook
.ipynb_checkpoints

@ -0,0 +1,6 @@
language: python
python:
- "3.5"
script:
- make merge_unihan

@ -0,0 +1,134 @@
# ChangeLog
## [0.10.2] (2021-03-13)
* 修改 `帧` 的最常用读音为 `zhēn`
* 修复 `zdic.txt` 中两个拼音字母 `è í` 使用不当的问题. Thanks [@Ace-Who](https://github.com/Ace-Who)
## [0.10.1] (2020-11-22)
* 调整 `地``謦` 的拼音顺序
## [0.10.0] (2020-10-07)
* 新增 `kTGHZ2013.txt`: [Unihan Database][unihan] 中 [kTGHZ2013](http://www.unicode.org/reports/tr38/#kTGHZ2013) 部分的拼音数据(来源于《通用规范汉字字典》的拼音数据)
* 修正部分拼音的读音
* 生成 `pinyin.txt` 时合并来自 `kTGHZ2013.txt` 的拼音数据
## [0.9.0] (2020-06-06)
* 更新 Unihan 数据版本为 13.0.0
## [0.8.1] (2019-10-26)
* 修正 `迹``分` 的读音。
## [0.8.0] (2019-06-01)
* 增加 `kanji.txt` 日本自造汉字的拼音数据 via [#32]. Thanks [@LuoZijun](https://github.com/LuoZijun)
* 去掉几个有误的轻声数据
## [0.7.0] (2019-03-31)
* 更新 Unihan 数据版本为 12.0.0
## [0.6.2] (2018-09-16)
* 修改 `蹒` 的最常用读音为 `pán`
## [0.6.1] (2018-08-04)
* 修改 `著` 的默认读音为 `zhù` via [8802f31]
## [0.6.0] (2018-07-08)
* 更新 Unihan 数据版本为 11.0.0 via [68dc169]
## [0.5.1] (2018-04-19)
* 更正 `卓`、`啥` 的拼音数据 via [#26] 。Thanks [@shibingli](https://github.com/shibingli)
* 更新 `` 的拼音数据 via [#27]
## [0.5.0] (2018-03-18)
* 更新 Unihan 数据版本为 10.0.0 via [#19][#19]
* 新增 kMandarin_overwrite.txt 用于手工纠正 kMandarin.txt 中有误的拼音数据 via [#21][#21]
* 更正 `讽`、`识` 的最常用读音 via [#20][#20]
* 更正 埔,彷,珖,U+275C8 的常用发音 [635b238c4](https://github.com/mozillazg/pinyin-data/commit/635b238c4d21e55d8fd66299c8da3ae555253b3a)
## [0.4.1] (2017-02-12)
* `妳` 的最常用拼音调整为 `nǐ` via [eb08200](https://github.com/mozillazg/pinyin-data/commit/eb08200d0a203c57ecc62ec7a118765518430238)
* `钭` 的拼音更新为 `tǒu,dǒu` via [fb9e64e](https://github.com/mozillazg/pinyin-data/commit/fb9e64e6c0a20eb0e792e8a402dffbf8cc2dfa57)
## [0.4.0] (2016-10-17)
* Update PUA.txt 详见 [#7](https://github.com/mozillazg/pinyin-data/issues/7) thanks [@Artoria2e5][@Artoria2e5]
* Rename PUA.txt to GBK_PUA.txt 详见 [#7](https://github.com/mozillazg/pinyin-data/issues/7)
* Add kMandarin_8105.txt (《通用规范汉字表》里 8105 个汉字最常用的一个读音) [#9][#9] [#11][#11]
* Update pinyin.txt with latest data
## [0.3.0] (2016-08-19)
* Fixed format of zdic.txt via [b8e4394](https://github.com/mozillazg/pinyin-data/commit/b8e439490d2c6e8c711652983db52fb69136919b).
* Fixed some pinyin: 罗 via [468ffaa](https://github.com/mozillazg/pinyin-data/commit/468ffaa8eb678637c7565a02e6836255bd0df06c).
* Support Chinese that in PUA([Private Use Area](https://en.wikipedia.org/wiki/Private_Use_Areas>)) via [#2](https://github.com/mozillazg/pinyin-data/pull/2).
* pinyin.txt add line comments that startswith `#` via [9944f79](https://github.com/mozillazg/pinyin-data/commit/9944f795e191fb3606d65ada84b6fad5665f8776).
## [0.2.0] (2016-07-19)
* Update to the latest version of [Unihan Database](http://www.unicode.org/charts/unihan.html):
> Date: 2016-06-01 07:01:48 GMT [JHJ]
> Unicode version: 9.0.0
## 0.1.0 (2016-03-11)
* Initial Release
[@Artoria2e5]: https://github.com/Artoria2e5
[#9]: https://github.com/mozillazg/pinyin-data/pull/9
[#11]: https://github.com/mozillazg/pinyin-data/pull/11
[#19]: https://github.com/mozillazg/pinyin-data/pull/19
[#20]: https://github.com/mozillazg/pinyin-data/pull/20
[#21]: https://github.com/mozillazg/pinyin-data/pull/21
[#26]: https://github.com/mozillazg/pinyin-data/pull/26
[#27]: https://github.com/mozillazg/pinyin-data/pull/27
[68dc169]: https://github.com/mozillazg/pinyin-data/commit/68dc169c3f0f02cb9bf53290edab2d2d2463e0c5
[8802f31]: https://github.com/mozillazg/pinyin-data/commit/8802f31e0e65c6e34a497adb55993425741a9d41
[#32]: https://github.com/mozillazg/pinyin-data/pull/32
[unihan]: http://www.unicode.org/charts/unihan.html
[0.2.0]: https://github.com/mozillazg/pinyin-data/compare/v0.1.0...v0.2.0
[0.3.0]: https://github.com/mozillazg/pinyin-data/compare/v0.2.0...v0.3.0
[0.4.0]: https://github.com/mozillazg/pinyin-data/compare/v0.3.0...v0.4.0
[0.4.1]: https://github.com/mozillazg/pinyin-data/compare/v0.4.0...v0.4.1
[0.5.0]: https://github.com/mozillazg/pinyin-data/compare/v0.4.1...v0.5.0
[0.5.1]: https://github.com/mozillazg/pinyin-data/compare/v0.5.0...v0.5.1
[0.6.0]: https://github.com/mozillazg/pinyin-data/compare/v0.5.1...v0.6.0
[0.6.1]: https://github.com/mozillazg/pinyin-data/compare/v0.6.0...v0.6.1
[0.6.2]: https://github.com/mozillazg/pinyin-data/compare/v0.6.1...v0.6.2
[0.7.0]: https://github.com/mozillazg/pinyin-data/compare/v0.6.2...v0.7.0
[0.8.0]: https://github.com/mozillazg/pinyin-data/compare/v0.7.0...v0.8.0
[0.8.1]: https://github.com/mozillazg/pinyin-data/compare/v0.8.0...v0.8.1
[0.9.0]: https://github.com/mozillazg/pinyin-data/compare/v0.8.1...v0.9.0
[0.10.0]: https://github.com/mozillazg/pinyin-data/compare/v0.9.0...v0.10.0
[0.10.1]: https://github.com/mozillazg/pinyin-data/compare/v0.10.0...v0.10.1
[0.10.2]: https://github.com/mozillazg/pinyin-data/compare/v0.10.1...v0.10.2

@ -0,0 +1,82 @@
# GBK/GB 18030 PUA 映射
# 详见https://zh.wikipedia.org/wiki/GB_18030#PUA
# U+E815: #  Unihan: U+2E81 ⺁
U+E816: zuǒ #  Unihan: U+20087 𠂇
# U+E817: #  Unihan: U+20089 𠂉
U+E818: gǔn #  Unihan: U+200CC 𠃌
# U+E819: #  Unihan: U+2E84 ⺄
U+E81A: zhòu,zhū #  Unihan: U+3473 㑳
U+E81B: zhòu #  Unihan: U+3447 㑇
# U+E81C: #  Unihan: U+2E88 ⺈
# U+E81D: #  Unihan: U+2E8B ⺋
# U+E81E: #  Unihan: U+9FB4 龴
U+E81F: wāi #  Unihan: U+359E 㖞
U+E820: hǎn #  Unihan: U+361A 㘚
U+E821: hǎn #  Unihan: U+360E 㘎
# U+E822: #  Unihan: U+2E8C ⺌
# U+E823: #  Unihan: U+2E97 ⺗
U+E824: zhòu,chǎo #  Unihan: U+396E 㥮
U+E825: zhòu #  Unihan: U+3918 㤘
# U+E826: #  Unihan: U+9FB5 龵
U+E827: gāng #  Unihan: U+39CF 㧏
U+E828: kuǎi #  Unihan: U+39DF 㧟
U+E829: sǒng #  Unihan: U+3A73 㩳
U+E82A: sǒng #  Unihan: U+39D0 㧐
# U+E82B: #  Unihan: U+9FB6 龶
# U+E82C: #  Unihan: U+9FB7 龷
U+E82D: gāng #  Unihan: U+3B4E 㭎
U+E82E: kuài #  Unihan: U+3C6E 㱮
U+E82F: tà #  Unihan: U+3CE0 㳠
# U+E830: #  Unihan: U+2EA7 ⺧
U+E831: pěng #  Unihan: U+215D7 𡗗
# U+E832: #  Unihan: U+9FB8 龸
# U+E833: #  Unihan: U+2EAA ⺪
U+E834: lōu #  Unihan: U+4056 䁖
U+E835: cǎn #  Unihan: U+415F 䅟
# U+E836: #  Unihan: U+2EAE ⺮
U+E837: chōu,chóu #  Unihan: U+4337 䌷
# U+E838: #  Unihan: U+2EB3 ⺳
# U+E839: #  Unihan: U+2EB6 ⺶
# U+E83A: #  Unihan: U+2EB7 ⺷
U+E83B: zāi #  Unihan: U+2298F 𢦏
U+E83C: bà,bēi #  Unihan: U+43B1 䎱
U+E83D: bà #  Unihan: U+43AC 䎬
# U+E83E: #  Unihan: U+2EBB ⺻
U+E83F: zhuān #  Unihan: U+43DD 䏝
U+E840: qióng #  Unihan: U+44D6 䓖
U+E841: kuì,huì #  Unihan: U+4661 䙡
U+E842: kuì #  Unihan: U+464C 䙌
# U+E843: #  Unihan: U+9FB9 龹
U+E844: xīn #  Unihan: U+4723 䜣
U+E845: yàn #  Unihan: U+4729 䜩
U+E846: jìng,qíng #  Unihan: U+477C 䝼
U+E847: qíng #  Unihan: U+478D 䞍
# U+E848: #  Unihan: U+2ECA ⻊
U+E849: shàn #  Unihan: U+4947 䥇
U+E84A: yé #  Unihan: U+497A 䥺
U+E84B: pō #  Unihan: U+497D 䥽
U+E84C: shàn #  Unihan: U+4982 䦂
U+E84D: zhuō #  Unihan: U+4983 䦃
U+E84E: shàn #  Unihan: U+4985 䦅
U+E84F: jué #  Unihan: U+4986 䦆
U+E850: wěn,chuài #  Unihan: U+499F 䦟
U+E851: zhèng #  Unihan: U+499B 䦛
U+E852: chuài #  Unihan: U+49B7 䦷
U+E853: zhèng #  Unihan: U+49B6 䦶
# U+E854: #  Unihan: U+9FBA 龺
U+E855: yíng #  Unihan: U+241FE 𤇾
U+E856: yú #  Unihan: U+4CA3 䲣
U+E857: yìn #  Unihan: U+4C9F 䲟
U+E858: chūn #  Unihan: U+4CA0 䲠
U+E859: qiū #  Unihan: U+4CA1 䲡
U+E85A: yú #  Unihan: U+4C77 䱷
U+E85B: téng #  Unihan: U+4CA2 䲢
U+E85C: shī #  Unihan: U+4D13 䴓
U+E85D: jiāo #  Unihan: U+4D14 䴔
U+E85E: liè #  Unihan: U+4D15 䴕
U+E85F: jīng #  Unihan: U+4D16 䴖
U+E860: jú #  Unihan: U+4D17 䴗
U+E861: tī #  Unihan: U+4D18 䴘
U+E862: pì #  Unihan: U+4D19 䴙
U+E863: yǎn #  Unihan: U+4DAE 䶮
# U+E864: #  Unihan: U+9FBB 龻

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2016 mozillazg
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -0,0 +1,17 @@
.PHONY: help
help:
@echo "merge_unihan merge Unihan data"
@echo "pua generate PUA"
@echo "check check unexpected char"
.PHONY: merge_unihan
merge_unihan: check
python merge_unihan.py
.PHONY: pua
pua:
python tools/gen_gb_pua.py > GBK_PUA.txt
.PHONY: check
check:
-rg 'ɡ|ɑ|í|è'

@ -0,0 +1,68 @@
# pinyin-data [![Build Status](https://travis-ci.org/mozillazg/pinyin-data.svg?branch=master)](https://travis-ci.org/mozillazg/pinyin-data)
汉字拼音数据。
## 数据介绍
拼音数据的格式:
{code point}: {pinyins} # {hanzi} {comments}
* 以 `#` 开头的行是注释,行内 `#` 后面的字符也是注释
* `{pinyins}` 中使用逗号分隔多个拼音
* 示例:
# 注释
U+4E2D: zhōng,zhòng # 中
[Unihan Database][unihan] 数据版本:
> Date: 2020-02-18 18:27:33 GMT [JHJ]
> Unicode version: 13.0.0
* `kTGHZ2013.txt`: [Unihan Database][unihan] 中 [kTGHZ2013](http://www.unicode.org/reports/tr38/#kTGHZ2013) 部分的拼音数据(来源于《通用规范汉字字典》的拼音数据)
* `kHanyuPinyin.txt`: [Unihan Database][unihan] 中 [kHanyuPinyin](http://www.unicode.org/reports/tr38/#kHanyuPinyin) 部分的拼音数据(来源于《漢語大字典》的拼音数据)
* `kXHC1983.txt`: [Unihan Database][unihan] 中 [kXHC1983](http://www.unicode.org/reports/tr38/#kXHC1983) 部分的拼音数据(来源于《现代汉语词典》的拼音数据)
* `kHanyuPinlu.txt`: [Unihan Database][unihan] 中 [kHanyuPinlu](http://www.unicode.org/reports/tr38/#kHanyuPinlu) 部分的拼音数据(来源于《現代漢語頻率詞典》的拼音数据)
* `kMandarin.txt`: [Unihan Database][unihan] 中 [kMandarin](http://www.unicode.org/reports/tr38/#kMandarin) 部分的拼音数据普通话中最常用的一个读音。zh-CN 为主,如果 zh-CN 中没有则使用 zh-TW 中的拼音)
* `kMandarin_overwrite.txt`: 手工纠正 `kMandarin.txt` 中有误的拼音数据(**可以修改**
* `GBK_PUA.txt`: [Private Use Area](https://en.wikipedia.org/wiki/Private_Use_Areas) 中有拼音的汉字,参考 [GB 18030 - 维基百科,自由的百科全书](https://zh.wikipedia.org/wiki/GB_18030#PUA) **可以修改**
* `nonCJKUI.txt`: 不属于 [CJK Unified Ideograph](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs) 但是却有拼音的字符(**可以修改**
* `kanji.txt`: [日本自造汉字](https://zh.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97#7_%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97%E7%9A%84%E6%B1%89%E8%AF%AD%E6%99%AE%E9%80%9A%E8%AF%9D%E8%A7%84%E8%8C%83%E8%AF%BB%E9%9F%B3%E8%A1%A8) 的拼音数据 **可以修改**
* `kMandarin_8105.txt`: [《通用规范汉字表》](https://zh.wikipedia.org/wiki/通用规范汉字表)(2013 年版)里 8105 个汉字最常用的一个读音 (**可以修改**)
* `overwrite.txt`: 手工纠正的拼音数据(**可以修改**
* `pinyin.txt`: 合并上述文件后的拼音数据
* `zdic.txt`: [汉典网](http://zdic.net) 的拼音数据(**可以修改**
## 修改数据
* 上面标注了 **可以修改** 字样的文件都可以直接修改
* 如果汉字的拼音不需要修改,只是调整第一个读音的话,可以直接修改 `kMandarin_8105.txt` 这个文件
* 执行 `merge_unihan` 命令可以按照合并规则生成最新的 `pinyin.txt` 文件
* 进入 unihan 目录,执行 `make update` 命令可以更新最新的 Unihan 数据
## 参考资料
* [汉语拼音方案](http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html)
* [Unihan Database Lookup](http://www.unicode.org/charts/unihan.html)
* [汉典 zdic.net](http://www.zdic.net/)
* [字海网,叶典网](http://zisea.com/)
* [国学大师_国学网](http://www.guoxuedashi.com/)
* [Unicode、GB2312、GBK和GB18030中的汉字](http://www.fmddlmyy.cn/text24.html)
* [GB 18030 - 维基百科,自由的百科全书](https://zh.wikipedia.org/wiki/GB_18030#PUA)
* [通用规范汉字表 - 维基百科,自由的百科全书](https://zh.wikipedia.org/wiki/%E9%80%9A%E7%94%A8%E8%A7%84%E8%8C%83%E6%B1%89%E5%AD%97%E8%A1%A8)
* [Chinas 通用规范汉字表 (Tōngyòng Guīfàn Hànzìbiǎo)](https://blogs.adobe.com/CCJKType/2014/03/china-8105.html)
* [日本汉字的汉语读音规范](http://www.moe.gov.cn/s78/A19/yxs_left/moe_810/s230/201001/t20100115_75698.html)
* [日本汉字的汉语普通话规范读音表- 维基百科](https://zh.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97#7_%E6%97%A5%E6%9C%AC%E6%B1%89%E5%AD%97%E7%9A%84%E6%B1%89%E8%AF%AD%E6%99%AE%E9%80%9A%E8%AF%9D%E8%A7%84%E8%8C%83%E8%AF%BB%E9%9F%B3%E8%A1%A8)
* [漢語大字典(第二版)](http://www.ivantsoi.com/hydzd/index.html)
[unihan]: http://www.unicode.org/charts/unihan.html
## 相关项目
* [mozillazg/phrase-pinyin-data](https://github.com/mozillazg/phrase-pinyin-data): 词语拼音数据

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,67 @@
U+389C: kāng # 㢜
U+60B7: lì # 悷
U+417F: huá # 䅿
U+46BE: rén # 䚾
U+4B78: fù # 䭸
U+4B7B: fēn # 䭻
U+4CC9: dōng # 䳉
U+4D7B: huì # 䵻
U+57D4: pǔ # 埔
U+5A47: cǎi # 婇
U+5F6F: piāo # 彯
U+5F77: páng # 彷
U+60B7: lì # 悷
U+65FD: tūn # 旽
U+6A0B: tōng # 樋
U+6ADA: lǘ # 櫚
U+6E5E: zhēn # 湞
U+73D6: guāng # 珖
U+77A1: guī # 瞡
U+7BC9: zhù # 築
U+815C: méi # 腜
U+816C: róu # 腬
U+8192: ōu # 膒
U+8491: yīn # 蒑
U+8A09: fàn # 訉
U+90D8: lǚ # 郘
U+9D24: zhōng # 鴤
U+2031A: nòng # 𠌚
U+2141D: fú # 𡐝
U+21594: nuó # 𡖔
U+2199D: xiāo # 𡦝
U+21B0D: mí # 𡬍
U+21B10: yí # 𡬐
U+21B15: lóng # 𡬕
U+2243F: rǎng # 𢐿
U+2273D: kuí # 𢜽
U+22741: hōng # 𢝁
U+22892: sū # 𢢒
U+22A10: jí # 𢨐
U+245ED: xià # 𤗭
U+24704: huái # 𤜄
U+247AE: zhài # 𤞮
U+24856: yán # 𤡖
U+248B5: lài # 𤢵
U+249EB: jīn # 𤧫
U+2546B: kān # 𥑫
U+2588D: hù # 𥢍
U+2588F: diàn # 𥢏
U+25C1F: yuán # 𥰟
U+272D5: kùn # 𧋕
U+2757A: shuāng # 𧕺
U+275C8: nú # 𧗈
U+27956: lí # 𧥖
U+280A2: jí # 𨂢
U+2824B: tuō # 𨉋
U+284A8: hài # 𨒨
U+28ABF: liú # 𨪿
U+28DED: chán # 𨷭
U+28E30: jú # 𨸰
U+293CF: wéi # 𩏏
U+295F5: zhēng # 𩗵
U+29B5D: wǒ # 𩭝
U+2A048: zhuāng # 𪁈
U+2A2A2: shí # 𪊢
U+8B9D: zhán # 讝
U+3D14: jí # 㴔
U+8B26: qǐng # 謦

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,32 @@
U+5302: yún # 匂 yún 为日本汉字读音; xiōng 为现代汉语读音;
U+4E3C: dǎn # 丼 dǎn 为日本汉字读音; jǐng 为现代汉语读音;
U+8FBB: shí # 辻
U+8FBC: rù # 込
U+51E7: jīn # 凧
U+6763: shān # 杣
U+67A0: zá # 枠
U+7551: tián # 畑
U+6803: lì # 栃
U+6802: méi # 栂
U+5CE0: kǎ # 峠
U+4FE3: yǔ # 俣
U+7C7E: rèn # 籾
U+7560: tián # 畠
U+96EB: xià # 雫
U+7B39: shì # 笹
U+5840: píng # 塀
U+6919: chāng # 椙
U+7872: yù # 硲
U+86EF: lǎo # 蛯
U+55B0: cān # 喰
U+643E: zhà # 搾
U+698A: shén # 榊
U+50CD: dòng # 働
U+7CC0: huā # 糀
U+9786: bǐng # 鞆
U+69C7: zhēn # 槇
U+6A2B: jiān # 樫
U+9D2B: tián # 鴫
U+567A: xīn # 噺
U+7C17: liáng # 簗
U+9EBF: mó # 麿

@ -0,0 +1,123 @@
# -*- coding: utf-8 -*-
import collections
def code_to_hanzi(code):
hanzi = chr(int(code.replace('U+', '0x'), 16))
return hanzi
def sort_pinyin_dict(pinyin_dict):
return collections.OrderedDict(
sorted(pinyin_dict.items(),
key=lambda item: int(item[0].replace('U+', '0x'), 16))
)
def remove_dup_items(lst):
new_lst = []
for item in lst:
if item not in new_lst:
new_lst.append(item)
return new_lst
def parse_pinyins(fp):
pinyin_map = {}
for line in fp:
line = line.strip()
if line.startswith('#') or not line:
continue
code, pinyin = line.split('#')[0].split(':')
pinyin = ','.join([x.strip() for x in pinyin.split() if x.strip()])
pinyin_map[code.strip()] = pinyin.split(',')
return pinyin_map
def merge(raw_pinyin_map, adjust_pinyin_map, overwrite_pinyin_map):
new_pinyin_map = {}
for code, pinyins in raw_pinyin_map.items():
if code in overwrite_pinyin_map:
pinyins = overwrite_pinyin_map[code]
elif code in adjust_pinyin_map:
pinyins = adjust_pinyin_map[code] + pinyins
new_pinyin_map[code] = remove_dup_items(pinyins)
return new_pinyin_map
def save_data(pinyin_map, writer):
for code, pinyins in pinyin_map.items():
hanzi = code_to_hanzi(code)
line = '{code}: {pinyin} # {hanzi}\n'.format(
code=code, pinyin=','.join(pinyins), hanzi=hanzi
)
writer.write(line)
def extend_pinyins(old_map, new_map, only_no_exists=False):
for code, pinyins in new_map.items():
if only_no_exists: # 只当 code 不存在时才更新
if code not in old_map:
old_map[code] = pinyins
else:
old_map.setdefault(code, []).extend(pinyins)
if __name__ == '__main__':
raw_pinyin_map = {}
with open('kHanyuPinyin.txt') as fp:
khanyupinyin = parse_pinyins(fp)
raw_pinyin_map.update(khanyupinyin)
with open('kXHC1983.txt') as fp:
kxhc1983 = parse_pinyins(fp)
extend_pinyins(raw_pinyin_map, kxhc1983)
with open('nonCJKUI.txt') as fp:
noncjkui = parse_pinyins(fp)
extend_pinyins(raw_pinyin_map, noncjkui)
with open('kMandarin_8105.txt') as fp:
adjust_pinyin_map = parse_pinyins(fp)
extend_pinyins(raw_pinyin_map, adjust_pinyin_map)
with open('kMandarin_overwrite.txt') as fp:
_map = parse_pinyins(fp)
extend_pinyins(adjust_pinyin_map, _map)
extend_pinyins(raw_pinyin_map, adjust_pinyin_map)
with open('kMandarin.txt') as fp:
_map = parse_pinyins(fp)
extend_pinyins(adjust_pinyin_map, _map)
extend_pinyins(raw_pinyin_map, adjust_pinyin_map)
with open('kTGHZ2013.txt') as fp:
_map = parse_pinyins(fp)
extend_pinyins(adjust_pinyin_map, _map)
extend_pinyins(raw_pinyin_map, adjust_pinyin_map)
with open('kHanyuPinlu.txt') as fp:
khanyupinyinlu = parse_pinyins(fp)
extend_pinyins(adjust_pinyin_map, _map)
extend_pinyins(raw_pinyin_map, adjust_pinyin_map)
with open('GBK_PUA.txt') as fp:
pua_pinyin_map = parse_pinyins(fp)
extend_pinyins(raw_pinyin_map, pua_pinyin_map)
with open('kanji.txt') as fp:
_map = parse_pinyins(fp)
extend_pinyins(raw_pinyin_map, _map, only_no_exists=True)
with open('overwrite.txt') as fp:
overwrite_pinyin_map = parse_pinyins(fp)
extend_pinyins(raw_pinyin_map, overwrite_pinyin_map)
new_pinyin_map = merge(raw_pinyin_map, adjust_pinyin_map,
overwrite_pinyin_map)
new_pinyin_map = sort_pinyin_dict(new_pinyin_map)
assert len(new_pinyin_map) == len(raw_pinyin_map)
code_set = set(new_pinyin_map.keys())
assert set(khanyupinyin.keys()) - code_set == set()
assert set(khanyupinyinlu.keys()) - code_set == set()
assert set(kxhc1983.keys()) - code_set == set()
assert set(adjust_pinyin_map.keys()) - code_set == set()
assert set(overwrite_pinyin_map.keys()) - code_set == set()
assert set(pua_pinyin_map.keys()) - code_set == set()
with open('pinyin.txt', 'w') as fp:
fp.write('# version: 0.10.2\n')
fp.write('# source: https://github.com/mozillazg/pinyin-data\n')
save_data(new_pinyin_map, fp)

@ -0,0 +1 @@
U+3007: líng,yuán,xīng #

@ -0,0 +1,63 @@
# 手工纠正错误的拼音数据
# 井号开头的行将会被忽略,可以用作注释
# 数据格式:{code point}: {pinyins} # {hanzi}
# 示例:
# U+4E2D: zhōng,zhòng # 中
U+5353: zhuó,zhuō # 卓
U+5565: shá,shà # 啥
U+5666: yuě,huì # 噦
U+59B3: nǐ,nǎi # 妳
U+8BB8: xǔ,hǔ # 许
U+94AD: tǒu,dǒu # 钭
U+9E00: chǔ,zhú,chù # 鸀
U+E815: yè # 
U+E816: zuǒ,yǒu # 
U+E81B: zhòu,zhū # 
U+E81D: jié,jiē # 
U+E824: zhòu # 
U+E826: shǒu # 
U+E82B: fēng # 
U+E82C: gòng # 
U+E82E: huì,kuì # 
U+E830: jiān # 
U+E831: ēn # 
U+E832: xiǎo # 
U+E834: lóu,lǘ # 
U+E835: cǎn,shān,cēn # 
U+E836: zhú # 
U+E838: wǎng # 
U+E83A: yáng,xiáng # 
U+E83D: bà,bēi # 
U+E83F: zhuān,zhuán,chuǎn,chún # 
U+E842: kuì,huì # 
U+E843: juǎn # 
U+E846: qíng # 
U+E84A: yé,yá # 
U+E850: chuài # 
U+E854: zhuó # 
U+E864: luán # 
U+241FE: yíng # 𤇾
U+275C8: nú # 𧗈
U+47C1: xiāo,chāo # 䟁
U+9EBF: mí # 麿
U+7C17: zhù # 簗
U+8279: cǎo # 艹
U+88CF: lǐ # 裏
U+88E1: lǐ # 裡
U+5206: fēn,fèn,fén # 分
U+208E1: fèng # 𠣡
U+2589F: hù # 𥢟
U+258F9: ràn # 𥣹
U+287B3: qú # 𨞳
U+2A008: yuān # 𪀈
U+9EFE: mǐn,miǎn,měng # 黾
U+55A3: xǔ # 喣
U+529A: zhú # 劚
U+532E: kuì,guì # 匮
U+9400: kuì,guì # 鐀
U+87AB: shì,zhē # 螫
U+5C82: qǐ,kǎi # 岂
U+534E: huá,huà,huā # 华
U+5455: ǒu,ōu,òu # 呕
U+4ECE: cóng,zòng # 从
U+513F: ér,er,rén # 儿

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
"""生成初始的 kMandarin_8105.txt"""
from merge_unihan import parse_pinyins, code_to_hanzi
def parse_china_x():
with open('tools/china-8105-06062014.txt') as fp:
for line in fp:
line = line.strip()
if line.startswith('#') or not line:
continue
yield line.split()[0]
def parse_zdic():
with open('zdic.txt') as fp:
return parse_pinyins(fp)
def parse_kmandain():
with open('pinyin.txt') as fp:
return parse_pinyins(fp)
def diff(kmandarin, zdic, commons):
for key in commons:
hanzi = code_to_hanzi(key)
if key in kmandarin:
value = kmandarin[key][0]
if key in zdic and value != zdic[key][0]:
yield '{0}: {1} # {2} -> {3}'.format(
key, value, hanzi, zdic[key][0]
)
else:
yield '{0}: {1} # {2}'.format(key, value, hanzi)
elif key in zdic:
value = zdic[key][0]
yield '{0}: {1} # {2}'.format(key, value, hanzi)
else:
yield '# {0}: {1} # {2}'.format(key, '<-', hanzi)
if __name__ == '__main__':
zdic = parse_zdic()
kmandarin = parse_kmandain()
commons = parse_china_x()
lst = diff(kmandarin, zdic, commons)
for x in lst:
print(x)

@ -0,0 +1,166 @@
# -*- coding: utf-8 -*-
import re
import sys
sys.path.append('.')
from merge_unihan import parse_pinyins
def get_pinyins(file_path):
with open(file_path) as fp:
return parse_pinyins(fp)
def get_pua_map():
text = '''
# A6D9 E78D () FE10 (︐)
# A6DA E78E () FE12 (︒)
# A6DB E78F () FE11 (︑)
# A6DC E790 () FE13 (︓)
# A6DD E791 () FE14 (︔)
# A6DE E792 () FE15 (︕)
# A6DF E793 () FE16 (︖)
# A6EC E794 () FE17 (︗)
# A6ED E795 () FE18 (︘)
# A8BC E7C7 () 1E3F (ḿ) 1E3F (ḿ)
# A8BF E7C8 () 01F9 (ǹ) 01F9 (ǹ)
# A989 E7E7 () 303E (〾) 303E (〾)
# A98A E7E8 () 2FF0 (⿰) 2FF0 (⿰)
# A98B E7E9 () 2FF1 (⿱) 2FF1 (⿱)
# A98C E7EA () 2FF2 (⿲) 2FF2 (⿲)
# A98D E7EB () 2FF3 (⿳) 2FF3 (⿳)
# A98E E7EC () 2FF4 (⿴) 2FF4 (⿴)
# A98F E7ED () 2FF5 (⿵) 2FF5 (⿵)
# A990 E7EE () 2FF6 (⿶) 2FF6 (⿶)
# A991 E7EF () 2FF7 (⿷) 2FF7 (⿷)
# A992 E7F0 () 2FF8 (⿸) 2FF8 (⿸)
# A993 E7F1 () 2FF9 (⿹) 2FF9 (⿹)
# A994 E7F2 () 2FFA (⿺) 2FFA (⿺)
# A995 E7F3 () 2FFB (⿻) 2FFB (⿻)
FE50 E815 () 2E81 () 2E81 ()
FE51 E816 () E816 () 20087 (𠂇)
FE52 E817 () E817 () 20089 (𠂉)
FE53 E818 () E818 () 200CC (𠃌)
FE54 E819 () 2E84 () 2E84 ()
FE55 E81A () 3473 () 3473 ()
FE56 E81B () 3447 () 3447 ()
FE57 E81C () 2E88 () 2E88 ()
FE58 E81D () 2E8B () 2E8B ()
FE59 E81E () E81E () 9FB4 ()
FE5A E81F () 359E () 359E ()
FE5B E820 () 361A () 361A ()
FE5C E821 () 360E () 360E ()
FE5D E822 () 2E8C () 2E8C ()
FE5E E823 () 2E97 () 2E97 ()
FE5F E824 () 396E () 396E ()
FE60 E825 () 3918 () 3918 ()
FE61 E826 () E826 () 9FB5 ()
FE62 E827 () 39CF () 39CF ()
FE63 E828 () 39DF () 39DF ()
FE64 E829 () 3A73 () 3A73 ()
FE65 E82A () 39D0 () 39D0 ()
FE66 E82B () E82B () 9FB6 ()
FE67 E82C () E82C () 9FB7 ()
FE68 E82D () 3B4E () 3B4E ()
FE69 E82E () 3C6E () 3C6E ()
FE6A E82F () 3CE0 () 3CE0 ()
FE6B E830 () 2EA7 () 2EA7 ()
FE6C E831 () E831 () 215D7 (𡗗)
FE6D E832 () E832 () 9FB8 ()
FE6E E833 () 2EAA () 2EAA ()
FE6F E834 () 4056 () 4056 ()
FE70 E835 () 415F () 415F ()
FE71 E836 () 2EAE () 2EAE ()
FE72 E837 () 4337 () 4337 ()
FE73 E838 () 2EB3 () 2EB3 ()
FE74 E839 () 2EB6 () 2EB6 ()
FE75 E83A () 2EB7 () 2EB7 ()
FE76 E83B () E83B () 2298F (𢦏)
FE77 E83C () 43B1 () 43B1 ()
FE78 E83D () 43AC () 43AC ()
FE79 E83E () 2EBB () 2EBB ()
FE7A E83F () 43DD () 43DD ()
FE7B E840 () 44D6 () 44D6 ()
FE7C E841 () 4661 () 4661 ()
FE7D E842 () 464C () 464C ()
FE7E E843 () E843 () 9FB9 ()
FE80 E844 () 4723 () 4723 ()
FE81 E845 () 4729 () 4729 ()
FE82 E846 () 477C () 477C ()
FE83 E847 () 478D () 478D ()
FE84 E848 () 2ECA () 2ECA ()
FE85 E849 () 4947 () 4947 ()
FE86 E84A () 497A () 497A ()
FE87 E84B () 497D () 497D ()
FE88 E84C () 4982 () 4982 ()
FE89 E84D () 4983 () 4983 ()
FE8A E84E () 4985 () 4985 ()
FE8B E84F () 4986 () 4986 ()
FE8C E850 () 499F () 499F ()
FE8D E851 () 499B () 499B ()
FE8E E852 () 49B7 () 49B7 ()
FE8F E853 () 49B6 () 49B6 ()
FE90 E854 () E854 () 9FBA ()
FE91 E855 () E855 () 241FE (𤇾)
FE92 E856 () 4CA3 () 4CA3 ()
FE93 E857 () 4C9F () 4C9F ()
FE94 E858 () 4CA0 () 4CA0 ()
FE95 E859 () 4CA1 () 4CA1 ()
FE96 E85A () 4C77 () 4C77 ()
FE97 E85B () 4CA2 () 4CA2 ()
FE98 E85C () 4D13 () 4D13 ()
FE99 E85D () 4D14 () 4D14 ()
FE9A E85E () 4D15 () 4D15 ()
FE9B E85F () 4D16 () 4D16 ()
FE9C E860 () 4D17 () 4D17 ()
FE9D E861 () 4D18 () 4D18 ()
FE9E E862 () 4D19 () 4D19 ()
FE9F E863 () 4DAE () 4DAE ()
FEA0 E864 () E864 () 9FBB ()
'''.strip()
for line in text.split('\n'):
if line.startswith('#'):
continue
gb, gbk, gb_18030, unicode_4_1 = line.split('\t')
# print(gb, gbk, gb_18030, unicode_4_1)
# print(get_han_point(gbk), get_han_point(unicode_4_1))
yield get_han_point(gbk), get_han_point(unicode_4_1)
def get_han_point(text):
if not text:
return '', ''
regex = re.compile(r'(?P<point>[A-Z0-9]+) \((?P<han>[^\)]+)\)')
result = regex.findall(text)
return result[0]
def point_to_u_point(point):
point = point.upper()
if not point.startswith('U+'):
point = 'U+' + point
return point
def gen_pua_data(gbk, unicode_4_1, pinyin_map):
gbk_point, gbk_han = gbk
gbk_point = point_to_u_point(gbk_point)
unicode_4_1_point, unicode_4_1_han = unicode_4_1
unicode_4_1_point = point_to_u_point(unicode_4_1_point)
pinyins = ','.join(pinyin_map.get(unicode_4_1_point, []))
prefix = ''
if not pinyins:
prefix = '# '
return (
'{prefix}{gbk_point}: {pinyins} # {gbk_han} '
'Unihan: {unicode_4_1_point} {unicode_4_1_han}'
).format(**locals())
if __name__ == '__main__':
pinyin_map = get_pinyins('pinyin.txt')
print('# GBK/GB 18030 PUA 映射\n'
'# 详见https://zh.wikipedia.org/wiki/GB_18030#PUA')
for gbk, unicode_4_1 in get_pua_map():
print(gen_pua_data(gbk, unicode_4_1, pinyin_map))

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
"""补充 8105 中汉字的拼音数据"""
from collections import namedtuple
import re
import sys
from pyquery import PyQuery
import requests
re_pinyin = re.compile(r'拼音:(?P<pinyin>\S+) ')
re_code = re.compile(r'统一码\w?(?P<code>\S+) ')
re_alternate = re.compile(r'异体字:\s+?(?P<alternate>\S+)')
HanziInfo = namedtuple('HanziInfo', 'pinyin code alternate')
def fetch_html(url, params):
response = requests.get(url, params=params)
return response.content
def fetch_info(hanzi):
url = 'http://www.guoxuedashi.com/zidian/so.php'
params = {
'sokeyzi': hanzi,
'kz': 1,
'submit': '',
}
html = fetch_html(url, params)
pq = PyQuery(html)
pq = PyQuery(pq('table.zui td')[1])
text = pq('tr').text()
text_alternate = pq(html)('.info_txt2')('em').text()
pinyin = ''
pinyin_match = re_pinyin.search(text)
if pinyin_match is not None:
pinyin = pinyin_match.group('pinyin')
code = re_code.search(text).group('code')
alternate = ''
alternate_match = re_alternate.search(text_alternate)
if alternate_match is not None:
alternate = alternate_match.group('alternate')
return HanziInfo(pinyin, code, alternate)
def parse_hanzi(hanzi):
info = fetch_info(hanzi)
if (not info.pinyin) and info.alternate:
alternate = fetch_info(info.alternate)
else:
alternate = ''
return HanziInfo(info.pinyin, info.code, alternate)
def main(lines):
for line in lines:
if line.startswith('# U+') and '<-' in line:
# # U+xxx ... -> U+xxx
code = line.split(':')[0].strip('# ')
# U+xxx -> xxx
code = code[2:]
info = parse_hanzi(code)
pinyin = info.pinyin
extra = ''
if (not pinyin) and info.alternate:
alternate = info.alternate
pinyin = alternate.pinyin
extra = ' => U+{0}'.format(alternate.code)
if ',' in pinyin:
first_pinyin, extra_pinyin = pinyin.split(',', 1)
pinyin = first_pinyin
extra += ' ?-> ' + extra_pinyin
if pinyin:
line = line.strip()
# # U+xxx -> U+xxx
line = line[2:]
line = line.replace('<-', pinyin)
if extra:
line += extra
yield line.strip()
if __name__ == '__main__':
args = sys.argv[1:]
input_file = args[0]
with open(input_file) as fp:
for line in main(fp):
print(line)

@ -0,0 +1,2 @@
pyquery==1.2.13
requests==2.20.0

@ -0,0 +1,20 @@
.PHONY: help
help:
@echo "parse parse Unihan database "
@echo "update update Unihan database"
@echo "diff diff between Unihan data and parsed data"
.PHONY:parse
parse:
@python parse_pinyin.py
.PHONY:update
update:
-rm Unihan*
wget ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip -O Unihan.zip
unzip Unihan.zip
python parse_pinyin.py
.PHONY:diff
diff:
@bash diff.sh

@ -0,0 +1,9 @@
# Unihan Database
http://www.unicode.org/charts/unihan.html
Update Unihan databse:
```
make update
```

@ -0,0 +1,14 @@
#!/usr/bin/env bash
set -euo pipefail
IFS=$'\n\t'
function main() {
printf '%-14s %-8s %-8s\n' '' 'parsed' 'Unihan'
for kind in 'kHanyuPinyin' 'kMandarin' 'kHanyuPinlu' 'kXHC1983'
do
unihanCount=$(less Unihan_Readings.txt |grep -v '^#' |grep -c "$kind")
parsedCount=$(less "$kind".txt | grep -c "")
printf '%-14s %-8s %-8s\n' "$kind" "$parsedCount" "$unihanCount"
done
}
main

@ -0,0 +1,102 @@
# -*- coding: utf-8 -*-
import functools
import operator
import re
def re_match_pinyin_line(kind):
return re.compile(
r'^U\+(?P<code>[0-9A-Z]+)\t{}\t(?P<pinyin>.+)$'.format(kind)
)
PINYIN = r'[^\d\.,]+'
re_khanyupinyin = re.compile(r'''
(?:\d{5}\.\d{2}0,)*\d{5}\.\d{2}0:
((?:%(pinyin)s,)*)
(%(pinyin)s)
''' % ({'pinyin': PINYIN}), re.X)
re_kmandarin = re.compile(r'''
()()
({pinyin})
'''.format(pinyin=PINYIN), re.X)
re_kxhc1983 = re.compile(r'''
()()[0-9]{4}\.[0-9]{3}\*?
(?:,[0-9]{4}\.[0-9]{3}\*?)*:
(%(pinyin)s)
''' % ({'pinyin': PINYIN}), re.X)
re_khanyupinlu = re.compile(r'''
()()({pinyin})\([0-9]+\)
'''.format(pinyin=PINYIN), re.X)
re_ktghz2013 = re.compile(r'''
()()[0-9]{3}\.[0-9]{3}
(?:,[0-9]{3}\.[0-9]{3})*:
(%(pinyin)s)
''' % ({'pinyin': PINYIN}), re.X)
re_kinds_map = {
'kHanyuPinyin': re_khanyupinyin,
'kMandarin': re_kmandarin,
'kXHC1983': re_kxhc1983,
'kHanyuPinlu': re_khanyupinlu,
'kTGHZ2013': re_ktghz2013,
}
def remove_dup_items(lst):
new_list = []
for item in lst:
if item not in new_list:
new_list.append(item)
return new_list
def parse(lines, kind='kHanyuPinyin', ignore_prefix='#') -> str:
re_line = re_match_pinyin_line(kind)
re_pinyin = re_kinds_map[kind]
for line in lines:
line = line.strip()
if line.startswith(ignore_prefix):
continue
match = re_line.match(line)
if match is None:
continue
code = match.group('code')
raw_pinyin = match.group('pinyin')
raw_pinyins = re_pinyin.findall(raw_pinyin)
# 处理有三个或三个以上拼音的情况,此时 raw_pinyins 类似
# [(' xī,', 'lǔ '), (' lǔ,', 'xī')] or [('shú,dú,', 'tù')]
for n, values in enumerate(raw_pinyins):
value = []
for v in values:
value.extend(v.split(','))
raw_pinyins[n] = value
pinyins = functools.reduce(
operator.add, raw_pinyins
)
pinyins = [x.strip() for x in pinyins if x.strip()]
pinyins = remove_dup_items(pinyins)
pinyin = ','.join(pinyins)
yield code, pinyin
def save_data(pinyins, writer):
for code, pinyin in pinyins:
gl = {}
exec('hanzi=chr(0x{})'.format(code), gl)
hanzi = gl['hanzi']
line = 'U+{code}: {pinyin} # {hanzi}\n'.format(
code=code, pinyin=pinyin, hanzi=hanzi
)
writer.write(line)
if __name__ == '__main__':
with open('Unihan_Readings.txt') as fp:
for kind in ('kHanyuPinyin', 'kMandarin',
'kHanyuPinlu', 'kXHC1983', 'kTGHZ2013'):
fp.seek(0)
with open('{}.txt'.format(kind), 'w') as writer:
pinyins = parse(fp.readlines(), kind=kind)
save_data(pinyins, writer)

File diff suppressed because it is too large Load Diff

@ -0,0 +1,52 @@
"""汉字拼音转换工具."""
from pypinyin.constants import BOPOMOFO
from pypinyin.constants import BOPOMOFO_FIRST
from pypinyin.constants import CYRILLIC
from pypinyin.constants import CYRILLIC_FIRST
from pypinyin.constants import FINALS
from pypinyin.constants import FINALS_TONE
from pypinyin.constants import FINALS_TONE2
from pypinyin.constants import FINALS_TONE3
from pypinyin.constants import FIRST_LETTER
from pypinyin.constants import INITIALS
from pypinyin.constants import NORMAL
from pypinyin.constants import Style
from pypinyin.constants import STYLE_BOPOMOFO
from pypinyin.constants import STYLE_BOPOMOFO_FIRST
from pypinyin.constants import STYLE_CYRILLIC
from pypinyin.constants import STYLE_CYRILLIC_FIRST
from pypinyin.constants import STYLE_FINALS
from pypinyin.constants import STYLE_FINALS_TONE
from pypinyin.constants import STYLE_FINALS_TONE2
from pypinyin.constants import STYLE_FINALS_TONE3
from pypinyin.constants import STYLE_FIRST_LETTER
from pypinyin.constants import STYLE_INITIALS
from pypinyin.constants import STYLE_NORMAL
from pypinyin.constants import STYLE_TONE
from pypinyin.constants import STYLE_TONE2
from pypinyin.constants import STYLE_TONE3
from pypinyin.constants import TONE
from pypinyin.constants import TONE2
from pypinyin.constants import TONE3
from pypinyin.core import lazy_pinyin
from pypinyin.core import load_phrases_dict
from pypinyin.core import load_single_dict
from pypinyin.core import pinyin
from pypinyin.core import slug
__all__ = [
'pinyin', 'lazy_pinyin', 'slug', 'load_single_dict', 'load_phrases_dict',
'Style', 'STYLE_NORMAL', 'NORMAL', 'STYLE_TONE', 'TONE', 'STYLE_TONE2',
'TONE2', 'STYLE_TONE3', 'TONE3', 'STYLE_INITIALS', 'INITIALS',
'STYLE_FINALS', 'FINALS', 'STYLE_FINALS_TONE', 'FINALS_TONE',
'STYLE_FINALS_TONE2', 'FINALS_TONE2', 'STYLE_FINALS_TONE3', 'FINALS_TONE3',
'STYLE_FIRST_LETTER', 'FIRST_LETTER', 'STYLE_BOPOMOFO', 'BOPOMOFO',
'STYLE_BOPOMOFO_FIRST', 'BOPOMOFO_FIRST', 'STYLE_CYRILLIC', 'CYRILLIC',
'STYLE_CYRILLIC_FIRST', 'CYRILLIC_FIRST'
]
__title__ = 'pypinyin'
__version__ = '0.41.0'
__license__ = 'MIT'
__author__ = 'Hui Zhang'
__copyright__ = 'Copyright (c) 2021 Hui Zhang'

@ -0,0 +1,5 @@
#!/usr/bin/env python3
from pypinyin.runner import main
if __name__ == '__main__':
main()

@ -0,0 +1,99 @@
import os
import re
from enum import IntEnum
from enum import unique
from pypinyin import pinyin_dict
SUPPORT_UCS4 = len('\U00020000') == 1
# 词语拼音库
if os.environ.get('PYPINYIN_NO_PHRASES'):
PHRASES_DICT = {}
else:
from pypinyin import phrases_dict
PHRASES_DICT = phrases_dict.phrases_dict # type: Dict[Text, List[List[Text]]]
# 单字拼音库
PINYIN_DICT = pinyin_dict.pinyin_dict # type: Dict[int, Text]
# 利用环境变量控制不做copy操作(无自定义拼音库的情况), 以减少内存使用
if not os.environ.get('PYPINYIN_NO_DICT_COPY'):
PINYIN_DICT = PINYIN_DICT.copy()
PHRASES_DICT = PHRASES_DICT.copy()
# 匹配使用数字标识声调的字符的正则表达式
RE_TONE2 = re.compile(r'([aeoiuvnm])([1-4])$')
# 有拼音的汉字
# https://www.qqxiuzi.cn/zh/hanzi-unicode-bianma.php
# https://developer.mozilla.org/zh-CN/docs/Web/JavaScript/Guide/Regular_Expressions
if SUPPORT_UCS4:
RE_HANS = re.compile(r'^(?:['
r'\u3007' #
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
r'])+$')
else:
RE_HANS = re.compile( # pragma: no cover
r'^(?:['
r'\u3007' #
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'])+$')
@unique
class Style(IntEnum):
"""拼音风格"""
#: 普通风格,不带声调。如: 中国 -> ``zhong guo``
NORMAL = 0
#: 标准声调风格,拼音声调在韵母第一个字母上(默认风格)。如: 中国 -> ``zhōng guó``
TONE = 1
#: 声调风格2即拼音声调在各个韵母之后用数字 [1-4] 进行表示。如: 中国 -> ``zho1ng guo2``
TONE2 = 2
#: 声调风格3即拼音声调在各个拼音之后用数字 [1-4] 进行表示。如: 中国 -> ``zhong1 guo2``
TONE3 = 8
#: 声母风格,只返回各个拼音的声母部分(注:有的拼音没有声母,详见 `#27`_。如 中国 -> ``zh g``
INITIALS = 3
#: 首字母风格,只返回拼音的首字母部分。如: 中国 -> ``z g``
FIRST_LETTER = 4
#: 韵母风格,只返回各个拼音的韵母部分,不带声调。如: 中国 -> ``ong uo``
FINALS = 5
#: 标准韵母风格,带声调,声调在韵母第一个字母上。如:中国 -> ``ōng uó``
FINALS_TONE = 6
#: 韵母风格2带声调声调在各个韵母之后用数字 [1-4] 进行表示。如: 中国 -> ``o1ng uo2``
FINALS_TONE2 = 7
#: 韵母风格3带声调声调在各个拼音之后用数字 [1-4] 进行表示。如: 中国 -> ``ong1 uo2``
FINALS_TONE3 = 9
#: 注音风格,带声调,阴平(第一声)不标。如: 中国 -> ``ㄓㄨㄥ ㄍㄨㄛˊ``
BOPOMOFO = 10
#: 注音风格,仅首字母。如: 中国 -> ``ㄓ ㄍ``
BOPOMOFO_FIRST = 11
#: 汉语拼音与俄语字母对照风格,声调在各个拼音之后,用数字 [1-4] 进行表示。如: 中国 -> ``чжун1 го2``
CYRILLIC = 12
#: 汉语拼音与俄语字母对照风格,仅首字母。如: 中国 -> ``ч г``
CYRILLIC_FIRST = 13
NORMAL = STYLE_NORMAL = Style.NORMAL
TONE = STYLE_TONE = Style.TONE
TONE2 = STYLE_TONE2 = Style.TONE2
TONE3 = STYLE_TONE3 = Style.TONE3
INITIALS = STYLE_INITIALS = Style.INITIALS
FIRST_LETTER = STYLE_FIRST_LETTER = Style.FIRST_LETTER
FINALS = STYLE_FINALS = Style.FINALS
FINALS_TONE = STYLE_FINALS_TONE = Style.FINALS_TONE
FINALS_TONE2 = STYLE_FINALS_TONE2 = Style.FINALS_TONE2
FINALS_TONE3 = STYLE_FINALS_TONE3 = Style.FINALS_TONE3
BOPOMOFO = STYLE_BOPOMOFO = Style.BOPOMOFO
BOPOMOFO_FIRST = STYLE_BOPOMOFO_FIRST = Style.BOPOMOFO_FIRST
CYRILLIC = STYLE_CYRILLIC = Style.CYRILLIC
CYRILLIC_FIRST = STYLE_CYRILLIC_FIRST = Style.CYRILLIC_FIRST

@ -0,0 +1,44 @@
from typing import Optional
from typing import Text
def right_mark_index(pinyin_no_tone: Text) -> Optional[int]:
"""
标调位置
ɑ 不放过
   ɑ oe
  ɑoeiuü
  标调就按这顺序
  iu 若是连在一起
  谁在后面就标谁
有ɑ不放过有ɑ一定要标在ɑ上
ɑ找oe没有ɑ的时候标在o上,如果没有o则标在e上;
iu并列标在后iu, ui的情况,标在后面的字母上,比如说iu应该标u,ui应该标i
单个韵母不用说只能标在单韵母上
http://www.hwjyw.com/resource/content/2010/06/04/8183.shtml
https://www.zhihu.com/question/23655297
https://github.com/mozillazg/python-pinyin/issues/160
http://www.pinyin.info/rules/where.html
"""
# 有 ɑ 不放过, 没 ɑ 找 o、e
for c in ['a', 'o', 'e']:
if c in pinyin_no_tone:
return pinyin_no_tone.index(c)
# i、u 若是连在一起,谁在后面就标谁
for c in ['iu', 'ui']:
if c in pinyin_no_tone:
return pinyin_no_tone.index(c) + 1
# ɑ、o、e、i、u、ü
for c in ['i', 'u', 'v', 'ü']:
if c in pinyin_no_tone:
return pinyin_no_tone.index(c)
# n, m, ê
for c in ['n', 'm', 'ê']:
if c in pinyin_no_tone:
return pinyin_no_tone.index(c)

@ -0,0 +1,68 @@
import re
from typing import Any
from typing import Optional
from typing import Text
from typing import Tuple
from pypinyin import Style
from pypinyin.contrib._tone_rule import right_mark_index
_re_number = re.compile(r'\d')
class NeutralToneWith5Mixin():
"""声调使用数字表示的相关拼音风格下的结果使用 5 标识轻声。
使用方法::
from pypinyin import lazy_pinyin, Style
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin
# 原来的结果中不会标识轻声
print(lazy_pinyin('好了', style=Style.TONE2))
# 输出: ['ha3o', 'le']
class MyConverter(NeutralToneWith5Mixin, DefaultConverter):
pass
my_pinyin = Pinyin(MyConverter())
pinyin = my_pinyin.pinyin
lazy_pinyin = my_pinyin.lazy_pinyin
# 新的结果中使用 ``5`` 标识轻声
print(lazy_pinyin('好了', style=Style.TONE2))
# 输出: ['ha3o', 'le5']
print(pinyin('好了', style=Style.TONE2))
# 输出:[['ha3o'], ['le5']]
"""
NUMBER_TONE = (Style.TONE2, Style.TONE3, Style.FINALS_TONE2,
Style.FINALS_TONE3) # type: Tuple[Style]
NUMBER_AT_END = (Style.TONE3, Style.FINALS_TONE3) # type: Tuple[Style]
def post_convert_style(self,
han: Text,
orig_pinyin: Text,
converted_pinyin: Text,
style: Style,
strict: bool,
**kwargs: Any) -> Optional[Text]:
pre_data = super().post_convert_style(
han, orig_pinyin, converted_pinyin, style, strict, **kwargs)
if style not in self.NUMBER_TONE:
return pre_data
if pre_data is not None:
converted_pinyin = pre_data
# 有声调,跳过
if _re_number.search(converted_pinyin):
return converted_pinyin
if style in self.NUMBER_AT_END:
return '{}5'.format(converted_pinyin)
# 找到应该在哪个字母上标声调
mark_index = right_mark_index(converted_pinyin)
before = converted_pinyin[:mark_index + 1]
after = converted_pinyin[mark_index + 1:]
return '{}5{}'.format(before, after)

@ -0,0 +1,341 @@
import re
from typing import Optional
from typing import Text
from pypinyin.contrib._tone_rule import right_mark_index
from pypinyin.style._constants import RE_TONE3
from pypinyin.style.tone import converter
from pypinyin.utils import _replace_tone2_style_dict_to_default
_re_number = re.compile(r'\d')
def _v_to_u(pinyin: Text, replace: bool=False) -> Text:
"""replace v to u
Args:
pinyin (Text): pinyin
replace (bool, optional): True, v to u; False, v as it is. Defaults to False.
Returns:
Text: new pinyin
"""
if not replace:
return pinyin
return pinyin.replace('v', 'ü')
def _fix_v_u(origin_py: Text, new_py: Text, v_to_u: bool) -> Text:
""" fix v u
Args:
origin_py (Text): origin pinyin
new_py (Text): new pinyin
v_to_u (bool): True, replace v to u; False, v as it is.
Returns:
Text:
"""
if not v_to_u:
if 'ü' in new_py and 'ü' not in origin_py:
return new_py.replace('ü', 'v')
return _v_to_u(new_py, replace=True)
def _get_number_from_pinyin(pinyin: Text) -> Optional[int]:
"""get tone number
Args:
pinyin (Text): [description]
Returns:
Optional[int]: int or None
"""
numbers = _re_number.findall(pinyin)
if numbers:
number = numbers[0]
else:
number = None
return number
def _improve_tone3(tone3: Text, neutral_tone_with_5: bool=False) -> Text:
"""neutral tone with 5 number if need.
Args:
tone3 (Text): [description]
neutral_tone_with_5 (bool, optional): True, neutral tone with 5 number. Defaults to False.
Returns:
Text: [description]
"""
number = _get_number_from_pinyin(tone3)
if number is None and neutral_tone_with_5:
tone3 = '{}5'.format(tone3)
return tone3
def tone_to_tone3(tone: Text,
v_to_u: bool=False,
neutral_tone_with_5: bool=False) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE` 风格的拼音转换为
:py:attr:`~pypinyin.Style.TONE3` 风格的拼音
:param tone: :py:attr:`~pypinyin.Style.TONE` 风格的拼音
:param v_to_u: 是否使用 ``ü`` 代替原来的 ``v``
:param neutral_tone_with_5: 是否使用 ``5`` 标识轻声
:return: :py:attr:`~pypinyin.Style.TONE3` 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import tone_to_tone3
>>> tone_to_tone3('zhōng')
'zhong1'
>>> tone_to_tone3('shang', neutral_tone_with_5=True)
'shang5'
>>> tone_to_tone3('lüè', v_to_u=True)
'lüe4'
"""
tone3 = converter.to_tone3(tone)
s = _improve_tone3(tone3, neutral_tone_with_5=neutral_tone_with_5)
return _v_to_u(s, v_to_u)
def tone_to_tone2(tone: Text,
v_to_u: bool=False,
neutral_tone_with_5: bool=False) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE` 风格的拼音转换为
:py:attr:`~pypinyin.Style.TONE2` 风格的拼音
:param tone: :py:attr:`~pypinyin.Style.TONE` 风格的拼音
:param v_to_u: 是否使用 ``ü`` 代替原来的 ``v``
:param neutral_tone_with_5: 是否使用 ``5`` 标识轻声
:return: :py:attr:`~pypinyin.Style.TONE2` 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import tone_to_tone2
>>> tone_to_tone2('zhōng')
'zho1ng'
>>> tone_to_tone2('shang', neutral_tone_with_5=True)
'sha5ng'
>>> tone_to_tone2('lüè', v_to_u=True)
'lüe4'
"""
tone3 = tone_to_tone3(
tone, v_to_u=v_to_u, neutral_tone_with_5=neutral_tone_with_5)
s = tone3_to_tone2(tone3)
return _v_to_u(s, v_to_u)
def tone_to_normal(tone: Text, v_to_u: bool=False) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE` 风格的拼音转换为
:py:attr:`~pypinyin.Style.NORMAL` 风格的拼音
:param tone: :py:attr:`~pypinyin.Style.TONE` 风格的拼音
:param v_to_u: 是否使用 ``ü`` 代替原来的 ``v``
:return: :py:attr:`~pypinyin.Style.NORMAL` 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import tone_to_normal
>>> tone_to_normal('zhōng')
'zhong'
>>> tone_to_normal('lüè', v_to_u=True)
'lüe'
"""
s = tone_to_tone2(tone, v_to_u=v_to_u)
s = _re_number.sub('', s)
return _v_to_u(s, v_to_u)
def tone2_to_normal(tone2: Text, v_to_u: bool=False) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE2` 风格的拼音转换为
:py:attr:`~pypinyin.Style.NORMAL` 风格的拼音
:param tone2: :py:attr:`~pypinyin.Style.TONE2` 风格的拼音
:param v_to_u: 是否使用 ``ü`` 代替原来的 ``v``
:return: Style.NORMAL 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import tone2_to_normal
>>> tone2_to_normal('zho1ng')
'zhong'
>>> tone2_to_normal('lüe4', v_to_u=True)
'lüe'
"""
s = _re_number.sub('', tone2)
return _v_to_u(s, v_to_u)
def tone2_to_tone(tone2: Text) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE2` 风格的拼音转换为
:py:attr:`~pypinyin.Style.TONE` 风格的拼音
:param tone2: :py:attr:`~pypinyin.Style.TONE2` 风格的拼音
:return: Style.TONE 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import tone2_to_tone
>>> tone2_to_tone('zho1ng')
'zhōng'
"""
return _replace_tone2_style_dict_to_default(tone2)
def tone2_to_tone3(tone2: Text) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE2` 风格的拼音转换为
:py:attr:`~pypinyin.Style.TONE3` 风格的拼音
:param tone2: :py:attr:`~pypinyin.Style.TONE2` 风格的拼音
:return: :py:attr:`~pypinyin.Style.TONE3` 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import tone2_to_tone3
>>> tone2_to_tone3('zho1ng')
'zhong1'
"""
tone3 = RE_TONE3.sub(r'\1\3\2', tone2)
return tone3
def tone3_to_normal(tone3: Text, v_to_u: bool=False) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE3` 风格的拼音转换为
:py:attr:`~pypinyin.Style.NORMAL` 风格的拼音
:param tone3: :py:attr:`~pypinyin.Style.TONE3` 风格的拼音
:param v_to_u: 是否使用 ``ü`` 代替原来的 ``v``
:return: :py:attr:`~pypinyin.Style.NORMAL` 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import tone3_to_normal
>>> tone3_to_normal('zhong1')
'zhong'
>>> tone3_to_normal('lüe4', v_to_u=True)
'lüe'
"""
s = _re_number.sub('', tone3)
return _v_to_u(s, v_to_u)
def tone3_to_tone(tone3: Text) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE3` 风格的拼音转换为
:py:attr:`~pypinyin.Style.TONE` 风格的拼音
:param tone3: :py:attr:`~pypinyin.Style.TONE3` 风格的拼音
:return: :py:attr:`~pypinyin.Style.TONE` 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import tone3_to_tone
>>> tone3_to_tone('zhong1')
'zhōng'
"""
tone2 = tone3_to_tone2(tone3)
return tone2_to_tone(tone2)
def tone3_to_tone2(tone3: Text) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE3` 风格的拼音转换为
:py:attr:`~pypinyin.Style.TONE2` 风格的拼音
:param tone3: :py:attr:`~pypinyin.Style.TONE3` 风格的拼音
:return: :py:attr:`~pypinyin.Style.TONE2` 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import tone3_to_tone2
>>> tone3_to_tone2('zhong1')
'zho1ng'
"""
no_number_tone3 = tone3_to_normal(tone3)
mark_index = right_mark_index(no_number_tone3)
if mark_index is None:
mark_index = len(no_number_tone3) - 1
before = no_number_tone3[:mark_index + 1]
after = no_number_tone3[mark_index + 1:]
number = _get_number_from_pinyin(tone3)
if number is None:
return tone3
return '{}{}{}'.format(before, number, after)
def to_normal(pinyin: Text, v_to_u: bool=False) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE`、
:py:attr:`~pypinyin.Style.TONE2`
:py:attr:`~pypinyin.Style.TONE3` 风格的拼音转换为
:py:attr:`~pypinyin.Style.NORMAL` 风格的拼音
:param pinyin: :py:attr:`~pypinyin.Style.TONE`
:py:attr:`~pypinyin.Style.TONE2`
:py:attr:`~pypinyin.Style.TONE3` 风格的拼音
:param v_to_u: 是否使用 ``ü`` 代替原来的 ``v``. True, v to u; False, v as it is.
:return: :py:attr:`~pypinyin.Style.NORMAL` 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import to_normal
>>> to_normal('zhōng')
'zhong'
>>> to_normal('zho1ng')
'zhong'
>>> to_normal('zhong1')
'zhong'
>>> to_normal('lüè', v_to_u=True)
'lüe'
"""
s = tone_to_tone2(pinyin, v_to_u=True)
s = tone2_to_normal(s)
return _fix_v_u(pinyin, s, v_to_u)
def to_tone(pinyin: Text) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE2` 或
:py:attr:`~pypinyin.Style.TONE3` 风格的拼音转换为
:py:attr:`~pypinyin.Style.TONE` 风格的拼音
:param pinyin: :py:attr:`~pypinyin.Style.TONE2`
:py:attr:`~pypinyin.Style.TONE3` 风格的拼音
:return: :py:attr:`~pypinyin.Style.TONE` 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import to_tone
>>> to_tone('zho1ng')
'zhōng'
>>> to_tone('zhong1')
'zhōng'
"""
if not _re_number.search(pinyin):
return pinyin
s = tone_to_tone2(pinyin)
s = tone2_to_tone(s)
return s
def to_tone2(pinyin: Text, v_to_u: bool=False,
neutral_tone_with_5: bool=False) -> Text:
"""将 :py:attr:`~pypinyin.Style.TONE` 或
:py:attr:`~pypinyin.Style.TONE3` 风格的拼音转换为
:py:attr:`~pypinyin.Style.TONE2` 风格的拼音
:param pinyin: :py:attr:`~pypinyin.Style.TONE`
:py:attr:`~pypinyin.Style.TONE3` 风格的拼音
:param v_to_u: 是否使用 ``ü`` 代替原来的 ``v``
:param neutral_tone_with_5: 是否使用 ``5`` 标识轻声
:return: :py:attr:`~pypinyin.Style.TONE2` 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import to_tone2
>>> to_tone2('zhōng')
'zho1ng'
>>> to_tone2('zhong1')
'zho1ng'
>>> to_tone2('shang', neutral_tone_with_5=True)
'sha5ng'
>>> to_tone2('lüè', v_to_u=True)
'lüe4'
"""
s = tone_to_tone3(
pinyin, v_to_u=True, neutral_tone_with_5=neutral_tone_with_5)
s = tone3_to_tone2(s)
return _fix_v_u(pinyin, s, v_to_u)
def to_tone3(pinyin: Text, v_to_u: bool=False, neutral_tone_with_5: bool=False):
"""将 :py:attr:`~pypinyin.Style.TONE` 或
:py:attr:`~pypinyin.Style.TONE2` 风格的拼音转换为
:py:attr:`~pypinyin.Style.TONE3` 风格的拼音
:param pinyin: :py:attr:`~pypinyin.Style.TONE`
:py:attr:`~pypinyin.Style.TONE2` 风格的拼音
:param v_to_u: 是否使用 ``ü`` 代替原来的 ``v``
:param neutral_tone_with_5: 是否使用 ``5`` 标识轻声
:return: :py:attr:`~pypinyin.Style.TONE2` 风格的拼音
Usage::
>>> from pypinyin.contrib.tone_convert import to_tone3
>>> to_tone3('zhōng')
'zhong1'
>>> to_tone3('zho1ng')
'zhong1'
>>> to_tone3('shang', neutral_tone_with_5=True)
'shang5'
>>> to_tone3('lüè', v_to_u=True)
'lüe4'
"""
s = tone_to_tone2(
pinyin, v_to_u=True, neutral_tone_with_5=neutral_tone_with_5)
s = tone2_to_tone3(s)
return _fix_v_u(pinyin, s, v_to_u)

@ -0,0 +1,44 @@
from typing import Any
from typing import Optional
from typing import Text
from pypinyin.constants import Style
class V2UMixin():
"""无声调相关拼音风格下的结果使用 ``ü`` 代替原来的 ``v``
使用方法::
from pypinyin import lazy_pinyin, Style
from pypinyin.contrib.uv import V2UMixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin
# 原来的结果中会使用 ``v`` 表示 ``ü``
print(lazy_pinyin('战略'))
# 输出:['zhan', 'lve']
class MyConverter(V2UMixin, DefaultConverter):
pass
my_pinyin = Pinyin(MyConverter())
pinyin = my_pinyin.pinyin
lazy_pinyin = my_pinyin.lazy_pinyin
# 新的结果中使用 ``ü`` 代替原来的 ``v``
print(lazy_pinyin('战略'))
# 输出: ['zhan', 'lüe']
print(pinyin('战略', style=Style.NORMAL))
# 输出:[['zhan'], ['lüe']]
"""
def post_convert_style(self,
han: Text,
orig_pinyin: Text,
converted_pinyin: Text,
style: Style,
strict: bool,
**kwargs: Any) -> Optional[Text]:
pre_data = super().post_convert_style(
han, orig_pinyin, converted_pinyin, style, strict, **kwargs)
if pre_data is not None:
converted_pinyin = pre_data
return converted_pinyin.replace('v', 'ü')

@ -0,0 +1,459 @@
from copy import deepcopy
from typing import Any
from typing import Callable
from typing import List
from typing import Optional
from typing import Text
from typing import Union
from pypinyin.constants import PHRASES_DICT
from pypinyin.constants import PINYIN_DICT
from pypinyin.constants import RE_HANS
from pypinyin.constants import Style
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.contrib.uv import V2UMixin
from pypinyin.style import auto_discover
from pypinyin.style import convert as convert_style
from pypinyin.utils import _remove_dup_items
TStyle = Style
TErrors = Union[Callable[[Text], Text], Text]
TPinyinResult = List[List[Text]]
TErrorResult = Union[Text, List[Text], None]
TNoPinyinResult = Union[TPinyinResult, List[Text], Text, None]
auto_discover()
class Converter():
def convert(self,
words: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool=...,
**kwargs: Any) -> TPinyinResult:
# TODO: use ``abc`` module
raise NotImplementedError # pragma: no cover
class DefaultConverter(Converter):
def __init__(self, **kwargs: Any) -> None:
pass
def post_pinyin(self,
han: Text,
heteronym: bool,
pinyin: TPinyinResult,
**kwargs: Any) -> Union[TPinyinResult, None]:
"""找到汉字对应的拼音后,会调用 ``post_pinyin`` 方法。
如果返回值不为 ``None`` 会使用返回的结果作为 han 的拼音数据
:param han: 单个汉字或者词语
:param heteronym: 是否需要处理多音字
:param pinyin: 单个汉字的拼音数据或词语的拼音数据 list
:type pinyin: list
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: ``None`` 或代替 ``pinyin`` 作为 han 的拼音 list
"""
pass
def _single_pinyin(self,
han: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool) -> TPinyinResult:
"""单字拼音转换.
:param han: 单个汉字
:param errors: 指定如何处理没有拼音的字符详情请参考
:py:func:`~pypinyin.pinyin`
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:return: 返回拼音列表多音字会有多个拼音项
:rtype: list
"""
num = ord(han)
# 处理没有拼音的字符
if num not in PINYIN_DICT:
return self.handle_nopinyin(
han,
style=style,
errors=errors,
heteronym=heteronym,
strict=strict)
pys = PINYIN_DICT[num].split(',') # 字的拼音列表
post_data = self.post_pinyin(han, heteronym, [pys])
if post_data is not None:
pys = post_data[0]
if not heteronym:
orig_pinyin = pys[0]
return [[
self.convert_style(
han, orig_pinyin, style=style, strict=strict)
]]
# 输出多音字的多个读音
# 临时存储已存在的拼音,避免多音字拼音转换为非声调风格出现重复。
# TODO: change to use set
# TODO: add test for cache
py_cached = {}
pinyins = []
for orig_pinyin in pys:
py = self.convert_style(
han, orig_pinyin, style=style, strict=strict)
if py in py_cached:
continue
py_cached[py] = py
pinyins.append(py)
return [pinyins]
def _phrase_pinyin(self,
phrase: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool) -> TPinyinResult:
"""词语拼音转换.
:param phrase: 词语
:param errors: 指定如何处理没有拼音的字符
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:return: 拼音列表
:rtype: list
"""
py = []
if phrase in PHRASES_DICT:
py = deepcopy(PHRASES_DICT[phrase])
post_data = self.post_pinyin(phrase, heteronym, py)
if post_data is not None:
py = post_data
for idx, item in enumerate(py):
han = phrase[idx]
if heteronym:
py[idx] = _remove_dup_items([
self.convert_style(
han, orig_pinyin=x, style=style, strict=strict)
for x in item
])
else:
orig_pinyin = item[0]
py[idx] = [
self.convert_style(
han,
orig_pinyin=orig_pinyin,
style=style,
strict=strict)
]
else:
for i in phrase:
single = self._single_pinyin(
i,
style=style,
heteronym=heteronym,
errors=errors,
strict=strict)
if single:
py.extend(single)
return py
def convert(self,
words: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool=...,
**kwargs: Any) -> TPinyinResult:
"""根据参数把汉字转成相应风格的拼音结果。
:param words: 汉字字符串
:type words: unicode
:param style: 拼音风格
:param heteronym: 是否启用多音字
:type heteronym: bool
:param errors: 如果处理没有拼音的字符
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:type strict: bool
:return: 按风格转换后的拼音结果
:rtype: list
"""
pys = []
# 初步过滤没有拼音的字符
if RE_HANS.match(words):
pys = self._phrase_pinyin(
words,
style=style,
heteronym=heteronym,
errors=errors,
strict=strict)
return pys
py = self.handle_nopinyin(
words,
style=style,
errors=errors,
heteronym=heteronym,
strict=strict)
if py:
pys.extend(py)
return pys
def pre_convert_style(self,
han: Text,
orig_pinyin: Text,
style: TStyle,
strict: bool,
**kwargs: Any) -> Optional[Text]:
"""在把原始带声调的拼音按拼音风格转换前会调用 ``pre_convert_style`` 方法。
如果返回值不为 ``None`` 会使用返回的结果代替 ``orig_pinyin``
来进行后面的风格转换
:param han: 要处理的汉字
:param orig_pinyin: 汉字对应的原始带声调拼音
:param style: 要转换的拼音风格
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: ``None`` 或代替 ``orig_pinyin`` 参与拼音风格转换的拼音字符串
"""
pass
def post_convert_style(self,
han: Text,
orig_pinyin: Text,
converted_pinyin: Text,
style: TStyle,
strict: bool,
**kwargs: Any) -> Optional[Text]:
"""在把原始带声调的拼音按拼音风格转换前会调用 ``pre_convert_style`` 方法。
如果返回值不为 ``None`` 会使用返回的结果代替 ``converted_pinyin``
作为拼音风格转换后的最终拼音结果
:param han: 要处理的汉字
:param orig_pinyin: 汉字对应的原始带声调拼音
:param converted_pinyin: 按拼音风格转换处理后的拼音
:param style: 要转换的拼音风格
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: ``None`` 或代替 ``converted_pinyin`` 作为拼音风格转换后的拼音结果
"""
pass
def _convert_style(self,
han: Text,
pinyin: Text,
style: TStyle,
strict: bool,
default: Text,
**kwargs: Any) -> Text:
return convert_style(pinyin, style, strict, default=default, **kwargs)
def convert_style(self,
han: Text,
orig_pinyin: Text,
style: TStyle,
strict: bool,
**kwargs: Any) -> Text:
"""按 ``style`` 的值对 ``orig_pinyin`` 进行处理,返回处理后的拼音
转换风格前会调用 ``pre_convert_style`` 方法
转换后会调用 ``post_convert_style`` 方法
:param han: 要处理的汉字
:param orig_pinyin: 汉字对应的原始带声调拼音
:param style: 拼音风格
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: 按拼音风格转换处理后的拼音
"""
pre_data = self.pre_convert_style(
han, orig_pinyin, style=style, strict=strict)
if pre_data is not None:
pinyin = pre_data
else:
pinyin = orig_pinyin
converted_pinyin = self._convert_style(
han, pinyin, style=style, strict=strict, default=pinyin)
post_data = self.post_convert_style(
han, pinyin, converted_pinyin, style=style, strict=strict)
if post_data is None:
post_data = converted_pinyin
return post_data
def pre_handle_nopinyin(self,
chars: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool) -> TNoPinyinResult:
"""处理没有拼音的字符串前会调用 ``pre_handle_nopinyin`` 方法。
如果返回值不为 ``None`` 会使用返回的结果作为处理没有拼音字符串的结果
不再使用内置方法进行处理
:param chars: 待处理的没有拼音的字符串
:param errors: 如何处理
:param heteronym: 是否需要处理多音字
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: ``None`` 或代替 ``chars`` 参与拼音风格转换的拼音字符串
或拼音结果 list
"""
pass
def post_handle_nopinyin(self,
chars: Text,
style: Style,
heteronym: bool,
errors: TErrors,
strict: bool,
pinyin: TNoPinyinResult,
**kwargs: Any) -> TNoPinyinResult:
"""处理完没有拼音的字符串后会调用 ``post_handle_nopinyin`` 方法。
如果返回值不为 ``None`` 会使用返回的结果作为处理没有拼音的字符串的结果
:param chars: 待处理的没有拼音的字符串
:param errors: 如何处理
:param heteronym: 是否需要处理多音字
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:param pinyin: 处理后的拼音信息值为空 list 或包含拼音信息的 list
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: ``None`` 或代替 ``pinyin`` 做为处理结果
"""
pass
def _convert_nopinyin_chars(self,
chars: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool) -> TNoPinyinResult:
"""转换没有拼音的字符。
"""
if callable(errors):
return errors(chars)
if errors == 'default':
return chars
elif errors == 'ignore':
return None
elif errors == 'replace':
if len(chars) > 1:
return ''.join(str('%x' % ord(x)) for x in chars)
else:
return str('%x' % ord(chars))
def handle_nopinyin(self,
chars: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool,
**kwargs: Any) -> TPinyinResult:
"""处理没有拼音的字符串。
处理前会调用 ``pre_handle_nopinyin`` 方法
处理后会调用 ``post_handle_nopinyin`` 方法
:param chars: 待处理的没有拼音的字符串
:param style: 拼音风格
:param errors: 如何处理
:param heteronym: 是否需要处理多音字
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:return: 处理后的拼音结果如果为 ``None`` 或空 list 表示忽略这个字符串.
:rtype: list
"""
pre_data = self.pre_handle_nopinyin(
chars, style, errors=errors, heteronym=heteronym, strict=strict)
if pre_data is not None:
py = pre_data
else:
pre_data = chars
py = self._convert_nopinyin_chars(
pre_data,
style,
errors=errors,
heteronym=heteronym,
strict=strict)
post_data = self.post_handle_nopinyin(
chars,
style,
errors=errors,
heteronym=heteronym,
strict=strict,
pinyin=py)
if post_data is not None:
py = post_data
if not py:
return []
if isinstance(py, list):
# 包含多音字信息
if isinstance(py[0], list):
if heteronym:
return py
# [[a, b], [c, d]]
# [[a], [c]]
return [[x[0]] for x in py]
return [[i] for i in py]
else:
return [[py]]
class _v2UConverter(V2UMixin, DefaultConverter):
pass
class _neutralToneWith5Converter(NeutralToneWith5Mixin, DefaultConverter):
pass
class _neutralToneWith5AndV2UConverter(NeutralToneWith5Mixin, V2UMixin,
DefaultConverter):
pass
class _mixConverter(DefaultConverter):
def __init__(self, v_to_u=False, neutral_tone_with_five=False, **kwargs):
super().__init__(**kwargs)
self._v_to_u = v_to_u
self._neutral_tone_with_five = neutral_tone_with_five
self._v2uconverter = _v2UConverter()
self._neutraltonewith5converter = _neutralToneWith5Converter()
self._neutraltonewith5andv2uconverter = \
_neutralToneWith5AndV2UConverter()
def post_convert_style(self, han, orig_pinyin, converted_pinyin, style,
strict, **kwargs):
if self._v_to_u and not self._neutral_tone_with_five:
return self._v2uconverter.post_convert_style(
han, orig_pinyin, converted_pinyin, style, strict, **kwargs)
if self._neutral_tone_with_five and not self._v_to_u:
return self._neutraltonewith5converter.post_convert_style(
han, orig_pinyin, converted_pinyin, style, strict, **kwargs)
if self._neutral_tone_with_five and self._v_to_u:
return self._neutraltonewith5andv2uconverter.post_convert_style(
han, orig_pinyin, converted_pinyin, style, strict, **kwargs)
return super().post_convert_style(han, orig_pinyin, converted_pinyin,
style, strict, **kwargs)

@ -0,0 +1,333 @@
from itertools import chain
from typing import Any
from typing import Callable
from typing import Dict
from typing import List
from typing import Optional
from typing import Text
from typing import Union
from pypinyin.constants import PHRASES_DICT
from pypinyin.constants import PINYIN_DICT
from pypinyin.constants import Style
from pypinyin.converter import _mixConverter
from pypinyin.converter import Converter
from pypinyin.converter import DefaultConverter
from pypinyin.seg import mmseg
from pypinyin.seg import simpleseg
from pypinyin.utils import (_replace_tone2_style_dict_to_default)
TStyle = Style
TErrors = Union[Callable[[Text], Text], Text]
TPinyinResult = List[List[Text]]
def load_single_dict(pinyin_dict: Dict[int, Text], style: str='default'):
"""载入用户自定义的单字拼音库
:param pinyin_dict: 单字拼音库比如 ``{0x963F: u"ā,ē"}``
:param style: pinyin_dict 参数值的拼音库风格. 支持 'default', 'tone2'
:type pinyin_dict: dict
"""
if style == 'tone2':
for k, v in pinyin_dict.items():
v = _replace_tone2_style_dict_to_default(v)
PINYIN_DICT[k] = v
else:
PINYIN_DICT.update(pinyin_dict)
mmseg.retrain(mmseg.seg)
def load_phrases_dict(phrases_dict: Dict[Text, List[List[Text]]],
style: str='default'):
"""载入用户自定义的词语拼音库
:param phrases_dict: 词语拼音库比如 ``{u"阿爸": [[u"ā"], [u""]]}``
:param style: phrases_dict 参数值的拼音库风格. 支持 'default', 'tone2'
:type phrases_dict: dict
"""
if style == 'tone2':
for k, value in phrases_dict.items():
v = [
list(map(_replace_tone2_style_dict_to_default, pys))
for pys in value
]
PHRASES_DICT[k] = v
else:
PHRASES_DICT.update(phrases_dict)
mmseg.retrain(mmseg.seg)
class Pinyin():
def __init__(self, converter: Converter=None, **kwargs: Any):
self._converter = converter or DefaultConverter()
def pinyin(self,
hans: Union[List[Text], Text],
style: TStyle=Style.TONE,
heteronym: bool=False,
errors: TErrors='default',
strict: bool=True,
**kwargs: Any) -> TPinyinResult:
"""将汉字转换为拼音,返回汉字的拼音列表。
:param hans: 汉字字符串( ``'你好吗'`` )或列表( ``['你好', '']`` ).
可以使用自己喜爱的分词模块对字符串进行分词处理,
只需将经过分词处理的字符串列表传进来就可以了
:type hans: unicode 字符串或字符串列表
:param style: 指定拼音风格默认是 :py:attr:`~pypinyin.Style.TONE` 风格
更多拼音风格详见 :class:`~pypinyin.Style`
:param errors: 指定如何处理没有拼音的字符详见 :ref:`handle_no_pinyin`
* ``'default'``: 保留原始字符
* ``'ignore'``: 忽略该字符
* ``'replace'``: 替换为去掉 ``\\u`` unicode 编码字符串
(``'\\u90aa'`` => ``'90aa'``)
* callable 对象: 回调函数之类的可调用对象
:param heteronym: 是否启用多音字
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:return: 拼音列表
:rtype: list
"""
# 对字符串进行分词处理
if isinstance(hans, str):
han_list = self.seg(hans)
else:
han_list = chain(*(self.seg(x) for x in hans))
pys = []
for words in han_list:
pys.extend(
self._converter.convert(
words, style, heteronym, errors, strict=strict))
return pys
def lazy_pinyin(self,
hans: Union[List[Text], Text],
style: TStyle=Style.NORMAL,
errors: TErrors='default',
strict: bool=True,
**kwargs: Any) -> List[Text]:
"""将汉字转换为拼音,返回不包含多音字结果的拼音列表.
:py:func:`~pypinyin.pinyin` 的区别是每个汉字的拼音是个字符串
并且每个字只包含一个读音.
:param hans: 汉字
:type hans: unicode or list
:param style: 指定拼音风格默认是 :py:attr:`~pypinyin.Style.NORMAL` 风格
更多拼音风格详见 :class:`~pypinyin.Style`
:param errors: 指定如何处理没有拼音的字符详情请参考
:py:func:`~pypinyin.pinyin`
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:return: 拼音列表(e.g. ``['zhong', 'guo', 'ren']``)
:rtype: list
"""
return list(
chain(*self.pinyin(
hans,
style=style,
heteronym=False,
errors=errors,
strict=strict)))
def pre_seg(self, hans: Text, **kwargs: Any) -> Optional[List[Text]]:
"""对字符串进行分词前将调用 ``pre_seg`` 方法对未分词的字符串做预处理。
默认原样返回传入的 ``hans``
如果这个方法的返回值类型是 ``list``表示返回的是一个分词后的结果此时
``seg`` 方法中将不再调用 ``seg_function`` 进行分词
:param hans: 分词前的字符串
:return: ``None`` or ``list``
"""
pass
def post_seg(self, hans: Text, seg_data: List[Text],
**kwargs: Any) -> Optional[List[Text]]:
"""对字符串进行分词后将调用 ``post_seg`` 方法对分词后的结果做处理。
默认原样返回传入的 ``seg_data``
如果这个方法的返回值类型是 ``list``表示对分词结果做了二次处理此时
``seg`` 方法将以这个返回的数据作为返回值
:param hans: 分词前的字符串
:param seg_data: 分词后的结果
:type seg_data: list
:return: ``None`` or ``list``
"""
pass
def seg(self, hans: Text, **kwargs: Any) -> List[Text]:
"""对汉字进行分词。
分词前会调用 ``pre_seg`` 方法分词后会调用 ``post_seg`` 方法
:param hans:
:return:
"""
pre_data = self.pre_seg(hans)
if isinstance(pre_data, list):
seg_data = pre_data
else:
seg_data = self.get_seg()(hans)
post_data = self.post_seg(hans, seg_data)
if isinstance(post_data, list):
return post_data
return seg_data
def get_seg(self, **kwargs: Any) -> Callable[[Text], List[Text]]:
"""获取分词函数。
:return: 分词函数
"""
return simpleseg.seg
_default_convert = DefaultConverter()
_default_pinyin = Pinyin(_default_convert)
def pinyin(hans: Union[List[Text], Text],
style: TStyle=Style.TONE,
heteronym: bool=False,
errors: TErrors='default',
strict: bool=True,
v_to_u: bool=False,
neutral_tone_with_five=False) -> List[List[Text]]:
"""将汉字转换为拼音,返回汉字的拼音列表。
:param hans: 汉字字符串( ``'你好吗'`` )或列表( ``['你好', '']`` ).
可以使用自己喜爱的分词模块对字符串进行分词处理,
只需将经过分词处理的字符串列表传进来就可以了
:type hans: unicode 字符串或字符串列表
:param style: 指定拼音风格默认是 :py:attr:`~pypinyin.Style.TONE` 风格
更多拼音风格详见 :class:`~pypinyin.Style`
:param errors: 指定如何处理没有拼音的字符详见 :ref:`handle_no_pinyin`
* ``'default'``: 保留原始字符
* ``'ignore'``: 忽略该字符
* ``'replace'``: 替换为去掉 ``\\u`` unicode 编码字符串
(``'\\u90aa'`` => ``'90aa'``)
* callable 对象: 回调函数之类的可调用对象
:param heteronym: 是否启用多音字
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:param v_to_u: 无声调相关拼音风格下的结果是否使用 ``ü`` 代替原来的 ``v``
:type v_to_u: bool
:param neutral_tone_with_five: 声调使用数字表示的相关拼音风格下的结果是否
使用 5 标识轻声
:type neutral_tone_with_five: bool
:return: 拼音列表
:rtype: list
:raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常
Usage::
>>> from pypinyin import pinyin, Style
>>> import pypinyin
>>> pinyin('中心')
[['zhōng'], ['xīn']]
>>> pinyin('中心', heteronym=True) # 启用多音字模式
[['zhōng', 'zhòng'], ['xīn']]
>>> pinyin('中心', style=Style.FIRST_LETTER) # 设置拼音风格
[['z'], ['x']]
>>> pinyin('中心', style=Style.TONE2)
[['zho1ng'], ['xi1n']]
>>> pinyin('中心', style=Style.CYRILLIC)
[['чжун1'], ['синь1']]
>>> pinyin('战略', v_to_u=True, style=Style.NORMAL)
[['zhan'], ['lüe']]
>>> pinyin('衣裳', style=Style.TONE3, neutral_tone_with_five=True)
[['yi1'], ['shang5']]
"""
_pinyin = Pinyin(
_mixConverter(
v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five))
return _pinyin.pinyin(
hans, style=style, heteronym=heteronym, errors=errors, strict=strict)
def slug(hans: Union[List[Text], Text],
style: TStyle=Style.NORMAL,
heteronym: bool=False,
separator: Text='-',
errors: TErrors='default',
strict: bool=True) -> Text:
"""将汉字转换为拼音,然后生成 slug 字符串.
:param hans: 汉字
:type hans: unicode or list
:param style: 指定拼音风格默认是 :py:attr:`~pypinyin.Style.NORMAL` 风格
更多拼音风格详见 :class:`~pypinyin.Style`
:param heteronym: 是否启用多音字
:param separator: 两个拼音间的分隔符/连接符
:param errors: 指定如何处理没有拼音的字符详情请参考
:py:func:`~pypinyin.pinyin`
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:return: slug 字符串.
:raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常
::
>>> import pypinyin
>>> from pypinyin import Style
>>> pypinyin.slug('中国人')
'zhong-guo-ren'
>>> pypinyin.slug('中国人', separator=' ')
'zhong guo ren'
>>> pypinyin.slug('中国人', style=Style.FIRST_LETTER)
'z-g-r'
>>> pypinyin.slug('中国人', style=Style.CYRILLIC)
'чжун1-го2-жэнь2'
"""
return separator.join(
chain(*_default_pinyin.pinyin(
hans,
style=style,
heteronym=heteronym,
errors=errors,
strict=strict)))
def lazy_pinyin(hans: Union[List[Text], Text],
style: TStyle=Style.NORMAL,
errors: TErrors='default',
strict: bool=True,
v_to_u: bool=False,
neutral_tone_with_five: bool=False) -> List[Text]:
"""将汉字转换为拼音,返回不包含多音字结果的拼音列表.
:py:func:`~pypinyin.pinyin` 的区别是返回的拼音是个字符串
并且每个字只包含一个读音.
:param hans: 汉字
:type hans: unicode or list
:param style: 指定拼音风格默认是 :py:attr:`~pypinyin.Style.NORMAL` 风格
更多拼音风格详见 :class:`~pypinyin.Style`
:param errors: 指定如何处理没有拼音的字符详情请参考
:py:func:`~pypinyin.pinyin`
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:param v_to_u: 无声调相关拼音风格下的结果是否使用 ``ü`` 代替原来的 ``v``
:type v_to_u: bool
:param neutral_tone_with_five: 声调使用数字表示的相关拼音风格下的结果是否
使用 5 标识轻声
:type neutral_tone_with_five: bool
:return: 拼音列表(e.g. ``['zhong', 'guo', 'ren']``)
:rtype: list
:raise AssertionError: 当传入的字符串不是 unicode 字符时会抛出这个异常
Usage::
>>> from pypinyin import lazy_pinyin, Style
>>> import pypinyin
>>> lazy_pinyin('中心')
['zhong', 'xin']
>>> lazy_pinyin('中心', style=Style.TONE)
['zhōng', 'xīn']
>>> lazy_pinyin('中心', style=Style.FIRST_LETTER)
['z', 'x']
>>> lazy_pinyin('中心', style=Style.TONE2)
['zho1ng', 'xi1n']
>>> lazy_pinyin('中心', style=Style.CYRILLIC)
['чжун1', 'синь1']
>>> lazy_pinyin('战略', v_to_u=True)
['zhan', 'lüe']
>>> lazy_pinyin('衣裳', style=Style.TONE3, neutral_tone_with_five=True)
['yi1', 'shang5']
"""
_pinyin = Pinyin(
_mixConverter(
v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five))
return _pinyin.lazy_pinyin(hans, style=style, errors=errors, strict=strict)

@ -0,0 +1,41 @@
# 带声调字符。
phonetic_symbol = {
"ā": "a1",
"á": "a2",
"ǎ": "a3",
"à": "a4",
"ē": "e1",
"é": "e2",
"ě": "e3",
"è": "e4",
"ō": "o1",
"ó": "o2",
"ǒ": "o3",
"ò": "o4",
"ī": "i1",
"í": "i2",
"ǐ": "i3",
"ì": "i4",
"ū": "u1",
"ú": "u2",
"ǔ": "u3",
"ù": "u4",
# üe
"ü": "v",
"ǖ": "v1",
"ǘ": "v2",
"ǚ": "v3",
"ǜ": "v4",
"ń": "n2",
"ň": "n3",
"ǹ": "n4",
"": "m1", # len('m̄') == 2
"ḿ": "m2",
"": "m4", # len("m̀") == 2
"ê̄": "ê1", # len('ê̄') == 2
"ế": "ê2",
"ê̌": "ê3", # len('ê̌') == 2
"": "ê4",
}
phonetic_symbol_reverse = dict((v, k) for k, v in phonetic_symbol.items())

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save