You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/python-pinyin/tests/test_pinyin.py

562 lines
24 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
import pytest
from pypinyin import (pinyin, slug, lazy_pinyin, load_single_dict,
load_phrases_dict, NORMAL, TONE, TONE2, TONE3, INITIALS,
FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2,
FINALS_TONE3, BOPOMOFO, BOPOMOFO_FIRST, CYRILLIC,
CYRILLIC_FIRST, Style)
from pypinyin.constants import SUPPORT_UCS4
from pypinyin.seg import simpleseg
def test_pinyin_initials():
"""包含声明和韵母的词语"""
hans = '中心'
# 默认风格,带声调
assert pinyin(hans) == [['zh\u014dng'], ['x\u012bn']]
assert pinyin(hans, strict=False) == [['zh\u014dng'], ['x\u012bn']]
# 普通风格,不带声调
assert pinyin(hans, NORMAL) == [['zhong'], ['xin']]
assert pinyin(hans, NORMAL, strict=False) == [['zhong'], ['xin']]
# 声调风格,拼音声调在韵母第一个字母上
assert pinyin(hans, TONE) == [['zh\u014dng'], ['x\u012bn']]
assert pinyin(hans, TONE, strict=False) == [['zh\u014dng'], ['x\u012bn']]
# 声调风格2即拼音声调在各个声母之后用数字 [1-4] 进行表示
assert pinyin(hans, TONE2) == [['zho1ng'], ['xi1n']]
assert pinyin(hans, TONE2, strict=False) == [['zho1ng'], ['xi1n']]
# 声调风格3即拼音声调在各个拼音之后用数字 [1-4] 进行表示
assert pinyin(hans, TONE3) == [['zhong1'], ['xin1']]
assert pinyin(hans, TONE3, strict=False) == [['zhong1'], ['xin1']]
# 声母风格,只返回各个拼音的声母部分
assert pinyin(hans, INITIALS) == [['zh'], ['x']]
assert pinyin(hans, INITIALS, strict=False) == [['zh'], ['x']]
# 首字母风格,只返回拼音的首字母部分
assert pinyin(hans, FIRST_LETTER) == [['z'], ['x']]
assert pinyin(hans, FIRST_LETTER, strict=False) == [['z'], ['x']]
# 注音风格,带声调
assert pinyin(hans, BOPOMOFO) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
assert pinyin(hans, BOPOMOFO, strict=False) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
# 注音风格,首字母
assert pinyin(hans, BOPOMOFO_FIRST) == [[''], ['']]
assert pinyin(hans, BOPOMOFO_FIRST, strict=False) == [[''], ['']]
# test CYRILLIC style
assert pinyin(hans, CYRILLIC) == [['чжун1'], ['синь1']]
assert pinyin(hans, CYRILLIC, strict=False) == [['чжун1'], ['синь1']]
# CYRILLIC_FIRST style return only first letters
assert pinyin(hans, CYRILLIC_FIRST) == [['ч'], ['с']]
assert pinyin(hans, CYRILLIC_FIRST, strict=False) == [['ч'], ['с']]
# 启用多音字模式
assert pinyin(hans, heteronym=True) == [['zh\u014dng', 'zh\xf2ng'],
['x\u012bn']]
assert pinyin(hans, heteronym=True, strict=False) == \
[['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]
# 韵母风格1只返回各个拼音的韵母部分不带声调
assert pinyin(hans, style=FINALS) == [['ong'], ['in']]
assert pinyin(hans, style=FINALS, strict=False) == [['ong'], ['in']]
# 韵母风格2带声调声调在韵母第一个字母上
assert pinyin(hans, style=FINALS_TONE) == [['\u014dng'], ['\u012bn']]
assert pinyin(hans, style=FINALS_TONE, strict=False) == \
[['\u014dng'], ['\u012bn']]
# 韵母风格2带声调声调在各个声母之后用数字 [1-4] 进行表示
assert pinyin(hans, style=FINALS_TONE2) == [['o1ng'], ['i1n']]
assert pinyin(hans, style=FINALS_TONE2, strict=False) == \
[['o1ng'], ['i1n']]
# 韵母风格3带声调声调在各个拼音之后用数字 [1-4] 进行表示
assert pinyin(hans, style=FINALS_TONE3) == [['ong1'], ['in1']]
assert pinyin(hans, style=FINALS_TONE3, strict=False) == \
[['ong1'], ['in1']]
def test_pinyin_finals():
"""只包含韵母的词语"""
hans = '嗷嗷'
assert pinyin(hans) == [['\xe1o'], ['\xe1o']]
assert pinyin(hans + 'abc') == [['\xe1o'], ['\xe1o'], ['abc']]
assert pinyin(hans, NORMAL) == [['ao'], ['ao']]
assert pinyin(hans, TONE) == [['\xe1o'], ['\xe1o']]
assert pinyin(hans, TONE2) == [['a2o'], ['a2o']]
assert pinyin(hans, TONE3) == [['ao2'], ['ao2']]
assert pinyin(hans, INITIALS) == [[''], ['']]
assert pinyin(hans, FIRST_LETTER) == [['a'], ['a']]
assert pinyin(hans, BOPOMOFO) == [['ㄠˊ'], ['ㄠˊ']]
assert pinyin(hans, BOPOMOFO_FIRST) == [[''], ['']]
assert pinyin(hans, CYRILLIC) == [['ао2'], ['ао2']]
assert pinyin(hans, CYRILLIC_FIRST) == [['а'], ['а']]
assert pinyin(hans, heteronym=True) == [['\xe1o'], ['\xe1o']]
assert pinyin('', heteronym=True) == \
[['a', 'ā', 'á', 'ǎ', 'à', 'è']]
assert pinyin(hans, style=FINALS) == [['ao'], ['ao']]
assert pinyin(hans, style=FINALS_TONE) == [['\xe1o'], ['\xe1o']]
assert pinyin(hans, style=FINALS_TONE2) == [['a2o'], ['a2o']]
assert pinyin(hans, style=FINALS_TONE3) == [['ao2'], ['ao2']]
def test_slug():
hans = '中心'
assert slug(hans) == 'zhong-xin'
assert slug(hans, heteronym=True) == 'zhong-xin'
def test_zh_and_en():
"""中英文混合的情况"""
# 中英文
hans = '中心'
assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']]
# 中英文混合的固定词组
assert pinyin('黄山B股', style=TONE2) == \
[['hua2ng'], ['sha1n'], ['B'], ['gu3']]
assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
assert pinyin('维生素C', style=TONE2) == \
[['we2i'], ['she1ng'], ['su4'], ['C']]
def test_others():
# 空字符串
assert pinyin('') == []
# 单个汉字
assert pinyin('') == [['y\xedng']]
# 中国 人
assert pinyin('中国人') == [['zh\u014dng'], ['gu\xf3'], ['r\xe9n']]
# 日文
assert pinyin('') == [['\u306e']]
# 没有读音的汉字,还不存在的汉字
assert pinyin('\u9fff') == [['\u9fff']]
def test_lazy_pinyin():
assert lazy_pinyin('中国人') == ['zhong', 'guo', 'ren']
assert lazy_pinyin('中心') == ['zhong', 'xin']
assert lazy_pinyin('中心', style=TONE) == ['zh\u014dng', 'x\u012bn']
assert lazy_pinyin('中心', style=INITIALS) == ['zh', 'x']
assert lazy_pinyin('中心', style=BOPOMOFO) == ['ㄓㄨㄥ', 'ㄒㄧㄣ']
assert lazy_pinyin('中心', style=CYRILLIC) == ['чжун1', 'синь1']
def test_seg():
hans = '音乐'
hans_seg = list(simpleseg(hans))
assert pinyin(hans_seg, style=TONE2) == [['yi1n'], ['yue4']]
# 中英文混合的固定词组
assert pinyin('黄山B股', style=TONE2) == \
[['hua2ng'], ['sha1n'], ['B'], ['gu3']]
assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
assert pinyin('维生素C', style=TONE2) == \
[['we2i'], ['she1ng'], ['su4'], ['C']]
def test_custom_pinyin_dict():
hans = ''
try:
assert lazy_pinyin(hans, style=TONE2) == ['ju2']
except AssertionError:
pass
load_single_dict({ord(''): 'jú,jié'})
assert lazy_pinyin(hans, style=TONE2) == ['ju2']
def test_custom_pinyin_dict2():
hans = ['同行']
try:
assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'ha2ng']
except AssertionError:
pass
load_phrases_dict({'同行': [['tóng'], ['xíng']]})
assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'xi2ng']
def test_custom_pinyin_dict_tone2():
load_single_dict({ord(''): 'ce4,si4'}, style='tone2')
assert lazy_pinyin('', style=TONE2) == ['ce4']
assert pinyin('') == [['']]
def test_custom_pinyin_dict2_tone2():
load_phrases_dict({'同行': [['to4ng'], ['ku1']]}, style='tone2')
assert lazy_pinyin(['同行'], style=TONE2) == ['to4ng', 'ku1']
assert pinyin('同行') == [['tòng'], ['']]
# yapf: disable
def test_errors():
hans = (
('', {'style': TONE2}, [['a']]),
('啊a', {'style': TONE2}, [['a'], ['a']]),
# 非中文字符,没有拼音
('', {'style': TONE2}, [['\u2e81']]),
('', {'style': TONE2, 'errors': 'ignore'}, []),
('', {'style': TONE2, 'errors': 'replace'}, [['2e81']]),
('⺁⺁', {'style': TONE2, 'errors': 'replace'}, [['2e812e81']]),
('⺁⺁', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
[['a'], ['a']]),
('⺁⺁', {'style': TONE2, 'errors': lambda x: [['a', 'b'], ['b', 'c']]},
[['a'], ['b']]),
('⺁⺁', {'style': TONE2, 'heteronym': True,
'errors': lambda x: [['a', 'b'], ['b', 'c']]},
[['a', 'b'], ['b', 'c']]),
# 中文字符,没有拼音
('', {'style': TONE2}, [['\u9fc5']]),
('', {'style': TONE2, 'errors': 'ignore'}, []),
('', {'style': TONE2, 'errors': '233'}, []),
('', {'style': TONE2, 'errors': 'replace'}, [['9fc5']]),
('', {'style': TONE2, 'errors': lambda x: ['a']}, [['a']]),
('', {'style': TONE2, 'errors': lambda x: None}, []),
('鿅鿅', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
[['a'], ['a']]),
('鿅鿅', {'style': TONE2, 'errors': lambda x: [['a', 'b']]},
[['a'], ['a']]),
('鿅鿅', {'style': TONE2, 'heteronym': True,
'errors': lambda x: [['a', 'b']]},
[['a', 'b'], ['a', 'b']]),
)
for han in hans:
assert pinyin(han[0], **han[1]) == han[2]
def test_errors_callable():
def foobar(chars):
return 'a' * len(chars)
class Foobar(object):
def __call__(self, chars):
return 'a' * len(chars)
n = 5
assert pinyin('' * n, errors=foobar) == [['a' * n]]
assert pinyin('' * n, errors=Foobar()) == [['a' * n]]
def test_simple_seg():
data = {
'北京abcc': 'be3i ji1ng abcc',
'你好にほんごРусский язык': 'ni3 ha3o にほんごРусский язык',
}
for h, p in data.items():
assert slug([h], style=TONE2, separator=' ') == p
hans = '你好にほんごРусский язык'
ret = 'ni3 ha3o'
assert slug(hans, style=TONE2, separator=' ', errors=lambda x: None) == ret
data_for_update = [
# 便宜的发音
[
['便宜'], {'style': TONE2}, ['pia2n', 'yi2']
],
[
['便宜从事'], {'style': TONE2}, ['bia4n', 'yi2', 'co2ng', 'shi4']
],
[
['便宜施行'], {'style': TONE2}, ['bia4n', 'yi2', 'shi1', 'xi2ng']
],
[
['便宜货'], {'style': TONE2}, ['pia2n', 'yi2', 'huo4']
],
[
['贪便宜'], {'style': TONE2}, ['ta1n', 'pia2n', 'yi2']
],
[
['讨便宜'], {'style': TONE2}, ['ta3o', 'pia2n', 'yi2']
],
[
['小便宜'], {'style': TONE2}, ['xia3o', 'pia2n', 'yi2']
],
[
['占便宜'], {'style': TONE2}, ['zha4n', 'pia2n', 'yi2']
],
#
[
'\u3400', {'style': TONE2}, ['qiu1'], # CJK 扩展 A:[3400-4DBF]
],
[
'\u4E00', {'style': TONE2}, ['yi1'], # CJK 基本:[4E00-9FFF]
],
# [
# '\uFA29', {'style': TONE2}, ['da3o'], # CJK 兼容:[F900-FAFF]
# ],
# 误把 yu 放到声母列表了
['', {'style': TONE2}, ['yu2']],
['', {'style': FINALS}, ['v']],
['', {'style': BOPOMOFO}, ['ㄩˊ']],
['', {'style': CYRILLIC}, ['юй']],
['', {'style': TONE2}, ['yu3']],
['', {'style': FINALS}, ['v']],
['', {'style': BOPOMOFO}, ['ㄩˇ']],
['', {'style': CYRILLIC}, ['юй']],
['', {'style': TONE2}, ['yua2n']],
['', {'style': FINALS}, ['van']],
['', {'style': BOPOMOFO}, ['ㄩㄢˊ']],
['', {'style': CYRILLIC}, ['юань2']],
# y, w 也不是拼音, yu的韵母是v, yi的韵母是i, wu的韵母是u
['', {'style': INITIALS}, ['']],
['', {'style': TONE2}, ['ya']],
['', {'style': FINALS}, ['ia']],
['', {'style': BOPOMOFO}, ['ㄧㄚ˙']],
['', {'style': CYRILLIC}, ['я']],
['', {'style': INITIALS}, ['']],
['', {'style': TONE2}, ['wu2']],
['', {'style': FINALS}, ['u']],
['', {'style': FINALS_TONE}, ['ú']],
['', {'style': BOPOMOFO}, ['ㄨˊ']],
['', {'style': CYRILLIC}, ['у2']],
['', {'style': TONE2}, ['yi1']],
['', {'style': FINALS}, ['i']],
['', {'style': BOPOMOFO}, ['']],
['', {'style': CYRILLIC}, ['и1']],
['', {'style': TONE2}, ['wa4n']],
['', {'style': FINALS}, ['uan']],
['', {'style': BOPOMOFO}, ['ㄨㄢˋ']],
['', {'style': CYRILLIC}, ['вань4']],
# ju, qu, xu 的韵母应该是 v
['', {'style': FINALS_TONE}, ['ǜ']],
['', {'style': FINALS_TONE2}, ['v4']],
['', {'style': FINALS}, ['v']],
['', {'style': BOPOMOFO}, ['ㄐㄩˋ']],
['', {'style': CYRILLIC}, ['цзюй4']],
['', {'style': FINALS_TONE}, ['ǚ']],
['', {'style': FINALS_TONE2}, ['v3']],
['', {'style': FINALS}, ['v']],
['', {'style': BOPOMOFO}, ['ㄑㄩˇ']],
['', {'style': CYRILLIC}, ['цюй3']],
['', {'style': FINALS_TONE}, ['ǘ']],
['', {'style': FINALS_TONE2}, ['v2']],
['', {'style': FINALS}, ['v']],
['', {'style': BOPOMOFO}, ['ㄒㄩˊ']],
['', {'style': CYRILLIC}, ['сюй2']],
# ń
['', {'style': NORMAL}, ['n']],
['', {'style': TONE}, ['ń']],
['', {'style': TONE2}, ['n2']],
['', {'style': INITIALS}, ['']],
['', {'style': FIRST_LETTER}, ['n']],
['', {'style': FINALS}, ['n']],
['', {'style': FINALS_TONE}, ['ń']],
['', {'style': FINALS_TONE2}, ['n2']],
['', {'style': BOPOMOFO}, ['ㄣˊ']],
['', {'style': CYRILLIC}, ['н2']],
# ḿ \u1e3f U+1E3F
['', {'style': NORMAL}, ['m']],
['', {'style': TONE}, ['ḿ']],
['', {'style': TONE2}, ['m2']],
['', {'style': INITIALS}, ['']],
['', {'style': FIRST_LETTER}, ['m']],
['', {'style': FINALS}, ['m']],
['', {'style': FINALS_TONE}, ['ḿ']],
['', {'style': FINALS_TONE2}, ['m2']],
['', {'style': BOPOMOFO}, ['ㄇㄨˊ']],
['', {'style': CYRILLIC}, ['м2']],
# 41
['彷徨', {}, ['pang', 'huang']],
['彷徨', {'style': CYRILLIC}, ['пан2', 'хуан2']],
# 注音
['打量', {'style': BOPOMOFO}, ['ㄉㄚˇ', 'ㄌㄧㄤˋ']],
['黄山b股', {'style': BOPOMOFO}, ['ㄏㄨㄤˊ', 'ㄕㄢ', 'b', 'ㄍㄨˇ']],
['打量', {'style': CYRILLIC}, ['да3', 'лян4']],
['黄山b股', {'style': CYRILLIC}, ['хуан2', 'шань1', 'b', 'гу3']],
# 50
['打量', {'style': TONE2}, ['da3', 'lia4ng']],
['打量', {'style': TONE3}, ['da3', 'liang4']],
['侵略', {'style': TONE2}, ['qi1n', 'lve4']],
['侵略', {'style': TONE3}, ['qin1', 'lve4']],
['侵略', {'style': FINALS_TONE2}, ['i1n', 've4']],
['侵略', {'style': FINALS_TONE3}, ['in1', 've4']],
['侵略', {'style': BOPOMOFO}, ['ㄑㄧㄣ', 'ㄌㄩㄝˋ']],
['侵略', {'style': CYRILLIC}, ['цинь1', 'люэ4']],
['', {'style': TONE}, ['líng']],
# 二次分词
[['你要', '重新考虑OK'], {'style': TONE}, [
'', 'yào', 'chóng', 'xīn', 'kǎo', '', 'OK']],
]
@pytest.mark.parametrize('hans, kwargs, result', data_for_update)
def test_update(hans, kwargs, result):
assert lazy_pinyin(hans, **kwargs) == result
@pytest.mark.skipif(not SUPPORT_UCS4, reason='dont support ucs4')
@pytest.mark.parametrize(
'han, result', [
['\U00020000', ['he']], # CJK 扩展 B:[20000-2A6DF]
['\U0002A79D', ['duo']], # CJK 扩展 C:[2A700-2B73F]
# ['\U0002B740', ['wu']], # CJK 扩展 D:[2B740-2B81D]
# ['\U0002F80A', ['seng']], # CJK 兼容扩展:[2F800-2FA1F]
]
)
def test_support_ucs4(han, result):
assert lazy_pinyin(han) == result
@pytest.mark.skipif(SUPPORT_UCS4, reason='support ucs4')
@pytest.mark.parametrize(
'han', [
'\U00020000', # CJK 扩展 B:[20000-2A6DF]
'\U0002A79D', # CJK 扩展 C:[2A700-2B73F]
# '\U0002B740', # CJK 扩展 D:[2B740-2B81D]
# '\U0002F80A', # CJK 兼容扩展:[2F800-2FA1F]
]
)
def test_dont_support_ucs4(han):
assert pinyin(han) == [[han]]
def test_36():
hans = '两年前七斤喝醉了酒'
pys = ['liang', 'nian', 'qian', 'qi', 'jin', 'he', 'zui', 'le', 'jiu']
assert lazy_pinyin(hans) == pys
def test_with_unknown_style():
assert lazy_pinyin('中国') == ['zhong', 'guo']
assert lazy_pinyin('中国', style='unknown') == ['zhōng', 'guó']
assert pinyin('中国') == [['zhōng'], ['guó']]
assert pinyin('中国', style='unknown') == [['zhōng'], ['guó']]
@pytest.mark.parametrize('kwargs,result', [
[{}, [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
[dict(strict=False), [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
[dict(style=NORMAL), [['zhong'], ['xin']]],
[dict(style=NORMAL, strict=False), [['zhong'], ['xin']]],
[dict(style=TONE), [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
[dict(style=TONE, strict=False), [
['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
[dict(style=TONE2), [['zho1ng', 'zho4ng'], ['xi1n']]],
[dict(style=TONE2, strict=False), [['zho1ng', 'zho4ng'], ['xi1n']]],
[dict(style=TONE3), [['zhong1', 'zhong4'], ['xin1']]],
[dict(style=TONE3, strict=False), [['zhong1', 'zhong4'], ['xin1']]],
[dict(style=INITIALS), [['zh'], ['x']]],
[dict(style=INITIALS, strict=False), [['zh'], ['x']]],
[dict(style=FIRST_LETTER), [['z'], ['x']]],
[dict(style=FIRST_LETTER, strict=False), [['z'], ['x']]],
[dict(style=FINALS), [['ong'], ['in']]],
[dict(style=FINALS, strict=False), [['ong'], ['in']]],
[dict(style=FINALS_TONE), [['\u014dng', '\xf2ng'], ['\u012bn']]],
[dict(style=FINALS_TONE, strict=False), [
['\u014dng', '\xf2ng'], ['\u012bn']]],
[dict(style=FINALS_TONE2), [['o1ng', 'o4ng'], ['i1n']]],
[dict(style=FINALS_TONE2, strict=False), [['o1ng', 'o4ng'], ['i1n']]],
[dict(style=FINALS_TONE3), [['ong1', 'ong4'], ['in1']]],
[dict(style=FINALS_TONE3, strict=False), [['ong1', 'ong4'], ['in1']]],
])
def test_heteronym_and_style(kwargs, result):
hans = '中心'
kwargs['heteronym'] = True
assert pinyin(hans, **kwargs) == result
@pytest.mark.parametrize('kwargs,result', [
[{}, [['zhāo'], ['yáng']]],
[dict(heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
[dict(strict=False), [['zhāo'], ['yáng']]],
[dict(strict=False, heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
[dict(style=NORMAL), [['zhao'], ['yang']]],
[dict(style=NORMAL, heteronym=True), [['zhao', 'chao'], ['yang']]],
[dict(style=NORMAL, strict=False), [['zhao'], ['yang']]],
[dict(style=NORMAL, strict=False, heteronym=True), [['zhao', 'chao'],
['yang']]],
[dict(style=TONE), [['zhāo'], ['yáng']]],
[dict(style=TONE, heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
[dict(style=TONE, strict=False), [['zhāo'], ['yáng']]],
[dict(style=TONE, strict=False, heteronym=True), [['zhāo', 'cháo'],
['yáng']]],
[dict(style=TONE2), [['zha1o'], ['ya2ng']]],
[dict(style=TONE2, heteronym=True), [['zha1o', 'cha2o'], ['ya2ng']]],
[dict(style=TONE2, strict=False), [['zha1o'], ['ya2ng']]],
[dict(style=TONE2, strict=False, heteronym=True), [['zha1o', 'cha2o'],
['ya2ng']]],
[dict(style=TONE3), [['zhao1'], ['yang2']]],
[dict(style=TONE3, heteronym=True), [['zhao1', 'chao2'], ['yang2']]],
[dict(style=TONE3, strict=False), [['zhao1'], ['yang2']]],
[dict(style=TONE3, strict=False, heteronym=True), [['zhao1', 'chao2'],
['yang2']]],
[dict(style=INITIALS), [['zh'], ['']]],
[dict(style=INITIALS, heteronym=True), [['zh', 'ch'], ['']]],
[dict(style=INITIALS, strict=False), [['zh'], ['y']]],
[dict(style=INITIALS, strict=False, heteronym=True), [['zh', 'ch'],
['y']]],
[dict(style=FIRST_LETTER), [['z'], ['y']]],
[dict(style=FIRST_LETTER, heteronym=True), [['z', 'c'], ['y']]],
[dict(style=FIRST_LETTER, strict=False), [['z'], ['y']]],
[dict(style=FIRST_LETTER, strict=False, heteronym=True), [['z', 'c'],
['y']]],
[dict(style=FINALS), [['ao'], ['iang']]],
[dict(style=FINALS, heteronym=True), [['ao'], ['iang']]],
[dict(style=FINALS, strict=False), [['ao'], ['ang']]],
[dict(style=FINALS, strict=False, heteronym=True), [['ao'], ['ang']]],
[dict(style=FINALS_TONE), [['āo'], ['iáng']]],
[dict(style=FINALS_TONE, heteronym=True), [['āo', 'áo'], ['iáng']]],
[dict(style=FINALS_TONE, strict=False), [['āo'], ['áng']]],
[dict(style=FINALS_TONE, strict=False, heteronym=True), [['āo', 'áo'],
['áng']]],
[dict(style=FINALS_TONE2), [['a1o'], ['ia2ng']]],
[dict(style=FINALS_TONE2, heteronym=True), [['a1o', 'a2o'], ['ia2ng']]],
[dict(style=FINALS_TONE2, strict=False), [['a1o'], ['a2ng']]],
[dict(style=FINALS_TONE2, strict=False, heteronym=True), [['a1o', 'a2o'],
['a2ng']]],
[dict(style=FINALS_TONE3), [['ao1'], ['iang2']]],
[dict(style=FINALS_TONE3, heteronym=True), [['ao1', 'ao2'], ['iang2']]],
[dict(style=FINALS_TONE3, strict=False), [['ao1'], ['ang2']]],
[dict(style=FINALS_TONE3, strict=False, heteronym=True), [['ao1', 'ao2'],
['ang2']]],
])
def test_heteronym_and_style_phrase(kwargs, result):
hans = '朝阳'
assert pinyin(hans, **kwargs) == result
def test_m4():
# U+5463: ḿ,móu,m̀ # 呣
han = ''
assert pinyin(han) == [['ḿ']]
assert pinyin(han, heteronym=True) == [['ḿ', '', 'móu']]
assert pinyin(
han, heteronym=True, style=NORMAL) == [['m', 'mou']]
assert pinyin(
han, heteronym=True, style=TONE) == [['ḿ', '', 'móu']]
assert pinyin(
han, heteronym=True, style=TONE2) == [['m2', 'm4', 'mo2u']]
assert pinyin(
han, heteronym=True, style=TONE3) == [['m2', 'm4', 'mou2']]
assert pinyin(
han, heteronym=True, style=INITIALS) == [['', 'm']] # TODO: fix ''
assert pinyin(
han, heteronym=True, style=FIRST_LETTER) == [['m']]
assert pinyin(
han, heteronym=True, style=FINALS) == [['m', 'ou']]
assert pinyin(
han, heteronym=True, style=FINALS_TONE) == [['ḿ', '', 'óu']]
assert pinyin(
han, heteronym=True, style=FINALS_TONE2) == [['m2', 'm4', 'o2u']]
assert pinyin(
han, heteronym=True, style=FINALS_TONE3) == [['m2', 'm4', 'ou2']]
@pytest.mark.parametrize('han,style,expect', [
['', Style.TONE, ['ḿ', '']],
['', Style.TONE2, ['m2', 'm4']],
['', Style.TONE, ['', 'ḿ']],
['', Style.TONE2, ['m1', 'm2']],
['', Style.TONE, ['ê̄', 'ế', 'ê̌', '']],
['', Style.TONE2, ['ê1', 'ê2', 'ê3', 'ê4']],
])
def test_m_e(han, style, expect):
result = pinyin(han, style=style, heteronym=True)
assert len(result) == 1
assert (set(result[0]) & set(expect)) == set(expect)
if __name__ == '__main__':
import pytest
pytest.cmdline.main()