You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/python-pinyin/tests/test_pinyin.py

562 lines
24 KiB

E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code
4 years ago
#!/usr/bin/env python3
import pytest
from pypinyin import (pinyin, slug, lazy_pinyin, load_single_dict,
load_phrases_dict, NORMAL, TONE, TONE2, TONE3, INITIALS,
FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2,
FINALS_TONE3, BOPOMOFO, BOPOMOFO_FIRST, CYRILLIC,
CYRILLIC_FIRST, Style)
from pypinyin.constants import SUPPORT_UCS4
from pypinyin.seg import simpleseg
def test_pinyin_initials():
"""包含声明和韵母的词语"""
hans = '中心'
# 默认风格,带声调
assert pinyin(hans) == [['zh\u014dng'], ['x\u012bn']]
assert pinyin(hans, strict=False) == [['zh\u014dng'], ['x\u012bn']]
# 普通风格,不带声调
assert pinyin(hans, NORMAL) == [['zhong'], ['xin']]
assert pinyin(hans, NORMAL, strict=False) == [['zhong'], ['xin']]
# 声调风格,拼音声调在韵母第一个字母上
assert pinyin(hans, TONE) == [['zh\u014dng'], ['x\u012bn']]
assert pinyin(hans, TONE, strict=False) == [['zh\u014dng'], ['x\u012bn']]
# 声调风格2即拼音声调在各个声母之后用数字 [1-4] 进行表示
assert pinyin(hans, TONE2) == [['zho1ng'], ['xi1n']]
assert pinyin(hans, TONE2, strict=False) == [['zho1ng'], ['xi1n']]
# 声调风格3即拼音声调在各个拼音之后用数字 [1-4] 进行表示
assert pinyin(hans, TONE3) == [['zhong1'], ['xin1']]
assert pinyin(hans, TONE3, strict=False) == [['zhong1'], ['xin1']]
# 声母风格,只返回各个拼音的声母部分
assert pinyin(hans, INITIALS) == [['zh'], ['x']]
assert pinyin(hans, INITIALS, strict=False) == [['zh'], ['x']]
# 首字母风格,只返回拼音的首字母部分
assert pinyin(hans, FIRST_LETTER) == [['z'], ['x']]
assert pinyin(hans, FIRST_LETTER, strict=False) == [['z'], ['x']]
# 注音风格,带声调
assert pinyin(hans, BOPOMOFO) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
assert pinyin(hans, BOPOMOFO, strict=False) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
# 注音风格,首字母
assert pinyin(hans, BOPOMOFO_FIRST) == [[''], ['']]
assert pinyin(hans, BOPOMOFO_FIRST, strict=False) == [[''], ['']]
# test CYRILLIC style
assert pinyin(hans, CYRILLIC) == [['чжун1'], ['синь1']]
assert pinyin(hans, CYRILLIC, strict=False) == [['чжун1'], ['синь1']]
# CYRILLIC_FIRST style return only first letters
assert pinyin(hans, CYRILLIC_FIRST) == [['ч'], ['с']]
assert pinyin(hans, CYRILLIC_FIRST, strict=False) == [['ч'], ['с']]
# 启用多音字模式
assert pinyin(hans, heteronym=True) == [['zh\u014dng', 'zh\xf2ng'],
['x\u012bn']]
assert pinyin(hans, heteronym=True, strict=False) == \
[['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]
# 韵母风格1只返回各个拼音的韵母部分不带声调
assert pinyin(hans, style=FINALS) == [['ong'], ['in']]
assert pinyin(hans, style=FINALS, strict=False) == [['ong'], ['in']]
# 韵母风格2带声调声调在韵母第一个字母上
assert pinyin(hans, style=FINALS_TONE) == [['\u014dng'], ['\u012bn']]
assert pinyin(hans, style=FINALS_TONE, strict=False) == \
[['\u014dng'], ['\u012bn']]
# 韵母风格2带声调声调在各个声母之后用数字 [1-4] 进行表示
assert pinyin(hans, style=FINALS_TONE2) == [['o1ng'], ['i1n']]
assert pinyin(hans, style=FINALS_TONE2, strict=False) == \
[['o1ng'], ['i1n']]
# 韵母风格3带声调声调在各个拼音之后用数字 [1-4] 进行表示
assert pinyin(hans, style=FINALS_TONE3) == [['ong1'], ['in1']]
assert pinyin(hans, style=FINALS_TONE3, strict=False) == \
[['ong1'], ['in1']]
def test_pinyin_finals():
"""只包含韵母的词语"""
hans = '嗷嗷'
assert pinyin(hans) == [['\xe1o'], ['\xe1o']]
assert pinyin(hans + 'abc') == [['\xe1o'], ['\xe1o'], ['abc']]
assert pinyin(hans, NORMAL) == [['ao'], ['ao']]
assert pinyin(hans, TONE) == [['\xe1o'], ['\xe1o']]
assert pinyin(hans, TONE2) == [['a2o'], ['a2o']]
assert pinyin(hans, TONE3) == [['ao2'], ['ao2']]
assert pinyin(hans, INITIALS) == [[''], ['']]
assert pinyin(hans, FIRST_LETTER) == [['a'], ['a']]
assert pinyin(hans, BOPOMOFO) == [['ㄠˊ'], ['ㄠˊ']]
assert pinyin(hans, BOPOMOFO_FIRST) == [[''], ['']]
assert pinyin(hans, CYRILLIC) == [['ао2'], ['ао2']]
assert pinyin(hans, CYRILLIC_FIRST) == [['а'], ['а']]
assert pinyin(hans, heteronym=True) == [['\xe1o'], ['\xe1o']]
assert pinyin('', heteronym=True) == \
[['a', 'ā', 'á', 'ǎ', 'à', 'è']]
assert pinyin(hans, style=FINALS) == [['ao'], ['ao']]
assert pinyin(hans, style=FINALS_TONE) == [['\xe1o'], ['\xe1o']]
assert pinyin(hans, style=FINALS_TONE2) == [['a2o'], ['a2o']]
assert pinyin(hans, style=FINALS_TONE3) == [['ao2'], ['ao2']]
def test_slug():
hans = '中心'
assert slug(hans) == 'zhong-xin'
assert slug(hans, heteronym=True) == 'zhong-xin'
def test_zh_and_en():
"""中英文混合的情况"""
# 中英文
hans = '中心'
assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']]
# 中英文混合的固定词组
assert pinyin('黄山B股', style=TONE2) == \
[['hua2ng'], ['sha1n'], ['B'], ['gu3']]
assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
assert pinyin('维生素C', style=TONE2) == \
[['we2i'], ['she1ng'], ['su4'], ['C']]
def test_others():
# 空字符串
assert pinyin('') == []
# 单个汉字
assert pinyin('') == [['y\xedng']]
# 中国 人
assert pinyin('中国人') == [['zh\u014dng'], ['gu\xf3'], ['r\xe9n']]
# 日文
assert pinyin('') == [['\u306e']]
# 没有读音的汉字,还不存在的汉字
assert pinyin('\u9fff') == [['\u9fff']]
def test_lazy_pinyin():
assert lazy_pinyin('中国人') == ['zhong', 'guo', 'ren']
assert lazy_pinyin('中心') == ['zhong', 'xin']
assert lazy_pinyin('中心', style=TONE) == ['zh\u014dng', 'x\u012bn']
assert lazy_pinyin('中心', style=INITIALS) == ['zh', 'x']
assert lazy_pinyin('中心', style=BOPOMOFO) == ['ㄓㄨㄥ', 'ㄒㄧㄣ']
assert lazy_pinyin('中心', style=CYRILLIC) == ['чжун1', 'синь1']
def test_seg():
hans = '音乐'
hans_seg = list(simpleseg(hans))
assert pinyin(hans_seg, style=TONE2) == [['yi1n'], ['yue4']]
# 中英文混合的固定词组
assert pinyin('黄山B股', style=TONE2) == \
[['hua2ng'], ['sha1n'], ['B'], ['gu3']]
assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
assert pinyin('维生素C', style=TONE2) == \
[['we2i'], ['she1ng'], ['su4'], ['C']]
def test_custom_pinyin_dict():
hans = ''
try:
assert lazy_pinyin(hans, style=TONE2) == ['ju2']
except AssertionError:
pass
load_single_dict({ord(''): 'jú,jié'})
assert lazy_pinyin(hans, style=TONE2) == ['ju2']
def test_custom_pinyin_dict2():
hans = ['同行']
try:
assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'ha2ng']
except AssertionError:
pass
load_phrases_dict({'同行': [['tóng'], ['xíng']]})
assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'xi2ng']
def test_custom_pinyin_dict_tone2():
load_single_dict({ord(''): 'ce4,si4'}, style='tone2')
assert lazy_pinyin('', style=TONE2) == ['ce4']
assert pinyin('') == [['']]
def test_custom_pinyin_dict2_tone2():
load_phrases_dict({'同行': [['to4ng'], ['ku1']]}, style='tone2')
assert lazy_pinyin(['同行'], style=TONE2) == ['to4ng', 'ku1']
assert pinyin('同行') == [['tòng'], ['']]
# yapf: disable
def test_errors():
hans = (
('', {'style': TONE2}, [['a']]),
('啊a', {'style': TONE2}, [['a'], ['a']]),
# 非中文字符,没有拼音
('', {'style': TONE2}, [['\u2e81']]),
('', {'style': TONE2, 'errors': 'ignore'}, []),
('', {'style': TONE2, 'errors': 'replace'}, [['2e81']]),
('⺁⺁', {'style': TONE2, 'errors': 'replace'}, [['2e812e81']]),
('⺁⺁', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
[['a'], ['a']]),
('⺁⺁', {'style': TONE2, 'errors': lambda x: [['a', 'b'], ['b', 'c']]},
[['a'], ['b']]),
('⺁⺁', {'style': TONE2, 'heteronym': True,
'errors': lambda x: [['a', 'b'], ['b', 'c']]},
[['a', 'b'], ['b', 'c']]),
# 中文字符,没有拼音
('', {'style': TONE2}, [['\u9fc5']]),
('', {'style': TONE2, 'errors': 'ignore'}, []),
('', {'style': TONE2, 'errors': '233'}, []),
('', {'style': TONE2, 'errors': 'replace'}, [['9fc5']]),
('', {'style': TONE2, 'errors': lambda x: ['a']}, [['a']]),
('', {'style': TONE2, 'errors': lambda x: None}, []),
('鿅鿅', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
[['a'], ['a']]),
('鿅鿅', {'style': TONE2, 'errors': lambda x: [['a', 'b']]},
[['a'], ['a']]),
('鿅鿅', {'style': TONE2, 'heteronym': True,
'errors': lambda x: [['a', 'b']]},
[['a', 'b'], ['a', 'b']]),
)
for han in hans:
assert pinyin(han[0], **han[1]) == han[2]
def test_errors_callable():
def foobar(chars):
return 'a' * len(chars)
class Foobar(object):
def __call__(self, chars):
return 'a' * len(chars)
n = 5
assert pinyin('' * n, errors=foobar) == [['a' * n]]
assert pinyin('' * n, errors=Foobar()) == [['a' * n]]
def test_simple_seg():
data = {
'北京abcc': 'be3i ji1ng abcc',
'你好にほんごРусский язык': 'ni3 ha3o にほんごРусский язык',
}
for h, p in data.items():
assert slug([h], style=TONE2, separator=' ') == p
hans = '你好にほんごРусский язык'
ret = 'ni3 ha3o'
assert slug(hans, style=TONE2, separator=' ', errors=lambda x: None) == ret
data_for_update = [
# 便宜的发音
[
['便宜'], {'style': TONE2}, ['pia2n', 'yi2']
],
[
['便宜从事'], {'style': TONE2}, ['bia4n', 'yi2', 'co2ng', 'shi4']
],
[
['便宜施行'], {'style': TONE2}, ['bia4n', 'yi2', 'shi1', 'xi2ng']
],
[
['便宜货'], {'style': TONE2}, ['pia2n', 'yi2', 'huo4']
],
[
['贪便宜'], {'style': TONE2}, ['ta1n', 'pia2n', 'yi2']
],
[
['讨便宜'], {'style': TONE2}, ['ta3o', 'pia2n', 'yi2']
],
[
['小便宜'], {'style': TONE2}, ['xia3o', 'pia2n', 'yi2']
],
[
['占便宜'], {'style': TONE2}, ['zha4n', 'pia2n', 'yi2']
],
#
[
'\u3400', {'style': TONE2}, ['qiu1'], # CJK 扩展 A:[3400-4DBF]
],
[
'\u4E00', {'style': TONE2}, ['yi1'], # CJK 基本:[4E00-9FFF]
],
# [
# '\uFA29', {'style': TONE2}, ['da3o'], # CJK 兼容:[F900-FAFF]
# ],
# 误把 yu 放到声母列表了
['', {'style': TONE2}, ['yu2']],
['', {'style': FINALS}, ['v']],
['', {'style': BOPOMOFO}, ['ㄩˊ']],
['', {'style': CYRILLIC}, ['юй']],
['', {'style': TONE2}, ['yu3']],
['', {'style': FINALS}, ['v']],
['', {'style': BOPOMOFO}, ['ㄩˇ']],
['', {'style': CYRILLIC}, ['юй']],
['', {'style': TONE2}, ['yua2n']],
['', {'style': FINALS}, ['van']],
['', {'style': BOPOMOFO}, ['ㄩㄢˊ']],
['', {'style': CYRILLIC}, ['юань2']],
# y, w 也不是拼音, yu的韵母是v, yi的韵母是i, wu的韵母是u
['', {'style': INITIALS}, ['']],
['', {'style': TONE2}, ['ya']],
['', {'style': FINALS}, ['ia']],
['', {'style': BOPOMOFO}, ['ㄧㄚ˙']],
['', {'style': CYRILLIC}, ['я']],
['', {'style': INITIALS}, ['']],
['', {'style': TONE2}, ['wu2']],
['', {'style': FINALS}, ['u']],
['', {'style': FINALS_TONE}, ['ú']],
['', {'style': BOPOMOFO}, ['ㄨˊ']],
['', {'style': CYRILLIC}, ['у2']],
['', {'style': TONE2}, ['yi1']],
['', {'style': FINALS}, ['i']],
['', {'style': BOPOMOFO}, ['']],
['', {'style': CYRILLIC}, ['и1']],
['', {'style': TONE2}, ['wa4n']],
['', {'style': FINALS}, ['uan']],
['', {'style': BOPOMOFO}, ['ㄨㄢˋ']],
['', {'style': CYRILLIC}, ['вань4']],
# ju, qu, xu 的韵母应该是 v
['', {'style': FINALS_TONE}, ['ǜ']],
['', {'style': FINALS_TONE2}, ['v4']],
['', {'style': FINALS}, ['v']],
['', {'style': BOPOMOFO}, ['ㄐㄩˋ']],
['', {'style': CYRILLIC}, ['цзюй4']],
['', {'style': FINALS_TONE}, ['ǚ']],
['', {'style': FINALS_TONE2}, ['v3']],
['', {'style': FINALS}, ['v']],
['', {'style': BOPOMOFO}, ['ㄑㄩˇ']],
['', {'style': CYRILLIC}, ['цюй3']],
['', {'style': FINALS_TONE}, ['ǘ']],
['', {'style': FINALS_TONE2}, ['v2']],
['', {'style': FINALS}, ['v']],
['', {'style': BOPOMOFO}, ['ㄒㄩˊ']],
['', {'style': CYRILLIC}, ['сюй2']],
# ń
['', {'style': NORMAL}, ['n']],
['', {'style': TONE}, ['ń']],
['', {'style': TONE2}, ['n2']],
['', {'style': INITIALS}, ['']],
['', {'style': FIRST_LETTER}, ['n']],
['', {'style': FINALS}, ['n']],
['', {'style': FINALS_TONE}, ['ń']],
['', {'style': FINALS_TONE2}, ['n2']],
['', {'style': BOPOMOFO}, ['ㄣˊ']],
['', {'style': CYRILLIC}, ['н2']],
# ḿ \u1e3f U+1E3F
['', {'style': NORMAL}, ['m']],
['', {'style': TONE}, ['ḿ']],
['', {'style': TONE2}, ['m2']],
['', {'style': INITIALS}, ['']],
['', {'style': FIRST_LETTER}, ['m']],
['', {'style': FINALS}, ['m']],
['', {'style': FINALS_TONE}, ['ḿ']],
['', {'style': FINALS_TONE2}, ['m2']],
['', {'style': BOPOMOFO}, ['ㄇㄨˊ']],
['', {'style': CYRILLIC}, ['м2']],
# 41
['彷徨', {}, ['pang', 'huang']],
['彷徨', {'style': CYRILLIC}, ['пан2', 'хуан2']],
# 注音
['打量', {'style': BOPOMOFO}, ['ㄉㄚˇ', 'ㄌㄧㄤˋ']],
['黄山b股', {'style': BOPOMOFO}, ['ㄏㄨㄤˊ', 'ㄕㄢ', 'b', 'ㄍㄨˇ']],
['打量', {'style': CYRILLIC}, ['да3', 'лян4']],
['黄山b股', {'style': CYRILLIC}, ['хуан2', 'шань1', 'b', 'гу3']],
# 50
['打量', {'style': TONE2}, ['da3', 'lia4ng']],
['打量', {'style': TONE3}, ['da3', 'liang4']],
['侵略', {'style': TONE2}, ['qi1n', 'lve4']],
['侵略', {'style': TONE3}, ['qin1', 'lve4']],
['侵略', {'style': FINALS_TONE2}, ['i1n', 've4']],
['侵略', {'style': FINALS_TONE3}, ['in1', 've4']],
['侵略', {'style': BOPOMOFO}, ['ㄑㄧㄣ', 'ㄌㄩㄝˋ']],
['侵略', {'style': CYRILLIC}, ['цинь1', 'люэ4']],
['', {'style': TONE}, ['líng']],
# 二次分词
[['你要', '重新考虑OK'], {'style': TONE}, [
'', 'yào', 'chóng', 'xīn', 'kǎo', '', 'OK']],
]
@pytest.mark.parametrize('hans, kwargs, result', data_for_update)
def test_update(hans, kwargs, result):
assert lazy_pinyin(hans, **kwargs) == result
@pytest.mark.skipif(not SUPPORT_UCS4, reason='dont support ucs4')
@pytest.mark.parametrize(
'han, result', [
['\U00020000', ['he']], # CJK 扩展 B:[20000-2A6DF]
['\U0002A79D', ['duo']], # CJK 扩展 C:[2A700-2B73F]
# ['\U0002B740', ['wu']], # CJK 扩展 D:[2B740-2B81D]
# ['\U0002F80A', ['seng']], # CJK 兼容扩展:[2F800-2FA1F]
]
)
def test_support_ucs4(han, result):
assert lazy_pinyin(han) == result
@pytest.mark.skipif(SUPPORT_UCS4, reason='support ucs4')
@pytest.mark.parametrize(
'han', [
'\U00020000', # CJK 扩展 B:[20000-2A6DF]
'\U0002A79D', # CJK 扩展 C:[2A700-2B73F]
# '\U0002B740', # CJK 扩展 D:[2B740-2B81D]
# '\U0002F80A', # CJK 兼容扩展:[2F800-2FA1F]
]
)
def test_dont_support_ucs4(han):
assert pinyin(han) == [[han]]
def test_36():
hans = '两年前七斤喝醉了酒'
pys = ['liang', 'nian', 'qian', 'qi', 'jin', 'he', 'zui', 'le', 'jiu']
assert lazy_pinyin(hans) == pys
def test_with_unknown_style():
assert lazy_pinyin('中国') == ['zhong', 'guo']
assert lazy_pinyin('中国', style='unknown') == ['zhōng', 'guó']
assert pinyin('中国') == [['zhōng'], ['guó']]
assert pinyin('中国', style='unknown') == [['zhōng'], ['guó']]
@pytest.mark.parametrize('kwargs,result', [
[{}, [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
[dict(strict=False), [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
[dict(style=NORMAL), [['zhong'], ['xin']]],
[dict(style=NORMAL, strict=False), [['zhong'], ['xin']]],
[dict(style=TONE), [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
[dict(style=TONE, strict=False), [
['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
[dict(style=TONE2), [['zho1ng', 'zho4ng'], ['xi1n']]],
[dict(style=TONE2, strict=False), [['zho1ng', 'zho4ng'], ['xi1n']]],
[dict(style=TONE3), [['zhong1', 'zhong4'], ['xin1']]],
[dict(style=TONE3, strict=False), [['zhong1', 'zhong4'], ['xin1']]],
[dict(style=INITIALS), [['zh'], ['x']]],
[dict(style=INITIALS, strict=False), [['zh'], ['x']]],
[dict(style=FIRST_LETTER), [['z'], ['x']]],
[dict(style=FIRST_LETTER, strict=False), [['z'], ['x']]],
[dict(style=FINALS), [['ong'], ['in']]],
[dict(style=FINALS, strict=False), [['ong'], ['in']]],
[dict(style=FINALS_TONE), [['\u014dng', '\xf2ng'], ['\u012bn']]],
[dict(style=FINALS_TONE, strict=False), [
['\u014dng', '\xf2ng'], ['\u012bn']]],
[dict(style=FINALS_TONE2), [['o1ng', 'o4ng'], ['i1n']]],
[dict(style=FINALS_TONE2, strict=False), [['o1ng', 'o4ng'], ['i1n']]],
[dict(style=FINALS_TONE3), [['ong1', 'ong4'], ['in1']]],
[dict(style=FINALS_TONE3, strict=False), [['ong1', 'ong4'], ['in1']]],
])
def test_heteronym_and_style(kwargs, result):
hans = '中心'
kwargs['heteronym'] = True
assert pinyin(hans, **kwargs) == result
@pytest.mark.parametrize('kwargs,result', [
[{}, [['zhāo'], ['yáng']]],
[dict(heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
[dict(strict=False), [['zhāo'], ['yáng']]],
[dict(strict=False, heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
[dict(style=NORMAL), [['zhao'], ['yang']]],
[dict(style=NORMAL, heteronym=True), [['zhao', 'chao'], ['yang']]],
[dict(style=NORMAL, strict=False), [['zhao'], ['yang']]],
[dict(style=NORMAL, strict=False, heteronym=True), [['zhao', 'chao'],
['yang']]],
[dict(style=TONE), [['zhāo'], ['yáng']]],
[dict(style=TONE, heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
[dict(style=TONE, strict=False), [['zhāo'], ['yáng']]],
[dict(style=TONE, strict=False, heteronym=True), [['zhāo', 'cháo'],
['yáng']]],
[dict(style=TONE2), [['zha1o'], ['ya2ng']]],
[dict(style=TONE2, heteronym=True), [['zha1o', 'cha2o'], ['ya2ng']]],
[dict(style=TONE2, strict=False), [['zha1o'], ['ya2ng']]],
[dict(style=TONE2, strict=False, heteronym=True), [['zha1o', 'cha2o'],
['ya2ng']]],
[dict(style=TONE3), [['zhao1'], ['yang2']]],
[dict(style=TONE3, heteronym=True), [['zhao1', 'chao2'], ['yang2']]],
[dict(style=TONE3, strict=False), [['zhao1'], ['yang2']]],
[dict(style=TONE3, strict=False, heteronym=True), [['zhao1', 'chao2'],
['yang2']]],
[dict(style=INITIALS), [['zh'], ['']]],
[dict(style=INITIALS, heteronym=True), [['zh', 'ch'], ['']]],
[dict(style=INITIALS, strict=False), [['zh'], ['y']]],
[dict(style=INITIALS, strict=False, heteronym=True), [['zh', 'ch'],
['y']]],
[dict(style=FIRST_LETTER), [['z'], ['y']]],
[dict(style=FIRST_LETTER, heteronym=True), [['z', 'c'], ['y']]],
[dict(style=FIRST_LETTER, strict=False), [['z'], ['y']]],
[dict(style=FIRST_LETTER, strict=False, heteronym=True), [['z', 'c'],
['y']]],
[dict(style=FINALS), [['ao'], ['iang']]],
[dict(style=FINALS, heteronym=True), [['ao'], ['iang']]],
[dict(style=FINALS, strict=False), [['ao'], ['ang']]],
[dict(style=FINALS, strict=False, heteronym=True), [['ao'], ['ang']]],
[dict(style=FINALS_TONE), [['āo'], ['iáng']]],
[dict(style=FINALS_TONE, heteronym=True), [['āo', 'áo'], ['iáng']]],
[dict(style=FINALS_TONE, strict=False), [['āo'], ['áng']]],
[dict(style=FINALS_TONE, strict=False, heteronym=True), [['āo', 'áo'],
['áng']]],
[dict(style=FINALS_TONE2), [['a1o'], ['ia2ng']]],
[dict(style=FINALS_TONE2, heteronym=True), [['a1o', 'a2o'], ['ia2ng']]],
[dict(style=FINALS_TONE2, strict=False), [['a1o'], ['a2ng']]],
[dict(style=FINALS_TONE2, strict=False, heteronym=True), [['a1o', 'a2o'],
['a2ng']]],
[dict(style=FINALS_TONE3), [['ao1'], ['iang2']]],
[dict(style=FINALS_TONE3, heteronym=True), [['ao1', 'ao2'], ['iang2']]],
[dict(style=FINALS_TONE3, strict=False), [['ao1'], ['ang2']]],
[dict(style=FINALS_TONE3, strict=False, heteronym=True), [['ao1', 'ao2'],
['ang2']]],
])
def test_heteronym_and_style_phrase(kwargs, result):
hans = '朝阳'
assert pinyin(hans, **kwargs) == result
def test_m4():
# U+5463: ḿ,móu,m̀ # 呣
han = ''
assert pinyin(han) == [['ḿ']]
assert pinyin(han, heteronym=True) == [['ḿ', '', 'móu']]
assert pinyin(
han, heteronym=True, style=NORMAL) == [['m', 'mou']]
assert pinyin(
han, heteronym=True, style=TONE) == [['ḿ', '', 'móu']]
assert pinyin(
han, heteronym=True, style=TONE2) == [['m2', 'm4', 'mo2u']]
assert pinyin(
han, heteronym=True, style=TONE3) == [['m2', 'm4', 'mou2']]
assert pinyin(
han, heteronym=True, style=INITIALS) == [['', 'm']] # TODO: fix ''
assert pinyin(
han, heteronym=True, style=FIRST_LETTER) == [['m']]
assert pinyin(
han, heteronym=True, style=FINALS) == [['m', 'ou']]
assert pinyin(
han, heteronym=True, style=FINALS_TONE) == [['ḿ', '', 'óu']]
assert pinyin(
han, heteronym=True, style=FINALS_TONE2) == [['m2', 'm4', 'o2u']]
assert pinyin(
han, heteronym=True, style=FINALS_TONE3) == [['m2', 'm4', 'ou2']]
@pytest.mark.parametrize('han,style,expect', [
['', Style.TONE, ['ḿ', '']],
['', Style.TONE2, ['m2', 'm4']],
['', Style.TONE, ['', 'ḿ']],
['', Style.TONE2, ['m1', 'm2']],
['', Style.TONE, ['ê̄', 'ế', 'ê̌', '']],
['', Style.TONE2, ['ê1', 'ê2', 'ê3', 'ê4']],
])
def test_m_e(han, style, expect):
result = pinyin(han, style=style, heteronym=True)
assert len(result) == 1
assert (set(result[0]) & set(expect)) == set(expect)
if __name__ == '__main__':
import pytest
pytest.cmdline.main()