PaddleSpeech/third_party/python-pinyin/tests/test_pinyin.py

#!/usr/bin/env python3

import pytest

from pypinyin import (pinyin, slug, lazy_pinyin, load_single_dict,
                      load_phrases_dict, NORMAL, TONE, TONE2, TONE3, INITIALS,
                      FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2,
                      FINALS_TONE3, BOPOMOFO, BOPOMOFO_FIRST, CYRILLIC,
                      CYRILLIC_FIRST, Style)
from pypinyin.constants import SUPPORT_UCS4
from pypinyin.seg import simpleseg


def test_pinyin_initials():
    """包含声明和韵母的词语"""
    hans = '中心'
    # 默认风格，带声调
    assert pinyin(hans) == [['zh\u014dng'], ['x\u012bn']]
    assert pinyin(hans, strict=False) == [['zh\u014dng'], ['x\u012bn']]
    # 普通风格，不带声调
    assert pinyin(hans, NORMAL) == [['zhong'], ['xin']]
    assert pinyin(hans, NORMAL, strict=False) == [['zhong'], ['xin']]
    # 声调风格，拼音声调在韵母第一个字母上
    assert pinyin(hans, TONE) == [['zh\u014dng'], ['x\u012bn']]
    assert pinyin(hans, TONE, strict=False) == [['zh\u014dng'], ['x\u012bn']]
    # 声调风格2，即拼音声调在各个声母之后，用数字 [1-4] 进行表示
    assert pinyin(hans, TONE2) == [['zho1ng'], ['xi1n']]
    assert pinyin(hans, TONE2, strict=False) == [['zho1ng'], ['xi1n']]
    # 声调风格3，即拼音声调在各个拼音之后，用数字 [1-4] 进行表示
    assert pinyin(hans, TONE3) == [['zhong1'], ['xin1']]
    assert pinyin(hans, TONE3, strict=False) == [['zhong1'], ['xin1']]
    # 声母风格，只返回各个拼音的声母部分
    assert pinyin(hans, INITIALS) == [['zh'], ['x']]
    assert pinyin(hans, INITIALS, strict=False) == [['zh'], ['x']]
    # 首字母风格，只返回拼音的首字母部分
    assert pinyin(hans, FIRST_LETTER) == [['z'], ['x']]
    assert pinyin(hans, FIRST_LETTER, strict=False) == [['z'], ['x']]
    # 注音风格，带声调
    assert pinyin(hans, BOPOMOFO) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
    assert pinyin(hans, BOPOMOFO, strict=False) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
    # 注音风格，首字母
    assert pinyin(hans, BOPOMOFO_FIRST) == [['ㄓ'], ['ㄒ']]
    assert pinyin(hans, BOPOMOFO_FIRST, strict=False) == [['ㄓ'], ['ㄒ']]
    # test CYRILLIC style
    assert pinyin(hans, CYRILLIC) == [['чжун1'], ['синь1']]
    assert pinyin(hans, CYRILLIC, strict=False) == [['чжун1'], ['синь1']]
    # CYRILLIC_FIRST style return only first letters
    assert pinyin(hans, CYRILLIC_FIRST) == [['ч'], ['с']]
    assert pinyin(hans, CYRILLIC_FIRST, strict=False) == [['ч'], ['с']]
    # 启用多音字模式
    assert pinyin(hans, heteronym=True) == [['zh\u014dng', 'zh\xf2ng'],
                                            ['x\u012bn']]
    assert pinyin(hans, heteronym=True, strict=False) == \
        [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]
    # 韵母风格1，只返回各个拼音的韵母部分，不带声调
    assert pinyin(hans, style=FINALS) == [['ong'], ['in']]
    assert pinyin(hans, style=FINALS, strict=False) == [['ong'], ['in']]
    # 韵母风格2，带声调，声调在韵母第一个字母上
    assert pinyin(hans, style=FINALS_TONE) == [['\u014dng'], ['\u012bn']]
    assert pinyin(hans, style=FINALS_TONE, strict=False) == \
        [['\u014dng'], ['\u012bn']]
    # 韵母风格2，带声调，声调在各个声母之后，用数字 [1-4] 进行表示
    assert pinyin(hans, style=FINALS_TONE2) == [['o1ng'], ['i1n']]
    assert pinyin(hans, style=FINALS_TONE2, strict=False) == \
        [['o1ng'], ['i1n']]
    # 韵母风格3，带声调，声调在各个拼音之后，用数字 [1-4] 进行表示
    assert pinyin(hans, style=FINALS_TONE3) == [['ong1'], ['in1']]
    assert pinyin(hans, style=FINALS_TONE3, strict=False) == \
        [['ong1'], ['in1']]


def test_pinyin_finals():
    """只包含韵母的词语"""
    hans = '嗷嗷'
    assert pinyin(hans) == [['\xe1o'], ['\xe1o']]
    assert pinyin(hans + 'abc') == [['\xe1o'], ['\xe1o'], ['abc']]
    assert pinyin(hans, NORMAL) == [['ao'], ['ao']]
    assert pinyin(hans, TONE) == [['\xe1o'], ['\xe1o']]
    assert pinyin(hans, TONE2) == [['a2o'], ['a2o']]
    assert pinyin(hans, TONE3) == [['ao2'], ['ao2']]
    assert pinyin(hans, INITIALS) == [[''], ['']]
    assert pinyin(hans, FIRST_LETTER) == [['a'], ['a']]
    assert pinyin(hans, BOPOMOFO) == [['ㄠˊ'], ['ㄠˊ']]
    assert pinyin(hans, BOPOMOFO_FIRST) == [['ㄠ'], ['ㄠ']]
    assert pinyin(hans, CYRILLIC) == [['ао2'], ['ао2']]
    assert pinyin(hans, CYRILLIC_FIRST) == [['а'], ['а']]
    assert pinyin(hans, heteronym=True) == [['\xe1o'], ['\xe1o']]
    assert pinyin('啊', heteronym=True) == \
        [['a', 'ā', 'á', 'ǎ', 'à', 'è']]
    assert pinyin(hans, style=FINALS) == [['ao'], ['ao']]
    assert pinyin(hans, style=FINALS_TONE) == [['\xe1o'], ['\xe1o']]
    assert pinyin(hans, style=FINALS_TONE2) == [['a2o'], ['a2o']]
    assert pinyin(hans, style=FINALS_TONE3) == [['ao2'], ['ao2']]


def test_slug():
    hans = '中心'
    assert slug(hans) == 'zhong-xin'
    assert slug(hans, heteronym=True) == 'zhong-xin'


def test_zh_and_en():
    """中英文混合的情况"""
    # 中英文
    hans = '中心'
    assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']]
    # 中英文混合的固定词组
    assert pinyin('黄山B股', style=TONE2) == \
        [['hua2ng'], ['sha1n'], ['B'], ['gu3']]
    assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
    assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
    assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
    assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
    assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
    assert pinyin('维生素C', style=TONE2) == \
        [['we2i'], ['she1ng'], ['su4'], ['C']]


def test_others():
    # 空字符串
    assert pinyin('') == []
    # 单个汉字
    assert pinyin('營') == [['y\xedng']]
    # 中国 人
    assert pinyin('中国人') == [['zh\u014dng'], ['gu\xf3'], ['r\xe9n']]
    # 日文
    assert pinyin('の') == [['\u306e']]
    # 没有读音的汉字，还不存在的汉字
    assert pinyin('\u9fff') == [['\u9fff']]


def test_lazy_pinyin():
    assert lazy_pinyin('中国人') == ['zhong', 'guo', 'ren']
    assert lazy_pinyin('中心') == ['zhong', 'xin']
    assert lazy_pinyin('中心', style=TONE) == ['zh\u014dng', 'x\u012bn']
    assert lazy_pinyin('中心', style=INITIALS) == ['zh', 'x']
    assert lazy_pinyin('中心', style=BOPOMOFO) == ['ㄓㄨㄥ', 'ㄒㄧㄣ']
    assert lazy_pinyin('中心', style=CYRILLIC) == ['чжун1', 'синь1']


def test_seg():
    hans = '音乐'
    hans_seg = list(simpleseg(hans))
    assert pinyin(hans_seg, style=TONE2) == [['yi1n'], ['yue4']]
    # 中英文混合的固定词组
    assert pinyin('黄山B股', style=TONE2) == \
        [['hua2ng'], ['sha1n'], ['B'], ['gu3']]
    assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
    assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
    assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
    assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
    assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
    assert pinyin('维生素C', style=TONE2) == \
        [['we2i'], ['she1ng'], ['su4'], ['C']]


def test_custom_pinyin_dict():
    hans = '桔'
    try:
        assert lazy_pinyin(hans, style=TONE2) == ['ju2']
    except AssertionError:
        pass
    load_single_dict({ord('桔'): 'jú,jié'})
    assert lazy_pinyin(hans, style=TONE2) == ['ju2']


def test_custom_pinyin_dict2():
    hans = ['同行']
    try:
        assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'ha2ng']
    except AssertionError:
        pass
    load_phrases_dict({'同行': [['tóng'], ['xíng']]})
    assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'xi2ng']


def test_custom_pinyin_dict_tone2():
    load_single_dict({ord('桔'): 'ce4,si4'}, style='tone2')
    assert lazy_pinyin('桔', style=TONE2) == ['ce4']
    assert pinyin('桔') == [['cè']]


def test_custom_pinyin_dict2_tone2():
    load_phrases_dict({'同行': [['to4ng'], ['ku1']]}, style='tone2')
    assert lazy_pinyin(['同行'], style=TONE2) == ['to4ng', 'ku1']
    assert pinyin('同行') == [['tòng'], ['kū']]


# yapf: disable
def test_errors():
    hans = (
        ('啊', {'style': TONE2}, [['a']]),
        ('啊a', {'style': TONE2}, [['a'], ['a']]),
        # 非中文字符，没有拼音
        ('⺁', {'style': TONE2}, [['\u2e81']]),
        ('⺁', {'style': TONE2, 'errors': 'ignore'}, []),
        ('⺁', {'style': TONE2, 'errors': 'replace'}, [['2e81']]),
        ('⺁⺁', {'style': TONE2, 'errors': 'replace'}, [['2e812e81']]),
        ('⺁⺁', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
         [['a'], ['a']]),
        ('⺁⺁', {'style': TONE2, 'errors': lambda x: [['a', 'b'], ['b', 'c']]},
         [['a'], ['b']]),
        ('⺁⺁', {'style': TONE2, 'heteronym': True,
                'errors': lambda x: [['a', 'b'], ['b', 'c']]},
         [['a', 'b'], ['b', 'c']]),
        # 中文字符，没有拼音
        ('鿅', {'style': TONE2}, [['\u9fc5']]),
        ('鿅', {'style': TONE2, 'errors': 'ignore'}, []),
        ('鿅', {'style': TONE2, 'errors': '233'}, []),
        ('鿅', {'style': TONE2, 'errors': 'replace'}, [['9fc5']]),
        ('鿅', {'style': TONE2, 'errors': lambda x: ['a']}, [['a']]),
        ('鿅', {'style': TONE2, 'errors': lambda x: None}, []),
        ('鿅鿅', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
         [['a'], ['a']]),
        ('鿅鿅', {'style': TONE2, 'errors': lambda x: [['a', 'b']]},
         [['a'], ['a']]),
        ('鿅鿅', {'style': TONE2, 'heteronym': True,
                'errors': lambda x: [['a', 'b']]},
         [['a', 'b'], ['a', 'b']]),
    )
    for han in hans:
        assert pinyin(han[0], **han[1]) == han[2]


def test_errors_callable():
    def foobar(chars):
        return 'a' * len(chars)

    class Foobar(object):
        def __call__(self, chars):
            return 'a' * len(chars)

    n = 5
    assert pinyin('あ' * n, errors=foobar) == [['a' * n]]
    assert pinyin('あ' * n, errors=Foobar()) == [['a' * n]]


def test_simple_seg():
    data = {
        '北京abcc': 'be3i ji1ng abcc',
        '你好にほんごРусский язык': 'ni3 ha3o にほんごРусский язык',
    }
    for h, p in data.items():
        assert slug([h], style=TONE2, separator=' ') == p

    hans = '你好にほんごРусский язык'
    ret = 'ni3 ha3o'
    assert slug(hans, style=TONE2, separator=' ', errors=lambda x: None) == ret


data_for_update = [
    # 便宜的发音
    [
        ['便宜'], {'style': TONE2}, ['pia2n', 'yi2']
    ],
    [
        ['便宜从事'], {'style': TONE2}, ['bia4n', 'yi2', 'co2ng', 'shi4']
    ],
    [
        ['便宜施行'], {'style': TONE2}, ['bia4n', 'yi2', 'shi1', 'xi2ng']
    ],
    [
        ['便宜货'], {'style': TONE2}, ['pia2n', 'yi2', 'huo4']
    ],
    [
        ['贪便宜'], {'style': TONE2}, ['ta1n', 'pia2n', 'yi2']
    ],
    [
        ['讨便宜'], {'style': TONE2}, ['ta3o', 'pia2n', 'yi2']
    ],
    [
        ['小便宜'], {'style': TONE2}, ['xia3o', 'pia2n', 'yi2']
    ],
    [
        ['占便宜'], {'style': TONE2}, ['zha4n', 'pia2n', 'yi2']
    ],
    #
    [
        '\u3400', {'style': TONE2}, ['qiu1'],  # CJK 扩展 A:[3400-4DBF]
    ],
    [
        '\u4E00', {'style': TONE2}, ['yi1'],   # CJK 基本:[4E00-9FFF]
    ],
    # [
    #     '\uFA29', {'style': TONE2}, ['da3o'],  # CJK 兼容:[F900-FAFF]
    # ],
    # 误把 yu 放到声母列表了
    ['鱼', {'style': TONE2}, ['yu2']],
    ['鱼', {'style': FINALS}, ['v']],
    ['鱼', {'style': BOPOMOFO}, ['ㄩˊ']],
    ['鱼', {'style': CYRILLIC}, ['юй']],
    ['雨', {'style': TONE2}, ['yu3']],
    ['雨', {'style': FINALS}, ['v']],
    ['雨', {'style': BOPOMOFO}, ['ㄩˇ']],
    ['雨', {'style': CYRILLIC}, ['юй']],
    ['元', {'style': TONE2}, ['yua2n']],
    ['元', {'style': FINALS}, ['van']],
    ['元', {'style': BOPOMOFO}, ['ㄩㄢˊ']],
    ['元', {'style': CYRILLIC}, ['юань2']],
    # y, w 也不是拼音, yu的韵母是v, yi的韵母是i, wu的韵母是u
    ['呀', {'style': INITIALS}, ['']],
    ['呀', {'style': TONE2}, ['ya']],
    ['呀', {'style': FINALS}, ['ia']],
    ['呀', {'style': BOPOMOFO}, ['ㄧㄚ˙']],
    ['呀', {'style': CYRILLIC}, ['я']],
    ['无', {'style': INITIALS}, ['']],
    ['无', {'style': TONE2}, ['wu2']],
    ['无', {'style': FINALS}, ['u']],
    ['无', {'style': FINALS_TONE}, ['ú']],
    ['无', {'style': BOPOMOFO}, ['ㄨˊ']],
    ['无', {'style': CYRILLIC}, ['у2']],
    ['衣', {'style': TONE2}, ['yi1']],
    ['衣', {'style': FINALS}, ['i']],
    ['衣', {'style': BOPOMOFO}, ['ㄧ']],
    ['衣', {'style': CYRILLIC}, ['и1']],
    ['万', {'style': TONE2}, ['wa4n']],
    ['万', {'style': FINALS}, ['uan']],
    ['万', {'style': BOPOMOFO}, ['ㄨㄢˋ']],
    ['万', {'style': CYRILLIC}, ['вань4']],
    # ju, qu, xu 的韵母应该是 v
    ['具', {'style': FINALS_TONE}, ['ǜ']],
    ['具', {'style': FINALS_TONE2}, ['v4']],
    ['具', {'style': FINALS}, ['v']],
    ['具', {'style': BOPOMOFO}, ['ㄐㄩˋ']],
    ['具', {'style': CYRILLIC}, ['цзюй4']],
    ['取', {'style': FINALS_TONE}, ['ǚ']],
    ['取', {'style': FINALS_TONE2}, ['v3']],
    ['取', {'style': FINALS}, ['v']],
    ['取', {'style': BOPOMOFO}, ['ㄑㄩˇ']],
    ['取', {'style': CYRILLIC}, ['цюй3']],
    ['徐', {'style': FINALS_TONE}, ['ǘ']],
    ['徐', {'style': FINALS_TONE2}, ['v2']],
    ['徐', {'style': FINALS}, ['v']],
    ['徐', {'style': BOPOMOFO}, ['ㄒㄩˊ']],
    ['徐', {'style': CYRILLIC}, ['сюй2']],
    # ń
    ['嗯', {'style': NORMAL}, ['n']],
    ['嗯', {'style': TONE}, ['ń']],
    ['嗯', {'style': TONE2}, ['n2']],
    ['嗯', {'style': INITIALS}, ['']],
    ['嗯', {'style': FIRST_LETTER}, ['n']],
    ['嗯', {'style': FINALS}, ['n']],
    ['嗯', {'style': FINALS_TONE}, ['ń']],
    ['嗯', {'style': FINALS_TONE2}, ['n2']],
    ['嗯', {'style': BOPOMOFO}, ['ㄣˊ']],
    ['嗯', {'style': CYRILLIC}, ['н2']],
    # ḿ  \u1e3f  U+1E3F
    ['呣', {'style': NORMAL}, ['m']],
    ['呣', {'style': TONE}, ['ḿ']],
    ['呣', {'style': TONE2}, ['m2']],
    ['呣', {'style': INITIALS}, ['']],
    ['呣', {'style': FIRST_LETTER}, ['m']],
    ['呣', {'style': FINALS}, ['m']],
    ['呣', {'style': FINALS_TONE}, ['ḿ']],
    ['呣', {'style': FINALS_TONE2}, ['m2']],
    ['呣', {'style': BOPOMOFO}, ['ㄇㄨˊ']],
    ['呣', {'style': CYRILLIC}, ['м2']],
    # 41
    ['彷徨', {}, ['pang', 'huang']],
    ['彷徨', {'style': CYRILLIC}, ['пан2', 'хуан2']],
    # 注音
    ['打量', {'style': BOPOMOFO}, ['ㄉㄚˇ', 'ㄌㄧㄤˋ']],
    ['黄山b股', {'style': BOPOMOFO}, ['ㄏㄨㄤˊ', 'ㄕㄢ', 'b', 'ㄍㄨˇ']],
    ['打量', {'style': CYRILLIC}, ['да3', 'лян4']],
    ['黄山b股', {'style': CYRILLIC}, ['хуан2', 'шань1', 'b', 'гу3']],
    # 50
    ['打量', {'style': TONE2}, ['da3', 'lia4ng']],
    ['打量', {'style': TONE3}, ['da3', 'liang4']],
    ['侵略', {'style': TONE2}, ['qi1n', 'lve4']],
    ['侵略', {'style': TONE3}, ['qin1', 'lve4']],
    ['侵略', {'style': FINALS_TONE2}, ['i1n', 've4']],
    ['侵略', {'style': FINALS_TONE3}, ['in1', 've4']],
    ['侵略', {'style': BOPOMOFO}, ['ㄑㄧㄣ', 'ㄌㄩㄝˋ']],
    ['侵略', {'style': CYRILLIC}, ['цинь1', 'люэ4']],
    ['〇', {'style': TONE}, ['líng']],
    # 二次分词
    [['你要', '重新考虑OK'], {'style': TONE}, [
        'nǐ', 'yào', 'chóng', 'xīn', 'kǎo', 'lǜ', 'OK']],
]


@pytest.mark.parametrize('hans, kwargs, result', data_for_update)
def test_update(hans, kwargs, result):
    assert lazy_pinyin(hans, **kwargs) == result


@pytest.mark.skipif(not SUPPORT_UCS4, reason='dont support ucs4')
@pytest.mark.parametrize(
    'han, result', [
        ['\U00020000', ['he']],      # CJK 扩展 B:[20000-2A6DF]
        ['\U0002A79D', ['duo']],      # CJK 扩展 C:[2A700-2B73F]
        # ['\U0002B740', ['wu']],      # CJK 扩展 D:[2B740-2B81D]
        # ['\U0002F80A', ['seng']],    # CJK 兼容扩展:[2F800-2FA1F]
    ]
)
def test_support_ucs4(han, result):
    assert lazy_pinyin(han) == result


@pytest.mark.skipif(SUPPORT_UCS4, reason='support ucs4')
@pytest.mark.parametrize(
    'han', [
        '\U00020000',      # CJK 扩展 B:[20000-2A6DF]
        '\U0002A79D',      # CJK 扩展 C:[2A700-2B73F]
        # '\U0002B740',      # CJK 扩展 D:[2B740-2B81D]
        # '\U0002F80A',      # CJK 兼容扩展:[2F800-2FA1F]
    ]
)
def test_dont_support_ucs4(han):
    assert pinyin(han) == [[han]]


def test_36():
    hans = '两年前七斤喝醉了酒'
    pys = ['liang', 'nian', 'qian', 'qi', 'jin', 'he', 'zui', 'le', 'jiu']
    assert lazy_pinyin(hans) == pys


def test_with_unknown_style():
    assert lazy_pinyin('中国') == ['zhong', 'guo']
    assert lazy_pinyin('中国', style='unknown') == ['zhōng', 'guó']
    assert pinyin('中国') == [['zhōng'], ['guó']]
    assert pinyin('中国', style='unknown') == [['zhōng'], ['guó']]


@pytest.mark.parametrize('kwargs,result', [
    [{}, [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
    [dict(strict=False), [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
    [dict(style=NORMAL), [['zhong'], ['xin']]],
    [dict(style=NORMAL, strict=False), [['zhong'], ['xin']]],
    [dict(style=TONE), [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
    [dict(style=TONE, strict=False), [
        ['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
    [dict(style=TONE2), [['zho1ng', 'zho4ng'], ['xi1n']]],
    [dict(style=TONE2, strict=False), [['zho1ng', 'zho4ng'], ['xi1n']]],
    [dict(style=TONE3), [['zhong1', 'zhong4'], ['xin1']]],
    [dict(style=TONE3, strict=False), [['zhong1', 'zhong4'], ['xin1']]],
    [dict(style=INITIALS), [['zh'], ['x']]],
    [dict(style=INITIALS, strict=False), [['zh'], ['x']]],
    [dict(style=FIRST_LETTER), [['z'], ['x']]],
    [dict(style=FIRST_LETTER, strict=False), [['z'], ['x']]],
    [dict(style=FINALS), [['ong'], ['in']]],
    [dict(style=FINALS, strict=False), [['ong'], ['in']]],
    [dict(style=FINALS_TONE), [['\u014dng', '\xf2ng'], ['\u012bn']]],
    [dict(style=FINALS_TONE, strict=False),  [
        ['\u014dng', '\xf2ng'], ['\u012bn']]],
    [dict(style=FINALS_TONE2), [['o1ng', 'o4ng'], ['i1n']]],
    [dict(style=FINALS_TONE2, strict=False),  [['o1ng', 'o4ng'], ['i1n']]],
    [dict(style=FINALS_TONE3), [['ong1', 'ong4'], ['in1']]],
    [dict(style=FINALS_TONE3, strict=False), [['ong1', 'ong4'], ['in1']]],
])
def test_heteronym_and_style(kwargs, result):
    hans = '中心'
    kwargs['heteronym'] = True
    assert pinyin(hans, **kwargs) == result


@pytest.mark.parametrize('kwargs,result', [
    [{}, [['zhāo'], ['yáng']]],
    [dict(heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
    [dict(strict=False), [['zhāo'], ['yáng']]],
    [dict(strict=False, heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
    [dict(style=NORMAL), [['zhao'], ['yang']]],
    [dict(style=NORMAL, heteronym=True), [['zhao', 'chao'], ['yang']]],
    [dict(style=NORMAL, strict=False), [['zhao'], ['yang']]],
    [dict(style=NORMAL, strict=False, heteronym=True), [['zhao', 'chao'],
                                                        ['yang']]],
    [dict(style=TONE), [['zhāo'], ['yáng']]],
    [dict(style=TONE, heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
    [dict(style=TONE, strict=False), [['zhāo'], ['yáng']]],
    [dict(style=TONE, strict=False, heteronym=True), [['zhāo', 'cháo'],
                                                      ['yáng']]],
    [dict(style=TONE2), [['zha1o'], ['ya2ng']]],
    [dict(style=TONE2, heteronym=True), [['zha1o', 'cha2o'], ['ya2ng']]],
    [dict(style=TONE2, strict=False), [['zha1o'], ['ya2ng']]],
    [dict(style=TONE2, strict=False, heteronym=True), [['zha1o', 'cha2o'],
                                                       ['ya2ng']]],
    [dict(style=TONE3), [['zhao1'], ['yang2']]],
    [dict(style=TONE3, heteronym=True), [['zhao1', 'chao2'], ['yang2']]],
    [dict(style=TONE3, strict=False), [['zhao1'], ['yang2']]],
    [dict(style=TONE3, strict=False, heteronym=True), [['zhao1', 'chao2'],
                                                       ['yang2']]],
    [dict(style=INITIALS), [['zh'], ['']]],
    [dict(style=INITIALS, heteronym=True), [['zh', 'ch'], ['']]],
    [dict(style=INITIALS, strict=False), [['zh'], ['y']]],
    [dict(style=INITIALS, strict=False, heteronym=True), [['zh', 'ch'],
                                                          ['y']]],
    [dict(style=FIRST_LETTER), [['z'], ['y']]],
    [dict(style=FIRST_LETTER, heteronym=True), [['z', 'c'], ['y']]],
    [dict(style=FIRST_LETTER, strict=False), [['z'], ['y']]],
    [dict(style=FIRST_LETTER, strict=False, heteronym=True), [['z', 'c'],
                                                              ['y']]],
    [dict(style=FINALS), [['ao'], ['iang']]],
    [dict(style=FINALS, heteronym=True), [['ao'], ['iang']]],
    [dict(style=FINALS, strict=False), [['ao'], ['ang']]],
    [dict(style=FINALS, strict=False, heteronym=True), [['ao'], ['ang']]],
    [dict(style=FINALS_TONE), [['āo'], ['iáng']]],
    [dict(style=FINALS_TONE, heteronym=True), [['āo', 'áo'], ['iáng']]],
    [dict(style=FINALS_TONE, strict=False),  [['āo'], ['áng']]],
    [dict(style=FINALS_TONE, strict=False, heteronym=True),  [['āo', 'áo'],
                                                              ['áng']]],
    [dict(style=FINALS_TONE2), [['a1o'], ['ia2ng']]],
    [dict(style=FINALS_TONE2, heteronym=True), [['a1o', 'a2o'], ['ia2ng']]],
    [dict(style=FINALS_TONE2, strict=False),  [['a1o'], ['a2ng']]],
    [dict(style=FINALS_TONE2, strict=False, heteronym=True),  [['a1o', 'a2o'],
                                                               ['a2ng']]],
    [dict(style=FINALS_TONE3), [['ao1'], ['iang2']]],
    [dict(style=FINALS_TONE3, heteronym=True), [['ao1', 'ao2'], ['iang2']]],
    [dict(style=FINALS_TONE3, strict=False), [['ao1'], ['ang2']]],
    [dict(style=FINALS_TONE3, strict=False, heteronym=True), [['ao1', 'ao2'],
                                                              ['ang2']]],
])
def test_heteronym_and_style_phrase(kwargs, result):
    hans = '朝阳'
    assert pinyin(hans, **kwargs) == result


def test_m4():
    # U+5463: ḿ,móu,m̀  # 呣
    han = '呣'
    assert pinyin(han) == [['ḿ']]
    assert pinyin(han, heteronym=True) == [['ḿ', 'm̀', 'móu']]
    assert pinyin(
        han, heteronym=True, style=NORMAL) == [['m', 'mou']]
    assert pinyin(
        han, heteronym=True, style=TONE) == [['ḿ', 'm̀', 'móu']]
    assert pinyin(
        han, heteronym=True, style=TONE2) == [['m2', 'm4', 'mo2u']]
    assert pinyin(
        han, heteronym=True, style=TONE3) == [['m2', 'm4', 'mou2']]
    assert pinyin(
        han, heteronym=True, style=INITIALS) == [['', 'm']]  # TODO: fix ''
    assert pinyin(
        han, heteronym=True, style=FIRST_LETTER) == [['m']]
    assert pinyin(
        han, heteronym=True, style=FINALS) == [['m', 'ou']]
    assert pinyin(
        han, heteronym=True, style=FINALS_TONE) == [['ḿ', 'm̀', 'óu']]
    assert pinyin(
        han, heteronym=True, style=FINALS_TONE2) == [['m2', 'm4', 'o2u']]
    assert pinyin(
        han, heteronym=True, style=FINALS_TONE3) == [['m2', 'm4', 'ou2']]


@pytest.mark.parametrize('han,style,expect', [
    ['呣', Style.TONE, ['ḿ', 'm̀']],
    ['呣', Style.TONE2, ['m2', 'm4']],
    ['嘸', Style.TONE, ['m̄', 'ḿ']],
    ['嘸', Style.TONE2, ['m1', 'm2']],
    ['誒', Style.TONE, ['ê̄', 'ế', 'ê̌', 'ề']],
    ['誒', Style.TONE2, ['ê1', 'ê2', 'ê3', 'ê4']],
])
def test_m_e(han, style, expect):
    result = pinyin(han, style=style, heteronym=True)
    assert len(result) == 1
    assert (set(result[0]) & set(expect)) == set(expect)


if __name__ == '__main__':
    import pytest
    pytest.cmdline.main()
-												E2E/Streaming Transformer/Conformer ASR (#578)

* add cmvn and label smoothing loss layer

* add layer for transformer

* add glu and conformer conv

* add torch compatiable hack, mask funcs

* not hack size since it exists

* add test; attention

* add attention, common utils, hack paddle

* add audio utils

* conformer batch padding mask bug fix #223

* fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2

* fix ci

* fix ci

* add encoder

* refactor egs

* add decoder

* refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils

* refactor docs

* add fix

* fix readme

* fix bugs, refactor collator, add pad_sequence, fix ckpt bugs

* fix docstring

* refactor data feed order

* add u2 model

* refactor cmvn, test

* add utils

* add u2 config

* fix bugs

* fix bugs

* fix autograd maybe has problem when using inplace operation

* refactor data, build vocab; add format data

* fix text featurizer

* refactor build vocab

* add fbank, refactor feature of speech

* refactor audio feat

* refactor data preprare

* refactor data

* model init from config

* add u2 bins

* flake8

* can train

* fix bugs, add coverage, add scripts

* test can run

* fix data

* speed perturb with sox

* add spec aug

* fix for train

* fix train logitc

* fix logger

* log valid loss, time dataset process

* using np for speed perturb, remove some debug log of grad clip

* fix logger

* fix build vocab

* fix logger name

* using module logger as default

* fix

* fix install

* reorder imports

* fix board logger

* fix logger

* kaldi fbank and mfcc

* fix cmvn and print prarams

* fix add_eos_sos and cmvn

* fix cmvn compute

* fix logger and cmvn

* fix subsampling, label smoothing loss, remove useless

* add notebook test

* fix log

* fix tb logger

* multi gpu valid

* fix log

* fix log

* fix config

* fix compute cmvn, need paddle 2.1

* add cmvn notebook

* fix layer tools

* fix compute cmvn

* add rtf

* fix decoding

* fix layer tools

* fix log, add avg script

* more avg and test info

* fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh;

* add vimrc

* refactor tiny script, add transformer and stream conf

* spm demo; librisppech scripts and confs

* fix log

* add librispeech scripts

* refactor data pipe; fix conf; fix u2 default params

* fix bugs

* refactor aishell scripts

* fix test

* fix cmvn

* fix s0 scripts

* fix ds2 scripts and bugs

* fix dev & test dataset filter

* fix dataset filter

* filter dev

* fix ckpt path

* filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test

* add comment

* add syllable doc

* fix ds2 configs

* add doc

* add pypinyin tools

* fix decoder using blank_id=0

* mmseg with pybind11

* format code
											
										
										
											4 years ago
+								#!/usr/bin/env python3
 								import pytest
 								from pypinyin import (pinyin, slug, lazy_pinyin, load_single_dict,
 								                      load_phrases_dict, NORMAL, TONE, TONE2, TONE3, INITIALS,
 								                      FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2,
 								                      FINALS_TONE3, BOPOMOFO, BOPOMOFO_FIRST, CYRILLIC,
 								                      CYRILLIC_FIRST, Style)
 								from pypinyin.constants import SUPPORT_UCS4
 								from pypinyin.seg import simpleseg
 								def test_pinyin_initials():
 								    """包含声明和韵母的词语"""
 								    hans = '中心'
 								    # 默认风格，带声调
 								    assert pinyin(hans) == [['zh\u014dng'], ['x\u012bn']]
 								    assert pinyin(hans, strict=False) == [['zh\u014dng'], ['x\u012bn']]
 								    # 普通风格，不带声调
 								    assert pinyin(hans, NORMAL) == [['zhong'], ['xin']]
 								    assert pinyin(hans, NORMAL, strict=False) == [['zhong'], ['xin']]
 								    # 声调风格，拼音声调在韵母第一个字母上
 								    assert pinyin(hans, TONE) == [['zh\u014dng'], ['x\u012bn']]
 								    assert pinyin(hans, TONE, strict=False) == [['zh\u014dng'], ['x\u012bn']]
 								    # 声调风格2，即拼音声调在各个声母之后，用数字 [1-4] 进行表示
 								    assert pinyin(hans, TONE2) == [['zho1ng'], ['xi1n']]
 								    assert pinyin(hans, TONE2, strict=False) == [['zho1ng'], ['xi1n']]
 								    # 声调风格3，即拼音声调在各个拼音之后，用数字 [1-4] 进行表示
 								    assert pinyin(hans, TONE3) == [['zhong1'], ['xin1']]
 								    assert pinyin(hans, TONE3, strict=False) == [['zhong1'], ['xin1']]
 								    # 声母风格，只返回各个拼音的声母部分
 								    assert pinyin(hans, INITIALS) == [['zh'], ['x']]
 								    assert pinyin(hans, INITIALS, strict=False) == [['zh'], ['x']]
 								    # 首字母风格，只返回拼音的首字母部分
 								    assert pinyin(hans, FIRST_LETTER) == [['z'], ['x']]
 								    assert pinyin(hans, FIRST_LETTER, strict=False) == [['z'], ['x']]
 								    # 注音风格，带声调
 								    assert pinyin(hans, BOPOMOFO) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
 								    assert pinyin(hans, BOPOMOFO, strict=False) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
 								    # 注音风格，首字母
 								    assert pinyin(hans, BOPOMOFO_FIRST) == [['ㄓ'], ['ㄒ']]
 								    assert pinyin(hans, BOPOMOFO_FIRST, strict=False) == [['ㄓ'], ['ㄒ']]
 								    # test CYRILLIC style
 								    assert pinyin(hans, CYRILLIC) == [['чжун1'], ['синь1']]
 								    assert pinyin(hans, CYRILLIC, strict=False) == [['чжун1'], ['синь1']]
 								    # CYRILLIC_FIRST style return only first letters
 								    assert pinyin(hans, CYRILLIC_FIRST) == [['ч'], ['с']]
 								    assert pinyin(hans, CYRILLIC_FIRST, strict=False) == [['ч'], ['с']]
 								    # 启用多音字模式
 								    assert pinyin(hans, heteronym=True) == [['zh\u014dng', 'zh\xf2ng'],
 								                                            ['x\u012bn']]
 								    assert pinyin(hans, heteronym=True, strict=False) == \
 								        [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]
 								    # 韵母风格1，只返回各个拼音的韵母部分，不带声调
 								    assert pinyin(hans, style=FINALS) == [['ong'], ['in']]
 								    assert pinyin(hans, style=FINALS, strict=False) == [['ong'], ['in']]
 								    # 韵母风格2，带声调，声调在韵母第一个字母上
 								    assert pinyin(hans, style=FINALS_TONE) == [['\u014dng'], ['\u012bn']]
 								    assert pinyin(hans, style=FINALS_TONE, strict=False) == \
 								        [['\u014dng'], ['\u012bn']]
 								    # 韵母风格2，带声调，声调在各个声母之后，用数字 [1-4] 进行表示
 								    assert pinyin(hans, style=FINALS_TONE2) == [['o1ng'], ['i1n']]
 								    assert pinyin(hans, style=FINALS_TONE2, strict=False) == \
 								        [['o1ng'], ['i1n']]
 								    # 韵母风格3，带声调，声调在各个拼音之后，用数字 [1-4] 进行表示
 								    assert pinyin(hans, style=FINALS_TONE3) == [['ong1'], ['in1']]
 								    assert pinyin(hans, style=FINALS_TONE3, strict=False) == \
 								        [['ong1'], ['in1']]
 								def test_pinyin_finals():
 								    """只包含韵母的词语"""
 								    hans = '嗷嗷'
 								    assert pinyin(hans) == [['\xe1o'], ['\xe1o']]
 								    assert pinyin(hans + 'abc') == [['\xe1o'], ['\xe1o'], ['abc']]
 								    assert pinyin(hans, NORMAL) == [['ao'], ['ao']]
 								    assert pinyin(hans, TONE) == [['\xe1o'], ['\xe1o']]
 								    assert pinyin(hans, TONE2) == [['a2o'], ['a2o']]
 								    assert pinyin(hans, TONE3) == [['ao2'], ['ao2']]
 								    assert pinyin(hans, INITIALS) == [[''], ['']]
 								    assert pinyin(hans, FIRST_LETTER) == [['a'], ['a']]
 								    assert pinyin(hans, BOPOMOFO) == [['ㄠˊ'], ['ㄠˊ']]
 								    assert pinyin(hans, BOPOMOFO_FIRST) == [['ㄠ'], ['ㄠ']]
 								    assert pinyin(hans, CYRILLIC) == [['ао2'], ['ао2']]
 								    assert pinyin(hans, CYRILLIC_FIRST) == [['а'], ['а']]
 								    assert pinyin(hans, heteronym=True) == [['\xe1o'], ['\xe1o']]
 								    assert pinyin('啊', heteronym=True) == \
 								        [['a', 'ā', 'á', 'ǎ', 'à', 'è']]
 								    assert pinyin(hans, style=FINALS) == [['ao'], ['ao']]
 								    assert pinyin(hans, style=FINALS_TONE) == [['\xe1o'], ['\xe1o']]
 								    assert pinyin(hans, style=FINALS_TONE2) == [['a2o'], ['a2o']]
 								    assert pinyin(hans, style=FINALS_TONE3) == [['ao2'], ['ao2']]
 								def test_slug():
 								    hans = '中心'
 								    assert slug(hans) == 'zhong-xin'
 								    assert slug(hans, heteronym=True) == 'zhong-xin'
 								def test_zh_and_en():
 								    """中英文混合的情况"""
 								    # 中英文
 								    hans = '中心'
 								    assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']]
 								    # 中英文混合的固定词组
 								    assert pinyin('黄山B股', style=TONE2) == \
 								        [['hua2ng'], ['sha1n'], ['B'], ['gu3']]
 								    assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
 								    assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
 								    assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
 								    assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
 								    assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
 								    assert pinyin('维生素C', style=TONE2) == \
 								        [['we2i'], ['she1ng'], ['su4'], ['C']]
 								def test_others():
 								    # 空字符串
 								    assert pinyin('') == []
 								    # 单个汉字
 								    assert pinyin('營') == [['y\xedng']]
 								    # 中国 人
 								    assert pinyin('中国人') == [['zh\u014dng'], ['gu\xf3'], ['r\xe9n']]
 								    # 日文
 								    assert pinyin('の') == [['\u306e']]
 								    # 没有读音的汉字，还不存在的汉字
 								    assert pinyin('\u9fff') == [['\u9fff']]
 								def test_lazy_pinyin():
 								    assert lazy_pinyin('中国人') == ['zhong', 'guo', 'ren']
 								    assert lazy_pinyin('中心') == ['zhong', 'xin']
 								    assert lazy_pinyin('中心', style=TONE) == ['zh\u014dng', 'x\u012bn']
 								    assert lazy_pinyin('中心', style=INITIALS) == ['zh', 'x']
 								    assert lazy_pinyin('中心', style=BOPOMOFO) == ['ㄓㄨㄥ', 'ㄒㄧㄣ']
 								    assert lazy_pinyin('中心', style=CYRILLIC) == ['чжун1', 'синь1']
 								def test_seg():
 								    hans = '音乐'
 								    hans_seg = list(simpleseg(hans))
 								    assert pinyin(hans_seg, style=TONE2) == [['yi1n'], ['yue4']]
 								    # 中英文混合的固定词组
 								    assert pinyin('黄山B股', style=TONE2) == \
 								        [['hua2ng'], ['sha1n'], ['B'], ['gu3']]
 								    assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
 								    assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
 								    assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
 								    assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
 								    assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
 								    assert pinyin('维生素C', style=TONE2) == \
 								        [['we2i'], ['she1ng'], ['su4'], ['C']]
 								def test_custom_pinyin_dict():
 								    hans = '桔'
 								    try:
 								        assert lazy_pinyin(hans, style=TONE2) == ['ju2']
 								    except AssertionError:
 								        pass
 								    load_single_dict({ord('桔'): 'jú,jié'})
 								    assert lazy_pinyin(hans, style=TONE2) == ['ju2']
 								def test_custom_pinyin_dict2():
 								    hans = ['同行']
 								    try:
 								        assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'ha2ng']
 								    except AssertionError:
 								        pass
 								    load_phrases_dict({'同行': [['tóng'], ['xíng']]})
 								    assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'xi2ng']
 								def test_custom_pinyin_dict_tone2():
 								    load_single_dict({ord('桔'): 'ce4,si4'}, style='tone2')
 								    assert lazy_pinyin('桔', style=TONE2) == ['ce4']
 								    assert pinyin('桔') == [['cè']]
 								def test_custom_pinyin_dict2_tone2():
 								    load_phrases_dict({'同行': [['to4ng'], ['ku1']]}, style='tone2')
 								    assert lazy_pinyin(['同行'], style=TONE2) == ['to4ng', 'ku1']
 								    assert pinyin('同行') == [['tòng'], ['kū']]
 								# yapf: disable
 								def test_errors():
 								    hans = (
 								        ('啊', {'style': TONE2}, [['a']]),
 								        ('啊a', {'style': TONE2}, [['a'], ['a']]),
 								        # 非中文字符，没有拼音
 								        ('⺁', {'style': TONE2}, [['\u2e81']]),
 								        ('⺁', {'style': TONE2, 'errors': 'ignore'}, []),
 								        ('⺁', {'style': TONE2, 'errors': 'replace'}, [['2e81']]),
 								        ('⺁⺁', {'style': TONE2, 'errors': 'replace'}, [['2e812e81']]),
 								        ('⺁⺁', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
 								         [['a'], ['a']]),
 								        ('⺁⺁', {'style': TONE2, 'errors': lambda x: [['a', 'b'], ['b', 'c']]},
 								         [['a'], ['b']]),
 								        ('⺁⺁', {'style': TONE2, 'heteronym': True,
 								                'errors': lambda x: [['a', 'b'], ['b', 'c']]},
 								         [['a', 'b'], ['b', 'c']]),
 								        # 中文字符，没有拼音
 								        ('鿅', {'style': TONE2}, [['\u9fc5']]),
 								        ('鿅', {'style': TONE2, 'errors': 'ignore'}, []),
 								        ('鿅', {'style': TONE2, 'errors': '233'}, []),
 								        ('鿅', {'style': TONE2, 'errors': 'replace'}, [['9fc5']]),
 								        ('鿅', {'style': TONE2, 'errors': lambda x: ['a']}, [['a']]),
 								        ('鿅', {'style': TONE2, 'errors': lambda x: None}, []),
 								        ('鿅鿅', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
 								         [['a'], ['a']]),
 								        ('鿅鿅', {'style': TONE2, 'errors': lambda x: [['a', 'b']]},
 								         [['a'], ['a']]),
 								        ('鿅鿅', {'style': TONE2, 'heteronym': True,
 								                'errors': lambda x: [['a', 'b']]},
 								         [['a', 'b'], ['a', 'b']]),
 								    )
 								    for han in hans:
 								        assert pinyin(han[0], **han[1]) == han[2]
 								def test_errors_callable():
 								    def foobar(chars):
 								        return 'a' * len(chars)
 								    class Foobar(object):
 								        def __call__(self, chars):
 								            return 'a' * len(chars)
 								    n = 5
 								    assert pinyin('あ' * n, errors=foobar) == [['a' * n]]
 								    assert pinyin('あ' * n, errors=Foobar()) == [['a' * n]]
 								def test_simple_seg():
 								    data = {
 								        '北京abcc': 'be3i ji1ng abcc',
 								        '你好にほんごРусский язык': 'ni3 ha3o にほんごРусский язык',
 								    }
 								    for h, p in data.items():
 								        assert slug([h], style=TONE2, separator=' ') == p
 								    hans = '你好にほんごРусский язык'
 								    ret = 'ni3 ha3o'
 								    assert slug(hans, style=TONE2, separator=' ', errors=lambda x: None) == ret
 								data_for_update = [
 								    # 便宜的发音
 								    [
 								        ['便宜'], {'style': TONE2}, ['pia2n', 'yi2']
 								    ],
 								    [
 								        ['便宜从事'], {'style': TONE2}, ['bia4n', 'yi2', 'co2ng', 'shi4']
 								    ],
 								    [
 								        ['便宜施行'], {'style': TONE2}, ['bia4n', 'yi2', 'shi1', 'xi2ng']
 								    ],
 								    [
 								        ['便宜货'], {'style': TONE2}, ['pia2n', 'yi2', 'huo4']
 								    ],
 								    [
 								        ['贪便宜'], {'style': TONE2}, ['ta1n', 'pia2n', 'yi2']
 								    ],
 								    [
 								        ['讨便宜'], {'style': TONE2}, ['ta3o', 'pia2n', 'yi2']
 								    ],
 								    [
 								        ['小便宜'], {'style': TONE2}, ['xia3o', 'pia2n', 'yi2']
 								    ],
 								    [
 								        ['占便宜'], {'style': TONE2}, ['zha4n', 'pia2n', 'yi2']
 								    ],
 								    #
 								    [
 								        '\u3400', {'style': TONE2}, ['qiu1'],  # CJK 扩展 A:[3400-4DBF]
 								    ],
 								    [
 								        '\u4E00', {'style': TONE2}, ['yi1'],   # CJK 基本:[4E00-9FFF]
 								    ],
 								    # [
 								    #     '\uFA29', {'style': TONE2}, ['da3o'],  # CJK 兼容:[F900-FAFF]
 								    # ],
 								    # 误把 yu 放到声母列表了
 								    ['鱼', {'style': TONE2}, ['yu2']],
 								    ['鱼', {'style': FINALS}, ['v']],
 								    ['鱼', {'style': BOPOMOFO}, ['ㄩˊ']],
 								    ['鱼', {'style': CYRILLIC}, ['юй']],
 								    ['雨', {'style': TONE2}, ['yu3']],
 								    ['雨', {'style': FINALS}, ['v']],
 								    ['雨', {'style': BOPOMOFO}, ['ㄩˇ']],
 								    ['雨', {'style': CYRILLIC}, ['юй']],
 								    ['元', {'style': TONE2}, ['yua2n']],
 								    ['元', {'style': FINALS}, ['van']],
 								    ['元', {'style': BOPOMOFO}, ['ㄩㄢˊ']],
 								    ['元', {'style': CYRILLIC}, ['юань2']],
 								    # y, w 也不是拼音, yu的韵母是v, yi的韵母是i, wu的韵母是u
 								    ['呀', {'style': INITIALS}, ['']],
 								    ['呀', {'style': TONE2}, ['ya']],
 								    ['呀', {'style': FINALS}, ['ia']],
 								    ['呀', {'style': BOPOMOFO}, ['ㄧㄚ˙']],
 								    ['呀', {'style': CYRILLIC}, ['я']],
 								    ['无', {'style': INITIALS}, ['']],
 								    ['无', {'style': TONE2}, ['wu2']],
 								    ['无', {'style': FINALS}, ['u']],
 								    ['无', {'style': FINALS_TONE}, ['ú']],
 								    ['无', {'style': BOPOMOFO}, ['ㄨˊ']],
 								    ['无', {'style': CYRILLIC}, ['у2']],
 								    ['衣', {'style': TONE2}, ['yi1']],
 								    ['衣', {'style': FINALS}, ['i']],
 								    ['衣', {'style': BOPOMOFO}, ['ㄧ']],
 								    ['衣', {'style': CYRILLIC}, ['и1']],
 								    ['万', {'style': TONE2}, ['wa4n']],
 								    ['万', {'style': FINALS}, ['uan']],
 								    ['万', {'style': BOPOMOFO}, ['ㄨㄢˋ']],
 								    ['万', {'style': CYRILLIC}, ['вань4']],
 								    # ju, qu, xu 的韵母应该是 v
 								    ['具', {'style': FINALS_TONE}, ['ǜ']],
 								    ['具', {'style': FINALS_TONE2}, ['v4']],
 								    ['具', {'style': FINALS}, ['v']],
 								    ['具', {'style': BOPOMOFO}, ['ㄐㄩˋ']],
 								    ['具', {'style': CYRILLIC}, ['цзюй4']],
 								    ['取', {'style': FINALS_TONE}, ['ǚ']],
 								    ['取', {'style': FINALS_TONE2}, ['v3']],
 								    ['取', {'style': FINALS}, ['v']],
 								    ['取', {'style': BOPOMOFO}, ['ㄑㄩˇ']],
 								    ['取', {'style': CYRILLIC}, ['цюй3']],
 								    ['徐', {'style': FINALS_TONE}, ['ǘ']],
 								    ['徐', {'style': FINALS_TONE2}, ['v2']],
 								    ['徐', {'style': FINALS}, ['v']],
 								    ['徐', {'style': BOPOMOFO}, ['ㄒㄩˊ']],
 								    ['徐', {'style': CYRILLIC}, ['сюй2']],
 								    # ń
 								    ['嗯', {'style': NORMAL}, ['n']],
 								    ['嗯', {'style': TONE}, ['ń']],
 								    ['嗯', {'style': TONE2}, ['n2']],
 								    ['嗯', {'style': INITIALS}, ['']],
 								    ['嗯', {'style': FIRST_LETTER}, ['n']],
 								    ['嗯', {'style': FINALS}, ['n']],
 								    ['嗯', {'style': FINALS_TONE}, ['ń']],
 								    ['嗯', {'style': FINALS_TONE2}, ['n2']],
 								    ['嗯', {'style': BOPOMOFO}, ['ㄣˊ']],
 								    ['嗯', {'style': CYRILLIC}, ['н2']],
 								    # ḿ  \u1e3f  U+1E3F
 								    ['呣', {'style': NORMAL}, ['m']],
 								    ['呣', {'style': TONE}, ['ḿ']],
 								    ['呣', {'style': TONE2}, ['m2']],
 								    ['呣', {'style': INITIALS}, ['']],
 								    ['呣', {'style': FIRST_LETTER}, ['m']],
 								    ['呣', {'style': FINALS}, ['m']],
 								    ['呣', {'style': FINALS_TONE}, ['ḿ']],
 								    ['呣', {'style': FINALS_TONE2}, ['m2']],
 								    ['呣', {'style': BOPOMOFO}, ['ㄇㄨˊ']],
 								    ['呣', {'style': CYRILLIC}, ['м2']],
 								    # 41
 								    ['彷徨', {}, ['pang', 'huang']],
 								    ['彷徨', {'style': CYRILLIC}, ['пан2', 'хуан2']],
 								    # 注音
 								    ['打量', {'style': BOPOMOFO}, ['ㄉㄚˇ', 'ㄌㄧㄤˋ']],
 								    ['黄山b股', {'style': BOPOMOFO}, ['ㄏㄨㄤˊ', 'ㄕㄢ', 'b', 'ㄍㄨˇ']],
 								    ['打量', {'style': CYRILLIC}, ['да3', 'лян4']],
 								    ['黄山b股', {'style': CYRILLIC}, ['хуан2', 'шань1', 'b', 'гу3']],
 								    # 50
 								    ['打量', {'style': TONE2}, ['da3', 'lia4ng']],
 								    ['打量', {'style': TONE3}, ['da3', 'liang4']],
 								    ['侵略', {'style': TONE2}, ['qi1n', 'lve4']],
 								    ['侵略', {'style': TONE3}, ['qin1', 'lve4']],
 								    ['侵略', {'style': FINALS_TONE2}, ['i1n', 've4']],
 								    ['侵略', {'style': FINALS_TONE3}, ['in1', 've4']],
 								    ['侵略', {'style': BOPOMOFO}, ['ㄑㄧㄣ', 'ㄌㄩㄝˋ']],
 								    ['侵略', {'style': CYRILLIC}, ['цинь1', 'люэ4']],
 								    ['〇', {'style': TONE}, ['líng']],
 								    # 二次分词
 								    [['你要', '重新考虑OK'], {'style': TONE}, [
 								        'nǐ', 'yào', 'chóng', 'xīn', 'kǎo', 'lǜ', 'OK']],
 								]
 								@pytest.mark.parametrize('hans, kwargs, result', data_for_update)
 								def test_update(hans, kwargs, result):
 								    assert lazy_pinyin(hans, **kwargs) == result
 								@pytest.mark.skipif(not SUPPORT_UCS4, reason='dont support ucs4')
 								@pytest.mark.parametrize(
 								    'han, result', [
 								        ['\U00020000', ['he']],      # CJK 扩展 B:[20000-2A6DF]
 								        ['\U0002A79D', ['duo']],      # CJK 扩展 C:[2A700-2B73F]
 								        # ['\U0002B740', ['wu']],      # CJK 扩展 D:[2B740-2B81D]
 								        # ['\U0002F80A', ['seng']],    # CJK 兼容扩展:[2F800-2FA1F]
 								    ]
 								)
 								def test_support_ucs4(han, result):
 								    assert lazy_pinyin(han) == result
 								@pytest.mark.skipif(SUPPORT_UCS4, reason='support ucs4')
 								@pytest.mark.parametrize(
 								    'han', [
 								        '\U00020000',      # CJK 扩展 B:[20000-2A6DF]
 								        '\U0002A79D',      # CJK 扩展 C:[2A700-2B73F]
 								        # '\U0002B740',      # CJK 扩展 D:[2B740-2B81D]
 								        # '\U0002F80A',      # CJK 兼容扩展:[2F800-2FA1F]
 								    ]
 								)
 								def test_dont_support_ucs4(han):
 								    assert pinyin(han) == [[han]]
 								def test_36():
 								    hans = '两年前七斤喝醉了酒'
 								    pys = ['liang', 'nian', 'qian', 'qi', 'jin', 'he', 'zui', 'le', 'jiu']
 								    assert lazy_pinyin(hans) == pys
 								def test_with_unknown_style():
 								    assert lazy_pinyin('中国') == ['zhong', 'guo']
 								    assert lazy_pinyin('中国', style='unknown') == ['zhōng', 'guó']
 								    assert pinyin('中国') == [['zhōng'], ['guó']]
 								    assert pinyin('中国', style='unknown') == [['zhōng'], ['guó']]
 								@pytest.mark.parametrize('kwargs,result', [
 								    [{}, [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
 								    [dict(strict=False), [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
 								    [dict(style=NORMAL), [['zhong'], ['xin']]],
 								    [dict(style=NORMAL, strict=False), [['zhong'], ['xin']]],
 								    [dict(style=TONE), [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
 								    [dict(style=TONE, strict=False), [
 								        ['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
 								    [dict(style=TONE2), [['zho1ng', 'zho4ng'], ['xi1n']]],
 								    [dict(style=TONE2, strict=False), [['zho1ng', 'zho4ng'], ['xi1n']]],
 								    [dict(style=TONE3), [['zhong1', 'zhong4'], ['xin1']]],
 								    [dict(style=TONE3, strict=False), [['zhong1', 'zhong4'], ['xin1']]],
 								    [dict(style=INITIALS), [['zh'], ['x']]],
 								    [dict(style=INITIALS, strict=False), [['zh'], ['x']]],
 								    [dict(style=FIRST_LETTER), [['z'], ['x']]],
 								    [dict(style=FIRST_LETTER, strict=False), [['z'], ['x']]],
 								    [dict(style=FINALS), [['ong'], ['in']]],
 								    [dict(style=FINALS, strict=False), [['ong'], ['in']]],
 								    [dict(style=FINALS_TONE), [['\u014dng', '\xf2ng'], ['\u012bn']]],
 								    [dict(style=FINALS_TONE, strict=False),  [
 								        ['\u014dng', '\xf2ng'], ['\u012bn']]],
 								    [dict(style=FINALS_TONE2), [['o1ng', 'o4ng'], ['i1n']]],
 								    [dict(style=FINALS_TONE2, strict=False),  [['o1ng', 'o4ng'], ['i1n']]],
 								    [dict(style=FINALS_TONE3), [['ong1', 'ong4'], ['in1']]],
 								    [dict(style=FINALS_TONE3, strict=False), [['ong1', 'ong4'], ['in1']]],
 								])
 								def test_heteronym_and_style(kwargs, result):
 								    hans = '中心'
 								    kwargs['heteronym'] = True
 								    assert pinyin(hans, **kwargs) == result
 								@pytest.mark.parametrize('kwargs,result', [
 								    [{}, [['zhāo'], ['yáng']]],
 								    [dict(heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
 								    [dict(strict=False), [['zhāo'], ['yáng']]],
 								    [dict(strict=False, heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
 								    [dict(style=NORMAL), [['zhao'], ['yang']]],
 								    [dict(style=NORMAL, heteronym=True), [['zhao', 'chao'], ['yang']]],
 								    [dict(style=NORMAL, strict=False), [['zhao'], ['yang']]],
 								    [dict(style=NORMAL, strict=False, heteronym=True), [['zhao', 'chao'],
 								                                                        ['yang']]],
 								    [dict(style=TONE), [['zhāo'], ['yáng']]],
 								    [dict(style=TONE, heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
 								    [dict(style=TONE, strict=False), [['zhāo'], ['yáng']]],
 								    [dict(style=TONE, strict=False, heteronym=True), [['zhāo', 'cháo'],
 								                                                      ['yáng']]],
 								    [dict(style=TONE2), [['zha1o'], ['ya2ng']]],
 								    [dict(style=TONE2, heteronym=True), [['zha1o', 'cha2o'], ['ya2ng']]],
 								    [dict(style=TONE2, strict=False), [['zha1o'], ['ya2ng']]],
 								    [dict(style=TONE2, strict=False, heteronym=True), [['zha1o', 'cha2o'],
 								                                                       ['ya2ng']]],
 								    [dict(style=TONE3), [['zhao1'], ['yang2']]],
 								    [dict(style=TONE3, heteronym=True), [['zhao1', 'chao2'], ['yang2']]],
 								    [dict(style=TONE3, strict=False), [['zhao1'], ['yang2']]],
 								    [dict(style=TONE3, strict=False, heteronym=True), [['zhao1', 'chao2'],
 								                                                       ['yang2']]],
 								    [dict(style=INITIALS), [['zh'], ['']]],
 								    [dict(style=INITIALS, heteronym=True), [['zh', 'ch'], ['']]],
 								    [dict(style=INITIALS, strict=False), [['zh'], ['y']]],
 								    [dict(style=INITIALS, strict=False, heteronym=True), [['zh', 'ch'],
 								                                                          ['y']]],
 								    [dict(style=FIRST_LETTER), [['z'], ['y']]],
 								    [dict(style=FIRST_LETTER, heteronym=True), [['z', 'c'], ['y']]],
 								    [dict(style=FIRST_LETTER, strict=False), [['z'], ['y']]],
 								    [dict(style=FIRST_LETTER, strict=False, heteronym=True), [['z', 'c'],
 								                                                              ['y']]],
 								    [dict(style=FINALS), [['ao'], ['iang']]],
 								    [dict(style=FINALS, heteronym=True), [['ao'], ['iang']]],
 								    [dict(style=FINALS, strict=False), [['ao'], ['ang']]],
 								    [dict(style=FINALS, strict=False, heteronym=True), [['ao'], ['ang']]],
 								    [dict(style=FINALS_TONE), [['āo'], ['iáng']]],
 								    [dict(style=FINALS_TONE, heteronym=True), [['āo', 'áo'], ['iáng']]],
 								    [dict(style=FINALS_TONE, strict=False),  [['āo'], ['áng']]],
 								    [dict(style=FINALS_TONE, strict=False, heteronym=True),  [['āo', 'áo'],
 								                                                              ['áng']]],
 								    [dict(style=FINALS_TONE2), [['a1o'], ['ia2ng']]],
 								    [dict(style=FINALS_TONE2, heteronym=True), [['a1o', 'a2o'], ['ia2ng']]],
 								    [dict(style=FINALS_TONE2, strict=False),  [['a1o'], ['a2ng']]],
 								    [dict(style=FINALS_TONE2, strict=False, heteronym=True),  [['a1o', 'a2o'],
 								                                                               ['a2ng']]],
 								    [dict(style=FINALS_TONE3), [['ao1'], ['iang2']]],
 								    [dict(style=FINALS_TONE3, heteronym=True), [['ao1', 'ao2'], ['iang2']]],
 								    [dict(style=FINALS_TONE3, strict=False), [['ao1'], ['ang2']]],
 								    [dict(style=FINALS_TONE3, strict=False, heteronym=True), [['ao1', 'ao2'],
 								                                                              ['ang2']]],
 								])
 								def test_heteronym_and_style_phrase(kwargs, result):
 								    hans = '朝阳'
 								    assert pinyin(hans, **kwargs) == result
 								def test_m4():
 								    # U+5463: ḿ,móu,m̀  # 呣
 								    han = '呣'
 								    assert pinyin(han) == [['ḿ']]
 								    assert pinyin(han, heteronym=True) == [['ḿ', 'm̀', 'móu']]
 								    assert pinyin(
 								        han, heteronym=True, style=NORMAL) == [['m', 'mou']]
 								    assert pinyin(
 								        han, heteronym=True, style=TONE) == [['ḿ', 'm̀', 'móu']]
 								    assert pinyin(
 								        han, heteronym=True, style=TONE2) == [['m2', 'm4', 'mo2u']]
 								    assert pinyin(
 								        han, heteronym=True, style=TONE3) == [['m2', 'm4', 'mou2']]
 								    assert pinyin(
 								        han, heteronym=True, style=INITIALS) == [['', 'm']]  # TODO: fix ''
 								    assert pinyin(
 								        han, heteronym=True, style=FIRST_LETTER) == [['m']]
 								    assert pinyin(
 								        han, heteronym=True, style=FINALS) == [['m', 'ou']]
 								    assert pinyin(
 								        han, heteronym=True, style=FINALS_TONE) == [['ḿ', 'm̀', 'óu']]
 								    assert pinyin(
 								        han, heteronym=True, style=FINALS_TONE2) == [['m2', 'm4', 'o2u']]
 								    assert pinyin(
 								        han, heteronym=True, style=FINALS_TONE3) == [['m2', 'm4', 'ou2']]
 								@pytest.mark.parametrize('han,style,expect', [
 								    ['呣', Style.TONE, ['ḿ', 'm̀']],
 								    ['呣', Style.TONE2, ['m2', 'm4']],
 								    ['嘸', Style.TONE, ['m̄', 'ḿ']],
 								    ['嘸', Style.TONE2, ['m1', 'm2']],
 								    ['誒', Style.TONE, ['ê̄', 'ế', 'ê̌', 'ề']],
 								    ['誒', Style.TONE2, ['ê1', 'ê2', 'ê3', 'ê4']],
 								])
 								def test_m_e(han, style, expect):
 								    result = pinyin(han, style=style, heteronym=True)
 								    assert len(result) == 1
 								    assert (set(result[0]) & set(expect)) == set(expect)
 								if __name__ == '__main__':
 								    import pytest
 								    pytest.cmdline.main()