E2E/Streaming Transformer/Conformer ASR (#578)
* add cmvn and label smoothing loss layer
* add layer for transformer
* add glu and conformer conv
* add torch compatiable hack, mask funcs
* not hack size since it exists
* add test; attention
* add attention, common utils, hack paddle
* add audio utils
* conformer batch padding mask bug fix #223
* fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2
* fix ci
* fix ci
* add encoder
* refactor egs
* add decoder
* refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils
* refactor docs
* add fix
* fix readme
* fix bugs, refactor collator, add pad_sequence, fix ckpt bugs
* fix docstring
* refactor data feed order
* add u2 model
* refactor cmvn, test
* add utils
* add u2 config
* fix bugs
* fix bugs
* fix autograd maybe has problem when using inplace operation
* refactor data, build vocab; add format data
* fix text featurizer
* refactor build vocab
* add fbank, refactor feature of speech
* refactor audio feat
* refactor data preprare
* refactor data
* model init from config
* add u2 bins
* flake8
* can train
* fix bugs, add coverage, add scripts
* test can run
* fix data
* speed perturb with sox
* add spec aug
* fix for train
* fix train logitc
* fix logger
* log valid loss, time dataset process
* using np for speed perturb, remove some debug log of grad clip
* fix logger
* fix build vocab
* fix logger name
* using module logger as default
* fix
* fix install
* reorder imports
* fix board logger
* fix logger
* kaldi fbank and mfcc
* fix cmvn and print prarams
* fix add_eos_sos and cmvn
* fix cmvn compute
* fix logger and cmvn
* fix subsampling, label smoothing loss, remove useless
* add notebook test
* fix log
* fix tb logger
* multi gpu valid
* fix log
* fix log
* fix config
* fix compute cmvn, need paddle 2.1
* add cmvn notebook
* fix layer tools
* fix compute cmvn
* add rtf
* fix decoding
* fix layer tools
* fix log, add avg script
* more avg and test info
* fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh;
* add vimrc
* refactor tiny script, add transformer and stream conf
* spm demo; librisppech scripts and confs
* fix log
* add librispeech scripts
* refactor data pipe; fix conf; fix u2 default params
* fix bugs
* refactor aishell scripts
* fix test
* fix cmvn
* fix s0 scripts
* fix ds2 scripts and bugs
* fix dev & test dataset filter
* fix dataset filter
* filter dev
* fix ckpt path
* filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test
* add comment
* add syllable doc
* fix ds2 configs
* add doc
* add pypinyin tools
* fix decoder using blank_id=0
* mmseg with pybind11
* format code
4 years ago
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
from pypinyin import (pinyin, slug, lazy_pinyin, load_single_dict,
|
|
|
|
|
load_phrases_dict, NORMAL, TONE, TONE2, TONE3, INITIALS,
|
|
|
|
|
FIRST_LETTER, FINALS, FINALS_TONE, FINALS_TONE2,
|
|
|
|
|
FINALS_TONE3, BOPOMOFO, BOPOMOFO_FIRST, CYRILLIC,
|
|
|
|
|
CYRILLIC_FIRST, Style)
|
|
|
|
|
from pypinyin.constants import SUPPORT_UCS4
|
|
|
|
|
from pypinyin.seg import simpleseg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_pinyin_initials():
|
|
|
|
|
"""包含声明和韵母的词语"""
|
|
|
|
|
hans = '中心'
|
|
|
|
|
# 默认风格,带声调
|
|
|
|
|
assert pinyin(hans) == [['zh\u014dng'], ['x\u012bn']]
|
|
|
|
|
assert pinyin(hans, strict=False) == [['zh\u014dng'], ['x\u012bn']]
|
|
|
|
|
# 普通风格,不带声调
|
|
|
|
|
assert pinyin(hans, NORMAL) == [['zhong'], ['xin']]
|
|
|
|
|
assert pinyin(hans, NORMAL, strict=False) == [['zhong'], ['xin']]
|
|
|
|
|
# 声调风格,拼音声调在韵母第一个字母上
|
|
|
|
|
assert pinyin(hans, TONE) == [['zh\u014dng'], ['x\u012bn']]
|
|
|
|
|
assert pinyin(hans, TONE, strict=False) == [['zh\u014dng'], ['x\u012bn']]
|
|
|
|
|
# 声调风格2,即拼音声调在各个声母之后,用数字 [1-4] 进行表示
|
|
|
|
|
assert pinyin(hans, TONE2) == [['zho1ng'], ['xi1n']]
|
|
|
|
|
assert pinyin(hans, TONE2, strict=False) == [['zho1ng'], ['xi1n']]
|
|
|
|
|
# 声调风格3,即拼音声调在各个拼音之后,用数字 [1-4] 进行表示
|
|
|
|
|
assert pinyin(hans, TONE3) == [['zhong1'], ['xin1']]
|
|
|
|
|
assert pinyin(hans, TONE3, strict=False) == [['zhong1'], ['xin1']]
|
|
|
|
|
# 声母风格,只返回各个拼音的声母部分
|
|
|
|
|
assert pinyin(hans, INITIALS) == [['zh'], ['x']]
|
|
|
|
|
assert pinyin(hans, INITIALS, strict=False) == [['zh'], ['x']]
|
|
|
|
|
# 首字母风格,只返回拼音的首字母部分
|
|
|
|
|
assert pinyin(hans, FIRST_LETTER) == [['z'], ['x']]
|
|
|
|
|
assert pinyin(hans, FIRST_LETTER, strict=False) == [['z'], ['x']]
|
|
|
|
|
# 注音风格,带声调
|
|
|
|
|
assert pinyin(hans, BOPOMOFO) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
|
|
|
|
|
assert pinyin(hans, BOPOMOFO, strict=False) == [['ㄓㄨㄥ'], ['ㄒㄧㄣ']]
|
|
|
|
|
# 注音风格,首字母
|
|
|
|
|
assert pinyin(hans, BOPOMOFO_FIRST) == [['ㄓ'], ['ㄒ']]
|
|
|
|
|
assert pinyin(hans, BOPOMOFO_FIRST, strict=False) == [['ㄓ'], ['ㄒ']]
|
|
|
|
|
# test CYRILLIC style
|
|
|
|
|
assert pinyin(hans, CYRILLIC) == [['чжун1'], ['синь1']]
|
|
|
|
|
assert pinyin(hans, CYRILLIC, strict=False) == [['чжун1'], ['синь1']]
|
|
|
|
|
# CYRILLIC_FIRST style return only first letters
|
|
|
|
|
assert pinyin(hans, CYRILLIC_FIRST) == [['ч'], ['с']]
|
|
|
|
|
assert pinyin(hans, CYRILLIC_FIRST, strict=False) == [['ч'], ['с']]
|
|
|
|
|
# 启用多音字模式
|
|
|
|
|
assert pinyin(hans, heteronym=True) == [['zh\u014dng', 'zh\xf2ng'],
|
|
|
|
|
['x\u012bn']]
|
|
|
|
|
assert pinyin(hans, heteronym=True, strict=False) == \
|
|
|
|
|
[['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]
|
|
|
|
|
# 韵母风格1,只返回各个拼音的韵母部分,不带声调
|
|
|
|
|
assert pinyin(hans, style=FINALS) == [['ong'], ['in']]
|
|
|
|
|
assert pinyin(hans, style=FINALS, strict=False) == [['ong'], ['in']]
|
|
|
|
|
# 韵母风格2,带声调,声调在韵母第一个字母上
|
|
|
|
|
assert pinyin(hans, style=FINALS_TONE) == [['\u014dng'], ['\u012bn']]
|
|
|
|
|
assert pinyin(hans, style=FINALS_TONE, strict=False) == \
|
|
|
|
|
[['\u014dng'], ['\u012bn']]
|
|
|
|
|
# 韵母风格2,带声调,声调在各个声母之后,用数字 [1-4] 进行表示
|
|
|
|
|
assert pinyin(hans, style=FINALS_TONE2) == [['o1ng'], ['i1n']]
|
|
|
|
|
assert pinyin(hans, style=FINALS_TONE2, strict=False) == \
|
|
|
|
|
[['o1ng'], ['i1n']]
|
|
|
|
|
# 韵母风格3,带声调,声调在各个拼音之后,用数字 [1-4] 进行表示
|
|
|
|
|
assert pinyin(hans, style=FINALS_TONE3) == [['ong1'], ['in1']]
|
|
|
|
|
assert pinyin(hans, style=FINALS_TONE3, strict=False) == \
|
|
|
|
|
[['ong1'], ['in1']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_pinyin_finals():
|
|
|
|
|
"""只包含韵母的词语"""
|
|
|
|
|
hans = '嗷嗷'
|
|
|
|
|
assert pinyin(hans) == [['\xe1o'], ['\xe1o']]
|
|
|
|
|
assert pinyin(hans + 'abc') == [['\xe1o'], ['\xe1o'], ['abc']]
|
|
|
|
|
assert pinyin(hans, NORMAL) == [['ao'], ['ao']]
|
|
|
|
|
assert pinyin(hans, TONE) == [['\xe1o'], ['\xe1o']]
|
|
|
|
|
assert pinyin(hans, TONE2) == [['a2o'], ['a2o']]
|
|
|
|
|
assert pinyin(hans, TONE3) == [['ao2'], ['ao2']]
|
|
|
|
|
assert pinyin(hans, INITIALS) == [[''], ['']]
|
|
|
|
|
assert pinyin(hans, FIRST_LETTER) == [['a'], ['a']]
|
|
|
|
|
assert pinyin(hans, BOPOMOFO) == [['ㄠˊ'], ['ㄠˊ']]
|
|
|
|
|
assert pinyin(hans, BOPOMOFO_FIRST) == [['ㄠ'], ['ㄠ']]
|
|
|
|
|
assert pinyin(hans, CYRILLIC) == [['ао2'], ['ао2']]
|
|
|
|
|
assert pinyin(hans, CYRILLIC_FIRST) == [['а'], ['а']]
|
|
|
|
|
assert pinyin(hans, heteronym=True) == [['\xe1o'], ['\xe1o']]
|
|
|
|
|
assert pinyin('啊', heteronym=True) == \
|
|
|
|
|
[['a', 'ā', 'á', 'ǎ', 'à', 'è']]
|
|
|
|
|
assert pinyin(hans, style=FINALS) == [['ao'], ['ao']]
|
|
|
|
|
assert pinyin(hans, style=FINALS_TONE) == [['\xe1o'], ['\xe1o']]
|
|
|
|
|
assert pinyin(hans, style=FINALS_TONE2) == [['a2o'], ['a2o']]
|
|
|
|
|
assert pinyin(hans, style=FINALS_TONE3) == [['ao2'], ['ao2']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_slug():
|
|
|
|
|
hans = '中心'
|
|
|
|
|
assert slug(hans) == 'zhong-xin'
|
|
|
|
|
assert slug(hans, heteronym=True) == 'zhong-xin'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_zh_and_en():
|
|
|
|
|
"""中英文混合的情况"""
|
|
|
|
|
# 中英文
|
|
|
|
|
hans = '中心'
|
|
|
|
|
assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']]
|
|
|
|
|
# 中英文混合的固定词组
|
|
|
|
|
assert pinyin('黄山B股', style=TONE2) == \
|
|
|
|
|
[['hua2ng'], ['sha1n'], ['B'], ['gu3']]
|
|
|
|
|
assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
|
|
|
|
|
assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
|
|
|
|
|
assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
|
|
|
|
|
assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
|
|
|
|
|
assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
|
|
|
|
|
assert pinyin('维生素C', style=TONE2) == \
|
|
|
|
|
[['we2i'], ['she1ng'], ['su4'], ['C']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_others():
|
|
|
|
|
# 空字符串
|
|
|
|
|
assert pinyin('') == []
|
|
|
|
|
# 单个汉字
|
|
|
|
|
assert pinyin('營') == [['y\xedng']]
|
|
|
|
|
# 中国 人
|
|
|
|
|
assert pinyin('中国人') == [['zh\u014dng'], ['gu\xf3'], ['r\xe9n']]
|
|
|
|
|
# 日文
|
|
|
|
|
assert pinyin('の') == [['\u306e']]
|
|
|
|
|
# 没有读音的汉字,还不存在的汉字
|
|
|
|
|
assert pinyin('\u9fff') == [['\u9fff']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_lazy_pinyin():
|
|
|
|
|
assert lazy_pinyin('中国人') == ['zhong', 'guo', 'ren']
|
|
|
|
|
assert lazy_pinyin('中心') == ['zhong', 'xin']
|
|
|
|
|
assert lazy_pinyin('中心', style=TONE) == ['zh\u014dng', 'x\u012bn']
|
|
|
|
|
assert lazy_pinyin('中心', style=INITIALS) == ['zh', 'x']
|
|
|
|
|
assert lazy_pinyin('中心', style=BOPOMOFO) == ['ㄓㄨㄥ', 'ㄒㄧㄣ']
|
|
|
|
|
assert lazy_pinyin('中心', style=CYRILLIC) == ['чжун1', 'синь1']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_seg():
|
|
|
|
|
hans = '音乐'
|
|
|
|
|
hans_seg = list(simpleseg(hans))
|
|
|
|
|
assert pinyin(hans_seg, style=TONE2) == [['yi1n'], ['yue4']]
|
|
|
|
|
# 中英文混合的固定词组
|
|
|
|
|
assert pinyin('黄山B股', style=TONE2) == \
|
|
|
|
|
[['hua2ng'], ['sha1n'], ['B'], ['gu3']]
|
|
|
|
|
assert pinyin('A股', style=TONE2) == [['A'], ['gu3']]
|
|
|
|
|
assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']]
|
|
|
|
|
assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']]
|
|
|
|
|
assert pinyin('AB超C', style=TONE2) == [['AB'], ['cha1o'], ['C']]
|
|
|
|
|
assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']]
|
|
|
|
|
assert pinyin('维生素C', style=TONE2) == \
|
|
|
|
|
[['we2i'], ['she1ng'], ['su4'], ['C']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_custom_pinyin_dict():
|
|
|
|
|
hans = '桔'
|
|
|
|
|
try:
|
|
|
|
|
assert lazy_pinyin(hans, style=TONE2) == ['ju2']
|
|
|
|
|
except AssertionError:
|
|
|
|
|
pass
|
|
|
|
|
load_single_dict({ord('桔'): 'jú,jié'})
|
|
|
|
|
assert lazy_pinyin(hans, style=TONE2) == ['ju2']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_custom_pinyin_dict2():
|
|
|
|
|
hans = ['同行']
|
|
|
|
|
try:
|
|
|
|
|
assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'ha2ng']
|
|
|
|
|
except AssertionError:
|
|
|
|
|
pass
|
|
|
|
|
load_phrases_dict({'同行': [['tóng'], ['xíng']]})
|
|
|
|
|
assert lazy_pinyin(hans, style=TONE2) == ['to2ng', 'xi2ng']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_custom_pinyin_dict_tone2():
|
|
|
|
|
load_single_dict({ord('桔'): 'ce4,si4'}, style='tone2')
|
|
|
|
|
assert lazy_pinyin('桔', style=TONE2) == ['ce4']
|
|
|
|
|
assert pinyin('桔') == [['cè']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_custom_pinyin_dict2_tone2():
|
|
|
|
|
load_phrases_dict({'同行': [['to4ng'], ['ku1']]}, style='tone2')
|
|
|
|
|
assert lazy_pinyin(['同行'], style=TONE2) == ['to4ng', 'ku1']
|
|
|
|
|
assert pinyin('同行') == [['tòng'], ['kū']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# yapf: disable
|
|
|
|
|
def test_errors():
|
|
|
|
|
hans = (
|
|
|
|
|
('啊', {'style': TONE2}, [['a']]),
|
|
|
|
|
('啊a', {'style': TONE2}, [['a'], ['a']]),
|
|
|
|
|
# 非中文字符,没有拼音
|
|
|
|
|
('⺁', {'style': TONE2}, [['\u2e81']]),
|
|
|
|
|
('⺁', {'style': TONE2, 'errors': 'ignore'}, []),
|
|
|
|
|
('⺁', {'style': TONE2, 'errors': 'replace'}, [['2e81']]),
|
|
|
|
|
('⺁⺁', {'style': TONE2, 'errors': 'replace'}, [['2e812e81']]),
|
|
|
|
|
('⺁⺁', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
|
|
|
|
|
[['a'], ['a']]),
|
|
|
|
|
('⺁⺁', {'style': TONE2, 'errors': lambda x: [['a', 'b'], ['b', 'c']]},
|
|
|
|
|
[['a'], ['b']]),
|
|
|
|
|
('⺁⺁', {'style': TONE2, 'heteronym': True,
|
|
|
|
|
'errors': lambda x: [['a', 'b'], ['b', 'c']]},
|
|
|
|
|
[['a', 'b'], ['b', 'c']]),
|
|
|
|
|
# 中文字符,没有拼音
|
|
|
|
|
('鿅', {'style': TONE2}, [['\u9fc5']]),
|
|
|
|
|
('鿅', {'style': TONE2, 'errors': 'ignore'}, []),
|
|
|
|
|
('鿅', {'style': TONE2, 'errors': '233'}, []),
|
|
|
|
|
('鿅', {'style': TONE2, 'errors': 'replace'}, [['9fc5']]),
|
|
|
|
|
('鿅', {'style': TONE2, 'errors': lambda x: ['a']}, [['a']]),
|
|
|
|
|
('鿅', {'style': TONE2, 'errors': lambda x: None}, []),
|
|
|
|
|
('鿅鿅', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]},
|
|
|
|
|
[['a'], ['a']]),
|
|
|
|
|
('鿅鿅', {'style': TONE2, 'errors': lambda x: [['a', 'b']]},
|
|
|
|
|
[['a'], ['a']]),
|
|
|
|
|
('鿅鿅', {'style': TONE2, 'heteronym': True,
|
|
|
|
|
'errors': lambda x: [['a', 'b']]},
|
|
|
|
|
[['a', 'b'], ['a', 'b']]),
|
|
|
|
|
)
|
|
|
|
|
for han in hans:
|
|
|
|
|
assert pinyin(han[0], **han[1]) == han[2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_errors_callable():
|
|
|
|
|
def foobar(chars):
|
|
|
|
|
return 'a' * len(chars)
|
|
|
|
|
|
|
|
|
|
class Foobar(object):
|
|
|
|
|
def __call__(self, chars):
|
|
|
|
|
return 'a' * len(chars)
|
|
|
|
|
|
|
|
|
|
n = 5
|
|
|
|
|
assert pinyin('あ' * n, errors=foobar) == [['a' * n]]
|
|
|
|
|
assert pinyin('あ' * n, errors=Foobar()) == [['a' * n]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_simple_seg():
|
|
|
|
|
data = {
|
|
|
|
|
'北京abcc': 'be3i ji1ng abcc',
|
|
|
|
|
'你好にほんごРусский язык': 'ni3 ha3o にほんごРусский язык',
|
|
|
|
|
}
|
|
|
|
|
for h, p in data.items():
|
|
|
|
|
assert slug([h], style=TONE2, separator=' ') == p
|
|
|
|
|
|
|
|
|
|
hans = '你好にほんごРусский язык'
|
|
|
|
|
ret = 'ni3 ha3o'
|
|
|
|
|
assert slug(hans, style=TONE2, separator=' ', errors=lambda x: None) == ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_for_update = [
|
|
|
|
|
# 便宜的发音
|
|
|
|
|
[
|
|
|
|
|
['便宜'], {'style': TONE2}, ['pia2n', 'yi2']
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
['便宜从事'], {'style': TONE2}, ['bia4n', 'yi2', 'co2ng', 'shi4']
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
['便宜施行'], {'style': TONE2}, ['bia4n', 'yi2', 'shi1', 'xi2ng']
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
['便宜货'], {'style': TONE2}, ['pia2n', 'yi2', 'huo4']
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
['贪便宜'], {'style': TONE2}, ['ta1n', 'pia2n', 'yi2']
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
['讨便宜'], {'style': TONE2}, ['ta3o', 'pia2n', 'yi2']
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
['小便宜'], {'style': TONE2}, ['xia3o', 'pia2n', 'yi2']
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
['占便宜'], {'style': TONE2}, ['zha4n', 'pia2n', 'yi2']
|
|
|
|
|
],
|
|
|
|
|
#
|
|
|
|
|
[
|
|
|
|
|
'\u3400', {'style': TONE2}, ['qiu1'], # CJK 扩展 A:[3400-4DBF]
|
|
|
|
|
],
|
|
|
|
|
[
|
|
|
|
|
'\u4E00', {'style': TONE2}, ['yi1'], # CJK 基本:[4E00-9FFF]
|
|
|
|
|
],
|
|
|
|
|
# [
|
|
|
|
|
# '\uFA29', {'style': TONE2}, ['da3o'], # CJK 兼容:[F900-FAFF]
|
|
|
|
|
# ],
|
|
|
|
|
# 误把 yu 放到声母列表了
|
|
|
|
|
['鱼', {'style': TONE2}, ['yu2']],
|
|
|
|
|
['鱼', {'style': FINALS}, ['v']],
|
|
|
|
|
['鱼', {'style': BOPOMOFO}, ['ㄩˊ']],
|
|
|
|
|
['鱼', {'style': CYRILLIC}, ['юй']],
|
|
|
|
|
['雨', {'style': TONE2}, ['yu3']],
|
|
|
|
|
['雨', {'style': FINALS}, ['v']],
|
|
|
|
|
['雨', {'style': BOPOMOFO}, ['ㄩˇ']],
|
|
|
|
|
['雨', {'style': CYRILLIC}, ['юй']],
|
|
|
|
|
['元', {'style': TONE2}, ['yua2n']],
|
|
|
|
|
['元', {'style': FINALS}, ['van']],
|
|
|
|
|
['元', {'style': BOPOMOFO}, ['ㄩㄢˊ']],
|
|
|
|
|
['元', {'style': CYRILLIC}, ['юань2']],
|
|
|
|
|
# y, w 也不是拼音, yu的韵母是v, yi的韵母是i, wu的韵母是u
|
|
|
|
|
['呀', {'style': INITIALS}, ['']],
|
|
|
|
|
['呀', {'style': TONE2}, ['ya']],
|
|
|
|
|
['呀', {'style': FINALS}, ['ia']],
|
|
|
|
|
['呀', {'style': BOPOMOFO}, ['ㄧㄚ˙']],
|
|
|
|
|
['呀', {'style': CYRILLIC}, ['я']],
|
|
|
|
|
['无', {'style': INITIALS}, ['']],
|
|
|
|
|
['无', {'style': TONE2}, ['wu2']],
|
|
|
|
|
['无', {'style': FINALS}, ['u']],
|
|
|
|
|
['无', {'style': FINALS_TONE}, ['ú']],
|
|
|
|
|
['无', {'style': BOPOMOFO}, ['ㄨˊ']],
|
|
|
|
|
['无', {'style': CYRILLIC}, ['у2']],
|
|
|
|
|
['衣', {'style': TONE2}, ['yi1']],
|
|
|
|
|
['衣', {'style': FINALS}, ['i']],
|
|
|
|
|
['衣', {'style': BOPOMOFO}, ['ㄧ']],
|
|
|
|
|
['衣', {'style': CYRILLIC}, ['и1']],
|
|
|
|
|
['万', {'style': TONE2}, ['wa4n']],
|
|
|
|
|
['万', {'style': FINALS}, ['uan']],
|
|
|
|
|
['万', {'style': BOPOMOFO}, ['ㄨㄢˋ']],
|
|
|
|
|
['万', {'style': CYRILLIC}, ['вань4']],
|
|
|
|
|
# ju, qu, xu 的韵母应该是 v
|
|
|
|
|
['具', {'style': FINALS_TONE}, ['ǜ']],
|
|
|
|
|
['具', {'style': FINALS_TONE2}, ['v4']],
|
|
|
|
|
['具', {'style': FINALS}, ['v']],
|
|
|
|
|
['具', {'style': BOPOMOFO}, ['ㄐㄩˋ']],
|
|
|
|
|
['具', {'style': CYRILLIC}, ['цзюй4']],
|
|
|
|
|
['取', {'style': FINALS_TONE}, ['ǚ']],
|
|
|
|
|
['取', {'style': FINALS_TONE2}, ['v3']],
|
|
|
|
|
['取', {'style': FINALS}, ['v']],
|
|
|
|
|
['取', {'style': BOPOMOFO}, ['ㄑㄩˇ']],
|
|
|
|
|
['取', {'style': CYRILLIC}, ['цюй3']],
|
|
|
|
|
['徐', {'style': FINALS_TONE}, ['ǘ']],
|
|
|
|
|
['徐', {'style': FINALS_TONE2}, ['v2']],
|
|
|
|
|
['徐', {'style': FINALS}, ['v']],
|
|
|
|
|
['徐', {'style': BOPOMOFO}, ['ㄒㄩˊ']],
|
|
|
|
|
['徐', {'style': CYRILLIC}, ['сюй2']],
|
|
|
|
|
# ń
|
|
|
|
|
['嗯', {'style': NORMAL}, ['n']],
|
|
|
|
|
['嗯', {'style': TONE}, ['ń']],
|
|
|
|
|
['嗯', {'style': TONE2}, ['n2']],
|
|
|
|
|
['嗯', {'style': INITIALS}, ['']],
|
|
|
|
|
['嗯', {'style': FIRST_LETTER}, ['n']],
|
|
|
|
|
['嗯', {'style': FINALS}, ['n']],
|
|
|
|
|
['嗯', {'style': FINALS_TONE}, ['ń']],
|
|
|
|
|
['嗯', {'style': FINALS_TONE2}, ['n2']],
|
|
|
|
|
['嗯', {'style': BOPOMOFO}, ['ㄣˊ']],
|
|
|
|
|
['嗯', {'style': CYRILLIC}, ['н2']],
|
|
|
|
|
# ḿ \u1e3f U+1E3F
|
|
|
|
|
['呣', {'style': NORMAL}, ['m']],
|
|
|
|
|
['呣', {'style': TONE}, ['ḿ']],
|
|
|
|
|
['呣', {'style': TONE2}, ['m2']],
|
|
|
|
|
['呣', {'style': INITIALS}, ['']],
|
|
|
|
|
['呣', {'style': FIRST_LETTER}, ['m']],
|
|
|
|
|
['呣', {'style': FINALS}, ['m']],
|
|
|
|
|
['呣', {'style': FINALS_TONE}, ['ḿ']],
|
|
|
|
|
['呣', {'style': FINALS_TONE2}, ['m2']],
|
|
|
|
|
['呣', {'style': BOPOMOFO}, ['ㄇㄨˊ']],
|
|
|
|
|
['呣', {'style': CYRILLIC}, ['м2']],
|
|
|
|
|
# 41
|
|
|
|
|
['彷徨', {}, ['pang', 'huang']],
|
|
|
|
|
['彷徨', {'style': CYRILLIC}, ['пан2', 'хуан2']],
|
|
|
|
|
# 注音
|
|
|
|
|
['打量', {'style': BOPOMOFO}, ['ㄉㄚˇ', 'ㄌㄧㄤˋ']],
|
|
|
|
|
['黄山b股', {'style': BOPOMOFO}, ['ㄏㄨㄤˊ', 'ㄕㄢ', 'b', 'ㄍㄨˇ']],
|
|
|
|
|
['打量', {'style': CYRILLIC}, ['да3', 'лян4']],
|
|
|
|
|
['黄山b股', {'style': CYRILLIC}, ['хуан2', 'шань1', 'b', 'гу3']],
|
|
|
|
|
# 50
|
|
|
|
|
['打量', {'style': TONE2}, ['da3', 'lia4ng']],
|
|
|
|
|
['打量', {'style': TONE3}, ['da3', 'liang4']],
|
|
|
|
|
['侵略', {'style': TONE2}, ['qi1n', 'lve4']],
|
|
|
|
|
['侵略', {'style': TONE3}, ['qin1', 'lve4']],
|
|
|
|
|
['侵略', {'style': FINALS_TONE2}, ['i1n', 've4']],
|
|
|
|
|
['侵略', {'style': FINALS_TONE3}, ['in1', 've4']],
|
|
|
|
|
['侵略', {'style': BOPOMOFO}, ['ㄑㄧㄣ', 'ㄌㄩㄝˋ']],
|
|
|
|
|
['侵略', {'style': CYRILLIC}, ['цинь1', 'люэ4']],
|
|
|
|
|
['〇', {'style': TONE}, ['líng']],
|
|
|
|
|
# 二次分词
|
|
|
|
|
[['你要', '重新考虑OK'], {'style': TONE}, [
|
|
|
|
|
'nǐ', 'yào', 'chóng', 'xīn', 'kǎo', 'lǜ', 'OK']],
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('hans, kwargs, result', data_for_update)
|
|
|
|
|
def test_update(hans, kwargs, result):
|
|
|
|
|
assert lazy_pinyin(hans, **kwargs) == result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(not SUPPORT_UCS4, reason='dont support ucs4')
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
'han, result', [
|
|
|
|
|
['\U00020000', ['he']], # CJK 扩展 B:[20000-2A6DF]
|
|
|
|
|
['\U0002A79D', ['duo']], # CJK 扩展 C:[2A700-2B73F]
|
|
|
|
|
# ['\U0002B740', ['wu']], # CJK 扩展 D:[2B740-2B81D]
|
|
|
|
|
# ['\U0002F80A', ['seng']], # CJK 兼容扩展:[2F800-2FA1F]
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
def test_support_ucs4(han, result):
|
|
|
|
|
assert lazy_pinyin(han) == result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(SUPPORT_UCS4, reason='support ucs4')
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
'han', [
|
|
|
|
|
'\U00020000', # CJK 扩展 B:[20000-2A6DF]
|
|
|
|
|
'\U0002A79D', # CJK 扩展 C:[2A700-2B73F]
|
|
|
|
|
# '\U0002B740', # CJK 扩展 D:[2B740-2B81D]
|
|
|
|
|
# '\U0002F80A', # CJK 兼容扩展:[2F800-2FA1F]
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
def test_dont_support_ucs4(han):
|
|
|
|
|
assert pinyin(han) == [[han]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_36():
|
|
|
|
|
hans = '两年前七斤喝醉了酒'
|
|
|
|
|
pys = ['liang', 'nian', 'qian', 'qi', 'jin', 'he', 'zui', 'le', 'jiu']
|
|
|
|
|
assert lazy_pinyin(hans) == pys
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_with_unknown_style():
|
|
|
|
|
assert lazy_pinyin('中国') == ['zhong', 'guo']
|
|
|
|
|
assert lazy_pinyin('中国', style='unknown') == ['zhōng', 'guó']
|
|
|
|
|
assert pinyin('中国') == [['zhōng'], ['guó']]
|
|
|
|
|
assert pinyin('中国', style='unknown') == [['zhōng'], ['guó']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('kwargs,result', [
|
|
|
|
|
[{}, [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
|
|
|
|
|
[dict(strict=False), [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
|
|
|
|
|
[dict(style=NORMAL), [['zhong'], ['xin']]],
|
|
|
|
|
[dict(style=NORMAL, strict=False), [['zhong'], ['xin']]],
|
|
|
|
|
[dict(style=TONE), [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
|
|
|
|
|
[dict(style=TONE, strict=False), [
|
|
|
|
|
['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']]],
|
|
|
|
|
[dict(style=TONE2), [['zho1ng', 'zho4ng'], ['xi1n']]],
|
|
|
|
|
[dict(style=TONE2, strict=False), [['zho1ng', 'zho4ng'], ['xi1n']]],
|
|
|
|
|
[dict(style=TONE3), [['zhong1', 'zhong4'], ['xin1']]],
|
|
|
|
|
[dict(style=TONE3, strict=False), [['zhong1', 'zhong4'], ['xin1']]],
|
|
|
|
|
[dict(style=INITIALS), [['zh'], ['x']]],
|
|
|
|
|
[dict(style=INITIALS, strict=False), [['zh'], ['x']]],
|
|
|
|
|
[dict(style=FIRST_LETTER), [['z'], ['x']]],
|
|
|
|
|
[dict(style=FIRST_LETTER, strict=False), [['z'], ['x']]],
|
|
|
|
|
[dict(style=FINALS), [['ong'], ['in']]],
|
|
|
|
|
[dict(style=FINALS, strict=False), [['ong'], ['in']]],
|
|
|
|
|
[dict(style=FINALS_TONE), [['\u014dng', '\xf2ng'], ['\u012bn']]],
|
|
|
|
|
[dict(style=FINALS_TONE, strict=False), [
|
|
|
|
|
['\u014dng', '\xf2ng'], ['\u012bn']]],
|
|
|
|
|
[dict(style=FINALS_TONE2), [['o1ng', 'o4ng'], ['i1n']]],
|
|
|
|
|
[dict(style=FINALS_TONE2, strict=False), [['o1ng', 'o4ng'], ['i1n']]],
|
|
|
|
|
[dict(style=FINALS_TONE3), [['ong1', 'ong4'], ['in1']]],
|
|
|
|
|
[dict(style=FINALS_TONE3, strict=False), [['ong1', 'ong4'], ['in1']]],
|
|
|
|
|
])
|
|
|
|
|
def test_heteronym_and_style(kwargs, result):
|
|
|
|
|
hans = '中心'
|
|
|
|
|
kwargs['heteronym'] = True
|
|
|
|
|
assert pinyin(hans, **kwargs) == result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('kwargs,result', [
|
|
|
|
|
[{}, [['zhāo'], ['yáng']]],
|
|
|
|
|
[dict(heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
|
|
|
|
|
[dict(strict=False), [['zhāo'], ['yáng']]],
|
|
|
|
|
[dict(strict=False, heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
|
|
|
|
|
[dict(style=NORMAL), [['zhao'], ['yang']]],
|
|
|
|
|
[dict(style=NORMAL, heteronym=True), [['zhao', 'chao'], ['yang']]],
|
|
|
|
|
[dict(style=NORMAL, strict=False), [['zhao'], ['yang']]],
|
|
|
|
|
[dict(style=NORMAL, strict=False, heteronym=True), [['zhao', 'chao'],
|
|
|
|
|
['yang']]],
|
|
|
|
|
[dict(style=TONE), [['zhāo'], ['yáng']]],
|
|
|
|
|
[dict(style=TONE, heteronym=True), [['zhāo', 'cháo'], ['yáng']]],
|
|
|
|
|
[dict(style=TONE, strict=False), [['zhāo'], ['yáng']]],
|
|
|
|
|
[dict(style=TONE, strict=False, heteronym=True), [['zhāo', 'cháo'],
|
|
|
|
|
['yáng']]],
|
|
|
|
|
[dict(style=TONE2), [['zha1o'], ['ya2ng']]],
|
|
|
|
|
[dict(style=TONE2, heteronym=True), [['zha1o', 'cha2o'], ['ya2ng']]],
|
|
|
|
|
[dict(style=TONE2, strict=False), [['zha1o'], ['ya2ng']]],
|
|
|
|
|
[dict(style=TONE2, strict=False, heteronym=True), [['zha1o', 'cha2o'],
|
|
|
|
|
['ya2ng']]],
|
|
|
|
|
[dict(style=TONE3), [['zhao1'], ['yang2']]],
|
|
|
|
|
[dict(style=TONE3, heteronym=True), [['zhao1', 'chao2'], ['yang2']]],
|
|
|
|
|
[dict(style=TONE3, strict=False), [['zhao1'], ['yang2']]],
|
|
|
|
|
[dict(style=TONE3, strict=False, heteronym=True), [['zhao1', 'chao2'],
|
|
|
|
|
['yang2']]],
|
|
|
|
|
[dict(style=INITIALS), [['zh'], ['']]],
|
|
|
|
|
[dict(style=INITIALS, heteronym=True), [['zh', 'ch'], ['']]],
|
|
|
|
|
[dict(style=INITIALS, strict=False), [['zh'], ['y']]],
|
|
|
|
|
[dict(style=INITIALS, strict=False, heteronym=True), [['zh', 'ch'],
|
|
|
|
|
['y']]],
|
|
|
|
|
[dict(style=FIRST_LETTER), [['z'], ['y']]],
|
|
|
|
|
[dict(style=FIRST_LETTER, heteronym=True), [['z', 'c'], ['y']]],
|
|
|
|
|
[dict(style=FIRST_LETTER, strict=False), [['z'], ['y']]],
|
|
|
|
|
[dict(style=FIRST_LETTER, strict=False, heteronym=True), [['z', 'c'],
|
|
|
|
|
['y']]],
|
|
|
|
|
[dict(style=FINALS), [['ao'], ['iang']]],
|
|
|
|
|
[dict(style=FINALS, heteronym=True), [['ao'], ['iang']]],
|
|
|
|
|
[dict(style=FINALS, strict=False), [['ao'], ['ang']]],
|
|
|
|
|
[dict(style=FINALS, strict=False, heteronym=True), [['ao'], ['ang']]],
|
|
|
|
|
[dict(style=FINALS_TONE), [['āo'], ['iáng']]],
|
|
|
|
|
[dict(style=FINALS_TONE, heteronym=True), [['āo', 'áo'], ['iáng']]],
|
|
|
|
|
[dict(style=FINALS_TONE, strict=False), [['āo'], ['áng']]],
|
|
|
|
|
[dict(style=FINALS_TONE, strict=False, heteronym=True), [['āo', 'áo'],
|
|
|
|
|
['áng']]],
|
|
|
|
|
[dict(style=FINALS_TONE2), [['a1o'], ['ia2ng']]],
|
|
|
|
|
[dict(style=FINALS_TONE2, heteronym=True), [['a1o', 'a2o'], ['ia2ng']]],
|
|
|
|
|
[dict(style=FINALS_TONE2, strict=False), [['a1o'], ['a2ng']]],
|
|
|
|
|
[dict(style=FINALS_TONE2, strict=False, heteronym=True), [['a1o', 'a2o'],
|
|
|
|
|
['a2ng']]],
|
|
|
|
|
[dict(style=FINALS_TONE3), [['ao1'], ['iang2']]],
|
|
|
|
|
[dict(style=FINALS_TONE3, heteronym=True), [['ao1', 'ao2'], ['iang2']]],
|
|
|
|
|
[dict(style=FINALS_TONE3, strict=False), [['ao1'], ['ang2']]],
|
|
|
|
|
[dict(style=FINALS_TONE3, strict=False, heteronym=True), [['ao1', 'ao2'],
|
|
|
|
|
['ang2']]],
|
|
|
|
|
])
|
|
|
|
|
def test_heteronym_and_style_phrase(kwargs, result):
|
|
|
|
|
hans = '朝阳'
|
|
|
|
|
assert pinyin(hans, **kwargs) == result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_m4():
|
|
|
|
|
# U+5463: ḿ,móu,m̀ # 呣
|
|
|
|
|
han = '呣'
|
|
|
|
|
assert pinyin(han) == [['ḿ']]
|
|
|
|
|
assert pinyin(han, heteronym=True) == [['ḿ', 'm̀', 'móu']]
|
|
|
|
|
assert pinyin(
|
|
|
|
|
han, heteronym=True, style=NORMAL) == [['m', 'mou']]
|
|
|
|
|
assert pinyin(
|
|
|
|
|
han, heteronym=True, style=TONE) == [['ḿ', 'm̀', 'móu']]
|
|
|
|
|
assert pinyin(
|
|
|
|
|
han, heteronym=True, style=TONE2) == [['m2', 'm4', 'mo2u']]
|
|
|
|
|
assert pinyin(
|
|
|
|
|
han, heteronym=True, style=TONE3) == [['m2', 'm4', 'mou2']]
|
|
|
|
|
assert pinyin(
|
|
|
|
|
han, heteronym=True, style=INITIALS) == [['', 'm']] # TODO: fix ''
|
|
|
|
|
assert pinyin(
|
|
|
|
|
han, heteronym=True, style=FIRST_LETTER) == [['m']]
|
|
|
|
|
assert pinyin(
|
|
|
|
|
han, heteronym=True, style=FINALS) == [['m', 'ou']]
|
|
|
|
|
assert pinyin(
|
|
|
|
|
han, heteronym=True, style=FINALS_TONE) == [['ḿ', 'm̀', 'óu']]
|
|
|
|
|
assert pinyin(
|
|
|
|
|
han, heteronym=True, style=FINALS_TONE2) == [['m2', 'm4', 'o2u']]
|
|
|
|
|
assert pinyin(
|
|
|
|
|
han, heteronym=True, style=FINALS_TONE3) == [['m2', 'm4', 'ou2']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('han,style,expect', [
|
|
|
|
|
['呣', Style.TONE, ['ḿ', 'm̀']],
|
|
|
|
|
['呣', Style.TONE2, ['m2', 'm4']],
|
|
|
|
|
['嘸', Style.TONE, ['m̄', 'ḿ']],
|
|
|
|
|
['嘸', Style.TONE2, ['m1', 'm2']],
|
|
|
|
|
['誒', Style.TONE, ['ê̄', 'ế', 'ê̌', 'ề']],
|
|
|
|
|
['誒', Style.TONE2, ['ê1', 'ê2', 'ê3', 'ê4']],
|
|
|
|
|
])
|
|
|
|
|
def test_m_e(han, style, expect):
|
|
|
|
|
result = pinyin(han, style=style, heteronym=True)
|
|
|
|
|
assert len(result) == 1
|
|
|
|
|
assert (set(result[0]) & set(expect)) == set(expect)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
import pytest
|
|
|
|
|
pytest.cmdline.main()
|