You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
205 lines
8.4 KiB
205 lines
8.4 KiB
4 years ago
|
|
||
|
"""Tests for the zhon.pinyin module."""
|
||
|
|
||
|
import random
|
||
|
import re
|
||
|
import unittest
|
||
|
|
||
|
from zhon import pinyin
|
||
|
|
||
|
|
||
|
NUM_WORDS = 50 # Number of random words to test
|
||
|
WORD_LENGTH = 4 # Length of random words (number of syllables)
|
||
|
NUM_SENT = 10 # Number of random sentences to test
|
||
|
SENT_LENGTH = 5 # Length of random sentences (number of words)
|
||
|
|
||
|
VALID_SYLS = ( # 411 total syllables, including 'r'
|
||
|
'ba', 'pa', 'ma', 'fa', 'da', 'ta', 'na', 'la', 'ga', 'ka', 'ha', 'za',
|
||
|
'ca', 'sa', 'zha', 'cha', 'sha', 'a', 'bo', 'po', 'mo', 'fo', 'yo', 'lo',
|
||
|
'o', 'me', 'de', 'te', 'ne', 'le', 'ge', 'ke', 'he', 'ze', 'ce', 'se',
|
||
|
'zhe', 'che', 'she', 're', 'e', 'bai', 'pai', 'mai', 'dai', 'tai',
|
||
|
'nai', 'lai', 'gai', 'kai', 'hai', 'zai', 'cai', 'sai', 'zhai', 'chai',
|
||
|
'shai', 'ai', 'bei', 'pei', 'mei', 'fei', 'dei', 'tei', 'nei', 'lei',
|
||
|
'gei', 'kei', 'hei', 'zei', 'zhei', 'shei', 'ei', 'bao', 'pao', 'mao',
|
||
|
'dao', 'tao', 'nao', 'lao', 'gao', 'kao', 'hao', 'zao', 'cao', 'sao',
|
||
|
'zhao', 'chao', 'shao', 'rao', 'ao', 'pou', 'mou', 'fou', 'dou', 'tou',
|
||
|
'nou', 'lou', 'gou', 'kou', 'hou', 'zou', 'cou', 'sou', 'zhou', 'chou',
|
||
|
'shou', 'rou', 'ou', 'ban', 'pan', 'man', 'fan', 'dan', 'tan', 'nan',
|
||
|
'lan', 'gan', 'kan', 'han', 'zan', 'can', 'san', 'zhan', 'chan',
|
||
|
'shan', 'ran', 'an', 'bang', 'pang', 'mang', 'fang', 'dang', 'tang',
|
||
|
'nang', 'lang', 'gang', 'kang', 'hang', 'zang', 'cang', 'sang',
|
||
|
'zhang', 'chang', 'shang', 'rang', 'ang', 'ben', 'pen', 'men', 'fen',
|
||
|
'den', 'nen', 'gen', 'ken', 'hen', 'zen', 'cen', 'sen', 'zhen', 'chen',
|
||
|
'shen', 'ren', 'en', 'beng', 'peng', 'meng', 'feng', 'deng', 'teng',
|
||
|
'neng', 'leng', 'geng', 'keng', 'heng', 'zeng', 'ceng', 'seng',
|
||
|
'zheng', 'cheng', 'sheng', 'reng', 'eng', 'dong', 'tong', 'nong',
|
||
|
'long', 'gong', 'kong', 'hong', 'zong', 'cong', 'song', 'zhong',
|
||
|
'chong', 'rong', 'bu', 'pu', 'mu', 'fu', 'du', 'tu', 'nu', 'lu',
|
||
|
'gu', 'ku', 'hu', 'zu', 'cu', 'su', 'zhu', 'chu', 'shu', 'ru', 'wu',
|
||
|
'gua', 'kua', 'hua', 'zhua', 'chua', 'shua', 'rua', 'wa', 'duo', 'tuo',
|
||
|
'nuo', 'luo', 'guo', 'kuo', 'huo', 'zuo', 'cuo', 'suo', 'zhuo', 'chuo',
|
||
|
'shuo', 'ruo', 'wo', 'guai', 'kuai', 'huai', 'zhuai', 'chuai', 'shuai',
|
||
|
'wai', 'dui', 'tui', 'gui', 'kui', 'hui', 'zui', 'cui', 'sui', 'zhui',
|
||
|
'chui', 'shui', 'rui', 'wei', 'duan', 'tuan', 'nuan', 'luan', 'guan',
|
||
|
'kuan', 'huan', 'zuan', 'cuan', 'suan', 'zhuan', 'chuan', 'shuan',
|
||
|
'ruan', 'wan', 'guang', 'kuang', 'huang', 'zhuang', 'chuang', 'shuang',
|
||
|
'wang', 'dun', 'tun', 'nun', 'lun', 'gun', 'kun', 'hun', 'zun', 'cun',
|
||
|
'sun', 'zhun', 'chun', 'shun', 'run', 'wen', 'weng', 'bi', 'pi', 'mi',
|
||
|
'di', 'ti', 'ni', 'li', 'zi', 'ci', 'si', 'zhi', 'chi', 'shi', 'ri',
|
||
|
'ji', 'qi', 'xi', 'yi', 'dia', 'lia', 'jia', 'qia', 'xia', 'ya', 'bie',
|
||
|
'pie', 'mie', 'die', 'tie', 'nie', 'lie', 'jie', 'qie', 'xie', 'ye',
|
||
|
'biao', 'piao', 'miao', 'diao', 'tiao', 'niao', 'liao', 'jiao', 'qiao',
|
||
|
'xiao', 'yao', 'miu', 'diu', 'niu', 'liu', 'jiu', 'qiu', 'xiu', 'you',
|
||
|
'bian', 'pian', 'mian', 'dian', 'tian', 'nian', 'lian', 'jian', 'qian',
|
||
|
'xian', 'yan', 'niang', 'liang', 'jiang', 'qiang', 'xiang', 'yang',
|
||
|
'bin', 'pin', 'min', 'nin', 'lin', 'jin', 'qin', 'xin', 'yin', 'bing',
|
||
|
'ping', 'ming', 'ding', 'ting', 'ning', 'ling', 'jing', 'qing', 'xing',
|
||
|
'ying', 'jiong', 'qiong', 'xiong', 'yong', 'nü', 'lü', 'ju', 'qu',
|
||
|
'xu', 'yu', 'nüe', 'lüe', 'jue', 'que', 'xue', 'yue', 'juan', 'quan',
|
||
|
'xuan', 'yuan', 'jun', 'qun', 'xun', 'yun', 'er', 'r'
|
||
|
)
|
||
|
|
||
|
SYL = re.compile(pinyin.syllable)
|
||
|
A_SYL = re.compile(pinyin.a_syl)
|
||
|
N_SYL = re.compile(pinyin.n_syl)
|
||
|
WORD = re.compile(pinyin.word)
|
||
|
N_WORD = re.compile(pinyin.n_word)
|
||
|
A_WORD = re.compile(pinyin.a_word)
|
||
|
SENT = re.compile(pinyin.sentence)
|
||
|
N_SENT = re.compile(pinyin.n_sent)
|
||
|
A_SENT = re.compile(pinyin.a_sent)
|
||
|
|
||
|
|
||
|
VOWELS = 'aeiou\u00FC'
|
||
|
VOWEL_MAP = {
|
||
|
'a1': '\u0101', 'a2': '\xe1', 'a3': '\u01ce', 'a4': '\xe0', 'a5': 'a',
|
||
|
'e1': '\u0113', 'e2': '\xe9', 'e3': '\u011b', 'e4': '\xe8', 'e5': 'e',
|
||
|
'i1': '\u012b', 'i2': '\xed', 'i3': '\u01d0', 'i4': '\xec', 'i5': 'i',
|
||
|
'o1': '\u014d', 'o2': '\xf3', 'o3': '\u01d2', 'o4': '\xf2', 'o5': 'o',
|
||
|
'u1': '\u016b', 'u2': '\xfa', 'u3': '\u01d4', 'u4': '\xf9', 'u5': 'u',
|
||
|
'\u00fc1': '\u01d6', '\u00fc2': '\u01d8', '\u00fc3': '\u01da',
|
||
|
'\u00fc4': '\u01dc', '\u00fc5': '\u00fc'
|
||
|
}
|
||
|
|
||
|
|
||
|
def _num_vowel_to_acc(vowel, tone):
|
||
|
"""Convert a numbered vowel to an accented vowel."""
|
||
|
try:
|
||
|
return VOWEL_MAP[vowel + str(tone)]
|
||
|
except IndexError:
|
||
|
raise ValueError("Vowel must be one of '{}' and tone must be an int"
|
||
|
"1-5.".format(VOWELS))
|
||
|
|
||
|
|
||
|
def num_syl_to_acc(syllable):
|
||
|
"""Convert a numbered pinyin syllable to an accented pinyin syllable.
|
||
|
|
||
|
Implements the following algorithm:
|
||
|
1. If the syllable has an 'a' or 'e', put the tone over that vowel.
|
||
|
2. If the syllable has 'ou', place the tone over the 'o'.
|
||
|
3. Otherwise, put the tone on the last vowel.
|
||
|
|
||
|
"""
|
||
|
if syllable.startswith('r') and len(syllable) <= 2:
|
||
|
return 'r' # Special case for 'r' syllable.
|
||
|
if re.search('[{}]'.format(VOWELS), syllable) is None:
|
||
|
return syllable
|
||
|
syl, tone = syllable[:-1], syllable[-1]
|
||
|
if tone not in '12345':
|
||
|
# We did not find a tone number. Abort conversion.
|
||
|
return syl
|
||
|
syl = re.sub('u:|v', '\u00fc', syl)
|
||
|
if 'a' in syl:
|
||
|
return syl.replace('a', _num_vowel_to_acc('a', tone))
|
||
|
elif 'e' in syl:
|
||
|
return syl.replace('e', _num_vowel_to_acc('e', tone))
|
||
|
elif 'ou' in syl:
|
||
|
return syl.replace('o', _num_vowel_to_acc('o', tone))
|
||
|
last_vowel = syl[max(map(syl.rfind, VOWELS))] # Find last vowel index.
|
||
|
return syl.replace(last_vowel, _num_vowel_to_acc(last_vowel, tone))
|
||
|
|
||
|
|
||
|
class TestPinyinSyllables(unittest.TestCase):
|
||
|
|
||
|
maxDiff = None
|
||
|
|
||
|
def test_number_syllables(self):
|
||
|
vs = list(VALID_SYLS)
|
||
|
_vs = []
|
||
|
for n in range(0, len(vs)):
|
||
|
vs[n] = vs[n] + str(random.randint(1, 5))
|
||
|
_vs.append(vs[n])
|
||
|
if _vs[n][0] in 'aeo':
|
||
|
_vs[n] = "'{}".format(_vs[n])
|
||
|
s = ''.join(_vs)
|
||
|
self.assertEqual(SYL.findall(s), vs)
|
||
|
self.assertEqual(N_SYL.findall(s), vs)
|
||
|
|
||
|
def test_accent_syllables(self):
|
||
|
vs = list(VALID_SYLS)
|
||
|
_vs = []
|
||
|
for n in range(0, len(vs)):
|
||
|
syl = vs[n]
|
||
|
vs[n] = num_syl_to_acc(vs[n] + str(random.randint(1, 5)))
|
||
|
_vs.append(vs[n])
|
||
|
if syl[0] in 'aeo':
|
||
|
_vs[n] = "'{}".format(_vs[n])
|
||
|
s = ''.join(_vs)
|
||
|
self.assertEqual(SYL.findall(s), vs)
|
||
|
self.assertEqual(A_SYL.findall(s), vs)
|
||
|
|
||
|
|
||
|
def create_word(accented=False):
|
||
|
if accented:
|
||
|
tone = lambda: str(random.randint(1, 5))
|
||
|
vs = [num_syl_to_acc(s + tone()) for s in VALID_SYLS]
|
||
|
else:
|
||
|
vs = [s + str(random.randint(1, 5)) for s in VALID_SYLS]
|
||
|
word = vs[random.randint(0, len(vs) - 1)]
|
||
|
for n in range(1, WORD_LENGTH):
|
||
|
num = random.randint(0, len(vs) - 1)
|
||
|
word += ['-', ''][random.randint(0, 1)]
|
||
|
if VALID_SYLS[num][0] in 'aeo' and word[-1] != '-':
|
||
|
word += "'"
|
||
|
word += vs[num]
|
||
|
return word
|
||
|
|
||
|
|
||
|
class TestPinyinWords(unittest.TestCase):
|
||
|
|
||
|
def test_number_words(self):
|
||
|
for n in range(0, NUM_WORDS):
|
||
|
word = create_word()
|
||
|
self.assertEqual(WORD.match(word).group(0), word)
|
||
|
self.assertEqual(N_WORD.match(word).group(0), word)
|
||
|
|
||
|
def test_accent_words(self):
|
||
|
for n in range(0, NUM_WORDS):
|
||
|
word = create_word(accented=True)
|
||
|
self.assertEqual(WORD.match(word).group(0), word)
|
||
|
self.assertEqual(A_WORD.match(word).group(0), word)
|
||
|
|
||
|
|
||
|
def create_sentence(accented=False):
|
||
|
_sent = []
|
||
|
for n in range(0, SENT_LENGTH):
|
||
|
_sent.append(create_word(accented=accented))
|
||
|
sentence = [_sent.pop(0)]
|
||
|
sentence.extend([random.choice([' ', ', ', '; ']) + w for w in _sent])
|
||
|
return ''.join(sentence) + '.'
|
||
|
|
||
|
|
||
|
class TestPinyinSentences(unittest.TestCase):
|
||
|
|
||
|
def test_number_sentences(self):
|
||
|
for n in range(0, NUM_SENT):
|
||
|
sentence = create_sentence()
|
||
|
self.assertEqual(SENT.match(sentence).group(0), sentence)
|
||
|
self.assertEqual(N_SENT.match(sentence).group(0), sentence)
|
||
|
|
||
|
def test_accent_sentences(self):
|
||
|
for n in range(0, NUM_SENT):
|
||
|
sentence = create_sentence(accented=True)
|
||
|
self.assertEqual(SENT.match(sentence).group(0), sentence)
|
||
|
self.assertEqual(A_SENT.match(sentence).group(0), sentence)
|