You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/zhon/tests/test-pinyin.py

205 lines
8.4 KiB

"""Tests for the zhon.pinyin module."""
import random
import re
import unittest
from zhon import pinyin
NUM_WORDS = 50 # Number of random words to test
WORD_LENGTH = 4 # Length of random words (number of syllables)
NUM_SENT = 10 # Number of random sentences to test
SENT_LENGTH = 5 # Length of random sentences (number of words)
VALID_SYLS = ( # 411 total syllables, including 'r'
'ba', 'pa', 'ma', 'fa', 'da', 'ta', 'na', 'la', 'ga', 'ka', 'ha', 'za',
'ca', 'sa', 'zha', 'cha', 'sha', 'a', 'bo', 'po', 'mo', 'fo', 'yo', 'lo',
'o', 'me', 'de', 'te', 'ne', 'le', 'ge', 'ke', 'he', 'ze', 'ce', 'se',
'zhe', 'che', 'she', 're', 'e', 'bai', 'pai', 'mai', 'dai', 'tai',
'nai', 'lai', 'gai', 'kai', 'hai', 'zai', 'cai', 'sai', 'zhai', 'chai',
'shai', 'ai', 'bei', 'pei', 'mei', 'fei', 'dei', 'tei', 'nei', 'lei',
'gei', 'kei', 'hei', 'zei', 'zhei', 'shei', 'ei', 'bao', 'pao', 'mao',
'dao', 'tao', 'nao', 'lao', 'gao', 'kao', 'hao', 'zao', 'cao', 'sao',
'zhao', 'chao', 'shao', 'rao', 'ao', 'pou', 'mou', 'fou', 'dou', 'tou',
'nou', 'lou', 'gou', 'kou', 'hou', 'zou', 'cou', 'sou', 'zhou', 'chou',
'shou', 'rou', 'ou', 'ban', 'pan', 'man', 'fan', 'dan', 'tan', 'nan',
'lan', 'gan', 'kan', 'han', 'zan', 'can', 'san', 'zhan', 'chan',
'shan', 'ran', 'an', 'bang', 'pang', 'mang', 'fang', 'dang', 'tang',
'nang', 'lang', 'gang', 'kang', 'hang', 'zang', 'cang', 'sang',
'zhang', 'chang', 'shang', 'rang', 'ang', 'ben', 'pen', 'men', 'fen',
'den', 'nen', 'gen', 'ken', 'hen', 'zen', 'cen', 'sen', 'zhen', 'chen',
'shen', 'ren', 'en', 'beng', 'peng', 'meng', 'feng', 'deng', 'teng',
'neng', 'leng', 'geng', 'keng', 'heng', 'zeng', 'ceng', 'seng',
'zheng', 'cheng', 'sheng', 'reng', 'eng', 'dong', 'tong', 'nong',
'long', 'gong', 'kong', 'hong', 'zong', 'cong', 'song', 'zhong',
'chong', 'rong', 'bu', 'pu', 'mu', 'fu', 'du', 'tu', 'nu', 'lu',
'gu', 'ku', 'hu', 'zu', 'cu', 'su', 'zhu', 'chu', 'shu', 'ru', 'wu',
'gua', 'kua', 'hua', 'zhua', 'chua', 'shua', 'rua', 'wa', 'duo', 'tuo',
'nuo', 'luo', 'guo', 'kuo', 'huo', 'zuo', 'cuo', 'suo', 'zhuo', 'chuo',
'shuo', 'ruo', 'wo', 'guai', 'kuai', 'huai', 'zhuai', 'chuai', 'shuai',
'wai', 'dui', 'tui', 'gui', 'kui', 'hui', 'zui', 'cui', 'sui', 'zhui',
'chui', 'shui', 'rui', 'wei', 'duan', 'tuan', 'nuan', 'luan', 'guan',
'kuan', 'huan', 'zuan', 'cuan', 'suan', 'zhuan', 'chuan', 'shuan',
'ruan', 'wan', 'guang', 'kuang', 'huang', 'zhuang', 'chuang', 'shuang',
'wang', 'dun', 'tun', 'nun', 'lun', 'gun', 'kun', 'hun', 'zun', 'cun',
'sun', 'zhun', 'chun', 'shun', 'run', 'wen', 'weng', 'bi', 'pi', 'mi',
'di', 'ti', 'ni', 'li', 'zi', 'ci', 'si', 'zhi', 'chi', 'shi', 'ri',
'ji', 'qi', 'xi', 'yi', 'dia', 'lia', 'jia', 'qia', 'xia', 'ya', 'bie',
'pie', 'mie', 'die', 'tie', 'nie', 'lie', 'jie', 'qie', 'xie', 'ye',
'biao', 'piao', 'miao', 'diao', 'tiao', 'niao', 'liao', 'jiao', 'qiao',
'xiao', 'yao', 'miu', 'diu', 'niu', 'liu', 'jiu', 'qiu', 'xiu', 'you',
'bian', 'pian', 'mian', 'dian', 'tian', 'nian', 'lian', 'jian', 'qian',
'xian', 'yan', 'niang', 'liang', 'jiang', 'qiang', 'xiang', 'yang',
'bin', 'pin', 'min', 'nin', 'lin', 'jin', 'qin', 'xin', 'yin', 'bing',
'ping', 'ming', 'ding', 'ting', 'ning', 'ling', 'jing', 'qing', 'xing',
'ying', 'jiong', 'qiong', 'xiong', 'yong', '', '', 'ju', 'qu',
'xu', 'yu', 'nüe', 'lüe', 'jue', 'que', 'xue', 'yue', 'juan', 'quan',
'xuan', 'yuan', 'jun', 'qun', 'xun', 'yun', 'er', 'r'
)
SYL = re.compile(pinyin.syllable)
A_SYL = re.compile(pinyin.a_syl)
N_SYL = re.compile(pinyin.n_syl)
WORD = re.compile(pinyin.word)
N_WORD = re.compile(pinyin.n_word)
A_WORD = re.compile(pinyin.a_word)
SENT = re.compile(pinyin.sentence)
N_SENT = re.compile(pinyin.n_sent)
A_SENT = re.compile(pinyin.a_sent)
VOWELS = 'aeiou\u00FC'
VOWEL_MAP = {
'a1': '\u0101', 'a2': '\xe1', 'a3': '\u01ce', 'a4': '\xe0', 'a5': 'a',
'e1': '\u0113', 'e2': '\xe9', 'e3': '\u011b', 'e4': '\xe8', 'e5': 'e',
'i1': '\u012b', 'i2': '\xed', 'i3': '\u01d0', 'i4': '\xec', 'i5': 'i',
'o1': '\u014d', 'o2': '\xf3', 'o3': '\u01d2', 'o4': '\xf2', 'o5': 'o',
'u1': '\u016b', 'u2': '\xfa', 'u3': '\u01d4', 'u4': '\xf9', 'u5': 'u',
'\u00fc1': '\u01d6', '\u00fc2': '\u01d8', '\u00fc3': '\u01da',
'\u00fc4': '\u01dc', '\u00fc5': '\u00fc'
}
def _num_vowel_to_acc(vowel, tone):
"""Convert a numbered vowel to an accented vowel."""
try:
return VOWEL_MAP[vowel + str(tone)]
except IndexError:
raise ValueError("Vowel must be one of '{}' and tone must be an int"
"1-5.".format(VOWELS))
def num_syl_to_acc(syllable):
"""Convert a numbered pinyin syllable to an accented pinyin syllable.
Implements the following algorithm:
1. If the syllable has an 'a' or 'e', put the tone over that vowel.
2. If the syllable has 'ou', place the tone over the 'o'.
3. Otherwise, put the tone on the last vowel.
"""
if syllable.startswith('r') and len(syllable) <= 2:
return 'r' # Special case for 'r' syllable.
if re.search('[{}]'.format(VOWELS), syllable) is None:
return syllable
syl, tone = syllable[:-1], syllable[-1]
if tone not in '12345':
# We did not find a tone number. Abort conversion.
return syl
syl = re.sub('u:|v', '\u00fc', syl)
if 'a' in syl:
return syl.replace('a', _num_vowel_to_acc('a', tone))
elif 'e' in syl:
return syl.replace('e', _num_vowel_to_acc('e', tone))
elif 'ou' in syl:
return syl.replace('o', _num_vowel_to_acc('o', tone))
last_vowel = syl[max(map(syl.rfind, VOWELS))] # Find last vowel index.
return syl.replace(last_vowel, _num_vowel_to_acc(last_vowel, tone))
class TestPinyinSyllables(unittest.TestCase):
maxDiff = None
def test_number_syllables(self):
vs = list(VALID_SYLS)
_vs = []
for n in range(0, len(vs)):
vs[n] = vs[n] + str(random.randint(1, 5))
_vs.append(vs[n])
if _vs[n][0] in 'aeo':
_vs[n] = "'{}".format(_vs[n])
s = ''.join(_vs)
self.assertEqual(SYL.findall(s), vs)
self.assertEqual(N_SYL.findall(s), vs)
def test_accent_syllables(self):
vs = list(VALID_SYLS)
_vs = []
for n in range(0, len(vs)):
syl = vs[n]
vs[n] = num_syl_to_acc(vs[n] + str(random.randint(1, 5)))
_vs.append(vs[n])
if syl[0] in 'aeo':
_vs[n] = "'{}".format(_vs[n])
s = ''.join(_vs)
self.assertEqual(SYL.findall(s), vs)
self.assertEqual(A_SYL.findall(s), vs)
def create_word(accented=False):
if accented:
tone = lambda: str(random.randint(1, 5))
vs = [num_syl_to_acc(s + tone()) for s in VALID_SYLS]
else:
vs = [s + str(random.randint(1, 5)) for s in VALID_SYLS]
word = vs[random.randint(0, len(vs) - 1)]
for n in range(1, WORD_LENGTH):
num = random.randint(0, len(vs) - 1)
word += ['-', ''][random.randint(0, 1)]
if VALID_SYLS[num][0] in 'aeo' and word[-1] != '-':
word += "'"
word += vs[num]
return word
class TestPinyinWords(unittest.TestCase):
def test_number_words(self):
for n in range(0, NUM_WORDS):
word = create_word()
self.assertEqual(WORD.match(word).group(0), word)
self.assertEqual(N_WORD.match(word).group(0), word)
def test_accent_words(self):
for n in range(0, NUM_WORDS):
word = create_word(accented=True)
self.assertEqual(WORD.match(word).group(0), word)
self.assertEqual(A_WORD.match(word).group(0), word)
def create_sentence(accented=False):
_sent = []
for n in range(0, SENT_LENGTH):
_sent.append(create_word(accented=accented))
sentence = [_sent.pop(0)]
sentence.extend([random.choice([' ', ', ', '; ']) + w for w in _sent])
return ''.join(sentence) + '.'
class TestPinyinSentences(unittest.TestCase):
def test_number_sentences(self):
for n in range(0, NUM_SENT):
sentence = create_sentence()
self.assertEqual(SENT.match(sentence).group(0), sentence)
self.assertEqual(N_SENT.match(sentence).group(0), sentence)
def test_accent_sentences(self):
for n in range(0, NUM_SENT):
sentence = create_sentence(accented=True)
self.assertEqual(SENT.match(sentence).group(0), sentence)
self.assertEqual(A_SENT.match(sentence).group(0), sentence)