PaddleSpeech/third_party/zhon/tests/test-pinyin.py


"""Tests for the zhon.pinyin module."""

import random
import re
import unittest

from zhon import pinyin


NUM_WORDS = 50   # Number of random words to test
WORD_LENGTH = 4    # Length of random words (number of syllables)
NUM_SENT = 10     # Number of random sentences to test
SENT_LENGTH = 5   # Length of random sentences (number of words)

VALID_SYLS = (  # 411 total syllables, including 'r'
    'ba', 'pa', 'ma', 'fa', 'da', 'ta', 'na', 'la', 'ga', 'ka', 'ha', 'za',
    'ca', 'sa', 'zha', 'cha', 'sha', 'a', 'bo', 'po', 'mo', 'fo', 'yo', 'lo',
    'o', 'me', 'de', 'te', 'ne', 'le', 'ge', 'ke', 'he', 'ze', 'ce', 'se',
    'zhe', 'che', 'she', 're', 'e', 'bai', 'pai', 'mai', 'dai', 'tai',
    'nai', 'lai', 'gai', 'kai', 'hai', 'zai', 'cai', 'sai', 'zhai', 'chai',
    'shai', 'ai', 'bei', 'pei', 'mei', 'fei', 'dei', 'tei', 'nei', 'lei',
    'gei', 'kei', 'hei', 'zei', 'zhei', 'shei', 'ei', 'bao', 'pao', 'mao',
    'dao', 'tao', 'nao', 'lao', 'gao', 'kao', 'hao', 'zao', 'cao', 'sao',
    'zhao', 'chao', 'shao', 'rao', 'ao', 'pou', 'mou', 'fou', 'dou', 'tou',
    'nou', 'lou', 'gou', 'kou', 'hou', 'zou', 'cou', 'sou', 'zhou', 'chou',
    'shou', 'rou', 'ou', 'ban', 'pan', 'man', 'fan', 'dan', 'tan', 'nan',
    'lan', 'gan', 'kan', 'han', 'zan', 'can', 'san', 'zhan', 'chan',
    'shan', 'ran', 'an', 'bang', 'pang', 'mang', 'fang', 'dang', 'tang',
    'nang', 'lang', 'gang', 'kang', 'hang', 'zang', 'cang', 'sang',
    'zhang', 'chang', 'shang', 'rang', 'ang', 'ben', 'pen', 'men', 'fen',
    'den', 'nen', 'gen', 'ken', 'hen', 'zen', 'cen', 'sen', 'zhen', 'chen',
    'shen', 'ren', 'en', 'beng', 'peng', 'meng', 'feng', 'deng', 'teng',
    'neng', 'leng', 'geng', 'keng', 'heng', 'zeng', 'ceng', 'seng',
    'zheng', 'cheng', 'sheng', 'reng', 'eng', 'dong', 'tong', 'nong',
    'long', 'gong', 'kong', 'hong', 'zong', 'cong', 'song', 'zhong',
    'chong', 'rong', 'bu', 'pu', 'mu', 'fu', 'du', 'tu', 'nu', 'lu',
    'gu', 'ku', 'hu', 'zu', 'cu', 'su', 'zhu', 'chu', 'shu', 'ru', 'wu',
    'gua', 'kua', 'hua', 'zhua', 'chua', 'shua', 'rua', 'wa', 'duo', 'tuo',
    'nuo', 'luo', 'guo', 'kuo', 'huo', 'zuo', 'cuo', 'suo', 'zhuo', 'chuo',
    'shuo', 'ruo', 'wo', 'guai', 'kuai', 'huai', 'zhuai', 'chuai', 'shuai',
    'wai', 'dui', 'tui', 'gui', 'kui', 'hui', 'zui', 'cui', 'sui', 'zhui',
    'chui', 'shui', 'rui', 'wei', 'duan', 'tuan', 'nuan', 'luan', 'guan',
    'kuan', 'huan', 'zuan', 'cuan', 'suan', 'zhuan', 'chuan', 'shuan',
    'ruan', 'wan', 'guang', 'kuang', 'huang', 'zhuang', 'chuang', 'shuang',
    'wang', 'dun', 'tun', 'nun', 'lun', 'gun', 'kun', 'hun', 'zun', 'cun',
    'sun', 'zhun', 'chun', 'shun', 'run', 'wen', 'weng', 'bi', 'pi', 'mi',
    'di', 'ti', 'ni', 'li', 'zi', 'ci', 'si', 'zhi', 'chi', 'shi', 'ri',
    'ji', 'qi', 'xi', 'yi', 'dia', 'lia', 'jia', 'qia', 'xia', 'ya', 'bie',
    'pie', 'mie', 'die', 'tie', 'nie', 'lie', 'jie', 'qie', 'xie', 'ye',
    'biao', 'piao', 'miao', 'diao', 'tiao', 'niao', 'liao', 'jiao', 'qiao',
    'xiao', 'yao', 'miu', 'diu', 'niu', 'liu', 'jiu', 'qiu', 'xiu', 'you',
    'bian', 'pian', 'mian', 'dian', 'tian', 'nian', 'lian', 'jian', 'qian',
    'xian', 'yan', 'niang', 'liang', 'jiang', 'qiang', 'xiang', 'yang',
    'bin', 'pin', 'min', 'nin', 'lin', 'jin', 'qin', 'xin', 'yin', 'bing',
    'ping', 'ming', 'ding', 'ting', 'ning', 'ling', 'jing', 'qing', 'xing',
    'ying', 'jiong', 'qiong', 'xiong', 'yong', 'nü', 'lü', 'ju', 'qu',
    'xu', 'yu', 'nüe', 'lüe', 'jue', 'que', 'xue', 'yue', 'juan', 'quan',
    'xuan', 'yuan', 'jun', 'qun', 'xun', 'yun', 'er', 'r'
)

SYL = re.compile(pinyin.syllable)
A_SYL = re.compile(pinyin.a_syl)
N_SYL = re.compile(pinyin.n_syl)
WORD = re.compile(pinyin.word)
N_WORD = re.compile(pinyin.n_word)
A_WORD = re.compile(pinyin.a_word)
SENT = re.compile(pinyin.sentence)
N_SENT = re.compile(pinyin.n_sent)
A_SENT = re.compile(pinyin.a_sent)


VOWELS = 'aeiou\u00FC'
VOWEL_MAP = {
    'a1': '\u0101', 'a2': '\xe1', 'a3': '\u01ce', 'a4': '\xe0', 'a5': 'a',
    'e1': '\u0113', 'e2': '\xe9', 'e3': '\u011b', 'e4': '\xe8', 'e5': 'e',
    'i1': '\u012b', 'i2': '\xed', 'i3': '\u01d0', 'i4': '\xec', 'i5': 'i',
    'o1': '\u014d', 'o2': '\xf3', 'o3': '\u01d2', 'o4': '\xf2', 'o5': 'o',
    'u1': '\u016b', 'u2': '\xfa', 'u3': '\u01d4', 'u4': '\xf9', 'u5': 'u',
    '\u00fc1': '\u01d6', '\u00fc2': '\u01d8', '\u00fc3': '\u01da',
    '\u00fc4': '\u01dc', '\u00fc5': '\u00fc'
}


def _num_vowel_to_acc(vowel, tone):
    """Convert a numbered vowel to an accented vowel."""
    try:
        return VOWEL_MAP[vowel + str(tone)]
    except IndexError:
        raise ValueError("Vowel must be one of '{}' and tone must be an int"
                         "1-5.".format(VOWELS))


def num_syl_to_acc(syllable):
    """Convert a numbered pinyin syllable to an accented pinyin syllable.

    Implements the following algorithm:
        1. If the syllable has an 'a' or 'e', put the tone over that vowel.
        2. If the syllable has 'ou', place the tone over the 'o'.
        3. Otherwise, put the tone on the last vowel.

    """
    if syllable.startswith('r') and len(syllable) <= 2:
        return 'r'  # Special case for 'r' syllable.
    if re.search('[{}]'.format(VOWELS), syllable) is None:
        return syllable
    syl, tone = syllable[:-1], syllable[-1]
    if tone not in '12345':
        # We did not find a tone number. Abort conversion.
        return syl
    syl = re.sub('u:|v', '\u00fc', syl)
    if 'a' in syl:
        return syl.replace('a', _num_vowel_to_acc('a', tone))
    elif 'e' in syl:
        return syl.replace('e', _num_vowel_to_acc('e', tone))
    elif 'ou' in syl:
        return syl.replace('o', _num_vowel_to_acc('o', tone))
    last_vowel = syl[max(map(syl.rfind, VOWELS))]  # Find last vowel index.
    return syl.replace(last_vowel, _num_vowel_to_acc(last_vowel, tone))


class TestPinyinSyllables(unittest.TestCase):

    maxDiff = None

    def test_number_syllables(self):
        vs = list(VALID_SYLS)
        _vs = []
        for n in range(0, len(vs)):
            vs[n] = vs[n] + str(random.randint(1, 5))
            _vs.append(vs[n])
            if _vs[n][0] in 'aeo':
                _vs[n] = "'{}".format(_vs[n])
        s = ''.join(_vs)
        self.assertEqual(SYL.findall(s), vs)
        self.assertEqual(N_SYL.findall(s), vs)

    def test_accent_syllables(self):
        vs = list(VALID_SYLS)
        _vs = []
        for n in range(0, len(vs)):
            syl = vs[n]
            vs[n] = num_syl_to_acc(vs[n] + str(random.randint(1, 5)))
            _vs.append(vs[n])
            if syl[0] in 'aeo':
                _vs[n] = "'{}".format(_vs[n])
        s = ''.join(_vs)
        self.assertEqual(SYL.findall(s), vs)
        self.assertEqual(A_SYL.findall(s), vs)


def create_word(accented=False):
    if accented:
        tone = lambda: str(random.randint(1, 5))
        vs = [num_syl_to_acc(s + tone()) for s in VALID_SYLS]
    else:
        vs = [s + str(random.randint(1, 5)) for s in VALID_SYLS]
    word = vs[random.randint(0, len(vs) - 1)]
    for n in range(1, WORD_LENGTH):
        num = random.randint(0, len(vs) - 1)
        word += ['-', ''][random.randint(0, 1)]
        if VALID_SYLS[num][0] in 'aeo' and word[-1] != '-':
            word += "'"
        word += vs[num]
    return word


class TestPinyinWords(unittest.TestCase):

    def test_number_words(self):
        for n in range(0, NUM_WORDS):
            word = create_word()
            self.assertEqual(WORD.match(word).group(0), word)
            self.assertEqual(N_WORD.match(word).group(0), word)

    def test_accent_words(self):
        for n in range(0, NUM_WORDS):
            word = create_word(accented=True)
            self.assertEqual(WORD.match(word).group(0), word)
            self.assertEqual(A_WORD.match(word).group(0), word)


def create_sentence(accented=False):
    _sent = []
    for n in range(0, SENT_LENGTH):
        _sent.append(create_word(accented=accented))
    sentence = [_sent.pop(0)]
    sentence.extend([random.choice([' ', ', ', '; ']) + w for w in _sent])
    return ''.join(sentence) + '.'


class TestPinyinSentences(unittest.TestCase):

    def test_number_sentences(self):
        for n in range(0, NUM_SENT):
            sentence = create_sentence()
            self.assertEqual(SENT.match(sentence).group(0), sentence)
            self.assertEqual(N_SENT.match(sentence).group(0), sentence)

    def test_accent_sentences(self):
        for n in range(0, NUM_SENT):
            sentence = create_sentence(accented=True)
            self.assertEqual(SENT.match(sentence).group(0), sentence)
            self.assertEqual(A_SENT.match(sentence).group(0), sentence)
chinese char/word ngram lm (#613) * add ngram lm egs * add zhon repo * install kenlm, zhon * format * add chinese_text_normalization repo * add ngram lm egs 4 years ago
			`"""Tests for the zhon.pinyin module."""`

			`import random`
			`import re`
			`import unittest`

			`from zhon import pinyin`


			`NUM_WORDS = 50 # Number of random words to test`
			`WORD_LENGTH = 4 # Length of random words (number of syllables)`
			`NUM_SENT = 10 # Number of random sentences to test`
			`SENT_LENGTH = 5 # Length of random sentences (number of words)`

			`VALID_SYLS = ( # 411 total syllables, including 'r'`
			`'ba', 'pa', 'ma', 'fa', 'da', 'ta', 'na', 'la', 'ga', 'ka', 'ha', 'za',`
			`'ca', 'sa', 'zha', 'cha', 'sha', 'a', 'bo', 'po', 'mo', 'fo', 'yo', 'lo',`
			`'o', 'me', 'de', 'te', 'ne', 'le', 'ge', 'ke', 'he', 'ze', 'ce', 'se',`
			`'zhe', 'che', 'she', 're', 'e', 'bai', 'pai', 'mai', 'dai', 'tai',`
			`'nai', 'lai', 'gai', 'kai', 'hai', 'zai', 'cai', 'sai', 'zhai', 'chai',`
			`'shai', 'ai', 'bei', 'pei', 'mei', 'fei', 'dei', 'tei', 'nei', 'lei',`
			`'gei', 'kei', 'hei', 'zei', 'zhei', 'shei', 'ei', 'bao', 'pao', 'mao',`
			`'dao', 'tao', 'nao', 'lao', 'gao', 'kao', 'hao', 'zao', 'cao', 'sao',`
			`'zhao', 'chao', 'shao', 'rao', 'ao', 'pou', 'mou', 'fou', 'dou', 'tou',`
			`'nou', 'lou', 'gou', 'kou', 'hou', 'zou', 'cou', 'sou', 'zhou', 'chou',`
			`'shou', 'rou', 'ou', 'ban', 'pan', 'man', 'fan', 'dan', 'tan', 'nan',`
			`'lan', 'gan', 'kan', 'han', 'zan', 'can', 'san', 'zhan', 'chan',`
			`'shan', 'ran', 'an', 'bang', 'pang', 'mang', 'fang', 'dang', 'tang',`
			`'nang', 'lang', 'gang', 'kang', 'hang', 'zang', 'cang', 'sang',`
			`'zhang', 'chang', 'shang', 'rang', 'ang', 'ben', 'pen', 'men', 'fen',`
			`'den', 'nen', 'gen', 'ken', 'hen', 'zen', 'cen', 'sen', 'zhen', 'chen',`
			`'shen', 'ren', 'en', 'beng', 'peng', 'meng', 'feng', 'deng', 'teng',`
			`'neng', 'leng', 'geng', 'keng', 'heng', 'zeng', 'ceng', 'seng',`
			`'zheng', 'cheng', 'sheng', 'reng', 'eng', 'dong', 'tong', 'nong',`
			`'long', 'gong', 'kong', 'hong', 'zong', 'cong', 'song', 'zhong',`
			`'chong', 'rong', 'bu', 'pu', 'mu', 'fu', 'du', 'tu', 'nu', 'lu',`
			`'gu', 'ku', 'hu', 'zu', 'cu', 'su', 'zhu', 'chu', 'shu', 'ru', 'wu',`
			`'gua', 'kua', 'hua', 'zhua', 'chua', 'shua', 'rua', 'wa', 'duo', 'tuo',`
			`'nuo', 'luo', 'guo', 'kuo', 'huo', 'zuo', 'cuo', 'suo', 'zhuo', 'chuo',`
			`'shuo', 'ruo', 'wo', 'guai', 'kuai', 'huai', 'zhuai', 'chuai', 'shuai',`
			`'wai', 'dui', 'tui', 'gui', 'kui', 'hui', 'zui', 'cui', 'sui', 'zhui',`
			`'chui', 'shui', 'rui', 'wei', 'duan', 'tuan', 'nuan', 'luan', 'guan',`
			`'kuan', 'huan', 'zuan', 'cuan', 'suan', 'zhuan', 'chuan', 'shuan',`
			`'ruan', 'wan', 'guang', 'kuang', 'huang', 'zhuang', 'chuang', 'shuang',`
			`'wang', 'dun', 'tun', 'nun', 'lun', 'gun', 'kun', 'hun', 'zun', 'cun',`
			`'sun', 'zhun', 'chun', 'shun', 'run', 'wen', 'weng', 'bi', 'pi', 'mi',`
			`'di', 'ti', 'ni', 'li', 'zi', 'ci', 'si', 'zhi', 'chi', 'shi', 'ri',`
			`'ji', 'qi', 'xi', 'yi', 'dia', 'lia', 'jia', 'qia', 'xia', 'ya', 'bie',`
			`'pie', 'mie', 'die', 'tie', 'nie', 'lie', 'jie', 'qie', 'xie', 'ye',`
			`'biao', 'piao', 'miao', 'diao', 'tiao', 'niao', 'liao', 'jiao', 'qiao',`
			`'xiao', 'yao', 'miu', 'diu', 'niu', 'liu', 'jiu', 'qiu', 'xiu', 'you',`
			`'bian', 'pian', 'mian', 'dian', 'tian', 'nian', 'lian', 'jian', 'qian',`
			`'xian', 'yan', 'niang', 'liang', 'jiang', 'qiang', 'xiang', 'yang',`
			`'bin', 'pin', 'min', 'nin', 'lin', 'jin', 'qin', 'xin', 'yin', 'bing',`
			`'ping', 'ming', 'ding', 'ting', 'ning', 'ling', 'jing', 'qing', 'xing',`
			`'ying', 'jiong', 'qiong', 'xiong', 'yong', 'nü', 'lü', 'ju', 'qu',`
			`'xu', 'yu', 'nüe', 'lüe', 'jue', 'que', 'xue', 'yue', 'juan', 'quan',`
			`'xuan', 'yuan', 'jun', 'qun', 'xun', 'yun', 'er', 'r'`
			`)`

			`SYL = re.compile(pinyin.syllable)`
			`A_SYL = re.compile(pinyin.a_syl)`
			`N_SYL = re.compile(pinyin.n_syl)`
			`WORD = re.compile(pinyin.word)`
			`N_WORD = re.compile(pinyin.n_word)`
			`A_WORD = re.compile(pinyin.a_word)`
			`SENT = re.compile(pinyin.sentence)`
			`N_SENT = re.compile(pinyin.n_sent)`
			`A_SENT = re.compile(pinyin.a_sent)`


			`VOWELS = 'aeiou\u00FC'`
			`VOWEL_MAP = {`
			`'a1': '\u0101', 'a2': '\xe1', 'a3': '\u01ce', 'a4': '\xe0', 'a5': 'a',`
			`'e1': '\u0113', 'e2': '\xe9', 'e3': '\u011b', 'e4': '\xe8', 'e5': 'e',`
			`'i1': '\u012b', 'i2': '\xed', 'i3': '\u01d0', 'i4': '\xec', 'i5': 'i',`
			`'o1': '\u014d', 'o2': '\xf3', 'o3': '\u01d2', 'o4': '\xf2', 'o5': 'o',`
			`'u1': '\u016b', 'u2': '\xfa', 'u3': '\u01d4', 'u4': '\xf9', 'u5': 'u',`
			`'\u00fc1': '\u01d6', '\u00fc2': '\u01d8', '\u00fc3': '\u01da',`
			`'\u00fc4': '\u01dc', '\u00fc5': '\u00fc'`
			`}`


			`def _num_vowel_to_acc(vowel, tone):`
			`"""Convert a numbered vowel to an accented vowel."""`
			`try:`
			`return VOWEL_MAP[vowel + str(tone)]`
			`except IndexError:`
			`raise ValueError("Vowel must be one of '{}' and tone must be an int"`
			`"1-5.".format(VOWELS))`


			`def num_syl_to_acc(syllable):`
			`"""Convert a numbered pinyin syllable to an accented pinyin syllable.`

			`Implements the following algorithm:`
			`1. If the syllable has an 'a' or 'e', put the tone over that vowel.`
			`2. If the syllable has 'ou', place the tone over the 'o'.`
			`3. Otherwise, put the tone on the last vowel.`

			`"""`
			`if syllable.startswith('r') and len(syllable) <= 2:`
			`return 'r' # Special case for 'r' syllable.`
			`if re.search('[{}]'.format(VOWELS), syllable) is None:`
			`return syllable`
			`syl, tone = syllable[:-1], syllable[-1]`
			`if tone not in '12345':`
			`# We did not find a tone number. Abort conversion.`
			`return syl`
			`syl = re.sub('u:\|v', '\u00fc', syl)`
			`if 'a' in syl:`
			`return syl.replace('a', _num_vowel_to_acc('a', tone))`
			`elif 'e' in syl:`
			`return syl.replace('e', _num_vowel_to_acc('e', tone))`
			`elif 'ou' in syl:`
			`return syl.replace('o', _num_vowel_to_acc('o', tone))`
			`last_vowel = syl[max(map(syl.rfind, VOWELS))] # Find last vowel index.`
			`return syl.replace(last_vowel, _num_vowel_to_acc(last_vowel, tone))`


			`class TestPinyinSyllables(unittest.TestCase):`

			`maxDiff = None`

			`def test_number_syllables(self):`
			`vs = list(VALID_SYLS)`
			`_vs = []`
			`for n in range(0, len(vs)):`
			`vs[n] = vs[n] + str(random.randint(1, 5))`
			`_vs.append(vs[n])`
			`if _vs[n][0] in 'aeo':`
			`_vs[n] = "'{}".format(_vs[n])`
			`s = ''.join(_vs)`
			`self.assertEqual(SYL.findall(s), vs)`
			`self.assertEqual(N_SYL.findall(s), vs)`

			`def test_accent_syllables(self):`
			`vs = list(VALID_SYLS)`
			`_vs = []`
			`for n in range(0, len(vs)):`
			`syl = vs[n]`
			`vs[n] = num_syl_to_acc(vs[n] + str(random.randint(1, 5)))`
			`_vs.append(vs[n])`
			`if syl[0] in 'aeo':`
			`_vs[n] = "'{}".format(_vs[n])`
			`s = ''.join(_vs)`
			`self.assertEqual(SYL.findall(s), vs)`
			`self.assertEqual(A_SYL.findall(s), vs)`


			`def create_word(accented=False):`
			`if accented:`
			`tone = lambda: str(random.randint(1, 5))`
			`vs = [num_syl_to_acc(s + tone()) for s in VALID_SYLS]`
			`else:`
			`vs = [s + str(random.randint(1, 5)) for s in VALID_SYLS]`
			`word = vs[random.randint(0, len(vs) - 1)]`
			`for n in range(1, WORD_LENGTH):`
			`num = random.randint(0, len(vs) - 1)`
			`word += ['-', ''][random.randint(0, 1)]`
			`if VALID_SYLS[num][0] in 'aeo' and word[-1] != '-':`
			`word += "'"`
			`word += vs[num]`
			`return word`


			`class TestPinyinWords(unittest.TestCase):`

			`def test_number_words(self):`
			`for n in range(0, NUM_WORDS):`
			`word = create_word()`
			`self.assertEqual(WORD.match(word).group(0), word)`
			`self.assertEqual(N_WORD.match(word).group(0), word)`

			`def test_accent_words(self):`
			`for n in range(0, NUM_WORDS):`
			`word = create_word(accented=True)`
			`self.assertEqual(WORD.match(word).group(0), word)`
			`self.assertEqual(A_WORD.match(word).group(0), word)`


			`def create_sentence(accented=False):`
			`_sent = []`
			`for n in range(0, SENT_LENGTH):`
			`_sent.append(create_word(accented=accented))`
			`sentence = [_sent.pop(0)]`
			`sentence.extend([random.choice([' ', ', ', '; ']) + w for w in _sent])`
			`return ''.join(sentence) + '.'`


			`class TestPinyinSentences(unittest.TestCase):`

			`def test_number_sentences(self):`
			`for n in range(0, NUM_SENT):`
			`sentence = create_sentence()`
			`self.assertEqual(SENT.match(sentence).group(0), sentence)`
			`self.assertEqual(N_SENT.match(sentence).group(0), sentence)`

			`def test_accent_sentences(self):`
			`for n in range(0, NUM_SENT):`
			`sentence = create_sentence(accented=True)`
			`self.assertEqual(SENT.match(sentence).group(0), sentence)`
			`self.assertEqual(A_SENT.match(sentence).group(0), sentence)`