You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/python-pinyin/pypinyin/standard.py

155 lines
5.4 KiB

E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code
3 years ago
"""
处理汉语拼音方案中的一些特殊情况
汉语拼音方案:
* https://zh.wiktionary.org/wiki/%E9%99%84%E5%BD%95:%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3%E6%96%B9%E6%A1%88
* http://www.moe.edu.cn/s78/A19/yxs_left/moe_810/s230/195802/t19580201_186000.html
等七个音节的韵母用i思等字拼作zhichishirizicisi
韵母ㄦ写成er用作韵尾的时候写成r例如儿童拼作ertong花儿拼作huar
i行的韵母前面没有声母的时候写成yi()ya()ye()yao()you()yan()yin()yang()ying()yong()
u行的韵母前面没有声母的时候写成wu()wa()wo()wai()wei()wan()wen()wang()weng()
ü行的韵母前面没有声母的时候写成yu()yue()yuan()yun()ü上两点省略
ü行的韵跟声母jqx拼的时候写成ju()qu()xu()ü上两点也省略但是跟声母nl拼的时候仍然写成nü()()
iouueiuen前面加声母的时候写成iuuiun例如niu()gui()lun()
在给汉字注音的时候为了使拼式简短ng可以省作ŋ
"""
import re
from typing import Text
# u -> ü
UV_MAP = {
'u': 'ü',
'ū': 'ǖ',
'ú': 'ǘ',
'ǔ': 'ǚ',
'ù': 'ǜ',
}
U_TONES = set(UV_MAP.keys())
# ü行的韵跟声母jqx拼的时候写成ju(居)qu(区)xu(虚)
UV_RE = re.compile(
r'^(j|q|x)({tones})(.*)$'.format(tones='|'.join(UV_MAP.keys())))
I_TONES = set(['i', 'ī', 'í', 'ǐ', 'ì'])
# iu -> iou
IU_MAP = {
'iu': 'iou',
'': 'ioū',
'': 'ioú',
'': 'ioǔ',
'': 'ioù',
}
IU_TONES = set(IU_MAP.keys())
IU_RE = re.compile(r'^([a-z]+)({tones})$'.format(tones='|'.join(IU_TONES)))
# ui -> uei
UI_MAP = {
'ui': 'uei',
'': 'ueī',
'': 'ueí',
'': 'ueǐ',
'': 'ueì',
}
UI_TONES = set(UI_MAP.keys())
UI_RE = re.compile(r'([a-z]+)({tones})$'.format(tones='|'.join(UI_TONES)))
# un -> uen
UN_MAP = {
'un': 'uen',
'ūn': 'ūen',
'ún': 'úen',
'ǔn': 'ǔen',
'ùn': 'ùen',
}
UN_TONES = set(UN_MAP.keys())
UN_RE = re.compile(r'([a-z]+)({tones})$'.format(tones='|'.join(UN_TONES)))
def convert_zero_consonant(pinyin: Text) -> Text:
"""零声母转换,还原原始的韵母
i行的韵母前面没有声母的时候写成yi()ya()ye()yao()
you()yan()yin()yang()ying()yong()
u行的韵母前面没有声母的时候写成wu()wa()wo()wai()
wei()wan()wen()wang()weng()
ü行的韵母前面没有声母的时候写成yu()yue()yuan()
yun()ü上两点省略
"""
# y: yu -> v, yi -> i, y -> i
if pinyin.startswith('y'):
# 去除 y 后的拼音
no_y_py = pinyin[1:]
first_char = no_y_py[0] if len(no_y_py) > 0 else None
# yu -> ü: yue -> üe
if first_char in U_TONES:
pinyin = UV_MAP[first_char] + pinyin[2:]
# yi -> i: yi -> i
elif first_char in I_TONES:
pinyin = no_y_py
# y -> i: ya -> ia
else:
pinyin = 'i' + no_y_py
return pinyin
# w: wu -> u, w -> u
if pinyin.startswith('w'):
# 去除 w 后的拼音
no_w_py = pinyin[1:]
first_char = no_w_py[0] if len(no_w_py) > 0 else None
# wu -> u: wu -> u
if first_char in U_TONES:
pinyin = pinyin[1:]
# w -> u: wa -> ua
else:
pinyin = 'u' + pinyin[1:]
return pinyin
return pinyin
def convert_uv(pinyin: Text) -> Text:
"""ü 转换,还原原始的韵母
ü行的韵跟声母jqx拼的时候写成ju()qu()xu()ü上两点也省略
但是跟声母nl拼的时候仍然写成nü()()
"""
return UV_RE.sub(
lambda m: ''.join((m.group(1), UV_MAP[m.group(2)], m.group(3))), pinyin)
def convert_iou(pinyin: Text) -> Text:
"""iou 转换,还原原始的韵母
iouueiuen前面加声母的时候写成iuuiun
例如niu()gui()lun()
"""
return IU_RE.sub(lambda m: m.group(1) + IU_MAP[m.group(2)], pinyin)
def convert_uei(pinyin: Text) -> Text:
"""uei 转换,还原原始的韵母
iouueiuen前面加声母的时候写成iuuiun
例如niu()gui()lun()
"""
return UI_RE.sub(lambda m: m.group(1) + UI_MAP[m.group(2)], pinyin)
def convert_uen(pinyin: Text) -> Text:
"""uen 转换,还原原始的韵母
iouueiuen前面加声母的时候写成iuuiun
例如niu()gui()lun()
"""
return UN_RE.sub(lambda m: m.group(1) + UN_MAP[m.group(2)], pinyin)
def convert_finals(pinyin: Text) -> Text:
"""还原原始的韵母"""
# i,u,ü 行的韵母,前面没有声母的时候
pinyin = convert_zero_consonant(pinyin)
# ü行的韵跟声母jqx拼的时候
pinyin = convert_uv(pinyin)
# iouueiuen前面加声母的时候
pinyin = convert_iou(pinyin)
pinyin = convert_uei(pinyin)
pinyin = convert_uen(pinyin)
return pinyin