You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/python-pinyin/pypinyin/runner.py

151 lines
4.1 KiB

E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code
3 years ago
#!/usr/bin/env python3
from typing import Union, Text, ByteString
import logging
import sys
from argparse import ArgumentParser
import pypinyin
style_map = {
'NORMAL': pypinyin.Style.NORMAL,
'zhao': pypinyin.Style.NORMAL,
'TONE': pypinyin.Style.TONE,
'zh4ao': pypinyin.Style.TONE,
'TONE2': pypinyin.Style.TONE2,
'zha4o': pypinyin.Style.TONE2,
'TONE3': pypinyin.Style.TONE3,
'zhao4': pypinyin.Style.TONE3,
'INITIALS': pypinyin.Style.INITIALS,
'zh': pypinyin.Style.INITIALS,
'FIRST_LETTER': pypinyin.Style.FIRST_LETTER,
'z': pypinyin.Style.FIRST_LETTER,
'FINALS': pypinyin.Style.FINALS,
'ao': pypinyin.Style.FINALS,
'FINALS_TONE': pypinyin.Style.FINALS_TONE,
'4ao': pypinyin.Style.FINALS_TONE,
'FINALS_TONE2': pypinyin.Style.FINALS_TONE2,
'a4o': pypinyin.Style.FINALS_TONE2,
'FINALS_TONE3': pypinyin.Style.FINALS_TONE3,
'ao4': pypinyin.Style.FINALS_TONE3,
'BOPOMOFO': pypinyin.Style.BOPOMOFO,
'BOPOMOFO_FIRST': pypinyin.Style.BOPOMOFO_FIRST,
'CYRILLIC': pypinyin.Style.CYRILLIC,
'CYRILLIC_FIRST': pypinyin.Style.CYRILLIC_FIRST,
}
func_map = {
'pinyin': pypinyin.pinyin,
'slug': pypinyin.slug,
}
default_style = 'zh4ao'
class NullWriter():
"""数据流黑洞,类似 linux/unix 下 /dev/null 的效果。"""
def write(self, string: Union[Text, ByteString]) -> None:
pass
def get_parser() -> ArgumentParser:
parser = ArgumentParser(description='convert chinese to pinyin.')
parser.add_argument(
'-V',
'--version',
action='version',
version='{0} {1}'.format(pypinyin.__title__, pypinyin.__version__))
# 要执行的函数名称
parser.add_argument(
'-f',
'--func',
help='function name (default: "pinyin")',
choices=['pinyin', 'slug'],
default='pinyin')
# 拼音风格
parser.add_argument(
'-s',
'--style',
help='pinyin style (default: "{0}")'.format(default_style),
choices=style_map.keys(),
default=default_style)
parser.add_argument(
'-p', '--separator', help='slug separator (default: "-")', default='-')
parser.add_argument(
'-e',
'--errors',
help=('how to handle none-pinyin string'
' (default: "default")'),
choices=['default', 'ignore', 'replace'],
default='default')
# 输出多音字
parser.add_argument(
'-m', '--heteronym', help='enable heteronym', action='store_true')
# 要查询的汉字
parser.add_argument('hans', help='chinese string')
return parser
def main():
# 禁用除 CRITICAL 外的日志消息
logging.disable(logging.CRITICAL)
# read hans from stdin
if not sys.stdin.isatty():
pipe_data = sys.stdin.read().strip()
else:
pipe_data = ''
args = sys.argv[1:]
if pipe_data:
args.append(pipe_data)
# 获取命令行选项和参数
parser = get_parser()
options = parser.parse_args(args)
hans = options.hans
func = getattr(pypinyin, options.func)
style = style_map[options.style]
heteronym = options.heteronym
separator = options.separator
errors = options.errors
func_kwargs = {
'pinyin': {
'heteronym': heteronym,
'errors': errors
},
'slug': {
'heteronym': heteronym,
'separator': separator,
'errors': errors
},
}
kwargs = func_kwargs[func.__name__]
# 重设标准输出流和标准错误流
# 不输出任何字符,防止污染命令行命令的输出结果
# 其实主要是为了干掉 jieba 内的 print 语句 ;)
sys.stdout = sys.stderr = NullWriter()
result = func(hans, style=style, **kwargs)
# 恢复默认
sys.stdout = sys.__stdout__
sys.stderr = sys.__stderr__
if not result:
print('')
elif result and isinstance(result, (list, tuple)):
if isinstance(result[0], (list, tuple)):
print(' '.join([','.join(s) for s in result]))
else:
print(result)
else:
print(result)
if __name__ == '__main__':
main()