You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/python-pinyin/pypinyin/converter.py

460 lines
18 KiB

E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code
3 years ago
from copy import deepcopy
from typing import Any
from typing import Callable
from typing import List
from typing import Optional
from typing import Text
from typing import Union
from pypinyin.constants import PHRASES_DICT
from pypinyin.constants import PINYIN_DICT
from pypinyin.constants import RE_HANS
from pypinyin.constants import Style
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.contrib.uv import V2UMixin
from pypinyin.style import auto_discover
from pypinyin.style import convert as convert_style
from pypinyin.utils import _remove_dup_items
TStyle = Style
TErrors = Union[Callable[[Text], Text], Text]
TPinyinResult = List[List[Text]]
TErrorResult = Union[Text, List[Text], None]
TNoPinyinResult = Union[TPinyinResult, List[Text], Text, None]
auto_discover()
class Converter():
def convert(self,
words: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool=...,
**kwargs: Any) -> TPinyinResult:
# TODO: use ``abc`` module
raise NotImplementedError # pragma: no cover
class DefaultConverter(Converter):
def __init__(self, **kwargs: Any) -> None:
pass
def post_pinyin(self,
han: Text,
heteronym: bool,
pinyin: TPinyinResult,
**kwargs: Any) -> Union[TPinyinResult, None]:
"""找到汉字对应的拼音后,会调用 ``post_pinyin`` 方法。
如果返回值不为 ``None`` 会使用返回的结果作为 han 的拼音数据
:param han: 单个汉字或者词语
:param heteronym: 是否需要处理多音字
:param pinyin: 单个汉字的拼音数据或词语的拼音数据 list
:type pinyin: list
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: ``None`` 或代替 ``pinyin`` 作为 han 的拼音 list
"""
pass
def _single_pinyin(self,
han: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool) -> TPinyinResult:
"""单字拼音转换.
:param han: 单个汉字
:param errors: 指定如何处理没有拼音的字符详情请参考
:py:func:`~pypinyin.pinyin`
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:return: 返回拼音列表多音字会有多个拼音项
:rtype: list
"""
num = ord(han)
# 处理没有拼音的字符
if num not in PINYIN_DICT:
return self.handle_nopinyin(
han,
style=style,
errors=errors,
heteronym=heteronym,
strict=strict)
pys = PINYIN_DICT[num].split(',') # 字的拼音列表
post_data = self.post_pinyin(han, heteronym, [pys])
if post_data is not None:
pys = post_data[0]
if not heteronym:
orig_pinyin = pys[0]
return [[
self.convert_style(
han, orig_pinyin, style=style, strict=strict)
]]
# 输出多音字的多个读音
# 临时存储已存在的拼音,避免多音字拼音转换为非声调风格出现重复。
# TODO: change to use set
# TODO: add test for cache
py_cached = {}
pinyins = []
for orig_pinyin in pys:
py = self.convert_style(
han, orig_pinyin, style=style, strict=strict)
if py in py_cached:
continue
py_cached[py] = py
pinyins.append(py)
return [pinyins]
def _phrase_pinyin(self,
phrase: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool) -> TPinyinResult:
"""词语拼音转换.
:param phrase: 词语
:param errors: 指定如何处理没有拼音的字符
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:return: 拼音列表
:rtype: list
"""
py = []
if phrase in PHRASES_DICT:
py = deepcopy(PHRASES_DICT[phrase])
post_data = self.post_pinyin(phrase, heteronym, py)
if post_data is not None:
py = post_data
for idx, item in enumerate(py):
han = phrase[idx]
if heteronym:
py[idx] = _remove_dup_items([
self.convert_style(
han, orig_pinyin=x, style=style, strict=strict)
for x in item
])
else:
orig_pinyin = item[0]
py[idx] = [
self.convert_style(
han,
orig_pinyin=orig_pinyin,
style=style,
strict=strict)
]
else:
for i in phrase:
single = self._single_pinyin(
i,
style=style,
heteronym=heteronym,
errors=errors,
strict=strict)
if single:
py.extend(single)
return py
def convert(self,
words: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool=...,
**kwargs: Any) -> TPinyinResult:
"""根据参数把汉字转成相应风格的拼音结果。
:param words: 汉字字符串
:type words: unicode
:param style: 拼音风格
:param heteronym: 是否启用多音字
:type heteronym: bool
:param errors: 如果处理没有拼音的字符
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:type strict: bool
:return: 按风格转换后的拼音结果
:rtype: list
"""
pys = []
# 初步过滤没有拼音的字符
if RE_HANS.match(words):
pys = self._phrase_pinyin(
words,
style=style,
heteronym=heteronym,
errors=errors,
strict=strict)
return pys
py = self.handle_nopinyin(
words,
style=style,
errors=errors,
heteronym=heteronym,
strict=strict)
if py:
pys.extend(py)
return pys
def pre_convert_style(self,
han: Text,
orig_pinyin: Text,
style: TStyle,
strict: bool,
**kwargs: Any) -> Optional[Text]:
"""在把原始带声调的拼音按拼音风格转换前会调用 ``pre_convert_style`` 方法。
如果返回值不为 ``None`` 会使用返回的结果代替 ``orig_pinyin``
来进行后面的风格转换
:param han: 要处理的汉字
:param orig_pinyin: 汉字对应的原始带声调拼音
:param style: 要转换的拼音风格
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: ``None`` 或代替 ``orig_pinyin`` 参与拼音风格转换的拼音字符串
"""
pass
def post_convert_style(self,
han: Text,
orig_pinyin: Text,
converted_pinyin: Text,
style: TStyle,
strict: bool,
**kwargs: Any) -> Optional[Text]:
"""在把原始带声调的拼音按拼音风格转换前会调用 ``pre_convert_style`` 方法。
如果返回值不为 ``None`` 会使用返回的结果代替 ``converted_pinyin``
作为拼音风格转换后的最终拼音结果
:param han: 要处理的汉字
:param orig_pinyin: 汉字对应的原始带声调拼音
:param converted_pinyin: 按拼音风格转换处理后的拼音
:param style: 要转换的拼音风格
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: ``None`` 或代替 ``converted_pinyin`` 作为拼音风格转换后的拼音结果
"""
pass
def _convert_style(self,
han: Text,
pinyin: Text,
style: TStyle,
strict: bool,
default: Text,
**kwargs: Any) -> Text:
return convert_style(pinyin, style, strict, default=default, **kwargs)
def convert_style(self,
han: Text,
orig_pinyin: Text,
style: TStyle,
strict: bool,
**kwargs: Any) -> Text:
"""按 ``style`` 的值对 ``orig_pinyin`` 进行处理,返回处理后的拼音
转换风格前会调用 ``pre_convert_style`` 方法
转换后会调用 ``post_convert_style`` 方法
:param han: 要处理的汉字
:param orig_pinyin: 汉字对应的原始带声调拼音
:param style: 拼音风格
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: 按拼音风格转换处理后的拼音
"""
pre_data = self.pre_convert_style(
han, orig_pinyin, style=style, strict=strict)
if pre_data is not None:
pinyin = pre_data
else:
pinyin = orig_pinyin
converted_pinyin = self._convert_style(
han, pinyin, style=style, strict=strict, default=pinyin)
post_data = self.post_convert_style(
han, pinyin, converted_pinyin, style=style, strict=strict)
if post_data is None:
post_data = converted_pinyin
return post_data
def pre_handle_nopinyin(self,
chars: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool) -> TNoPinyinResult:
"""处理没有拼音的字符串前会调用 ``pre_handle_nopinyin`` 方法。
如果返回值不为 ``None`` 会使用返回的结果作为处理没有拼音字符串的结果
不再使用内置方法进行处理
:param chars: 待处理的没有拼音的字符串
:param errors: 如何处理
:param heteronym: 是否需要处理多音字
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: ``None`` 或代替 ``chars`` 参与拼音风格转换的拼音字符串
或拼音结果 list
"""
pass
def post_handle_nopinyin(self,
chars: Text,
style: Style,
heteronym: bool,
errors: TErrors,
strict: bool,
pinyin: TNoPinyinResult,
**kwargs: Any) -> TNoPinyinResult:
"""处理完没有拼音的字符串后会调用 ``post_handle_nopinyin`` 方法。
如果返回值不为 ``None`` 会使用返回的结果作为处理没有拼音的字符串的结果
:param chars: 待处理的没有拼音的字符串
:param errors: 如何处理
:param heteronym: 是否需要处理多音字
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:param pinyin: 处理后的拼音信息值为空 list 或包含拼音信息的 list
:param kwargs: 其他关键字参数暂时无用用于以后扩展新的参数
:return: ``None`` 或代替 ``pinyin`` 做为处理结果
"""
pass
def _convert_nopinyin_chars(self,
chars: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool) -> TNoPinyinResult:
"""转换没有拼音的字符。
"""
if callable(errors):
return errors(chars)
if errors == 'default':
return chars
elif errors == 'ignore':
return None
elif errors == 'replace':
if len(chars) > 1:
return ''.join(str('%x' % ord(x)) for x in chars)
else:
return str('%x' % ord(chars))
def handle_nopinyin(self,
chars: Text,
style: TStyle,
heteronym: bool,
errors: TErrors,
strict: bool,
**kwargs: Any) -> TPinyinResult:
"""处理没有拼音的字符串。
处理前会调用 ``pre_handle_nopinyin`` 方法
处理后会调用 ``post_handle_nopinyin`` 方法
:param chars: 待处理的没有拼音的字符串
:param style: 拼音风格
:param errors: 如何处理
:param heteronym: 是否需要处理多音字
:param strict: 只获取声母或只获取韵母相关拼音风格的返回结果
是否严格遵照汉语拼音方案来处理声母和韵母
详见 :ref:`strict`
:return: 处理后的拼音结果如果为 ``None`` 或空 list 表示忽略这个字符串.
:rtype: list
"""
pre_data = self.pre_handle_nopinyin(
chars, style, errors=errors, heteronym=heteronym, strict=strict)
if pre_data is not None:
py = pre_data
else:
pre_data = chars
py = self._convert_nopinyin_chars(
pre_data,
style,
errors=errors,
heteronym=heteronym,
strict=strict)
post_data = self.post_handle_nopinyin(
chars,
style,
errors=errors,
heteronym=heteronym,
strict=strict,
pinyin=py)
if post_data is not None:
py = post_data
if not py:
return []
if isinstance(py, list):
# 包含多音字信息
if isinstance(py[0], list):
if heteronym:
return py
# [[a, b], [c, d]]
# [[a], [c]]
return [[x[0]] for x in py]
return [[i] for i in py]
else:
return [[py]]
class _v2UConverter(V2UMixin, DefaultConverter):
pass
class _neutralToneWith5Converter(NeutralToneWith5Mixin, DefaultConverter):
pass
class _neutralToneWith5AndV2UConverter(NeutralToneWith5Mixin, V2UMixin,
DefaultConverter):
pass
class _mixConverter(DefaultConverter):
def __init__(self, v_to_u=False, neutral_tone_with_five=False, **kwargs):
super().__init__(**kwargs)
self._v_to_u = v_to_u
self._neutral_tone_with_five = neutral_tone_with_five
self._v2uconverter = _v2UConverter()
self._neutraltonewith5converter = _neutralToneWith5Converter()
self._neutraltonewith5andv2uconverter = \
_neutralToneWith5AndV2UConverter()
def post_convert_style(self, han, orig_pinyin, converted_pinyin, style,
strict, **kwargs):
if self._v_to_u and not self._neutral_tone_with_five:
return self._v2uconverter.post_convert_style(
han, orig_pinyin, converted_pinyin, style, strict, **kwargs)
if self._neutral_tone_with_five and not self._v_to_u:
return self._neutraltonewith5converter.post_convert_style(
han, orig_pinyin, converted_pinyin, style, strict, **kwargs)
if self._neutral_tone_with_five and self._v_to_u:
return self._neutraltonewith5andv2uconverter.post_convert_style(
han, orig_pinyin, converted_pinyin, style, strict, **kwargs)
return super().post_convert_style(han, orig_pinyin, converted_pinyin,
style, strict, **kwargs)