|
|
import re
|
|
|
|
|
|
from typing import List
|
|
|
from typing import Text
|
|
|
|
|
|
from pypinyin import phonetic_symbol
|
|
|
from pypinyin.constants import RE_TONE2
|
|
|
from pypinyin.seg.simpleseg import simple_seg # noqa
|
|
|
|
|
|
# 用于向后兼容,TODO: 废弃
|
|
|
|
|
|
|
|
|
def is_chinese_char(cp) -> bool:
|
|
|
"""Checks whether CP is the codepoint of a CJK character."""
|
|
|
# This defines a "chinese character" as anything in the CJK Unicode block:
|
|
|
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
|
|
|
#
|
|
|
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
|
|
|
# despite its name. The modern Korean Hangul alphabet is a different block,
|
|
|
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
|
|
|
# space-separated words, so they are not treated specially and handled
|
|
|
# like the all of the other languages.
|
|
|
# https://www.cnblogs.com/jacen789/p/10825350.html
|
|
|
|
|
|
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) or
|
|
|
(cp >= 0x20000 and cp <= 0x2A6DF) or
|
|
|
(cp >= 0x2A700 and cp <= 0x2B73F) or
|
|
|
(cp >= 0x2B740 and cp <= 0x2B81F) or
|
|
|
(cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or
|
|
|
(cp >= 0x2F800 and cp <= 0x2FA1F)):
|
|
|
return True # yapf: disable
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
def _replace_tone2_style_dict_to_default(string: Text) -> Text:
|
|
|
regex = re.compile(RE_TONE2.pattern.replace('$', ''))
|
|
|
d = phonetic_symbol.phonetic_symbol_reverse
|
|
|
string = string.replace('ü', 'v').replace('5', '').replace('0', '')
|
|
|
|
|
|
def _replace(m):
|
|
|
s = m.group(0)
|
|
|
return d.get(s) or s
|
|
|
|
|
|
return regex.sub(_replace, string)
|
|
|
|
|
|
|
|
|
def _remove_dup_items(lst: List[Text]) -> List[Text]:
|
|
|
new_lst = []
|
|
|
for item in lst:
|
|
|
if item not in new_lst:
|
|
|
new_lst.append(item)
|
|
|
return new_lst
|