PaddleSpeech/third_party/python-pinyin/pypinyin/utils.py

import re

from typing import List
from typing import Text

from pypinyin import phonetic_symbol
from pypinyin.constants import RE_TONE2
from pypinyin.seg.simpleseg import simple_seg  # noqa

# 用于向后兼容，TODO: 废弃


def is_chinese_char(cp) -> bool:
    """Checks whether CP is the codepoint of a CJK character."""
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    # https://www.cnblogs.com/jacen789/p/10825350.html

    if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) or
        (cp >= 0x20000 and cp <= 0x2A6DF) or
        (cp >= 0x2A700 and cp <= 0x2B73F) or
        (cp >= 0x2B740 and cp <= 0x2B81F) or
        (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or
        (cp >= 0x2F800 and cp <= 0x2FA1F)):
        return True  # yapf: disable

    return False


def _replace_tone2_style_dict_to_default(string: Text) -> Text:
    regex = re.compile(RE_TONE2.pattern.replace('$', ''))
    d = phonetic_symbol.phonetic_symbol_reverse
    string = string.replace('ü', 'v').replace('5', '').replace('0', '')

    def _replace(m):
        s = m.group(0)
        return d.get(s) or s

    return regex.sub(_replace, string)


def _remove_dup_items(lst: List[Text]) -> List[Text]:
    new_lst = []
    for item in lst:
        if item not in new_lst:
            new_lst.append(item)
    return new_lst