PaddleSpeech/third_party/python-pinyin/pypinyin/utils.py

import re

from typing import List
from typing import Text

from pypinyin import phonetic_symbol
from pypinyin.constants import RE_TONE2
from pypinyin.seg.simpleseg import simple_seg  # noqa

# 用于向后兼容，TODO: 废弃


def is_chinese_char(cp) -> bool:
    """Checks whether CP is the codepoint of a CJK character."""
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    # https://www.cnblogs.com/jacen789/p/10825350.html

    if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) or
        (cp >= 0x20000 and cp <= 0x2A6DF) or
        (cp >= 0x2A700 and cp <= 0x2B73F) or
        (cp >= 0x2B740 and cp <= 0x2B81F) or
        (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or
        (cp >= 0x2F800 and cp <= 0x2FA1F)):
        return True  # yapf: disable

    return False


def _replace_tone2_style_dict_to_default(string: Text) -> Text:
    regex = re.compile(RE_TONE2.pattern.replace('$', ''))
    d = phonetic_symbol.phonetic_symbol_reverse
    string = string.replace('ü', 'v').replace('5', '').replace('0', '')

    def _replace(m):
        s = m.group(0)
        return d.get(s) or s

    return regex.sub(_replace, string)


def _remove_dup_items(lst: List[Text]) -> List[Text]:
    new_lst = []
    for item in lst:
        if item not in new_lst:
            new_lst.append(item)
    return new_lst
-												E2E/Streaming Transformer/Conformer ASR (#578)

* add cmvn and label smoothing loss layer

* add layer for transformer

* add glu and conformer conv

* add torch compatiable hack, mask funcs

* not hack size since it exists

* add test; attention

* add attention, common utils, hack paddle

* add audio utils

* conformer batch padding mask bug fix #223

* fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2

* fix ci

* fix ci

* add encoder

* refactor egs

* add decoder

* refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils

* refactor docs

* add fix

* fix readme

* fix bugs, refactor collator, add pad_sequence, fix ckpt bugs

* fix docstring

* refactor data feed order

* add u2 model

* refactor cmvn, test

* add utils

* add u2 config

* fix bugs

* fix bugs

* fix autograd maybe has problem when using inplace operation

* refactor data, build vocab; add format data

* fix text featurizer

* refactor build vocab

* add fbank, refactor feature of speech

* refactor audio feat

* refactor data preprare

* refactor data

* model init from config

* add u2 bins

* flake8

* can train

* fix bugs, add coverage, add scripts

* test can run

* fix data

* speed perturb with sox

* add spec aug

* fix for train

* fix train logitc

* fix logger

* log valid loss, time dataset process

* using np for speed perturb, remove some debug log of grad clip

* fix logger

* fix build vocab

* fix logger name

* using module logger as default

* fix

* fix install

* reorder imports

* fix board logger

* fix logger

* kaldi fbank and mfcc

* fix cmvn and print prarams

* fix add_eos_sos and cmvn

* fix cmvn compute

* fix logger and cmvn

* fix subsampling, label smoothing loss, remove useless

* add notebook test

* fix log

* fix tb logger

* multi gpu valid

* fix log

* fix log

* fix config

* fix compute cmvn, need paddle 2.1

* add cmvn notebook

* fix layer tools

* fix compute cmvn

* add rtf

* fix decoding

* fix layer tools

* fix log, add avg script

* more avg and test info

* fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh;

* add vimrc

* refactor tiny script, add transformer and stream conf

* spm demo; librisppech scripts and confs

* fix log

* add librispeech scripts

* refactor data pipe; fix conf; fix u2 default params

* fix bugs

* refactor aishell scripts

* fix test

* fix cmvn

* fix s0 scripts

* fix ds2 scripts and bugs

* fix dev & test dataset filter

* fix dataset filter

* filter dev

* fix ckpt path

* filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test

* add comment

* add syllable doc

* fix ds2 configs

* add doc

* add pypinyin tools

* fix decoder using blank_id=0

* mmseg with pybind11

* format code
											
										
										
											3 years ago
+								import re
 								from typing import List
 								from typing import Text
 								from pypinyin import phonetic_symbol
 								from pypinyin.constants import RE_TONE2
 								from pypinyin.seg.simpleseg import simple_seg  # noqa
 								# 用于向后兼容，TODO: 废弃
 								def is_chinese_char(cp) -> bool:
 								    """Checks whether CP is the codepoint of a CJK character."""
 								    # This defines a "chinese character" as anything in the CJK Unicode block:
 								    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
 								    #
 								    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
 								    # despite its name. The modern Korean Hangul alphabet is a different block,
 								    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
 								    # space-separated words, so they are not treated specially and handled
 								    # like the all of the other languages.
 								    # https://www.cnblogs.com/jacen789/p/10825350.html
 								    if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) or
 								        (cp >= 0x20000 and cp <= 0x2A6DF) or
 								        (cp >= 0x2A700 and cp <= 0x2B73F) or
 								        (cp >= 0x2B740 and cp <= 0x2B81F) or
 								        (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or
 								        (cp >= 0x2F800 and cp <= 0x2FA1F)):
 								        return True  # yapf: disable
 								    return False
 								def _replace_tone2_style_dict_to_default(string: Text) -> Text:
 								    regex = re.compile(RE_TONE2.pattern.replace('$', ''))
 								    d = phonetic_symbol.phonetic_symbol_reverse
 								    string = string.replace('ü', 'v').replace('5', '').replace('0', '')
 								    def _replace(m):
 								        s = m.group(0)
 								        return d.get(s) or s
 								    return regex.sub(_replace, string)
 								def _remove_dup_items(lst: List[Text]) -> List[Text]:
 								    new_lst = []
 								    for item in lst:
 								        if item not in new_lst:
 								            new_lst.append(item)
 								    return new_lst