You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/phkit/phkit/chinese/hanziconv.py

99 lines
3.6 KiB

# Copyright 2014 Bernard Yue
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
__doc__ = """
Hanzi Converter 繁簡轉換器 | 繁简转换器
This module provides functions converting chinese text between simplified and
traditional characters. It returns unicode represnetation of the text.
Class HanziConv is the main entry point of the module, you can import the
class by doing:
>>> from hanziconv import HanziConv
"""
import os
from zhon import cedict
class HanziConv():
"""This class supports hanzi (漢字) convention between simplified and
traditional format"""
__traditional_charmap = cedict.traditional
__simplified_charmap = cedict.simplified
@classmethod
def __convert(cls, text, toTraditional=True):
"""Convert `text` to Traditional characters if `toTraditional` is
True, else convert to simplified characters
:param text: data to convert
:param toTraditional: True -- convert to traditional text
False -- covert to simplified text
:returns: converted 'text`
"""
if isinstance(text, bytes):
text = text.decode('utf-8')
fromMap = cls.__simplified_charmap
toMap = cls.__traditional_charmap
if not toTraditional:
fromMap = cls.__traditional_charmap
toMap = cls.__simplified_charmap
final = []
for c in text:
index = fromMap.find(c)
if index != -1:
final.append(toMap[index])
else:
final.append(c)
return ''.join(final)
@classmethod
def toSimplified(cls, text):
"""Convert `text` to simplified character string. Assuming text is
traditional character string
:param text: text to convert
:returns: converted UTF-8 characters
>>> from hanziconv import HanziConv
>>> print(HanziConv.toSimplified('繁簡轉換器'))
繁简转换器
"""
return cls.__convert(text, toTraditional=False)
@classmethod
def toTraditional(cls, text):
"""Convert `text` to traditional character string. Assuming text is
simplified character string
:param text: text to convert
:returns: converted UTF-8 characters
>>> from hanziconv import HanziConv
>>> print(HanziConv.toTraditional('繁简转换器'))
繁簡轉換器
"""
return cls.__convert(text, toTraditional=True)
@classmethod
def same(cls, text1, text2):
"""Return True if text1 and text2 meant literally the same, False
otherwise
:param text1: string to compare to ``text2``
:param text2: string to compare to ``text1``
:returns: **True** -- ``text1`` and ``text2`` are the same in meaning,
**False** -- otherwise
>>> from hanziconv import HanziConv
>>> print(HanziConv.same('繁简转换器', '繁簡轉換器'))
True
"""
t1 = cls.toSimplified(text1)
t2 = cls.toSimplified(text2)
return t1 == t2