diff --git a/examples/text_normalization/local/test_normalization.py b/examples/text_normalization/local/test_normalization.py index 38a38460..bcf7ee0d 100644 --- a/examples/text_normalization/local/test_normalization.py +++ b/examples/text_normalization/local/test_normalization.py @@ -1,7 +1,22 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import argparse + from text_processing import normalization -parser = argparse.ArgumentParser(description="Normalize text in Chinese with some rules.") +parser = argparse.ArgumentParser( + description="Normalize text in Chinese with some rules.") parser.add_argument("input", type=str, help="the input sentences") parser.add_argument("output", type=str, help="path to save the output file.") args = parser.parse_args() diff --git a/examples/text_normalization/path.sh b/examples/text_normalization/path.sh index c8b1f1c2..7cec3a24 100644 --- a/examples/text_normalization/path.sh +++ b/examples/text_normalization/path.sh @@ -1,5 +1,4 @@ export MAIN_ROOT=${PWD}/../../ - export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/text_normalization/run.sh b/examples/text_normalization/run.sh index b39de2a2..c4043a31 100755 --- a/examples/text_normalization/run.sh +++ b/examples/text_normalization/run.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash - source path.sh stage=-1 diff --git a/third_party/text_processing/__ini__.py b/third_party/text_processing/__ini__.py new file mode 100644 index 00000000..8d1c8b69 --- /dev/null +++ b/third_party/text_processing/__ini__.py @@ -0,0 +1 @@ + diff --git a/third_party/text_processing/normalization/char_convert.py b/third_party/text_processing/normalization/char_convert.py index 1c035a80..bd328f69 100644 --- a/third_party/text_processing/normalization/char_convert.py +++ b/third_party/text_processing/normalization/char_convert.py @@ -2,6 +2,7 @@ `opencc `_. """ + import opencc _t2s_converter = opencc.OpenCC("t2s.json") @@ -11,4 +12,4 @@ def tranditional_to_simplified(text: str) -> str: return _t2s_converter.convert(text) def simplified_to_traditional(text: str) -> str: - return _s2t_converter.convert(text) \ No newline at end of file + return _s2t_converter.convert(text) diff --git a/third_party/text_processing/normalization/chronology.py b/third_party/text_processing/normalization/chronology.py index 727bbd65..7143eb58 100644 --- a/third_party/text_processing/normalization/chronology.py +++ b/third_party/text_processing/normalization/chronology.py @@ -1,6 +1,7 @@ import re from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS + def _time_num2str(num_string: str) -> str: """A special case for verbalizing number in time.""" result = num2str(num_string.lstrip('0')) @@ -60,4 +61,4 @@ def replace_date2(match: re.Match) -> str: result += f"{verbalize_cardinal(month)}月" if day: result += f"{verbalize_cardinal(day)}日" - return result \ No newline at end of file + return result diff --git a/third_party/text_processing/normalization/constants.py b/third_party/text_processing/normalization/constants.py index bbfccb67..d5c04a76 100644 --- a/third_party/text_processing/normalization/constants.py +++ b/third_party/text_processing/normalization/constants.py @@ -2,6 +2,7 @@ import string import re from pypinyin.constants import SUPPORT_UCS4 + # 全角半角转换 # 英文字符全角 -> 半角映射表 (num: 52) F2H_ASCII_LETTERS = { diff --git a/third_party/text_processing/normalization/num.py b/third_party/text_processing/normalization/num.py index 9b8b0ab3..60fc1686 100644 --- a/third_party/text_processing/normalization/num.py +++ b/third_party/text_processing/normalization/num.py @@ -2,6 +2,7 @@ Rules to verbalize numbers into Chinese characters. https://zh.wikipedia.org/wiki/中文数字#現代中文 """ + import re from typing import List from collections import OrderedDict diff --git a/third_party/text_processing/normalization/phone.py b/third_party/text_processing/normalization/phone.py index e8bdecd7..1acc1836 100644 --- a/third_party/text_processing/normalization/phone.py +++ b/third_party/text_processing/normalization/phone.py @@ -1,6 +1,7 @@ import re from .num import verbalize_digit + # 规范化固话/手机号码 # 手机 # http://www.jihaoba.com/news/show/13680 @@ -27,4 +28,4 @@ def phone2str(phone_string: str, mobile=True) -> str: def replace_phone(match: re.Match) -> str: - return phone2str(match.group(0)) \ No newline at end of file + return phone2str(match.group(0)) diff --git a/third_party/text_processing/normalization/quantifier.py b/third_party/text_processing/normalization/quantifier.py index 836fc88c..024eb6e0 100644 --- a/third_party/text_processing/normalization/quantifier.py +++ b/third_party/text_processing/normalization/quantifier.py @@ -1,6 +1,7 @@ import re from .num import num2str + # 温度表达式,温度会影响负号的读法 # -3°C 零下三度 RE_TEMPERATURE = re.compile( @@ -14,4 +15,4 @@ def replace_temperature(match: re.Match) -> str: temperature: str = num2str(temperature) unit: str = "摄氏度" if unit == "摄氏度" else "度" result = f"{sign}{temperature}{unit}" - return result \ No newline at end of file + return result diff --git a/third_party/text_processing/normalization/sentence_split.py b/third_party/text_processing/normalization/sentence_split.py index 451371da..5867342b 100644 --- a/third_party/text_processing/normalization/sentence_split.py +++ b/third_party/text_processing/normalization/sentence_split.py @@ -1,6 +1,7 @@ import re from typing import List + SENTENCE_SPLITOR = re.compile(r'([。!?][”’]?)') def split(text: str) -> List[str]: