diff --git a/examples/text_normalization/README.md b/examples/text_normalization/README.md new file mode 100644 index 00000000..108bbf10 --- /dev/null +++ b/examples/text_normalization/README.md @@ -0,0 +1,3 @@ +# Regular expression based text normalization for Chinese + +For simplicity and ease of implementation, text normalization is basically done by rules and dictionaries. Jere's an example. \ No newline at end of file diff --git a/examples/text_normalization/data/sentences.txt b/examples/text_normalization/data/sentences.txt new file mode 100644 index 00000000..d15bfe46 --- /dev/null +++ b/examples/text_normalization/data/sentences.txt @@ -0,0 +1,26 @@ +今天的最低气温达到-10°C. +只要有33/4的人同意,就可以通过决议。 +1945年5月2日,苏联士兵在德国国会大厦上升起了胜利旗,象征着攻占柏林并战胜了纳粹德国。 +4月16日,清晨的战斗以炮击揭幕,数以千计的大炮和喀秋莎火箭炮开始炮轰德军阵地,炮击持续了数天之久。 +如果剩下的30.6%是过去,那么还有69.4%. +事情发生在2020/03/31的上午8:00. +警方正在找一支.22口径的手枪。 +欢迎致电中国联通,北京2022年冬奥会官方合作伙伴为您服务 +充值缴费请按1,查询话费及余量请按2,跳过本次提醒请按井号键。 +快速解除流量封顶请按星号键,腾讯王卡产品介绍、使用说明、特权及活动请按9,查询话费、套餐余量、积分及活动返款请按1,手机上网流量开通及取消请按2,查询本机号码及本号所使用套餐请按4,密码修改及重置请按5,紧急开机请按6,挂失请按7,查询充值记录请按8,其它自助服务及人工服务请按0 +智能客服助理快速查话费、查流量请按9,了解北京联通业务请按1,宽带IPTV新装、查询请按2,障碍报修请按3,充值缴费请按4,投诉建议请按5,政企业务请按7,人工服务请按0,for english severice press star key +您的帐户当前可用余额为63.89元,本月消费为2.17元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。 +您的帐户当前可用余额为负15.5元,本月消费为59.6元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。 +尊敬的客户,您目前的话费余额为负14.60元,已低于10元,为保证您的通信畅通,请及时缴纳费用。 +您的流量已用完,为避免您产生额外费用,建议您根据需求开通一个流量包以作补充。 +您可以直接说,查询话费及余量、开通流量包、缴费,您也可以说出其它需求,请问有什么可以帮您? +您的账户当前可用余额为负36.00元,本月消费36.00元。 +请问你是电话13985608526的机主吗? +如您对处理结果不满意,可拨打中国联通集团投诉电话10015进行投诉,按本地通话费收费,返回自助服务请按井号键 +“26314”号VIP客服代表为您服务。 +尊敬的5G用户,欢迎您致电中国联通 +首先是应用了M1芯片的iPad Pro,新款的iPad Pro支持5G,这也是苹果的第二款5G产品线。 +除此之外,摄像头方面再次升级,增加了前摄全新超广角摄像头,支持人物居中功能,搭配超广角可实现视频中始终让人物居中效果。 +屏幕方面,iPad Pro 12.9版本支持XDR体验的Mini-LEDS显示屏,支持HDR10、杜比视界,还支持杜比全景声。 +iPad Pro的秒控键盘这次也推出白色版本。 +售价方面,11英寸版本售价799美元起,12.9英寸售价1099美元起。 diff --git a/examples/text_normalization/local/test_normalization.py b/examples/text_normalization/local/test_normalization.py new file mode 100644 index 00000000..38a38460 --- /dev/null +++ b/examples/text_normalization/local/test_normalization.py @@ -0,0 +1,14 @@ +import argparse +from text_processing import normalization + +parser = argparse.ArgumentParser(description="Normalize text in Chinese with some rules.") +parser.add_argument("input", type=str, help="the input sentences") +parser.add_argument("output", type=str, help="path to save the output file.") +args = parser.parse_args() + +with open(args.input, 'rt') as fin: + with open(args.output, 'wt') as fout: + for sent in fin: + sent = normalization.normalize_sentence(sent.strip()) + fout.write(sent) + fout.write('\n') diff --git a/examples/text_normalization/path.sh b/examples/text_normalization/path.sh new file mode 100644 index 00000000..c8b1f1c2 --- /dev/null +++ b/examples/text_normalization/path.sh @@ -0,0 +1,8 @@ +export MAIN_ROOT=${PWD}/../../ + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}# diff --git a/examples/text_normalization/run.sh b/examples/text_normalization/run.sh new file mode 100755 index 00000000..b39de2a2 --- /dev/null +++ b/examples/text_normalization/run.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +source path.sh + +stage=-1 +stop_stage=100 + +exp_dir=exp +data_dir=data +filename="sentences.txt" + +source ${MAIN_ROOT}/utils/parse_options.sh || exit -1 + +mkdir -p ${exp_dir} + + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + echo "stage 1: Processing " + python3 local/test_normalization.py ${data_dir}/${filename} ${exp_dir}/normalized.txt + if [ -f "${exp_dir}/normalized.txt" ]; then + echo "Normalized text save at ${exp_dir}/normalized.txt" + fi + # TODO(chenfeiyu): compute edit distance against ground-truth +fi + +echo "done" +exit 0 diff --git a/third_party/text_processing/__init__.py b/third_party/text_processing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/third_party/text_processing/normalization/__init__.py b/third_party/text_processing/normalization/__init__.py new file mode 100644 index 00000000..0b4f0e7f --- /dev/null +++ b/third_party/text_processing/normalization/__init__.py @@ -0,0 +1,42 @@ +from .sentence_split import split +from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM +from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num + +from .chronology import RE_TIME, RE_DATE, RE_DATE2 +from .chronology import replace_time, replace_date, replace_date2 + +from .quantifier import RE_TEMPERATURE +from .quantifier import replace_temperature + +from .phone import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone + +from .char_convert import tranditional_to_simplified +from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE + + +def normalize_sentence(sentence): + # basic character conversions + sentence = tranditional_to_simplified(sentence) + sentence = sentence.translate(F2H_ASCII_LETTERS).translate( + F2H_DIGITS).translate(F2H_SPACE) + + # number related NSW verbalization + sentence = RE_DATE.sub(replace_date, sentence) + sentence = RE_DATE2.sub(replace_date2, sentence) + sentence = RE_TIME.sub(replace_time, sentence) + sentence = RE_TEMPERATURE.sub(replace_temperature, sentence) + sentence = RE_RANGE.sub(replace_range, sentence) + sentence = RE_FRAC.sub(replace_frac, sentence) + sentence = RE_PERCENTAGE.sub(replace_percentage, sentence) + sentence = RE_MOBILE_PHONE.sub(replace_phone, sentence) + sentence = RE_TELEPHONE.sub(replace_phone, sentence) + sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) + sentence = RE_NUMBER.sub(replace_number, sentence) + + return sentence + + +def normalize(text): + sentences = split(text) + sentences = [normalize_sentence(sent) for sent in sentences] + return sentences diff --git a/third_party/text_processing/normalization/char_convert.py b/third_party/text_processing/normalization/char_convert.py new file mode 100644 index 00000000..1c035a80 --- /dev/null +++ b/third_party/text_processing/normalization/char_convert.py @@ -0,0 +1,14 @@ +"""Traditional and simplified Chinese conversion with +`opencc `_. +""" + +import opencc + +_t2s_converter = opencc.OpenCC("t2s.json") +_s2t_converter = opencc.OpenCC('s2t.json') + +def tranditional_to_simplified(text: str) -> str: + return _t2s_converter.convert(text) + +def simplified_to_traditional(text: str) -> str: + return _s2t_converter.convert(text) \ No newline at end of file diff --git a/third_party/text_processing/normalization/chronology.py b/third_party/text_processing/normalization/chronology.py new file mode 100644 index 00000000..727bbd65 --- /dev/null +++ b/third_party/text_processing/normalization/chronology.py @@ -0,0 +1,63 @@ +import re +from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS + +def _time_num2str(num_string: str) -> str: + """A special case for verbalizing number in time.""" + result = num2str(num_string.lstrip('0')) + if num_string.startswith('0'): + result = DIGITS['0'] + result + return result + +# 时刻表达式 +RE_TIME = re.compile( + r'([0-1]?[0-9]|2[0-3])' + r':([0-5][0-9])' + r'(:([0-5][0-9]))?' +) +def replace_time(match: re.Match) -> str: + hour = match.group(1) + minute = match.group(2) + second = match.group(4) + + result = f"{num2str(hour)}点" + if minute.lstrip('0'): + result += f"{_time_num2str(minute)}分" + if second and second.lstrip('0'): + result += f"{_time_num2str(second)}秒" + return result + + +RE_DATE = re.compile( + r'(\d{4}|\d{2})年' + r'((0?[1-9]|1[0-2])月)?' + r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?' +) +def replace_date(match: re.Match) -> str: + year = match.group(1) + month = match.group(3) + day = match.group(5) + result = "" + if year: + result += f"{verbalize_digit(year)}年" + if month: + result += f"{verbalize_cardinal(month)}月" + if day: + result += f"{verbalize_cardinal(day)}{match.group(9)}" + return result + +# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期 +RE_DATE2 = re.compile( + r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])' +) +def replace_date2(match: re.Match) -> str: + year = match.group(1) + month = match.group(3) + day = match.group(4) + result = "" + if year: + result += f"{verbalize_digit(year)}年" + if month: + result += f"{verbalize_cardinal(month)}月" + if day: + result += f"{verbalize_cardinal(day)}日" + return result \ No newline at end of file diff --git a/third_party/text_processing/normalization/constants.py b/third_party/text_processing/normalization/constants.py new file mode 100644 index 00000000..bbfccb67 --- /dev/null +++ b/third_party/text_processing/normalization/constants.py @@ -0,0 +1,57 @@ +import string +import re +from pypinyin.constants import SUPPORT_UCS4 + +# 全角半角转换 +# 英文字符全角 -> 半角映射表 (num: 52) +F2H_ASCII_LETTERS = { + chr(ord(char) + 65248): char + for char in string.ascii_letters +} + +# 英文字符半角 -> 全角映射表 +H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} + +# 数字字符全角 -> 半角映射表 (num: 10) +F2H_DIGITS = { + chr(ord(char) + 65248): char + for char in string.digits +} +# 数字字符半角 -> 全角映射表 +H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} + +# 标点符号全角 -> 半角映射表 (num: 32) +F2H_PUNCTUATIONS = { + chr(ord(char) + 65248): char + for char in string.punctuation +} +# 标点符号半角 -> 全角映射表 +H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} + +# 空格 (num: 1) +F2H_SPACE = {'\u3000': ' '} +H2F_SPACE = {' ': '\u3000'} + +# 非"有拼音的汉字"的字符串,可用于NSW提取 +if SUPPORT_UCS4: + RE_NSW = re.compile( + r'(?:[^' + r'\u3007' # 〇 + r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] + r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] + r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] + r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF] + r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F] + r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D] + r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F] + r'])+' + ) +else: + RE_NSW = re.compile( # pragma: no cover + r'(?:[^' + r'\u3007' # 〇 + r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF] + r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF] + r'\uf900-\ufaff' # CJK兼容:[F900-FAFF] + r'])+' + ) diff --git a/third_party/text_processing/normalization/num.py b/third_party/text_processing/normalization/num.py new file mode 100644 index 00000000..9b8b0ab3 --- /dev/null +++ b/third_party/text_processing/normalization/num.py @@ -0,0 +1,154 @@ +""" +Rules to verbalize numbers into Chinese characters. +https://zh.wikipedia.org/wiki/中文数字#現代中文 +""" +import re +from typing import List +from collections import OrderedDict + +DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')} +UNITS = OrderedDict({ + 1: '十', + 2: '百', + 3: '千', + 4: '万', + 8: '亿', +}) + +# 分数表达式 +RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') +def replace_frac(match: re.Match) -> str: + sign = match.group(1) + nominator = match.group(2) + denominator = match.group(3) + sign: str = "负" if sign else "" + nominator: str = num2str(nominator) + denominator: str = num2str(denominator) + result = f"{sign}{denominator}分之{nominator}" + return result + + +# 百分数表达式 +RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') +def replace_percentage(match: re.Match) -> str: + sign = match.group(1) + percent = match.group(2) + sign: str = "负" if sign else "" + percent: str = num2str(percent) + result = f"{sign}百分之{percent}" + return result + +# 整数表达式 +# 带负号或者不带负号的整数 12, -10 +RE_INTEGER = re.compile( + r'(-?)' + r'(\d+)' +) + +# 编号-无符号整形 +# 00078 +RE_DEFAULT_NUM = re.compile(r'\d{4}\d*') +def replace_default_num(match: re.Match): + number = match.group(0) + return verbalize_digit(number) + +# 数字表达式 +# 1. 整数: -10, 10; +# 2. 浮点数: 10.2, -0.3 +# 3. 不带符号和整数部分的纯浮点数: .22, .38 +RE_NUMBER = re.compile( + r'(-?)((\d+)(\.\d+)?)' + r'|(\.(\d+))' +) +def replace_number(match: re.Match) -> str: + sign = match.group(1) + number = match.group(2) + pure_decimal = match.group(5) + if pure_decimal: + result = num2str(pure_decimal) + else: + sign: str = "负" if sign else "" + number: str = num2str(number) + result = f"{sign}{number}" + return result + +# 范围表达式 +# 12-23, 12~23 +RE_RANGE = re.compile( + r'(\d+)[-~](\d+)' +) +def replace_range(match: re.Match) -> str: + first, second = match.group(1), match.group(2) + first: str = num2str(first) + second: str = num2str(second) + result = f"{first}到{second}" + return result + + +def _get_value(value_string: str, use_zero: bool=True) -> List[str]: + stripped = value_string.lstrip('0') + if len(stripped) == 0: + return [] + elif len(stripped) == 1: + if use_zero and len(stripped) < len(value_string): + return [DIGITS['0'], DIGITS[stripped]] + else: + return [DIGITS[stripped]] + else: + largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped)) + first_part = value_string[:-largest_unit] + second_part = value_string[-largest_unit:] + return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part) + +def verbalize_cardinal(value_string: str) -> str: + if not value_string: + return '' + + # 000 -> '零' , 0 -> '零' + value_string = value_string.lstrip('0') + if len(value_string) == 0: + return DIGITS['0'] + + result_symbols = _get_value(value_string) + # verbalized number starting with '一十*' is abbreviated as `十*` + if len(result_symbols) >= 2 and result_symbols[0] == DIGITS['1'] and result_symbols[1] == UNITS[1]: + result_symbols = result_symbols[1:] + return ''.join(result_symbols) + +def verbalize_digit(value_string: str, alt_one=False) -> str: + result_symbols = [DIGITS[digit] for digit in value_string] + result = ''.join(result_symbols) + if alt_one: + result.replace("一", "幺") + return result + +def num2str(value_string: str) -> str: + integer_decimal = value_string.split('.') + if len(integer_decimal) == 1: + integer = integer_decimal[0] + decimal = '' + elif len(integer_decimal) == 2: + integer, decimal = integer_decimal + else: + raise ValueError(f"The value string: '${value_string}' has more than one point in it.") + + result = verbalize_cardinal(integer) + + decimal = decimal.rstrip('0') + if decimal: + # '.22' is verbalized as '点二二' + # '3.20' is verbalized as '三点二 + result += '点' + verbalize_digit(decimal) + return result + + + + + + + + + + + + diff --git a/third_party/text_processing/normalization/phone.py b/third_party/text_processing/normalization/phone.py new file mode 100644 index 00000000..e8bdecd7 --- /dev/null +++ b/third_party/text_processing/normalization/phone.py @@ -0,0 +1,30 @@ +import re +from .num import verbalize_digit + +# 规范化固话/手机号码 +# 手机 +# http://www.jihaoba.com/news/show/13680 +# 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 +# 联通:130、131、132、156、155、186、185、176 +# 电信:133、153、189、180、181、177 +RE_MOBILE_PHONE= re.compile( + r"(? str: + if mobile: + sp_parts = phone_string.strip('+').split() + result = ''.join( + [verbalize_digit(part, alt_one=True) for part in sp_parts]) + return result + else: + sil_parts = phone_string.split('-') + result = ''.join( + [verbalize_digit(part, alt_one=True) for part in sil_parts]) + return result + + +def replace_phone(match: re.Match) -> str: + return phone2str(match.group(0)) \ No newline at end of file diff --git a/third_party/text_processing/normalization/quantifier.py b/third_party/text_processing/normalization/quantifier.py new file mode 100644 index 00000000..836fc88c --- /dev/null +++ b/third_party/text_processing/normalization/quantifier.py @@ -0,0 +1,17 @@ +import re +from .num import num2str + +# 温度表达式,温度会影响负号的读法 +# -3°C 零下三度 +RE_TEMPERATURE = re.compile( + r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)' +) +def replace_temperature(match: re.Match) -> str: + sign = match.group(1) + temperature = match.group(2) + unit = match.group(3) + sign: str = "零下" if sign else "" + temperature: str = num2str(temperature) + unit: str = "摄氏度" if unit == "摄氏度" else "度" + result = f"{sign}{temperature}{unit}" + return result \ No newline at end of file diff --git a/third_party/text_processing/normalization/sentence_split.py b/third_party/text_processing/normalization/sentence_split.py new file mode 100644 index 00000000..451371da --- /dev/null +++ b/third_party/text_processing/normalization/sentence_split.py @@ -0,0 +1,22 @@ +import re +from typing import List + +SENTENCE_SPLITOR = re.compile(r'([。!?][”’]?)') + +def split(text: str) -> List[str]: + """Split long text into sentences with sentence-splitting punctuations. + + Parameters + ---------- + text : str + The input text. + + Returns + ------- + List[str] + Sentences. + """ + text = SENTENCE_SPLITOR.sub(r'\1\n', text) + text = text.strip() + sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] + return sentences