add text normalization example

pull/658/head
Feiyu Chan 3 years ago committed by chenfeiyu
parent c65ea55bb1
commit 7779f33e74

@ -0,0 +1,3 @@
# Regular expression based text normalization for Chinese
For simplicity and ease of implementation, text normalization is basically done by rules and dictionaries. Jere's an example.

@ -0,0 +1,26 @@
今天的最低气温达到-10°C.
只要有33/4的人同意就可以通过决议。
1945年5月2日苏联士兵在德国国会大厦上升起了胜利旗象征着攻占柏林并战胜了纳粹德国。
4月16日清晨的战斗以炮击揭幕数以千计的大炮和喀秋莎火箭炮开始炮轰德军阵地炮击持续了数天之久。
如果剩下的30.6%是过去那么还有69.4%.
事情发生在2020/03/31的上午8:00.
警方正在找一支.22口径的手枪。
欢迎致电中国联通北京2022年冬奥会官方合作伙伴为您服务
充值缴费请按1查询话费及余量请按2跳过本次提醒请按井号键。
快速解除流量封顶请按星号键腾讯王卡产品介绍、使用说明、特权及活动请按9查询话费、套餐余量、积分及活动返款请按1手机上网流量开通及取消请按2查询本机号码及本号所使用套餐请按4密码修改及重置请按5紧急开机请按6挂失请按7查询充值记录请按8其它自助服务及人工服务请按0
智能客服助理快速查话费、查流量请按9了解北京联通业务请按1宽带IPTV新装、查询请按2障碍报修请按3充值缴费请按4投诉建议请按5政企业务请按7人工服务请按0for english severice press star key
您的帐户当前可用余额为63.89元本月消费为2.17元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。
您的帐户当前可用余额为负15.5元本月消费为59.6元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。
尊敬的客户您目前的话费余额为负14.60元已低于10元为保证您的通信畅通请及时缴纳费用。
您的流量已用完,为避免您产生额外费用,建议您根据需求开通一个流量包以作补充。
您可以直接说,查询话费及余量、开通流量包、缴费,您也可以说出其它需求,请问有什么可以帮您?
您的账户当前可用余额为负36.00元本月消费36.00元。
请问你是电话13985608526的机主吗
如您对处理结果不满意可拨打中国联通集团投诉电话10015进行投诉按本地通话费收费返回自助服务请按井号键
“26314”号VIP客服代表为您服务。
尊敬的5G用户欢迎您致电中国联通
首先是应用了M1芯片的iPad Pro新款的iPad Pro支持5G这也是苹果的第二款5G产品线。
除此之外,摄像头方面再次升级,增加了前摄全新超广角摄像头,支持人物居中功能,搭配超广角可实现视频中始终让人物居中效果。
屏幕方面iPad Pro 12.9版本支持XDR体验的Mini-LEDS显示屏支持HDR10、杜比视界还支持杜比全景声。
iPad Pro的秒控键盘这次也推出白色版本。
售价方面11英寸版本售价799美元起12.9英寸售价1099美元起。

@ -0,0 +1,14 @@
import argparse
from text_processing import normalization
parser = argparse.ArgumentParser(description="Normalize text in Chinese with some rules.")
parser.add_argument("input", type=str, help="the input sentences")
parser.add_argument("output", type=str, help="path to save the output file.")
args = parser.parse_args()
with open(args.input, 'rt') as fin:
with open(args.output, 'wt') as fout:
for sent in fin:
sent = normalization.normalize_sentence(sent.strip())
fout.write(sent)
fout.write('\n')

@ -0,0 +1,8 @@
export MAIN_ROOT=${PWD}/../../
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}#

@ -0,0 +1,27 @@
#!/usr/bin/env bash
source path.sh
stage=-1
stop_stage=100
exp_dir=exp
data_dir=data
filename="sentences.txt"
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
mkdir -p ${exp_dir}
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
echo "stage 1: Processing "
python3 local/test_normalization.py ${data_dir}/${filename} ${exp_dir}/normalized.txt
if [ -f "${exp_dir}/normalized.txt" ]; then
echo "Normalized text save at ${exp_dir}/normalized.txt"
fi
# TODO(chenfeiyu): compute edit distance against ground-truth
fi
echo "done"
exit 0

@ -0,0 +1,42 @@
from .sentence_split import split
from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM
from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num
from .chronology import RE_TIME, RE_DATE, RE_DATE2
from .chronology import replace_time, replace_date, replace_date2
from .quantifier import RE_TEMPERATURE
from .quantifier import replace_temperature
from .phone import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone
from .char_convert import tranditional_to_simplified
from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE
def normalize_sentence(sentence):
# basic character conversions
sentence = tranditional_to_simplified(sentence)
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
F2H_DIGITS).translate(F2H_SPACE)
# number related NSW verbalization
sentence = RE_DATE.sub(replace_date, sentence)
sentence = RE_DATE2.sub(replace_date2, sentence)
sentence = RE_TIME.sub(replace_time, sentence)
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
sentence = RE_RANGE.sub(replace_range, sentence)
sentence = RE_FRAC.sub(replace_frac, sentence)
sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
sentence = RE_MOBILE_PHONE.sub(replace_phone, sentence)
sentence = RE_TELEPHONE.sub(replace_phone, sentence)
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
sentence = RE_NUMBER.sub(replace_number, sentence)
return sentence
def normalize(text):
sentences = split(text)
sentences = [normalize_sentence(sent) for sent in sentences]
return sentences

@ -0,0 +1,14 @@
"""Traditional and simplified Chinese conversion with
`opencc <https://github.com/BYVoid/OpenCC>`_.
"""
import opencc
_t2s_converter = opencc.OpenCC("t2s.json")
_s2t_converter = opencc.OpenCC('s2t.json')
def tranditional_to_simplified(text: str) -> str:
return _t2s_converter.convert(text)
def simplified_to_traditional(text: str) -> str:
return _s2t_converter.convert(text)

@ -0,0 +1,63 @@
import re
from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS
def _time_num2str(num_string: str) -> str:
"""A special case for verbalizing number in time."""
result = num2str(num_string.lstrip('0'))
if num_string.startswith('0'):
result = DIGITS['0'] + result
return result
# 时刻表达式
RE_TIME = re.compile(
r'([0-1]?[0-9]|2[0-3])'
r':([0-5][0-9])'
r'(:([0-5][0-9]))?'
)
def replace_time(match: re.Match) -> str:
hour = match.group(1)
minute = match.group(2)
second = match.group(4)
result = f"{num2str(hour)}"
if minute.lstrip('0'):
result += f"{_time_num2str(minute)}"
if second and second.lstrip('0'):
result += f"{_time_num2str(second)}"
return result
RE_DATE = re.compile(
r'(\d{4}|\d{2})年'
r'((0?[1-9]|1[0-2])月)?'
r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?'
)
def replace_date(match: re.Match) -> str:
year = match.group(1)
month = match.group(3)
day = match.group(5)
result = ""
if year:
result += f"{verbalize_digit(year)}"
if month:
result += f"{verbalize_cardinal(month)}"
if day:
result += f"{verbalize_cardinal(day)}{match.group(9)}"
return result
# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
RE_DATE2 = re.compile(
r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])'
)
def replace_date2(match: re.Match) -> str:
year = match.group(1)
month = match.group(3)
day = match.group(4)
result = ""
if year:
result += f"{verbalize_digit(year)}"
if month:
result += f"{verbalize_cardinal(month)}"
if day:
result += f"{verbalize_cardinal(day)}"
return result

@ -0,0 +1,57 @@
import string
import re
from pypinyin.constants import SUPPORT_UCS4
# 全角半角转换
# 英文字符全角 -> 半角映射表 (num: 52)
F2H_ASCII_LETTERS = {
chr(ord(char) + 65248): char
for char in string.ascii_letters
}
# 英文字符半角 -> 全角映射表
H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
# 数字字符全角 -> 半角映射表 (num: 10)
F2H_DIGITS = {
chr(ord(char) + 65248): char
for char in string.digits
}
# 数字字符半角 -> 全角映射表
H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
# 标点符号全角 -> 半角映射表 (num: 32)
F2H_PUNCTUATIONS = {
chr(ord(char) + 65248): char
for char in string.punctuation
}
# 标点符号半角 -> 全角映射表
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
# 空格 (num: 1)
F2H_SPACE = {'\u3000': ' '}
H2F_SPACE = {' ': '\u3000'}
# 非"有拼音的汉字"的字符串可用于NSW提取
if SUPPORT_UCS4:
RE_NSW = re.compile(
r'(?:[^'
r'\u3007' #
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
r'])+'
)
else:
RE_NSW = re.compile( # pragma: no cover
r'(?:[^'
r'\u3007' #
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'])+'
)

@ -0,0 +1,154 @@
"""
Rules to verbalize numbers into Chinese characters.
https://zh.wikipedia.org/wiki/中文数字#現代中文
"""
import re
from typing import List
from collections import OrderedDict
DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
UNITS = OrderedDict({
1: '',
2: '',
3: '',
4: '',
8: '亿',
})
# 分数表达式
RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
def replace_frac(match: re.Match) -> str:
sign = match.group(1)
nominator = match.group(2)
denominator = match.group(3)
sign: str = "" if sign else ""
nominator: str = num2str(nominator)
denominator: str = num2str(denominator)
result = f"{sign}{denominator}分之{nominator}"
return result
# 百分数表达式
RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
def replace_percentage(match: re.Match) -> str:
sign = match.group(1)
percent = match.group(2)
sign: str = "" if sign else ""
percent: str = num2str(percent)
result = f"{sign}百分之{percent}"
return result
# 整数表达式
# 带负号或者不带负号的整数 12, -10
RE_INTEGER = re.compile(
r'(-?)'
r'(\d+)'
)
# 编号-无符号整形
# 00078
RE_DEFAULT_NUM = re.compile(r'\d{4}\d*')
def replace_default_num(match: re.Match):
number = match.group(0)
return verbalize_digit(number)
# 数字表达式
# 1. 整数: -10, 10;
# 2. 浮点数: 10.2, -0.3
# 3. 不带符号和整数部分的纯浮点数: .22, .38
RE_NUMBER = re.compile(
r'(-?)((\d+)(\.\d+)?)'
r'|(\.(\d+))'
)
def replace_number(match: re.Match) -> str:
sign = match.group(1)
number = match.group(2)
pure_decimal = match.group(5)
if pure_decimal:
result = num2str(pure_decimal)
else:
sign: str = "" if sign else ""
number: str = num2str(number)
result = f"{sign}{number}"
return result
# 范围表达式
# 12-23, 12~23
RE_RANGE = re.compile(
r'(\d+)[-~](\d+)'
)
def replace_range(match: re.Match) -> str:
first, second = match.group(1), match.group(2)
first: str = num2str(first)
second: str = num2str(second)
result = f"{first}{second}"
return result
def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
stripped = value_string.lstrip('0')
if len(stripped) == 0:
return []
elif len(stripped) == 1:
if use_zero and len(stripped) < len(value_string):
return [DIGITS['0'], DIGITS[stripped]]
else:
return [DIGITS[stripped]]
else:
largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped))
first_part = value_string[:-largest_unit]
second_part = value_string[-largest_unit:]
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
def verbalize_cardinal(value_string: str) -> str:
if not value_string:
return ''
# 000 -> '零' , 0 -> '零'
value_string = value_string.lstrip('0')
if len(value_string) == 0:
return DIGITS['0']
result_symbols = _get_value(value_string)
# verbalized number starting with '一十*' is abbreviated as `十*`
if len(result_symbols) >= 2 and result_symbols[0] == DIGITS['1'] and result_symbols[1] == UNITS[1]:
result_symbols = result_symbols[1:]
return ''.join(result_symbols)
def verbalize_digit(value_string: str, alt_one=False) -> str:
result_symbols = [DIGITS[digit] for digit in value_string]
result = ''.join(result_symbols)
if alt_one:
result.replace("", "")
return result
def num2str(value_string: str) -> str:
integer_decimal = value_string.split('.')
if len(integer_decimal) == 1:
integer = integer_decimal[0]
decimal = ''
elif len(integer_decimal) == 2:
integer, decimal = integer_decimal
else:
raise ValueError(f"The value string: '${value_string}' has more than one point in it.")
result = verbalize_cardinal(integer)
decimal = decimal.rstrip('0')
if decimal:
# '.22' is verbalized as '点二二'
# '3.20' is verbalized as '三点二
result += '' + verbalize_digit(decimal)
return result

@ -0,0 +1,30 @@
import re
from .num import verbalize_digit
# 规范化固话/手机号码
# 手机
# http://www.jihaoba.com/news/show/13680
# 移动139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
# 联通130、131、132、156、155、186、185、176
# 电信133、153、189、180、181、177
RE_MOBILE_PHONE= re.compile(
r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
RE_TELEPHONE = re.compile(
r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
def phone2str(phone_string: str, mobile=True) -> str:
if mobile:
sp_parts = phone_string.strip('+').split()
result = ''.join(
[verbalize_digit(part, alt_one=True) for part in sp_parts])
return result
else:
sil_parts = phone_string.split('-')
result = ''.join(
[verbalize_digit(part, alt_one=True) for part in sil_parts])
return result
def replace_phone(match: re.Match) -> str:
return phone2str(match.group(0))

@ -0,0 +1,17 @@
import re
from .num import num2str
# 温度表达式,温度会影响负号的读法
# -3°C 零下三度
RE_TEMPERATURE = re.compile(
r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)'
)
def replace_temperature(match: re.Match) -> str:
sign = match.group(1)
temperature = match.group(2)
unit = match.group(3)
sign: str = "零下" if sign else ""
temperature: str = num2str(temperature)
unit: str = "摄氏度" if unit == "摄氏度" else ""
result = f"{sign}{temperature}{unit}"
return result

@ -0,0 +1,22 @@
import re
from typing import List
SENTENCE_SPLITOR = re.compile(r'([。!?][”’]?)')
def split(text: str) -> List[str]:
"""Split long text into sentences with sentence-splitting punctuations.
Parameters
----------
text : str
The input text.
Returns
-------
List[str]
Sentences.
"""
text = SENTENCE_SPLITOR.sub(r'\1\n', text)
text = text.strip()
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
return sentences
Loading…
Cancel
Save