add text normalization example

5 years ago · 7779f33e74
parent c65ea55bb1
commit 7779f33e74
14 changed files with 477 additions and 0 deletions
--- a/examples/text_normalization/README.md
+++ b/examples/text_normalization/README.md
@ -0,0 +1,3 @@
+# Regular expression based text normalization for Chinese
+
+For simplicity and ease of implementation, text normalization is basically done by rules and dictionaries. Jere's an example.
--- a/examples/text_normalization/data/sentences.txt
+++ b/examples/text_normalization/data/sentences.txt
@ -0,0 +1,26 @@
+今天的最低气温达到-10°C.
+只要有33/4的人同意，就可以通过决议。
+1945年5月2日，苏联士兵在德国国会大厦上升起了胜利旗，象征着攻占柏林并战胜了纳粹德国。
+4月16日，清晨的战斗以炮击揭幕，数以千计的大炮和喀秋莎火箭炮开始炮轰德军阵地，炮击持续了数天之久。
+如果剩下的30.6%是过去，那么还有69.4%.
+事情发生在2020/03/31的上午8:00.
+警方正在找一支.22口径的手枪。
+欢迎致电中国联通，北京2022年冬奥会官方合作伙伴为您服务
+充值缴费请按1，查询话费及余量请按2，跳过本次提醒请按井号键。
+快速解除流量封顶请按星号键，腾讯王卡产品介绍、使用说明、特权及活动请按9，查询话费、套餐余量、积分及活动返款请按1，手机上网流量开通及取消请按2，查询本机号码及本号所使用套餐请按4，密码修改及重置请按5，紧急开机请按6，挂失请按7，查询充值记录请按8，其它自助服务及人工服务请按0
+智能客服助理快速查话费、查流量请按9，了解北京联通业务请按1，宽带IPTV新装、查询请按2，障碍报修请按3，充值缴费请按4，投诉建议请按5，政企业务请按7，人工服务请按0，for english severice press star key
+您的帐户当前可用余额为63.89元，本月消费为2.17元。您的消费、套餐余量和其它信息将以短信形式下发，请您注意查收。谢谢使用，再见！。
+您的帐户当前可用余额为负15.5元，本月消费为59.6元。您的消费、套餐余量和其它信息将以短信形式下发，请您注意查收。谢谢使用，再见！。
+尊敬的客户，您目前的话费余额为负14.60元，已低于10元，为保证您的通信畅通，请及时缴纳费用。
+您的流量已用完，为避免您产生额外费用，建议您根据需求开通一个流量包以作补充。
+您可以直接说，查询话费及余量、开通流量包、缴费，您也可以说出其它需求，请问有什么可以帮您？
+您的账户当前可用余额为负36.00元，本月消费36.00元。
+请问你是电话13985608526的机主吗？
+如您对处理结果不满意，可拨打中国联通集团投诉电话10015进行投诉，按本地通话费收费，返回自助服务请按井号键
+“26314”号VIP客服代表为您服务。
+尊敬的5G用户，欢迎您致电中国联通
+首先是应用了M1芯片的iPad Pro，新款的iPad Pro支持5G，这也是苹果的第二款5G产品线。
+除此之外，摄像头方面再次升级，增加了前摄全新超广角摄像头，支持人物居中功能，搭配超广角可实现视频中始终让人物居中效果。
+屏幕方面，iPad Pro 12.9版本支持XDR体验的Mini-LEDS显示屏，支持HDR10、杜比视界，还支持杜比全景声。
+iPad Pro的秒控键盘这次也推出白色版本。
+售价方面，11英寸版本售价799美元起，12.9英寸售价1099美元起。
--- a/examples/text_normalization/local/test_normalization.py
+++ b/examples/text_normalization/local/test_normalization.py
@ -0,0 +1,14 @@
+import argparse
+from text_processing import normalization
+
+parser = argparse.ArgumentParser(description="Normalize text in Chinese with some rules.")
+parser.add_argument("input", type=str, help="the input sentences")
+parser.add_argument("output", type=str, help="path to save the output file.")
+args = parser.parse_args()
+
+with open(args.input, 'rt') as fin:
+    with open(args.output, 'wt') as fout:
+        for sent in fin:
+            sent = normalization.normalize_sentence(sent.strip())
+            fout.write(sent)
+            fout.write('\n')
--- a/examples/text_normalization/path.sh
+++ b/examples/text_normalization/path.sh
@ -0,0 +1,8 @@
+export MAIN_ROOT=${PWD}/../../
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}#
--- a/examples/text_normalization/run.sh
+++ b/examples/text_normalization/run.sh
@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+source path.sh
+
+stage=-1
+stop_stage=100
+
+exp_dir=exp
+data_dir=data
+filename="sentences.txt"
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
+
+mkdir -p ${exp_dir}
+
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+    echo "stage 1: Processing "
+    python3 local/test_normalization.py  ${data_dir}/${filename} ${exp_dir}/normalized.txt
+    if [ -f "${exp_dir}/normalized.txt" ]; then
+	echo "Normalized text save at ${exp_dir}/normalized.txt"
+    fi
+    # TODO(chenfeiyu): compute edit distance against ground-truth
+fi
+
+echo "done"
+exit 0
--- a/third_party/text_processing/init.py
+++ b/third_party/text_processing/init.py
--- a/third_party/text_processing/normalization/init.py
+++ b/third_party/text_processing/normalization/init.py
@ -0,0 +1,42 @@
+from .sentence_split import split
+from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM
+from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num
+
+from .chronology import RE_TIME, RE_DATE, RE_DATE2
+from .chronology import replace_time, replace_date, replace_date2
+
+from .quantifier import RE_TEMPERATURE
+from .quantifier import replace_temperature
+
+from .phone import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone
+
+from .char_convert import tranditional_to_simplified
+from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE
+
+
+def normalize_sentence(sentence):
+    # basic character conversions
+    sentence = tranditional_to_simplified(sentence)
+    sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
+        F2H_DIGITS).translate(F2H_SPACE)
+
+    # number related NSW verbalization
+    sentence = RE_DATE.sub(replace_date, sentence)
+    sentence = RE_DATE2.sub(replace_date2, sentence)
+    sentence = RE_TIME.sub(replace_time, sentence)
+    sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
+    sentence = RE_RANGE.sub(replace_range, sentence)
+    sentence = RE_FRAC.sub(replace_frac, sentence)
+    sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
+    sentence = RE_MOBILE_PHONE.sub(replace_phone, sentence)
+    sentence = RE_TELEPHONE.sub(replace_phone, sentence)
+    sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
+    sentence = RE_NUMBER.sub(replace_number, sentence)
+
+    return sentence
+
+
+def normalize(text):
+    sentences = split(text)
+    sentences = [normalize_sentence(sent) for sent in sentences]
+    return sentences
--- a/third_party/text_processing/normalization/char_convert.py
+++ b/third_party/text_processing/normalization/char_convert.py
@ -0,0 +1,14 @@
+"""Traditional and simplified Chinese conversion with 
+`opencc <https://github.com/BYVoid/OpenCC>`_.
+"""
+
+import opencc
+
+_t2s_converter = opencc.OpenCC("t2s.json")
+_s2t_converter = opencc.OpenCC('s2t.json')
+
+def tranditional_to_simplified(text: str) -> str:
+    return _t2s_converter.convert(text)
+
+def simplified_to_traditional(text: str) -> str:
+    return _s2t_converter.convert(text)
--- a/third_party/text_processing/normalization/chronology.py
+++ b/third_party/text_processing/normalization/chronology.py
@ -0,0 +1,63 @@
+import re
+from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS
+
+def _time_num2str(num_string: str) -> str:
+    """A special case for verbalizing number in time."""
+    result = num2str(num_string.lstrip('0'))
+    if num_string.startswith('0'):
+        result = DIGITS['0'] + result
+    return result
+
+# 时刻表达式
+RE_TIME = re.compile(
+    r'([0-1]?[0-9]|2[0-3])'
+    r':([0-5][0-9])'
+    r'(:([0-5][0-9]))?'
+)
+def replace_time(match: re.Match) -> str:
+    hour = match.group(1)
+    minute = match.group(2)
+    second = match.group(4)
+    
+    result = f"{num2str(hour)}点"
+    if minute.lstrip('0'):
+        result += f"{_time_num2str(minute)}分"
+    if second and second.lstrip('0'):
+        result += f"{_time_num2str(second)}秒"
+    return result
+
+
+RE_DATE = re.compile(
+    r'(\d{4}|\d{2})年'
+    r'((0?[1-9]|1[0-2])月)?'
+    r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?'
+)
+def replace_date(match: re.Match) -> str:
+    year = match.group(1)
+    month = match.group(3)
+    day = match.group(5)
+    result = ""
+    if year:
+        result += f"{verbalize_digit(year)}年"
+    if month:
+        result += f"{verbalize_cardinal(month)}月"
+    if day:
+        result += f"{verbalize_cardinal(day)}{match.group(9)}"
+    return result
+
+# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
+RE_DATE2 = re.compile(
+    r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])'
+)
+def replace_date2(match: re.Match) -> str:
+    year = match.group(1)
+    month = match.group(3)
+    day = match.group(4)
+    result = ""
+    if year:
+        result += f"{verbalize_digit(year)}年"
+    if month:
+        result += f"{verbalize_cardinal(month)}月"
+    if day:
+        result += f"{verbalize_cardinal(day)}日"
+    return result
--- a/third_party/text_processing/normalization/constants.py
+++ b/third_party/text_processing/normalization/constants.py
@ -0,0 +1,57 @@
+import string
+import re
+from pypinyin.constants import SUPPORT_UCS4
+
+# 全角半角转换
+# 英文字符全角 -> 半角映射表 (num: 52)
+F2H_ASCII_LETTERS = {
+    chr(ord(char) + 65248): char
+    for char in string.ascii_letters
+}
+
+# 英文字符半角 -> 全角映射表
+H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
+
+# 数字字符全角 -> 半角映射表 (num: 10)
+F2H_DIGITS = {
+    chr(ord(char) + 65248): char
+    for char in string.digits
+}
+# 数字字符半角 -> 全角映射表
+H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
+
+# 标点符号全角 -> 半角映射表 (num: 32)
+F2H_PUNCTUATIONS = {
+    chr(ord(char) + 65248): char
+    for char in string.punctuation
+}
+# 标点符号半角 -> 全角映射表
+H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
+
+# 空格 (num: 1)
+F2H_SPACE = {'\u3000': ' '}
+H2F_SPACE = {' ': '\u3000'}
+
+# 非"有拼音的汉字"的字符串，可用于NSW提取
+if SUPPORT_UCS4:
+    RE_NSW = re.compile(
+        r'(?:[^'
+        r'\u3007'                  # 〇
+        r'\u3400-\u4dbf'           # CJK扩展A:[3400-4DBF]
+        r'\u4e00-\u9fff'           # CJK基本:[4E00-9FFF]
+        r'\uf900-\ufaff'           # CJK兼容:[F900-FAFF]
+        r'\U00020000-\U0002A6DF'   # CJK扩展B:[20000-2A6DF]
+        r'\U0002A703-\U0002B73F'   # CJK扩展C:[2A700-2B73F]
+        r'\U0002B740-\U0002B81D'   # CJK扩展D:[2B740-2B81D]
+        r'\U0002F80A-\U0002FA1F'   # CJK兼容扩展:[2F800-2FA1F]
+        r'])+'
+    )
+else:
+    RE_NSW = re.compile(  # pragma: no cover
+        r'(?:[^'
+        r'\u3007'                  # 〇
+        r'\u3400-\u4dbf'           # CJK扩展A:[3400-4DBF]
+        r'\u4e00-\u9fff'           # CJK基本:[4E00-9FFF]
+        r'\uf900-\ufaff'           # CJK兼容:[F900-FAFF]
+        r'])+'
+    )
--- a/third_party/text_processing/normalization/num.py
+++ b/third_party/text_processing/normalization/num.py
@ -0,0 +1,154 @@
+"""
+Rules to verbalize numbers into Chinese characters.
+https://zh.wikipedia.org/wiki/中文数字#現代中文
+"""
+import re
+from typing import List
+from collections import OrderedDict
+
+DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
+UNITS = OrderedDict({
+    1: '十',
+    2: '百',
+    3: '千',
+    4: '万',
+    8: '亿',
+})
+
+# 分数表达式
+RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
+def replace_frac(match: re.Match) -> str:
+    sign = match.group(1)
+    nominator = match.group(2)
+    denominator = match.group(3)
+    sign: str = "负" if sign else ""
+    nominator: str = num2str(nominator)
+    denominator: str = num2str(denominator)
+    result = f"{sign}{denominator}分之{nominator}"
+    return result
+    
+
+# 百分数表达式
+RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
+def replace_percentage(match: re.Match) -> str:
+    sign = match.group(1)
+    percent = match.group(2)
+    sign: str = "负" if sign else ""
+    percent: str = num2str(percent)
+    result = f"{sign}百分之{percent}"
+    return result
+
+# 整数表达式
+# 带负号或者不带负号的整数 12, -10
+RE_INTEGER = re.compile(
+    r'(-?)'
+    r'(\d+)'
+)
+
+# 编号-无符号整形
+# 00078
+RE_DEFAULT_NUM = re.compile(r'\d{4}\d*')
+def replace_default_num(match: re.Match):
+    number = match.group(0)
+    return verbalize_digit(number)
+
+# 数字表达式
+# 1. 整数: -10, 10;
+# 2. 浮点数: 10.2, -0.3
+# 3. 不带符号和整数部分的纯浮点数: .22, .38   
+RE_NUMBER = re.compile(
+    r'(-?)((\d+)(\.\d+)?)'
+    r'|(\.(\d+))'
+)
+def replace_number(match: re.Match) -> str:
+    sign = match.group(1)
+    number = match.group(2)
+    pure_decimal = match.group(5)
+    if pure_decimal:
+        result = num2str(pure_decimal)
+    else:
+        sign: str = "负" if sign else ""
+        number: str = num2str(number)
+        result = f"{sign}{number}"
+    return result
+
+# 范围表达式
+# 12-23, 12~23
+RE_RANGE = re.compile(
+    r'(\d+)[-~](\d+)'
+)
+def replace_range(match: re.Match) -> str:
+    first, second = match.group(1), match.group(2)
+    first: str = num2str(first)
+    second: str = num2str(second)
+    result = f"{first}到{second}"
+    return result
+
+
+def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
+    stripped = value_string.lstrip('0')
+    if len(stripped) == 0:
+        return []
+    elif len(stripped) == 1:
+        if use_zero and len(stripped) < len(value_string):
+            return [DIGITS['0'], DIGITS[stripped]]
+        else:
+            return [DIGITS[stripped]]
+    else:
+        largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped))
+        first_part = value_string[:-largest_unit]
+        second_part = value_string[-largest_unit:]
+        return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
+
+def verbalize_cardinal(value_string: str) -> str:
+    if not value_string:
+        return ''
+    
+    # 000 -> '零' , 0 -> '零'
+    value_string = value_string.lstrip('0')
+    if len(value_string) == 0:
+        return DIGITS['0']
+    
+    result_symbols = _get_value(value_string)
+    # verbalized number starting with '一十*' is abbreviated as `十*`
+    if len(result_symbols) >= 2 and result_symbols[0] == DIGITS['1'] and result_symbols[1] == UNITS[1]:
+        result_symbols = result_symbols[1:]
+    return ''.join(result_symbols)
+
+def verbalize_digit(value_string: str, alt_one=False) -> str:
+    result_symbols = [DIGITS[digit] for digit in value_string]
+    result = ''.join(result_symbols)
+    if alt_one:
+        result.replace("一", "幺")
+    return result
+
+def num2str(value_string: str) -> str:
+    integer_decimal = value_string.split('.')
+    if len(integer_decimal) == 1:
+        integer = integer_decimal[0]
+        decimal = ''
+    elif len(integer_decimal) == 2:
+        integer, decimal = integer_decimal
+    else:
+        raise ValueError(f"The value string: '${value_string}' has more than one point in it.")
+    
+    result = verbalize_cardinal(integer)
+
+    decimal = decimal.rstrip('0')
+    if decimal:
+        # '.22' is verbalized as '点二二'
+        # '3.20' is verbalized as '三点二
+        result += '点' + verbalize_digit(decimal)
+    return result
+
+
+
+
+
+
+
+
+
+
+
+
--- a/third_party/text_processing/normalization/phone.py
+++ b/third_party/text_processing/normalization/phone.py
@ -0,0 +1,30 @@
+import re
+from .num import verbalize_digit
+
+# 规范化固话/手机号码
+# 手机
+# http://www.jihaoba.com/news/show/13680
+# 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
+# 联通：130、131、132、156、155、186、185、176
+# 电信：133、153、189、180、181、177
+RE_MOBILE_PHONE= re.compile(
+    r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
+RE_TELEPHONE = re.compile(
+    r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
+
+
+def phone2str(phone_string: str, mobile=True) -> str:
+    if mobile:
+        sp_parts = phone_string.strip('+').split()
+        result = ''.join(
+            [verbalize_digit(part, alt_one=True) for part in sp_parts])
+        return result
+    else:
+        sil_parts = phone_string.split('-')
+        result = ''.join(
+            [verbalize_digit(part, alt_one=True) for part in sil_parts])
+        return result
+
+
+def replace_phone(match: re.Match) -> str:
+    return phone2str(match.group(0))
--- a/third_party/text_processing/normalization/quantifier.py
+++ b/third_party/text_processing/normalization/quantifier.py
@ -0,0 +1,17 @@
+import re
+from .num import num2str
+
+# 温度表达式，温度会影响负号的读法
+# -3°C 零下三度
+RE_TEMPERATURE = re.compile(
+    r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)'
+)
+def replace_temperature(match: re.Match) -> str:
+    sign = match.group(1)
+    temperature = match.group(2)
+    unit = match.group(3)
+    sign: str = "零下" if sign else ""
+    temperature: str = num2str(temperature)
+    unit: str = "摄氏度" if unit == "摄氏度" else "度"
+    result = f"{sign}{temperature}{unit}"
+    return result
--- a/third_party/text_processing/normalization/sentence_split.py
+++ b/third_party/text_processing/normalization/sentence_split.py
@ -0,0 +1,22 @@
+import re
+from typing import List
+
+SENTENCE_SPLITOR = re.compile(r'([。！？][”’]?)')
+
+def split(text: str) -> List[str]:
+    """Split long text into sentences with sentence-splitting punctuations.
+
+    Parameters
+    ----------
+    text : str
+        The input text.
+
+    Returns
+    -------
+    List[str]
+        Sentences.
+    """
+    text = SENTENCE_SPLITOR.sub(r'\1\n', text)
+    text = text.strip()
+    sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
+    return sentences