parent
c65ea55bb1
commit
7779f33e74
@ -0,0 +1,3 @@
|
|||||||
|
# Regular expression based text normalization for Chinese
|
||||||
|
|
||||||
|
For simplicity and ease of implementation, text normalization is basically done by rules and dictionaries. Jere's an example.
|
@ -0,0 +1,14 @@
|
|||||||
|
import argparse
|
||||||
|
from text_processing import normalization
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Normalize text in Chinese with some rules.")
|
||||||
|
parser.add_argument("input", type=str, help="the input sentences")
|
||||||
|
parser.add_argument("output", type=str, help="path to save the output file.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
with open(args.input, 'rt') as fin:
|
||||||
|
with open(args.output, 'wt') as fout:
|
||||||
|
for sent in fin:
|
||||||
|
sent = normalization.normalize_sentence(sent.strip())
|
||||||
|
fout.write(sent)
|
||||||
|
fout.write('\n')
|
@ -0,0 +1,8 @@
|
|||||||
|
export MAIN_ROOT=${PWD}/../../
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}#
|
@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
exp_dir=exp
|
||||||
|
data_dir=data
|
||||||
|
filename="sentences.txt"
|
||||||
|
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||||
|
|
||||||
|
mkdir -p ${exp_dir}
|
||||||
|
|
||||||
|
|
||||||
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||||
|
echo "stage 1: Processing "
|
||||||
|
python3 local/test_normalization.py ${data_dir}/${filename} ${exp_dir}/normalized.txt
|
||||||
|
if [ -f "${exp_dir}/normalized.txt" ]; then
|
||||||
|
echo "Normalized text save at ${exp_dir}/normalized.txt"
|
||||||
|
fi
|
||||||
|
# TODO(chenfeiyu): compute edit distance against ground-truth
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "done"
|
||||||
|
exit 0
|
@ -0,0 +1,42 @@
|
|||||||
|
from .sentence_split import split
|
||||||
|
from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM
|
||||||
|
from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num
|
||||||
|
|
||||||
|
from .chronology import RE_TIME, RE_DATE, RE_DATE2
|
||||||
|
from .chronology import replace_time, replace_date, replace_date2
|
||||||
|
|
||||||
|
from .quantifier import RE_TEMPERATURE
|
||||||
|
from .quantifier import replace_temperature
|
||||||
|
|
||||||
|
from .phone import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone
|
||||||
|
|
||||||
|
from .char_convert import tranditional_to_simplified
|
||||||
|
from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_sentence(sentence):
|
||||||
|
# basic character conversions
|
||||||
|
sentence = tranditional_to_simplified(sentence)
|
||||||
|
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
|
||||||
|
F2H_DIGITS).translate(F2H_SPACE)
|
||||||
|
|
||||||
|
# number related NSW verbalization
|
||||||
|
sentence = RE_DATE.sub(replace_date, sentence)
|
||||||
|
sentence = RE_DATE2.sub(replace_date2, sentence)
|
||||||
|
sentence = RE_TIME.sub(replace_time, sentence)
|
||||||
|
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
|
||||||
|
sentence = RE_RANGE.sub(replace_range, sentence)
|
||||||
|
sentence = RE_FRAC.sub(replace_frac, sentence)
|
||||||
|
sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
|
||||||
|
sentence = RE_MOBILE_PHONE.sub(replace_phone, sentence)
|
||||||
|
sentence = RE_TELEPHONE.sub(replace_phone, sentence)
|
||||||
|
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
|
||||||
|
sentence = RE_NUMBER.sub(replace_number, sentence)
|
||||||
|
|
||||||
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(text):
|
||||||
|
sentences = split(text)
|
||||||
|
sentences = [normalize_sentence(sent) for sent in sentences]
|
||||||
|
return sentences
|
@ -0,0 +1,14 @@
|
|||||||
|
"""Traditional and simplified Chinese conversion with
|
||||||
|
`opencc <https://github.com/BYVoid/OpenCC>`_.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import opencc
|
||||||
|
|
||||||
|
_t2s_converter = opencc.OpenCC("t2s.json")
|
||||||
|
_s2t_converter = opencc.OpenCC('s2t.json')
|
||||||
|
|
||||||
|
def tranditional_to_simplified(text: str) -> str:
|
||||||
|
return _t2s_converter.convert(text)
|
||||||
|
|
||||||
|
def simplified_to_traditional(text: str) -> str:
|
||||||
|
return _s2t_converter.convert(text)
|
@ -0,0 +1,63 @@
|
|||||||
|
import re
|
||||||
|
from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS
|
||||||
|
|
||||||
|
def _time_num2str(num_string: str) -> str:
|
||||||
|
"""A special case for verbalizing number in time."""
|
||||||
|
result = num2str(num_string.lstrip('0'))
|
||||||
|
if num_string.startswith('0'):
|
||||||
|
result = DIGITS['0'] + result
|
||||||
|
return result
|
||||||
|
|
||||||
|
# 时刻表达式
|
||||||
|
RE_TIME = re.compile(
|
||||||
|
r'([0-1]?[0-9]|2[0-3])'
|
||||||
|
r':([0-5][0-9])'
|
||||||
|
r'(:([0-5][0-9]))?'
|
||||||
|
)
|
||||||
|
def replace_time(match: re.Match) -> str:
|
||||||
|
hour = match.group(1)
|
||||||
|
minute = match.group(2)
|
||||||
|
second = match.group(4)
|
||||||
|
|
||||||
|
result = f"{num2str(hour)}点"
|
||||||
|
if minute.lstrip('0'):
|
||||||
|
result += f"{_time_num2str(minute)}分"
|
||||||
|
if second and second.lstrip('0'):
|
||||||
|
result += f"{_time_num2str(second)}秒"
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
RE_DATE = re.compile(
|
||||||
|
r'(\d{4}|\d{2})年'
|
||||||
|
r'((0?[1-9]|1[0-2])月)?'
|
||||||
|
r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?'
|
||||||
|
)
|
||||||
|
def replace_date(match: re.Match) -> str:
|
||||||
|
year = match.group(1)
|
||||||
|
month = match.group(3)
|
||||||
|
day = match.group(5)
|
||||||
|
result = ""
|
||||||
|
if year:
|
||||||
|
result += f"{verbalize_digit(year)}年"
|
||||||
|
if month:
|
||||||
|
result += f"{verbalize_cardinal(month)}月"
|
||||||
|
if day:
|
||||||
|
result += f"{verbalize_cardinal(day)}{match.group(9)}"
|
||||||
|
return result
|
||||||
|
|
||||||
|
# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
|
||||||
|
RE_DATE2 = re.compile(
|
||||||
|
r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])'
|
||||||
|
)
|
||||||
|
def replace_date2(match: re.Match) -> str:
|
||||||
|
year = match.group(1)
|
||||||
|
month = match.group(3)
|
||||||
|
day = match.group(4)
|
||||||
|
result = ""
|
||||||
|
if year:
|
||||||
|
result += f"{verbalize_digit(year)}年"
|
||||||
|
if month:
|
||||||
|
result += f"{verbalize_cardinal(month)}月"
|
||||||
|
if day:
|
||||||
|
result += f"{verbalize_cardinal(day)}日"
|
||||||
|
return result
|
@ -0,0 +1,154 @@
|
|||||||
|
"""
|
||||||
|
Rules to verbalize numbers into Chinese characters.
|
||||||
|
https://zh.wikipedia.org/wiki/中文数字#現代中文
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
|
||||||
|
UNITS = OrderedDict({
|
||||||
|
1: '十',
|
||||||
|
2: '百',
|
||||||
|
3: '千',
|
||||||
|
4: '万',
|
||||||
|
8: '亿',
|
||||||
|
})
|
||||||
|
|
||||||
|
# 分数表达式
|
||||||
|
RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
|
||||||
|
def replace_frac(match: re.Match) -> str:
|
||||||
|
sign = match.group(1)
|
||||||
|
nominator = match.group(2)
|
||||||
|
denominator = match.group(3)
|
||||||
|
sign: str = "负" if sign else ""
|
||||||
|
nominator: str = num2str(nominator)
|
||||||
|
denominator: str = num2str(denominator)
|
||||||
|
result = f"{sign}{denominator}分之{nominator}"
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# 百分数表达式
|
||||||
|
RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
|
||||||
|
def replace_percentage(match: re.Match) -> str:
|
||||||
|
sign = match.group(1)
|
||||||
|
percent = match.group(2)
|
||||||
|
sign: str = "负" if sign else ""
|
||||||
|
percent: str = num2str(percent)
|
||||||
|
result = f"{sign}百分之{percent}"
|
||||||
|
return result
|
||||||
|
|
||||||
|
# 整数表达式
|
||||||
|
# 带负号或者不带负号的整数 12, -10
|
||||||
|
RE_INTEGER = re.compile(
|
||||||
|
r'(-?)'
|
||||||
|
r'(\d+)'
|
||||||
|
)
|
||||||
|
|
||||||
|
# 编号-无符号整形
|
||||||
|
# 00078
|
||||||
|
RE_DEFAULT_NUM = re.compile(r'\d{4}\d*')
|
||||||
|
def replace_default_num(match: re.Match):
|
||||||
|
number = match.group(0)
|
||||||
|
return verbalize_digit(number)
|
||||||
|
|
||||||
|
# 数字表达式
|
||||||
|
# 1. 整数: -10, 10;
|
||||||
|
# 2. 浮点数: 10.2, -0.3
|
||||||
|
# 3. 不带符号和整数部分的纯浮点数: .22, .38
|
||||||
|
RE_NUMBER = re.compile(
|
||||||
|
r'(-?)((\d+)(\.\d+)?)'
|
||||||
|
r'|(\.(\d+))'
|
||||||
|
)
|
||||||
|
def replace_number(match: re.Match) -> str:
|
||||||
|
sign = match.group(1)
|
||||||
|
number = match.group(2)
|
||||||
|
pure_decimal = match.group(5)
|
||||||
|
if pure_decimal:
|
||||||
|
result = num2str(pure_decimal)
|
||||||
|
else:
|
||||||
|
sign: str = "负" if sign else ""
|
||||||
|
number: str = num2str(number)
|
||||||
|
result = f"{sign}{number}"
|
||||||
|
return result
|
||||||
|
|
||||||
|
# 范围表达式
|
||||||
|
# 12-23, 12~23
|
||||||
|
RE_RANGE = re.compile(
|
||||||
|
r'(\d+)[-~](\d+)'
|
||||||
|
)
|
||||||
|
def replace_range(match: re.Match) -> str:
|
||||||
|
first, second = match.group(1), match.group(2)
|
||||||
|
first: str = num2str(first)
|
||||||
|
second: str = num2str(second)
|
||||||
|
result = f"{first}到{second}"
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
|
||||||
|
stripped = value_string.lstrip('0')
|
||||||
|
if len(stripped) == 0:
|
||||||
|
return []
|
||||||
|
elif len(stripped) == 1:
|
||||||
|
if use_zero and len(stripped) < len(value_string):
|
||||||
|
return [DIGITS['0'], DIGITS[stripped]]
|
||||||
|
else:
|
||||||
|
return [DIGITS[stripped]]
|
||||||
|
else:
|
||||||
|
largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped))
|
||||||
|
first_part = value_string[:-largest_unit]
|
||||||
|
second_part = value_string[-largest_unit:]
|
||||||
|
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
|
||||||
|
|
||||||
|
def verbalize_cardinal(value_string: str) -> str:
|
||||||
|
if not value_string:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
# 000 -> '零' , 0 -> '零'
|
||||||
|
value_string = value_string.lstrip('0')
|
||||||
|
if len(value_string) == 0:
|
||||||
|
return DIGITS['0']
|
||||||
|
|
||||||
|
result_symbols = _get_value(value_string)
|
||||||
|
# verbalized number starting with '一十*' is abbreviated as `十*`
|
||||||
|
if len(result_symbols) >= 2 and result_symbols[0] == DIGITS['1'] and result_symbols[1] == UNITS[1]:
|
||||||
|
result_symbols = result_symbols[1:]
|
||||||
|
return ''.join(result_symbols)
|
||||||
|
|
||||||
|
def verbalize_digit(value_string: str, alt_one=False) -> str:
|
||||||
|
result_symbols = [DIGITS[digit] for digit in value_string]
|
||||||
|
result = ''.join(result_symbols)
|
||||||
|
if alt_one:
|
||||||
|
result.replace("一", "幺")
|
||||||
|
return result
|
||||||
|
|
||||||
|
def num2str(value_string: str) -> str:
|
||||||
|
integer_decimal = value_string.split('.')
|
||||||
|
if len(integer_decimal) == 1:
|
||||||
|
integer = integer_decimal[0]
|
||||||
|
decimal = ''
|
||||||
|
elif len(integer_decimal) == 2:
|
||||||
|
integer, decimal = integer_decimal
|
||||||
|
else:
|
||||||
|
raise ValueError(f"The value string: '${value_string}' has more than one point in it.")
|
||||||
|
|
||||||
|
result = verbalize_cardinal(integer)
|
||||||
|
|
||||||
|
decimal = decimal.rstrip('0')
|
||||||
|
if decimal:
|
||||||
|
# '.22' is verbalized as '点二二'
|
||||||
|
# '3.20' is verbalized as '三点二
|
||||||
|
result += '点' + verbalize_digit(decimal)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,17 @@
|
|||||||
|
import re
|
||||||
|
from .num import num2str
|
||||||
|
|
||||||
|
# 温度表达式,温度会影响负号的读法
|
||||||
|
# -3°C 零下三度
|
||||||
|
RE_TEMPERATURE = re.compile(
|
||||||
|
r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)'
|
||||||
|
)
|
||||||
|
def replace_temperature(match: re.Match) -> str:
|
||||||
|
sign = match.group(1)
|
||||||
|
temperature = match.group(2)
|
||||||
|
unit = match.group(3)
|
||||||
|
sign: str = "零下" if sign else ""
|
||||||
|
temperature: str = num2str(temperature)
|
||||||
|
unit: str = "摄氏度" if unit == "摄氏度" else "度"
|
||||||
|
result = f"{sign}{temperature}{unit}"
|
||||||
|
return result
|
@ -0,0 +1,22 @@
|
|||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
SENTENCE_SPLITOR = re.compile(r'([。!?][”’]?)')
|
||||||
|
|
||||||
|
def split(text: str) -> List[str]:
|
||||||
|
"""Split long text into sentences with sentence-splitting punctuations.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text : str
|
||||||
|
The input text.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
List[str]
|
||||||
|
Sentences.
|
||||||
|
"""
|
||||||
|
text = SENTENCE_SPLITOR.sub(r'\1\n', text)
|
||||||
|
text = text.strip()
|
||||||
|
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
|
||||||
|
return sentences
|
Loading…
Reference in new issue