You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
43 lines
1.6 KiB
43 lines
1.6 KiB
from .sentence_split import split
|
|
from .num import RE_NUMBER, RE_FRAC, RE_PERCENTAGE, RE_RANGE, RE_INTEGER, RE_DEFAULT_NUM
|
|
from .num import replace_number, replace_frac, replace_percentage, replace_range, replace_default_num
|
|
|
|
from .chronology import RE_TIME, RE_DATE, RE_DATE2
|
|
from .chronology import replace_time, replace_date, replace_date2
|
|
|
|
from .quantifier import RE_TEMPERATURE
|
|
from .quantifier import replace_temperature
|
|
|
|
from .phone import RE_MOBILE_PHONE, RE_TELEPHONE, replace_phone
|
|
|
|
from .char_convert import tranditional_to_simplified
|
|
from .constants import F2H_ASCII_LETTERS, F2H_DIGITS, F2H_SPACE
|
|
|
|
|
|
def normalize_sentence(sentence):
|
|
# basic character conversions
|
|
sentence = tranditional_to_simplified(sentence)
|
|
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
|
|
F2H_DIGITS).translate(F2H_SPACE)
|
|
|
|
# number related NSW verbalization
|
|
sentence = RE_DATE.sub(replace_date, sentence)
|
|
sentence = RE_DATE2.sub(replace_date2, sentence)
|
|
sentence = RE_TIME.sub(replace_time, sentence)
|
|
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
|
|
sentence = RE_RANGE.sub(replace_range, sentence)
|
|
sentence = RE_FRAC.sub(replace_frac, sentence)
|
|
sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
|
|
sentence = RE_MOBILE_PHONE.sub(replace_phone, sentence)
|
|
sentence = RE_TELEPHONE.sub(replace_phone, sentence)
|
|
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
|
|
sentence = RE_NUMBER.sub(replace_number, sentence)
|
|
|
|
return sentence
|
|
|
|
|
|
def normalize(text):
|
|
sentences = split(text)
|
|
sentences = [normalize_sentence(sent) for sent in sentences]
|
|
return sentences
|