remove useless third lib

pull/960/head
Hui Zhang 3 years ago
parent aba37810ff
commit 3f3442b98a

@ -1,2 +0,0 @@
data
exp

@ -1,3 +0,0 @@
# G2P
* zh - Chinese G2P

@ -1,93 +0,0 @@
# G2P
* WS
jieba
* G2P
pypinyin
* Tone sandhi
simple
We recommend using [Paraket](https://github.com/PaddlePaddle/Parakeet] [TextFrontEnd](https://github.com/PaddlePaddle/Parakeet/blob/develop/parakeet/frontend/__init__.py) to do G2P.
The phoneme set should be changed, you can reference `examples/thchs30/a0/data/dict/syllable.lexicon`.
## Download Baker dataset
[Baker](https://test.data-baker.com/#/data/index/source) dataset has to be downloaded mannually and moved to './data',
because you will have to pass the `CATTCHA` from a browswe to download the dataset.
## RUN
```
. path.sh
./run.sh
```
## Result
```
exp/
|-- 000001-010000.txt
|-- ref.pinyin
|-- trans.jieba.pinyin
`-- trans.pinyin
0 directories, 4 files
```
```
4f5a368441eb16aaf43dc1972f8b63dd exp/000001-010000.txt
01707896391c2de9b6fc4a39654be942 exp/ref.pinyin
43380ef160f65a23a3a0544700aa49b8 exp/trans.jieba.pinyin
8e6ff1fc22d8e8584082e804e8bcdeb7 exp/trans.pinyin
```
```
==> exp/000001-010000.txt <==
000001 卡尔普#2陪外孙#1玩滑梯#4。
ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 假语村言#2别再#1拥抱我#4。
jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 宝马#1配挂#1跛骡鞍#3貂蝉#1怨枕#2董翁榻#4。
bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 邓小平#2与#1撒切尔#2会晤#4。
deng4 xiao3 ping2 yu3 sa4 qie4 er3 hui4 wu4
000005 老虎#1幼崽#2与#1宠物犬#1玩耍#4。
lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3
==> exp/ref.pinyin <==
000001 ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 deng4 xiao3 ping2 yu3 sa4 qie4 er3 hui4 wu4
000005 lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3
000006 shen1 chang2 yue1 wu2 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
000008 zhan2 pin3 sui1 you3 zhan3 yuan2 que4 tui2
000009 yi2 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
==> exp/trans.jieba.pinyin <==
000001 ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 jia3 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 bao3 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 deng4 xiao3 ping2 yu3 sa1 qie4 er3 hui4 wu4
000005 lao3 hu3 you4 zai3 yu3 chong3 wu4 quan3 wan2 shua3
000006 shen1 chang2 yue1 wu3 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
000008 zhan3 pin3 sui1 you3 zhan3 yuan2 que4 tui2
000009 yi3 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
==> exp/trans.pinyin <==
000001 ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
000002 jia3 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3
000003 bao3 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4
000004 deng4 xiao3 ping2 yu3 sa1 qie4 er3 hui4 wu4
000005 lao3 hu3 you4 zai3 yu3 chong3 wu4 quan3 wan2 shua3
000006 shen1 chang2 yue1 wu3 chi3 er4 cun4 wu3 fen1 huo4 yi3 shang4
000007 zhao4 di2 yue1 cao2 yun2 teng2 qu4 gui3 wu1
000008 zhan3 pin3 sui1 you3 zhan3 yuan2 que4 tui2
000009 yi3 san3 ju1 er2 tong2 he2 you4 tuo1 er2 tong2 wei2 zhu3
000010 ke1 te4 ni1 shen1 chuan1 bao4 wen2 da4 yi1
```

@ -1,53 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import re
import jieba
from pypinyin import lazy_pinyin
from pypinyin import Style
def extract_pinyin(source, target, use_jieba=False):
with open(source, 'rt', encoding='utf-8') as fin:
with open(target, 'wt', encoding='utf-8') as fout:
for i, line in enumerate(fin):
if i % 2 == 0:
sentence_id, raw_text = line.strip().split()
raw_text = re.sub(r'#\d', '', raw_text)
if use_jieba:
raw_text = jieba.lcut(raw_text)
syllables = lazy_pinyin(
raw_text,
errors='ignore',
style=Style.TONE3,
neutral_tone_with_five=True)
transcription = ' '.join(syllables)
fout.write(f'{sentence_id} {transcription}\n')
else:
continue
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
parser.add_argument(
"input", type=str, help="source file of baker's prosody label file")
parser.add_argument(
"output", type=str, help="target file to write pinyin lables")
parser.add_argument(
"--use-jieba",
action='store_true',
help="use jieba for word segmentation.")
args = parser.parse_args()
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)

@ -1,37 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
def extract_pinyin_lables(source, target):
"""Extract pinyin labels from Baker's prosody labeling."""
with open(source, 'rt', encoding='utf-8') as fin:
with open(target, 'wt', encoding='utf-8') as fout:
for i, line in enumerate(fin):
if i % 2 == 0:
sentence_id, raw_text = line.strip().split()
fout.write(f'{sentence_id} ')
else:
transcription = line.strip()
fout.write(f'{transcription}\n')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
parser.add_argument(
"input", type=str, help="source file of baker's prosody label file")
parser.add_argument(
"output", type=str, help="target file to write pinyin lables")
args = parser.parse_args()
extract_pinyin_lables(args.input, args.output)

@ -1,103 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
from typing import List
from typing import Union
def erized(syllable: str) -> bool:
"""Whether the syllable contains erhua effect.
Example
--------
huar -> True
guanr -> True
er -> False
"""
# note: for pinyin, len(syllable) >=2 is always true
# if not: there is something wrong in the data
assert len(syllable) >= 2, f"inavlid syllable {syllable}"
return syllable[:2] != "er" and syllable[-2] == 'r'
def ignore_sandhi(reference: List[str], generated: List[str]) -> List[str]:
"""
Given a sequence of syllables from human annotation(reference),
which makes sandhi explici and a sequence of syllables from some
simple g2p program(generated), which does not consider sandhi,
return a the reference sequence while ignore sandhi.
Example
--------
['lao2', 'hu3'], ['lao3', 'hu3'] -> ['lao3', 'hu3']
"""
i = 0
j = 0
# sandhi ignored in the result while other errors are not included
result = []
while i < len(reference):
if erized(reference[i]):
result.append(reference[i])
i += 1
j += 2
elif reference[i][:-1] == generated[i][:-1] and reference[i][
-1] == '2' and generated[i][-1] == '3':
result.append(generated[i])
i += 1
j += 1
else:
result.append(reference[i])
i += 1
j += 1
assert j == len(
generated
), "length of transcriptions mismatch, There may be some characters that are ignored in the generated transcription."
return result
def convert_transcriptions(reference: Union[str, Path],
generated: Union[str, Path],
output: Union[str, Path]):
with open(reference, 'rt') as f_ref:
with open(generated, 'rt') as f_gen:
with open(output, 'wt') as f_out:
for i, (ref, gen) in enumerate(zip(f_ref, f_gen)):
sentence_id, ref_transcription = ref.strip().split(' ', 1)
_, gen_transcription = gen.strip().split(' ', 1)
try:
result = ignore_sandhi(ref_transcription.split(),
gen_transcription.split())
result = ' '.join(result)
except Exception:
print(
f"sentence_id: {sentence_id} There is some annotation error in the reference or generated transcription. Use the reference."
)
result = ref_transcription
f_out.write(f"{sentence_id} {result}\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="reference transcription but ignore sandhi.")
parser.add_argument(
"--reference",
type=str,
help="path to the reference transcription of baker dataset.")
parser.add_argument(
"--generated", type=str, help="path to the generated transcription.")
parser.add_argument("--output", type=str, help="path to save result.")
args = parser.parse_args()
convert_transcriptions(args.reference, args.generated, args.output)

@ -1,33 +0,0 @@
#!/bin/bash
exp_dir="exp"
data_dir="data"
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
archive=${data_dir}/"BZNSYP.rar"
if [ ! -f ${archive} ]; then
echo "Baker Dataset not found! Download it first to the data_dir."
exit -1
fi
MD5='c4350563bf7dc298f7dd364b2607be83'
md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
if [ ${md5_result} != ${MD5} ]; then
echo "MD5 mismatch! The Archive has been changed."
exit -1
fi
label_file='ProsodyLabeling/000001-010000.txt'
filename='000001-010000.txt'
unrar e ${archive} ${label_file}
cp ${filename} ${exp_dir}
rm -f ${filename}
if [ ! -f ${exp_dir}/${filename} ];then
echo "File extraction failed!"
exit
fi
exit 0

@ -1,8 +0,0 @@
export MAIN_ROOT=`realpath ${PWD}/../../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}

@ -1,37 +0,0 @@
#!/usr/bin/env bash
source path.sh
stage=-1
stop_stage=100
exp_dir=exp
data=data
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
mkdir -p ${exp_dir}
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
mkdir -p ${data}
test -e ${data}/BZNSYP.rar || wget -c https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar -P ${data}
fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
echo "stage 0: Extracting Prosody Labeling"
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data}
fi
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
filename="000001-010000.txt"
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
echo "stage 1: Processing transcriptions..."
python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/ref.pinyin
python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/trans.pinyin
python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/trans.jieba.pinyin
fi
echo "done"
exit 0

@ -1,36 +0,0 @@
# Regular expression based text normalization for Chinese
For simplicity and ease of implementation, text normalization is basically done by rules and dictionaries. Here's an example.
## Run
```
. path.sh
bash run.sh
```
## Results
```
exp/
`-- normalized.txt
0 directories, 1 file
```
```
aff31f8aa08e2a7360228c9ce5886b98 exp/normalized.txt
```
```
今天的最低气温达到零下十度.
只要有四分之三十三的人同意,就可以通过决议。
一九四五年五月二日,苏联士兵在德国国会大厦上升起了胜利旗,象征着攻占柏林并战胜了纳粹德国。
四月十六日,清晨的战斗以炮击揭幕,数以千计的大炮和喀秋莎火箭炮开始炮轰德军阵地,炮击持续了数天之久。
如果剩下的百分之三十点六是过去,那么还有百分之六十九点四.
事情发生在二零二零年三月三十一日的上午八点.
警方正在找一支点二二口径的手枪。
欢迎致电中国联通,北京二零二二年冬奥会官方合作伙伴为您服务
充值缴费请按一,查询话费及余量请按二,跳过本次提醒请按井号键。
快速解除流量封顶请按星号键腾讯王卡产品介绍、使用说明、特权及活动请按九查询话费、套餐余量、积分及活动返款请按一手机上网流量开通及取消请按二<EFBFBD><EFBFBD><EFBFBD>本机号码及本号所使用套餐请按四密码修改及重置请按五紧急开机请按六挂失请按七查询充值记录请按八其它自助服务及工服务请按零
```

@ -1,26 +0,0 @@
今天的最低气温达到-10°C.
只要有33/4的人同意就可以通过决议。
1945年5月2日苏联士兵在德国国会大厦上升起了胜利旗象征着攻占柏林并战胜了纳粹德国。
4月16日清晨的战斗以炮击揭幕数以千计的大炮和喀秋莎火箭炮开始炮轰德军阵地炮击持续了数天之久。
如果剩下的30.6%是过去那么还有69.4%.
事情发生在2020/03/31的上午8:00.
警方正在找一支.22口径的手枪。
欢迎致电中国联通北京2022年冬奥会官方合作伙伴为您服务
充值缴费请按1查询话费及余量请按2跳过本次提醒请按井号键。
快速解除流量封顶请按星号键腾讯王卡产品介绍、使用说明、特权及活动请按9查询话费、套餐余量、积分及活动返款请按1手机上网流量开通及取消请按2查询本机号码及本号所使用套餐请按4密码修改及重置请按5紧急开机请按6挂失请按7查询充值记录请按8其它自助服务及人工服务请按0
智能客服助理快速查话费、查流量请按9了解北京联通业务请按1宽带IPTV新装、查询请按2障碍报修请按3充值缴费请按4投诉建议请按5政企业务请按7人工服务请按0for english severice press star key
您的帐户当前可用余额为63.89元本月消费为2.17元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。
您的帐户当前可用余额为负15.5元本月消费为59.6元。您的消费、套餐余量和其它信息将以短信形式下发,请您注意查收。谢谢使用,再见!。
尊敬的客户您目前的话费余额为负14.60元已低于10元为保证您的通信畅通请及时缴纳费用。
您的流量已用完,为避免您产生额外费用,建议您根据需求开通一个流量包以作补充。
您可以直接说,查询话费及余量、开通流量包、缴费,您也可以说出其它需求,请问有什么可以帮您?
您的账户当前可用余额为负36.00元本月消费36.00元。
请问你是电话13985608526的机主吗
如您对处理结果不满意可拨打中国联通集团投诉电话10015进行投诉按本地通话费收费返回自助服务请按井号键
“26314”号VIP客服代表为您服务。
尊敬的5G用户欢迎您致电中国联通
首先是应用了M1芯片的iPad Pro新款的iPad Pro支持5G这也是苹果的第二款5G产品线。
除此之外,摄像头方面再次升级,增加了前摄全新超广角摄像头,支持人物居中功能,搭配超广角可实现视频中始终让人物居中效果。
屏幕方面iPad Pro 12.9版本支持XDR体验的Mini-LEDS显示屏支持HDR10、杜比视界还支持杜比全景声。
iPad Pro的秒控键盘这次也推出白色版本。
售价方面11英寸版本售价799美元起12.9英寸售价1099美元起。

@ -1,29 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from text_processing import normalization
parser = argparse.ArgumentParser(
description="Normalize text in Chinese with some rules.")
parser.add_argument("input", type=str, help="the input sentences")
parser.add_argument("output", type=str, help="path to save the output file.")
args = parser.parse_args()
with open(args.input, 'rt') as fin:
with open(args.output, 'wt') as fout:
for sent in fin:
sent = normalization.normalize_sentence(sent.strip())
fout.write(sent)
fout.write('\n')

@ -1,8 +0,0 @@
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}#

@ -1,26 +0,0 @@
#!/usr/bin/env bash
source path.sh
stage=-1
stop_stage=100
exp_dir=exp
data_dir=data
filename="sentences.txt"
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
mkdir -p ${exp_dir}
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
echo "stage 1: Processing "
python3 local/test_normalization.py ${data_dir}/${filename} ${exp_dir}/normalized.txt
if [ -f "${exp_dir}/normalized.txt" ]; then
echo "Normalized text save at ${exp_dir}/normalized.txt"
fi
# TODO(chenfeiyu): compute edit distance against ground-truth
fi
echo "done"
exit 0

@ -22,3 +22,7 @@ licence: MIT
* [phkit](https://github.com/KuangDD/phkit.git)
commit: b2100293c1e36da531d7f30bd52c9b955a649522
licence: None
* [nnAudio](https://github.com/KinWaiCheuk/nnAudio.git)
licence: MIT

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2020 SpeechIO
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -1,112 +0,0 @@
# Chinese Text Normalization for Speech Processing
## Problem
Search for "Text Normalization"(TN) on Google and Github, you can hardly find open-source projects that are "read-to-use" for text normalization tasks. Instead, you find a bunch of NLP toolkits or frameworks that *supports* TN functionality. There is quite some work between "support text normalization" and "do text normalization".
## Reason
* TN is language-dependent, more or less.
Some of TN processing methods are shared across languages, but a good TN module always involves language-specific knowledge and treatments, more or less.
* TN is task-specific.
Even for the same language, different applications require quite different TN.
* TN is "dirty"
Constructing and maintaining a set of TN rewrite-rules is painful, whatever toolkits and frameworks you choose. Subtle and intrinsic complexities hide inside TN task itself, not in tools or frameworks.
* mature TN module is an asset
Since constructing and maintaining TN is hard, it is actually an asset for commercial companies, hence it is unlikely to find a product-level TN in open-source community (correct me if you find any)
* TN is a less important topic for either academic or commercials.
## Goal
This project sets up a ready-to-use TN module for **Chinese**. Since my background is **speech processing**, this project should be able to handle most common TN tasks, in **Chinese ASR** text processing pipelines.
## Normalizers
1. supported NSW (Non-Standard-Word) Normalization
|NSW type|raw|normalized|
|-|-|-|
|cardinal|这块黄金重达324.75克|这块黄金重达三百二十四点七五克|
|date|她出生于86年8月18日她弟弟出生于1995年3月1日|她出生于八六年八月十八日 她弟弟出生于一九九五年三月一日|
|digit|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
|fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
|money|随便来几个价格12块534.5元20.1万|随便来几个价格十二块五 三十四点五元 二十点一万|
|percentage|明天有62的概率降雨|明天有百分之六十二的概率降雨|
|telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
acknowledgement: the NSW normalization codes are based on [Zhiyang Zhou's work here](https://github.com/Joee1995/chn_text_norm.git)
1. punctuation removal
For Chinese, it removes punctuation list collected in [Zhon](https://github.com/tsroten/zhon) project, containing
* non-stop puncs
```
'"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏'
```
* stop puncs
```
'!?。。'
```
For English, it removes Python's `string.punctuation`
1. multilingual English word upper/lower case conversion
since ASR/TTS lexicons usually unify English entries to uppercase or lowercase, the TN module should adapt with lexicon accordingly.
## Supported text format
1. plain text, preferably one sentence per line(most common case in ASR processing).
```
今天早饭吃了没
没吃回家吃去吧
...
```
plain text is default format.
2. Kaldi's transcription format
```
KALDI_KEY_UTT001 今天早饭吃了没
KALDI_KEY_UTT002 没吃回家吃去吧
...
```
TN will skip first column key section, normalize latter transcription text
pass `--has_key` option to switch to kaldi format.
_note: All input text should be UTF-8 encoded._
## Run examples
* TN (python)
make sure you have **python3**, python2.X won't work correctly.
`sh run.sh` in `TN` dir, and compare raw text and normalized text.
* ITN (thrax)
make sure you have **thrax** installed, and your PATH should be able to find thrax binaries.
`sh run.sh` in `ITN` dir. check Makefile for grammar dependency.
## possible future work
Since TN is a typical "done is better than perfect" module in context of ASR, and the current state is sufficient for my purpose, I probably won't update this repo frequently.
there are indeed something that needs to be improved:
* For TN, NSW normalizers in TN dir are based on regular expression, I've found some unintended matches, those pattern regexps need to be refined for more precise TN coverage.
* For ITN, extend those thrax rewriting grammars to cover more scenarios.
* Further more, nowadays commercial systems start to introduce RNN-like models into TN, and a mix of (rule-based & model-based) system is state-of-the-art. More readings about this, look for Richard Sproat and KyleGorman's work at Google.
END

@ -1,794 +0,0 @@
#!/usr/bin/env python3
# coding=utf-8
# Authors:
# 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)
# 2019.9 Jiayu DU
#
# requirements:
# - python 3.X
# notes: python 2.X WILL fail or produce misleading results
import sys, os, argparse, codecs, string, re
# ================================================================================ #
# basic constant
# ================================================================================ #
CHINESE_DIGIS = u'零一二三四五六七八九'
BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖'
BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖'
SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万'
SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬'
LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载'
LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載'
SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万'
SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬'
ZERO_ALT = u''
ONE_ALT = u''
TWO_ALTS = [u'', u'']
POSITIVE = [u'', u'']
NEGATIVE = [u'', u'']
POINT = [u'', u'']
# PLUS = [u'加', u'加']
# SIL = [u'杠', u'槓']
# 中文数字系统类型
NUMBERING_TYPES = ['low', 'mid', 'high']
CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \
'里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)'
CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \
'砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \
'针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \
'毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \
'盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \
'纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)'
# punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git)
CHINESE_PUNC_STOP = '!?。。'
CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏'
CHINESE_PUNC_OTHER = '·〈〉-'
CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP + CHINESE_PUNC_OTHER
# ================================================================================ #
# basic class
# ================================================================================ #
class ChineseChar(object):
"""
中文字符
每个字符对应简体和繁体,
e.g. 简体 = '', 繁体 = ''
转换时可转换为简体或繁体
"""
def __init__(self, simplified, traditional):
self.simplified = simplified
self.traditional = traditional
#self.__repr__ = self.__str__
def __str__(self):
return self.simplified or self.traditional or None
def __repr__(self):
return self.__str__()
class ChineseNumberUnit(ChineseChar):
"""
中文数字/数位字符
每个字符除繁简体外还有一个额外的大写字符
e.g. '' ''
"""
def __init__(self, power, simplified, traditional, big_s, big_t):
super(ChineseNumberUnit, self).__init__(simplified, traditional)
self.power = power
self.big_s = big_s
self.big_t = big_t
def __str__(self):
return '10^{}'.format(self.power)
@classmethod
def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
if small_unit:
return ChineseNumberUnit(power=index + 1,
simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1])
elif numbering_type == NUMBERING_TYPES[0]:
return ChineseNumberUnit(power=index + 8,
simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
elif numbering_type == NUMBERING_TYPES[1]:
return ChineseNumberUnit(power=(index + 2) * 4,
simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
elif numbering_type == NUMBERING_TYPES[2]:
return ChineseNumberUnit(power=pow(2, index + 3),
simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
else:
raise ValueError(
'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type))
class ChineseNumberDigit(ChineseChar):
"""
中文数字字符
"""
def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
super(ChineseNumberDigit, self).__init__(simplified, traditional)
self.value = value
self.big_s = big_s
self.big_t = big_t
self.alt_s = alt_s
self.alt_t = alt_t
def __str__(self):
return str(self.value)
@classmethod
def create(cls, i, v):
return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
class ChineseMath(ChineseChar):
"""
中文数位字符
"""
def __init__(self, simplified, traditional, symbol, expression=None):
super(ChineseMath, self).__init__(simplified, traditional)
self.symbol = symbol
self.expression = expression
self.big_s = simplified
self.big_t = traditional
CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
class NumberSystem(object):
"""
中文数字系统
"""
pass
class MathSymbol(object):
"""
用于中文数字系统的数学符号 (/简体), e.g.
positive = ['', '']
negative = ['', '']
point = ['', '']
"""
def __init__(self, positive, negative, point):
self.positive = positive
self.negative = negative
self.point = point
def __iter__(self):
for v in self.__dict__.values():
yield v
# class OtherSymbol(object):
# """
# 其他符号
# """
#
# def __init__(self, sil):
# self.sil = sil
#
# def __iter__(self):
# for v in self.__dict__.values():
# yield v
# ================================================================================ #
# basic utils
# ================================================================================ #
def create_system(numbering_type=NUMBERING_TYPES[1]):
"""
根据数字系统类型返回创建相应的数字系统默认为 mid
NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
low: '' = '亿' * '' = $10^{9}$, '' = '' * '', etc.
mid: '' = '亿' * '' = $10^{12}$, '' = '' * '', etc.
high: '' = '亿' * '亿' = $10^{16}$, '' = '' * '', etc.
返回对应的数字系统
"""
# chinese number units of '亿' and larger
all_larger_units = zip(
LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL)
larger_units = [CNU.create(i, v, numbering_type, False)
for i, v in enumerate(all_larger_units)]
# chinese number units of '十, 百, 千, 万'
all_smaller_units = zip(
SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL)
smaller_units = [CNU.create(i, v, small_unit=True)
for i, v in enumerate(all_smaller_units)]
# digis
chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS,
BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL)
digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
# symbols
positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x)
negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x)
point_cn = CM(POINT[0], POINT[1], '.', lambda x,
y: float(str(x) + '.' + str(y)))
# sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
system = NumberSystem()
system.units = smaller_units + larger_units
system.digits = digits
system.math = MathSymbol(positive_cn, negative_cn, point_cn)
# system.symbols = OtherSymbol(sil_cn)
return system
def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
def get_symbol(char, system):
for u in system.units:
if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
return u
for d in system.digits:
if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]:
return d
for m in system.math:
if char in [m.traditional, m.simplified]:
return m
def string2symbols(chinese_string, system):
int_string, dec_string = chinese_string, ''
for p in [system.math.point.simplified, system.math.point.traditional]:
if p in chinese_string:
int_string, dec_string = chinese_string.split(p)
break
return [get_symbol(c, system) for c in int_string], \
[get_symbol(c, system) for c in dec_string]
def correct_symbols(integer_symbols, system):
"""
一百八 to 一百八十
一亿一千三百万 to 一亿 一千万 三百万
"""
if integer_symbols and isinstance(integer_symbols[0], CNU):
if integer_symbols[0].power == 1:
integer_symbols = [system.digits[1]] + integer_symbols
if len(integer_symbols) > 1:
if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU):
integer_symbols.append(
CNU(integer_symbols[-2].power - 1, None, None, None, None))
result = []
unit_count = 0
for s in integer_symbols:
if isinstance(s, CND):
result.append(s)
unit_count = 0
elif isinstance(s, CNU):
current_unit = CNU(s.power, None, None, None, None)
unit_count += 1
if unit_count == 1:
result.append(current_unit)
elif unit_count > 1:
for i in range(len(result)):
if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power:
result[-i - 1] = CNU(result[-i - 1].power +
current_unit.power, None, None, None, None)
return result
def compute_value(integer_symbols):
"""
Compute the value.
When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
e.g. '两千万' = 2000 * 10000 not 2000 + 10000
"""
value = [0]
last_power = 0
for s in integer_symbols:
if isinstance(s, CND):
value[-1] = s.value
elif isinstance(s, CNU):
value[-1] *= pow(10, s.power)
if s.power > last_power:
value[:-1] = list(map(lambda v: v *
pow(10, s.power), value[:-1]))
last_power = s.power
value.append(0)
return sum(value)
system = create_system(numbering_type)
int_part, dec_part = string2symbols(chinese_string, system)
int_part = correct_symbols(int_part, system)
int_str = str(compute_value(int_part))
dec_str = ''.join([str(d.value) for d in dec_part])
if dec_part:
return '{0}.{1}'.format(int_str, dec_str)
else:
return int_str
def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False,
traditional=False, alt_zero=False, alt_one=False, alt_two=True,
use_zeros=True, use_units=True):
def get_value(value_string, use_zeros=True):
striped_string = value_string.lstrip('0')
# record nothing if all zeros
if not striped_string:
return []
# record one digits
elif len(striped_string) == 1:
if use_zeros and len(value_string) != len(striped_string):
return [system.digits[0], system.digits[int(striped_string)]]
else:
return [system.digits[int(striped_string)]]
# recursively record multiple digits
else:
result_unit = next(u for u in reversed(
system.units) if u.power < len(striped_string))
result_string = value_string[:-result_unit.power]
return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:])
system = create_system(numbering_type)
int_dec = number_string.split('.')
if len(int_dec) == 1:
int_string = int_dec[0]
dec_string = ""
elif len(int_dec) == 2:
int_string = int_dec[0]
dec_string = int_dec[1]
else:
raise ValueError(
"invalid input num string with more than one dot: {}".format(number_string))
if use_units and len(int_string) > 1:
result_symbols = get_value(int_string)
else:
result_symbols = [system.digits[int(c)] for c in int_string]
dec_symbols = [system.digits[int(c)] for c in dec_string]
if dec_string:
result_symbols += [system.math.point] + dec_symbols
if alt_two:
liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t,
system.digits[2].big_s, system.digits[2].big_t)
for i, v in enumerate(result_symbols):
if isinstance(v, CND) and v.value == 2:
next_symbol = result_symbols[i +
1] if i < len(result_symbols) - 1 else None
previous_symbol = result_symbols[i - 1] if i > 0 else None
if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)):
result_symbols[i] = liang
# if big is True, '两' will not be used and `alt_two` has no impact on output
if big:
attr_name = 'big_'
if traditional:
attr_name += 't'
else:
attr_name += 's'
else:
if traditional:
attr_name = 'traditional'
else:
attr_name = 'simplified'
result = ''.join([getattr(s, attr_name) for s in result_symbols])
# if not use_zeros:
# result = result.strip(getattr(system.digits[0], attr_name))
if alt_zero:
result = result.replace(
getattr(system.digits[0], attr_name), system.digits[0].alt_s)
if alt_one:
result = result.replace(
getattr(system.digits[1], attr_name), system.digits[1].alt_s)
for i, p in enumerate(POINT):
if result.startswith(p):
return CHINESE_DIGIS[0] + result
# ^10, 11, .., 19
if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \
result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]:
result = result[1:]
return result
# ================================================================================ #
# different types of rewriters
# ================================================================================ #
class Cardinal:
"""
CARDINAL类
"""
def __init__(self, cardinal=None, chntext=None):
self.cardinal = cardinal
self.chntext = chntext
def chntext2cardinal(self):
return chn2num(self.chntext)
def cardinal2chntext(self):
return num2chn(self.cardinal)
class Digit:
"""
DIGIT类
"""
def __init__(self, digit=None, chntext=None):
self.digit = digit
self.chntext = chntext
# def chntext2digit(self):
# return chn2num(self.chntext)
def digit2chntext(self):
return num2chn(self.digit, alt_two=False, use_units=False)
class TelePhone:
"""
TELEPHONE类
"""
def __init__(self, telephone=None, raw_chntext=None, chntext=None):
self.telephone = telephone
self.raw_chntext = raw_chntext
self.chntext = chntext
# def chntext2telephone(self):
# sil_parts = self.raw_chntext.split('<SIL>')
# self.telephone = '-'.join([
# str(chn2num(p)) for p in sil_parts
# ])
# return self.telephone
def telephone2chntext(self, fixed=False):
if fixed:
sil_parts = self.telephone.split('-')
self.raw_chntext = '<SIL>'.join([
num2chn(part, alt_two=False, use_units=False) for part in sil_parts
])
self.chntext = self.raw_chntext.replace('<SIL>', '')
else:
sp_parts = self.telephone.strip('+').split()
self.raw_chntext = '<SP>'.join([
num2chn(part, alt_two=False, use_units=False) for part in sp_parts
])
self.chntext = self.raw_chntext.replace('<SP>', '')
return self.chntext
class Fraction:
"""
FRACTION类
"""
def __init__(self, fraction=None, chntext=None):
self.fraction = fraction
self.chntext = chntext
def chntext2fraction(self):
denominator, numerator = self.chntext.split('分之')
return chn2num(numerator) + '/' + chn2num(denominator)
def fraction2chntext(self):
numerator, denominator = self.fraction.split('/')
return num2chn(denominator) + '分之' + num2chn(numerator)
class Date:
"""
DATE类
"""
def __init__(self, date=None, chntext=None):
self.date = date
self.chntext = chntext
# def chntext2date(self):
# chntext = self.chntext
# try:
# year, other = chntext.strip().split('年', maxsplit=1)
# year = Digit(chntext=year).digit2chntext() + '年'
# except ValueError:
# other = chntext
# year = ''
# if other:
# try:
# month, day = other.strip().split('月', maxsplit=1)
# month = Cardinal(chntext=month).chntext2cardinal() + '月'
# except ValueError:
# day = chntext
# month = ''
# if day:
# day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
# else:
# month = ''
# day = ''
# date = year + month + day
# self.date = date
# return self.date
def date2chntext(self):
date = self.date
try:
year, other = date.strip().split('', 1)
year = Digit(digit=year).digit2chntext() + ''
except ValueError:
other = date
year = ''
if other:
try:
month, day = other.strip().split('', 1)
month = Cardinal(cardinal=month).cardinal2chntext() + ''
except ValueError:
day = date
month = ''
if day:
day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
else:
month = ''
day = ''
chntext = year + month + day
self.chntext = chntext
return self.chntext
class Money:
"""
MONEY类
"""
def __init__(self, money=None, chntext=None):
self.money = money
self.chntext = chntext
# def chntext2money(self):
# return self.money
def money2chntext(self):
money = self.money
pattern = re.compile(r'(\d+(\.\d+)?)')
matchers = pattern.findall(money)
if matchers:
for matcher in matchers:
money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext())
self.chntext = money
return self.chntext
class Percentage:
"""
PERCENTAGE类
"""
def __init__(self, percentage=None, chntext=None):
self.percentage = percentage
self.chntext = chntext
def chntext2percentage(self):
return chn2num(self.chntext.strip().strip('百分之')) + '%'
def percentage2chntext(self):
return '百分之' + num2chn(self.percentage.strip().strip('%'))
# ================================================================================ #
# NSW Normalizer
# ================================================================================ #
class NSWNormalizer:
def __init__(self, raw_text):
self.raw_text = '^' + raw_text + '$'
self.norm_text = ''
def _particular(self):
text = self.norm_text
pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
matchers = pattern.findall(text)
if matchers:
# print('particular')
for matcher in matchers:
text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1)
self.norm_text = text
return self.norm_text
def normalize(self):
text = self.raw_text
# 规范化日期
pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)")
matchers = pattern.findall(text)
if matchers:
#print('date')
for matcher in matchers:
text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
# 规范化金钱
pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)")
matchers = pattern.findall(text)
if matchers:
#print('money')
for matcher in matchers:
text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
# 规范化固话/手机号码
# 手机
# http://www.jihaoba.com/news/show/13680
# 移动139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
# 联通130、131、132、156、155、186、185、176
# 电信133、153、189、180、181、177
pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
matchers = pattern.findall(text)
if matchers:
#print('telephone')
for matcher in matchers:
text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1)
# 固话
pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
matchers = pattern.findall(text)
if matchers:
# print('fixed telephone')
for matcher in matchers:
text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1)
# 规范化分数
pattern = re.compile(r"(\d+/\d+)")
matchers = pattern.findall(text)
if matchers:
#print('fraction')
for matcher in matchers:
text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
# 规范化百分数
text = text.replace('', '%')
pattern = re.compile(r"(\d+(\.\d+)?%)")
matchers = pattern.findall(text)
if matchers:
#print('percentage')
for matcher in matchers:
text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1)
# 规范化纯数+量词
pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
matchers = pattern.findall(text)
if matchers:
#print('cardinal+quantifier')
for matcher in matchers:
text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
# 规范化数字编号
pattern = re.compile(r"(\d{4,32})")
matchers = pattern.findall(text)
if matchers:
#print('digit')
for matcher in matchers:
text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
# 规范化纯数
pattern = re.compile(r"(\d+(\.\d+)?)")
matchers = pattern.findall(text)
if matchers:
#print('cardinal')
for matcher in matchers:
text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
self.norm_text = text
self._particular()
return self.norm_text.lstrip('^').rstrip('$')
def nsw_test_case(raw_text):
print('I:' + raw_text)
print('O:' + NSWNormalizer(raw_text).normalize())
print('')
def nsw_test():
nsw_test_case('固话0595-23865596或23880880。')
nsw_test_case('固话0595-23865596或23880880。')
nsw_test_case('手机:+86 19859213959或15659451527。')
nsw_test_case('分数32477/76391。')
nsw_test_case('百分数80.03%')
nsw_test_case('编号31520181154418。')
nsw_test_case('纯数2983.07克或12345.60米。')
nsw_test_case('日期1999年2月20日或09年3月15号。')
nsw_test_case('金钱12块534.5元20.1万')
nsw_test_case('特殊O2O或B2C。')
nsw_test_case('3456万吨')
nsw_test_case('2938个')
nsw_test_case('938')
nsw_test_case('今天吃了115个小笼包231个馒头')
nsw_test_case('有62的概率')
if __name__ == '__main__':
#nsw_test()
p = argparse.ArgumentParser()
p.add_argument('ifile', help='input filename, assume utf-8 encoding')
p.add_argument('ofile', help='output filename')
p.add_argument('--to_upper', action='store_true', help='convert to upper case')
p.add_argument('--to_lower', action='store_true', help='convert to lower case')
p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.")
p.add_argument('--log_interval', type=int, default=100000, help='log interval in number of processed lines')
args = p.parse_args()
ifile = codecs.open(args.ifile, 'r', 'utf8')
ofile = codecs.open(args.ofile, 'w+', 'utf8')
n = 0
for l in ifile:
key = ''
text = ''
if args.has_key:
cols = l.split(maxsplit=1)
key = cols[0]
if len(cols) == 2:
text = cols[1].strip()
else:
text = ''
else:
text = l.strip()
# cases
if args.to_upper and args.to_lower:
sys.stderr.write('cn_tn.py: to_upper OR to_lower?')
exit(1)
if args.to_upper:
text = text.upper()
if args.to_lower:
text = text.lower()
# NSW(Non-Standard-Word) normalization
text = NSWNormalizer(text).normalize()
# Punctuations removal
old_chars = CHINESE_PUNC_LIST + string.punctuation # includes all CN and EN punctuations
new_chars = ' ' * len(old_chars)
del_chars = ''
text = text.translate(str.maketrans(old_chars, new_chars, del_chars))
#
if args.has_key:
ofile.write(key + '\t' + text + '\n')
else:
if text.strip() != '': # skip empty line in pure text format(without Kaldi's utt key)
ofile.write(text + '\n')
n += 1
if n % args.log_interval == 0:
sys.stderr.write("cn_tn.py: {} lines done.\n".format(n))
sys.stderr.flush()
sys.stderr.write("cn_tn.py: {} lines done in total.\n".format(n))
sys.stderr.flush()
ifile.close()
ofile.close()

@ -1,7 +0,0 @@
UTT000 这块黄金重达324.75克
UTT001 她出生于86年8月18日她弟弟出生于1995年3月1日
UTT002 电影中梁朝伟扮演的陈永仁的编号27149
UTT003 现场有7/12的观众投出了赞成票
UTT004 随便来几个价格12块534.5元20.1万
UTT005 明天有62的概率降雨
UTT006 这是固话0421-33441122或这是手机+86 18544139121

@ -1,7 +0,0 @@
这块黄金重达324.75克
她出生于86年8月18日她弟弟出生于1995年3月1日
电影中梁朝伟扮演的陈永仁的编号27149
现场有7/12的观众投出了赞成票
随便来几个价格12块534.5元20.1万
明天有62的概率降雨
这是固话0421-33441122或这是手机+86 18544139121

@ -1,8 +0,0 @@
# for plain text
python3 cn_tn.py example_plain.txt output_plain.txt
diff example_plain.txt output_plain.txt
# for Kaldi's trans format
python3 cn_tn.py --has_key example_kaldi.txt output_kaldi.txt
diff example_kaldi.txt output_kaldi.txt

@ -1,24 +0,0 @@
0. place install_thrax.sh into $KALDI/tools/extras/
1. recompile openfst with necessary option "--enable-grm" to support thrax:
* cd $KALDI_ROOT/tools
* make clean
* edit $KALDI_ROOT/tools/Makefile, append "--enable-grm" option to OPENFST_CONFIGURE:
OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts --enable-lookahead-fsts --with-pic --enable-grm
* make -j 10
2. install thrax
cd $KALDI_ROOT/tools
sh extras/install_thrax.sh
3. add thrax binary path into $KALDI_ROOT/tools/env.sh:
export PATH=/path/to/your/kaldi_root/tools/thrax-1.2.9/src/bin:${PATH}
usage:
before you run anything related to thrax, use:
. $KALDI_ROOT/tools/env.sh
to enable binary finding, like what we always do in kaldi.
sample usage:
sh run_en.sh
sh run_cn.sh

@ -1,12 +0,0 @@
#!/bin/bash
## This script should be placed under $KALDI_ROOT/tools/extras/, and see INSTALL.txt for installation guide
if [ ! -f thrax-1.2.9.tar.gz ]; then
wget http://www.openfst.org/twiki/pub/GRM/ThraxDownload/thrax-1.2.9.tar.gz
tar -zxf thrax-1.2.9.tar.gz
fi
cd thrax-1.2.9
OPENFSTPREFIX=`pwd`/../openfst
LDFLAGS="-L${OPENFSTPREFIX}/lib" CXXFLAGS="-I${OPENFSTPREFIX}/include" ./configure --prefix ${OPENFSTPREFIX}
make -j 10; make install
cd ..

@ -1,6 +0,0 @@
cd src/cn
thraxmakedep itn.grm
make
#thraxrewrite-tester --far=itn.far --rules=ITN
cat ../../testcase_cn.txt | thraxrewrite-tester --far=itn.far --rules=ITN
cd -

@ -1,6 +0,0 @@
cd src
thraxmakedep en/verbalizer/podspeech.grm
make
cat ../testcase_en.txt
cat ../testcase_en.txt | thraxrewrite-tester --far=en/verbalizer/podspeech.far --rules=POD_SPEECH_TN
cd -

@ -1,202 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

@ -1,65 +0,0 @@
en/verbalizer/podspeech.far: en/verbalizer/podspeech.grm util/util.far util/case.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far
thraxcompiler --input_grammar=$< --output_far=$@
util/util.far: util/util.grm util/byte.far util/case.far
thraxcompiler --input_grammar=$< --output_far=$@
util/byte.far: util/byte.grm
thraxcompiler --input_grammar=$< --output_far=$@
util/case.far: util/case.grm util/byte.far
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/extra_numbers.far: en/verbalizer/extra_numbers.grm util/byte.far en/verbalizer/numbers.far
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/numbers.far: en/verbalizer/numbers.grm en/verbalizer/number_names.far util/byte.far universal/thousands_punct.far
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/number_names.far: en/verbalizer/number_names.grm util/arithmetic.far en/verbalizer/g.fst en/verbalizer/cardinals.tsv en/verbalizer/ordinals.tsv
thraxcompiler --input_grammar=$< --output_far=$@
util/arithmetic.far: util/arithmetic.grm util/byte.far util/germanic.tsv
thraxcompiler --input_grammar=$< --output_far=$@
universal/thousands_punct.far: universal/thousands_punct.grm util/byte.far util/util.far
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/float.far: en/verbalizer/float.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/factorization.far: en/verbalizer/factorization.grm util/byte.far util/util.far en/verbalizer/numbers.far
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/lexical_map.far: en/verbalizer/lexical_map.grm util/byte.far en/verbalizer/lexical_map.tsv
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/math.far: en/verbalizer/math.grm en/verbalizer/float.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/miscellaneous.far: en/verbalizer/miscellaneous.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/extra_numbers.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/spelled.far
thraxcompiler --input_grammar=$< --output_far=$@
ru/classifier/cyrillic.far: ru/classifier/cyrillic.grm
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/spelled.far: en/verbalizer/spelled.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/money.far: en/verbalizer/money.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/money.tsv
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/numbers_plus.far: en/verbalizer/numbers_plus.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/spoken_punct.far: en/verbalizer/spoken_punct.grm en/verbalizer/lexical_map.far
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/time.far: en/verbalizer/time.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
thraxcompiler --input_grammar=$< --output_far=$@
en/verbalizer/urls.far: en/verbalizer/urls.grm util/byte.far en/verbalizer/lexical_map.far
thraxcompiler --input_grammar=$< --output_far=$@
clean:
rm -f util/util.far util/case.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far util/byte.far en/verbalizer/number_names.far universal/thousands_punct.far util/arithmetic.far en/verbalizer/factorization.far en/verbalizer/lexical_map.far ru/classifier/cyrillic.far

@ -1,24 +0,0 @@
# Text normalization covering grammars
This repository provides covering grammars for English and Russian text normalization as
documented in:
Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
_Transactions of the Association for Computational Linguistics_ 4: 507-519.
Ng, A. H., Gorman, K., and Sproat, R. 2017. Minimally supervised
written-to-spoken text normalization. In _ASRU_, pages 665-670.
If you use these grammars in a publication, we would appreciate if you cite these works.
## Building
The grammars are written in [Thrax](thrax.opengrm.org) and compile into [OpenFst](openfst.org) FAR (FstARchive) files. To compile, simply run `make` in the `src/` directory.
## License
See `LICENSE`.
## Mandatory disclaimer
This is not an official Google product.

@ -1,23 +0,0 @@
itn.far: itn.grm byte.far number.far hotfix.far percentage.far date.far amount.far
thraxcompiler --input_grammar=$< --output_far=$@
byte.far: byte.grm
thraxcompiler --input_grammar=$< --output_far=$@
number.far: number.grm byte.far
thraxcompiler --input_grammar=$< --output_far=$@
hotfix.far: hotfix.grm byte.far hotfix.list
thraxcompiler --input_grammar=$< --output_far=$@
percentage.far: percentage.grm byte.far number.far
thraxcompiler --input_grammar=$< --output_far=$@
date.far: date.grm byte.far number.far
thraxcompiler --input_grammar=$< --output_far=$@
amount.far: amount.grm byte.far number.far
thraxcompiler --input_grammar=$< --output_far=$@
clean:
rm -f byte.far number.far hotfix.far percentage.far date.far amount.far

@ -1,24 +0,0 @@
import 'byte.grm' as b;
import 'number.grm' as n;
unit = (
"匹"|"张"|"座"|"回"|"场"|"尾"|"条"|"个"|"首"|"阙"|"阵"|"网"|"炮"|
"顶"|"丘"|"棵"|"只"|"支"|"袭"|"辆"|"挑"|"担"|"颗"|"壳"|"窠"|"曲"|
"墙"|"群"|"腔"|"砣"|"座"|"客"|"贯"|"扎"|"捆"|"刀"|"令"|"打"|"手"|
"罗"|"坡"|"山"|"岭"|"江"|"溪"|"钟"|"队"|"单"|"双"|"对"|"出"|"口"|
"头"|"脚"|"板"|"跳"|"枝"|"件"|"贴"|"针"|"线"|"管"|"名"|"位"|"身"|
"堂"|"课"|"本"|"页"|"家"|"户"|"层"|"丝"|"毫"|"厘"|"分"|"钱"|"两"|
"斤"|"担"|"铢"|"石"|"钧"|"锱"|"忽"|"毫"|"厘"|"分"|"寸"|"尺"|"丈"|
"里"|"寻"|"常"|"铺"|"程"|"撮"|"勺"|"合"|"升"|"斗"|"石"|"盘"|"碗"|
"碟"|"叠"|"桶"|"笼"|"盆"|"盒"|"杯"|"钟"|"斛"|"锅"|"簋"|"篮"|"盘"|
"桶"|"罐"|"瓶"|"壶"|"卮"|"盏"|"箩"|"箱"|"煲"|"啖"|"袋"|"钵"|"年"|
"月"|"日"|"季"|"刻"|"时"|"周"|"天"|"秒"|"分"|"旬"|"纪"|"岁"|"世"|
"更"|"夜"|"春"|"夏"|"秋"|"冬"|"代"|"伏"|"辈"|"丸"|"泡"|"粒"|"颗"|
"幢"|"堆"|"条"|"根"|"支"|"道"|"面"|"片"|"张"|"颗"|"块"|
(("千克":"kg")|("毫克":"mg")|("微克":"µg"))|
(("千米":"km")|("厘米":"cm")|("毫米":"mm")|("微米":"µm")|("纳米":"nm"))
);
amount = n.number unit;
export AMOUNT = CDRewrite[amount, "", "", b.kBytes*];

@ -1,76 +0,0 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Copyright 2005-2011 Google, Inc.
# Author: ttai@google.com (Terry Tai)
# Standard constants for ASCII (byte) based strings. This mirrors the
# functions provided by C/C++'s ctype.h library.
# Note that [0] is missing. Matching the string-termination character is kinda weird.
export kBytes = Optimize[
"[1]" | "[2]" | "[3]" | "[4]" | "[5]" | "[6]" | "[7]" | "[8]" | "[9]" | "[10]" |
"[11]" | "[12]" | "[13]" | "[14]" | "[15]" | "[16]" | "[17]" | "[18]" | "[19]" | "[20]" |
"[21]" | "[22]" | "[23]" | "[24]" | "[25]" | "[26]" | "[27]" | "[28]" | "[29]" | "[30]" |
"[31]" | "[32]" | "[33]" | "[34]" | "[35]" | "[36]" | "[37]" | "[38]" | "[39]" | "[40]" |
"[41]" | "[42]" | "[43]" | "[44]" | "[45]" | "[46]" | "[47]" | "[48]" | "[49]" | "[50]" |
"[51]" | "[52]" | "[53]" | "[54]" | "[55]" | "[56]" | "[57]" | "[58]" | "[59]" | "[60]" |
"[61]" | "[62]" | "[63]" | "[64]" | "[65]" | "[66]" | "[67]" | "[68]" | "[69]" | "[70]" |
"[71]" | "[72]" | "[73]" | "[74]" | "[75]" | "[76]" | "[77]" | "[78]" | "[79]" | "[80]" |
"[81]" | "[82]" | "[83]" | "[84]" | "[85]" | "[86]" | "[87]" | "[88]" | "[89]" | "[90]" |
"[91]" | "[92]" | "[93]" | "[94]" | "[95]" | "[96]" | "[97]" | "[98]" | "[99]" | "[100]" |
"[101]" | "[102]" | "[103]" | "[104]" | "[105]" | "[106]" | "[107]" | "[108]" | "[109]" | "[110]" |
"[111]" | "[112]" | "[113]" | "[114]" | "[115]" | "[116]" | "[117]" | "[118]" | "[119]" | "[120]" |
"[121]" | "[122]" | "[123]" | "[124]" | "[125]" | "[126]" | "[127]" | "[128]" | "[129]" | "[130]" |
"[131]" | "[132]" | "[133]" | "[134]" | "[135]" | "[136]" | "[137]" | "[138]" | "[139]" | "[140]" |
"[141]" | "[142]" | "[143]" | "[144]" | "[145]" | "[146]" | "[147]" | "[148]" | "[149]" | "[150]" |
"[151]" | "[152]" | "[153]" | "[154]" | "[155]" | "[156]" | "[157]" | "[158]" | "[159]" | "[160]" |
"[161]" | "[162]" | "[163]" | "[164]" | "[165]" | "[166]" | "[167]" | "[168]" | "[169]" | "[170]" |
"[171]" | "[172]" | "[173]" | "[174]" | "[175]" | "[176]" | "[177]" | "[178]" | "[179]" | "[180]" |
"[181]" | "[182]" | "[183]" | "[184]" | "[185]" | "[186]" | "[187]" | "[188]" | "[189]" | "[190]" |
"[191]" | "[192]" | "[193]" | "[194]" | "[195]" | "[196]" | "[197]" | "[198]" | "[199]" | "[200]" |
"[201]" | "[202]" | "[203]" | "[204]" | "[205]" | "[206]" | "[207]" | "[208]" | "[209]" | "[210]" |
"[211]" | "[212]" | "[213]" | "[214]" | "[215]" | "[216]" | "[217]" | "[218]" | "[219]" | "[220]" |
"[221]" | "[222]" | "[223]" | "[224]" | "[225]" | "[226]" | "[227]" | "[228]" | "[229]" | "[230]" |
"[231]" | "[232]" | "[233]" | "[234]" | "[235]" | "[236]" | "[237]" | "[238]" | "[239]" | "[240]" |
"[241]" | "[242]" | "[243]" | "[244]" | "[245]" | "[246]" | "[247]" | "[248]" | "[249]" | "[250]" |
"[251]" | "[252]" | "[253]" | "[254]" | "[255]"
];
export kDigit = Optimize[
"0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
];
export kLower = Optimize[
"a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" |
"n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
];
export kUpper = Optimize[
"A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" |
"N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
];
export kAlpha = Optimize[kLower | kUpper];
export kAlnum = Optimize[kDigit | kAlpha];
export kSpace = Optimize[
" " | "\t" | "\n" | "\r"
];
export kNotSpace = Optimize[kBytes - kSpace];
export kPunct = Optimize[
"!" | "\"" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," |
"-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "\[" | "\\" |
"\]" | "^" | "_" | "`" | "{" | "|" | "}" | "~"
];
export kGraph = Optimize[kAlnum | kPunct];

@ -1,10 +0,0 @@
import 'byte.grm' as b;
import 'number.grm' as n;
date_day = n.number_1_to_99 ("日"|"号");
date_month_day = n.number_1_to_99 "月" date_day;
date_year_month_day = ((n.number_0_to_9){2,4} | n.number) "年" date_month_day;
date = date_year_month_day | date_month_day | date_day;
export DATE = CDRewrite[date, "", "", b.kBytes*];

@ -1,5 +0,0 @@
import 'byte.grm' as b;
hotfix = StringFile['hotfix.list'];
export HOTFIX = CDRewrite[hotfix, "", "", b.kBytes*];

@ -1,18 +0,0 @@
0头 零头
10字 十字
东4环 东4环 -1.0
东4 东四 -0.5
4惠 四惠
3元桥 三元桥
4平市 四平市
5台山 五台山
西2旗 西二旗
西3旗 西三旗
4道口 四道口 -1.0
5道口 五道口 -1.0
6道口 六道口 -1.0
6里桥 六里桥
7里庄 七里庄
8宝山 八宝山
9颗松 九棵松
10里堡 十里堡

@ -1,9 +0,0 @@
import 'byte.grm' as b;
import 'number.grm' as number;
import 'hotfix.grm' as hotfix;
import 'percentage.grm' as percentage;
import 'date.grm' as date;
import 'amount.grm' as amount; # seems not useful for now
export ITN = Optimize[percentage.PERCENTAGE @ (date.DATE <-1>) @ number.NUMBER @ hotfix.HOTFIX];

@ -1,61 +0,0 @@
import 'byte.grm' as b;
number_1_to_9 = (
("一":"1") | ("幺":"1") |
("二":"2") | ("两":"2") |
("三":"3") |
("四":"4") |
("五":"5") |
("六":"6") |
("七":"7") |
("八":"8") |
("九":"9")
);
export number_0_to_9 = (("零":"0") | number_1_to_9);
number_10_to_19 = (
("十":"10") |
("十一":"11") |
("十二":"12") |
("十三":"13") |
("十四":"14") |
("十五":"15") |
("十六":"16") |
("十七":"17") |
("十八":"18") |
("十九":"19")
);
number_10s = (number_1_to_9 ("十":""));
number_100s = (number_1_to_9 ("百":""));
number_1000s = (number_1_to_9 ("千":""));
number_10000s = (number_1_to_9 ("万":""));
number_10_to_99 = (
((number_10s number_1_to_9)<-0.3>) |
((number_10s ("":"0"))<-0.2>) |
(number_10_to_19 <-0.1>)
);
export number_1_to_99 = (number_1_to_9 | number_10_to_99);
number_100_to_999 = (
((number_100s ("零":"0") number_1_to_9)<0.0>)|
((number_100s number_10_to_99)<0.0>) |
((number_100s number_1_to_9 ("":"0"))<0.0>) |
((number_100s ("":"00"))<0.1>)
);
number_1000_to_9999 = (
((number_1000s number_100_to_999)<0.0>) |
((number_1000s ("零":"0") number_10_to_99)<0.0>)|
((number_1000s ("零":"00") number_1_to_9)<0.0>)|
((number_1000s ("":"000"))<1>) |
((number_1000s number_1_to_9 ("":"00"))<0.0>)
);
export number = number_1_to_99 | (number_100_to_999 <-1>) | (number_1000_to_9999 <-2>);
export NUMBER = CDRewrite[number, "", "", b.kBytes*];

@ -1,8 +0,0 @@
import 'byte.grm' as b;
import 'number.grm' as n;
percentage = (
("百分之":"") n.number_1_to_99 ("":"%")
);
export PERCENTAGE = CDRewrite[percentage, "", "", b.kBytes*];

@ -1,6 +0,0 @@
# English covering grammar definitions
This directory defines a English text normalization covering grammar. The
primary entry-point is the FST `VERBALIZER`, defined in
`verbalizer/verbalizer.grm` and compiled in the FST archive
`verbalizer/verbalizer.far`.

@ -1,3 +0,0 @@
verbalizer.far: verbalizer.grm util/util.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far
thraxcompiler --input_grammar=$< --output_far=$@

@ -1,32 +0,0 @@
0 zero
1 one
2 two
3 three
4 four
5 five
6 six
7 seven
8 eight
9 nine
10 ten
11 eleven
12 twelve
13 thirteen
14 fourteen
15 fifteen
16 sixteen
17 seventeen
18 eighteen
19 nineteen
20 twenty
30 thirty
40 forty
50 fifty
60 sixty
70 seventy
80 eighty
90 ninety
100 hundred
1000 thousand
1000000 million
1000000000 billion
1 0 zero
2 1 one
3 2 two
4 3 three
5 4 four
6 5 five
7 6 six
8 7 seven
9 8 eight
10 9 nine
11 10 ten
12 11 eleven
13 12 twelve
14 13 thirteen
15 14 fourteen
16 15 fifteen
17 16 sixteen
18 17 seventeen
19 18 eighteen
20 19 nineteen
21 20 twenty
22 30 thirty
23 40 forty
24 50 fifty
25 60 sixty
26 70 seventy
27 80 eighty
28 90 ninety
29 100 hundred
30 1000 thousand
31 1000000 million
32 1000000000 billion

@ -1,35 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
import 'en/verbalizer/numbers.grm' as n;
digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@");
export DIGITS = digit (n.I[" "] digit)*;
# Various common factorizations
two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS;
three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS;
mixed =
(digit n.I[" "] two_digits)
| (two_digits n.I[" "] two_digits)
| (two_digits n.I[" "] three_digits)
| (two_digits n.I[" "] two_digits n.I[" "] two_digits)
;
export MIXED_NUMBERS = Optimize[mixed];

@ -1,40 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
import 'util/util.grm' as u;
import 'en/verbalizer/numbers.grm' as n;
func ToNumberName[expr] {
number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*;
return Optimize[expr @ number_name_seq];
}
d = b.kDigit;
leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*];
by_ones = d n.I[" "];
by_twos = (d{2} @ leading_zero) n.I[" "];
by_threes = (d{3} @ leading_zero) n.I[" "];
groupings = by_twos* (by_threes | by_twos | by_ones);
export FRACTIONAL_PART_UNGROUPED =
Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]]
;
export FRACTIONAL_PART_GROUPED =
Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]]
;
export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]];

@ -1,30 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'en/verbalizer/factorization.grm' as f;
import 'en/verbalizer/lexical_map.grm' as l;
import 'en/verbalizer/numbers.grm' as n;
fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED;
fractional_part_grouped = f.FRACTIONAL_PART_GROUPED;
fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED;
__fractional_part__ = fractional_part_ungrouped | fractional_part_unparsed;
__decimal_marker__ = ".";
export FLOAT = Optimize[
(n.CARDINAL_NUMBERS
(__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ")
__fractional_part__) @ l.LEXICAL_MAP]
;

@ -1,25 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
lexical_map = StringFile['en/verbalizer/lexical_map.tsv'];
sigma_star = b.kBytes*;
del_null = CDRewrite["__NULL__" : "", "", "", sigma_star];
export LEXICAL_MAP = Optimize[
CDRewrite[lexical_map, "", "", sigma_star] @ del_null]
;

@ -1,74 +0,0 @@
@@CONNECTOR_RANGE@@ to
@@CONNECTOR_RATIO@@ to
@@CONNECTOR_BY@@ by
@@CONNECTOR_CONSECUTIVE_YEAR@@ to
@@JANUARY@@ january
@@FEBRUARY@@ february
@@MARCH@@ march
@@APRIL@@ april
@@MAY@@ may
@@JUNE@@ june
@@JULY@@ july
@@AUGUST@@ august
@@SEPTEMBER@@ september
@@OCTOBER@@ october
@@NOVEMBER@@ november
@@DECEMBER@@ december
@@MINUS@@ minus
@@DECIMAL_DOT_EXPRESSION@@ point
@@URL_DOT_EXPRESSION@@ dot
@@DECIMAL_EXPONENT@@ to the
@@DECIMAL_EXPONENT@@ to the power of
@@COLON@@ colon
@@SLASH@@ slash
@@SLASH@@ forward slash
@@DASH@@ dash
@@PASSWORD@@ password
@@AT@@ at
@@PORT@@ port
@@QUESTION_MARK@@ question mark
@@HASH@@ hash
@@HASH@@ hash tag
@@FRACTION_OVER@@ over
@@MONEY_AND@@ and
@@AND@@ and
@@PHONE_PLUS@@ plus
@@PHONE_EXTENSION@@ extension
@@TIME_AM@@ a m
@@TIME_PM@@ p m
@@HOUR@@ o'clock
@@MINUTE@@ minute
@@MINUTE@@ minutes
@@TIME_AFTER@@ after
@@TIME_AFTER@@ past
@@TIME_BEFORE@@ to
@@TIME_BEFORE@@ till
@@TIME_QUARTER@@ quarter
@@TIME_HALF@@ half
@@TIME_ZERO@@ oh
@@TIME_THREE_QUARTER@@ three quarters
@@ARITHMETIC_PLUS@@ plus
@@ARITHMETIC_TIMES@@ times
@@ARITHMETIC_TIMES@@ multiplied by
@@ARITHMETIC_MINUS@@ minus
@@ARITHMETIC_DIVISION@@ divided by
@@ARITHMETIC_DIVISION@@ over
@@ARITHMETIC_EQUALS@@ equals
@@PERCENT@@ percent
@@DEGREE@@ degree
@@DEGREE@@ degrees
@@SQUARE_ROOT@@ square root of
@@SQUARE_ROOT@@ the square root of
@@STAR@@ star
@@HYPHEN@@ hyphen
@@AT@@ at
@@PER@@ per
@@PERIOD@@ period
@@PERIOD@@ full stop
@@PERIOD@@ dot
@@EXCLAMATION_MARK@@ exclamation mark
@@EXCLAMATION_MARK@@ exclamation point
@@COMMA@@ comma
@@POSITIVE@@ positive
@@NEGATIVE@@ negative
@@OTHER_ZERO_VERBALIZATIONS@@ oh
Can't render this file because it has a wrong number of fields in line 37.

@ -1,34 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'en/verbalizer/float.grm' as f;
import 'en/verbalizer/lexical_map.grm' as l;
import 'en/verbalizer/numbers.grm' as n;
float = f.FLOAT;
card = n.CARDINAL_NUMBERS;
number = card | float;
plus = "+" : " @@ARITHMETIC_PLUS@@ ";
times = "*" : " @@ARITHMETIC_TIMES@@ ";
minus = "-" : " @@ARITHMETIC_MINUS@@ ";
division = "/" : " @@ARITHMETIC_DIVISION@@ ";
operator = plus | times | minus | division;
percent = "%" : " @@PERCENT@@";
export ARITHMETIC =
Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP]
;

@ -1,78 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
import 'ru/classifier/cyrillic.grm' as c;
import 'en/verbalizer/extra_numbers.grm' as e;
import 'en/verbalizer/lexical_map.grm' as l;
import 'en/verbalizer/numbers.grm' as n;
import 'en/verbalizer/spelled.grm' as s;
letter = b.kAlpha | c.kCyrillicAlpha;
dash = "-";
word = letter+;
possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?;
post_word_symbol =
("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) |
("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) |
("*" : "@@STAR@@")
;
pre_word_symbol =
("@" : "@@AT@@") |
("/" : "@@SLASH@@") |
("#" : "@@HASH@@")
;
post_word = possibly_split_word n.I[" "] post_word_symbol;
pre_word = pre_word_symbol n.I[" "] possibly_split_word;
## Number/digit sequence combos, maybe with a dash
spelled_word = word @ s.SPELLED_NO_LETTER;
word_number =
(word | spelled_word)
(n.I[" "] | (dash : " "))
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
;
number_word =
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
(n.I[" "] | (dash : " "))
(word | spelled_word)
;
## Two-digit year.
# Note that in this case to be fair we really have to allow ordinals too since
# in some languages that's what you would have.
two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS));
dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com";
miscellaneous = Optimize[
possibly_split_word
| post_word
| pre_word
| word_number
| number_word
| two_digit_year
| dot_com
];
export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP];

@ -1,44 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
import 'en/verbalizer/lexical_map.grm' as l;
import 'en/verbalizer/numbers.grm' as n;
card = n.CARDINAL_NUMBERS;
__currency__ = StringFile['en/verbalizer/money.tsv'];
d = b.kDigit;
D = d - "0";
cents = ((n.D["0"] | D) d) @ card;
# Only dollar for the verbalizer tests for English. Will need to add other
# currencies.
usd_maj = Project["usd_maj" @ __currency__, 'output'];
usd_min = Project["usd_min" @ __currency__, 'output'];
and = " @@MONEY_AND@@ " | " ";
dollar1 =
n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min]
;
dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"];
dollar3 = n.D["$"] card n.I[" " usd_maj];
dollar = Optimize[dollar1 | dollar2 | dollar3];
export MONEY = Optimize[dollar @ l.LEXICAL_MAP];

@ -1,4 +0,0 @@
usd_maj dollar
usd_maj dollars
usd_min cent
usd_min cents
1 usd_maj dollar
2 usd_maj dollars
3 usd_min cent
4 usd_min cents

@ -1,54 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# English minimally supervised number grammar.
#
# Supports both cardinals and ordinals without overt marking.
#
# The language-specific acceptor G was compiled with digit, teen, and decade
# preterminals. The lexicon transducer L is unambiguous so no LM is used.
import 'util/arithmetic.grm' as a;
# Intersects the universal factorization transducer (F) with the
# language-specific acceptor (G).
d = a.DELTA_STAR;
f = a.IARITHMETIC_RESTRICTED;
g = LoadFst['en/verbalizer/g.fst'];
fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]];
test1 = AssertEqual["230" @ fg, "(+ (* 2 100 *) 30 +)"];
# Compiles lexicon transducer (L).
cardinal_name = StringFile['en/verbalizer/cardinals.tsv'];
cardinal_l = Optimize[(cardinal_name " ")* cardinal_name];
test2 = AssertEqual["2 100 30" @ cardinal_l, "two hundred thirty"];
ordinal_name = StringFile['en/verbalizer/ordinals.tsv'];
# In English, ordinals have the same syntax as cardinals and all but the final
# element is verbalized using a cardinal number word; e.g., "two hundred
# thirtieth".
ordinal_l = Optimize[(cardinal_name " ")* ordinal_name];
test3 = AssertEqual["2 100 30" @ ordinal_l, "two hundred thirtieth"];
# Composes L with the leaf transducer (P), then composes that with FG.
p = a.LEAVES;
export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)];
test4 = AssertEqual["230" @ CARDINAL_NUMBER_NAME, "two hundred thirty"];
export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)];
test5 = AssertEqual["230" @ ORDINAL_NUMBER_NAME, "two hundred thirtieth"];

@ -1,57 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'en/verbalizer/number_names.grm' as n;
import 'util/byte.grm' as bytelib;
import 'universal/thousands_punct.grm' as t;
cardinal = n.CARDINAL_NUMBER_NAME;
ordinal = n.ORDINAL_NUMBER_NAME;
# Putting these here since this grammar gets incorporated by all the others.
func I[expr] {
return "" : expr;
}
func D[expr] {
return expr : "";
}
separators = t.comma_thousands | t.no_delimiter;
# Language specific endings for ordinals.
d = bytelib.kDigit;
endings = "st" | "nd" | "rd" | "th";
st = (d* "1") - (d* "11");
nd = (d* "2") - (d* "12");
rd = (d* "3") - (d* "13");
th = Optimize[d* - st - nd - rd];
first = st ("st" : "");
second = nd ("nd" : "");
third = rd ("rd" : "");
other = th ("th" : "");
marked_ordinal = Optimize[first | second | third | other];
# The separator is a no-op here but will be needed once we replace
# the above targets.
export CARDINAL_NUMBERS = Optimize[separators @ cardinal];
export ORDINAL_NUMBERS =
Optimize[(separators endings) @ marked_ordinal @ ordinal]
;
export ORDINAL_NUMBERS_UNMARKED = Optimize[separators @ ordinal];

@ -1,133 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Grammar for things built mostly on numbers.
import 'en/verbalizer/factorization.grm' as f;
import 'en/verbalizer/lexical_map.grm' as l;
import 'en/verbalizer/numbers.grm' as n;
num = n.CARDINAL_NUMBERS;
ord = n.ORDINAL_NUMBERS_UNMARKED;
digits = f.FRACTIONAL_PART_UNGROUPED;
# Various symbols.
plus = "+" : "@@ARITHMETIC_PLUS@@";
minus = "-" : "@@ARITHMETIC_MINUS@@";
slash = "/" : "@@SLASH@@";
dot = "." : "@@URL_DOT_EXPRESSION@@";
dash = "-" : "@@DASH@@";
equals = "=" : "@@ARITHMETIC_EQUALS@@";
degree = "°" : "@@DEGREE@@";
division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@";
times = ("x" | "*") : "@@ARITHMETIC_TIMES@@";
power = "^" : "@@DECIMAL_EXPONENT@@";
square_root = "√" : "@@SQUARE_ROOT@@";
percent = "%" : "@@PERCENT@@";
# Safe roman numbers.
# NB: Do not change the formatting here. NO_EDIT must be on the same
# line as the path.
rfile =
'universal/roman_numerals.tsv' # NO_EDIT
;
roman = StringFile[rfile];
## Main categories.
cat_dot_number =
num
n.I[" "] dot n.I[" "] num
(n.I[" "] dot n.I[" "] num)+
;
cat_slash_number =
num
n.I[" "] slash n.I[" "] num
(n.I[" "] slash n.I[" "] num)*
;
cat_dash_number =
num
n.I[" "] dash n.I[" "] num
(n.I[" "] dash n.I[" "] num)*
;
cat_signed_number = ((plus | minus) n.I[" "])? num;
cat_degree = cat_signed_number n.I[" "] degree;
cat_country_code = plus n.I[" "] (num | digits);
cat_math_operations =
plus
| minus
| division
| times
| equals
| percent
| power
| square_root
;
# Roman numbers are often either cardinals or ordinals in various languages.
cat_roman = roman @ (num | ord);
# Allow
#
# number:number
# number-number
#
# to just be
#
# number number.
cat_number_number =
num ((":" | "-") : " ") num
;
# Some additional readings for these symbols.
cat_additional_readings =
("/" : "@@PER@@") |
("+" : "@@AND@@") |
("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) |
("*" : "@@STAR@@") |
("x" : ("x" | "@@CONNECTOR_BY@@")) |
("@" : "@@AT@@")
;
numbers_plus = Optimize[
cat_dot_number
| cat_slash_number
| cat_dash_number
| cat_signed_number
| cat_degree
| cat_country_code
| cat_math_operations
| cat_roman
| cat_number_number
| cat_additional_readings
];
export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP];

@ -1,32 +0,0 @@
0 zeroth
1 first
2 second
3 third
4 fourth
5 fifth
6 sixth
7 seventh
8 eighth
9 ninth
10 tenth
11 eleventh
12 twelfth
13 thirteenth
14 fourteenth
15 fifteenth
16 sixteenth
17 seventeenth
18 eighteenth
19 nineteenth
20 twentieth
30 thirtieth
40 fortieth
50 fiftieth
60 sixtieth
70 seventieth
80 eightieth
90 ninetieth
100 hundredth
1000 thousandth
1000000 millionth
1000000000 billionth
1 0 zeroth
2 1 first
3 2 second
4 3 third
5 4 fourth
6 5 fifth
7 6 sixth
8 7 seventh
9 8 eighth
10 9 ninth
11 10 tenth
12 11 eleventh
13 12 twelfth
14 13 thirteenth
15 14 fourteenth
16 15 fifteenth
17 16 sixteenth
18 17 seventeenth
19 18 eighteenth
20 19 nineteenth
21 20 twentieth
22 30 thirtieth
23 40 fortieth
24 50 fiftieth
25 60 sixtieth
26 70 seventieth
27 80 eightieth
28 90 ninetieth
29 100 hundredth
30 1000 thousandth
31 1000000 millionth
32 1000000000 billionth

@ -1,7 +0,0 @@
float.grm __fractional_part__ = fractional_part_ungrouped | fractional_part_unparsed;
telephone.grm __grouping__ = f.UNGROUPED;
measure.grm __measure__ = StringFile['en/verbalizer/measures.tsv'];
money.grm __currency__ = StringFile['en/verbalizer/money.tsv'];
time.grm __sep__ = ":";
time.grm __am__ = "a.m." | "am" | "AM";
time.grm __pm__ = "p.m." | "pm" | "PM";
Can't render this file because it contains an unexpected character in line 5 and column 20.

@ -1,46 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/util.grm' as util;
import 'util/case.grm' as case;
import 'en/verbalizer/extra_numbers.grm' as e;
import 'en/verbalizer/float.grm' as f;
import 'en/verbalizer/math.grm' as ma;
import 'en/verbalizer/miscellaneous.grm' as mi;
import 'en/verbalizer/money.grm' as mo;
import 'en/verbalizer/numbers.grm' as n;
import 'en/verbalizer/numbers_plus.grm' as np;
import 'en/verbalizer/spelled.grm' as s;
import 'en/verbalizer/spoken_punct.grm' as sp;
import 'en/verbalizer/time.grm' as t;
import 'en/verbalizer/urls.grm' as u;
export POD_SPEECH_TN = Optimize[RmWeight[
(u.URL
| e.MIXED_NUMBERS
| e.DIGITS
| f.FLOAT
| ma.ARITHMETIC
| mo.MONEY
| n.CARDINAL_NUMBERS
| n.ORDINAL_NUMBERS
| np.NUMBERS_PLUS
| s.SPELLED
| sp.SPOKEN_PUNCT
| t.TIME
| u.URL
| u.EMAILS) @ util.CLEAN_SPACES @ case.TOUPPER
]];
#export POD_SPEECH_TN = Optimize[RmWeight[(mi.MISCELLANEOUS) @ util.CLEAN_SPACES @ case.TOUPPER]];

@ -1,77 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This verbalizer is used whenever there is an LM symbol that consists of
# letters immediately followed by "{spelled}".l This strips the "{spelled}"
# suffix.
import 'util/byte.grm' as b;
import 'ru/classifier/cyrillic.grm' as c;
import 'en/verbalizer/lexical_map.grm' as l;
import 'en/verbalizer/numbers.grm' as n;
digit = b.kDigit @ n.CARDINAL_NUMBERS;
char_set = (("a" | "A") : "letter-a")
| (("b" | "B") : "letter-b")
| (("c" | "C") : "letter-c")
| (("d" | "D") : "letter-d")
| (("e" | "E") : "letter-e")
| (("f" | "F") : "letter-f")
| (("g" | "G") : "letter-g")
| (("h" | "H") : "letter-h")
| (("i" | "I") : "letter-i")
| (("j" | "J") : "letter-j")
| (("k" | "K") : "letter-k")
| (("l" | "L") : "letter-l")
| (("m" | "M") : "letter-m")
| (("n" | "N") : "letter-n")
| (("o" | "O") : "letter-o")
| (("p" | "P") : "letter-p")
| (("q" | "Q") : "letter-q")
| (("r" | "R") : "letter-r")
| (("s" | "S") : "letter-s")
| (("t" | "T") : "letter-t")
| (("u" | "U") : "letter-u")
| (("v" | "V") : "letter-v")
| (("w" | "W") : "letter-w")
| (("x" | "X") : "letter-x")
| (("y" | "Y") : "letter-y")
| (("z" | "Z") : "letter-z")
| (digit)
| ("&" : "@@AND@@")
| ("." : "")
| ("-" : "")
| ("_" : "")
| ("/" : "")
| (n.I["letter-"] c.kCyrillicAlpha)
;
ins_space = "" : " ";
suffix = "{spelled}" : "";
spelled = Optimize[char_set (ins_space char_set)* suffix];
export SPELLED = Optimize[spelled @ l.LEXICAL_MAP];
sigma_star = b.kBytes*;
# Gets rid of the letter- prefix since in some cases we don't want it.
del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star];
spelled_no_tag = Optimize[char_set (ins_space char_set)*];
export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter];

@ -1,24 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'en/verbalizer/lexical_map.grm' as l;
punct =
("." : "@@PERIOD@@")
| ("," : "@@COMMA@@")
| ("!" : "@@EXCLAMATION_MARK@@")
| ("?" : "@@QUESTION_MARK@@")
;
export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP];

@ -1,108 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
import 'en/verbalizer/lexical_map.grm' as l;
import 'en/verbalizer/numbers.grm' as n;
# Only handles 24-hour time with quarter-to, half-past and quarter-past.
increment_hour =
("0" : "1")
| ("1" : "2")
| ("2" : "3")
| ("3" : "4")
| ("4" : "5")
| ("5" : "6")
| ("6" : "7")
| ("7" : "8")
| ("8" : "9")
| ("9" : "10")
| ("10" : "11")
| ("11" : "12")
| ("12" : "1") # If someone uses 12, we assume 12-hour by default.
| ("13" : "14")
| ("14" : "15")
| ("15" : "16")
| ("16" : "17")
| ("17" : "18")
| ("18" : "19")
| ("19" : "20")
| ("20" : "21")
| ("21" : "22")
| ("22" : "23")
| ("23" : "12")
;
hours = Project[increment_hour, 'input'];
d = b.kDigit;
D = d - "0";
minutes09 = "0" D;
minutes = ("1" | "2" | "3" | "4" | "5") d;
__sep__ = ":";
sep_space = __sep__ : " ";
verbalize_hours = hours @ n.CARDINAL_NUMBERS;
verbalize_minutes =
("00" : "@@HOUR@@")
| (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS))
| (minutes @ n.CARDINAL_NUMBERS)
;
time_basic = Optimize[verbalize_hours sep_space verbalize_minutes];
# Special cases we handle right now.
# TODO: Need to allow for cases like
#
# half twelve (in the UK English sense)
# half twaalf (in the Dutch sense)
time_quarter_past =
n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "]
verbalize_hours
n.D[__sep__ "15"];
time_half_past =
n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "]
verbalize_hours
n.D[__sep__ "30"];
time_quarter_to =
n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "]
(increment_hour @ verbalize_hours)
n.D[__sep__ "45"];
time_extra = Optimize[
time_quarter_past | time_half_past | time_quarter_to]
;
# Basic time periods which most languages can be expected to have.
__am__ = "a.m." | "am" | "AM";
__pm__ = "p.m." | "pm" | "PM";
period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@");
time_variants = time_basic | time_extra;
time = Optimize[
(period (" " | n.I[" "]))? time_variants
| time_variants ((" " | n.I[" "]) period)?]
;
export TIME = Optimize[time @ l.LEXICAL_MAP];

@ -1,68 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Rules for URLs and email addresses.
import 'util/byte.grm' as bytelib;
import 'en/verbalizer/lexical_map.grm' as l;
ins_space = "" : " ";
dot = "." : "@@URL_DOT_EXPRESSION@@";
at = "@" : "@@AT@@";
url_suffix =
(".com" : dot ins_space "com") |
(".gov" : dot ins_space "gov") |
(".edu" : dot ins_space "e d u") |
(".org" : dot ins_space "org") |
(".net" : dot ins_space "net")
;
letter_string = (bytelib.kAlnum)* bytelib.kAlnum;
letter_string_dot =
((letter_string ins_space dot ins_space)* letter_string)
;
# Rules for URLs.
export URL = Optimize[
((letter_string_dot) (ins_space)
(url_suffix)) @ l.LEXICAL_MAP
];
# Rules for email addresses.
letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum);
letter_by_letter_dot =
((letter_by_letter ins_space dot ins_space)*
letter_by_letter)
;
export EMAIL1 = Optimize[
((letter_by_letter) (ins_space)
(at) (ins_space)
(letter_by_letter_dot) (ins_space)
(url_suffix)) @ l.LEXICAL_MAP
];
export EMAIL2 = Optimize[
((letter_by_letter) (ins_space)
(at) (ins_space)
(letter_string_dot) (ins_space)
(url_suffix)) @ l.LEXICAL_MAP
];
export EMAILS = Optimize[
EMAIL1 | EMAIL2
];

@ -1,42 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/util.grm' as util;
import 'en/verbalizer/extra_numbers.grm' as e;
import 'en/verbalizer/float.grm' as f;
import 'en/verbalizer/math.grm' as ma;
import 'en/verbalizer/miscellaneous.grm' as mi;
import 'en/verbalizer/money.grm' as mo;
import 'en/verbalizer/numbers.grm' as n;
import 'en/verbalizer/numbers_plus.grm' as np;
import 'en/verbalizer/spelled.grm' as s;
import 'en/verbalizer/spoken_punct.grm' as sp;
import 'en/verbalizer/time.grm' as t;
import 'en/verbalizer/urls.grm' as u;
export VERBALIZER = Optimize[RmWeight[
( e.MIXED_NUMBERS
| e.DIGITS
| f.FLOAT
| ma.ARITHMETIC
| mi.MISCELLANEOUS
| mo.MONEY
| n.CARDINAL_NUMBERS
| n.ORDINAL_NUMBERS
| np.NUMBERS_PLUS
| s.SPELLED
| sp.SPOKEN_PUNCT
| t.TIME
| u.URL) @ util.CLEAN_SPACES
]];

@ -1,17 +0,0 @@
This directory contains data used in:
Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
Transactions of the Association for Computational Linguistics 4: 507-519.
* `minimal.txt`: A list of 30 curated numbers used as the "minimal" training
set.
* `random-trn.txt`: A list of 9000 randomly-generated numbers used as the
"medium" training set.
* `random-tst.txt`: A list of 1000 randomly-generated numbers used as the test
set.
Note that `random-trn.txt` and `random-tst.txt` are totally disjoint, but that
a small number of examples occur both in `minimal.txt` and `random-tst.txt`.
For information about the sampling procedure used to generate the random data
sets, see appendix A of the aforementioned paper.

@ -1,300 +0,0 @@
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
220
221
230
300
400
500
600
700
800
900
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1020
1021
1030
1200
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2020
2021
2030
2100
2200
5001
10000
12000
20000
21000
50001
100000
120000
200000
210000
500001
1000000
1001000
1200000
2000000
2100000
5000001
10000000
10001000
12000000
20000000
50000001
100000000
100001000
120000000
200000000
500000001
1000000000
1000001000
1200000000
2000000000
5000000001
10000000000
10000001000
12000000000
20000000000
50000000001
100000000000
100000001000
120000000000
200000000000
500000000001

@ -1,6 +0,0 @@
# Russian covering grammar definitions
This directory defines a Russian text normalization covering grammar. The
primary entry-point is the FST `VERBALIZER`, defined in
`verbalizer/verbalizer.grm` and compiled in the FST archive
`verbalizer/verbalizer.far`.

@ -1,58 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export kRussianLowerAlpha = Optimize[
"а" | "б" | "в" | "г" | "д" | "е" | "ё" | "ж" | "з" | "и" | "й" |
"к" | "л" | "м" | "н" | "о" | "п" | "р" | "с" | "т" | "у" | "ф" |
"х" | "ц" | "ч" | "ш" | "щ" | "ъ" | "ы" | "ь" | "э" | "ю" | "я" ];
export kRussianUpperAlpha = Optimize[
"А" | "Б" | "В" | "Г" | "Д" | "Е" | "Ё" | "Ж" | "З" | "И" | "Й" |
"К" | "Л" | "М" | "Н" | "О" | "П" | "Р" | "С" | "Т" | "У" | "Ф" |
"Х" | "Ц" | "Ч" | "Ш" | "Щ" | "Ъ" | "Ы" | "Ь" | "Э" | "Ю" | "Я" ];
export kRussianLowerAlphaStressed = Optimize[
"а́" | "е́" | "ё́" | "и́" | "о́" | "у́" | "ы́" | "э́" | "ю́" | "я́" ];
export kRussianUpperAlphaStressed = Optimize[
"А́" | "Е́" | "Ё́" | "И́" | "О́" | "У́" | "Ы́" | "Э́" | "Ю́" | "Я́" ];
export kRussianRewriteStress = Optimize[
("А́" : "А'") | ("Е́" : "Е'") | ("Ё́" : "Ё'") | ("И́" : "И'") |
("О́" : "О'") | ("У́" : "У'") | ("Ы́" : "Ы'") | ("Э́" : "Э'") |
("Ю́" : "Ю'") | ("Я́" : "Я'") |
("а́" : "а'") | ("е́" : "е'") | ("ё́" : "ё'") | ("и́" : "и'") |
("о́" : "о'") | ("у́" : "у'") | ("ы́" : "ы'") | ("э́" : "э'") |
("ю́" : "ю'") | ("я́" : "я'")
];
export kRussianRemoveStress = Optimize[
("А́" : "А") | ("Е́" : "Е") | ("Ё́" : "Ё") | ("И́" : "И") | ("О́" : "О") |
("У́" : "У") | ("Ы́" : "Ы") | ("Э́" : "Э") | ("Ю́" : "Ю") | ("Я́" : "Я") |
("а́" : "а") | ("е́" : "е") | ("ё́" : "ё") | ("и́" : "и") | ("о́" : "о") |
("у́" : "у") | ("ы́" : "ы") | ("э́" : "э") | ("ю́" : "ю") | ("я́" : "я")
];
# Pre-reform characters, just in case.
export kRussianPreReform = Optimize[
"ѣ" | "Ѣ" # http://en.wikipedia.org/wiki/Yat
];
export kCyrillicAlphaStressed = Optimize[
kRussianLowerAlphaStressed | kRussianUpperAlphaStressed
];
export kCyrillicAlpha = Optimize[
kRussianLowerAlpha | kRussianUpperAlpha | kRussianPreReform
];

@ -1,338 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# AUTOMATICALLY GENERATED: DO NOT EDIT.
import 'util/byte.grm' as b;
# Utilities for insertion and deletion.
func I[expr] {
return "" : expr;
}
func D[expr] {
return expr : "";
}
# Powers of base 10.
export POWERS =
"[E15]"
| "[E14]"
| "[E13]"
| "[E12]"
| "[E11]"
| "[E10]"
| "[E9]"
| "[E8]"
| "[E7]"
| "[E6]"
| "[E5]"
| "[E4]"
| "[E3]"
| "[E2]"
| "[E1]"
;
export SIGMA = b.kBytes | POWERS;
export SIGMA_STAR = SIGMA*;
export SIGMA_PLUS = SIGMA+;
################################################################################
# BEGIN LANGUAGE SPECIFIC DATA
revaluations =
("[E4]" : "[E1]")
| ("[E5]" : "[E2]")
| ("[E7]" : "[E1]")
| ("[E8]" : "[E2]")
;
Ms = "[E3]" | "[E6]" | "[E9]";
func Zero[expr] {
return expr : ("");
}
space = " ";
lexset3 = Optimize[
("1[E1]+1" : "одиннадцати")
| ("1[E1]+1" : "одиннадцать")
| ("1[E1]+1" : "одиннадцатью")
| ("1[E1]+2" : "двенадцати")
| ("1[E1]+2" : "двенадцать")
| ("1[E1]+2" : "двенадцатью")
| ("1[E1]+3" : "тринадцати")
| ("1[E1]+3" : "тринадцать")
| ("1[E1]+3" : "тринадцатью")
| ("1[E1]+4" : "четырнадцати")
| ("1[E1]+4" : "четырнадцать")
| ("1[E1]+4" : "четырнадцатью")
| ("1[E1]+5" : "пятнадцати")
| ("1[E1]+5" : "пятнадцать")
| ("1[E1]+5" : "пятнадцатью")
| ("1[E1]+6" : "шестнадцати")
| ("1[E1]+6" : "шестнадцать")
| ("1[E1]+6" : "шестнадцатью")
| ("1[E1]+7" : "семнадцати")
| ("1[E1]+7" : "семнадцать")
| ("1[E1]+7" : "семнадцатью")
| ("1[E1]+8" : "восемнадцати")
| ("1[E1]+8" : "восемнадцать")
| ("1[E1]+8" : "восемнадцатью")
| ("1[E1]+9" : "девятнадцати")
| ("1[E1]+9" : "девятнадцать")
| ("1[E1]+9" : "девятнадцатью")]
;
lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR];
lexset2 = Optimize[
("1[E1]" : "десяти")
| ("1[E1]" : "десять")
| ("1[E1]" : "десятью")
| ("1[E2]" : "ста")
| ("1[E2]" : "сто")
| ("2[E1]" : "двадцати")
| ("2[E1]" : "двадцать")
| ("2[E1]" : "двадцатью")
| ("2[E2]" : "двести")
| ("2[E2]" : "двумстам")
| ("2[E2]" : "двумястами")
| ("2[E2]" : "двухсот")
| ("2[E2]" : "двухстах")
| ("3[E1]" : "тридцати")
| ("3[E1]" : "тридцать")
| ("3[E1]" : "тридцатью")
| ("3[E2]" : "тремстам")
| ("3[E2]" : "тремястами")
| ("3[E2]" : "трехсот")
| ("3[E2]" : "трехстах")
| ("3[E2]" : "триста")
| ("4[E1]" : "сорок")
| ("4[E1]" : "сорока")
| ("4[E2]" : "четыремстам")
| ("4[E2]" : "четыреста")
| ("4[E2]" : "четырехсот")
| ("4[E2]" : "четырехстах")
| ("4[E2]" : "четырьмястами")
| ("5[E1]" : "пятидесяти")
| ("5[E1]" : "пятьдесят")
| ("5[E1]" : "пятьюдесятью")
| ("5[E2]" : "пятисот")
| ("5[E2]" : "пятистам")
| ("5[E2]" : "пятистах")
| ("5[E2]" : "пятьсот")
| ("5[E2]" : "пятьюстами")
| ("6[E1]" : "шестидесяти")
| ("6[E1]" : "шестьдесят")
| ("6[E1]" : "шестьюдесятью")
| ("6[E2]" : "шестисот")
| ("6[E2]" : "шестистам")
| ("6[E2]" : "шестистах")
| ("6[E2]" : "шестьсот")
| ("6[E2]" : "шестьюстами")
| ("7[E1]" : "семидесяти")
| ("7[E1]" : "семьдесят")
| ("7[E1]" : "семьюдесятью")
| ("7[E2]" : "семисот")
| ("7[E2]" : "семистам")
| ("7[E2]" : "семистах")
| ("7[E2]" : "семьсот")
| ("7[E2]" : "семьюстами")
| ("8[E1]" : "восемьдесят")
| ("8[E1]" : "восьмидесяти")
| ("8[E1]" : "восьмьюдесятью")
| ("8[E2]" : "восемьсот")
| ("8[E2]" : "восемьюстами")
| ("8[E2]" : "восьмисот")
| ("8[E2]" : "восьмистам")
| ("8[E2]" : "восьмистах")
| ("8[E2]" : "восьмьюстами")
| ("9[E1]" : "девяноста")
| ("9[E1]" : "девяносто")
| ("9[E2]" : "девятисот")
| ("9[E2]" : "девятистам")
| ("9[E2]" : "девятистах")
| ("9[E2]" : "девятьсот")
| ("9[E2]" : "девятьюстами")]
;
lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR];
lexset1 = Optimize[
("+" : "")
| ("1" : "один")
| ("1" : "одна")
| ("1" : "одни")
| ("1" : "одним")
| ("1" : "одними")
| ("1" : "одних")
| ("1" : "одно")
| ("1" : "одного")
| ("1" : "одной")
| ("1" : "одном")
| ("1" : "одному")
| ("1" : "одною")
| ("1" : "одну")
| ("2" : "два")
| ("2" : "две")
| ("2" : "двум")
| ("2" : "двумя")
| ("2" : "двух")
| ("3" : "трем")
| ("3" : "тремя")
| ("3" : "трех")
| ("3" : "три")
| ("4" : "четыре")
| ("4" : "четырем")
| ("4" : "четырех")
| ("4" : "четырьмя")
| ("5" : "пяти")
| ("5" : "пять")
| ("5" : "пятью")
| ("6" : "шести")
| ("6" : "шесть")
| ("6" : "шестью")
| ("7" : "семи")
| ("7" : "семь")
| ("7" : "семью")
| ("8" : "восемь")
| ("8" : "восьми")
| ("8" : "восьмью")
| ("9" : "девяти")
| ("9" : "девять")
| ("9" : "девятью")
| ("[E3]" : "тысяч")
| ("[E3]" : "тысяча")
| ("[E3]" : "тысячам")
| ("[E3]" : "тысячами")
| ("[E3]" : "тысячах")
| ("[E3]" : "тысяче")
| ("[E3]" : "тысячей")
| ("[E3]" : "тысячи")
| ("[E3]" : "тысячу")
| ("[E3]" : "тысячью")
| ("[E6]" : "миллион")
| ("[E6]" : "миллиона")
| ("[E6]" : "миллионам")
| ("[E6]" : "миллионами")
| ("[E6]" : "миллионах")
| ("[E6]" : "миллионе")
| ("[E6]" : "миллионов")
| ("[E6]" : "миллионом")
| ("[E6]" : "миллиону")
| ("[E6]" : "миллионы")
| ("[E9]" : "миллиард")
| ("[E9]" : "миллиарда")
| ("[E9]" : "миллиардам")
| ("[E9]" : "миллиардами")
| ("[E9]" : "миллиардах")
| ("[E9]" : "миллиарде")
| ("[E9]" : "миллиардов")
| ("[E9]" : "миллиардом")
| ("[E9]" : "миллиарду")
| ("[E9]" : "миллиарды")
| ("|0|" : "ноле")
| ("|0|" : "нолем")
| ("|0|" : "ноль")
| ("|0|" : "нолю")
| ("|0|" : "ноля")
| ("|0|" : "нуле")
| ("|0|" : "нулем")
| ("|0|" : "нуль")
| ("|0|" : "нулю")
| ("|0|" : "нуля")]
;
lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR];
export LEX = Optimize[lex3 @ lex2 @ lex1];
export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]";
# END LANGUAGE SPECIFIC DATA
################################################################################
# Inserts a marker after the Ms.
export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR];
# Deletes all powers and "+".
export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR];
# Deletes trailing zeros at the beginning of a number, so that "0003" does not
# get treated as an ordinary number.
export DELETE_INITIAL_ZEROS =
CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR]
;
NonMs = Optimize[POWERS - Ms];
# Deletes (usually) zeros before a non-M. E.g., +0[E1] should be deleted.
export DELETE_INTERMEDIATE_ZEROS1 =
CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR]
;
# Deletes (usually) zeros before an M, if there is no non-zero element between
# that and the previous boundary. Thus, if after the result of the rule above we
# end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final
# zero.
export DELETE_INTERMEDIATE_ZEROS2 = Optimize[
CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR]
@ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]]
;
# Final clean up of stray zeros.
export DELETE_REMAINING_ZEROS = Optimize[
CDRewrite[Zero["+0"], "", "", SIGMA_STAR]
@ CDRewrite[Zero["0"], "", "", SIGMA_STAR]]
;
# Applies the revaluation map. For example in English, changes [E4] to [E1] as a
# modifier of [E3].
export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR];
# Deletes the various marks and powers in the input and output.
export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR];
export CLEAN_SPACES = Optimize[
CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR]
@ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR]
@ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]]
;
d = b.kDigit;
# Germanic inversion rule.
germanic =
(I["1+"] d "[E1]" D["+1"])
| (I["2+"] d "[E1]" D["+2"])
| (I["3+"] d "[E1]" D["+3"])
| (I["4+"] d "[E1]" D["+4"])
| (I["5+"] d "[E1]" D["+5"])
| (I["6+"] d "[E1]" D["+6"])
| (I["7+"] d "[E1]" D["+7"])
| (I["8+"] d "[E1]" D["+8"])
| (I["9+"] d "[E1]" D["+9"])
;
germanic_inversion =
CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt']
;
export GERMANIC_INVERSION = SIGMA_STAR;
export ORDINAL_RESTRICTION = SIGMA_STAR;
nondigits = b.kBytes - b.kDigit;
export ORDINAL_SUFFIX = D[nondigits*];

@ -1,177 +0,0 @@
0 ноле
0 ноль
0 нолю
0 ноля
0 нолём
0 нуле
0 нуль
0 нулю
0 нуля
0 нулём
1 один
1 одна
1 одни
1 одним
1 одними
1 одних
1 одно
1 одного
1 одной
1 одном
1 одному
1 одною
1 раз
1 одну
2 два
2 две
2 двум
2 двумя
2 двух
3 тремя
3 три
3 трём
3 трёх
4 четыре
4 четырьмя
4 четырём
4 четырёх
5 пяти
5 пять
5 пятью
6 шести
6 шесть
6 шестью
7 семи
7 семь
7 семью
8 восемь
8 восьми
8 восьмью
9 девяти
9 девять
9 девятью
10 десяти
10 десять
10 десятью
11 одиннадцати
11 одиннадцать
11 одиннадцатью
12 двенадцати
12 двенадцать
12 двенадцатью
13 тринадцати
13 тринадцать
13 тринадцатью
14 четырнадцати
14 четырнадцать
14 четырнадцатью
15 пятнадцати
15 пятнадцать
15 пятнадцатью
16 шестнадцати
16 шестнадцать
16 шестнадцатью
17 семнадцати
17 семнадцать
17 семнадцатью
18 восемнадцати
18 восемнадцать
18 восемнадцатью
19 девятнадцати
19 девятнадцать
19 девятнадцатью
20 двадцати
20 двадцать
20 двадцатью
30 тридцати
30 тридцать
30 тридцатью
40 сорок
40 сорока
50 пятидесяти
50 пятьдесят
50 пятьюдесятью
60 шестидесяти
60 шестьдесят
60 шестьюдесятью
70 семидесяти
70 семьдесят
70 семьюдесятью
80 восемьдесят
80 восьмидесяти
80 восьмьюдесятью
90 девяноста
90 девяносто
100 ста
100 сто
200 двести
200 двумстам
200 двумястами
200 двухсот
200 двухстах
300 тремястами
300 трехсот
300 триста
300 трёмстам
300 трёхстах
400 четыреста
400 четырьмястами
400 четырёмстам
400 четырёхсот
400 четырёхстах
500 пятисот
500 пятистам
500 пятистах
500 пятьсот
500 пятьюстами
600 шестисот
600 шестистам
600 шестистах
600 шестьсот
600 шестьюстами
700 семисот
700 семистам
700 семистах
700 семьсот
700 семьюстами
800 восемьсот
800 восемьюстами
800 восьмисот
800 восьмистам
800 восьмистах
800 восьмьюстами
900 девятисот
900 девятистам
900 девятистах
900 девятьсот
900 девятьюстами
1000 тысяч
1000 тысяча
1000 тысячам
1000 тысячами
1000 тысячах
1000 тысяче
1000 тысячей
1000 тысячи
1000 тысячу
1000 тысячью
1000000 миллион
1000000 миллиона
1000000 миллионам
1000000 миллионами
1000000 миллионах
1000000 миллионе
1000000 миллионов
1000000 миллионом
1000000 миллиону
1000000 миллионы
1000000000 миллиард
1000000000 миллиарда
1000000000 миллиардам
1000000000 миллиардами
1000000000 миллиардах
1000000000 миллиарде
1000000000 миллиардов
1000000000 миллиардом
1000000000 миллиарду
1000000000 миллиарды
1 0 ноле
2 0 ноль
3 0 нолю
4 0 ноля
5 0 нолём
6 0 нуле
7 0 нуль
8 0 нулю
9 0 нуля
10 0 нулём
11 1 один
12 1 одна
13 1 одни
14 1 одним
15 1 одними
16 1 одних
17 1 одно
18 1 одного
19 1 одной
20 1 одном
21 1 одному
22 1 одною
23 1 раз
24 1 одну
25 2 два
26 2 две
27 2 двум
28 2 двумя
29 2 двух
30 3 тремя
31 3 три
32 3 трём
33 3 трёх
34 4 четыре
35 4 четырьмя
36 4 четырём
37 4 четырёх
38 5 пяти
39 5 пять
40 5 пятью
41 6 шести
42 6 шесть
43 6 шестью
44 7 семи
45 7 семь
46 7 семью
47 8 восемь
48 8 восьми
49 8 восьмью
50 9 девяти
51 9 девять
52 9 девятью
53 10 десяти
54 10 десять
55 10 десятью
56 11 одиннадцати
57 11 одиннадцать
58 11 одиннадцатью
59 12 двенадцати
60 12 двенадцать
61 12 двенадцатью
62 13 тринадцати
63 13 тринадцать
64 13 тринадцатью
65 14 четырнадцати
66 14 четырнадцать
67 14 четырнадцатью
68 15 пятнадцати
69 15 пятнадцать
70 15 пятнадцатью
71 16 шестнадцати
72 16 шестнадцать
73 16 шестнадцатью
74 17 семнадцати
75 17 семнадцать
76 17 семнадцатью
77 18 восемнадцати
78 18 восемнадцать
79 18 восемнадцатью
80 19 девятнадцати
81 19 девятнадцать
82 19 девятнадцатью
83 20 двадцати
84 20 двадцать
85 20 двадцатью
86 30 тридцати
87 30 тридцать
88 30 тридцатью
89 40 сорок
90 40 сорока
91 50 пятидесяти
92 50 пятьдесят
93 50 пятьюдесятью
94 60 шестидесяти
95 60 шестьдесят
96 60 шестьюдесятью
97 70 семидесяти
98 70 семьдесят
99 70 семьюдесятью
100 80 восемьдесят
101 80 восьмидесяти
102 80 восьмьюдесятью
103 90 девяноста
104 90 девяносто
105 100 ста
106 100 сто
107 200 двести
108 200 двумстам
109 200 двумястами
110 200 двухсот
111 200 двухстах
112 300 тремястами
113 300 трехсот
114 300 триста
115 300 трёмстам
116 300 трёхстах
117 400 четыреста
118 400 четырьмястами
119 400 четырёмстам
120 400 четырёхсот
121 400 четырёхстах
122 500 пятисот
123 500 пятистам
124 500 пятистах
125 500 пятьсот
126 500 пятьюстами
127 600 шестисот
128 600 шестистам
129 600 шестистах
130 600 шестьсот
131 600 шестьюстами
132 700 семисот
133 700 семистам
134 700 семистах
135 700 семьсот
136 700 семьюстами
137 800 восемьсот
138 800 восемьюстами
139 800 восьмисот
140 800 восьмистам
141 800 восьмистах
142 800 восьмьюстами
143 900 девятисот
144 900 девятистам
145 900 девятистах
146 900 девятьсот
147 900 девятьюстами
148 1000 тысяч
149 1000 тысяча
150 1000 тысячам
151 1000 тысячами
152 1000 тысячах
153 1000 тысяче
154 1000 тысячей
155 1000 тысячи
156 1000 тысячу
157 1000 тысячью
158 1000000 миллион
159 1000000 миллиона
160 1000000 миллионам
161 1000000 миллионами
162 1000000 миллионах
163 1000000 миллионе
164 1000000 миллионов
165 1000000 миллионом
166 1000000 миллиону
167 1000000 миллионы
168 1000000000 миллиард
169 1000000000 миллиарда
170 1000000000 миллиардам
171 1000000000 миллиардами
172 1000000000 миллиардах
173 1000000000 миллиарде
174 1000000000 миллиардов
175 1000000000 миллиардом
176 1000000000 миллиарду
177 1000000000 миллиарды

@ -1,35 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
import 'ru/verbalizer/numbers.grm' as n;
digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@");
export DIGITS = digit (n.I[" "] digit)*;
# Various common factorizations
two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS;
three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS;
mixed =
(digit n.I[" "] two_digits)
| (two_digits n.I[" "] two_digits)
| (two_digits n.I[" "] three_digits)
| (two_digits n.I[" "] two_digits n.I[" "] two_digits)
;
export MIXED_NUMBERS = Optimize[mixed];

@ -1,40 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
import 'util/util.grm' as u;
import 'ru/verbalizer/numbers.grm' as n;
func ToNumberName[expr] {
number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*;
return Optimize[expr @ number_name_seq];
}
d = b.kDigit;
leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*];
by_ones = d n.I[" "];
by_twos = (d{2} @ leading_zero) n.I[" "];
by_threes = (d{3} @ leading_zero) n.I[" "];
groupings = by_twos* (by_threes | by_twos | by_ones);
export FRACTIONAL_PART_UNGROUPED =
Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]]
;
export FRACTIONAL_PART_GROUPED =
Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]]
;
export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]];

@ -1,30 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'ru/verbalizer/factorization.grm' as f;
import 'ru/verbalizer/lexical_map.grm' as l;
import 'ru/verbalizer/numbers.grm' as n;
fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED;
fractional_part_grouped = f.FRACTIONAL_PART_GROUPED;
fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED;
__fractional_part__ = fractional_part_unparsed;
__decimal_marker__ = ",";
export FLOAT = Optimize[
(n.CARDINAL_NUMBERS
(__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ")
__fractional_part__) @ l.LEXICAL_MAP]
;

@ -1,25 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
lexical_map = StringFile['ru/verbalizer/lexical_map.tsv'];
sigma_star = b.kBytes*;
del_null = CDRewrite["__NULL__" : "", "", "", sigma_star];
export LEXICAL_MAP = Optimize[
CDRewrite[lexical_map, "", "", sigma_star] @ del_null]
;

@ -1,221 +0,0 @@
@@CONNECTOR_RANGE@@ до
@@CONNECTOR_RATIO@@ к
@@CONNECTOR_BY@@ на
@@CONNECTOR_CONSECUTIVE_YEAR@@ до
@@JANUARY@@ январь
@@JANUARY@@ январи
@@JANUARY@@ января
@@JANUARY@@ январей
@@JANUARY@@ январю
@@JANUARY@@ январям
@@JANUARY@@ январь
@@JANUARY@@ январи
@@JANUARY@@ январём
@@JANUARY@@ январями
@@JANUARY@@ январе
@@JANUARY@@ январях
@@FEBRUARY@@ февраль
@@FEBRUARY@@ феврали
@@FEBRUARY@@ февраля
@@FEBRUARY@@ февралей
@@FEBRUARY@@ февралю
@@FEBRUARY@@ февралям
@@FEBRUARY@@ февраль
@@FEBRUARY@@ феврали
@@FEBRUARY@@ февралём
@@FEBRUARY@@ февралями
@@FEBRUARY@@ феврале
@@FEBRUARY@@ февралях
@@MARCH@@ март
@@MARCH@@ марты
@@MARCH@@ марта
@@MARCH@@ мартов
@@MARCH@@ марту
@@MARCH@@ мартам
@@MARCH@@ март
@@MARCH@@ марты
@@MARCH@@ мартом
@@MARCH@@ мартами
@@MARCH@@ марте
@@MARCH@@ мартах
@@APRIL@@ апрель
@@APRIL@@ апрели
@@APRIL@@ апреля
@@APRIL@@ апрелей
@@APRIL@@ апрелю
@@APRIL@@ апрелям
@@APRIL@@ апрель
@@APRIL@@ апрели
@@APRIL@@ апрелем
@@APRIL@@ апрелями
@@APRIL@@ апреле
@@APRIL@@ апрелях
@@MAY@@ май
@@MAY@@ маи
@@MAY@@ мая
@@MAY@@ маев
@@MAY@@ маю
@@MAY@@ маям
@@MAY@@ май
@@MAY@@ маи
@@MAY@@ маем
@@MAY@@ маями
@@MAY@@ мае
@@MAY@@ маях
@@JUN@@ июнь
@@JUN@@ июни
@@JUN@@ июня
@@JUN@@ июней
@@JUN@@ июню
@@JUN@@ июням
@@JUN@@ июнь
@@JUN@@ июни
@@JUN@@ июнем
@@JUN@@ июнями
@@JUN@@ июне
@@JUN@@ июнях
@@JUL@@ июль
@@JUL@@ июли
@@JUL@@ июля
@@JUL@@ июлей
@@JUL@@ июлю
@@JUL@@ июлям
@@JUL@@ июль
@@JUL@@ июли
@@JUL@@ июлем
@@JUL@@ июлями
@@JUL@@ июле
@@JUL@@ июлях
@@AUGUST@@ август
@@AUGUST@@ августы
@@AUGUST@@ августа
@@AUGUST@@ августов
@@AUGUST@@ августу
@@AUGUST@@ августам
@@AUGUST@@ август
@@AUGUST@@ августы
@@AUGUST@@ августом
@@AUGUST@@ августами
@@AUGUST@@ августе
@@AUGUST@@ августах
@@SEPTEMBER@@ сентябрь
@@SEPTEMBER@@ сентябри
@@SEPTEMBER@@ сентября
@@SEPTEMBER@@ сентябрей
@@SEPTEMBER@@ сентябрю
@@SEPTEMBER@@ сентябрям
@@SEPTEMBER@@ сентябрь
@@SEPTEMBER@@ сентябри
@@SEPTEMBER@@ сентябрём
@@SEPTEMBER@@ сентябрями
@@SEPTEMBER@@ сентябре
@@SEPTEMBER@@ сентябрях
@@OCTOBER@@ октябрь
@@OCTOBER@@ октябри
@@OCTOBER@@ октября
@@OCTOBER@@ октябрей
@@OCTOBER@@ октябрю
@@OCTOBER@@ октябрям
@@OCTOBER@@ октябрь
@@OCTOBER@@ октябри
@@OCTOBER@@ октябрём
@@OCTOBER@@ октябрями
@@OCTOBER@@ октябре
@@OCTOBER@@ октябрях
@@NOVEMBER@@ ноябрь
@@NOVEMBER@@ ноябри
@@NOVEMBER@@ ноября
@@NOVEMBER@@ ноябрей
@@NOVEMBER@@ ноябрю
@@NOVEMBER@@ ноябрям
@@NOVEMBER@@ ноябрь
@@NOVEMBER@@ ноябри
@@NOVEMBER@@ ноябрём
@@NOVEMBER@@ ноябрями
@@NOVEMBER@@ ноябре
@@NOVEMBER@@ ноябрях
@@DECEMBER@@ декабрь
@@DECEMBER@@ декабри
@@DECEMBER@@ декабря
@@DECEMBER@@ декабрей
@@DECEMBER@@ декабрю
@@DECEMBER@@ декабрям
@@DECEMBER@@ декабрь
@@DECEMBER@@ декабри
@@DECEMBER@@ декабрём
@@DECEMBER@@ декабрями
@@DECEMBER@@ декабре
@@DECEMBER@@ декабрях
@@MINUS@@ минус
@@DECIMAL_DOT_EXPRESSION@@ целая
@@DECIMAL_DOT_EXPRESSION@@ целой
@@DECIMAL_DOT_EXPRESSION@@ целой
@@DECIMAL_DOT_EXPRESSION@@ целую
@@DECIMAL_DOT_EXPRESSION@@ целой
@@DECIMAL_DOT_EXPRESSION@@ целой
@@DECIMAL_DOT_EXPRESSION@@ целым
@@DECIMAL_DOT_EXPRESSION@@ целыми
@@DECIMAL_DOT_EXPRESSION@@ целых
@@DECIMAL_DOT_EXPRESSION@@ целых
@@URL_DOT_EXPRESSION@@ точка
@@PERIOD@@ точка
@@DECIMAL_EXPONENT@@ умножить на десять в степени
@@COLON@@ двоеточие
@@SLASH@@ косая черта
@@PASSWORD@@ пароль
@@AT@@ собака
@@PORT@@ порт
@@QUESTION_MARK@@ вопросительный знак
@@HASH@@ решётка
@@HASH@@ решетка
@@MONEY_AND@@ и
@@AND@@ и
@@PHONE_PLUS@@ плюс
@@ARITHMETIC_PLUS@@ плюс
@@PHONE_EXTENSION@@ добавочный номер
@@TIME_AM@@ утра
@@TIME_PM@@ вечера
@@HOUR@@ час
@@HOUR@@ часа
@@HOUR@@ часам
@@HOUR@@ часами
@@HOUR@@ часах
@@HOUR@@ часе
@@HOUR@@ часов
@@HOUR@@ часом
@@HOUR@@ часу
@@HOUR@@ часы
@@MINUTE@@ минут
@@MINUTE@@ минута
@@MINUTE@@ минутам
@@MINUTE@@ минутами
@@MINUTE@@ минутах
@@MINUTE@@ минуте
@@MINUTE@@ минутой
@@MINUTE@@ минутою
@@MINUTE@@ минуту
@@MINUTE@@ минуты
@@TIME_AFTER@@ __NULL__
@@TIME_BEFORE_PRE@@ без
@@TIME_QUARTER@@ четверть
@@TIME_QUARTER@@ четверти
@@TIME_HALF@@ половина
@@TIME_HALF@@ половины
@@TIME_HALF@@ половину
@@TIME_HALF@@ половин
@@TIME_HALF@@ половине
@@TIME_HALF@@ половинам
@@TIME_HALF@@ половиной
@@TIME_HALF@@ половинами
@@TIME_HALF@@ половинах
@@PERCENT@@ процент
@@PERCENT@@ процента
@@PERCENT@@ процентам
@@PERCENT@@ процентами
@@PERCENT@@ процентах
@@PERCENT@@ проценте
@@PERCENT@@ процентов
@@PERCENT@@ процентом
@@PERCENT@@ проценту
@@PERCENT@@ проценты
@@PERCENT@@ проценты
Can't render this file because it has a wrong number of fields in line 176.

@ -1,34 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'ru/verbalizer/float.grm' as f;
import 'ru/verbalizer/lexical_map.grm' as l;
import 'ru/verbalizer/numbers.grm' as n;
float = f.FLOAT;
card = n.CARDINAL_NUMBERS;
number = card | float;
plus = "+" : " @@ARITHMETIC_PLUS@@ ";
times = "*" : " @@ARITHMETIC_TIMES@@ ";
minus = "-" : " @@ARITHMETIC_MINUS@@ ";
division = "/" : " @@ARITHMETIC_DIVISION@@ ";
operator = plus | times | minus | division;
percent = "%" : " @@PERCENT@@";
export ARITHMETIC =
Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP]
;

@ -1,78 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
import 'ru/classifier/cyrillic.grm' as c;
import 'ru/verbalizer/extra_numbers.grm' as e;
import 'ru/verbalizer/lexical_map.grm' as l;
import 'ru/verbalizer/numbers.grm' as n;
import 'ru/verbalizer/spelled.grm' as s;
letter = b.kAlpha | c.kCyrillicAlpha;
dash = "-";
word = letter+;
possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?;
post_word_symbol =
("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) |
("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) |
("*" : "@@STAR@@")
;
pre_word_symbol =
("@" : "@@AT@@") |
("/" : "@@SLASH@@") |
("#" : "@@HASH@@")
;
post_word = possibly_split_word n.I[" "] post_word_symbol;
pre_word = pre_word_symbol n.I[" "] possibly_split_word;
## Number/digit sequence combos, maybe with a dash
spelled_word = word @ s.SPELLED_NO_LETTER;
word_number =
(word | spelled_word)
(n.I[" "] | (dash : " "))
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
;
number_word =
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
(n.I[" "] | (dash : " "))
(word | spelled_word)
;
## Two-digit year.
# Note that in this case to be fair we really have to allow ordinals too since
# in some languages that's what you would have.
two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS));
dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com";
miscellaneous = Optimize[
possibly_split_word
| post_word
| pre_word
| word_number
| number_word
| two_digit_year
| dot_com
];
export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP];

@ -1,44 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
import 'ru/verbalizer/lexical_map.grm' as l;
import 'ru/verbalizer/numbers.grm' as n;
card = n.CARDINAL_NUMBERS;
__currency__ = StringFile['ru/verbalizer/money.tsv'];
d = b.kDigit;
D = d - "0";
cents = ((n.D["0"] | D) d) @ card;
# Only dollar for the verbalizer tests for English. Will need to add other
# currencies.
usd_maj = Project["usd_maj" @ __currency__, 'output'];
usd_min = Project["usd_min" @ __currency__, 'output'];
and = " @@MONEY_AND@@ " | " ";
dollar1 =
n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min]
;
dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"];
dollar3 = n.D["$"] card n.I[" " usd_maj];
dollar = Optimize[dollar1 | dollar2 | dollar3];
export MONEY = Optimize[dollar @ l.LEXICAL_MAP];

@ -1,24 +0,0 @@
usd_maj доллара
usd_maj долларами
usd_maj долларам
usd_maj долларах
usd_maj долларе
usd_maj долларов
usd_maj долларом
usd_maj доллар
usd_maj доллар
usd_maj доллару
usd_maj доллары
usd_maj доллары
usd_min цент
usd_min цент
usd_min цента
usd_min центам
usd_min центами
usd_min центах
usd_min центе
usd_min центов
usd_min центом
usd_min центу
usd_min центы
usd_min центы
1 usd_maj доллара
2 usd_maj долларами
3 usd_maj долларам
4 usd_maj долларах
5 usd_maj долларе
6 usd_maj долларов
7 usd_maj долларом
8 usd_maj доллар
9 usd_maj доллар
10 usd_maj доллару
11 usd_maj доллары
12 usd_maj доллары
13 usd_min цент
14 usd_min цент
15 usd_min цента
16 usd_min центам
17 usd_min центами
18 usd_min центах
19 usd_min центе
20 usd_min центов
21 usd_min центом
22 usd_min центу
23 usd_min центы
24 usd_min центы

@ -1,166 +0,0 @@
нуль
ноль
один
два
две
три
четыре
пять
шесть
семь
восемь
девять
десять
одиннадцать
двенадцать
тринадцать
четырнадцать
пятнадцать
шестнадцать
семнадцать
восемнадцать
девятнадцать
двадцать
тридцать
сорок
пятьдесят
шестьдесят
семьдесят
восемьдесят
девяносто
сто
двести
триста
четыреста
пятьсот
шестьсот
семьсот
восемьсот
девятьсот
тысячи
тысяч
тысяча
миллионов
миллион
миллиона
миллиардов
миллиард
миллиарда
первая
первого
первое
первый
вторая
второе
второй
третий
третье
третья
четвертая
четвертое
четвертой
пятая
пятое
пятой
шестая
шестое
шестой
седьмая
седьмое
седьмой
восьмая
восьмое
восьмой
девятая
девятое
девятой
десятая
десятое
десятой
одиннадцатая
одиннадцатое
одиннадцатой
двенадцатая
двенадцатое
двенадцатой
тринадцатая
тринадцатое
тринадцатой
четырнадцатая
четырнадцатое
четырнадцатой
пятнадцатая
пятнадцатое
пятнадцатой
шестнадцатая
шестнадцатое
шестнадцатой
семнадцатая
семнадцатое
семнадцатой
восемнадцатая
восемнадцатое
восемнадцатой
девятнадцатая
девятнадцатое
девятнадцатой
двадцатая
двадцатое
двадцатой
тридцатая
тридцатое
тридцатой
сороковая
сороковое
сороковой
пятидесятая
пятидесятое
пятидесятой
шестидесятая
шестидесятое
шестидесятой
семидесятая
семидесятое
семидесятой
восьмидесятая
восьмидесятое
восьмидесятой
девяностая
девяностое
девяностой
сотая
сотое
сотой
двухсотая
двухсотое
двухсотой
трехсотая
трехсотое
трехсотой
четырехсотая
четырехсотое
четырехсотой
пятисотая
пятисотое
пятисотой
шестисотая
шестисотое
шестисотой
семисотая
семисотое
семисотой
восьмисотая
восьмисотое
восьмисотой
девятисотая
девятисотое
девятисотой
тысячная
тысячное
тысячной
миллионная
миллионное
миллионной
миллиардная
миллиардное
миллиардной
1 нуль
2 ноль
3 один
4 два
5 две
6 три
7 четыре
8 пять
9 шесть
10 семь
11 восемь
12 девять
13 десять
14 одиннадцать
15 двенадцать
16 тринадцать
17 четырнадцать
18 пятнадцать
19 шестнадцать
20 семнадцать
21 восемнадцать
22 девятнадцать
23 двадцать
24 тридцать
25 сорок
26 пятьдесят
27 шестьдесят
28 семьдесят
29 восемьдесят
30 девяносто
31 сто
32 двести
33 триста
34 четыреста
35 пятьсот
36 шестьсот
37 семьсот
38 восемьсот
39 девятьсот
40 тысячи
41 тысяч
42 тысяча
43 миллионов
44 миллион
45 миллиона
46 миллиардов
47 миллиард
48 миллиарда
49 первая
50 первого
51 первое
52 первый
53 вторая
54 второе
55 второй
56 третий
57 третье
58 третья
59 четвертая
60 четвертое
61 четвертой
62 пятая
63 пятое
64 пятой
65 шестая
66 шестое
67 шестой
68 седьмая
69 седьмое
70 седьмой
71 восьмая
72 восьмое
73 восьмой
74 девятая
75 девятое
76 девятой
77 десятая
78 десятое
79 десятой
80 одиннадцатая
81 одиннадцатое
82 одиннадцатой
83 двенадцатая
84 двенадцатое
85 двенадцатой
86 тринадцатая
87 тринадцатое
88 тринадцатой
89 четырнадцатая
90 четырнадцатое
91 четырнадцатой
92 пятнадцатая
93 пятнадцатое
94 пятнадцатой
95 шестнадцатая
96 шестнадцатое
97 шестнадцатой
98 семнадцатая
99 семнадцатое
100 семнадцатой
101 восемнадцатая
102 восемнадцатое
103 восемнадцатой
104 девятнадцатая
105 девятнадцатое
106 девятнадцатой
107 двадцатая
108 двадцатое
109 двадцатой
110 тридцатая
111 тридцатое
112 тридцатой
113 сороковая
114 сороковое
115 сороковой
116 пятидесятая
117 пятидесятое
118 пятидесятой
119 шестидесятая
120 шестидесятое
121 шестидесятой
122 семидесятая
123 семидесятое
124 семидесятой
125 восьмидесятая
126 восьмидесятое
127 восьмидесятой
128 девяностая
129 девяностое
130 девяностой
131 сотая
132 сотое
133 сотой
134 двухсотая
135 двухсотое
136 двухсотой
137 трехсотая
138 трехсотое
139 трехсотой
140 четырехсотая
141 четырехсотое
142 четырехсотой
143 пятисотая
144 пятисотое
145 пятисотой
146 шестисотая
147 шестисотое
148 шестисотой
149 семисотая
150 семисотое
151 семисотой
152 восьмисотая
153 восьмисотое
154 восьмисотой
155 девятисотая
156 девятисотое
157 девятисотой
158 тысячная
159 тысячное
160 тысячной
161 миллионная
162 миллионное
163 миллионной
164 миллиардная
165 миллиардное
166 миллиардной

@ -1,48 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Russian minimally supervised number grammar.
#
# Supports cardinals and ordinals in all inflected forms.
#
# The language-specific acceptor G was compiled with digit, teen, decade,
# century, and big power-of-ten preterminals. The lexicon transducer is
# highly ambiguous, but no LM is used.
import 'util/arithmetic.grm' as a;
# Intersects the universal factorization transducer (F) with language-specific
# acceptor (G).
d = a.DELTA_STAR;
f = a.IARITHMETIC_RESTRICTED;
g = LoadFst['ru/verbalizer/g.fst'];
fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]];
test1 = AssertEqual["230" @ fg, "(+ 200 30 +)"];
# Compiles lexicon transducers (L).
cardinal_name = StringFile['ru/verbalizer/cardinals.tsv'];
cardinal_l = Optimize[(cardinal_name " ")* cardinal_name];
ordinal_name = StringFile['ru/verbalizer/ordinals.tsv'];
ordinal_l = Optimize[(cardinal_name " ")* ordinal_name];
# Composes L with the leaf transducer (P), then composes that with FG.
p = a.LEAVES;
export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)];
export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)];

@ -1,68 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'ru/verbalizer/number_names.grm' as n;
import 'universal/thousands_punct.grm' as t;
import 'util/byte.grm' as b;
nominatives = StringFile['ru/verbalizer/nominatives.tsv'];
sigma_star = b.kBytes*;
nominative_filter =
CDRewrite[nominatives ("" : "" <-1>), "[BOS]" | " ", " " | "[EOS]", sigma_star]
;
cardinal = n.CARDINAL_NUMBER_NAME;
ordinal = n.ORDINAL_NUMBER_NAME;
# Putting these here since this grammar gets incorporated by all the others.
func I[expr] {
return "" : expr;
}
func D[expr] {
return expr : "";
}
# Since we know this is the default for Russian, it's fair game to set it.
separators = t.dot_thousands | t.no_delimiter;
export CARDINAL_NUMBERS = Optimize[
separators
@ cardinal
];
export ORDINAL_NUMBERS_UNMARKED = Optimize[
separators
@ ordinal
];
endings = StringFile['ru/verbalizer/ordinal_endings.tsv'];
not_dash = (b.kBytes - "-")+;
del_ending = CDRewrite[("-" not_dash) : "", "", "[EOS]", sigma_star];
# Needs nominative_filter here if we take out Kyle's models.
export ORDINAL_NUMBERS_MARKED = Optimize[
Optimize[Optimize[separators @ ordinal] "-" not_dash]
@ Optimize[sigma_star endings]
@ del_ending]
;
export ORDINAL_NUMBERS =
Optimize[ORDINAL_NUMBERS_MARKED | ORDINAL_NUMBERS_UNMARKED]
;

@ -1,133 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Grammar for things built mostly on numbers.
import 'ru/verbalizer/factorization.grm' as f;
import 'ru/verbalizer/lexical_map.grm' as l;
import 'ru/verbalizer/numbers.grm' as n;
num = n.CARDINAL_NUMBERS;
ord = n.ORDINAL_NUMBERS_UNMARKED;
digits = f.FRACTIONAL_PART_UNGROUPED;
# Various symbols.
plus = "+" : "@@ARITHMETIC_PLUS@@";
minus = "-" : "@@ARITHMETIC_MINUS@@";
slash = "/" : "@@SLASH@@";
dot = "." : "@@URL_DOT_EXPRESSION@@";
dash = "-" : "@@DASH@@";
equals = "=" : "@@ARITHMETIC_EQUALS@@";
degree = "°" : "@@DEGREE@@";
division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@";
times = ("x" | "*") : "@@ARITHMETIC_TIMES@@";
power = "^" : "@@DECIMAL_EXPONENT@@";
square_root = "√" : "@@SQUARE_ROOT@@";
percent = "%" : "@@PERCENT@@";
# Safe roman numbers.
# NB: Do not change the formatting here. NO_EDIT must be on the same
# line as the path.
rfile =
'universal/roman_numerals.tsv' # NO_EDIT
;
roman = StringFile[rfile];
## Main categories.
cat_dot_number =
num
n.I[" "] dot n.I[" "] num
(n.I[" "] dot n.I[" "] num)+
;
cat_slash_number =
num
n.I[" "] slash n.I[" "] num
(n.I[" "] slash n.I[" "] num)*
;
cat_dash_number =
num
n.I[" "] dash n.I[" "] num
(n.I[" "] dash n.I[" "] num)*
;
cat_signed_number = ((plus | minus) n.I[" "])? num;
cat_degree = cat_signed_number n.I[" "] degree;
cat_country_code = plus n.I[" "] (num | digits);
cat_math_operations =
plus
| minus
| division
| times
| equals
| percent
| power
| square_root
;
# Roman numbers are often either cardinals or ordinals in various languages.
cat_roman = roman @ (num | ord);
# Allow
#
# number:number
# number-number
#
# to just be
#
# number number.
cat_number_number =
num ((":" | "-") : " ") num
;
# Some additional readings for these symbols.
cat_additional_readings =
("/" : "@@PER@@") |
("+" : "@@AND@@") |
("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) |
("*" : "@@STAR@@") |
("x" : ("x" | "@@CONNECTOR_BY@@")) |
("@" : "@@AT@@")
;
numbers_plus = Optimize[
cat_dot_number
| cat_slash_number
| cat_dash_number
| cat_signed_number
| cat_degree
| cat_country_code
| cat_math_operations
| cat_roman
| cat_number_number
| cat_additional_readings
];
export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP];

@ -1,39 +0,0 @@
ая-ая
ого-го
ьего-го
ьего-его
ьей-ей
ьему-ему
ьем-ем
ое-е
ые-е
ье-е
ий-ий
ьими-ими
ьим-им
ьих-их
ьи-и
ий-й
ой-й
ый-й
ыми-ми
ьими-ми
ому-му
ьему-му
ого-ого
ое-ое
ой-ой
ом-ом
ому-ому
ую-ую
ых-х
ьих-х
ые-ые
ый-ый
ыми-ыми
ым-ым
ых-ых
ую-ю
ью-ю
ая-я
ья-я
1 ая-ая
2 ого-го
3 ьего-го
4 ьего-его
5 ьей-ей
6 ьему-ему
7 ьем-ем
8 ое-е
9 ые-е
10 ье-е
11 ий-ий
12 ьими-ими
13 ьим-им
14 ьих-их
15 ьи-и
16 ий-й
17 ой-й
18 ый-й
19 ыми-ми
20 ьими-ми
21 ому-му
22 ьему-му
23 ого-ого
24 ое-ое
25 ой-ой
26 ом-ом
27 ому-ому
28 ую-ую
29 ых-х
30 ьих-х
31 ые-ые
32 ый-ый
33 ыми-ыми
34 ым-ым
35 ых-ых
36 ую-ю
37 ью-ю
38 ая-я
39 ья-я

@ -1,804 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# AUTOMATICALLY GENERATED: DO NOT EDIT.
import 'util/byte.grm' as b;
# Utilities for insertion and deletion.
func I[expr] {
return "" : expr;
}
func D[expr] {
return expr : "";
}
# Powers of base 10.
export POWERS =
"[E15]"
| "[E14]"
| "[E13]"
| "[E12]"
| "[E11]"
| "[E10]"
| "[E9]"
| "[E8]"
| "[E7]"
| "[E6]"
| "[E5]"
| "[E4]"
| "[E3]"
| "[E2]"
| "[E1]"
;
export SIGMA = b.kBytes | POWERS;
export SIGMA_STAR = SIGMA*;
export SIGMA_PLUS = SIGMA+;
################################################################################
# BEGIN LANGUAGE SPECIFIC DATA
revaluations =
("[E4]" : "[E1]")
| ("[E5]" : "[E2]")
| ("[E7]" : "[E1]")
| ("[E8]" : "[E2]")
;
Ms = "[E3]" | "[E6]" | "[E9]";
func Zero[expr] {
return expr : ("");
}
space = " ";
lexset3 = Optimize[
("1[E1]+1" : "одиннадцатая@")
| ("1[E1]+1" : "одиннадцати")
| ("1[E1]+1" : "одиннадцатого@")
| ("1[E1]+1" : "одиннадцатое@")
| ("1[E1]+1" : "одиннадцатой@")
| ("1[E1]+1" : "одиннадцатом@")
| ("1[E1]+1" : "одиннадцатому@")
| ("1[E1]+1" : "одиннадцатую@")
| ("1[E1]+1" : "одиннадцатые@")
| ("1[E1]+1" : "одиннадцатый@")
| ("1[E1]+1" : "одиннадцатым@")
| ("1[E1]+1" : "одиннадцатыми@")
| ("1[E1]+1" : "одиннадцатых@")
| ("1[E1]+1" : "одиннадцать")
| ("1[E1]+1" : "одиннадцатью")
| ("1[E1]+2" : "двенадцатая@")
| ("1[E1]+2" : "двенадцати")
| ("1[E1]+2" : "двенадцатого@")
| ("1[E1]+2" : "двенадцатое@")
| ("1[E1]+2" : "двенадцатой@")
| ("1[E1]+2" : "двенадцатом@")
| ("1[E1]+2" : "двенадцатому@")
| ("1[E1]+2" : "двенадцатую@")
| ("1[E1]+2" : "двенадцатые@")
| ("1[E1]+2" : "двенадцатый@")
| ("1[E1]+2" : "двенадцатым@")
| ("1[E1]+2" : "двенадцатыми@")
| ("1[E1]+2" : "двенадцатых@")
| ("1[E1]+2" : "двенадцать")
| ("1[E1]+2" : "двенадцатью")
| ("1[E1]+3" : "тринадцатая@")
| ("1[E1]+3" : "тринадцати")
| ("1[E1]+3" : "тринадцатого@")
| ("1[E1]+3" : "тринадцатое@")
| ("1[E1]+3" : "тринадцатой@")
| ("1[E1]+3" : "тринадцатом@")
| ("1[E1]+3" : "тринадцатому@")
| ("1[E1]+3" : "тринадцатую@")
| ("1[E1]+3" : "тринадцатые@")
| ("1[E1]+3" : "тринадцатый@")
| ("1[E1]+3" : "тринадцатым@")
| ("1[E1]+3" : "тринадцатыми@")
| ("1[E1]+3" : "тринадцатых@")
| ("1[E1]+3" : "тринадцать")
| ("1[E1]+3" : "тринадцатью")
| ("1[E1]+4" : "четырнадцатая@")
| ("1[E1]+4" : "четырнадцати")
| ("1[E1]+4" : "четырнадцатого@")
| ("1[E1]+4" : "четырнадцатое@")
| ("1[E1]+4" : "четырнадцатой@")
| ("1[E1]+4" : "четырнадцатом@")
| ("1[E1]+4" : "четырнадцатому@")
| ("1[E1]+4" : "четырнадцатую@")
| ("1[E1]+4" : "четырнадцатые@")
| ("1[E1]+4" : "четырнадцатый@")
| ("1[E1]+4" : "четырнадцатым@")
| ("1[E1]+4" : "четырнадцатыми@")
| ("1[E1]+4" : "четырнадцатых@")
| ("1[E1]+4" : "четырнадцать")
| ("1[E1]+4" : "четырнадцатью")
| ("1[E1]+5" : "пятнадцатая@")
| ("1[E1]+5" : "пятнадцати")
| ("1[E1]+5" : "пятнадцатого@")
| ("1[E1]+5" : "пятнадцатое@")
| ("1[E1]+5" : "пятнадцатой@")
| ("1[E1]+5" : "пятнадцатом@")
| ("1[E1]+5" : "пятнадцатому@")
| ("1[E1]+5" : "пятнадцатую@")
| ("1[E1]+5" : "пятнадцатые@")
| ("1[E1]+5" : "пятнадцатый@")
| ("1[E1]+5" : "пятнадцатым@")
| ("1[E1]+5" : "пятнадцатыми@")
| ("1[E1]+5" : "пятнадцатых@")
| ("1[E1]+5" : "пятнадцать")
| ("1[E1]+5" : "пятнадцатью")
| ("1[E1]+6" : "шестнадцатая@")
| ("1[E1]+6" : "шестнадцати")
| ("1[E1]+6" : "шестнадцатого@")
| ("1[E1]+6" : "шестнадцатое@")
| ("1[E1]+6" : "шестнадцатой@")
| ("1[E1]+6" : "шестнадцатом@")
| ("1[E1]+6" : "шестнадцатому@")
| ("1[E1]+6" : "шестнадцатую@")
| ("1[E1]+6" : "шестнадцатые@")
| ("1[E1]+6" : "шестнадцатый@")
| ("1[E1]+6" : "шестнадцатым@")
| ("1[E1]+6" : "шестнадцатыми@")
| ("1[E1]+6" : "шестнадцатых@")
| ("1[E1]+6" : "шестнадцать")
| ("1[E1]+6" : "шестнадцатью")
| ("1[E1]+7" : "семнадцатая@")
| ("1[E1]+7" : "семнадцати")
| ("1[E1]+7" : "семнадцатого@")
| ("1[E1]+7" : "семнадцатое@")
| ("1[E1]+7" : "семнадцатой@")
| ("1[E1]+7" : "семнадцатом@")
| ("1[E1]+7" : "семнадцатому@")
| ("1[E1]+7" : "семнадцатую@")
| ("1[E1]+7" : "семнадцатые@")
| ("1[E1]+7" : "семнадцатый@")
| ("1[E1]+7" : "семнадцатым@")
| ("1[E1]+7" : "семнадцатыми@")
| ("1[E1]+7" : "семнадцатых@")
| ("1[E1]+7" : "семнадцать")
| ("1[E1]+7" : "семнадцатью")
| ("1[E1]+8" : "восемнадцатая@")
| ("1[E1]+8" : "восемнадцати")
| ("1[E1]+8" : "восемнадцатого@")
| ("1[E1]+8" : "восемнадцатое@")
| ("1[E1]+8" : "восемнадцатой@")
| ("1[E1]+8" : "восемнадцатом@")
| ("1[E1]+8" : "восемнадцатому@")
| ("1[E1]+8" : "восемнадцатую@")
| ("1[E1]+8" : "восемнадцатые@")
| ("1[E1]+8" : "восемнадцатый@")
| ("1[E1]+8" : "восемнадцатым@")
| ("1[E1]+8" : "восемнадцатыми@")
| ("1[E1]+8" : "восемнадцатых@")
| ("1[E1]+8" : "восемнадцать")
| ("1[E1]+8" : "восемнадцатью")
| ("1[E1]+9" : "девятнадцатая@")
| ("1[E1]+9" : "девятнадцати")
| ("1[E1]+9" : "девятнадцатого@")
| ("1[E1]+9" : "девятнадцатое@")
| ("1[E1]+9" : "девятнадцатой@")
| ("1[E1]+9" : "девятнадцатом@")
| ("1[E1]+9" : "девятнадцатому@")
| ("1[E1]+9" : "девятнадцатую@")
| ("1[E1]+9" : "девятнадцатые@")
| ("1[E1]+9" : "девятнадцатый@")
| ("1[E1]+9" : "девятнадцатым@")
| ("1[E1]+9" : "девятнадцатыми@")
| ("1[E1]+9" : "девятнадцатых@")
| ("1[E1]+9" : "девятнадцать")
| ("1[E1]+9" : "девятнадцатью")]
;
lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR];
lexset2 = Optimize[
("1[E1]" : "десятая@")
| ("1[E1]" : "десяти")
| ("1[E1]" : "десятого@")
| ("1[E1]" : "десятое@")
| ("1[E1]" : "десятой@")
| ("1[E1]" : "десятом@")
| ("1[E1]" : "десятому@")
| ("1[E1]" : "десятую@")
| ("1[E1]" : "десятые@")
| ("1[E1]" : "десятый@")
| ("1[E1]" : "десятым@")
| ("1[E1]" : "десятыми@")
| ("1[E1]" : "десятых@")
| ("1[E1]" : "десять")
| ("1[E1]" : "десятью")
| ("1[E2]" : "сотая@")
| ("1[E2]" : "сотого@")
| ("1[E2]" : "сотое@")
| ("1[E2]" : "сотой@")
| ("1[E2]" : "сотом@")
| ("1[E2]" : "сотому@")
| ("1[E2]" : "сотую@")
| ("1[E2]" : "сотые@")
| ("1[E2]" : "сотый@")
| ("1[E2]" : "сотым@")
| ("1[E2]" : "сотыми@")
| ("1[E2]" : "сотых@")
| ("1[E2]" : "ста")
| ("1[E2]" : "сто")
| ("1[E3]" : "тысячная@")
| ("1[E3]" : "тысячного@")
| ("1[E3]" : "тысячное@")
| ("1[E3]" : "тысячной@")
| ("1[E3]" : "тысячном@")
| ("1[E3]" : "тысячному@")
| ("1[E3]" : "тысячную@")
| ("1[E3]" : "тысячные@")
| ("1[E3]" : "тысячный@")
| ("1[E3]" : "тысячным@")
| ("1[E3]" : "тысячными@")
| ("1[E3]" : "тысячных@")
| ("1[E6]" : "миллионная@")
| ("1[E6]" : "миллионного@")
| ("1[E6]" : "миллионное@")
| ("1[E6]" : "миллионной@")
| ("1[E6]" : "миллионном@")
| ("1[E6]" : "миллионному@")
| ("1[E6]" : "миллионную@")
| ("1[E6]" : "миллионные@")
| ("1[E6]" : "миллионный@")
| ("1[E6]" : "миллионным@")
| ("1[E6]" : "миллионными@")
| ("1[E6]" : "миллионных@")
| ("1[E9]" : "миллиардная@")
| ("1[E9]" : "миллиардного@")
| ("1[E9]" : "миллиардное@")
| ("1[E9]" : "миллиардной@")
| ("1[E9]" : "миллиардном@")
| ("1[E9]" : "миллиардному@")
| ("1[E9]" : "миллиардную@")
| ("1[E9]" : "миллиардные@")
| ("1[E9]" : "миллиардный@")
| ("1[E9]" : "миллиардным@")
| ("1[E9]" : "миллиардными@")
| ("1[E9]" : "миллиардных@")
| ("2[E1]" : "двадцатая@")
| ("2[E1]" : "двадцати")
| ("2[E1]" : "двадцатого@")
| ("2[E1]" : "двадцатое@")
| ("2[E1]" : "двадцатой@")
| ("2[E1]" : "двадцатом@")
| ("2[E1]" : "двадцатому@")
| ("2[E1]" : "двадцатую@")
| ("2[E1]" : "двадцатые@")
| ("2[E1]" : "двадцатый@")
| ("2[E1]" : "двадцатым@")
| ("2[E1]" : "двадцатыми@")
| ("2[E1]" : "двадцатых@")
| ("2[E1]" : "двадцать")
| ("2[E1]" : "двадцатью")
| ("2[E2]" : "двести")
| ("2[E2]" : "двумстам")
| ("2[E2]" : "двумястами")
| ("2[E2]" : "двухсот")
| ("2[E2]" : "двухсотая@")
| ("2[E2]" : "двухсотого@")
| ("2[E2]" : "двухсотое@")
| ("2[E2]" : "двухсотой@")
| ("2[E2]" : "двухсотом@")
| ("2[E2]" : "двухсотому@")
| ("2[E2]" : "двухсотую@")
| ("2[E2]" : "двухсотые@")
| ("2[E2]" : "двухсотый@")
| ("2[E2]" : "двухсотым@")
| ("2[E2]" : "двухсотыми@")
| ("2[E2]" : "двухсотых@")
| ("2[E2]" : "двухстах")
| ("3[E1]" : "тридцатая@")
| ("3[E1]" : "тридцати")
| ("3[E1]" : "тридцатого@")
| ("3[E1]" : "тридцатое@")
| ("3[E1]" : "тридцатой@")
| ("3[E1]" : "тридцатом@")
| ("3[E1]" : "тридцатому@")
| ("3[E1]" : "тридцатую@")
| ("3[E1]" : "тридцатые@")
| ("3[E1]" : "тридцатый@")
| ("3[E1]" : "тридцатым@")
| ("3[E1]" : "тридцатыми@")
| ("3[E1]" : "тридцатых@")
| ("3[E1]" : "тридцать")
| ("3[E1]" : "тридцатью")
| ("3[E2]" : "тремстам")
| ("3[E2]" : "тремястами")
| ("3[E2]" : "трехсот")
| ("3[E2]" : "трехсотая@")
| ("3[E2]" : "трехсотого@")
| ("3[E2]" : "трехсотое@")
| ("3[E2]" : "трехсотой@")
| ("3[E2]" : "трехсотом@")
| ("3[E2]" : "трехсотому@")
| ("3[E2]" : "трехсотую@")
| ("3[E2]" : "трехсотые@")
| ("3[E2]" : "трехсотый@")
| ("3[E2]" : "трехсотым@")
| ("3[E2]" : "трехсотыми@")
| ("3[E2]" : "трехсотых@")
| ("3[E2]" : "трехстах")
| ("3[E2]" : "триста")
| ("4[E1]" : "сорок")
| ("4[E1]" : "сорока")
| ("4[E1]" : "сороковая@")
| ("4[E1]" : "сорокового@")
| ("4[E1]" : "сороковое@")
| ("4[E1]" : "сороковой@")
| ("4[E1]" : "сороковом@")
| ("4[E1]" : "сороковому@")
| ("4[E1]" : "сороковую@")
| ("4[E1]" : "сороковые@")
| ("4[E1]" : "сороковым@")
| ("4[E1]" : "сороковыми@")
| ("4[E1]" : "сороковых@")
| ("4[E2]" : "четыремстам")
| ("4[E2]" : "четыреста")
| ("4[E2]" : "четырехсот")
| ("4[E2]" : "четырехсотая@")
| ("4[E2]" : "четырехсотого@")
| ("4[E2]" : "четырехсотое@")
| ("4[E2]" : "четырехсотой@")
| ("4[E2]" : "четырехсотом@")
| ("4[E2]" : "четырехсотому@")
| ("4[E2]" : "четырехсотую@")
| ("4[E2]" : "четырехсотые@")
| ("4[E2]" : "четырехсотый@")
| ("4[E2]" : "четырехсотым@")
| ("4[E2]" : "четырехсотыми@")
| ("4[E2]" : "четырехсотых@")
| ("4[E2]" : "четырехстах")
| ("4[E2]" : "четырьмястами")
| ("5[E1]" : "пятидесятая@")
| ("5[E1]" : "пятидесяти")
| ("5[E1]" : "пятидесятого@")
| ("5[E1]" : "пятидесятое@")
| ("5[E1]" : "пятидесятой@")
| ("5[E1]" : "пятидесятом@")
| ("5[E1]" : "пятидесятому@")
| ("5[E1]" : "пятидесятую@")
| ("5[E1]" : "пятидесятые@")
| ("5[E1]" : "пятидесятый@")
| ("5[E1]" : "пятидесятым@")
| ("5[E1]" : "пятидесятыми@")
| ("5[E1]" : "пятидесятых@")
| ("5[E1]" : "пятьдесят")
| ("5[E1]" : "пятьюдесятью")
| ("5[E2]" : "пятисот")
| ("5[E2]" : "пятисотая@")
| ("5[E2]" : "пятисотого@")
| ("5[E2]" : "пятисотое@")
| ("5[E2]" : "пятисотой@")
| ("5[E2]" : "пятисотом@")
| ("5[E2]" : "пятисотому@")
| ("5[E2]" : "пятисотую@")
| ("5[E2]" : "пятисотые@")
| ("5[E2]" : "пятисотый@")
| ("5[E2]" : "пятисотым@")
| ("5[E2]" : "пятисотыми@")
| ("5[E2]" : "пятисотых@")
| ("5[E2]" : "пятистам")
| ("5[E2]" : "пятистах")
| ("5[E2]" : "пятьсот")
| ("5[E2]" : "пятьюстами")
| ("6[E1]" : "шестидесятая@")
| ("6[E1]" : "шестидесяти")
| ("6[E1]" : "шестидесятого@")
| ("6[E1]" : "шестидесятое@")
| ("6[E1]" : "шестидесятой@")
| ("6[E1]" : "шестидесятом@")
| ("6[E1]" : "шестидесятому@")
| ("6[E1]" : "шестидесятую@")
| ("6[E1]" : "шестидесятые@")
| ("6[E1]" : "шестидесятый@")
| ("6[E1]" : "шестидесятым@")
| ("6[E1]" : "шестидесятыми@")
| ("6[E1]" : "шестидесятых@")
| ("6[E1]" : "шестьдесят")
| ("6[E1]" : "шестьюдесятью")
| ("6[E2]" : "шестисот")
| ("6[E2]" : "шестисотая@")
| ("6[E2]" : "шестисотого@")
| ("6[E2]" : "шестисотое@")
| ("6[E2]" : "шестисотой@")
| ("6[E2]" : "шестисотом@")
| ("6[E2]" : "шестисотому@")
| ("6[E2]" : "шестисотую@")
| ("6[E2]" : "шестисотые@")
| ("6[E2]" : "шестисотый@")
| ("6[E2]" : "шестисотым@")
| ("6[E2]" : "шестисотыми@")
| ("6[E2]" : "шестисотых@")
| ("6[E2]" : "шестистам")
| ("6[E2]" : "шестистах")
| ("6[E2]" : "шестьсот")
| ("6[E2]" : "шестьюстами")
| ("7[E1]" : "семидесятая@")
| ("7[E1]" : "семидесяти")
| ("7[E1]" : "семидесятого@")
| ("7[E1]" : "семидесятое@")
| ("7[E1]" : "семидесятой@")
| ("7[E1]" : "семидесятом@")
| ("7[E1]" : "семидесятому@")
| ("7[E1]" : "семидесятую@")
| ("7[E1]" : "семидесятые@")
| ("7[E1]" : "семидесятый@")
| ("7[E1]" : "семидесятым@")
| ("7[E1]" : "семидесятыми@")
| ("7[E1]" : "семидесятых@")
| ("7[E1]" : "семьдесят")
| ("7[E1]" : "семьюдесятью")
| ("7[E2]" : "семисот")
| ("7[E2]" : "семисотая@")
| ("7[E2]" : "семисотого@")
| ("7[E2]" : "семисотое@")
| ("7[E2]" : "семисотой@")
| ("7[E2]" : "семисотом@")
| ("7[E2]" : "семисотому@")
| ("7[E2]" : "семисотую@")
| ("7[E2]" : "семисотые@")
| ("7[E2]" : "семисотый@")
| ("7[E2]" : "семисотым@")
| ("7[E2]" : "семисотыми@")
| ("7[E2]" : "семисотых@")
| ("7[E2]" : "семистам")
| ("7[E2]" : "семистах")
| ("7[E2]" : "семьсот")
| ("7[E2]" : "семьюстами")
| ("8[E1]" : "восемьдесят")
| ("8[E1]" : "восьмидесятая@")
| ("8[E1]" : "восьмидесяти")
| ("8[E1]" : "восьмидесятого@")
| ("8[E1]" : "восьмидесятое@")
| ("8[E1]" : "восьмидесятой@")
| ("8[E1]" : "восьмидесятом@")
| ("8[E1]" : "восьмидесятому@")
| ("8[E1]" : "восьмидесятую@")
| ("8[E1]" : "восьмидесятые@")
| ("8[E1]" : "восьмидесятый@")
| ("8[E1]" : "восьмидесятым@")
| ("8[E1]" : "восьмидесятыми@")
| ("8[E1]" : "восьмидесятых@")
| ("8[E1]" : "восьмьюдесятью")
| ("8[E2]" : "восемьсот")
| ("8[E2]" : "восемьюстами")
| ("8[E2]" : "восьмисот")
| ("8[E2]" : "восьмисотая@")
| ("8[E2]" : "восьмисотого@")
| ("8[E2]" : "восьмисотое@")
| ("8[E2]" : "восьмисотой@")
| ("8[E2]" : "восьмисотом@")
| ("8[E2]" : "восьмисотому@")
| ("8[E2]" : "восьмисотую@")
| ("8[E2]" : "восьмисотые@")
| ("8[E2]" : "восьмисотый@")
| ("8[E2]" : "восьмисотым@")
| ("8[E2]" : "восьмисотыми@")
| ("8[E2]" : "восьмисотых@")
| ("8[E2]" : "восьмистам")
| ("8[E2]" : "восьмистах")
| ("8[E2]" : "восьмьюстами")
| ("9[E1]" : "девяноста")
| ("9[E1]" : "девяностая@")
| ("9[E1]" : "девяносто")
| ("9[E1]" : "девяностого@")
| ("9[E1]" : "девяностое@")
| ("9[E1]" : "девяностой@")
| ("9[E1]" : "девяностом@")
| ("9[E1]" : "девяностому@")
| ("9[E1]" : "девяностую@")
| ("9[E1]" : "девяностые@")
| ("9[E1]" : "девяностый@")
| ("9[E1]" : "девяностым@")
| ("9[E1]" : "девяностыми@")
| ("9[E1]" : "девяностых@")
| ("9[E2]" : "девятисот")
| ("9[E2]" : "девятисотая@")
| ("9[E2]" : "девятисотого@")
| ("9[E2]" : "девятисотое@")
| ("9[E2]" : "девятисотой@")
| ("9[E2]" : "девятисотом@")
| ("9[E2]" : "девятисотому@")
| ("9[E2]" : "девятисотую@")
| ("9[E2]" : "девятисотые@")
| ("9[E2]" : "девятисотый@")
| ("9[E2]" : "девятисотым@")
| ("9[E2]" : "девятисотыми@")
| ("9[E2]" : "девятисотых@")
| ("9[E2]" : "девятистам")
| ("9[E2]" : "девятистах")
| ("9[E2]" : "девятьсот")
| ("9[E2]" : "девятьюстами")]
;
lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR];
lexset1 = Optimize[
("+" : "")
| ("1" : "один")
| ("1" : "одна")
| ("1" : "одни")
| ("1" : "одним")
| ("1" : "одними")
| ("1" : "одних")
| ("1" : "одно")
| ("1" : "одного")
| ("1" : "одной")
| ("1" : "одном")
| ("1" : "одному")
| ("1" : "одною")
| ("1" : "одну")
| ("1" : "первая@")
| ("1" : "первого@")
| ("1" : "первое@")
| ("1" : "первой@")
| ("1" : "первом@")
| ("1" : "первому@")
| ("1" : "первую@")
| ("1" : "первые@")
| ("1" : "первый@")
| ("1" : "первым@")
| ("1" : "первыми@")
| ("1" : "первых@")
| ("2" : "вторая@")
| ("2" : "второго@")
| ("2" : "второе@")
| ("2" : "второй@")
| ("2" : "втором@")
| ("2" : "второму@")
| ("2" : "вторую@")
| ("2" : "вторые@")
| ("2" : "вторым@")
| ("2" : "вторыми@")
| ("2" : "вторых@")
| ("2" : "два")
| ("2" : "две")
| ("2" : "двум")
| ("2" : "двумя")
| ("2" : "двух")
| ("3" : "трем")
| ("3" : "тремя")
| ("3" : "третий@")
| ("3" : "третье@")
| ("3" : "третьего@")
| ("3" : "третьей@")
| ("3" : "третьем@")
| ("3" : "третьему@")
| ("3" : "третьи@")
| ("3" : "третьим@")
| ("3" : "третьими@")
| ("3" : "третьих@")
| ("3" : "третью@")
| ("3" : "третья@")
| ("3" : "трех")
| ("3" : "три")
| ("4" : "четвертая@")
| ("4" : "четвертого@")
| ("4" : "четвертое@")
| ("4" : "четвертой@")
| ("4" : "четвертом@")
| ("4" : "четвертому@")
| ("4" : "четвертую@")
| ("4" : "четвертые@")
| ("4" : "четвертый@")
| ("4" : "четвертым@")
| ("4" : "четвертыми@")
| ("4" : "четвертых@")
| ("4" : "четыре")
| ("4" : "четырем")
| ("4" : "четырех")
| ("4" : "четырьмя")
| ("5" : "пятая@")
| ("5" : "пяти")
| ("5" : "пятого@")
| ("5" : "пятое@")
| ("5" : "пятой@")
| ("5" : "пятом@")
| ("5" : "пятому@")
| ("5" : "пятую@")
| ("5" : "пятые@")
| ("5" : "пятый@")
| ("5" : "пятым@")
| ("5" : "пятыми@")
| ("5" : "пятых@")
| ("5" : "пять")
| ("5" : "пятью")
| ("6" : "шестая@")
| ("6" : "шести")
| ("6" : "шестого@")
| ("6" : "шестое@")
| ("6" : "шестой@")
| ("6" : "шестом@")
| ("6" : "шестому@")
| ("6" : "шестую@")
| ("6" : "шестые@")
| ("6" : "шестым@")
| ("6" : "шестыми@")
| ("6" : "шестых@")
| ("6" : "шесть")
| ("6" : "шестью")
| ("7" : "седьмая@")
| ("7" : "седьмого@")
| ("7" : "седьмое@")
| ("7" : "седьмой@")
| ("7" : "седьмом@")
| ("7" : "седьмому@")
| ("7" : "седьмую@")
| ("7" : "седьмые@")
| ("7" : "седьмым@")
| ("7" : "седьмыми@")
| ("7" : "седьмых@")
| ("7" : "семи")
| ("7" : "семь")
| ("7" : "семью")
| ("8" : "восемь")
| ("8" : "восьмая@")
| ("8" : "восьми")
| ("8" : "восьмого@")
| ("8" : "восьмое@")
| ("8" : "восьмой@")
| ("8" : "восьмом@")
| ("8" : "восьмому@")
| ("8" : "восьмую@")
| ("8" : "восьмые@")
| ("8" : "восьмым@")
| ("8" : "восьмыми@")
| ("8" : "восьмых@")
| ("8" : "восьмью")
| ("9" : "девятая@")
| ("9" : "девяти")
| ("9" : "девятого@")
| ("9" : "девятое@")
| ("9" : "девятой@")
| ("9" : "девятом@")
| ("9" : "девятому@")
| ("9" : "девятую@")
| ("9" : "девятые@")
| ("9" : "девятый@")
| ("9" : "девятым@")
| ("9" : "девятыми@")
| ("9" : "девятых@")
| ("9" : "девять")
| ("9" : "девятью")
| ("[E3]" : "тысяч")
| ("[E3]" : "тысяча")
| ("[E3]" : "тысячам")
| ("[E3]" : "тысячами")
| ("[E3]" : "тысячах")
| ("[E3]" : "тысяче")
| ("[E3]" : "тысячей")
| ("[E3]" : "тысячи")
| ("[E3]" : "тысячу")
| ("[E3]" : "тысячью")
| ("[E6]" : "миллион")
| ("[E6]" : "миллиона")
| ("[E6]" : "миллионам")
| ("[E6]" : "миллионами")
| ("[E6]" : "миллионах")
| ("[E6]" : "миллионе")
| ("[E6]" : "миллионов")
| ("[E6]" : "миллионом")
| ("[E6]" : "миллиону")
| ("[E6]" : "миллионы")
| ("[E9]" : "миллиард")
| ("[E9]" : "миллиарда")
| ("[E9]" : "миллиардам")
| ("[E9]" : "миллиардами")
| ("[E9]" : "миллиардах")
| ("[E9]" : "миллиарде")
| ("[E9]" : "миллиардов")
| ("[E9]" : "миллиардом")
| ("[E9]" : "миллиарду")
| ("[E9]" : "миллиарды")
| ("|0|" : "ноле")
| ("|0|" : "нолем")
| ("|0|" : "ноль")
| ("|0|" : "нолю")
| ("|0|" : "ноля")
| ("|0|" : "нуле")
| ("|0|" : "нулем")
| ("|0|" : "нуль")
| ("|0|" : "нулю")
| ("|0|" : "нуля")]
;
lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR];
export LEX = Optimize[lex3 @ lex2 @ lex1];
export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]";
# END LANGUAGE SPECIFIC DATA
################################################################################
# Inserts a marker after the Ms.
export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR];
# Deletes all powers and "+".
export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR];
# Deletes trailing zeros at the beginning of a number, so that "0003" does not
# get treated as an ordinary number.
export DELETE_INITIAL_ZEROS =
CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR]
;
NonMs = Optimize[POWERS - Ms];
# Deletes (usually) zeros before a non-M. E.g., +0[E1] should be
# deleted
export DELETE_INTERMEDIATE_ZEROS1 =
CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR]
;
# Deletes (usually) zeros before an M, if there is no non-zero element between
# that and the previous boundary. Thus, if after the result of the rule above we
# end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final
# zero.
export DELETE_INTERMEDIATE_ZEROS2 = Optimize[
CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR]
@ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]]
;
# Final clean up of stray zeros.
export DELETE_REMAINING_ZEROS = Optimize[
CDRewrite[Zero["+0"], "", "", SIGMA_STAR]
@ CDRewrite[Zero["0"], "", "", SIGMA_STAR]]
;
# Applies the revaluation map. For example in English, change [E4] to [E1] as a
# modifier of [E3]
export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR];
# Deletes the various marks and powers in the input and output.
export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR];
export CLEAN_SPACES = Optimize[
CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR]
@ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR]
@ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]]
;
d = b.kDigit;
# Germanic inversion rule.
germanic =
(I["1+"] d "[E1]" D["+1"])
| (I["2+"] d "[E1]" D["+2"])
| (I["3+"] d "[E1]" D["+3"])
| (I["4+"] d "[E1]" D["+4"])
| (I["5+"] d "[E1]" D["+5"])
| (I["6+"] d "[E1]" D["+6"])
| (I["7+"] d "[E1]" D["+7"])
| (I["8+"] d "[E1]" D["+8"])
| (I["9+"] d "[E1]" D["+9"])
;
germanic_inversion =
CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt']
;
export GERMANIC_INVERSION = SIGMA_STAR;
export ORDINAL_RESTRICTION =
Optimize[((SIGMA - "@")* "@") @ CDRewrite[D["@"], "", "", SIGMA_STAR]]
;
nondigits = b.kBytes - b.kDigit;
export ORDINAL_SUFFIX = D[nondigits*];

@ -1,527 +0,0 @@
0 нулевая
0 нулевого
0 нулевое
0 нулевой
0 нулевом
0 нулевому
0 нулевую
0 нулевые
0 нулевым
0 нулевым
0 нулевыми
0 нулевых
1 первая
1 первого
1 первое
1 первой
1 первом
1 первому
1 первую
1 первые
1 первый
1 первым
1 первым
1 первыми
1 первых
2 вторая
2 второго
2 второе
2 второй
2 втором
2 второму
2 вторую
2 вторые
2 вторым
2 вторым
2 вторыми
2 вторых
3 третий
3 третье
3 третьего
3 третьей
3 третьем
3 третьему
3 третьи
3 третьим
3 третьим
3 третьими
3 третьих
3 третью
3 третья
4 четвертая
4 четвертого
4 четвертое
4 четвертой
4 четвертом
4 четвертому
4 четвертую
4 четвертые
4 четвертый
4 четвертым
4 четвертым
4 четвертыми
4 четвертых
4 четвёртая
4 четвёртого
4 четвёртое
4 четвёртой
4 четвёртом
4 четвёртому
4 четвёртую
4 четвёртые
4 четвёртый
4 четвёртым
4 четвёртым
4 четвёртыми
4 четвёртых
5 пятая
5 пятого
5 пятое
5 пятой
5 пятом
5 пятому
5 пятую
5 пятые
5 пятый
5 пятым
5 пятым
5 пятыми
5 пятых
6 шестая
6 шестого
6 шестое
6 шестой
6 шестом
6 шестому
6 шестую
6 шестые
6 шестым
6 шестым
6 шестыми
6 шестых
7 седьмая
7 седьмого
7 седьмое
7 седьмой
7 седьмом
7 седьмому
7 седьмую
7 седьмые
7 седьмым
7 седьмым
7 седьмыми
7 седьмых
8 восьмая
8 восьмого
8 восьмое
8 восьмой
8 восьмом
8 восьмому
8 восьмую
8 восьмые
8 восьмым
8 восьмым
8 восьмыми
8 восьмых
9 девятая
9 девятого
9 девятое
9 девятой
9 девятом
9 девятому
9 девятую
9 девятые
9 девятый
9 девятым
9 девятым
9 девятыми
9 девятых
10 десятая
10 десятого
10 десятое
10 десятой
10 десятом
10 десятому
10 десятую
10 десятые
10 десятый
10 десятым
10 десятым
10 десятыми
10 десятых
11 одиннадцатая
11 одиннадцатого
11 одиннадцатое
11 одиннадцатой
11 одиннадцатом
11 одиннадцатому
11 одиннадцатую
11 одиннадцатые
11 одиннадцатый
11 одиннадцатым
11 одиннадцатым
11 одиннадцатыми
11 одиннадцатых
12 двенадцатая
12 двенадцатого
12 двенадцатое
12 двенадцатой
12 двенадцатом
12 двенадцатому
12 двенадцатую
12 двенадцатые
12 двенадцатый
12 двенадцатым
12 двенадцатым
12 двенадцатыми
12 двенадцатых
13 тринадцатая
13 тринадцатого
13 тринадцатое
13 тринадцатой
13 тринадцатом
13 тринадцатому
13 тринадцатую
13 тринадцатые
13 тринадцатый
13 тринадцатым
13 тринадцатым
13 тринадцатыми
13 тринадцатых
14 четырнадцатая
14 четырнадцатого
14 четырнадцатое
14 четырнадцатой
14 четырнадцатом
14 четырнадцатому
14 четырнадцатую
14 четырнадцатые
14 четырнадцатый
14 четырнадцатым
14 четырнадцатым
14 четырнадцатыми
14 четырнадцатых
15 пятнадцатая
15 пятнадцатого
15 пятнадцатое
15 пятнадцатой
15 пятнадцатом
15 пятнадцатому
15 пятнадцатую
15 пятнадцатые
15 пятнадцатый
15 пятнадцатым
15 пятнадцатым
15 пятнадцатыми
15 пятнадцатых
16 шестнадцатая
16 шестнадцатого
16 шестнадцатое
16 шестнадцатой
16 шестнадцатом
16 шестнадцатому
16 шестнадцатую
16 шестнадцатые
16 шестнадцатый
16 шестнадцатым
16 шестнадцатым
16 шестнадцатыми
16 шестнадцатых
17 семнадцатая
17 семнадцатого
17 семнадцатое
17 семнадцатой
17 семнадцатом
17 семнадцатому
17 семнадцатую
17 семнадцатые
17 семнадцатый
17 семнадцатым
17 семнадцатым
17 семнадцатыми
17 семнадцатых
18 восемнадцатая
18 восемнадцатого
18 восемнадцатое
18 восемнадцатой
18 восемнадцатом
18 восемнадцатому
18 восемнадцатую
18 восемнадцатые
18 восемнадцатый
18 восемнадцатым
18 восемнадцатым
18 восемнадцатыми
18 восемнадцатых
19 девятнадцатая
19 девятнадцатого
19 девятнадцатое
19 девятнадцатой
19 девятнадцатом
19 девятнадцатому
19 девятнадцатую
19 девятнадцатые
19 девятнадцатый
19 девятнадцатым
19 девятнадцатым
19 девятнадцатыми
19 девятнадцатых
20 двадцатая
20 двадцатого
20 двадцатое
20 двадцатой
20 двадцатом
20 двадцатому
20 двадцатую
20 двадцатые
20 двадцатый
20 двадцатым
20 двадцатым
20 двадцатыми
20 двадцатых
30 тридцатая
30 тридцатого
30 тридцатое
30 тридцатой
30 тридцатом
30 тридцатому
30 тридцатую
30 тридцатые
30 тридцатый
30 тридцатым
30 тридцатым
30 тридцатыми
30 тридцатых
40 сороковая
40 сорокового
40 сороковое
40 сороковой
40 сороковом
40 сороковому
40 сороковую
40 сороковые
40 сороковым
40 сороковым
40 сороковыми
40 сороковых
50 пятидесятая
50 пятидесятого
50 пятидесятое
50 пятидесятой
50 пятидесятом
50 пятидесятому
50 пятидесятую
50 пятидесятые
50 пятидесятый
50 пятидесятым
50 пятидесятым
50 пятидесятыми
50 пятидесятых
60 шестидесятая
60 шестидесятого
60 шестидесятое
60 шестидесятой
60 шестидесятом
60 шестидесятому
60 шестидесятую
60 шестидесятые
60 шестидесятый
60 шестидесятым
60 шестидесятым
60 шестидесятыми
60 шестидесятых
70 семидесятая
70 семидесятого
70 семидесятое
70 семидесятой
70 семидесятом
70 семидесятому
70 семидесятую
70 семидесятые
70 семидесятый
70 семидесятым
70 семидесятым
70 семидесятыми
70 семидесятых
80 восьмидесятая
80 восьмидесятого
80 восьмидесятое
80 восьмидесятой
80 восьмидесятом
80 восьмидесятому
80 восьмидесятую
80 восьмидесятые
80 восьмидесятый
80 восьмидесятым
80 восьмидесятым
80 восьмидесятыми
80 восьмидесятых
90 девяностая
90 девяностого
90 девяностое
90 девяностой
90 девяностом
90 девяностому
90 девяностую
90 девяностые
90 девяностый
90 девяностым
90 девяностым
90 девяностыми
90 девяностых
100 сотая
100 сотого
100 сотое
100 сотой
100 сотом
100 сотому
100 сотую
100 сотые
100 сотый
100 сотым
100 сотым
100 сотыми
100 сотых
200 двухсотая
200 двухсотого
200 двухсотое
200 двухсотой
200 двухсотом
200 двухсотому
200 двухсотую
200 двухсотые
200 двухсотый
200 двухсотым
200 двухсотым
200 двухсотыми
200 двухсотых
300 трехсотая
300 трехсотого
300 трехсотое
300 трехсотой
300 трехсотом
300 трехсотому
300 трехсотую
300 трехсотые
300 трехсотый
300 трехсотым
300 трехсотым
300 трехсотыми
300 трехсотых
400 четырехсотая
400 четырехсотого
400 четырехсотое
400 четырехсотой
400 четырехсотом
400 четырехсотому
400 четырехсотую
400 четырехсотые
400 четырехсотый
400 четырехсотым
400 четырехсотым
400 четырехсотыми
400 четырехсотых
500 пятисотая
500 пятисотого
500 пятисотое
500 пятисотой
500 пятисотом
500 пятисотому
500 пятисотую
500 пятисотые
500 пятисотый
500 пятисотым
500 пятисотым
500 пятисотыми
500 пятисотых
600 шестисотая
600 шестисотого
600 шестисотое
600 шестисотой
600 шестисотом
600 шестисотому
600 шестисотую
600 шестисотые
600 шестисотый
600 шестисотым
600 шестисотым
600 шестисотыми
600 шестисотых
700 семисотая
700 семисотого
700 семисотое
700 семисотой
700 семисотом
700 семисотому
700 семисотую
700 семисотые
700 семисотый
700 семисотым
700 семисотым
700 семисотыми
700 семисотых
800 восьмисотая
800 восьмисотого
800 восьмисотое
800 восьмисотой
800 восьмисотом
800 восьмисотому
800 восьмисотую
800 восьмисотые
800 восьмисотый
800 восьмисотым
800 восьмисотым
800 восьмисотыми
800 восьмисотых
900 девятисотая
900 девятисотого
900 девятисотое
900 девятисотой
900 девятисотом
900 девятисотому
900 девятисотую
900 девятисотые
900 девятисотый
900 девятисотым
900 девятисотым
900 девятисотыми
900 девятисотых
1000 тысячная
1000 тысячного
1000 тысячное
1000 тысячной
1000 тысячном
1000 тысячному
1000 тысячную
1000 тысячные
1000 тысячный
1000 тысячным
1000 тысячным
1000 тысячными
1000 тысячных
1000000 миллионная
1000000 миллионного
1000000 миллионное
1000000 миллионной
1000000 миллионном
1000000 миллионному
1000000 миллионную
1000000 миллионные
1000000 миллионный
1000000 миллионным
1000000 миллионным
1000000 миллионными
1000000 миллионных
1000000000 миллиардная
1000000000 миллиардного
1000000000 миллиардное
1000000000 миллиардной
1000000000 миллиардном
1000000000 миллиардному
1000000000 миллиардную
1000000000 миллиардные
1000000000 миллиардный
1000000000 миллиардным
1000000000 миллиардным
1000000000 миллиардными
1000000000 миллиардных
1 0 нулевая
2 0 нулевого
3 0 нулевое
4 0 нулевой
5 0 нулевом
6 0 нулевому
7 0 нулевую
8 0 нулевые
9 0 нулевым
10 0 нулевым
11 0 нулевыми
12 0 нулевых
13 1 первая
14 1 первого
15 1 первое
16 1 первой
17 1 первом
18 1 первому
19 1 первую
20 1 первые
21 1 первый
22 1 первым
23 1 первым
24 1 первыми
25 1 первых
26 2 вторая
27 2 второго
28 2 второе
29 2 второй
30 2 втором
31 2 второму
32 2 вторую
33 2 вторые
34 2 вторым
35 2 вторым
36 2 вторыми
37 2 вторых
38 3 третий
39 3 третье
40 3 третьего
41 3 третьей
42 3 третьем
43 3 третьему
44 3 третьи
45 3 третьим
46 3 третьим
47 3 третьими
48 3 третьих
49 3 третью
50 3 третья
51 4 четвертая
52 4 четвертого
53 4 четвертое
54 4 четвертой
55 4 четвертом
56 4 четвертому
57 4 четвертую
58 4 четвертые
59 4 четвертый
60 4 четвертым
61 4 четвертым
62 4 четвертыми
63 4 четвертых
64 4 четвёртая
65 4 четвёртого
66 4 четвёртое
67 4 четвёртой
68 4 четвёртом
69 4 четвёртому
70 4 четвёртую
71 4 четвёртые
72 4 четвёртый
73 4 четвёртым
74 4 четвёртым
75 4 четвёртыми
76 4 четвёртых
77 5 пятая
78 5 пятого
79 5 пятое
80 5 пятой
81 5 пятом
82 5 пятому
83 5 пятую
84 5 пятые
85 5 пятый
86 5 пятым
87 5 пятым
88 5 пятыми
89 5 пятых
90 6 шестая
91 6 шестого
92 6 шестое
93 6 шестой
94 6 шестом
95 6 шестому
96 6 шестую
97 6 шестые
98 6 шестым
99 6 шестым
100 6 шестыми
101 6 шестых
102 7 седьмая
103 7 седьмого
104 7 седьмое
105 7 седьмой
106 7 седьмом
107 7 седьмому
108 7 седьмую
109 7 седьмые
110 7 седьмым
111 7 седьмым
112 7 седьмыми
113 7 седьмых
114 8 восьмая
115 8 восьмого
116 8 восьмое
117 8 восьмой
118 8 восьмом
119 8 восьмому
120 8 восьмую
121 8 восьмые
122 8 восьмым
123 8 восьмым
124 8 восьмыми
125 8 восьмых
126 9 девятая
127 9 девятого
128 9 девятое
129 9 девятой
130 9 девятом
131 9 девятому
132 9 девятую
133 9 девятые
134 9 девятый
135 9 девятым
136 9 девятым
137 9 девятыми
138 9 девятых
139 10 десятая
140 10 десятого
141 10 десятое
142 10 десятой
143 10 десятом
144 10 десятому
145 10 десятую
146 10 десятые
147 10 десятый
148 10 десятым
149 10 десятым
150 10 десятыми
151 10 десятых
152 11 одиннадцатая
153 11 одиннадцатого
154 11 одиннадцатое
155 11 одиннадцатой
156 11 одиннадцатом
157 11 одиннадцатому
158 11 одиннадцатую
159 11 одиннадцатые
160 11 одиннадцатый
161 11 одиннадцатым
162 11 одиннадцатым
163 11 одиннадцатыми
164 11 одиннадцатых
165 12 двенадцатая
166 12 двенадцатого
167 12 двенадцатое
168 12 двенадцатой
169 12 двенадцатом
170 12 двенадцатому
171 12 двенадцатую
172 12 двенадцатые
173 12 двенадцатый
174 12 двенадцатым
175 12 двенадцатым
176 12 двенадцатыми
177 12 двенадцатых
178 13 тринадцатая
179 13 тринадцатого
180 13 тринадцатое
181 13 тринадцатой
182 13 тринадцатом
183 13 тринадцатому
184 13 тринадцатую
185 13 тринадцатые
186 13 тринадцатый
187 13 тринадцатым
188 13 тринадцатым
189 13 тринадцатыми
190 13 тринадцатых
191 14 четырнадцатая
192 14 четырнадцатого
193 14 четырнадцатое
194 14 четырнадцатой
195 14 четырнадцатом
196 14 четырнадцатому
197 14 четырнадцатую
198 14 четырнадцатые
199 14 четырнадцатый
200 14 четырнадцатым
201 14 четырнадцатым
202 14 четырнадцатыми
203 14 четырнадцатых
204 15 пятнадцатая
205 15 пятнадцатого
206 15 пятнадцатое
207 15 пятнадцатой
208 15 пятнадцатом
209 15 пятнадцатому
210 15 пятнадцатую
211 15 пятнадцатые
212 15 пятнадцатый
213 15 пятнадцатым
214 15 пятнадцатым
215 15 пятнадцатыми
216 15 пятнадцатых
217 16 шестнадцатая
218 16 шестнадцатого
219 16 шестнадцатое
220 16 шестнадцатой
221 16 шестнадцатом
222 16 шестнадцатому
223 16 шестнадцатую
224 16 шестнадцатые
225 16 шестнадцатый
226 16 шестнадцатым
227 16 шестнадцатым
228 16 шестнадцатыми
229 16 шестнадцатых
230 17 семнадцатая
231 17 семнадцатого
232 17 семнадцатое
233 17 семнадцатой
234 17 семнадцатом
235 17 семнадцатому
236 17 семнадцатую
237 17 семнадцатые
238 17 семнадцатый
239 17 семнадцатым
240 17 семнадцатым
241 17 семнадцатыми
242 17 семнадцатых
243 18 восемнадцатая
244 18 восемнадцатого
245 18 восемнадцатое
246 18 восемнадцатой
247 18 восемнадцатом
248 18 восемнадцатому
249 18 восемнадцатую
250 18 восемнадцатые
251 18 восемнадцатый
252 18 восемнадцатым
253 18 восемнадцатым
254 18 восемнадцатыми
255 18 восемнадцатых
256 19 девятнадцатая
257 19 девятнадцатого
258 19 девятнадцатое
259 19 девятнадцатой
260 19 девятнадцатом
261 19 девятнадцатому
262 19 девятнадцатую
263 19 девятнадцатые
264 19 девятнадцатый
265 19 девятнадцатым
266 19 девятнадцатым
267 19 девятнадцатыми
268 19 девятнадцатых
269 20 двадцатая
270 20 двадцатого
271 20 двадцатое
272 20 двадцатой
273 20 двадцатом
274 20 двадцатому
275 20 двадцатую
276 20 двадцатые
277 20 двадцатый
278 20 двадцатым
279 20 двадцатым
280 20 двадцатыми
281 20 двадцатых
282 30 тридцатая
283 30 тридцатого
284 30 тридцатое
285 30 тридцатой
286 30 тридцатом
287 30 тридцатому
288 30 тридцатую
289 30 тридцатые
290 30 тридцатый
291 30 тридцатым
292 30 тридцатым
293 30 тридцатыми
294 30 тридцатых
295 40 сороковая
296 40 сорокового
297 40 сороковое
298 40 сороковой
299 40 сороковом
300 40 сороковому
301 40 сороковую
302 40 сороковые
303 40 сороковым
304 40 сороковым
305 40 сороковыми
306 40 сороковых
307 50 пятидесятая
308 50 пятидесятого
309 50 пятидесятое
310 50 пятидесятой
311 50 пятидесятом
312 50 пятидесятому
313 50 пятидесятую
314 50 пятидесятые
315 50 пятидесятый
316 50 пятидесятым
317 50 пятидесятым
318 50 пятидесятыми
319 50 пятидесятых
320 60 шестидесятая
321 60 шестидесятого
322 60 шестидесятое
323 60 шестидесятой
324 60 шестидесятом
325 60 шестидесятому
326 60 шестидесятую
327 60 шестидесятые
328 60 шестидесятый
329 60 шестидесятым
330 60 шестидесятым
331 60 шестидесятыми
332 60 шестидесятых
333 70 семидесятая
334 70 семидесятого
335 70 семидесятое
336 70 семидесятой
337 70 семидесятом
338 70 семидесятому
339 70 семидесятую
340 70 семидесятые
341 70 семидесятый
342 70 семидесятым
343 70 семидесятым
344 70 семидесятыми
345 70 семидесятых
346 80 восьмидесятая
347 80 восьмидесятого
348 80 восьмидесятое
349 80 восьмидесятой
350 80 восьмидесятом
351 80 восьмидесятому
352 80 восьмидесятую
353 80 восьмидесятые
354 80 восьмидесятый
355 80 восьмидесятым
356 80 восьмидесятым
357 80 восьмидесятыми
358 80 восьмидесятых
359 90 девяностая
360 90 девяностого
361 90 девяностое
362 90 девяностой
363 90 девяностом
364 90 девяностому
365 90 девяностую
366 90 девяностые
367 90 девяностый
368 90 девяностым
369 90 девяностым
370 90 девяностыми
371 90 девяностых
372 100 сотая
373 100 сотого
374 100 сотое
375 100 сотой
376 100 сотом
377 100 сотому
378 100 сотую
379 100 сотые
380 100 сотый
381 100 сотым
382 100 сотым
383 100 сотыми
384 100 сотых
385 200 двухсотая
386 200 двухсотого
387 200 двухсотое
388 200 двухсотой
389 200 двухсотом
390 200 двухсотому
391 200 двухсотую
392 200 двухсотые
393 200 двухсотый
394 200 двухсотым
395 200 двухсотым
396 200 двухсотыми
397 200 двухсотых
398 300 трехсотая
399 300 трехсотого
400 300 трехсотое
401 300 трехсотой
402 300 трехсотом
403 300 трехсотому
404 300 трехсотую
405 300 трехсотые
406 300 трехсотый
407 300 трехсотым
408 300 трехсотым
409 300 трехсотыми
410 300 трехсотых
411 400 четырехсотая
412 400 четырехсотого
413 400 четырехсотое
414 400 четырехсотой
415 400 четырехсотом
416 400 четырехсотому
417 400 четырехсотую
418 400 четырехсотые
419 400 четырехсотый
420 400 четырехсотым
421 400 четырехсотым
422 400 четырехсотыми
423 400 четырехсотых
424 500 пятисотая
425 500 пятисотого
426 500 пятисотое
427 500 пятисотой
428 500 пятисотом
429 500 пятисотому
430 500 пятисотую
431 500 пятисотые
432 500 пятисотый
433 500 пятисотым
434 500 пятисотым
435 500 пятисотыми
436 500 пятисотых
437 600 шестисотая
438 600 шестисотого
439 600 шестисотое
440 600 шестисотой
441 600 шестисотом
442 600 шестисотому
443 600 шестисотую
444 600 шестисотые
445 600 шестисотый
446 600 шестисотым
447 600 шестисотым
448 600 шестисотыми
449 600 шестисотых
450 700 семисотая
451 700 семисотого
452 700 семисотое
453 700 семисотой
454 700 семисотом
455 700 семисотому
456 700 семисотую
457 700 семисотые
458 700 семисотый
459 700 семисотым
460 700 семисотым
461 700 семисотыми
462 700 семисотых
463 800 восьмисотая
464 800 восьмисотого
465 800 восьмисотое
466 800 восьмисотой
467 800 восьмисотом
468 800 восьмисотому
469 800 восьмисотую
470 800 восьмисотые
471 800 восьмисотый
472 800 восьмисотым
473 800 восьмисотым
474 800 восьмисотыми
475 800 восьмисотых
476 900 девятисотая
477 900 девятисотого
478 900 девятисотое
479 900 девятисотой
480 900 девятисотом
481 900 девятисотому
482 900 девятисотую
483 900 девятисотые
484 900 девятисотый
485 900 девятисотым
486 900 девятисотым
487 900 девятисотыми
488 900 девятисотых
489 1000 тысячная
490 1000 тысячного
491 1000 тысячное
492 1000 тысячной
493 1000 тысячном
494 1000 тысячному
495 1000 тысячную
496 1000 тысячные
497 1000 тысячный
498 1000 тысячным
499 1000 тысячным
500 1000 тысячными
501 1000 тысячных
502 1000000 миллионная
503 1000000 миллионного
504 1000000 миллионное
505 1000000 миллионной
506 1000000 миллионном
507 1000000 миллионному
508 1000000 миллионную
509 1000000 миллионные
510 1000000 миллионный
511 1000000 миллионным
512 1000000 миллионным
513 1000000 миллионными
514 1000000 миллионных
515 1000000000 миллиардная
516 1000000000 миллиардного
517 1000000000 миллиардное
518 1000000000 миллиардной
519 1000000000 миллиардном
520 1000000000 миллиардному
521 1000000000 миллиардную
522 1000000000 миллиардные
523 1000000000 миллиардный
524 1000000000 миллиардным
525 1000000000 миллиардным
526 1000000000 миллиардными
527 1000000000 миллиардных

@ -1,77 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This verbalizer is used whenever there is an LM symbol that consists of
# letters immediately followed by "{spelled}". This strips the "{spelled}"
# suffix.
import 'util/byte.grm' as b;
import 'ru/classifier/cyrillic.grm' as c;
import 'ru/verbalizer/lexical_map.grm' as l;
import 'ru/verbalizer/numbers.grm' as n;
digit = b.kDigit @ n.CARDINAL_NUMBERS;
char_set = (("a" | "A") : "letter-a")
| (("b" | "B") : "letter-b")
| (("c" | "C") : "letter-c")
| (("d" | "D") : "letter-d")
| (("e" | "E") : "letter-e")
| (("f" | "F") : "letter-f")
| (("g" | "G") : "letter-g")
| (("h" | "H") : "letter-h")
| (("i" | "I") : "letter-i")
| (("j" | "J") : "letter-j")
| (("k" | "K") : "letter-k")
| (("l" | "L") : "letter-l")
| (("m" | "M") : "letter-m")
| (("n" | "N") : "letter-n")
| (("o" | "O") : "letter-o")
| (("p" | "P") : "letter-p")
| (("q" | "Q") : "letter-q")
| (("r" | "R") : "letter-r")
| (("s" | "S") : "letter-s")
| (("t" | "T") : "letter-t")
| (("u" | "U") : "letter-u")
| (("v" | "V") : "letter-v")
| (("w" | "W") : "letter-w")
| (("x" | "X") : "letter-x")
| (("y" | "Y") : "letter-y")
| (("z" | "Z") : "letter-z")
| (digit)
| ("&" : "@@AND@@")
| ("." : "")
| ("-" : "")
| ("_" : "")
| ("/" : "")
| (n.I["letter-"] c.kCyrillicAlpha)
;
ins_space = "" : " ";
suffix = "{spelled}" : "";
spelled = Optimize[char_set (ins_space char_set)* suffix];
export SPELLED = Optimize[spelled @ l.LEXICAL_MAP];
sigma_star = b.kBytes*;
# Gets rid of the letter- prefix since in some cases we don't want it.
del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star];
spelled_no_tag = Optimize[char_set (ins_space char_set)*];
export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter];

@ -1,24 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'ru/verbalizer/lexical_map.grm' as l;
punct =
("." : "@@PERIOD@@")
| ("," : "@@COMMA@@")
| ("!" : "@@EXCLAMATION_MARK@@")
| ("?" : "@@QUESTION_MARK@@")
;
export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP];

@ -1,108 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/byte.grm' as b;
import 'ru/verbalizer/lexical_map.grm' as l;
import 'ru/verbalizer/numbers.grm' as n;
# Only handles 24-hour time with quarter-to, half-past and quarter-past.
increment_hour =
("0" : "1")
| ("1" : "2")
| ("2" : "3")
| ("3" : "4")
| ("4" : "5")
| ("5" : "6")
| ("6" : "7")
| ("7" : "8")
| ("8" : "9")
| ("9" : "10")
| ("10" : "11")
| ("11" : "12")
| ("12" : "1") # If someone uses 12, we assume 12-hour by default.
| ("13" : "14")
| ("14" : "15")
| ("15" : "16")
| ("16" : "17")
| ("17" : "18")
| ("18" : "19")
| ("19" : "20")
| ("20" : "21")
| ("21" : "22")
| ("22" : "23")
| ("23" : "12")
;
hours = Project[increment_hour, 'input'];
d = b.kDigit;
D = d - "0";
minutes09 = "0" D;
minutes = ("1" | "2" | "3" | "4" | "5") d;
__sep__ = ":";
sep_space = __sep__ : " ";
verbalize_hours = hours @ n.CARDINAL_NUMBERS;
verbalize_minutes =
("00" : "@@HOUR@@")
| (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS))
| (minutes @ n.CARDINAL_NUMBERS)
;
time_basic = Optimize[verbalize_hours sep_space verbalize_minutes];
# Special cases we handle right now.
# TODO: Need to allow for cases like
#
# half twelve (in the UK English sense)
# half twaalf (in the Dutch sense)
time_quarter_past =
n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "]
verbalize_hours
n.D[__sep__ "15"];
time_half_past =
n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "]
verbalize_hours
n.D[__sep__ "30"];
time_quarter_to =
n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "]
(increment_hour @ verbalize_hours)
n.D[__sep__ "45"];
time_extra = Optimize[
time_quarter_past | time_half_past | time_quarter_to]
;
# Basic time periods which most languages can be expected to have.
__am__ = "a.m." | "am" | "AM" | "утра";
__pm__ = "p.m." | "pm" | "PM" | "вечера";
period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@");
time_variants = time_basic | time_extra;
time = Optimize[
(period (" " | n.I[" "]))? time_variants
| time_variants ((" " | n.I[" "]) period)?]
;
export TIME = Optimize[time @ l.LEXICAL_MAP];

@ -1,68 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Rules for URLs and email addresses.
import 'util/byte.grm' as bytelib;
import 'ru/verbalizer/lexical_map.grm' as l;
ins_space = "" : " ";
dot = "." : "@@URL_DOT_EXPRESSION@@";
at = "@" : "@@AT@@";
url_suffix =
(".com" : dot ins_space "com") |
(".gov" : dot ins_space "gov") |
(".edu" : dot ins_space "e d u") |
(".org" : dot ins_space "org") |
(".net" : dot ins_space "net")
;
letter_string = (bytelib.kAlnum)* bytelib.kAlnum;
letter_string_dot =
((letter_string ins_space dot ins_space)* letter_string)
;
# Rules for URLs.
export URL = Optimize[
((letter_string_dot) (ins_space)
(url_suffix)) @ l.LEXICAL_MAP
];
# Rules for email addresses.
letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum);
letter_by_letter_dot =
((letter_by_letter ins_space dot ins_space)*
letter_by_letter)
;
export EMAIL1 = Optimize[
((letter_by_letter) (ins_space)
(at) (ins_space)
(letter_by_letter_dot) (ins_space)
(url_suffix)) @ l.LEXICAL_MAP
];
export EMAIL2 = Optimize[
((letter_by_letter) (ins_space)
(at) (ins_space)
(letter_string_dot) (ins_space)
(url_suffix)) @ l.LEXICAL_MAP
];
export EMAILS = Optimize[
EMAIL1 | EMAIL2
];

@ -1,42 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import 'util/util.grm' as util;
import 'ru/verbalizer/extra_numbers.grm' as e;
import 'ru/verbalizer/float.grm' as f;
import 'ru/verbalizer/math.grm' as ma;
import 'ru/verbalizer/miscellaneous.grm' as mi;
import 'ru/verbalizer/money.grm' as mo;
import 'ru/verbalizer/numbers.grm' as n;
import 'ru/verbalizer/numbers_plus.grm' as np;
import 'ru/verbalizer/spelled.grm' as s;
import 'ru/verbalizer/spoken_punct.grm' as sp;
import 'ru/verbalizer/time.grm' as t;
import 'ru/verbalizer/urls.grm' as u;
export VERBALIZER = Optimize[RmWeight[
( e.MIXED_NUMBERS
| e.DIGITS
| f.FLOAT
| ma.ARITHMETIC
| mi.MISCELLANEOUS
| mo.MONEY
| n.CARDINAL_NUMBERS
| n.ORDINAL_NUMBERS
| np.NUMBERS_PLUS
| s.SPELLED
| sp.SPOKEN_PUNCT
| t.TIME
| u.URL) @ util.CLEAN_SPACES
]];

@ -1,3 +0,0 @@
# Language-universal grammar definitions
This directory contains various language-universal grammar definitions.

@ -1,91 +0,0 @@
i 1
ii 2
iii 3
iv 4
v 5
vi 6
vii 7
viii 8
ix 9
x 10
xi 11
xii 12
xiii 13
xiv 14
xv 15
xvi 16
xvii 17
xviii 18
xix 19
xx 20
xxi 21
xxii 22
xxiii 23
xxiv 24
xxv 25
xxvi 26
xxvii 27
xxviii 28
xxix 29
xxx 30
xxxi 31
xxxii 32
xxxiii 33
xxxiv 34
xxxv 35
xxxvi 36
xxxvii 37
xxxviii 38
xxxix 39
xl 40
xli 41
xlii 42
xliii 43
xliv 44
xlv 45
xlvi 46
xlvii 47
xlviii 48
xlix 49
mcmxciv 1994
mcmxcv 1995
mcmxcvi 1996
mcmxcvii 1997
mcmxcviii 1998
mcmxcix 1999
mm 2000
mmi 2001
mmii 2002
mmiii 2003
mmiv 2004
mmv 2005
mmvi 2006
mmvii 2007
mmviii 2008
mmix 2009
mmx 2010
mmxi 2011
mmxii 2012
mmxiii 2013
mmxiv 2014
mmxv 2015
mmxvi 2016
mmxvii 2017
mmxviii 2018
mmxix 2019
mmxx 2020
mmxxi 2021
mmxxii 2022
mmxxiii 2023
mmxxiv 2024
mmxxv 2025
mmxxvi 2026
mmxxvii 2027
mmxxviii 2028
mmxxix 2029
mmxxx 2030
mmxxxi 2031
mmxxxii 2032
mmxxxiii 2033
mmxxxiv 2034
mmxxxv 2035
1 i 1
2 ii 2
3 iii 3
4 iv 4
5 v 5
6 vi 6
7 vii 7
8 viii 8
9 ix 9
10 x 10
11 xi 11
12 xii 12
13 xiii 13
14 xiv 14
15 xv 15
16 xvi 16
17 xvii 17
18 xviii 18
19 xix 19
20 xx 20
21 xxi 21
22 xxii 22
23 xxiii 23
24 xxiv 24
25 xxv 25
26 xxvi 26
27 xxvii 27
28 xxviii 28
29 xxix 29
30 xxx 30
31 xxxi 31
32 xxxii 32
33 xxxiii 33
34 xxxiv 34
35 xxxv 35
36 xxxvi 36
37 xxxvii 37
38 xxxviii 38
39 xxxix 39
40 xl 40
41 xli 41
42 xlii 42
43 xliii 43
44 xliv 44
45 xlv 45
46 xlvi 46
47 xlvii 47
48 xlviii 48
49 xlix 49
50 mcmxciv 1994
51 mcmxcv 1995
52 mcmxcvi 1996
53 mcmxcvii 1997
54 mcmxcviii 1998
55 mcmxcix 1999
56 mm 2000
57 mmi 2001
58 mmii 2002
59 mmiii 2003
60 mmiv 2004
61 mmv 2005
62 mmvi 2006
63 mmvii 2007
64 mmviii 2008
65 mmix 2009
66 mmx 2010
67 mmxi 2011
68 mmxii 2012
69 mmxiii 2013
70 mmxiv 2014
71 mmxv 2015
72 mmxvi 2016
73 mmxvii 2017
74 mmxviii 2018
75 mmxix 2019
76 mmxx 2020
77 mmxxi 2021
78 mmxxii 2022
79 mmxxiii 2023
80 mmxxiv 2024
81 mmxxv 2025
82 mmxxvi 2026
83 mmxxvii 2027
84 mmxxviii 2028
85 mmxxix 2029
86 mmxxx 2030
87 mmxxxi 2031
88 mmxxxii 2032
89 mmxxxiii 2033
90 mmxxxiv 2034
91 mmxxxv 2035

@ -1,126 +0,0 @@
# Copyright 2017 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Specifies common ways of delimiting thousands in digit strings.
import 'util/byte.grm' as bytelib;
import 'util/util.grm' as util;
killcomma = "," : "";
dot2comma = "." : ",";
spaces2comma = " "+ : ",";
zero = "0";
# no_delimiter = zero | "[1-9][0-9]*";
export no_delimiter = zero | (util.d1to9 bytelib.kDigit*);
# delim_map_dot = ("[0-9]" | ("\." : ","))*;
delim_map_dot = (bytelib.kDigit | dot2comma)*;
# delim_map_space = ("[0-9]" | (" +" : ","))*;
delim_map_space = (bytelib.kDigit | spaces2comma)*;
## Western systems group thousands. Korean goes this way too.
# comma_thousands = zero | ("[1-9][0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9]")*);
export comma_thousands = zero | (util.d1to9 bytelib.kDigit{0,2} (killcomma bytelib.kDigit{3})*);
# ComposeFst: 1st argument cannot match on output labels and 2nd argument
# cannot match on input labels (sort?).
export dot_thousands = delim_map_dot @ comma_thousands;
# ComposeFst: 1st argument cannot match on output labels and 2nd argument
# cannot match on input labels (sort?).
export space_thousands = delim_map_space @ comma_thousands;
## Chinese prefers grouping by fours (by ten-thousands).
# chinese_comma =
# zero | ("[1-9][0-9]?[0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9][0-9]")*);
export chinese_comma = zero | (util.d1to9 (bytelib.kDigit{0,3}) (killcomma bytelib.kDigit{4})*);
## The Indian system is more complex because of the Stravinskian alternation
## between lakhs and crores.
##
## According to Wikipedia:
##
## Indian English Value
## One 1
## Ten 10
## Hundred 100
## Thousand 1,000
## Lakh 1,00,000
## Crore 1,00,00,000
## Arab 1,00,00,00,000
## Kharab 1,00,00,00,00,000
# indian_hundreds = "[1-9][0-9]?[0-9]?";
indian_hundreds = util.d1to9 bytelib.kDigit{0,2};
## Up to 99,999.
# indian_comma_thousands = "[1-9][0-9]?" ("," : "") "[0-9][0-9][0-9]";
indian_comma_thousands = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{3};
## Up to 99,99,999.
# indian_comma_lakhs = "[1-9][0-9]?" ("," : "") "[0-9][0-9]" ("," : "") "[0-9][0-9][0-9]";
indian_comma_lakhs = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{2} killcomma bytelib.kDigit{3};
## Up to 999,99,99,999
indian_comma_crores =
util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma
(bytelib.kDigit{2} killcomma)?
bytelib.kDigit{2} killcomma
bytelib.kDigit{3}
;
## Up to 99,999,99,99,999.
indian_comma_thousand_crores =
util.d1to9 bytelib.kDigit? killcomma
bytelib.kDigit{3} killcomma
bytelib.kDigit{2} killcomma
bytelib.kDigit{2} killcomma
bytelib.kDigit{3}
;
## Up to 999,99,999,99,99,999.
indian_comma_lakh_crores =
util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma
bytelib.kDigit{2} killcomma
bytelib.kDigit{3} killcomma
bytelib.kDigit{2} killcomma
bytelib.kDigit{2} killcomma
bytelib.kDigit{3}
;
export indian_comma =
zero
| indian_hundreds
| indian_comma_thousands
| indian_comma_lakhs
| indian_comma_crores
| indian_comma_thousand_crores
| indian_comma_lakh_crores
;
# Indian number system with dots.
export indian_dot_number = delim_map_dot @ indian_comma;
# Indian number system with spaces.
export indian_space_number = delim_map_space @ indian_comma;

@ -1,3 +0,0 @@
# Utility grammar definitions
This directory contains various utility grammar definitions.

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save