chinese char/word ngram lm (#613)
* add ngram lm egs * add zhon repo * install kenlm, zhon * format * add chinese_text_normalization repo * add ngram lm egspull/619/head
parent
2bdf4c946a
commit
538bf271eb
@ -0,0 +1 @@
|
|||||||
|
exp/
|
@ -0,0 +1,2 @@
|
|||||||
|
text_correct.txt: https://github.com/shibing624/pycorrector/raw/master/tests/test_file.txt
|
||||||
|
custom_confusion.txt: https://github.com/shibing624/pycorrector/raw/master/tests/custom_confusion.txt
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,37 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
order=5
|
||||||
|
mem=80%
|
||||||
|
prune=0
|
||||||
|
a=22
|
||||||
|
q=8
|
||||||
|
b=8
|
||||||
|
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||||
|
|
||||||
|
if [ $# != 3 ]; then
|
||||||
|
echo "$0 token_type exp/text exp/text.arpa"
|
||||||
|
echo $@
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# char or word
|
||||||
|
type=$1
|
||||||
|
text=$2
|
||||||
|
arpa=$3
|
||||||
|
|
||||||
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||||
|
# text tn & wordseg preprocess
|
||||||
|
echo "process text."
|
||||||
|
python3 ${MAIN_ROOT}/utils/zh_tn.py ${type} ${text} ${text}.${type}.tn
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
|
||||||
|
# train ngram lm
|
||||||
|
echo "build lm."
|
||||||
|
bash ${MAIN_ROOT}/utils/ngram_train.sh --order ${order} --mem ${mem} --prune "${prune}" ${text}.${type}.tn ${arpa}
|
||||||
|
fi
|
@ -0,0 +1,21 @@
|
|||||||
|
#! /usr/bin/env bash
|
||||||
|
|
||||||
|
. ${MAIN_ROOT}/utils/utility.sh
|
||||||
|
|
||||||
|
DIR=data/lm
|
||||||
|
mkdir -p ${DIR}
|
||||||
|
|
||||||
|
URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
|
||||||
|
MD5="29e02312deb2e59b3c8686c7966d4fe3"
|
||||||
|
TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
|
||||||
|
|
||||||
|
|
||||||
|
echo "Download language model ..."
|
||||||
|
download $URL $MD5 $TARGET
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Fail to download the language model!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,10 @@
|
|||||||
|
export MAIN_ROOT=${PWD}/../../
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH}
|
@ -0,0 +1 @@
|
|||||||
|
jieba>=0.39
|
@ -0,0 +1,57 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||||
|
|
||||||
|
python3 -c 'import kenlm;' || { echo "kenlm package not install!"; exit -1; }
|
||||||
|
|
||||||
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||||
|
# case 1, test kenlm
|
||||||
|
# download language model
|
||||||
|
bash local/download_lm_zh.sh
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# test kenlm `score` and `full_score`
|
||||||
|
python local/kenlm_score_test.py data/lm/zh_giga.no_cna_cmn.prune01244.klm
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p exp
|
||||||
|
cp data/text_correct.txt exp/text
|
||||||
|
|
||||||
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
|
||||||
|
# case 2, chinese chararctor ngram lm build
|
||||||
|
# output: xxx.arpa xxx.kenlm.bin
|
||||||
|
input=exp/text
|
||||||
|
token_type=char
|
||||||
|
lang=zh
|
||||||
|
order=5
|
||||||
|
prune="0 1 2 4 4"
|
||||||
|
a=22
|
||||||
|
q=8
|
||||||
|
b=8
|
||||||
|
output=${input}_${lang}_${token_type}_o${order}_p${prune// /_}_a${a}_q${q}_b${b}.arpa
|
||||||
|
echo "build ${token_type} lm."
|
||||||
|
bash local/build_zh_lm.sh --order ${order} --prune "${prune}" --a ${a} --q ${a} --b ${b} ${token_type} ${input} ${output}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
||||||
|
# case 2, chinese chararctor ngram lm build
|
||||||
|
# output: xxx.arpa xxx.kenlm.bin
|
||||||
|
input=exp/text
|
||||||
|
token_type=word
|
||||||
|
lang=zh
|
||||||
|
order=3
|
||||||
|
prune="0 0 0"
|
||||||
|
a=22
|
||||||
|
q=8
|
||||||
|
b=8
|
||||||
|
output=${input}_${lang}_${token_type}_o${order}_p${prune// /_}_a${a}_q${q}_b${b}.arpa
|
||||||
|
echo "build ${token_type} lm."
|
||||||
|
bash local/build_zh_lm.sh --order ${order} --prune "${prune}" --a ${a} --q ${a} --b ${b} ${token_type} ${input} ${output}
|
||||||
|
fi
|
@ -1,8 +1,20 @@
|
|||||||
|
|
||||||
* [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
|
* [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
|
||||||
commit: fc1bd6240c2008412ab64dc25045cd872f5e126c
|
commit: fc1bd6240c2008412ab64dc25045cd872f5e126c
|
||||||
ref: https://zhuanlan.zhihu.com/p/55371926
|
ref: https://zhuanlan.zhihu.com/p/55371926
|
||||||
|
licence: MIT
|
||||||
|
|
||||||
* [python-pinyin](https://github.com/mozillazg/python-pinyin.git)
|
* [python-pinyin](https://github.com/mozillazg/python-pinyin.git)
|
||||||
commit: 55e524aa1b7b8eec3d15c5306043c6cdd5938b03
|
commit: 55e524aa1b7b8eec3d15c5306043c6cdd5938b03
|
||||||
licence: MIT
|
licence: MIT
|
||||||
|
|
||||||
|
* [zhon](https://github.com/tsroten/zhon)
|
||||||
|
commit: 09bf543696277f71de502506984661a60d24494c
|
||||||
|
licence: MIT
|
||||||
|
|
||||||
|
* [pymmseg-cpp](https://github.com/pluskid/pymmseg-cpp.git)
|
||||||
|
commit: b76465045717fbb4f118c4fbdd24ce93bab10a6d
|
||||||
|
licence: MIT
|
||||||
|
|
||||||
|
* [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization.git)
|
||||||
|
commit: 9e92c7bf2d6b5a7974305406d8e240045beac51c
|
||||||
|
licence: MIT
|
||||||
|
@ -0,0 +1,2 @@
|
|||||||
|
*~
|
||||||
|
*.far
|
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2020 SpeechIO
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
@ -0,0 +1,8 @@
|
|||||||
|
# for plain text
|
||||||
|
python3 cn_tn.py example_plain.txt output_plain.txt
|
||||||
|
diff example_plain.txt output_plain.txt
|
||||||
|
|
||||||
|
# for Kaldi's trans format
|
||||||
|
python3 cn_tn.py --has_key example_kaldi.txt output_kaldi.txt
|
||||||
|
diff example_kaldi.txt output_kaldi.txt
|
||||||
|
|
@ -0,0 +1,24 @@
|
|||||||
|
0. place install_thrax.sh into $KALDI/tools/extras/
|
||||||
|
|
||||||
|
1. recompile openfst with necessary option "--enable-grm" to support thrax:
|
||||||
|
* cd $KALDI_ROOT/tools
|
||||||
|
* make clean
|
||||||
|
* edit $KALDI_ROOT/tools/Makefile, append "--enable-grm" option to OPENFST_CONFIGURE:
|
||||||
|
OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts --enable-lookahead-fsts --with-pic --enable-grm
|
||||||
|
* make -j 10
|
||||||
|
|
||||||
|
2. install thrax
|
||||||
|
cd $KALDI_ROOT/tools
|
||||||
|
sh extras/install_thrax.sh
|
||||||
|
|
||||||
|
3. add thrax binary path into $KALDI_ROOT/tools/env.sh:
|
||||||
|
export PATH=/path/to/your/kaldi_root/tools/thrax-1.2.9/src/bin:${PATH}
|
||||||
|
|
||||||
|
usage:
|
||||||
|
before you run anything related to thrax, use:
|
||||||
|
. $KALDI_ROOT/tools/env.sh
|
||||||
|
to enable binary finding, like what we always do in kaldi.
|
||||||
|
|
||||||
|
sample usage:
|
||||||
|
sh run_en.sh
|
||||||
|
sh run_cn.sh
|
@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
## This script should be placed under $KALDI_ROOT/tools/extras/, and see INSTALL.txt for installation guide
|
||||||
|
if [ ! -f thrax-1.2.9.tar.gz ]; then
|
||||||
|
wget http://www.openfst.org/twiki/pub/GRM/ThraxDownload/thrax-1.2.9.tar.gz
|
||||||
|
tar -zxf thrax-1.2.9.tar.gz
|
||||||
|
fi
|
||||||
|
cd thrax-1.2.9
|
||||||
|
OPENFSTPREFIX=`pwd`/../openfst
|
||||||
|
LDFLAGS="-L${OPENFSTPREFIX}/lib" CXXFLAGS="-I${OPENFSTPREFIX}/include" ./configure --prefix ${OPENFSTPREFIX}
|
||||||
|
make -j 10; make install
|
||||||
|
cd ..
|
||||||
|
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,6 @@
|
|||||||
|
cd src/cn
|
||||||
|
thraxmakedep itn.grm
|
||||||
|
make
|
||||||
|
#thraxrewrite-tester --far=itn.far --rules=ITN
|
||||||
|
cat ../../testcase_cn.txt | thraxrewrite-tester --far=itn.far --rules=ITN
|
||||||
|
cd -
|
@ -0,0 +1,6 @@
|
|||||||
|
cd src
|
||||||
|
thraxmakedep en/verbalizer/podspeech.grm
|
||||||
|
make
|
||||||
|
cat ../testcase_en.txt
|
||||||
|
cat ../testcase_en.txt | thraxrewrite-tester --far=en/verbalizer/podspeech.far --rules=POD_SPEECH_TN
|
||||||
|
cd -
|
@ -0,0 +1,202 @@
|
|||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright [yyyy] [name of copyright owner]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
@ -0,0 +1,65 @@
|
|||||||
|
en/verbalizer/podspeech.far: en/verbalizer/podspeech.grm util/util.far util/case.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
util/util.far: util/util.grm util/byte.far util/case.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
util/byte.far: util/byte.grm
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
util/case.far: util/case.grm util/byte.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/extra_numbers.far: en/verbalizer/extra_numbers.grm util/byte.far en/verbalizer/numbers.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/numbers.far: en/verbalizer/numbers.grm en/verbalizer/number_names.far util/byte.far universal/thousands_punct.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/number_names.far: en/verbalizer/number_names.grm util/arithmetic.far en/verbalizer/g.fst en/verbalizer/cardinals.tsv en/verbalizer/ordinals.tsv
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
util/arithmetic.far: util/arithmetic.grm util/byte.far util/germanic.tsv
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
universal/thousands_punct.far: universal/thousands_punct.grm util/byte.far util/util.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/float.far: en/verbalizer/float.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/factorization.far: en/verbalizer/factorization.grm util/byte.far util/util.far en/verbalizer/numbers.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/lexical_map.far: en/verbalizer/lexical_map.grm util/byte.far en/verbalizer/lexical_map.tsv
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/math.far: en/verbalizer/math.grm en/verbalizer/float.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/miscellaneous.far: en/verbalizer/miscellaneous.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/extra_numbers.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/spelled.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
ru/classifier/cyrillic.far: ru/classifier/cyrillic.grm
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/spelled.far: en/verbalizer/spelled.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/money.far: en/verbalizer/money.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/money.tsv
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/numbers_plus.far: en/verbalizer/numbers_plus.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/spoken_punct.far: en/verbalizer/spoken_punct.grm en/verbalizer/lexical_map.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/time.far: en/verbalizer/time.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
en/verbalizer/urls.far: en/verbalizer/urls.grm util/byte.far en/verbalizer/lexical_map.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f util/util.far util/case.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far util/byte.far en/verbalizer/number_names.far universal/thousands_punct.far util/arithmetic.far en/verbalizer/factorization.far en/verbalizer/lexical_map.far ru/classifier/cyrillic.far
|
@ -0,0 +1,24 @@
|
|||||||
|
# Text normalization covering grammars
|
||||||
|
|
||||||
|
This repository provides covering grammars for English and Russian text normalization as
|
||||||
|
documented in:
|
||||||
|
|
||||||
|
Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
|
||||||
|
_Transactions of the Association for Computational Linguistics_ 4: 507-519.
|
||||||
|
|
||||||
|
Ng, A. H., Gorman, K., and Sproat, R. 2017. Minimally supervised
|
||||||
|
written-to-spoken text normalization. In _ASRU_, pages 665-670.
|
||||||
|
|
||||||
|
If you use these grammars in a publication, we would appreciate if you cite these works.
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
The grammars are written in [Thrax](thrax.opengrm.org) and compile into [OpenFst](openfst.org) FAR (FstARchive) files. To compile, simply run `make` in the `src/` directory.
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
See `LICENSE`.
|
||||||
|
|
||||||
|
## Mandatory disclaimer
|
||||||
|
|
||||||
|
This is not an official Google product.
|
@ -0,0 +1,23 @@
|
|||||||
|
itn.far: itn.grm byte.far number.far hotfix.far percentage.far date.far amount.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
byte.far: byte.grm
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
number.far: number.grm byte.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
hotfix.far: hotfix.grm byte.far hotfix.list
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
percentage.far: percentage.grm byte.far number.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
date.far: date.grm byte.far number.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
amount.far: amount.grm byte.far number.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f byte.far number.far hotfix.far percentage.far date.far amount.far
|
@ -0,0 +1,24 @@
|
|||||||
|
import 'byte.grm' as b;
|
||||||
|
import 'number.grm' as n;
|
||||||
|
|
||||||
|
unit = (
|
||||||
|
"匹"|"张"|"座"|"回"|"场"|"尾"|"条"|"个"|"首"|"阙"|"阵"|"网"|"炮"|
|
||||||
|
"顶"|"丘"|"棵"|"只"|"支"|"袭"|"辆"|"挑"|"担"|"颗"|"壳"|"窠"|"曲"|
|
||||||
|
"墙"|"群"|"腔"|"砣"|"座"|"客"|"贯"|"扎"|"捆"|"刀"|"令"|"打"|"手"|
|
||||||
|
"罗"|"坡"|"山"|"岭"|"江"|"溪"|"钟"|"队"|"单"|"双"|"对"|"出"|"口"|
|
||||||
|
"头"|"脚"|"板"|"跳"|"枝"|"件"|"贴"|"针"|"线"|"管"|"名"|"位"|"身"|
|
||||||
|
"堂"|"课"|"本"|"页"|"家"|"户"|"层"|"丝"|"毫"|"厘"|"分"|"钱"|"两"|
|
||||||
|
"斤"|"担"|"铢"|"石"|"钧"|"锱"|"忽"|"毫"|"厘"|"分"|"寸"|"尺"|"丈"|
|
||||||
|
"里"|"寻"|"常"|"铺"|"程"|"撮"|"勺"|"合"|"升"|"斗"|"石"|"盘"|"碗"|
|
||||||
|
"碟"|"叠"|"桶"|"笼"|"盆"|"盒"|"杯"|"钟"|"斛"|"锅"|"簋"|"篮"|"盘"|
|
||||||
|
"桶"|"罐"|"瓶"|"壶"|"卮"|"盏"|"箩"|"箱"|"煲"|"啖"|"袋"|"钵"|"年"|
|
||||||
|
"月"|"日"|"季"|"刻"|"时"|"周"|"天"|"秒"|"分"|"旬"|"纪"|"岁"|"世"|
|
||||||
|
"更"|"夜"|"春"|"夏"|"秋"|"冬"|"代"|"伏"|"辈"|"丸"|"泡"|"粒"|"颗"|
|
||||||
|
"幢"|"堆"|"条"|"根"|"支"|"道"|"面"|"片"|"张"|"颗"|"块"|
|
||||||
|
(("千克":"kg")|("毫克":"mg")|("微克":"µg"))|
|
||||||
|
(("千米":"km")|("厘米":"cm")|("毫米":"mm")|("微米":"µm")|("纳米":"nm"))
|
||||||
|
);
|
||||||
|
|
||||||
|
amount = n.number unit;
|
||||||
|
export AMOUNT = CDRewrite[amount, "", "", b.kBytes*];
|
||||||
|
|
@ -0,0 +1,76 @@
|
|||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Copyright 2005-2011 Google, Inc.
|
||||||
|
# Author: ttai@google.com (Terry Tai)
|
||||||
|
|
||||||
|
# Standard constants for ASCII (byte) based strings. This mirrors the
|
||||||
|
# functions provided by C/C++'s ctype.h library.
|
||||||
|
|
||||||
|
# Note that [0] is missing. Matching the string-termination character is kinda weird.
|
||||||
|
export kBytes = Optimize[
|
||||||
|
"[1]" | "[2]" | "[3]" | "[4]" | "[5]" | "[6]" | "[7]" | "[8]" | "[9]" | "[10]" |
|
||||||
|
"[11]" | "[12]" | "[13]" | "[14]" | "[15]" | "[16]" | "[17]" | "[18]" | "[19]" | "[20]" |
|
||||||
|
"[21]" | "[22]" | "[23]" | "[24]" | "[25]" | "[26]" | "[27]" | "[28]" | "[29]" | "[30]" |
|
||||||
|
"[31]" | "[32]" | "[33]" | "[34]" | "[35]" | "[36]" | "[37]" | "[38]" | "[39]" | "[40]" |
|
||||||
|
"[41]" | "[42]" | "[43]" | "[44]" | "[45]" | "[46]" | "[47]" | "[48]" | "[49]" | "[50]" |
|
||||||
|
"[51]" | "[52]" | "[53]" | "[54]" | "[55]" | "[56]" | "[57]" | "[58]" | "[59]" | "[60]" |
|
||||||
|
"[61]" | "[62]" | "[63]" | "[64]" | "[65]" | "[66]" | "[67]" | "[68]" | "[69]" | "[70]" |
|
||||||
|
"[71]" | "[72]" | "[73]" | "[74]" | "[75]" | "[76]" | "[77]" | "[78]" | "[79]" | "[80]" |
|
||||||
|
"[81]" | "[82]" | "[83]" | "[84]" | "[85]" | "[86]" | "[87]" | "[88]" | "[89]" | "[90]" |
|
||||||
|
"[91]" | "[92]" | "[93]" | "[94]" | "[95]" | "[96]" | "[97]" | "[98]" | "[99]" | "[100]" |
|
||||||
|
"[101]" | "[102]" | "[103]" | "[104]" | "[105]" | "[106]" | "[107]" | "[108]" | "[109]" | "[110]" |
|
||||||
|
"[111]" | "[112]" | "[113]" | "[114]" | "[115]" | "[116]" | "[117]" | "[118]" | "[119]" | "[120]" |
|
||||||
|
"[121]" | "[122]" | "[123]" | "[124]" | "[125]" | "[126]" | "[127]" | "[128]" | "[129]" | "[130]" |
|
||||||
|
"[131]" | "[132]" | "[133]" | "[134]" | "[135]" | "[136]" | "[137]" | "[138]" | "[139]" | "[140]" |
|
||||||
|
"[141]" | "[142]" | "[143]" | "[144]" | "[145]" | "[146]" | "[147]" | "[148]" | "[149]" | "[150]" |
|
||||||
|
"[151]" | "[152]" | "[153]" | "[154]" | "[155]" | "[156]" | "[157]" | "[158]" | "[159]" | "[160]" |
|
||||||
|
"[161]" | "[162]" | "[163]" | "[164]" | "[165]" | "[166]" | "[167]" | "[168]" | "[169]" | "[170]" |
|
||||||
|
"[171]" | "[172]" | "[173]" | "[174]" | "[175]" | "[176]" | "[177]" | "[178]" | "[179]" | "[180]" |
|
||||||
|
"[181]" | "[182]" | "[183]" | "[184]" | "[185]" | "[186]" | "[187]" | "[188]" | "[189]" | "[190]" |
|
||||||
|
"[191]" | "[192]" | "[193]" | "[194]" | "[195]" | "[196]" | "[197]" | "[198]" | "[199]" | "[200]" |
|
||||||
|
"[201]" | "[202]" | "[203]" | "[204]" | "[205]" | "[206]" | "[207]" | "[208]" | "[209]" | "[210]" |
|
||||||
|
"[211]" | "[212]" | "[213]" | "[214]" | "[215]" | "[216]" | "[217]" | "[218]" | "[219]" | "[220]" |
|
||||||
|
"[221]" | "[222]" | "[223]" | "[224]" | "[225]" | "[226]" | "[227]" | "[228]" | "[229]" | "[230]" |
|
||||||
|
"[231]" | "[232]" | "[233]" | "[234]" | "[235]" | "[236]" | "[237]" | "[238]" | "[239]" | "[240]" |
|
||||||
|
"[241]" | "[242]" | "[243]" | "[244]" | "[245]" | "[246]" | "[247]" | "[248]" | "[249]" | "[250]" |
|
||||||
|
"[251]" | "[252]" | "[253]" | "[254]" | "[255]"
|
||||||
|
];
|
||||||
|
|
||||||
|
export kDigit = Optimize[
|
||||||
|
"0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
|
||||||
|
];
|
||||||
|
|
||||||
|
export kLower = Optimize[
|
||||||
|
"a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" |
|
||||||
|
"n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
|
||||||
|
];
|
||||||
|
export kUpper = Optimize[
|
||||||
|
"A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" |
|
||||||
|
"N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
|
||||||
|
];
|
||||||
|
export kAlpha = Optimize[kLower | kUpper];
|
||||||
|
|
||||||
|
export kAlnum = Optimize[kDigit | kAlpha];
|
||||||
|
|
||||||
|
export kSpace = Optimize[
|
||||||
|
" " | "\t" | "\n" | "\r"
|
||||||
|
];
|
||||||
|
export kNotSpace = Optimize[kBytes - kSpace];
|
||||||
|
|
||||||
|
export kPunct = Optimize[
|
||||||
|
"!" | "\"" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," |
|
||||||
|
"-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "\[" | "\\" |
|
||||||
|
"\]" | "^" | "_" | "`" | "{" | "|" | "}" | "~"
|
||||||
|
];
|
||||||
|
|
||||||
|
export kGraph = Optimize[kAlnum | kPunct];
|
@ -0,0 +1,10 @@
|
|||||||
|
import 'byte.grm' as b;
|
||||||
|
import 'number.grm' as n;
|
||||||
|
|
||||||
|
date_day = n.number_1_to_99 ("日"|"号");
|
||||||
|
date_month_day = n.number_1_to_99 "月" date_day;
|
||||||
|
date_year_month_day = ((n.number_0_to_9){2,4} | n.number) "年" date_month_day;
|
||||||
|
|
||||||
|
date = date_year_month_day | date_month_day | date_day;
|
||||||
|
|
||||||
|
export DATE = CDRewrite[date, "", "", b.kBytes*];
|
@ -0,0 +1,5 @@
|
|||||||
|
import 'byte.grm' as b;
|
||||||
|
hotfix = StringFile['hotfix.list'];
|
||||||
|
|
||||||
|
export HOTFIX = CDRewrite[hotfix, "", "", b.kBytes*];
|
||||||
|
|
@ -0,0 +1,18 @@
|
|||||||
|
0头 零头
|
||||||
|
10字 十字
|
||||||
|
东4环 东4环 -1.0
|
||||||
|
东4 东四 -0.5
|
||||||
|
4惠 四惠
|
||||||
|
3元桥 三元桥
|
||||||
|
4平市 四平市
|
||||||
|
5台山 五台山
|
||||||
|
西2旗 西二旗
|
||||||
|
西3旗 西三旗
|
||||||
|
4道口 四道口 -1.0
|
||||||
|
5道口 五道口 -1.0
|
||||||
|
6道口 六道口 -1.0
|
||||||
|
6里桥 六里桥
|
||||||
|
7里庄 七里庄
|
||||||
|
8宝山 八宝山
|
||||||
|
9颗松 九棵松
|
||||||
|
10里堡 十里堡
|
@ -0,0 +1,9 @@
|
|||||||
|
import 'byte.grm' as b;
|
||||||
|
import 'number.grm' as number;
|
||||||
|
import 'hotfix.grm' as hotfix;
|
||||||
|
import 'percentage.grm' as percentage;
|
||||||
|
import 'date.grm' as date;
|
||||||
|
import 'amount.grm' as amount; # seems not useful for now
|
||||||
|
|
||||||
|
export ITN = Optimize[percentage.PERCENTAGE @ (date.DATE <-1>) @ number.NUMBER @ hotfix.HOTFIX];
|
||||||
|
|
@ -0,0 +1,61 @@
|
|||||||
|
import 'byte.grm' as b;
|
||||||
|
|
||||||
|
number_1_to_9 = (
|
||||||
|
("一":"1") | ("幺":"1") |
|
||||||
|
("二":"2") | ("两":"2") |
|
||||||
|
("三":"3") |
|
||||||
|
("四":"4") |
|
||||||
|
("五":"5") |
|
||||||
|
("六":"6") |
|
||||||
|
("七":"7") |
|
||||||
|
("八":"8") |
|
||||||
|
("九":"9")
|
||||||
|
);
|
||||||
|
|
||||||
|
export number_0_to_9 = (("零":"0") | number_1_to_9);
|
||||||
|
|
||||||
|
number_10_to_19 = (
|
||||||
|
("十":"10") |
|
||||||
|
("十一":"11") |
|
||||||
|
("十二":"12") |
|
||||||
|
("十三":"13") |
|
||||||
|
("十四":"14") |
|
||||||
|
("十五":"15") |
|
||||||
|
("十六":"16") |
|
||||||
|
("十七":"17") |
|
||||||
|
("十八":"18") |
|
||||||
|
("十九":"19")
|
||||||
|
);
|
||||||
|
|
||||||
|
number_10s = (number_1_to_9 ("十":""));
|
||||||
|
number_100s = (number_1_to_9 ("百":""));
|
||||||
|
number_1000s = (number_1_to_9 ("千":""));
|
||||||
|
number_10000s = (number_1_to_9 ("万":""));
|
||||||
|
|
||||||
|
number_10_to_99 = (
|
||||||
|
((number_10s number_1_to_9)<-0.3>) |
|
||||||
|
((number_10s ("":"0"))<-0.2>) |
|
||||||
|
(number_10_to_19 <-0.1>)
|
||||||
|
);
|
||||||
|
|
||||||
|
export number_1_to_99 = (number_1_to_9 | number_10_to_99);
|
||||||
|
|
||||||
|
number_100_to_999 = (
|
||||||
|
((number_100s ("零":"0") number_1_to_9)<0.0>)|
|
||||||
|
((number_100s number_10_to_99)<0.0>) |
|
||||||
|
((number_100s number_1_to_9 ("":"0"))<0.0>) |
|
||||||
|
((number_100s ("":"00"))<0.1>)
|
||||||
|
);
|
||||||
|
|
||||||
|
number_1000_to_9999 = (
|
||||||
|
((number_1000s number_100_to_999)<0.0>) |
|
||||||
|
((number_1000s ("零":"0") number_10_to_99)<0.0>)|
|
||||||
|
((number_1000s ("零":"00") number_1_to_9)<0.0>)|
|
||||||
|
((number_1000s ("":"000"))<1>) |
|
||||||
|
((number_1000s number_1_to_9 ("":"00"))<0.0>)
|
||||||
|
);
|
||||||
|
|
||||||
|
export number = number_1_to_99 | (number_100_to_999 <-1>) | (number_1000_to_9999 <-2>);
|
||||||
|
|
||||||
|
export NUMBER = CDRewrite[number, "", "", b.kBytes*];
|
||||||
|
|
@ -0,0 +1,8 @@
|
|||||||
|
import 'byte.grm' as b;
|
||||||
|
import 'number.grm' as n;
|
||||||
|
|
||||||
|
percentage = (
|
||||||
|
("百分之":"") n.number_1_to_99 ("":"%")
|
||||||
|
);
|
||||||
|
|
||||||
|
export PERCENTAGE = CDRewrite[percentage, "", "", b.kBytes*];
|
@ -0,0 +1,6 @@
|
|||||||
|
# English covering grammar definitions
|
||||||
|
|
||||||
|
This directory defines a English text normalization covering grammar. The
|
||||||
|
primary entry-point is the FST `VERBALIZER`, defined in
|
||||||
|
`verbalizer/verbalizer.grm` and compiled in the FST archive
|
||||||
|
`verbalizer/verbalizer.far`.
|
@ -0,0 +1,3 @@
|
|||||||
|
verbalizer.far: verbalizer.grm util/util.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far
|
||||||
|
thraxcompiler --input_grammar=$< --output_far=$@
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'en/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@");
|
||||||
|
|
||||||
|
export DIGITS = digit (n.I[" "] digit)*;
|
||||||
|
|
||||||
|
# Various common factorizations
|
||||||
|
|
||||||
|
two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS;
|
||||||
|
|
||||||
|
three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS;
|
||||||
|
|
||||||
|
mixed =
|
||||||
|
(digit n.I[" "] two_digits)
|
||||||
|
| (two_digits n.I[" "] two_digits)
|
||||||
|
| (two_digits n.I[" "] three_digits)
|
||||||
|
| (two_digits n.I[" "] two_digits n.I[" "] two_digits)
|
||||||
|
;
|
||||||
|
|
||||||
|
export MIXED_NUMBERS = Optimize[mixed];
|
@ -0,0 +1,40 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'util/util.grm' as u;
|
||||||
|
import 'en/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
func ToNumberName[expr] {
|
||||||
|
number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*;
|
||||||
|
return Optimize[expr @ number_name_seq];
|
||||||
|
}
|
||||||
|
|
||||||
|
d = b.kDigit;
|
||||||
|
|
||||||
|
leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*];
|
||||||
|
|
||||||
|
by_ones = d n.I[" "];
|
||||||
|
by_twos = (d{2} @ leading_zero) n.I[" "];
|
||||||
|
by_threes = (d{3} @ leading_zero) n.I[" "];
|
||||||
|
|
||||||
|
groupings = by_twos* (by_threes | by_twos | by_ones);
|
||||||
|
|
||||||
|
export FRACTIONAL_PART_UNGROUPED =
|
||||||
|
Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]]
|
||||||
|
;
|
||||||
|
export FRACTIONAL_PART_GROUPED =
|
||||||
|
Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]]
|
||||||
|
;
|
||||||
|
export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]];
|
@ -0,0 +1,30 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'en/verbalizer/factorization.grm' as f;
|
||||||
|
import 'en/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'en/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED;
|
||||||
|
fractional_part_grouped = f.FRACTIONAL_PART_GROUPED;
|
||||||
|
fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED;
|
||||||
|
|
||||||
|
__fractional_part__ = fractional_part_ungrouped | fractional_part_unparsed;
|
||||||
|
__decimal_marker__ = ".";
|
||||||
|
|
||||||
|
export FLOAT = Optimize[
|
||||||
|
(n.CARDINAL_NUMBERS
|
||||||
|
(__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ")
|
||||||
|
__fractional_part__) @ l.LEXICAL_MAP]
|
||||||
|
;
|
Binary file not shown.
@ -0,0 +1,25 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
|
||||||
|
lexical_map = StringFile['en/verbalizer/lexical_map.tsv'];
|
||||||
|
|
||||||
|
sigma_star = b.kBytes*;
|
||||||
|
|
||||||
|
del_null = CDRewrite["__NULL__" : "", "", "", sigma_star];
|
||||||
|
|
||||||
|
export LEXICAL_MAP = Optimize[
|
||||||
|
CDRewrite[lexical_map, "", "", sigma_star] @ del_null]
|
||||||
|
;
|
Can't render this file because it has a wrong number of fields in line 37.
|
@ -0,0 +1,34 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'en/verbalizer/float.grm' as f;
|
||||||
|
import 'en/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'en/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
float = f.FLOAT;
|
||||||
|
card = n.CARDINAL_NUMBERS;
|
||||||
|
number = card | float;
|
||||||
|
|
||||||
|
plus = "+" : " @@ARITHMETIC_PLUS@@ ";
|
||||||
|
times = "*" : " @@ARITHMETIC_TIMES@@ ";
|
||||||
|
minus = "-" : " @@ARITHMETIC_MINUS@@ ";
|
||||||
|
division = "/" : " @@ARITHMETIC_DIVISION@@ ";
|
||||||
|
|
||||||
|
operator = plus | times | minus | division;
|
||||||
|
|
||||||
|
percent = "%" : " @@PERCENT@@";
|
||||||
|
|
||||||
|
export ARITHMETIC =
|
||||||
|
Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP]
|
||||||
|
;
|
@ -0,0 +1,78 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'ru/classifier/cyrillic.grm' as c;
|
||||||
|
import 'en/verbalizer/extra_numbers.grm' as e;
|
||||||
|
import 'en/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'en/verbalizer/numbers.grm' as n;
|
||||||
|
import 'en/verbalizer/spelled.grm' as s;
|
||||||
|
|
||||||
|
letter = b.kAlpha | c.kCyrillicAlpha;
|
||||||
|
dash = "-";
|
||||||
|
word = letter+;
|
||||||
|
possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?;
|
||||||
|
|
||||||
|
post_word_symbol =
|
||||||
|
("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) |
|
||||||
|
("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) |
|
||||||
|
("*" : "@@STAR@@")
|
||||||
|
;
|
||||||
|
|
||||||
|
pre_word_symbol =
|
||||||
|
("@" : "@@AT@@") |
|
||||||
|
("/" : "@@SLASH@@") |
|
||||||
|
("#" : "@@HASH@@")
|
||||||
|
;
|
||||||
|
|
||||||
|
post_word = possibly_split_word n.I[" "] post_word_symbol;
|
||||||
|
|
||||||
|
pre_word = pre_word_symbol n.I[" "] possibly_split_word;
|
||||||
|
|
||||||
|
## Number/digit sequence combos, maybe with a dash
|
||||||
|
|
||||||
|
spelled_word = word @ s.SPELLED_NO_LETTER;
|
||||||
|
|
||||||
|
word_number =
|
||||||
|
(word | spelled_word)
|
||||||
|
(n.I[" "] | (dash : " "))
|
||||||
|
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
||||||
|
;
|
||||||
|
|
||||||
|
number_word =
|
||||||
|
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
||||||
|
(n.I[" "] | (dash : " "))
|
||||||
|
(word | spelled_word)
|
||||||
|
;
|
||||||
|
|
||||||
|
## Two-digit year.
|
||||||
|
|
||||||
|
# Note that in this case to be fair we really have to allow ordinals too since
|
||||||
|
# in some languages that's what you would have.
|
||||||
|
|
||||||
|
two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS));
|
||||||
|
|
||||||
|
dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com";
|
||||||
|
|
||||||
|
miscellaneous = Optimize[
|
||||||
|
possibly_split_word
|
||||||
|
| post_word
|
||||||
|
| pre_word
|
||||||
|
| word_number
|
||||||
|
| number_word
|
||||||
|
| two_digit_year
|
||||||
|
| dot_com
|
||||||
|
];
|
||||||
|
|
||||||
|
export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP];
|
@ -0,0 +1,44 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'en/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'en/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
card = n.CARDINAL_NUMBERS;
|
||||||
|
|
||||||
|
__currency__ = StringFile['en/verbalizer/money.tsv'];
|
||||||
|
|
||||||
|
d = b.kDigit;
|
||||||
|
D = d - "0";
|
||||||
|
|
||||||
|
cents = ((n.D["0"] | D) d) @ card;
|
||||||
|
|
||||||
|
# Only dollar for the verbalizer tests for English. Will need to add other
|
||||||
|
# currencies.
|
||||||
|
usd_maj = Project["usd_maj" @ __currency__, 'output'];
|
||||||
|
usd_min = Project["usd_min" @ __currency__, 'output'];
|
||||||
|
and = " @@MONEY_AND@@ " | " ";
|
||||||
|
|
||||||
|
dollar1 =
|
||||||
|
n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min]
|
||||||
|
;
|
||||||
|
|
||||||
|
dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"];
|
||||||
|
|
||||||
|
dollar3 = n.D["$"] card n.I[" " usd_maj];
|
||||||
|
|
||||||
|
dollar = Optimize[dollar1 | dollar2 | dollar3];
|
||||||
|
|
||||||
|
export MONEY = Optimize[dollar @ l.LEXICAL_MAP];
|
|
@ -0,0 +1,54 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# English minimally supervised number grammar.
|
||||||
|
#
|
||||||
|
# Supports both cardinals and ordinals without overt marking.
|
||||||
|
#
|
||||||
|
# The language-specific acceptor G was compiled with digit, teen, and decade
|
||||||
|
# preterminals. The lexicon transducer L is unambiguous so no LM is used.
|
||||||
|
|
||||||
|
import 'util/arithmetic.grm' as a;
|
||||||
|
|
||||||
|
# Intersects the universal factorization transducer (F) with the
|
||||||
|
# language-specific acceptor (G).
|
||||||
|
|
||||||
|
d = a.DELTA_STAR;
|
||||||
|
f = a.IARITHMETIC_RESTRICTED;
|
||||||
|
g = LoadFst['en/verbalizer/g.fst'];
|
||||||
|
fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]];
|
||||||
|
test1 = AssertEqual["230" @ fg, "(+ (* 2 100 *) 30 +)"];
|
||||||
|
|
||||||
|
# Compiles lexicon transducer (L).
|
||||||
|
|
||||||
|
cardinal_name = StringFile['en/verbalizer/cardinals.tsv'];
|
||||||
|
cardinal_l = Optimize[(cardinal_name " ")* cardinal_name];
|
||||||
|
test2 = AssertEqual["2 100 30" @ cardinal_l, "two hundred thirty"];
|
||||||
|
|
||||||
|
ordinal_name = StringFile['en/verbalizer/ordinals.tsv'];
|
||||||
|
# In English, ordinals have the same syntax as cardinals and all but the final
|
||||||
|
# element is verbalized using a cardinal number word; e.g., "two hundred
|
||||||
|
# thirtieth".
|
||||||
|
ordinal_l = Optimize[(cardinal_name " ")* ordinal_name];
|
||||||
|
test3 = AssertEqual["2 100 30" @ ordinal_l, "two hundred thirtieth"];
|
||||||
|
|
||||||
|
# Composes L with the leaf transducer (P), then composes that with FG.
|
||||||
|
|
||||||
|
p = a.LEAVES;
|
||||||
|
|
||||||
|
export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)];
|
||||||
|
test4 = AssertEqual["230" @ CARDINAL_NUMBER_NAME, "two hundred thirty"];
|
||||||
|
|
||||||
|
export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)];
|
||||||
|
test5 = AssertEqual["230" @ ORDINAL_NUMBER_NAME, "two hundred thirtieth"];
|
@ -0,0 +1,57 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'en/verbalizer/number_names.grm' as n;
|
||||||
|
import 'util/byte.grm' as bytelib;
|
||||||
|
import 'universal/thousands_punct.grm' as t;
|
||||||
|
|
||||||
|
cardinal = n.CARDINAL_NUMBER_NAME;
|
||||||
|
ordinal = n.ORDINAL_NUMBER_NAME;
|
||||||
|
|
||||||
|
# Putting these here since this grammar gets incorporated by all the others.
|
||||||
|
|
||||||
|
func I[expr] {
|
||||||
|
return "" : expr;
|
||||||
|
}
|
||||||
|
|
||||||
|
func D[expr] {
|
||||||
|
return expr : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
separators = t.comma_thousands | t.no_delimiter;
|
||||||
|
|
||||||
|
# Language specific endings for ordinals.
|
||||||
|
d = bytelib.kDigit;
|
||||||
|
endings = "st" | "nd" | "rd" | "th";
|
||||||
|
|
||||||
|
st = (d* "1") - (d* "11");
|
||||||
|
nd = (d* "2") - (d* "12");
|
||||||
|
rd = (d* "3") - (d* "13");
|
||||||
|
th = Optimize[d* - st - nd - rd];
|
||||||
|
first = st ("st" : "");
|
||||||
|
second = nd ("nd" : "");
|
||||||
|
third = rd ("rd" : "");
|
||||||
|
other = th ("th" : "");
|
||||||
|
marked_ordinal = Optimize[first | second | third | other];
|
||||||
|
|
||||||
|
# The separator is a no-op here but will be needed once we replace
|
||||||
|
# the above targets.
|
||||||
|
|
||||||
|
export CARDINAL_NUMBERS = Optimize[separators @ cardinal];
|
||||||
|
|
||||||
|
export ORDINAL_NUMBERS =
|
||||||
|
Optimize[(separators endings) @ marked_ordinal @ ordinal]
|
||||||
|
;
|
||||||
|
|
||||||
|
export ORDINAL_NUMBERS_UNMARKED = Optimize[separators @ ordinal];
|
@ -0,0 +1,133 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# Grammar for things built mostly on numbers.
|
||||||
|
|
||||||
|
import 'en/verbalizer/factorization.grm' as f;
|
||||||
|
import 'en/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'en/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
num = n.CARDINAL_NUMBERS;
|
||||||
|
ord = n.ORDINAL_NUMBERS_UNMARKED;
|
||||||
|
digits = f.FRACTIONAL_PART_UNGROUPED;
|
||||||
|
|
||||||
|
# Various symbols.
|
||||||
|
|
||||||
|
plus = "+" : "@@ARITHMETIC_PLUS@@";
|
||||||
|
minus = "-" : "@@ARITHMETIC_MINUS@@";
|
||||||
|
slash = "/" : "@@SLASH@@";
|
||||||
|
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
||||||
|
dash = "-" : "@@DASH@@";
|
||||||
|
equals = "=" : "@@ARITHMETIC_EQUALS@@";
|
||||||
|
|
||||||
|
degree = "°" : "@@DEGREE@@";
|
||||||
|
|
||||||
|
division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@";
|
||||||
|
|
||||||
|
times = ("x" | "*") : "@@ARITHMETIC_TIMES@@";
|
||||||
|
|
||||||
|
power = "^" : "@@DECIMAL_EXPONENT@@";
|
||||||
|
|
||||||
|
square_root = "√" : "@@SQUARE_ROOT@@";
|
||||||
|
|
||||||
|
percent = "%" : "@@PERCENT@@";
|
||||||
|
|
||||||
|
# Safe roman numbers.
|
||||||
|
|
||||||
|
# NB: Do not change the formatting here. NO_EDIT must be on the same
|
||||||
|
# line as the path.
|
||||||
|
rfile =
|
||||||
|
'universal/roman_numerals.tsv' # NO_EDIT
|
||||||
|
;
|
||||||
|
|
||||||
|
roman = StringFile[rfile];
|
||||||
|
|
||||||
|
## Main categories.
|
||||||
|
|
||||||
|
cat_dot_number =
|
||||||
|
num
|
||||||
|
n.I[" "] dot n.I[" "] num
|
||||||
|
(n.I[" "] dot n.I[" "] num)+
|
||||||
|
;
|
||||||
|
|
||||||
|
cat_slash_number =
|
||||||
|
num
|
||||||
|
n.I[" "] slash n.I[" "] num
|
||||||
|
(n.I[" "] slash n.I[" "] num)*
|
||||||
|
;
|
||||||
|
|
||||||
|
cat_dash_number =
|
||||||
|
num
|
||||||
|
n.I[" "] dash n.I[" "] num
|
||||||
|
(n.I[" "] dash n.I[" "] num)*
|
||||||
|
;
|
||||||
|
|
||||||
|
cat_signed_number = ((plus | minus) n.I[" "])? num;
|
||||||
|
|
||||||
|
cat_degree = cat_signed_number n.I[" "] degree;
|
||||||
|
|
||||||
|
cat_country_code = plus n.I[" "] (num | digits);
|
||||||
|
|
||||||
|
cat_math_operations =
|
||||||
|
plus
|
||||||
|
| minus
|
||||||
|
| division
|
||||||
|
| times
|
||||||
|
| equals
|
||||||
|
| percent
|
||||||
|
| power
|
||||||
|
| square_root
|
||||||
|
;
|
||||||
|
|
||||||
|
# Roman numbers are often either cardinals or ordinals in various languages.
|
||||||
|
cat_roman = roman @ (num | ord);
|
||||||
|
|
||||||
|
# Allow
|
||||||
|
#
|
||||||
|
# number:number
|
||||||
|
# number-number
|
||||||
|
#
|
||||||
|
# to just be
|
||||||
|
#
|
||||||
|
# number number.
|
||||||
|
|
||||||
|
cat_number_number =
|
||||||
|
num ((":" | "-") : " ") num
|
||||||
|
;
|
||||||
|
|
||||||
|
# Some additional readings for these symbols.
|
||||||
|
|
||||||
|
cat_additional_readings =
|
||||||
|
("/" : "@@PER@@") |
|
||||||
|
("+" : "@@AND@@") |
|
||||||
|
("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) |
|
||||||
|
("*" : "@@STAR@@") |
|
||||||
|
("x" : ("x" | "@@CONNECTOR_BY@@")) |
|
||||||
|
("@" : "@@AT@@")
|
||||||
|
;
|
||||||
|
|
||||||
|
numbers_plus = Optimize[
|
||||||
|
cat_dot_number
|
||||||
|
| cat_slash_number
|
||||||
|
| cat_dash_number
|
||||||
|
| cat_signed_number
|
||||||
|
| cat_degree
|
||||||
|
| cat_country_code
|
||||||
|
| cat_math_operations
|
||||||
|
| cat_roman
|
||||||
|
| cat_number_number
|
||||||
|
| cat_additional_readings
|
||||||
|
];
|
||||||
|
|
||||||
|
export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP];
|
|
Can't render this file because it contains an unexpected character in line 5 and column 20.
|
@ -0,0 +1,46 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/util.grm' as util;
|
||||||
|
import 'util/case.grm' as case;
|
||||||
|
import 'en/verbalizer/extra_numbers.grm' as e;
|
||||||
|
import 'en/verbalizer/float.grm' as f;
|
||||||
|
import 'en/verbalizer/math.grm' as ma;
|
||||||
|
import 'en/verbalizer/miscellaneous.grm' as mi;
|
||||||
|
import 'en/verbalizer/money.grm' as mo;
|
||||||
|
import 'en/verbalizer/numbers.grm' as n;
|
||||||
|
import 'en/verbalizer/numbers_plus.grm' as np;
|
||||||
|
import 'en/verbalizer/spelled.grm' as s;
|
||||||
|
import 'en/verbalizer/spoken_punct.grm' as sp;
|
||||||
|
import 'en/verbalizer/time.grm' as t;
|
||||||
|
import 'en/verbalizer/urls.grm' as u;
|
||||||
|
|
||||||
|
export POD_SPEECH_TN = Optimize[RmWeight[
|
||||||
|
(u.URL
|
||||||
|
| e.MIXED_NUMBERS
|
||||||
|
| e.DIGITS
|
||||||
|
| f.FLOAT
|
||||||
|
| ma.ARITHMETIC
|
||||||
|
| mo.MONEY
|
||||||
|
| n.CARDINAL_NUMBERS
|
||||||
|
| n.ORDINAL_NUMBERS
|
||||||
|
| np.NUMBERS_PLUS
|
||||||
|
| s.SPELLED
|
||||||
|
| sp.SPOKEN_PUNCT
|
||||||
|
| t.TIME
|
||||||
|
| u.URL
|
||||||
|
| u.EMAILS) @ util.CLEAN_SPACES @ case.TOUPPER
|
||||||
|
]];
|
||||||
|
|
||||||
|
#export POD_SPEECH_TN = Optimize[RmWeight[(mi.MISCELLANEOUS) @ util.CLEAN_SPACES @ case.TOUPPER]];
|
@ -0,0 +1,77 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# This verbalizer is used whenever there is an LM symbol that consists of
|
||||||
|
# letters immediately followed by "{spelled}".l This strips the "{spelled}"
|
||||||
|
# suffix.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'ru/classifier/cyrillic.grm' as c;
|
||||||
|
import 'en/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'en/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
digit = b.kDigit @ n.CARDINAL_NUMBERS;
|
||||||
|
|
||||||
|
char_set = (("a" | "A") : "letter-a")
|
||||||
|
| (("b" | "B") : "letter-b")
|
||||||
|
| (("c" | "C") : "letter-c")
|
||||||
|
| (("d" | "D") : "letter-d")
|
||||||
|
| (("e" | "E") : "letter-e")
|
||||||
|
| (("f" | "F") : "letter-f")
|
||||||
|
| (("g" | "G") : "letter-g")
|
||||||
|
| (("h" | "H") : "letter-h")
|
||||||
|
| (("i" | "I") : "letter-i")
|
||||||
|
| (("j" | "J") : "letter-j")
|
||||||
|
| (("k" | "K") : "letter-k")
|
||||||
|
| (("l" | "L") : "letter-l")
|
||||||
|
| (("m" | "M") : "letter-m")
|
||||||
|
| (("n" | "N") : "letter-n")
|
||||||
|
| (("o" | "O") : "letter-o")
|
||||||
|
| (("p" | "P") : "letter-p")
|
||||||
|
| (("q" | "Q") : "letter-q")
|
||||||
|
| (("r" | "R") : "letter-r")
|
||||||
|
| (("s" | "S") : "letter-s")
|
||||||
|
| (("t" | "T") : "letter-t")
|
||||||
|
| (("u" | "U") : "letter-u")
|
||||||
|
| (("v" | "V") : "letter-v")
|
||||||
|
| (("w" | "W") : "letter-w")
|
||||||
|
| (("x" | "X") : "letter-x")
|
||||||
|
| (("y" | "Y") : "letter-y")
|
||||||
|
| (("z" | "Z") : "letter-z")
|
||||||
|
| (digit)
|
||||||
|
| ("&" : "@@AND@@")
|
||||||
|
| ("." : "")
|
||||||
|
| ("-" : "")
|
||||||
|
| ("_" : "")
|
||||||
|
| ("/" : "")
|
||||||
|
| (n.I["letter-"] c.kCyrillicAlpha)
|
||||||
|
;
|
||||||
|
|
||||||
|
ins_space = "" : " ";
|
||||||
|
|
||||||
|
suffix = "{spelled}" : "";
|
||||||
|
|
||||||
|
spelled = Optimize[char_set (ins_space char_set)* suffix];
|
||||||
|
|
||||||
|
export SPELLED = Optimize[spelled @ l.LEXICAL_MAP];
|
||||||
|
|
||||||
|
sigma_star = b.kBytes*;
|
||||||
|
|
||||||
|
# Gets rid of the letter- prefix since in some cases we don't want it.
|
||||||
|
|
||||||
|
del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star];
|
||||||
|
|
||||||
|
spelled_no_tag = Optimize[char_set (ins_space char_set)*];
|
||||||
|
|
||||||
|
export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter];
|
@ -0,0 +1,24 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'en/verbalizer/lexical_map.grm' as l;
|
||||||
|
|
||||||
|
punct =
|
||||||
|
("." : "@@PERIOD@@")
|
||||||
|
| ("," : "@@COMMA@@")
|
||||||
|
| ("!" : "@@EXCLAMATION_MARK@@")
|
||||||
|
| ("?" : "@@QUESTION_MARK@@")
|
||||||
|
;
|
||||||
|
|
||||||
|
export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP];
|
@ -0,0 +1,108 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'en/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'en/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
# Only handles 24-hour time with quarter-to, half-past and quarter-past.
|
||||||
|
|
||||||
|
increment_hour =
|
||||||
|
("0" : "1")
|
||||||
|
| ("1" : "2")
|
||||||
|
| ("2" : "3")
|
||||||
|
| ("3" : "4")
|
||||||
|
| ("4" : "5")
|
||||||
|
| ("5" : "6")
|
||||||
|
| ("6" : "7")
|
||||||
|
| ("7" : "8")
|
||||||
|
| ("8" : "9")
|
||||||
|
| ("9" : "10")
|
||||||
|
| ("10" : "11")
|
||||||
|
| ("11" : "12")
|
||||||
|
| ("12" : "1") # If someone uses 12, we assume 12-hour by default.
|
||||||
|
| ("13" : "14")
|
||||||
|
| ("14" : "15")
|
||||||
|
| ("15" : "16")
|
||||||
|
| ("16" : "17")
|
||||||
|
| ("17" : "18")
|
||||||
|
| ("18" : "19")
|
||||||
|
| ("19" : "20")
|
||||||
|
| ("20" : "21")
|
||||||
|
| ("21" : "22")
|
||||||
|
| ("22" : "23")
|
||||||
|
| ("23" : "12")
|
||||||
|
;
|
||||||
|
|
||||||
|
hours = Project[increment_hour, 'input'];
|
||||||
|
|
||||||
|
d = b.kDigit;
|
||||||
|
D = d - "0";
|
||||||
|
|
||||||
|
minutes09 = "0" D;
|
||||||
|
|
||||||
|
minutes = ("1" | "2" | "3" | "4" | "5") d;
|
||||||
|
|
||||||
|
__sep__ = ":";
|
||||||
|
sep_space = __sep__ : " ";
|
||||||
|
|
||||||
|
verbalize_hours = hours @ n.CARDINAL_NUMBERS;
|
||||||
|
|
||||||
|
verbalize_minutes =
|
||||||
|
("00" : "@@HOUR@@")
|
||||||
|
| (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS))
|
||||||
|
| (minutes @ n.CARDINAL_NUMBERS)
|
||||||
|
;
|
||||||
|
|
||||||
|
time_basic = Optimize[verbalize_hours sep_space verbalize_minutes];
|
||||||
|
|
||||||
|
# Special cases we handle right now.
|
||||||
|
# TODO: Need to allow for cases like
|
||||||
|
#
|
||||||
|
# half twelve (in the UK English sense)
|
||||||
|
# half twaalf (in the Dutch sense)
|
||||||
|
|
||||||
|
time_quarter_past =
|
||||||
|
n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "]
|
||||||
|
verbalize_hours
|
||||||
|
n.D[__sep__ "15"];
|
||||||
|
|
||||||
|
time_half_past =
|
||||||
|
n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "]
|
||||||
|
verbalize_hours
|
||||||
|
n.D[__sep__ "30"];
|
||||||
|
|
||||||
|
time_quarter_to =
|
||||||
|
n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "]
|
||||||
|
(increment_hour @ verbalize_hours)
|
||||||
|
n.D[__sep__ "45"];
|
||||||
|
|
||||||
|
time_extra = Optimize[
|
||||||
|
time_quarter_past | time_half_past | time_quarter_to]
|
||||||
|
;
|
||||||
|
|
||||||
|
# Basic time periods which most languages can be expected to have.
|
||||||
|
__am__ = "a.m." | "am" | "AM";
|
||||||
|
__pm__ = "p.m." | "pm" | "PM";
|
||||||
|
|
||||||
|
period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@");
|
||||||
|
|
||||||
|
time_variants = time_basic | time_extra;
|
||||||
|
|
||||||
|
time = Optimize[
|
||||||
|
(period (" " | n.I[" "]))? time_variants
|
||||||
|
| time_variants ((" " | n.I[" "]) period)?]
|
||||||
|
;
|
||||||
|
|
||||||
|
export TIME = Optimize[time @ l.LEXICAL_MAP];
|
@ -0,0 +1,68 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Rules for URLs and email addresses.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as bytelib;
|
||||||
|
import 'en/verbalizer/lexical_map.grm' as l;
|
||||||
|
|
||||||
|
ins_space = "" : " ";
|
||||||
|
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
||||||
|
at = "@" : "@@AT@@";
|
||||||
|
|
||||||
|
url_suffix =
|
||||||
|
(".com" : dot ins_space "com") |
|
||||||
|
(".gov" : dot ins_space "gov") |
|
||||||
|
(".edu" : dot ins_space "e d u") |
|
||||||
|
(".org" : dot ins_space "org") |
|
||||||
|
(".net" : dot ins_space "net")
|
||||||
|
;
|
||||||
|
|
||||||
|
letter_string = (bytelib.kAlnum)* bytelib.kAlnum;
|
||||||
|
|
||||||
|
letter_string_dot =
|
||||||
|
((letter_string ins_space dot ins_space)* letter_string)
|
||||||
|
;
|
||||||
|
|
||||||
|
# Rules for URLs.
|
||||||
|
export URL = Optimize[
|
||||||
|
((letter_string_dot) (ins_space)
|
||||||
|
(url_suffix)) @ l.LEXICAL_MAP
|
||||||
|
];
|
||||||
|
|
||||||
|
# Rules for email addresses.
|
||||||
|
letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum);
|
||||||
|
|
||||||
|
letter_by_letter_dot =
|
||||||
|
((letter_by_letter ins_space dot ins_space)*
|
||||||
|
letter_by_letter)
|
||||||
|
;
|
||||||
|
|
||||||
|
export EMAIL1 = Optimize[
|
||||||
|
((letter_by_letter) (ins_space)
|
||||||
|
(at) (ins_space)
|
||||||
|
(letter_by_letter_dot) (ins_space)
|
||||||
|
(url_suffix)) @ l.LEXICAL_MAP
|
||||||
|
];
|
||||||
|
|
||||||
|
export EMAIL2 = Optimize[
|
||||||
|
((letter_by_letter) (ins_space)
|
||||||
|
(at) (ins_space)
|
||||||
|
(letter_string_dot) (ins_space)
|
||||||
|
(url_suffix)) @ l.LEXICAL_MAP
|
||||||
|
];
|
||||||
|
|
||||||
|
export EMAILS = Optimize[
|
||||||
|
EMAIL1 | EMAIL2
|
||||||
|
];
|
@ -0,0 +1,42 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/util.grm' as util;
|
||||||
|
import 'en/verbalizer/extra_numbers.grm' as e;
|
||||||
|
import 'en/verbalizer/float.grm' as f;
|
||||||
|
import 'en/verbalizer/math.grm' as ma;
|
||||||
|
import 'en/verbalizer/miscellaneous.grm' as mi;
|
||||||
|
import 'en/verbalizer/money.grm' as mo;
|
||||||
|
import 'en/verbalizer/numbers.grm' as n;
|
||||||
|
import 'en/verbalizer/numbers_plus.grm' as np;
|
||||||
|
import 'en/verbalizer/spelled.grm' as s;
|
||||||
|
import 'en/verbalizer/spoken_punct.grm' as sp;
|
||||||
|
import 'en/verbalizer/time.grm' as t;
|
||||||
|
import 'en/verbalizer/urls.grm' as u;
|
||||||
|
|
||||||
|
export VERBALIZER = Optimize[RmWeight[
|
||||||
|
( e.MIXED_NUMBERS
|
||||||
|
| e.DIGITS
|
||||||
|
| f.FLOAT
|
||||||
|
| ma.ARITHMETIC
|
||||||
|
| mi.MISCELLANEOUS
|
||||||
|
| mo.MONEY
|
||||||
|
| n.CARDINAL_NUMBERS
|
||||||
|
| n.ORDINAL_NUMBERS
|
||||||
|
| np.NUMBERS_PLUS
|
||||||
|
| s.SPELLED
|
||||||
|
| sp.SPOKEN_PUNCT
|
||||||
|
| t.TIME
|
||||||
|
| u.URL) @ util.CLEAN_SPACES
|
||||||
|
]];
|
@ -0,0 +1,17 @@
|
|||||||
|
This directory contains data used in:
|
||||||
|
|
||||||
|
Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
|
||||||
|
Transactions of the Association for Computational Linguistics 4: 507-519.
|
||||||
|
|
||||||
|
* `minimal.txt`: A list of 30 curated numbers used as the "minimal" training
|
||||||
|
set.
|
||||||
|
* `random-trn.txt`: A list of 9000 randomly-generated numbers used as the
|
||||||
|
"medium" training set.
|
||||||
|
* `random-tst.txt`: A list of 1000 randomly-generated numbers used as the test
|
||||||
|
set.
|
||||||
|
|
||||||
|
Note that `random-trn.txt` and `random-tst.txt` are totally disjoint, but that
|
||||||
|
a small number of examples occur both in `minimal.txt` and `random-tst.txt`.
|
||||||
|
|
||||||
|
For information about the sampling procedure used to generate the random data
|
||||||
|
sets, see appendix A of the aforementioned paper.
|
@ -0,0 +1,300 @@
|
|||||||
|
0
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
|
4
|
||||||
|
5
|
||||||
|
6
|
||||||
|
7
|
||||||
|
8
|
||||||
|
9
|
||||||
|
10
|
||||||
|
11
|
||||||
|
12
|
||||||
|
13
|
||||||
|
14
|
||||||
|
15
|
||||||
|
16
|
||||||
|
17
|
||||||
|
18
|
||||||
|
19
|
||||||
|
20
|
||||||
|
21
|
||||||
|
22
|
||||||
|
23
|
||||||
|
24
|
||||||
|
25
|
||||||
|
26
|
||||||
|
27
|
||||||
|
28
|
||||||
|
29
|
||||||
|
30
|
||||||
|
31
|
||||||
|
32
|
||||||
|
33
|
||||||
|
34
|
||||||
|
35
|
||||||
|
36
|
||||||
|
37
|
||||||
|
38
|
||||||
|
39
|
||||||
|
40
|
||||||
|
41
|
||||||
|
42
|
||||||
|
43
|
||||||
|
44
|
||||||
|
45
|
||||||
|
46
|
||||||
|
47
|
||||||
|
48
|
||||||
|
49
|
||||||
|
50
|
||||||
|
51
|
||||||
|
52
|
||||||
|
53
|
||||||
|
54
|
||||||
|
55
|
||||||
|
56
|
||||||
|
57
|
||||||
|
58
|
||||||
|
59
|
||||||
|
60
|
||||||
|
61
|
||||||
|
62
|
||||||
|
63
|
||||||
|
64
|
||||||
|
65
|
||||||
|
66
|
||||||
|
67
|
||||||
|
68
|
||||||
|
69
|
||||||
|
70
|
||||||
|
71
|
||||||
|
72
|
||||||
|
73
|
||||||
|
74
|
||||||
|
75
|
||||||
|
76
|
||||||
|
77
|
||||||
|
78
|
||||||
|
79
|
||||||
|
80
|
||||||
|
81
|
||||||
|
82
|
||||||
|
83
|
||||||
|
84
|
||||||
|
85
|
||||||
|
86
|
||||||
|
87
|
||||||
|
88
|
||||||
|
89
|
||||||
|
90
|
||||||
|
91
|
||||||
|
92
|
||||||
|
93
|
||||||
|
94
|
||||||
|
95
|
||||||
|
96
|
||||||
|
97
|
||||||
|
98
|
||||||
|
99
|
||||||
|
100
|
||||||
|
101
|
||||||
|
102
|
||||||
|
103
|
||||||
|
104
|
||||||
|
105
|
||||||
|
106
|
||||||
|
107
|
||||||
|
108
|
||||||
|
109
|
||||||
|
110
|
||||||
|
111
|
||||||
|
112
|
||||||
|
113
|
||||||
|
114
|
||||||
|
115
|
||||||
|
116
|
||||||
|
117
|
||||||
|
118
|
||||||
|
119
|
||||||
|
120
|
||||||
|
121
|
||||||
|
122
|
||||||
|
123
|
||||||
|
124
|
||||||
|
125
|
||||||
|
126
|
||||||
|
127
|
||||||
|
128
|
||||||
|
129
|
||||||
|
130
|
||||||
|
131
|
||||||
|
132
|
||||||
|
133
|
||||||
|
134
|
||||||
|
135
|
||||||
|
136
|
||||||
|
137
|
||||||
|
138
|
||||||
|
139
|
||||||
|
140
|
||||||
|
141
|
||||||
|
142
|
||||||
|
143
|
||||||
|
144
|
||||||
|
145
|
||||||
|
146
|
||||||
|
147
|
||||||
|
148
|
||||||
|
149
|
||||||
|
150
|
||||||
|
151
|
||||||
|
152
|
||||||
|
153
|
||||||
|
154
|
||||||
|
155
|
||||||
|
156
|
||||||
|
157
|
||||||
|
158
|
||||||
|
159
|
||||||
|
160
|
||||||
|
161
|
||||||
|
162
|
||||||
|
163
|
||||||
|
164
|
||||||
|
165
|
||||||
|
166
|
||||||
|
167
|
||||||
|
168
|
||||||
|
169
|
||||||
|
170
|
||||||
|
171
|
||||||
|
172
|
||||||
|
173
|
||||||
|
174
|
||||||
|
175
|
||||||
|
176
|
||||||
|
177
|
||||||
|
178
|
||||||
|
179
|
||||||
|
180
|
||||||
|
181
|
||||||
|
182
|
||||||
|
183
|
||||||
|
184
|
||||||
|
185
|
||||||
|
186
|
||||||
|
187
|
||||||
|
188
|
||||||
|
189
|
||||||
|
190
|
||||||
|
191
|
||||||
|
192
|
||||||
|
193
|
||||||
|
194
|
||||||
|
195
|
||||||
|
196
|
||||||
|
197
|
||||||
|
198
|
||||||
|
199
|
||||||
|
200
|
||||||
|
201
|
||||||
|
202
|
||||||
|
203
|
||||||
|
204
|
||||||
|
205
|
||||||
|
206
|
||||||
|
207
|
||||||
|
208
|
||||||
|
209
|
||||||
|
210
|
||||||
|
211
|
||||||
|
212
|
||||||
|
220
|
||||||
|
221
|
||||||
|
230
|
||||||
|
300
|
||||||
|
400
|
||||||
|
500
|
||||||
|
600
|
||||||
|
700
|
||||||
|
800
|
||||||
|
900
|
||||||
|
1000
|
||||||
|
1001
|
||||||
|
1002
|
||||||
|
1003
|
||||||
|
1004
|
||||||
|
1005
|
||||||
|
1006
|
||||||
|
1007
|
||||||
|
1008
|
||||||
|
1009
|
||||||
|
1010
|
||||||
|
1011
|
||||||
|
1012
|
||||||
|
1020
|
||||||
|
1021
|
||||||
|
1030
|
||||||
|
1200
|
||||||
|
2000
|
||||||
|
2001
|
||||||
|
2002
|
||||||
|
2003
|
||||||
|
2004
|
||||||
|
2005
|
||||||
|
2006
|
||||||
|
2007
|
||||||
|
2008
|
||||||
|
2009
|
||||||
|
2010
|
||||||
|
2011
|
||||||
|
2012
|
||||||
|
2020
|
||||||
|
2021
|
||||||
|
2030
|
||||||
|
2100
|
||||||
|
2200
|
||||||
|
5001
|
||||||
|
10000
|
||||||
|
12000
|
||||||
|
20000
|
||||||
|
21000
|
||||||
|
50001
|
||||||
|
100000
|
||||||
|
120000
|
||||||
|
200000
|
||||||
|
210000
|
||||||
|
500001
|
||||||
|
1000000
|
||||||
|
1001000
|
||||||
|
1200000
|
||||||
|
2000000
|
||||||
|
2100000
|
||||||
|
5000001
|
||||||
|
10000000
|
||||||
|
10001000
|
||||||
|
12000000
|
||||||
|
20000000
|
||||||
|
50000001
|
||||||
|
100000000
|
||||||
|
100001000
|
||||||
|
120000000
|
||||||
|
200000000
|
||||||
|
500000001
|
||||||
|
1000000000
|
||||||
|
1000001000
|
||||||
|
1200000000
|
||||||
|
2000000000
|
||||||
|
5000000001
|
||||||
|
10000000000
|
||||||
|
10000001000
|
||||||
|
12000000000
|
||||||
|
20000000000
|
||||||
|
50000000001
|
||||||
|
100000000000
|
||||||
|
100000001000
|
||||||
|
120000000000
|
||||||
|
200000000000
|
||||||
|
500000000001
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,6 @@
|
|||||||
|
# Russian covering grammar definitions
|
||||||
|
|
||||||
|
This directory defines a Russian text normalization covering grammar. The
|
||||||
|
primary entry-point is the FST `VERBALIZER`, defined in
|
||||||
|
`verbalizer/verbalizer.grm` and compiled in the FST archive
|
||||||
|
`verbalizer/verbalizer.far`.
|
@ -0,0 +1,338 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# AUTOMATICALLY GENERATED: DO NOT EDIT.
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
|
||||||
|
# Utilities for insertion and deletion.
|
||||||
|
|
||||||
|
func I[expr] {
|
||||||
|
return "" : expr;
|
||||||
|
}
|
||||||
|
|
||||||
|
func D[expr] {
|
||||||
|
return expr : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
# Powers of base 10.
|
||||||
|
export POWERS =
|
||||||
|
"[E15]"
|
||||||
|
| "[E14]"
|
||||||
|
| "[E13]"
|
||||||
|
| "[E12]"
|
||||||
|
| "[E11]"
|
||||||
|
| "[E10]"
|
||||||
|
| "[E9]"
|
||||||
|
| "[E8]"
|
||||||
|
| "[E7]"
|
||||||
|
| "[E6]"
|
||||||
|
| "[E5]"
|
||||||
|
| "[E4]"
|
||||||
|
| "[E3]"
|
||||||
|
| "[E2]"
|
||||||
|
| "[E1]"
|
||||||
|
;
|
||||||
|
|
||||||
|
export SIGMA = b.kBytes | POWERS;
|
||||||
|
|
||||||
|
export SIGMA_STAR = SIGMA*;
|
||||||
|
|
||||||
|
export SIGMA_PLUS = SIGMA+;
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# BEGIN LANGUAGE SPECIFIC DATA
|
||||||
|
revaluations =
|
||||||
|
("[E4]" : "[E1]")
|
||||||
|
| ("[E5]" : "[E2]")
|
||||||
|
| ("[E7]" : "[E1]")
|
||||||
|
| ("[E8]" : "[E2]")
|
||||||
|
;
|
||||||
|
|
||||||
|
Ms = "[E3]" | "[E6]" | "[E9]";
|
||||||
|
|
||||||
|
|
||||||
|
func Zero[expr] {
|
||||||
|
return expr : ("");
|
||||||
|
}
|
||||||
|
|
||||||
|
space = " ";
|
||||||
|
|
||||||
|
lexset3 = Optimize[
|
||||||
|
("1[E1]+1" : "одиннадцати")
|
||||||
|
| ("1[E1]+1" : "одиннадцать")
|
||||||
|
| ("1[E1]+1" : "одиннадцатью")
|
||||||
|
| ("1[E1]+2" : "двенадцати")
|
||||||
|
| ("1[E1]+2" : "двенадцать")
|
||||||
|
| ("1[E1]+2" : "двенадцатью")
|
||||||
|
| ("1[E1]+3" : "тринадцати")
|
||||||
|
| ("1[E1]+3" : "тринадцать")
|
||||||
|
| ("1[E1]+3" : "тринадцатью")
|
||||||
|
| ("1[E1]+4" : "четырнадцати")
|
||||||
|
| ("1[E1]+4" : "четырнадцать")
|
||||||
|
| ("1[E1]+4" : "четырнадцатью")
|
||||||
|
| ("1[E1]+5" : "пятнадцати")
|
||||||
|
| ("1[E1]+5" : "пятнадцать")
|
||||||
|
| ("1[E1]+5" : "пятнадцатью")
|
||||||
|
| ("1[E1]+6" : "шестнадцати")
|
||||||
|
| ("1[E1]+6" : "шестнадцать")
|
||||||
|
| ("1[E1]+6" : "шестнадцатью")
|
||||||
|
| ("1[E1]+7" : "семнадцати")
|
||||||
|
| ("1[E1]+7" : "семнадцать")
|
||||||
|
| ("1[E1]+7" : "семнадцатью")
|
||||||
|
| ("1[E1]+8" : "восемнадцати")
|
||||||
|
| ("1[E1]+8" : "восемнадцать")
|
||||||
|
| ("1[E1]+8" : "восемнадцатью")
|
||||||
|
| ("1[E1]+9" : "девятнадцати")
|
||||||
|
| ("1[E1]+9" : "девятнадцать")
|
||||||
|
| ("1[E1]+9" : "девятнадцатью")]
|
||||||
|
;
|
||||||
|
|
||||||
|
lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
lexset2 = Optimize[
|
||||||
|
("1[E1]" : "десяти")
|
||||||
|
| ("1[E1]" : "десять")
|
||||||
|
| ("1[E1]" : "десятью")
|
||||||
|
| ("1[E2]" : "ста")
|
||||||
|
| ("1[E2]" : "сто")
|
||||||
|
| ("2[E1]" : "двадцати")
|
||||||
|
| ("2[E1]" : "двадцать")
|
||||||
|
| ("2[E1]" : "двадцатью")
|
||||||
|
| ("2[E2]" : "двести")
|
||||||
|
| ("2[E2]" : "двумстам")
|
||||||
|
| ("2[E2]" : "двумястами")
|
||||||
|
| ("2[E2]" : "двухсот")
|
||||||
|
| ("2[E2]" : "двухстах")
|
||||||
|
| ("3[E1]" : "тридцати")
|
||||||
|
| ("3[E1]" : "тридцать")
|
||||||
|
| ("3[E1]" : "тридцатью")
|
||||||
|
| ("3[E2]" : "тремстам")
|
||||||
|
| ("3[E2]" : "тремястами")
|
||||||
|
| ("3[E2]" : "трехсот")
|
||||||
|
| ("3[E2]" : "трехстах")
|
||||||
|
| ("3[E2]" : "триста")
|
||||||
|
| ("4[E1]" : "сорок")
|
||||||
|
| ("4[E1]" : "сорока")
|
||||||
|
| ("4[E2]" : "четыремстам")
|
||||||
|
| ("4[E2]" : "четыреста")
|
||||||
|
| ("4[E2]" : "четырехсот")
|
||||||
|
| ("4[E2]" : "четырехстах")
|
||||||
|
| ("4[E2]" : "четырьмястами")
|
||||||
|
| ("5[E1]" : "пятидесяти")
|
||||||
|
| ("5[E1]" : "пятьдесят")
|
||||||
|
| ("5[E1]" : "пятьюдесятью")
|
||||||
|
| ("5[E2]" : "пятисот")
|
||||||
|
| ("5[E2]" : "пятистам")
|
||||||
|
| ("5[E2]" : "пятистах")
|
||||||
|
| ("5[E2]" : "пятьсот")
|
||||||
|
| ("5[E2]" : "пятьюстами")
|
||||||
|
| ("6[E1]" : "шестидесяти")
|
||||||
|
| ("6[E1]" : "шестьдесят")
|
||||||
|
| ("6[E1]" : "шестьюдесятью")
|
||||||
|
| ("6[E2]" : "шестисот")
|
||||||
|
| ("6[E2]" : "шестистам")
|
||||||
|
| ("6[E2]" : "шестистах")
|
||||||
|
| ("6[E2]" : "шестьсот")
|
||||||
|
| ("6[E2]" : "шестьюстами")
|
||||||
|
| ("7[E1]" : "семидесяти")
|
||||||
|
| ("7[E1]" : "семьдесят")
|
||||||
|
| ("7[E1]" : "семьюдесятью")
|
||||||
|
| ("7[E2]" : "семисот")
|
||||||
|
| ("7[E2]" : "семистам")
|
||||||
|
| ("7[E2]" : "семистах")
|
||||||
|
| ("7[E2]" : "семьсот")
|
||||||
|
| ("7[E2]" : "семьюстами")
|
||||||
|
| ("8[E1]" : "восемьдесят")
|
||||||
|
| ("8[E1]" : "восьмидесяти")
|
||||||
|
| ("8[E1]" : "восьмьюдесятью")
|
||||||
|
| ("8[E2]" : "восемьсот")
|
||||||
|
| ("8[E2]" : "восемьюстами")
|
||||||
|
| ("8[E2]" : "восьмисот")
|
||||||
|
| ("8[E2]" : "восьмистам")
|
||||||
|
| ("8[E2]" : "восьмистах")
|
||||||
|
| ("8[E2]" : "восьмьюстами")
|
||||||
|
| ("9[E1]" : "девяноста")
|
||||||
|
| ("9[E1]" : "девяносто")
|
||||||
|
| ("9[E2]" : "девятисот")
|
||||||
|
| ("9[E2]" : "девятистам")
|
||||||
|
| ("9[E2]" : "девятистах")
|
||||||
|
| ("9[E2]" : "девятьсот")
|
||||||
|
| ("9[E2]" : "девятьюстами")]
|
||||||
|
;
|
||||||
|
|
||||||
|
lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
lexset1 = Optimize[
|
||||||
|
("+" : "")
|
||||||
|
| ("1" : "один")
|
||||||
|
| ("1" : "одна")
|
||||||
|
| ("1" : "одни")
|
||||||
|
| ("1" : "одним")
|
||||||
|
| ("1" : "одними")
|
||||||
|
| ("1" : "одних")
|
||||||
|
| ("1" : "одно")
|
||||||
|
| ("1" : "одного")
|
||||||
|
| ("1" : "одной")
|
||||||
|
| ("1" : "одном")
|
||||||
|
| ("1" : "одному")
|
||||||
|
| ("1" : "одною")
|
||||||
|
| ("1" : "одну")
|
||||||
|
| ("2" : "два")
|
||||||
|
| ("2" : "две")
|
||||||
|
| ("2" : "двум")
|
||||||
|
| ("2" : "двумя")
|
||||||
|
| ("2" : "двух")
|
||||||
|
| ("3" : "трем")
|
||||||
|
| ("3" : "тремя")
|
||||||
|
| ("3" : "трех")
|
||||||
|
| ("3" : "три")
|
||||||
|
| ("4" : "четыре")
|
||||||
|
| ("4" : "четырем")
|
||||||
|
| ("4" : "четырех")
|
||||||
|
| ("4" : "четырьмя")
|
||||||
|
| ("5" : "пяти")
|
||||||
|
| ("5" : "пять")
|
||||||
|
| ("5" : "пятью")
|
||||||
|
| ("6" : "шести")
|
||||||
|
| ("6" : "шесть")
|
||||||
|
| ("6" : "шестью")
|
||||||
|
| ("7" : "семи")
|
||||||
|
| ("7" : "семь")
|
||||||
|
| ("7" : "семью")
|
||||||
|
| ("8" : "восемь")
|
||||||
|
| ("8" : "восьми")
|
||||||
|
| ("8" : "восьмью")
|
||||||
|
| ("9" : "девяти")
|
||||||
|
| ("9" : "девять")
|
||||||
|
| ("9" : "девятью")
|
||||||
|
| ("[E3]" : "тысяч")
|
||||||
|
| ("[E3]" : "тысяча")
|
||||||
|
| ("[E3]" : "тысячам")
|
||||||
|
| ("[E3]" : "тысячами")
|
||||||
|
| ("[E3]" : "тысячах")
|
||||||
|
| ("[E3]" : "тысяче")
|
||||||
|
| ("[E3]" : "тысячей")
|
||||||
|
| ("[E3]" : "тысячи")
|
||||||
|
| ("[E3]" : "тысячу")
|
||||||
|
| ("[E3]" : "тысячью")
|
||||||
|
| ("[E6]" : "миллион")
|
||||||
|
| ("[E6]" : "миллиона")
|
||||||
|
| ("[E6]" : "миллионам")
|
||||||
|
| ("[E6]" : "миллионами")
|
||||||
|
| ("[E6]" : "миллионах")
|
||||||
|
| ("[E6]" : "миллионе")
|
||||||
|
| ("[E6]" : "миллионов")
|
||||||
|
| ("[E6]" : "миллионом")
|
||||||
|
| ("[E6]" : "миллиону")
|
||||||
|
| ("[E6]" : "миллионы")
|
||||||
|
| ("[E9]" : "миллиард")
|
||||||
|
| ("[E9]" : "миллиарда")
|
||||||
|
| ("[E9]" : "миллиардам")
|
||||||
|
| ("[E9]" : "миллиардами")
|
||||||
|
| ("[E9]" : "миллиардах")
|
||||||
|
| ("[E9]" : "миллиарде")
|
||||||
|
| ("[E9]" : "миллиардов")
|
||||||
|
| ("[E9]" : "миллиардом")
|
||||||
|
| ("[E9]" : "миллиарду")
|
||||||
|
| ("[E9]" : "миллиарды")
|
||||||
|
| ("|0|" : "ноле")
|
||||||
|
| ("|0|" : "нолем")
|
||||||
|
| ("|0|" : "ноль")
|
||||||
|
| ("|0|" : "нолю")
|
||||||
|
| ("|0|" : "ноля")
|
||||||
|
| ("|0|" : "нуле")
|
||||||
|
| ("|0|" : "нулем")
|
||||||
|
| ("|0|" : "нуль")
|
||||||
|
| ("|0|" : "нулю")
|
||||||
|
| ("|0|" : "нуля")]
|
||||||
|
;
|
||||||
|
|
||||||
|
lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
export LEX = Optimize[lex3 @ lex2 @ lex1];
|
||||||
|
|
||||||
|
export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]";
|
||||||
|
|
||||||
|
# END LANGUAGE SPECIFIC DATA
|
||||||
|
################################################################################
|
||||||
|
# Inserts a marker after the Ms.
|
||||||
|
export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR];
|
||||||
|
|
||||||
|
# Deletes all powers and "+".
|
||||||
|
export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
# Deletes trailing zeros at the beginning of a number, so that "0003" does not
|
||||||
|
# get treated as an ordinary number.
|
||||||
|
export DELETE_INITIAL_ZEROS =
|
||||||
|
CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR]
|
||||||
|
;
|
||||||
|
|
||||||
|
NonMs = Optimize[POWERS - Ms];
|
||||||
|
|
||||||
|
# Deletes (usually) zeros before a non-M. E.g., +0[E1] should be deleted.
|
||||||
|
export DELETE_INTERMEDIATE_ZEROS1 =
|
||||||
|
CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR]
|
||||||
|
;
|
||||||
|
|
||||||
|
# Deletes (usually) zeros before an M, if there is no non-zero element between
|
||||||
|
# that and the previous boundary. Thus, if after the result of the rule above we
|
||||||
|
# end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final
|
||||||
|
# zero.
|
||||||
|
export DELETE_INTERMEDIATE_ZEROS2 = Optimize[
|
||||||
|
CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR]
|
||||||
|
@ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]]
|
||||||
|
;
|
||||||
|
|
||||||
|
# Final clean up of stray zeros.
|
||||||
|
export DELETE_REMAINING_ZEROS = Optimize[
|
||||||
|
CDRewrite[Zero["+0"], "", "", SIGMA_STAR]
|
||||||
|
@ CDRewrite[Zero["0"], "", "", SIGMA_STAR]]
|
||||||
|
;
|
||||||
|
|
||||||
|
# Applies the revaluation map. For example in English, changes [E4] to [E1] as a
|
||||||
|
# modifier of [E3].
|
||||||
|
export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
# Deletes the various marks and powers in the input and output.
|
||||||
|
export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
export CLEAN_SPACES = Optimize[
|
||||||
|
CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR]
|
||||||
|
@ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR]
|
||||||
|
@ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]]
|
||||||
|
;
|
||||||
|
|
||||||
|
d = b.kDigit;
|
||||||
|
|
||||||
|
# Germanic inversion rule.
|
||||||
|
germanic =
|
||||||
|
(I["1+"] d "[E1]" D["+1"])
|
||||||
|
| (I["2+"] d "[E1]" D["+2"])
|
||||||
|
| (I["3+"] d "[E1]" D["+3"])
|
||||||
|
| (I["4+"] d "[E1]" D["+4"])
|
||||||
|
| (I["5+"] d "[E1]" D["+5"])
|
||||||
|
| (I["6+"] d "[E1]" D["+6"])
|
||||||
|
| (I["7+"] d "[E1]" D["+7"])
|
||||||
|
| (I["8+"] d "[E1]" D["+8"])
|
||||||
|
| (I["9+"] d "[E1]" D["+9"])
|
||||||
|
;
|
||||||
|
|
||||||
|
germanic_inversion =
|
||||||
|
CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt']
|
||||||
|
;
|
||||||
|
|
||||||
|
export GERMANIC_INVERSION = SIGMA_STAR;
|
||||||
|
export ORDINAL_RESTRICTION = SIGMA_STAR;
|
||||||
|
nondigits = b.kBytes - b.kDigit;
|
||||||
|
export ORDINAL_SUFFIX = D[nondigits*];
|
|
@ -0,0 +1,35 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'ru/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@");
|
||||||
|
|
||||||
|
export DIGITS = digit (n.I[" "] digit)*;
|
||||||
|
|
||||||
|
# Various common factorizations
|
||||||
|
|
||||||
|
two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS;
|
||||||
|
|
||||||
|
three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS;
|
||||||
|
|
||||||
|
mixed =
|
||||||
|
(digit n.I[" "] two_digits)
|
||||||
|
| (two_digits n.I[" "] two_digits)
|
||||||
|
| (two_digits n.I[" "] three_digits)
|
||||||
|
| (two_digits n.I[" "] two_digits n.I[" "] two_digits)
|
||||||
|
;
|
||||||
|
|
||||||
|
export MIXED_NUMBERS = Optimize[mixed];
|
@ -0,0 +1,40 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'util/util.grm' as u;
|
||||||
|
import 'ru/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
func ToNumberName[expr] {
|
||||||
|
number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*;
|
||||||
|
return Optimize[expr @ number_name_seq];
|
||||||
|
}
|
||||||
|
|
||||||
|
d = b.kDigit;
|
||||||
|
|
||||||
|
leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*];
|
||||||
|
|
||||||
|
by_ones = d n.I[" "];
|
||||||
|
by_twos = (d{2} @ leading_zero) n.I[" "];
|
||||||
|
by_threes = (d{3} @ leading_zero) n.I[" "];
|
||||||
|
|
||||||
|
groupings = by_twos* (by_threes | by_twos | by_ones);
|
||||||
|
|
||||||
|
export FRACTIONAL_PART_UNGROUPED =
|
||||||
|
Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]]
|
||||||
|
;
|
||||||
|
export FRACTIONAL_PART_GROUPED =
|
||||||
|
Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]]
|
||||||
|
;
|
||||||
|
export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]];
|
@ -0,0 +1,30 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'ru/verbalizer/factorization.grm' as f;
|
||||||
|
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'ru/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED;
|
||||||
|
fractional_part_grouped = f.FRACTIONAL_PART_GROUPED;
|
||||||
|
fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED;
|
||||||
|
|
||||||
|
__fractional_part__ = fractional_part_unparsed;
|
||||||
|
__decimal_marker__ = ",";
|
||||||
|
|
||||||
|
export FLOAT = Optimize[
|
||||||
|
(n.CARDINAL_NUMBERS
|
||||||
|
(__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ")
|
||||||
|
__fractional_part__) @ l.LEXICAL_MAP]
|
||||||
|
;
|
Binary file not shown.
@ -0,0 +1,25 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
|
||||||
|
lexical_map = StringFile['ru/verbalizer/lexical_map.tsv'];
|
||||||
|
|
||||||
|
sigma_star = b.kBytes*;
|
||||||
|
|
||||||
|
del_null = CDRewrite["__NULL__" : "", "", "", sigma_star];
|
||||||
|
|
||||||
|
export LEXICAL_MAP = Optimize[
|
||||||
|
CDRewrite[lexical_map, "", "", sigma_star] @ del_null]
|
||||||
|
;
|
Can't render this file because it has a wrong number of fields in line 176.
|
@ -0,0 +1,34 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'ru/verbalizer/float.grm' as f;
|
||||||
|
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'ru/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
float = f.FLOAT;
|
||||||
|
card = n.CARDINAL_NUMBERS;
|
||||||
|
number = card | float;
|
||||||
|
|
||||||
|
plus = "+" : " @@ARITHMETIC_PLUS@@ ";
|
||||||
|
times = "*" : " @@ARITHMETIC_TIMES@@ ";
|
||||||
|
minus = "-" : " @@ARITHMETIC_MINUS@@ ";
|
||||||
|
division = "/" : " @@ARITHMETIC_DIVISION@@ ";
|
||||||
|
|
||||||
|
operator = plus | times | minus | division;
|
||||||
|
|
||||||
|
percent = "%" : " @@PERCENT@@";
|
||||||
|
|
||||||
|
export ARITHMETIC =
|
||||||
|
Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP]
|
||||||
|
;
|
@ -0,0 +1,78 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'ru/classifier/cyrillic.grm' as c;
|
||||||
|
import 'ru/verbalizer/extra_numbers.grm' as e;
|
||||||
|
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'ru/verbalizer/numbers.grm' as n;
|
||||||
|
import 'ru/verbalizer/spelled.grm' as s;
|
||||||
|
|
||||||
|
letter = b.kAlpha | c.kCyrillicAlpha;
|
||||||
|
dash = "-";
|
||||||
|
word = letter+;
|
||||||
|
possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?;
|
||||||
|
|
||||||
|
post_word_symbol =
|
||||||
|
("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) |
|
||||||
|
("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) |
|
||||||
|
("*" : "@@STAR@@")
|
||||||
|
;
|
||||||
|
|
||||||
|
pre_word_symbol =
|
||||||
|
("@" : "@@AT@@") |
|
||||||
|
("/" : "@@SLASH@@") |
|
||||||
|
("#" : "@@HASH@@")
|
||||||
|
;
|
||||||
|
|
||||||
|
post_word = possibly_split_word n.I[" "] post_word_symbol;
|
||||||
|
|
||||||
|
pre_word = pre_word_symbol n.I[" "] possibly_split_word;
|
||||||
|
|
||||||
|
## Number/digit sequence combos, maybe with a dash
|
||||||
|
|
||||||
|
spelled_word = word @ s.SPELLED_NO_LETTER;
|
||||||
|
|
||||||
|
word_number =
|
||||||
|
(word | spelled_word)
|
||||||
|
(n.I[" "] | (dash : " "))
|
||||||
|
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
||||||
|
;
|
||||||
|
|
||||||
|
number_word =
|
||||||
|
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
||||||
|
(n.I[" "] | (dash : " "))
|
||||||
|
(word | spelled_word)
|
||||||
|
;
|
||||||
|
|
||||||
|
## Two-digit year.
|
||||||
|
|
||||||
|
# Note that in this case to be fair we really have to allow ordinals too since
|
||||||
|
# in some languages that's what you would have.
|
||||||
|
|
||||||
|
two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS));
|
||||||
|
|
||||||
|
dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com";
|
||||||
|
|
||||||
|
miscellaneous = Optimize[
|
||||||
|
possibly_split_word
|
||||||
|
| post_word
|
||||||
|
| pre_word
|
||||||
|
| word_number
|
||||||
|
| number_word
|
||||||
|
| two_digit_year
|
||||||
|
| dot_com
|
||||||
|
];
|
||||||
|
|
||||||
|
export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP];
|
@ -0,0 +1,44 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'ru/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
card = n.CARDINAL_NUMBERS;
|
||||||
|
|
||||||
|
__currency__ = StringFile['ru/verbalizer/money.tsv'];
|
||||||
|
|
||||||
|
d = b.kDigit;
|
||||||
|
D = d - "0";
|
||||||
|
|
||||||
|
cents = ((n.D["0"] | D) d) @ card;
|
||||||
|
|
||||||
|
# Only dollar for the verbalizer tests for English. Will need to add other
|
||||||
|
# currencies.
|
||||||
|
usd_maj = Project["usd_maj" @ __currency__, 'output'];
|
||||||
|
usd_min = Project["usd_min" @ __currency__, 'output'];
|
||||||
|
and = " @@MONEY_AND@@ " | " ";
|
||||||
|
|
||||||
|
dollar1 =
|
||||||
|
n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min]
|
||||||
|
;
|
||||||
|
|
||||||
|
dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"];
|
||||||
|
|
||||||
|
dollar3 = n.D["$"] card n.I[" " usd_maj];
|
||||||
|
|
||||||
|
dollar = Optimize[dollar1 | dollar2 | dollar3];
|
||||||
|
|
||||||
|
export MONEY = Optimize[dollar @ l.LEXICAL_MAP];
|
|
|
@ -0,0 +1,48 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Russian minimally supervised number grammar.
|
||||||
|
#
|
||||||
|
# Supports cardinals and ordinals in all inflected forms.
|
||||||
|
#
|
||||||
|
# The language-specific acceptor G was compiled with digit, teen, decade,
|
||||||
|
# century, and big power-of-ten preterminals. The lexicon transducer is
|
||||||
|
# highly ambiguous, but no LM is used.
|
||||||
|
|
||||||
|
import 'util/arithmetic.grm' as a;
|
||||||
|
|
||||||
|
# Intersects the universal factorization transducer (F) with language-specific
|
||||||
|
# acceptor (G).
|
||||||
|
|
||||||
|
d = a.DELTA_STAR;
|
||||||
|
f = a.IARITHMETIC_RESTRICTED;
|
||||||
|
g = LoadFst['ru/verbalizer/g.fst'];
|
||||||
|
fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]];
|
||||||
|
test1 = AssertEqual["230" @ fg, "(+ 200 30 +)"];
|
||||||
|
|
||||||
|
# Compiles lexicon transducers (L).
|
||||||
|
|
||||||
|
cardinal_name = StringFile['ru/verbalizer/cardinals.tsv'];
|
||||||
|
cardinal_l = Optimize[(cardinal_name " ")* cardinal_name];
|
||||||
|
|
||||||
|
ordinal_name = StringFile['ru/verbalizer/ordinals.tsv'];
|
||||||
|
ordinal_l = Optimize[(cardinal_name " ")* ordinal_name];
|
||||||
|
|
||||||
|
# Composes L with the leaf transducer (P), then composes that with FG.
|
||||||
|
|
||||||
|
p = a.LEAVES;
|
||||||
|
|
||||||
|
export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)];
|
||||||
|
|
||||||
|
export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)];
|
@ -0,0 +1,68 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'ru/verbalizer/number_names.grm' as n;
|
||||||
|
import 'universal/thousands_punct.grm' as t;
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
|
||||||
|
nominatives = StringFile['ru/verbalizer/nominatives.tsv'];
|
||||||
|
|
||||||
|
sigma_star = b.kBytes*;
|
||||||
|
|
||||||
|
nominative_filter =
|
||||||
|
CDRewrite[nominatives ("" : "" <-1>), "[BOS]" | " ", " " | "[EOS]", sigma_star]
|
||||||
|
;
|
||||||
|
|
||||||
|
cardinal = n.CARDINAL_NUMBER_NAME;
|
||||||
|
ordinal = n.ORDINAL_NUMBER_NAME;
|
||||||
|
|
||||||
|
# Putting these here since this grammar gets incorporated by all the others.
|
||||||
|
|
||||||
|
func I[expr] {
|
||||||
|
return "" : expr;
|
||||||
|
}
|
||||||
|
|
||||||
|
func D[expr] {
|
||||||
|
return expr : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
# Since we know this is the default for Russian, it's fair game to set it.
|
||||||
|
separators = t.dot_thousands | t.no_delimiter;
|
||||||
|
|
||||||
|
export CARDINAL_NUMBERS = Optimize[
|
||||||
|
separators
|
||||||
|
@ cardinal
|
||||||
|
];
|
||||||
|
|
||||||
|
export ORDINAL_NUMBERS_UNMARKED = Optimize[
|
||||||
|
separators
|
||||||
|
@ ordinal
|
||||||
|
];
|
||||||
|
|
||||||
|
|
||||||
|
endings = StringFile['ru/verbalizer/ordinal_endings.tsv'];
|
||||||
|
|
||||||
|
not_dash = (b.kBytes - "-")+;
|
||||||
|
del_ending = CDRewrite[("-" not_dash) : "", "", "[EOS]", sigma_star];
|
||||||
|
|
||||||
|
# Needs nominative_filter here if we take out Kyle's models.
|
||||||
|
export ORDINAL_NUMBERS_MARKED = Optimize[
|
||||||
|
Optimize[Optimize[separators @ ordinal] "-" not_dash]
|
||||||
|
@ Optimize[sigma_star endings]
|
||||||
|
@ del_ending]
|
||||||
|
;
|
||||||
|
|
||||||
|
export ORDINAL_NUMBERS =
|
||||||
|
Optimize[ORDINAL_NUMBERS_MARKED | ORDINAL_NUMBERS_UNMARKED]
|
||||||
|
;
|
@ -0,0 +1,133 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Grammar for things built mostly on numbers.
|
||||||
|
|
||||||
|
import 'ru/verbalizer/factorization.grm' as f;
|
||||||
|
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'ru/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
num = n.CARDINAL_NUMBERS;
|
||||||
|
ord = n.ORDINAL_NUMBERS_UNMARKED;
|
||||||
|
digits = f.FRACTIONAL_PART_UNGROUPED;
|
||||||
|
|
||||||
|
# Various symbols.
|
||||||
|
|
||||||
|
plus = "+" : "@@ARITHMETIC_PLUS@@";
|
||||||
|
minus = "-" : "@@ARITHMETIC_MINUS@@";
|
||||||
|
slash = "/" : "@@SLASH@@";
|
||||||
|
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
||||||
|
dash = "-" : "@@DASH@@";
|
||||||
|
equals = "=" : "@@ARITHMETIC_EQUALS@@";
|
||||||
|
|
||||||
|
degree = "°" : "@@DEGREE@@";
|
||||||
|
|
||||||
|
division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@";
|
||||||
|
|
||||||
|
times = ("x" | "*") : "@@ARITHMETIC_TIMES@@";
|
||||||
|
|
||||||
|
power = "^" : "@@DECIMAL_EXPONENT@@";
|
||||||
|
|
||||||
|
square_root = "√" : "@@SQUARE_ROOT@@";
|
||||||
|
|
||||||
|
percent = "%" : "@@PERCENT@@";
|
||||||
|
|
||||||
|
# Safe roman numbers.
|
||||||
|
|
||||||
|
# NB: Do not change the formatting here. NO_EDIT must be on the same
|
||||||
|
# line as the path.
|
||||||
|
rfile =
|
||||||
|
'universal/roman_numerals.tsv' # NO_EDIT
|
||||||
|
;
|
||||||
|
|
||||||
|
roman = StringFile[rfile];
|
||||||
|
|
||||||
|
## Main categories.
|
||||||
|
|
||||||
|
cat_dot_number =
|
||||||
|
num
|
||||||
|
n.I[" "] dot n.I[" "] num
|
||||||
|
(n.I[" "] dot n.I[" "] num)+
|
||||||
|
;
|
||||||
|
|
||||||
|
cat_slash_number =
|
||||||
|
num
|
||||||
|
n.I[" "] slash n.I[" "] num
|
||||||
|
(n.I[" "] slash n.I[" "] num)*
|
||||||
|
;
|
||||||
|
|
||||||
|
cat_dash_number =
|
||||||
|
num
|
||||||
|
n.I[" "] dash n.I[" "] num
|
||||||
|
(n.I[" "] dash n.I[" "] num)*
|
||||||
|
;
|
||||||
|
|
||||||
|
cat_signed_number = ((plus | minus) n.I[" "])? num;
|
||||||
|
|
||||||
|
cat_degree = cat_signed_number n.I[" "] degree;
|
||||||
|
|
||||||
|
cat_country_code = plus n.I[" "] (num | digits);
|
||||||
|
|
||||||
|
cat_math_operations =
|
||||||
|
plus
|
||||||
|
| minus
|
||||||
|
| division
|
||||||
|
| times
|
||||||
|
| equals
|
||||||
|
| percent
|
||||||
|
| power
|
||||||
|
| square_root
|
||||||
|
;
|
||||||
|
|
||||||
|
# Roman numbers are often either cardinals or ordinals in various languages.
|
||||||
|
cat_roman = roman @ (num | ord);
|
||||||
|
|
||||||
|
# Allow
|
||||||
|
#
|
||||||
|
# number:number
|
||||||
|
# number-number
|
||||||
|
#
|
||||||
|
# to just be
|
||||||
|
#
|
||||||
|
# number number.
|
||||||
|
|
||||||
|
cat_number_number =
|
||||||
|
num ((":" | "-") : " ") num
|
||||||
|
;
|
||||||
|
|
||||||
|
# Some additional readings for these symbols.
|
||||||
|
|
||||||
|
cat_additional_readings =
|
||||||
|
("/" : "@@PER@@") |
|
||||||
|
("+" : "@@AND@@") |
|
||||||
|
("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) |
|
||||||
|
("*" : "@@STAR@@") |
|
||||||
|
("x" : ("x" | "@@CONNECTOR_BY@@")) |
|
||||||
|
("@" : "@@AT@@")
|
||||||
|
;
|
||||||
|
|
||||||
|
numbers_plus = Optimize[
|
||||||
|
cat_dot_number
|
||||||
|
| cat_slash_number
|
||||||
|
| cat_dash_number
|
||||||
|
| cat_signed_number
|
||||||
|
| cat_degree
|
||||||
|
| cat_country_code
|
||||||
|
| cat_math_operations
|
||||||
|
| cat_roman
|
||||||
|
| cat_number_number
|
||||||
|
| cat_additional_readings
|
||||||
|
];
|
||||||
|
|
||||||
|
export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP];
|
|
@ -0,0 +1,804 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# AUTOMATICALLY GENERATED: DO NOT EDIT.
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
|
||||||
|
# Utilities for insertion and deletion.
|
||||||
|
|
||||||
|
func I[expr] {
|
||||||
|
return "" : expr;
|
||||||
|
}
|
||||||
|
|
||||||
|
func D[expr] {
|
||||||
|
return expr : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
# Powers of base 10.
|
||||||
|
export POWERS =
|
||||||
|
"[E15]"
|
||||||
|
| "[E14]"
|
||||||
|
| "[E13]"
|
||||||
|
| "[E12]"
|
||||||
|
| "[E11]"
|
||||||
|
| "[E10]"
|
||||||
|
| "[E9]"
|
||||||
|
| "[E8]"
|
||||||
|
| "[E7]"
|
||||||
|
| "[E6]"
|
||||||
|
| "[E5]"
|
||||||
|
| "[E4]"
|
||||||
|
| "[E3]"
|
||||||
|
| "[E2]"
|
||||||
|
| "[E1]"
|
||||||
|
;
|
||||||
|
|
||||||
|
export SIGMA = b.kBytes | POWERS;
|
||||||
|
|
||||||
|
export SIGMA_STAR = SIGMA*;
|
||||||
|
|
||||||
|
export SIGMA_PLUS = SIGMA+;
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# BEGIN LANGUAGE SPECIFIC DATA
|
||||||
|
revaluations =
|
||||||
|
("[E4]" : "[E1]")
|
||||||
|
| ("[E5]" : "[E2]")
|
||||||
|
| ("[E7]" : "[E1]")
|
||||||
|
| ("[E8]" : "[E2]")
|
||||||
|
;
|
||||||
|
|
||||||
|
Ms = "[E3]" | "[E6]" | "[E9]";
|
||||||
|
|
||||||
|
|
||||||
|
func Zero[expr] {
|
||||||
|
return expr : ("");
|
||||||
|
}
|
||||||
|
|
||||||
|
space = " ";
|
||||||
|
|
||||||
|
lexset3 = Optimize[
|
||||||
|
("1[E1]+1" : "одиннадцатая@")
|
||||||
|
| ("1[E1]+1" : "одиннадцати")
|
||||||
|
| ("1[E1]+1" : "одиннадцатого@")
|
||||||
|
| ("1[E1]+1" : "одиннадцатое@")
|
||||||
|
| ("1[E1]+1" : "одиннадцатой@")
|
||||||
|
| ("1[E1]+1" : "одиннадцатом@")
|
||||||
|
| ("1[E1]+1" : "одиннадцатому@")
|
||||||
|
| ("1[E1]+1" : "одиннадцатую@")
|
||||||
|
| ("1[E1]+1" : "одиннадцатые@")
|
||||||
|
| ("1[E1]+1" : "одиннадцатый@")
|
||||||
|
| ("1[E1]+1" : "одиннадцатым@")
|
||||||
|
| ("1[E1]+1" : "одиннадцатыми@")
|
||||||
|
| ("1[E1]+1" : "одиннадцатых@")
|
||||||
|
| ("1[E1]+1" : "одиннадцать")
|
||||||
|
| ("1[E1]+1" : "одиннадцатью")
|
||||||
|
| ("1[E1]+2" : "двенадцатая@")
|
||||||
|
| ("1[E1]+2" : "двенадцати")
|
||||||
|
| ("1[E1]+2" : "двенадцатого@")
|
||||||
|
| ("1[E1]+2" : "двенадцатое@")
|
||||||
|
| ("1[E1]+2" : "двенадцатой@")
|
||||||
|
| ("1[E1]+2" : "двенадцатом@")
|
||||||
|
| ("1[E1]+2" : "двенадцатому@")
|
||||||
|
| ("1[E1]+2" : "двенадцатую@")
|
||||||
|
| ("1[E1]+2" : "двенадцатые@")
|
||||||
|
| ("1[E1]+2" : "двенадцатый@")
|
||||||
|
| ("1[E1]+2" : "двенадцатым@")
|
||||||
|
| ("1[E1]+2" : "двенадцатыми@")
|
||||||
|
| ("1[E1]+2" : "двенадцатых@")
|
||||||
|
| ("1[E1]+2" : "двенадцать")
|
||||||
|
| ("1[E1]+2" : "двенадцатью")
|
||||||
|
| ("1[E1]+3" : "тринадцатая@")
|
||||||
|
| ("1[E1]+3" : "тринадцати")
|
||||||
|
| ("1[E1]+3" : "тринадцатого@")
|
||||||
|
| ("1[E1]+3" : "тринадцатое@")
|
||||||
|
| ("1[E1]+3" : "тринадцатой@")
|
||||||
|
| ("1[E1]+3" : "тринадцатом@")
|
||||||
|
| ("1[E1]+3" : "тринадцатому@")
|
||||||
|
| ("1[E1]+3" : "тринадцатую@")
|
||||||
|
| ("1[E1]+3" : "тринадцатые@")
|
||||||
|
| ("1[E1]+3" : "тринадцатый@")
|
||||||
|
| ("1[E1]+3" : "тринадцатым@")
|
||||||
|
| ("1[E1]+3" : "тринадцатыми@")
|
||||||
|
| ("1[E1]+3" : "тринадцатых@")
|
||||||
|
| ("1[E1]+3" : "тринадцать")
|
||||||
|
| ("1[E1]+3" : "тринадцатью")
|
||||||
|
| ("1[E1]+4" : "четырнадцатая@")
|
||||||
|
| ("1[E1]+4" : "четырнадцати")
|
||||||
|
| ("1[E1]+4" : "четырнадцатого@")
|
||||||
|
| ("1[E1]+4" : "четырнадцатое@")
|
||||||
|
| ("1[E1]+4" : "четырнадцатой@")
|
||||||
|
| ("1[E1]+4" : "четырнадцатом@")
|
||||||
|
| ("1[E1]+4" : "четырнадцатому@")
|
||||||
|
| ("1[E1]+4" : "четырнадцатую@")
|
||||||
|
| ("1[E1]+4" : "четырнадцатые@")
|
||||||
|
| ("1[E1]+4" : "четырнадцатый@")
|
||||||
|
| ("1[E1]+4" : "четырнадцатым@")
|
||||||
|
| ("1[E1]+4" : "четырнадцатыми@")
|
||||||
|
| ("1[E1]+4" : "четырнадцатых@")
|
||||||
|
| ("1[E1]+4" : "четырнадцать")
|
||||||
|
| ("1[E1]+4" : "четырнадцатью")
|
||||||
|
| ("1[E1]+5" : "пятнадцатая@")
|
||||||
|
| ("1[E1]+5" : "пятнадцати")
|
||||||
|
| ("1[E1]+5" : "пятнадцатого@")
|
||||||
|
| ("1[E1]+5" : "пятнадцатое@")
|
||||||
|
| ("1[E1]+5" : "пятнадцатой@")
|
||||||
|
| ("1[E1]+5" : "пятнадцатом@")
|
||||||
|
| ("1[E1]+5" : "пятнадцатому@")
|
||||||
|
| ("1[E1]+5" : "пятнадцатую@")
|
||||||
|
| ("1[E1]+5" : "пятнадцатые@")
|
||||||
|
| ("1[E1]+5" : "пятнадцатый@")
|
||||||
|
| ("1[E1]+5" : "пятнадцатым@")
|
||||||
|
| ("1[E1]+5" : "пятнадцатыми@")
|
||||||
|
| ("1[E1]+5" : "пятнадцатых@")
|
||||||
|
| ("1[E1]+5" : "пятнадцать")
|
||||||
|
| ("1[E1]+5" : "пятнадцатью")
|
||||||
|
| ("1[E1]+6" : "шестнадцатая@")
|
||||||
|
| ("1[E1]+6" : "шестнадцати")
|
||||||
|
| ("1[E1]+6" : "шестнадцатого@")
|
||||||
|
| ("1[E1]+6" : "шестнадцатое@")
|
||||||
|
| ("1[E1]+6" : "шестнадцатой@")
|
||||||
|
| ("1[E1]+6" : "шестнадцатом@")
|
||||||
|
| ("1[E1]+6" : "шестнадцатому@")
|
||||||
|
| ("1[E1]+6" : "шестнадцатую@")
|
||||||
|
| ("1[E1]+6" : "шестнадцатые@")
|
||||||
|
| ("1[E1]+6" : "шестнадцатый@")
|
||||||
|
| ("1[E1]+6" : "шестнадцатым@")
|
||||||
|
| ("1[E1]+6" : "шестнадцатыми@")
|
||||||
|
| ("1[E1]+6" : "шестнадцатых@")
|
||||||
|
| ("1[E1]+6" : "шестнадцать")
|
||||||
|
| ("1[E1]+6" : "шестнадцатью")
|
||||||
|
| ("1[E1]+7" : "семнадцатая@")
|
||||||
|
| ("1[E1]+7" : "семнадцати")
|
||||||
|
| ("1[E1]+7" : "семнадцатого@")
|
||||||
|
| ("1[E1]+7" : "семнадцатое@")
|
||||||
|
| ("1[E1]+7" : "семнадцатой@")
|
||||||
|
| ("1[E1]+7" : "семнадцатом@")
|
||||||
|
| ("1[E1]+7" : "семнадцатому@")
|
||||||
|
| ("1[E1]+7" : "семнадцатую@")
|
||||||
|
| ("1[E1]+7" : "семнадцатые@")
|
||||||
|
| ("1[E1]+7" : "семнадцатый@")
|
||||||
|
| ("1[E1]+7" : "семнадцатым@")
|
||||||
|
| ("1[E1]+7" : "семнадцатыми@")
|
||||||
|
| ("1[E1]+7" : "семнадцатых@")
|
||||||
|
| ("1[E1]+7" : "семнадцать")
|
||||||
|
| ("1[E1]+7" : "семнадцатью")
|
||||||
|
| ("1[E1]+8" : "восемнадцатая@")
|
||||||
|
| ("1[E1]+8" : "восемнадцати")
|
||||||
|
| ("1[E1]+8" : "восемнадцатого@")
|
||||||
|
| ("1[E1]+8" : "восемнадцатое@")
|
||||||
|
| ("1[E1]+8" : "восемнадцатой@")
|
||||||
|
| ("1[E1]+8" : "восемнадцатом@")
|
||||||
|
| ("1[E1]+8" : "восемнадцатому@")
|
||||||
|
| ("1[E1]+8" : "восемнадцатую@")
|
||||||
|
| ("1[E1]+8" : "восемнадцатые@")
|
||||||
|
| ("1[E1]+8" : "восемнадцатый@")
|
||||||
|
| ("1[E1]+8" : "восемнадцатым@")
|
||||||
|
| ("1[E1]+8" : "восемнадцатыми@")
|
||||||
|
| ("1[E1]+8" : "восемнадцатых@")
|
||||||
|
| ("1[E1]+8" : "восемнадцать")
|
||||||
|
| ("1[E1]+8" : "восемнадцатью")
|
||||||
|
| ("1[E1]+9" : "девятнадцатая@")
|
||||||
|
| ("1[E1]+9" : "девятнадцати")
|
||||||
|
| ("1[E1]+9" : "девятнадцатого@")
|
||||||
|
| ("1[E1]+9" : "девятнадцатое@")
|
||||||
|
| ("1[E1]+9" : "девятнадцатой@")
|
||||||
|
| ("1[E1]+9" : "девятнадцатом@")
|
||||||
|
| ("1[E1]+9" : "девятнадцатому@")
|
||||||
|
| ("1[E1]+9" : "девятнадцатую@")
|
||||||
|
| ("1[E1]+9" : "девятнадцатые@")
|
||||||
|
| ("1[E1]+9" : "девятнадцатый@")
|
||||||
|
| ("1[E1]+9" : "девятнадцатым@")
|
||||||
|
| ("1[E1]+9" : "девятнадцатыми@")
|
||||||
|
| ("1[E1]+9" : "девятнадцатых@")
|
||||||
|
| ("1[E1]+9" : "девятнадцать")
|
||||||
|
| ("1[E1]+9" : "девятнадцатью")]
|
||||||
|
;
|
||||||
|
|
||||||
|
lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
lexset2 = Optimize[
|
||||||
|
("1[E1]" : "десятая@")
|
||||||
|
| ("1[E1]" : "десяти")
|
||||||
|
| ("1[E1]" : "десятого@")
|
||||||
|
| ("1[E1]" : "десятое@")
|
||||||
|
| ("1[E1]" : "десятой@")
|
||||||
|
| ("1[E1]" : "десятом@")
|
||||||
|
| ("1[E1]" : "десятому@")
|
||||||
|
| ("1[E1]" : "десятую@")
|
||||||
|
| ("1[E1]" : "десятые@")
|
||||||
|
| ("1[E1]" : "десятый@")
|
||||||
|
| ("1[E1]" : "десятым@")
|
||||||
|
| ("1[E1]" : "десятыми@")
|
||||||
|
| ("1[E1]" : "десятых@")
|
||||||
|
| ("1[E1]" : "десять")
|
||||||
|
| ("1[E1]" : "десятью")
|
||||||
|
| ("1[E2]" : "сотая@")
|
||||||
|
| ("1[E2]" : "сотого@")
|
||||||
|
| ("1[E2]" : "сотое@")
|
||||||
|
| ("1[E2]" : "сотой@")
|
||||||
|
| ("1[E2]" : "сотом@")
|
||||||
|
| ("1[E2]" : "сотому@")
|
||||||
|
| ("1[E2]" : "сотую@")
|
||||||
|
| ("1[E2]" : "сотые@")
|
||||||
|
| ("1[E2]" : "сотый@")
|
||||||
|
| ("1[E2]" : "сотым@")
|
||||||
|
| ("1[E2]" : "сотыми@")
|
||||||
|
| ("1[E2]" : "сотых@")
|
||||||
|
| ("1[E2]" : "ста")
|
||||||
|
| ("1[E2]" : "сто")
|
||||||
|
| ("1[E3]" : "тысячная@")
|
||||||
|
| ("1[E3]" : "тысячного@")
|
||||||
|
| ("1[E3]" : "тысячное@")
|
||||||
|
| ("1[E3]" : "тысячной@")
|
||||||
|
| ("1[E3]" : "тысячном@")
|
||||||
|
| ("1[E3]" : "тысячному@")
|
||||||
|
| ("1[E3]" : "тысячную@")
|
||||||
|
| ("1[E3]" : "тысячные@")
|
||||||
|
| ("1[E3]" : "тысячный@")
|
||||||
|
| ("1[E3]" : "тысячным@")
|
||||||
|
| ("1[E3]" : "тысячными@")
|
||||||
|
| ("1[E3]" : "тысячных@")
|
||||||
|
| ("1[E6]" : "миллионная@")
|
||||||
|
| ("1[E6]" : "миллионного@")
|
||||||
|
| ("1[E6]" : "миллионное@")
|
||||||
|
| ("1[E6]" : "миллионной@")
|
||||||
|
| ("1[E6]" : "миллионном@")
|
||||||
|
| ("1[E6]" : "миллионному@")
|
||||||
|
| ("1[E6]" : "миллионную@")
|
||||||
|
| ("1[E6]" : "миллионные@")
|
||||||
|
| ("1[E6]" : "миллионный@")
|
||||||
|
| ("1[E6]" : "миллионным@")
|
||||||
|
| ("1[E6]" : "миллионными@")
|
||||||
|
| ("1[E6]" : "миллионных@")
|
||||||
|
| ("1[E9]" : "миллиардная@")
|
||||||
|
| ("1[E9]" : "миллиардного@")
|
||||||
|
| ("1[E9]" : "миллиардное@")
|
||||||
|
| ("1[E9]" : "миллиардной@")
|
||||||
|
| ("1[E9]" : "миллиардном@")
|
||||||
|
| ("1[E9]" : "миллиардному@")
|
||||||
|
| ("1[E9]" : "миллиардную@")
|
||||||
|
| ("1[E9]" : "миллиардные@")
|
||||||
|
| ("1[E9]" : "миллиардный@")
|
||||||
|
| ("1[E9]" : "миллиардным@")
|
||||||
|
| ("1[E9]" : "миллиардными@")
|
||||||
|
| ("1[E9]" : "миллиардных@")
|
||||||
|
| ("2[E1]" : "двадцатая@")
|
||||||
|
| ("2[E1]" : "двадцати")
|
||||||
|
| ("2[E1]" : "двадцатого@")
|
||||||
|
| ("2[E1]" : "двадцатое@")
|
||||||
|
| ("2[E1]" : "двадцатой@")
|
||||||
|
| ("2[E1]" : "двадцатом@")
|
||||||
|
| ("2[E1]" : "двадцатому@")
|
||||||
|
| ("2[E1]" : "двадцатую@")
|
||||||
|
| ("2[E1]" : "двадцатые@")
|
||||||
|
| ("2[E1]" : "двадцатый@")
|
||||||
|
| ("2[E1]" : "двадцатым@")
|
||||||
|
| ("2[E1]" : "двадцатыми@")
|
||||||
|
| ("2[E1]" : "двадцатых@")
|
||||||
|
| ("2[E1]" : "двадцать")
|
||||||
|
| ("2[E1]" : "двадцатью")
|
||||||
|
| ("2[E2]" : "двести")
|
||||||
|
| ("2[E2]" : "двумстам")
|
||||||
|
| ("2[E2]" : "двумястами")
|
||||||
|
| ("2[E2]" : "двухсот")
|
||||||
|
| ("2[E2]" : "двухсотая@")
|
||||||
|
| ("2[E2]" : "двухсотого@")
|
||||||
|
| ("2[E2]" : "двухсотое@")
|
||||||
|
| ("2[E2]" : "двухсотой@")
|
||||||
|
| ("2[E2]" : "двухсотом@")
|
||||||
|
| ("2[E2]" : "двухсотому@")
|
||||||
|
| ("2[E2]" : "двухсотую@")
|
||||||
|
| ("2[E2]" : "двухсотые@")
|
||||||
|
| ("2[E2]" : "двухсотый@")
|
||||||
|
| ("2[E2]" : "двухсотым@")
|
||||||
|
| ("2[E2]" : "двухсотыми@")
|
||||||
|
| ("2[E2]" : "двухсотых@")
|
||||||
|
| ("2[E2]" : "двухстах")
|
||||||
|
| ("3[E1]" : "тридцатая@")
|
||||||
|
| ("3[E1]" : "тридцати")
|
||||||
|
| ("3[E1]" : "тридцатого@")
|
||||||
|
| ("3[E1]" : "тридцатое@")
|
||||||
|
| ("3[E1]" : "тридцатой@")
|
||||||
|
| ("3[E1]" : "тридцатом@")
|
||||||
|
| ("3[E1]" : "тридцатому@")
|
||||||
|
| ("3[E1]" : "тридцатую@")
|
||||||
|
| ("3[E1]" : "тридцатые@")
|
||||||
|
| ("3[E1]" : "тридцатый@")
|
||||||
|
| ("3[E1]" : "тридцатым@")
|
||||||
|
| ("3[E1]" : "тридцатыми@")
|
||||||
|
| ("3[E1]" : "тридцатых@")
|
||||||
|
| ("3[E1]" : "тридцать")
|
||||||
|
| ("3[E1]" : "тридцатью")
|
||||||
|
| ("3[E2]" : "тремстам")
|
||||||
|
| ("3[E2]" : "тремястами")
|
||||||
|
| ("3[E2]" : "трехсот")
|
||||||
|
| ("3[E2]" : "трехсотая@")
|
||||||
|
| ("3[E2]" : "трехсотого@")
|
||||||
|
| ("3[E2]" : "трехсотое@")
|
||||||
|
| ("3[E2]" : "трехсотой@")
|
||||||
|
| ("3[E2]" : "трехсотом@")
|
||||||
|
| ("3[E2]" : "трехсотому@")
|
||||||
|
| ("3[E2]" : "трехсотую@")
|
||||||
|
| ("3[E2]" : "трехсотые@")
|
||||||
|
| ("3[E2]" : "трехсотый@")
|
||||||
|
| ("3[E2]" : "трехсотым@")
|
||||||
|
| ("3[E2]" : "трехсотыми@")
|
||||||
|
| ("3[E2]" : "трехсотых@")
|
||||||
|
| ("3[E2]" : "трехстах")
|
||||||
|
| ("3[E2]" : "триста")
|
||||||
|
| ("4[E1]" : "сорок")
|
||||||
|
| ("4[E1]" : "сорока")
|
||||||
|
| ("4[E1]" : "сороковая@")
|
||||||
|
| ("4[E1]" : "сорокового@")
|
||||||
|
| ("4[E1]" : "сороковое@")
|
||||||
|
| ("4[E1]" : "сороковой@")
|
||||||
|
| ("4[E1]" : "сороковом@")
|
||||||
|
| ("4[E1]" : "сороковому@")
|
||||||
|
| ("4[E1]" : "сороковую@")
|
||||||
|
| ("4[E1]" : "сороковые@")
|
||||||
|
| ("4[E1]" : "сороковым@")
|
||||||
|
| ("4[E1]" : "сороковыми@")
|
||||||
|
| ("4[E1]" : "сороковых@")
|
||||||
|
| ("4[E2]" : "четыремстам")
|
||||||
|
| ("4[E2]" : "четыреста")
|
||||||
|
| ("4[E2]" : "четырехсот")
|
||||||
|
| ("4[E2]" : "четырехсотая@")
|
||||||
|
| ("4[E2]" : "четырехсотого@")
|
||||||
|
| ("4[E2]" : "четырехсотое@")
|
||||||
|
| ("4[E2]" : "четырехсотой@")
|
||||||
|
| ("4[E2]" : "четырехсотом@")
|
||||||
|
| ("4[E2]" : "четырехсотому@")
|
||||||
|
| ("4[E2]" : "четырехсотую@")
|
||||||
|
| ("4[E2]" : "четырехсотые@")
|
||||||
|
| ("4[E2]" : "четырехсотый@")
|
||||||
|
| ("4[E2]" : "четырехсотым@")
|
||||||
|
| ("4[E2]" : "четырехсотыми@")
|
||||||
|
| ("4[E2]" : "четырехсотых@")
|
||||||
|
| ("4[E2]" : "четырехстах")
|
||||||
|
| ("4[E2]" : "четырьмястами")
|
||||||
|
| ("5[E1]" : "пятидесятая@")
|
||||||
|
| ("5[E1]" : "пятидесяти")
|
||||||
|
| ("5[E1]" : "пятидесятого@")
|
||||||
|
| ("5[E1]" : "пятидесятое@")
|
||||||
|
| ("5[E1]" : "пятидесятой@")
|
||||||
|
| ("5[E1]" : "пятидесятом@")
|
||||||
|
| ("5[E1]" : "пятидесятому@")
|
||||||
|
| ("5[E1]" : "пятидесятую@")
|
||||||
|
| ("5[E1]" : "пятидесятые@")
|
||||||
|
| ("5[E1]" : "пятидесятый@")
|
||||||
|
| ("5[E1]" : "пятидесятым@")
|
||||||
|
| ("5[E1]" : "пятидесятыми@")
|
||||||
|
| ("5[E1]" : "пятидесятых@")
|
||||||
|
| ("5[E1]" : "пятьдесят")
|
||||||
|
| ("5[E1]" : "пятьюдесятью")
|
||||||
|
| ("5[E2]" : "пятисот")
|
||||||
|
| ("5[E2]" : "пятисотая@")
|
||||||
|
| ("5[E2]" : "пятисотого@")
|
||||||
|
| ("5[E2]" : "пятисотое@")
|
||||||
|
| ("5[E2]" : "пятисотой@")
|
||||||
|
| ("5[E2]" : "пятисотом@")
|
||||||
|
| ("5[E2]" : "пятисотому@")
|
||||||
|
| ("5[E2]" : "пятисотую@")
|
||||||
|
| ("5[E2]" : "пятисотые@")
|
||||||
|
| ("5[E2]" : "пятисотый@")
|
||||||
|
| ("5[E2]" : "пятисотым@")
|
||||||
|
| ("5[E2]" : "пятисотыми@")
|
||||||
|
| ("5[E2]" : "пятисотых@")
|
||||||
|
| ("5[E2]" : "пятистам")
|
||||||
|
| ("5[E2]" : "пятистах")
|
||||||
|
| ("5[E2]" : "пятьсот")
|
||||||
|
| ("5[E2]" : "пятьюстами")
|
||||||
|
| ("6[E1]" : "шестидесятая@")
|
||||||
|
| ("6[E1]" : "шестидесяти")
|
||||||
|
| ("6[E1]" : "шестидесятого@")
|
||||||
|
| ("6[E1]" : "шестидесятое@")
|
||||||
|
| ("6[E1]" : "шестидесятой@")
|
||||||
|
| ("6[E1]" : "шестидесятом@")
|
||||||
|
| ("6[E1]" : "шестидесятому@")
|
||||||
|
| ("6[E1]" : "шестидесятую@")
|
||||||
|
| ("6[E1]" : "шестидесятые@")
|
||||||
|
| ("6[E1]" : "шестидесятый@")
|
||||||
|
| ("6[E1]" : "шестидесятым@")
|
||||||
|
| ("6[E1]" : "шестидесятыми@")
|
||||||
|
| ("6[E1]" : "шестидесятых@")
|
||||||
|
| ("6[E1]" : "шестьдесят")
|
||||||
|
| ("6[E1]" : "шестьюдесятью")
|
||||||
|
| ("6[E2]" : "шестисот")
|
||||||
|
| ("6[E2]" : "шестисотая@")
|
||||||
|
| ("6[E2]" : "шестисотого@")
|
||||||
|
| ("6[E2]" : "шестисотое@")
|
||||||
|
| ("6[E2]" : "шестисотой@")
|
||||||
|
| ("6[E2]" : "шестисотом@")
|
||||||
|
| ("6[E2]" : "шестисотому@")
|
||||||
|
| ("6[E2]" : "шестисотую@")
|
||||||
|
| ("6[E2]" : "шестисотые@")
|
||||||
|
| ("6[E2]" : "шестисотый@")
|
||||||
|
| ("6[E2]" : "шестисотым@")
|
||||||
|
| ("6[E2]" : "шестисотыми@")
|
||||||
|
| ("6[E2]" : "шестисотых@")
|
||||||
|
| ("6[E2]" : "шестистам")
|
||||||
|
| ("6[E2]" : "шестистах")
|
||||||
|
| ("6[E2]" : "шестьсот")
|
||||||
|
| ("6[E2]" : "шестьюстами")
|
||||||
|
| ("7[E1]" : "семидесятая@")
|
||||||
|
| ("7[E1]" : "семидесяти")
|
||||||
|
| ("7[E1]" : "семидесятого@")
|
||||||
|
| ("7[E1]" : "семидесятое@")
|
||||||
|
| ("7[E1]" : "семидесятой@")
|
||||||
|
| ("7[E1]" : "семидесятом@")
|
||||||
|
| ("7[E1]" : "семидесятому@")
|
||||||
|
| ("7[E1]" : "семидесятую@")
|
||||||
|
| ("7[E1]" : "семидесятые@")
|
||||||
|
| ("7[E1]" : "семидесятый@")
|
||||||
|
| ("7[E1]" : "семидесятым@")
|
||||||
|
| ("7[E1]" : "семидесятыми@")
|
||||||
|
| ("7[E1]" : "семидесятых@")
|
||||||
|
| ("7[E1]" : "семьдесят")
|
||||||
|
| ("7[E1]" : "семьюдесятью")
|
||||||
|
| ("7[E2]" : "семисот")
|
||||||
|
| ("7[E2]" : "семисотая@")
|
||||||
|
| ("7[E2]" : "семисотого@")
|
||||||
|
| ("7[E2]" : "семисотое@")
|
||||||
|
| ("7[E2]" : "семисотой@")
|
||||||
|
| ("7[E2]" : "семисотом@")
|
||||||
|
| ("7[E2]" : "семисотому@")
|
||||||
|
| ("7[E2]" : "семисотую@")
|
||||||
|
| ("7[E2]" : "семисотые@")
|
||||||
|
| ("7[E2]" : "семисотый@")
|
||||||
|
| ("7[E2]" : "семисотым@")
|
||||||
|
| ("7[E2]" : "семисотыми@")
|
||||||
|
| ("7[E2]" : "семисотых@")
|
||||||
|
| ("7[E2]" : "семистам")
|
||||||
|
| ("7[E2]" : "семистах")
|
||||||
|
| ("7[E2]" : "семьсот")
|
||||||
|
| ("7[E2]" : "семьюстами")
|
||||||
|
| ("8[E1]" : "восемьдесят")
|
||||||
|
| ("8[E1]" : "восьмидесятая@")
|
||||||
|
| ("8[E1]" : "восьмидесяти")
|
||||||
|
| ("8[E1]" : "восьмидесятого@")
|
||||||
|
| ("8[E1]" : "восьмидесятое@")
|
||||||
|
| ("8[E1]" : "восьмидесятой@")
|
||||||
|
| ("8[E1]" : "восьмидесятом@")
|
||||||
|
| ("8[E1]" : "восьмидесятому@")
|
||||||
|
| ("8[E1]" : "восьмидесятую@")
|
||||||
|
| ("8[E1]" : "восьмидесятые@")
|
||||||
|
| ("8[E1]" : "восьмидесятый@")
|
||||||
|
| ("8[E1]" : "восьмидесятым@")
|
||||||
|
| ("8[E1]" : "восьмидесятыми@")
|
||||||
|
| ("8[E1]" : "восьмидесятых@")
|
||||||
|
| ("8[E1]" : "восьмьюдесятью")
|
||||||
|
| ("8[E2]" : "восемьсот")
|
||||||
|
| ("8[E2]" : "восемьюстами")
|
||||||
|
| ("8[E2]" : "восьмисот")
|
||||||
|
| ("8[E2]" : "восьмисотая@")
|
||||||
|
| ("8[E2]" : "восьмисотого@")
|
||||||
|
| ("8[E2]" : "восьмисотое@")
|
||||||
|
| ("8[E2]" : "восьмисотой@")
|
||||||
|
| ("8[E2]" : "восьмисотом@")
|
||||||
|
| ("8[E2]" : "восьмисотому@")
|
||||||
|
| ("8[E2]" : "восьмисотую@")
|
||||||
|
| ("8[E2]" : "восьмисотые@")
|
||||||
|
| ("8[E2]" : "восьмисотый@")
|
||||||
|
| ("8[E2]" : "восьмисотым@")
|
||||||
|
| ("8[E2]" : "восьмисотыми@")
|
||||||
|
| ("8[E2]" : "восьмисотых@")
|
||||||
|
| ("8[E2]" : "восьмистам")
|
||||||
|
| ("8[E2]" : "восьмистах")
|
||||||
|
| ("8[E2]" : "восьмьюстами")
|
||||||
|
| ("9[E1]" : "девяноста")
|
||||||
|
| ("9[E1]" : "девяностая@")
|
||||||
|
| ("9[E1]" : "девяносто")
|
||||||
|
| ("9[E1]" : "девяностого@")
|
||||||
|
| ("9[E1]" : "девяностое@")
|
||||||
|
| ("9[E1]" : "девяностой@")
|
||||||
|
| ("9[E1]" : "девяностом@")
|
||||||
|
| ("9[E1]" : "девяностому@")
|
||||||
|
| ("9[E1]" : "девяностую@")
|
||||||
|
| ("9[E1]" : "девяностые@")
|
||||||
|
| ("9[E1]" : "девяностый@")
|
||||||
|
| ("9[E1]" : "девяностым@")
|
||||||
|
| ("9[E1]" : "девяностыми@")
|
||||||
|
| ("9[E1]" : "девяностых@")
|
||||||
|
| ("9[E2]" : "девятисот")
|
||||||
|
| ("9[E2]" : "девятисотая@")
|
||||||
|
| ("9[E2]" : "девятисотого@")
|
||||||
|
| ("9[E2]" : "девятисотое@")
|
||||||
|
| ("9[E2]" : "девятисотой@")
|
||||||
|
| ("9[E2]" : "девятисотом@")
|
||||||
|
| ("9[E2]" : "девятисотому@")
|
||||||
|
| ("9[E2]" : "девятисотую@")
|
||||||
|
| ("9[E2]" : "девятисотые@")
|
||||||
|
| ("9[E2]" : "девятисотый@")
|
||||||
|
| ("9[E2]" : "девятисотым@")
|
||||||
|
| ("9[E2]" : "девятисотыми@")
|
||||||
|
| ("9[E2]" : "девятисотых@")
|
||||||
|
| ("9[E2]" : "девятистам")
|
||||||
|
| ("9[E2]" : "девятистах")
|
||||||
|
| ("9[E2]" : "девятьсот")
|
||||||
|
| ("9[E2]" : "девятьюстами")]
|
||||||
|
;
|
||||||
|
|
||||||
|
lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
lexset1 = Optimize[
|
||||||
|
("+" : "")
|
||||||
|
| ("1" : "один")
|
||||||
|
| ("1" : "одна")
|
||||||
|
| ("1" : "одни")
|
||||||
|
| ("1" : "одним")
|
||||||
|
| ("1" : "одними")
|
||||||
|
| ("1" : "одних")
|
||||||
|
| ("1" : "одно")
|
||||||
|
| ("1" : "одного")
|
||||||
|
| ("1" : "одной")
|
||||||
|
| ("1" : "одном")
|
||||||
|
| ("1" : "одному")
|
||||||
|
| ("1" : "одною")
|
||||||
|
| ("1" : "одну")
|
||||||
|
| ("1" : "первая@")
|
||||||
|
| ("1" : "первого@")
|
||||||
|
| ("1" : "первое@")
|
||||||
|
| ("1" : "первой@")
|
||||||
|
| ("1" : "первом@")
|
||||||
|
| ("1" : "первому@")
|
||||||
|
| ("1" : "первую@")
|
||||||
|
| ("1" : "первые@")
|
||||||
|
| ("1" : "первый@")
|
||||||
|
| ("1" : "первым@")
|
||||||
|
| ("1" : "первыми@")
|
||||||
|
| ("1" : "первых@")
|
||||||
|
| ("2" : "вторая@")
|
||||||
|
| ("2" : "второго@")
|
||||||
|
| ("2" : "второе@")
|
||||||
|
| ("2" : "второй@")
|
||||||
|
| ("2" : "втором@")
|
||||||
|
| ("2" : "второму@")
|
||||||
|
| ("2" : "вторую@")
|
||||||
|
| ("2" : "вторые@")
|
||||||
|
| ("2" : "вторым@")
|
||||||
|
| ("2" : "вторыми@")
|
||||||
|
| ("2" : "вторых@")
|
||||||
|
| ("2" : "два")
|
||||||
|
| ("2" : "две")
|
||||||
|
| ("2" : "двум")
|
||||||
|
| ("2" : "двумя")
|
||||||
|
| ("2" : "двух")
|
||||||
|
| ("3" : "трем")
|
||||||
|
| ("3" : "тремя")
|
||||||
|
| ("3" : "третий@")
|
||||||
|
| ("3" : "третье@")
|
||||||
|
| ("3" : "третьего@")
|
||||||
|
| ("3" : "третьей@")
|
||||||
|
| ("3" : "третьем@")
|
||||||
|
| ("3" : "третьему@")
|
||||||
|
| ("3" : "третьи@")
|
||||||
|
| ("3" : "третьим@")
|
||||||
|
| ("3" : "третьими@")
|
||||||
|
| ("3" : "третьих@")
|
||||||
|
| ("3" : "третью@")
|
||||||
|
| ("3" : "третья@")
|
||||||
|
| ("3" : "трех")
|
||||||
|
| ("3" : "три")
|
||||||
|
| ("4" : "четвертая@")
|
||||||
|
| ("4" : "четвертого@")
|
||||||
|
| ("4" : "четвертое@")
|
||||||
|
| ("4" : "четвертой@")
|
||||||
|
| ("4" : "четвертом@")
|
||||||
|
| ("4" : "четвертому@")
|
||||||
|
| ("4" : "четвертую@")
|
||||||
|
| ("4" : "четвертые@")
|
||||||
|
| ("4" : "четвертый@")
|
||||||
|
| ("4" : "четвертым@")
|
||||||
|
| ("4" : "четвертыми@")
|
||||||
|
| ("4" : "четвертых@")
|
||||||
|
| ("4" : "четыре")
|
||||||
|
| ("4" : "четырем")
|
||||||
|
| ("4" : "четырех")
|
||||||
|
| ("4" : "четырьмя")
|
||||||
|
| ("5" : "пятая@")
|
||||||
|
| ("5" : "пяти")
|
||||||
|
| ("5" : "пятого@")
|
||||||
|
| ("5" : "пятое@")
|
||||||
|
| ("5" : "пятой@")
|
||||||
|
| ("5" : "пятом@")
|
||||||
|
| ("5" : "пятому@")
|
||||||
|
| ("5" : "пятую@")
|
||||||
|
| ("5" : "пятые@")
|
||||||
|
| ("5" : "пятый@")
|
||||||
|
| ("5" : "пятым@")
|
||||||
|
| ("5" : "пятыми@")
|
||||||
|
| ("5" : "пятых@")
|
||||||
|
| ("5" : "пять")
|
||||||
|
| ("5" : "пятью")
|
||||||
|
| ("6" : "шестая@")
|
||||||
|
| ("6" : "шести")
|
||||||
|
| ("6" : "шестого@")
|
||||||
|
| ("6" : "шестое@")
|
||||||
|
| ("6" : "шестой@")
|
||||||
|
| ("6" : "шестом@")
|
||||||
|
| ("6" : "шестому@")
|
||||||
|
| ("6" : "шестую@")
|
||||||
|
| ("6" : "шестые@")
|
||||||
|
| ("6" : "шестым@")
|
||||||
|
| ("6" : "шестыми@")
|
||||||
|
| ("6" : "шестых@")
|
||||||
|
| ("6" : "шесть")
|
||||||
|
| ("6" : "шестью")
|
||||||
|
| ("7" : "седьмая@")
|
||||||
|
| ("7" : "седьмого@")
|
||||||
|
| ("7" : "седьмое@")
|
||||||
|
| ("7" : "седьмой@")
|
||||||
|
| ("7" : "седьмом@")
|
||||||
|
| ("7" : "седьмому@")
|
||||||
|
| ("7" : "седьмую@")
|
||||||
|
| ("7" : "седьмые@")
|
||||||
|
| ("7" : "седьмым@")
|
||||||
|
| ("7" : "седьмыми@")
|
||||||
|
| ("7" : "седьмых@")
|
||||||
|
| ("7" : "семи")
|
||||||
|
| ("7" : "семь")
|
||||||
|
| ("7" : "семью")
|
||||||
|
| ("8" : "восемь")
|
||||||
|
| ("8" : "восьмая@")
|
||||||
|
| ("8" : "восьми")
|
||||||
|
| ("8" : "восьмого@")
|
||||||
|
| ("8" : "восьмое@")
|
||||||
|
| ("8" : "восьмой@")
|
||||||
|
| ("8" : "восьмом@")
|
||||||
|
| ("8" : "восьмому@")
|
||||||
|
| ("8" : "восьмую@")
|
||||||
|
| ("8" : "восьмые@")
|
||||||
|
| ("8" : "восьмым@")
|
||||||
|
| ("8" : "восьмыми@")
|
||||||
|
| ("8" : "восьмых@")
|
||||||
|
| ("8" : "восьмью")
|
||||||
|
| ("9" : "девятая@")
|
||||||
|
| ("9" : "девяти")
|
||||||
|
| ("9" : "девятого@")
|
||||||
|
| ("9" : "девятое@")
|
||||||
|
| ("9" : "девятой@")
|
||||||
|
| ("9" : "девятом@")
|
||||||
|
| ("9" : "девятому@")
|
||||||
|
| ("9" : "девятую@")
|
||||||
|
| ("9" : "девятые@")
|
||||||
|
| ("9" : "девятый@")
|
||||||
|
| ("9" : "девятым@")
|
||||||
|
| ("9" : "девятыми@")
|
||||||
|
| ("9" : "девятых@")
|
||||||
|
| ("9" : "девять")
|
||||||
|
| ("9" : "девятью")
|
||||||
|
| ("[E3]" : "тысяч")
|
||||||
|
| ("[E3]" : "тысяча")
|
||||||
|
| ("[E3]" : "тысячам")
|
||||||
|
| ("[E3]" : "тысячами")
|
||||||
|
| ("[E3]" : "тысячах")
|
||||||
|
| ("[E3]" : "тысяче")
|
||||||
|
| ("[E3]" : "тысячей")
|
||||||
|
| ("[E3]" : "тысячи")
|
||||||
|
| ("[E3]" : "тысячу")
|
||||||
|
| ("[E3]" : "тысячью")
|
||||||
|
| ("[E6]" : "миллион")
|
||||||
|
| ("[E6]" : "миллиона")
|
||||||
|
| ("[E6]" : "миллионам")
|
||||||
|
| ("[E6]" : "миллионами")
|
||||||
|
| ("[E6]" : "миллионах")
|
||||||
|
| ("[E6]" : "миллионе")
|
||||||
|
| ("[E6]" : "миллионов")
|
||||||
|
| ("[E6]" : "миллионом")
|
||||||
|
| ("[E6]" : "миллиону")
|
||||||
|
| ("[E6]" : "миллионы")
|
||||||
|
| ("[E9]" : "миллиард")
|
||||||
|
| ("[E9]" : "миллиарда")
|
||||||
|
| ("[E9]" : "миллиардам")
|
||||||
|
| ("[E9]" : "миллиардами")
|
||||||
|
| ("[E9]" : "миллиардах")
|
||||||
|
| ("[E9]" : "миллиарде")
|
||||||
|
| ("[E9]" : "миллиардов")
|
||||||
|
| ("[E9]" : "миллиардом")
|
||||||
|
| ("[E9]" : "миллиарду")
|
||||||
|
| ("[E9]" : "миллиарды")
|
||||||
|
| ("|0|" : "ноле")
|
||||||
|
| ("|0|" : "нолем")
|
||||||
|
| ("|0|" : "ноль")
|
||||||
|
| ("|0|" : "нолю")
|
||||||
|
| ("|0|" : "ноля")
|
||||||
|
| ("|0|" : "нуле")
|
||||||
|
| ("|0|" : "нулем")
|
||||||
|
| ("|0|" : "нуль")
|
||||||
|
| ("|0|" : "нулю")
|
||||||
|
| ("|0|" : "нуля")]
|
||||||
|
;
|
||||||
|
|
||||||
|
lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
export LEX = Optimize[lex3 @ lex2 @ lex1];
|
||||||
|
|
||||||
|
export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]";
|
||||||
|
|
||||||
|
# END LANGUAGE SPECIFIC DATA
|
||||||
|
################################################################################
|
||||||
|
# Inserts a marker after the Ms.
|
||||||
|
export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR];
|
||||||
|
|
||||||
|
# Deletes all powers and "+".
|
||||||
|
export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
# Deletes trailing zeros at the beginning of a number, so that "0003" does not
|
||||||
|
# get treated as an ordinary number.
|
||||||
|
export DELETE_INITIAL_ZEROS =
|
||||||
|
CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR]
|
||||||
|
;
|
||||||
|
|
||||||
|
NonMs = Optimize[POWERS - Ms];
|
||||||
|
|
||||||
|
# Deletes (usually) zeros before a non-M. E.g., +0[E1] should be
|
||||||
|
# deleted
|
||||||
|
export DELETE_INTERMEDIATE_ZEROS1 =
|
||||||
|
CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR]
|
||||||
|
;
|
||||||
|
|
||||||
|
# Deletes (usually) zeros before an M, if there is no non-zero element between
|
||||||
|
# that and the previous boundary. Thus, if after the result of the rule above we
|
||||||
|
# end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final
|
||||||
|
# zero.
|
||||||
|
export DELETE_INTERMEDIATE_ZEROS2 = Optimize[
|
||||||
|
CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR]
|
||||||
|
@ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]]
|
||||||
|
;
|
||||||
|
|
||||||
|
# Final clean up of stray zeros.
|
||||||
|
export DELETE_REMAINING_ZEROS = Optimize[
|
||||||
|
CDRewrite[Zero["+0"], "", "", SIGMA_STAR]
|
||||||
|
@ CDRewrite[Zero["0"], "", "", SIGMA_STAR]]
|
||||||
|
;
|
||||||
|
|
||||||
|
# Applies the revaluation map. For example in English, change [E4] to [E1] as a
|
||||||
|
# modifier of [E3]
|
||||||
|
export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
# Deletes the various marks and powers in the input and output.
|
||||||
|
export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR];
|
||||||
|
|
||||||
|
export CLEAN_SPACES = Optimize[
|
||||||
|
CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR]
|
||||||
|
@ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR]
|
||||||
|
@ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]]
|
||||||
|
;
|
||||||
|
|
||||||
|
d = b.kDigit;
|
||||||
|
|
||||||
|
# Germanic inversion rule.
|
||||||
|
germanic =
|
||||||
|
(I["1+"] d "[E1]" D["+1"])
|
||||||
|
| (I["2+"] d "[E1]" D["+2"])
|
||||||
|
| (I["3+"] d "[E1]" D["+3"])
|
||||||
|
| (I["4+"] d "[E1]" D["+4"])
|
||||||
|
| (I["5+"] d "[E1]" D["+5"])
|
||||||
|
| (I["6+"] d "[E1]" D["+6"])
|
||||||
|
| (I["7+"] d "[E1]" D["+7"])
|
||||||
|
| (I["8+"] d "[E1]" D["+8"])
|
||||||
|
| (I["9+"] d "[E1]" D["+9"])
|
||||||
|
;
|
||||||
|
|
||||||
|
germanic_inversion =
|
||||||
|
CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt']
|
||||||
|
;
|
||||||
|
|
||||||
|
export GERMANIC_INVERSION = SIGMA_STAR;
|
||||||
|
export ORDINAL_RESTRICTION =
|
||||||
|
Optimize[((SIGMA - "@")* "@") @ CDRewrite[D["@"], "", "", SIGMA_STAR]]
|
||||||
|
;
|
||||||
|
nondigits = b.kBytes - b.kDigit;
|
||||||
|
export ORDINAL_SUFFIX = D[nondigits*];
|
|
@ -0,0 +1,77 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# This verbalizer is used whenever there is an LM symbol that consists of
|
||||||
|
# letters immediately followed by "{spelled}". This strips the "{spelled}"
|
||||||
|
# suffix.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'ru/classifier/cyrillic.grm' as c;
|
||||||
|
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'ru/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
digit = b.kDigit @ n.CARDINAL_NUMBERS;
|
||||||
|
|
||||||
|
char_set = (("a" | "A") : "letter-a")
|
||||||
|
| (("b" | "B") : "letter-b")
|
||||||
|
| (("c" | "C") : "letter-c")
|
||||||
|
| (("d" | "D") : "letter-d")
|
||||||
|
| (("e" | "E") : "letter-e")
|
||||||
|
| (("f" | "F") : "letter-f")
|
||||||
|
| (("g" | "G") : "letter-g")
|
||||||
|
| (("h" | "H") : "letter-h")
|
||||||
|
| (("i" | "I") : "letter-i")
|
||||||
|
| (("j" | "J") : "letter-j")
|
||||||
|
| (("k" | "K") : "letter-k")
|
||||||
|
| (("l" | "L") : "letter-l")
|
||||||
|
| (("m" | "M") : "letter-m")
|
||||||
|
| (("n" | "N") : "letter-n")
|
||||||
|
| (("o" | "O") : "letter-o")
|
||||||
|
| (("p" | "P") : "letter-p")
|
||||||
|
| (("q" | "Q") : "letter-q")
|
||||||
|
| (("r" | "R") : "letter-r")
|
||||||
|
| (("s" | "S") : "letter-s")
|
||||||
|
| (("t" | "T") : "letter-t")
|
||||||
|
| (("u" | "U") : "letter-u")
|
||||||
|
| (("v" | "V") : "letter-v")
|
||||||
|
| (("w" | "W") : "letter-w")
|
||||||
|
| (("x" | "X") : "letter-x")
|
||||||
|
| (("y" | "Y") : "letter-y")
|
||||||
|
| (("z" | "Z") : "letter-z")
|
||||||
|
| (digit)
|
||||||
|
| ("&" : "@@AND@@")
|
||||||
|
| ("." : "")
|
||||||
|
| ("-" : "")
|
||||||
|
| ("_" : "")
|
||||||
|
| ("/" : "")
|
||||||
|
| (n.I["letter-"] c.kCyrillicAlpha)
|
||||||
|
;
|
||||||
|
|
||||||
|
ins_space = "" : " ";
|
||||||
|
|
||||||
|
suffix = "{spelled}" : "";
|
||||||
|
|
||||||
|
spelled = Optimize[char_set (ins_space char_set)* suffix];
|
||||||
|
|
||||||
|
export SPELLED = Optimize[spelled @ l.LEXICAL_MAP];
|
||||||
|
|
||||||
|
sigma_star = b.kBytes*;
|
||||||
|
|
||||||
|
# Gets rid of the letter- prefix since in some cases we don't want it.
|
||||||
|
|
||||||
|
del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star];
|
||||||
|
|
||||||
|
spelled_no_tag = Optimize[char_set (ins_space char_set)*];
|
||||||
|
|
||||||
|
export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter];
|
@ -0,0 +1,24 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||||
|
|
||||||
|
punct =
|
||||||
|
("." : "@@PERIOD@@")
|
||||||
|
| ("," : "@@COMMA@@")
|
||||||
|
| ("!" : "@@EXCLAMATION_MARK@@")
|
||||||
|
| ("?" : "@@QUESTION_MARK@@")
|
||||||
|
;
|
||||||
|
|
||||||
|
export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP];
|
@ -0,0 +1,108 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as b;
|
||||||
|
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||||
|
import 'ru/verbalizer/numbers.grm' as n;
|
||||||
|
|
||||||
|
# Only handles 24-hour time with quarter-to, half-past and quarter-past.
|
||||||
|
|
||||||
|
increment_hour =
|
||||||
|
("0" : "1")
|
||||||
|
| ("1" : "2")
|
||||||
|
| ("2" : "3")
|
||||||
|
| ("3" : "4")
|
||||||
|
| ("4" : "5")
|
||||||
|
| ("5" : "6")
|
||||||
|
| ("6" : "7")
|
||||||
|
| ("7" : "8")
|
||||||
|
| ("8" : "9")
|
||||||
|
| ("9" : "10")
|
||||||
|
| ("10" : "11")
|
||||||
|
| ("11" : "12")
|
||||||
|
| ("12" : "1") # If someone uses 12, we assume 12-hour by default.
|
||||||
|
| ("13" : "14")
|
||||||
|
| ("14" : "15")
|
||||||
|
| ("15" : "16")
|
||||||
|
| ("16" : "17")
|
||||||
|
| ("17" : "18")
|
||||||
|
| ("18" : "19")
|
||||||
|
| ("19" : "20")
|
||||||
|
| ("20" : "21")
|
||||||
|
| ("21" : "22")
|
||||||
|
| ("22" : "23")
|
||||||
|
| ("23" : "12")
|
||||||
|
;
|
||||||
|
|
||||||
|
hours = Project[increment_hour, 'input'];
|
||||||
|
|
||||||
|
d = b.kDigit;
|
||||||
|
D = d - "0";
|
||||||
|
|
||||||
|
minutes09 = "0" D;
|
||||||
|
|
||||||
|
minutes = ("1" | "2" | "3" | "4" | "5") d;
|
||||||
|
|
||||||
|
__sep__ = ":";
|
||||||
|
sep_space = __sep__ : " ";
|
||||||
|
|
||||||
|
verbalize_hours = hours @ n.CARDINAL_NUMBERS;
|
||||||
|
|
||||||
|
verbalize_minutes =
|
||||||
|
("00" : "@@HOUR@@")
|
||||||
|
| (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS))
|
||||||
|
| (minutes @ n.CARDINAL_NUMBERS)
|
||||||
|
;
|
||||||
|
|
||||||
|
time_basic = Optimize[verbalize_hours sep_space verbalize_minutes];
|
||||||
|
|
||||||
|
# Special cases we handle right now.
|
||||||
|
# TODO: Need to allow for cases like
|
||||||
|
#
|
||||||
|
# half twelve (in the UK English sense)
|
||||||
|
# half twaalf (in the Dutch sense)
|
||||||
|
|
||||||
|
time_quarter_past =
|
||||||
|
n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "]
|
||||||
|
verbalize_hours
|
||||||
|
n.D[__sep__ "15"];
|
||||||
|
|
||||||
|
time_half_past =
|
||||||
|
n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "]
|
||||||
|
verbalize_hours
|
||||||
|
n.D[__sep__ "30"];
|
||||||
|
|
||||||
|
time_quarter_to =
|
||||||
|
n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "]
|
||||||
|
(increment_hour @ verbalize_hours)
|
||||||
|
n.D[__sep__ "45"];
|
||||||
|
|
||||||
|
time_extra = Optimize[
|
||||||
|
time_quarter_past | time_half_past | time_quarter_to]
|
||||||
|
;
|
||||||
|
|
||||||
|
# Basic time periods which most languages can be expected to have.
|
||||||
|
__am__ = "a.m." | "am" | "AM" | "утра";
|
||||||
|
__pm__ = "p.m." | "pm" | "PM" | "вечера";
|
||||||
|
|
||||||
|
period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@");
|
||||||
|
|
||||||
|
time_variants = time_basic | time_extra;
|
||||||
|
|
||||||
|
time = Optimize[
|
||||||
|
(period (" " | n.I[" "]))? time_variants
|
||||||
|
| time_variants ((" " | n.I[" "]) period)?]
|
||||||
|
;
|
||||||
|
|
||||||
|
export TIME = Optimize[time @ l.LEXICAL_MAP];
|
@ -0,0 +1,68 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Rules for URLs and email addresses.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as bytelib;
|
||||||
|
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||||
|
|
||||||
|
ins_space = "" : " ";
|
||||||
|
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
||||||
|
at = "@" : "@@AT@@";
|
||||||
|
|
||||||
|
url_suffix =
|
||||||
|
(".com" : dot ins_space "com") |
|
||||||
|
(".gov" : dot ins_space "gov") |
|
||||||
|
(".edu" : dot ins_space "e d u") |
|
||||||
|
(".org" : dot ins_space "org") |
|
||||||
|
(".net" : dot ins_space "net")
|
||||||
|
;
|
||||||
|
|
||||||
|
letter_string = (bytelib.kAlnum)* bytelib.kAlnum;
|
||||||
|
|
||||||
|
letter_string_dot =
|
||||||
|
((letter_string ins_space dot ins_space)* letter_string)
|
||||||
|
;
|
||||||
|
|
||||||
|
# Rules for URLs.
|
||||||
|
export URL = Optimize[
|
||||||
|
((letter_string_dot) (ins_space)
|
||||||
|
(url_suffix)) @ l.LEXICAL_MAP
|
||||||
|
];
|
||||||
|
|
||||||
|
# Rules for email addresses.
|
||||||
|
letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum);
|
||||||
|
|
||||||
|
letter_by_letter_dot =
|
||||||
|
((letter_by_letter ins_space dot ins_space)*
|
||||||
|
letter_by_letter)
|
||||||
|
;
|
||||||
|
|
||||||
|
export EMAIL1 = Optimize[
|
||||||
|
((letter_by_letter) (ins_space)
|
||||||
|
(at) (ins_space)
|
||||||
|
(letter_by_letter_dot) (ins_space)
|
||||||
|
(url_suffix)) @ l.LEXICAL_MAP
|
||||||
|
];
|
||||||
|
|
||||||
|
export EMAIL2 = Optimize[
|
||||||
|
((letter_by_letter) (ins_space)
|
||||||
|
(at) (ins_space)
|
||||||
|
(letter_string_dot) (ins_space)
|
||||||
|
(url_suffix)) @ l.LEXICAL_MAP
|
||||||
|
];
|
||||||
|
|
||||||
|
export EMAILS = Optimize[
|
||||||
|
EMAIL1 | EMAIL2
|
||||||
|
];
|
@ -0,0 +1,42 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import 'util/util.grm' as util;
|
||||||
|
import 'ru/verbalizer/extra_numbers.grm' as e;
|
||||||
|
import 'ru/verbalizer/float.grm' as f;
|
||||||
|
import 'ru/verbalizer/math.grm' as ma;
|
||||||
|
import 'ru/verbalizer/miscellaneous.grm' as mi;
|
||||||
|
import 'ru/verbalizer/money.grm' as mo;
|
||||||
|
import 'ru/verbalizer/numbers.grm' as n;
|
||||||
|
import 'ru/verbalizer/numbers_plus.grm' as np;
|
||||||
|
import 'ru/verbalizer/spelled.grm' as s;
|
||||||
|
import 'ru/verbalizer/spoken_punct.grm' as sp;
|
||||||
|
import 'ru/verbalizer/time.grm' as t;
|
||||||
|
import 'ru/verbalizer/urls.grm' as u;
|
||||||
|
|
||||||
|
export VERBALIZER = Optimize[RmWeight[
|
||||||
|
( e.MIXED_NUMBERS
|
||||||
|
| e.DIGITS
|
||||||
|
| f.FLOAT
|
||||||
|
| ma.ARITHMETIC
|
||||||
|
| mi.MISCELLANEOUS
|
||||||
|
| mo.MONEY
|
||||||
|
| n.CARDINAL_NUMBERS
|
||||||
|
| n.ORDINAL_NUMBERS
|
||||||
|
| np.NUMBERS_PLUS
|
||||||
|
| s.SPELLED
|
||||||
|
| sp.SPOKEN_PUNCT
|
||||||
|
| t.TIME
|
||||||
|
| u.URL) @ util.CLEAN_SPACES
|
||||||
|
]];
|
@ -0,0 +1,3 @@
|
|||||||
|
# Language-universal grammar definitions
|
||||||
|
|
||||||
|
This directory contains various language-universal grammar definitions.
|
|
@ -0,0 +1,126 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Specifies common ways of delimiting thousands in digit strings.
|
||||||
|
|
||||||
|
import 'util/byte.grm' as bytelib;
|
||||||
|
import 'util/util.grm' as util;
|
||||||
|
|
||||||
|
killcomma = "," : "";
|
||||||
|
dot2comma = "." : ",";
|
||||||
|
spaces2comma = " "+ : ",";
|
||||||
|
|
||||||
|
zero = "0";
|
||||||
|
|
||||||
|
# no_delimiter = zero | "[1-9][0-9]*";
|
||||||
|
export no_delimiter = zero | (util.d1to9 bytelib.kDigit*);
|
||||||
|
|
||||||
|
# delim_map_dot = ("[0-9]" | ("\." : ","))*;
|
||||||
|
delim_map_dot = (bytelib.kDigit | dot2comma)*;
|
||||||
|
|
||||||
|
# delim_map_space = ("[0-9]" | (" +" : ","))*;
|
||||||
|
delim_map_space = (bytelib.kDigit | spaces2comma)*;
|
||||||
|
|
||||||
|
## Western systems group thousands. Korean goes this way too.
|
||||||
|
|
||||||
|
# comma_thousands = zero | ("[1-9][0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9]")*);
|
||||||
|
export comma_thousands = zero | (util.d1to9 bytelib.kDigit{0,2} (killcomma bytelib.kDigit{3})*);
|
||||||
|
|
||||||
|
# ComposeFst: 1st argument cannot match on output labels and 2nd argument
|
||||||
|
# cannot match on input labels (sort?).
|
||||||
|
export dot_thousands = delim_map_dot @ comma_thousands;
|
||||||
|
|
||||||
|
# ComposeFst: 1st argument cannot match on output labels and 2nd argument
|
||||||
|
# cannot match on input labels (sort?).
|
||||||
|
export space_thousands = delim_map_space @ comma_thousands;
|
||||||
|
|
||||||
|
## Chinese prefers grouping by fours (by ten-thousands).
|
||||||
|
|
||||||
|
# chinese_comma =
|
||||||
|
# zero | ("[1-9][0-9]?[0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9][0-9]")*);
|
||||||
|
export chinese_comma = zero | (util.d1to9 (bytelib.kDigit{0,3}) (killcomma bytelib.kDigit{4})*);
|
||||||
|
|
||||||
|
## The Indian system is more complex because of the Stravinskian alternation
|
||||||
|
## between lakhs and crores.
|
||||||
|
##
|
||||||
|
## According to Wikipedia:
|
||||||
|
##
|
||||||
|
## Indian English Value
|
||||||
|
## One 1
|
||||||
|
## Ten 10
|
||||||
|
## Hundred 100
|
||||||
|
## Thousand 1,000
|
||||||
|
## Lakh 1,00,000
|
||||||
|
## Crore 1,00,00,000
|
||||||
|
## Arab 1,00,00,00,000
|
||||||
|
## Kharab 1,00,00,00,00,000
|
||||||
|
|
||||||
|
# indian_hundreds = "[1-9][0-9]?[0-9]?";
|
||||||
|
indian_hundreds = util.d1to9 bytelib.kDigit{0,2};
|
||||||
|
|
||||||
|
## Up to 99,999.
|
||||||
|
|
||||||
|
# indian_comma_thousands = "[1-9][0-9]?" ("," : "") "[0-9][0-9][0-9]";
|
||||||
|
indian_comma_thousands = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{3};
|
||||||
|
|
||||||
|
## Up to 99,99,999.
|
||||||
|
|
||||||
|
# indian_comma_lakhs = "[1-9][0-9]?" ("," : "") "[0-9][0-9]" ("," : "") "[0-9][0-9][0-9]";
|
||||||
|
indian_comma_lakhs = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{2} killcomma bytelib.kDigit{3};
|
||||||
|
|
||||||
|
## Up to 999,99,99,999
|
||||||
|
|
||||||
|
indian_comma_crores =
|
||||||
|
util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma
|
||||||
|
(bytelib.kDigit{2} killcomma)?
|
||||||
|
bytelib.kDigit{2} killcomma
|
||||||
|
bytelib.kDigit{3}
|
||||||
|
;
|
||||||
|
|
||||||
|
## Up to 99,999,99,99,999.
|
||||||
|
|
||||||
|
indian_comma_thousand_crores =
|
||||||
|
util.d1to9 bytelib.kDigit? killcomma
|
||||||
|
bytelib.kDigit{3} killcomma
|
||||||
|
bytelib.kDigit{2} killcomma
|
||||||
|
bytelib.kDigit{2} killcomma
|
||||||
|
bytelib.kDigit{3}
|
||||||
|
;
|
||||||
|
|
||||||
|
## Up to 999,99,999,99,99,999.
|
||||||
|
|
||||||
|
indian_comma_lakh_crores =
|
||||||
|
util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma
|
||||||
|
bytelib.kDigit{2} killcomma
|
||||||
|
bytelib.kDigit{3} killcomma
|
||||||
|
bytelib.kDigit{2} killcomma
|
||||||
|
bytelib.kDigit{2} killcomma
|
||||||
|
bytelib.kDigit{3}
|
||||||
|
;
|
||||||
|
|
||||||
|
export indian_comma =
|
||||||
|
zero
|
||||||
|
| indian_hundreds
|
||||||
|
| indian_comma_thousands
|
||||||
|
| indian_comma_lakhs
|
||||||
|
| indian_comma_crores
|
||||||
|
| indian_comma_thousand_crores
|
||||||
|
| indian_comma_lakh_crores
|
||||||
|
;
|
||||||
|
|
||||||
|
# Indian number system with dots.
|
||||||
|
export indian_dot_number = delim_map_dot @ indian_comma;
|
||||||
|
|
||||||
|
# Indian number system with spaces.
|
||||||
|
export indian_space_number = delim_map_space @ indian_comma;
|
@ -0,0 +1,3 @@
|
|||||||
|
# Utility grammar definitions
|
||||||
|
|
||||||
|
This directory contains various utility grammar definitions.
|
@ -0,0 +1,75 @@
|
|||||||
|
# Copyright 2017 Google Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# Standard constants for ASCII (byte) based strings. This mirrors the
|
||||||
|
# functions provided by C/C++'s ctype.h library.
|
||||||
|
|
||||||
|
# Note that [0] is missing; matching the string-termination character is kinda weird.
|
||||||
|
export kBytes = Optimize[
|
||||||
|
"[1]" | "[2]" | "[3]" | "[4]" | "[5]" | "[6]" | "[7]" | "[8]" | "[9]" | "[10]" |
|
||||||
|
"[11]" | "[12]" | "[13]" | "[14]" | "[15]" | "[16]" | "[17]" | "[18]" | "[19]" | "[20]" |
|
||||||
|
"[21]" | "[22]" | "[23]" | "[24]" | "[25]" | "[26]" | "[27]" | "[28]" | "[29]" | "[30]" |
|
||||||
|
"[31]" | "[32]" | "[33]" | "[34]" | "[35]" | "[36]" | "[37]" | "[38]" | "[39]" | "[40]" |
|
||||||
|
"[41]" | "[42]" | "[43]" | "[44]" | "[45]" | "[46]" | "[47]" | "[48]" | "[49]" | "[50]" |
|
||||||
|
"[51]" | "[52]" | "[53]" | "[54]" | "[55]" | "[56]" | "[57]" | "[58]" | "[59]" | "[60]" |
|
||||||
|
"[61]" | "[62]" | "[63]" | "[64]" | "[65]" | "[66]" | "[67]" | "[68]" | "[69]" | "[70]" |
|
||||||
|
"[71]" | "[72]" | "[73]" | "[74]" | "[75]" | "[76]" | "[77]" | "[78]" | "[79]" | "[80]" |
|
||||||
|
"[81]" | "[82]" | "[83]" | "[84]" | "[85]" | "[86]" | "[87]" | "[88]" | "[89]" | "[90]" |
|
||||||
|
"[91]" | "[92]" | "[93]" | "[94]" | "[95]" | "[96]" | "[97]" | "[98]" | "[99]" | "[100]" |
|
||||||
|
"[101]" | "[102]" | "[103]" | "[104]" | "[105]" | "[106]" | "[107]" | "[108]" | "[109]" | "[110]" |
|
||||||
|
"[111]" | "[112]" | "[113]" | "[114]" | "[115]" | "[116]" | "[117]" | "[118]" | "[119]" | "[120]" |
|
||||||
|
"[121]" | "[122]" | "[123]" | "[124]" | "[125]" | "[126]" | "[127]" | "[128]" | "[129]" | "[130]" |
|
||||||
|
"[131]" | "[132]" | "[133]" | "[134]" | "[135]" | "[136]" | "[137]" | "[138]" | "[139]" | "[140]" |
|
||||||
|
"[141]" | "[142]" | "[143]" | "[144]" | "[145]" | "[146]" | "[147]" | "[148]" | "[149]" | "[150]" |
|
||||||
|
"[151]" | "[152]" | "[153]" | "[154]" | "[155]" | "[156]" | "[157]" | "[158]" | "[159]" | "[160]" |
|
||||||
|
"[161]" | "[162]" | "[163]" | "[164]" | "[165]" | "[166]" | "[167]" | "[168]" | "[169]" | "[170]" |
|
||||||
|
"[171]" | "[172]" | "[173]" | "[174]" | "[175]" | "[176]" | "[177]" | "[178]" | "[179]" | "[180]" |
|
||||||
|
"[181]" | "[182]" | "[183]" | "[184]" | "[185]" | "[186]" | "[187]" | "[188]" | "[189]" | "[190]" |
|
||||||
|
"[191]" | "[192]" | "[193]" | "[194]" | "[195]" | "[196]" | "[197]" | "[198]" | "[199]" | "[200]" |
|
||||||
|
"[201]" | "[202]" | "[203]" | "[204]" | "[205]" | "[206]" | "[207]" | "[208]" | "[209]" | "[210]" |
|
||||||
|
"[211]" | "[212]" | "[213]" | "[214]" | "[215]" | "[216]" | "[217]" | "[218]" | "[219]" | "[220]" |
|
||||||
|
"[221]" | "[222]" | "[223]" | "[224]" | "[225]" | "[226]" | "[227]" | "[228]" | "[229]" | "[230]" |
|
||||||
|
"[231]" | "[232]" | "[233]" | "[234]" | "[235]" | "[236]" | "[237]" | "[238]" | "[239]" | "[240]" |
|
||||||
|
"[241]" | "[242]" | "[243]" | "[244]" | "[245]" | "[246]" | "[247]" | "[248]" | "[249]" | "[250]" |
|
||||||
|
"[251]" | "[252]" | "[253]" | "[254]" | "[255]"
|
||||||
|
];
|
||||||
|
|
||||||
|
export kDigit = Optimize[
|
||||||
|
"0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
|
||||||
|
];
|
||||||
|
|
||||||
|
export kLower = Optimize[
|
||||||
|
"a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" |
|
||||||
|
"n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
|
||||||
|
];
|
||||||
|
export kUpper = Optimize[
|
||||||
|
"A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" |
|
||||||
|
"N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
|
||||||
|
];
|
||||||
|
export kAlpha = Optimize[kLower | kUpper];
|
||||||
|
|
||||||
|
export kAlnum = Optimize[kDigit | kAlpha];
|
||||||
|
|
||||||
|
export kSpace = Optimize[
|
||||||
|
" " | "\t" | "\n" | "\r"
|
||||||
|
];
|
||||||
|
export kNotSpace = Optimize[kBytes - kSpace];
|
||||||
|
|
||||||
|
export kPunct = Optimize[
|
||||||
|
"!" | "\"" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," |
|
||||||
|
"-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "\[" | "\\" |
|
||||||
|
"\]" | "^" | "_" | "`" | "{" | "|" | "}" | "~"
|
||||||
|
];
|
||||||
|
|
||||||
|
export kGraph = Optimize[kAlnum | kPunct];
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue