parent
aba37810ff
commit
3f3442b98a
@ -1,2 +0,0 @@
|
||||
data
|
||||
exp
|
@ -1,3 +0,0 @@
|
||||
# G2P
|
||||
|
||||
* zh - Chinese G2P
|
@ -1,53 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import re
|
||||
|
||||
import jieba
|
||||
from pypinyin import lazy_pinyin
|
||||
from pypinyin import Style
|
||||
|
||||
|
||||
def extract_pinyin(source, target, use_jieba=False):
|
||||
with open(source, 'rt', encoding='utf-8') as fin:
|
||||
with open(target, 'wt', encoding='utf-8') as fout:
|
||||
for i, line in enumerate(fin):
|
||||
if i % 2 == 0:
|
||||
sentence_id, raw_text = line.strip().split()
|
||||
raw_text = re.sub(r'#\d', '', raw_text)
|
||||
if use_jieba:
|
||||
raw_text = jieba.lcut(raw_text)
|
||||
syllables = lazy_pinyin(
|
||||
raw_text,
|
||||
errors='ignore',
|
||||
style=Style.TONE3,
|
||||
neutral_tone_with_five=True)
|
||||
transcription = ' '.join(syllables)
|
||||
fout.write(f'{sentence_id} {transcription}\n')
|
||||
else:
|
||||
continue
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||
parser.add_argument(
|
||||
"input", type=str, help="source file of baker's prosody label file")
|
||||
parser.add_argument(
|
||||
"output", type=str, help="target file to write pinyin lables")
|
||||
parser.add_argument(
|
||||
"--use-jieba",
|
||||
action='store_true',
|
||||
help="use jieba for word segmentation.")
|
||||
args = parser.parse_args()
|
||||
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
|
@ -1,37 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
|
||||
|
||||
def extract_pinyin_lables(source, target):
|
||||
"""Extract pinyin labels from Baker's prosody labeling."""
|
||||
with open(source, 'rt', encoding='utf-8') as fin:
|
||||
with open(target, 'wt', encoding='utf-8') as fout:
|
||||
for i, line in enumerate(fin):
|
||||
if i % 2 == 0:
|
||||
sentence_id, raw_text = line.strip().split()
|
||||
fout.write(f'{sentence_id} ')
|
||||
else:
|
||||
transcription = line.strip()
|
||||
fout.write(f'{transcription}\n')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||
parser.add_argument(
|
||||
"input", type=str, help="source file of baker's prosody label file")
|
||||
parser.add_argument(
|
||||
"output", type=str, help="target file to write pinyin lables")
|
||||
args = parser.parse_args()
|
||||
extract_pinyin_lables(args.input, args.output)
|
@ -1,103 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import Union
|
||||
|
||||
|
||||
def erized(syllable: str) -> bool:
|
||||
"""Whether the syllable contains erhua effect.
|
||||
|
||||
Example
|
||||
--------
|
||||
huar -> True
|
||||
guanr -> True
|
||||
er -> False
|
||||
"""
|
||||
# note: for pinyin, len(syllable) >=2 is always true
|
||||
# if not: there is something wrong in the data
|
||||
assert len(syllable) >= 2, f"inavlid syllable {syllable}"
|
||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
||||
|
||||
|
||||
def ignore_sandhi(reference: List[str], generated: List[str]) -> List[str]:
|
||||
"""
|
||||
Given a sequence of syllables from human annotation(reference),
|
||||
which makes sandhi explici and a sequence of syllables from some
|
||||
simple g2p program(generated), which does not consider sandhi,
|
||||
return a the reference sequence while ignore sandhi.
|
||||
|
||||
Example
|
||||
--------
|
||||
['lao2', 'hu3'], ['lao3', 'hu3'] -> ['lao3', 'hu3']
|
||||
"""
|
||||
i = 0
|
||||
j = 0
|
||||
|
||||
# sandhi ignored in the result while other errors are not included
|
||||
result = []
|
||||
while i < len(reference):
|
||||
if erized(reference[i]):
|
||||
result.append(reference[i])
|
||||
i += 1
|
||||
j += 2
|
||||
elif reference[i][:-1] == generated[i][:-1] and reference[i][
|
||||
-1] == '2' and generated[i][-1] == '3':
|
||||
result.append(generated[i])
|
||||
i += 1
|
||||
j += 1
|
||||
else:
|
||||
result.append(reference[i])
|
||||
i += 1
|
||||
j += 1
|
||||
assert j == len(
|
||||
generated
|
||||
), "length of transcriptions mismatch, There may be some characters that are ignored in the generated transcription."
|
||||
return result
|
||||
|
||||
|
||||
def convert_transcriptions(reference: Union[str, Path],
|
||||
generated: Union[str, Path],
|
||||
output: Union[str, Path]):
|
||||
with open(reference, 'rt') as f_ref:
|
||||
with open(generated, 'rt') as f_gen:
|
||||
with open(output, 'wt') as f_out:
|
||||
for i, (ref, gen) in enumerate(zip(f_ref, f_gen)):
|
||||
sentence_id, ref_transcription = ref.strip().split(' ', 1)
|
||||
_, gen_transcription = gen.strip().split(' ', 1)
|
||||
try:
|
||||
result = ignore_sandhi(ref_transcription.split(),
|
||||
gen_transcription.split())
|
||||
result = ' '.join(result)
|
||||
except Exception:
|
||||
print(
|
||||
f"sentence_id: {sentence_id} There is some annotation error in the reference or generated transcription. Use the reference."
|
||||
)
|
||||
result = ref_transcription
|
||||
f_out.write(f"{sentence_id} {result}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="reference transcription but ignore sandhi.")
|
||||
parser.add_argument(
|
||||
"--reference",
|
||||
type=str,
|
||||
help="path to the reference transcription of baker dataset.")
|
||||
parser.add_argument(
|
||||
"--generated", type=str, help="path to the generated transcription.")
|
||||
parser.add_argument("--output", type=str, help="path to save result.")
|
||||
args = parser.parse_args()
|
||||
convert_transcriptions(args.reference, args.generated, args.output)
|
@ -1,33 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
exp_dir="exp"
|
||||
data_dir="data"
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
archive=${data_dir}/"BZNSYP.rar"
|
||||
if [ ! -f ${archive} ]; then
|
||||
echo "Baker Dataset not found! Download it first to the data_dir."
|
||||
exit -1
|
||||
fi
|
||||
|
||||
MD5='c4350563bf7dc298f7dd364b2607be83'
|
||||
md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
|
||||
if [ ${md5_result} != ${MD5} ]; then
|
||||
echo "MD5 mismatch! The Archive has been changed."
|
||||
exit -1
|
||||
fi
|
||||
|
||||
|
||||
label_file='ProsodyLabeling/000001-010000.txt'
|
||||
filename='000001-010000.txt'
|
||||
unrar e ${archive} ${label_file}
|
||||
cp ${filename} ${exp_dir}
|
||||
rm -f ${filename}
|
||||
|
||||
if [ ! -f ${exp_dir}/${filename} ];then
|
||||
echo "File extraction failed!"
|
||||
exit
|
||||
fi
|
||||
|
||||
exit 0
|
@ -1,8 +0,0 @@
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
@ -1 +0,0 @@
|
||||
jieba
|
@ -1,37 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
source path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
exp_dir=exp
|
||||
data=data
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
mkdir -p ${exp_dir}
|
||||
|
||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
|
||||
mkdir -p ${data}
|
||||
test -e ${data}/BZNSYP.rar || wget -c https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar -P ${data}
|
||||
fi
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||
echo "stage 0: Extracting Prosody Labeling"
|
||||
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data}
|
||||
fi
|
||||
|
||||
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
|
||||
filename="000001-010000.txt"
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
echo "stage 1: Processing transcriptions..."
|
||||
python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/ref.pinyin
|
||||
|
||||
python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/trans.pinyin
|
||||
python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/trans.jieba.pinyin
|
||||
fi
|
||||
|
||||
echo "done"
|
||||
exit 0
|
@ -1 +0,0 @@
|
||||
exp
|
@ -1,29 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
|
||||
from text_processing import normalization
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Normalize text in Chinese with some rules.")
|
||||
parser.add_argument("input", type=str, help="the input sentences")
|
||||
parser.add_argument("output", type=str, help="path to save the output file.")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.input, 'rt') as fin:
|
||||
with open(args.output, 'wt') as fout:
|
||||
for sent in fin:
|
||||
sent = normalization.normalize_sentence(sent.strip())
|
||||
fout.write(sent)
|
||||
fout.write('\n')
|
@ -1,8 +0,0 @@
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}#
|
@ -1,26 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
source path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
exp_dir=exp
|
||||
data_dir=data
|
||||
filename="sentences.txt"
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
mkdir -p ${exp_dir}
|
||||
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
echo "stage 1: Processing "
|
||||
python3 local/test_normalization.py ${data_dir}/${filename} ${exp_dir}/normalized.txt
|
||||
if [ -f "${exp_dir}/normalized.txt" ]; then
|
||||
echo "Normalized text save at ${exp_dir}/normalized.txt"
|
||||
fi
|
||||
# TODO(chenfeiyu): compute edit distance against ground-truth
|
||||
fi
|
||||
|
||||
echo "done"
|
||||
exit 0
|
@ -1,2 +0,0 @@
|
||||
*~
|
||||
*.far
|
@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020 SpeechIO
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -1,8 +0,0 @@
|
||||
# for plain text
|
||||
python3 cn_tn.py example_plain.txt output_plain.txt
|
||||
diff example_plain.txt output_plain.txt
|
||||
|
||||
# for Kaldi's trans format
|
||||
python3 cn_tn.py --has_key example_kaldi.txt output_kaldi.txt
|
||||
diff example_kaldi.txt output_kaldi.txt
|
||||
|
@ -1,24 +0,0 @@
|
||||
0. place install_thrax.sh into $KALDI/tools/extras/
|
||||
|
||||
1. recompile openfst with necessary option "--enable-grm" to support thrax:
|
||||
* cd $KALDI_ROOT/tools
|
||||
* make clean
|
||||
* edit $KALDI_ROOT/tools/Makefile, append "--enable-grm" option to OPENFST_CONFIGURE:
|
||||
OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts --enable-lookahead-fsts --with-pic --enable-grm
|
||||
* make -j 10
|
||||
|
||||
2. install thrax
|
||||
cd $KALDI_ROOT/tools
|
||||
sh extras/install_thrax.sh
|
||||
|
||||
3. add thrax binary path into $KALDI_ROOT/tools/env.sh:
|
||||
export PATH=/path/to/your/kaldi_root/tools/thrax-1.2.9/src/bin:${PATH}
|
||||
|
||||
usage:
|
||||
before you run anything related to thrax, use:
|
||||
. $KALDI_ROOT/tools/env.sh
|
||||
to enable binary finding, like what we always do in kaldi.
|
||||
|
||||
sample usage:
|
||||
sh run_en.sh
|
||||
sh run_cn.sh
|
@ -1,12 +0,0 @@
|
||||
#!/bin/bash
|
||||
## This script should be placed under $KALDI_ROOT/tools/extras/, and see INSTALL.txt for installation guide
|
||||
if [ ! -f thrax-1.2.9.tar.gz ]; then
|
||||
wget http://www.openfst.org/twiki/pub/GRM/ThraxDownload/thrax-1.2.9.tar.gz
|
||||
tar -zxf thrax-1.2.9.tar.gz
|
||||
fi
|
||||
cd thrax-1.2.9
|
||||
OPENFSTPREFIX=`pwd`/../openfst
|
||||
LDFLAGS="-L${OPENFSTPREFIX}/lib" CXXFLAGS="-I${OPENFSTPREFIX}/include" ./configure --prefix ${OPENFSTPREFIX}
|
||||
make -j 10; make install
|
||||
cd ..
|
||||
|
Binary file not shown.
Binary file not shown.
@ -1,6 +0,0 @@
|
||||
cd src/cn
|
||||
thraxmakedep itn.grm
|
||||
make
|
||||
#thraxrewrite-tester --far=itn.far --rules=ITN
|
||||
cat ../../testcase_cn.txt | thraxrewrite-tester --far=itn.far --rules=ITN
|
||||
cd -
|
@ -1,6 +0,0 @@
|
||||
cd src
|
||||
thraxmakedep en/verbalizer/podspeech.grm
|
||||
make
|
||||
cat ../testcase_en.txt
|
||||
cat ../testcase_en.txt | thraxrewrite-tester --far=en/verbalizer/podspeech.far --rules=POD_SPEECH_TN
|
||||
cd -
|
@ -1,202 +0,0 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
@ -1,65 +0,0 @@
|
||||
en/verbalizer/podspeech.far: en/verbalizer/podspeech.grm util/util.far util/case.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
util/util.far: util/util.grm util/byte.far util/case.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
util/byte.far: util/byte.grm
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
util/case.far: util/case.grm util/byte.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/extra_numbers.far: en/verbalizer/extra_numbers.grm util/byte.far en/verbalizer/numbers.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/numbers.far: en/verbalizer/numbers.grm en/verbalizer/number_names.far util/byte.far universal/thousands_punct.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/number_names.far: en/verbalizer/number_names.grm util/arithmetic.far en/verbalizer/g.fst en/verbalizer/cardinals.tsv en/verbalizer/ordinals.tsv
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
util/arithmetic.far: util/arithmetic.grm util/byte.far util/germanic.tsv
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
universal/thousands_punct.far: universal/thousands_punct.grm util/byte.far util/util.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/float.far: en/verbalizer/float.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/factorization.far: en/verbalizer/factorization.grm util/byte.far util/util.far en/verbalizer/numbers.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/lexical_map.far: en/verbalizer/lexical_map.grm util/byte.far en/verbalizer/lexical_map.tsv
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/math.far: en/verbalizer/math.grm en/verbalizer/float.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/miscellaneous.far: en/verbalizer/miscellaneous.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/extra_numbers.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/spelled.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
ru/classifier/cyrillic.far: ru/classifier/cyrillic.grm
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/spelled.far: en/verbalizer/spelled.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/money.far: en/verbalizer/money.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/money.tsv
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/numbers_plus.far: en/verbalizer/numbers_plus.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/spoken_punct.far: en/verbalizer/spoken_punct.grm en/verbalizer/lexical_map.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/time.far: en/verbalizer/time.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
en/verbalizer/urls.far: en/verbalizer/urls.grm util/byte.far en/verbalizer/lexical_map.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
clean:
|
||||
rm -f util/util.far util/case.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far util/byte.far en/verbalizer/number_names.far universal/thousands_punct.far util/arithmetic.far en/verbalizer/factorization.far en/verbalizer/lexical_map.far ru/classifier/cyrillic.far
|
@ -1,24 +0,0 @@
|
||||
# Text normalization covering grammars
|
||||
|
||||
This repository provides covering grammars for English and Russian text normalization as
|
||||
documented in:
|
||||
|
||||
Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
|
||||
_Transactions of the Association for Computational Linguistics_ 4: 507-519.
|
||||
|
||||
Ng, A. H., Gorman, K., and Sproat, R. 2017. Minimally supervised
|
||||
written-to-spoken text normalization. In _ASRU_, pages 665-670.
|
||||
|
||||
If you use these grammars in a publication, we would appreciate if you cite these works.
|
||||
|
||||
## Building
|
||||
|
||||
The grammars are written in [Thrax](thrax.opengrm.org) and compile into [OpenFst](openfst.org) FAR (FstARchive) files. To compile, simply run `make` in the `src/` directory.
|
||||
|
||||
## License
|
||||
|
||||
See `LICENSE`.
|
||||
|
||||
## Mandatory disclaimer
|
||||
|
||||
This is not an official Google product.
|
@ -1,23 +0,0 @@
|
||||
itn.far: itn.grm byte.far number.far hotfix.far percentage.far date.far amount.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
byte.far: byte.grm
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
number.far: number.grm byte.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
hotfix.far: hotfix.grm byte.far hotfix.list
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
percentage.far: percentage.grm byte.far number.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
date.far: date.grm byte.far number.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
amount.far: amount.grm byte.far number.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
||||
clean:
|
||||
rm -f byte.far number.far hotfix.far percentage.far date.far amount.far
|
@ -1,24 +0,0 @@
|
||||
import 'byte.grm' as b;
|
||||
import 'number.grm' as n;
|
||||
|
||||
unit = (
|
||||
"匹"|"张"|"座"|"回"|"场"|"尾"|"条"|"个"|"首"|"阙"|"阵"|"网"|"炮"|
|
||||
"顶"|"丘"|"棵"|"只"|"支"|"袭"|"辆"|"挑"|"担"|"颗"|"壳"|"窠"|"曲"|
|
||||
"墙"|"群"|"腔"|"砣"|"座"|"客"|"贯"|"扎"|"捆"|"刀"|"令"|"打"|"手"|
|
||||
"罗"|"坡"|"山"|"岭"|"江"|"溪"|"钟"|"队"|"单"|"双"|"对"|"出"|"口"|
|
||||
"头"|"脚"|"板"|"跳"|"枝"|"件"|"贴"|"针"|"线"|"管"|"名"|"位"|"身"|
|
||||
"堂"|"课"|"本"|"页"|"家"|"户"|"层"|"丝"|"毫"|"厘"|"分"|"钱"|"两"|
|
||||
"斤"|"担"|"铢"|"石"|"钧"|"锱"|"忽"|"毫"|"厘"|"分"|"寸"|"尺"|"丈"|
|
||||
"里"|"寻"|"常"|"铺"|"程"|"撮"|"勺"|"合"|"升"|"斗"|"石"|"盘"|"碗"|
|
||||
"碟"|"叠"|"桶"|"笼"|"盆"|"盒"|"杯"|"钟"|"斛"|"锅"|"簋"|"篮"|"盘"|
|
||||
"桶"|"罐"|"瓶"|"壶"|"卮"|"盏"|"箩"|"箱"|"煲"|"啖"|"袋"|"钵"|"年"|
|
||||
"月"|"日"|"季"|"刻"|"时"|"周"|"天"|"秒"|"分"|"旬"|"纪"|"岁"|"世"|
|
||||
"更"|"夜"|"春"|"夏"|"秋"|"冬"|"代"|"伏"|"辈"|"丸"|"泡"|"粒"|"颗"|
|
||||
"幢"|"堆"|"条"|"根"|"支"|"道"|"面"|"片"|"张"|"颗"|"块"|
|
||||
(("千克":"kg")|("毫克":"mg")|("微克":"µg"))|
|
||||
(("千米":"km")|("厘米":"cm")|("毫米":"mm")|("微米":"µm")|("纳米":"nm"))
|
||||
);
|
||||
|
||||
amount = n.number unit;
|
||||
export AMOUNT = CDRewrite[amount, "", "", b.kBytes*];
|
||||
|
@ -1,76 +0,0 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Copyright 2005-2011 Google, Inc.
|
||||
# Author: ttai@google.com (Terry Tai)
|
||||
|
||||
# Standard constants for ASCII (byte) based strings. This mirrors the
|
||||
# functions provided by C/C++'s ctype.h library.
|
||||
|
||||
# Note that [0] is missing. Matching the string-termination character is kinda weird.
|
||||
export kBytes = Optimize[
|
||||
"[1]" | "[2]" | "[3]" | "[4]" | "[5]" | "[6]" | "[7]" | "[8]" | "[9]" | "[10]" |
|
||||
"[11]" | "[12]" | "[13]" | "[14]" | "[15]" | "[16]" | "[17]" | "[18]" | "[19]" | "[20]" |
|
||||
"[21]" | "[22]" | "[23]" | "[24]" | "[25]" | "[26]" | "[27]" | "[28]" | "[29]" | "[30]" |
|
||||
"[31]" | "[32]" | "[33]" | "[34]" | "[35]" | "[36]" | "[37]" | "[38]" | "[39]" | "[40]" |
|
||||
"[41]" | "[42]" | "[43]" | "[44]" | "[45]" | "[46]" | "[47]" | "[48]" | "[49]" | "[50]" |
|
||||
"[51]" | "[52]" | "[53]" | "[54]" | "[55]" | "[56]" | "[57]" | "[58]" | "[59]" | "[60]" |
|
||||
"[61]" | "[62]" | "[63]" | "[64]" | "[65]" | "[66]" | "[67]" | "[68]" | "[69]" | "[70]" |
|
||||
"[71]" | "[72]" | "[73]" | "[74]" | "[75]" | "[76]" | "[77]" | "[78]" | "[79]" | "[80]" |
|
||||
"[81]" | "[82]" | "[83]" | "[84]" | "[85]" | "[86]" | "[87]" | "[88]" | "[89]" | "[90]" |
|
||||
"[91]" | "[92]" | "[93]" | "[94]" | "[95]" | "[96]" | "[97]" | "[98]" | "[99]" | "[100]" |
|
||||
"[101]" | "[102]" | "[103]" | "[104]" | "[105]" | "[106]" | "[107]" | "[108]" | "[109]" | "[110]" |
|
||||
"[111]" | "[112]" | "[113]" | "[114]" | "[115]" | "[116]" | "[117]" | "[118]" | "[119]" | "[120]" |
|
||||
"[121]" | "[122]" | "[123]" | "[124]" | "[125]" | "[126]" | "[127]" | "[128]" | "[129]" | "[130]" |
|
||||
"[131]" | "[132]" | "[133]" | "[134]" | "[135]" | "[136]" | "[137]" | "[138]" | "[139]" | "[140]" |
|
||||
"[141]" | "[142]" | "[143]" | "[144]" | "[145]" | "[146]" | "[147]" | "[148]" | "[149]" | "[150]" |
|
||||
"[151]" | "[152]" | "[153]" | "[154]" | "[155]" | "[156]" | "[157]" | "[158]" | "[159]" | "[160]" |
|
||||
"[161]" | "[162]" | "[163]" | "[164]" | "[165]" | "[166]" | "[167]" | "[168]" | "[169]" | "[170]" |
|
||||
"[171]" | "[172]" | "[173]" | "[174]" | "[175]" | "[176]" | "[177]" | "[178]" | "[179]" | "[180]" |
|
||||
"[181]" | "[182]" | "[183]" | "[184]" | "[185]" | "[186]" | "[187]" | "[188]" | "[189]" | "[190]" |
|
||||
"[191]" | "[192]" | "[193]" | "[194]" | "[195]" | "[196]" | "[197]" | "[198]" | "[199]" | "[200]" |
|
||||
"[201]" | "[202]" | "[203]" | "[204]" | "[205]" | "[206]" | "[207]" | "[208]" | "[209]" | "[210]" |
|
||||
"[211]" | "[212]" | "[213]" | "[214]" | "[215]" | "[216]" | "[217]" | "[218]" | "[219]" | "[220]" |
|
||||
"[221]" | "[222]" | "[223]" | "[224]" | "[225]" | "[226]" | "[227]" | "[228]" | "[229]" | "[230]" |
|
||||
"[231]" | "[232]" | "[233]" | "[234]" | "[235]" | "[236]" | "[237]" | "[238]" | "[239]" | "[240]" |
|
||||
"[241]" | "[242]" | "[243]" | "[244]" | "[245]" | "[246]" | "[247]" | "[248]" | "[249]" | "[250]" |
|
||||
"[251]" | "[252]" | "[253]" | "[254]" | "[255]"
|
||||
];
|
||||
|
||||
export kDigit = Optimize[
|
||||
"0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
|
||||
];
|
||||
|
||||
export kLower = Optimize[
|
||||
"a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" |
|
||||
"n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
|
||||
];
|
||||
export kUpper = Optimize[
|
||||
"A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" |
|
||||
"N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
|
||||
];
|
||||
export kAlpha = Optimize[kLower | kUpper];
|
||||
|
||||
export kAlnum = Optimize[kDigit | kAlpha];
|
||||
|
||||
export kSpace = Optimize[
|
||||
" " | "\t" | "\n" | "\r"
|
||||
];
|
||||
export kNotSpace = Optimize[kBytes - kSpace];
|
||||
|
||||
export kPunct = Optimize[
|
||||
"!" | "\"" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," |
|
||||
"-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "\[" | "\\" |
|
||||
"\]" | "^" | "_" | "`" | "{" | "|" | "}" | "~"
|
||||
];
|
||||
|
||||
export kGraph = Optimize[kAlnum | kPunct];
|
@ -1,10 +0,0 @@
|
||||
import 'byte.grm' as b;
|
||||
import 'number.grm' as n;
|
||||
|
||||
date_day = n.number_1_to_99 ("日"|"号");
|
||||
date_month_day = n.number_1_to_99 "月" date_day;
|
||||
date_year_month_day = ((n.number_0_to_9){2,4} | n.number) "年" date_month_day;
|
||||
|
||||
date = date_year_month_day | date_month_day | date_day;
|
||||
|
||||
export DATE = CDRewrite[date, "", "", b.kBytes*];
|
@ -1,5 +0,0 @@
|
||||
import 'byte.grm' as b;
|
||||
hotfix = StringFile['hotfix.list'];
|
||||
|
||||
export HOTFIX = CDRewrite[hotfix, "", "", b.kBytes*];
|
||||
|
@ -1,18 +0,0 @@
|
||||
0头 零头
|
||||
10字 十字
|
||||
东4环 东4环 -1.0
|
||||
东4 东四 -0.5
|
||||
4惠 四惠
|
||||
3元桥 三元桥
|
||||
4平市 四平市
|
||||
5台山 五台山
|
||||
西2旗 西二旗
|
||||
西3旗 西三旗
|
||||
4道口 四道口 -1.0
|
||||
5道口 五道口 -1.0
|
||||
6道口 六道口 -1.0
|
||||
6里桥 六里桥
|
||||
7里庄 七里庄
|
||||
8宝山 八宝山
|
||||
9颗松 九棵松
|
||||
10里堡 十里堡
|
@ -1,9 +0,0 @@
|
||||
import 'byte.grm' as b;
|
||||
import 'number.grm' as number;
|
||||
import 'hotfix.grm' as hotfix;
|
||||
import 'percentage.grm' as percentage;
|
||||
import 'date.grm' as date;
|
||||
import 'amount.grm' as amount; # seems not useful for now
|
||||
|
||||
export ITN = Optimize[percentage.PERCENTAGE @ (date.DATE <-1>) @ number.NUMBER @ hotfix.HOTFIX];
|
||||
|
@ -1,61 +0,0 @@
|
||||
import 'byte.grm' as b;
|
||||
|
||||
number_1_to_9 = (
|
||||
("一":"1") | ("幺":"1") |
|
||||
("二":"2") | ("两":"2") |
|
||||
("三":"3") |
|
||||
("四":"4") |
|
||||
("五":"5") |
|
||||
("六":"6") |
|
||||
("七":"7") |
|
||||
("八":"8") |
|
||||
("九":"9")
|
||||
);
|
||||
|
||||
export number_0_to_9 = (("零":"0") | number_1_to_9);
|
||||
|
||||
number_10_to_19 = (
|
||||
("十":"10") |
|
||||
("十一":"11") |
|
||||
("十二":"12") |
|
||||
("十三":"13") |
|
||||
("十四":"14") |
|
||||
("十五":"15") |
|
||||
("十六":"16") |
|
||||
("十七":"17") |
|
||||
("十八":"18") |
|
||||
("十九":"19")
|
||||
);
|
||||
|
||||
number_10s = (number_1_to_9 ("十":""));
|
||||
number_100s = (number_1_to_9 ("百":""));
|
||||
number_1000s = (number_1_to_9 ("千":""));
|
||||
number_10000s = (number_1_to_9 ("万":""));
|
||||
|
||||
number_10_to_99 = (
|
||||
((number_10s number_1_to_9)<-0.3>) |
|
||||
((number_10s ("":"0"))<-0.2>) |
|
||||
(number_10_to_19 <-0.1>)
|
||||
);
|
||||
|
||||
export number_1_to_99 = (number_1_to_9 | number_10_to_99);
|
||||
|
||||
number_100_to_999 = (
|
||||
((number_100s ("零":"0") number_1_to_9)<0.0>)|
|
||||
((number_100s number_10_to_99)<0.0>) |
|
||||
((number_100s number_1_to_9 ("":"0"))<0.0>) |
|
||||
((number_100s ("":"00"))<0.1>)
|
||||
);
|
||||
|
||||
number_1000_to_9999 = (
|
||||
((number_1000s number_100_to_999)<0.0>) |
|
||||
((number_1000s ("零":"0") number_10_to_99)<0.0>)|
|
||||
((number_1000s ("零":"00") number_1_to_9)<0.0>)|
|
||||
((number_1000s ("":"000"))<1>) |
|
||||
((number_1000s number_1_to_9 ("":"00"))<0.0>)
|
||||
);
|
||||
|
||||
export number = number_1_to_99 | (number_100_to_999 <-1>) | (number_1000_to_9999 <-2>);
|
||||
|
||||
export NUMBER = CDRewrite[number, "", "", b.kBytes*];
|
||||
|
@ -1,8 +0,0 @@
|
||||
import 'byte.grm' as b;
|
||||
import 'number.grm' as n;
|
||||
|
||||
percentage = (
|
||||
("百分之":"") n.number_1_to_99 ("":"%")
|
||||
);
|
||||
|
||||
export PERCENTAGE = CDRewrite[percentage, "", "", b.kBytes*];
|
@ -1,6 +0,0 @@
|
||||
# English covering grammar definitions
|
||||
|
||||
This directory defines a English text normalization covering grammar. The
|
||||
primary entry-point is the FST `VERBALIZER`, defined in
|
||||
`verbalizer/verbalizer.grm` and compiled in the FST archive
|
||||
`verbalizer/verbalizer.far`.
|
@ -1,3 +0,0 @@
|
||||
verbalizer.far: verbalizer.grm util/util.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far
|
||||
thraxcompiler --input_grammar=$< --output_far=$@
|
||||
|
|
@ -1,35 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'en/verbalizer/numbers.grm' as n;
|
||||
|
||||
digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@");
|
||||
|
||||
export DIGITS = digit (n.I[" "] digit)*;
|
||||
|
||||
# Various common factorizations
|
||||
|
||||
two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS;
|
||||
|
||||
three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS;
|
||||
|
||||
mixed =
|
||||
(digit n.I[" "] two_digits)
|
||||
| (two_digits n.I[" "] two_digits)
|
||||
| (two_digits n.I[" "] three_digits)
|
||||
| (two_digits n.I[" "] two_digits n.I[" "] two_digits)
|
||||
;
|
||||
|
||||
export MIXED_NUMBERS = Optimize[mixed];
|
@ -1,40 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'util/util.grm' as u;
|
||||
import 'en/verbalizer/numbers.grm' as n;
|
||||
|
||||
func ToNumberName[expr] {
|
||||
number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*;
|
||||
return Optimize[expr @ number_name_seq];
|
||||
}
|
||||
|
||||
d = b.kDigit;
|
||||
|
||||
leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*];
|
||||
|
||||
by_ones = d n.I[" "];
|
||||
by_twos = (d{2} @ leading_zero) n.I[" "];
|
||||
by_threes = (d{3} @ leading_zero) n.I[" "];
|
||||
|
||||
groupings = by_twos* (by_threes | by_twos | by_ones);
|
||||
|
||||
export FRACTIONAL_PART_UNGROUPED =
|
||||
Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]]
|
||||
;
|
||||
export FRACTIONAL_PART_GROUPED =
|
||||
Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]]
|
||||
;
|
||||
export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]];
|
@ -1,30 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'en/verbalizer/factorization.grm' as f;
|
||||
import 'en/verbalizer/lexical_map.grm' as l;
|
||||
import 'en/verbalizer/numbers.grm' as n;
|
||||
|
||||
fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED;
|
||||
fractional_part_grouped = f.FRACTIONAL_PART_GROUPED;
|
||||
fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED;
|
||||
|
||||
__fractional_part__ = fractional_part_ungrouped | fractional_part_unparsed;
|
||||
__decimal_marker__ = ".";
|
||||
|
||||
export FLOAT = Optimize[
|
||||
(n.CARDINAL_NUMBERS
|
||||
(__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ")
|
||||
__fractional_part__) @ l.LEXICAL_MAP]
|
||||
;
|
Binary file not shown.
@ -1,25 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
|
||||
lexical_map = StringFile['en/verbalizer/lexical_map.tsv'];
|
||||
|
||||
sigma_star = b.kBytes*;
|
||||
|
||||
del_null = CDRewrite["__NULL__" : "", "", "", sigma_star];
|
||||
|
||||
export LEXICAL_MAP = Optimize[
|
||||
CDRewrite[lexical_map, "", "", sigma_star] @ del_null]
|
||||
;
|
Can't render this file because it has a wrong number of fields in line 37.
|
@ -1,34 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'en/verbalizer/float.grm' as f;
|
||||
import 'en/verbalizer/lexical_map.grm' as l;
|
||||
import 'en/verbalizer/numbers.grm' as n;
|
||||
|
||||
float = f.FLOAT;
|
||||
card = n.CARDINAL_NUMBERS;
|
||||
number = card | float;
|
||||
|
||||
plus = "+" : " @@ARITHMETIC_PLUS@@ ";
|
||||
times = "*" : " @@ARITHMETIC_TIMES@@ ";
|
||||
minus = "-" : " @@ARITHMETIC_MINUS@@ ";
|
||||
division = "/" : " @@ARITHMETIC_DIVISION@@ ";
|
||||
|
||||
operator = plus | times | minus | division;
|
||||
|
||||
percent = "%" : " @@PERCENT@@";
|
||||
|
||||
export ARITHMETIC =
|
||||
Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP]
|
||||
;
|
@ -1,78 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'ru/classifier/cyrillic.grm' as c;
|
||||
import 'en/verbalizer/extra_numbers.grm' as e;
|
||||
import 'en/verbalizer/lexical_map.grm' as l;
|
||||
import 'en/verbalizer/numbers.grm' as n;
|
||||
import 'en/verbalizer/spelled.grm' as s;
|
||||
|
||||
letter = b.kAlpha | c.kCyrillicAlpha;
|
||||
dash = "-";
|
||||
word = letter+;
|
||||
possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?;
|
||||
|
||||
post_word_symbol =
|
||||
("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) |
|
||||
("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) |
|
||||
("*" : "@@STAR@@")
|
||||
;
|
||||
|
||||
pre_word_symbol =
|
||||
("@" : "@@AT@@") |
|
||||
("/" : "@@SLASH@@") |
|
||||
("#" : "@@HASH@@")
|
||||
;
|
||||
|
||||
post_word = possibly_split_word n.I[" "] post_word_symbol;
|
||||
|
||||
pre_word = pre_word_symbol n.I[" "] possibly_split_word;
|
||||
|
||||
## Number/digit sequence combos, maybe with a dash
|
||||
|
||||
spelled_word = word @ s.SPELLED_NO_LETTER;
|
||||
|
||||
word_number =
|
||||
(word | spelled_word)
|
||||
(n.I[" "] | (dash : " "))
|
||||
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
||||
;
|
||||
|
||||
number_word =
|
||||
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
||||
(n.I[" "] | (dash : " "))
|
||||
(word | spelled_word)
|
||||
;
|
||||
|
||||
## Two-digit year.
|
||||
|
||||
# Note that in this case to be fair we really have to allow ordinals too since
|
||||
# in some languages that's what you would have.
|
||||
|
||||
two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS));
|
||||
|
||||
dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com";
|
||||
|
||||
miscellaneous = Optimize[
|
||||
possibly_split_word
|
||||
| post_word
|
||||
| pre_word
|
||||
| word_number
|
||||
| number_word
|
||||
| two_digit_year
|
||||
| dot_com
|
||||
];
|
||||
|
||||
export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP];
|
@ -1,44 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'en/verbalizer/lexical_map.grm' as l;
|
||||
import 'en/verbalizer/numbers.grm' as n;
|
||||
|
||||
card = n.CARDINAL_NUMBERS;
|
||||
|
||||
__currency__ = StringFile['en/verbalizer/money.tsv'];
|
||||
|
||||
d = b.kDigit;
|
||||
D = d - "0";
|
||||
|
||||
cents = ((n.D["0"] | D) d) @ card;
|
||||
|
||||
# Only dollar for the verbalizer tests for English. Will need to add other
|
||||
# currencies.
|
||||
usd_maj = Project["usd_maj" @ __currency__, 'output'];
|
||||
usd_min = Project["usd_min" @ __currency__, 'output'];
|
||||
and = " @@MONEY_AND@@ " | " ";
|
||||
|
||||
dollar1 =
|
||||
n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min]
|
||||
;
|
||||
|
||||
dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"];
|
||||
|
||||
dollar3 = n.D["$"] card n.I[" " usd_maj];
|
||||
|
||||
dollar = Optimize[dollar1 | dollar2 | dollar3];
|
||||
|
||||
export MONEY = Optimize[dollar @ l.LEXICAL_MAP];
|
|
@ -1,54 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# English minimally supervised number grammar.
|
||||
#
|
||||
# Supports both cardinals and ordinals without overt marking.
|
||||
#
|
||||
# The language-specific acceptor G was compiled with digit, teen, and decade
|
||||
# preterminals. The lexicon transducer L is unambiguous so no LM is used.
|
||||
|
||||
import 'util/arithmetic.grm' as a;
|
||||
|
||||
# Intersects the universal factorization transducer (F) with the
|
||||
# language-specific acceptor (G).
|
||||
|
||||
d = a.DELTA_STAR;
|
||||
f = a.IARITHMETIC_RESTRICTED;
|
||||
g = LoadFst['en/verbalizer/g.fst'];
|
||||
fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]];
|
||||
test1 = AssertEqual["230" @ fg, "(+ (* 2 100 *) 30 +)"];
|
||||
|
||||
# Compiles lexicon transducer (L).
|
||||
|
||||
cardinal_name = StringFile['en/verbalizer/cardinals.tsv'];
|
||||
cardinal_l = Optimize[(cardinal_name " ")* cardinal_name];
|
||||
test2 = AssertEqual["2 100 30" @ cardinal_l, "two hundred thirty"];
|
||||
|
||||
ordinal_name = StringFile['en/verbalizer/ordinals.tsv'];
|
||||
# In English, ordinals have the same syntax as cardinals and all but the final
|
||||
# element is verbalized using a cardinal number word; e.g., "two hundred
|
||||
# thirtieth".
|
||||
ordinal_l = Optimize[(cardinal_name " ")* ordinal_name];
|
||||
test3 = AssertEqual["2 100 30" @ ordinal_l, "two hundred thirtieth"];
|
||||
|
||||
# Composes L with the leaf transducer (P), then composes that with FG.
|
||||
|
||||
p = a.LEAVES;
|
||||
|
||||
export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)];
|
||||
test4 = AssertEqual["230" @ CARDINAL_NUMBER_NAME, "two hundred thirty"];
|
||||
|
||||
export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)];
|
||||
test5 = AssertEqual["230" @ ORDINAL_NUMBER_NAME, "two hundred thirtieth"];
|
@ -1,57 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'en/verbalizer/number_names.grm' as n;
|
||||
import 'util/byte.grm' as bytelib;
|
||||
import 'universal/thousands_punct.grm' as t;
|
||||
|
||||
cardinal = n.CARDINAL_NUMBER_NAME;
|
||||
ordinal = n.ORDINAL_NUMBER_NAME;
|
||||
|
||||
# Putting these here since this grammar gets incorporated by all the others.
|
||||
|
||||
func I[expr] {
|
||||
return "" : expr;
|
||||
}
|
||||
|
||||
func D[expr] {
|
||||
return expr : "";
|
||||
}
|
||||
|
||||
separators = t.comma_thousands | t.no_delimiter;
|
||||
|
||||
# Language specific endings for ordinals.
|
||||
d = bytelib.kDigit;
|
||||
endings = "st" | "nd" | "rd" | "th";
|
||||
|
||||
st = (d* "1") - (d* "11");
|
||||
nd = (d* "2") - (d* "12");
|
||||
rd = (d* "3") - (d* "13");
|
||||
th = Optimize[d* - st - nd - rd];
|
||||
first = st ("st" : "");
|
||||
second = nd ("nd" : "");
|
||||
third = rd ("rd" : "");
|
||||
other = th ("th" : "");
|
||||
marked_ordinal = Optimize[first | second | third | other];
|
||||
|
||||
# The separator is a no-op here but will be needed once we replace
|
||||
# the above targets.
|
||||
|
||||
export CARDINAL_NUMBERS = Optimize[separators @ cardinal];
|
||||
|
||||
export ORDINAL_NUMBERS =
|
||||
Optimize[(separators endings) @ marked_ordinal @ ordinal]
|
||||
;
|
||||
|
||||
export ORDINAL_NUMBERS_UNMARKED = Optimize[separators @ ordinal];
|
@ -1,133 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Grammar for things built mostly on numbers.
|
||||
|
||||
import 'en/verbalizer/factorization.grm' as f;
|
||||
import 'en/verbalizer/lexical_map.grm' as l;
|
||||
import 'en/verbalizer/numbers.grm' as n;
|
||||
|
||||
num = n.CARDINAL_NUMBERS;
|
||||
ord = n.ORDINAL_NUMBERS_UNMARKED;
|
||||
digits = f.FRACTIONAL_PART_UNGROUPED;
|
||||
|
||||
# Various symbols.
|
||||
|
||||
plus = "+" : "@@ARITHMETIC_PLUS@@";
|
||||
minus = "-" : "@@ARITHMETIC_MINUS@@";
|
||||
slash = "/" : "@@SLASH@@";
|
||||
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
||||
dash = "-" : "@@DASH@@";
|
||||
equals = "=" : "@@ARITHMETIC_EQUALS@@";
|
||||
|
||||
degree = "°" : "@@DEGREE@@";
|
||||
|
||||
division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@";
|
||||
|
||||
times = ("x" | "*") : "@@ARITHMETIC_TIMES@@";
|
||||
|
||||
power = "^" : "@@DECIMAL_EXPONENT@@";
|
||||
|
||||
square_root = "√" : "@@SQUARE_ROOT@@";
|
||||
|
||||
percent = "%" : "@@PERCENT@@";
|
||||
|
||||
# Safe roman numbers.
|
||||
|
||||
# NB: Do not change the formatting here. NO_EDIT must be on the same
|
||||
# line as the path.
|
||||
rfile =
|
||||
'universal/roman_numerals.tsv' # NO_EDIT
|
||||
;
|
||||
|
||||
roman = StringFile[rfile];
|
||||
|
||||
## Main categories.
|
||||
|
||||
cat_dot_number =
|
||||
num
|
||||
n.I[" "] dot n.I[" "] num
|
||||
(n.I[" "] dot n.I[" "] num)+
|
||||
;
|
||||
|
||||
cat_slash_number =
|
||||
num
|
||||
n.I[" "] slash n.I[" "] num
|
||||
(n.I[" "] slash n.I[" "] num)*
|
||||
;
|
||||
|
||||
cat_dash_number =
|
||||
num
|
||||
n.I[" "] dash n.I[" "] num
|
||||
(n.I[" "] dash n.I[" "] num)*
|
||||
;
|
||||
|
||||
cat_signed_number = ((plus | minus) n.I[" "])? num;
|
||||
|
||||
cat_degree = cat_signed_number n.I[" "] degree;
|
||||
|
||||
cat_country_code = plus n.I[" "] (num | digits);
|
||||
|
||||
cat_math_operations =
|
||||
plus
|
||||
| minus
|
||||
| division
|
||||
| times
|
||||
| equals
|
||||
| percent
|
||||
| power
|
||||
| square_root
|
||||
;
|
||||
|
||||
# Roman numbers are often either cardinals or ordinals in various languages.
|
||||
cat_roman = roman @ (num | ord);
|
||||
|
||||
# Allow
|
||||
#
|
||||
# number:number
|
||||
# number-number
|
||||
#
|
||||
# to just be
|
||||
#
|
||||
# number number.
|
||||
|
||||
cat_number_number =
|
||||
num ((":" | "-") : " ") num
|
||||
;
|
||||
|
||||
# Some additional readings for these symbols.
|
||||
|
||||
cat_additional_readings =
|
||||
("/" : "@@PER@@") |
|
||||
("+" : "@@AND@@") |
|
||||
("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) |
|
||||
("*" : "@@STAR@@") |
|
||||
("x" : ("x" | "@@CONNECTOR_BY@@")) |
|
||||
("@" : "@@AT@@")
|
||||
;
|
||||
|
||||
numbers_plus = Optimize[
|
||||
cat_dot_number
|
||||
| cat_slash_number
|
||||
| cat_dash_number
|
||||
| cat_signed_number
|
||||
| cat_degree
|
||||
| cat_country_code
|
||||
| cat_math_operations
|
||||
| cat_roman
|
||||
| cat_number_number
|
||||
| cat_additional_readings
|
||||
];
|
||||
|
||||
export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP];
|
|
Can't render this file because it contains an unexpected character in line 5 and column 20.
|
@ -1,46 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/util.grm' as util;
|
||||
import 'util/case.grm' as case;
|
||||
import 'en/verbalizer/extra_numbers.grm' as e;
|
||||
import 'en/verbalizer/float.grm' as f;
|
||||
import 'en/verbalizer/math.grm' as ma;
|
||||
import 'en/verbalizer/miscellaneous.grm' as mi;
|
||||
import 'en/verbalizer/money.grm' as mo;
|
||||
import 'en/verbalizer/numbers.grm' as n;
|
||||
import 'en/verbalizer/numbers_plus.grm' as np;
|
||||
import 'en/verbalizer/spelled.grm' as s;
|
||||
import 'en/verbalizer/spoken_punct.grm' as sp;
|
||||
import 'en/verbalizer/time.grm' as t;
|
||||
import 'en/verbalizer/urls.grm' as u;
|
||||
|
||||
export POD_SPEECH_TN = Optimize[RmWeight[
|
||||
(u.URL
|
||||
| e.MIXED_NUMBERS
|
||||
| e.DIGITS
|
||||
| f.FLOAT
|
||||
| ma.ARITHMETIC
|
||||
| mo.MONEY
|
||||
| n.CARDINAL_NUMBERS
|
||||
| n.ORDINAL_NUMBERS
|
||||
| np.NUMBERS_PLUS
|
||||
| s.SPELLED
|
||||
| sp.SPOKEN_PUNCT
|
||||
| t.TIME
|
||||
| u.URL
|
||||
| u.EMAILS) @ util.CLEAN_SPACES @ case.TOUPPER
|
||||
]];
|
||||
|
||||
#export POD_SPEECH_TN = Optimize[RmWeight[(mi.MISCELLANEOUS) @ util.CLEAN_SPACES @ case.TOUPPER]];
|
@ -1,77 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This verbalizer is used whenever there is an LM symbol that consists of
|
||||
# letters immediately followed by "{spelled}".l This strips the "{spelled}"
|
||||
# suffix.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'ru/classifier/cyrillic.grm' as c;
|
||||
import 'en/verbalizer/lexical_map.grm' as l;
|
||||
import 'en/verbalizer/numbers.grm' as n;
|
||||
|
||||
digit = b.kDigit @ n.CARDINAL_NUMBERS;
|
||||
|
||||
char_set = (("a" | "A") : "letter-a")
|
||||
| (("b" | "B") : "letter-b")
|
||||
| (("c" | "C") : "letter-c")
|
||||
| (("d" | "D") : "letter-d")
|
||||
| (("e" | "E") : "letter-e")
|
||||
| (("f" | "F") : "letter-f")
|
||||
| (("g" | "G") : "letter-g")
|
||||
| (("h" | "H") : "letter-h")
|
||||
| (("i" | "I") : "letter-i")
|
||||
| (("j" | "J") : "letter-j")
|
||||
| (("k" | "K") : "letter-k")
|
||||
| (("l" | "L") : "letter-l")
|
||||
| (("m" | "M") : "letter-m")
|
||||
| (("n" | "N") : "letter-n")
|
||||
| (("o" | "O") : "letter-o")
|
||||
| (("p" | "P") : "letter-p")
|
||||
| (("q" | "Q") : "letter-q")
|
||||
| (("r" | "R") : "letter-r")
|
||||
| (("s" | "S") : "letter-s")
|
||||
| (("t" | "T") : "letter-t")
|
||||
| (("u" | "U") : "letter-u")
|
||||
| (("v" | "V") : "letter-v")
|
||||
| (("w" | "W") : "letter-w")
|
||||
| (("x" | "X") : "letter-x")
|
||||
| (("y" | "Y") : "letter-y")
|
||||
| (("z" | "Z") : "letter-z")
|
||||
| (digit)
|
||||
| ("&" : "@@AND@@")
|
||||
| ("." : "")
|
||||
| ("-" : "")
|
||||
| ("_" : "")
|
||||
| ("/" : "")
|
||||
| (n.I["letter-"] c.kCyrillicAlpha)
|
||||
;
|
||||
|
||||
ins_space = "" : " ";
|
||||
|
||||
suffix = "{spelled}" : "";
|
||||
|
||||
spelled = Optimize[char_set (ins_space char_set)* suffix];
|
||||
|
||||
export SPELLED = Optimize[spelled @ l.LEXICAL_MAP];
|
||||
|
||||
sigma_star = b.kBytes*;
|
||||
|
||||
# Gets rid of the letter- prefix since in some cases we don't want it.
|
||||
|
||||
del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star];
|
||||
|
||||
spelled_no_tag = Optimize[char_set (ins_space char_set)*];
|
||||
|
||||
export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter];
|
@ -1,24 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'en/verbalizer/lexical_map.grm' as l;
|
||||
|
||||
punct =
|
||||
("." : "@@PERIOD@@")
|
||||
| ("," : "@@COMMA@@")
|
||||
| ("!" : "@@EXCLAMATION_MARK@@")
|
||||
| ("?" : "@@QUESTION_MARK@@")
|
||||
;
|
||||
|
||||
export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP];
|
@ -1,108 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'en/verbalizer/lexical_map.grm' as l;
|
||||
import 'en/verbalizer/numbers.grm' as n;
|
||||
|
||||
# Only handles 24-hour time with quarter-to, half-past and quarter-past.
|
||||
|
||||
increment_hour =
|
||||
("0" : "1")
|
||||
| ("1" : "2")
|
||||
| ("2" : "3")
|
||||
| ("3" : "4")
|
||||
| ("4" : "5")
|
||||
| ("5" : "6")
|
||||
| ("6" : "7")
|
||||
| ("7" : "8")
|
||||
| ("8" : "9")
|
||||
| ("9" : "10")
|
||||
| ("10" : "11")
|
||||
| ("11" : "12")
|
||||
| ("12" : "1") # If someone uses 12, we assume 12-hour by default.
|
||||
| ("13" : "14")
|
||||
| ("14" : "15")
|
||||
| ("15" : "16")
|
||||
| ("16" : "17")
|
||||
| ("17" : "18")
|
||||
| ("18" : "19")
|
||||
| ("19" : "20")
|
||||
| ("20" : "21")
|
||||
| ("21" : "22")
|
||||
| ("22" : "23")
|
||||
| ("23" : "12")
|
||||
;
|
||||
|
||||
hours = Project[increment_hour, 'input'];
|
||||
|
||||
d = b.kDigit;
|
||||
D = d - "0";
|
||||
|
||||
minutes09 = "0" D;
|
||||
|
||||
minutes = ("1" | "2" | "3" | "4" | "5") d;
|
||||
|
||||
__sep__ = ":";
|
||||
sep_space = __sep__ : " ";
|
||||
|
||||
verbalize_hours = hours @ n.CARDINAL_NUMBERS;
|
||||
|
||||
verbalize_minutes =
|
||||
("00" : "@@HOUR@@")
|
||||
| (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS))
|
||||
| (minutes @ n.CARDINAL_NUMBERS)
|
||||
;
|
||||
|
||||
time_basic = Optimize[verbalize_hours sep_space verbalize_minutes];
|
||||
|
||||
# Special cases we handle right now.
|
||||
# TODO: Need to allow for cases like
|
||||
#
|
||||
# half twelve (in the UK English sense)
|
||||
# half twaalf (in the Dutch sense)
|
||||
|
||||
time_quarter_past =
|
||||
n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "]
|
||||
verbalize_hours
|
||||
n.D[__sep__ "15"];
|
||||
|
||||
time_half_past =
|
||||
n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "]
|
||||
verbalize_hours
|
||||
n.D[__sep__ "30"];
|
||||
|
||||
time_quarter_to =
|
||||
n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "]
|
||||
(increment_hour @ verbalize_hours)
|
||||
n.D[__sep__ "45"];
|
||||
|
||||
time_extra = Optimize[
|
||||
time_quarter_past | time_half_past | time_quarter_to]
|
||||
;
|
||||
|
||||
# Basic time periods which most languages can be expected to have.
|
||||
__am__ = "a.m." | "am" | "AM";
|
||||
__pm__ = "p.m." | "pm" | "PM";
|
||||
|
||||
period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@");
|
||||
|
||||
time_variants = time_basic | time_extra;
|
||||
|
||||
time = Optimize[
|
||||
(period (" " | n.I[" "]))? time_variants
|
||||
| time_variants ((" " | n.I[" "]) period)?]
|
||||
;
|
||||
|
||||
export TIME = Optimize[time @ l.LEXICAL_MAP];
|
@ -1,68 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Rules for URLs and email addresses.
|
||||
|
||||
import 'util/byte.grm' as bytelib;
|
||||
import 'en/verbalizer/lexical_map.grm' as l;
|
||||
|
||||
ins_space = "" : " ";
|
||||
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
||||
at = "@" : "@@AT@@";
|
||||
|
||||
url_suffix =
|
||||
(".com" : dot ins_space "com") |
|
||||
(".gov" : dot ins_space "gov") |
|
||||
(".edu" : dot ins_space "e d u") |
|
||||
(".org" : dot ins_space "org") |
|
||||
(".net" : dot ins_space "net")
|
||||
;
|
||||
|
||||
letter_string = (bytelib.kAlnum)* bytelib.kAlnum;
|
||||
|
||||
letter_string_dot =
|
||||
((letter_string ins_space dot ins_space)* letter_string)
|
||||
;
|
||||
|
||||
# Rules for URLs.
|
||||
export URL = Optimize[
|
||||
((letter_string_dot) (ins_space)
|
||||
(url_suffix)) @ l.LEXICAL_MAP
|
||||
];
|
||||
|
||||
# Rules for email addresses.
|
||||
letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum);
|
||||
|
||||
letter_by_letter_dot =
|
||||
((letter_by_letter ins_space dot ins_space)*
|
||||
letter_by_letter)
|
||||
;
|
||||
|
||||
export EMAIL1 = Optimize[
|
||||
((letter_by_letter) (ins_space)
|
||||
(at) (ins_space)
|
||||
(letter_by_letter_dot) (ins_space)
|
||||
(url_suffix)) @ l.LEXICAL_MAP
|
||||
];
|
||||
|
||||
export EMAIL2 = Optimize[
|
||||
((letter_by_letter) (ins_space)
|
||||
(at) (ins_space)
|
||||
(letter_string_dot) (ins_space)
|
||||
(url_suffix)) @ l.LEXICAL_MAP
|
||||
];
|
||||
|
||||
export EMAILS = Optimize[
|
||||
EMAIL1 | EMAIL2
|
||||
];
|
@ -1,42 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/util.grm' as util;
|
||||
import 'en/verbalizer/extra_numbers.grm' as e;
|
||||
import 'en/verbalizer/float.grm' as f;
|
||||
import 'en/verbalizer/math.grm' as ma;
|
||||
import 'en/verbalizer/miscellaneous.grm' as mi;
|
||||
import 'en/verbalizer/money.grm' as mo;
|
||||
import 'en/verbalizer/numbers.grm' as n;
|
||||
import 'en/verbalizer/numbers_plus.grm' as np;
|
||||
import 'en/verbalizer/spelled.grm' as s;
|
||||
import 'en/verbalizer/spoken_punct.grm' as sp;
|
||||
import 'en/verbalizer/time.grm' as t;
|
||||
import 'en/verbalizer/urls.grm' as u;
|
||||
|
||||
export VERBALIZER = Optimize[RmWeight[
|
||||
( e.MIXED_NUMBERS
|
||||
| e.DIGITS
|
||||
| f.FLOAT
|
||||
| ma.ARITHMETIC
|
||||
| mi.MISCELLANEOUS
|
||||
| mo.MONEY
|
||||
| n.CARDINAL_NUMBERS
|
||||
| n.ORDINAL_NUMBERS
|
||||
| np.NUMBERS_PLUS
|
||||
| s.SPELLED
|
||||
| sp.SPOKEN_PUNCT
|
||||
| t.TIME
|
||||
| u.URL) @ util.CLEAN_SPACES
|
||||
]];
|
@ -1,17 +0,0 @@
|
||||
This directory contains data used in:
|
||||
|
||||
Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
|
||||
Transactions of the Association for Computational Linguistics 4: 507-519.
|
||||
|
||||
* `minimal.txt`: A list of 30 curated numbers used as the "minimal" training
|
||||
set.
|
||||
* `random-trn.txt`: A list of 9000 randomly-generated numbers used as the
|
||||
"medium" training set.
|
||||
* `random-tst.txt`: A list of 1000 randomly-generated numbers used as the test
|
||||
set.
|
||||
|
||||
Note that `random-trn.txt` and `random-tst.txt` are totally disjoint, but that
|
||||
a small number of examples occur both in `minimal.txt` and `random-tst.txt`.
|
||||
|
||||
For information about the sampling procedure used to generate the random data
|
||||
sets, see appendix A of the aforementioned paper.
|
@ -1,300 +0,0 @@
|
||||
0
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
10
|
||||
11
|
||||
12
|
||||
13
|
||||
14
|
||||
15
|
||||
16
|
||||
17
|
||||
18
|
||||
19
|
||||
20
|
||||
21
|
||||
22
|
||||
23
|
||||
24
|
||||
25
|
||||
26
|
||||
27
|
||||
28
|
||||
29
|
||||
30
|
||||
31
|
||||
32
|
||||
33
|
||||
34
|
||||
35
|
||||
36
|
||||
37
|
||||
38
|
||||
39
|
||||
40
|
||||
41
|
||||
42
|
||||
43
|
||||
44
|
||||
45
|
||||
46
|
||||
47
|
||||
48
|
||||
49
|
||||
50
|
||||
51
|
||||
52
|
||||
53
|
||||
54
|
||||
55
|
||||
56
|
||||
57
|
||||
58
|
||||
59
|
||||
60
|
||||
61
|
||||
62
|
||||
63
|
||||
64
|
||||
65
|
||||
66
|
||||
67
|
||||
68
|
||||
69
|
||||
70
|
||||
71
|
||||
72
|
||||
73
|
||||
74
|
||||
75
|
||||
76
|
||||
77
|
||||
78
|
||||
79
|
||||
80
|
||||
81
|
||||
82
|
||||
83
|
||||
84
|
||||
85
|
||||
86
|
||||
87
|
||||
88
|
||||
89
|
||||
90
|
||||
91
|
||||
92
|
||||
93
|
||||
94
|
||||
95
|
||||
96
|
||||
97
|
||||
98
|
||||
99
|
||||
100
|
||||
101
|
||||
102
|
||||
103
|
||||
104
|
||||
105
|
||||
106
|
||||
107
|
||||
108
|
||||
109
|
||||
110
|
||||
111
|
||||
112
|
||||
113
|
||||
114
|
||||
115
|
||||
116
|
||||
117
|
||||
118
|
||||
119
|
||||
120
|
||||
121
|
||||
122
|
||||
123
|
||||
124
|
||||
125
|
||||
126
|
||||
127
|
||||
128
|
||||
129
|
||||
130
|
||||
131
|
||||
132
|
||||
133
|
||||
134
|
||||
135
|
||||
136
|
||||
137
|
||||
138
|
||||
139
|
||||
140
|
||||
141
|
||||
142
|
||||
143
|
||||
144
|
||||
145
|
||||
146
|
||||
147
|
||||
148
|
||||
149
|
||||
150
|
||||
151
|
||||
152
|
||||
153
|
||||
154
|
||||
155
|
||||
156
|
||||
157
|
||||
158
|
||||
159
|
||||
160
|
||||
161
|
||||
162
|
||||
163
|
||||
164
|
||||
165
|
||||
166
|
||||
167
|
||||
168
|
||||
169
|
||||
170
|
||||
171
|
||||
172
|
||||
173
|
||||
174
|
||||
175
|
||||
176
|
||||
177
|
||||
178
|
||||
179
|
||||
180
|
||||
181
|
||||
182
|
||||
183
|
||||
184
|
||||
185
|
||||
186
|
||||
187
|
||||
188
|
||||
189
|
||||
190
|
||||
191
|
||||
192
|
||||
193
|
||||
194
|
||||
195
|
||||
196
|
||||
197
|
||||
198
|
||||
199
|
||||
200
|
||||
201
|
||||
202
|
||||
203
|
||||
204
|
||||
205
|
||||
206
|
||||
207
|
||||
208
|
||||
209
|
||||
210
|
||||
211
|
||||
212
|
||||
220
|
||||
221
|
||||
230
|
||||
300
|
||||
400
|
||||
500
|
||||
600
|
||||
700
|
||||
800
|
||||
900
|
||||
1000
|
||||
1001
|
||||
1002
|
||||
1003
|
||||
1004
|
||||
1005
|
||||
1006
|
||||
1007
|
||||
1008
|
||||
1009
|
||||
1010
|
||||
1011
|
||||
1012
|
||||
1020
|
||||
1021
|
||||
1030
|
||||
1200
|
||||
2000
|
||||
2001
|
||||
2002
|
||||
2003
|
||||
2004
|
||||
2005
|
||||
2006
|
||||
2007
|
||||
2008
|
||||
2009
|
||||
2010
|
||||
2011
|
||||
2012
|
||||
2020
|
||||
2021
|
||||
2030
|
||||
2100
|
||||
2200
|
||||
5001
|
||||
10000
|
||||
12000
|
||||
20000
|
||||
21000
|
||||
50001
|
||||
100000
|
||||
120000
|
||||
200000
|
||||
210000
|
||||
500001
|
||||
1000000
|
||||
1001000
|
||||
1200000
|
||||
2000000
|
||||
2100000
|
||||
5000001
|
||||
10000000
|
||||
10001000
|
||||
12000000
|
||||
20000000
|
||||
50000001
|
||||
100000000
|
||||
100001000
|
||||
120000000
|
||||
200000000
|
||||
500000001
|
||||
1000000000
|
||||
1000001000
|
||||
1200000000
|
||||
2000000000
|
||||
5000000001
|
||||
10000000000
|
||||
10000001000
|
||||
12000000000
|
||||
20000000000
|
||||
50000000001
|
||||
100000000000
|
||||
100000001000
|
||||
120000000000
|
||||
200000000000
|
||||
500000000001
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,6 +0,0 @@
|
||||
# Russian covering grammar definitions
|
||||
|
||||
This directory defines a Russian text normalization covering grammar. The
|
||||
primary entry-point is the FST `VERBALIZER`, defined in
|
||||
`verbalizer/verbalizer.grm` and compiled in the FST archive
|
||||
`verbalizer/verbalizer.far`.
|
@ -1,338 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# AUTOMATICALLY GENERATED: DO NOT EDIT.
|
||||
import 'util/byte.grm' as b;
|
||||
|
||||
# Utilities for insertion and deletion.
|
||||
|
||||
func I[expr] {
|
||||
return "" : expr;
|
||||
}
|
||||
|
||||
func D[expr] {
|
||||
return expr : "";
|
||||
}
|
||||
|
||||
# Powers of base 10.
|
||||
export POWERS =
|
||||
"[E15]"
|
||||
| "[E14]"
|
||||
| "[E13]"
|
||||
| "[E12]"
|
||||
| "[E11]"
|
||||
| "[E10]"
|
||||
| "[E9]"
|
||||
| "[E8]"
|
||||
| "[E7]"
|
||||
| "[E6]"
|
||||
| "[E5]"
|
||||
| "[E4]"
|
||||
| "[E3]"
|
||||
| "[E2]"
|
||||
| "[E1]"
|
||||
;
|
||||
|
||||
export SIGMA = b.kBytes | POWERS;
|
||||
|
||||
export SIGMA_STAR = SIGMA*;
|
||||
|
||||
export SIGMA_PLUS = SIGMA+;
|
||||
|
||||
################################################################################
|
||||
# BEGIN LANGUAGE SPECIFIC DATA
|
||||
revaluations =
|
||||
("[E4]" : "[E1]")
|
||||
| ("[E5]" : "[E2]")
|
||||
| ("[E7]" : "[E1]")
|
||||
| ("[E8]" : "[E2]")
|
||||
;
|
||||
|
||||
Ms = "[E3]" | "[E6]" | "[E9]";
|
||||
|
||||
|
||||
func Zero[expr] {
|
||||
return expr : ("");
|
||||
}
|
||||
|
||||
space = " ";
|
||||
|
||||
lexset3 = Optimize[
|
||||
("1[E1]+1" : "одиннадцати")
|
||||
| ("1[E1]+1" : "одиннадцать")
|
||||
| ("1[E1]+1" : "одиннадцатью")
|
||||
| ("1[E1]+2" : "двенадцати")
|
||||
| ("1[E1]+2" : "двенадцать")
|
||||
| ("1[E1]+2" : "двенадцатью")
|
||||
| ("1[E1]+3" : "тринадцати")
|
||||
| ("1[E1]+3" : "тринадцать")
|
||||
| ("1[E1]+3" : "тринадцатью")
|
||||
| ("1[E1]+4" : "четырнадцати")
|
||||
| ("1[E1]+4" : "четырнадцать")
|
||||
| ("1[E1]+4" : "четырнадцатью")
|
||||
| ("1[E1]+5" : "пятнадцати")
|
||||
| ("1[E1]+5" : "пятнадцать")
|
||||
| ("1[E1]+5" : "пятнадцатью")
|
||||
| ("1[E1]+6" : "шестнадцати")
|
||||
| ("1[E1]+6" : "шестнадцать")
|
||||
| ("1[E1]+6" : "шестнадцатью")
|
||||
| ("1[E1]+7" : "семнадцати")
|
||||
| ("1[E1]+7" : "семнадцать")
|
||||
| ("1[E1]+7" : "семнадцатью")
|
||||
| ("1[E1]+8" : "восемнадцати")
|
||||
| ("1[E1]+8" : "восемнадцать")
|
||||
| ("1[E1]+8" : "восемнадцатью")
|
||||
| ("1[E1]+9" : "девятнадцати")
|
||||
| ("1[E1]+9" : "девятнадцать")
|
||||
| ("1[E1]+9" : "девятнадцатью")]
|
||||
;
|
||||
|
||||
lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR];
|
||||
|
||||
lexset2 = Optimize[
|
||||
("1[E1]" : "десяти")
|
||||
| ("1[E1]" : "десять")
|
||||
| ("1[E1]" : "десятью")
|
||||
| ("1[E2]" : "ста")
|
||||
| ("1[E2]" : "сто")
|
||||
| ("2[E1]" : "двадцати")
|
||||
| ("2[E1]" : "двадцать")
|
||||
| ("2[E1]" : "двадцатью")
|
||||
| ("2[E2]" : "двести")
|
||||
| ("2[E2]" : "двумстам")
|
||||
| ("2[E2]" : "двумястами")
|
||||
| ("2[E2]" : "двухсот")
|
||||
| ("2[E2]" : "двухстах")
|
||||
| ("3[E1]" : "тридцати")
|
||||
| ("3[E1]" : "тридцать")
|
||||
| ("3[E1]" : "тридцатью")
|
||||
| ("3[E2]" : "тремстам")
|
||||
| ("3[E2]" : "тремястами")
|
||||
| ("3[E2]" : "трехсот")
|
||||
| ("3[E2]" : "трехстах")
|
||||
| ("3[E2]" : "триста")
|
||||
| ("4[E1]" : "сорок")
|
||||
| ("4[E1]" : "сорока")
|
||||
| ("4[E2]" : "четыремстам")
|
||||
| ("4[E2]" : "четыреста")
|
||||
| ("4[E2]" : "четырехсот")
|
||||
| ("4[E2]" : "четырехстах")
|
||||
| ("4[E2]" : "четырьмястами")
|
||||
| ("5[E1]" : "пятидесяти")
|
||||
| ("5[E1]" : "пятьдесят")
|
||||
| ("5[E1]" : "пятьюдесятью")
|
||||
| ("5[E2]" : "пятисот")
|
||||
| ("5[E2]" : "пятистам")
|
||||
| ("5[E2]" : "пятистах")
|
||||
| ("5[E2]" : "пятьсот")
|
||||
| ("5[E2]" : "пятьюстами")
|
||||
| ("6[E1]" : "шестидесяти")
|
||||
| ("6[E1]" : "шестьдесят")
|
||||
| ("6[E1]" : "шестьюдесятью")
|
||||
| ("6[E2]" : "шестисот")
|
||||
| ("6[E2]" : "шестистам")
|
||||
| ("6[E2]" : "шестистах")
|
||||
| ("6[E2]" : "шестьсот")
|
||||
| ("6[E2]" : "шестьюстами")
|
||||
| ("7[E1]" : "семидесяти")
|
||||
| ("7[E1]" : "семьдесят")
|
||||
| ("7[E1]" : "семьюдесятью")
|
||||
| ("7[E2]" : "семисот")
|
||||
| ("7[E2]" : "семистам")
|
||||
| ("7[E2]" : "семистах")
|
||||
| ("7[E2]" : "семьсот")
|
||||
| ("7[E2]" : "семьюстами")
|
||||
| ("8[E1]" : "восемьдесят")
|
||||
| ("8[E1]" : "восьмидесяти")
|
||||
| ("8[E1]" : "восьмьюдесятью")
|
||||
| ("8[E2]" : "восемьсот")
|
||||
| ("8[E2]" : "восемьюстами")
|
||||
| ("8[E2]" : "восьмисот")
|
||||
| ("8[E2]" : "восьмистам")
|
||||
| ("8[E2]" : "восьмистах")
|
||||
| ("8[E2]" : "восьмьюстами")
|
||||
| ("9[E1]" : "девяноста")
|
||||
| ("9[E1]" : "девяносто")
|
||||
| ("9[E2]" : "девятисот")
|
||||
| ("9[E2]" : "девятистам")
|
||||
| ("9[E2]" : "девятистах")
|
||||
| ("9[E2]" : "девятьсот")
|
||||
| ("9[E2]" : "девятьюстами")]
|
||||
;
|
||||
|
||||
lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR];
|
||||
|
||||
lexset1 = Optimize[
|
||||
("+" : "")
|
||||
| ("1" : "один")
|
||||
| ("1" : "одна")
|
||||
| ("1" : "одни")
|
||||
| ("1" : "одним")
|
||||
| ("1" : "одними")
|
||||
| ("1" : "одних")
|
||||
| ("1" : "одно")
|
||||
| ("1" : "одного")
|
||||
| ("1" : "одной")
|
||||
| ("1" : "одном")
|
||||
| ("1" : "одному")
|
||||
| ("1" : "одною")
|
||||
| ("1" : "одну")
|
||||
| ("2" : "два")
|
||||
| ("2" : "две")
|
||||
| ("2" : "двум")
|
||||
| ("2" : "двумя")
|
||||
| ("2" : "двух")
|
||||
| ("3" : "трем")
|
||||
| ("3" : "тремя")
|
||||
| ("3" : "трех")
|
||||
| ("3" : "три")
|
||||
| ("4" : "четыре")
|
||||
| ("4" : "четырем")
|
||||
| ("4" : "четырех")
|
||||
| ("4" : "четырьмя")
|
||||
| ("5" : "пяти")
|
||||
| ("5" : "пять")
|
||||
| ("5" : "пятью")
|
||||
| ("6" : "шести")
|
||||
| ("6" : "шесть")
|
||||
| ("6" : "шестью")
|
||||
| ("7" : "семи")
|
||||
| ("7" : "семь")
|
||||
| ("7" : "семью")
|
||||
| ("8" : "восемь")
|
||||
| ("8" : "восьми")
|
||||
| ("8" : "восьмью")
|
||||
| ("9" : "девяти")
|
||||
| ("9" : "девять")
|
||||
| ("9" : "девятью")
|
||||
| ("[E3]" : "тысяч")
|
||||
| ("[E3]" : "тысяча")
|
||||
| ("[E3]" : "тысячам")
|
||||
| ("[E3]" : "тысячами")
|
||||
| ("[E3]" : "тысячах")
|
||||
| ("[E3]" : "тысяче")
|
||||
| ("[E3]" : "тысячей")
|
||||
| ("[E3]" : "тысячи")
|
||||
| ("[E3]" : "тысячу")
|
||||
| ("[E3]" : "тысячью")
|
||||
| ("[E6]" : "миллион")
|
||||
| ("[E6]" : "миллиона")
|
||||
| ("[E6]" : "миллионам")
|
||||
| ("[E6]" : "миллионами")
|
||||
| ("[E6]" : "миллионах")
|
||||
| ("[E6]" : "миллионе")
|
||||
| ("[E6]" : "миллионов")
|
||||
| ("[E6]" : "миллионом")
|
||||
| ("[E6]" : "миллиону")
|
||||
| ("[E6]" : "миллионы")
|
||||
| ("[E9]" : "миллиард")
|
||||
| ("[E9]" : "миллиарда")
|
||||
| ("[E9]" : "миллиардам")
|
||||
| ("[E9]" : "миллиардами")
|
||||
| ("[E9]" : "миллиардах")
|
||||
| ("[E9]" : "миллиарде")
|
||||
| ("[E9]" : "миллиардов")
|
||||
| ("[E9]" : "миллиардом")
|
||||
| ("[E9]" : "миллиарду")
|
||||
| ("[E9]" : "миллиарды")
|
||||
| ("|0|" : "ноле")
|
||||
| ("|0|" : "нолем")
|
||||
| ("|0|" : "ноль")
|
||||
| ("|0|" : "нолю")
|
||||
| ("|0|" : "ноля")
|
||||
| ("|0|" : "нуле")
|
||||
| ("|0|" : "нулем")
|
||||
| ("|0|" : "нуль")
|
||||
| ("|0|" : "нулю")
|
||||
| ("|0|" : "нуля")]
|
||||
;
|
||||
|
||||
lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR];
|
||||
|
||||
export LEX = Optimize[lex3 @ lex2 @ lex1];
|
||||
|
||||
export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]";
|
||||
|
||||
# END LANGUAGE SPECIFIC DATA
|
||||
################################################################################
|
||||
# Inserts a marker after the Ms.
|
||||
export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR];
|
||||
|
||||
# Deletes all powers and "+".
|
||||
export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR];
|
||||
|
||||
# Deletes trailing zeros at the beginning of a number, so that "0003" does not
|
||||
# get treated as an ordinary number.
|
||||
export DELETE_INITIAL_ZEROS =
|
||||
CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR]
|
||||
;
|
||||
|
||||
NonMs = Optimize[POWERS - Ms];
|
||||
|
||||
# Deletes (usually) zeros before a non-M. E.g., +0[E1] should be deleted.
|
||||
export DELETE_INTERMEDIATE_ZEROS1 =
|
||||
CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR]
|
||||
;
|
||||
|
||||
# Deletes (usually) zeros before an M, if there is no non-zero element between
|
||||
# that and the previous boundary. Thus, if after the result of the rule above we
|
||||
# end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final
|
||||
# zero.
|
||||
export DELETE_INTERMEDIATE_ZEROS2 = Optimize[
|
||||
CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR]
|
||||
@ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]]
|
||||
;
|
||||
|
||||
# Final clean up of stray zeros.
|
||||
export DELETE_REMAINING_ZEROS = Optimize[
|
||||
CDRewrite[Zero["+0"], "", "", SIGMA_STAR]
|
||||
@ CDRewrite[Zero["0"], "", "", SIGMA_STAR]]
|
||||
;
|
||||
|
||||
# Applies the revaluation map. For example in English, changes [E4] to [E1] as a
|
||||
# modifier of [E3].
|
||||
export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR];
|
||||
|
||||
# Deletes the various marks and powers in the input and output.
|
||||
export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR];
|
||||
|
||||
export CLEAN_SPACES = Optimize[
|
||||
CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR]
|
||||
@ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR]
|
||||
@ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]]
|
||||
;
|
||||
|
||||
d = b.kDigit;
|
||||
|
||||
# Germanic inversion rule.
|
||||
germanic =
|
||||
(I["1+"] d "[E1]" D["+1"])
|
||||
| (I["2+"] d "[E1]" D["+2"])
|
||||
| (I["3+"] d "[E1]" D["+3"])
|
||||
| (I["4+"] d "[E1]" D["+4"])
|
||||
| (I["5+"] d "[E1]" D["+5"])
|
||||
| (I["6+"] d "[E1]" D["+6"])
|
||||
| (I["7+"] d "[E1]" D["+7"])
|
||||
| (I["8+"] d "[E1]" D["+8"])
|
||||
| (I["9+"] d "[E1]" D["+9"])
|
||||
;
|
||||
|
||||
germanic_inversion =
|
||||
CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt']
|
||||
;
|
||||
|
||||
export GERMANIC_INVERSION = SIGMA_STAR;
|
||||
export ORDINAL_RESTRICTION = SIGMA_STAR;
|
||||
nondigits = b.kBytes - b.kDigit;
|
||||
export ORDINAL_SUFFIX = D[nondigits*];
|
|
@ -1,35 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'ru/verbalizer/numbers.grm' as n;
|
||||
|
||||
digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@");
|
||||
|
||||
export DIGITS = digit (n.I[" "] digit)*;
|
||||
|
||||
# Various common factorizations
|
||||
|
||||
two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS;
|
||||
|
||||
three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS;
|
||||
|
||||
mixed =
|
||||
(digit n.I[" "] two_digits)
|
||||
| (two_digits n.I[" "] two_digits)
|
||||
| (two_digits n.I[" "] three_digits)
|
||||
| (two_digits n.I[" "] two_digits n.I[" "] two_digits)
|
||||
;
|
||||
|
||||
export MIXED_NUMBERS = Optimize[mixed];
|
@ -1,40 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'util/util.grm' as u;
|
||||
import 'ru/verbalizer/numbers.grm' as n;
|
||||
|
||||
func ToNumberName[expr] {
|
||||
number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*;
|
||||
return Optimize[expr @ number_name_seq];
|
||||
}
|
||||
|
||||
d = b.kDigit;
|
||||
|
||||
leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*];
|
||||
|
||||
by_ones = d n.I[" "];
|
||||
by_twos = (d{2} @ leading_zero) n.I[" "];
|
||||
by_threes = (d{3} @ leading_zero) n.I[" "];
|
||||
|
||||
groupings = by_twos* (by_threes | by_twos | by_ones);
|
||||
|
||||
export FRACTIONAL_PART_UNGROUPED =
|
||||
Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]]
|
||||
;
|
||||
export FRACTIONAL_PART_GROUPED =
|
||||
Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]]
|
||||
;
|
||||
export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]];
|
@ -1,30 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'ru/verbalizer/factorization.grm' as f;
|
||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||
import 'ru/verbalizer/numbers.grm' as n;
|
||||
|
||||
fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED;
|
||||
fractional_part_grouped = f.FRACTIONAL_PART_GROUPED;
|
||||
fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED;
|
||||
|
||||
__fractional_part__ = fractional_part_unparsed;
|
||||
__decimal_marker__ = ",";
|
||||
|
||||
export FLOAT = Optimize[
|
||||
(n.CARDINAL_NUMBERS
|
||||
(__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ")
|
||||
__fractional_part__) @ l.LEXICAL_MAP]
|
||||
;
|
Binary file not shown.
@ -1,25 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
|
||||
lexical_map = StringFile['ru/verbalizer/lexical_map.tsv'];
|
||||
|
||||
sigma_star = b.kBytes*;
|
||||
|
||||
del_null = CDRewrite["__NULL__" : "", "", "", sigma_star];
|
||||
|
||||
export LEXICAL_MAP = Optimize[
|
||||
CDRewrite[lexical_map, "", "", sigma_star] @ del_null]
|
||||
;
|
Can't render this file because it has a wrong number of fields in line 176.
|
@ -1,34 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'ru/verbalizer/float.grm' as f;
|
||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||
import 'ru/verbalizer/numbers.grm' as n;
|
||||
|
||||
float = f.FLOAT;
|
||||
card = n.CARDINAL_NUMBERS;
|
||||
number = card | float;
|
||||
|
||||
plus = "+" : " @@ARITHMETIC_PLUS@@ ";
|
||||
times = "*" : " @@ARITHMETIC_TIMES@@ ";
|
||||
minus = "-" : " @@ARITHMETIC_MINUS@@ ";
|
||||
division = "/" : " @@ARITHMETIC_DIVISION@@ ";
|
||||
|
||||
operator = plus | times | minus | division;
|
||||
|
||||
percent = "%" : " @@PERCENT@@";
|
||||
|
||||
export ARITHMETIC =
|
||||
Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP]
|
||||
;
|
@ -1,78 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'ru/classifier/cyrillic.grm' as c;
|
||||
import 'ru/verbalizer/extra_numbers.grm' as e;
|
||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||
import 'ru/verbalizer/numbers.grm' as n;
|
||||
import 'ru/verbalizer/spelled.grm' as s;
|
||||
|
||||
letter = b.kAlpha | c.kCyrillicAlpha;
|
||||
dash = "-";
|
||||
word = letter+;
|
||||
possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?;
|
||||
|
||||
post_word_symbol =
|
||||
("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) |
|
||||
("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) |
|
||||
("*" : "@@STAR@@")
|
||||
;
|
||||
|
||||
pre_word_symbol =
|
||||
("@" : "@@AT@@") |
|
||||
("/" : "@@SLASH@@") |
|
||||
("#" : "@@HASH@@")
|
||||
;
|
||||
|
||||
post_word = possibly_split_word n.I[" "] post_word_symbol;
|
||||
|
||||
pre_word = pre_word_symbol n.I[" "] possibly_split_word;
|
||||
|
||||
## Number/digit sequence combos, maybe with a dash
|
||||
|
||||
spelled_word = word @ s.SPELLED_NO_LETTER;
|
||||
|
||||
word_number =
|
||||
(word | spelled_word)
|
||||
(n.I[" "] | (dash : " "))
|
||||
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
||||
;
|
||||
|
||||
number_word =
|
||||
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
||||
(n.I[" "] | (dash : " "))
|
||||
(word | spelled_word)
|
||||
;
|
||||
|
||||
## Two-digit year.
|
||||
|
||||
# Note that in this case to be fair we really have to allow ordinals too since
|
||||
# in some languages that's what you would have.
|
||||
|
||||
two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS));
|
||||
|
||||
dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com";
|
||||
|
||||
miscellaneous = Optimize[
|
||||
possibly_split_word
|
||||
| post_word
|
||||
| pre_word
|
||||
| word_number
|
||||
| number_word
|
||||
| two_digit_year
|
||||
| dot_com
|
||||
];
|
||||
|
||||
export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP];
|
@ -1,44 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||
import 'ru/verbalizer/numbers.grm' as n;
|
||||
|
||||
card = n.CARDINAL_NUMBERS;
|
||||
|
||||
__currency__ = StringFile['ru/verbalizer/money.tsv'];
|
||||
|
||||
d = b.kDigit;
|
||||
D = d - "0";
|
||||
|
||||
cents = ((n.D["0"] | D) d) @ card;
|
||||
|
||||
# Only dollar for the verbalizer tests for English. Will need to add other
|
||||
# currencies.
|
||||
usd_maj = Project["usd_maj" @ __currency__, 'output'];
|
||||
usd_min = Project["usd_min" @ __currency__, 'output'];
|
||||
and = " @@MONEY_AND@@ " | " ";
|
||||
|
||||
dollar1 =
|
||||
n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min]
|
||||
;
|
||||
|
||||
dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"];
|
||||
|
||||
dollar3 = n.D["$"] card n.I[" " usd_maj];
|
||||
|
||||
dollar = Optimize[dollar1 | dollar2 | dollar3];
|
||||
|
||||
export MONEY = Optimize[dollar @ l.LEXICAL_MAP];
|
|
|
@ -1,48 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Russian minimally supervised number grammar.
|
||||
#
|
||||
# Supports cardinals and ordinals in all inflected forms.
|
||||
#
|
||||
# The language-specific acceptor G was compiled with digit, teen, decade,
|
||||
# century, and big power-of-ten preterminals. The lexicon transducer is
|
||||
# highly ambiguous, but no LM is used.
|
||||
|
||||
import 'util/arithmetic.grm' as a;
|
||||
|
||||
# Intersects the universal factorization transducer (F) with language-specific
|
||||
# acceptor (G).
|
||||
|
||||
d = a.DELTA_STAR;
|
||||
f = a.IARITHMETIC_RESTRICTED;
|
||||
g = LoadFst['ru/verbalizer/g.fst'];
|
||||
fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]];
|
||||
test1 = AssertEqual["230" @ fg, "(+ 200 30 +)"];
|
||||
|
||||
# Compiles lexicon transducers (L).
|
||||
|
||||
cardinal_name = StringFile['ru/verbalizer/cardinals.tsv'];
|
||||
cardinal_l = Optimize[(cardinal_name " ")* cardinal_name];
|
||||
|
||||
ordinal_name = StringFile['ru/verbalizer/ordinals.tsv'];
|
||||
ordinal_l = Optimize[(cardinal_name " ")* ordinal_name];
|
||||
|
||||
# Composes L with the leaf transducer (P), then composes that with FG.
|
||||
|
||||
p = a.LEAVES;
|
||||
|
||||
export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)];
|
||||
|
||||
export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)];
|
@ -1,68 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'ru/verbalizer/number_names.grm' as n;
|
||||
import 'universal/thousands_punct.grm' as t;
|
||||
import 'util/byte.grm' as b;
|
||||
|
||||
nominatives = StringFile['ru/verbalizer/nominatives.tsv'];
|
||||
|
||||
sigma_star = b.kBytes*;
|
||||
|
||||
nominative_filter =
|
||||
CDRewrite[nominatives ("" : "" <-1>), "[BOS]" | " ", " " | "[EOS]", sigma_star]
|
||||
;
|
||||
|
||||
cardinal = n.CARDINAL_NUMBER_NAME;
|
||||
ordinal = n.ORDINAL_NUMBER_NAME;
|
||||
|
||||
# Putting these here since this grammar gets incorporated by all the others.
|
||||
|
||||
func I[expr] {
|
||||
return "" : expr;
|
||||
}
|
||||
|
||||
func D[expr] {
|
||||
return expr : "";
|
||||
}
|
||||
|
||||
# Since we know this is the default for Russian, it's fair game to set it.
|
||||
separators = t.dot_thousands | t.no_delimiter;
|
||||
|
||||
export CARDINAL_NUMBERS = Optimize[
|
||||
separators
|
||||
@ cardinal
|
||||
];
|
||||
|
||||
export ORDINAL_NUMBERS_UNMARKED = Optimize[
|
||||
separators
|
||||
@ ordinal
|
||||
];
|
||||
|
||||
|
||||
endings = StringFile['ru/verbalizer/ordinal_endings.tsv'];
|
||||
|
||||
not_dash = (b.kBytes - "-")+;
|
||||
del_ending = CDRewrite[("-" not_dash) : "", "", "[EOS]", sigma_star];
|
||||
|
||||
# Needs nominative_filter here if we take out Kyle's models.
|
||||
export ORDINAL_NUMBERS_MARKED = Optimize[
|
||||
Optimize[Optimize[separators @ ordinal] "-" not_dash]
|
||||
@ Optimize[sigma_star endings]
|
||||
@ del_ending]
|
||||
;
|
||||
|
||||
export ORDINAL_NUMBERS =
|
||||
Optimize[ORDINAL_NUMBERS_MARKED | ORDINAL_NUMBERS_UNMARKED]
|
||||
;
|
@ -1,133 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Grammar for things built mostly on numbers.
|
||||
|
||||
import 'ru/verbalizer/factorization.grm' as f;
|
||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||
import 'ru/verbalizer/numbers.grm' as n;
|
||||
|
||||
num = n.CARDINAL_NUMBERS;
|
||||
ord = n.ORDINAL_NUMBERS_UNMARKED;
|
||||
digits = f.FRACTIONAL_PART_UNGROUPED;
|
||||
|
||||
# Various symbols.
|
||||
|
||||
plus = "+" : "@@ARITHMETIC_PLUS@@";
|
||||
minus = "-" : "@@ARITHMETIC_MINUS@@";
|
||||
slash = "/" : "@@SLASH@@";
|
||||
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
||||
dash = "-" : "@@DASH@@";
|
||||
equals = "=" : "@@ARITHMETIC_EQUALS@@";
|
||||
|
||||
degree = "°" : "@@DEGREE@@";
|
||||
|
||||
division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@";
|
||||
|
||||
times = ("x" | "*") : "@@ARITHMETIC_TIMES@@";
|
||||
|
||||
power = "^" : "@@DECIMAL_EXPONENT@@";
|
||||
|
||||
square_root = "√" : "@@SQUARE_ROOT@@";
|
||||
|
||||
percent = "%" : "@@PERCENT@@";
|
||||
|
||||
# Safe roman numbers.
|
||||
|
||||
# NB: Do not change the formatting here. NO_EDIT must be on the same
|
||||
# line as the path.
|
||||
rfile =
|
||||
'universal/roman_numerals.tsv' # NO_EDIT
|
||||
;
|
||||
|
||||
roman = StringFile[rfile];
|
||||
|
||||
## Main categories.
|
||||
|
||||
cat_dot_number =
|
||||
num
|
||||
n.I[" "] dot n.I[" "] num
|
||||
(n.I[" "] dot n.I[" "] num)+
|
||||
;
|
||||
|
||||
cat_slash_number =
|
||||
num
|
||||
n.I[" "] slash n.I[" "] num
|
||||
(n.I[" "] slash n.I[" "] num)*
|
||||
;
|
||||
|
||||
cat_dash_number =
|
||||
num
|
||||
n.I[" "] dash n.I[" "] num
|
||||
(n.I[" "] dash n.I[" "] num)*
|
||||
;
|
||||
|
||||
cat_signed_number = ((plus | minus) n.I[" "])? num;
|
||||
|
||||
cat_degree = cat_signed_number n.I[" "] degree;
|
||||
|
||||
cat_country_code = plus n.I[" "] (num | digits);
|
||||
|
||||
cat_math_operations =
|
||||
plus
|
||||
| minus
|
||||
| division
|
||||
| times
|
||||
| equals
|
||||
| percent
|
||||
| power
|
||||
| square_root
|
||||
;
|
||||
|
||||
# Roman numbers are often either cardinals or ordinals in various languages.
|
||||
cat_roman = roman @ (num | ord);
|
||||
|
||||
# Allow
|
||||
#
|
||||
# number:number
|
||||
# number-number
|
||||
#
|
||||
# to just be
|
||||
#
|
||||
# number number.
|
||||
|
||||
cat_number_number =
|
||||
num ((":" | "-") : " ") num
|
||||
;
|
||||
|
||||
# Some additional readings for these symbols.
|
||||
|
||||
cat_additional_readings =
|
||||
("/" : "@@PER@@") |
|
||||
("+" : "@@AND@@") |
|
||||
("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) |
|
||||
("*" : "@@STAR@@") |
|
||||
("x" : ("x" | "@@CONNECTOR_BY@@")) |
|
||||
("@" : "@@AT@@")
|
||||
;
|
||||
|
||||
numbers_plus = Optimize[
|
||||
cat_dot_number
|
||||
| cat_slash_number
|
||||
| cat_dash_number
|
||||
| cat_signed_number
|
||||
| cat_degree
|
||||
| cat_country_code
|
||||
| cat_math_operations
|
||||
| cat_roman
|
||||
| cat_number_number
|
||||
| cat_additional_readings
|
||||
];
|
||||
|
||||
export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP];
|
|
@ -1,804 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# AUTOMATICALLY GENERATED: DO NOT EDIT.
|
||||
import 'util/byte.grm' as b;
|
||||
|
||||
# Utilities for insertion and deletion.
|
||||
|
||||
func I[expr] {
|
||||
return "" : expr;
|
||||
}
|
||||
|
||||
func D[expr] {
|
||||
return expr : "";
|
||||
}
|
||||
|
||||
# Powers of base 10.
|
||||
export POWERS =
|
||||
"[E15]"
|
||||
| "[E14]"
|
||||
| "[E13]"
|
||||
| "[E12]"
|
||||
| "[E11]"
|
||||
| "[E10]"
|
||||
| "[E9]"
|
||||
| "[E8]"
|
||||
| "[E7]"
|
||||
| "[E6]"
|
||||
| "[E5]"
|
||||
| "[E4]"
|
||||
| "[E3]"
|
||||
| "[E2]"
|
||||
| "[E1]"
|
||||
;
|
||||
|
||||
export SIGMA = b.kBytes | POWERS;
|
||||
|
||||
export SIGMA_STAR = SIGMA*;
|
||||
|
||||
export SIGMA_PLUS = SIGMA+;
|
||||
|
||||
################################################################################
|
||||
# BEGIN LANGUAGE SPECIFIC DATA
|
||||
revaluations =
|
||||
("[E4]" : "[E1]")
|
||||
| ("[E5]" : "[E2]")
|
||||
| ("[E7]" : "[E1]")
|
||||
| ("[E8]" : "[E2]")
|
||||
;
|
||||
|
||||
Ms = "[E3]" | "[E6]" | "[E9]";
|
||||
|
||||
|
||||
func Zero[expr] {
|
||||
return expr : ("");
|
||||
}
|
||||
|
||||
space = " ";
|
||||
|
||||
lexset3 = Optimize[
|
||||
("1[E1]+1" : "одиннадцатая@")
|
||||
| ("1[E1]+1" : "одиннадцати")
|
||||
| ("1[E1]+1" : "одиннадцатого@")
|
||||
| ("1[E1]+1" : "одиннадцатое@")
|
||||
| ("1[E1]+1" : "одиннадцатой@")
|
||||
| ("1[E1]+1" : "одиннадцатом@")
|
||||
| ("1[E1]+1" : "одиннадцатому@")
|
||||
| ("1[E1]+1" : "одиннадцатую@")
|
||||
| ("1[E1]+1" : "одиннадцатые@")
|
||||
| ("1[E1]+1" : "одиннадцатый@")
|
||||
| ("1[E1]+1" : "одиннадцатым@")
|
||||
| ("1[E1]+1" : "одиннадцатыми@")
|
||||
| ("1[E1]+1" : "одиннадцатых@")
|
||||
| ("1[E1]+1" : "одиннадцать")
|
||||
| ("1[E1]+1" : "одиннадцатью")
|
||||
| ("1[E1]+2" : "двенадцатая@")
|
||||
| ("1[E1]+2" : "двенадцати")
|
||||
| ("1[E1]+2" : "двенадцатого@")
|
||||
| ("1[E1]+2" : "двенадцатое@")
|
||||
| ("1[E1]+2" : "двенадцатой@")
|
||||
| ("1[E1]+2" : "двенадцатом@")
|
||||
| ("1[E1]+2" : "двенадцатому@")
|
||||
| ("1[E1]+2" : "двенадцатую@")
|
||||
| ("1[E1]+2" : "двенадцатые@")
|
||||
| ("1[E1]+2" : "двенадцатый@")
|
||||
| ("1[E1]+2" : "двенадцатым@")
|
||||
| ("1[E1]+2" : "двенадцатыми@")
|
||||
| ("1[E1]+2" : "двенадцатых@")
|
||||
| ("1[E1]+2" : "двенадцать")
|
||||
| ("1[E1]+2" : "двенадцатью")
|
||||
| ("1[E1]+3" : "тринадцатая@")
|
||||
| ("1[E1]+3" : "тринадцати")
|
||||
| ("1[E1]+3" : "тринадцатого@")
|
||||
| ("1[E1]+3" : "тринадцатое@")
|
||||
| ("1[E1]+3" : "тринадцатой@")
|
||||
| ("1[E1]+3" : "тринадцатом@")
|
||||
| ("1[E1]+3" : "тринадцатому@")
|
||||
| ("1[E1]+3" : "тринадцатую@")
|
||||
| ("1[E1]+3" : "тринадцатые@")
|
||||
| ("1[E1]+3" : "тринадцатый@")
|
||||
| ("1[E1]+3" : "тринадцатым@")
|
||||
| ("1[E1]+3" : "тринадцатыми@")
|
||||
| ("1[E1]+3" : "тринадцатых@")
|
||||
| ("1[E1]+3" : "тринадцать")
|
||||
| ("1[E1]+3" : "тринадцатью")
|
||||
| ("1[E1]+4" : "четырнадцатая@")
|
||||
| ("1[E1]+4" : "четырнадцати")
|
||||
| ("1[E1]+4" : "четырнадцатого@")
|
||||
| ("1[E1]+4" : "четырнадцатое@")
|
||||
| ("1[E1]+4" : "четырнадцатой@")
|
||||
| ("1[E1]+4" : "четырнадцатом@")
|
||||
| ("1[E1]+4" : "четырнадцатому@")
|
||||
| ("1[E1]+4" : "четырнадцатую@")
|
||||
| ("1[E1]+4" : "четырнадцатые@")
|
||||
| ("1[E1]+4" : "четырнадцатый@")
|
||||
| ("1[E1]+4" : "четырнадцатым@")
|
||||
| ("1[E1]+4" : "четырнадцатыми@")
|
||||
| ("1[E1]+4" : "четырнадцатых@")
|
||||
| ("1[E1]+4" : "четырнадцать")
|
||||
| ("1[E1]+4" : "четырнадцатью")
|
||||
| ("1[E1]+5" : "пятнадцатая@")
|
||||
| ("1[E1]+5" : "пятнадцати")
|
||||
| ("1[E1]+5" : "пятнадцатого@")
|
||||
| ("1[E1]+5" : "пятнадцатое@")
|
||||
| ("1[E1]+5" : "пятнадцатой@")
|
||||
| ("1[E1]+5" : "пятнадцатом@")
|
||||
| ("1[E1]+5" : "пятнадцатому@")
|
||||
| ("1[E1]+5" : "пятнадцатую@")
|
||||
| ("1[E1]+5" : "пятнадцатые@")
|
||||
| ("1[E1]+5" : "пятнадцатый@")
|
||||
| ("1[E1]+5" : "пятнадцатым@")
|
||||
| ("1[E1]+5" : "пятнадцатыми@")
|
||||
| ("1[E1]+5" : "пятнадцатых@")
|
||||
| ("1[E1]+5" : "пятнадцать")
|
||||
| ("1[E1]+5" : "пятнадцатью")
|
||||
| ("1[E1]+6" : "шестнадцатая@")
|
||||
| ("1[E1]+6" : "шестнадцати")
|
||||
| ("1[E1]+6" : "шестнадцатого@")
|
||||
| ("1[E1]+6" : "шестнадцатое@")
|
||||
| ("1[E1]+6" : "шестнадцатой@")
|
||||
| ("1[E1]+6" : "шестнадцатом@")
|
||||
| ("1[E1]+6" : "шестнадцатому@")
|
||||
| ("1[E1]+6" : "шестнадцатую@")
|
||||
| ("1[E1]+6" : "шестнадцатые@")
|
||||
| ("1[E1]+6" : "шестнадцатый@")
|
||||
| ("1[E1]+6" : "шестнадцатым@")
|
||||
| ("1[E1]+6" : "шестнадцатыми@")
|
||||
| ("1[E1]+6" : "шестнадцатых@")
|
||||
| ("1[E1]+6" : "шестнадцать")
|
||||
| ("1[E1]+6" : "шестнадцатью")
|
||||
| ("1[E1]+7" : "семнадцатая@")
|
||||
| ("1[E1]+7" : "семнадцати")
|
||||
| ("1[E1]+7" : "семнадцатого@")
|
||||
| ("1[E1]+7" : "семнадцатое@")
|
||||
| ("1[E1]+7" : "семнадцатой@")
|
||||
| ("1[E1]+7" : "семнадцатом@")
|
||||
| ("1[E1]+7" : "семнадцатому@")
|
||||
| ("1[E1]+7" : "семнадцатую@")
|
||||
| ("1[E1]+7" : "семнадцатые@")
|
||||
| ("1[E1]+7" : "семнадцатый@")
|
||||
| ("1[E1]+7" : "семнадцатым@")
|
||||
| ("1[E1]+7" : "семнадцатыми@")
|
||||
| ("1[E1]+7" : "семнадцатых@")
|
||||
| ("1[E1]+7" : "семнадцать")
|
||||
| ("1[E1]+7" : "семнадцатью")
|
||||
| ("1[E1]+8" : "восемнадцатая@")
|
||||
| ("1[E1]+8" : "восемнадцати")
|
||||
| ("1[E1]+8" : "восемнадцатого@")
|
||||
| ("1[E1]+8" : "восемнадцатое@")
|
||||
| ("1[E1]+8" : "восемнадцатой@")
|
||||
| ("1[E1]+8" : "восемнадцатом@")
|
||||
| ("1[E1]+8" : "восемнадцатому@")
|
||||
| ("1[E1]+8" : "восемнадцатую@")
|
||||
| ("1[E1]+8" : "восемнадцатые@")
|
||||
| ("1[E1]+8" : "восемнадцатый@")
|
||||
| ("1[E1]+8" : "восемнадцатым@")
|
||||
| ("1[E1]+8" : "восемнадцатыми@")
|
||||
| ("1[E1]+8" : "восемнадцатых@")
|
||||
| ("1[E1]+8" : "восемнадцать")
|
||||
| ("1[E1]+8" : "восемнадцатью")
|
||||
| ("1[E1]+9" : "девятнадцатая@")
|
||||
| ("1[E1]+9" : "девятнадцати")
|
||||
| ("1[E1]+9" : "девятнадцатого@")
|
||||
| ("1[E1]+9" : "девятнадцатое@")
|
||||
| ("1[E1]+9" : "девятнадцатой@")
|
||||
| ("1[E1]+9" : "девятнадцатом@")
|
||||
| ("1[E1]+9" : "девятнадцатому@")
|
||||
| ("1[E1]+9" : "девятнадцатую@")
|
||||
| ("1[E1]+9" : "девятнадцатые@")
|
||||
| ("1[E1]+9" : "девятнадцатый@")
|
||||
| ("1[E1]+9" : "девятнадцатым@")
|
||||
| ("1[E1]+9" : "девятнадцатыми@")
|
||||
| ("1[E1]+9" : "девятнадцатых@")
|
||||
| ("1[E1]+9" : "девятнадцать")
|
||||
| ("1[E1]+9" : "девятнадцатью")]
|
||||
;
|
||||
|
||||
lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR];
|
||||
|
||||
lexset2 = Optimize[
|
||||
("1[E1]" : "десятая@")
|
||||
| ("1[E1]" : "десяти")
|
||||
| ("1[E1]" : "десятого@")
|
||||
| ("1[E1]" : "десятое@")
|
||||
| ("1[E1]" : "десятой@")
|
||||
| ("1[E1]" : "десятом@")
|
||||
| ("1[E1]" : "десятому@")
|
||||
| ("1[E1]" : "десятую@")
|
||||
| ("1[E1]" : "десятые@")
|
||||
| ("1[E1]" : "десятый@")
|
||||
| ("1[E1]" : "десятым@")
|
||||
| ("1[E1]" : "десятыми@")
|
||||
| ("1[E1]" : "десятых@")
|
||||
| ("1[E1]" : "десять")
|
||||
| ("1[E1]" : "десятью")
|
||||
| ("1[E2]" : "сотая@")
|
||||
| ("1[E2]" : "сотого@")
|
||||
| ("1[E2]" : "сотое@")
|
||||
| ("1[E2]" : "сотой@")
|
||||
| ("1[E2]" : "сотом@")
|
||||
| ("1[E2]" : "сотому@")
|
||||
| ("1[E2]" : "сотую@")
|
||||
| ("1[E2]" : "сотые@")
|
||||
| ("1[E2]" : "сотый@")
|
||||
| ("1[E2]" : "сотым@")
|
||||
| ("1[E2]" : "сотыми@")
|
||||
| ("1[E2]" : "сотых@")
|
||||
| ("1[E2]" : "ста")
|
||||
| ("1[E2]" : "сто")
|
||||
| ("1[E3]" : "тысячная@")
|
||||
| ("1[E3]" : "тысячного@")
|
||||
| ("1[E3]" : "тысячное@")
|
||||
| ("1[E3]" : "тысячной@")
|
||||
| ("1[E3]" : "тысячном@")
|
||||
| ("1[E3]" : "тысячному@")
|
||||
| ("1[E3]" : "тысячную@")
|
||||
| ("1[E3]" : "тысячные@")
|
||||
| ("1[E3]" : "тысячный@")
|
||||
| ("1[E3]" : "тысячным@")
|
||||
| ("1[E3]" : "тысячными@")
|
||||
| ("1[E3]" : "тысячных@")
|
||||
| ("1[E6]" : "миллионная@")
|
||||
| ("1[E6]" : "миллионного@")
|
||||
| ("1[E6]" : "миллионное@")
|
||||
| ("1[E6]" : "миллионной@")
|
||||
| ("1[E6]" : "миллионном@")
|
||||
| ("1[E6]" : "миллионному@")
|
||||
| ("1[E6]" : "миллионную@")
|
||||
| ("1[E6]" : "миллионные@")
|
||||
| ("1[E6]" : "миллионный@")
|
||||
| ("1[E6]" : "миллионным@")
|
||||
| ("1[E6]" : "миллионными@")
|
||||
| ("1[E6]" : "миллионных@")
|
||||
| ("1[E9]" : "миллиардная@")
|
||||
| ("1[E9]" : "миллиардного@")
|
||||
| ("1[E9]" : "миллиардное@")
|
||||
| ("1[E9]" : "миллиардной@")
|
||||
| ("1[E9]" : "миллиардном@")
|
||||
| ("1[E9]" : "миллиардному@")
|
||||
| ("1[E9]" : "миллиардную@")
|
||||
| ("1[E9]" : "миллиардные@")
|
||||
| ("1[E9]" : "миллиардный@")
|
||||
| ("1[E9]" : "миллиардным@")
|
||||
| ("1[E9]" : "миллиардными@")
|
||||
| ("1[E9]" : "миллиардных@")
|
||||
| ("2[E1]" : "двадцатая@")
|
||||
| ("2[E1]" : "двадцати")
|
||||
| ("2[E1]" : "двадцатого@")
|
||||
| ("2[E1]" : "двадцатое@")
|
||||
| ("2[E1]" : "двадцатой@")
|
||||
| ("2[E1]" : "двадцатом@")
|
||||
| ("2[E1]" : "двадцатому@")
|
||||
| ("2[E1]" : "двадцатую@")
|
||||
| ("2[E1]" : "двадцатые@")
|
||||
| ("2[E1]" : "двадцатый@")
|
||||
| ("2[E1]" : "двадцатым@")
|
||||
| ("2[E1]" : "двадцатыми@")
|
||||
| ("2[E1]" : "двадцатых@")
|
||||
| ("2[E1]" : "двадцать")
|
||||
| ("2[E1]" : "двадцатью")
|
||||
| ("2[E2]" : "двести")
|
||||
| ("2[E2]" : "двумстам")
|
||||
| ("2[E2]" : "двумястами")
|
||||
| ("2[E2]" : "двухсот")
|
||||
| ("2[E2]" : "двухсотая@")
|
||||
| ("2[E2]" : "двухсотого@")
|
||||
| ("2[E2]" : "двухсотое@")
|
||||
| ("2[E2]" : "двухсотой@")
|
||||
| ("2[E2]" : "двухсотом@")
|
||||
| ("2[E2]" : "двухсотому@")
|
||||
| ("2[E2]" : "двухсотую@")
|
||||
| ("2[E2]" : "двухсотые@")
|
||||
| ("2[E2]" : "двухсотый@")
|
||||
| ("2[E2]" : "двухсотым@")
|
||||
| ("2[E2]" : "двухсотыми@")
|
||||
| ("2[E2]" : "двухсотых@")
|
||||
| ("2[E2]" : "двухстах")
|
||||
| ("3[E1]" : "тридцатая@")
|
||||
| ("3[E1]" : "тридцати")
|
||||
| ("3[E1]" : "тридцатого@")
|
||||
| ("3[E1]" : "тридцатое@")
|
||||
| ("3[E1]" : "тридцатой@")
|
||||
| ("3[E1]" : "тридцатом@")
|
||||
| ("3[E1]" : "тридцатому@")
|
||||
| ("3[E1]" : "тридцатую@")
|
||||
| ("3[E1]" : "тридцатые@")
|
||||
| ("3[E1]" : "тридцатый@")
|
||||
| ("3[E1]" : "тридцатым@")
|
||||
| ("3[E1]" : "тридцатыми@")
|
||||
| ("3[E1]" : "тридцатых@")
|
||||
| ("3[E1]" : "тридцать")
|
||||
| ("3[E1]" : "тридцатью")
|
||||
| ("3[E2]" : "тремстам")
|
||||
| ("3[E2]" : "тремястами")
|
||||
| ("3[E2]" : "трехсот")
|
||||
| ("3[E2]" : "трехсотая@")
|
||||
| ("3[E2]" : "трехсотого@")
|
||||
| ("3[E2]" : "трехсотое@")
|
||||
| ("3[E2]" : "трехсотой@")
|
||||
| ("3[E2]" : "трехсотом@")
|
||||
| ("3[E2]" : "трехсотому@")
|
||||
| ("3[E2]" : "трехсотую@")
|
||||
| ("3[E2]" : "трехсотые@")
|
||||
| ("3[E2]" : "трехсотый@")
|
||||
| ("3[E2]" : "трехсотым@")
|
||||
| ("3[E2]" : "трехсотыми@")
|
||||
| ("3[E2]" : "трехсотых@")
|
||||
| ("3[E2]" : "трехстах")
|
||||
| ("3[E2]" : "триста")
|
||||
| ("4[E1]" : "сорок")
|
||||
| ("4[E1]" : "сорока")
|
||||
| ("4[E1]" : "сороковая@")
|
||||
| ("4[E1]" : "сорокового@")
|
||||
| ("4[E1]" : "сороковое@")
|
||||
| ("4[E1]" : "сороковой@")
|
||||
| ("4[E1]" : "сороковом@")
|
||||
| ("4[E1]" : "сороковому@")
|
||||
| ("4[E1]" : "сороковую@")
|
||||
| ("4[E1]" : "сороковые@")
|
||||
| ("4[E1]" : "сороковым@")
|
||||
| ("4[E1]" : "сороковыми@")
|
||||
| ("4[E1]" : "сороковых@")
|
||||
| ("4[E2]" : "четыремстам")
|
||||
| ("4[E2]" : "четыреста")
|
||||
| ("4[E2]" : "четырехсот")
|
||||
| ("4[E2]" : "четырехсотая@")
|
||||
| ("4[E2]" : "четырехсотого@")
|
||||
| ("4[E2]" : "четырехсотое@")
|
||||
| ("4[E2]" : "четырехсотой@")
|
||||
| ("4[E2]" : "четырехсотом@")
|
||||
| ("4[E2]" : "четырехсотому@")
|
||||
| ("4[E2]" : "четырехсотую@")
|
||||
| ("4[E2]" : "четырехсотые@")
|
||||
| ("4[E2]" : "четырехсотый@")
|
||||
| ("4[E2]" : "четырехсотым@")
|
||||
| ("4[E2]" : "четырехсотыми@")
|
||||
| ("4[E2]" : "четырехсотых@")
|
||||
| ("4[E2]" : "четырехстах")
|
||||
| ("4[E2]" : "четырьмястами")
|
||||
| ("5[E1]" : "пятидесятая@")
|
||||
| ("5[E1]" : "пятидесяти")
|
||||
| ("5[E1]" : "пятидесятого@")
|
||||
| ("5[E1]" : "пятидесятое@")
|
||||
| ("5[E1]" : "пятидесятой@")
|
||||
| ("5[E1]" : "пятидесятом@")
|
||||
| ("5[E1]" : "пятидесятому@")
|
||||
| ("5[E1]" : "пятидесятую@")
|
||||
| ("5[E1]" : "пятидесятые@")
|
||||
| ("5[E1]" : "пятидесятый@")
|
||||
| ("5[E1]" : "пятидесятым@")
|
||||
| ("5[E1]" : "пятидесятыми@")
|
||||
| ("5[E1]" : "пятидесятых@")
|
||||
| ("5[E1]" : "пятьдесят")
|
||||
| ("5[E1]" : "пятьюдесятью")
|
||||
| ("5[E2]" : "пятисот")
|
||||
| ("5[E2]" : "пятисотая@")
|
||||
| ("5[E2]" : "пятисотого@")
|
||||
| ("5[E2]" : "пятисотое@")
|
||||
| ("5[E2]" : "пятисотой@")
|
||||
| ("5[E2]" : "пятисотом@")
|
||||
| ("5[E2]" : "пятисотому@")
|
||||
| ("5[E2]" : "пятисотую@")
|
||||
| ("5[E2]" : "пятисотые@")
|
||||
| ("5[E2]" : "пятисотый@")
|
||||
| ("5[E2]" : "пятисотым@")
|
||||
| ("5[E2]" : "пятисотыми@")
|
||||
| ("5[E2]" : "пятисотых@")
|
||||
| ("5[E2]" : "пятистам")
|
||||
| ("5[E2]" : "пятистах")
|
||||
| ("5[E2]" : "пятьсот")
|
||||
| ("5[E2]" : "пятьюстами")
|
||||
| ("6[E1]" : "шестидесятая@")
|
||||
| ("6[E1]" : "шестидесяти")
|
||||
| ("6[E1]" : "шестидесятого@")
|
||||
| ("6[E1]" : "шестидесятое@")
|
||||
| ("6[E1]" : "шестидесятой@")
|
||||
| ("6[E1]" : "шестидесятом@")
|
||||
| ("6[E1]" : "шестидесятому@")
|
||||
| ("6[E1]" : "шестидесятую@")
|
||||
| ("6[E1]" : "шестидесятые@")
|
||||
| ("6[E1]" : "шестидесятый@")
|
||||
| ("6[E1]" : "шестидесятым@")
|
||||
| ("6[E1]" : "шестидесятыми@")
|
||||
| ("6[E1]" : "шестидесятых@")
|
||||
| ("6[E1]" : "шестьдесят")
|
||||
| ("6[E1]" : "шестьюдесятью")
|
||||
| ("6[E2]" : "шестисот")
|
||||
| ("6[E2]" : "шестисотая@")
|
||||
| ("6[E2]" : "шестисотого@")
|
||||
| ("6[E2]" : "шестисотое@")
|
||||
| ("6[E2]" : "шестисотой@")
|
||||
| ("6[E2]" : "шестисотом@")
|
||||
| ("6[E2]" : "шестисотому@")
|
||||
| ("6[E2]" : "шестисотую@")
|
||||
| ("6[E2]" : "шестисотые@")
|
||||
| ("6[E2]" : "шестисотый@")
|
||||
| ("6[E2]" : "шестисотым@")
|
||||
| ("6[E2]" : "шестисотыми@")
|
||||
| ("6[E2]" : "шестисотых@")
|
||||
| ("6[E2]" : "шестистам")
|
||||
| ("6[E2]" : "шестистах")
|
||||
| ("6[E2]" : "шестьсот")
|
||||
| ("6[E2]" : "шестьюстами")
|
||||
| ("7[E1]" : "семидесятая@")
|
||||
| ("7[E1]" : "семидесяти")
|
||||
| ("7[E1]" : "семидесятого@")
|
||||
| ("7[E1]" : "семидесятое@")
|
||||
| ("7[E1]" : "семидесятой@")
|
||||
| ("7[E1]" : "семидесятом@")
|
||||
| ("7[E1]" : "семидесятому@")
|
||||
| ("7[E1]" : "семидесятую@")
|
||||
| ("7[E1]" : "семидесятые@")
|
||||
| ("7[E1]" : "семидесятый@")
|
||||
| ("7[E1]" : "семидесятым@")
|
||||
| ("7[E1]" : "семидесятыми@")
|
||||
| ("7[E1]" : "семидесятых@")
|
||||
| ("7[E1]" : "семьдесят")
|
||||
| ("7[E1]" : "семьюдесятью")
|
||||
| ("7[E2]" : "семисот")
|
||||
| ("7[E2]" : "семисотая@")
|
||||
| ("7[E2]" : "семисотого@")
|
||||
| ("7[E2]" : "семисотое@")
|
||||
| ("7[E2]" : "семисотой@")
|
||||
| ("7[E2]" : "семисотом@")
|
||||
| ("7[E2]" : "семисотому@")
|
||||
| ("7[E2]" : "семисотую@")
|
||||
| ("7[E2]" : "семисотые@")
|
||||
| ("7[E2]" : "семисотый@")
|
||||
| ("7[E2]" : "семисотым@")
|
||||
| ("7[E2]" : "семисотыми@")
|
||||
| ("7[E2]" : "семисотых@")
|
||||
| ("7[E2]" : "семистам")
|
||||
| ("7[E2]" : "семистах")
|
||||
| ("7[E2]" : "семьсот")
|
||||
| ("7[E2]" : "семьюстами")
|
||||
| ("8[E1]" : "восемьдесят")
|
||||
| ("8[E1]" : "восьмидесятая@")
|
||||
| ("8[E1]" : "восьмидесяти")
|
||||
| ("8[E1]" : "восьмидесятого@")
|
||||
| ("8[E1]" : "восьмидесятое@")
|
||||
| ("8[E1]" : "восьмидесятой@")
|
||||
| ("8[E1]" : "восьмидесятом@")
|
||||
| ("8[E1]" : "восьмидесятому@")
|
||||
| ("8[E1]" : "восьмидесятую@")
|
||||
| ("8[E1]" : "восьмидесятые@")
|
||||
| ("8[E1]" : "восьмидесятый@")
|
||||
| ("8[E1]" : "восьмидесятым@")
|
||||
| ("8[E1]" : "восьмидесятыми@")
|
||||
| ("8[E1]" : "восьмидесятых@")
|
||||
| ("8[E1]" : "восьмьюдесятью")
|
||||
| ("8[E2]" : "восемьсот")
|
||||
| ("8[E2]" : "восемьюстами")
|
||||
| ("8[E2]" : "восьмисот")
|
||||
| ("8[E2]" : "восьмисотая@")
|
||||
| ("8[E2]" : "восьмисотого@")
|
||||
| ("8[E2]" : "восьмисотое@")
|
||||
| ("8[E2]" : "восьмисотой@")
|
||||
| ("8[E2]" : "восьмисотом@")
|
||||
| ("8[E2]" : "восьмисотому@")
|
||||
| ("8[E2]" : "восьмисотую@")
|
||||
| ("8[E2]" : "восьмисотые@")
|
||||
| ("8[E2]" : "восьмисотый@")
|
||||
| ("8[E2]" : "восьмисотым@")
|
||||
| ("8[E2]" : "восьмисотыми@")
|
||||
| ("8[E2]" : "восьмисотых@")
|
||||
| ("8[E2]" : "восьмистам")
|
||||
| ("8[E2]" : "восьмистах")
|
||||
| ("8[E2]" : "восьмьюстами")
|
||||
| ("9[E1]" : "девяноста")
|
||||
| ("9[E1]" : "девяностая@")
|
||||
| ("9[E1]" : "девяносто")
|
||||
| ("9[E1]" : "девяностого@")
|
||||
| ("9[E1]" : "девяностое@")
|
||||
| ("9[E1]" : "девяностой@")
|
||||
| ("9[E1]" : "девяностом@")
|
||||
| ("9[E1]" : "девяностому@")
|
||||
| ("9[E1]" : "девяностую@")
|
||||
| ("9[E1]" : "девяностые@")
|
||||
| ("9[E1]" : "девяностый@")
|
||||
| ("9[E1]" : "девяностым@")
|
||||
| ("9[E1]" : "девяностыми@")
|
||||
| ("9[E1]" : "девяностых@")
|
||||
| ("9[E2]" : "девятисот")
|
||||
| ("9[E2]" : "девятисотая@")
|
||||
| ("9[E2]" : "девятисотого@")
|
||||
| ("9[E2]" : "девятисотое@")
|
||||
| ("9[E2]" : "девятисотой@")
|
||||
| ("9[E2]" : "девятисотом@")
|
||||
| ("9[E2]" : "девятисотому@")
|
||||
| ("9[E2]" : "девятисотую@")
|
||||
| ("9[E2]" : "девятисотые@")
|
||||
| ("9[E2]" : "девятисотый@")
|
||||
| ("9[E2]" : "девятисотым@")
|
||||
| ("9[E2]" : "девятисотыми@")
|
||||
| ("9[E2]" : "девятисотых@")
|
||||
| ("9[E2]" : "девятистам")
|
||||
| ("9[E2]" : "девятистах")
|
||||
| ("9[E2]" : "девятьсот")
|
||||
| ("9[E2]" : "девятьюстами")]
|
||||
;
|
||||
|
||||
lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR];
|
||||
|
||||
lexset1 = Optimize[
|
||||
("+" : "")
|
||||
| ("1" : "один")
|
||||
| ("1" : "одна")
|
||||
| ("1" : "одни")
|
||||
| ("1" : "одним")
|
||||
| ("1" : "одними")
|
||||
| ("1" : "одних")
|
||||
| ("1" : "одно")
|
||||
| ("1" : "одного")
|
||||
| ("1" : "одной")
|
||||
| ("1" : "одном")
|
||||
| ("1" : "одному")
|
||||
| ("1" : "одною")
|
||||
| ("1" : "одну")
|
||||
| ("1" : "первая@")
|
||||
| ("1" : "первого@")
|
||||
| ("1" : "первое@")
|
||||
| ("1" : "первой@")
|
||||
| ("1" : "первом@")
|
||||
| ("1" : "первому@")
|
||||
| ("1" : "первую@")
|
||||
| ("1" : "первые@")
|
||||
| ("1" : "первый@")
|
||||
| ("1" : "первым@")
|
||||
| ("1" : "первыми@")
|
||||
| ("1" : "первых@")
|
||||
| ("2" : "вторая@")
|
||||
| ("2" : "второго@")
|
||||
| ("2" : "второе@")
|
||||
| ("2" : "второй@")
|
||||
| ("2" : "втором@")
|
||||
| ("2" : "второму@")
|
||||
| ("2" : "вторую@")
|
||||
| ("2" : "вторые@")
|
||||
| ("2" : "вторым@")
|
||||
| ("2" : "вторыми@")
|
||||
| ("2" : "вторых@")
|
||||
| ("2" : "два")
|
||||
| ("2" : "две")
|
||||
| ("2" : "двум")
|
||||
| ("2" : "двумя")
|
||||
| ("2" : "двух")
|
||||
| ("3" : "трем")
|
||||
| ("3" : "тремя")
|
||||
| ("3" : "третий@")
|
||||
| ("3" : "третье@")
|
||||
| ("3" : "третьего@")
|
||||
| ("3" : "третьей@")
|
||||
| ("3" : "третьем@")
|
||||
| ("3" : "третьему@")
|
||||
| ("3" : "третьи@")
|
||||
| ("3" : "третьим@")
|
||||
| ("3" : "третьими@")
|
||||
| ("3" : "третьих@")
|
||||
| ("3" : "третью@")
|
||||
| ("3" : "третья@")
|
||||
| ("3" : "трех")
|
||||
| ("3" : "три")
|
||||
| ("4" : "четвертая@")
|
||||
| ("4" : "четвертого@")
|
||||
| ("4" : "четвертое@")
|
||||
| ("4" : "четвертой@")
|
||||
| ("4" : "четвертом@")
|
||||
| ("4" : "четвертому@")
|
||||
| ("4" : "четвертую@")
|
||||
| ("4" : "четвертые@")
|
||||
| ("4" : "четвертый@")
|
||||
| ("4" : "четвертым@")
|
||||
| ("4" : "четвертыми@")
|
||||
| ("4" : "четвертых@")
|
||||
| ("4" : "четыре")
|
||||
| ("4" : "четырем")
|
||||
| ("4" : "четырех")
|
||||
| ("4" : "четырьмя")
|
||||
| ("5" : "пятая@")
|
||||
| ("5" : "пяти")
|
||||
| ("5" : "пятого@")
|
||||
| ("5" : "пятое@")
|
||||
| ("5" : "пятой@")
|
||||
| ("5" : "пятом@")
|
||||
| ("5" : "пятому@")
|
||||
| ("5" : "пятую@")
|
||||
| ("5" : "пятые@")
|
||||
| ("5" : "пятый@")
|
||||
| ("5" : "пятым@")
|
||||
| ("5" : "пятыми@")
|
||||
| ("5" : "пятых@")
|
||||
| ("5" : "пять")
|
||||
| ("5" : "пятью")
|
||||
| ("6" : "шестая@")
|
||||
| ("6" : "шести")
|
||||
| ("6" : "шестого@")
|
||||
| ("6" : "шестое@")
|
||||
| ("6" : "шестой@")
|
||||
| ("6" : "шестом@")
|
||||
| ("6" : "шестому@")
|
||||
| ("6" : "шестую@")
|
||||
| ("6" : "шестые@")
|
||||
| ("6" : "шестым@")
|
||||
| ("6" : "шестыми@")
|
||||
| ("6" : "шестых@")
|
||||
| ("6" : "шесть")
|
||||
| ("6" : "шестью")
|
||||
| ("7" : "седьмая@")
|
||||
| ("7" : "седьмого@")
|
||||
| ("7" : "седьмое@")
|
||||
| ("7" : "седьмой@")
|
||||
| ("7" : "седьмом@")
|
||||
| ("7" : "седьмому@")
|
||||
| ("7" : "седьмую@")
|
||||
| ("7" : "седьмые@")
|
||||
| ("7" : "седьмым@")
|
||||
| ("7" : "седьмыми@")
|
||||
| ("7" : "седьмых@")
|
||||
| ("7" : "семи")
|
||||
| ("7" : "семь")
|
||||
| ("7" : "семью")
|
||||
| ("8" : "восемь")
|
||||
| ("8" : "восьмая@")
|
||||
| ("8" : "восьми")
|
||||
| ("8" : "восьмого@")
|
||||
| ("8" : "восьмое@")
|
||||
| ("8" : "восьмой@")
|
||||
| ("8" : "восьмом@")
|
||||
| ("8" : "восьмому@")
|
||||
| ("8" : "восьмую@")
|
||||
| ("8" : "восьмые@")
|
||||
| ("8" : "восьмым@")
|
||||
| ("8" : "восьмыми@")
|
||||
| ("8" : "восьмых@")
|
||||
| ("8" : "восьмью")
|
||||
| ("9" : "девятая@")
|
||||
| ("9" : "девяти")
|
||||
| ("9" : "девятого@")
|
||||
| ("9" : "девятое@")
|
||||
| ("9" : "девятой@")
|
||||
| ("9" : "девятом@")
|
||||
| ("9" : "девятому@")
|
||||
| ("9" : "девятую@")
|
||||
| ("9" : "девятые@")
|
||||
| ("9" : "девятый@")
|
||||
| ("9" : "девятым@")
|
||||
| ("9" : "девятыми@")
|
||||
| ("9" : "девятых@")
|
||||
| ("9" : "девять")
|
||||
| ("9" : "девятью")
|
||||
| ("[E3]" : "тысяч")
|
||||
| ("[E3]" : "тысяча")
|
||||
| ("[E3]" : "тысячам")
|
||||
| ("[E3]" : "тысячами")
|
||||
| ("[E3]" : "тысячах")
|
||||
| ("[E3]" : "тысяче")
|
||||
| ("[E3]" : "тысячей")
|
||||
| ("[E3]" : "тысячи")
|
||||
| ("[E3]" : "тысячу")
|
||||
| ("[E3]" : "тысячью")
|
||||
| ("[E6]" : "миллион")
|
||||
| ("[E6]" : "миллиона")
|
||||
| ("[E6]" : "миллионам")
|
||||
| ("[E6]" : "миллионами")
|
||||
| ("[E6]" : "миллионах")
|
||||
| ("[E6]" : "миллионе")
|
||||
| ("[E6]" : "миллионов")
|
||||
| ("[E6]" : "миллионом")
|
||||
| ("[E6]" : "миллиону")
|
||||
| ("[E6]" : "миллионы")
|
||||
| ("[E9]" : "миллиард")
|
||||
| ("[E9]" : "миллиарда")
|
||||
| ("[E9]" : "миллиардам")
|
||||
| ("[E9]" : "миллиардами")
|
||||
| ("[E9]" : "миллиардах")
|
||||
| ("[E9]" : "миллиарде")
|
||||
| ("[E9]" : "миллиардов")
|
||||
| ("[E9]" : "миллиардом")
|
||||
| ("[E9]" : "миллиарду")
|
||||
| ("[E9]" : "миллиарды")
|
||||
| ("|0|" : "ноле")
|
||||
| ("|0|" : "нолем")
|
||||
| ("|0|" : "ноль")
|
||||
| ("|0|" : "нолю")
|
||||
| ("|0|" : "ноля")
|
||||
| ("|0|" : "нуле")
|
||||
| ("|0|" : "нулем")
|
||||
| ("|0|" : "нуль")
|
||||
| ("|0|" : "нулю")
|
||||
| ("|0|" : "нуля")]
|
||||
;
|
||||
|
||||
lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR];
|
||||
|
||||
export LEX = Optimize[lex3 @ lex2 @ lex1];
|
||||
|
||||
export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]";
|
||||
|
||||
# END LANGUAGE SPECIFIC DATA
|
||||
################################################################################
|
||||
# Inserts a marker after the Ms.
|
||||
export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR];
|
||||
|
||||
# Deletes all powers and "+".
|
||||
export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR];
|
||||
|
||||
# Deletes trailing zeros at the beginning of a number, so that "0003" does not
|
||||
# get treated as an ordinary number.
|
||||
export DELETE_INITIAL_ZEROS =
|
||||
CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR]
|
||||
;
|
||||
|
||||
NonMs = Optimize[POWERS - Ms];
|
||||
|
||||
# Deletes (usually) zeros before a non-M. E.g., +0[E1] should be
|
||||
# deleted
|
||||
export DELETE_INTERMEDIATE_ZEROS1 =
|
||||
CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR]
|
||||
;
|
||||
|
||||
# Deletes (usually) zeros before an M, if there is no non-zero element between
|
||||
# that and the previous boundary. Thus, if after the result of the rule above we
|
||||
# end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final
|
||||
# zero.
|
||||
export DELETE_INTERMEDIATE_ZEROS2 = Optimize[
|
||||
CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR]
|
||||
@ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]]
|
||||
;
|
||||
|
||||
# Final clean up of stray zeros.
|
||||
export DELETE_REMAINING_ZEROS = Optimize[
|
||||
CDRewrite[Zero["+0"], "", "", SIGMA_STAR]
|
||||
@ CDRewrite[Zero["0"], "", "", SIGMA_STAR]]
|
||||
;
|
||||
|
||||
# Applies the revaluation map. For example in English, change [E4] to [E1] as a
|
||||
# modifier of [E3]
|
||||
export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR];
|
||||
|
||||
# Deletes the various marks and powers in the input and output.
|
||||
export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR];
|
||||
|
||||
export CLEAN_SPACES = Optimize[
|
||||
CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR]
|
||||
@ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR]
|
||||
@ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]]
|
||||
;
|
||||
|
||||
d = b.kDigit;
|
||||
|
||||
# Germanic inversion rule.
|
||||
germanic =
|
||||
(I["1+"] d "[E1]" D["+1"])
|
||||
| (I["2+"] d "[E1]" D["+2"])
|
||||
| (I["3+"] d "[E1]" D["+3"])
|
||||
| (I["4+"] d "[E1]" D["+4"])
|
||||
| (I["5+"] d "[E1]" D["+5"])
|
||||
| (I["6+"] d "[E1]" D["+6"])
|
||||
| (I["7+"] d "[E1]" D["+7"])
|
||||
| (I["8+"] d "[E1]" D["+8"])
|
||||
| (I["9+"] d "[E1]" D["+9"])
|
||||
;
|
||||
|
||||
germanic_inversion =
|
||||
CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt']
|
||||
;
|
||||
|
||||
export GERMANIC_INVERSION = SIGMA_STAR;
|
||||
export ORDINAL_RESTRICTION =
|
||||
Optimize[((SIGMA - "@")* "@") @ CDRewrite[D["@"], "", "", SIGMA_STAR]]
|
||||
;
|
||||
nondigits = b.kBytes - b.kDigit;
|
||||
export ORDINAL_SUFFIX = D[nondigits*];
|
|
@ -1,77 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# This verbalizer is used whenever there is an LM symbol that consists of
|
||||
# letters immediately followed by "{spelled}". This strips the "{spelled}"
|
||||
# suffix.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'ru/classifier/cyrillic.grm' as c;
|
||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||
import 'ru/verbalizer/numbers.grm' as n;
|
||||
|
||||
digit = b.kDigit @ n.CARDINAL_NUMBERS;
|
||||
|
||||
char_set = (("a" | "A") : "letter-a")
|
||||
| (("b" | "B") : "letter-b")
|
||||
| (("c" | "C") : "letter-c")
|
||||
| (("d" | "D") : "letter-d")
|
||||
| (("e" | "E") : "letter-e")
|
||||
| (("f" | "F") : "letter-f")
|
||||
| (("g" | "G") : "letter-g")
|
||||
| (("h" | "H") : "letter-h")
|
||||
| (("i" | "I") : "letter-i")
|
||||
| (("j" | "J") : "letter-j")
|
||||
| (("k" | "K") : "letter-k")
|
||||
| (("l" | "L") : "letter-l")
|
||||
| (("m" | "M") : "letter-m")
|
||||
| (("n" | "N") : "letter-n")
|
||||
| (("o" | "O") : "letter-o")
|
||||
| (("p" | "P") : "letter-p")
|
||||
| (("q" | "Q") : "letter-q")
|
||||
| (("r" | "R") : "letter-r")
|
||||
| (("s" | "S") : "letter-s")
|
||||
| (("t" | "T") : "letter-t")
|
||||
| (("u" | "U") : "letter-u")
|
||||
| (("v" | "V") : "letter-v")
|
||||
| (("w" | "W") : "letter-w")
|
||||
| (("x" | "X") : "letter-x")
|
||||
| (("y" | "Y") : "letter-y")
|
||||
| (("z" | "Z") : "letter-z")
|
||||
| (digit)
|
||||
| ("&" : "@@AND@@")
|
||||
| ("." : "")
|
||||
| ("-" : "")
|
||||
| ("_" : "")
|
||||
| ("/" : "")
|
||||
| (n.I["letter-"] c.kCyrillicAlpha)
|
||||
;
|
||||
|
||||
ins_space = "" : " ";
|
||||
|
||||
suffix = "{spelled}" : "";
|
||||
|
||||
spelled = Optimize[char_set (ins_space char_set)* suffix];
|
||||
|
||||
export SPELLED = Optimize[spelled @ l.LEXICAL_MAP];
|
||||
|
||||
sigma_star = b.kBytes*;
|
||||
|
||||
# Gets rid of the letter- prefix since in some cases we don't want it.
|
||||
|
||||
del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star];
|
||||
|
||||
spelled_no_tag = Optimize[char_set (ins_space char_set)*];
|
||||
|
||||
export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter];
|
@ -1,24 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||
|
||||
punct =
|
||||
("." : "@@PERIOD@@")
|
||||
| ("," : "@@COMMA@@")
|
||||
| ("!" : "@@EXCLAMATION_MARK@@")
|
||||
| ("?" : "@@QUESTION_MARK@@")
|
||||
;
|
||||
|
||||
export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP];
|
@ -1,108 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/byte.grm' as b;
|
||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||
import 'ru/verbalizer/numbers.grm' as n;
|
||||
|
||||
# Only handles 24-hour time with quarter-to, half-past and quarter-past.
|
||||
|
||||
increment_hour =
|
||||
("0" : "1")
|
||||
| ("1" : "2")
|
||||
| ("2" : "3")
|
||||
| ("3" : "4")
|
||||
| ("4" : "5")
|
||||
| ("5" : "6")
|
||||
| ("6" : "7")
|
||||
| ("7" : "8")
|
||||
| ("8" : "9")
|
||||
| ("9" : "10")
|
||||
| ("10" : "11")
|
||||
| ("11" : "12")
|
||||
| ("12" : "1") # If someone uses 12, we assume 12-hour by default.
|
||||
| ("13" : "14")
|
||||
| ("14" : "15")
|
||||
| ("15" : "16")
|
||||
| ("16" : "17")
|
||||
| ("17" : "18")
|
||||
| ("18" : "19")
|
||||
| ("19" : "20")
|
||||
| ("20" : "21")
|
||||
| ("21" : "22")
|
||||
| ("22" : "23")
|
||||
| ("23" : "12")
|
||||
;
|
||||
|
||||
hours = Project[increment_hour, 'input'];
|
||||
|
||||
d = b.kDigit;
|
||||
D = d - "0";
|
||||
|
||||
minutes09 = "0" D;
|
||||
|
||||
minutes = ("1" | "2" | "3" | "4" | "5") d;
|
||||
|
||||
__sep__ = ":";
|
||||
sep_space = __sep__ : " ";
|
||||
|
||||
verbalize_hours = hours @ n.CARDINAL_NUMBERS;
|
||||
|
||||
verbalize_minutes =
|
||||
("00" : "@@HOUR@@")
|
||||
| (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS))
|
||||
| (minutes @ n.CARDINAL_NUMBERS)
|
||||
;
|
||||
|
||||
time_basic = Optimize[verbalize_hours sep_space verbalize_minutes];
|
||||
|
||||
# Special cases we handle right now.
|
||||
# TODO: Need to allow for cases like
|
||||
#
|
||||
# half twelve (in the UK English sense)
|
||||
# half twaalf (in the Dutch sense)
|
||||
|
||||
time_quarter_past =
|
||||
n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "]
|
||||
verbalize_hours
|
||||
n.D[__sep__ "15"];
|
||||
|
||||
time_half_past =
|
||||
n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "]
|
||||
verbalize_hours
|
||||
n.D[__sep__ "30"];
|
||||
|
||||
time_quarter_to =
|
||||
n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "]
|
||||
(increment_hour @ verbalize_hours)
|
||||
n.D[__sep__ "45"];
|
||||
|
||||
time_extra = Optimize[
|
||||
time_quarter_past | time_half_past | time_quarter_to]
|
||||
;
|
||||
|
||||
# Basic time periods which most languages can be expected to have.
|
||||
__am__ = "a.m." | "am" | "AM" | "утра";
|
||||
__pm__ = "p.m." | "pm" | "PM" | "вечера";
|
||||
|
||||
period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@");
|
||||
|
||||
time_variants = time_basic | time_extra;
|
||||
|
||||
time = Optimize[
|
||||
(period (" " | n.I[" "]))? time_variants
|
||||
| time_variants ((" " | n.I[" "]) period)?]
|
||||
;
|
||||
|
||||
export TIME = Optimize[time @ l.LEXICAL_MAP];
|
@ -1,68 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Rules for URLs and email addresses.
|
||||
|
||||
import 'util/byte.grm' as bytelib;
|
||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
||||
|
||||
ins_space = "" : " ";
|
||||
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
||||
at = "@" : "@@AT@@";
|
||||
|
||||
url_suffix =
|
||||
(".com" : dot ins_space "com") |
|
||||
(".gov" : dot ins_space "gov") |
|
||||
(".edu" : dot ins_space "e d u") |
|
||||
(".org" : dot ins_space "org") |
|
||||
(".net" : dot ins_space "net")
|
||||
;
|
||||
|
||||
letter_string = (bytelib.kAlnum)* bytelib.kAlnum;
|
||||
|
||||
letter_string_dot =
|
||||
((letter_string ins_space dot ins_space)* letter_string)
|
||||
;
|
||||
|
||||
# Rules for URLs.
|
||||
export URL = Optimize[
|
||||
((letter_string_dot) (ins_space)
|
||||
(url_suffix)) @ l.LEXICAL_MAP
|
||||
];
|
||||
|
||||
# Rules for email addresses.
|
||||
letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum);
|
||||
|
||||
letter_by_letter_dot =
|
||||
((letter_by_letter ins_space dot ins_space)*
|
||||
letter_by_letter)
|
||||
;
|
||||
|
||||
export EMAIL1 = Optimize[
|
||||
((letter_by_letter) (ins_space)
|
||||
(at) (ins_space)
|
||||
(letter_by_letter_dot) (ins_space)
|
||||
(url_suffix)) @ l.LEXICAL_MAP
|
||||
];
|
||||
|
||||
export EMAIL2 = Optimize[
|
||||
((letter_by_letter) (ins_space)
|
||||
(at) (ins_space)
|
||||
(letter_string_dot) (ins_space)
|
||||
(url_suffix)) @ l.LEXICAL_MAP
|
||||
];
|
||||
|
||||
export EMAILS = Optimize[
|
||||
EMAIL1 | EMAIL2
|
||||
];
|
@ -1,42 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import 'util/util.grm' as util;
|
||||
import 'ru/verbalizer/extra_numbers.grm' as e;
|
||||
import 'ru/verbalizer/float.grm' as f;
|
||||
import 'ru/verbalizer/math.grm' as ma;
|
||||
import 'ru/verbalizer/miscellaneous.grm' as mi;
|
||||
import 'ru/verbalizer/money.grm' as mo;
|
||||
import 'ru/verbalizer/numbers.grm' as n;
|
||||
import 'ru/verbalizer/numbers_plus.grm' as np;
|
||||
import 'ru/verbalizer/spelled.grm' as s;
|
||||
import 'ru/verbalizer/spoken_punct.grm' as sp;
|
||||
import 'ru/verbalizer/time.grm' as t;
|
||||
import 'ru/verbalizer/urls.grm' as u;
|
||||
|
||||
export VERBALIZER = Optimize[RmWeight[
|
||||
( e.MIXED_NUMBERS
|
||||
| e.DIGITS
|
||||
| f.FLOAT
|
||||
| ma.ARITHMETIC
|
||||
| mi.MISCELLANEOUS
|
||||
| mo.MONEY
|
||||
| n.CARDINAL_NUMBERS
|
||||
| n.ORDINAL_NUMBERS
|
||||
| np.NUMBERS_PLUS
|
||||
| s.SPELLED
|
||||
| sp.SPOKEN_PUNCT
|
||||
| t.TIME
|
||||
| u.URL) @ util.CLEAN_SPACES
|
||||
]];
|
@ -1,3 +0,0 @@
|
||||
# Language-universal grammar definitions
|
||||
|
||||
This directory contains various language-universal grammar definitions.
|
|
@ -1,126 +0,0 @@
|
||||
# Copyright 2017 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Specifies common ways of delimiting thousands in digit strings.
|
||||
|
||||
import 'util/byte.grm' as bytelib;
|
||||
import 'util/util.grm' as util;
|
||||
|
||||
killcomma = "," : "";
|
||||
dot2comma = "." : ",";
|
||||
spaces2comma = " "+ : ",";
|
||||
|
||||
zero = "0";
|
||||
|
||||
# no_delimiter = zero | "[1-9][0-9]*";
|
||||
export no_delimiter = zero | (util.d1to9 bytelib.kDigit*);
|
||||
|
||||
# delim_map_dot = ("[0-9]" | ("\." : ","))*;
|
||||
delim_map_dot = (bytelib.kDigit | dot2comma)*;
|
||||
|
||||
# delim_map_space = ("[0-9]" | (" +" : ","))*;
|
||||
delim_map_space = (bytelib.kDigit | spaces2comma)*;
|
||||
|
||||
## Western systems group thousands. Korean goes this way too.
|
||||
|
||||
# comma_thousands = zero | ("[1-9][0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9]")*);
|
||||
export comma_thousands = zero | (util.d1to9 bytelib.kDigit{0,2} (killcomma bytelib.kDigit{3})*);
|
||||
|
||||
# ComposeFst: 1st argument cannot match on output labels and 2nd argument
|
||||
# cannot match on input labels (sort?).
|
||||
export dot_thousands = delim_map_dot @ comma_thousands;
|
||||
|
||||
# ComposeFst: 1st argument cannot match on output labels and 2nd argument
|
||||
# cannot match on input labels (sort?).
|
||||
export space_thousands = delim_map_space @ comma_thousands;
|
||||
|
||||
## Chinese prefers grouping by fours (by ten-thousands).
|
||||
|
||||
# chinese_comma =
|
||||
# zero | ("[1-9][0-9]?[0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9][0-9]")*);
|
||||
export chinese_comma = zero | (util.d1to9 (bytelib.kDigit{0,3}) (killcomma bytelib.kDigit{4})*);
|
||||
|
||||
## The Indian system is more complex because of the Stravinskian alternation
|
||||
## between lakhs and crores.
|
||||
##
|
||||
## According to Wikipedia:
|
||||
##
|
||||
## Indian English Value
|
||||
## One 1
|
||||
## Ten 10
|
||||
## Hundred 100
|
||||
## Thousand 1,000
|
||||
## Lakh 1,00,000
|
||||
## Crore 1,00,00,000
|
||||
## Arab 1,00,00,00,000
|
||||
## Kharab 1,00,00,00,00,000
|
||||
|
||||
# indian_hundreds = "[1-9][0-9]?[0-9]?";
|
||||
indian_hundreds = util.d1to9 bytelib.kDigit{0,2};
|
||||
|
||||
## Up to 99,999.
|
||||
|
||||
# indian_comma_thousands = "[1-9][0-9]?" ("," : "") "[0-9][0-9][0-9]";
|
||||
indian_comma_thousands = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{3};
|
||||
|
||||
## Up to 99,99,999.
|
||||
|
||||
# indian_comma_lakhs = "[1-9][0-9]?" ("," : "") "[0-9][0-9]" ("," : "") "[0-9][0-9][0-9]";
|
||||
indian_comma_lakhs = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{2} killcomma bytelib.kDigit{3};
|
||||
|
||||
## Up to 999,99,99,999
|
||||
|
||||
indian_comma_crores =
|
||||
util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma
|
||||
(bytelib.kDigit{2} killcomma)?
|
||||
bytelib.kDigit{2} killcomma
|
||||
bytelib.kDigit{3}
|
||||
;
|
||||
|
||||
## Up to 99,999,99,99,999.
|
||||
|
||||
indian_comma_thousand_crores =
|
||||
util.d1to9 bytelib.kDigit? killcomma
|
||||
bytelib.kDigit{3} killcomma
|
||||
bytelib.kDigit{2} killcomma
|
||||
bytelib.kDigit{2} killcomma
|
||||
bytelib.kDigit{3}
|
||||
;
|
||||
|
||||
## Up to 999,99,999,99,99,999.
|
||||
|
||||
indian_comma_lakh_crores =
|
||||
util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma
|
||||
bytelib.kDigit{2} killcomma
|
||||
bytelib.kDigit{3} killcomma
|
||||
bytelib.kDigit{2} killcomma
|
||||
bytelib.kDigit{2} killcomma
|
||||
bytelib.kDigit{3}
|
||||
;
|
||||
|
||||
export indian_comma =
|
||||
zero
|
||||
| indian_hundreds
|
||||
| indian_comma_thousands
|
||||
| indian_comma_lakhs
|
||||
| indian_comma_crores
|
||||
| indian_comma_thousand_crores
|
||||
| indian_comma_lakh_crores
|
||||
;
|
||||
|
||||
# Indian number system with dots.
|
||||
export indian_dot_number = delim_map_dot @ indian_comma;
|
||||
|
||||
# Indian number system with spaces.
|
||||
export indian_space_number = delim_map_space @ indian_comma;
|
@ -1,3 +0,0 @@
|
||||
# Utility grammar definitions
|
||||
|
||||
This directory contains various utility grammar definitions.
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue