parent
aba37810ff
commit
3f3442b98a
@ -1,2 +0,0 @@
|
|||||||
data
|
|
||||||
exp
|
|
@ -1,3 +0,0 @@
|
|||||||
# G2P
|
|
||||||
|
|
||||||
* zh - Chinese G2P
|
|
@ -1,53 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
import re
|
|
||||||
|
|
||||||
import jieba
|
|
||||||
from pypinyin import lazy_pinyin
|
|
||||||
from pypinyin import Style
|
|
||||||
|
|
||||||
|
|
||||||
def extract_pinyin(source, target, use_jieba=False):
|
|
||||||
with open(source, 'rt', encoding='utf-8') as fin:
|
|
||||||
with open(target, 'wt', encoding='utf-8') as fout:
|
|
||||||
for i, line in enumerate(fin):
|
|
||||||
if i % 2 == 0:
|
|
||||||
sentence_id, raw_text = line.strip().split()
|
|
||||||
raw_text = re.sub(r'#\d', '', raw_text)
|
|
||||||
if use_jieba:
|
|
||||||
raw_text = jieba.lcut(raw_text)
|
|
||||||
syllables = lazy_pinyin(
|
|
||||||
raw_text,
|
|
||||||
errors='ignore',
|
|
||||||
style=Style.TONE3,
|
|
||||||
neutral_tone_with_five=True)
|
|
||||||
transcription = ' '.join(syllables)
|
|
||||||
fout.write(f'{sentence_id} {transcription}\n')
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
|
||||||
parser.add_argument(
|
|
||||||
"input", type=str, help="source file of baker's prosody label file")
|
|
||||||
parser.add_argument(
|
|
||||||
"output", type=str, help="target file to write pinyin lables")
|
|
||||||
parser.add_argument(
|
|
||||||
"--use-jieba",
|
|
||||||
action='store_true',
|
|
||||||
help="use jieba for word segmentation.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
|
|
@ -1,37 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
def extract_pinyin_lables(source, target):
|
|
||||||
"""Extract pinyin labels from Baker's prosody labeling."""
|
|
||||||
with open(source, 'rt', encoding='utf-8') as fin:
|
|
||||||
with open(target, 'wt', encoding='utf-8') as fout:
|
|
||||||
for i, line in enumerate(fin):
|
|
||||||
if i % 2 == 0:
|
|
||||||
sentence_id, raw_text = line.strip().split()
|
|
||||||
fout.write(f'{sentence_id} ')
|
|
||||||
else:
|
|
||||||
transcription = line.strip()
|
|
||||||
fout.write(f'{transcription}\n')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
|
||||||
parser.add_argument(
|
|
||||||
"input", type=str, help="source file of baker's prosody label file")
|
|
||||||
parser.add_argument(
|
|
||||||
"output", type=str, help="target file to write pinyin lables")
|
|
||||||
args = parser.parse_args()
|
|
||||||
extract_pinyin_lables(args.input, args.output)
|
|
@ -1,103 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import List
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
|
|
||||||
def erized(syllable: str) -> bool:
|
|
||||||
"""Whether the syllable contains erhua effect.
|
|
||||||
|
|
||||||
Example
|
|
||||||
--------
|
|
||||||
huar -> True
|
|
||||||
guanr -> True
|
|
||||||
er -> False
|
|
||||||
"""
|
|
||||||
# note: for pinyin, len(syllable) >=2 is always true
|
|
||||||
# if not: there is something wrong in the data
|
|
||||||
assert len(syllable) >= 2, f"inavlid syllable {syllable}"
|
|
||||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
|
||||||
|
|
||||||
|
|
||||||
def ignore_sandhi(reference: List[str], generated: List[str]) -> List[str]:
|
|
||||||
"""
|
|
||||||
Given a sequence of syllables from human annotation(reference),
|
|
||||||
which makes sandhi explici and a sequence of syllables from some
|
|
||||||
simple g2p program(generated), which does not consider sandhi,
|
|
||||||
return a the reference sequence while ignore sandhi.
|
|
||||||
|
|
||||||
Example
|
|
||||||
--------
|
|
||||||
['lao2', 'hu3'], ['lao3', 'hu3'] -> ['lao3', 'hu3']
|
|
||||||
"""
|
|
||||||
i = 0
|
|
||||||
j = 0
|
|
||||||
|
|
||||||
# sandhi ignored in the result while other errors are not included
|
|
||||||
result = []
|
|
||||||
while i < len(reference):
|
|
||||||
if erized(reference[i]):
|
|
||||||
result.append(reference[i])
|
|
||||||
i += 1
|
|
||||||
j += 2
|
|
||||||
elif reference[i][:-1] == generated[i][:-1] and reference[i][
|
|
||||||
-1] == '2' and generated[i][-1] == '3':
|
|
||||||
result.append(generated[i])
|
|
||||||
i += 1
|
|
||||||
j += 1
|
|
||||||
else:
|
|
||||||
result.append(reference[i])
|
|
||||||
i += 1
|
|
||||||
j += 1
|
|
||||||
assert j == len(
|
|
||||||
generated
|
|
||||||
), "length of transcriptions mismatch, There may be some characters that are ignored in the generated transcription."
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def convert_transcriptions(reference: Union[str, Path],
|
|
||||||
generated: Union[str, Path],
|
|
||||||
output: Union[str, Path]):
|
|
||||||
with open(reference, 'rt') as f_ref:
|
|
||||||
with open(generated, 'rt') as f_gen:
|
|
||||||
with open(output, 'wt') as f_out:
|
|
||||||
for i, (ref, gen) in enumerate(zip(f_ref, f_gen)):
|
|
||||||
sentence_id, ref_transcription = ref.strip().split(' ', 1)
|
|
||||||
_, gen_transcription = gen.strip().split(' ', 1)
|
|
||||||
try:
|
|
||||||
result = ignore_sandhi(ref_transcription.split(),
|
|
||||||
gen_transcription.split())
|
|
||||||
result = ' '.join(result)
|
|
||||||
except Exception:
|
|
||||||
print(
|
|
||||||
f"sentence_id: {sentence_id} There is some annotation error in the reference or generated transcription. Use the reference."
|
|
||||||
)
|
|
||||||
result = ref_transcription
|
|
||||||
f_out.write(f"{sentence_id} {result}\n")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="reference transcription but ignore sandhi.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--reference",
|
|
||||||
type=str,
|
|
||||||
help="path to the reference transcription of baker dataset.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--generated", type=str, help="path to the generated transcription.")
|
|
||||||
parser.add_argument("--output", type=str, help="path to save result.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
convert_transcriptions(args.reference, args.generated, args.output)
|
|
@ -1,33 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
exp_dir="exp"
|
|
||||||
data_dir="data"
|
|
||||||
|
|
||||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
|
||||||
|
|
||||||
archive=${data_dir}/"BZNSYP.rar"
|
|
||||||
if [ ! -f ${archive} ]; then
|
|
||||||
echo "Baker Dataset not found! Download it first to the data_dir."
|
|
||||||
exit -1
|
|
||||||
fi
|
|
||||||
|
|
||||||
MD5='c4350563bf7dc298f7dd364b2607be83'
|
|
||||||
md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
|
|
||||||
if [ ${md5_result} != ${MD5} ]; then
|
|
||||||
echo "MD5 mismatch! The Archive has been changed."
|
|
||||||
exit -1
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
label_file='ProsodyLabeling/000001-010000.txt'
|
|
||||||
filename='000001-010000.txt'
|
|
||||||
unrar e ${archive} ${label_file}
|
|
||||||
cp ${filename} ${exp_dir}
|
|
||||||
rm -f ${filename}
|
|
||||||
|
|
||||||
if [ ! -f ${exp_dir}/${filename} ];then
|
|
||||||
echo "File extraction failed!"
|
|
||||||
exit
|
|
||||||
fi
|
|
||||||
|
|
||||||
exit 0
|
|
@ -1,8 +0,0 @@
|
|||||||
export MAIN_ROOT=`realpath ${PWD}/../../../../`
|
|
||||||
|
|
||||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
|
||||||
export LC_ALL=C
|
|
||||||
|
|
||||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
|
||||||
export PYTHONIOENCODING=UTF-8
|
|
||||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
|
@ -1 +0,0 @@
|
|||||||
jieba
|
|
@ -1,37 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
source path.sh
|
|
||||||
|
|
||||||
stage=-1
|
|
||||||
stop_stage=100
|
|
||||||
|
|
||||||
exp_dir=exp
|
|
||||||
data=data
|
|
||||||
|
|
||||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
|
||||||
|
|
||||||
mkdir -p ${exp_dir}
|
|
||||||
|
|
||||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
|
|
||||||
mkdir -p ${data}
|
|
||||||
test -e ${data}/BZNSYP.rar || wget -c https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar -P ${data}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
|
||||||
echo "stage 0: Extracting Prosody Labeling"
|
|
||||||
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data}
|
|
||||||
fi
|
|
||||||
|
|
||||||
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
|
|
||||||
filename="000001-010000.txt"
|
|
||||||
|
|
||||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
||||||
echo "stage 1: Processing transcriptions..."
|
|
||||||
python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/ref.pinyin
|
|
||||||
|
|
||||||
python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/trans.pinyin
|
|
||||||
python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/trans.jieba.pinyin
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "done"
|
|
||||||
exit 0
|
|
@ -1 +0,0 @@
|
|||||||
exp
|
|
@ -1,29 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
from text_processing import normalization
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Normalize text in Chinese with some rules.")
|
|
||||||
parser.add_argument("input", type=str, help="the input sentences")
|
|
||||||
parser.add_argument("output", type=str, help="path to save the output file.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
with open(args.input, 'rt') as fin:
|
|
||||||
with open(args.output, 'wt') as fout:
|
|
||||||
for sent in fin:
|
|
||||||
sent = normalization.normalize_sentence(sent.strip())
|
|
||||||
fout.write(sent)
|
|
||||||
fout.write('\n')
|
|
@ -1,8 +0,0 @@
|
|||||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
|
||||||
|
|
||||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
|
||||||
export LC_ALL=C
|
|
||||||
|
|
||||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
|
||||||
export PYTHONIOENCODING=UTF-8
|
|
||||||
export PYTHONPATH=${MAIN_ROOT}:${MAIN_ROOT}/third_party:${PYTHONPATH}#
|
|
@ -1,26 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
source path.sh
|
|
||||||
|
|
||||||
stage=-1
|
|
||||||
stop_stage=100
|
|
||||||
|
|
||||||
exp_dir=exp
|
|
||||||
data_dir=data
|
|
||||||
filename="sentences.txt"
|
|
||||||
|
|
||||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
|
||||||
|
|
||||||
mkdir -p ${exp_dir}
|
|
||||||
|
|
||||||
|
|
||||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
||||||
echo "stage 1: Processing "
|
|
||||||
python3 local/test_normalization.py ${data_dir}/${filename} ${exp_dir}/normalized.txt
|
|
||||||
if [ -f "${exp_dir}/normalized.txt" ]; then
|
|
||||||
echo "Normalized text save at ${exp_dir}/normalized.txt"
|
|
||||||
fi
|
|
||||||
# TODO(chenfeiyu): compute edit distance against ground-truth
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "done"
|
|
||||||
exit 0
|
|
@ -1,2 +0,0 @@
|
|||||||
*~
|
|
||||||
*.far
|
|
@ -1,21 +0,0 @@
|
|||||||
MIT License
|
|
||||||
|
|
||||||
Copyright (c) 2020 SpeechIO
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
@ -1,8 +0,0 @@
|
|||||||
# for plain text
|
|
||||||
python3 cn_tn.py example_plain.txt output_plain.txt
|
|
||||||
diff example_plain.txt output_plain.txt
|
|
||||||
|
|
||||||
# for Kaldi's trans format
|
|
||||||
python3 cn_tn.py --has_key example_kaldi.txt output_kaldi.txt
|
|
||||||
diff example_kaldi.txt output_kaldi.txt
|
|
||||||
|
|
@ -1,24 +0,0 @@
|
|||||||
0. place install_thrax.sh into $KALDI/tools/extras/
|
|
||||||
|
|
||||||
1. recompile openfst with necessary option "--enable-grm" to support thrax:
|
|
||||||
* cd $KALDI_ROOT/tools
|
|
||||||
* make clean
|
|
||||||
* edit $KALDI_ROOT/tools/Makefile, append "--enable-grm" option to OPENFST_CONFIGURE:
|
|
||||||
OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts --enable-lookahead-fsts --with-pic --enable-grm
|
|
||||||
* make -j 10
|
|
||||||
|
|
||||||
2. install thrax
|
|
||||||
cd $KALDI_ROOT/tools
|
|
||||||
sh extras/install_thrax.sh
|
|
||||||
|
|
||||||
3. add thrax binary path into $KALDI_ROOT/tools/env.sh:
|
|
||||||
export PATH=/path/to/your/kaldi_root/tools/thrax-1.2.9/src/bin:${PATH}
|
|
||||||
|
|
||||||
usage:
|
|
||||||
before you run anything related to thrax, use:
|
|
||||||
. $KALDI_ROOT/tools/env.sh
|
|
||||||
to enable binary finding, like what we always do in kaldi.
|
|
||||||
|
|
||||||
sample usage:
|
|
||||||
sh run_en.sh
|
|
||||||
sh run_cn.sh
|
|
@ -1,12 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
## This script should be placed under $KALDI_ROOT/tools/extras/, and see INSTALL.txt for installation guide
|
|
||||||
if [ ! -f thrax-1.2.9.tar.gz ]; then
|
|
||||||
wget http://www.openfst.org/twiki/pub/GRM/ThraxDownload/thrax-1.2.9.tar.gz
|
|
||||||
tar -zxf thrax-1.2.9.tar.gz
|
|
||||||
fi
|
|
||||||
cd thrax-1.2.9
|
|
||||||
OPENFSTPREFIX=`pwd`/../openfst
|
|
||||||
LDFLAGS="-L${OPENFSTPREFIX}/lib" CXXFLAGS="-I${OPENFSTPREFIX}/include" ./configure --prefix ${OPENFSTPREFIX}
|
|
||||||
make -j 10; make install
|
|
||||||
cd ..
|
|
||||||
|
|
Binary file not shown.
Binary file not shown.
@ -1,6 +0,0 @@
|
|||||||
cd src/cn
|
|
||||||
thraxmakedep itn.grm
|
|
||||||
make
|
|
||||||
#thraxrewrite-tester --far=itn.far --rules=ITN
|
|
||||||
cat ../../testcase_cn.txt | thraxrewrite-tester --far=itn.far --rules=ITN
|
|
||||||
cd -
|
|
@ -1,6 +0,0 @@
|
|||||||
cd src
|
|
||||||
thraxmakedep en/verbalizer/podspeech.grm
|
|
||||||
make
|
|
||||||
cat ../testcase_en.txt
|
|
||||||
cat ../testcase_en.txt | thraxrewrite-tester --far=en/verbalizer/podspeech.far --rules=POD_SPEECH_TN
|
|
||||||
cd -
|
|
@ -1,202 +0,0 @@
|
|||||||
|
|
||||||
Apache License
|
|
||||||
Version 2.0, January 2004
|
|
||||||
http://www.apache.org/licenses/
|
|
||||||
|
|
||||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
||||||
|
|
||||||
1. Definitions.
|
|
||||||
|
|
||||||
"License" shall mean the terms and conditions for use, reproduction,
|
|
||||||
and distribution as defined by Sections 1 through 9 of this document.
|
|
||||||
|
|
||||||
"Licensor" shall mean the copyright owner or entity authorized by
|
|
||||||
the copyright owner that is granting the License.
|
|
||||||
|
|
||||||
"Legal Entity" shall mean the union of the acting entity and all
|
|
||||||
other entities that control, are controlled by, or are under common
|
|
||||||
control with that entity. For the purposes of this definition,
|
|
||||||
"control" means (i) the power, direct or indirect, to cause the
|
|
||||||
direction or management of such entity, whether by contract or
|
|
||||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
||||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
||||||
|
|
||||||
"You" (or "Your") shall mean an individual or Legal Entity
|
|
||||||
exercising permissions granted by this License.
|
|
||||||
|
|
||||||
"Source" form shall mean the preferred form for making modifications,
|
|
||||||
including but not limited to software source code, documentation
|
|
||||||
source, and configuration files.
|
|
||||||
|
|
||||||
"Object" form shall mean any form resulting from mechanical
|
|
||||||
transformation or translation of a Source form, including but
|
|
||||||
not limited to compiled object code, generated documentation,
|
|
||||||
and conversions to other media types.
|
|
||||||
|
|
||||||
"Work" shall mean the work of authorship, whether in Source or
|
|
||||||
Object form, made available under the License, as indicated by a
|
|
||||||
copyright notice that is included in or attached to the work
|
|
||||||
(an example is provided in the Appendix below).
|
|
||||||
|
|
||||||
"Derivative Works" shall mean any work, whether in Source or Object
|
|
||||||
form, that is based on (or derived from) the Work and for which the
|
|
||||||
editorial revisions, annotations, elaborations, or other modifications
|
|
||||||
represent, as a whole, an original work of authorship. For the purposes
|
|
||||||
of this License, Derivative Works shall not include works that remain
|
|
||||||
separable from, or merely link (or bind by name) to the interfaces of,
|
|
||||||
the Work and Derivative Works thereof.
|
|
||||||
|
|
||||||
"Contribution" shall mean any work of authorship, including
|
|
||||||
the original version of the Work and any modifications or additions
|
|
||||||
to that Work or Derivative Works thereof, that is intentionally
|
|
||||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
||||||
or by an individual or Legal Entity authorized to submit on behalf of
|
|
||||||
the copyright owner. For the purposes of this definition, "submitted"
|
|
||||||
means any form of electronic, verbal, or written communication sent
|
|
||||||
to the Licensor or its representatives, including but not limited to
|
|
||||||
communication on electronic mailing lists, source code control systems,
|
|
||||||
and issue tracking systems that are managed by, or on behalf of, the
|
|
||||||
Licensor for the purpose of discussing and improving the Work, but
|
|
||||||
excluding communication that is conspicuously marked or otherwise
|
|
||||||
designated in writing by the copyright owner as "Not a Contribution."
|
|
||||||
|
|
||||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
||||||
on behalf of whom a Contribution has been received by Licensor and
|
|
||||||
subsequently incorporated within the Work.
|
|
||||||
|
|
||||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
||||||
this License, each Contributor hereby grants to You a perpetual,
|
|
||||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
||||||
copyright license to reproduce, prepare Derivative Works of,
|
|
||||||
publicly display, publicly perform, sublicense, and distribute the
|
|
||||||
Work and such Derivative Works in Source or Object form.
|
|
||||||
|
|
||||||
3. Grant of Patent License. Subject to the terms and conditions of
|
|
||||||
this License, each Contributor hereby grants to You a perpetual,
|
|
||||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
||||||
(except as stated in this section) patent license to make, have made,
|
|
||||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
||||||
where such license applies only to those patent claims licensable
|
|
||||||
by such Contributor that are necessarily infringed by their
|
|
||||||
Contribution(s) alone or by combination of their Contribution(s)
|
|
||||||
with the Work to which such Contribution(s) was submitted. If You
|
|
||||||
institute patent litigation against any entity (including a
|
|
||||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
||||||
or a Contribution incorporated within the Work constitutes direct
|
|
||||||
or contributory patent infringement, then any patent licenses
|
|
||||||
granted to You under this License for that Work shall terminate
|
|
||||||
as of the date such litigation is filed.
|
|
||||||
|
|
||||||
4. Redistribution. You may reproduce and distribute copies of the
|
|
||||||
Work or Derivative Works thereof in any medium, with or without
|
|
||||||
modifications, and in Source or Object form, provided that You
|
|
||||||
meet the following conditions:
|
|
||||||
|
|
||||||
(a) You must give any other recipients of the Work or
|
|
||||||
Derivative Works a copy of this License; and
|
|
||||||
|
|
||||||
(b) You must cause any modified files to carry prominent notices
|
|
||||||
stating that You changed the files; and
|
|
||||||
|
|
||||||
(c) You must retain, in the Source form of any Derivative Works
|
|
||||||
that You distribute, all copyright, patent, trademark, and
|
|
||||||
attribution notices from the Source form of the Work,
|
|
||||||
excluding those notices that do not pertain to any part of
|
|
||||||
the Derivative Works; and
|
|
||||||
|
|
||||||
(d) If the Work includes a "NOTICE" text file as part of its
|
|
||||||
distribution, then any Derivative Works that You distribute must
|
|
||||||
include a readable copy of the attribution notices contained
|
|
||||||
within such NOTICE file, excluding those notices that do not
|
|
||||||
pertain to any part of the Derivative Works, in at least one
|
|
||||||
of the following places: within a NOTICE text file distributed
|
|
||||||
as part of the Derivative Works; within the Source form or
|
|
||||||
documentation, if provided along with the Derivative Works; or,
|
|
||||||
within a display generated by the Derivative Works, if and
|
|
||||||
wherever such third-party notices normally appear. The contents
|
|
||||||
of the NOTICE file are for informational purposes only and
|
|
||||||
do not modify the License. You may add Your own attribution
|
|
||||||
notices within Derivative Works that You distribute, alongside
|
|
||||||
or as an addendum to the NOTICE text from the Work, provided
|
|
||||||
that such additional attribution notices cannot be construed
|
|
||||||
as modifying the License.
|
|
||||||
|
|
||||||
You may add Your own copyright statement to Your modifications and
|
|
||||||
may provide additional or different license terms and conditions
|
|
||||||
for use, reproduction, or distribution of Your modifications, or
|
|
||||||
for any such Derivative Works as a whole, provided Your use,
|
|
||||||
reproduction, and distribution of the Work otherwise complies with
|
|
||||||
the conditions stated in this License.
|
|
||||||
|
|
||||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
||||||
any Contribution intentionally submitted for inclusion in the Work
|
|
||||||
by You to the Licensor shall be under the terms and conditions of
|
|
||||||
this License, without any additional terms or conditions.
|
|
||||||
Notwithstanding the above, nothing herein shall supersede or modify
|
|
||||||
the terms of any separate license agreement you may have executed
|
|
||||||
with Licensor regarding such Contributions.
|
|
||||||
|
|
||||||
6. Trademarks. This License does not grant permission to use the trade
|
|
||||||
names, trademarks, service marks, or product names of the Licensor,
|
|
||||||
except as required for reasonable and customary use in describing the
|
|
||||||
origin of the Work and reproducing the content of the NOTICE file.
|
|
||||||
|
|
||||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
||||||
agreed to in writing, Licensor provides the Work (and each
|
|
||||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
||||||
implied, including, without limitation, any warranties or conditions
|
|
||||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
||||||
appropriateness of using or redistributing the Work and assume any
|
|
||||||
risks associated with Your exercise of permissions under this License.
|
|
||||||
|
|
||||||
8. Limitation of Liability. In no event and under no legal theory,
|
|
||||||
whether in tort (including negligence), contract, or otherwise,
|
|
||||||
unless required by applicable law (such as deliberate and grossly
|
|
||||||
negligent acts) or agreed to in writing, shall any Contributor be
|
|
||||||
liable to You for damages, including any direct, indirect, special,
|
|
||||||
incidental, or consequential damages of any character arising as a
|
|
||||||
result of this License or out of the use or inability to use the
|
|
||||||
Work (including but not limited to damages for loss of goodwill,
|
|
||||||
work stoppage, computer failure or malfunction, or any and all
|
|
||||||
other commercial damages or losses), even if such Contributor
|
|
||||||
has been advised of the possibility of such damages.
|
|
||||||
|
|
||||||
9. Accepting Warranty or Additional Liability. While redistributing
|
|
||||||
the Work or Derivative Works thereof, You may choose to offer,
|
|
||||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
||||||
or other liability obligations and/or rights consistent with this
|
|
||||||
License. However, in accepting such obligations, You may act only
|
|
||||||
on Your own behalf and on Your sole responsibility, not on behalf
|
|
||||||
of any other Contributor, and only if You agree to indemnify,
|
|
||||||
defend, and hold each Contributor harmless for any liability
|
|
||||||
incurred by, or claims asserted against, such Contributor by reason
|
|
||||||
of your accepting any such warranty or additional liability.
|
|
||||||
|
|
||||||
END OF TERMS AND CONDITIONS
|
|
||||||
|
|
||||||
APPENDIX: How to apply the Apache License to your work.
|
|
||||||
|
|
||||||
To apply the Apache License to your work, attach the following
|
|
||||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
|
||||||
replaced with your own identifying information. (Don't include
|
|
||||||
the brackets!) The text should be enclosed in the appropriate
|
|
||||||
comment syntax for the file format. We also recommend that a
|
|
||||||
file or class name and description of purpose be included on the
|
|
||||||
same "printed page" as the copyright notice for easier
|
|
||||||
identification within third-party archives.
|
|
||||||
|
|
||||||
Copyright [yyyy] [name of copyright owner]
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
@ -1,65 +0,0 @@
|
|||||||
en/verbalizer/podspeech.far: en/verbalizer/podspeech.grm util/util.far util/case.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
util/util.far: util/util.grm util/byte.far util/case.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
util/byte.far: util/byte.grm
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
util/case.far: util/case.grm util/byte.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/extra_numbers.far: en/verbalizer/extra_numbers.grm util/byte.far en/verbalizer/numbers.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/numbers.far: en/verbalizer/numbers.grm en/verbalizer/number_names.far util/byte.far universal/thousands_punct.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/number_names.far: en/verbalizer/number_names.grm util/arithmetic.far en/verbalizer/g.fst en/verbalizer/cardinals.tsv en/verbalizer/ordinals.tsv
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
util/arithmetic.far: util/arithmetic.grm util/byte.far util/germanic.tsv
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
universal/thousands_punct.far: universal/thousands_punct.grm util/byte.far util/util.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/float.far: en/verbalizer/float.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/factorization.far: en/verbalizer/factorization.grm util/byte.far util/util.far en/verbalizer/numbers.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/lexical_map.far: en/verbalizer/lexical_map.grm util/byte.far en/verbalizer/lexical_map.tsv
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/math.far: en/verbalizer/math.grm en/verbalizer/float.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/miscellaneous.far: en/verbalizer/miscellaneous.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/extra_numbers.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/spelled.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
ru/classifier/cyrillic.far: ru/classifier/cyrillic.grm
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/spelled.far: en/verbalizer/spelled.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/money.far: en/verbalizer/money.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/money.tsv
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/numbers_plus.far: en/verbalizer/numbers_plus.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/spoken_punct.far: en/verbalizer/spoken_punct.grm en/verbalizer/lexical_map.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/time.far: en/verbalizer/time.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
en/verbalizer/urls.far: en/verbalizer/urls.grm util/byte.far en/verbalizer/lexical_map.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -f util/util.far util/case.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far util/byte.far en/verbalizer/number_names.far universal/thousands_punct.far util/arithmetic.far en/verbalizer/factorization.far en/verbalizer/lexical_map.far ru/classifier/cyrillic.far
|
|
@ -1,24 +0,0 @@
|
|||||||
# Text normalization covering grammars
|
|
||||||
|
|
||||||
This repository provides covering grammars for English and Russian text normalization as
|
|
||||||
documented in:
|
|
||||||
|
|
||||||
Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
|
|
||||||
_Transactions of the Association for Computational Linguistics_ 4: 507-519.
|
|
||||||
|
|
||||||
Ng, A. H., Gorman, K., and Sproat, R. 2017. Minimally supervised
|
|
||||||
written-to-spoken text normalization. In _ASRU_, pages 665-670.
|
|
||||||
|
|
||||||
If you use these grammars in a publication, we would appreciate if you cite these works.
|
|
||||||
|
|
||||||
## Building
|
|
||||||
|
|
||||||
The grammars are written in [Thrax](thrax.opengrm.org) and compile into [OpenFst](openfst.org) FAR (FstARchive) files. To compile, simply run `make` in the `src/` directory.
|
|
||||||
|
|
||||||
## License
|
|
||||||
|
|
||||||
See `LICENSE`.
|
|
||||||
|
|
||||||
## Mandatory disclaimer
|
|
||||||
|
|
||||||
This is not an official Google product.
|
|
@ -1,23 +0,0 @@
|
|||||||
itn.far: itn.grm byte.far number.far hotfix.far percentage.far date.far amount.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
byte.far: byte.grm
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
number.far: number.grm byte.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
hotfix.far: hotfix.grm byte.far hotfix.list
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
percentage.far: percentage.grm byte.far number.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
date.far: date.grm byte.far number.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
amount.far: amount.grm byte.far number.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -f byte.far number.far hotfix.far percentage.far date.far amount.far
|
|
@ -1,24 +0,0 @@
|
|||||||
import 'byte.grm' as b;
|
|
||||||
import 'number.grm' as n;
|
|
||||||
|
|
||||||
unit = (
|
|
||||||
"匹"|"张"|"座"|"回"|"场"|"尾"|"条"|"个"|"首"|"阙"|"阵"|"网"|"炮"|
|
|
||||||
"顶"|"丘"|"棵"|"只"|"支"|"袭"|"辆"|"挑"|"担"|"颗"|"壳"|"窠"|"曲"|
|
|
||||||
"墙"|"群"|"腔"|"砣"|"座"|"客"|"贯"|"扎"|"捆"|"刀"|"令"|"打"|"手"|
|
|
||||||
"罗"|"坡"|"山"|"岭"|"江"|"溪"|"钟"|"队"|"单"|"双"|"对"|"出"|"口"|
|
|
||||||
"头"|"脚"|"板"|"跳"|"枝"|"件"|"贴"|"针"|"线"|"管"|"名"|"位"|"身"|
|
|
||||||
"堂"|"课"|"本"|"页"|"家"|"户"|"层"|"丝"|"毫"|"厘"|"分"|"钱"|"两"|
|
|
||||||
"斤"|"担"|"铢"|"石"|"钧"|"锱"|"忽"|"毫"|"厘"|"分"|"寸"|"尺"|"丈"|
|
|
||||||
"里"|"寻"|"常"|"铺"|"程"|"撮"|"勺"|"合"|"升"|"斗"|"石"|"盘"|"碗"|
|
|
||||||
"碟"|"叠"|"桶"|"笼"|"盆"|"盒"|"杯"|"钟"|"斛"|"锅"|"簋"|"篮"|"盘"|
|
|
||||||
"桶"|"罐"|"瓶"|"壶"|"卮"|"盏"|"箩"|"箱"|"煲"|"啖"|"袋"|"钵"|"年"|
|
|
||||||
"月"|"日"|"季"|"刻"|"时"|"周"|"天"|"秒"|"分"|"旬"|"纪"|"岁"|"世"|
|
|
||||||
"更"|"夜"|"春"|"夏"|"秋"|"冬"|"代"|"伏"|"辈"|"丸"|"泡"|"粒"|"颗"|
|
|
||||||
"幢"|"堆"|"条"|"根"|"支"|"道"|"面"|"片"|"张"|"颗"|"块"|
|
|
||||||
(("千克":"kg")|("毫克":"mg")|("微克":"µg"))|
|
|
||||||
(("千米":"km")|("厘米":"cm")|("毫米":"mm")|("微米":"µm")|("纳米":"nm"))
|
|
||||||
);
|
|
||||||
|
|
||||||
amount = n.number unit;
|
|
||||||
export AMOUNT = CDRewrite[amount, "", "", b.kBytes*];
|
|
||||||
|
|
@ -1,76 +0,0 @@
|
|||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
# Copyright 2005-2011 Google, Inc.
|
|
||||||
# Author: ttai@google.com (Terry Tai)
|
|
||||||
|
|
||||||
# Standard constants for ASCII (byte) based strings. This mirrors the
|
|
||||||
# functions provided by C/C++'s ctype.h library.
|
|
||||||
|
|
||||||
# Note that [0] is missing. Matching the string-termination character is kinda weird.
|
|
||||||
export kBytes = Optimize[
|
|
||||||
"[1]" | "[2]" | "[3]" | "[4]" | "[5]" | "[6]" | "[7]" | "[8]" | "[9]" | "[10]" |
|
|
||||||
"[11]" | "[12]" | "[13]" | "[14]" | "[15]" | "[16]" | "[17]" | "[18]" | "[19]" | "[20]" |
|
|
||||||
"[21]" | "[22]" | "[23]" | "[24]" | "[25]" | "[26]" | "[27]" | "[28]" | "[29]" | "[30]" |
|
|
||||||
"[31]" | "[32]" | "[33]" | "[34]" | "[35]" | "[36]" | "[37]" | "[38]" | "[39]" | "[40]" |
|
|
||||||
"[41]" | "[42]" | "[43]" | "[44]" | "[45]" | "[46]" | "[47]" | "[48]" | "[49]" | "[50]" |
|
|
||||||
"[51]" | "[52]" | "[53]" | "[54]" | "[55]" | "[56]" | "[57]" | "[58]" | "[59]" | "[60]" |
|
|
||||||
"[61]" | "[62]" | "[63]" | "[64]" | "[65]" | "[66]" | "[67]" | "[68]" | "[69]" | "[70]" |
|
|
||||||
"[71]" | "[72]" | "[73]" | "[74]" | "[75]" | "[76]" | "[77]" | "[78]" | "[79]" | "[80]" |
|
|
||||||
"[81]" | "[82]" | "[83]" | "[84]" | "[85]" | "[86]" | "[87]" | "[88]" | "[89]" | "[90]" |
|
|
||||||
"[91]" | "[92]" | "[93]" | "[94]" | "[95]" | "[96]" | "[97]" | "[98]" | "[99]" | "[100]" |
|
|
||||||
"[101]" | "[102]" | "[103]" | "[104]" | "[105]" | "[106]" | "[107]" | "[108]" | "[109]" | "[110]" |
|
|
||||||
"[111]" | "[112]" | "[113]" | "[114]" | "[115]" | "[116]" | "[117]" | "[118]" | "[119]" | "[120]" |
|
|
||||||
"[121]" | "[122]" | "[123]" | "[124]" | "[125]" | "[126]" | "[127]" | "[128]" | "[129]" | "[130]" |
|
|
||||||
"[131]" | "[132]" | "[133]" | "[134]" | "[135]" | "[136]" | "[137]" | "[138]" | "[139]" | "[140]" |
|
|
||||||
"[141]" | "[142]" | "[143]" | "[144]" | "[145]" | "[146]" | "[147]" | "[148]" | "[149]" | "[150]" |
|
|
||||||
"[151]" | "[152]" | "[153]" | "[154]" | "[155]" | "[156]" | "[157]" | "[158]" | "[159]" | "[160]" |
|
|
||||||
"[161]" | "[162]" | "[163]" | "[164]" | "[165]" | "[166]" | "[167]" | "[168]" | "[169]" | "[170]" |
|
|
||||||
"[171]" | "[172]" | "[173]" | "[174]" | "[175]" | "[176]" | "[177]" | "[178]" | "[179]" | "[180]" |
|
|
||||||
"[181]" | "[182]" | "[183]" | "[184]" | "[185]" | "[186]" | "[187]" | "[188]" | "[189]" | "[190]" |
|
|
||||||
"[191]" | "[192]" | "[193]" | "[194]" | "[195]" | "[196]" | "[197]" | "[198]" | "[199]" | "[200]" |
|
|
||||||
"[201]" | "[202]" | "[203]" | "[204]" | "[205]" | "[206]" | "[207]" | "[208]" | "[209]" | "[210]" |
|
|
||||||
"[211]" | "[212]" | "[213]" | "[214]" | "[215]" | "[216]" | "[217]" | "[218]" | "[219]" | "[220]" |
|
|
||||||
"[221]" | "[222]" | "[223]" | "[224]" | "[225]" | "[226]" | "[227]" | "[228]" | "[229]" | "[230]" |
|
|
||||||
"[231]" | "[232]" | "[233]" | "[234]" | "[235]" | "[236]" | "[237]" | "[238]" | "[239]" | "[240]" |
|
|
||||||
"[241]" | "[242]" | "[243]" | "[244]" | "[245]" | "[246]" | "[247]" | "[248]" | "[249]" | "[250]" |
|
|
||||||
"[251]" | "[252]" | "[253]" | "[254]" | "[255]"
|
|
||||||
];
|
|
||||||
|
|
||||||
export kDigit = Optimize[
|
|
||||||
"0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
|
|
||||||
];
|
|
||||||
|
|
||||||
export kLower = Optimize[
|
|
||||||
"a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" |
|
|
||||||
"n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
|
|
||||||
];
|
|
||||||
export kUpper = Optimize[
|
|
||||||
"A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" |
|
|
||||||
"N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
|
|
||||||
];
|
|
||||||
export kAlpha = Optimize[kLower | kUpper];
|
|
||||||
|
|
||||||
export kAlnum = Optimize[kDigit | kAlpha];
|
|
||||||
|
|
||||||
export kSpace = Optimize[
|
|
||||||
" " | "\t" | "\n" | "\r"
|
|
||||||
];
|
|
||||||
export kNotSpace = Optimize[kBytes - kSpace];
|
|
||||||
|
|
||||||
export kPunct = Optimize[
|
|
||||||
"!" | "\"" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," |
|
|
||||||
"-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "\[" | "\\" |
|
|
||||||
"\]" | "^" | "_" | "`" | "{" | "|" | "}" | "~"
|
|
||||||
];
|
|
||||||
|
|
||||||
export kGraph = Optimize[kAlnum | kPunct];
|
|
@ -1,10 +0,0 @@
|
|||||||
import 'byte.grm' as b;
|
|
||||||
import 'number.grm' as n;
|
|
||||||
|
|
||||||
date_day = n.number_1_to_99 ("日"|"号");
|
|
||||||
date_month_day = n.number_1_to_99 "月" date_day;
|
|
||||||
date_year_month_day = ((n.number_0_to_9){2,4} | n.number) "年" date_month_day;
|
|
||||||
|
|
||||||
date = date_year_month_day | date_month_day | date_day;
|
|
||||||
|
|
||||||
export DATE = CDRewrite[date, "", "", b.kBytes*];
|
|
@ -1,5 +0,0 @@
|
|||||||
import 'byte.grm' as b;
|
|
||||||
hotfix = StringFile['hotfix.list'];
|
|
||||||
|
|
||||||
export HOTFIX = CDRewrite[hotfix, "", "", b.kBytes*];
|
|
||||||
|
|
@ -1,18 +0,0 @@
|
|||||||
0头 零头
|
|
||||||
10字 十字
|
|
||||||
东4环 东4环 -1.0
|
|
||||||
东4 东四 -0.5
|
|
||||||
4惠 四惠
|
|
||||||
3元桥 三元桥
|
|
||||||
4平市 四平市
|
|
||||||
5台山 五台山
|
|
||||||
西2旗 西二旗
|
|
||||||
西3旗 西三旗
|
|
||||||
4道口 四道口 -1.0
|
|
||||||
5道口 五道口 -1.0
|
|
||||||
6道口 六道口 -1.0
|
|
||||||
6里桥 六里桥
|
|
||||||
7里庄 七里庄
|
|
||||||
8宝山 八宝山
|
|
||||||
9颗松 九棵松
|
|
||||||
10里堡 十里堡
|
|
@ -1,9 +0,0 @@
|
|||||||
import 'byte.grm' as b;
|
|
||||||
import 'number.grm' as number;
|
|
||||||
import 'hotfix.grm' as hotfix;
|
|
||||||
import 'percentage.grm' as percentage;
|
|
||||||
import 'date.grm' as date;
|
|
||||||
import 'amount.grm' as amount; # seems not useful for now
|
|
||||||
|
|
||||||
export ITN = Optimize[percentage.PERCENTAGE @ (date.DATE <-1>) @ number.NUMBER @ hotfix.HOTFIX];
|
|
||||||
|
|
@ -1,61 +0,0 @@
|
|||||||
import 'byte.grm' as b;
|
|
||||||
|
|
||||||
number_1_to_9 = (
|
|
||||||
("一":"1") | ("幺":"1") |
|
|
||||||
("二":"2") | ("两":"2") |
|
|
||||||
("三":"3") |
|
|
||||||
("四":"4") |
|
|
||||||
("五":"5") |
|
|
||||||
("六":"6") |
|
|
||||||
("七":"7") |
|
|
||||||
("八":"8") |
|
|
||||||
("九":"9")
|
|
||||||
);
|
|
||||||
|
|
||||||
export number_0_to_9 = (("零":"0") | number_1_to_9);
|
|
||||||
|
|
||||||
number_10_to_19 = (
|
|
||||||
("十":"10") |
|
|
||||||
("十一":"11") |
|
|
||||||
("十二":"12") |
|
|
||||||
("十三":"13") |
|
|
||||||
("十四":"14") |
|
|
||||||
("十五":"15") |
|
|
||||||
("十六":"16") |
|
|
||||||
("十七":"17") |
|
|
||||||
("十八":"18") |
|
|
||||||
("十九":"19")
|
|
||||||
);
|
|
||||||
|
|
||||||
number_10s = (number_1_to_9 ("十":""));
|
|
||||||
number_100s = (number_1_to_9 ("百":""));
|
|
||||||
number_1000s = (number_1_to_9 ("千":""));
|
|
||||||
number_10000s = (number_1_to_9 ("万":""));
|
|
||||||
|
|
||||||
number_10_to_99 = (
|
|
||||||
((number_10s number_1_to_9)<-0.3>) |
|
|
||||||
((number_10s ("":"0"))<-0.2>) |
|
|
||||||
(number_10_to_19 <-0.1>)
|
|
||||||
);
|
|
||||||
|
|
||||||
export number_1_to_99 = (number_1_to_9 | number_10_to_99);
|
|
||||||
|
|
||||||
number_100_to_999 = (
|
|
||||||
((number_100s ("零":"0") number_1_to_9)<0.0>)|
|
|
||||||
((number_100s number_10_to_99)<0.0>) |
|
|
||||||
((number_100s number_1_to_9 ("":"0"))<0.0>) |
|
|
||||||
((number_100s ("":"00"))<0.1>)
|
|
||||||
);
|
|
||||||
|
|
||||||
number_1000_to_9999 = (
|
|
||||||
((number_1000s number_100_to_999)<0.0>) |
|
|
||||||
((number_1000s ("零":"0") number_10_to_99)<0.0>)|
|
|
||||||
((number_1000s ("零":"00") number_1_to_9)<0.0>)|
|
|
||||||
((number_1000s ("":"000"))<1>) |
|
|
||||||
((number_1000s number_1_to_9 ("":"00"))<0.0>)
|
|
||||||
);
|
|
||||||
|
|
||||||
export number = number_1_to_99 | (number_100_to_999 <-1>) | (number_1000_to_9999 <-2>);
|
|
||||||
|
|
||||||
export NUMBER = CDRewrite[number, "", "", b.kBytes*];
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
|||||||
import 'byte.grm' as b;
|
|
||||||
import 'number.grm' as n;
|
|
||||||
|
|
||||||
percentage = (
|
|
||||||
("百分之":"") n.number_1_to_99 ("":"%")
|
|
||||||
);
|
|
||||||
|
|
||||||
export PERCENTAGE = CDRewrite[percentage, "", "", b.kBytes*];
|
|
@ -1,6 +0,0 @@
|
|||||||
# English covering grammar definitions
|
|
||||||
|
|
||||||
This directory defines a English text normalization covering grammar. The
|
|
||||||
primary entry-point is the FST `VERBALIZER`, defined in
|
|
||||||
`verbalizer/verbalizer.grm` and compiled in the FST archive
|
|
||||||
`verbalizer/verbalizer.far`.
|
|
@ -1,3 +0,0 @@
|
|||||||
verbalizer.far: verbalizer.grm util/util.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far
|
|
||||||
thraxcompiler --input_grammar=$< --output_far=$@
|
|
||||||
|
|
|
@ -1,35 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'en/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@");
|
|
||||||
|
|
||||||
export DIGITS = digit (n.I[" "] digit)*;
|
|
||||||
|
|
||||||
# Various common factorizations
|
|
||||||
|
|
||||||
two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS;
|
|
||||||
|
|
||||||
three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS;
|
|
||||||
|
|
||||||
mixed =
|
|
||||||
(digit n.I[" "] two_digits)
|
|
||||||
| (two_digits n.I[" "] two_digits)
|
|
||||||
| (two_digits n.I[" "] three_digits)
|
|
||||||
| (two_digits n.I[" "] two_digits n.I[" "] two_digits)
|
|
||||||
;
|
|
||||||
|
|
||||||
export MIXED_NUMBERS = Optimize[mixed];
|
|
@ -1,40 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'util/util.grm' as u;
|
|
||||||
import 'en/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
func ToNumberName[expr] {
|
|
||||||
number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*;
|
|
||||||
return Optimize[expr @ number_name_seq];
|
|
||||||
}
|
|
||||||
|
|
||||||
d = b.kDigit;
|
|
||||||
|
|
||||||
leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*];
|
|
||||||
|
|
||||||
by_ones = d n.I[" "];
|
|
||||||
by_twos = (d{2} @ leading_zero) n.I[" "];
|
|
||||||
by_threes = (d{3} @ leading_zero) n.I[" "];
|
|
||||||
|
|
||||||
groupings = by_twos* (by_threes | by_twos | by_ones);
|
|
||||||
|
|
||||||
export FRACTIONAL_PART_UNGROUPED =
|
|
||||||
Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]]
|
|
||||||
;
|
|
||||||
export FRACTIONAL_PART_GROUPED =
|
|
||||||
Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]]
|
|
||||||
;
|
|
||||||
export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]];
|
|
@ -1,30 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'en/verbalizer/factorization.grm' as f;
|
|
||||||
import 'en/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'en/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED;
|
|
||||||
fractional_part_grouped = f.FRACTIONAL_PART_GROUPED;
|
|
||||||
fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED;
|
|
||||||
|
|
||||||
__fractional_part__ = fractional_part_ungrouped | fractional_part_unparsed;
|
|
||||||
__decimal_marker__ = ".";
|
|
||||||
|
|
||||||
export FLOAT = Optimize[
|
|
||||||
(n.CARDINAL_NUMBERS
|
|
||||||
(__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ")
|
|
||||||
__fractional_part__) @ l.LEXICAL_MAP]
|
|
||||||
;
|
|
Binary file not shown.
@ -1,25 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
|
|
||||||
lexical_map = StringFile['en/verbalizer/lexical_map.tsv'];
|
|
||||||
|
|
||||||
sigma_star = b.kBytes*;
|
|
||||||
|
|
||||||
del_null = CDRewrite["__NULL__" : "", "", "", sigma_star];
|
|
||||||
|
|
||||||
export LEXICAL_MAP = Optimize[
|
|
||||||
CDRewrite[lexical_map, "", "", sigma_star] @ del_null]
|
|
||||||
;
|
|
Can't render this file because it has a wrong number of fields in line 37.
|
@ -1,34 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'en/verbalizer/float.grm' as f;
|
|
||||||
import 'en/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'en/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
float = f.FLOAT;
|
|
||||||
card = n.CARDINAL_NUMBERS;
|
|
||||||
number = card | float;
|
|
||||||
|
|
||||||
plus = "+" : " @@ARITHMETIC_PLUS@@ ";
|
|
||||||
times = "*" : " @@ARITHMETIC_TIMES@@ ";
|
|
||||||
minus = "-" : " @@ARITHMETIC_MINUS@@ ";
|
|
||||||
division = "/" : " @@ARITHMETIC_DIVISION@@ ";
|
|
||||||
|
|
||||||
operator = plus | times | minus | division;
|
|
||||||
|
|
||||||
percent = "%" : " @@PERCENT@@";
|
|
||||||
|
|
||||||
export ARITHMETIC =
|
|
||||||
Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP]
|
|
||||||
;
|
|
@ -1,78 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'ru/classifier/cyrillic.grm' as c;
|
|
||||||
import 'en/verbalizer/extra_numbers.grm' as e;
|
|
||||||
import 'en/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'en/verbalizer/numbers.grm' as n;
|
|
||||||
import 'en/verbalizer/spelled.grm' as s;
|
|
||||||
|
|
||||||
letter = b.kAlpha | c.kCyrillicAlpha;
|
|
||||||
dash = "-";
|
|
||||||
word = letter+;
|
|
||||||
possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?;
|
|
||||||
|
|
||||||
post_word_symbol =
|
|
||||||
("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) |
|
|
||||||
("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) |
|
|
||||||
("*" : "@@STAR@@")
|
|
||||||
;
|
|
||||||
|
|
||||||
pre_word_symbol =
|
|
||||||
("@" : "@@AT@@") |
|
|
||||||
("/" : "@@SLASH@@") |
|
|
||||||
("#" : "@@HASH@@")
|
|
||||||
;
|
|
||||||
|
|
||||||
post_word = possibly_split_word n.I[" "] post_word_symbol;
|
|
||||||
|
|
||||||
pre_word = pre_word_symbol n.I[" "] possibly_split_word;
|
|
||||||
|
|
||||||
## Number/digit sequence combos, maybe with a dash
|
|
||||||
|
|
||||||
spelled_word = word @ s.SPELLED_NO_LETTER;
|
|
||||||
|
|
||||||
word_number =
|
|
||||||
(word | spelled_word)
|
|
||||||
(n.I[" "] | (dash : " "))
|
|
||||||
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
|
||||||
;
|
|
||||||
|
|
||||||
number_word =
|
|
||||||
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
|
||||||
(n.I[" "] | (dash : " "))
|
|
||||||
(word | spelled_word)
|
|
||||||
;
|
|
||||||
|
|
||||||
## Two-digit year.
|
|
||||||
|
|
||||||
# Note that in this case to be fair we really have to allow ordinals too since
|
|
||||||
# in some languages that's what you would have.
|
|
||||||
|
|
||||||
two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS));
|
|
||||||
|
|
||||||
dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com";
|
|
||||||
|
|
||||||
miscellaneous = Optimize[
|
|
||||||
possibly_split_word
|
|
||||||
| post_word
|
|
||||||
| pre_word
|
|
||||||
| word_number
|
|
||||||
| number_word
|
|
||||||
| two_digit_year
|
|
||||||
| dot_com
|
|
||||||
];
|
|
||||||
|
|
||||||
export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP];
|
|
@ -1,44 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'en/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'en/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
card = n.CARDINAL_NUMBERS;
|
|
||||||
|
|
||||||
__currency__ = StringFile['en/verbalizer/money.tsv'];
|
|
||||||
|
|
||||||
d = b.kDigit;
|
|
||||||
D = d - "0";
|
|
||||||
|
|
||||||
cents = ((n.D["0"] | D) d) @ card;
|
|
||||||
|
|
||||||
# Only dollar for the verbalizer tests for English. Will need to add other
|
|
||||||
# currencies.
|
|
||||||
usd_maj = Project["usd_maj" @ __currency__, 'output'];
|
|
||||||
usd_min = Project["usd_min" @ __currency__, 'output'];
|
|
||||||
and = " @@MONEY_AND@@ " | " ";
|
|
||||||
|
|
||||||
dollar1 =
|
|
||||||
n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min]
|
|
||||||
;
|
|
||||||
|
|
||||||
dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"];
|
|
||||||
|
|
||||||
dollar3 = n.D["$"] card n.I[" " usd_maj];
|
|
||||||
|
|
||||||
dollar = Optimize[dollar1 | dollar2 | dollar3];
|
|
||||||
|
|
||||||
export MONEY = Optimize[dollar @ l.LEXICAL_MAP];
|
|
|
@ -1,54 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
# English minimally supervised number grammar.
|
|
||||||
#
|
|
||||||
# Supports both cardinals and ordinals without overt marking.
|
|
||||||
#
|
|
||||||
# The language-specific acceptor G was compiled with digit, teen, and decade
|
|
||||||
# preterminals. The lexicon transducer L is unambiguous so no LM is used.
|
|
||||||
|
|
||||||
import 'util/arithmetic.grm' as a;
|
|
||||||
|
|
||||||
# Intersects the universal factorization transducer (F) with the
|
|
||||||
# language-specific acceptor (G).
|
|
||||||
|
|
||||||
d = a.DELTA_STAR;
|
|
||||||
f = a.IARITHMETIC_RESTRICTED;
|
|
||||||
g = LoadFst['en/verbalizer/g.fst'];
|
|
||||||
fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]];
|
|
||||||
test1 = AssertEqual["230" @ fg, "(+ (* 2 100 *) 30 +)"];
|
|
||||||
|
|
||||||
# Compiles lexicon transducer (L).
|
|
||||||
|
|
||||||
cardinal_name = StringFile['en/verbalizer/cardinals.tsv'];
|
|
||||||
cardinal_l = Optimize[(cardinal_name " ")* cardinal_name];
|
|
||||||
test2 = AssertEqual["2 100 30" @ cardinal_l, "two hundred thirty"];
|
|
||||||
|
|
||||||
ordinal_name = StringFile['en/verbalizer/ordinals.tsv'];
|
|
||||||
# In English, ordinals have the same syntax as cardinals and all but the final
|
|
||||||
# element is verbalized using a cardinal number word; e.g., "two hundred
|
|
||||||
# thirtieth".
|
|
||||||
ordinal_l = Optimize[(cardinal_name " ")* ordinal_name];
|
|
||||||
test3 = AssertEqual["2 100 30" @ ordinal_l, "two hundred thirtieth"];
|
|
||||||
|
|
||||||
# Composes L with the leaf transducer (P), then composes that with FG.
|
|
||||||
|
|
||||||
p = a.LEAVES;
|
|
||||||
|
|
||||||
export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)];
|
|
||||||
test4 = AssertEqual["230" @ CARDINAL_NUMBER_NAME, "two hundred thirty"];
|
|
||||||
|
|
||||||
export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)];
|
|
||||||
test5 = AssertEqual["230" @ ORDINAL_NUMBER_NAME, "two hundred thirtieth"];
|
|
@ -1,57 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'en/verbalizer/number_names.grm' as n;
|
|
||||||
import 'util/byte.grm' as bytelib;
|
|
||||||
import 'universal/thousands_punct.grm' as t;
|
|
||||||
|
|
||||||
cardinal = n.CARDINAL_NUMBER_NAME;
|
|
||||||
ordinal = n.ORDINAL_NUMBER_NAME;
|
|
||||||
|
|
||||||
# Putting these here since this grammar gets incorporated by all the others.
|
|
||||||
|
|
||||||
func I[expr] {
|
|
||||||
return "" : expr;
|
|
||||||
}
|
|
||||||
|
|
||||||
func D[expr] {
|
|
||||||
return expr : "";
|
|
||||||
}
|
|
||||||
|
|
||||||
separators = t.comma_thousands | t.no_delimiter;
|
|
||||||
|
|
||||||
# Language specific endings for ordinals.
|
|
||||||
d = bytelib.kDigit;
|
|
||||||
endings = "st" | "nd" | "rd" | "th";
|
|
||||||
|
|
||||||
st = (d* "1") - (d* "11");
|
|
||||||
nd = (d* "2") - (d* "12");
|
|
||||||
rd = (d* "3") - (d* "13");
|
|
||||||
th = Optimize[d* - st - nd - rd];
|
|
||||||
first = st ("st" : "");
|
|
||||||
second = nd ("nd" : "");
|
|
||||||
third = rd ("rd" : "");
|
|
||||||
other = th ("th" : "");
|
|
||||||
marked_ordinal = Optimize[first | second | third | other];
|
|
||||||
|
|
||||||
# The separator is a no-op here but will be needed once we replace
|
|
||||||
# the above targets.
|
|
||||||
|
|
||||||
export CARDINAL_NUMBERS = Optimize[separators @ cardinal];
|
|
||||||
|
|
||||||
export ORDINAL_NUMBERS =
|
|
||||||
Optimize[(separators endings) @ marked_ordinal @ ordinal]
|
|
||||||
;
|
|
||||||
|
|
||||||
export ORDINAL_NUMBERS_UNMARKED = Optimize[separators @ ordinal];
|
|
@ -1,133 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
# Grammar for things built mostly on numbers.
|
|
||||||
|
|
||||||
import 'en/verbalizer/factorization.grm' as f;
|
|
||||||
import 'en/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'en/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
num = n.CARDINAL_NUMBERS;
|
|
||||||
ord = n.ORDINAL_NUMBERS_UNMARKED;
|
|
||||||
digits = f.FRACTIONAL_PART_UNGROUPED;
|
|
||||||
|
|
||||||
# Various symbols.
|
|
||||||
|
|
||||||
plus = "+" : "@@ARITHMETIC_PLUS@@";
|
|
||||||
minus = "-" : "@@ARITHMETIC_MINUS@@";
|
|
||||||
slash = "/" : "@@SLASH@@";
|
|
||||||
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
|
||||||
dash = "-" : "@@DASH@@";
|
|
||||||
equals = "=" : "@@ARITHMETIC_EQUALS@@";
|
|
||||||
|
|
||||||
degree = "°" : "@@DEGREE@@";
|
|
||||||
|
|
||||||
division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@";
|
|
||||||
|
|
||||||
times = ("x" | "*") : "@@ARITHMETIC_TIMES@@";
|
|
||||||
|
|
||||||
power = "^" : "@@DECIMAL_EXPONENT@@";
|
|
||||||
|
|
||||||
square_root = "√" : "@@SQUARE_ROOT@@";
|
|
||||||
|
|
||||||
percent = "%" : "@@PERCENT@@";
|
|
||||||
|
|
||||||
# Safe roman numbers.
|
|
||||||
|
|
||||||
# NB: Do not change the formatting here. NO_EDIT must be on the same
|
|
||||||
# line as the path.
|
|
||||||
rfile =
|
|
||||||
'universal/roman_numerals.tsv' # NO_EDIT
|
|
||||||
;
|
|
||||||
|
|
||||||
roman = StringFile[rfile];
|
|
||||||
|
|
||||||
## Main categories.
|
|
||||||
|
|
||||||
cat_dot_number =
|
|
||||||
num
|
|
||||||
n.I[" "] dot n.I[" "] num
|
|
||||||
(n.I[" "] dot n.I[" "] num)+
|
|
||||||
;
|
|
||||||
|
|
||||||
cat_slash_number =
|
|
||||||
num
|
|
||||||
n.I[" "] slash n.I[" "] num
|
|
||||||
(n.I[" "] slash n.I[" "] num)*
|
|
||||||
;
|
|
||||||
|
|
||||||
cat_dash_number =
|
|
||||||
num
|
|
||||||
n.I[" "] dash n.I[" "] num
|
|
||||||
(n.I[" "] dash n.I[" "] num)*
|
|
||||||
;
|
|
||||||
|
|
||||||
cat_signed_number = ((plus | minus) n.I[" "])? num;
|
|
||||||
|
|
||||||
cat_degree = cat_signed_number n.I[" "] degree;
|
|
||||||
|
|
||||||
cat_country_code = plus n.I[" "] (num | digits);
|
|
||||||
|
|
||||||
cat_math_operations =
|
|
||||||
plus
|
|
||||||
| minus
|
|
||||||
| division
|
|
||||||
| times
|
|
||||||
| equals
|
|
||||||
| percent
|
|
||||||
| power
|
|
||||||
| square_root
|
|
||||||
;
|
|
||||||
|
|
||||||
# Roman numbers are often either cardinals or ordinals in various languages.
|
|
||||||
cat_roman = roman @ (num | ord);
|
|
||||||
|
|
||||||
# Allow
|
|
||||||
#
|
|
||||||
# number:number
|
|
||||||
# number-number
|
|
||||||
#
|
|
||||||
# to just be
|
|
||||||
#
|
|
||||||
# number number.
|
|
||||||
|
|
||||||
cat_number_number =
|
|
||||||
num ((":" | "-") : " ") num
|
|
||||||
;
|
|
||||||
|
|
||||||
# Some additional readings for these symbols.
|
|
||||||
|
|
||||||
cat_additional_readings =
|
|
||||||
("/" : "@@PER@@") |
|
|
||||||
("+" : "@@AND@@") |
|
|
||||||
("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) |
|
|
||||||
("*" : "@@STAR@@") |
|
|
||||||
("x" : ("x" | "@@CONNECTOR_BY@@")) |
|
|
||||||
("@" : "@@AT@@")
|
|
||||||
;
|
|
||||||
|
|
||||||
numbers_plus = Optimize[
|
|
||||||
cat_dot_number
|
|
||||||
| cat_slash_number
|
|
||||||
| cat_dash_number
|
|
||||||
| cat_signed_number
|
|
||||||
| cat_degree
|
|
||||||
| cat_country_code
|
|
||||||
| cat_math_operations
|
|
||||||
| cat_roman
|
|
||||||
| cat_number_number
|
|
||||||
| cat_additional_readings
|
|
||||||
];
|
|
||||||
|
|
||||||
export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP];
|
|
|
Can't render this file because it contains an unexpected character in line 5 and column 20.
|
@ -1,46 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/util.grm' as util;
|
|
||||||
import 'util/case.grm' as case;
|
|
||||||
import 'en/verbalizer/extra_numbers.grm' as e;
|
|
||||||
import 'en/verbalizer/float.grm' as f;
|
|
||||||
import 'en/verbalizer/math.grm' as ma;
|
|
||||||
import 'en/verbalizer/miscellaneous.grm' as mi;
|
|
||||||
import 'en/verbalizer/money.grm' as mo;
|
|
||||||
import 'en/verbalizer/numbers.grm' as n;
|
|
||||||
import 'en/verbalizer/numbers_plus.grm' as np;
|
|
||||||
import 'en/verbalizer/spelled.grm' as s;
|
|
||||||
import 'en/verbalizer/spoken_punct.grm' as sp;
|
|
||||||
import 'en/verbalizer/time.grm' as t;
|
|
||||||
import 'en/verbalizer/urls.grm' as u;
|
|
||||||
|
|
||||||
export POD_SPEECH_TN = Optimize[RmWeight[
|
|
||||||
(u.URL
|
|
||||||
| e.MIXED_NUMBERS
|
|
||||||
| e.DIGITS
|
|
||||||
| f.FLOAT
|
|
||||||
| ma.ARITHMETIC
|
|
||||||
| mo.MONEY
|
|
||||||
| n.CARDINAL_NUMBERS
|
|
||||||
| n.ORDINAL_NUMBERS
|
|
||||||
| np.NUMBERS_PLUS
|
|
||||||
| s.SPELLED
|
|
||||||
| sp.SPOKEN_PUNCT
|
|
||||||
| t.TIME
|
|
||||||
| u.URL
|
|
||||||
| u.EMAILS) @ util.CLEAN_SPACES @ case.TOUPPER
|
|
||||||
]];
|
|
||||||
|
|
||||||
#export POD_SPEECH_TN = Optimize[RmWeight[(mi.MISCELLANEOUS) @ util.CLEAN_SPACES @ case.TOUPPER]];
|
|
@ -1,77 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
# This verbalizer is used whenever there is an LM symbol that consists of
|
|
||||||
# letters immediately followed by "{spelled}".l This strips the "{spelled}"
|
|
||||||
# suffix.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'ru/classifier/cyrillic.grm' as c;
|
|
||||||
import 'en/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'en/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
digit = b.kDigit @ n.CARDINAL_NUMBERS;
|
|
||||||
|
|
||||||
char_set = (("a" | "A") : "letter-a")
|
|
||||||
| (("b" | "B") : "letter-b")
|
|
||||||
| (("c" | "C") : "letter-c")
|
|
||||||
| (("d" | "D") : "letter-d")
|
|
||||||
| (("e" | "E") : "letter-e")
|
|
||||||
| (("f" | "F") : "letter-f")
|
|
||||||
| (("g" | "G") : "letter-g")
|
|
||||||
| (("h" | "H") : "letter-h")
|
|
||||||
| (("i" | "I") : "letter-i")
|
|
||||||
| (("j" | "J") : "letter-j")
|
|
||||||
| (("k" | "K") : "letter-k")
|
|
||||||
| (("l" | "L") : "letter-l")
|
|
||||||
| (("m" | "M") : "letter-m")
|
|
||||||
| (("n" | "N") : "letter-n")
|
|
||||||
| (("o" | "O") : "letter-o")
|
|
||||||
| (("p" | "P") : "letter-p")
|
|
||||||
| (("q" | "Q") : "letter-q")
|
|
||||||
| (("r" | "R") : "letter-r")
|
|
||||||
| (("s" | "S") : "letter-s")
|
|
||||||
| (("t" | "T") : "letter-t")
|
|
||||||
| (("u" | "U") : "letter-u")
|
|
||||||
| (("v" | "V") : "letter-v")
|
|
||||||
| (("w" | "W") : "letter-w")
|
|
||||||
| (("x" | "X") : "letter-x")
|
|
||||||
| (("y" | "Y") : "letter-y")
|
|
||||||
| (("z" | "Z") : "letter-z")
|
|
||||||
| (digit)
|
|
||||||
| ("&" : "@@AND@@")
|
|
||||||
| ("." : "")
|
|
||||||
| ("-" : "")
|
|
||||||
| ("_" : "")
|
|
||||||
| ("/" : "")
|
|
||||||
| (n.I["letter-"] c.kCyrillicAlpha)
|
|
||||||
;
|
|
||||||
|
|
||||||
ins_space = "" : " ";
|
|
||||||
|
|
||||||
suffix = "{spelled}" : "";
|
|
||||||
|
|
||||||
spelled = Optimize[char_set (ins_space char_set)* suffix];
|
|
||||||
|
|
||||||
export SPELLED = Optimize[spelled @ l.LEXICAL_MAP];
|
|
||||||
|
|
||||||
sigma_star = b.kBytes*;
|
|
||||||
|
|
||||||
# Gets rid of the letter- prefix since in some cases we don't want it.
|
|
||||||
|
|
||||||
del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star];
|
|
||||||
|
|
||||||
spelled_no_tag = Optimize[char_set (ins_space char_set)*];
|
|
||||||
|
|
||||||
export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter];
|
|
@ -1,24 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'en/verbalizer/lexical_map.grm' as l;
|
|
||||||
|
|
||||||
punct =
|
|
||||||
("." : "@@PERIOD@@")
|
|
||||||
| ("," : "@@COMMA@@")
|
|
||||||
| ("!" : "@@EXCLAMATION_MARK@@")
|
|
||||||
| ("?" : "@@QUESTION_MARK@@")
|
|
||||||
;
|
|
||||||
|
|
||||||
export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP];
|
|
@ -1,108 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'en/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'en/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
# Only handles 24-hour time with quarter-to, half-past and quarter-past.
|
|
||||||
|
|
||||||
increment_hour =
|
|
||||||
("0" : "1")
|
|
||||||
| ("1" : "2")
|
|
||||||
| ("2" : "3")
|
|
||||||
| ("3" : "4")
|
|
||||||
| ("4" : "5")
|
|
||||||
| ("5" : "6")
|
|
||||||
| ("6" : "7")
|
|
||||||
| ("7" : "8")
|
|
||||||
| ("8" : "9")
|
|
||||||
| ("9" : "10")
|
|
||||||
| ("10" : "11")
|
|
||||||
| ("11" : "12")
|
|
||||||
| ("12" : "1") # If someone uses 12, we assume 12-hour by default.
|
|
||||||
| ("13" : "14")
|
|
||||||
| ("14" : "15")
|
|
||||||
| ("15" : "16")
|
|
||||||
| ("16" : "17")
|
|
||||||
| ("17" : "18")
|
|
||||||
| ("18" : "19")
|
|
||||||
| ("19" : "20")
|
|
||||||
| ("20" : "21")
|
|
||||||
| ("21" : "22")
|
|
||||||
| ("22" : "23")
|
|
||||||
| ("23" : "12")
|
|
||||||
;
|
|
||||||
|
|
||||||
hours = Project[increment_hour, 'input'];
|
|
||||||
|
|
||||||
d = b.kDigit;
|
|
||||||
D = d - "0";
|
|
||||||
|
|
||||||
minutes09 = "0" D;
|
|
||||||
|
|
||||||
minutes = ("1" | "2" | "3" | "4" | "5") d;
|
|
||||||
|
|
||||||
__sep__ = ":";
|
|
||||||
sep_space = __sep__ : " ";
|
|
||||||
|
|
||||||
verbalize_hours = hours @ n.CARDINAL_NUMBERS;
|
|
||||||
|
|
||||||
verbalize_minutes =
|
|
||||||
("00" : "@@HOUR@@")
|
|
||||||
| (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS))
|
|
||||||
| (minutes @ n.CARDINAL_NUMBERS)
|
|
||||||
;
|
|
||||||
|
|
||||||
time_basic = Optimize[verbalize_hours sep_space verbalize_minutes];
|
|
||||||
|
|
||||||
# Special cases we handle right now.
|
|
||||||
# TODO: Need to allow for cases like
|
|
||||||
#
|
|
||||||
# half twelve (in the UK English sense)
|
|
||||||
# half twaalf (in the Dutch sense)
|
|
||||||
|
|
||||||
time_quarter_past =
|
|
||||||
n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "]
|
|
||||||
verbalize_hours
|
|
||||||
n.D[__sep__ "15"];
|
|
||||||
|
|
||||||
time_half_past =
|
|
||||||
n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "]
|
|
||||||
verbalize_hours
|
|
||||||
n.D[__sep__ "30"];
|
|
||||||
|
|
||||||
time_quarter_to =
|
|
||||||
n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "]
|
|
||||||
(increment_hour @ verbalize_hours)
|
|
||||||
n.D[__sep__ "45"];
|
|
||||||
|
|
||||||
time_extra = Optimize[
|
|
||||||
time_quarter_past | time_half_past | time_quarter_to]
|
|
||||||
;
|
|
||||||
|
|
||||||
# Basic time periods which most languages can be expected to have.
|
|
||||||
__am__ = "a.m." | "am" | "AM";
|
|
||||||
__pm__ = "p.m." | "pm" | "PM";
|
|
||||||
|
|
||||||
period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@");
|
|
||||||
|
|
||||||
time_variants = time_basic | time_extra;
|
|
||||||
|
|
||||||
time = Optimize[
|
|
||||||
(period (" " | n.I[" "]))? time_variants
|
|
||||||
| time_variants ((" " | n.I[" "]) period)?]
|
|
||||||
;
|
|
||||||
|
|
||||||
export TIME = Optimize[time @ l.LEXICAL_MAP];
|
|
@ -1,68 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
# Rules for URLs and email addresses.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as bytelib;
|
|
||||||
import 'en/verbalizer/lexical_map.grm' as l;
|
|
||||||
|
|
||||||
ins_space = "" : " ";
|
|
||||||
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
|
||||||
at = "@" : "@@AT@@";
|
|
||||||
|
|
||||||
url_suffix =
|
|
||||||
(".com" : dot ins_space "com") |
|
|
||||||
(".gov" : dot ins_space "gov") |
|
|
||||||
(".edu" : dot ins_space "e d u") |
|
|
||||||
(".org" : dot ins_space "org") |
|
|
||||||
(".net" : dot ins_space "net")
|
|
||||||
;
|
|
||||||
|
|
||||||
letter_string = (bytelib.kAlnum)* bytelib.kAlnum;
|
|
||||||
|
|
||||||
letter_string_dot =
|
|
||||||
((letter_string ins_space dot ins_space)* letter_string)
|
|
||||||
;
|
|
||||||
|
|
||||||
# Rules for URLs.
|
|
||||||
export URL = Optimize[
|
|
||||||
((letter_string_dot) (ins_space)
|
|
||||||
(url_suffix)) @ l.LEXICAL_MAP
|
|
||||||
];
|
|
||||||
|
|
||||||
# Rules for email addresses.
|
|
||||||
letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum);
|
|
||||||
|
|
||||||
letter_by_letter_dot =
|
|
||||||
((letter_by_letter ins_space dot ins_space)*
|
|
||||||
letter_by_letter)
|
|
||||||
;
|
|
||||||
|
|
||||||
export EMAIL1 = Optimize[
|
|
||||||
((letter_by_letter) (ins_space)
|
|
||||||
(at) (ins_space)
|
|
||||||
(letter_by_letter_dot) (ins_space)
|
|
||||||
(url_suffix)) @ l.LEXICAL_MAP
|
|
||||||
];
|
|
||||||
|
|
||||||
export EMAIL2 = Optimize[
|
|
||||||
((letter_by_letter) (ins_space)
|
|
||||||
(at) (ins_space)
|
|
||||||
(letter_string_dot) (ins_space)
|
|
||||||
(url_suffix)) @ l.LEXICAL_MAP
|
|
||||||
];
|
|
||||||
|
|
||||||
export EMAILS = Optimize[
|
|
||||||
EMAIL1 | EMAIL2
|
|
||||||
];
|
|
@ -1,42 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/util.grm' as util;
|
|
||||||
import 'en/verbalizer/extra_numbers.grm' as e;
|
|
||||||
import 'en/verbalizer/float.grm' as f;
|
|
||||||
import 'en/verbalizer/math.grm' as ma;
|
|
||||||
import 'en/verbalizer/miscellaneous.grm' as mi;
|
|
||||||
import 'en/verbalizer/money.grm' as mo;
|
|
||||||
import 'en/verbalizer/numbers.grm' as n;
|
|
||||||
import 'en/verbalizer/numbers_plus.grm' as np;
|
|
||||||
import 'en/verbalizer/spelled.grm' as s;
|
|
||||||
import 'en/verbalizer/spoken_punct.grm' as sp;
|
|
||||||
import 'en/verbalizer/time.grm' as t;
|
|
||||||
import 'en/verbalizer/urls.grm' as u;
|
|
||||||
|
|
||||||
export VERBALIZER = Optimize[RmWeight[
|
|
||||||
( e.MIXED_NUMBERS
|
|
||||||
| e.DIGITS
|
|
||||||
| f.FLOAT
|
|
||||||
| ma.ARITHMETIC
|
|
||||||
| mi.MISCELLANEOUS
|
|
||||||
| mo.MONEY
|
|
||||||
| n.CARDINAL_NUMBERS
|
|
||||||
| n.ORDINAL_NUMBERS
|
|
||||||
| np.NUMBERS_PLUS
|
|
||||||
| s.SPELLED
|
|
||||||
| sp.SPOKEN_PUNCT
|
|
||||||
| t.TIME
|
|
||||||
| u.URL) @ util.CLEAN_SPACES
|
|
||||||
]];
|
|
@ -1,17 +0,0 @@
|
|||||||
This directory contains data used in:
|
|
||||||
|
|
||||||
Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
|
|
||||||
Transactions of the Association for Computational Linguistics 4: 507-519.
|
|
||||||
|
|
||||||
* `minimal.txt`: A list of 30 curated numbers used as the "minimal" training
|
|
||||||
set.
|
|
||||||
* `random-trn.txt`: A list of 9000 randomly-generated numbers used as the
|
|
||||||
"medium" training set.
|
|
||||||
* `random-tst.txt`: A list of 1000 randomly-generated numbers used as the test
|
|
||||||
set.
|
|
||||||
|
|
||||||
Note that `random-trn.txt` and `random-tst.txt` are totally disjoint, but that
|
|
||||||
a small number of examples occur both in `minimal.txt` and `random-tst.txt`.
|
|
||||||
|
|
||||||
For information about the sampling procedure used to generate the random data
|
|
||||||
sets, see appendix A of the aforementioned paper.
|
|
@ -1,300 +0,0 @@
|
|||||||
0
|
|
||||||
1
|
|
||||||
2
|
|
||||||
3
|
|
||||||
4
|
|
||||||
5
|
|
||||||
6
|
|
||||||
7
|
|
||||||
8
|
|
||||||
9
|
|
||||||
10
|
|
||||||
11
|
|
||||||
12
|
|
||||||
13
|
|
||||||
14
|
|
||||||
15
|
|
||||||
16
|
|
||||||
17
|
|
||||||
18
|
|
||||||
19
|
|
||||||
20
|
|
||||||
21
|
|
||||||
22
|
|
||||||
23
|
|
||||||
24
|
|
||||||
25
|
|
||||||
26
|
|
||||||
27
|
|
||||||
28
|
|
||||||
29
|
|
||||||
30
|
|
||||||
31
|
|
||||||
32
|
|
||||||
33
|
|
||||||
34
|
|
||||||
35
|
|
||||||
36
|
|
||||||
37
|
|
||||||
38
|
|
||||||
39
|
|
||||||
40
|
|
||||||
41
|
|
||||||
42
|
|
||||||
43
|
|
||||||
44
|
|
||||||
45
|
|
||||||
46
|
|
||||||
47
|
|
||||||
48
|
|
||||||
49
|
|
||||||
50
|
|
||||||
51
|
|
||||||
52
|
|
||||||
53
|
|
||||||
54
|
|
||||||
55
|
|
||||||
56
|
|
||||||
57
|
|
||||||
58
|
|
||||||
59
|
|
||||||
60
|
|
||||||
61
|
|
||||||
62
|
|
||||||
63
|
|
||||||
64
|
|
||||||
65
|
|
||||||
66
|
|
||||||
67
|
|
||||||
68
|
|
||||||
69
|
|
||||||
70
|
|
||||||
71
|
|
||||||
72
|
|
||||||
73
|
|
||||||
74
|
|
||||||
75
|
|
||||||
76
|
|
||||||
77
|
|
||||||
78
|
|
||||||
79
|
|
||||||
80
|
|
||||||
81
|
|
||||||
82
|
|
||||||
83
|
|
||||||
84
|
|
||||||
85
|
|
||||||
86
|
|
||||||
87
|
|
||||||
88
|
|
||||||
89
|
|
||||||
90
|
|
||||||
91
|
|
||||||
92
|
|
||||||
93
|
|
||||||
94
|
|
||||||
95
|
|
||||||
96
|
|
||||||
97
|
|
||||||
98
|
|
||||||
99
|
|
||||||
100
|
|
||||||
101
|
|
||||||
102
|
|
||||||
103
|
|
||||||
104
|
|
||||||
105
|
|
||||||
106
|
|
||||||
107
|
|
||||||
108
|
|
||||||
109
|
|
||||||
110
|
|
||||||
111
|
|
||||||
112
|
|
||||||
113
|
|
||||||
114
|
|
||||||
115
|
|
||||||
116
|
|
||||||
117
|
|
||||||
118
|
|
||||||
119
|
|
||||||
120
|
|
||||||
121
|
|
||||||
122
|
|
||||||
123
|
|
||||||
124
|
|
||||||
125
|
|
||||||
126
|
|
||||||
127
|
|
||||||
128
|
|
||||||
129
|
|
||||||
130
|
|
||||||
131
|
|
||||||
132
|
|
||||||
133
|
|
||||||
134
|
|
||||||
135
|
|
||||||
136
|
|
||||||
137
|
|
||||||
138
|
|
||||||
139
|
|
||||||
140
|
|
||||||
141
|
|
||||||
142
|
|
||||||
143
|
|
||||||
144
|
|
||||||
145
|
|
||||||
146
|
|
||||||
147
|
|
||||||
148
|
|
||||||
149
|
|
||||||
150
|
|
||||||
151
|
|
||||||
152
|
|
||||||
153
|
|
||||||
154
|
|
||||||
155
|
|
||||||
156
|
|
||||||
157
|
|
||||||
158
|
|
||||||
159
|
|
||||||
160
|
|
||||||
161
|
|
||||||
162
|
|
||||||
163
|
|
||||||
164
|
|
||||||
165
|
|
||||||
166
|
|
||||||
167
|
|
||||||
168
|
|
||||||
169
|
|
||||||
170
|
|
||||||
171
|
|
||||||
172
|
|
||||||
173
|
|
||||||
174
|
|
||||||
175
|
|
||||||
176
|
|
||||||
177
|
|
||||||
178
|
|
||||||
179
|
|
||||||
180
|
|
||||||
181
|
|
||||||
182
|
|
||||||
183
|
|
||||||
184
|
|
||||||
185
|
|
||||||
186
|
|
||||||
187
|
|
||||||
188
|
|
||||||
189
|
|
||||||
190
|
|
||||||
191
|
|
||||||
192
|
|
||||||
193
|
|
||||||
194
|
|
||||||
195
|
|
||||||
196
|
|
||||||
197
|
|
||||||
198
|
|
||||||
199
|
|
||||||
200
|
|
||||||
201
|
|
||||||
202
|
|
||||||
203
|
|
||||||
204
|
|
||||||
205
|
|
||||||
206
|
|
||||||
207
|
|
||||||
208
|
|
||||||
209
|
|
||||||
210
|
|
||||||
211
|
|
||||||
212
|
|
||||||
220
|
|
||||||
221
|
|
||||||
230
|
|
||||||
300
|
|
||||||
400
|
|
||||||
500
|
|
||||||
600
|
|
||||||
700
|
|
||||||
800
|
|
||||||
900
|
|
||||||
1000
|
|
||||||
1001
|
|
||||||
1002
|
|
||||||
1003
|
|
||||||
1004
|
|
||||||
1005
|
|
||||||
1006
|
|
||||||
1007
|
|
||||||
1008
|
|
||||||
1009
|
|
||||||
1010
|
|
||||||
1011
|
|
||||||
1012
|
|
||||||
1020
|
|
||||||
1021
|
|
||||||
1030
|
|
||||||
1200
|
|
||||||
2000
|
|
||||||
2001
|
|
||||||
2002
|
|
||||||
2003
|
|
||||||
2004
|
|
||||||
2005
|
|
||||||
2006
|
|
||||||
2007
|
|
||||||
2008
|
|
||||||
2009
|
|
||||||
2010
|
|
||||||
2011
|
|
||||||
2012
|
|
||||||
2020
|
|
||||||
2021
|
|
||||||
2030
|
|
||||||
2100
|
|
||||||
2200
|
|
||||||
5001
|
|
||||||
10000
|
|
||||||
12000
|
|
||||||
20000
|
|
||||||
21000
|
|
||||||
50001
|
|
||||||
100000
|
|
||||||
120000
|
|
||||||
200000
|
|
||||||
210000
|
|
||||||
500001
|
|
||||||
1000000
|
|
||||||
1001000
|
|
||||||
1200000
|
|
||||||
2000000
|
|
||||||
2100000
|
|
||||||
5000001
|
|
||||||
10000000
|
|
||||||
10001000
|
|
||||||
12000000
|
|
||||||
20000000
|
|
||||||
50000001
|
|
||||||
100000000
|
|
||||||
100001000
|
|
||||||
120000000
|
|
||||||
200000000
|
|
||||||
500000001
|
|
||||||
1000000000
|
|
||||||
1000001000
|
|
||||||
1200000000
|
|
||||||
2000000000
|
|
||||||
5000000001
|
|
||||||
10000000000
|
|
||||||
10000001000
|
|
||||||
12000000000
|
|
||||||
20000000000
|
|
||||||
50000000001
|
|
||||||
100000000000
|
|
||||||
100000001000
|
|
||||||
120000000000
|
|
||||||
200000000000
|
|
||||||
500000000001
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,6 +0,0 @@
|
|||||||
# Russian covering grammar definitions
|
|
||||||
|
|
||||||
This directory defines a Russian text normalization covering grammar. The
|
|
||||||
primary entry-point is the FST `VERBALIZER`, defined in
|
|
||||||
`verbalizer/verbalizer.grm` and compiled in the FST archive
|
|
||||||
`verbalizer/verbalizer.far`.
|
|
@ -1,338 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
# AUTOMATICALLY GENERATED: DO NOT EDIT.
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
|
|
||||||
# Utilities for insertion and deletion.
|
|
||||||
|
|
||||||
func I[expr] {
|
|
||||||
return "" : expr;
|
|
||||||
}
|
|
||||||
|
|
||||||
func D[expr] {
|
|
||||||
return expr : "";
|
|
||||||
}
|
|
||||||
|
|
||||||
# Powers of base 10.
|
|
||||||
export POWERS =
|
|
||||||
"[E15]"
|
|
||||||
| "[E14]"
|
|
||||||
| "[E13]"
|
|
||||||
| "[E12]"
|
|
||||||
| "[E11]"
|
|
||||||
| "[E10]"
|
|
||||||
| "[E9]"
|
|
||||||
| "[E8]"
|
|
||||||
| "[E7]"
|
|
||||||
| "[E6]"
|
|
||||||
| "[E5]"
|
|
||||||
| "[E4]"
|
|
||||||
| "[E3]"
|
|
||||||
| "[E2]"
|
|
||||||
| "[E1]"
|
|
||||||
;
|
|
||||||
|
|
||||||
export SIGMA = b.kBytes | POWERS;
|
|
||||||
|
|
||||||
export SIGMA_STAR = SIGMA*;
|
|
||||||
|
|
||||||
export SIGMA_PLUS = SIGMA+;
|
|
||||||
|
|
||||||
################################################################################
|
|
||||||
# BEGIN LANGUAGE SPECIFIC DATA
|
|
||||||
revaluations =
|
|
||||||
("[E4]" : "[E1]")
|
|
||||||
| ("[E5]" : "[E2]")
|
|
||||||
| ("[E7]" : "[E1]")
|
|
||||||
| ("[E8]" : "[E2]")
|
|
||||||
;
|
|
||||||
|
|
||||||
Ms = "[E3]" | "[E6]" | "[E9]";
|
|
||||||
|
|
||||||
|
|
||||||
func Zero[expr] {
|
|
||||||
return expr : ("");
|
|
||||||
}
|
|
||||||
|
|
||||||
space = " ";
|
|
||||||
|
|
||||||
lexset3 = Optimize[
|
|
||||||
("1[E1]+1" : "одиннадцати")
|
|
||||||
| ("1[E1]+1" : "одиннадцать")
|
|
||||||
| ("1[E1]+1" : "одиннадцатью")
|
|
||||||
| ("1[E1]+2" : "двенадцати")
|
|
||||||
| ("1[E1]+2" : "двенадцать")
|
|
||||||
| ("1[E1]+2" : "двенадцатью")
|
|
||||||
| ("1[E1]+3" : "тринадцати")
|
|
||||||
| ("1[E1]+3" : "тринадцать")
|
|
||||||
| ("1[E1]+3" : "тринадцатью")
|
|
||||||
| ("1[E1]+4" : "четырнадцати")
|
|
||||||
| ("1[E1]+4" : "четырнадцать")
|
|
||||||
| ("1[E1]+4" : "четырнадцатью")
|
|
||||||
| ("1[E1]+5" : "пятнадцати")
|
|
||||||
| ("1[E1]+5" : "пятнадцать")
|
|
||||||
| ("1[E1]+5" : "пятнадцатью")
|
|
||||||
| ("1[E1]+6" : "шестнадцати")
|
|
||||||
| ("1[E1]+6" : "шестнадцать")
|
|
||||||
| ("1[E1]+6" : "шестнадцатью")
|
|
||||||
| ("1[E1]+7" : "семнадцати")
|
|
||||||
| ("1[E1]+7" : "семнадцать")
|
|
||||||
| ("1[E1]+7" : "семнадцатью")
|
|
||||||
| ("1[E1]+8" : "восемнадцати")
|
|
||||||
| ("1[E1]+8" : "восемнадцать")
|
|
||||||
| ("1[E1]+8" : "восемнадцатью")
|
|
||||||
| ("1[E1]+9" : "девятнадцати")
|
|
||||||
| ("1[E1]+9" : "девятнадцать")
|
|
||||||
| ("1[E1]+9" : "девятнадцатью")]
|
|
||||||
;
|
|
||||||
|
|
||||||
lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
lexset2 = Optimize[
|
|
||||||
("1[E1]" : "десяти")
|
|
||||||
| ("1[E1]" : "десять")
|
|
||||||
| ("1[E1]" : "десятью")
|
|
||||||
| ("1[E2]" : "ста")
|
|
||||||
| ("1[E2]" : "сто")
|
|
||||||
| ("2[E1]" : "двадцати")
|
|
||||||
| ("2[E1]" : "двадцать")
|
|
||||||
| ("2[E1]" : "двадцатью")
|
|
||||||
| ("2[E2]" : "двести")
|
|
||||||
| ("2[E2]" : "двумстам")
|
|
||||||
| ("2[E2]" : "двумястами")
|
|
||||||
| ("2[E2]" : "двухсот")
|
|
||||||
| ("2[E2]" : "двухстах")
|
|
||||||
| ("3[E1]" : "тридцати")
|
|
||||||
| ("3[E1]" : "тридцать")
|
|
||||||
| ("3[E1]" : "тридцатью")
|
|
||||||
| ("3[E2]" : "тремстам")
|
|
||||||
| ("3[E2]" : "тремястами")
|
|
||||||
| ("3[E2]" : "трехсот")
|
|
||||||
| ("3[E2]" : "трехстах")
|
|
||||||
| ("3[E2]" : "триста")
|
|
||||||
| ("4[E1]" : "сорок")
|
|
||||||
| ("4[E1]" : "сорока")
|
|
||||||
| ("4[E2]" : "четыремстам")
|
|
||||||
| ("4[E2]" : "четыреста")
|
|
||||||
| ("4[E2]" : "четырехсот")
|
|
||||||
| ("4[E2]" : "четырехстах")
|
|
||||||
| ("4[E2]" : "четырьмястами")
|
|
||||||
| ("5[E1]" : "пятидесяти")
|
|
||||||
| ("5[E1]" : "пятьдесят")
|
|
||||||
| ("5[E1]" : "пятьюдесятью")
|
|
||||||
| ("5[E2]" : "пятисот")
|
|
||||||
| ("5[E2]" : "пятистам")
|
|
||||||
| ("5[E2]" : "пятистах")
|
|
||||||
| ("5[E2]" : "пятьсот")
|
|
||||||
| ("5[E2]" : "пятьюстами")
|
|
||||||
| ("6[E1]" : "шестидесяти")
|
|
||||||
| ("6[E1]" : "шестьдесят")
|
|
||||||
| ("6[E1]" : "шестьюдесятью")
|
|
||||||
| ("6[E2]" : "шестисот")
|
|
||||||
| ("6[E2]" : "шестистам")
|
|
||||||
| ("6[E2]" : "шестистах")
|
|
||||||
| ("6[E2]" : "шестьсот")
|
|
||||||
| ("6[E2]" : "шестьюстами")
|
|
||||||
| ("7[E1]" : "семидесяти")
|
|
||||||
| ("7[E1]" : "семьдесят")
|
|
||||||
| ("7[E1]" : "семьюдесятью")
|
|
||||||
| ("7[E2]" : "семисот")
|
|
||||||
| ("7[E2]" : "семистам")
|
|
||||||
| ("7[E2]" : "семистах")
|
|
||||||
| ("7[E2]" : "семьсот")
|
|
||||||
| ("7[E2]" : "семьюстами")
|
|
||||||
| ("8[E1]" : "восемьдесят")
|
|
||||||
| ("8[E1]" : "восьмидесяти")
|
|
||||||
| ("8[E1]" : "восьмьюдесятью")
|
|
||||||
| ("8[E2]" : "восемьсот")
|
|
||||||
| ("8[E2]" : "восемьюстами")
|
|
||||||
| ("8[E2]" : "восьмисот")
|
|
||||||
| ("8[E2]" : "восьмистам")
|
|
||||||
| ("8[E2]" : "восьмистах")
|
|
||||||
| ("8[E2]" : "восьмьюстами")
|
|
||||||
| ("9[E1]" : "девяноста")
|
|
||||||
| ("9[E1]" : "девяносто")
|
|
||||||
| ("9[E2]" : "девятисот")
|
|
||||||
| ("9[E2]" : "девятистам")
|
|
||||||
| ("9[E2]" : "девятистах")
|
|
||||||
| ("9[E2]" : "девятьсот")
|
|
||||||
| ("9[E2]" : "девятьюстами")]
|
|
||||||
;
|
|
||||||
|
|
||||||
lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
lexset1 = Optimize[
|
|
||||||
("+" : "")
|
|
||||||
| ("1" : "один")
|
|
||||||
| ("1" : "одна")
|
|
||||||
| ("1" : "одни")
|
|
||||||
| ("1" : "одним")
|
|
||||||
| ("1" : "одними")
|
|
||||||
| ("1" : "одних")
|
|
||||||
| ("1" : "одно")
|
|
||||||
| ("1" : "одного")
|
|
||||||
| ("1" : "одной")
|
|
||||||
| ("1" : "одном")
|
|
||||||
| ("1" : "одному")
|
|
||||||
| ("1" : "одною")
|
|
||||||
| ("1" : "одну")
|
|
||||||
| ("2" : "два")
|
|
||||||
| ("2" : "две")
|
|
||||||
| ("2" : "двум")
|
|
||||||
| ("2" : "двумя")
|
|
||||||
| ("2" : "двух")
|
|
||||||
| ("3" : "трем")
|
|
||||||
| ("3" : "тремя")
|
|
||||||
| ("3" : "трех")
|
|
||||||
| ("3" : "три")
|
|
||||||
| ("4" : "четыре")
|
|
||||||
| ("4" : "четырем")
|
|
||||||
| ("4" : "четырех")
|
|
||||||
| ("4" : "четырьмя")
|
|
||||||
| ("5" : "пяти")
|
|
||||||
| ("5" : "пять")
|
|
||||||
| ("5" : "пятью")
|
|
||||||
| ("6" : "шести")
|
|
||||||
| ("6" : "шесть")
|
|
||||||
| ("6" : "шестью")
|
|
||||||
| ("7" : "семи")
|
|
||||||
| ("7" : "семь")
|
|
||||||
| ("7" : "семью")
|
|
||||||
| ("8" : "восемь")
|
|
||||||
| ("8" : "восьми")
|
|
||||||
| ("8" : "восьмью")
|
|
||||||
| ("9" : "девяти")
|
|
||||||
| ("9" : "девять")
|
|
||||||
| ("9" : "девятью")
|
|
||||||
| ("[E3]" : "тысяч")
|
|
||||||
| ("[E3]" : "тысяча")
|
|
||||||
| ("[E3]" : "тысячам")
|
|
||||||
| ("[E3]" : "тысячами")
|
|
||||||
| ("[E3]" : "тысячах")
|
|
||||||
| ("[E3]" : "тысяче")
|
|
||||||
| ("[E3]" : "тысячей")
|
|
||||||
| ("[E3]" : "тысячи")
|
|
||||||
| ("[E3]" : "тысячу")
|
|
||||||
| ("[E3]" : "тысячью")
|
|
||||||
| ("[E6]" : "миллион")
|
|
||||||
| ("[E6]" : "миллиона")
|
|
||||||
| ("[E6]" : "миллионам")
|
|
||||||
| ("[E6]" : "миллионами")
|
|
||||||
| ("[E6]" : "миллионах")
|
|
||||||
| ("[E6]" : "миллионе")
|
|
||||||
| ("[E6]" : "миллионов")
|
|
||||||
| ("[E6]" : "миллионом")
|
|
||||||
| ("[E6]" : "миллиону")
|
|
||||||
| ("[E6]" : "миллионы")
|
|
||||||
| ("[E9]" : "миллиард")
|
|
||||||
| ("[E9]" : "миллиарда")
|
|
||||||
| ("[E9]" : "миллиардам")
|
|
||||||
| ("[E9]" : "миллиардами")
|
|
||||||
| ("[E9]" : "миллиардах")
|
|
||||||
| ("[E9]" : "миллиарде")
|
|
||||||
| ("[E9]" : "миллиардов")
|
|
||||||
| ("[E9]" : "миллиардом")
|
|
||||||
| ("[E9]" : "миллиарду")
|
|
||||||
| ("[E9]" : "миллиарды")
|
|
||||||
| ("|0|" : "ноле")
|
|
||||||
| ("|0|" : "нолем")
|
|
||||||
| ("|0|" : "ноль")
|
|
||||||
| ("|0|" : "нолю")
|
|
||||||
| ("|0|" : "ноля")
|
|
||||||
| ("|0|" : "нуле")
|
|
||||||
| ("|0|" : "нулем")
|
|
||||||
| ("|0|" : "нуль")
|
|
||||||
| ("|0|" : "нулю")
|
|
||||||
| ("|0|" : "нуля")]
|
|
||||||
;
|
|
||||||
|
|
||||||
lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
export LEX = Optimize[lex3 @ lex2 @ lex1];
|
|
||||||
|
|
||||||
export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]";
|
|
||||||
|
|
||||||
# END LANGUAGE SPECIFIC DATA
|
|
||||||
################################################################################
|
|
||||||
# Inserts a marker after the Ms.
|
|
||||||
export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR];
|
|
||||||
|
|
||||||
# Deletes all powers and "+".
|
|
||||||
export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
# Deletes trailing zeros at the beginning of a number, so that "0003" does not
|
|
||||||
# get treated as an ordinary number.
|
|
||||||
export DELETE_INITIAL_ZEROS =
|
|
||||||
CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR]
|
|
||||||
;
|
|
||||||
|
|
||||||
NonMs = Optimize[POWERS - Ms];
|
|
||||||
|
|
||||||
# Deletes (usually) zeros before a non-M. E.g., +0[E1] should be deleted.
|
|
||||||
export DELETE_INTERMEDIATE_ZEROS1 =
|
|
||||||
CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR]
|
|
||||||
;
|
|
||||||
|
|
||||||
# Deletes (usually) zeros before an M, if there is no non-zero element between
|
|
||||||
# that and the previous boundary. Thus, if after the result of the rule above we
|
|
||||||
# end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final
|
|
||||||
# zero.
|
|
||||||
export DELETE_INTERMEDIATE_ZEROS2 = Optimize[
|
|
||||||
CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR]
|
|
||||||
@ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]]
|
|
||||||
;
|
|
||||||
|
|
||||||
# Final clean up of stray zeros.
|
|
||||||
export DELETE_REMAINING_ZEROS = Optimize[
|
|
||||||
CDRewrite[Zero["+0"], "", "", SIGMA_STAR]
|
|
||||||
@ CDRewrite[Zero["0"], "", "", SIGMA_STAR]]
|
|
||||||
;
|
|
||||||
|
|
||||||
# Applies the revaluation map. For example in English, changes [E4] to [E1] as a
|
|
||||||
# modifier of [E3].
|
|
||||||
export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
# Deletes the various marks and powers in the input and output.
|
|
||||||
export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
export CLEAN_SPACES = Optimize[
|
|
||||||
CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR]
|
|
||||||
@ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR]
|
|
||||||
@ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]]
|
|
||||||
;
|
|
||||||
|
|
||||||
d = b.kDigit;
|
|
||||||
|
|
||||||
# Germanic inversion rule.
|
|
||||||
germanic =
|
|
||||||
(I["1+"] d "[E1]" D["+1"])
|
|
||||||
| (I["2+"] d "[E1]" D["+2"])
|
|
||||||
| (I["3+"] d "[E1]" D["+3"])
|
|
||||||
| (I["4+"] d "[E1]" D["+4"])
|
|
||||||
| (I["5+"] d "[E1]" D["+5"])
|
|
||||||
| (I["6+"] d "[E1]" D["+6"])
|
|
||||||
| (I["7+"] d "[E1]" D["+7"])
|
|
||||||
| (I["8+"] d "[E1]" D["+8"])
|
|
||||||
| (I["9+"] d "[E1]" D["+9"])
|
|
||||||
;
|
|
||||||
|
|
||||||
germanic_inversion =
|
|
||||||
CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt']
|
|
||||||
;
|
|
||||||
|
|
||||||
export GERMANIC_INVERSION = SIGMA_STAR;
|
|
||||||
export ORDINAL_RESTRICTION = SIGMA_STAR;
|
|
||||||
nondigits = b.kBytes - b.kDigit;
|
|
||||||
export ORDINAL_SUFFIX = D[nondigits*];
|
|
|
@ -1,35 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'ru/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@");
|
|
||||||
|
|
||||||
export DIGITS = digit (n.I[" "] digit)*;
|
|
||||||
|
|
||||||
# Various common factorizations
|
|
||||||
|
|
||||||
two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS;
|
|
||||||
|
|
||||||
three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS;
|
|
||||||
|
|
||||||
mixed =
|
|
||||||
(digit n.I[" "] two_digits)
|
|
||||||
| (two_digits n.I[" "] two_digits)
|
|
||||||
| (two_digits n.I[" "] three_digits)
|
|
||||||
| (two_digits n.I[" "] two_digits n.I[" "] two_digits)
|
|
||||||
;
|
|
||||||
|
|
||||||
export MIXED_NUMBERS = Optimize[mixed];
|
|
@ -1,40 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'util/util.grm' as u;
|
|
||||||
import 'ru/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
func ToNumberName[expr] {
|
|
||||||
number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*;
|
|
||||||
return Optimize[expr @ number_name_seq];
|
|
||||||
}
|
|
||||||
|
|
||||||
d = b.kDigit;
|
|
||||||
|
|
||||||
leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*];
|
|
||||||
|
|
||||||
by_ones = d n.I[" "];
|
|
||||||
by_twos = (d{2} @ leading_zero) n.I[" "];
|
|
||||||
by_threes = (d{3} @ leading_zero) n.I[" "];
|
|
||||||
|
|
||||||
groupings = by_twos* (by_threes | by_twos | by_ones);
|
|
||||||
|
|
||||||
export FRACTIONAL_PART_UNGROUPED =
|
|
||||||
Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]]
|
|
||||||
;
|
|
||||||
export FRACTIONAL_PART_GROUPED =
|
|
||||||
Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]]
|
|
||||||
;
|
|
||||||
export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]];
|
|
@ -1,30 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'ru/verbalizer/factorization.grm' as f;
|
|
||||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'ru/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED;
|
|
||||||
fractional_part_grouped = f.FRACTIONAL_PART_GROUPED;
|
|
||||||
fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED;
|
|
||||||
|
|
||||||
__fractional_part__ = fractional_part_unparsed;
|
|
||||||
__decimal_marker__ = ",";
|
|
||||||
|
|
||||||
export FLOAT = Optimize[
|
|
||||||
(n.CARDINAL_NUMBERS
|
|
||||||
(__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ")
|
|
||||||
__fractional_part__) @ l.LEXICAL_MAP]
|
|
||||||
;
|
|
Binary file not shown.
@ -1,25 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
|
|
||||||
lexical_map = StringFile['ru/verbalizer/lexical_map.tsv'];
|
|
||||||
|
|
||||||
sigma_star = b.kBytes*;
|
|
||||||
|
|
||||||
del_null = CDRewrite["__NULL__" : "", "", "", sigma_star];
|
|
||||||
|
|
||||||
export LEXICAL_MAP = Optimize[
|
|
||||||
CDRewrite[lexical_map, "", "", sigma_star] @ del_null]
|
|
||||||
;
|
|
Can't render this file because it has a wrong number of fields in line 176.
|
@ -1,34 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'ru/verbalizer/float.grm' as f;
|
|
||||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'ru/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
float = f.FLOAT;
|
|
||||||
card = n.CARDINAL_NUMBERS;
|
|
||||||
number = card | float;
|
|
||||||
|
|
||||||
plus = "+" : " @@ARITHMETIC_PLUS@@ ";
|
|
||||||
times = "*" : " @@ARITHMETIC_TIMES@@ ";
|
|
||||||
minus = "-" : " @@ARITHMETIC_MINUS@@ ";
|
|
||||||
division = "/" : " @@ARITHMETIC_DIVISION@@ ";
|
|
||||||
|
|
||||||
operator = plus | times | minus | division;
|
|
||||||
|
|
||||||
percent = "%" : " @@PERCENT@@";
|
|
||||||
|
|
||||||
export ARITHMETIC =
|
|
||||||
Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP]
|
|
||||||
;
|
|
@ -1,78 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'ru/classifier/cyrillic.grm' as c;
|
|
||||||
import 'ru/verbalizer/extra_numbers.grm' as e;
|
|
||||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'ru/verbalizer/numbers.grm' as n;
|
|
||||||
import 'ru/verbalizer/spelled.grm' as s;
|
|
||||||
|
|
||||||
letter = b.kAlpha | c.kCyrillicAlpha;
|
|
||||||
dash = "-";
|
|
||||||
word = letter+;
|
|
||||||
possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?;
|
|
||||||
|
|
||||||
post_word_symbol =
|
|
||||||
("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) |
|
|
||||||
("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) |
|
|
||||||
("*" : "@@STAR@@")
|
|
||||||
;
|
|
||||||
|
|
||||||
pre_word_symbol =
|
|
||||||
("@" : "@@AT@@") |
|
|
||||||
("/" : "@@SLASH@@") |
|
|
||||||
("#" : "@@HASH@@")
|
|
||||||
;
|
|
||||||
|
|
||||||
post_word = possibly_split_word n.I[" "] post_word_symbol;
|
|
||||||
|
|
||||||
pre_word = pre_word_symbol n.I[" "] possibly_split_word;
|
|
||||||
|
|
||||||
## Number/digit sequence combos, maybe with a dash
|
|
||||||
|
|
||||||
spelled_word = word @ s.SPELLED_NO_LETTER;
|
|
||||||
|
|
||||||
word_number =
|
|
||||||
(word | spelled_word)
|
|
||||||
(n.I[" "] | (dash : " "))
|
|
||||||
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
|
||||||
;
|
|
||||||
|
|
||||||
number_word =
|
|
||||||
(e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
|
|
||||||
(n.I[" "] | (dash : " "))
|
|
||||||
(word | spelled_word)
|
|
||||||
;
|
|
||||||
|
|
||||||
## Two-digit year.
|
|
||||||
|
|
||||||
# Note that in this case to be fair we really have to allow ordinals too since
|
|
||||||
# in some languages that's what you would have.
|
|
||||||
|
|
||||||
two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS));
|
|
||||||
|
|
||||||
dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com";
|
|
||||||
|
|
||||||
miscellaneous = Optimize[
|
|
||||||
possibly_split_word
|
|
||||||
| post_word
|
|
||||||
| pre_word
|
|
||||||
| word_number
|
|
||||||
| number_word
|
|
||||||
| two_digit_year
|
|
||||||
| dot_com
|
|
||||||
];
|
|
||||||
|
|
||||||
export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP];
|
|
@ -1,44 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'ru/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
card = n.CARDINAL_NUMBERS;
|
|
||||||
|
|
||||||
__currency__ = StringFile['ru/verbalizer/money.tsv'];
|
|
||||||
|
|
||||||
d = b.kDigit;
|
|
||||||
D = d - "0";
|
|
||||||
|
|
||||||
cents = ((n.D["0"] | D) d) @ card;
|
|
||||||
|
|
||||||
# Only dollar for the verbalizer tests for English. Will need to add other
|
|
||||||
# currencies.
|
|
||||||
usd_maj = Project["usd_maj" @ __currency__, 'output'];
|
|
||||||
usd_min = Project["usd_min" @ __currency__, 'output'];
|
|
||||||
and = " @@MONEY_AND@@ " | " ";
|
|
||||||
|
|
||||||
dollar1 =
|
|
||||||
n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min]
|
|
||||||
;
|
|
||||||
|
|
||||||
dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"];
|
|
||||||
|
|
||||||
dollar3 = n.D["$"] card n.I[" " usd_maj];
|
|
||||||
|
|
||||||
dollar = Optimize[dollar1 | dollar2 | dollar3];
|
|
||||||
|
|
||||||
export MONEY = Optimize[dollar @ l.LEXICAL_MAP];
|
|
|
|
@ -1,48 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
# Russian minimally supervised number grammar.
|
|
||||||
#
|
|
||||||
# Supports cardinals and ordinals in all inflected forms.
|
|
||||||
#
|
|
||||||
# The language-specific acceptor G was compiled with digit, teen, decade,
|
|
||||||
# century, and big power-of-ten preterminals. The lexicon transducer is
|
|
||||||
# highly ambiguous, but no LM is used.
|
|
||||||
|
|
||||||
import 'util/arithmetic.grm' as a;
|
|
||||||
|
|
||||||
# Intersects the universal factorization transducer (F) with language-specific
|
|
||||||
# acceptor (G).
|
|
||||||
|
|
||||||
d = a.DELTA_STAR;
|
|
||||||
f = a.IARITHMETIC_RESTRICTED;
|
|
||||||
g = LoadFst['ru/verbalizer/g.fst'];
|
|
||||||
fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]];
|
|
||||||
test1 = AssertEqual["230" @ fg, "(+ 200 30 +)"];
|
|
||||||
|
|
||||||
# Compiles lexicon transducers (L).
|
|
||||||
|
|
||||||
cardinal_name = StringFile['ru/verbalizer/cardinals.tsv'];
|
|
||||||
cardinal_l = Optimize[(cardinal_name " ")* cardinal_name];
|
|
||||||
|
|
||||||
ordinal_name = StringFile['ru/verbalizer/ordinals.tsv'];
|
|
||||||
ordinal_l = Optimize[(cardinal_name " ")* ordinal_name];
|
|
||||||
|
|
||||||
# Composes L with the leaf transducer (P), then composes that with FG.
|
|
||||||
|
|
||||||
p = a.LEAVES;
|
|
||||||
|
|
||||||
export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)];
|
|
||||||
|
|
||||||
export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)];
|
|
@ -1,68 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'ru/verbalizer/number_names.grm' as n;
|
|
||||||
import 'universal/thousands_punct.grm' as t;
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
|
|
||||||
nominatives = StringFile['ru/verbalizer/nominatives.tsv'];
|
|
||||||
|
|
||||||
sigma_star = b.kBytes*;
|
|
||||||
|
|
||||||
nominative_filter =
|
|
||||||
CDRewrite[nominatives ("" : "" <-1>), "[BOS]" | " ", " " | "[EOS]", sigma_star]
|
|
||||||
;
|
|
||||||
|
|
||||||
cardinal = n.CARDINAL_NUMBER_NAME;
|
|
||||||
ordinal = n.ORDINAL_NUMBER_NAME;
|
|
||||||
|
|
||||||
# Putting these here since this grammar gets incorporated by all the others.
|
|
||||||
|
|
||||||
func I[expr] {
|
|
||||||
return "" : expr;
|
|
||||||
}
|
|
||||||
|
|
||||||
func D[expr] {
|
|
||||||
return expr : "";
|
|
||||||
}
|
|
||||||
|
|
||||||
# Since we know this is the default for Russian, it's fair game to set it.
|
|
||||||
separators = t.dot_thousands | t.no_delimiter;
|
|
||||||
|
|
||||||
export CARDINAL_NUMBERS = Optimize[
|
|
||||||
separators
|
|
||||||
@ cardinal
|
|
||||||
];
|
|
||||||
|
|
||||||
export ORDINAL_NUMBERS_UNMARKED = Optimize[
|
|
||||||
separators
|
|
||||||
@ ordinal
|
|
||||||
];
|
|
||||||
|
|
||||||
|
|
||||||
endings = StringFile['ru/verbalizer/ordinal_endings.tsv'];
|
|
||||||
|
|
||||||
not_dash = (b.kBytes - "-")+;
|
|
||||||
del_ending = CDRewrite[("-" not_dash) : "", "", "[EOS]", sigma_star];
|
|
||||||
|
|
||||||
# Needs nominative_filter here if we take out Kyle's models.
|
|
||||||
export ORDINAL_NUMBERS_MARKED = Optimize[
|
|
||||||
Optimize[Optimize[separators @ ordinal] "-" not_dash]
|
|
||||||
@ Optimize[sigma_star endings]
|
|
||||||
@ del_ending]
|
|
||||||
;
|
|
||||||
|
|
||||||
export ORDINAL_NUMBERS =
|
|
||||||
Optimize[ORDINAL_NUMBERS_MARKED | ORDINAL_NUMBERS_UNMARKED]
|
|
||||||
;
|
|
@ -1,133 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
# Grammar for things built mostly on numbers.
|
|
||||||
|
|
||||||
import 'ru/verbalizer/factorization.grm' as f;
|
|
||||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'ru/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
num = n.CARDINAL_NUMBERS;
|
|
||||||
ord = n.ORDINAL_NUMBERS_UNMARKED;
|
|
||||||
digits = f.FRACTIONAL_PART_UNGROUPED;
|
|
||||||
|
|
||||||
# Various symbols.
|
|
||||||
|
|
||||||
plus = "+" : "@@ARITHMETIC_PLUS@@";
|
|
||||||
minus = "-" : "@@ARITHMETIC_MINUS@@";
|
|
||||||
slash = "/" : "@@SLASH@@";
|
|
||||||
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
|
||||||
dash = "-" : "@@DASH@@";
|
|
||||||
equals = "=" : "@@ARITHMETIC_EQUALS@@";
|
|
||||||
|
|
||||||
degree = "°" : "@@DEGREE@@";
|
|
||||||
|
|
||||||
division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@";
|
|
||||||
|
|
||||||
times = ("x" | "*") : "@@ARITHMETIC_TIMES@@";
|
|
||||||
|
|
||||||
power = "^" : "@@DECIMAL_EXPONENT@@";
|
|
||||||
|
|
||||||
square_root = "√" : "@@SQUARE_ROOT@@";
|
|
||||||
|
|
||||||
percent = "%" : "@@PERCENT@@";
|
|
||||||
|
|
||||||
# Safe roman numbers.
|
|
||||||
|
|
||||||
# NB: Do not change the formatting here. NO_EDIT must be on the same
|
|
||||||
# line as the path.
|
|
||||||
rfile =
|
|
||||||
'universal/roman_numerals.tsv' # NO_EDIT
|
|
||||||
;
|
|
||||||
|
|
||||||
roman = StringFile[rfile];
|
|
||||||
|
|
||||||
## Main categories.
|
|
||||||
|
|
||||||
cat_dot_number =
|
|
||||||
num
|
|
||||||
n.I[" "] dot n.I[" "] num
|
|
||||||
(n.I[" "] dot n.I[" "] num)+
|
|
||||||
;
|
|
||||||
|
|
||||||
cat_slash_number =
|
|
||||||
num
|
|
||||||
n.I[" "] slash n.I[" "] num
|
|
||||||
(n.I[" "] slash n.I[" "] num)*
|
|
||||||
;
|
|
||||||
|
|
||||||
cat_dash_number =
|
|
||||||
num
|
|
||||||
n.I[" "] dash n.I[" "] num
|
|
||||||
(n.I[" "] dash n.I[" "] num)*
|
|
||||||
;
|
|
||||||
|
|
||||||
cat_signed_number = ((plus | minus) n.I[" "])? num;
|
|
||||||
|
|
||||||
cat_degree = cat_signed_number n.I[" "] degree;
|
|
||||||
|
|
||||||
cat_country_code = plus n.I[" "] (num | digits);
|
|
||||||
|
|
||||||
cat_math_operations =
|
|
||||||
plus
|
|
||||||
| minus
|
|
||||||
| division
|
|
||||||
| times
|
|
||||||
| equals
|
|
||||||
| percent
|
|
||||||
| power
|
|
||||||
| square_root
|
|
||||||
;
|
|
||||||
|
|
||||||
# Roman numbers are often either cardinals or ordinals in various languages.
|
|
||||||
cat_roman = roman @ (num | ord);
|
|
||||||
|
|
||||||
# Allow
|
|
||||||
#
|
|
||||||
# number:number
|
|
||||||
# number-number
|
|
||||||
#
|
|
||||||
# to just be
|
|
||||||
#
|
|
||||||
# number number.
|
|
||||||
|
|
||||||
cat_number_number =
|
|
||||||
num ((":" | "-") : " ") num
|
|
||||||
;
|
|
||||||
|
|
||||||
# Some additional readings for these symbols.
|
|
||||||
|
|
||||||
cat_additional_readings =
|
|
||||||
("/" : "@@PER@@") |
|
|
||||||
("+" : "@@AND@@") |
|
|
||||||
("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) |
|
|
||||||
("*" : "@@STAR@@") |
|
|
||||||
("x" : ("x" | "@@CONNECTOR_BY@@")) |
|
|
||||||
("@" : "@@AT@@")
|
|
||||||
;
|
|
||||||
|
|
||||||
numbers_plus = Optimize[
|
|
||||||
cat_dot_number
|
|
||||||
| cat_slash_number
|
|
||||||
| cat_dash_number
|
|
||||||
| cat_signed_number
|
|
||||||
| cat_degree
|
|
||||||
| cat_country_code
|
|
||||||
| cat_math_operations
|
|
||||||
| cat_roman
|
|
||||||
| cat_number_number
|
|
||||||
| cat_additional_readings
|
|
||||||
];
|
|
||||||
|
|
||||||
export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP];
|
|
|
@ -1,804 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
# AUTOMATICALLY GENERATED: DO NOT EDIT.
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
|
|
||||||
# Utilities for insertion and deletion.
|
|
||||||
|
|
||||||
func I[expr] {
|
|
||||||
return "" : expr;
|
|
||||||
}
|
|
||||||
|
|
||||||
func D[expr] {
|
|
||||||
return expr : "";
|
|
||||||
}
|
|
||||||
|
|
||||||
# Powers of base 10.
|
|
||||||
export POWERS =
|
|
||||||
"[E15]"
|
|
||||||
| "[E14]"
|
|
||||||
| "[E13]"
|
|
||||||
| "[E12]"
|
|
||||||
| "[E11]"
|
|
||||||
| "[E10]"
|
|
||||||
| "[E9]"
|
|
||||||
| "[E8]"
|
|
||||||
| "[E7]"
|
|
||||||
| "[E6]"
|
|
||||||
| "[E5]"
|
|
||||||
| "[E4]"
|
|
||||||
| "[E3]"
|
|
||||||
| "[E2]"
|
|
||||||
| "[E1]"
|
|
||||||
;
|
|
||||||
|
|
||||||
export SIGMA = b.kBytes | POWERS;
|
|
||||||
|
|
||||||
export SIGMA_STAR = SIGMA*;
|
|
||||||
|
|
||||||
export SIGMA_PLUS = SIGMA+;
|
|
||||||
|
|
||||||
################################################################################
|
|
||||||
# BEGIN LANGUAGE SPECIFIC DATA
|
|
||||||
revaluations =
|
|
||||||
("[E4]" : "[E1]")
|
|
||||||
| ("[E5]" : "[E2]")
|
|
||||||
| ("[E7]" : "[E1]")
|
|
||||||
| ("[E8]" : "[E2]")
|
|
||||||
;
|
|
||||||
|
|
||||||
Ms = "[E3]" | "[E6]" | "[E9]";
|
|
||||||
|
|
||||||
|
|
||||||
func Zero[expr] {
|
|
||||||
return expr : ("");
|
|
||||||
}
|
|
||||||
|
|
||||||
space = " ";
|
|
||||||
|
|
||||||
lexset3 = Optimize[
|
|
||||||
("1[E1]+1" : "одиннадцатая@")
|
|
||||||
| ("1[E1]+1" : "одиннадцати")
|
|
||||||
| ("1[E1]+1" : "одиннадцатого@")
|
|
||||||
| ("1[E1]+1" : "одиннадцатое@")
|
|
||||||
| ("1[E1]+1" : "одиннадцатой@")
|
|
||||||
| ("1[E1]+1" : "одиннадцатом@")
|
|
||||||
| ("1[E1]+1" : "одиннадцатому@")
|
|
||||||
| ("1[E1]+1" : "одиннадцатую@")
|
|
||||||
| ("1[E1]+1" : "одиннадцатые@")
|
|
||||||
| ("1[E1]+1" : "одиннадцатый@")
|
|
||||||
| ("1[E1]+1" : "одиннадцатым@")
|
|
||||||
| ("1[E1]+1" : "одиннадцатыми@")
|
|
||||||
| ("1[E1]+1" : "одиннадцатых@")
|
|
||||||
| ("1[E1]+1" : "одиннадцать")
|
|
||||||
| ("1[E1]+1" : "одиннадцатью")
|
|
||||||
| ("1[E1]+2" : "двенадцатая@")
|
|
||||||
| ("1[E1]+2" : "двенадцати")
|
|
||||||
| ("1[E1]+2" : "двенадцатого@")
|
|
||||||
| ("1[E1]+2" : "двенадцатое@")
|
|
||||||
| ("1[E1]+2" : "двенадцатой@")
|
|
||||||
| ("1[E1]+2" : "двенадцатом@")
|
|
||||||
| ("1[E1]+2" : "двенадцатому@")
|
|
||||||
| ("1[E1]+2" : "двенадцатую@")
|
|
||||||
| ("1[E1]+2" : "двенадцатые@")
|
|
||||||
| ("1[E1]+2" : "двенадцатый@")
|
|
||||||
| ("1[E1]+2" : "двенадцатым@")
|
|
||||||
| ("1[E1]+2" : "двенадцатыми@")
|
|
||||||
| ("1[E1]+2" : "двенадцатых@")
|
|
||||||
| ("1[E1]+2" : "двенадцать")
|
|
||||||
| ("1[E1]+2" : "двенадцатью")
|
|
||||||
| ("1[E1]+3" : "тринадцатая@")
|
|
||||||
| ("1[E1]+3" : "тринадцати")
|
|
||||||
| ("1[E1]+3" : "тринадцатого@")
|
|
||||||
| ("1[E1]+3" : "тринадцатое@")
|
|
||||||
| ("1[E1]+3" : "тринадцатой@")
|
|
||||||
| ("1[E1]+3" : "тринадцатом@")
|
|
||||||
| ("1[E1]+3" : "тринадцатому@")
|
|
||||||
| ("1[E1]+3" : "тринадцатую@")
|
|
||||||
| ("1[E1]+3" : "тринадцатые@")
|
|
||||||
| ("1[E1]+3" : "тринадцатый@")
|
|
||||||
| ("1[E1]+3" : "тринадцатым@")
|
|
||||||
| ("1[E1]+3" : "тринадцатыми@")
|
|
||||||
| ("1[E1]+3" : "тринадцатых@")
|
|
||||||
| ("1[E1]+3" : "тринадцать")
|
|
||||||
| ("1[E1]+3" : "тринадцатью")
|
|
||||||
| ("1[E1]+4" : "четырнадцатая@")
|
|
||||||
| ("1[E1]+4" : "четырнадцати")
|
|
||||||
| ("1[E1]+4" : "четырнадцатого@")
|
|
||||||
| ("1[E1]+4" : "четырнадцатое@")
|
|
||||||
| ("1[E1]+4" : "четырнадцатой@")
|
|
||||||
| ("1[E1]+4" : "четырнадцатом@")
|
|
||||||
| ("1[E1]+4" : "четырнадцатому@")
|
|
||||||
| ("1[E1]+4" : "четырнадцатую@")
|
|
||||||
| ("1[E1]+4" : "четырнадцатые@")
|
|
||||||
| ("1[E1]+4" : "четырнадцатый@")
|
|
||||||
| ("1[E1]+4" : "четырнадцатым@")
|
|
||||||
| ("1[E1]+4" : "четырнадцатыми@")
|
|
||||||
| ("1[E1]+4" : "четырнадцатых@")
|
|
||||||
| ("1[E1]+4" : "четырнадцать")
|
|
||||||
| ("1[E1]+4" : "четырнадцатью")
|
|
||||||
| ("1[E1]+5" : "пятнадцатая@")
|
|
||||||
| ("1[E1]+5" : "пятнадцати")
|
|
||||||
| ("1[E1]+5" : "пятнадцатого@")
|
|
||||||
| ("1[E1]+5" : "пятнадцатое@")
|
|
||||||
| ("1[E1]+5" : "пятнадцатой@")
|
|
||||||
| ("1[E1]+5" : "пятнадцатом@")
|
|
||||||
| ("1[E1]+5" : "пятнадцатому@")
|
|
||||||
| ("1[E1]+5" : "пятнадцатую@")
|
|
||||||
| ("1[E1]+5" : "пятнадцатые@")
|
|
||||||
| ("1[E1]+5" : "пятнадцатый@")
|
|
||||||
| ("1[E1]+5" : "пятнадцатым@")
|
|
||||||
| ("1[E1]+5" : "пятнадцатыми@")
|
|
||||||
| ("1[E1]+5" : "пятнадцатых@")
|
|
||||||
| ("1[E1]+5" : "пятнадцать")
|
|
||||||
| ("1[E1]+5" : "пятнадцатью")
|
|
||||||
| ("1[E1]+6" : "шестнадцатая@")
|
|
||||||
| ("1[E1]+6" : "шестнадцати")
|
|
||||||
| ("1[E1]+6" : "шестнадцатого@")
|
|
||||||
| ("1[E1]+6" : "шестнадцатое@")
|
|
||||||
| ("1[E1]+6" : "шестнадцатой@")
|
|
||||||
| ("1[E1]+6" : "шестнадцатом@")
|
|
||||||
| ("1[E1]+6" : "шестнадцатому@")
|
|
||||||
| ("1[E1]+6" : "шестнадцатую@")
|
|
||||||
| ("1[E1]+6" : "шестнадцатые@")
|
|
||||||
| ("1[E1]+6" : "шестнадцатый@")
|
|
||||||
| ("1[E1]+6" : "шестнадцатым@")
|
|
||||||
| ("1[E1]+6" : "шестнадцатыми@")
|
|
||||||
| ("1[E1]+6" : "шестнадцатых@")
|
|
||||||
| ("1[E1]+6" : "шестнадцать")
|
|
||||||
| ("1[E1]+6" : "шестнадцатью")
|
|
||||||
| ("1[E1]+7" : "семнадцатая@")
|
|
||||||
| ("1[E1]+7" : "семнадцати")
|
|
||||||
| ("1[E1]+7" : "семнадцатого@")
|
|
||||||
| ("1[E1]+7" : "семнадцатое@")
|
|
||||||
| ("1[E1]+7" : "семнадцатой@")
|
|
||||||
| ("1[E1]+7" : "семнадцатом@")
|
|
||||||
| ("1[E1]+7" : "семнадцатому@")
|
|
||||||
| ("1[E1]+7" : "семнадцатую@")
|
|
||||||
| ("1[E1]+7" : "семнадцатые@")
|
|
||||||
| ("1[E1]+7" : "семнадцатый@")
|
|
||||||
| ("1[E1]+7" : "семнадцатым@")
|
|
||||||
| ("1[E1]+7" : "семнадцатыми@")
|
|
||||||
| ("1[E1]+7" : "семнадцатых@")
|
|
||||||
| ("1[E1]+7" : "семнадцать")
|
|
||||||
| ("1[E1]+7" : "семнадцатью")
|
|
||||||
| ("1[E1]+8" : "восемнадцатая@")
|
|
||||||
| ("1[E1]+8" : "восемнадцати")
|
|
||||||
| ("1[E1]+8" : "восемнадцатого@")
|
|
||||||
| ("1[E1]+8" : "восемнадцатое@")
|
|
||||||
| ("1[E1]+8" : "восемнадцатой@")
|
|
||||||
| ("1[E1]+8" : "восемнадцатом@")
|
|
||||||
| ("1[E1]+8" : "восемнадцатому@")
|
|
||||||
| ("1[E1]+8" : "восемнадцатую@")
|
|
||||||
| ("1[E1]+8" : "восемнадцатые@")
|
|
||||||
| ("1[E1]+8" : "восемнадцатый@")
|
|
||||||
| ("1[E1]+8" : "восемнадцатым@")
|
|
||||||
| ("1[E1]+8" : "восемнадцатыми@")
|
|
||||||
| ("1[E1]+8" : "восемнадцатых@")
|
|
||||||
| ("1[E1]+8" : "восемнадцать")
|
|
||||||
| ("1[E1]+8" : "восемнадцатью")
|
|
||||||
| ("1[E1]+9" : "девятнадцатая@")
|
|
||||||
| ("1[E1]+9" : "девятнадцати")
|
|
||||||
| ("1[E1]+9" : "девятнадцатого@")
|
|
||||||
| ("1[E1]+9" : "девятнадцатое@")
|
|
||||||
| ("1[E1]+9" : "девятнадцатой@")
|
|
||||||
| ("1[E1]+9" : "девятнадцатом@")
|
|
||||||
| ("1[E1]+9" : "девятнадцатому@")
|
|
||||||
| ("1[E1]+9" : "девятнадцатую@")
|
|
||||||
| ("1[E1]+9" : "девятнадцатые@")
|
|
||||||
| ("1[E1]+9" : "девятнадцатый@")
|
|
||||||
| ("1[E1]+9" : "девятнадцатым@")
|
|
||||||
| ("1[E1]+9" : "девятнадцатыми@")
|
|
||||||
| ("1[E1]+9" : "девятнадцатых@")
|
|
||||||
| ("1[E1]+9" : "девятнадцать")
|
|
||||||
| ("1[E1]+9" : "девятнадцатью")]
|
|
||||||
;
|
|
||||||
|
|
||||||
lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
lexset2 = Optimize[
|
|
||||||
("1[E1]" : "десятая@")
|
|
||||||
| ("1[E1]" : "десяти")
|
|
||||||
| ("1[E1]" : "десятого@")
|
|
||||||
| ("1[E1]" : "десятое@")
|
|
||||||
| ("1[E1]" : "десятой@")
|
|
||||||
| ("1[E1]" : "десятом@")
|
|
||||||
| ("1[E1]" : "десятому@")
|
|
||||||
| ("1[E1]" : "десятую@")
|
|
||||||
| ("1[E1]" : "десятые@")
|
|
||||||
| ("1[E1]" : "десятый@")
|
|
||||||
| ("1[E1]" : "десятым@")
|
|
||||||
| ("1[E1]" : "десятыми@")
|
|
||||||
| ("1[E1]" : "десятых@")
|
|
||||||
| ("1[E1]" : "десять")
|
|
||||||
| ("1[E1]" : "десятью")
|
|
||||||
| ("1[E2]" : "сотая@")
|
|
||||||
| ("1[E2]" : "сотого@")
|
|
||||||
| ("1[E2]" : "сотое@")
|
|
||||||
| ("1[E2]" : "сотой@")
|
|
||||||
| ("1[E2]" : "сотом@")
|
|
||||||
| ("1[E2]" : "сотому@")
|
|
||||||
| ("1[E2]" : "сотую@")
|
|
||||||
| ("1[E2]" : "сотые@")
|
|
||||||
| ("1[E2]" : "сотый@")
|
|
||||||
| ("1[E2]" : "сотым@")
|
|
||||||
| ("1[E2]" : "сотыми@")
|
|
||||||
| ("1[E2]" : "сотых@")
|
|
||||||
| ("1[E2]" : "ста")
|
|
||||||
| ("1[E2]" : "сто")
|
|
||||||
| ("1[E3]" : "тысячная@")
|
|
||||||
| ("1[E3]" : "тысячного@")
|
|
||||||
| ("1[E3]" : "тысячное@")
|
|
||||||
| ("1[E3]" : "тысячной@")
|
|
||||||
| ("1[E3]" : "тысячном@")
|
|
||||||
| ("1[E3]" : "тысячному@")
|
|
||||||
| ("1[E3]" : "тысячную@")
|
|
||||||
| ("1[E3]" : "тысячные@")
|
|
||||||
| ("1[E3]" : "тысячный@")
|
|
||||||
| ("1[E3]" : "тысячным@")
|
|
||||||
| ("1[E3]" : "тысячными@")
|
|
||||||
| ("1[E3]" : "тысячных@")
|
|
||||||
| ("1[E6]" : "миллионная@")
|
|
||||||
| ("1[E6]" : "миллионного@")
|
|
||||||
| ("1[E6]" : "миллионное@")
|
|
||||||
| ("1[E6]" : "миллионной@")
|
|
||||||
| ("1[E6]" : "миллионном@")
|
|
||||||
| ("1[E6]" : "миллионному@")
|
|
||||||
| ("1[E6]" : "миллионную@")
|
|
||||||
| ("1[E6]" : "миллионные@")
|
|
||||||
| ("1[E6]" : "миллионный@")
|
|
||||||
| ("1[E6]" : "миллионным@")
|
|
||||||
| ("1[E6]" : "миллионными@")
|
|
||||||
| ("1[E6]" : "миллионных@")
|
|
||||||
| ("1[E9]" : "миллиардная@")
|
|
||||||
| ("1[E9]" : "миллиардного@")
|
|
||||||
| ("1[E9]" : "миллиардное@")
|
|
||||||
| ("1[E9]" : "миллиардной@")
|
|
||||||
| ("1[E9]" : "миллиардном@")
|
|
||||||
| ("1[E9]" : "миллиардному@")
|
|
||||||
| ("1[E9]" : "миллиардную@")
|
|
||||||
| ("1[E9]" : "миллиардные@")
|
|
||||||
| ("1[E9]" : "миллиардный@")
|
|
||||||
| ("1[E9]" : "миллиардным@")
|
|
||||||
| ("1[E9]" : "миллиардными@")
|
|
||||||
| ("1[E9]" : "миллиардных@")
|
|
||||||
| ("2[E1]" : "двадцатая@")
|
|
||||||
| ("2[E1]" : "двадцати")
|
|
||||||
| ("2[E1]" : "двадцатого@")
|
|
||||||
| ("2[E1]" : "двадцатое@")
|
|
||||||
| ("2[E1]" : "двадцатой@")
|
|
||||||
| ("2[E1]" : "двадцатом@")
|
|
||||||
| ("2[E1]" : "двадцатому@")
|
|
||||||
| ("2[E1]" : "двадцатую@")
|
|
||||||
| ("2[E1]" : "двадцатые@")
|
|
||||||
| ("2[E1]" : "двадцатый@")
|
|
||||||
| ("2[E1]" : "двадцатым@")
|
|
||||||
| ("2[E1]" : "двадцатыми@")
|
|
||||||
| ("2[E1]" : "двадцатых@")
|
|
||||||
| ("2[E1]" : "двадцать")
|
|
||||||
| ("2[E1]" : "двадцатью")
|
|
||||||
| ("2[E2]" : "двести")
|
|
||||||
| ("2[E2]" : "двумстам")
|
|
||||||
| ("2[E2]" : "двумястами")
|
|
||||||
| ("2[E2]" : "двухсот")
|
|
||||||
| ("2[E2]" : "двухсотая@")
|
|
||||||
| ("2[E2]" : "двухсотого@")
|
|
||||||
| ("2[E2]" : "двухсотое@")
|
|
||||||
| ("2[E2]" : "двухсотой@")
|
|
||||||
| ("2[E2]" : "двухсотом@")
|
|
||||||
| ("2[E2]" : "двухсотому@")
|
|
||||||
| ("2[E2]" : "двухсотую@")
|
|
||||||
| ("2[E2]" : "двухсотые@")
|
|
||||||
| ("2[E2]" : "двухсотый@")
|
|
||||||
| ("2[E2]" : "двухсотым@")
|
|
||||||
| ("2[E2]" : "двухсотыми@")
|
|
||||||
| ("2[E2]" : "двухсотых@")
|
|
||||||
| ("2[E2]" : "двухстах")
|
|
||||||
| ("3[E1]" : "тридцатая@")
|
|
||||||
| ("3[E1]" : "тридцати")
|
|
||||||
| ("3[E1]" : "тридцатого@")
|
|
||||||
| ("3[E1]" : "тридцатое@")
|
|
||||||
| ("3[E1]" : "тридцатой@")
|
|
||||||
| ("3[E1]" : "тридцатом@")
|
|
||||||
| ("3[E1]" : "тридцатому@")
|
|
||||||
| ("3[E1]" : "тридцатую@")
|
|
||||||
| ("3[E1]" : "тридцатые@")
|
|
||||||
| ("3[E1]" : "тридцатый@")
|
|
||||||
| ("3[E1]" : "тридцатым@")
|
|
||||||
| ("3[E1]" : "тридцатыми@")
|
|
||||||
| ("3[E1]" : "тридцатых@")
|
|
||||||
| ("3[E1]" : "тридцать")
|
|
||||||
| ("3[E1]" : "тридцатью")
|
|
||||||
| ("3[E2]" : "тремстам")
|
|
||||||
| ("3[E2]" : "тремястами")
|
|
||||||
| ("3[E2]" : "трехсот")
|
|
||||||
| ("3[E2]" : "трехсотая@")
|
|
||||||
| ("3[E2]" : "трехсотого@")
|
|
||||||
| ("3[E2]" : "трехсотое@")
|
|
||||||
| ("3[E2]" : "трехсотой@")
|
|
||||||
| ("3[E2]" : "трехсотом@")
|
|
||||||
| ("3[E2]" : "трехсотому@")
|
|
||||||
| ("3[E2]" : "трехсотую@")
|
|
||||||
| ("3[E2]" : "трехсотые@")
|
|
||||||
| ("3[E2]" : "трехсотый@")
|
|
||||||
| ("3[E2]" : "трехсотым@")
|
|
||||||
| ("3[E2]" : "трехсотыми@")
|
|
||||||
| ("3[E2]" : "трехсотых@")
|
|
||||||
| ("3[E2]" : "трехстах")
|
|
||||||
| ("3[E2]" : "триста")
|
|
||||||
| ("4[E1]" : "сорок")
|
|
||||||
| ("4[E1]" : "сорока")
|
|
||||||
| ("4[E1]" : "сороковая@")
|
|
||||||
| ("4[E1]" : "сорокового@")
|
|
||||||
| ("4[E1]" : "сороковое@")
|
|
||||||
| ("4[E1]" : "сороковой@")
|
|
||||||
| ("4[E1]" : "сороковом@")
|
|
||||||
| ("4[E1]" : "сороковому@")
|
|
||||||
| ("4[E1]" : "сороковую@")
|
|
||||||
| ("4[E1]" : "сороковые@")
|
|
||||||
| ("4[E1]" : "сороковым@")
|
|
||||||
| ("4[E1]" : "сороковыми@")
|
|
||||||
| ("4[E1]" : "сороковых@")
|
|
||||||
| ("4[E2]" : "четыремстам")
|
|
||||||
| ("4[E2]" : "четыреста")
|
|
||||||
| ("4[E2]" : "четырехсот")
|
|
||||||
| ("4[E2]" : "четырехсотая@")
|
|
||||||
| ("4[E2]" : "четырехсотого@")
|
|
||||||
| ("4[E2]" : "четырехсотое@")
|
|
||||||
| ("4[E2]" : "четырехсотой@")
|
|
||||||
| ("4[E2]" : "четырехсотом@")
|
|
||||||
| ("4[E2]" : "четырехсотому@")
|
|
||||||
| ("4[E2]" : "четырехсотую@")
|
|
||||||
| ("4[E2]" : "четырехсотые@")
|
|
||||||
| ("4[E2]" : "четырехсотый@")
|
|
||||||
| ("4[E2]" : "четырехсотым@")
|
|
||||||
| ("4[E2]" : "четырехсотыми@")
|
|
||||||
| ("4[E2]" : "четырехсотых@")
|
|
||||||
| ("4[E2]" : "четырехстах")
|
|
||||||
| ("4[E2]" : "четырьмястами")
|
|
||||||
| ("5[E1]" : "пятидесятая@")
|
|
||||||
| ("5[E1]" : "пятидесяти")
|
|
||||||
| ("5[E1]" : "пятидесятого@")
|
|
||||||
| ("5[E1]" : "пятидесятое@")
|
|
||||||
| ("5[E1]" : "пятидесятой@")
|
|
||||||
| ("5[E1]" : "пятидесятом@")
|
|
||||||
| ("5[E1]" : "пятидесятому@")
|
|
||||||
| ("5[E1]" : "пятидесятую@")
|
|
||||||
| ("5[E1]" : "пятидесятые@")
|
|
||||||
| ("5[E1]" : "пятидесятый@")
|
|
||||||
| ("5[E1]" : "пятидесятым@")
|
|
||||||
| ("5[E1]" : "пятидесятыми@")
|
|
||||||
| ("5[E1]" : "пятидесятых@")
|
|
||||||
| ("5[E1]" : "пятьдесят")
|
|
||||||
| ("5[E1]" : "пятьюдесятью")
|
|
||||||
| ("5[E2]" : "пятисот")
|
|
||||||
| ("5[E2]" : "пятисотая@")
|
|
||||||
| ("5[E2]" : "пятисотого@")
|
|
||||||
| ("5[E2]" : "пятисотое@")
|
|
||||||
| ("5[E2]" : "пятисотой@")
|
|
||||||
| ("5[E2]" : "пятисотом@")
|
|
||||||
| ("5[E2]" : "пятисотому@")
|
|
||||||
| ("5[E2]" : "пятисотую@")
|
|
||||||
| ("5[E2]" : "пятисотые@")
|
|
||||||
| ("5[E2]" : "пятисотый@")
|
|
||||||
| ("5[E2]" : "пятисотым@")
|
|
||||||
| ("5[E2]" : "пятисотыми@")
|
|
||||||
| ("5[E2]" : "пятисотых@")
|
|
||||||
| ("5[E2]" : "пятистам")
|
|
||||||
| ("5[E2]" : "пятистах")
|
|
||||||
| ("5[E2]" : "пятьсот")
|
|
||||||
| ("5[E2]" : "пятьюстами")
|
|
||||||
| ("6[E1]" : "шестидесятая@")
|
|
||||||
| ("6[E1]" : "шестидесяти")
|
|
||||||
| ("6[E1]" : "шестидесятого@")
|
|
||||||
| ("6[E1]" : "шестидесятое@")
|
|
||||||
| ("6[E1]" : "шестидесятой@")
|
|
||||||
| ("6[E1]" : "шестидесятом@")
|
|
||||||
| ("6[E1]" : "шестидесятому@")
|
|
||||||
| ("6[E1]" : "шестидесятую@")
|
|
||||||
| ("6[E1]" : "шестидесятые@")
|
|
||||||
| ("6[E1]" : "шестидесятый@")
|
|
||||||
| ("6[E1]" : "шестидесятым@")
|
|
||||||
| ("6[E1]" : "шестидесятыми@")
|
|
||||||
| ("6[E1]" : "шестидесятых@")
|
|
||||||
| ("6[E1]" : "шестьдесят")
|
|
||||||
| ("6[E1]" : "шестьюдесятью")
|
|
||||||
| ("6[E2]" : "шестисот")
|
|
||||||
| ("6[E2]" : "шестисотая@")
|
|
||||||
| ("6[E2]" : "шестисотого@")
|
|
||||||
| ("6[E2]" : "шестисотое@")
|
|
||||||
| ("6[E2]" : "шестисотой@")
|
|
||||||
| ("6[E2]" : "шестисотом@")
|
|
||||||
| ("6[E2]" : "шестисотому@")
|
|
||||||
| ("6[E2]" : "шестисотую@")
|
|
||||||
| ("6[E2]" : "шестисотые@")
|
|
||||||
| ("6[E2]" : "шестисотый@")
|
|
||||||
| ("6[E2]" : "шестисотым@")
|
|
||||||
| ("6[E2]" : "шестисотыми@")
|
|
||||||
| ("6[E2]" : "шестисотых@")
|
|
||||||
| ("6[E2]" : "шестистам")
|
|
||||||
| ("6[E2]" : "шестистах")
|
|
||||||
| ("6[E2]" : "шестьсот")
|
|
||||||
| ("6[E2]" : "шестьюстами")
|
|
||||||
| ("7[E1]" : "семидесятая@")
|
|
||||||
| ("7[E1]" : "семидесяти")
|
|
||||||
| ("7[E1]" : "семидесятого@")
|
|
||||||
| ("7[E1]" : "семидесятое@")
|
|
||||||
| ("7[E1]" : "семидесятой@")
|
|
||||||
| ("7[E1]" : "семидесятом@")
|
|
||||||
| ("7[E1]" : "семидесятому@")
|
|
||||||
| ("7[E1]" : "семидесятую@")
|
|
||||||
| ("7[E1]" : "семидесятые@")
|
|
||||||
| ("7[E1]" : "семидесятый@")
|
|
||||||
| ("7[E1]" : "семидесятым@")
|
|
||||||
| ("7[E1]" : "семидесятыми@")
|
|
||||||
| ("7[E1]" : "семидесятых@")
|
|
||||||
| ("7[E1]" : "семьдесят")
|
|
||||||
| ("7[E1]" : "семьюдесятью")
|
|
||||||
| ("7[E2]" : "семисот")
|
|
||||||
| ("7[E2]" : "семисотая@")
|
|
||||||
| ("7[E2]" : "семисотого@")
|
|
||||||
| ("7[E2]" : "семисотое@")
|
|
||||||
| ("7[E2]" : "семисотой@")
|
|
||||||
| ("7[E2]" : "семисотом@")
|
|
||||||
| ("7[E2]" : "семисотому@")
|
|
||||||
| ("7[E2]" : "семисотую@")
|
|
||||||
| ("7[E2]" : "семисотые@")
|
|
||||||
| ("7[E2]" : "семисотый@")
|
|
||||||
| ("7[E2]" : "семисотым@")
|
|
||||||
| ("7[E2]" : "семисотыми@")
|
|
||||||
| ("7[E2]" : "семисотых@")
|
|
||||||
| ("7[E2]" : "семистам")
|
|
||||||
| ("7[E2]" : "семистах")
|
|
||||||
| ("7[E2]" : "семьсот")
|
|
||||||
| ("7[E2]" : "семьюстами")
|
|
||||||
| ("8[E1]" : "восемьдесят")
|
|
||||||
| ("8[E1]" : "восьмидесятая@")
|
|
||||||
| ("8[E1]" : "восьмидесяти")
|
|
||||||
| ("8[E1]" : "восьмидесятого@")
|
|
||||||
| ("8[E1]" : "восьмидесятое@")
|
|
||||||
| ("8[E1]" : "восьмидесятой@")
|
|
||||||
| ("8[E1]" : "восьмидесятом@")
|
|
||||||
| ("8[E1]" : "восьмидесятому@")
|
|
||||||
| ("8[E1]" : "восьмидесятую@")
|
|
||||||
| ("8[E1]" : "восьмидесятые@")
|
|
||||||
| ("8[E1]" : "восьмидесятый@")
|
|
||||||
| ("8[E1]" : "восьмидесятым@")
|
|
||||||
| ("8[E1]" : "восьмидесятыми@")
|
|
||||||
| ("8[E1]" : "восьмидесятых@")
|
|
||||||
| ("8[E1]" : "восьмьюдесятью")
|
|
||||||
| ("8[E2]" : "восемьсот")
|
|
||||||
| ("8[E2]" : "восемьюстами")
|
|
||||||
| ("8[E2]" : "восьмисот")
|
|
||||||
| ("8[E2]" : "восьмисотая@")
|
|
||||||
| ("8[E2]" : "восьмисотого@")
|
|
||||||
| ("8[E2]" : "восьмисотое@")
|
|
||||||
| ("8[E2]" : "восьмисотой@")
|
|
||||||
| ("8[E2]" : "восьмисотом@")
|
|
||||||
| ("8[E2]" : "восьмисотому@")
|
|
||||||
| ("8[E2]" : "восьмисотую@")
|
|
||||||
| ("8[E2]" : "восьмисотые@")
|
|
||||||
| ("8[E2]" : "восьмисотый@")
|
|
||||||
| ("8[E2]" : "восьмисотым@")
|
|
||||||
| ("8[E2]" : "восьмисотыми@")
|
|
||||||
| ("8[E2]" : "восьмисотых@")
|
|
||||||
| ("8[E2]" : "восьмистам")
|
|
||||||
| ("8[E2]" : "восьмистах")
|
|
||||||
| ("8[E2]" : "восьмьюстами")
|
|
||||||
| ("9[E1]" : "девяноста")
|
|
||||||
| ("9[E1]" : "девяностая@")
|
|
||||||
| ("9[E1]" : "девяносто")
|
|
||||||
| ("9[E1]" : "девяностого@")
|
|
||||||
| ("9[E1]" : "девяностое@")
|
|
||||||
| ("9[E1]" : "девяностой@")
|
|
||||||
| ("9[E1]" : "девяностом@")
|
|
||||||
| ("9[E1]" : "девяностому@")
|
|
||||||
| ("9[E1]" : "девяностую@")
|
|
||||||
| ("9[E1]" : "девяностые@")
|
|
||||||
| ("9[E1]" : "девяностый@")
|
|
||||||
| ("9[E1]" : "девяностым@")
|
|
||||||
| ("9[E1]" : "девяностыми@")
|
|
||||||
| ("9[E1]" : "девяностых@")
|
|
||||||
| ("9[E2]" : "девятисот")
|
|
||||||
| ("9[E2]" : "девятисотая@")
|
|
||||||
| ("9[E2]" : "девятисотого@")
|
|
||||||
| ("9[E2]" : "девятисотое@")
|
|
||||||
| ("9[E2]" : "девятисотой@")
|
|
||||||
| ("9[E2]" : "девятисотом@")
|
|
||||||
| ("9[E2]" : "девятисотому@")
|
|
||||||
| ("9[E2]" : "девятисотую@")
|
|
||||||
| ("9[E2]" : "девятисотые@")
|
|
||||||
| ("9[E2]" : "девятисотый@")
|
|
||||||
| ("9[E2]" : "девятисотым@")
|
|
||||||
| ("9[E2]" : "девятисотыми@")
|
|
||||||
| ("9[E2]" : "девятисотых@")
|
|
||||||
| ("9[E2]" : "девятистам")
|
|
||||||
| ("9[E2]" : "девятистах")
|
|
||||||
| ("9[E2]" : "девятьсот")
|
|
||||||
| ("9[E2]" : "девятьюстами")]
|
|
||||||
;
|
|
||||||
|
|
||||||
lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
lexset1 = Optimize[
|
|
||||||
("+" : "")
|
|
||||||
| ("1" : "один")
|
|
||||||
| ("1" : "одна")
|
|
||||||
| ("1" : "одни")
|
|
||||||
| ("1" : "одним")
|
|
||||||
| ("1" : "одними")
|
|
||||||
| ("1" : "одних")
|
|
||||||
| ("1" : "одно")
|
|
||||||
| ("1" : "одного")
|
|
||||||
| ("1" : "одной")
|
|
||||||
| ("1" : "одном")
|
|
||||||
| ("1" : "одному")
|
|
||||||
| ("1" : "одною")
|
|
||||||
| ("1" : "одну")
|
|
||||||
| ("1" : "первая@")
|
|
||||||
| ("1" : "первого@")
|
|
||||||
| ("1" : "первое@")
|
|
||||||
| ("1" : "первой@")
|
|
||||||
| ("1" : "первом@")
|
|
||||||
| ("1" : "первому@")
|
|
||||||
| ("1" : "первую@")
|
|
||||||
| ("1" : "первые@")
|
|
||||||
| ("1" : "первый@")
|
|
||||||
| ("1" : "первым@")
|
|
||||||
| ("1" : "первыми@")
|
|
||||||
| ("1" : "первых@")
|
|
||||||
| ("2" : "вторая@")
|
|
||||||
| ("2" : "второго@")
|
|
||||||
| ("2" : "второе@")
|
|
||||||
| ("2" : "второй@")
|
|
||||||
| ("2" : "втором@")
|
|
||||||
| ("2" : "второму@")
|
|
||||||
| ("2" : "вторую@")
|
|
||||||
| ("2" : "вторые@")
|
|
||||||
| ("2" : "вторым@")
|
|
||||||
| ("2" : "вторыми@")
|
|
||||||
| ("2" : "вторых@")
|
|
||||||
| ("2" : "два")
|
|
||||||
| ("2" : "две")
|
|
||||||
| ("2" : "двум")
|
|
||||||
| ("2" : "двумя")
|
|
||||||
| ("2" : "двух")
|
|
||||||
| ("3" : "трем")
|
|
||||||
| ("3" : "тремя")
|
|
||||||
| ("3" : "третий@")
|
|
||||||
| ("3" : "третье@")
|
|
||||||
| ("3" : "третьего@")
|
|
||||||
| ("3" : "третьей@")
|
|
||||||
| ("3" : "третьем@")
|
|
||||||
| ("3" : "третьему@")
|
|
||||||
| ("3" : "третьи@")
|
|
||||||
| ("3" : "третьим@")
|
|
||||||
| ("3" : "третьими@")
|
|
||||||
| ("3" : "третьих@")
|
|
||||||
| ("3" : "третью@")
|
|
||||||
| ("3" : "третья@")
|
|
||||||
| ("3" : "трех")
|
|
||||||
| ("3" : "три")
|
|
||||||
| ("4" : "четвертая@")
|
|
||||||
| ("4" : "четвертого@")
|
|
||||||
| ("4" : "четвертое@")
|
|
||||||
| ("4" : "четвертой@")
|
|
||||||
| ("4" : "четвертом@")
|
|
||||||
| ("4" : "четвертому@")
|
|
||||||
| ("4" : "четвертую@")
|
|
||||||
| ("4" : "четвертые@")
|
|
||||||
| ("4" : "четвертый@")
|
|
||||||
| ("4" : "четвертым@")
|
|
||||||
| ("4" : "четвертыми@")
|
|
||||||
| ("4" : "четвертых@")
|
|
||||||
| ("4" : "четыре")
|
|
||||||
| ("4" : "четырем")
|
|
||||||
| ("4" : "четырех")
|
|
||||||
| ("4" : "четырьмя")
|
|
||||||
| ("5" : "пятая@")
|
|
||||||
| ("5" : "пяти")
|
|
||||||
| ("5" : "пятого@")
|
|
||||||
| ("5" : "пятое@")
|
|
||||||
| ("5" : "пятой@")
|
|
||||||
| ("5" : "пятом@")
|
|
||||||
| ("5" : "пятому@")
|
|
||||||
| ("5" : "пятую@")
|
|
||||||
| ("5" : "пятые@")
|
|
||||||
| ("5" : "пятый@")
|
|
||||||
| ("5" : "пятым@")
|
|
||||||
| ("5" : "пятыми@")
|
|
||||||
| ("5" : "пятых@")
|
|
||||||
| ("5" : "пять")
|
|
||||||
| ("5" : "пятью")
|
|
||||||
| ("6" : "шестая@")
|
|
||||||
| ("6" : "шести")
|
|
||||||
| ("6" : "шестого@")
|
|
||||||
| ("6" : "шестое@")
|
|
||||||
| ("6" : "шестой@")
|
|
||||||
| ("6" : "шестом@")
|
|
||||||
| ("6" : "шестому@")
|
|
||||||
| ("6" : "шестую@")
|
|
||||||
| ("6" : "шестые@")
|
|
||||||
| ("6" : "шестым@")
|
|
||||||
| ("6" : "шестыми@")
|
|
||||||
| ("6" : "шестых@")
|
|
||||||
| ("6" : "шесть")
|
|
||||||
| ("6" : "шестью")
|
|
||||||
| ("7" : "седьмая@")
|
|
||||||
| ("7" : "седьмого@")
|
|
||||||
| ("7" : "седьмое@")
|
|
||||||
| ("7" : "седьмой@")
|
|
||||||
| ("7" : "седьмом@")
|
|
||||||
| ("7" : "седьмому@")
|
|
||||||
| ("7" : "седьмую@")
|
|
||||||
| ("7" : "седьмые@")
|
|
||||||
| ("7" : "седьмым@")
|
|
||||||
| ("7" : "седьмыми@")
|
|
||||||
| ("7" : "седьмых@")
|
|
||||||
| ("7" : "семи")
|
|
||||||
| ("7" : "семь")
|
|
||||||
| ("7" : "семью")
|
|
||||||
| ("8" : "восемь")
|
|
||||||
| ("8" : "восьмая@")
|
|
||||||
| ("8" : "восьми")
|
|
||||||
| ("8" : "восьмого@")
|
|
||||||
| ("8" : "восьмое@")
|
|
||||||
| ("8" : "восьмой@")
|
|
||||||
| ("8" : "восьмом@")
|
|
||||||
| ("8" : "восьмому@")
|
|
||||||
| ("8" : "восьмую@")
|
|
||||||
| ("8" : "восьмые@")
|
|
||||||
| ("8" : "восьмым@")
|
|
||||||
| ("8" : "восьмыми@")
|
|
||||||
| ("8" : "восьмых@")
|
|
||||||
| ("8" : "восьмью")
|
|
||||||
| ("9" : "девятая@")
|
|
||||||
| ("9" : "девяти")
|
|
||||||
| ("9" : "девятого@")
|
|
||||||
| ("9" : "девятое@")
|
|
||||||
| ("9" : "девятой@")
|
|
||||||
| ("9" : "девятом@")
|
|
||||||
| ("9" : "девятому@")
|
|
||||||
| ("9" : "девятую@")
|
|
||||||
| ("9" : "девятые@")
|
|
||||||
| ("9" : "девятый@")
|
|
||||||
| ("9" : "девятым@")
|
|
||||||
| ("9" : "девятыми@")
|
|
||||||
| ("9" : "девятых@")
|
|
||||||
| ("9" : "девять")
|
|
||||||
| ("9" : "девятью")
|
|
||||||
| ("[E3]" : "тысяч")
|
|
||||||
| ("[E3]" : "тысяча")
|
|
||||||
| ("[E3]" : "тысячам")
|
|
||||||
| ("[E3]" : "тысячами")
|
|
||||||
| ("[E3]" : "тысячах")
|
|
||||||
| ("[E3]" : "тысяче")
|
|
||||||
| ("[E3]" : "тысячей")
|
|
||||||
| ("[E3]" : "тысячи")
|
|
||||||
| ("[E3]" : "тысячу")
|
|
||||||
| ("[E3]" : "тысячью")
|
|
||||||
| ("[E6]" : "миллион")
|
|
||||||
| ("[E6]" : "миллиона")
|
|
||||||
| ("[E6]" : "миллионам")
|
|
||||||
| ("[E6]" : "миллионами")
|
|
||||||
| ("[E6]" : "миллионах")
|
|
||||||
| ("[E6]" : "миллионе")
|
|
||||||
| ("[E6]" : "миллионов")
|
|
||||||
| ("[E6]" : "миллионом")
|
|
||||||
| ("[E6]" : "миллиону")
|
|
||||||
| ("[E6]" : "миллионы")
|
|
||||||
| ("[E9]" : "миллиард")
|
|
||||||
| ("[E9]" : "миллиарда")
|
|
||||||
| ("[E9]" : "миллиардам")
|
|
||||||
| ("[E9]" : "миллиардами")
|
|
||||||
| ("[E9]" : "миллиардах")
|
|
||||||
| ("[E9]" : "миллиарде")
|
|
||||||
| ("[E9]" : "миллиардов")
|
|
||||||
| ("[E9]" : "миллиардом")
|
|
||||||
| ("[E9]" : "миллиарду")
|
|
||||||
| ("[E9]" : "миллиарды")
|
|
||||||
| ("|0|" : "ноле")
|
|
||||||
| ("|0|" : "нолем")
|
|
||||||
| ("|0|" : "ноль")
|
|
||||||
| ("|0|" : "нолю")
|
|
||||||
| ("|0|" : "ноля")
|
|
||||||
| ("|0|" : "нуле")
|
|
||||||
| ("|0|" : "нулем")
|
|
||||||
| ("|0|" : "нуль")
|
|
||||||
| ("|0|" : "нулю")
|
|
||||||
| ("|0|" : "нуля")]
|
|
||||||
;
|
|
||||||
|
|
||||||
lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
export LEX = Optimize[lex3 @ lex2 @ lex1];
|
|
||||||
|
|
||||||
export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]";
|
|
||||||
|
|
||||||
# END LANGUAGE SPECIFIC DATA
|
|
||||||
################################################################################
|
|
||||||
# Inserts a marker after the Ms.
|
|
||||||
export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR];
|
|
||||||
|
|
||||||
# Deletes all powers and "+".
|
|
||||||
export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
# Deletes trailing zeros at the beginning of a number, so that "0003" does not
|
|
||||||
# get treated as an ordinary number.
|
|
||||||
export DELETE_INITIAL_ZEROS =
|
|
||||||
CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR]
|
|
||||||
;
|
|
||||||
|
|
||||||
NonMs = Optimize[POWERS - Ms];
|
|
||||||
|
|
||||||
# Deletes (usually) zeros before a non-M. E.g., +0[E1] should be
|
|
||||||
# deleted
|
|
||||||
export DELETE_INTERMEDIATE_ZEROS1 =
|
|
||||||
CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR]
|
|
||||||
;
|
|
||||||
|
|
||||||
# Deletes (usually) zeros before an M, if there is no non-zero element between
|
|
||||||
# that and the previous boundary. Thus, if after the result of the rule above we
|
|
||||||
# end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final
|
|
||||||
# zero.
|
|
||||||
export DELETE_INTERMEDIATE_ZEROS2 = Optimize[
|
|
||||||
CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR]
|
|
||||||
@ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]]
|
|
||||||
;
|
|
||||||
|
|
||||||
# Final clean up of stray zeros.
|
|
||||||
export DELETE_REMAINING_ZEROS = Optimize[
|
|
||||||
CDRewrite[Zero["+0"], "", "", SIGMA_STAR]
|
|
||||||
@ CDRewrite[Zero["0"], "", "", SIGMA_STAR]]
|
|
||||||
;
|
|
||||||
|
|
||||||
# Applies the revaluation map. For example in English, change [E4] to [E1] as a
|
|
||||||
# modifier of [E3]
|
|
||||||
export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
# Deletes the various marks and powers in the input and output.
|
|
||||||
export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR];
|
|
||||||
|
|
||||||
export CLEAN_SPACES = Optimize[
|
|
||||||
CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR]
|
|
||||||
@ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR]
|
|
||||||
@ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]]
|
|
||||||
;
|
|
||||||
|
|
||||||
d = b.kDigit;
|
|
||||||
|
|
||||||
# Germanic inversion rule.
|
|
||||||
germanic =
|
|
||||||
(I["1+"] d "[E1]" D["+1"])
|
|
||||||
| (I["2+"] d "[E1]" D["+2"])
|
|
||||||
| (I["3+"] d "[E1]" D["+3"])
|
|
||||||
| (I["4+"] d "[E1]" D["+4"])
|
|
||||||
| (I["5+"] d "[E1]" D["+5"])
|
|
||||||
| (I["6+"] d "[E1]" D["+6"])
|
|
||||||
| (I["7+"] d "[E1]" D["+7"])
|
|
||||||
| (I["8+"] d "[E1]" D["+8"])
|
|
||||||
| (I["9+"] d "[E1]" D["+9"])
|
|
||||||
;
|
|
||||||
|
|
||||||
germanic_inversion =
|
|
||||||
CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt']
|
|
||||||
;
|
|
||||||
|
|
||||||
export GERMANIC_INVERSION = SIGMA_STAR;
|
|
||||||
export ORDINAL_RESTRICTION =
|
|
||||||
Optimize[((SIGMA - "@")* "@") @ CDRewrite[D["@"], "", "", SIGMA_STAR]]
|
|
||||||
;
|
|
||||||
nondigits = b.kBytes - b.kDigit;
|
|
||||||
export ORDINAL_SUFFIX = D[nondigits*];
|
|
|
@ -1,77 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
# This verbalizer is used whenever there is an LM symbol that consists of
|
|
||||||
# letters immediately followed by "{spelled}". This strips the "{spelled}"
|
|
||||||
# suffix.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'ru/classifier/cyrillic.grm' as c;
|
|
||||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'ru/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
digit = b.kDigit @ n.CARDINAL_NUMBERS;
|
|
||||||
|
|
||||||
char_set = (("a" | "A") : "letter-a")
|
|
||||||
| (("b" | "B") : "letter-b")
|
|
||||||
| (("c" | "C") : "letter-c")
|
|
||||||
| (("d" | "D") : "letter-d")
|
|
||||||
| (("e" | "E") : "letter-e")
|
|
||||||
| (("f" | "F") : "letter-f")
|
|
||||||
| (("g" | "G") : "letter-g")
|
|
||||||
| (("h" | "H") : "letter-h")
|
|
||||||
| (("i" | "I") : "letter-i")
|
|
||||||
| (("j" | "J") : "letter-j")
|
|
||||||
| (("k" | "K") : "letter-k")
|
|
||||||
| (("l" | "L") : "letter-l")
|
|
||||||
| (("m" | "M") : "letter-m")
|
|
||||||
| (("n" | "N") : "letter-n")
|
|
||||||
| (("o" | "O") : "letter-o")
|
|
||||||
| (("p" | "P") : "letter-p")
|
|
||||||
| (("q" | "Q") : "letter-q")
|
|
||||||
| (("r" | "R") : "letter-r")
|
|
||||||
| (("s" | "S") : "letter-s")
|
|
||||||
| (("t" | "T") : "letter-t")
|
|
||||||
| (("u" | "U") : "letter-u")
|
|
||||||
| (("v" | "V") : "letter-v")
|
|
||||||
| (("w" | "W") : "letter-w")
|
|
||||||
| (("x" | "X") : "letter-x")
|
|
||||||
| (("y" | "Y") : "letter-y")
|
|
||||||
| (("z" | "Z") : "letter-z")
|
|
||||||
| (digit)
|
|
||||||
| ("&" : "@@AND@@")
|
|
||||||
| ("." : "")
|
|
||||||
| ("-" : "")
|
|
||||||
| ("_" : "")
|
|
||||||
| ("/" : "")
|
|
||||||
| (n.I["letter-"] c.kCyrillicAlpha)
|
|
||||||
;
|
|
||||||
|
|
||||||
ins_space = "" : " ";
|
|
||||||
|
|
||||||
suffix = "{spelled}" : "";
|
|
||||||
|
|
||||||
spelled = Optimize[char_set (ins_space char_set)* suffix];
|
|
||||||
|
|
||||||
export SPELLED = Optimize[spelled @ l.LEXICAL_MAP];
|
|
||||||
|
|
||||||
sigma_star = b.kBytes*;
|
|
||||||
|
|
||||||
# Gets rid of the letter- prefix since in some cases we don't want it.
|
|
||||||
|
|
||||||
del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star];
|
|
||||||
|
|
||||||
spelled_no_tag = Optimize[char_set (ins_space char_set)*];
|
|
||||||
|
|
||||||
export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter];
|
|
@ -1,24 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
|
||||||
|
|
||||||
punct =
|
|
||||||
("." : "@@PERIOD@@")
|
|
||||||
| ("," : "@@COMMA@@")
|
|
||||||
| ("!" : "@@EXCLAMATION_MARK@@")
|
|
||||||
| ("?" : "@@QUESTION_MARK@@")
|
|
||||||
;
|
|
||||||
|
|
||||||
export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP];
|
|
@ -1,108 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as b;
|
|
||||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
|
||||||
import 'ru/verbalizer/numbers.grm' as n;
|
|
||||||
|
|
||||||
# Only handles 24-hour time with quarter-to, half-past and quarter-past.
|
|
||||||
|
|
||||||
increment_hour =
|
|
||||||
("0" : "1")
|
|
||||||
| ("1" : "2")
|
|
||||||
| ("2" : "3")
|
|
||||||
| ("3" : "4")
|
|
||||||
| ("4" : "5")
|
|
||||||
| ("5" : "6")
|
|
||||||
| ("6" : "7")
|
|
||||||
| ("7" : "8")
|
|
||||||
| ("8" : "9")
|
|
||||||
| ("9" : "10")
|
|
||||||
| ("10" : "11")
|
|
||||||
| ("11" : "12")
|
|
||||||
| ("12" : "1") # If someone uses 12, we assume 12-hour by default.
|
|
||||||
| ("13" : "14")
|
|
||||||
| ("14" : "15")
|
|
||||||
| ("15" : "16")
|
|
||||||
| ("16" : "17")
|
|
||||||
| ("17" : "18")
|
|
||||||
| ("18" : "19")
|
|
||||||
| ("19" : "20")
|
|
||||||
| ("20" : "21")
|
|
||||||
| ("21" : "22")
|
|
||||||
| ("22" : "23")
|
|
||||||
| ("23" : "12")
|
|
||||||
;
|
|
||||||
|
|
||||||
hours = Project[increment_hour, 'input'];
|
|
||||||
|
|
||||||
d = b.kDigit;
|
|
||||||
D = d - "0";
|
|
||||||
|
|
||||||
minutes09 = "0" D;
|
|
||||||
|
|
||||||
minutes = ("1" | "2" | "3" | "4" | "5") d;
|
|
||||||
|
|
||||||
__sep__ = ":";
|
|
||||||
sep_space = __sep__ : " ";
|
|
||||||
|
|
||||||
verbalize_hours = hours @ n.CARDINAL_NUMBERS;
|
|
||||||
|
|
||||||
verbalize_minutes =
|
|
||||||
("00" : "@@HOUR@@")
|
|
||||||
| (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS))
|
|
||||||
| (minutes @ n.CARDINAL_NUMBERS)
|
|
||||||
;
|
|
||||||
|
|
||||||
time_basic = Optimize[verbalize_hours sep_space verbalize_minutes];
|
|
||||||
|
|
||||||
# Special cases we handle right now.
|
|
||||||
# TODO: Need to allow for cases like
|
|
||||||
#
|
|
||||||
# half twelve (in the UK English sense)
|
|
||||||
# half twaalf (in the Dutch sense)
|
|
||||||
|
|
||||||
time_quarter_past =
|
|
||||||
n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "]
|
|
||||||
verbalize_hours
|
|
||||||
n.D[__sep__ "15"];
|
|
||||||
|
|
||||||
time_half_past =
|
|
||||||
n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "]
|
|
||||||
verbalize_hours
|
|
||||||
n.D[__sep__ "30"];
|
|
||||||
|
|
||||||
time_quarter_to =
|
|
||||||
n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "]
|
|
||||||
(increment_hour @ verbalize_hours)
|
|
||||||
n.D[__sep__ "45"];
|
|
||||||
|
|
||||||
time_extra = Optimize[
|
|
||||||
time_quarter_past | time_half_past | time_quarter_to]
|
|
||||||
;
|
|
||||||
|
|
||||||
# Basic time periods which most languages can be expected to have.
|
|
||||||
__am__ = "a.m." | "am" | "AM" | "утра";
|
|
||||||
__pm__ = "p.m." | "pm" | "PM" | "вечера";
|
|
||||||
|
|
||||||
period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@");
|
|
||||||
|
|
||||||
time_variants = time_basic | time_extra;
|
|
||||||
|
|
||||||
time = Optimize[
|
|
||||||
(period (" " | n.I[" "]))? time_variants
|
|
||||||
| time_variants ((" " | n.I[" "]) period)?]
|
|
||||||
;
|
|
||||||
|
|
||||||
export TIME = Optimize[time @ l.LEXICAL_MAP];
|
|
@ -1,68 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
# Rules for URLs and email addresses.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as bytelib;
|
|
||||||
import 'ru/verbalizer/lexical_map.grm' as l;
|
|
||||||
|
|
||||||
ins_space = "" : " ";
|
|
||||||
dot = "." : "@@URL_DOT_EXPRESSION@@";
|
|
||||||
at = "@" : "@@AT@@";
|
|
||||||
|
|
||||||
url_suffix =
|
|
||||||
(".com" : dot ins_space "com") |
|
|
||||||
(".gov" : dot ins_space "gov") |
|
|
||||||
(".edu" : dot ins_space "e d u") |
|
|
||||||
(".org" : dot ins_space "org") |
|
|
||||||
(".net" : dot ins_space "net")
|
|
||||||
;
|
|
||||||
|
|
||||||
letter_string = (bytelib.kAlnum)* bytelib.kAlnum;
|
|
||||||
|
|
||||||
letter_string_dot =
|
|
||||||
((letter_string ins_space dot ins_space)* letter_string)
|
|
||||||
;
|
|
||||||
|
|
||||||
# Rules for URLs.
|
|
||||||
export URL = Optimize[
|
|
||||||
((letter_string_dot) (ins_space)
|
|
||||||
(url_suffix)) @ l.LEXICAL_MAP
|
|
||||||
];
|
|
||||||
|
|
||||||
# Rules for email addresses.
|
|
||||||
letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum);
|
|
||||||
|
|
||||||
letter_by_letter_dot =
|
|
||||||
((letter_by_letter ins_space dot ins_space)*
|
|
||||||
letter_by_letter)
|
|
||||||
;
|
|
||||||
|
|
||||||
export EMAIL1 = Optimize[
|
|
||||||
((letter_by_letter) (ins_space)
|
|
||||||
(at) (ins_space)
|
|
||||||
(letter_by_letter_dot) (ins_space)
|
|
||||||
(url_suffix)) @ l.LEXICAL_MAP
|
|
||||||
];
|
|
||||||
|
|
||||||
export EMAIL2 = Optimize[
|
|
||||||
((letter_by_letter) (ins_space)
|
|
||||||
(at) (ins_space)
|
|
||||||
(letter_string_dot) (ins_space)
|
|
||||||
(url_suffix)) @ l.LEXICAL_MAP
|
|
||||||
];
|
|
||||||
|
|
||||||
export EMAILS = Optimize[
|
|
||||||
EMAIL1 | EMAIL2
|
|
||||||
];
|
|
@ -1,42 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import 'util/util.grm' as util;
|
|
||||||
import 'ru/verbalizer/extra_numbers.grm' as e;
|
|
||||||
import 'ru/verbalizer/float.grm' as f;
|
|
||||||
import 'ru/verbalizer/math.grm' as ma;
|
|
||||||
import 'ru/verbalizer/miscellaneous.grm' as mi;
|
|
||||||
import 'ru/verbalizer/money.grm' as mo;
|
|
||||||
import 'ru/verbalizer/numbers.grm' as n;
|
|
||||||
import 'ru/verbalizer/numbers_plus.grm' as np;
|
|
||||||
import 'ru/verbalizer/spelled.grm' as s;
|
|
||||||
import 'ru/verbalizer/spoken_punct.grm' as sp;
|
|
||||||
import 'ru/verbalizer/time.grm' as t;
|
|
||||||
import 'ru/verbalizer/urls.grm' as u;
|
|
||||||
|
|
||||||
export VERBALIZER = Optimize[RmWeight[
|
|
||||||
( e.MIXED_NUMBERS
|
|
||||||
| e.DIGITS
|
|
||||||
| f.FLOAT
|
|
||||||
| ma.ARITHMETIC
|
|
||||||
| mi.MISCELLANEOUS
|
|
||||||
| mo.MONEY
|
|
||||||
| n.CARDINAL_NUMBERS
|
|
||||||
| n.ORDINAL_NUMBERS
|
|
||||||
| np.NUMBERS_PLUS
|
|
||||||
| s.SPELLED
|
|
||||||
| sp.SPOKEN_PUNCT
|
|
||||||
| t.TIME
|
|
||||||
| u.URL) @ util.CLEAN_SPACES
|
|
||||||
]];
|
|
@ -1,3 +0,0 @@
|
|||||||
# Language-universal grammar definitions
|
|
||||||
|
|
||||||
This directory contains various language-universal grammar definitions.
|
|
|
@ -1,126 +0,0 @@
|
|||||||
# Copyright 2017 Google Inc.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
# Specifies common ways of delimiting thousands in digit strings.
|
|
||||||
|
|
||||||
import 'util/byte.grm' as bytelib;
|
|
||||||
import 'util/util.grm' as util;
|
|
||||||
|
|
||||||
killcomma = "," : "";
|
|
||||||
dot2comma = "." : ",";
|
|
||||||
spaces2comma = " "+ : ",";
|
|
||||||
|
|
||||||
zero = "0";
|
|
||||||
|
|
||||||
# no_delimiter = zero | "[1-9][0-9]*";
|
|
||||||
export no_delimiter = zero | (util.d1to9 bytelib.kDigit*);
|
|
||||||
|
|
||||||
# delim_map_dot = ("[0-9]" | ("\." : ","))*;
|
|
||||||
delim_map_dot = (bytelib.kDigit | dot2comma)*;
|
|
||||||
|
|
||||||
# delim_map_space = ("[0-9]" | (" +" : ","))*;
|
|
||||||
delim_map_space = (bytelib.kDigit | spaces2comma)*;
|
|
||||||
|
|
||||||
## Western systems group thousands. Korean goes this way too.
|
|
||||||
|
|
||||||
# comma_thousands = zero | ("[1-9][0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9]")*);
|
|
||||||
export comma_thousands = zero | (util.d1to9 bytelib.kDigit{0,2} (killcomma bytelib.kDigit{3})*);
|
|
||||||
|
|
||||||
# ComposeFst: 1st argument cannot match on output labels and 2nd argument
|
|
||||||
# cannot match on input labels (sort?).
|
|
||||||
export dot_thousands = delim_map_dot @ comma_thousands;
|
|
||||||
|
|
||||||
# ComposeFst: 1st argument cannot match on output labels and 2nd argument
|
|
||||||
# cannot match on input labels (sort?).
|
|
||||||
export space_thousands = delim_map_space @ comma_thousands;
|
|
||||||
|
|
||||||
## Chinese prefers grouping by fours (by ten-thousands).
|
|
||||||
|
|
||||||
# chinese_comma =
|
|
||||||
# zero | ("[1-9][0-9]?[0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9][0-9]")*);
|
|
||||||
export chinese_comma = zero | (util.d1to9 (bytelib.kDigit{0,3}) (killcomma bytelib.kDigit{4})*);
|
|
||||||
|
|
||||||
## The Indian system is more complex because of the Stravinskian alternation
|
|
||||||
## between lakhs and crores.
|
|
||||||
##
|
|
||||||
## According to Wikipedia:
|
|
||||||
##
|
|
||||||
## Indian English Value
|
|
||||||
## One 1
|
|
||||||
## Ten 10
|
|
||||||
## Hundred 100
|
|
||||||
## Thousand 1,000
|
|
||||||
## Lakh 1,00,000
|
|
||||||
## Crore 1,00,00,000
|
|
||||||
## Arab 1,00,00,00,000
|
|
||||||
## Kharab 1,00,00,00,00,000
|
|
||||||
|
|
||||||
# indian_hundreds = "[1-9][0-9]?[0-9]?";
|
|
||||||
indian_hundreds = util.d1to9 bytelib.kDigit{0,2};
|
|
||||||
|
|
||||||
## Up to 99,999.
|
|
||||||
|
|
||||||
# indian_comma_thousands = "[1-9][0-9]?" ("," : "") "[0-9][0-9][0-9]";
|
|
||||||
indian_comma_thousands = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{3};
|
|
||||||
|
|
||||||
## Up to 99,99,999.
|
|
||||||
|
|
||||||
# indian_comma_lakhs = "[1-9][0-9]?" ("," : "") "[0-9][0-9]" ("," : "") "[0-9][0-9][0-9]";
|
|
||||||
indian_comma_lakhs = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{2} killcomma bytelib.kDigit{3};
|
|
||||||
|
|
||||||
## Up to 999,99,99,999
|
|
||||||
|
|
||||||
indian_comma_crores =
|
|
||||||
util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma
|
|
||||||
(bytelib.kDigit{2} killcomma)?
|
|
||||||
bytelib.kDigit{2} killcomma
|
|
||||||
bytelib.kDigit{3}
|
|
||||||
;
|
|
||||||
|
|
||||||
## Up to 99,999,99,99,999.
|
|
||||||
|
|
||||||
indian_comma_thousand_crores =
|
|
||||||
util.d1to9 bytelib.kDigit? killcomma
|
|
||||||
bytelib.kDigit{3} killcomma
|
|
||||||
bytelib.kDigit{2} killcomma
|
|
||||||
bytelib.kDigit{2} killcomma
|
|
||||||
bytelib.kDigit{3}
|
|
||||||
;
|
|
||||||
|
|
||||||
## Up to 999,99,999,99,99,999.
|
|
||||||
|
|
||||||
indian_comma_lakh_crores =
|
|
||||||
util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma
|
|
||||||
bytelib.kDigit{2} killcomma
|
|
||||||
bytelib.kDigit{3} killcomma
|
|
||||||
bytelib.kDigit{2} killcomma
|
|
||||||
bytelib.kDigit{2} killcomma
|
|
||||||
bytelib.kDigit{3}
|
|
||||||
;
|
|
||||||
|
|
||||||
export indian_comma =
|
|
||||||
zero
|
|
||||||
| indian_hundreds
|
|
||||||
| indian_comma_thousands
|
|
||||||
| indian_comma_lakhs
|
|
||||||
| indian_comma_crores
|
|
||||||
| indian_comma_thousand_crores
|
|
||||||
| indian_comma_lakh_crores
|
|
||||||
;
|
|
||||||
|
|
||||||
# Indian number system with dots.
|
|
||||||
export indian_dot_number = delim_map_dot @ indian_comma;
|
|
||||||
|
|
||||||
# Indian number system with spaces.
|
|
||||||
export indian_space_number = delim_map_space @ indian_comma;
|
|
@ -1,3 +0,0 @@
|
|||||||
# Utility grammar definitions
|
|
||||||
|
|
||||||
This directory contains various utility grammar definitions.
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue