diff --git a/.gitignore b/.gitignore index b81c5d37d..953cc28d2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ .DS_Store *.pyc -tools/venv .vscode *.log *.pdmodel @@ -10,3 +9,6 @@ tools/venv *.tar.gz .ipynb_checkpoints *.npz + +tools/venv +tools/kenlm diff --git a/README.md b/README.md index e7019a897..a2d2f9a56 100644 --- a/README.md +++ b/README.md @@ -52,4 +52,4 @@ DeepSpeech is provided under the [Apache-2.0 License](./LICENSE). ## Acknowledgement -We depends on many open source repos. See [References](doc/src/reference.md) for more information. \ No newline at end of file +We depends on many open source repos. See [References](doc/src/reference.md) for more information. diff --git a/README_cn.md b/README_cn.md index b9ad78908..3c1111b5e 100644 --- a/README_cn.md +++ b/README_cn.md @@ -50,4 +50,4 @@ DeepSpeech遵循[Apache-2.0开源协议](./LICENSE)。 ## 感谢 -开发中参考一些优秀的仓库,详情参见 [References](doc/src/reference.md)。 \ No newline at end of file +开发中参考一些优秀的仓库,详情参见 [References](doc/src/reference.md)。 diff --git a/examples/ngram_lm/.gitignore b/examples/ngram_lm/.gitignore new file mode 100644 index 000000000..fd7c578e1 --- /dev/null +++ b/examples/ngram_lm/.gitignore @@ -0,0 +1 @@ +exp/ diff --git a/examples/ngram_lm/data/README.md b/examples/ngram_lm/data/README.md new file mode 100644 index 000000000..514f17b60 --- /dev/null +++ b/examples/ngram_lm/data/README.md @@ -0,0 +1,2 @@ +text_correct.txt: https://github.com/shibing624/pycorrector/raw/master/tests/test_file.txt +custom_confusion.txt: https://github.com/shibing624/pycorrector/raw/master/tests/custom_confusion.txt diff --git a/examples/ngram_lm/data/custom_confusion.txt b/examples/ngram_lm/data/custom_confusion.txt new file mode 100644 index 000000000..aecabf6aa --- /dev/null +++ b/examples/ngram_lm/data/custom_confusion.txt @@ -0,0 +1,1371 @@ +#变体 本体 本体词词频(可省略) +兴高彩列 兴高采烈 100 +吹唐人 吹糖人 100 +百年家具 百年家居 +泄药 泻药 +称做 称作 100 +化学成份 化学成分 100 +天地无垠 天地无限 100 +欲妄 欲望 +满头大汉 满头大汗 +一阙 一阕 300 +斗音 抖音 100 +人材 人才 100 +微亅信 微信 100 +微·信 微信 1000 +一毛不陪 一毛不赔 200 +无线大 无限大 1 +正加 增加 +相对得 相对地 +越来越底 越来越低 +个纾 个数 +曾加 增加 +情怳 情况 +持续的 持续地 +不断的 不断地 +成现 呈现 +家重 加重 +中国子 中国字 +计录 记录 +一落千仗 一落千丈 +婴而 婴儿 +负贵 富贵 +旁遍 旁边 +续承 继承 +约来越晚 越来越晚 +提共 提供 +行成 形成 +读哩 独立 +忙绿 忙碌 +年经人 年轻人 +智识 知识 +一但 一旦 +指摘 指责 +懹人民自己选择 让人民自己选择 +紧裤 辛苦 +放心地生活 放心的生活 +但靠一份收入 单靠一份收入 +多付出地几十倍 多付出的几十倍 +化钱 花钱 +依些 一些 +耐心的教育 耐心地教育 +多某些人来说 对某些人来说 +列子 例子 +普便 普遍 +单纯的说 单纯地说 +相对得坏处 相对的坏处 +改很多幼稚园 盖很多幼稚园 +不断的增加 不断地增加 +谈讨 探讨 +小还 小孩 +未普边 未普遍 +考良 考量 +过的不错 过得不错 +手医学治疗 受医学治疗 +数为至少四个人 数位至少四个人 +小子化 少子化 +间直 简直 +每办法 没办法 +青跨 情况 +最总 最终 +僱佣 僱用 +渐渐的形成 渐渐地形成 +渐渐的增加 渐渐地增加 +不断的改变 不断地改变 +不断的提高 不断地提高 +莫些年 某些年 +成工 成功 +线然 显然 +一长争段 一长争短 +在加上 再加上 +一后 以后 +自起 自己 +年轻得夫妇 年轻的夫妇 +真的事对 真的是对 +养小还 养小孩 +不彷 不妨 +疏缓 舒缓 +上叙 上述 +或着市 或者是 +学习地意愿 学习的意愿 +按部就班 按步就班 +全面的规划 全面地规划 +纾缓 舒缓 +年少气胜 年少气盛 +新家坡 新加坡 +变的很轻松 变得很轻松 +我门 我们 +月来越好 越来越好 +发生国 发生过 +以开发国家 已开发国家 +以开发 已开发 +赛翁失马 塞翁失马 +非常的宝贵 非常地宝贵 +列如 例如 +非常的珍贵 非常地珍贵 +二零五龄年 二零五零年 +証明 证明 +一间一间的关必 一间一间地关闭 +缠能收割 才能收割 +朗朗称道 琅琅称道 +缠能拯救 才能拯救 +付其责 负其责 +还而 孩儿 +自然而然的 自然而然地 +现再 现在 +一间一间的在建造 一间一间地在建造 +惰胎 堕胎 +给与 给予 +少字化 少子化 +狂大 广大 +至到 直到 +作些政策 做些政策 +想望 向往 +低免 抵免 +展生 产生 +源料 原料 +好好的准备 好好地准备 +庄况 状况 +因发 引发 +自记 自己 +桃论 讨论 +负单 负担 +而以 而已 +政符 政府 +邀情 邀请 +郑加 增加 +全心全力的 全心全力地 +进一份力 尽一份力 +不断的努力 不断地努力 +厉害关系 利害关系 +相像 现象 +题及 提及 +剥歇 剥削 +下起 下去 +了所学校 各所学校 +很好得一份工作 很好的一份工作 +来的这么快 来得这么快 +言前 眼前 +所担任的是有限 所担任的事有限 +放面 方面 +办发 办法 +有趣的说到 有趣地说到 +坚帧不移 坚贞不移 +心力绞碎 心力交瘁 +开发看经济 开发和经济 +有效的规划 有效地规划 +流守 留守 +生济 生计 +带欠缺 但欠缺 +堆动 推动 +表达的是 表达的事 +所以问题 所有问题 +由其固定的速度 有其固定的速度 +乐与减少 乐于减少 +变的困难 变得困难 +多馀 多于 +长远的看 长远地看 +血浓与水 血浓于水 +表面的看 表面地看 +推称出新 推陈出新 +脱离萍 脱离贫 +时间一天一天的过去 时间一天一天地过去 +提昌 提倡 +续多 许多 +现进 先进 +销费率 消费率 +直接的被感受 直接地被感受 +陪养 培养 +主在乡下 住在乡下 +弄多的很快 弄多得很快 +他门 他们 +持续的下降 持续地下降 +相对的减少 相对地减少 +相对的提高 相对地提高 +适当的生育 适当地生育 +成功的制定 成功地制定 +青少年门 青少年们 +慎重的希望 慎重地希望 +娇傲 骄傲 +亲爱的少年门 亲爱的少年们 +少子华 少子化 +生上 身上 +不断的鼓励 不断地鼓励 +具造 制造 +单初 当初 +发掌 发展 +大幅度的提升 大幅度地提升 +负担地因素 负担的因素 +亘多医疗费用 更多医疗费用 +餵了 为了 +照成 造成 +教育基精 教育基金 +装造性 创造性 +观察关 观察官 +怎么形成地 怎么形成的 +比年长者来的多 比年长者来得多 +不单 不但 +变的更良好 变得更良好 +变的更大 变得更大 +壤大众理解 让大众理解 +上免得分析 上面的分析 +已经再发生了 已经在发生了 +带来得危机 带来的危机 +恶讯循环 恶性循环 +他得生活费 他的生活费 +以开发地区 已开发地区 +长其 长期 +经额 金额 +在过几十年 再过几十年 +作为探讨 做为探讨 +系根据 是根据 +位于 为于 +份公司 分公司 +要作塑胶袋 要做塑胶袋 +还不过 还不够 +逃战 挑战 +再服务业 在服务业 +招莫 招募 +坐者 坐着 +仔细的观察 仔细地观察 +三个凑皮匠 三个臭皮匠 +个中各样 各种各样 +不啻 不是 +即史 即使 +岂有此里 岂有此理 +组宠物的人 租宠物的人 +浅溥 浅薄 +处里 处理 +他们得不负责任的态度 他们的不负责任的态度 +出租得事 出租的事 +这样以来 这样一来 +题高 提高 +厌力 压力 +事情阿 事情啊 +又意思 有意思 +建慷 健康 +真恭喜妳阿 真恭喜妳啊 +抱见阿 抱歉啊 +幸苦 辛苦 +参家 参加 +回来台湾 会来台湾 +生体 身体 +总视 总是 +一把很大的化 一把很大的花 +出你 祝你 +高心 高兴 +应为 因为 +洗碗 希望 +那理 那里 +根她坐公共汽车 跟她坐公共汽车 +公克 功课 +根林美美去 跟林美美去 +一前 以前 +傍边 旁边 +总明 聪明 +最难得课 最难的课 +在马上去 再马上去 +建在 现在 +事等我女朋友的 是等我女朋友的 +客爱 可爱 +象片 相片 +也雪 也许 +录的 绿的 +纲来 刚来 +让后 然后 +真得是坏人 真的是坏人 +提以 提议 +大的家 他的家 +从是 总是 +有处 有趣 +课目 科目 +看得是美国电影 看的是美国电影 +兴奋得 兴奋地 +控的时间 空的时间 +内容懂不清楚 内容都不清楚 +谊起 一起 +打公车 搭公车 +近去 进去 +林美妹 林美美 +唷意思 有意思 +蔘加 参加 +再开一个好把 再开一个好吧 +兴起不好 心情不好 +庆祝地会 庆祝的会 +漂漂亮亮地风景 漂漂亮亮的风景 +再庆祝一次把 再庆祝一次吧 +找的时间 找个时间 +间妈妈 见妈妈 +无会 舞会 +解我的朋友 请我的朋友 +日字 日子 +但然 当然 +障爱文 张爱文 +已定 一定 +澳是 要是 +长爱文 张爱文 +理物 礼物 +延后 然后 +合问她 和问她 +恨高兴 很高兴 +一让 一样 +帮别的学生的芒 帮别的学生的忙 +沾着 站着 +怒朋友 女朋友 +下明 小明 +老师文学生 老师问学生 +他清她吃饭 他请她吃饭 +还没作 还没做 +座公车 坐公车 +打篹 打算 +沾起来 站起来 +再起床呢 在起床呢 +再黑板写的东西 在黑板写的东西 +不输服 不舒服 +很哩 很累 +警张 紧张 +劳天 聊天 +很来 很累 +多要 都要 +奴朋友 女朋友 +高兴的不得了 高兴得不得了 +里学校很近 离学校很近 +音龠 音乐 +合唱父系语法 和常复习语法 +税觉 睡觉 +觉得贰 觉得饿 +对不气 对不起 +情你去吃饭 请你去吃饭 +不局道 不知道 +你好马 你好吗 +岑么样 怎么样 +可以马 可以吗 +我想情你 我想请你 +火者站 火车站 +聊体 聊天 +最前的子 最前的字 +座捷运 坐捷运 +哪哩 哪里 +十只路口 十字路口 +不智道 不知道 +怎门办 怎么办 +点视机 电视机 +已后 以后 +妳会时后 妳回时候 +漂凉 漂亮 +票凉 漂亮 +又漂亮的衣服 有漂亮的衣服 +清卖给我 请买给我 +红虹 红红 +待你的国家 到你的国家 +太样兵 太阳饼 +不拘导 不知道 +不咀导 不知道 +这哩 这里 +成市 城市 +发山 爬山 +风镜 风景 +他杏林 他姓林 +痕累 很累 +方假 放假 +风京 风景 +座火车 坐火车 +他的华 他的话 +户然 忽然 +理面 里面 +做回去 坐回去 +出们 出门 +愿赖 原来 +吧手 把手 +厅路上 听路上 +定车 停车 +高现 高兴 +排照 拍照 +他的婉 他的腕 +原意 愿意 +没人琣他 没人陪他 +歹去 带去 +自即 自己 +开是 开始 +和一点饮料 喝一点饮料 +录行 旅行 +谈着谈者 谈着谈着 +漂亮的术 漂亮的树 +戴我 载我 +作天 昨天 +很日 很热 +照篇 照片 +我借你们 我接你们 +握着我的朋友 我找我的朋友 +台配种战 台北总站 +来我的家完 来我的家玩 +营为 因为 +她门 她们 +些日会 生日会 +右遍 右边 +开时 开始 +做八八的公车 坐八八的公车 +对们 对门 +学玩了 学完了 +票漂亮亮 漂漂亮亮 +喜暗 喜欢 +括大风 刮大风 +批具 啤酒 +骠了 漂亮 +瞅天 秋天 +难德 南德 +不太施 不太湿 +冬天件 冬天间 +愁天 秋天 +塞太阳 晒太阳 +跟美 很美 +单让 当然 +感在 改在 +有孔 有空 +吃葚么 吃什么 +西欢 喜欢 +泰泰 太太 +词反 吃饭 +其末考 期末考 +台湾采 台湾菜 +他们的采 他们的菜 +设么 什么 +共喜 恭喜 +日本蔡 日本菜 +餐订 餐厅 +公课 功课 +大们口 大门口 +有没有控 有没有空 +刘信 留信 +六根礼拜日 六跟礼拜日 +永敢 勇敢 +辛亏 幸亏 +估停站 古亭站 +异子 椅子 +胡涂 糊涂 +真巧呕 真巧喔 +奴生 女生 +好朋有 好朋友 +莫斯汉堡 摩斯汉堡 +不可望 不可忘 +中文壳 中文课 +棒他 帮他 +怎嚜走 怎么走 +吃凉 吃惊 +冈来 刚来 +勇敢得 勇敢地 +风友 朋友 +走天 昨天 +湖涂 糊涂 +怎嚜 怎么 +真得 值得 +哪理 哪里 +票亮 漂亮 +又高又受 又高又瘦 +化了半个小时 花了半个小时 +杆快 赶快 +学习的很愉快 学习得很愉快 +怎吗 怎么 +十子路口 十字路口 +开方 开放 +息缓 喜欢 +交相机 照相机 +请办我 请帮我 +爬上 爬山 +录色 绿色 +各我妈妈 给我妈妈 +交马上停车 叫马上停车 +米了 迷了 +交到了 就到了 +号吗 号码 +很葬 很脏 +换给 还给 +清流 请留 +名子 名字 +把输给弄丢 把书给弄丢 +播爱 博爱 +大搂 大楼 +逃论室 讨论室 +坻里 地理 +奇给我 寄给我 +座飞机 坐飞机 +再机场 在机场 +便很帅 变很帅 +腾她 等她 +路行 旅行 +梅天 每天 +她九打电话 她就打电话 +美邦法 没办法 +不好以时 不好意思 +有疑点藤 有一点疼 +里拜四 礼拜四 +李佬时 李老师 +工课 功课 +身提 身体 +可使 可是 +靠事 考试 +重要得东西 重要的东西 +寮解 了解 +情你 请你 +而截 而且 +昭显机 照相机 +够猫 狗猫 +最有名的采 最有名的菜 +波澜 波兰 +西办加话 西班牙话 +东四 东西 +台弯 台湾 +学子 学字 +银董 运动 +而接 而且 +有控 有空 +陆行 旅行 +忍识 认识 +式式看 试试看 +拨兰 波兰 +一支狗 一只狗 +管哩 管理 +将学金 奖学金 +再波兰 在波兰 +下了可 下了课 +回越南得时候 回越南的时候 +台湾得风景 台湾的风景 +朋有 朋友 +气个月 七个月 +从德国莱 从德国来 +文花 文化 +吃部下 吃不下 +清假 请假 +烤试 考试 +血绿 血缘 +我票告诉您 我要告诉您 +候天 后天 +惜来看 借来看 +上果 上课 +吗吗 妈妈 +师什么公车 是什么公车 +岗来 刚来 +挑了 跳了 +座716号的公车 坐716号的公车 +三十分锺 三十分钟 +七点锺 七点钟 +公车佔 公车站 +邓的时候 等的时候 +依服 衣服 +睡过偷 睡过头 +佑对 有对 +乎很大的吸 呼很大的吸 +一整天水乐 一整天睡了 +等以等 等一等 +只有妳 只有你 +可况 何况 +学姣 学校 +跳无 跳舞 +恨情张 很紧张 +恨票亮 很漂亮 +俩个 两个 +悾怕 恐怕 +北老师骂他 怕老师骂他 +十五分种 十五分钟 +必较多 比较多 +睡过头把 睡过头吧 +我要座得巴士 我要坐的巴士 +来不起 来不及 +才陪吵醒 才被吵醒 +撘公车 搭公车 +眼靓 眼睛 +台我无聊 太过无聊 +大明哏他朋友 大明跟他朋友 +跳五 跳舞 +30份钟 30分钟 +座呢个公车 坐那个公车 +热恼 热闹 +手摽 手表 +苏嘉 暑假 +不好亿思 不好意思 +怎么杨 怎么样 +妳怎么了 你怎么了 +没有控 没有空 +根冒 感冒 +卧是李 我是李 +即然 既然 +台杯 台北 +里拜 礼拜 +希冠 希望 +开兴 开心 +烤师 考试 +热心的不得了 热心得不得了 +结昏 结婚 +边请卡 邀请卡 +注在 住在 +密鲁 秘鲁 +过的很开心 过得很开心 +寰麟 婚礼 +我回想妳们 我会想你们 +绍待卡 招待卡 +妳们 你们 +幸福快乐的过 幸福快乐地过 +公司排我 公司派我 +常不一样 穿不一样 +做二九七的公车 坐二九七的公车 +几希 继续 +今天得计画 今天的计画 +开心得 开心地 +钟就 终究 +共车 公车 +兔然 突然 +林雨 淋雨 +做公车 坐公车 +好好地经验 好好的经验 +涂然 突然 +讨润以下 讨论一下 +去万 去玩 +高兴得 高兴地 +考食物 烤食物 +在考 在烤 +美里 美丽 +交他的朋友 教他的朋友 +休葸 休息 +拟越来越漂亮 妳越来越漂亮 +对阿 对啊 +一扁吃 一边吃 +高高兴兴的 高高兴兴地 +修系 休息 +搜已 所以 +早藏 早上 +玩的横高兴 玩得很高兴 +做神马 做什么 +努生 女生 +高行 高兴 +一点鹅 一点饿 +点蔡 点菜 +交弮 交卷 +贵定 规定 +公作 工作 +炽爱 挚爱 +系望 希望 +你道德地旁 你到的地方 +除去完 出去玩 +里拜天 礼拜天 +暱友打算 你有打算 +新家波 新加坡 +考利以下 考虑一下 +一起去把 一起去吧 +寄怪 奇怪 +休息一点 休息一天 +我要会我的国家 我要回我的国家 +四班牙 西班牙 +完五月 玩五月 +踏青我们 他请我们 +是天两夜 四天两夜 +忆起去 一起去 +好久没间 好久没见 +放便 方便 +玩着玩者 玩着玩着 +裁板 裁判 +清等我 请等我 +座一下 坐一下 +请座 请坐 +板球 棒球 +测天 这天 +馔衣服 穿衣服 +情坐 请坐 +经张 紧张 +辕动比赛 运动比赛 +埤酒 啤酒 +看般球 看棒球 +餲得不得了 渴得不得了 +絣干 饼干 +形像代言人 形象代言人 +化夏子孙 华夏子孙 +华为泡影 化为泡影 +秘密歪斜 秘密外泄 +计画 计划 +忠于等到了 终于等到了 +未日来临 末日来临 +眼晴 眼睛 +游刀有余 游刃有余 +唐僧帅徒 唐僧师徒 +太概 大概 +一揽子货币 一篮子货币 +举足无措 手足无措 +凭藉 凭借 +令人發指 令人髮指 +绅仕 绅士 +粘豆包 黏豆包 +磬竹难书 罄竹难书 +严惩不怠 严惩不贷 +戮力同心 勠力同心 +罚角球 发角球 +综合症 综合征 +单独二胎 单独二孩 +蛰人 蜇人 +泄秘 泄密 +伏法 服法 +羊羯子 羊蝎子 +泻湖 潟湖 +家俱 家具 +精萃 精粹 +兴亡周期率 兴亡周期律 +震憾 震撼 +中华人名共和国 中华人民共和国 +大人常委会 人大常委会 +中国共产常 中国共产党 +科学发展现 科学发展观 +扶贪 扶贫 +严谨公款吃喝 严禁公款吃喝 +按纳 按捺 +案语 按语 +百废具兴 百废俱兴 +百页窗 百叶窗 +班白 斑白 +颁白 斑白 +班驳 斑驳 +胞子 孢子 +保镳 保镖 +保母 保姆 +褓姆 保姆 +辈份 辈分 +本份 本分 +笔划 笔画 +必恭必敬 毕恭毕敬 +编者案 编者按 +萹豆 扁豆 +稨豆 扁豆 +藊豆 扁豆 +标识 标志 +鬓脚 鬓角 +禀承 秉承 +补靪 补丁 +补钉 补丁 +参预 参与 +惨澹 惨淡 +差迟 差池 +搀和 掺和 +搀假 掺假 +搀杂 掺杂 +刬除 铲除 +倘佯 徜徉 +车箱 车厢 +澈底 彻底 +沈思 沉思 +趁心 称心 +成份 成分 +澄彻 澄澈 +侈糜 侈靡 +筹画 筹划 +筹马 筹码 +踌蹰 踌躇 +出谋画策 出谋划策 +喘嘘嘘 喘吁吁 +磁器 瓷器 +赐与 赐予 +粗卤 粗鲁 +搭当 搭档 +搭挡 搭档 +搭赸 搭讪 +答讪 搭讪 +答覆 答复 +带孝 戴孝 +耽心 担心 +耽忧 担忧 +担搁 耽搁 +澹泊 淡泊 +澹然 淡然 +倒楣 倒霉 +低徊 低回 +雕敝 凋敝 +雕弊 凋敝 +雕零 凋零 +雕落 凋落 +雕谢 凋谢 +跌荡 跌宕 +跌交 跌跤 +蹀血 喋血 +丁宁 叮咛 +定单 订单 +定户 订户 +定婚 订婚 +定货 订货 +定阅 订阅 +枓拱 斗拱 +枓栱 斗拱 +逗遛 逗留 +斗趣儿 逗趣儿 +独脚戏 独角戏 +端五 端午 +二簧 二黄 +贰心 二心 +发人深醒 发人深省 +蕃衍 繁衍 +分付 吩咐 +份量 分量 +份内 分内 +份外 分外 +忿忿 愤愤 +丰富多采 丰富多彩 +疯瘫 风瘫 +疯颠 疯癫 +疯疯颠颠 疯疯癫癫 +锋铓 锋芒 +伏侍 服侍 +服事 服侍 +伏输 服输 +伏罪 服罪 +负嵎顽抗 负隅顽抗 +傅会 附会 +覆信 复信 +复辙 覆辙 +干与 干预 +告戒 告诫 +梗直 耿直 +鲠直 耿直 +恭惟 恭维 +勾划 勾画 +勾联 勾连 +孤苦零丁 孤苦伶仃 +孤负 辜负 +骨董 古董 +股分 股份 +骨瘦如豺 骨瘦如柴 +关连 关联 +光采 光彩 +归根结柢 归根结底 +规戒 规诫 +鬼哭狼嗥 鬼哭狼嚎 +过份 过分 +虾蟆 蛤蟆 +含胡 含糊 +涵蓄 含蓄 +寒伧 寒碜 +喝采 喝彩 +喝倒采 喝倒彩 +哄动 轰动 +宏扬 弘扬 +红通通 红彤彤 +弘论 宏论 +弘图 宏图 +鸿图 宏图 +弘愿 宏愿 +弘旨 宏旨 +鸿福 洪福 +胡臭 狐臭 +胡蝶 蝴蝶 +胡涂 糊涂 +虎魄 琥珀 +花着 花招 +豁拳 划拳 +搳拳 划拳 +恍忽 恍惚 +晖映 辉映 +混水摸鱼 浑水摸鱼 +火伴 伙伴 +机伶 机灵 +激忿 激愤 +计画 计划 +记念 纪念 +寄与 寄予 +茄克 夹克 +佳宾 嘉宾 +驾御 驾驭 +架式 架势 +嫁装 嫁妆 +简炼 简练 +骄奢淫佚 骄奢淫逸 +脚门 角门 +狡滑 狡猾 +脚根 脚跟 +叫化子 叫花子 +精采 精彩 +鸠合 纠合 +鸠集 纠集 +脚色 角色 +刻期 克期 +刻日 克日 +刻划 刻画 +阔老 阔佬 +蓝缕 褴褛 +烂缦 烂漫 +烂熳 烂漫 +狼籍 狼藉 +狼头 榔头 +累坠 累赘 +黎黑 黧黑 +联贯 连贯 +联接 连接 +联绵 连绵 +联缀 连缀 +连袂 联袂 +连翩 联翩 +踉蹡 踉跄 +嘹喨 嘹亮 +撩乱 缭乱 +零丁 伶仃 +囹圉 囹圄 +蹓跶 溜达 +留连 流连 +喽罗 喽啰 +卤莽 鲁莽 +录象 录像 +录相 录像 +落腮胡子 络腮胡子 +落漠 落寞 +落莫 落寞 +痲痹 麻痹 +痲风 麻风 +痲疹 麻疹 +蚂蜂 马蜂 +马糊 马虎 +门坎 门槛 +糜费 靡费 +绵联 绵连 +摹仿 模仿 +模胡 模糊 +摹拟 模拟 +模写 摹写 +磨擦 摩擦 +磨拳擦掌 摩拳擦掌 +魔难 磨难 +眽眽 脉脉 +谋画 谋划 +那末 那么 +内哄 内讧 +凝炼 凝练 +牛崽裤 牛仔裤 +钮扣 纽扣 +掱手 扒手 +蟠根错节 盘根错节 +盘据 盘踞 +蟠踞 盘踞 +蟠据 盘踞 +蟠曲 盘曲 +盘陁 盘陀 +盘石 磐石 +蟠石 磐石 +盘跚 蹒跚 +旁皇 彷徨 +披星带月 披星戴月 +疲塌 疲沓 +飘泊 漂泊 +飘流 漂流 +漂零 飘零 +飘飖 飘摇 +平空 凭空 +牵联 牵连 +蕉萃 憔悴 +清彻 清澈 +情素 情愫 +惓惓 拳拳 +劝戒 劝诫 +热呼呼 热乎乎 +热呼 热乎 +热中 热衷 +人材 人才 +日蚀 日食 +入坐 入座 +色采 色彩 +杀一警百 杀一儆百 +沙鱼 鲨鱼 +山查 山楂 +舢舨 舢板 +梢公 艄公 +奢糜 奢靡 +伸雪 申雪 +神彩 神采 +湿渌渌 湿漉漉 +十锦 什锦 +收伏 收服 +首坐 首座 +书柬 书简 +思惟 思维 +死心踏地 死心塌地 +塌实 踏实 +菾菜 甜菜 +挺而走险 铤而走险 +透澈 透彻 +图象 图像 +推委 推诿 +玩艺儿 玩意儿 +委过 诿过 +污七八糟 乌七八糟 +无动于中 无动于衷 +无宁 毋宁 +无庸 毋庸 +五采缤纷 五彩缤纷 +五痨七伤 五劳七伤 +瘜肉 息肉 +希罕 稀罕 +希奇 稀奇 +希少 稀少 +希世 稀世 +希有 稀有 +噏动 翕动 +洗炼 洗练 +贤慧 贤惠 +香纯 香醇 +香菰 香菇 +像貌 相貌 +萧洒 潇洒 +小题大作 小题大做 +卸傤 卸载 +信口开合 信口开河 +惺松 惺忪 +秀外惠中 秀外慧中 +叙文 序文 +叙言 序言 +训戒 训诫 +压伏 压服 +压韵 押韵 +雅片 鸦片 +洋琴 扬琴 +要末 要么 +夜消 夜宵 +一槌定音 一锤定音 +一古脑儿 一股脑儿 +衣衿 衣襟 +衣著 衣着 +义无返顾 义无反顾 +霪雨 淫雨 +赢余 盈余 +影象 影像 +余辉 余晖 +鱼具 渔具 +鱼网 渔网 +预会 与会 +预闻 与闻 +御手 驭手 +豫备 预备 +元来 原来 +元煤 原煤 +源源本本 原原本本 +元元本本 原原本本 +原故 缘故 +原由 缘由 +月蚀 月食 +月芽 月牙 +云豆 芸豆 +杂遝 杂沓 +再接再砺 再接再厉 +斩新 崭新 +展转 辗转 +颤栗 战栗 +帐本 账本 +折衷 折中 +这末 这么 +正经八摆 正经八百 +脂麻 芝麻 +支解 肢解 +枝解 肢解 +直捷了当 直截了当 +直接了当 直截了当 +指手划脚 指手画脚 +赒济 周济 +转游 转悠 +装璜 装潢 +姿式 姿势 +子细 仔细 +自各儿 自个儿 +左证 佐证 +安份守己 安分守己 +暗度陈仓 暗渡陈仓 +把势 把式 +班配 般配 +棒锤 棒槌 +棒棰 棒槌 +暴光 曝光 +报导 报道 +悲忿 悲愤 +背理 悖理 +比画 比划 +笔心 笔芯 +荜路蓝缕 筚路蓝缕 +辨白 辩白 +辩辞 辩词 +波浪鼓 拨浪鼓 +泼浪鼓 拨浪鼓 +部份 部分 +菜子 菜籽 +仓惶 仓皇 +仓黄 仓皇 +仓遑 仓皇 +策画 策划 +常年累月 长年累月 +唱工 唱功 +潮呼呼 潮乎乎 +潮忽忽 潮乎乎 +撤消 撤销 +承上起下 承上启下 +吃里扒外 吃里爬外 +踟躇 踟蹰 +串连 串联 +辞汇 词汇 +词令 辞令 +搭拉 耷拉 +答理 搭理 +哒哒 嗒嗒 +搭裢 褡裢 +搭连 褡裢 +褡连 褡裢 +褡联 褡裢 +打冷颤 打冷战 +大放厥辞 大放厥词 +铛铛 当当 +当做 当作 +捣腾 倒腾 +悼辞 悼词 +得意扬扬 得意洋洋 +灯心 灯芯 +滴里嘟噜 嘀里嘟噜 +掉包 调包 +钉梢 盯梢 +丢三拉四 丢三落四 +掉换 调换 +东不拉 冬不拉 +遁辞 遁词 +哆唆 哆嗦 +峨嵋山 峨眉山 +发楞 发愣 +翻然醒悟 幡然醒悟 +反覆 反复 +忿恨 愤恨 +忿怒 愤怒 +夫倡妇随 夫唱妇随 +浮图 浮屠 +辐凑 辐辏 +福份 福分 +俯首贴耳 俯首帖耳 +赋与 赋予 +夹肢窝 胳肢窝 +格登 咯噔 +根柢 根底 +梗咽 哽咽 +宫庭 宫廷 +钩勒 勾勒 +勾针 钩针 +够戗 够呛 +孤另另 孤零零 +孤伶伶 孤零零 +轱轳 轱辘 +毂辘 轱辘 +固步自封 故步自封 +故技 故伎 +锢疾 痼疾 +固疾 痼疾 +刮刮叫 呱呱叫 +呵腰 哈腰 +寒颤 寒战 +嚎啕 号啕 +号咷 号啕 +嚎咷 号啕 +好高务远 好高骛远 +和事老 和事佬 +贺辞 贺词 +黑鼓隆咚 黑咕隆咚 +黑古龙冬 黑咕隆咚 +黑鸦鸦 黑压压 +轰堂大笑 哄堂大笑 +轰笑 哄笑 +宏亮 洪亮 +呼嗤 呼哧 +呼蚩 呼哧 +呼吃 呼哧 +花狸狐哨 花里胡哨 +花梢 花哨 +花稍 花哨 +花消 花销 +黄历 皇历 +混身 浑身 +浑沌 混沌 +辑逸 辑佚 +给与 给予 +记录片 纪录片 +记要 纪要 +趼子 茧子 +交待 交代 +脚鸭子 脚丫子 +脚指 脚趾 +叫真 较真 +菁华 精华 +警省 警醒 +酒钟 酒盅 +倔犟 倔强 +开消 开销 +砍大山 侃大山 +看做 看作 +夸大其辞 夸大其词 +宽洪大量 宽宏大量 +老趼 老茧 +乐和和 乐呵呵 +乐孜孜 乐滋滋 +利害 厉害 +伶牙利齿 伶牙俐齿 +流言飞语 流言蜚语 +蹓弯儿 遛弯儿 +乱烘烘 乱哄哄 +罗纹 螺纹 +慢道 漫道 +慢说 漫说 +毛骨耸然 毛骨悚然 +毛骨竦然 毛骨悚然 +冒然 贸然 +棉子 棉籽 +藐小 渺小 +渺视 藐视 +渺远 邈远 +溟溟 冥冥 +摸棱两可 模棱两可 +秣马利兵 秣马厉兵 +秣马砺兵 秣马厉兵 +木犀 木樨 +闹轰轰 闹哄哄 +闹烘烘 闹哄哄 +粘稠 黏稠 +粘糊 黏糊 +粘土 黏土 +粘性 黏性 +粘液 黏液 +念道 念叨 +暖呼呼 暖乎乎 +扒犁 爬犁 +判辞 判词 +皮簧 皮黄 +慓悍 剽悍 +飘渺 缥缈 +漂渺 缥缈 +飘眇 缥缈 +飘邈 缥缈 +凭白无故 平白无故 +匍伏 匍匐 +起程 启程 +启锚 起锚 +起迄 起讫 +气门心 气门芯 +牵就 迁就 +遣辞 遣词 +枪枝 枪支 +情份 情分 +屈伏 屈服 +取销 取消 +雀瘢 雀斑 +热剌剌 热辣辣 +如雷灌耳 如雷贯耳 +散逸 散佚 +沙锅 砂锅 +沙壶 砂壶 +沙浆 砂浆 +沙糖 砂糖 +杀风景 煞风景 +杀尾 煞尾 +刹时 霎时 +山颠 山巅 +扇风点火 煽风点火 +闪烁其辞 闪烁其词 +上方宝剑 尚方宝剑 +深醒 深省 +甚么 什么 +神甫 神父 +省分 省份 +拾遗补阙 拾遗补缺 +士女画 仕女画 +视阈 视域 +誓辞 誓词 +授与 授予 +摔交 摔跤 +水份 水分 +水长船高 水涨船高 +思辩 思辨 +死气白赖 死乞白赖 +宿愿 夙愿 +夙来 素来 +夙敌 宿敌 +夙儒 宿儒 +夙怨 宿怨 +梯己 体己 +题辞 题词 +俶傥 倜傥 +瞳人 瞳仁 +产玲 产龄 +退色 褪色 +托咐 托付 +顽耍 玩耍 +玩皮 顽皮 +惟独 唯独 +惟恐 唯恐 +惟利是图 唯利是图 +惟命是从 唯命是从 +惟其 唯其 +惟我独尊 唯我独尊 +惟一 唯一 +萎顿 委顿 +委宛 委婉 +委罪 诿罪 +委靡 萎靡 +委谢 萎谢 +文彩 文采 +无精打彩 无精打采 +无尚 无上 +欷歔 唏嘘 +喜孜孜 喜滋滋 +陷井 陷阱 +项练 项链 +销歇 消歇 +消魂 销魂 +兴高彩烈 兴高采烈 +雄纠纠 雄赳赳 +旋涡 漩涡 +薰陶 熏陶 +丫鬟 丫环 +压宝 押宝 +哑吧 哑巴 +哑叭 哑巴 +言不由中 言不由衷 +一倡百和 一唱百和 +一蹋糊涂 一塌糊涂 +一榻糊涂 一塌糊涂 +一相情愿 一厢情愿 +引伸 引申 +硬梆梆 硬邦邦 +硬帮帮 硬邦邦 +渔汛 鱼汛 +鱼鼓 渔鼓 +约莫 约摸 +殒落 陨落 +在坐 在座 +糟踏 糟蹋 +糟塌 糟蹋 +张惶 张皇 +照像 照相 +珍羞 珍馐 +真象 真相 +枝梧 支吾 +枝捂 支吾 +装聋做哑 装聋作哑 +妆束 装束 +装做 装作 +子畜 仔畜 +子猪 仔猪 +子粒 籽粒 +子棉 籽棉 +子实 籽实 +走露 走漏 +做弊 作弊 +做美 作美 +做弄 作弄 +做声 作声 +做秀 作秀 +座落 坐落 +坐次 座次 +坐位 座位 +旁证博引 旁征博引 +谈笑风声 谈笑风生 +美仑美幻 美轮美奂 +坐阵 坐镇 +不径而走 不胫而走 +飘亮 漂亮 +青纯 清纯 +体晾 体谅 +发杨广大 发扬光大 +浪废水 浪费水 +通货膨涨 通货膨胀 +迫不急待 迫不及待 +堵注 赌注 \ No newline at end of file diff --git a/examples/ngram_lm/data/text_correct.txt b/examples/ngram_lm/data/text_correct.txt new file mode 100644 index 000000000..ded8341bb --- /dev/null +++ b/examples/ngram_lm/data/text_correct.txt @@ -0,0 +1,220 @@ +少先队员因该为老人让坐 +祛痘印可以吗?有效果吗? +不知这款牛奶口感怎样? 小孩子喝行吗! +是转基因油? +我家宝宝13斤用多大码的 +会起坨吗? +请问给送上楼吗? +亲是送赁上门吗 +送货时候有外包装没有还是直接发货过来 +会不会有坏的? +这个米煮粥好还煮饭好吃 +有送的马克杯吗? +这纸尿裤分男孩女孩使用吗 +买的路由器老是断网,拔了跳过路由器就可以用了 +能泡开不?辣度几 +请问这个米蒸出来是一粒一粒的还是一坨一坨的? +水和其他商品一样送货上门,还是自提呀? +快两个月的孩子 要穿什么码的 +买回来会不会过期? +洗的还干净把吧 +路由器怎么样啊,掉线严重吗? +你好这米是五斤还是十斤 +收安费不 +给送开果器吗 +这纸好用吗?我看有不少的差评 +自用好用吗 +请问袜子穿久了会往下掉吗? +每一卷是独立包装的吗? +这个火龙果口味怎么样?甜不甜? +买这个送红杯吗? +一袋子多少斤 +这款拉拉裤有味道吗?超市买的没有味道,不知道这个怎么样 +我想问下拉拉裤上面那个贴的用来干嘛的,怎么用 +这里边有没有枣核 +玫瑰和薰衣草哪个好闻 +这个冰糖质量怎么样,有杂质吗 +倒水的时候漏吗 +请问大家,这个水壶烧出来的水有异味吗?因为给宝宝用所以很在意,谢谢大家 +这米煮出来糯吗? +这在款子好用吗?有香味吗? +到底是棉花的材质还是化纤的无纺布啊 求问? +我用360手机能充电几次 +亲这纸好用吗?值得买吗? +24瓶?还是12瓶 +是否是真的纸? +适用机洗吗? +好吃不好吃啊 +真的好用吗?我也想买 +你们拿到是什么版本的 +这水和超市一样吗?质量保证吗? +可以丢进马桶冲吗? +纸会不会粗? +这个翠的还不是不催的呀。。没有吃的那种不脆 +这个好用吗 +这纸有香味的吗? +是最近的生产日期吗 +赠品是什么呀 +这是两瓶还是一瓶的价格? +请问这是硬壳还是软壳? +亲,苹果收到后有坏的吗? +适合两人用吗 +这个直接喝好不好喝 还是要热一下 +纸有木有刺鼻气味? +酸不酸??? +这啤好渴吗? +跟安慕希哪个比较好喝? +好用么,主要是带宝宝出去玩的时候用的多? +刚出生的宝宝用什么码? +能当洗手液吗? +是不是很小包的那一种?50块有24包便宜的有点不敢相信 +好用吗,会不会起会不会起坨? +这个口可以直接放饮水机上用吗? +这种纸掉粉末吗 +手机好用吗?会卡吗 +开盖里面是拉环的吗? +这个电池真的需要一直换吗? +好用吗?是不是正品? +请问有尿显吗 +容易发烫吗 +苹果有腊吗 +这油有这么好吗?不是过期的吧 +这个夏天用会不会红屁股?透气性好吗 +你好。 我想问下这个是尿不湿吗 ? +这奶为啥这么便宜? +你们买的酱油会没有颜色吗,像水一样,看着都没胃口 +这个是机诜,还是手洗 +这个卫生巾带香味吗? +这种洗发水好用吗 +有餡嗎?好不好吃 +纸质不会好差吗? +亲们,此米是真空包装吗? +是软毛的吗?!! +请问大家德运牌子的好喝还是安佳的? +这纸好用吗,薄嘛 +这壶保温吗 +这个威露士货到了就是跟图片上的一样吗?只要是图片上显示的都有吗? +你们买的牛奶是最近日期吗 +这个除菌液,是单独放在滚筒洗衣机除菌液格,还是与洗衣液混合放在洗衣液格? +请问你们的三只松鼠寄回来的时候是用袋子装着的吗 +1kg是不是两斤? +洗衣皂怎么样啊,味道重吗,用之后好不好清洗啊。 +我要请问你这个是不是那个拉拉裤吗?这个花纹是不是拉拉裤? +好多人都说小米运动升级后手环就连不上了,你们有没有这种情况? +这部手机运行速度快不快? +新生儿可以用吗 抽一张会带出来很多张吗 +洗后有香味吗 +体验装有多少片 +银装怎么样?会漏尿吗?你们都是多久换一次的??(我家大概2-3个小时左右,宝宝醒一回换一次) +声音大吗?好用不? +抽纸有味吗 +苹果好吃吗?打过蜡吗?是不是坏的很多? +70g和80g得区别是啥? +袋装的和瓶装的洗衣液是一样的么? +噪音很大吗 +烧出来的水会不会很多一块一块的东西 +这个吹风真心好用吗?我今晚下单什么时候到 +请问各位宝妈 这个乳垫的背胶粘吗 +M号的你们给宝宝用到多大啊?几个月?我家宝宝3个月5㎏重,用花王的M号觉得小了。不知道这个怎么样? +这个喝了能找到女朋友吗 +这袜子耐不耐穿 +请问好用么 是正品么 +怎么储藏 我买了两天在常温阴凉处放着下层有些化了 需要放冰箱冷冻吗 +这批苏打水是否有股消毒水的味道? +质量怎么样,看到那么多差评,我不敢买了。 +会不会有烂的 +为什么我买的用完之后没香味 +甜吗???? +我看到评论里的差评说大米里有虫,是真的吗? +要放冰箱冷藏吗 +好不好吃啊 +这油怎么样 炒菜香不香 +这纸擦手时有屑吗? +是正品的吗? +好用吗 +这个特浓的苦不苦 +这个好用吗? +米里真的有虫吗 +是金装的吗? +双内胆有什么区别,两个一样的吗? +请问这款水可以降尿酸吗? +好用吗这个 +购物袋结实吗,能放重东西吗 +你好,请问这款可以剃头发刮光头吗 +这个纸巾质量如何?好用吗? +好用吗?小孩子喜欢吗? +亲。煮面时会糊锅不 +包邮吗运费多少 +会一抽就两三张一起抽起来吗? +一箱几桶油呀 +这个吹风机分冷风和热风吗 +发什么快递呢 +请问一下,有些枸杞说是不要洗,你们的是否建议洗呢? +请问纸有异味吗?我以前买过一箱就是这个居然有异味。 +这是6个么 怎么觉得有好多 +我买的荣耀10横滑home键进入后台这个操作成功率特别低,你们也是这样吗? +你们的有塑料味吗,机械的 +小米路由器真心说的有这么差吗 +请问大家这款刮的干净吗?谢谢 +会有塑料味吗 +质量真的很差吗?不敢买 +这纸有气味吗 +我买两箱怎么要运费 +这个标准果好吃吗,酸不酸 +稀吗?是不是有种兑了水的感觉? +威露士和滴露的消毒液哪个更好用呢? +曰期是几月份的 +手机容易折弯吗? +我家宝宝25斤XL会紧吗? +这款200克一箱的纸张和10卷手提的价格相差那么多 质量一样吗? +豆浆可以打吗 +电量有百分比吗 +用快递送过来瓶子会不会打破 +是三相电吗,有空调摇控器吧 +拿它送人,有问题吗?? +安幕希好喝吗? +这款纸尿裤好用吗?和尤妮佳比较哪个好用些? +2层厚吗?是不是一到水就烂了 +为什么我宝宝拉粑粑后面总是漏出来我已经贴的很牢了,10斤的宝宝用S号也不小啊你们用了没这种情况吗? +这个产品好用吗? +刷毛柔软度咋样,这么便宜,会不会是很小个的 +会不会有过敏的情况呀 +请问是辣条吗 +这种米只能煮粥不能煮饭吗 +可以开袋即食吗? +这米好吃吗? +这个充电宝充满电需要多久 +这个奶开了可以保质喝两天吗 +这种薰衣草的洗衣液怎么样 +你们的小米六边框掉漆了吗??? +这个是机洗用还是手洗用的啊 +厚度怎么样、起球吗感谢大哥大姐们 +这个好喝还是康师傅红茶好喝 +这种洁面膏会不会过敏,我上次用的火山岩冰感洁面啫喱对那种过敏,但听别人说那种稀的本来就特别容易过敏,不知道这种洁面膏会不会过敏! +这杯那么多差评,是真的吗,吓得我都不敢买了 +枣是免洗的吗? +这个尿不湿尿过会起坨吗 +感觉和苏菲比哪个更好用呢? +煮出来的饭香吗? +你好!请问这个水壶烧水开了是自动切电吗? +这个跟 原木纯品 那个啥区别?不是原木纸浆做的? +能放冰箱吗 +纸有味道吗? +2016全国高考卷答题模板 +2016全国大考卷答题模板 +2016全国低考卷答题模板 +床前明月光,疑是地上霜 +床前星星光,疑是地上霜 +床前白月光,疑是地上霜 +落霞与孤鹜齐飞,秋水共长天一色 +落霞与孤鹜齐跑,秋水共长天一色 +落霞与孤鹜双飞,秋水共长天一色 +众里寻他千百度,蓦然回首,那人却在,灯火阑珊处 +众里寻她千百度,蓦然回首,那人却在,灯火阑珊处 +众里寻ta千百度,蓦然回首,那人却在,灯火阑珊处 +吸烟的人容*得癌症 +就只听着我*妈所说的话, +就接受环境污*用化肥和农药, +是或者接受环境污染用化肥和农药, +现在的香港比从前的*荣很多。 +现在的香港比*前的饭荣很多。 diff --git a/examples/ngram_lm/local/build_zh_lm.sh b/examples/ngram_lm/local/build_zh_lm.sh new file mode 100644 index 000000000..73eb165ec --- /dev/null +++ b/examples/ngram_lm/local/build_zh_lm.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -e + +stage=0 +stop_stage=100 + +order=5 +mem=80% +prune=0 +a=22 +q=8 +b=8 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +if [ $# != 3 ]; then + echo "$0 token_type exp/text exp/text.arpa" + echo $@ + exit 1 +fi + +# char or word +type=$1 +text=$2 +arpa=$3 + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then + # text tn & wordseg preprocess + echo "process text." + python3 ${MAIN_ROOT}/utils/zh_tn.py ${type} ${text} ${text}.${type}.tn +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then + # train ngram lm + echo "build lm." + bash ${MAIN_ROOT}/utils/ngram_train.sh --order ${order} --mem ${mem} --prune "${prune}" ${text}.${type}.tn ${arpa} +fi \ No newline at end of file diff --git a/examples/ngram_lm/local/download_lm_zh.sh b/examples/ngram_lm/local/download_lm_zh.sh new file mode 100755 index 000000000..f9e2261fd --- /dev/null +++ b/examples/ngram_lm/local/download_lm_zh.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +. ${MAIN_ROOT}/utils/utility.sh + +DIR=data/lm +mkdir -p ${DIR} + +URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm' +MD5="29e02312deb2e59b3c8686c7966d4fe3" +TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm + + +echo "Download language model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download the language model!" + exit 1 +fi + + +exit 0 diff --git a/examples/ngram_lm/local/kenlm_score_test.py b/examples/ngram_lm/local/kenlm_score_test.py new file mode 100644 index 000000000..30bc1e4b1 --- /dev/null +++ b/examples/ngram_lm/local/kenlm_score_test.py @@ -0,0 +1,187 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import time + +import jieba +import kenlm + +language_model_path = sys.argv[1] +assert os.path.exists(language_model_path) + +start = time.time() +model = kenlm.Model(language_model_path) +print(f"load kenLM cost: {time.time() - start}s") + +sentence = '盘点不怕被税的海淘网站❗️海淘向来便宜又保真!' +sentence_char_split = ' '.join(list(sentence)) +sentence_word_split = ' '.join(jieba.lcut(sentence)) + + +def test_score(): + print('Loaded language model: %s' % language_model_path) + + print(sentence) + print(model.score(sentence)) + print(list(model.full_scores(sentence))) + for i, v in enumerate(model.full_scores(sentence)): + print(i, v) + + print(sentence_char_split) + print(model.score(sentence_char_split)) + print(list(model.full_scores(sentence_char_split))) + split_size = 0 + for i, v in enumerate(model.full_scores(sentence_char_split)): + print(i, v) + split_size += 1 + assert split_size == len( + sentence_char_split.split()) + 1, "error split size." + + print(sentence_word_split) + print(model.score(sentence_word_split)) + print(list(model.full_scores(sentence_word_split))) + for i, v in enumerate(model.full_scores(sentence_word_split)): + print(i, v) + + +def test_full_scores_chars(): + print('Loaded language model: %s' % language_model_path) + print(sentence_char_split) + # Show scores and n-gram matches + words = ['<s>'] + list(sentence) + ['</s>'] + for i, (prob, length, + oov) in enumerate(model.full_scores(sentence_char_split)): + print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i + 2 - length: + i + 2]))) + if oov: + print('\t"{0}" is an OOV'.format(words[i + 1])) + + print("-" * 42) + # Find out-of-vocabulary words + oov = [] + for w in words: + if w not in model: + print('"{0}" is an OOV'.format(w)) + oov.append(w) + assert oov == ["❗", "️", "!"], 'error oov' + + +def test_full_scores_words(): + print('Loaded language model: %s' % language_model_path) + print(sentence_word_split) + # Show scores and n-gram matches + words = ['<s>'] + sentence_word_split.split() + ['</s>'] + for i, (prob, length, + oov) in enumerate(model.full_scores(sentence_word_split)): + print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i + 2 - length: + i + 2]))) + if oov: + print('\t"{0}" is an OOV'.format(words[i + 1])) + + print("-" * 42) + # Find out-of-vocabulary words + oov = [] + for w in words: + if w not in model: + print('"{0}" is an OOV'.format(w)) + oov.append(w) + # zh_giga.no_cna_cmn.prune01244.klm is chinese charactor LM + assert oov == ["盘点", "不怕", "网站", "❗", "️", "海淘", "向来", "便宜", "保真", + "!"], 'error oov' + + +def test_full_scores_chars_length(): + """test bos eos size""" + print('Loaded language model: %s' % language_model_path) + r = list(model.full_scores(sentence_char_split)) + n = list(model.full_scores(sentence_char_split, bos=False, eos=False)) + print(r) + print(n) + assert len(r) == len(n) + 1 + + # bos=False, eos=False, input len == output len + print(len(n), len(sentence_char_split.split())) + assert len(n) == len(sentence_char_split.split()) + + k = list(model.full_scores(sentence_char_split, bos=False, eos=True)) + print(k, len(k)) + + +def test_ppl_sentence(): + """测试句子粒度的ppl得分""" + sentence_char_split1 = ' '.join('先救挨饿的人,然后治疗病人。') + sentence_char_split2 = ' '.join('先就挨饿的人,然后治疗病人。') + n = model.perplexity(sentence_char_split1) + print('1', n) + n = model.perplexity(sentence_char_split2) + print(n) + + part_char_split1 = ' '.join('先救挨饿的人') + part_char_split2 = ' '.join('先就挨饿的人') + n = model.perplexity(part_char_split1) + print('2', n) + n = model.perplexity(part_char_split2) + print(n) + + part_char_split1 = '先救挨' + part_char_split2 = '先就挨' + n1 = model.perplexity(part_char_split1) + print('3', n1) + n2 = model.perplexity(part_char_split2) + print(n2) + assert n1 == n2 + + part_char_split1 = '先 救 挨' + part_char_split2 = '先 就 挨' + n1 = model.perplexity(part_char_split1) + print('4', n1) + n2 = model.perplexity(part_char_split2) + print(n2) + + part_char_split1 = '先 救 挨 饿 的 人' + part_char_split2 = '先 就 挨 饿 的 人' + n1 = model.perplexity(part_char_split1) + print('5', n1) + n2 = model.perplexity(part_char_split2) + print(n2) + + part_char_split1 = '先 救 挨 饿 的 人 ,' + part_char_split2 = '先 就 挨 饿 的 人 ,' + n1 = model.perplexity(part_char_split1) + print('6', n1) + n2 = model.perplexity(part_char_split2) + print(n2) + + part_char_split1 = '先 救 挨 饿 的 人 , 然 后 治 疗 病 人' + part_char_split2 = '先 就 挨 饿 的 人 , 然 后 治 疗 病 人' + n1 = model.perplexity(part_char_split1) + print('7', n1) + n2 = model.perplexity(part_char_split2) + print(n2) + + part_char_split1 = '先 救 挨 饿 的 人 , 然 后 治 疗 病 人 。' + part_char_split2 = '先 就 挨 饿 的 人 , 然 后 治 疗 病 人 。' + n1 = model.perplexity(part_char_split1) + print('8', n1) + n2 = model.perplexity(part_char_split2) + print(n2) + + +if __name__ == '__main__': + test_score() + test_full_scores_chars() + test_full_scores_words() + test_full_scores_chars_length() + test_ppl_sentence() diff --git a/examples/ngram_lm/path.sh b/examples/ngram_lm/path.sh new file mode 100644 index 000000000..84e2de7d0 --- /dev/null +++ b/examples/ngram_lm/path.sh @@ -0,0 +1,10 @@ +export MAIN_ROOT=${PWD}/../../ + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH} \ No newline at end of file diff --git a/examples/ngram_lm/requirements.txt b/examples/ngram_lm/requirements.txt new file mode 100644 index 000000000..523cd3e14 --- /dev/null +++ b/examples/ngram_lm/requirements.txt @@ -0,0 +1 @@ +jieba>=0.39 \ No newline at end of file diff --git a/examples/ngram_lm/run.sh b/examples/ngram_lm/run.sh new file mode 100755 index 000000000..4b507e95c --- /dev/null +++ b/examples/ngram_lm/run.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -e +source path.sh + +stage=0 +stop_stage=100 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit -1 + +python3 -c 'import kenlm;' || { echo "kenlm package not install!"; exit -1; } + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then + # case 1, test kenlm + # download language model + bash local/download_lm_zh.sh + if [ $? -ne 0 ]; then + exit 1 + fi + + # test kenlm `score` and `full_score` + python local/kenlm_score_test.py data/lm/zh_giga.no_cna_cmn.prune01244.klm +fi + +mkdir -p exp +cp data/text_correct.txt exp/text + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then + # case 2, chinese chararctor ngram lm build + # output: xxx.arpa xxx.kenlm.bin + input=exp/text + token_type=char + lang=zh + order=5 + prune="0 1 2 4 4" + a=22 + q=8 + b=8 + output=${input}_${lang}_${token_type}_o${order}_p${prune// /_}_a${a}_q${q}_b${b}.arpa + echo "build ${token_type} lm." + bash local/build_zh_lm.sh --order ${order} --prune "${prune}" --a ${a} --q ${a} --b ${b} ${token_type} ${input} ${output} +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then + # case 2, chinese chararctor ngram lm build + # output: xxx.arpa xxx.kenlm.bin + input=exp/text + token_type=word + lang=zh + order=3 + prune="0 0 0" + a=22 + q=8 + b=8 + output=${input}_${lang}_${token_type}_o${order}_p${prune// /_}_a${a}_q${q}_b${b}.arpa + echo "build ${token_type} lm." + bash local/build_zh_lm.sh --order ${order} --prune "${prune}" --a ${a} --q ${a} --b ${b} ${token_type} ${input} ${output} +fi diff --git a/setup.sh b/setup.sh index 32d252a04..11daa102a 100644 --- a/setup.sh +++ b/setup.sh @@ -57,11 +57,11 @@ if [ $? != 0 ]; then fi -# install kaldi-comptiable feature -pushd third_party/python_kaldi_features/ -python setup.py install +# install third_party +pushd third_party +bash install.sh if [ $? != 0 ]; then - error_msg "Please check why kaldi feature install error!" + error_msg "Please check why third_party install error!" exit -1 fi popd diff --git a/third_party/README.md b/third_party/README.md index e17040ef0..655c826e8 100644 --- a/third_party/README.md +++ b/third_party/README.md @@ -1,8 +1,20 @@ - * [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features) commit: fc1bd6240c2008412ab64dc25045cd872f5e126c ref: https://zhuanlan.zhihu.com/p/55371926 +licence: MIT * [python-pinyin](https://github.com/mozillazg/python-pinyin.git) - commit: 55e524aa1b7b8eec3d15c5306043c6cdd5938b03 - licence: MIT +commit: 55e524aa1b7b8eec3d15c5306043c6cdd5938b03 +licence: MIT + +* [zhon](https://github.com/tsroten/zhon) +commit: 09bf543696277f71de502506984661a60d24494c +licence: MIT + +* [pymmseg-cpp](https://github.com/pluskid/pymmseg-cpp.git) +commit: b76465045717fbb4f118c4fbdd24ce93bab10a6d +licence: MIT + +* [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization.git) +commit: 9e92c7bf2d6b5a7974305406d8e240045beac51c +licence: MIT diff --git a/third_party/chinese_text_normalization/.gitignore b/third_party/chinese_text_normalization/.gitignore new file mode 100644 index 000000000..f50f06f32 --- /dev/null +++ b/third_party/chinese_text_normalization/.gitignore @@ -0,0 +1,2 @@ +*~ +*.far diff --git a/third_party/chinese_text_normalization/LICENSE b/third_party/chinese_text_normalization/LICENSE new file mode 100644 index 000000000..c6be42fba --- /dev/null +++ b/third_party/chinese_text_normalization/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 SpeechIO + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/third_party/chinese_text_normalization/README.md b/third_party/chinese_text_normalization/README.md new file mode 100644 index 000000000..105e8fd52 --- /dev/null +++ b/third_party/chinese_text_normalization/README.md @@ -0,0 +1,112 @@ +# Chinese Text Normalization for Speech Processing + +## Problem + +Search for "Text Normalization"(TN) on Google and Github, you can hardly find open-source projects that are "read-to-use" for text normalization tasks. Instead, you find a bunch of NLP toolkits or frameworks that *supports* TN functionality. There is quite some work between "support text normalization" and "do text normalization". + +## Reason + +* TN is language-dependent, more or less. + + Some of TN processing methods are shared across languages, but a good TN module always involves language-specific knowledge and treatments, more or less. + +* TN is task-specific. + + Even for the same language, different applications require quite different TN. + +* TN is "dirty" + + Constructing and maintaining a set of TN rewrite-rules is painful, whatever toolkits and frameworks you choose. Subtle and intrinsic complexities hide inside TN task itself, not in tools or frameworks. + +* mature TN module is an asset + + Since constructing and maintaining TN is hard, it is actually an asset for commercial companies, hence it is unlikely to find a product-level TN in open-source community (correct me if you find any) + +* TN is a less important topic for either academic or commercials. + +## Goal + +This project sets up a ready-to-use TN module for **Chinese**. Since my background is **speech processing**, this project should be able to handle most common TN tasks, in **Chinese ASR** text processing pipelines. + +## Normalizers + +1. supported NSW (Non-Standard-Word) Normalization + + |NSW type|raw|normalized| + |-|-|-| + |cardinal|这块黄金重达324.75克|这块黄金重达三百二十四点七五克| + |date|她出生于86年8月18日,她弟弟出生于1995年3月1日|她出生于八六年八月十八日 她弟弟出生于一九九五年三月一日| + |digit|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九| + |fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票| + |money|随便来几个价格12块5,34.5元,20.1万|随便来几个价格十二块五 三十四点五元 二十点一万| + |percentage|明天有62%的概率降雨|明天有百分之六十二的概率降雨| + |telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一| + + acknowledgement: the NSW normalization codes are based on [Zhiyang Zhou's work here](https://github.com/Joee1995/chn_text_norm.git) + +1. punctuation removal + + For Chinese, it removes punctuation list collected in [Zhon](https://github.com/tsroten/zhon) project, containing + * non-stop puncs + ``` + '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏' + ``` + * stop puncs + ``` + '!?。。' + ``` + + For English, it removes Python's `string.punctuation` + +1. multilingual English word upper/lower case conversion + since ASR/TTS lexicons usually unify English entries to uppercase or lowercase, the TN module should adapt with lexicon accordingly. + +## Supported text format + +1. plain text, preferably one sentence per line(most common case in ASR processing). + ``` + 今天早饭吃了没 + 没吃回家吃去吧 + ... + ``` + plain text is default format. + +2. Kaldi's transcription format + ``` + KALDI_KEY_UTT001 今天早饭吃了没 + KALDI_KEY_UTT002 没吃回家吃去吧 + ... + ``` + TN will skip first column key section, normalize latter transcription text + + pass `--has_key` option to switch to kaldi format. + +_note: All input text should be UTF-8 encoded._ + +## Run examples + +* TN (python) + +make sure you have **python3**, python2.X won't work correctly. + +`sh run.sh` in `TN` dir, and compare raw text and normalized text. + +* ITN (thrax) + +make sure you have **thrax** installed, and your PATH should be able to find thrax binaries. + +`sh run.sh` in `ITN` dir. check Makefile for grammar dependency. + +## possible future work + +Since TN is a typical "done is better than perfect" module in context of ASR, and the current state is sufficient for my purpose, I probably won't update this repo frequently. + +there are indeed something that needs to be improved: + +* For TN, NSW normalizers in TN dir are based on regular expression, I've found some unintended matches, those pattern regexps need to be refined for more precise TN coverage. + +* For ITN, extend those thrax rewriting grammars to cover more scenarios. + +* Further more, nowadays commercial systems start to introduce RNN-like models into TN, and a mix of (rule-based & model-based) system is state-of-the-art. More readings about this, look for Richard Sproat and KyleGorman's work at Google. + +END diff --git a/third_party/chinese_text_normalization/python/cn_tn.py b/third_party/chinese_text_normalization/python/cn_tn.py new file mode 100755 index 000000000..bac1c19ea --- /dev/null +++ b/third_party/chinese_text_normalization/python/cn_tn.py @@ -0,0 +1,794 @@ +#!/usr/bin/env python3 +# coding=utf-8 +# Authors: +# 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git) +# 2019.9 Jiayu DU +# +# requirements: +# - python 3.X +# notes: python 2.X WILL fail or produce misleading results + +import sys, os, argparse, codecs, string, re + +# ================================================================================ # +# basic constant +# ================================================================================ # +CHINESE_DIGIS = u'零一二三四五六七八九' +BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖' +BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖' +SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万' +SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬' +LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载' +LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載' +SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万' +SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬' + +ZERO_ALT = u'〇' +ONE_ALT = u'幺' +TWO_ALTS = [u'两', u'兩'] + +POSITIVE = [u'正', u'正'] +NEGATIVE = [u'负', u'負'] +POINT = [u'点', u'點'] +# PLUS = [u'加', u'加'] +# SIL = [u'杠', u'槓'] + +# 中文数字系统类型 +NUMBERING_TYPES = ['low', 'mid', 'high'] + +CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \ + '里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)' +CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' +COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \ + '砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \ + '针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \ + '毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \ + '盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \ + '纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)' + +# punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git) +CHINESE_PUNC_STOP = '!?。。' +CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏' +CHINESE_PUNC_OTHER = '·〈〉-' +CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP + CHINESE_PUNC_OTHER + +# ================================================================================ # +# basic class +# ================================================================================ # +class ChineseChar(object): + """ + 中文字符 + 每个字符对应简体和繁体, + e.g. 简体 = '负', 繁体 = '負' + 转换时可转换为简体或繁体 + """ + + def __init__(self, simplified, traditional): + self.simplified = simplified + self.traditional = traditional + #self.__repr__ = self.__str__ + + def __str__(self): + return self.simplified or self.traditional or None + + def __repr__(self): + return self.__str__() + + +class ChineseNumberUnit(ChineseChar): + """ + 中文数字/数位字符 + 每个字符除繁简体外还有一个额外的大写字符 + e.g. '陆' 和 '陸' + """ + + def __init__(self, power, simplified, traditional, big_s, big_t): + super(ChineseNumberUnit, self).__init__(simplified, traditional) + self.power = power + self.big_s = big_s + self.big_t = big_t + + def __str__(self): + return '10^{}'.format(self.power) + + @classmethod + def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False): + + if small_unit: + return ChineseNumberUnit(power=index + 1, + simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1]) + elif numbering_type == NUMBERING_TYPES[0]: + return ChineseNumberUnit(power=index + 8, + simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) + elif numbering_type == NUMBERING_TYPES[1]: + return ChineseNumberUnit(power=(index + 2) * 4, + simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) + elif numbering_type == NUMBERING_TYPES[2]: + return ChineseNumberUnit(power=pow(2, index + 3), + simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1]) + else: + raise ValueError( + 'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type)) + + +class ChineseNumberDigit(ChineseChar): + """ + 中文数字字符 + """ + + def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None): + super(ChineseNumberDigit, self).__init__(simplified, traditional) + self.value = value + self.big_s = big_s + self.big_t = big_t + self.alt_s = alt_s + self.alt_t = alt_t + + def __str__(self): + return str(self.value) + + @classmethod + def create(cls, i, v): + return ChineseNumberDigit(i, v[0], v[1], v[2], v[3]) + + +class ChineseMath(ChineseChar): + """ + 中文数位字符 + """ + + def __init__(self, simplified, traditional, symbol, expression=None): + super(ChineseMath, self).__init__(simplified, traditional) + self.symbol = symbol + self.expression = expression + self.big_s = simplified + self.big_t = traditional + + +CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath + + +class NumberSystem(object): + """ + 中文数字系统 + """ + pass + + +class MathSymbol(object): + """ + 用于中文数字系统的数学符号 (繁/简体), e.g. + positive = ['正', '正'] + negative = ['负', '負'] + point = ['点', '點'] + """ + + def __init__(self, positive, negative, point): + self.positive = positive + self.negative = negative + self.point = point + + def __iter__(self): + for v in self.__dict__.values(): + yield v + + +# class OtherSymbol(object): +# """ +# 其他符号 +# """ +# +# def __init__(self, sil): +# self.sil = sil +# +# def __iter__(self): +# for v in self.__dict__.values(): +# yield v + + +# ================================================================================ # +# basic utils +# ================================================================================ # +def create_system(numbering_type=NUMBERING_TYPES[1]): + """ + 根据数字系统类型返回创建相应的数字系统,默认为 mid + NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型 + low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc. + mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc. + high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc. + 返回对应的数字系统 + """ + + # chinese number units of '亿' and larger + all_larger_units = zip( + LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL) + larger_units = [CNU.create(i, v, numbering_type, False) + for i, v in enumerate(all_larger_units)] + # chinese number units of '十, 百, 千, 万' + all_smaller_units = zip( + SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL) + smaller_units = [CNU.create(i, v, small_unit=True) + for i, v in enumerate(all_smaller_units)] + # digis + chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS, + BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL) + digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)] + digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT + digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT + digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1] + + # symbols + positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x) + negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x) + point_cn = CM(POINT[0], POINT[1], '.', lambda x, + y: float(str(x) + '.' + str(y))) + # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y))) + system = NumberSystem() + system.units = smaller_units + larger_units + system.digits = digits + system.math = MathSymbol(positive_cn, negative_cn, point_cn) + # system.symbols = OtherSymbol(sil_cn) + return system + + +def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]): + + def get_symbol(char, system): + for u in system.units: + if char in [u.traditional, u.simplified, u.big_s, u.big_t]: + return u + for d in system.digits: + if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]: + return d + for m in system.math: + if char in [m.traditional, m.simplified]: + return m + + def string2symbols(chinese_string, system): + int_string, dec_string = chinese_string, '' + for p in [system.math.point.simplified, system.math.point.traditional]: + if p in chinese_string: + int_string, dec_string = chinese_string.split(p) + break + return [get_symbol(c, system) for c in int_string], \ + [get_symbol(c, system) for c in dec_string] + + def correct_symbols(integer_symbols, system): + """ + 一百八 to 一百八十 + 一亿一千三百万 to 一亿 一千万 三百万 + """ + + if integer_symbols and isinstance(integer_symbols[0], CNU): + if integer_symbols[0].power == 1: + integer_symbols = [system.digits[1]] + integer_symbols + + if len(integer_symbols) > 1: + if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU): + integer_symbols.append( + CNU(integer_symbols[-2].power - 1, None, None, None, None)) + + result = [] + unit_count = 0 + for s in integer_symbols: + if isinstance(s, CND): + result.append(s) + unit_count = 0 + elif isinstance(s, CNU): + current_unit = CNU(s.power, None, None, None, None) + unit_count += 1 + + if unit_count == 1: + result.append(current_unit) + elif unit_count > 1: + for i in range(len(result)): + if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power: + result[-i - 1] = CNU(result[-i - 1].power + + current_unit.power, None, None, None, None) + return result + + def compute_value(integer_symbols): + """ + Compute the value. + When current unit is larger than previous unit, current unit * all previous units will be used as all previous units. + e.g. '两千万' = 2000 * 10000 not 2000 + 10000 + """ + value = [0] + last_power = 0 + for s in integer_symbols: + if isinstance(s, CND): + value[-1] = s.value + elif isinstance(s, CNU): + value[-1] *= pow(10, s.power) + if s.power > last_power: + value[:-1] = list(map(lambda v: v * + pow(10, s.power), value[:-1])) + last_power = s.power + value.append(0) + return sum(value) + + system = create_system(numbering_type) + int_part, dec_part = string2symbols(chinese_string, system) + int_part = correct_symbols(int_part, system) + int_str = str(compute_value(int_part)) + dec_str = ''.join([str(d.value) for d in dec_part]) + if dec_part: + return '{0}.{1}'.format(int_str, dec_str) + else: + return int_str + + +def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False, + traditional=False, alt_zero=False, alt_one=False, alt_two=True, + use_zeros=True, use_units=True): + + def get_value(value_string, use_zeros=True): + + striped_string = value_string.lstrip('0') + + # record nothing if all zeros + if not striped_string: + return [] + + # record one digits + elif len(striped_string) == 1: + if use_zeros and len(value_string) != len(striped_string): + return [system.digits[0], system.digits[int(striped_string)]] + else: + return [system.digits[int(striped_string)]] + + # recursively record multiple digits + else: + result_unit = next(u for u in reversed( + system.units) if u.power < len(striped_string)) + result_string = value_string[:-result_unit.power] + return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:]) + + system = create_system(numbering_type) + + int_dec = number_string.split('.') + if len(int_dec) == 1: + int_string = int_dec[0] + dec_string = "" + elif len(int_dec) == 2: + int_string = int_dec[0] + dec_string = int_dec[1] + else: + raise ValueError( + "invalid input num string with more than one dot: {}".format(number_string)) + + if use_units and len(int_string) > 1: + result_symbols = get_value(int_string) + else: + result_symbols = [system.digits[int(c)] for c in int_string] + dec_symbols = [system.digits[int(c)] for c in dec_string] + if dec_string: + result_symbols += [system.math.point] + dec_symbols + + if alt_two: + liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t, + system.digits[2].big_s, system.digits[2].big_t) + for i, v in enumerate(result_symbols): + if isinstance(v, CND) and v.value == 2: + next_symbol = result_symbols[i + + 1] if i < len(result_symbols) - 1 else None + previous_symbol = result_symbols[i - 1] if i > 0 else None + if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))): + if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)): + result_symbols[i] = liang + + # if big is True, '两' will not be used and `alt_two` has no impact on output + if big: + attr_name = 'big_' + if traditional: + attr_name += 't' + else: + attr_name += 's' + else: + if traditional: + attr_name = 'traditional' + else: + attr_name = 'simplified' + + result = ''.join([getattr(s, attr_name) for s in result_symbols]) + + # if not use_zeros: + # result = result.strip(getattr(system.digits[0], attr_name)) + + if alt_zero: + result = result.replace( + getattr(system.digits[0], attr_name), system.digits[0].alt_s) + + if alt_one: + result = result.replace( + getattr(system.digits[1], attr_name), system.digits[1].alt_s) + + for i, p in enumerate(POINT): + if result.startswith(p): + return CHINESE_DIGIS[0] + result + + # ^10, 11, .., 19 + if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0], + SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \ + result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]: + result = result[1:] + + return result + + +# ================================================================================ # +# different types of rewriters +# ================================================================================ # +class Cardinal: + """ + CARDINAL类 + """ + + def __init__(self, cardinal=None, chntext=None): + self.cardinal = cardinal + self.chntext = chntext + + def chntext2cardinal(self): + return chn2num(self.chntext) + + def cardinal2chntext(self): + return num2chn(self.cardinal) + +class Digit: + """ + DIGIT类 + """ + + def __init__(self, digit=None, chntext=None): + self.digit = digit + self.chntext = chntext + + # def chntext2digit(self): + # return chn2num(self.chntext) + + def digit2chntext(self): + return num2chn(self.digit, alt_two=False, use_units=False) + + +class TelePhone: + """ + TELEPHONE类 + """ + + def __init__(self, telephone=None, raw_chntext=None, chntext=None): + self.telephone = telephone + self.raw_chntext = raw_chntext + self.chntext = chntext + + # def chntext2telephone(self): + # sil_parts = self.raw_chntext.split('<SIL>') + # self.telephone = '-'.join([ + # str(chn2num(p)) for p in sil_parts + # ]) + # return self.telephone + + def telephone2chntext(self, fixed=False): + + if fixed: + sil_parts = self.telephone.split('-') + self.raw_chntext = '<SIL>'.join([ + num2chn(part, alt_two=False, use_units=False) for part in sil_parts + ]) + self.chntext = self.raw_chntext.replace('<SIL>', '') + else: + sp_parts = self.telephone.strip('+').split() + self.raw_chntext = '<SP>'.join([ + num2chn(part, alt_two=False, use_units=False) for part in sp_parts + ]) + self.chntext = self.raw_chntext.replace('<SP>', '') + return self.chntext + + +class Fraction: + """ + FRACTION类 + """ + + def __init__(self, fraction=None, chntext=None): + self.fraction = fraction + self.chntext = chntext + + def chntext2fraction(self): + denominator, numerator = self.chntext.split('分之') + return chn2num(numerator) + '/' + chn2num(denominator) + + def fraction2chntext(self): + numerator, denominator = self.fraction.split('/') + return num2chn(denominator) + '分之' + num2chn(numerator) + + +class Date: + """ + DATE类 + """ + + def __init__(self, date=None, chntext=None): + self.date = date + self.chntext = chntext + + # def chntext2date(self): + # chntext = self.chntext + # try: + # year, other = chntext.strip().split('年', maxsplit=1) + # year = Digit(chntext=year).digit2chntext() + '年' + # except ValueError: + # other = chntext + # year = '' + # if other: + # try: + # month, day = other.strip().split('月', maxsplit=1) + # month = Cardinal(chntext=month).chntext2cardinal() + '月' + # except ValueError: + # day = chntext + # month = '' + # if day: + # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1] + # else: + # month = '' + # day = '' + # date = year + month + day + # self.date = date + # return self.date + + def date2chntext(self): + date = self.date + try: + year, other = date.strip().split('年', 1) + year = Digit(digit=year).digit2chntext() + '年' + except ValueError: + other = date + year = '' + if other: + try: + month, day = other.strip().split('月', 1) + month = Cardinal(cardinal=month).cardinal2chntext() + '月' + except ValueError: + day = date + month = '' + if day: + day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1] + else: + month = '' + day = '' + chntext = year + month + day + self.chntext = chntext + return self.chntext + + +class Money: + """ + MONEY类 + """ + + def __init__(self, money=None, chntext=None): + self.money = money + self.chntext = chntext + + # def chntext2money(self): + # return self.money + + def money2chntext(self): + money = self.money + pattern = re.compile(r'(\d+(\.\d+)?)') + matchers = pattern.findall(money) + if matchers: + for matcher in matchers: + money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext()) + self.chntext = money + return self.chntext + + +class Percentage: + """ + PERCENTAGE类 + """ + + def __init__(self, percentage=None, chntext=None): + self.percentage = percentage + self.chntext = chntext + + def chntext2percentage(self): + return chn2num(self.chntext.strip().strip('百分之')) + '%' + + def percentage2chntext(self): + return '百分之' + num2chn(self.percentage.strip().strip('%')) + + +# ================================================================================ # +# NSW Normalizer +# ================================================================================ # +class NSWNormalizer: + def __init__(self, raw_text): + self.raw_text = '^' + raw_text + '$' + self.norm_text = '' + + def _particular(self): + text = self.norm_text + pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))") + matchers = pattern.findall(text) + if matchers: + # print('particular') + for matcher in matchers: + text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1) + self.norm_text = text + return self.norm_text + + def normalize(self): + text = self.raw_text + + # 规范化日期 + pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)") + matchers = pattern.findall(text) + if matchers: + #print('date') + for matcher in matchers: + text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1) + + # 规范化金钱 + pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)") + matchers = pattern.findall(text) + if matchers: + #print('money') + for matcher in matchers: + text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1) + + # 规范化固话/手机号码 + # 手机 + # http://www.jihaoba.com/news/show/13680 + # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 + # 联通:130、131、132、156、155、186、185、176 + # 电信:133、153、189、180、181、177 + pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D") + matchers = pattern.findall(text) + if matchers: + #print('telephone') + for matcher in matchers: + text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1) + # 固话 + pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D") + matchers = pattern.findall(text) + if matchers: + # print('fixed telephone') + for matcher in matchers: + text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1) + + # 规范化分数 + pattern = re.compile(r"(\d+/\d+)") + matchers = pattern.findall(text) + if matchers: + #print('fraction') + for matcher in matchers: + text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1) + + # 规范化百分数 + text = text.replace('%', '%') + pattern = re.compile(r"(\d+(\.\d+)?%)") + matchers = pattern.findall(text) + if matchers: + #print('percentage') + for matcher in matchers: + text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1) + + # 规范化纯数+量词 + pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS) + matchers = pattern.findall(text) + if matchers: + #print('cardinal+quantifier') + for matcher in matchers: + text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) + + # 规范化数字编号 + pattern = re.compile(r"(\d{4,32})") + matchers = pattern.findall(text) + if matchers: + #print('digit') + for matcher in matchers: + text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1) + + # 规范化纯数 + pattern = re.compile(r"(\d+(\.\d+)?)") + matchers = pattern.findall(text) + if matchers: + #print('cardinal') + for matcher in matchers: + text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) + + self.norm_text = text + self._particular() + + return self.norm_text.lstrip('^').rstrip('$') + + +def nsw_test_case(raw_text): + print('I:' + raw_text) + print('O:' + NSWNormalizer(raw_text).normalize()) + print('') + + +def nsw_test(): + nsw_test_case('固话:0595-23865596或23880880。') + nsw_test_case('固话:0595-23865596或23880880。') + nsw_test_case('手机:+86 19859213959或15659451527。') + nsw_test_case('分数:32477/76391。') + nsw_test_case('百分数:80.03%。') + nsw_test_case('编号:31520181154418。') + nsw_test_case('纯数:2983.07克或12345.60米。') + nsw_test_case('日期:1999年2月20日或09年3月15号。') + nsw_test_case('金钱:12块5,34.5元,20.1万') + nsw_test_case('特殊:O2O或B2C。') + nsw_test_case('3456万吨') + nsw_test_case('2938个') + nsw_test_case('938') + nsw_test_case('今天吃了115个小笼包231个馒头') + nsw_test_case('有62%的概率') + + +if __name__ == '__main__': + #nsw_test() + + p = argparse.ArgumentParser() + p.add_argument('ifile', help='input filename, assume utf-8 encoding') + p.add_argument('ofile', help='output filename') + p.add_argument('--to_upper', action='store_true', help='convert to upper case') + p.add_argument('--to_lower', action='store_true', help='convert to lower case') + p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.") + p.add_argument('--log_interval', type=int, default=100000, help='log interval in number of processed lines') + args = p.parse_args() + + ifile = codecs.open(args.ifile, 'r', 'utf8') + ofile = codecs.open(args.ofile, 'w+', 'utf8') + + n = 0 + for l in ifile: + key = '' + text = '' + if args.has_key: + cols = l.split(maxsplit=1) + key = cols[0] + if len(cols) == 2: + text = cols[1].strip() + else: + text = '' + else: + text = l.strip() + + # cases + if args.to_upper and args.to_lower: + sys.stderr.write('cn_tn.py: to_upper OR to_lower?') + exit(1) + if args.to_upper: + text = text.upper() + if args.to_lower: + text = text.lower() + + # NSW(Non-Standard-Word) normalization + text = NSWNormalizer(text).normalize() + + # Punctuations removal + old_chars = CHINESE_PUNC_LIST + string.punctuation # includes all CN and EN punctuations + new_chars = ' ' * len(old_chars) + del_chars = '' + text = text.translate(str.maketrans(old_chars, new_chars, del_chars)) + + # + if args.has_key: + ofile.write(key + '\t' + text + '\n') + else: + if text.strip() != '': # skip empty line in pure text format(without Kaldi's utt key) + ofile.write(text + '\n') + + n += 1 + if n % args.log_interval == 0: + sys.stderr.write("cn_tn.py: {} lines done.\n".format(n)) + sys.stderr.flush() + + sys.stderr.write("cn_tn.py: {} lines done in total.\n".format(n)) + sys.stderr.flush() + + ifile.close() + ofile.close() diff --git a/third_party/chinese_text_normalization/python/example_kaldi.txt b/third_party/chinese_text_normalization/python/example_kaldi.txt new file mode 100644 index 000000000..07af5674b --- /dev/null +++ b/third_party/chinese_text_normalization/python/example_kaldi.txt @@ -0,0 +1,7 @@ +UTT000 这块黄金重达324.75克 +UTT001 她出生于86年8月18日,她弟弟出生于1995年3月1日 +UTT002 电影中梁朝伟扮演的陈永仁的编号27149 +UTT003 现场有7/12的观众投出了赞成票 +UTT004 随便来几个价格12块5,34.5元,20.1万 +UTT005 明天有62%的概率降雨 +UTT006 这是固话0421-33441122或这是手机+86 18544139121 diff --git a/third_party/chinese_text_normalization/python/example_plain.txt b/third_party/chinese_text_normalization/python/example_plain.txt new file mode 100644 index 000000000..14e5a09fe --- /dev/null +++ b/third_party/chinese_text_normalization/python/example_plain.txt @@ -0,0 +1,7 @@ +这块黄金重达324.75克 +她出生于86年8月18日,她弟弟出生于1995年3月1日 +电影中梁朝伟扮演的陈永仁的编号27149 +现场有7/12的观众投出了赞成票 +随便来几个价格12块5,34.5元,20.1万 +明天有62%的概率降雨 +这是固话0421-33441122或这是手机+86 18544139121 diff --git a/third_party/chinese_text_normalization/python/run.sh b/third_party/chinese_text_normalization/python/run.sh new file mode 100644 index 000000000..0866d72f0 --- /dev/null +++ b/third_party/chinese_text_normalization/python/run.sh @@ -0,0 +1,8 @@ +# for plain text +python3 cn_tn.py example_plain.txt output_plain.txt +diff example_plain.txt output_plain.txt + +# for Kaldi's trans format +python3 cn_tn.py --has_key example_kaldi.txt output_kaldi.txt +diff example_kaldi.txt output_kaldi.txt + diff --git a/third_party/chinese_text_normalization/thrax/INSTALL.txt b/third_party/chinese_text_normalization/thrax/INSTALL.txt new file mode 100644 index 000000000..dcbd58c50 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/INSTALL.txt @@ -0,0 +1,24 @@ +0. place install_thrax.sh into $KALDI/tools/extras/ + +1. recompile openfst with necessary option "--enable-grm" to support thrax: +* cd $KALDI_ROOT/tools +* make clean +* edit $KALDI_ROOT/tools/Makefile, append "--enable-grm" option to OPENFST_CONFIGURE: +OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts --enable-lookahead-fsts --with-pic --enable-grm +* make -j 10 + +2. install thrax +cd $KALDI_ROOT/tools +sh extras/install_thrax.sh + +3. add thrax binary path into $KALDI_ROOT/tools/env.sh: +export PATH=/path/to/your/kaldi_root/tools/thrax-1.2.9/src/bin:${PATH} + +usage: +before you run anything related to thrax, use: +. $KALDI_ROOT/tools/env.sh +to enable binary finding, like what we always do in kaldi. + +sample usage: +sh run_en.sh +sh run_cn.sh diff --git a/third_party/chinese_text_normalization/thrax/install_thrax.sh b/third_party/chinese_text_normalization/thrax/install_thrax.sh new file mode 100755 index 000000000..20d2757b9 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/install_thrax.sh @@ -0,0 +1,12 @@ +#!/bin/bash +## This script should be placed under $KALDI_ROOT/tools/extras/, and see INSTALL.txt for installation guide +if [ ! -f thrax-1.2.9.tar.gz ]; then + wget http://www.openfst.org/twiki/pub/GRM/ThraxDownload/thrax-1.2.9.tar.gz + tar -zxf thrax-1.2.9.tar.gz +fi +cd thrax-1.2.9 +OPENFSTPREFIX=`pwd`/../openfst +LDFLAGS="-L${OPENFSTPREFIX}/lib" CXXFLAGS="-I${OPENFSTPREFIX}/include" ./configure --prefix ${OPENFSTPREFIX} +make -j 10; make install +cd .. + diff --git a/third_party/chinese_text_normalization/thrax/papers/gorman-sproat-2016.pdf b/third_party/chinese_text_normalization/thrax/papers/gorman-sproat-2016.pdf new file mode 100644 index 000000000..14a438c7f Binary files /dev/null and b/third_party/chinese_text_normalization/thrax/papers/gorman-sproat-2016.pdf differ diff --git a/third_party/chinese_text_normalization/thrax/papers/wu-etal-2016.pdf b/third_party/chinese_text_normalization/thrax/papers/wu-etal-2016.pdf new file mode 100644 index 000000000..c7d1068fe Binary files /dev/null and b/third_party/chinese_text_normalization/thrax/papers/wu-etal-2016.pdf differ diff --git a/third_party/chinese_text_normalization/thrax/run_cn.sh b/third_party/chinese_text_normalization/thrax/run_cn.sh new file mode 100644 index 000000000..81bb2893e --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/run_cn.sh @@ -0,0 +1,6 @@ +cd src/cn +thraxmakedep itn.grm +make +#thraxrewrite-tester --far=itn.far --rules=ITN +cat ../../testcase_cn.txt | thraxrewrite-tester --far=itn.far --rules=ITN +cd - diff --git a/third_party/chinese_text_normalization/thrax/run_en.sh b/third_party/chinese_text_normalization/thrax/run_en.sh new file mode 100644 index 000000000..f8526487d --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/run_en.sh @@ -0,0 +1,6 @@ +cd src +thraxmakedep en/verbalizer/podspeech.grm +make +cat ../testcase_en.txt +cat ../testcase_en.txt | thraxrewrite-tester --far=en/verbalizer/podspeech.far --rules=POD_SPEECH_TN +cd - diff --git a/third_party/chinese_text_normalization/thrax/src/LICENSE b/third_party/chinese_text_normalization/thrax/src/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/chinese_text_normalization/thrax/src/Makefile b/third_party/chinese_text_normalization/thrax/src/Makefile new file mode 100644 index 000000000..6937ab5f7 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/Makefile @@ -0,0 +1,65 @@ +en/verbalizer/podspeech.far: en/verbalizer/podspeech.grm util/util.far util/case.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far + thraxcompiler --input_grammar=$< --output_far=$@ + +util/util.far: util/util.grm util/byte.far util/case.far + thraxcompiler --input_grammar=$< --output_far=$@ + +util/byte.far: util/byte.grm + thraxcompiler --input_grammar=$< --output_far=$@ + +util/case.far: util/case.grm util/byte.far + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/extra_numbers.far: en/verbalizer/extra_numbers.grm util/byte.far en/verbalizer/numbers.far + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/numbers.far: en/verbalizer/numbers.grm en/verbalizer/number_names.far util/byte.far universal/thousands_punct.far + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/number_names.far: en/verbalizer/number_names.grm util/arithmetic.far en/verbalizer/g.fst en/verbalizer/cardinals.tsv en/verbalizer/ordinals.tsv + thraxcompiler --input_grammar=$< --output_far=$@ + +util/arithmetic.far: util/arithmetic.grm util/byte.far util/germanic.tsv + thraxcompiler --input_grammar=$< --output_far=$@ + +universal/thousands_punct.far: universal/thousands_punct.grm util/byte.far util/util.far + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/float.far: en/verbalizer/float.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/factorization.far: en/verbalizer/factorization.grm util/byte.far util/util.far en/verbalizer/numbers.far + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/lexical_map.far: en/verbalizer/lexical_map.grm util/byte.far en/verbalizer/lexical_map.tsv + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/math.far: en/verbalizer/math.grm en/verbalizer/float.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/miscellaneous.far: en/verbalizer/miscellaneous.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/extra_numbers.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/spelled.far + thraxcompiler --input_grammar=$< --output_far=$@ + +ru/classifier/cyrillic.far: ru/classifier/cyrillic.grm + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/spelled.far: en/verbalizer/spelled.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/money.far: en/verbalizer/money.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/money.tsv + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/numbers_plus.far: en/verbalizer/numbers_plus.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/spoken_punct.far: en/verbalizer/spoken_punct.grm en/verbalizer/lexical_map.far + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/time.far: en/verbalizer/time.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far + thraxcompiler --input_grammar=$< --output_far=$@ + +en/verbalizer/urls.far: en/verbalizer/urls.grm util/byte.far en/verbalizer/lexical_map.far + thraxcompiler --input_grammar=$< --output_far=$@ + +clean: + rm -f util/util.far util/case.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far util/byte.far en/verbalizer/number_names.far universal/thousands_punct.far util/arithmetic.far en/verbalizer/factorization.far en/verbalizer/lexical_map.far ru/classifier/cyrillic.far diff --git a/third_party/chinese_text_normalization/thrax/src/README.md b/third_party/chinese_text_normalization/thrax/src/README.md new file mode 100644 index 000000000..878ff18fb --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/README.md @@ -0,0 +1,24 @@ +# Text normalization covering grammars + +This repository provides covering grammars for English and Russian text normalization as +documented in: + + Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization. + _Transactions of the Association for Computational Linguistics_ 4: 507-519. + + Ng, A. H., Gorman, K., and Sproat, R. 2017. Minimally supervised + written-to-spoken text normalization. In _ASRU_, pages 665-670. + +If you use these grammars in a publication, we would appreciate if you cite these works. + +## Building + +The grammars are written in [Thrax](thrax.opengrm.org) and compile into [OpenFst](openfst.org) FAR (FstARchive) files. To compile, simply run `make` in the `src/` directory. + +## License + +See `LICENSE`. + +## Mandatory disclaimer + +This is not an official Google product. diff --git a/third_party/chinese_text_normalization/thrax/src/cn/Makefile b/third_party/chinese_text_normalization/thrax/src/cn/Makefile new file mode 100644 index 000000000..2ff2d74ae --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/cn/Makefile @@ -0,0 +1,23 @@ +itn.far: itn.grm byte.far number.far hotfix.far percentage.far date.far amount.far + thraxcompiler --input_grammar=$< --output_far=$@ + +byte.far: byte.grm + thraxcompiler --input_grammar=$< --output_far=$@ + +number.far: number.grm byte.far + thraxcompiler --input_grammar=$< --output_far=$@ + +hotfix.far: hotfix.grm byte.far hotfix.list + thraxcompiler --input_grammar=$< --output_far=$@ + +percentage.far: percentage.grm byte.far number.far + thraxcompiler --input_grammar=$< --output_far=$@ + +date.far: date.grm byte.far number.far + thraxcompiler --input_grammar=$< --output_far=$@ + +amount.far: amount.grm byte.far number.far + thraxcompiler --input_grammar=$< --output_far=$@ + +clean: + rm -f byte.far number.far hotfix.far percentage.far date.far amount.far diff --git a/third_party/chinese_text_normalization/thrax/src/cn/amount.grm b/third_party/chinese_text_normalization/thrax/src/cn/amount.grm new file mode 100644 index 000000000..a83b3bee2 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/cn/amount.grm @@ -0,0 +1,24 @@ +import 'byte.grm' as b; +import 'number.grm' as n; + +unit = ( + "匹"|"张"|"座"|"回"|"场"|"尾"|"条"|"个"|"首"|"阙"|"阵"|"网"|"炮"| + "顶"|"丘"|"棵"|"只"|"支"|"袭"|"辆"|"挑"|"担"|"颗"|"壳"|"窠"|"曲"| + "墙"|"群"|"腔"|"砣"|"座"|"客"|"贯"|"扎"|"捆"|"刀"|"令"|"打"|"手"| + "罗"|"坡"|"山"|"岭"|"江"|"溪"|"钟"|"队"|"单"|"双"|"对"|"出"|"口"| + "头"|"脚"|"板"|"跳"|"枝"|"件"|"贴"|"针"|"线"|"管"|"名"|"位"|"身"| + "堂"|"课"|"本"|"页"|"家"|"户"|"层"|"丝"|"毫"|"厘"|"分"|"钱"|"两"| + "斤"|"担"|"铢"|"石"|"钧"|"锱"|"忽"|"毫"|"厘"|"分"|"寸"|"尺"|"丈"| + "里"|"寻"|"常"|"铺"|"程"|"撮"|"勺"|"合"|"升"|"斗"|"石"|"盘"|"碗"| + "碟"|"叠"|"桶"|"笼"|"盆"|"盒"|"杯"|"钟"|"斛"|"锅"|"簋"|"篮"|"盘"| + "桶"|"罐"|"瓶"|"壶"|"卮"|"盏"|"箩"|"箱"|"煲"|"啖"|"袋"|"钵"|"年"| + "月"|"日"|"季"|"刻"|"时"|"周"|"天"|"秒"|"分"|"旬"|"纪"|"岁"|"世"| + "更"|"夜"|"春"|"夏"|"秋"|"冬"|"代"|"伏"|"辈"|"丸"|"泡"|"粒"|"颗"| + "幢"|"堆"|"条"|"根"|"支"|"道"|"面"|"片"|"张"|"颗"|"块"| + (("千克":"kg")|("毫克":"mg")|("微克":"µg"))| + (("千米":"km")|("厘米":"cm")|("毫米":"mm")|("微米":"µm")|("纳米":"nm")) +); + +amount = n.number unit; +export AMOUNT = CDRewrite[amount, "", "", b.kBytes*]; + diff --git a/third_party/chinese_text_normalization/thrax/src/cn/byte.grm b/third_party/chinese_text_normalization/thrax/src/cn/byte.grm new file mode 100644 index 000000000..f23337344 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/cn/byte.grm @@ -0,0 +1,76 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Copyright 2005-2011 Google, Inc. +# Author: ttai@google.com (Terry Tai) + +# Standard constants for ASCII (byte) based strings. This mirrors the +# functions provided by C/C++'s ctype.h library. + +# Note that [0] is missing. Matching the string-termination character is kinda weird. +export kBytes = Optimize[ + "[1]" | "[2]" | "[3]" | "[4]" | "[5]" | "[6]" | "[7]" | "[8]" | "[9]" | "[10]" | + "[11]" | "[12]" | "[13]" | "[14]" | "[15]" | "[16]" | "[17]" | "[18]" | "[19]" | "[20]" | + "[21]" | "[22]" | "[23]" | "[24]" | "[25]" | "[26]" | "[27]" | "[28]" | "[29]" | "[30]" | + "[31]" | "[32]" | "[33]" | "[34]" | "[35]" | "[36]" | "[37]" | "[38]" | "[39]" | "[40]" | + "[41]" | "[42]" | "[43]" | "[44]" | "[45]" | "[46]" | "[47]" | "[48]" | "[49]" | "[50]" | + "[51]" | "[52]" | "[53]" | "[54]" | "[55]" | "[56]" | "[57]" | "[58]" | "[59]" | "[60]" | + "[61]" | "[62]" | "[63]" | "[64]" | "[65]" | "[66]" | "[67]" | "[68]" | "[69]" | "[70]" | + "[71]" | "[72]" | "[73]" | "[74]" | "[75]" | "[76]" | "[77]" | "[78]" | "[79]" | "[80]" | + "[81]" | "[82]" | "[83]" | "[84]" | "[85]" | "[86]" | "[87]" | "[88]" | "[89]" | "[90]" | + "[91]" | "[92]" | "[93]" | "[94]" | "[95]" | "[96]" | "[97]" | "[98]" | "[99]" | "[100]" | +"[101]" | "[102]" | "[103]" | "[104]" | "[105]" | "[106]" | "[107]" | "[108]" | "[109]" | "[110]" | +"[111]" | "[112]" | "[113]" | "[114]" | "[115]" | "[116]" | "[117]" | "[118]" | "[119]" | "[120]" | +"[121]" | "[122]" | "[123]" | "[124]" | "[125]" | "[126]" | "[127]" | "[128]" | "[129]" | "[130]" | +"[131]" | "[132]" | "[133]" | "[134]" | "[135]" | "[136]" | "[137]" | "[138]" | "[139]" | "[140]" | +"[141]" | "[142]" | "[143]" | "[144]" | "[145]" | "[146]" | "[147]" | "[148]" | "[149]" | "[150]" | +"[151]" | "[152]" | "[153]" | "[154]" | "[155]" | "[156]" | "[157]" | "[158]" | "[159]" | "[160]" | +"[161]" | "[162]" | "[163]" | "[164]" | "[165]" | "[166]" | "[167]" | "[168]" | "[169]" | "[170]" | +"[171]" | "[172]" | "[173]" | "[174]" | "[175]" | "[176]" | "[177]" | "[178]" | "[179]" | "[180]" | +"[181]" | "[182]" | "[183]" | "[184]" | "[185]" | "[186]" | "[187]" | "[188]" | "[189]" | "[190]" | +"[191]" | "[192]" | "[193]" | "[194]" | "[195]" | "[196]" | "[197]" | "[198]" | "[199]" | "[200]" | +"[201]" | "[202]" | "[203]" | "[204]" | "[205]" | "[206]" | "[207]" | "[208]" | "[209]" | "[210]" | +"[211]" | "[212]" | "[213]" | "[214]" | "[215]" | "[216]" | "[217]" | "[218]" | "[219]" | "[220]" | +"[221]" | "[222]" | "[223]" | "[224]" | "[225]" | "[226]" | "[227]" | "[228]" | "[229]" | "[230]" | +"[231]" | "[232]" | "[233]" | "[234]" | "[235]" | "[236]" | "[237]" | "[238]" | "[239]" | "[240]" | +"[241]" | "[242]" | "[243]" | "[244]" | "[245]" | "[246]" | "[247]" | "[248]" | "[249]" | "[250]" | +"[251]" | "[252]" | "[253]" | "[254]" | "[255]" +]; + +export kDigit = Optimize[ + "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" +]; + +export kLower = Optimize[ + "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | + "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" +]; +export kUpper = Optimize[ + "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | + "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" +]; +export kAlpha = Optimize[kLower | kUpper]; + +export kAlnum = Optimize[kDigit | kAlpha]; + +export kSpace = Optimize[ + " " | "\t" | "\n" | "\r" +]; +export kNotSpace = Optimize[kBytes - kSpace]; + +export kPunct = Optimize[ + "!" | "\"" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," | + "-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "\[" | "\\" | + "\]" | "^" | "_" | "`" | "{" | "|" | "}" | "~" +]; + +export kGraph = Optimize[kAlnum | kPunct]; diff --git a/third_party/chinese_text_normalization/thrax/src/cn/date.grm b/third_party/chinese_text_normalization/thrax/src/cn/date.grm new file mode 100644 index 000000000..546937383 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/cn/date.grm @@ -0,0 +1,10 @@ +import 'byte.grm' as b; +import 'number.grm' as n; + +date_day = n.number_1_to_99 ("日"|"号"); +date_month_day = n.number_1_to_99 "月" date_day; +date_year_month_day = ((n.number_0_to_9){2,4} | n.number) "年" date_month_day; + +date = date_year_month_day | date_month_day | date_day; + +export DATE = CDRewrite[date, "", "", b.kBytes*]; diff --git a/third_party/chinese_text_normalization/thrax/src/cn/hotfix.grm b/third_party/chinese_text_normalization/thrax/src/cn/hotfix.grm new file mode 100644 index 000000000..f1a43cdf2 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/cn/hotfix.grm @@ -0,0 +1,5 @@ +import 'byte.grm' as b; +hotfix = StringFile['hotfix.list']; + +export HOTFIX = CDRewrite[hotfix, "", "", b.kBytes*]; + diff --git a/third_party/chinese_text_normalization/thrax/src/cn/hotfix.list b/third_party/chinese_text_normalization/thrax/src/cn/hotfix.list new file mode 100644 index 000000000..7234996e9 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/cn/hotfix.list @@ -0,0 +1,18 @@ +0头 零头 +10字 十字 +东4环 东4环 -1.0 +东4 东四 -0.5 +4惠 四惠 +3元桥 三元桥 +4平市 四平市 +5台山 五台山 +西2旗 西二旗 +西3旗 西三旗 +4道口 四道口 -1.0 +5道口 五道口 -1.0 +6道口 六道口 -1.0 +6里桥 六里桥 +7里庄 七里庄 +8宝山 八宝山 +9颗松 九棵松 +10里堡 十里堡 diff --git a/third_party/chinese_text_normalization/thrax/src/cn/itn.grm b/third_party/chinese_text_normalization/thrax/src/cn/itn.grm new file mode 100644 index 000000000..709ce6c66 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/cn/itn.grm @@ -0,0 +1,9 @@ +import 'byte.grm' as b; +import 'number.grm' as number; +import 'hotfix.grm' as hotfix; +import 'percentage.grm' as percentage; +import 'date.grm' as date; +import 'amount.grm' as amount; # seems not useful for now + +export ITN = Optimize[percentage.PERCENTAGE @ (date.DATE <-1>) @ number.NUMBER @ hotfix.HOTFIX]; + diff --git a/third_party/chinese_text_normalization/thrax/src/cn/number.grm b/third_party/chinese_text_normalization/thrax/src/cn/number.grm new file mode 100644 index 000000000..1e9a86545 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/cn/number.grm @@ -0,0 +1,61 @@ +import 'byte.grm' as b; + +number_1_to_9 = ( + ("一":"1") | ("幺":"1") | + ("二":"2") | ("两":"2") | + ("三":"3") | + ("四":"4") | + ("五":"5") | + ("六":"6") | + ("七":"7") | + ("八":"8") | + ("九":"9") +); + +export number_0_to_9 = (("零":"0") | number_1_to_9); + +number_10_to_19 = ( + ("十":"10") | + ("十一":"11") | + ("十二":"12") | + ("十三":"13") | + ("十四":"14") | + ("十五":"15") | + ("十六":"16") | + ("十七":"17") | + ("十八":"18") | + ("十九":"19") +); + +number_10s = (number_1_to_9 ("十":"")); +number_100s = (number_1_to_9 ("百":"")); +number_1000s = (number_1_to_9 ("千":"")); +number_10000s = (number_1_to_9 ("万":"")); + +number_10_to_99 = ( + ((number_10s number_1_to_9)<-0.3>) | + ((number_10s ("":"0"))<-0.2>) | + (number_10_to_19 <-0.1>) +); + +export number_1_to_99 = (number_1_to_9 | number_10_to_99); + +number_100_to_999 = ( + ((number_100s ("零":"0") number_1_to_9)<0.0>)| + ((number_100s number_10_to_99)<0.0>) | + ((number_100s number_1_to_9 ("":"0"))<0.0>) | + ((number_100s ("":"00"))<0.1>) +); + +number_1000_to_9999 = ( + ((number_1000s number_100_to_999)<0.0>) | + ((number_1000s ("零":"0") number_10_to_99)<0.0>)| + ((number_1000s ("零":"00") number_1_to_9)<0.0>)| + ((number_1000s ("":"000"))<1>) | + ((number_1000s number_1_to_9 ("":"00"))<0.0>) +); + +export number = number_1_to_99 | (number_100_to_999 <-1>) | (number_1000_to_9999 <-2>); + +export NUMBER = CDRewrite[number, "", "", b.kBytes*]; + diff --git a/third_party/chinese_text_normalization/thrax/src/cn/percentage.grm b/third_party/chinese_text_normalization/thrax/src/cn/percentage.grm new file mode 100644 index 000000000..d9f92a36e --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/cn/percentage.grm @@ -0,0 +1,8 @@ +import 'byte.grm' as b; +import 'number.grm' as n; + +percentage = ( + ("百分之":"") n.number_1_to_99 ("":"%") +); + +export PERCENTAGE = CDRewrite[percentage, "", "", b.kBytes*]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/README.md b/third_party/chinese_text_normalization/thrax/src/en/README.md new file mode 100644 index 000000000..8157e807c --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/README.md @@ -0,0 +1,6 @@ +# English covering grammar definitions + +This directory defines a English text normalization covering grammar. The +primary entry-point is the FST `VERBALIZER`, defined in +`verbalizer/verbalizer.grm` and compiled in the FST archive +`verbalizer/verbalizer.far`. diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/Makefile b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/Makefile new file mode 100644 index 000000000..6318dc546 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/Makefile @@ -0,0 +1,3 @@ +verbalizer.far: verbalizer.grm util/util.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far + thraxcompiler --input_grammar=$< --output_far=$@ + diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/cardinals.tsv b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/cardinals.tsv new file mode 100644 index 000000000..b4704ff3e --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/cardinals.tsv @@ -0,0 +1,32 @@ +0 zero +1 one +2 two +3 three +4 four +5 five +6 six +7 seven +8 eight +9 nine +10 ten +11 eleven +12 twelve +13 thirteen +14 fourteen +15 fifteen +16 sixteen +17 seventeen +18 eighteen +19 nineteen +20 twenty +30 thirty +40 forty +50 fifty +60 sixty +70 seventy +80 eighty +90 ninety +100 hundred +1000 thousand +1000000 million +1000000000 billion diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/extra_numbers.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/extra_numbers.grm new file mode 100644 index 000000000..a1fb370c4 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/extra_numbers.grm @@ -0,0 +1,35 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; +import 'en/verbalizer/numbers.grm' as n; + +digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@"); + +export DIGITS = digit (n.I[" "] digit)*; + +# Various common factorizations + +two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS; + +three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS; + +mixed = + (digit n.I[" "] two_digits) + | (two_digits n.I[" "] two_digits) + | (two_digits n.I[" "] three_digits) + | (two_digits n.I[" "] two_digits n.I[" "] two_digits) +; + +export MIXED_NUMBERS = Optimize[mixed]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/factorization.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/factorization.grm new file mode 100644 index 000000000..22ecfa9f4 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/factorization.grm @@ -0,0 +1,40 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; +import 'util/util.grm' as u; +import 'en/verbalizer/numbers.grm' as n; + +func ToNumberName[expr] { + number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*; + return Optimize[expr @ number_name_seq]; +} + +d = b.kDigit; + +leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*]; + +by_ones = d n.I[" "]; +by_twos = (d{2} @ leading_zero) n.I[" "]; +by_threes = (d{3} @ leading_zero) n.I[" "]; + +groupings = by_twos* (by_threes | by_twos | by_ones); + +export FRACTIONAL_PART_UNGROUPED = + Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]] +; +export FRACTIONAL_PART_GROUPED = + Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]] +; +export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/float.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/float.grm new file mode 100644 index 000000000..00b7ea376 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/float.grm @@ -0,0 +1,30 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'en/verbalizer/factorization.grm' as f; +import 'en/verbalizer/lexical_map.grm' as l; +import 'en/verbalizer/numbers.grm' as n; + +fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED; +fractional_part_grouped = f.FRACTIONAL_PART_GROUPED; +fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED; + +__fractional_part__ = fractional_part_ungrouped | fractional_part_unparsed; +__decimal_marker__ = "."; + +export FLOAT = Optimize[ + (n.CARDINAL_NUMBERS + (__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ") + __fractional_part__) @ l.LEXICAL_MAP] +; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/g.fst b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/g.fst new file mode 100644 index 000000000..135da015c Binary files /dev/null and b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/g.fst differ diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/lexical_map.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/lexical_map.grm new file mode 100644 index 000000000..a9b4ea490 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/lexical_map.grm @@ -0,0 +1,25 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; + +lexical_map = StringFile['en/verbalizer/lexical_map.tsv']; + +sigma_star = b.kBytes*; + +del_null = CDRewrite["__NULL__" : "", "", "", sigma_star]; + +export LEXICAL_MAP = Optimize[ + CDRewrite[lexical_map, "", "", sigma_star] @ del_null] +; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/lexical_map.tsv b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/lexical_map.tsv new file mode 100644 index 000000000..1e17034d8 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/lexical_map.tsv @@ -0,0 +1,74 @@ +@@CONNECTOR_RANGE@@ to +@@CONNECTOR_RATIO@@ to +@@CONNECTOR_BY@@ by +@@CONNECTOR_CONSECUTIVE_YEAR@@ to +@@JANUARY@@ january +@@FEBRUARY@@ february +@@MARCH@@ march +@@APRIL@@ april +@@MAY@@ may +@@JUNE@@ june +@@JULY@@ july +@@AUGUST@@ august +@@SEPTEMBER@@ september +@@OCTOBER@@ october +@@NOVEMBER@@ november +@@DECEMBER@@ december +@@MINUS@@ minus +@@DECIMAL_DOT_EXPRESSION@@ point +@@URL_DOT_EXPRESSION@@ dot +@@DECIMAL_EXPONENT@@ to the +@@DECIMAL_EXPONENT@@ to the power of +@@COLON@@ colon +@@SLASH@@ slash +@@SLASH@@ forward slash +@@DASH@@ dash +@@PASSWORD@@ password +@@AT@@ at +@@PORT@@ port +@@QUESTION_MARK@@ question mark +@@HASH@@ hash +@@HASH@@ hash tag +@@FRACTION_OVER@@ over +@@MONEY_AND@@ and +@@AND@@ and +@@PHONE_PLUS@@ plus +@@PHONE_EXTENSION@@ extension +@@TIME_AM@@ a m +@@TIME_PM@@ p m +@@HOUR@@ o'clock +@@MINUTE@@ minute +@@MINUTE@@ minutes +@@TIME_AFTER@@ after +@@TIME_AFTER@@ past +@@TIME_BEFORE@@ to +@@TIME_BEFORE@@ till +@@TIME_QUARTER@@ quarter +@@TIME_HALF@@ half +@@TIME_ZERO@@ oh +@@TIME_THREE_QUARTER@@ three quarters +@@ARITHMETIC_PLUS@@ plus +@@ARITHMETIC_TIMES@@ times +@@ARITHMETIC_TIMES@@ multiplied by +@@ARITHMETIC_MINUS@@ minus +@@ARITHMETIC_DIVISION@@ divided by +@@ARITHMETIC_DIVISION@@ over +@@ARITHMETIC_EQUALS@@ equals +@@PERCENT@@ percent +@@DEGREE@@ degree +@@DEGREE@@ degrees +@@SQUARE_ROOT@@ square root of +@@SQUARE_ROOT@@ the square root of +@@STAR@@ star +@@HYPHEN@@ hyphen +@@AT@@ at +@@PER@@ per +@@PERIOD@@ period +@@PERIOD@@ full stop +@@PERIOD@@ dot +@@EXCLAMATION_MARK@@ exclamation mark +@@EXCLAMATION_MARK@@ exclamation point +@@COMMA@@ comma +@@POSITIVE@@ positive +@@NEGATIVE@@ negative +@@OTHER_ZERO_VERBALIZATIONS@@ oh diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/math.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/math.grm new file mode 100644 index 000000000..764e6e02e --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/math.grm @@ -0,0 +1,34 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'en/verbalizer/float.grm' as f; +import 'en/verbalizer/lexical_map.grm' as l; +import 'en/verbalizer/numbers.grm' as n; + +float = f.FLOAT; +card = n.CARDINAL_NUMBERS; +number = card | float; + +plus = "+" : " @@ARITHMETIC_PLUS@@ "; +times = "*" : " @@ARITHMETIC_TIMES@@ "; +minus = "-" : " @@ARITHMETIC_MINUS@@ "; +division = "/" : " @@ARITHMETIC_DIVISION@@ "; + +operator = plus | times | minus | division; + +percent = "%" : " @@PERCENT@@"; + +export ARITHMETIC = + Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP] +; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/miscellaneous.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/miscellaneous.grm new file mode 100644 index 000000000..3a087d95c --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/miscellaneous.grm @@ -0,0 +1,78 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; +import 'ru/classifier/cyrillic.grm' as c; +import 'en/verbalizer/extra_numbers.grm' as e; +import 'en/verbalizer/lexical_map.grm' as l; +import 'en/verbalizer/numbers.grm' as n; +import 'en/verbalizer/spelled.grm' as s; + +letter = b.kAlpha | c.kCyrillicAlpha; +dash = "-"; +word = letter+; +possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?; + +post_word_symbol = + ("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) | + ("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) | + ("*" : "@@STAR@@") +; + +pre_word_symbol = + ("@" : "@@AT@@") | + ("/" : "@@SLASH@@") | + ("#" : "@@HASH@@") +; + +post_word = possibly_split_word n.I[" "] post_word_symbol; + +pre_word = pre_word_symbol n.I[" "] possibly_split_word; + +## Number/digit sequence combos, maybe with a dash + +spelled_word = word @ s.SPELLED_NO_LETTER; + +word_number = + (word | spelled_word) + (n.I[" "] | (dash : " ")) + (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS) +; + +number_word = + (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS) + (n.I[" "] | (dash : " ")) + (word | spelled_word) +; + +## Two-digit year. + +# Note that in this case to be fair we really have to allow ordinals too since +# in some languages that's what you would have. + +two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS)); + +dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com"; + +miscellaneous = Optimize[ + possibly_split_word + | post_word + | pre_word + | word_number + | number_word + | two_digit_year + | dot_com +]; + +export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/money.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/money.grm new file mode 100644 index 000000000..e37a7f7b3 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/money.grm @@ -0,0 +1,44 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; +import 'en/verbalizer/lexical_map.grm' as l; +import 'en/verbalizer/numbers.grm' as n; + +card = n.CARDINAL_NUMBERS; + +__currency__ = StringFile['en/verbalizer/money.tsv']; + +d = b.kDigit; +D = d - "0"; + +cents = ((n.D["0"] | D) d) @ card; + +# Only dollar for the verbalizer tests for English. Will need to add other +# currencies. +usd_maj = Project["usd_maj" @ __currency__, 'output']; +usd_min = Project["usd_min" @ __currency__, 'output']; +and = " @@MONEY_AND@@ " | " "; + +dollar1 = + n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min] +; + +dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"]; + +dollar3 = n.D["$"] card n.I[" " usd_maj]; + +dollar = Optimize[dollar1 | dollar2 | dollar3]; + +export MONEY = Optimize[dollar @ l.LEXICAL_MAP]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/money.tsv b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/money.tsv new file mode 100644 index 000000000..f3965cf41 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/money.tsv @@ -0,0 +1,4 @@ +usd_maj dollar +usd_maj dollars +usd_min cent +usd_min cents diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/number_names.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/number_names.grm new file mode 100644 index 000000000..3e07532fe --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/number_names.grm @@ -0,0 +1,54 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# English minimally supervised number grammar. +# +# Supports both cardinals and ordinals without overt marking. +# +# The language-specific acceptor G was compiled with digit, teen, and decade +# preterminals. The lexicon transducer L is unambiguous so no LM is used. + +import 'util/arithmetic.grm' as a; + +# Intersects the universal factorization transducer (F) with the +# language-specific acceptor (G). + +d = a.DELTA_STAR; +f = a.IARITHMETIC_RESTRICTED; +g = LoadFst['en/verbalizer/g.fst']; +fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]]; +test1 = AssertEqual["230" @ fg, "(+ (* 2 100 *) 30 +)"]; + +# Compiles lexicon transducer (L). + +cardinal_name = StringFile['en/verbalizer/cardinals.tsv']; +cardinal_l = Optimize[(cardinal_name " ")* cardinal_name]; +test2 = AssertEqual["2 100 30" @ cardinal_l, "two hundred thirty"]; + +ordinal_name = StringFile['en/verbalizer/ordinals.tsv']; +# In English, ordinals have the same syntax as cardinals and all but the final +# element is verbalized using a cardinal number word; e.g., "two hundred +# thirtieth". +ordinal_l = Optimize[(cardinal_name " ")* ordinal_name]; +test3 = AssertEqual["2 100 30" @ ordinal_l, "two hundred thirtieth"]; + +# Composes L with the leaf transducer (P), then composes that with FG. + +p = a.LEAVES; + +export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)]; +test4 = AssertEqual["230" @ CARDINAL_NUMBER_NAME, "two hundred thirty"]; + +export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)]; +test5 = AssertEqual["230" @ ORDINAL_NUMBER_NAME, "two hundred thirtieth"]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/numbers.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/numbers.grm new file mode 100644 index 000000000..e158b7a02 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/numbers.grm @@ -0,0 +1,57 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'en/verbalizer/number_names.grm' as n; +import 'util/byte.grm' as bytelib; +import 'universal/thousands_punct.grm' as t; + +cardinal = n.CARDINAL_NUMBER_NAME; +ordinal = n.ORDINAL_NUMBER_NAME; + +# Putting these here since this grammar gets incorporated by all the others. + +func I[expr] { + return "" : expr; +} + +func D[expr] { + return expr : ""; +} + +separators = t.comma_thousands | t.no_delimiter; + +# Language specific endings for ordinals. +d = bytelib.kDigit; +endings = "st" | "nd" | "rd" | "th"; + +st = (d* "1") - (d* "11"); +nd = (d* "2") - (d* "12"); +rd = (d* "3") - (d* "13"); +th = Optimize[d* - st - nd - rd]; +first = st ("st" : ""); +second = nd ("nd" : ""); +third = rd ("rd" : ""); +other = th ("th" : ""); +marked_ordinal = Optimize[first | second | third | other]; + +# The separator is a no-op here but will be needed once we replace +# the above targets. + +export CARDINAL_NUMBERS = Optimize[separators @ cardinal]; + +export ORDINAL_NUMBERS = + Optimize[(separators endings) @ marked_ordinal @ ordinal] +; + +export ORDINAL_NUMBERS_UNMARKED = Optimize[separators @ ordinal]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/numbers_plus.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/numbers_plus.grm new file mode 100644 index 000000000..a152e8133 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/numbers_plus.grm @@ -0,0 +1,133 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Grammar for things built mostly on numbers. + +import 'en/verbalizer/factorization.grm' as f; +import 'en/verbalizer/lexical_map.grm' as l; +import 'en/verbalizer/numbers.grm' as n; + +num = n.CARDINAL_NUMBERS; +ord = n.ORDINAL_NUMBERS_UNMARKED; +digits = f.FRACTIONAL_PART_UNGROUPED; + +# Various symbols. + +plus = "+" : "@@ARITHMETIC_PLUS@@"; +minus = "-" : "@@ARITHMETIC_MINUS@@"; +slash = "/" : "@@SLASH@@"; +dot = "." : "@@URL_DOT_EXPRESSION@@"; +dash = "-" : "@@DASH@@"; +equals = "=" : "@@ARITHMETIC_EQUALS@@"; + +degree = "°" : "@@DEGREE@@"; + +division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@"; + +times = ("x" | "*") : "@@ARITHMETIC_TIMES@@"; + +power = "^" : "@@DECIMAL_EXPONENT@@"; + +square_root = "√" : "@@SQUARE_ROOT@@"; + +percent = "%" : "@@PERCENT@@"; + +# Safe roman numbers. + +# NB: Do not change the formatting here. NO_EDIT must be on the same +# line as the path. +rfile = + 'universal/roman_numerals.tsv' # NO_EDIT +; + +roman = StringFile[rfile]; + +## Main categories. + +cat_dot_number = + num + n.I[" "] dot n.I[" "] num + (n.I[" "] dot n.I[" "] num)+ +; + +cat_slash_number = + num + n.I[" "] slash n.I[" "] num + (n.I[" "] slash n.I[" "] num)* +; + +cat_dash_number = + num + n.I[" "] dash n.I[" "] num + (n.I[" "] dash n.I[" "] num)* +; + +cat_signed_number = ((plus | minus) n.I[" "])? num; + +cat_degree = cat_signed_number n.I[" "] degree; + +cat_country_code = plus n.I[" "] (num | digits); + +cat_math_operations = + plus + | minus + | division + | times + | equals + | percent + | power + | square_root +; + +# Roman numbers are often either cardinals or ordinals in various languages. +cat_roman = roman @ (num | ord); + +# Allow +# +# number:number +# number-number +# +# to just be +# +# number number. + +cat_number_number = + num ((":" | "-") : " ") num +; + +# Some additional readings for these symbols. + +cat_additional_readings = + ("/" : "@@PER@@") | + ("+" : "@@AND@@") | + ("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) | + ("*" : "@@STAR@@") | + ("x" : ("x" | "@@CONNECTOR_BY@@")) | + ("@" : "@@AT@@") +; + +numbers_plus = Optimize[ + cat_dot_number + | cat_slash_number + | cat_dash_number + | cat_signed_number + | cat_degree + | cat_country_code + | cat_math_operations + | cat_roman + | cat_number_number + | cat_additional_readings +]; + +export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/ordinals.tsv b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/ordinals.tsv new file mode 100644 index 000000000..f4d3d37e0 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/ordinals.tsv @@ -0,0 +1,32 @@ +0 zeroth +1 first +2 second +3 third +4 fourth +5 fifth +6 sixth +7 seventh +8 eighth +9 ninth +10 tenth +11 eleventh +12 twelfth +13 thirteenth +14 fourteenth +15 fifteenth +16 sixteenth +17 seventeenth +18 eighteenth +19 nineteenth +20 twentieth +30 thirtieth +40 fortieth +50 fiftieth +60 sixtieth +70 seventieth +80 eightieth +90 ninetieth +100 hundredth +1000 thousandth +1000000 millionth +1000000000 billionth diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/params.tsv b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/params.tsv new file mode 100644 index 000000000..d31a8a4ae --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/params.tsv @@ -0,0 +1,7 @@ +float.grm __fractional_part__ = fractional_part_ungrouped | fractional_part_unparsed; +telephone.grm __grouping__ = f.UNGROUPED; +measure.grm __measure__ = StringFile['en/verbalizer/measures.tsv']; +money.grm __currency__ = StringFile['en/verbalizer/money.tsv']; +time.grm __sep__ = ":"; +time.grm __am__ = "a.m." | "am" | "AM"; +time.grm __pm__ = "p.m." | "pm" | "PM"; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/podspeech.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/podspeech.grm new file mode 100644 index 000000000..1c67c2e3f --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/podspeech.grm @@ -0,0 +1,46 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/util.grm' as util; +import 'util/case.grm' as case; +import 'en/verbalizer/extra_numbers.grm' as e; +import 'en/verbalizer/float.grm' as f; +import 'en/verbalizer/math.grm' as ma; +import 'en/verbalizer/miscellaneous.grm' as mi; +import 'en/verbalizer/money.grm' as mo; +import 'en/verbalizer/numbers.grm' as n; +import 'en/verbalizer/numbers_plus.grm' as np; +import 'en/verbalizer/spelled.grm' as s; +import 'en/verbalizer/spoken_punct.grm' as sp; +import 'en/verbalizer/time.grm' as t; +import 'en/verbalizer/urls.grm' as u; + +export POD_SPEECH_TN = Optimize[RmWeight[ + (u.URL + | e.MIXED_NUMBERS + | e.DIGITS + | f.FLOAT + | ma.ARITHMETIC + | mo.MONEY + | n.CARDINAL_NUMBERS + | n.ORDINAL_NUMBERS + | np.NUMBERS_PLUS + | s.SPELLED + | sp.SPOKEN_PUNCT + | t.TIME + | u.URL + | u.EMAILS) @ util.CLEAN_SPACES @ case.TOUPPER +]]; + +#export POD_SPEECH_TN = Optimize[RmWeight[(mi.MISCELLANEOUS) @ util.CLEAN_SPACES @ case.TOUPPER]]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/spelled.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/spelled.grm new file mode 100644 index 000000000..b04974d2a --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/spelled.grm @@ -0,0 +1,77 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This verbalizer is used whenever there is an LM symbol that consists of +# letters immediately followed by "{spelled}".l This strips the "{spelled}" +# suffix. + +import 'util/byte.grm' as b; +import 'ru/classifier/cyrillic.grm' as c; +import 'en/verbalizer/lexical_map.grm' as l; +import 'en/verbalizer/numbers.grm' as n; + +digit = b.kDigit @ n.CARDINAL_NUMBERS; + +char_set = (("a" | "A") : "letter-a") + | (("b" | "B") : "letter-b") + | (("c" | "C") : "letter-c") + | (("d" | "D") : "letter-d") + | (("e" | "E") : "letter-e") + | (("f" | "F") : "letter-f") + | (("g" | "G") : "letter-g") + | (("h" | "H") : "letter-h") + | (("i" | "I") : "letter-i") + | (("j" | "J") : "letter-j") + | (("k" | "K") : "letter-k") + | (("l" | "L") : "letter-l") + | (("m" | "M") : "letter-m") + | (("n" | "N") : "letter-n") + | (("o" | "O") : "letter-o") + | (("p" | "P") : "letter-p") + | (("q" | "Q") : "letter-q") + | (("r" | "R") : "letter-r") + | (("s" | "S") : "letter-s") + | (("t" | "T") : "letter-t") + | (("u" | "U") : "letter-u") + | (("v" | "V") : "letter-v") + | (("w" | "W") : "letter-w") + | (("x" | "X") : "letter-x") + | (("y" | "Y") : "letter-y") + | (("z" | "Z") : "letter-z") + | (digit) + | ("&" : "@@AND@@") + | ("." : "") + | ("-" : "") + | ("_" : "") + | ("/" : "") + | (n.I["letter-"] c.kCyrillicAlpha) + ; + +ins_space = "" : " "; + +suffix = "{spelled}" : ""; + +spelled = Optimize[char_set (ins_space char_set)* suffix]; + +export SPELLED = Optimize[spelled @ l.LEXICAL_MAP]; + +sigma_star = b.kBytes*; + +# Gets rid of the letter- prefix since in some cases we don't want it. + +del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star]; + +spelled_no_tag = Optimize[char_set (ins_space char_set)*]; + +export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/spoken_punct.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/spoken_punct.grm new file mode 100644 index 000000000..b0db6535b --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/spoken_punct.grm @@ -0,0 +1,24 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'en/verbalizer/lexical_map.grm' as l; + +punct = + ("." : "@@PERIOD@@") + | ("," : "@@COMMA@@") + | ("!" : "@@EXCLAMATION_MARK@@") + | ("?" : "@@QUESTION_MARK@@") +; + +export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/time.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/time.grm new file mode 100644 index 000000000..0bf92d0ab --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/time.grm @@ -0,0 +1,108 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; +import 'en/verbalizer/lexical_map.grm' as l; +import 'en/verbalizer/numbers.grm' as n; + +# Only handles 24-hour time with quarter-to, half-past and quarter-past. + +increment_hour = + ("0" : "1") + | ("1" : "2") + | ("2" : "3") + | ("3" : "4") + | ("4" : "5") + | ("5" : "6") + | ("6" : "7") + | ("7" : "8") + | ("8" : "9") + | ("9" : "10") + | ("10" : "11") + | ("11" : "12") + | ("12" : "1") # If someone uses 12, we assume 12-hour by default. + | ("13" : "14") + | ("14" : "15") + | ("15" : "16") + | ("16" : "17") + | ("17" : "18") + | ("18" : "19") + | ("19" : "20") + | ("20" : "21") + | ("21" : "22") + | ("22" : "23") + | ("23" : "12") +; + +hours = Project[increment_hour, 'input']; + +d = b.kDigit; +D = d - "0"; + +minutes09 = "0" D; + +minutes = ("1" | "2" | "3" | "4" | "5") d; + +__sep__ = ":"; +sep_space = __sep__ : " "; + +verbalize_hours = hours @ n.CARDINAL_NUMBERS; + +verbalize_minutes = + ("00" : "@@HOUR@@") + | (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS)) + | (minutes @ n.CARDINAL_NUMBERS) +; + +time_basic = Optimize[verbalize_hours sep_space verbalize_minutes]; + +# Special cases we handle right now. +# TODO: Need to allow for cases like +# +# half twelve (in the UK English sense) +# half twaalf (in the Dutch sense) + +time_quarter_past = + n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "] + verbalize_hours + n.D[__sep__ "15"]; + +time_half_past = + n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "] + verbalize_hours + n.D[__sep__ "30"]; + +time_quarter_to = + n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "] + (increment_hour @ verbalize_hours) + n.D[__sep__ "45"]; + +time_extra = Optimize[ + time_quarter_past | time_half_past | time_quarter_to] +; + +# Basic time periods which most languages can be expected to have. +__am__ = "a.m." | "am" | "AM"; +__pm__ = "p.m." | "pm" | "PM"; + +period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@"); + +time_variants = time_basic | time_extra; + +time = Optimize[ + (period (" " | n.I[" "]))? time_variants + | time_variants ((" " | n.I[" "]) period)?] +; + +export TIME = Optimize[time @ l.LEXICAL_MAP]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/urls.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/urls.grm new file mode 100644 index 000000000..a2232f9bc --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/urls.grm @@ -0,0 +1,68 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Rules for URLs and email addresses. + +import 'util/byte.grm' as bytelib; +import 'en/verbalizer/lexical_map.grm' as l; + +ins_space = "" : " "; +dot = "." : "@@URL_DOT_EXPRESSION@@"; +at = "@" : "@@AT@@"; + +url_suffix = + (".com" : dot ins_space "com") | + (".gov" : dot ins_space "gov") | + (".edu" : dot ins_space "e d u") | + (".org" : dot ins_space "org") | + (".net" : dot ins_space "net") +; + +letter_string = (bytelib.kAlnum)* bytelib.kAlnum; + +letter_string_dot = + ((letter_string ins_space dot ins_space)* letter_string) +; + +# Rules for URLs. +export URL = Optimize[ + ((letter_string_dot) (ins_space) + (url_suffix)) @ l.LEXICAL_MAP +]; + +# Rules for email addresses. +letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum); + +letter_by_letter_dot = + ((letter_by_letter ins_space dot ins_space)* + letter_by_letter) +; + +export EMAIL1 = Optimize[ + ((letter_by_letter) (ins_space) + (at) (ins_space) + (letter_by_letter_dot) (ins_space) + (url_suffix)) @ l.LEXICAL_MAP +]; + +export EMAIL2 = Optimize[ + ((letter_by_letter) (ins_space) + (at) (ins_space) + (letter_string_dot) (ins_space) + (url_suffix)) @ l.LEXICAL_MAP +]; + +export EMAILS = Optimize[ + EMAIL1 | EMAIL2 +]; diff --git a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/verbalizer.grm b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/verbalizer.grm new file mode 100644 index 000000000..fe6f4e42c --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/verbalizer.grm @@ -0,0 +1,42 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/util.grm' as util; +import 'en/verbalizer/extra_numbers.grm' as e; +import 'en/verbalizer/float.grm' as f; +import 'en/verbalizer/math.grm' as ma; +import 'en/verbalizer/miscellaneous.grm' as mi; +import 'en/verbalizer/money.grm' as mo; +import 'en/verbalizer/numbers.grm' as n; +import 'en/verbalizer/numbers_plus.grm' as np; +import 'en/verbalizer/spelled.grm' as s; +import 'en/verbalizer/spoken_punct.grm' as sp; +import 'en/verbalizer/time.grm' as t; +import 'en/verbalizer/urls.grm' as u; + +export VERBALIZER = Optimize[RmWeight[ + ( e.MIXED_NUMBERS + | e.DIGITS + | f.FLOAT + | ma.ARITHMETIC + | mi.MISCELLANEOUS + | mo.MONEY + | n.CARDINAL_NUMBERS + | n.ORDINAL_NUMBERS + | np.NUMBERS_PLUS + | s.SPELLED + | sp.SPOKEN_PUNCT + | t.TIME + | u.URL) @ util.CLEAN_SPACES +]]; diff --git a/third_party/chinese_text_normalization/thrax/src/number_data/README.md b/third_party/chinese_text_normalization/thrax/src/number_data/README.md new file mode 100644 index 000000000..dd76ad16c --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/number_data/README.md @@ -0,0 +1,17 @@ +This directory contains data used in: + + Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization. + Transactions of the Association for Computational Linguistics 4: 507-519. + +* `minimal.txt`: A list of 30 curated numbers used as the "minimal" training + set. +* `random-trn.txt`: A list of 9000 randomly-generated numbers used as the + "medium" training set. +* `random-tst.txt`: A list of 1000 randomly-generated numbers used as the test + set. + +Note that `random-trn.txt` and `random-tst.txt` are totally disjoint, but that +a small number of examples occur both in `minimal.txt` and `random-tst.txt`. + +For information about the sampling procedure used to generate the random data +sets, see appendix A of the aforementioned paper. diff --git a/third_party/chinese_text_normalization/thrax/src/number_data/minimal.txt b/third_party/chinese_text_normalization/thrax/src/number_data/minimal.txt new file mode 100644 index 000000000..dd0704fd9 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/number_data/minimal.txt @@ -0,0 +1,300 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +220 +221 +230 +300 +400 +500 +600 +700 +800 +900 +1000 +1001 +1002 +1003 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1020 +1021 +1030 +1200 +2000 +2001 +2002 +2003 +2004 +2005 +2006 +2007 +2008 +2009 +2010 +2011 +2012 +2020 +2021 +2030 +2100 +2200 +5001 +10000 +12000 +20000 +21000 +50001 +100000 +120000 +200000 +210000 +500001 +1000000 +1001000 +1200000 +2000000 +2100000 +5000001 +10000000 +10001000 +12000000 +20000000 +50000001 +100000000 +100001000 +120000000 +200000000 +500000001 +1000000000 +1000001000 +1200000000 +2000000000 +5000000001 +10000000000 +10000001000 +12000000000 +20000000000 +50000000001 +100000000000 +100000001000 +120000000000 +200000000000 +500000000001 diff --git a/third_party/chinese_text_normalization/thrax/src/number_data/random-trn.txt b/third_party/chinese_text_normalization/thrax/src/number_data/random-trn.txt new file mode 100644 index 000000000..103a7063d --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/number_data/random-trn.txt @@ -0,0 +1,9000 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +206 +208 +210 +212 +213 +214 +215 +217 +218 +219 +221 +223 +224 +226 +228 +229 +231 +232 +234 +235 +238 +240 +242 +247 +249 +251 +253 +256 +259 +260 +261 +267 +268 +272 +278 +279 +280 +283 +284 +286 +287 +288 +289 +292 +296 +297 +299 +300 +305 +306 +307 +309 +311 +312 +314 +315 +316 +318 +319 +320 +321 +325 +326 +327 +328 +329 +330 +331 +333 +334 +335 +336 +339 +340 +341 +343 +345 +350 +351 +352 +353 +354 +355 +356 +360 +361 +362 +369 +372 +373 +374 +375 +378 +380 +382 +383 +384 +385 +387 +388 +389 +391 +393 +394 +400 +403 +404 +406 +407 +409 +411 +412 +413 +415 +416 +419 +420 +421 +423 +426 +427 +430 +432 +433 +435 +437 +438 +439 +440 +441 +442 +444 +445 +447 +448 +449 +453 +454 +456 +457 +460 +461 +463 +464 +465 +467 +468 +470 +474 +475 +476 +477 +478 +479 +480 +481 +483 +485 +486 +487 +488 +490 +491 +492 +493 +496 +497 +498 +499 +500 +501 +502 +504 +506 +508 +509 +511 +512 +514 +515 +517 +518 +521 +522 +529 +532 +533 +534 +536 +539 +542 +543 +544 +545 +546 +549 +550 +551 +552 +553 +557 +559 +560 +561 +562 +564 +565 +567 +568 +570 +571 +572 +575 +576 +577 +579 +581 +582 +583 +584 +586 +588 +590 +591 +592 +597 +598 +600 +602 +603 +606 +608 +609 +610 +611 +613 +616 +617 +618 +623 +626 +628 +630 +631 +632 +634 +635 +637 +638 +639 +641 +643 +644 +645 +647 +649 +651 +659 +660 +661 +663 +669 +670 +672 +673 +674 +676 +683 +684 +686 +687 +690 +691 +695 +696 +697 +698 +699 +700 +702 +703 +705 +706 +707 +709 +712 +714 +717 +718 +719 +720 +721 +722 +724 +727 +728 +731 +733 +734 +735 +736 +739 +743 +744 +745 +746 +748 +749 +752 +753 +754 +755 +756 +758 +759 +762 +764 +765 +767 +769 +771 +772 +773 +774 +777 +778 +779 +780 +782 +783 +784 +785 +786 +787 +789 +791 +794 +795 +796 +798 +799 +800 +801 +802 +805 +806 +807 +808 +809 +811 +812 +813 +816 +817 +819 +820 +822 +823 +824 +825 +826 +827 +828 +830 +832 +833 +834 +835 +841 +842 +844 +846 +848 +850 +852 +853 +855 +856 +857 +858 +859 +861 +862 +863 +864 +866 +867 +869 +870 +871 +874 +876 +877 +880 +882 +885 +886 +887 +888 +891 +894 +895 +896 +897 +901 +905 +906 +907 +908 +913 +914 +915 +916 +919 +920 +923 +925 +926 +929 +931 +933 +935 +936 +937 +938 +939 +941 +942 +946 +951 +952 +953 +954 +955 +957 +958 +961 +962 +967 +971 +976 +977 +978 +979 +980 +981 +982 +983 +988 +990 +991 +992 +993 +996 +997 +998 +1001 +1004 +1005 +1006 +1007 +1008 +1009 +1010 +1011 +1012 +1015 +1019 +1020 +1021 +1024 +1026 +1028 +1030 +1031 +1033 +1034 +1036 +1037 +1039 +1040 +1041 +1042 +1043 +1045 +1046 +1048 +1049 +1051 +1053 +1054 +1056 +1058 +1060 +1061 +1064 +1065 +1067 +1069 +1072 +1073 +1074 +1075 +1078 +1080 +1082 +1084 +1086 +1088 +1089 +1091 +1092 +1093 +1095 +1096 +1099 +1100 +1102 +1103 +1105 +1106 +1109 +1110 +1111 +1112 +1114 +1116 +1117 +1118 +1119 +1120 +1122 +1125 +1126 +1127 +1128 +1130 +1135 +1136 +1138 +1142 +1143 +1145 +1146 +1147 +1148 +1149 +1150 +1153 +1154 +1157 +1158 +1162 +1163 +1164 +1165 +1166 +1169 +1170 +1171 +1174 +1175 +1178 +1180 +1181 +1183 +1185 +1187 +1188 +1190 +1191 +1193 +1195 +1198 +1200 +1201 +1204 +1206 +1207 +1208 +1209 +1212 +1215 +1216 +1217 +1219 +1220 +1221 +1223 +1224 +1225 +1227 +1228 +1230 +1232 +1233 +1236 +1238 +1239 +1243 +1244 +1245 +1246 +1249 +1250 +1251 +1254 +1256 +1261 +1263 +1264 +1268 +1269 +1270 +1271 +1272 +1274 +1276 +1277 +1283 +1284 +1287 +1288 +1289 +1291 +1292 +1294 +1296 +1298 +1300 +1303 +1304 +1305 +1306 +1309 +1313 +1316 +1317 +1319 +1320 +1321 +1322 +1323 +1324 +1328 +1330 +1331 +1333 +1335 +1336 +1337 +1338 +1341 +1346 +1348 +1349 +1350 +1351 +1353 +1354 +1355 +1356 +1357 +1359 +1360 +1361 +1362 +1363 +1365 +1366 +1367 +1370 +1371 +1373 +1374 +1376 +1377 +1380 +1381 +1382 +1384 +1386 +1387 +1388 +1390 +1392 +1394 +1395 +1396 +1400 +1403 +1405 +1406 +1407 +1408 +1409 +1410 +1412 +1413 +1414 +1415 +1416 +1419 +1421 +1422 +1423 +1424 +1425 +1427 +1429 +1433 +1434 +1435 +1436 +1437 +1438 +1440 +1443 +1445 +1446 +1448 +1454 +1457 +1460 +1461 +1465 +1468 +1469 +1470 +1474 +1475 +1477 +1483 +1485 +1487 +1488 +1489 +1490 +1491 +1493 +1494 +1496 +1497 +1498 +1501 +1502 +1503 +1505 +1506 +1507 +1508 +1510 +1511 +1512 +1513 +1514 +1515 +1518 +1519 +1520 +1522 +1523 +1525 +1526 +1529 +1530 +1531 +1532 +1534 +1536 +1537 +1539 +1540 +1541 +1542 +1543 +1544 +1545 +1546 +1554 +1555 +1556 +1560 +1561 +1562 +1563 +1564 +1565 +1567 +1568 +1569 +1570 +1572 +1573 +1575 +1576 +1577 +1578 +1579 +1580 +1584 +1586 +1588 +1590 +1591 +1595 +1596 +1598 +1600 +1601 +1602 +1603 +1609 +1610 +1611 +1614 +1616 +1617 +1620 +1621 +1622 +1625 +1627 +1631 +1632 +1633 +1636 +1637 +1641 +1644 +1645 +1650 +1652 +1658 +1659 +1663 +1664 +1665 +1666 +1667 +1669 +1670 +1671 +1673 +1677 +1678 +1679 +1680 +1681 +1682 +1683 +1691 +1694 +1697 +1699 +1700 +1701 +1702 +1703 +1704 +1706 +1708 +1712 +1713 +1715 +1718 +1721 +1723 +1724 +1725 +1726 +1727 +1728 +1730 +1731 +1736 +1740 +1741 +1742 +1744 +1746 +1749 +1750 +1751 +1752 +1753 +1754 +1755 +1756 +1763 +1766 +1767 +1768 +1769 +1772 +1775 +1777 +1778 +1780 +1783 +1784 +1785 +1791 +1793 +1795 +1796 +1798 +1800 +1801 +1803 +1804 +1805 +1810 +1812 +1814 +1816 +1818 +1820 +1822 +1823 +1825 +1826 +1828 +1829 +1830 +1833 +1836 +1837 +1839 +1842 +1844 +1845 +1846 +1848 +1852 +1853 +1854 +1855 +1857 +1858 +1859 +1860 +1861 +1862 +1863 +1864 +1865 +1866 +1868 +1869 +1874 +1876 +1879 +1882 +1884 +1885 +1886 +1887 +1888 +1890 +1892 +1893 +1894 +1895 +1896 +1898 +1899 +1900 +1901 +1902 +1903 +1906 +1907 +1908 +1909 +1910 +1911 +1912 +1913 +1914 +1916 +1917 +1918 +1919 +1920 +1921 +1925 +1927 +1929 +1935 +1936 +1938 +1939 +1940 +1941 +1943 +1946 +1947 +1948 +1953 +1954 +1956 +1957 +1958 +1959 +1960 +1961 +1962 +1963 +1968 +1971 +1973 +1978 +1979 +1980 +1981 +1985 +1988 +1989 +1991 +1993 +1994 +1997 +1998 +1999 +2000 +2003 +2004 +2005 +2007 +2008 +2009 +2011 +2014 +2015 +2016 +2017 +2021 +2022 +2023 +2027 +2028 +2030 +2031 +2032 +2033 +2034 +2036 +2037 +2039 +2042 +2043 +2044 +2045 +2046 +2050 +2051 +2053 +2054 +2057 +2060 +2061 +2063 +2065 +2066 +2068 +2069 +2071 +2074 +2076 +2077 +2078 +2079 +2080 +2081 +2084 +2085 +2086 +2088 +2089 +2090 +2091 +2092 +2093 +2094 +2095 +2096 +2097 +2098 +2100 +2101 +2102 +2105 +2106 +2109 +2111 +2112 +2113 +2114 +2115 +2117 +2118 +2121 +2122 +2125 +2127 +2128 +2131 +2134 +2136 +2137 +2138 +2141 +2144 +2145 +2147 +2148 +2149 +2150 +2153 +2154 +2156 +2157 +2158 +2159 +2160 +2161 +2162 +2163 +2164 +2165 +2169 +2170 +2172 +2173 +2175 +2177 +2178 +2183 +2185 +2189 +2190 +2192 +2195 +2196 +2198 +2199 +2202 +2204 +2207 +2208 +2210 +2211 +2212 +2213 +2214 +2216 +2217 +2218 +2219 +2221 +2222 +2224 +2225 +2227 +2229 +2230 +2231 +2232 +2233 +2235 +2236 +2239 +2240 +2241 +2242 +2243 +2244 +2245 +2247 +2249 +2252 +2253 +2254 +2255 +2257 +2260 +2261 +2263 +2264 +2266 +2267 +2269 +2271 +2272 +2273 +2274 +2278 +2281 +2282 +2284 +2288 +2289 +2291 +2293 +2296 +2297 +2298 +2299 +2301 +2302 +2304 +2305 +2308 +2310 +2312 +2314 +2316 +2317 +2319 +2320 +2322 +2324 +2325 +2328 +2330 +2331 +2332 +2333 +2335 +2336 +2337 +2338 +2339 +2340 +2343 +2344 +2345 +2346 +2348 +2350 +2351 +2352 +2353 +2354 +2356 +2358 +2359 +2360 +2365 +2366 +2368 +2369 +2370 +2371 +2373 +2374 +2375 +2376 +2377 +2381 +2382 +2383 +2385 +2386 +2387 +2388 +2393 +2394 +2395 +2397 +2398 +2401 +2403 +2404 +2406 +2407 +2408 +2409 +2411 +2412 +2415 +2421 +2422 +2423 +2424 +2425 +2431 +2432 +2434 +2440 +2442 +2443 +2446 +2447 +2453 +2454 +2455 +2457 +2458 +2459 +2460 +2461 +2462 +2465 +2469 +2470 +2471 +2473 +2477 +2478 +2479 +2480 +2481 +2485 +2486 +2487 +2489 +2495 +2501 +2503 +2504 +2506 +2509 +2510 +2512 +2515 +2516 +2517 +2518 +2522 +2525 +2526 +2527 +2528 +2529 +2530 +2533 +2534 +2536 +2538 +2539 +2540 +2543 +2546 +2548 +2551 +2552 +2553 +2555 +2556 +2559 +2561 +2564 +2565 +2566 +2567 +2570 +2572 +2573 +2574 +2575 +2577 +2580 +2581 +2583 +2584 +2585 +2589 +2591 +2592 +2594 +2595 +2596 +2597 +2600 +2603 +2604 +2607 +2608 +2610 +2613 +2614 +2615 +2618 +2621 +2622 +2624 +2625 +2626 +2627 +2630 +2631 +2632 +2634 +2638 +2640 +2641 +2642 +2643 +2644 +2645 +2646 +2648 +2649 +2650 +2653 +2655 +2656 +2657 +2658 +2661 +2662 +2665 +2666 +2667 +2668 +2671 +2672 +2674 +2675 +2676 +2677 +2679 +2681 +2683 +2684 +2686 +2688 +2691 +2694 +2695 +2696 +2701 +2703 +2704 +2705 +2706 +2707 +2708 +2709 +2710 +2711 +2712 +2713 +2714 +2719 +2720 +2721 +2725 +2726 +2728 +2731 +2732 +2735 +2736 +2737 +2738 +2740 +2741 +2743 +2744 +2745 +2748 +2749 +2750 +2751 +2754 +2756 +2758 +2759 +2764 +2767 +2769 +2770 +2771 +2772 +2773 +2775 +2776 +2780 +2781 +2783 +2784 +2786 +2787 +2788 +2789 +2790 +2794 +2798 +2799 +2800 +2802 +2803 +2804 +2806 +2808 +2810 +2811 +2813 +2814 +2815 +2820 +2822 +2824 +2826 +2827 +2829 +2830 +2831 +2832 +2833 +2834 +2835 +2837 +2840 +2841 +2845 +2846 +2847 +2848 +2851 +2852 +2854 +2855 +2856 +2858 +2860 +2861 +2862 +2863 +2867 +2868 +2869 +2870 +2871 +2872 +2874 +2876 +2877 +2878 +2881 +2882 +2883 +2884 +2886 +2887 +2889 +2890 +2891 +2892 +2893 +2894 +2895 +2897 +2898 +2900 +2903 +2906 +2907 +2911 +2912 +2914 +2915 +2917 +2920 +2924 +2925 +2930 +2931 +2932 +2933 +2934 +2936 +2938 +2939 +2940 +2941 +2942 +2943 +2944 +2945 +2947 +2948 +2949 +2951 +2952 +2953 +2955 +2956 +2957 +2958 +2959 +2962 +2964 +2965 +2966 +2967 +2968 +2972 +2973 +2979 +2981 +2982 +2983 +2985 +2986 +2988 +2990 +2992 +2993 +2996 +2999 +3000 +3001 +3002 +3005 +3006 +3007 +3009 +3012 +3013 +3015 +3016 +3017 +3018 +3019 +3020 +3022 +3023 +3024 +3027 +3028 +3029 +3031 +3033 +3035 +3036 +3037 +3038 +3041 +3044 +3045 +3046 +3048 +3050 +3051 +3052 +3053 +3055 +3056 +3058 +3059 +3062 +3064 +3065 +3067 +3068 +3069 +3071 +3072 +3073 +3074 +3075 +3079 +3083 +3085 +3087 +3088 +3089 +3090 +3092 +3094 +3099 +3100 +3101 +3102 +3105 +3106 +3107 +3109 +3110 +3116 +3117 +3122 +3124 +3125 +3126 +3127 +3129 +3131 +3132 +3133 +3134 +3135 +3138 +3139 +3140 +3143 +3144 +3146 +3148 +3151 +3153 +3154 +3158 +3159 +3160 +3161 +3162 +3163 +3165 +3166 +3167 +3168 +3169 +3170 +3172 +3181 +3184 +3186 +3187 +3188 +3189 +3191 +3192 +3193 +3194 +3195 +3198 +3199 +3201 +3202 +3203 +3204 +3206 +3209 +3211 +3216 +3221 +3224 +3225 +3226 +3228 +3229 +3231 +3232 +3236 +3238 +3239 +3241 +3242 +3246 +3249 +3251 +3252 +3253 +3254 +3256 +3261 +3263 +3264 +3266 +3268 +3269 +3272 +3274 +3275 +3277 +3278 +3281 +3282 +3284 +3285 +3287 +3288 +3289 +3290 +3291 +3292 +3293 +3294 +3295 +3297 +3299 +3300 +3301 +3302 +3306 +3307 +3308 +3309 +3310 +3311 +3313 +3319 +3320 +3321 +3322 +3323 +3325 +3326 +3327 +3328 +3329 +3331 +3332 +3334 +3335 +3336 +3337 +3338 +3340 +3342 +3343 +3345 +3346 +3348 +3351 +3352 +3354 +3358 +3359 +3360 +3362 +3363 +3366 +3369 +3371 +3374 +3375 +3376 +3380 +3381 +3382 +3386 +3389 +3390 +3391 +3395 +3397 +3399 +3402 +3404 +3406 +3407 +3411 +3413 +3414 +3415 +3417 +3418 +3419 +3420 +3421 +3422 +3426 +3428 +3429 +3430 +3431 +3434 +3443 +3444 +3445 +3446 +3447 +3448 +3450 +3451 +3452 +3455 +3456 +3457 +3458 +3460 +3461 +3462 +3464 +3465 +3466 +3467 +3468 +3469 +3470 +3471 +3472 +3473 +3476 +3478 +3480 +3481 +3483 +3484 +3485 +3487 +3488 +3489 +3491 +3492 +3494 +3496 +3498 +3499 +3503 +3504 +3506 +3507 +3509 +3511 +3512 +3515 +3520 +3521 +3523 +3524 +3527 +3529 +3530 +3532 +3533 +3534 +3536 +3538 +3539 +3540 +3541 +3542 +3544 +3546 +3547 +3554 +3555 +3556 +3560 +3561 +3562 +3563 +3565 +3566 +3567 +3571 +3574 +3575 +3578 +3579 +3580 +3581 +3582 +3583 +3584 +3585 +3587 +3589 +3590 +3591 +3592 +3593 +3594 +3595 +3596 +3599 +3600 +3601 +3604 +3605 +3609 +3610 +3611 +3613 +3614 +3615 +3616 +3619 +3621 +3622 +3623 +3624 +3627 +3628 +3631 +3632 +3633 +3635 +3637 +3640 +3641 +3643 +3644 +3646 +3648 +3650 +3654 +3655 +3656 +3657 +3658 +3660 +3661 +3662 +3663 +3669 +3670 +3671 +3672 +3673 +3677 +3678 +3679 +3682 +3686 +3687 +3688 +3689 +3690 +3693 +3694 +3698 +3699 +3700 +3703 +3704 +3705 +3707 +3709 +3710 +3711 +3714 +3715 +3718 +3723 +3724 +3725 +3726 +3727 +3729 +3730 +3731 +3733 +3734 +3735 +3736 +3737 +3738 +3743 +3744 +3745 +3748 +3749 +3751 +3752 +3753 +3757 +3760 +3762 +3764 +3766 +3767 +3768 +3769 +3770 +3772 +3774 +3775 +3778 +3779 +3780 +3782 +3784 +3786 +3789 +3791 +3792 +3793 +3794 +3795 +3796 +3799 +3800 +3801 +3802 +3805 +3806 +3810 +3812 +3814 +3815 +3817 +3818 +3819 +3820 +3823 +3824 +3830 +3833 +3835 +3837 +3838 +3839 +3842 +3843 +3849 +3851 +3853 +3855 +3856 +3857 +3858 +3859 +3861 +3862 +3863 +3865 +3866 +3867 +3868 +3869 +3872 +3874 +3876 +3877 +3878 +3880 +3884 +3886 +3888 +3891 +3892 +3895 +3898 +3903 +3904 +3905 +3906 +3908 +3909 +3914 +3915 +3916 +3917 +3918 +3919 +3920 +3922 +3923 +3924 +3925 +3930 +3932 +3934 +3936 +3938 +3939 +3940 +3946 +3947 +3948 +3949 +3952 +3954 +3955 +3958 +3959 +3960 +3962 +3965 +3967 +3970 +3974 +3975 +3976 +3977 +3978 +3980 +3981 +3985 +3987 +3988 +3991 +3997 +3998 +3999 +4001 +4002 +4003 +4004 +4005 +4006 +4008 +4009 +4015 +4017 +4021 +4022 +4023 +4026 +4027 +4028 +4029 +4030 +4033 +4035 +4036 +4037 +4039 +4041 +4044 +4050 +4051 +4053 +4056 +4057 +4060 +4067 +4068 +4073 +4075 +4076 +4077 +4078 +4080 +4081 +4082 +4085 +4090 +4091 +4092 +4094 +4096 +4098 +4101 +4102 +4104 +4107 +4111 +4112 +4113 +4115 +4117 +4118 +4120 +4122 +4124 +4130 +4134 +4136 +4137 +4141 +4142 +4144 +4147 +4150 +4152 +4153 +4154 +4157 +4158 +4159 +4161 +4163 +4164 +4165 +4168 +4169 +4170 +4171 +4174 +4175 +4176 +4181 +4182 +4184 +4188 +4189 +4190 +4191 +4193 +4194 +4195 +4196 +4197 +4198 +4200 +4203 +4204 +4207 +4208 +4209 +4210 +4211 +4212 +4214 +4215 +4216 +4218 +4221 +4222 +4223 +4224 +4225 +4226 +4227 +4228 +4230 +4231 +4233 +4235 +4236 +4237 +4239 +4242 +4244 +4245 +4247 +4250 +4251 +4252 +4253 +4255 +4257 +4258 +4262 +4263 +4264 +4265 +4266 +4268 +4271 +4272 +4273 +4276 +4277 +4278 +4279 +4282 +4283 +4285 +4286 +4289 +4292 +4294 +4295 +4297 +4299 +4301 +4304 +4306 +4308 +4310 +4313 +4319 +4321 +4323 +4324 +4328 +4329 +4330 +4332 +4335 +4336 +4340 +4342 +4355 +4356 +4357 +4360 +4363 +4364 +4365 +4366 +4368 +4370 +4371 +4372 +4373 +4374 +4376 +4377 +4378 +4379 +4380 +4382 +4383 +4384 +4385 +4387 +4390 +4392 +4393 +4395 +4396 +4398 +4399 +4400 +4401 +4402 +4403 +4405 +4406 +4407 +4408 +4411 +4412 +4414 +4415 +4420 +4422 +4426 +4429 +4430 +4431 +4436 +4439 +4440 +4441 +4442 +4443 +4444 +4448 +4449 +4452 +4453 +4455 +4456 +4457 +4458 +4462 +4463 +4465 +4466 +4467 +4468 +4473 +4475 +4476 +4477 +4479 +4480 +4481 +4482 +4483 +4484 +4488 +4491 +4492 +4493 +4495 +4496 +4499 +4500 +4501 +4504 +4505 +4509 +4510 +4512 +4513 +4514 +4516 +4517 +4518 +4519 +4528 +4529 +4530 +4532 +4533 +4534 +4535 +4537 +4541 +4542 +4547 +4549 +4550 +4552 +4555 +4556 +4559 +4561 +4562 +4563 +4564 +4565 +4566 +4568 +4571 +4573 +4574 +4575 +4576 +4578 +4579 +4581 +4582 +4584 +4587 +4589 +4592 +4593 +4594 +4595 +4596 +4597 +4599 +4600 +4604 +4606 +4610 +4612 +4615 +4617 +4618 +4620 +4622 +4623 +4625 +4626 +4627 +4628 +4631 +4635 +4637 +4640 +4641 +4643 +4644 +4646 +4649 +4651 +4653 +4657 +4659 +4660 +4661 +4662 +4663 +4664 +4667 +4670 +4675 +4678 +4681 +4684 +4688 +4691 +4692 +4696 +4697 +4698 +4700 +4703 +4704 +4705 +4707 +4709 +4710 +4711 +4715 +4719 +4720 +4722 +4725 +4728 +4729 +4732 +4733 +4735 +4739 +4742 +4746 +4749 +4750 +4751 +4752 +4753 +4754 +4755 +4756 +4758 +4760 +4761 +4762 +4763 +4765 +4766 +4767 +4768 +4769 +4770 +4775 +4776 +4777 +4778 +4779 +4780 +4781 +4782 +4784 +4787 +4788 +4789 +4790 +4794 +4799 +4800 +4801 +4804 +4805 +4806 +4808 +4810 +4811 +4812 +4813 +4815 +4816 +4817 +4818 +4819 +4822 +4823 +4825 +4827 +4829 +4831 +4833 +4834 +4837 +4838 +4839 +4840 +4844 +4847 +4848 +4851 +4853 +4855 +4856 +4859 +4860 +4861 +4866 +4867 +4868 +4869 +4871 +4872 +4873 +4875 +4876 +4877 +4880 +4881 +4883 +4885 +4886 +4887 +4888 +4889 +4890 +4891 +4893 +4898 +4904 +4905 +4909 +4910 +4913 +4914 +4915 +4916 +4917 +4920 +4922 +4923 +4924 +4925 +4930 +4931 +4932 +4933 +4935 +4936 +4937 +4938 +4939 +4942 +4944 +4947 +4952 +4955 +4956 +4957 +4958 +4959 +4961 +4963 +4967 +4969 +4970 +4971 +4972 +4973 +4974 +4977 +4981 +4985 +4986 +4989 +4990 +4992 +4993 +4996 +4999 +5000 +5001 +5003 +5012 +5015 +5018 +5019 +5021 +5022 +5023 +5025 +5028 +5029 +5035 +5036 +5041 +5042 +5043 +5046 +5047 +5049 +5050 +5058 +5059 +5063 +5065 +5068 +5069 +5070 +5071 +5072 +5073 +5074 +5077 +5078 +5082 +5083 +5084 +5086 +5087 +5090 +5092 +5093 +5094 +5096 +5097 +5100 +5103 +5104 +5105 +5106 +5110 +5113 +5117 +5118 +5119 +5120 +5122 +5123 +5124 +5125 +5126 +5130 +5131 +5132 +5135 +5138 +5139 +5146 +5147 +5149 +5151 +5152 +5153 +5154 +5155 +5157 +5160 +5161 +5165 +5167 +5170 +5172 +5173 +5174 +5177 +5178 +5179 +5182 +5183 +5185 +5187 +5188 +5189 +5191 +5194 +5195 +5196 +5197 +5199 +5201 +5202 +5203 +5208 +5209 +5211 +5214 +5218 +5219 +5220 +5221 +5222 +5224 +5226 +5227 +5229 +5230 +5232 +5234 +5235 +5238 +5243 +5245 +5246 +5248 +5249 +5250 +5251 +5252 +5253 +5256 +5260 +5261 +5267 +5268 +5269 +5271 +5278 +5279 +5285 +5289 +5291 +5292 +5294 +5296 +5297 +5299 +5300 +5303 +5304 +5309 +5311 +5312 +5315 +5320 +5321 +5322 +5324 +5326 +5328 +5329 +5332 +5333 +5334 +5337 +5339 +5340 +5341 +5343 +5344 +5347 +5348 +5349 +5350 +5352 +5353 +5355 +5359 +5360 +5361 +5363 +5366 +5367 +5368 +5369 +5371 +5372 +5374 +5375 +5376 +5377 +5385 +5386 +5388 +5392 +5393 +5394 +5395 +5398 +5399 +5400 +5401 +5402 +5404 +5406 +5408 +5409 +5410 +5412 +5413 +5414 +5416 +5417 +5419 +5420 +5421 +5425 +5426 +5427 +5431 +5432 +5433 +5435 +5437 +5440 +5441 +5443 +5448 +5449 +5452 +5453 +5456 +5457 +5459 +5460 +5462 +5463 +5466 +5468 +5470 +5471 +5474 +5475 +5478 +5480 +5485 +5487 +5488 +5490 +5491 +5494 +5496 +5500 +5509 +5510 +5512 +5514 +5517 +5518 +5519 +5520 +5521 +5527 +5530 +5533 +5537 +5538 +5540 +5541 +5544 +5546 +5548 +5549 +5551 +5554 +5557 +5559 +5560 +5562 +5563 +5564 +5573 +5574 +5575 +5576 +5580 +5581 +5583 +5584 +5586 +5589 +5591 +5592 +5598 +5600 +5601 +5605 +5608 +5610 +5615 +5618 +5620 +5621 +5624 +5625 +5626 +5628 +5630 +5632 +5633 +5638 +5640 +5641 +5643 +5645 +5647 +5652 +5655 +5658 +5660 +5661 +5662 +5663 +5664 +5667 +5668 +5669 +5670 +5672 +5673 +5674 +5679 +5681 +5682 +5690 +5693 +5697 +5698 +5702 +5703 +5705 +5712 +5715 +5718 +5719 +5721 +5722 +5726 +5728 +5737 +5739 +5743 +5744 +5745 +5747 +5748 +5750 +5752 +5755 +5756 +5757 +5759 +5760 +5764 +5767 +5768 +5770 +5772 +5773 +5775 +5776 +5781 +5782 +5783 +5785 +5787 +5788 +5790 +5792 +5793 +5795 +5796 +5797 +5799 +5800 +5801 +5802 +5803 +5805 +5806 +5807 +5808 +5810 +5815 +5818 +5821 +5822 +5823 +5827 +5829 +5830 +5835 +5836 +5840 +5842 +5844 +5846 +5849 +5853 +5854 +5857 +5859 +5860 +5866 +5869 +5870 +5872 +5873 +5875 +5876 +5878 +5881 +5882 +5883 +5884 +5886 +5888 +5893 +5894 +5900 +5901 +5902 +5903 +5904 +5906 +5908 +5910 +5911 +5914 +5918 +5920 +5922 +5925 +5926 +5927 +5928 +5932 +5933 +5934 +5935 +5938 +5940 +5942 +5944 +5945 +5947 +5950 +5952 +5954 +5956 +5960 +5961 +5963 +5966 +5970 +5974 +5975 +5983 +5985 +5986 +5990 +5995 +5996 +5997 +5999 +6000 +6003 +6006 +6010 +6011 +6012 +6013 +6015 +6016 +6020 +6025 +6026 +6028 +6030 +6031 +6033 +6035 +6037 +6038 +6041 +6042 +6044 +6045 +6046 +6048 +6056 +6057 +6058 +6061 +6062 +6064 +6065 +6071 +6074 +6078 +6088 +6095 +6098 +6099 +6100 +6102 +6103 +6105 +6106 +6110 +6112 +6116 +6119 +6120 +6122 +6123 +6125 +6126 +6133 +6136 +6137 +6140 +6142 +6147 +6148 +6149 +6151 +6152 +6154 +6155 +6163 +6166 +6168 +6171 +6172 +6173 +6174 +6175 +6177 +6178 +6179 +6180 +6182 +6183 +6185 +6186 +6188 +6189 +6190 +6193 +6194 +6195 +6197 +6199 +6200 +6204 +6205 +6207 +6208 +6209 +6212 +6216 +6218 +6219 +6222 +6223 +6225 +6226 +6230 +6232 +6235 +6237 +6240 +6241 +6242 +6247 +6251 +6252 +6256 +6260 +6262 +6263 +6264 +6267 +6269 +6270 +6271 +6272 +6275 +6276 +6278 +6280 +6282 +6286 +6290 +6293 +6298 +6299 +6302 +6304 +6306 +6310 +6315 +6316 +6317 +6319 +6321 +6323 +6326 +6327 +6329 +6333 +6334 +6344 +6345 +6346 +6348 +6349 +6350 +6353 +6354 +6355 +6356 +6363 +6366 +6369 +6370 +6374 +6376 +6378 +6383 +6385 +6390 +6392 +6396 +6399 +6400 +6402 +6403 +6404 +6405 +6406 +6408 +6410 +6411 +6412 +6415 +6416 +6418 +6419 +6420 +6421 +6423 +6424 +6425 +6426 +6427 +6428 +6429 +6431 +6433 +6438 +6439 +6440 +6441 +6442 +6443 +6444 +6448 +6450 +6454 +6457 +6460 +6461 +6462 +6467 +6468 +6476 +6479 +6480 +6483 +6484 +6485 +6488 +6495 +6503 +6507 +6515 +6516 +6517 +6518 +6520 +6521 +6530 +6531 +6532 +6537 +6538 +6539 +6546 +6550 +6554 +6557 +6561 +6562 +6563 +6565 +6566 +6570 +6572 +6574 +6578 +6583 +6585 +6586 +6593 +6595 +6596 +6597 +6598 +6600 +6601 +6607 +6608 +6609 +6611 +6613 +6620 +6627 +6630 +6633 +6635 +6636 +6637 +6639 +6640 +6641 +6642 +6644 +6645 +6650 +6651 +6653 +6654 +6657 +6662 +6663 +6664 +6665 +6667 +6671 +6673 +6674 +6678 +6679 +6681 +6684 +6686 +6689 +6690 +6692 +6693 +6694 +6696 +6698 +6701 +6703 +6705 +6707 +6712 +6713 +6714 +6716 +6717 +6718 +6720 +6726 +6728 +6730 +6731 +6732 +6733 +6735 +6742 +6743 +6745 +6746 +6747 +6752 +6755 +6759 +6760 +6761 +6762 +6764 +6768 +6772 +6773 +6774 +6775 +6781 +6784 +6787 +6791 +6792 +6795 +6798 +6800 +6803 +6806 +6807 +6810 +6811 +6814 +6816 +6817 +6821 +6824 +6828 +6829 +6830 +6832 +6838 +6842 +6843 +6847 +6850 +6853 +6854 +6857 +6858 +6859 +6860 +6862 +6863 +6864 +6866 +6867 +6870 +6871 +6874 +6875 +6876 +6878 +6880 +6883 +6884 +6885 +6888 +6891 +6896 +6900 +6905 +6906 +6907 +6908 +6909 +6910 +6912 +6913 +6914 +6917 +6919 +6923 +6930 +6932 +6934 +6935 +6939 +6940 +6941 +6942 +6944 +6945 +6946 +6948 +6950 +6951 +6953 +6957 +6961 +6962 +6963 +6972 +6974 +6976 +6977 +6978 +6979 +6980 +6981 +6983 +6986 +6990 +6993 +6995 +6997 +7000 +7006 +7011 +7013 +7015 +7018 +7019 +7024 +7025 +7026 +7028 +7031 +7032 +7035 +7038 +7042 +7043 +7044 +7049 +7051 +7054 +7055 +7057 +7058 +7059 +7060 +7061 +7062 +7064 +7068 +7070 +7072 +7073 +7078 +7081 +7084 +7085 +7087 +7090 +7092 +7095 +7096 +7100 +7107 +7108 +7110 +7111 +7114 +7118 +7120 +7122 +7124 +7125 +7132 +7134 +7138 +7139 +7144 +7147 +7148 +7150 +7151 +7153 +7160 +7161 +7162 +7168 +7169 +7170 +7177 +7179 +7182 +7183 +7184 +7186 +7187 +7188 +7190 +7200 +7202 +7203 +7208 +7211 +7212 +7215 +7216 +7217 +7218 +7221 +7224 +7225 +7227 +7229 +7232 +7233 +7235 +7237 +7238 +7247 +7248 +7249 +7250 +7252 +7254 +7258 +7261 +7265 +7268 +7269 +7271 +7272 +7274 +7276 +7278 +7280 +7281 +7284 +7286 +7288 +7293 +7295 +7300 +7301 +7302 +7304 +7308 +7310 +7311 +7319 +7320 +7321 +7323 +7328 +7331 +7332 +7335 +7336 +7339 +7340 +7342 +7347 +7348 +7349 +7350 +7353 +7355 +7356 +7362 +7365 +7369 +7373 +7374 +7377 +7380 +7382 +7385 +7387 +7390 +7395 +7398 +7399 +7402 +7405 +7415 +7417 +7418 +7419 +7420 +7422 +7424 +7432 +7434 +7437 +7438 +7440 +7441 +7445 +7450 +7452 +7454 +7455 +7457 +7460 +7461 +7466 +7467 +7469 +7470 +7471 +7474 +7475 +7478 +7480 +7485 +7488 +7490 +7491 +7492 +7493 +7494 +7495 +7496 +7497 +7498 +7500 +7502 +7508 +7510 +7516 +7518 +7520 +7521 +7522 +7523 +7525 +7533 +7536 +7539 +7540 +7543 +7547 +7548 +7549 +7550 +7553 +7554 +7556 +7559 +7560 +7562 +7564 +7567 +7570 +7574 +7575 +7579 +7580 +7585 +7586 +7587 +7589 +7591 +7594 +7596 +7598 +7602 +7603 +7607 +7608 +7610 +7611 +7615 +7620 +7621 +7627 +7630 +7632 +7634 +7637 +7639 +7642 +7644 +7646 +7650 +7651 +7660 +7661 +7666 +7668 +7679 +7680 +7683 +7685 +7686 +7690 +7691 +7692 +7696 +7698 +7700 +7702 +7703 +7704 +7705 +7710 +7711 +7712 +7715 +7716 +7718 +7720 +7726 +7727 +7731 +7733 +7735 +7739 +7740 +7741 +7742 +7747 +7750 +7755 +7757 +7758 +7759 +7760 +7763 +7764 +7765 +7766 +7768 +7769 +7771 +7779 +7784 +7786 +7789 +7790 +7793 +7797 +7800 +7801 +7802 +7805 +7808 +7810 +7814 +7816 +7817 +7820 +7821 +7823 +7824 +7827 +7828 +7829 +7834 +7835 +7839 +7840 +7842 +7844 +7846 +7849 +7850 +7856 +7859 +7864 +7865 +7866 +7868 +7870 +7877 +7879 +7880 +7881 +7891 +7895 +7900 +7905 +7907 +7910 +7913 +7915 +7916 +7918 +7919 +7920 +7922 +7924 +7925 +7932 +7934 +7936 +7940 +7942 +7943 +7946 +7950 +7951 +7952 +7959 +7964 +7968 +7971 +7978 +7980 +7983 +7984 +7986 +7988 +7989 +7990 +7993 +8000 +8002 +8004 +8005 +8010 +8015 +8017 +8028 +8030 +8032 +8038 +8040 +8043 +8046 +8047 +8050 +8052 +8053 +8054 +8057 +8058 +8059 +8060 +8061 +8062 +8064 +8070 +8071 +8072 +8076 +8077 +8081 +8082 +8086 +8087 +8090 +8093 +8107 +8108 +8109 +8110 +8111 +8113 +8117 +8118 +8120 +8121 +8124 +8126 +8128 +8130 +8131 +8132 +8134 +8136 +8139 +8140 +8141 +8142 +8143 +8144 +8146 +8149 +8150 +8152 +8155 +8156 +8157 +8166 +8170 +8173 +8174 +8178 +8180 +8182 +8188 +8189 +8190 +8191 +8199 +8201 +8204 +8210 +8213 +8217 +8220 +8224 +8227 +8230 +8234 +8235 +8237 +8242 +8243 +8245 +8246 +8250 +8251 +8253 +8254 +8255 +8260 +8263 +8265 +8266 +8267 +8275 +8279 +8281 +8282 +8283 +8284 +8285 +8286 +8292 +8293 +8294 +8296 +8300 +8302 +8305 +8307 +8310 +8317 +8318 +8324 +8325 +8326 +8332 +8336 +8341 +8344 +8345 +8346 +8349 +8352 +8355 +8360 +8361 +8362 +8364 +8365 +8370 +8372 +8374 +8376 +8377 +8380 +8383 +8388 +8390 +8395 +8399 +8401 +8404 +8406 +8407 +8410 +8411 +8420 +8421 +8423 +8427 +8430 +8432 +8435 +8437 +8441 +8442 +8450 +8451 +8459 +8460 +8461 +8463 +8468 +8471 +8480 +8481 +8483 +8489 +8495 +8496 +8499 +8500 +8501 +8512 +8516 +8520 +8521 +8522 +8528 +8529 +8530 +8535 +8536 +8542 +8548 +8549 +8553 +8555 +8557 +8563 +8566 +8570 +8573 +8576 +8580 +8584 +8586 +8587 +8588 +8590 +8592 +8596 +8597 +8598 +8600 +8601 +8603 +8608 +8610 +8613 +8620 +8621 +8622 +8624 +8625 +8630 +8633 +8635 +8640 +8643 +8644 +8646 +8653 +8659 +8661 +8662 +8665 +8666 +8670 +8673 +8678 +8681 +8683 +8685 +8694 +8705 +8706 +8709 +8724 +8725 +8730 +8732 +8734 +8741 +8745 +8748 +8749 +8750 +8753 +8754 +8756 +8757 +8758 +8761 +8762 +8764 +8777 +8780 +8781 +8782 +8787 +8788 +8791 +8795 +8796 +8798 +8800 +8810 +8812 +8815 +8818 +8820 +8821 +8822 +8825 +8827 +8828 +8830 +8832 +8835 +8838 +8840 +8841 +8845 +8846 +8849 +8850 +8851 +8855 +8859 +8860 +8866 +8869 +8870 +8871 +8874 +8877 +8878 +8879 +8883 +8887 +8888 +8889 +8890 +8891 +8892 +8893 +8898 +8904 +8906 +8907 +8910 +8911 +8916 +8919 +8920 +8921 +8922 +8924 +8926 +8930 +8933 +8936 +8940 +8947 +8954 +8960 +8963 +8964 +8968 +8969 +8970 +8974 +8977 +8978 +8979 +8980 +8985 +8989 +8995 +8996 +9000 +9001 +9010 +9014 +9015 +9027 +9028 +9030 +9031 +9044 +9049 +9051 +9054 +9060 +9064 +9066 +9069 +9070 +9071 +9072 +9073 +9075 +9079 +9080 +9085 +9090 +9091 +9092 +9097 +9098 +9105 +9107 +9110 +9112 +9117 +9125 +9126 +9129 +9130 +9137 +9140 +9142 +9150 +9155 +9158 +9160 +9161 +9163 +9167 +9169 +9170 +9176 +9178 +9180 +9190 +9193 +9198 +9200 +9206 +9207 +9209 +9212 +9215 +9217 +9226 +9227 +9228 +9229 +9230 +9233 +9234 +9236 +9242 +9248 +9249 +9252 +9260 +9269 +9273 +9275 +9279 +9280 +9281 +9282 +9284 +9285 +9290 +9299 +9300 +9305 +9314 +9323 +9324 +9325 +9329 +9335 +9336 +9340 +9342 +9343 +9347 +9348 +9350 +9352 +9353 +9356 +9360 +9362 +9363 +9367 +9370 +9380 +9383 +9384 +9391 +9395 +9396 +9398 +9400 +9403 +9405 +9406 +9409 +9410 +9414 +9419 +9420 +9428 +9437 +9438 +9440 +9452 +9453 +9456 +9457 +9459 +9461 +9470 +9477 +9480 +9481 +9485 +9487 +9491 +9496 +9499 +9500 +9501 +9504 +9508 +9510 +9519 +9520 +9527 +9536 +9539 +9543 +9546 +9547 +9550 +9555 +9556 +9565 +9566 +9570 +9572 +9575 +9577 +9578 +9579 +9581 +9586 +9587 +9591 +9600 +9605 +9607 +9610 +9611 +9614 +9619 +9620 +9625 +9628 +9630 +9635 +9644 +9646 +9647 +9650 +9652 +9656 +9666 +9670 +9673 +9676 +9680 +9683 +9684 +9686 +9688 +9692 +9694 +9696 +9697 +9700 +9701 +9702 +9704 +9706 +9709 +9710 +9717 +9718 +9719 +9720 +9730 +9734 +9744 +9748 +9750 +9767 +9769 +9777 +9778 +9790 +9810 +9820 +9822 +9824 +9827 +9830 +9832 +9834 +9840 +9844 +9850 +9851 +9852 +9860 +9864 +9865 +9870 +9873 +9882 +9886 +9890 +9896 +9900 +9903 +9904 +9914 +9917 +9918 +9919 +9930 +9932 +9934 +9938 +9939 +9940 +9943 +9950 +9952 +9956 +9960 +9972 +9977 +9980 +9993 +9995 +10000 +10004 +10005 +10007 +10020 +10021 +10023 +10030 +10031 +10032 +10033 +10034 +10038 +10040 +10049 +10051 +10058 +10075 +10080 +10081 +10085 +10103 +10106 +10111 +10112 +10113 +10115 +10116 +10120 +10122 +10128 +10140 +10145 +10148 +10150 +10151 +10155 +10156 +10166 +10167 +10169 +10170 +10174 +10188 +10193 +10200 +10203 +10208 +10210 +10211 +10217 +10220 +10223 +10230 +10233 +10240 +10243 +10245 +10246 +10250 +10255 +10260 +10269 +10270 +10272 +10279 +10280 +10282 +10284 +10285 +10289 +10291 +10299 +10300 +10302 +10303 +10323 +10324 +10325 +10326 +10327 +10333 +10338 +10340 +10344 +10349 +10350 +10352 +10360 +10370 +10373 +10374 +10376 +10388 +10393 +10396 +10398 +10400 +10407 +10411 +10420 +10428 +10435 +10447 +10450 +10461 +10465 +10468 +10473 +10474 +10478 +10480 +10485 +10490 +10492 +10498 +10502 +10503 +10504 +10507 +10510 +10514 +10527 +10528 +10540 +10550 +10558 +10560 +10562 +10564 +10570 +10572 +10580 +10585 +10600 +10604 +10608 +10610 +10616 +10619 +10644 +10660 +10670 +10672 +10674 +10690 +10692 +10695 +10699 +10700 +10701 +10707 +10710 +10712 +10718 +10719 +10722 +10730 +10732 +10738 +10740 +10741 +10743 +10744 +10746 +10752 +10770 +10771 +10777 +10780 +10788 +10790 +10794 +10810 +10811 +10814 +10823 +10829 +10833 +10835 +10848 +10852 +10860 +10865 +10867 +10870 +10875 +10877 +10878 +10882 +10888 +10906 +10908 +10909 +10910 +10915 +10922 +10930 +10933 +10950 +10958 +10960 +10964 +10974 +10980 +10984 +10989 +10990 +10992 +10993 +10996 +11005 +11017 +11018 +11023 +11025 +11027 +11029 +11030 +11039 +11040 +11048 +11049 +11050 +11063 +11073 +11076 +11079 +11080 +11091 +11094 +11102 +11106 +11108 +11109 +11110 +11115 +11121 +11124 +11128 +11146 +11147 +11160 +11170 +11177 +11180 +11181 +11185 +11187 +11190 +11195 +11200 +11205 +11219 +11220 +11229 +11250 +11253 +11256 +11258 +11260 +11270 +11277 +11279 +11280 +11283 +11286 +11289 +11296 +11297 +11299 +11301 +11309 +11312 +11328 +11330 +11339 +11340 +11342 +11346 +11350 +11356 +11358 +11359 +11360 +11366 +11370 +11372 +11378 +11382 +11383 +11390 +11400 +11403 +11404 +11410 +11419 +11428 +11430 +11439 +11441 +11446 +11447 +11450 +11458 +11463 +11464 +11470 +11475 +11487 +11489 +11490 +11499 +11500 +11515 +11518 +11520 +11524 +11537 +11540 +11546 +11550 +11553 +11555 +11560 +11570 +11571 +11572 +11577 +11580 +11583 +11589 +11590 +11595 +11600 +11601 +11610 +11611 +11620 +11622 +11630 +11631 +11640 +11643 +11644 +11645 +11650 +11660 +11664 +11669 +11670 +11681 +11683 +11686 +11687 +11690 +11694 +11696 +11700 +11710 +11719 +11720 +11723 +11730 +11742 +11744 +11750 +11760 +11770 +11772 +11773 +11780 +11781 +11784 +11791 +11800 +11801 +11802 +11813 +11815 +11816 +11820 +11839 +11840 +11849 +11855 +11856 +11857 +11858 +11860 +11862 +11869 +11880 +11890 +11899 +11900 +11905 +11910 +11920 +11930 +11932 +11939 +11940 +11946 +11959 +11960 +11980 +11987 +12000 +12006 +12009 +12027 +12030 +12035 +12040 +12047 +12050 +12070 +12080 +12087 +12088 +12091 +12093 +12115 +12120 +12125 +12130 +12139 +12140 +12146 +12148 +12160 +12162 +12166 +12168 +12170 +12172 +12177 +12180 +12186 +12188 +12189 +12192 +12194 +12200 +12201 +12204 +12218 +12219 +12220 +12221 +12225 +12228 +12230 +12237 +12239 +12240 +12242 +12244 +12250 +12256 +12260 +12261 +12266 +12270 +12282 +12294 +12295 +12296 +12300 +12301 +12306 +12317 +12320 +12321 +12322 +12323 +12326 +12327 +12330 +12340 +12343 +12371 +12380 +12385 +12390 +12391 +12392 +12400 +12408 +12412 +12419 +12422 +12423 +12427 +12430 +12440 +12441 +12446 +12450 +12452 +12455 +12468 +12477 +12480 +12486 +12497 +12499 +12500 +12503 +12510 +12526 +12528 +12530 +12540 +12550 +12559 +12560 +12570 +12578 +12592 +12601 +12608 +12610 +12616 +12619 +12620 +12640 +12644 +12650 +12660 +12663 +12670 +12677 +12680 +12681 +12684 +12691 +12695 +12701 +12720 +12721 +12730 +12740 +12744 +12750 +12760 +12762 +12767 +12769 +12770 +12773 +12779 +12780 +12784 +12797 +12801 +12803 +12819 +12820 +12830 +12840 +12849 +12850 +12860 +12879 +12882 +12885 +12896 +12903 +12916 +12918 +12925 +12930 +12934 +12940 +12942 +12943 +12950 +12953 +12960 +12966 +12971 +12990 +12997 +13000 +13020 +13025 +13026 +13030 +13040 +13041 +13043 +13049 +13053 +13059 +13060 +13061 +13067 +13070 +13072 +13100 +13111 +13120 +13124 +13126 +13139 +13140 +13160 +13171 +13178 +13179 +13180 +13191 +13198 +13200 +13215 +13226 +13230 +13231 +13244 +13246 +13260 +13269 +13270 +13272 +13274 +13276 +13280 +13288 +13300 +13305 +13309 +13312 +13324 +13330 +13333 +13340 +13350 +13351 +13360 +13361 +13367 +13368 +13370 +13372 +13380 +13382 +13388 +13410 +13422 +13427 +13430 +13440 +13450 +13466 +13470 +13475 +13480 +13490 +13491 +13495 +13504 +13508 +13509 +13517 +13520 +13539 +13540 +13546 +13548 +13559 +13560 +13562 +13570 +13574 +13580 +13587 +13590 +13591 +13592 +13594 +13596 +13598 +13599 +13610 +13620 +13622 +13626 +13632 +13635 +13639 +13641 +13650 +13660 +13674 +13680 +13683 +13688 +13690 +13700 +13704 +13709 +13710 +13720 +13722 +13730 +13733 +13740 +13744 +13750 +13758 +13760 +13782 +13786 +13792 +13800 +13813 +13820 +13821 +13832 +13833 +13841 +13849 +13860 +13866 +13870 +13873 +13877 +13880 +13882 +13890 +13900 +13905 +13907 +13929 +13936 +13954 +13970 +13980 +14010 +14038 +14040 +14041 +14043 +14050 +14056 +14060 +14068 +14080 +14087 +14090 +14091 +14092 +14100 +14106 +14110 +14114 +14120 +14130 +14132 +14146 +14154 +14157 +14168 +14172 +14173 +14177 +14190 +14193 +14205 +14215 +14223 +14230 +14234 +14238 +14250 +14253 +14255 +14260 +14278 +14285 +14300 +14311 +14318 +14320 +14321 +14324 +14331 +14342 +14343 +14358 +14364 +14369 +14370 +14382 +14390 +14396 +14400 +14402 +14410 +14420 +14430 +14440 +14442 +14444 +14460 +14480 +14491 +14499 +14504 +14510 +14513 +14520 +14530 +14540 +14545 +14550 +14560 +14570 +14578 +14580 +14591 +14597 +14600 +14605 +14607 +14610 +14618 +14622 +14625 +14629 +14630 +14650 +14654 +14675 +14679 +14689 +14690 +14698 +14700 +14705 +14710 +14717 +14720 +14729 +14746 +14750 +14757 +14759 +14760 +14773 +14780 +14789 +14790 +14795 +14800 +14803 +14811 +14819 +14820 +14821 +14823 +14828 +14850 +14860 +14894 +14895 +14898 +14900 +14901 +14905 +14910 +14916 +14938 +14940 +14960 +14970 +14997 +14998 +15007 +15025 +15030 +15038 +15040 +15052 +15053 +15060 +15070 +15074 +15077 +15080 +15090 +15110 +15133 +15136 +15137 +15144 +15147 +15150 +15157 +15160 +15170 +15175 +15178 +15190 +15200 +15201 +15210 +15213 +15220 +15249 +15250 +15260 +15261 +15267 +15278 +15283 +15289 +15292 +15296 +15300 +15311 +15321 +15331 +15340 +15355 +15365 +15375 +15380 +15381 +15400 +15408 +15409 +15418 +15424 +15425 +15430 +15434 +15440 +15445 +15460 +15470 +15479 +15480 +15484 +15490 +15492 +15502 +15505 +15507 +15520 +15527 +15530 +15549 +15554 +15563 +15567 +15570 +15579 +15582 +15584 +15600 +15610 +15630 +15645 +15649 +15650 +15652 +15655 +15665 +15670 +15684 +15685 +15688 +15690 +15695 +15730 +15734 +15747 +15756 +15765 +15779 +15782 +15792 +15817 +15823 +15828 +15830 +15836 +15840 +15845 +15849 +15860 +15870 +15871 +15876 +15877 +15880 +15892 +15901 +15907 +15930 +15954 +16000 +16003 +16004 +16010 +16040 +16050 +16054 +16072 +16080 +16090 +16100 +16109 +16110 +16119 +16125 +16150 +16162 +16184 +16190 +16198 +16210 +16214 +16230 +16234 +16240 +16245 +16246 +16258 +16273 +16280 +16282 +16290 +16300 +16310 +16311 +16321 +16329 +16330 +16332 +16343 +16362 +16369 +16377 +16379 +16382 +16383 +16386 +16387 +16390 +16399 +16400 +16432 +16433 +16440 +16450 +16453 +16463 +16470 +16490 +16493 +16495 +16500 +16506 +16510 +16511 +16520 +16540 +16543 +16557 +16560 +16570 +16573 +16579 +16599 +16610 +16617 +16628 +16645 +16647 +16648 +16660 +16661 +16664 +16668 +16670 +16678 +16680 +16683 +16687 +16718 +16720 +16723 +16737 +16762 +16772 +16777 +16780 +16783 +16784 +16792 +16797 +16800 +16805 +16814 +16820 +16831 +16837 +16840 +16853 +16857 +16858 +16860 +16865 +16867 +16869 +16892 +16894 +16900 +16916 +16920 +16923 +16925 +16927 +16930 +16948 +16950 +16980 +16990 +17000 +17010 +17020 +17035 +17045 +17071 +17080 +17085 +17087 +17090 +17091 +17100 +17129 +17130 +17139 +17142 +17148 +17160 +17166 +17196 +17200 +17203 +17218 +17220 +17230 +17232 +17240 +17248 +17260 +17267 +17271 +17303 +17310 +17317 +17319 +17330 +17346 +17350 +17360 +17370 +17380 +17390 +17402 +17406 +17410 +17411 +17412 +17440 +17449 +17464 +17465 +17466 +17470 +17480 +17490 +17500 +17520 +17525 +17536 +17540 +17542 +17550 +17570 +17579 +17580 +17585 +17600 +17610 +17615 +17617 +17620 +17626 +17629 +17640 +17644 +17650 +17666 +17673 +17680 +17690 +17700 +17722 +17740 +17750 +17760 +17770 +17780 +17790 +17798 +17800 +17825 +17867 +17880 +17900 +17928 +17930 +17932 +17940 +17950 +17955 +17957 +17960 +17977 +17987 +17990 +18000 +18012 +18022 +18030 +18050 +18054 +18060 +18080 +18081 +18086 +18090 +18099 +18110 +18130 +18134 +18144 +18150 +18151 +18157 +18162 +18170 +18171 +18204 +18220 +18223 +18242 +18252 +18260 +18283 +18286 +18288 +18289 +18300 +18302 +18320 +18330 +18336 +18337 +18339 +18356 +18360 +18380 +18390 +18400 +18420 +18427 +18428 +18430 +18451 +18456 +18468 +18469 +18487 +18490 +18510 +18513 +18515 +18517 +18522 +18530 +18546 +18582 +18591 +18599 +18600 +18616 +18620 +18627 +18643 +18650 +18670 +18672 +18720 +18731 +18732 +18740 +18770 +18774 +18778 +18780 +18782 +18790 +18797 +18810 +18828 +18839 +18850 +18860 +18891 +18900 +18908 +18910 +18919 +18920 +18921 +18924 +18930 +18939 +18960 +18975 +18980 +18990 +19000 +19004 +19016 +19028 +19029 +19030 +19032 +19040 +19050 +19062 +19090 +19095 +19100 +19110 +19139 +19149 +19167 +19170 +19176 +19180 +19182 +19189 +19191 +19200 +19208 +19220 +19230 +19260 +19261 +19270 +19290 +19295 +19300 +19306 +19308 +19310 +19312 +19317 +19318 +19340 +19341 +19346 +19360 +19370 +19371 +19400 +19405 +19410 +19413 +19420 +19442 +19446 +19450 +19465 +19470 +19480 +19490 +19494 +19500 +19510 +19520 +19525 +19542 +19554 +19570 +19595 +19609 +19610 +19612 +19620 +19622 +19644 +19658 +19667 +19680 +19696 +19709 +19720 +19740 +19743 +19770 +19782 +19795 +19798 +19807 +19830 +19836 +19840 +19841 +19850 +19883 +19885 +19896 +19900 +19910 +19913 +19920 +19971 +19989 +19990 +20000 +20008 +20042 +20050 +20065 +20090 +20105 +20110 +20114 +20117 +20119 +20130 +20167 +20180 +20184 +20190 +20195 +20198 +20200 +20210 +20233 +20244 +20250 +20251 +20285 +20299 +20305 +20320 +20328 +20330 +20360 +20381 +20422 +20428 +20442 +20448 +20450 +20453 +20460 +20470 +20480 +20490 +20504 +20520 +20561 +20571 +20580 +20610 +20620 +20627 +20633 +20634 +20640 +20670 +20683 +20687 +20690 +20730 +20731 +20750 +20760 +20763 +20778 +20800 +20820 +20830 +20840 +20841 +20850 +20851 +20853 +20870 +20883 +20906 +20927 +20931 +20935 +20940 +20958 +20970 +20980 +20987 +20993 +21005 +21017 +21039 +21040 +21043 +21062 +21064 +21066 +21070 +21100 +21108 +21118 +21119 +21120 +21130 +21133 +21142 +21155 +21168 +21170 +21180 +21188 +21200 +21221 +21245 +21248 +21270 +21280 +21287 +21297 +21305 +21309 +21312 +21320 +21330 +21369 +21372 +21376 +21380 +21390 +21415 +21440 +21455 +21468 +21470 +21480 +21513 +21530 +21550 +21560 +21599 +21600 +21621 +21638 +21672 +21687 +21688 +21690 +21702 +21710 +21726 +21767 +21800 +21803 +21815 +21820 +21824 +21827 +21830 +21837 +21846 +21870 +21883 +21904 +21909 +21936 +21940 +21972 +21991 +21995 +22000 +22001 +22010 +22038 +22050 +22054 +22080 +22098 +22100 +22105 +22108 +22130 +22136 +22160 +22180 +22192 +22212 +22215 +22240 +22250 +22260 +22262 +22267 +22269 +22274 +22277 +22280 +22290 +22293 +22317 +22320 +22337 +22341 +22420 +22421 +22500 +22510 +22516 +22530 +22532 +22571 +22580 +22600 +22608 +22648 +22654 +22657 +22659 +22660 +22690 +22700 +22710 +22716 +22720 +22730 +22758 +22760 +22764 +22765 +22766 +22770 +22780 +22789 +22800 +22801 +22820 +22830 +22844 +22848 +22875 +22881 +22888 +22890 +22900 +22903 +22920 +22927 +22960 +22970 +22989 +22990 +23000 +23045 +23050 +23100 +23118 +23119 +23120 +23121 +23123 +23124 +23133 +23180 +23200 +23220 +23221 +23238 +23240 +23246 +23260 +23280 +23320 +23330 +23334 +23350 +23395 +23398 +23410 +23427 +23450 +23480 +23500 +23501 +23505 +23510 +23517 +23540 +23545 +23556 +23561 +23579 +23600 +23602 +23606 +23620 +23630 +23694 +23696 +23699 +23710 +23740 +23742 +23750 +23760 +23767 +23779 +23804 +23822 +23837 +23841 +23862 +23870 +23878 +23880 +23900 +23906 +23920 +23930 +24000 +24016 +24020 +24047 +24079 +24093 +24120 +24139 +24140 +24141 +24145 +24160 +24161 +24175 +24180 +24200 +24210 +24220 +24227 +24250 +24278 +24280 +24298 +24300 +24306 +24317 +24353 +24356 +24386 +24400 +24410 +24420 +24457 +24461 +24465 +24500 +24501 +24510 +24528 +24583 +24610 +24637 +24657 +24690 +24740 +24761 +24770 +24796 +24798 +24800 +24809 +24826 +24830 +24840 +24841 +24843 +24855 +24869 +24880 +24890 +24893 +24900 +24906 +24911 +24915 +24958 +24960 +24990 +25038 +25040 +25057 +25060 +25090 +25092 +25160 +25168 +25180 +25187 +25200 +25233 +25250 +25260 +25286 +25300 +25306 +25310 +25320 +25330 +25343 +25380 +25430 +25432 +25440 +25460 +25470 +25486 +25500 +25517 +25519 +25533 +25540 +25544 +25560 +25575 +25580 +25584 +25590 +25596 +25630 +25640 +25650 +25690 +25700 +25710 +25720 +25740 +25746 +25770 +25780 +25800 +25806 +25810 +25834 +25840 +25858 +25860 +25863 +25868 +25877 +25878 +25880 +25910 +25930 +25931 +25950 +25962 +25964 +25980 +26034 +26078 +26110 +26120 +26127 +26134 +26140 +26166 +26167 +26188 +26190 +26210 +26251 +26302 +26330 +26338 +26350 +26421 +26453 +26469 +26470 +26490 +26500 +26528 +26546 +26550 +26554 +26600 +26637 +26638 +26650 +26660 +26664 +26670 +26680 +26690 +26700 +26770 +26772 +26782 +26790 +26800 +26808 +26820 +26822 +26840 +26860 +26861 +26870 +26940 +26980 +26989 +27008 +27030 +27071 +27080 +27081 +27091 +27100 +27106 +27110 +27143 +27152 +27159 +27176 +27180 +27188 +27200 +27212 +27226 +27251 +27269 +27280 +27290 +27307 +27317 +27330 +27350 +27360 +27381 +27386 +27389 +27390 +27391 +27410 +27432 +27440 +27442 +27450 +27463 +27500 +27520 +27527 +27548 +27557 +27590 +27624 +27627 +27647 +27658 +27667 +27670 +27680 +27700 +27730 +27780 +27796 +27800 +27804 +27809 +27840 +27850 +27852 +27861 +27894 +27920 +27922 +27923 +27948 +27980 +27983 +27988 +27990 +28000 +28019 +28020 +28060 +28087 +28090 +28100 +28110 +28140 +28148 +28160 +28163 +28164 +28229 +28251 +28261 +28273 +28290 +28300 +28305 +28351 +28353 +28389 +28400 +28425 +28430 +28500 +28510 +28520 +28530 +28531 +28550 +28580 +28658 +28700 +28780 +28783 +28791 +28823 +28832 +28856 +28860 +28872 +28900 +28928 +28929 +28947 +28953 +28980 +29000 +29030 +29067 +29080 +29090 +29100 +29130 +29144 +29150 +29160 +29175 +29180 +29192 +29200 +29203 +29240 +29280 +29290 +29300 +29320 +29360 +29385 +29455 +29456 +29459 +29470 +29480 +29492 +29500 +29519 +29540 +29580 +29585 +29590 +29619 +29648 +29652 +29660 +29666 +29699 +29710 +29736 +29740 +29761 +29776 +29831 +29848 +29852 +29860 +29870 +29900 +29930 +29961 +30012 +30040 +30070 +30100 +30109 +30111 +30150 +30200 +30252 +30261 +30271 +30280 +30290 +30300 +30312 +30320 +30340 +30354 +30400 +30417 +30420 +30433 +30452 +30472 +30490 +30495 +30510 +30523 +30563 +30590 +30592 +30600 +30660 +30680 +30690 +30710 +30720 +30722 +30736 +30738 +30750 +30770 +30821 +30894 +30900 +30912 +30920 +30930 +30970 +30998 +31000 +31013 +31028 +31048 +31060 +31080 +31087 +31090 +31092 +31100 +31114 +31118 +31160 +31169 +31197 +31200 +31222 +31230 +31240 +31248 +31282 +31300 +31316 +31320 +31330 +31352 +31355 +31370 +31394 +31400 +31420 +31430 +31440 +31475 +31476 +31500 +31510 +31542 +31549 +31550 +31580 +31600 +31612 +31614 +31629 +31655 +31682 +31694 +31700 +31740 +31780 +31830 +31835 +31848 +31850 +31860 +31870 +31900 +31903 +31952 +32010 +32050 +32099 +32100 +32117 +32130 +32173 +32200 +32210 +32223 +32250 +32256 +32298 +32300 +32310 +32330 +32335 +32360 +32370 +32380 +32390 +32400 +32440 +32460 +32470 +32480 +32537 +32630 +32656 +32680 +32695 +32764 +32800 +32808 +32812 +32820 +32840 +32867 +32870 +32900 +32940 +33050 +33066 +33075 +33100 +33130 +33144 +33150 +33160 +33170 +33220 +33270 +33280 +33300 +33303 +33380 +33390 +33400 +33410 +33480 +33485 +33496 +33500 +33530 +33590 +33600 +33650 +33700 +33736 +33770 +33797 +33810 +33900 +33951 +33960 +34020 +34027 +34040 +34058 +34059 +34074 +34088 +34100 +34132 +34140 +34177 +34190 +34194 +34200 +34207 +34208 +34236 +34240 +34270 +34280 +34284 +34300 +34310 +34381 +34400 +34414 +34453 +34481 +34485 +34500 +34520 +34544 +34546 +34631 +34700 +34748 +34773 +34780 +34820 +34857 +34910 +34956 +34962 +35000 +35010 +35067 +35070 +35080 +35087 +35100 +35117 +35132 +35140 +35204 +35277 +35281 +35283 +35289 +35300 +35310 +35320 +35353 +35370 +35410 +35440 +35470 +35512 +35520 +35548 +35569 +35579 +35600 +35610 +35616 +35650 +35680 +35700 +35760 +35771 +35785 +35820 +35833 +35849 +35870 +35880 +35900 +35910 +36000 +36061 +36080 +36103 +36122 +36180 +36200 +36212 +36277 +36290 +36300 +36393 +36400 +36500 +36559 +36560 +36570 +36585 +36630 +36670 +36690 +36710 +36717 +36750 +36870 +36890 +36900 +36940 +36946 +36956 +36980 +37000 +37030 +37062 +37113 +37120 +37122 +37130 +37170 +37294 +37297 +37300 +37340 +37385 +37400 +37410 +37440 +37466 +37480 +37530 +37549 +37550 +37600 +37613 +37670 +37750 +37754 +37760 +37770 +37830 +37865 +37883 +37885 +37890 +37905 +37940 +37950 +38090 +38120 +38124 +38160 +38172 +38192 +38194 +38200 +38210 +38216 +38236 +38249 +38250 +38260 +38272 +38280 +38290 +38300 +38320 +38360 +38362 +38419 +38451 +38492 +38523 +38548 +38575 +38580 +38595 +38620 +38640 +38661 +38680 +38717 +38744 +38841 +39055 +39070 +39125 +39140 +39220 +39258 +39300 +39322 +39331 +39365 +39405 +39590 +39593 +39615 +39640 +39662 +39673 +39722 +39740 +39743 +39750 +39760 +39770 +39794 +39799 +39800 +39863 +39900 +39937 +39940 +39943 +39960 +39989 +40000 +40010 +40040 +40096 +40100 +40180 +40200 +40211 +40217 +40220 +40280 +40330 +40352 +40360 +40400 +40500 +40550 +40597 +40606 +40617 +40660 +40675 +40730 +40746 +40800 +40820 +40830 +40998 +41000 +41010 +41020 +41040 +41067 +41077 +41105 +41131 +41138 +41218 +41239 +41240 +41389 +41390 +41420 +41432 +41491 +41500 +41550 +41600 +41620 +41628 +41630 +41664 +41690 +41710 +41740 +41773 +41800 +41816 +41838 +41980 +41987 +41990 +42047 +42121 +42134 +42200 +42253 +42350 +42383 +42410 +42434 +42440 +42510 +42520 +42550 +42570 +42611 +42636 +42640 +42682 +42700 +42800 +42861 +42900 +42940 +42980 +43060 +43080 +43090 +43100 +43140 +43170 +43181 +43231 +43240 +43265 +43351 +43376 +43400 +43411 +43423 +43520 +43540 +43549 +43600 +43627 +43670 +43790 +43803 +43810 +43844 +43917 +43920 +44000 +44050 +44100 +44110 +44160 +44170 +44240 +44290 +44300 +44330 +44360 +44370 +44409 +44420 +44426 +44470 +44600 +44620 +44627 +44630 +44640 +44691 +44810 +44860 +44870 +44900 +44940 +45020 +45110 +45191 +45210 +45310 +45400 +45450 +45454 +45488 +45490 +45618 +45633 +45680 +45700 +45710 +45715 +45743 +45832 +45854 +45855 +45966 +46080 +46111 +46142 +46200 +46222 +46236 +46303 +46317 +46340 +46390 +46400 +46484 +46500 +46510 +46600 +46640 +46667 +46750 +46800 +46890 +46900 +47000 +47087 +47090 +47100 +47150 +47154 +47206 +47400 +47597 +47600 +47620 +47640 +47647 +47665 +47701 +47760 +47800 +47840 +47880 +48000 +48100 +48150 +48180 +48190 +48200 +48360 +48406 +48440 +48460 +48520 +48540 +48600 +48700 +48779 +48877 +48970 +48980 +49000 +49040 +49060 +49090 +49097 +49160 +49450 +49521 +49522 +49527 +49600 +49660 +49663 +49700 +49800 +49810 +49869 +49875 +49910 +50000 +50032 +50047 +50050 +50065 +50123 +50198 +50200 +50240 +50400 +50407 +50420 +50498 +50540 +50626 +50640 +50687 +50700 +50709 +50785 +50900 +50980 +50990 +51020 +51100 +51166 +51170 +51173 +51247 +51257 +51300 +51320 +51410 +51480 +51557 +51563 +51600 +51680 +51710 +51722 +51740 +51753 +51770 +51780 +51900 +51970 +52000 +52065 +52200 +52254 +52320 +52400 +52500 +52530 +52600 +52601 +52640 +52720 +52750 +52800 +52864 +52874 +52899 +52900 +53000 +53070 +53128 +53180 +53213 +53240 +53550 +53670 +53695 +53745 +53800 +53806 +53810 +53812 +53838 +53946 +54000 +54010 +54100 +54140 +54200 +54299 +54310 +54550 +54570 +54750 +54782 +54822 +54867 +54888 +54910 +54980 +54990 +55100 +55200 +55241 +55300 +55310 +55340 +55405 +55460 +55600 +55690 +55750 +55790 +55800 +55900 +56000 +56037 +56052 +56130 +56200 +56255 +56300 +56363 +56500 +56600 +56605 +56629 +56630 +56800 +56854 +56890 +57000 +57170 +57183 +57200 +57310 +57351 +57352 +57470 +57492 +57540 +57550 +57600 +57603 +57668 +57693 +57711 +57940 +57950 +58014 +58047 +58080 +58100 +58200 +58250 +58300 +58364 +58460 +58560 +58600 +58630 +58646 +58700 +58760 +58970 +59000 +59081 +59100 +59170 +59300 +59408 +59443 +59460 +59472 +59546 +59580 +59600 +59680 +59760 +59800 +59810 +59820 +59914 +59917 +59934 +59992 +60000 +60114 +60149 +60250 +60280 +60330 +60335 +60400 +60420 +60450 +60455 +60520 +60522 +60610 +60720 +60756 +60776 +60890 +60961 +61000 +61015 +61020 +61040 +61119 +61220 +61250 +61260 +61269 +61270 +61292 +61400 +61476 +61540 +61568 +61640 +61648 +61690 +61700 +61780 +61787 +61851 +61852 +61890 +62090 +62100 +62160 +62240 +62320 +62440 +62500 +62610 +62750 +62800 +62900 +63000 +63140 +63200 +63300 +63424 +63430 +63600 +63710 +63760 +63880 +63890 +63940 +64020 +64025 +64090 +64161 +64210 +64300 +64400 +64500 +64581 +64600 +64604 +64628 +64648 +64680 +64708 +64736 +64740 +64779 +64782 +64890 +64910 +64994 +65000 +65013 +65221 +65240 +65250 +65265 +65300 +65330 +65440 +65496 +65600 +65700 +65790 +65800 +65893 +65900 +65979 +66070 +66100 +66140 +66190 +66200 +66231 +66265 +66327 +66430 +66490 +66500 +66510 +66647 +66689 +66780 +66800 +66847 +66850 +66900 +67000 +67170 +67175 +67180 +67200 +67251 +67300 +67370 +67400 +67500 +67649 +67700 +67723 +67780 +67800 +67936 +68160 +68189 +68200 +68302 +68470 +68517 +68570 +68640 +68700 +68710 +68763 +68776 +68840 +68894 +68900 +68950 +69000 +69131 +69180 +69306 +69406 +69430 +69468 +69520 +69540 +69553 +69600 +69608 +69610 +69880 +69890 +70100 +70143 +70190 +70200 +70220 +70300 +70390 +70400 +70500 +70659 +70760 +70791 +71019 +71300 +71450 +71474 +71600 +71700 +71749 +71800 +71870 +71900 +71930 +72040 +72188 +72213 +72270 +72416 +72440 +72453 +72475 +72500 +72580 +72710 +72800 +73045 +73101 +73134 +73219 +73326 +73354 +73430 +73451 +73710 +73718 +73790 +73803 +73890 +73900 +73950 +74200 +74349 +74388 +74565 +74580 +74597 +74900 +75000 +75025 +75100 +75152 +75210 +75230 +75295 +75322 +75477 +75500 +75700 +75779 +75780 +75810 +75991 +76020 +76040 +76150 +76400 +76430 +76700 +76769 +77000 +77100 +77195 +77200 +77252 +77330 +77342 +77542 +77600 +77700 +78000 +78003 +78100 +78160 +78207 +78300 +78340 +78400 +78596 +78782 +78870 +79038 +79091 +79100 +79180 +79210 +79300 +79500 +79620 +79622 +79862 +79900 +79960 +80000 +80500 +80570 +80580 +80600 +80622 +80720 +80781 +81000 +81200 +81230 +81280 +81454 +81600 +81700 +81740 +81832 +81855 +81900 +81930 +82000 +82090 +82400 +82500 +82560 +82600 +82650 +82700 +82720 +83013 +83130 +83214 +83370 +83500 +83570 +83580 +83590 +83620 +83720 +83765 +83845 +83919 +84000 +84008 +84052 +84124 +84343 +84490 +84570 +84979 +85000 +85320 +85540 +85560 +85682 +86040 +86175 +86200 +86250 +86570 +86599 +86907 +86920 +87000 +87050 +87198 +87305 +87400 +87420 +87500 +87840 +87900 +88000 +88100 +88150 +88170 +88200 +89089 +89400 +89427 +89500 +89700 +89751 +89830 +89860 +90000 +90105 +90300 +90419 +90500 +90580 +90600 +91022 +91114 +91168 +91200 +91248 +91280 +91400 +91850 +91865 +91900 +92000 +92200 +92400 +92624 +92643 +92800 +92813 +92870 +92980 +93000 +93080 +93200 +93389 +93499 +93600 +93769 +93790 +94000 +94070 +94130 +94300 +94560 +94569 +94600 +94700 +94930 +95052 +95100 +95160 +95200 +95253 +95300 +95390 +95450 +95580 +95780 +95800 +96000 +96100 +96138 +96200 +96300 +96430 +96495 +96660 +96969 +97150 +97216 +97300 +97400 +97440 +97630 +97640 +97735 +97800 +97832 +97930 +98000 +98100 +98184 +98200 +98660 +98700 +98759 +99000 +99177 +99266 +99297 +99453 +99486 +99851 +99940 +100000 +100100 +100400 +100500 +100880 +100900 +100933 +101200 +101300 +101400 +101480 +101652 +101666 +101685 +101989 +102000 +102200 +102300 +102652 +102787 +103000 +103200 +103240 +103990 +104642 +104871 +105000 +105075 +105200 +105399 +105535 +105740 +105950 +106000 +106160 +106167 +106500 +106772 +107370 +107400 +107401 +107414 +107603 +107657 +107700 +107701 +107986 +108000 +108190 +108200 +108216 +108278 +108490 +108761 +109000 +109020 +109120 +109130 +109361 +110500 +110691 +110940 +110950 +110970 +111460 +111500 +111766 +111800 +111919 +112000 +112450 +112855 +112890 +113170 +113233 +113260 +113689 +113709 +114000 +114030 +114050 +114200 +114446 +114500 +114580 +114726 +114770 +114800 +114902 +115400 +115470 +116000 +117000 +117500 +117605 +117690 +117692 +118000 +118228 +119000 +119782 +119830 +119960 +120151 +120325 +120400 +120600 +120814 +120980 +121246 +121300 +121482 +121813 +121911 +122100 +122120 +122490 +122848 +123360 +123924 +125000 +125294 +125853 +126000 +126100 +126400 +126510 +126581 +126600 +126699 +127400 +128000 +128700 +128760 +128910 +129000 +129100 +129300 +129500 +129600 +129640 +129700 +129930 +130000 +130180 +130200 +130494 +130700 +131000 +131461 +131478 +131491 +131500 +131600 +132000 +132100 +132500 +132900 +134000 +134400 +134900 +135000 +135050 +136000 +136520 +136600 +136790 +137000 +137100 +137700 +138000 +138700 +139000 +139437 +139704 +140000 +140440 +140458 +140540 +140751 +141100 +141119 +141458 +141810 +142000 +142300 +142551 +142842 +143600 +143884 +143900 +144040 +144995 +145345 +146517 +147404 +147705 +148230 +148303 +148810 +149000 +149073 +149900 +150000 +150551 +150564 +150700 +151000 +151691 +152000 +152023 +153000 +153200 +154000 +154200 +155310 +155909 +156000 +156360 +156490 +156530 +156780 +156900 +157540 +157560 +158000 +158425 +159085 +159352 +160410 +161610 +161700 +161800 +162000 +162100 +162200 +162893 +163700 +163830 +164000 +164068 +164200 +164684 +165000 +166030 +166165 +166400 +167000 +167178 +168146 +168700 +169636 +169910 +170000 +170010 +170701 +172000 +172075 +172189 +172235 +173479 +173975 +174200 +174390 +174530 +175120 +175591 +175640 +175900 +176431 +177000 +177500 +178082 +178400 +179000 +179390 +179900 +180000 +180107 +180590 +180610 +180900 +181000 +181300 +181490 +182000 +182990 +183000 +183050 +183100 +183940 +185554 +186150 +186800 +189000 +189100 +189670 +190000 +191538 +191600 +191781 +192500 +192740 +193200 +193820 +194040 +194309 +194790 +195504 +195824 +195937 +196000 +196110 +196687 +196775 +197000 +197200 +198000 +199950 +200000 +200222 +201270 +201900 +202000 +202562 +203432 +204020 +204390 +204800 +205000 +206000 +206110 +206900 +207000 +207148 +207990 +208150 +209000 +209400 +209678 +210000 +210400 +210684 +211900 +212000 +212200 +212700 +214290 +214330 +214700 +215227 +215500 +216000 +216200 +217000 +217400 +217410 +217700 +218000 +219070 +219552 +220000 +220970 +221000 +221690 +222913 +224000 +224900 +224918 +225000 +225490 +226712 +226833 +227000 +228780 +230000 +230400 +234150 +234410 +236205 +236804 +237300 +237700 +238800 +238860 +239003 +240431 +241048 +242700 +243794 +244794 +245700 +246000 +246190 +247000 +248000 +248240 +249500 +250000 +250300 +250400 +250929 +251579 +253200 +253750 +257100 +257600 +258000 +258123 +258681 +259000 +259570 +260160 +261200 +261750 +261800 +263813 +264600 +264800 +265240 +265500 +265900 +266757 +267120 +267346 +268500 +268742 +269240 +270000 +270132 +271000 +274000 +275000 +275100 +276500 +276620 +279101 +279140 +280000 +281490 +283570 +284702 +287300 +288440 +290000 +292304 +294000 +294862 +296480 +296600 +299800 +300000 +300500 +301120 +301232 +301868 +302000 +302114 +302270 +306000 +307130 +310000 +310240 +310396 +313591 +316000 +317500 +317741 +318000 +318731 +318999 +319531 +319800 +320000 +320300 +321000 +322520 +323214 +327300 +330000 +335480 +338740 +339000 +339160 +340000 +341930 +342000 +342300 +345410 +346030 +349000 +349700 +350000 +350530 +352000 +353170 +353220 +356000 +357300 +358000 +358170 +362920 +363000 +365000 +370000 +373400 +380000 +382310 +383500 +384220 +385685 +394610 +400000 +402000 +403900 +403942 +404987 +407139 +412130 +412200 +413443 +420400 +423057 +429600 +430480 +431780 +433000 +433150 +433530 +440000 +440700 +442480 +445190 +449046 +450000 +452000 +455509 +463000 +464000 +464730 +468200 +471840 +472850 +472900 +474310 +476780 +477600 +478552 +480000 +485000 +491000 +491500 +493700 +499598 +499700 +500000 +503100 +506005 +506270 +507000 +511700 +514592 +521640 +527620 +530000 +531898 +532400 +539900 +540000 +540126 +545400 +545586 +547638 +550000 +552171 +553500 +555717 +561127 +566690 +571000 +572210 +580000 +583380 +587000 +587860 +589400 +590400 +597000 +600000 +614000 +616000 +617290 +620000 +625000 +630000 +634000 +637422 +643030 +647200 +662200 +662600 +666200 +667016 +670000 +671279 +673600 +674000 +679700 +680000 +682897 +686220 +686730 +690100 +697000 +699030 +700000 +705000 +710000 +718440 +719000 +719570 +731452 +739000 +739640 +747200 +749600 +750000 +755470 +756719 +766900 +777520 +800000 +803800 +810000 +825212 +833300 +838700 +867200 +873134 +879350 +885550 +903000 +910000 +938079 +952000 +956700 +959670 +964000 +993300 +994000 +996900 +1000000 +1010000 +1022000 +1030000 +1050860 +1055492 +1060890 +1097432 +1099300 +1100000 +1102700 +1117000 +1141236 +1170000 +1185900 +1200000 +1207000 +1226450 +1242000 +1253840 +1300000 +1300780 +1332804 +1343800 +1360000 +1400000 +1435225 +1446280 +1459558 +1462100 +1465890 +1530000 +1556805 +1574400 +1580000 +1605000 +1613800 +1651931 +1668600 +1700000 +1800000 +1816096 +1920362 +1950000 +2000000 +2040100 +2050000 +2190000 +2308000 +2370210 +2400000 +2418000 +2491276 +2500000 +2521000 +2553560 +2610000 +2640969 +2700000 +2893870 +2962000 +3000000 +3053100 +3090000 +3129707 +3179920 +3321107 +3336106 +3406660 +3739270 +3817000 +3835000 +3900000 +4000000 +4032220 +4340000 +4530000 +4554000 +4714500 +5240500 +5338800 +5585700 +5600000 +5655380 +5799000 +6000000 +6249000 +6400000 +6598100 +6700000 +6940000 +7000000 +7400000 +7520000 +7890000 +8000000 +9000000 +9089710 +10000000 +11000000 +17400000 +20200000 +44353022 +75889000 diff --git a/third_party/chinese_text_normalization/thrax/src/number_data/random-tst.txt b/third_party/chinese_text_normalization/thrax/src/number_data/random-tst.txt new file mode 100644 index 000000000..efce19a97 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/number_data/random-tst.txt @@ -0,0 +1,1000 @@ +209 +220 +250 +254 +263 +266 +276 +303 +310 +317 +322 +364 +386 +405 +414 +424 +429 +489 +505 +520 +523 +525 +554 +624 +627 +640 +665 +680 +704 +715 +723 +741 +742 +775 +776 +845 +847 +851 +868 +898 +921 +927 +972 +973 +984 +986 +994 +1038 +1055 +1077 +1079 +1083 +1090 +1123 +1137 +1161 +1184 +1186 +1235 +1257 +1258 +1285 +1302 +1307 +1311 +1358 +1369 +1372 +1383 +1391 +1418 +1441 +1442 +1447 +1476 +1478 +1509 +1535 +1548 +1550 +1571 +1581 +1593 +1615 +1623 +1639 +1660 +1686 +1688 +1717 +1735 +1782 +1813 +1815 +1824 +1831 +1875 +1881 +1924 +1931 +1949 +1951 +1966 +1970 +1984 +1990 +1992 +2012 +2013 +2024 +2040 +2058 +2062 +2064 +2067 +2075 +2116 +2130 +2135 +2171 +2197 +2200 +2215 +2220 +2226 +2246 +2259 +2277 +2294 +2303 +2318 +2342 +2347 +2349 +2355 +2364 +2413 +2419 +2420 +2433 +2441 +2445 +2451 +2468 +2488 +2498 +2499 +2500 +2502 +2514 +2523 +2524 +2557 +2568 +2598 +2609 +2612 +2629 +2685 +2697 +2718 +2724 +2734 +2739 +2760 +2763 +2779 +2796 +2797 +2809 +2818 +2828 +2839 +2842 +2850 +2857 +2864 +2916 +2923 +2984 +2987 +2991 +2994 +3021 +3025 +3026 +3054 +3070 +3080 +3086 +3098 +3114 +3121 +3130 +3136 +3137 +3157 +3175 +3182 +3200 +3233 +3245 +3250 +3270 +3298 +3303 +3330 +3341 +3347 +3368 +3392 +3394 +3398 +3400 +3427 +3435 +3441 +3449 +3474 +3477 +3497 +3501 +3525 +3526 +3551 +3570 +3576 +3597 +3612 +3630 +3636 +3639 +3649 +3651 +3675 +3692 +3719 +3742 +3773 +3785 +3790 +3850 +3870 +3873 +3875 +3885 +3910 +3926 +3927 +3928 +3941 +3943 +3945 +3950 +3961 +3971 +3990 +3992 +3996 +4010 +4013 +4018 +4024 +4032 +4047 +4065 +4069 +4079 +4089 +4097 +4114 +4125 +4127 +4148 +4155 +4173 +4180 +4206 +4249 +4256 +4284 +4298 +4303 +4305 +4345 +4354 +4409 +4417 +4433 +4437 +4470 +4474 +4486 +4494 +4527 +4538 +4544 +4572 +4629 +4630 +4634 +4647 +4652 +4654 +4658 +4680 +4699 +4747 +4748 +4773 +4791 +4852 +4863 +4884 +4907 +4927 +4943 +4953 +5027 +5032 +5037 +5080 +5095 +5108 +5134 +5163 +5168 +5186 +5210 +5236 +5237 +5265 +5273 +5283 +5330 +5351 +5362 +5396 +5438 +5446 +5465 +5495 +5511 +5526 +5534 +5556 +5567 +5611 +5639 +5642 +5725 +5738 +5751 +5774 +5777 +5786 +5813 +5837 +5864 +5879 +5885 +5889 +5898 +5921 +5924 +5946 +5955 +5959 +5968 +5976 +5981 +6021 +6047 +6049 +6080 +6158 +6162 +6170 +6176 +6206 +6214 +6220 +6243 +6253 +6261 +6284 +6307 +6322 +6330 +6338 +6367 +6413 +6430 +6434 +6437 +6470 +6492 +6499 +6504 +6512 +6660 +6670 +6680 +6699 +6710 +6737 +6741 +6751 +6776 +6779 +6802 +6819 +6890 +6892 +6969 +6970 +7040 +7045 +7052 +7063 +7065 +7088 +7128 +7129 +7133 +7155 +7164 +7166 +7181 +7210 +7219 +7234 +7236 +7256 +7266 +7270 +7303 +7364 +7370 +7378 +7499 +7593 +7629 +7633 +7640 +7675 +7709 +7753 +7791 +7792 +7812 +7838 +7860 +7890 +7972 +8014 +8025 +8096 +8106 +8123 +8154 +8159 +8200 +8228 +8343 +8381 +8429 +8490 +8515 +8526 +8560 +8568 +8579 +8658 +8668 +8672 +8688 +8710 +8731 +8739 +8752 +8771 +8790 +8833 +8900 +8917 +8929 +9002 +9035 +9043 +9067 +9078 +9122 +9138 +9144 +9183 +9199 +9211 +9235 +9240 +9257 +9330 +9385 +9390 +9450 +9512 +9523 +9530 +9535 +9564 +9596 +9601 +9602 +9603 +9626 +9655 +9691 +9695 +9772 +9780 +9808 +9849 +9881 +9911 +9923 +9946 +9970 +9986 +10009 +10019 +10168 +10178 +10180 +10190 +10290 +10348 +10470 +10520 +10525 +10535 +10545 +10627 +10675 +10715 +10757 +10772 +10786 +10896 +10940 +10970 +11000 +11101 +11120 +11132 +11192 +11201 +11209 +11265 +11337 +11392 +11549 +11557 +11567 +11736 +11767 +11807 +11814 +11866 +11881 +11913 +12073 +12098 +12111 +12137 +12291 +12370 +12376 +12397 +12435 +12439 +12443 +12511 +12520 +12567 +12575 +12615 +12700 +12710 +12726 +12729 +12814 +12822 +12883 +12890 +12910 +12915 +12980 +13069 +13075 +13127 +13193 +13209 +13386 +13390 +13393 +13511 +13586 +13607 +13625 +13630 +13647 +13656 +13763 +13810 +13910 +13979 +13991 +14073 +14096 +14111 +14170 +14210 +14259 +14306 +14350 +14351 +14360 +14479 +14587 +14613 +14736 +14745 +14797 +14810 +14822 +14824 +14830 +15020 +15068 +15118 +15197 +15230 +15270 +15310 +15404 +15510 +15603 +15680 +15700 +15721 +15820 +15928 +15990 +16012 +16018 +16030 +16073 +16123 +16243 +16275 +16501 +16690 +16710 +16765 +16870 +16958 +17014 +17030 +17138 +17190 +17272 +17409 +17424 +17430 +17477 +17678 +17684 +17687 +17820 +17840 +17898 +18097 +18219 +18284 +18349 +18525 +18634 +18680 +19042 +19070 +19084 +19120 +19151 +19250 +19389 +19679 +19932 +20080 +20100 +20133 +20321 +20440 +20801 +20819 +20969 +21190 +21300 +21340 +21350 +21360 +21490 +21531 +21640 +21728 +21796 +21831 +21860 +22040 +22208 +22282 +22410 +22566 +22850 +23060 +23196 +23380 +24190 +24350 +24360 +24380 +24475 +24480 +24491 +24521 +24644 +24695 +24747 +24760 +24945 +25000 +25510 +25754 +25870 +26200 +26300 +26410 +26447 +26472 +26510 +27000 +27017 +27400 +27430 +27531 +27600 +27740 +27870 +28200 +28544 +28570 +28618 +28629 +28716 +28753 +28850 +29027 +29040 +29045 +29129 +29190 +29404 +29600 +29970 +30030 +30050 +30190 +30375 +30500 +30700 +30778 +30790 +30838 +31310 +31379 +31480 +31547 +31698 +31986 +32600 +32991 +33417 +33603 +34751 +34900 +34980 +35059 +35101 +35190 +35496 +35500 +35707 +35761 +36320 +36496 +36893 +37200 +37520 +37780 +38370 +38500 +38600 +39200 +39575 +39580 +40324 +40560 +41222 +41300 +41485 +41973 +43110 +43229 +44097 +44550 +44666 +45078 +45085 +45090 +45600 +46170 +46772 +47060 +48280 +48500 +48518 +49400 +49430 +50100 +50167 +50359 +50800 +51386 +51390 +51531 +51800 +52092 +52100 +52590 +52663 +52670 +52738 +52990 +53025 +53450 +53600 +53620 +54070 +54505 +56160 +56165 +57100 +57730 +58825 +58900 +60151 +60500 +61306 +61710 +62250 +62270 +62400 +63310 +63960 +64235 +64760 +65200 +65654 +66240 +66400 +66600 +68670 +68920 +71000 +71400 +72630 +72700 +72860 +73700 +75841 +76108 +77122 +79220 +79400 +79670 +81110 +83574 +84100 +84500 +86090 +87078 +87300 +87860 +88340 +88880 +89154 +89950 +92600 +96220 +96870 +97503 +99600 +101000 +104000 +105100 +105570 +106900 +108290 +108400 +110840 +110975 +113773 +115000 +116500 +119200 +124720 +127000 +127780 +128200 +128966 +138900 +140900 +141000 +141228 +144000 +145000 +145061 +147245 +147562 +148450 +152218 +154990 +158775 +159940 +161000 +161300 +163500 +165500 +170559 +176000 +178000 +184000 +188800 +196100 +204400 +204880 +210900 +216616 +220930 +238000 +239740 +257226 +265000 +271590 +273200 +285810 +309620 +315612 +320959 +321500 +341400 +348697 +350260 +359030 +360000 +360600 +376500 +378265 +383070 +394740 +410000 +446000 +471750 +497384 +510600 +560000 +590000 +608400 +696900 +704000 +1448374 +2256800 +3275000 +3980000 +4500000 +5066940 +5166299 +7113500 +9842447 +13020696 +70477170 diff --git a/third_party/chinese_text_normalization/thrax/src/ru/README.md b/third_party/chinese_text_normalization/thrax/src/ru/README.md new file mode 100644 index 000000000..c02d2935d --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/README.md @@ -0,0 +1,6 @@ +# Russian covering grammar definitions + +This directory defines a Russian text normalization covering grammar. The +primary entry-point is the FST `VERBALIZER`, defined in +`verbalizer/verbalizer.grm` and compiled in the FST archive +`verbalizer/verbalizer.far`. diff --git a/third_party/chinese_text_normalization/thrax/src/ru/classifier/cyrillic.grm b/third_party/chinese_text_normalization/thrax/src/ru/classifier/cyrillic.grm new file mode 100644 index 000000000..0672e45a1 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/classifier/cyrillic.grm @@ -0,0 +1,58 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export kRussianLowerAlpha = Optimize[ + "а" | "б" | "в" | "г" | "д" | "е" | "ё" | "ж" | "з" | "и" | "й" | + "к" | "л" | "м" | "н" | "о" | "п" | "р" | "с" | "т" | "у" | "ф" | + "х" | "ц" | "ч" | "ш" | "щ" | "ъ" | "ы" | "ь" | "э" | "ю" | "я" ]; + +export kRussianUpperAlpha = Optimize[ + "А" | "Б" | "В" | "Г" | "Д" | "Е" | "Ё" | "Ж" | "З" | "И" | "Й" | + "К" | "Л" | "М" | "Н" | "О" | "П" | "Р" | "С" | "Т" | "У" | "Ф" | + "Х" | "Ц" | "Ч" | "Ш" | "Щ" | "Ъ" | "Ы" | "Ь" | "Э" | "Ю" | "Я" ]; + +export kRussianLowerAlphaStressed = Optimize[ + "а́" | "е́" | "ё́" | "и́" | "о́" | "у́" | "ы́" | "э́" | "ю́" | "я́" ]; + +export kRussianUpperAlphaStressed = Optimize[ + "А́" | "Е́" | "Ё́" | "И́" | "О́" | "У́" | "Ы́" | "Э́" | "Ю́" | "Я́" ]; + +export kRussianRewriteStress = Optimize[ + ("А́" : "А'") | ("Е́" : "Е'") | ("Ё́" : "Ё'") | ("И́" : "И'") | + ("О́" : "О'") | ("У́" : "У'") | ("Ы́" : "Ы'") | ("Э́" : "Э'") | + ("Ю́" : "Ю'") | ("Я́" : "Я'") | + ("а́" : "а'") | ("е́" : "е'") | ("ё́" : "ё'") | ("и́" : "и'") | + ("о́" : "о'") | ("у́" : "у'") | ("ы́" : "ы'") | ("э́" : "э'") | + ("ю́" : "ю'") | ("я́" : "я'") +]; + +export kRussianRemoveStress = Optimize[ + ("А́" : "А") | ("Е́" : "Е") | ("Ё́" : "Ё") | ("И́" : "И") | ("О́" : "О") | + ("У́" : "У") | ("Ы́" : "Ы") | ("Э́" : "Э") | ("Ю́" : "Ю") | ("Я́" : "Я") | + ("а́" : "а") | ("е́" : "е") | ("ё́" : "ё") | ("и́" : "и") | ("о́" : "о") | + ("у́" : "у") | ("ы́" : "ы") | ("э́" : "э") | ("ю́" : "ю") | ("я́" : "я") +]; + +# Pre-reform characters, just in case. +export kRussianPreReform = Optimize[ + "ѣ" | "Ѣ" # http://en.wikipedia.org/wiki/Yat +]; + +export kCyrillicAlphaStressed = Optimize[ + kRussianLowerAlphaStressed | kRussianUpperAlphaStressed +]; + +export kCyrillicAlpha = Optimize[ + kRussianLowerAlpha | kRussianUpperAlpha | kRussianPreReform +]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/cardinals-lex.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/cardinals-lex.grm new file mode 100644 index 000000000..c07a7ae1c --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/cardinals-lex.grm @@ -0,0 +1,338 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# AUTOMATICALLY GENERATED: DO NOT EDIT. +import 'util/byte.grm' as b; + +# Utilities for insertion and deletion. + +func I[expr] { + return "" : expr; +} + +func D[expr] { + return expr : ""; +} + +# Powers of base 10. +export POWERS = + "[E15]" + | "[E14]" + | "[E13]" + | "[E12]" + | "[E11]" + | "[E10]" + | "[E9]" + | "[E8]" + | "[E7]" + | "[E6]" + | "[E5]" + | "[E4]" + | "[E3]" + | "[E2]" + | "[E1]" +; + +export SIGMA = b.kBytes | POWERS; + +export SIGMA_STAR = SIGMA*; + +export SIGMA_PLUS = SIGMA+; + +################################################################################ +# BEGIN LANGUAGE SPECIFIC DATA +revaluations = + ("[E4]" : "[E1]") + | ("[E5]" : "[E2]") + | ("[E7]" : "[E1]") + | ("[E8]" : "[E2]") +; + +Ms = "[E3]" | "[E6]" | "[E9]"; + + +func Zero[expr] { + return expr : (""); +} + +space = " "; + +lexset3 = Optimize[ + ("1[E1]+1" : "одиннадцати") + | ("1[E1]+1" : "одиннадцать") + | ("1[E1]+1" : "одиннадцатью") + | ("1[E1]+2" : "двенадцати") + | ("1[E1]+2" : "двенадцать") + | ("1[E1]+2" : "двенадцатью") + | ("1[E1]+3" : "тринадцати") + | ("1[E1]+3" : "тринадцать") + | ("1[E1]+3" : "тринадцатью") + | ("1[E1]+4" : "четырнадцати") + | ("1[E1]+4" : "четырнадцать") + | ("1[E1]+4" : "четырнадцатью") + | ("1[E1]+5" : "пятнадцати") + | ("1[E1]+5" : "пятнадцать") + | ("1[E1]+5" : "пятнадцатью") + | ("1[E1]+6" : "шестнадцати") + | ("1[E1]+6" : "шестнадцать") + | ("1[E1]+6" : "шестнадцатью") + | ("1[E1]+7" : "семнадцати") + | ("1[E1]+7" : "семнадцать") + | ("1[E1]+7" : "семнадцатью") + | ("1[E1]+8" : "восемнадцати") + | ("1[E1]+8" : "восемнадцать") + | ("1[E1]+8" : "восемнадцатью") + | ("1[E1]+9" : "девятнадцати") + | ("1[E1]+9" : "девятнадцать") + | ("1[E1]+9" : "девятнадцатью")] +; + +lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR]; + +lexset2 = Optimize[ + ("1[E1]" : "десяти") + | ("1[E1]" : "десять") + | ("1[E1]" : "десятью") + | ("1[E2]" : "ста") + | ("1[E2]" : "сто") + | ("2[E1]" : "двадцати") + | ("2[E1]" : "двадцать") + | ("2[E1]" : "двадцатью") + | ("2[E2]" : "двести") + | ("2[E2]" : "двумстам") + | ("2[E2]" : "двумястами") + | ("2[E2]" : "двухсот") + | ("2[E2]" : "двухстах") + | ("3[E1]" : "тридцати") + | ("3[E1]" : "тридцать") + | ("3[E1]" : "тридцатью") + | ("3[E2]" : "тремстам") + | ("3[E2]" : "тремястами") + | ("3[E2]" : "трехсот") + | ("3[E2]" : "трехстах") + | ("3[E2]" : "триста") + | ("4[E1]" : "сорок") + | ("4[E1]" : "сорока") + | ("4[E2]" : "четыремстам") + | ("4[E2]" : "четыреста") + | ("4[E2]" : "четырехсот") + | ("4[E2]" : "четырехстах") + | ("4[E2]" : "четырьмястами") + | ("5[E1]" : "пятидесяти") + | ("5[E1]" : "пятьдесят") + | ("5[E1]" : "пятьюдесятью") + | ("5[E2]" : "пятисот") + | ("5[E2]" : "пятистам") + | ("5[E2]" : "пятистах") + | ("5[E2]" : "пятьсот") + | ("5[E2]" : "пятьюстами") + | ("6[E1]" : "шестидесяти") + | ("6[E1]" : "шестьдесят") + | ("6[E1]" : "шестьюдесятью") + | ("6[E2]" : "шестисот") + | ("6[E2]" : "шестистам") + | ("6[E2]" : "шестистах") + | ("6[E2]" : "шестьсот") + | ("6[E2]" : "шестьюстами") + | ("7[E1]" : "семидесяти") + | ("7[E1]" : "семьдесят") + | ("7[E1]" : "семьюдесятью") + | ("7[E2]" : "семисот") + | ("7[E2]" : "семистам") + | ("7[E2]" : "семистах") + | ("7[E2]" : "семьсот") + | ("7[E2]" : "семьюстами") + | ("8[E1]" : "восемьдесят") + | ("8[E1]" : "восьмидесяти") + | ("8[E1]" : "восьмьюдесятью") + | ("8[E2]" : "восемьсот") + | ("8[E2]" : "восемьюстами") + | ("8[E2]" : "восьмисот") + | ("8[E2]" : "восьмистам") + | ("8[E2]" : "восьмистах") + | ("8[E2]" : "восьмьюстами") + | ("9[E1]" : "девяноста") + | ("9[E1]" : "девяносто") + | ("9[E2]" : "девятисот") + | ("9[E2]" : "девятистам") + | ("9[E2]" : "девятистах") + | ("9[E2]" : "девятьсот") + | ("9[E2]" : "девятьюстами")] +; + +lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR]; + +lexset1 = Optimize[ + ("+" : "") + | ("1" : "один") + | ("1" : "одна") + | ("1" : "одни") + | ("1" : "одним") + | ("1" : "одними") + | ("1" : "одних") + | ("1" : "одно") + | ("1" : "одного") + | ("1" : "одной") + | ("1" : "одном") + | ("1" : "одному") + | ("1" : "одною") + | ("1" : "одну") + | ("2" : "два") + | ("2" : "две") + | ("2" : "двум") + | ("2" : "двумя") + | ("2" : "двух") + | ("3" : "трем") + | ("3" : "тремя") + | ("3" : "трех") + | ("3" : "три") + | ("4" : "четыре") + | ("4" : "четырем") + | ("4" : "четырех") + | ("4" : "четырьмя") + | ("5" : "пяти") + | ("5" : "пять") + | ("5" : "пятью") + | ("6" : "шести") + | ("6" : "шесть") + | ("6" : "шестью") + | ("7" : "семи") + | ("7" : "семь") + | ("7" : "семью") + | ("8" : "восемь") + | ("8" : "восьми") + | ("8" : "восьмью") + | ("9" : "девяти") + | ("9" : "девять") + | ("9" : "девятью") + | ("[E3]" : "тысяч") + | ("[E3]" : "тысяча") + | ("[E3]" : "тысячам") + | ("[E3]" : "тысячами") + | ("[E3]" : "тысячах") + | ("[E3]" : "тысяче") + | ("[E3]" : "тысячей") + | ("[E3]" : "тысячи") + | ("[E3]" : "тысячу") + | ("[E3]" : "тысячью") + | ("[E6]" : "миллион") + | ("[E6]" : "миллиона") + | ("[E6]" : "миллионам") + | ("[E6]" : "миллионами") + | ("[E6]" : "миллионах") + | ("[E6]" : "миллионе") + | ("[E6]" : "миллионов") + | ("[E6]" : "миллионом") + | ("[E6]" : "миллиону") + | ("[E6]" : "миллионы") + | ("[E9]" : "миллиард") + | ("[E9]" : "миллиарда") + | ("[E9]" : "миллиардам") + | ("[E9]" : "миллиардами") + | ("[E9]" : "миллиардах") + | ("[E9]" : "миллиарде") + | ("[E9]" : "миллиардов") + | ("[E9]" : "миллиардом") + | ("[E9]" : "миллиарду") + | ("[E9]" : "миллиарды") + | ("|0|" : "ноле") + | ("|0|" : "нолем") + | ("|0|" : "ноль") + | ("|0|" : "нолю") + | ("|0|" : "ноля") + | ("|0|" : "нуле") + | ("|0|" : "нулем") + | ("|0|" : "нуль") + | ("|0|" : "нулю") + | ("|0|" : "нуля")] +; + +lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR]; + +export LEX = Optimize[lex3 @ lex2 @ lex1]; + +export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]"; + +# END LANGUAGE SPECIFIC DATA +################################################################################ +# Inserts a marker after the Ms. +export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR]; + +# Deletes all powers and "+". +export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR]; + +# Deletes trailing zeros at the beginning of a number, so that "0003" does not +# get treated as an ordinary number. +export DELETE_INITIAL_ZEROS = + CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR] +; + +NonMs = Optimize[POWERS - Ms]; + +# Deletes (usually) zeros before a non-M. E.g., +0[E1] should be deleted. +export DELETE_INTERMEDIATE_ZEROS1 = + CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR] +; + +# Deletes (usually) zeros before an M, if there is no non-zero element between +# that and the previous boundary. Thus, if after the result of the rule above we +# end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final +# zero. +export DELETE_INTERMEDIATE_ZEROS2 = Optimize[ + CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR] + @ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]] +; + +# Final clean up of stray zeros. +export DELETE_REMAINING_ZEROS = Optimize[ + CDRewrite[Zero["+0"], "", "", SIGMA_STAR] + @ CDRewrite[Zero["0"], "", "", SIGMA_STAR]] +; + +# Applies the revaluation map. For example in English, changes [E4] to [E1] as a +# modifier of [E3]. +export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR]; + +# Deletes the various marks and powers in the input and output. +export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR]; + +export CLEAN_SPACES = Optimize[ + CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR] + @ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR] + @ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]] +; + +d = b.kDigit; + +# Germanic inversion rule. +germanic = + (I["1+"] d "[E1]" D["+1"]) + | (I["2+"] d "[E1]" D["+2"]) + | (I["3+"] d "[E1]" D["+3"]) + | (I["4+"] d "[E1]" D["+4"]) + | (I["5+"] d "[E1]" D["+5"]) + | (I["6+"] d "[E1]" D["+6"]) + | (I["7+"] d "[E1]" D["+7"]) + | (I["8+"] d "[E1]" D["+8"]) + | (I["9+"] d "[E1]" D["+9"]) +; + +germanic_inversion = + CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt'] +; + +export GERMANIC_INVERSION = SIGMA_STAR; +export ORDINAL_RESTRICTION = SIGMA_STAR; +nondigits = b.kBytes - b.kDigit; +export ORDINAL_SUFFIX = D[nondigits*]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/cardinals.tsv b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/cardinals.tsv new file mode 100644 index 000000000..484a5c8a7 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/cardinals.tsv @@ -0,0 +1,177 @@ +0 ноле +0 ноль +0 нолю +0 ноля +0 нолём +0 нуле +0 нуль +0 нулю +0 нуля +0 нулём +1 один +1 одна +1 одни +1 одним +1 одними +1 одних +1 одно +1 одного +1 одной +1 одном +1 одному +1 одною +1 раз +1 одну +2 два +2 две +2 двум +2 двумя +2 двух +3 тремя +3 три +3 трём +3 трёх +4 четыре +4 четырьмя +4 четырём +4 четырёх +5 пяти +5 пять +5 пятью +6 шести +6 шесть +6 шестью +7 семи +7 семь +7 семью +8 восемь +8 восьми +8 восьмью +9 девяти +9 девять +9 девятью +10 десяти +10 десять +10 десятью +11 одиннадцати +11 одиннадцать +11 одиннадцатью +12 двенадцати +12 двенадцать +12 двенадцатью +13 тринадцати +13 тринадцать +13 тринадцатью +14 четырнадцати +14 четырнадцать +14 четырнадцатью +15 пятнадцати +15 пятнадцать +15 пятнадцатью +16 шестнадцати +16 шестнадцать +16 шестнадцатью +17 семнадцати +17 семнадцать +17 семнадцатью +18 восемнадцати +18 восемнадцать +18 восемнадцатью +19 девятнадцати +19 девятнадцать +19 девятнадцатью +20 двадцати +20 двадцать +20 двадцатью +30 тридцати +30 тридцать +30 тридцатью +40 сорок +40 сорока +50 пятидесяти +50 пятьдесят +50 пятьюдесятью +60 шестидесяти +60 шестьдесят +60 шестьюдесятью +70 семидесяти +70 семьдесят +70 семьюдесятью +80 восемьдесят +80 восьмидесяти +80 восьмьюдесятью +90 девяноста +90 девяносто +100 ста +100 сто +200 двести +200 двумстам +200 двумястами +200 двухсот +200 двухстах +300 тремястами +300 трехсот +300 триста +300 трёмстам +300 трёхстах +400 четыреста +400 четырьмястами +400 четырёмстам +400 четырёхсот +400 четырёхстах +500 пятисот +500 пятистам +500 пятистах +500 пятьсот +500 пятьюстами +600 шестисот +600 шестистам +600 шестистах +600 шестьсот +600 шестьюстами +700 семисот +700 семистам +700 семистах +700 семьсот +700 семьюстами +800 восемьсот +800 восемьюстами +800 восьмисот +800 восьмистам +800 восьмистах +800 восьмьюстами +900 девятисот +900 девятистам +900 девятистах +900 девятьсот +900 девятьюстами +1000 тысяч +1000 тысяча +1000 тысячам +1000 тысячами +1000 тысячах +1000 тысяче +1000 тысячей +1000 тысячи +1000 тысячу +1000 тысячью +1000000 миллион +1000000 миллиона +1000000 миллионам +1000000 миллионами +1000000 миллионах +1000000 миллионе +1000000 миллионов +1000000 миллионом +1000000 миллиону +1000000 миллионы +1000000000 миллиард +1000000000 миллиарда +1000000000 миллиардам +1000000000 миллиардами +1000000000 миллиардах +1000000000 миллиарде +1000000000 миллиардов +1000000000 миллиардом +1000000000 миллиарду +1000000000 миллиарды diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/extra_numbers.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/extra_numbers.grm new file mode 100644 index 000000000..644f30dff --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/extra_numbers.grm @@ -0,0 +1,35 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; +import 'ru/verbalizer/numbers.grm' as n; + +digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@"); + +export DIGITS = digit (n.I[" "] digit)*; + +# Various common factorizations + +two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS; + +three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS; + +mixed = + (digit n.I[" "] two_digits) + | (two_digits n.I[" "] two_digits) + | (two_digits n.I[" "] three_digits) + | (two_digits n.I[" "] two_digits n.I[" "] two_digits) +; + +export MIXED_NUMBERS = Optimize[mixed]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/factorization.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/factorization.grm new file mode 100644 index 000000000..860161463 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/factorization.grm @@ -0,0 +1,40 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; +import 'util/util.grm' as u; +import 'ru/verbalizer/numbers.grm' as n; + +func ToNumberName[expr] { + number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*; + return Optimize[expr @ number_name_seq]; +} + +d = b.kDigit; + +leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*]; + +by_ones = d n.I[" "]; +by_twos = (d{2} @ leading_zero) n.I[" "]; +by_threes = (d{3} @ leading_zero) n.I[" "]; + +groupings = by_twos* (by_threes | by_twos | by_ones); + +export FRACTIONAL_PART_UNGROUPED = + Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]] +; +export FRACTIONAL_PART_GROUPED = + Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]] +; +export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/float.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/float.grm new file mode 100644 index 000000000..c608507a7 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/float.grm @@ -0,0 +1,30 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'ru/verbalizer/factorization.grm' as f; +import 'ru/verbalizer/lexical_map.grm' as l; +import 'ru/verbalizer/numbers.grm' as n; + +fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED; +fractional_part_grouped = f.FRACTIONAL_PART_GROUPED; +fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED; + +__fractional_part__ = fractional_part_unparsed; +__decimal_marker__ = ","; + +export FLOAT = Optimize[ + (n.CARDINAL_NUMBERS + (__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ") + __fractional_part__) @ l.LEXICAL_MAP] +; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/g.fst b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/g.fst new file mode 100644 index 000000000..66665f390 Binary files /dev/null and b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/g.fst differ diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/lexical_map.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/lexical_map.grm new file mode 100644 index 000000000..e7bb32b0b --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/lexical_map.grm @@ -0,0 +1,25 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; + +lexical_map = StringFile['ru/verbalizer/lexical_map.tsv']; + +sigma_star = b.kBytes*; + +del_null = CDRewrite["__NULL__" : "", "", "", sigma_star]; + +export LEXICAL_MAP = Optimize[ + CDRewrite[lexical_map, "", "", sigma_star] @ del_null] +; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/lexical_map.tsv b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/lexical_map.tsv new file mode 100644 index 000000000..b78cf73df --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/lexical_map.tsv @@ -0,0 +1,221 @@ +@@CONNECTOR_RANGE@@ до +@@CONNECTOR_RATIO@@ к +@@CONNECTOR_BY@@ на +@@CONNECTOR_CONSECUTIVE_YEAR@@ до +@@JANUARY@@ январь +@@JANUARY@@ январи +@@JANUARY@@ января +@@JANUARY@@ январей +@@JANUARY@@ январю +@@JANUARY@@ январям +@@JANUARY@@ январь +@@JANUARY@@ январи +@@JANUARY@@ январём +@@JANUARY@@ январями +@@JANUARY@@ январе +@@JANUARY@@ январях +@@FEBRUARY@@ февраль +@@FEBRUARY@@ феврали +@@FEBRUARY@@ февраля +@@FEBRUARY@@ февралей +@@FEBRUARY@@ февралю +@@FEBRUARY@@ февралям +@@FEBRUARY@@ февраль +@@FEBRUARY@@ феврали +@@FEBRUARY@@ февралём +@@FEBRUARY@@ февралями +@@FEBRUARY@@ феврале +@@FEBRUARY@@ февралях +@@MARCH@@ март +@@MARCH@@ марты +@@MARCH@@ марта +@@MARCH@@ мартов +@@MARCH@@ марту +@@MARCH@@ мартам +@@MARCH@@ март +@@MARCH@@ марты +@@MARCH@@ мартом +@@MARCH@@ мартами +@@MARCH@@ марте +@@MARCH@@ мартах +@@APRIL@@ апрель +@@APRIL@@ апрели +@@APRIL@@ апреля +@@APRIL@@ апрелей +@@APRIL@@ апрелю +@@APRIL@@ апрелям +@@APRIL@@ апрель +@@APRIL@@ апрели +@@APRIL@@ апрелем +@@APRIL@@ апрелями +@@APRIL@@ апреле +@@APRIL@@ апрелях +@@MAY@@ май +@@MAY@@ маи +@@MAY@@ мая +@@MAY@@ маев +@@MAY@@ маю +@@MAY@@ маям +@@MAY@@ май +@@MAY@@ маи +@@MAY@@ маем +@@MAY@@ маями +@@MAY@@ мае +@@MAY@@ маях +@@JUN@@ июнь +@@JUN@@ июни +@@JUN@@ июня +@@JUN@@ июней +@@JUN@@ июню +@@JUN@@ июням +@@JUN@@ июнь +@@JUN@@ июни +@@JUN@@ июнем +@@JUN@@ июнями +@@JUN@@ июне +@@JUN@@ июнях +@@JUL@@ июль +@@JUL@@ июли +@@JUL@@ июля +@@JUL@@ июлей +@@JUL@@ июлю +@@JUL@@ июлям +@@JUL@@ июль +@@JUL@@ июли +@@JUL@@ июлем +@@JUL@@ июлями +@@JUL@@ июле +@@JUL@@ июлях +@@AUGUST@@ август +@@AUGUST@@ августы +@@AUGUST@@ августа +@@AUGUST@@ августов +@@AUGUST@@ августу +@@AUGUST@@ августам +@@AUGUST@@ август +@@AUGUST@@ августы +@@AUGUST@@ августом +@@AUGUST@@ августами +@@AUGUST@@ августе +@@AUGUST@@ августах +@@SEPTEMBER@@ сентябрь +@@SEPTEMBER@@ сентябри +@@SEPTEMBER@@ сентября +@@SEPTEMBER@@ сентябрей +@@SEPTEMBER@@ сентябрю +@@SEPTEMBER@@ сентябрям +@@SEPTEMBER@@ сентябрь +@@SEPTEMBER@@ сентябри +@@SEPTEMBER@@ сентябрём +@@SEPTEMBER@@ сентябрями +@@SEPTEMBER@@ сентябре +@@SEPTEMBER@@ сентябрях +@@OCTOBER@@ октябрь +@@OCTOBER@@ октябри +@@OCTOBER@@ октября +@@OCTOBER@@ октябрей +@@OCTOBER@@ октябрю +@@OCTOBER@@ октябрям +@@OCTOBER@@ октябрь +@@OCTOBER@@ октябри +@@OCTOBER@@ октябрём +@@OCTOBER@@ октябрями +@@OCTOBER@@ октябре +@@OCTOBER@@ октябрях +@@NOVEMBER@@ ноябрь +@@NOVEMBER@@ ноябри +@@NOVEMBER@@ ноября +@@NOVEMBER@@ ноябрей +@@NOVEMBER@@ ноябрю +@@NOVEMBER@@ ноябрям +@@NOVEMBER@@ ноябрь +@@NOVEMBER@@ ноябри +@@NOVEMBER@@ ноябрём +@@NOVEMBER@@ ноябрями +@@NOVEMBER@@ ноябре +@@NOVEMBER@@ ноябрях +@@DECEMBER@@ декабрь +@@DECEMBER@@ декабри +@@DECEMBER@@ декабря +@@DECEMBER@@ декабрей +@@DECEMBER@@ декабрю +@@DECEMBER@@ декабрям +@@DECEMBER@@ декабрь +@@DECEMBER@@ декабри +@@DECEMBER@@ декабрём +@@DECEMBER@@ декабрями +@@DECEMBER@@ декабре +@@DECEMBER@@ декабрях +@@MINUS@@ минус +@@DECIMAL_DOT_EXPRESSION@@ целая +@@DECIMAL_DOT_EXPRESSION@@ целой +@@DECIMAL_DOT_EXPRESSION@@ целой +@@DECIMAL_DOT_EXPRESSION@@ целую +@@DECIMAL_DOT_EXPRESSION@@ целой +@@DECIMAL_DOT_EXPRESSION@@ целой +@@DECIMAL_DOT_EXPRESSION@@ целым +@@DECIMAL_DOT_EXPRESSION@@ целыми +@@DECIMAL_DOT_EXPRESSION@@ целых +@@DECIMAL_DOT_EXPRESSION@@ целых +@@URL_DOT_EXPRESSION@@ точка +@@PERIOD@@ точка +@@DECIMAL_EXPONENT@@ умножить на десять в степени +@@COLON@@ двоеточие +@@SLASH@@ косая черта +@@PASSWORD@@ пароль +@@AT@@ собака +@@PORT@@ порт +@@QUESTION_MARK@@ вопросительный знак +@@HASH@@ решётка +@@HASH@@ решетка +@@MONEY_AND@@ и +@@AND@@ и +@@PHONE_PLUS@@ плюс +@@ARITHMETIC_PLUS@@ плюс +@@PHONE_EXTENSION@@ добавочный номер +@@TIME_AM@@ утра +@@TIME_PM@@ вечера +@@HOUR@@ час +@@HOUR@@ часа +@@HOUR@@ часам +@@HOUR@@ часами +@@HOUR@@ часах +@@HOUR@@ часе +@@HOUR@@ часов +@@HOUR@@ часом +@@HOUR@@ часу +@@HOUR@@ часы +@@MINUTE@@ минут +@@MINUTE@@ минута +@@MINUTE@@ минутам +@@MINUTE@@ минутами +@@MINUTE@@ минутах +@@MINUTE@@ минуте +@@MINUTE@@ минутой +@@MINUTE@@ минутою +@@MINUTE@@ минуту +@@MINUTE@@ минуты +@@TIME_AFTER@@ __NULL__ +@@TIME_BEFORE_PRE@@ без +@@TIME_QUARTER@@ четверть +@@TIME_QUARTER@@ четверти +@@TIME_HALF@@ половина +@@TIME_HALF@@ половины +@@TIME_HALF@@ половину +@@TIME_HALF@@ половин +@@TIME_HALF@@ половине +@@TIME_HALF@@ половинам +@@TIME_HALF@@ половиной +@@TIME_HALF@@ половинами +@@TIME_HALF@@ половинах +@@PERCENT@@ процент +@@PERCENT@@ процента +@@PERCENT@@ процентам +@@PERCENT@@ процентами +@@PERCENT@@ процентах +@@PERCENT@@ проценте +@@PERCENT@@ процентов +@@PERCENT@@ процентом +@@PERCENT@@ проценту +@@PERCENT@@ проценты +@@PERCENT@@ проценты diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/math.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/math.grm new file mode 100644 index 000000000..061de4a78 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/math.grm @@ -0,0 +1,34 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'ru/verbalizer/float.grm' as f; +import 'ru/verbalizer/lexical_map.grm' as l; +import 'ru/verbalizer/numbers.grm' as n; + +float = f.FLOAT; +card = n.CARDINAL_NUMBERS; +number = card | float; + +plus = "+" : " @@ARITHMETIC_PLUS@@ "; +times = "*" : " @@ARITHMETIC_TIMES@@ "; +minus = "-" : " @@ARITHMETIC_MINUS@@ "; +division = "/" : " @@ARITHMETIC_DIVISION@@ "; + +operator = plus | times | minus | division; + +percent = "%" : " @@PERCENT@@"; + +export ARITHMETIC = + Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP] +; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/miscellaneous.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/miscellaneous.grm new file mode 100644 index 000000000..352363106 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/miscellaneous.grm @@ -0,0 +1,78 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; +import 'ru/classifier/cyrillic.grm' as c; +import 'ru/verbalizer/extra_numbers.grm' as e; +import 'ru/verbalizer/lexical_map.grm' as l; +import 'ru/verbalizer/numbers.grm' as n; +import 'ru/verbalizer/spelled.grm' as s; + +letter = b.kAlpha | c.kCyrillicAlpha; +dash = "-"; +word = letter+; +possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?; + +post_word_symbol = + ("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) | + ("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) | + ("*" : "@@STAR@@") +; + +pre_word_symbol = + ("@" : "@@AT@@") | + ("/" : "@@SLASH@@") | + ("#" : "@@HASH@@") +; + +post_word = possibly_split_word n.I[" "] post_word_symbol; + +pre_word = pre_word_symbol n.I[" "] possibly_split_word; + +## Number/digit sequence combos, maybe with a dash + +spelled_word = word @ s.SPELLED_NO_LETTER; + +word_number = + (word | spelled_word) + (n.I[" "] | (dash : " ")) + (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS) +; + +number_word = + (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS) + (n.I[" "] | (dash : " ")) + (word | spelled_word) +; + +## Two-digit year. + +# Note that in this case to be fair we really have to allow ordinals too since +# in some languages that's what you would have. + +two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS)); + +dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com"; + +miscellaneous = Optimize[ + possibly_split_word + | post_word + | pre_word + | word_number + | number_word + | two_digit_year + | dot_com +]; + +export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/money.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/money.grm new file mode 100644 index 000000000..ddea02431 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/money.grm @@ -0,0 +1,44 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; +import 'ru/verbalizer/lexical_map.grm' as l; +import 'ru/verbalizer/numbers.grm' as n; + +card = n.CARDINAL_NUMBERS; + +__currency__ = StringFile['ru/verbalizer/money.tsv']; + +d = b.kDigit; +D = d - "0"; + +cents = ((n.D["0"] | D) d) @ card; + +# Only dollar for the verbalizer tests for English. Will need to add other +# currencies. +usd_maj = Project["usd_maj" @ __currency__, 'output']; +usd_min = Project["usd_min" @ __currency__, 'output']; +and = " @@MONEY_AND@@ " | " "; + +dollar1 = + n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min] +; + +dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"]; + +dollar3 = n.D["$"] card n.I[" " usd_maj]; + +dollar = Optimize[dollar1 | dollar2 | dollar3]; + +export MONEY = Optimize[dollar @ l.LEXICAL_MAP]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/money.tsv b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/money.tsv new file mode 100644 index 000000000..184ea8fe7 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/money.tsv @@ -0,0 +1,24 @@ +usd_maj доллара +usd_maj долларами +usd_maj долларам +usd_maj долларах +usd_maj долларе +usd_maj долларов +usd_maj долларом +usd_maj доллар +usd_maj доллар +usd_maj доллару +usd_maj доллары +usd_maj доллары +usd_min цент +usd_min цент +usd_min цента +usd_min центам +usd_min центами +usd_min центах +usd_min центе +usd_min центов +usd_min центом +usd_min центу +usd_min центы +usd_min центы diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/nominatives.tsv b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/nominatives.tsv new file mode 100644 index 000000000..fdfb61038 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/nominatives.tsv @@ -0,0 +1,166 @@ +нуль +ноль +один +два +две +три +четыре +пять +шесть +семь +восемь +девять +десять +одиннадцать +двенадцать +тринадцать +четырнадцать +пятнадцать +шестнадцать +семнадцать +восемнадцать +девятнадцать +двадцать +тридцать +сорок +пятьдесят +шестьдесят +семьдесят +восемьдесят +девяносто +сто +двести +триста +четыреста +пятьсот +шестьсот +семьсот +восемьсот +девятьсот +тысячи +тысяч +тысяча +миллионов +миллион +миллиона +миллиардов +миллиард +миллиарда +первая +первого +первое +первый +вторая +второе +второй +третий +третье +третья +четвертая +четвертое +четвертой +пятая +пятое +пятой +шестая +шестое +шестой +седьмая +седьмое +седьмой +восьмая +восьмое +восьмой +девятая +девятое +девятой +десятая +десятое +десятой +одиннадцатая +одиннадцатое +одиннадцатой +двенадцатая +двенадцатое +двенадцатой +тринадцатая +тринадцатое +тринадцатой +четырнадцатая +четырнадцатое +четырнадцатой +пятнадцатая +пятнадцатое +пятнадцатой +шестнадцатая +шестнадцатое +шестнадцатой +семнадцатая +семнадцатое +семнадцатой +восемнадцатая +восемнадцатое +восемнадцатой +девятнадцатая +девятнадцатое +девятнадцатой +двадцатая +двадцатое +двадцатой +тридцатая +тридцатое +тридцатой +сороковая +сороковое +сороковой +пятидесятая +пятидесятое +пятидесятой +шестидесятая +шестидесятое +шестидесятой +семидесятая +семидесятое +семидесятой +восьмидесятая +восьмидесятое +восьмидесятой +девяностая +девяностое +девяностой +сотая +сотое +сотой +двухсотая +двухсотое +двухсотой +трехсотая +трехсотое +трехсотой +четырехсотая +четырехсотое +четырехсотой +пятисотая +пятисотое +пятисотой +шестисотая +шестисотое +шестисотой +семисотая +семисотое +семисотой +восьмисотая +восьмисотое +восьмисотой +девятисотая +девятисотое +девятисотой +тысячная +тысячное +тысячной +миллионная +миллионное +миллионной +миллиардная +миллиардное +миллиардной diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/number_names.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/number_names.grm new file mode 100644 index 000000000..84ac15a25 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/number_names.grm @@ -0,0 +1,48 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Russian minimally supervised number grammar. +# +# Supports cardinals and ordinals in all inflected forms. +# +# The language-specific acceptor G was compiled with digit, teen, decade, +# century, and big power-of-ten preterminals. The lexicon transducer is +# highly ambiguous, but no LM is used. + +import 'util/arithmetic.grm' as a; + +# Intersects the universal factorization transducer (F) with language-specific +# acceptor (G). + +d = a.DELTA_STAR; +f = a.IARITHMETIC_RESTRICTED; +g = LoadFst['ru/verbalizer/g.fst']; +fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]]; +test1 = AssertEqual["230" @ fg, "(+ 200 30 +)"]; + +# Compiles lexicon transducers (L). + +cardinal_name = StringFile['ru/verbalizer/cardinals.tsv']; +cardinal_l = Optimize[(cardinal_name " ")* cardinal_name]; + +ordinal_name = StringFile['ru/verbalizer/ordinals.tsv']; +ordinal_l = Optimize[(cardinal_name " ")* ordinal_name]; + +# Composes L with the leaf transducer (P), then composes that with FG. + +p = a.LEAVES; + +export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)]; + +export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/numbers.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/numbers.grm new file mode 100644 index 000000000..b25f1fb67 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/numbers.grm @@ -0,0 +1,68 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'ru/verbalizer/number_names.grm' as n; +import 'universal/thousands_punct.grm' as t; +import 'util/byte.grm' as b; + +nominatives = StringFile['ru/verbalizer/nominatives.tsv']; + +sigma_star = b.kBytes*; + +nominative_filter = + CDRewrite[nominatives ("" : "" <-1>), "[BOS]" | " ", " " | "[EOS]", sigma_star] +; + +cardinal = n.CARDINAL_NUMBER_NAME; +ordinal = n.ORDINAL_NUMBER_NAME; + +# Putting these here since this grammar gets incorporated by all the others. + +func I[expr] { + return "" : expr; +} + +func D[expr] { + return expr : ""; +} + +# Since we know this is the default for Russian, it's fair game to set it. +separators = t.dot_thousands | t.no_delimiter; + +export CARDINAL_NUMBERS = Optimize[ + separators + @ cardinal +]; + +export ORDINAL_NUMBERS_UNMARKED = Optimize[ + separators + @ ordinal +]; + + +endings = StringFile['ru/verbalizer/ordinal_endings.tsv']; + +not_dash = (b.kBytes - "-")+; +del_ending = CDRewrite[("-" not_dash) : "", "", "[EOS]", sigma_star]; + +# Needs nominative_filter here if we take out Kyle's models. +export ORDINAL_NUMBERS_MARKED = Optimize[ + Optimize[Optimize[separators @ ordinal] "-" not_dash] + @ Optimize[sigma_star endings] + @ del_ending] +; + +export ORDINAL_NUMBERS = + Optimize[ORDINAL_NUMBERS_MARKED | ORDINAL_NUMBERS_UNMARKED] +; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/numbers_plus.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/numbers_plus.grm new file mode 100644 index 000000000..dd000b3b9 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/numbers_plus.grm @@ -0,0 +1,133 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Grammar for things built mostly on numbers. + +import 'ru/verbalizer/factorization.grm' as f; +import 'ru/verbalizer/lexical_map.grm' as l; +import 'ru/verbalizer/numbers.grm' as n; + +num = n.CARDINAL_NUMBERS; +ord = n.ORDINAL_NUMBERS_UNMARKED; +digits = f.FRACTIONAL_PART_UNGROUPED; + +# Various symbols. + +plus = "+" : "@@ARITHMETIC_PLUS@@"; +minus = "-" : "@@ARITHMETIC_MINUS@@"; +slash = "/" : "@@SLASH@@"; +dot = "." : "@@URL_DOT_EXPRESSION@@"; +dash = "-" : "@@DASH@@"; +equals = "=" : "@@ARITHMETIC_EQUALS@@"; + +degree = "°" : "@@DEGREE@@"; + +division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@"; + +times = ("x" | "*") : "@@ARITHMETIC_TIMES@@"; + +power = "^" : "@@DECIMAL_EXPONENT@@"; + +square_root = "√" : "@@SQUARE_ROOT@@"; + +percent = "%" : "@@PERCENT@@"; + +# Safe roman numbers. + +# NB: Do not change the formatting here. NO_EDIT must be on the same +# line as the path. +rfile = + 'universal/roman_numerals.tsv' # NO_EDIT +; + +roman = StringFile[rfile]; + +## Main categories. + +cat_dot_number = + num + n.I[" "] dot n.I[" "] num + (n.I[" "] dot n.I[" "] num)+ +; + +cat_slash_number = + num + n.I[" "] slash n.I[" "] num + (n.I[" "] slash n.I[" "] num)* +; + +cat_dash_number = + num + n.I[" "] dash n.I[" "] num + (n.I[" "] dash n.I[" "] num)* +; + +cat_signed_number = ((plus | minus) n.I[" "])? num; + +cat_degree = cat_signed_number n.I[" "] degree; + +cat_country_code = plus n.I[" "] (num | digits); + +cat_math_operations = + plus + | minus + | division + | times + | equals + | percent + | power + | square_root +; + +# Roman numbers are often either cardinals or ordinals in various languages. +cat_roman = roman @ (num | ord); + +# Allow +# +# number:number +# number-number +# +# to just be +# +# number number. + +cat_number_number = + num ((":" | "-") : " ") num +; + +# Some additional readings for these symbols. + +cat_additional_readings = + ("/" : "@@PER@@") | + ("+" : "@@AND@@") | + ("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) | + ("*" : "@@STAR@@") | + ("x" : ("x" | "@@CONNECTOR_BY@@")) | + ("@" : "@@AT@@") +; + +numbers_plus = Optimize[ + cat_dot_number + | cat_slash_number + | cat_dash_number + | cat_signed_number + | cat_degree + | cat_country_code + | cat_math_operations + | cat_roman + | cat_number_number + | cat_additional_readings +]; + +export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinal_endings.tsv b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinal_endings.tsv new file mode 100644 index 000000000..6db35e26d --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinal_endings.tsv @@ -0,0 +1,39 @@ +ая-ая +ого-го +ьего-го +ьего-его +ьей-ей +ьему-ему +ьем-ем +ое-е +ые-е +ье-е +ий-ий +ьими-ими +ьим-им +ьих-их +ьи-и +ий-й +ой-й +ый-й +ыми-ми +ьими-ми +ому-му +ьему-му +ого-ого +ое-ое +ой-ой +ом-ом +ому-ому +ую-ую +ых-х +ьих-х +ые-ые +ый-ый +ыми-ыми +ым-ым +ых-ых +ую-ю +ью-ю +ая-я +ья-я diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinals-lex.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinals-lex.grm new file mode 100644 index 000000000..ca4d86d07 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinals-lex.grm @@ -0,0 +1,804 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# AUTOMATICALLY GENERATED: DO NOT EDIT. +import 'util/byte.grm' as b; + +# Utilities for insertion and deletion. + +func I[expr] { + return "" : expr; +} + +func D[expr] { + return expr : ""; +} + +# Powers of base 10. +export POWERS = + "[E15]" + | "[E14]" + | "[E13]" + | "[E12]" + | "[E11]" + | "[E10]" + | "[E9]" + | "[E8]" + | "[E7]" + | "[E6]" + | "[E5]" + | "[E4]" + | "[E3]" + | "[E2]" + | "[E1]" +; + +export SIGMA = b.kBytes | POWERS; + +export SIGMA_STAR = SIGMA*; + +export SIGMA_PLUS = SIGMA+; + +################################################################################ +# BEGIN LANGUAGE SPECIFIC DATA +revaluations = + ("[E4]" : "[E1]") + | ("[E5]" : "[E2]") + | ("[E7]" : "[E1]") + | ("[E8]" : "[E2]") +; + +Ms = "[E3]" | "[E6]" | "[E9]"; + + +func Zero[expr] { + return expr : (""); +} + +space = " "; + +lexset3 = Optimize[ + ("1[E1]+1" : "одиннадцатая@") + | ("1[E1]+1" : "одиннадцати") + | ("1[E1]+1" : "одиннадцатого@") + | ("1[E1]+1" : "одиннадцатое@") + | ("1[E1]+1" : "одиннадцатой@") + | ("1[E1]+1" : "одиннадцатом@") + | ("1[E1]+1" : "одиннадцатому@") + | ("1[E1]+1" : "одиннадцатую@") + | ("1[E1]+1" : "одиннадцатые@") + | ("1[E1]+1" : "одиннадцатый@") + | ("1[E1]+1" : "одиннадцатым@") + | ("1[E1]+1" : "одиннадцатыми@") + | ("1[E1]+1" : "одиннадцатых@") + | ("1[E1]+1" : "одиннадцать") + | ("1[E1]+1" : "одиннадцатью") + | ("1[E1]+2" : "двенадцатая@") + | ("1[E1]+2" : "двенадцати") + | ("1[E1]+2" : "двенадцатого@") + | ("1[E1]+2" : "двенадцатое@") + | ("1[E1]+2" : "двенадцатой@") + | ("1[E1]+2" : "двенадцатом@") + | ("1[E1]+2" : "двенадцатому@") + | ("1[E1]+2" : "двенадцатую@") + | ("1[E1]+2" : "двенадцатые@") + | ("1[E1]+2" : "двенадцатый@") + | ("1[E1]+2" : "двенадцатым@") + | ("1[E1]+2" : "двенадцатыми@") + | ("1[E1]+2" : "двенадцатых@") + | ("1[E1]+2" : "двенадцать") + | ("1[E1]+2" : "двенадцатью") + | ("1[E1]+3" : "тринадцатая@") + | ("1[E1]+3" : "тринадцати") + | ("1[E1]+3" : "тринадцатого@") + | ("1[E1]+3" : "тринадцатое@") + | ("1[E1]+3" : "тринадцатой@") + | ("1[E1]+3" : "тринадцатом@") + | ("1[E1]+3" : "тринадцатому@") + | ("1[E1]+3" : "тринадцатую@") + | ("1[E1]+3" : "тринадцатые@") + | ("1[E1]+3" : "тринадцатый@") + | ("1[E1]+3" : "тринадцатым@") + | ("1[E1]+3" : "тринадцатыми@") + | ("1[E1]+3" : "тринадцатых@") + | ("1[E1]+3" : "тринадцать") + | ("1[E1]+3" : "тринадцатью") + | ("1[E1]+4" : "четырнадцатая@") + | ("1[E1]+4" : "четырнадцати") + | ("1[E1]+4" : "четырнадцатого@") + | ("1[E1]+4" : "четырнадцатое@") + | ("1[E1]+4" : "четырнадцатой@") + | ("1[E1]+4" : "четырнадцатом@") + | ("1[E1]+4" : "четырнадцатому@") + | ("1[E1]+4" : "четырнадцатую@") + | ("1[E1]+4" : "четырнадцатые@") + | ("1[E1]+4" : "четырнадцатый@") + | ("1[E1]+4" : "четырнадцатым@") + | ("1[E1]+4" : "четырнадцатыми@") + | ("1[E1]+4" : "четырнадцатых@") + | ("1[E1]+4" : "четырнадцать") + | ("1[E1]+4" : "четырнадцатью") + | ("1[E1]+5" : "пятнадцатая@") + | ("1[E1]+5" : "пятнадцати") + | ("1[E1]+5" : "пятнадцатого@") + | ("1[E1]+5" : "пятнадцатое@") + | ("1[E1]+5" : "пятнадцатой@") + | ("1[E1]+5" : "пятнадцатом@") + | ("1[E1]+5" : "пятнадцатому@") + | ("1[E1]+5" : "пятнадцатую@") + | ("1[E1]+5" : "пятнадцатые@") + | ("1[E1]+5" : "пятнадцатый@") + | ("1[E1]+5" : "пятнадцатым@") + | ("1[E1]+5" : "пятнадцатыми@") + | ("1[E1]+5" : "пятнадцатых@") + | ("1[E1]+5" : "пятнадцать") + | ("1[E1]+5" : "пятнадцатью") + | ("1[E1]+6" : "шестнадцатая@") + | ("1[E1]+6" : "шестнадцати") + | ("1[E1]+6" : "шестнадцатого@") + | ("1[E1]+6" : "шестнадцатое@") + | ("1[E1]+6" : "шестнадцатой@") + | ("1[E1]+6" : "шестнадцатом@") + | ("1[E1]+6" : "шестнадцатому@") + | ("1[E1]+6" : "шестнадцатую@") + | ("1[E1]+6" : "шестнадцатые@") + | ("1[E1]+6" : "шестнадцатый@") + | ("1[E1]+6" : "шестнадцатым@") + | ("1[E1]+6" : "шестнадцатыми@") + | ("1[E1]+6" : "шестнадцатых@") + | ("1[E1]+6" : "шестнадцать") + | ("1[E1]+6" : "шестнадцатью") + | ("1[E1]+7" : "семнадцатая@") + | ("1[E1]+7" : "семнадцати") + | ("1[E1]+7" : "семнадцатого@") + | ("1[E1]+7" : "семнадцатое@") + | ("1[E1]+7" : "семнадцатой@") + | ("1[E1]+7" : "семнадцатом@") + | ("1[E1]+7" : "семнадцатому@") + | ("1[E1]+7" : "семнадцатую@") + | ("1[E1]+7" : "семнадцатые@") + | ("1[E1]+7" : "семнадцатый@") + | ("1[E1]+7" : "семнадцатым@") + | ("1[E1]+7" : "семнадцатыми@") + | ("1[E1]+7" : "семнадцатых@") + | ("1[E1]+7" : "семнадцать") + | ("1[E1]+7" : "семнадцатью") + | ("1[E1]+8" : "восемнадцатая@") + | ("1[E1]+8" : "восемнадцати") + | ("1[E1]+8" : "восемнадцатого@") + | ("1[E1]+8" : "восемнадцатое@") + | ("1[E1]+8" : "восемнадцатой@") + | ("1[E1]+8" : "восемнадцатом@") + | ("1[E1]+8" : "восемнадцатому@") + | ("1[E1]+8" : "восемнадцатую@") + | ("1[E1]+8" : "восемнадцатые@") + | ("1[E1]+8" : "восемнадцатый@") + | ("1[E1]+8" : "восемнадцатым@") + | ("1[E1]+8" : "восемнадцатыми@") + | ("1[E1]+8" : "восемнадцатых@") + | ("1[E1]+8" : "восемнадцать") + | ("1[E1]+8" : "восемнадцатью") + | ("1[E1]+9" : "девятнадцатая@") + | ("1[E1]+9" : "девятнадцати") + | ("1[E1]+9" : "девятнадцатого@") + | ("1[E1]+9" : "девятнадцатое@") + | ("1[E1]+9" : "девятнадцатой@") + | ("1[E1]+9" : "девятнадцатом@") + | ("1[E1]+9" : "девятнадцатому@") + | ("1[E1]+9" : "девятнадцатую@") + | ("1[E1]+9" : "девятнадцатые@") + | ("1[E1]+9" : "девятнадцатый@") + | ("1[E1]+9" : "девятнадцатым@") + | ("1[E1]+9" : "девятнадцатыми@") + | ("1[E1]+9" : "девятнадцатых@") + | ("1[E1]+9" : "девятнадцать") + | ("1[E1]+9" : "девятнадцатью")] +; + +lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR]; + +lexset2 = Optimize[ + ("1[E1]" : "десятая@") + | ("1[E1]" : "десяти") + | ("1[E1]" : "десятого@") + | ("1[E1]" : "десятое@") + | ("1[E1]" : "десятой@") + | ("1[E1]" : "десятом@") + | ("1[E1]" : "десятому@") + | ("1[E1]" : "десятую@") + | ("1[E1]" : "десятые@") + | ("1[E1]" : "десятый@") + | ("1[E1]" : "десятым@") + | ("1[E1]" : "десятыми@") + | ("1[E1]" : "десятых@") + | ("1[E1]" : "десять") + | ("1[E1]" : "десятью") + | ("1[E2]" : "сотая@") + | ("1[E2]" : "сотого@") + | ("1[E2]" : "сотое@") + | ("1[E2]" : "сотой@") + | ("1[E2]" : "сотом@") + | ("1[E2]" : "сотому@") + | ("1[E2]" : "сотую@") + | ("1[E2]" : "сотые@") + | ("1[E2]" : "сотый@") + | ("1[E2]" : "сотым@") + | ("1[E2]" : "сотыми@") + | ("1[E2]" : "сотых@") + | ("1[E2]" : "ста") + | ("1[E2]" : "сто") + | ("1[E3]" : "тысячная@") + | ("1[E3]" : "тысячного@") + | ("1[E3]" : "тысячное@") + | ("1[E3]" : "тысячной@") + | ("1[E3]" : "тысячном@") + | ("1[E3]" : "тысячному@") + | ("1[E3]" : "тысячную@") + | ("1[E3]" : "тысячные@") + | ("1[E3]" : "тысячный@") + | ("1[E3]" : "тысячным@") + | ("1[E3]" : "тысячными@") + | ("1[E3]" : "тысячных@") + | ("1[E6]" : "миллионная@") + | ("1[E6]" : "миллионного@") + | ("1[E6]" : "миллионное@") + | ("1[E6]" : "миллионной@") + | ("1[E6]" : "миллионном@") + | ("1[E6]" : "миллионному@") + | ("1[E6]" : "миллионную@") + | ("1[E6]" : "миллионные@") + | ("1[E6]" : "миллионный@") + | ("1[E6]" : "миллионным@") + | ("1[E6]" : "миллионными@") + | ("1[E6]" : "миллионных@") + | ("1[E9]" : "миллиардная@") + | ("1[E9]" : "миллиардного@") + | ("1[E9]" : "миллиардное@") + | ("1[E9]" : "миллиардной@") + | ("1[E9]" : "миллиардном@") + | ("1[E9]" : "миллиардному@") + | ("1[E9]" : "миллиардную@") + | ("1[E9]" : "миллиардные@") + | ("1[E9]" : "миллиардный@") + | ("1[E9]" : "миллиардным@") + | ("1[E9]" : "миллиардными@") + | ("1[E9]" : "миллиардных@") + | ("2[E1]" : "двадцатая@") + | ("2[E1]" : "двадцати") + | ("2[E1]" : "двадцатого@") + | ("2[E1]" : "двадцатое@") + | ("2[E1]" : "двадцатой@") + | ("2[E1]" : "двадцатом@") + | ("2[E1]" : "двадцатому@") + | ("2[E1]" : "двадцатую@") + | ("2[E1]" : "двадцатые@") + | ("2[E1]" : "двадцатый@") + | ("2[E1]" : "двадцатым@") + | ("2[E1]" : "двадцатыми@") + | ("2[E1]" : "двадцатых@") + | ("2[E1]" : "двадцать") + | ("2[E1]" : "двадцатью") + | ("2[E2]" : "двести") + | ("2[E2]" : "двумстам") + | ("2[E2]" : "двумястами") + | ("2[E2]" : "двухсот") + | ("2[E2]" : "двухсотая@") + | ("2[E2]" : "двухсотого@") + | ("2[E2]" : "двухсотое@") + | ("2[E2]" : "двухсотой@") + | ("2[E2]" : "двухсотом@") + | ("2[E2]" : "двухсотому@") + | ("2[E2]" : "двухсотую@") + | ("2[E2]" : "двухсотые@") + | ("2[E2]" : "двухсотый@") + | ("2[E2]" : "двухсотым@") + | ("2[E2]" : "двухсотыми@") + | ("2[E2]" : "двухсотых@") + | ("2[E2]" : "двухстах") + | ("3[E1]" : "тридцатая@") + | ("3[E1]" : "тридцати") + | ("3[E1]" : "тридцатого@") + | ("3[E1]" : "тридцатое@") + | ("3[E1]" : "тридцатой@") + | ("3[E1]" : "тридцатом@") + | ("3[E1]" : "тридцатому@") + | ("3[E1]" : "тридцатую@") + | ("3[E1]" : "тридцатые@") + | ("3[E1]" : "тридцатый@") + | ("3[E1]" : "тридцатым@") + | ("3[E1]" : "тридцатыми@") + | ("3[E1]" : "тридцатых@") + | ("3[E1]" : "тридцать") + | ("3[E1]" : "тридцатью") + | ("3[E2]" : "тремстам") + | ("3[E2]" : "тремястами") + | ("3[E2]" : "трехсот") + | ("3[E2]" : "трехсотая@") + | ("3[E2]" : "трехсотого@") + | ("3[E2]" : "трехсотое@") + | ("3[E2]" : "трехсотой@") + | ("3[E2]" : "трехсотом@") + | ("3[E2]" : "трехсотому@") + | ("3[E2]" : "трехсотую@") + | ("3[E2]" : "трехсотые@") + | ("3[E2]" : "трехсотый@") + | ("3[E2]" : "трехсотым@") + | ("3[E2]" : "трехсотыми@") + | ("3[E2]" : "трехсотых@") + | ("3[E2]" : "трехстах") + | ("3[E2]" : "триста") + | ("4[E1]" : "сорок") + | ("4[E1]" : "сорока") + | ("4[E1]" : "сороковая@") + | ("4[E1]" : "сорокового@") + | ("4[E1]" : "сороковое@") + | ("4[E1]" : "сороковой@") + | ("4[E1]" : "сороковом@") + | ("4[E1]" : "сороковому@") + | ("4[E1]" : "сороковую@") + | ("4[E1]" : "сороковые@") + | ("4[E1]" : "сороковым@") + | ("4[E1]" : "сороковыми@") + | ("4[E1]" : "сороковых@") + | ("4[E2]" : "четыремстам") + | ("4[E2]" : "четыреста") + | ("4[E2]" : "четырехсот") + | ("4[E2]" : "четырехсотая@") + | ("4[E2]" : "четырехсотого@") + | ("4[E2]" : "четырехсотое@") + | ("4[E2]" : "четырехсотой@") + | ("4[E2]" : "четырехсотом@") + | ("4[E2]" : "четырехсотому@") + | ("4[E2]" : "четырехсотую@") + | ("4[E2]" : "четырехсотые@") + | ("4[E2]" : "четырехсотый@") + | ("4[E2]" : "четырехсотым@") + | ("4[E2]" : "четырехсотыми@") + | ("4[E2]" : "четырехсотых@") + | ("4[E2]" : "четырехстах") + | ("4[E2]" : "четырьмястами") + | ("5[E1]" : "пятидесятая@") + | ("5[E1]" : "пятидесяти") + | ("5[E1]" : "пятидесятого@") + | ("5[E1]" : "пятидесятое@") + | ("5[E1]" : "пятидесятой@") + | ("5[E1]" : "пятидесятом@") + | ("5[E1]" : "пятидесятому@") + | ("5[E1]" : "пятидесятую@") + | ("5[E1]" : "пятидесятые@") + | ("5[E1]" : "пятидесятый@") + | ("5[E1]" : "пятидесятым@") + | ("5[E1]" : "пятидесятыми@") + | ("5[E1]" : "пятидесятых@") + | ("5[E1]" : "пятьдесят") + | ("5[E1]" : "пятьюдесятью") + | ("5[E2]" : "пятисот") + | ("5[E2]" : "пятисотая@") + | ("5[E2]" : "пятисотого@") + | ("5[E2]" : "пятисотое@") + | ("5[E2]" : "пятисотой@") + | ("5[E2]" : "пятисотом@") + | ("5[E2]" : "пятисотому@") + | ("5[E2]" : "пятисотую@") + | ("5[E2]" : "пятисотые@") + | ("5[E2]" : "пятисотый@") + | ("5[E2]" : "пятисотым@") + | ("5[E2]" : "пятисотыми@") + | ("5[E2]" : "пятисотых@") + | ("5[E2]" : "пятистам") + | ("5[E2]" : "пятистах") + | ("5[E2]" : "пятьсот") + | ("5[E2]" : "пятьюстами") + | ("6[E1]" : "шестидесятая@") + | ("6[E1]" : "шестидесяти") + | ("6[E1]" : "шестидесятого@") + | ("6[E1]" : "шестидесятое@") + | ("6[E1]" : "шестидесятой@") + | ("6[E1]" : "шестидесятом@") + | ("6[E1]" : "шестидесятому@") + | ("6[E1]" : "шестидесятую@") + | ("6[E1]" : "шестидесятые@") + | ("6[E1]" : "шестидесятый@") + | ("6[E1]" : "шестидесятым@") + | ("6[E1]" : "шестидесятыми@") + | ("6[E1]" : "шестидесятых@") + | ("6[E1]" : "шестьдесят") + | ("6[E1]" : "шестьюдесятью") + | ("6[E2]" : "шестисот") + | ("6[E2]" : "шестисотая@") + | ("6[E2]" : "шестисотого@") + | ("6[E2]" : "шестисотое@") + | ("6[E2]" : "шестисотой@") + | ("6[E2]" : "шестисотом@") + | ("6[E2]" : "шестисотому@") + | ("6[E2]" : "шестисотую@") + | ("6[E2]" : "шестисотые@") + | ("6[E2]" : "шестисотый@") + | ("6[E2]" : "шестисотым@") + | ("6[E2]" : "шестисотыми@") + | ("6[E2]" : "шестисотых@") + | ("6[E2]" : "шестистам") + | ("6[E2]" : "шестистах") + | ("6[E2]" : "шестьсот") + | ("6[E2]" : "шестьюстами") + | ("7[E1]" : "семидесятая@") + | ("7[E1]" : "семидесяти") + | ("7[E1]" : "семидесятого@") + | ("7[E1]" : "семидесятое@") + | ("7[E1]" : "семидесятой@") + | ("7[E1]" : "семидесятом@") + | ("7[E1]" : "семидесятому@") + | ("7[E1]" : "семидесятую@") + | ("7[E1]" : "семидесятые@") + | ("7[E1]" : "семидесятый@") + | ("7[E1]" : "семидесятым@") + | ("7[E1]" : "семидесятыми@") + | ("7[E1]" : "семидесятых@") + | ("7[E1]" : "семьдесят") + | ("7[E1]" : "семьюдесятью") + | ("7[E2]" : "семисот") + | ("7[E2]" : "семисотая@") + | ("7[E2]" : "семисотого@") + | ("7[E2]" : "семисотое@") + | ("7[E2]" : "семисотой@") + | ("7[E2]" : "семисотом@") + | ("7[E2]" : "семисотому@") + | ("7[E2]" : "семисотую@") + | ("7[E2]" : "семисотые@") + | ("7[E2]" : "семисотый@") + | ("7[E2]" : "семисотым@") + | ("7[E2]" : "семисотыми@") + | ("7[E2]" : "семисотых@") + | ("7[E2]" : "семистам") + | ("7[E2]" : "семистах") + | ("7[E2]" : "семьсот") + | ("7[E2]" : "семьюстами") + | ("8[E1]" : "восемьдесят") + | ("8[E1]" : "восьмидесятая@") + | ("8[E1]" : "восьмидесяти") + | ("8[E1]" : "восьмидесятого@") + | ("8[E1]" : "восьмидесятое@") + | ("8[E1]" : "восьмидесятой@") + | ("8[E1]" : "восьмидесятом@") + | ("8[E1]" : "восьмидесятому@") + | ("8[E1]" : "восьмидесятую@") + | ("8[E1]" : "восьмидесятые@") + | ("8[E1]" : "восьмидесятый@") + | ("8[E1]" : "восьмидесятым@") + | ("8[E1]" : "восьмидесятыми@") + | ("8[E1]" : "восьмидесятых@") + | ("8[E1]" : "восьмьюдесятью") + | ("8[E2]" : "восемьсот") + | ("8[E2]" : "восемьюстами") + | ("8[E2]" : "восьмисот") + | ("8[E2]" : "восьмисотая@") + | ("8[E2]" : "восьмисотого@") + | ("8[E2]" : "восьмисотое@") + | ("8[E2]" : "восьмисотой@") + | ("8[E2]" : "восьмисотом@") + | ("8[E2]" : "восьмисотому@") + | ("8[E2]" : "восьмисотую@") + | ("8[E2]" : "восьмисотые@") + | ("8[E2]" : "восьмисотый@") + | ("8[E2]" : "восьмисотым@") + | ("8[E2]" : "восьмисотыми@") + | ("8[E2]" : "восьмисотых@") + | ("8[E2]" : "восьмистам") + | ("8[E2]" : "восьмистах") + | ("8[E2]" : "восьмьюстами") + | ("9[E1]" : "девяноста") + | ("9[E1]" : "девяностая@") + | ("9[E1]" : "девяносто") + | ("9[E1]" : "девяностого@") + | ("9[E1]" : "девяностое@") + | ("9[E1]" : "девяностой@") + | ("9[E1]" : "девяностом@") + | ("9[E1]" : "девяностому@") + | ("9[E1]" : "девяностую@") + | ("9[E1]" : "девяностые@") + | ("9[E1]" : "девяностый@") + | ("9[E1]" : "девяностым@") + | ("9[E1]" : "девяностыми@") + | ("9[E1]" : "девяностых@") + | ("9[E2]" : "девятисот") + | ("9[E2]" : "девятисотая@") + | ("9[E2]" : "девятисотого@") + | ("9[E2]" : "девятисотое@") + | ("9[E2]" : "девятисотой@") + | ("9[E2]" : "девятисотом@") + | ("9[E2]" : "девятисотому@") + | ("9[E2]" : "девятисотую@") + | ("9[E2]" : "девятисотые@") + | ("9[E2]" : "девятисотый@") + | ("9[E2]" : "девятисотым@") + | ("9[E2]" : "девятисотыми@") + | ("9[E2]" : "девятисотых@") + | ("9[E2]" : "девятистам") + | ("9[E2]" : "девятистах") + | ("9[E2]" : "девятьсот") + | ("9[E2]" : "девятьюстами")] +; + +lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR]; + +lexset1 = Optimize[ + ("+" : "") + | ("1" : "один") + | ("1" : "одна") + | ("1" : "одни") + | ("1" : "одним") + | ("1" : "одними") + | ("1" : "одних") + | ("1" : "одно") + | ("1" : "одного") + | ("1" : "одной") + | ("1" : "одном") + | ("1" : "одному") + | ("1" : "одною") + | ("1" : "одну") + | ("1" : "первая@") + | ("1" : "первого@") + | ("1" : "первое@") + | ("1" : "первой@") + | ("1" : "первом@") + | ("1" : "первому@") + | ("1" : "первую@") + | ("1" : "первые@") + | ("1" : "первый@") + | ("1" : "первым@") + | ("1" : "первыми@") + | ("1" : "первых@") + | ("2" : "вторая@") + | ("2" : "второго@") + | ("2" : "второе@") + | ("2" : "второй@") + | ("2" : "втором@") + | ("2" : "второму@") + | ("2" : "вторую@") + | ("2" : "вторые@") + | ("2" : "вторым@") + | ("2" : "вторыми@") + | ("2" : "вторых@") + | ("2" : "два") + | ("2" : "две") + | ("2" : "двум") + | ("2" : "двумя") + | ("2" : "двух") + | ("3" : "трем") + | ("3" : "тремя") + | ("3" : "третий@") + | ("3" : "третье@") + | ("3" : "третьего@") + | ("3" : "третьей@") + | ("3" : "третьем@") + | ("3" : "третьему@") + | ("3" : "третьи@") + | ("3" : "третьим@") + | ("3" : "третьими@") + | ("3" : "третьих@") + | ("3" : "третью@") + | ("3" : "третья@") + | ("3" : "трех") + | ("3" : "три") + | ("4" : "четвертая@") + | ("4" : "четвертого@") + | ("4" : "четвертое@") + | ("4" : "четвертой@") + | ("4" : "четвертом@") + | ("4" : "четвертому@") + | ("4" : "четвертую@") + | ("4" : "четвертые@") + | ("4" : "четвертый@") + | ("4" : "четвертым@") + | ("4" : "четвертыми@") + | ("4" : "четвертых@") + | ("4" : "четыре") + | ("4" : "четырем") + | ("4" : "четырех") + | ("4" : "четырьмя") + | ("5" : "пятая@") + | ("5" : "пяти") + | ("5" : "пятого@") + | ("5" : "пятое@") + | ("5" : "пятой@") + | ("5" : "пятом@") + | ("5" : "пятому@") + | ("5" : "пятую@") + | ("5" : "пятые@") + | ("5" : "пятый@") + | ("5" : "пятым@") + | ("5" : "пятыми@") + | ("5" : "пятых@") + | ("5" : "пять") + | ("5" : "пятью") + | ("6" : "шестая@") + | ("6" : "шести") + | ("6" : "шестого@") + | ("6" : "шестое@") + | ("6" : "шестой@") + | ("6" : "шестом@") + | ("6" : "шестому@") + | ("6" : "шестую@") + | ("6" : "шестые@") + | ("6" : "шестым@") + | ("6" : "шестыми@") + | ("6" : "шестых@") + | ("6" : "шесть") + | ("6" : "шестью") + | ("7" : "седьмая@") + | ("7" : "седьмого@") + | ("7" : "седьмое@") + | ("7" : "седьмой@") + | ("7" : "седьмом@") + | ("7" : "седьмому@") + | ("7" : "седьмую@") + | ("7" : "седьмые@") + | ("7" : "седьмым@") + | ("7" : "седьмыми@") + | ("7" : "седьмых@") + | ("7" : "семи") + | ("7" : "семь") + | ("7" : "семью") + | ("8" : "восемь") + | ("8" : "восьмая@") + | ("8" : "восьми") + | ("8" : "восьмого@") + | ("8" : "восьмое@") + | ("8" : "восьмой@") + | ("8" : "восьмом@") + | ("8" : "восьмому@") + | ("8" : "восьмую@") + | ("8" : "восьмые@") + | ("8" : "восьмым@") + | ("8" : "восьмыми@") + | ("8" : "восьмых@") + | ("8" : "восьмью") + | ("9" : "девятая@") + | ("9" : "девяти") + | ("9" : "девятого@") + | ("9" : "девятое@") + | ("9" : "девятой@") + | ("9" : "девятом@") + | ("9" : "девятому@") + | ("9" : "девятую@") + | ("9" : "девятые@") + | ("9" : "девятый@") + | ("9" : "девятым@") + | ("9" : "девятыми@") + | ("9" : "девятых@") + | ("9" : "девять") + | ("9" : "девятью") + | ("[E3]" : "тысяч") + | ("[E3]" : "тысяча") + | ("[E3]" : "тысячам") + | ("[E3]" : "тысячами") + | ("[E3]" : "тысячах") + | ("[E3]" : "тысяче") + | ("[E3]" : "тысячей") + | ("[E3]" : "тысячи") + | ("[E3]" : "тысячу") + | ("[E3]" : "тысячью") + | ("[E6]" : "миллион") + | ("[E6]" : "миллиона") + | ("[E6]" : "миллионам") + | ("[E6]" : "миллионами") + | ("[E6]" : "миллионах") + | ("[E6]" : "миллионе") + | ("[E6]" : "миллионов") + | ("[E6]" : "миллионом") + | ("[E6]" : "миллиону") + | ("[E6]" : "миллионы") + | ("[E9]" : "миллиард") + | ("[E9]" : "миллиарда") + | ("[E9]" : "миллиардам") + | ("[E9]" : "миллиардами") + | ("[E9]" : "миллиардах") + | ("[E9]" : "миллиарде") + | ("[E9]" : "миллиардов") + | ("[E9]" : "миллиардом") + | ("[E9]" : "миллиарду") + | ("[E9]" : "миллиарды") + | ("|0|" : "ноле") + | ("|0|" : "нолем") + | ("|0|" : "ноль") + | ("|0|" : "нолю") + | ("|0|" : "ноля") + | ("|0|" : "нуле") + | ("|0|" : "нулем") + | ("|0|" : "нуль") + | ("|0|" : "нулю") + | ("|0|" : "нуля")] +; + +lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR]; + +export LEX = Optimize[lex3 @ lex2 @ lex1]; + +export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]"; + +# END LANGUAGE SPECIFIC DATA +################################################################################ +# Inserts a marker after the Ms. +export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR]; + +# Deletes all powers and "+". +export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR]; + +# Deletes trailing zeros at the beginning of a number, so that "0003" does not +# get treated as an ordinary number. +export DELETE_INITIAL_ZEROS = + CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR] +; + +NonMs = Optimize[POWERS - Ms]; + +# Deletes (usually) zeros before a non-M. E.g., +0[E1] should be +# deleted +export DELETE_INTERMEDIATE_ZEROS1 = + CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR] +; + +# Deletes (usually) zeros before an M, if there is no non-zero element between +# that and the previous boundary. Thus, if after the result of the rule above we +# end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final +# zero. +export DELETE_INTERMEDIATE_ZEROS2 = Optimize[ + CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR] + @ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]] +; + +# Final clean up of stray zeros. +export DELETE_REMAINING_ZEROS = Optimize[ + CDRewrite[Zero["+0"], "", "", SIGMA_STAR] + @ CDRewrite[Zero["0"], "", "", SIGMA_STAR]] +; + +# Applies the revaluation map. For example in English, change [E4] to [E1] as a +# modifier of [E3] +export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR]; + +# Deletes the various marks and powers in the input and output. +export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR]; + +export CLEAN_SPACES = Optimize[ + CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR] + @ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR] + @ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]] +; + +d = b.kDigit; + +# Germanic inversion rule. +germanic = + (I["1+"] d "[E1]" D["+1"]) + | (I["2+"] d "[E1]" D["+2"]) + | (I["3+"] d "[E1]" D["+3"]) + | (I["4+"] d "[E1]" D["+4"]) + | (I["5+"] d "[E1]" D["+5"]) + | (I["6+"] d "[E1]" D["+6"]) + | (I["7+"] d "[E1]" D["+7"]) + | (I["8+"] d "[E1]" D["+8"]) + | (I["9+"] d "[E1]" D["+9"]) +; + +germanic_inversion = + CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt'] +; + +export GERMANIC_INVERSION = SIGMA_STAR; +export ORDINAL_RESTRICTION = + Optimize[((SIGMA - "@")* "@") @ CDRewrite[D["@"], "", "", SIGMA_STAR]] +; +nondigits = b.kBytes - b.kDigit; +export ORDINAL_SUFFIX = D[nondigits*]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinals.tsv b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinals.tsv new file mode 100644 index 000000000..367e14b11 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinals.tsv @@ -0,0 +1,527 @@ +0 нулевая +0 нулевого +0 нулевое +0 нулевой +0 нулевом +0 нулевому +0 нулевую +0 нулевые +0 нулевым +0 нулевым +0 нулевыми +0 нулевых +1 первая +1 первого +1 первое +1 первой +1 первом +1 первому +1 первую +1 первые +1 первый +1 первым +1 первым +1 первыми +1 первых +2 вторая +2 второго +2 второе +2 второй +2 втором +2 второму +2 вторую +2 вторые +2 вторым +2 вторым +2 вторыми +2 вторых +3 третий +3 третье +3 третьего +3 третьей +3 третьем +3 третьему +3 третьи +3 третьим +3 третьим +3 третьими +3 третьих +3 третью +3 третья +4 четвертая +4 четвертого +4 четвертое +4 четвертой +4 четвертом +4 четвертому +4 четвертую +4 четвертые +4 четвертый +4 четвертым +4 четвертым +4 четвертыми +4 четвертых +4 четвёртая +4 четвёртого +4 четвёртое +4 четвёртой +4 четвёртом +4 четвёртому +4 четвёртую +4 четвёртые +4 четвёртый +4 четвёртым +4 четвёртым +4 четвёртыми +4 четвёртых +5 пятая +5 пятого +5 пятое +5 пятой +5 пятом +5 пятому +5 пятую +5 пятые +5 пятый +5 пятым +5 пятым +5 пятыми +5 пятых +6 шестая +6 шестого +6 шестое +6 шестой +6 шестом +6 шестому +6 шестую +6 шестые +6 шестым +6 шестым +6 шестыми +6 шестых +7 седьмая +7 седьмого +7 седьмое +7 седьмой +7 седьмом +7 седьмому +7 седьмую +7 седьмые +7 седьмым +7 седьмым +7 седьмыми +7 седьмых +8 восьмая +8 восьмого +8 восьмое +8 восьмой +8 восьмом +8 восьмому +8 восьмую +8 восьмые +8 восьмым +8 восьмым +8 восьмыми +8 восьмых +9 девятая +9 девятого +9 девятое +9 девятой +9 девятом +9 девятому +9 девятую +9 девятые +9 девятый +9 девятым +9 девятым +9 девятыми +9 девятых +10 десятая +10 десятого +10 десятое +10 десятой +10 десятом +10 десятому +10 десятую +10 десятые +10 десятый +10 десятым +10 десятым +10 десятыми +10 десятых +11 одиннадцатая +11 одиннадцатого +11 одиннадцатое +11 одиннадцатой +11 одиннадцатом +11 одиннадцатому +11 одиннадцатую +11 одиннадцатые +11 одиннадцатый +11 одиннадцатым +11 одиннадцатым +11 одиннадцатыми +11 одиннадцатых +12 двенадцатая +12 двенадцатого +12 двенадцатое +12 двенадцатой +12 двенадцатом +12 двенадцатому +12 двенадцатую +12 двенадцатые +12 двенадцатый +12 двенадцатым +12 двенадцатым +12 двенадцатыми +12 двенадцатых +13 тринадцатая +13 тринадцатого +13 тринадцатое +13 тринадцатой +13 тринадцатом +13 тринадцатому +13 тринадцатую +13 тринадцатые +13 тринадцатый +13 тринадцатым +13 тринадцатым +13 тринадцатыми +13 тринадцатых +14 четырнадцатая +14 четырнадцатого +14 четырнадцатое +14 четырнадцатой +14 четырнадцатом +14 четырнадцатому +14 четырнадцатую +14 четырнадцатые +14 четырнадцатый +14 четырнадцатым +14 четырнадцатым +14 четырнадцатыми +14 четырнадцатых +15 пятнадцатая +15 пятнадцатого +15 пятнадцатое +15 пятнадцатой +15 пятнадцатом +15 пятнадцатому +15 пятнадцатую +15 пятнадцатые +15 пятнадцатый +15 пятнадцатым +15 пятнадцатым +15 пятнадцатыми +15 пятнадцатых +16 шестнадцатая +16 шестнадцатого +16 шестнадцатое +16 шестнадцатой +16 шестнадцатом +16 шестнадцатому +16 шестнадцатую +16 шестнадцатые +16 шестнадцатый +16 шестнадцатым +16 шестнадцатым +16 шестнадцатыми +16 шестнадцатых +17 семнадцатая +17 семнадцатого +17 семнадцатое +17 семнадцатой +17 семнадцатом +17 семнадцатому +17 семнадцатую +17 семнадцатые +17 семнадцатый +17 семнадцатым +17 семнадцатым +17 семнадцатыми +17 семнадцатых +18 восемнадцатая +18 восемнадцатого +18 восемнадцатое +18 восемнадцатой +18 восемнадцатом +18 восемнадцатому +18 восемнадцатую +18 восемнадцатые +18 восемнадцатый +18 восемнадцатым +18 восемнадцатым +18 восемнадцатыми +18 восемнадцатых +19 девятнадцатая +19 девятнадцатого +19 девятнадцатое +19 девятнадцатой +19 девятнадцатом +19 девятнадцатому +19 девятнадцатую +19 девятнадцатые +19 девятнадцатый +19 девятнадцатым +19 девятнадцатым +19 девятнадцатыми +19 девятнадцатых +20 двадцатая +20 двадцатого +20 двадцатое +20 двадцатой +20 двадцатом +20 двадцатому +20 двадцатую +20 двадцатые +20 двадцатый +20 двадцатым +20 двадцатым +20 двадцатыми +20 двадцатых +30 тридцатая +30 тридцатого +30 тридцатое +30 тридцатой +30 тридцатом +30 тридцатому +30 тридцатую +30 тридцатые +30 тридцатый +30 тридцатым +30 тридцатым +30 тридцатыми +30 тридцатых +40 сороковая +40 сорокового +40 сороковое +40 сороковой +40 сороковом +40 сороковому +40 сороковую +40 сороковые +40 сороковым +40 сороковым +40 сороковыми +40 сороковых +50 пятидесятая +50 пятидесятого +50 пятидесятое +50 пятидесятой +50 пятидесятом +50 пятидесятому +50 пятидесятую +50 пятидесятые +50 пятидесятый +50 пятидесятым +50 пятидесятым +50 пятидесятыми +50 пятидесятых +60 шестидесятая +60 шестидесятого +60 шестидесятое +60 шестидесятой +60 шестидесятом +60 шестидесятому +60 шестидесятую +60 шестидесятые +60 шестидесятый +60 шестидесятым +60 шестидесятым +60 шестидесятыми +60 шестидесятых +70 семидесятая +70 семидесятого +70 семидесятое +70 семидесятой +70 семидесятом +70 семидесятому +70 семидесятую +70 семидесятые +70 семидесятый +70 семидесятым +70 семидесятым +70 семидесятыми +70 семидесятых +80 восьмидесятая +80 восьмидесятого +80 восьмидесятое +80 восьмидесятой +80 восьмидесятом +80 восьмидесятому +80 восьмидесятую +80 восьмидесятые +80 восьмидесятый +80 восьмидесятым +80 восьмидесятым +80 восьмидесятыми +80 восьмидесятых +90 девяностая +90 девяностого +90 девяностое +90 девяностой +90 девяностом +90 девяностому +90 девяностую +90 девяностые +90 девяностый +90 девяностым +90 девяностым +90 девяностыми +90 девяностых +100 сотая +100 сотого +100 сотое +100 сотой +100 сотом +100 сотому +100 сотую +100 сотые +100 сотый +100 сотым +100 сотым +100 сотыми +100 сотых +200 двухсотая +200 двухсотого +200 двухсотое +200 двухсотой +200 двухсотом +200 двухсотому +200 двухсотую +200 двухсотые +200 двухсотый +200 двухсотым +200 двухсотым +200 двухсотыми +200 двухсотых +300 трехсотая +300 трехсотого +300 трехсотое +300 трехсотой +300 трехсотом +300 трехсотому +300 трехсотую +300 трехсотые +300 трехсотый +300 трехсотым +300 трехсотым +300 трехсотыми +300 трехсотых +400 четырехсотая +400 четырехсотого +400 четырехсотое +400 четырехсотой +400 четырехсотом +400 четырехсотому +400 четырехсотую +400 четырехсотые +400 четырехсотый +400 четырехсотым +400 четырехсотым +400 четырехсотыми +400 четырехсотых +500 пятисотая +500 пятисотого +500 пятисотое +500 пятисотой +500 пятисотом +500 пятисотому +500 пятисотую +500 пятисотые +500 пятисотый +500 пятисотым +500 пятисотым +500 пятисотыми +500 пятисотых +600 шестисотая +600 шестисотого +600 шестисотое +600 шестисотой +600 шестисотом +600 шестисотому +600 шестисотую +600 шестисотые +600 шестисотый +600 шестисотым +600 шестисотым +600 шестисотыми +600 шестисотых +700 семисотая +700 семисотого +700 семисотое +700 семисотой +700 семисотом +700 семисотому +700 семисотую +700 семисотые +700 семисотый +700 семисотым +700 семисотым +700 семисотыми +700 семисотых +800 восьмисотая +800 восьмисотого +800 восьмисотое +800 восьмисотой +800 восьмисотом +800 восьмисотому +800 восьмисотую +800 восьмисотые +800 восьмисотый +800 восьмисотым +800 восьмисотым +800 восьмисотыми +800 восьмисотых +900 девятисотая +900 девятисотого +900 девятисотое +900 девятисотой +900 девятисотом +900 девятисотому +900 девятисотую +900 девятисотые +900 девятисотый +900 девятисотым +900 девятисотым +900 девятисотыми +900 девятисотых +1000 тысячная +1000 тысячного +1000 тысячное +1000 тысячной +1000 тысячном +1000 тысячному +1000 тысячную +1000 тысячные +1000 тысячный +1000 тысячным +1000 тысячным +1000 тысячными +1000 тысячных +1000000 миллионная +1000000 миллионного +1000000 миллионное +1000000 миллионной +1000000 миллионном +1000000 миллионному +1000000 миллионную +1000000 миллионные +1000000 миллионный +1000000 миллионным +1000000 миллионным +1000000 миллионными +1000000 миллионных +1000000000 миллиардная +1000000000 миллиардного +1000000000 миллиардное +1000000000 миллиардной +1000000000 миллиардном +1000000000 миллиардному +1000000000 миллиардную +1000000000 миллиардные +1000000000 миллиардный +1000000000 миллиардным +1000000000 миллиардным +1000000000 миллиардными +1000000000 миллиардных diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/spelled.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/spelled.grm new file mode 100644 index 000000000..123759ba9 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/spelled.grm @@ -0,0 +1,77 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This verbalizer is used whenever there is an LM symbol that consists of +# letters immediately followed by "{spelled}". This strips the "{spelled}" +# suffix. + +import 'util/byte.grm' as b; +import 'ru/classifier/cyrillic.grm' as c; +import 'ru/verbalizer/lexical_map.grm' as l; +import 'ru/verbalizer/numbers.grm' as n; + +digit = b.kDigit @ n.CARDINAL_NUMBERS; + +char_set = (("a" | "A") : "letter-a") + | (("b" | "B") : "letter-b") + | (("c" | "C") : "letter-c") + | (("d" | "D") : "letter-d") + | (("e" | "E") : "letter-e") + | (("f" | "F") : "letter-f") + | (("g" | "G") : "letter-g") + | (("h" | "H") : "letter-h") + | (("i" | "I") : "letter-i") + | (("j" | "J") : "letter-j") + | (("k" | "K") : "letter-k") + | (("l" | "L") : "letter-l") + | (("m" | "M") : "letter-m") + | (("n" | "N") : "letter-n") + | (("o" | "O") : "letter-o") + | (("p" | "P") : "letter-p") + | (("q" | "Q") : "letter-q") + | (("r" | "R") : "letter-r") + | (("s" | "S") : "letter-s") + | (("t" | "T") : "letter-t") + | (("u" | "U") : "letter-u") + | (("v" | "V") : "letter-v") + | (("w" | "W") : "letter-w") + | (("x" | "X") : "letter-x") + | (("y" | "Y") : "letter-y") + | (("z" | "Z") : "letter-z") + | (digit) + | ("&" : "@@AND@@") + | ("." : "") + | ("-" : "") + | ("_" : "") + | ("/" : "") + | (n.I["letter-"] c.kCyrillicAlpha) + ; + +ins_space = "" : " "; + +suffix = "{spelled}" : ""; + +spelled = Optimize[char_set (ins_space char_set)* suffix]; + +export SPELLED = Optimize[spelled @ l.LEXICAL_MAP]; + +sigma_star = b.kBytes*; + +# Gets rid of the letter- prefix since in some cases we don't want it. + +del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star]; + +spelled_no_tag = Optimize[char_set (ins_space char_set)*]; + +export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/spoken_punct.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/spoken_punct.grm new file mode 100644 index 000000000..26a1bf27f --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/spoken_punct.grm @@ -0,0 +1,24 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'ru/verbalizer/lexical_map.grm' as l; + +punct = + ("." : "@@PERIOD@@") + | ("," : "@@COMMA@@") + | ("!" : "@@EXCLAMATION_MARK@@") + | ("?" : "@@QUESTION_MARK@@") +; + +export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/time.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/time.grm new file mode 100644 index 000000000..a416aba7d --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/time.grm @@ -0,0 +1,108 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/byte.grm' as b; +import 'ru/verbalizer/lexical_map.grm' as l; +import 'ru/verbalizer/numbers.grm' as n; + +# Only handles 24-hour time with quarter-to, half-past and quarter-past. + +increment_hour = + ("0" : "1") + | ("1" : "2") + | ("2" : "3") + | ("3" : "4") + | ("4" : "5") + | ("5" : "6") + | ("6" : "7") + | ("7" : "8") + | ("8" : "9") + | ("9" : "10") + | ("10" : "11") + | ("11" : "12") + | ("12" : "1") # If someone uses 12, we assume 12-hour by default. + | ("13" : "14") + | ("14" : "15") + | ("15" : "16") + | ("16" : "17") + | ("17" : "18") + | ("18" : "19") + | ("19" : "20") + | ("20" : "21") + | ("21" : "22") + | ("22" : "23") + | ("23" : "12") +; + +hours = Project[increment_hour, 'input']; + +d = b.kDigit; +D = d - "0"; + +minutes09 = "0" D; + +minutes = ("1" | "2" | "3" | "4" | "5") d; + +__sep__ = ":"; +sep_space = __sep__ : " "; + +verbalize_hours = hours @ n.CARDINAL_NUMBERS; + +verbalize_minutes = + ("00" : "@@HOUR@@") + | (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS)) + | (minutes @ n.CARDINAL_NUMBERS) +; + +time_basic = Optimize[verbalize_hours sep_space verbalize_minutes]; + +# Special cases we handle right now. +# TODO: Need to allow for cases like +# +# half twelve (in the UK English sense) +# half twaalf (in the Dutch sense) + +time_quarter_past = + n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "] + verbalize_hours + n.D[__sep__ "15"]; + +time_half_past = + n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "] + verbalize_hours + n.D[__sep__ "30"]; + +time_quarter_to = + n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "] + (increment_hour @ verbalize_hours) + n.D[__sep__ "45"]; + +time_extra = Optimize[ + time_quarter_past | time_half_past | time_quarter_to] +; + +# Basic time periods which most languages can be expected to have. +__am__ = "a.m." | "am" | "AM" | "утра"; +__pm__ = "p.m." | "pm" | "PM" | "вечера"; + +period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@"); + +time_variants = time_basic | time_extra; + +time = Optimize[ + (period (" " | n.I[" "]))? time_variants + | time_variants ((" " | n.I[" "]) period)?] +; + +export TIME = Optimize[time @ l.LEXICAL_MAP]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/urls.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/urls.grm new file mode 100644 index 000000000..3039b6521 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/urls.grm @@ -0,0 +1,68 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Rules for URLs and email addresses. + +import 'util/byte.grm' as bytelib; +import 'ru/verbalizer/lexical_map.grm' as l; + +ins_space = "" : " "; +dot = "." : "@@URL_DOT_EXPRESSION@@"; +at = "@" : "@@AT@@"; + +url_suffix = + (".com" : dot ins_space "com") | + (".gov" : dot ins_space "gov") | + (".edu" : dot ins_space "e d u") | + (".org" : dot ins_space "org") | + (".net" : dot ins_space "net") +; + +letter_string = (bytelib.kAlnum)* bytelib.kAlnum; + +letter_string_dot = + ((letter_string ins_space dot ins_space)* letter_string) +; + +# Rules for URLs. +export URL = Optimize[ + ((letter_string_dot) (ins_space) + (url_suffix)) @ l.LEXICAL_MAP +]; + +# Rules for email addresses. +letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum); + +letter_by_letter_dot = + ((letter_by_letter ins_space dot ins_space)* + letter_by_letter) +; + +export EMAIL1 = Optimize[ + ((letter_by_letter) (ins_space) + (at) (ins_space) + (letter_by_letter_dot) (ins_space) + (url_suffix)) @ l.LEXICAL_MAP +]; + +export EMAIL2 = Optimize[ + ((letter_by_letter) (ins_space) + (at) (ins_space) + (letter_string_dot) (ins_space) + (url_suffix)) @ l.LEXICAL_MAP +]; + +export EMAILS = Optimize[ + EMAIL1 | EMAIL2 +]; diff --git a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/verbalizer.grm b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/verbalizer.grm new file mode 100644 index 000000000..ddd469685 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/verbalizer.grm @@ -0,0 +1,42 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import 'util/util.grm' as util; +import 'ru/verbalizer/extra_numbers.grm' as e; +import 'ru/verbalizer/float.grm' as f; +import 'ru/verbalizer/math.grm' as ma; +import 'ru/verbalizer/miscellaneous.grm' as mi; +import 'ru/verbalizer/money.grm' as mo; +import 'ru/verbalizer/numbers.grm' as n; +import 'ru/verbalizer/numbers_plus.grm' as np; +import 'ru/verbalizer/spelled.grm' as s; +import 'ru/verbalizer/spoken_punct.grm' as sp; +import 'ru/verbalizer/time.grm' as t; +import 'ru/verbalizer/urls.grm' as u; + +export VERBALIZER = Optimize[RmWeight[ + ( e.MIXED_NUMBERS + | e.DIGITS + | f.FLOAT + | ma.ARITHMETIC + | mi.MISCELLANEOUS + | mo.MONEY + | n.CARDINAL_NUMBERS + | n.ORDINAL_NUMBERS + | np.NUMBERS_PLUS + | s.SPELLED + | sp.SPOKEN_PUNCT + | t.TIME + | u.URL) @ util.CLEAN_SPACES +]]; diff --git a/third_party/chinese_text_normalization/thrax/src/universal/README.md b/third_party/chinese_text_normalization/thrax/src/universal/README.md new file mode 100644 index 000000000..33225f6da --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/universal/README.md @@ -0,0 +1,3 @@ +# Language-universal grammar definitions + +This directory contains various language-universal grammar definitions. diff --git a/third_party/chinese_text_normalization/thrax/src/universal/roman_numerals.tsv b/third_party/chinese_text_normalization/thrax/src/universal/roman_numerals.tsv new file mode 100644 index 000000000..98a8d97d9 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/universal/roman_numerals.tsv @@ -0,0 +1,91 @@ +i 1 +ii 2 +iii 3 +iv 4 +v 5 +vi 6 +vii 7 +viii 8 +ix 9 +x 10 +xi 11 +xii 12 +xiii 13 +xiv 14 +xv 15 +xvi 16 +xvii 17 +xviii 18 +xix 19 +xx 20 +xxi 21 +xxii 22 +xxiii 23 +xxiv 24 +xxv 25 +xxvi 26 +xxvii 27 +xxviii 28 +xxix 29 +xxx 30 +xxxi 31 +xxxii 32 +xxxiii 33 +xxxiv 34 +xxxv 35 +xxxvi 36 +xxxvii 37 +xxxviii 38 +xxxix 39 +xl 40 +xli 41 +xlii 42 +xliii 43 +xliv 44 +xlv 45 +xlvi 46 +xlvii 47 +xlviii 48 +xlix 49 +mcmxciv 1994 +mcmxcv 1995 +mcmxcvi 1996 +mcmxcvii 1997 +mcmxcviii 1998 +mcmxcix 1999 +mm 2000 +mmi 2001 +mmii 2002 +mmiii 2003 +mmiv 2004 +mmv 2005 +mmvi 2006 +mmvii 2007 +mmviii 2008 +mmix 2009 +mmx 2010 +mmxi 2011 +mmxii 2012 +mmxiii 2013 +mmxiv 2014 +mmxv 2015 +mmxvi 2016 +mmxvii 2017 +mmxviii 2018 +mmxix 2019 +mmxx 2020 +mmxxi 2021 +mmxxii 2022 +mmxxiii 2023 +mmxxiv 2024 +mmxxv 2025 +mmxxvi 2026 +mmxxvii 2027 +mmxxviii 2028 +mmxxix 2029 +mmxxx 2030 +mmxxxi 2031 +mmxxxii 2032 +mmxxxiii 2033 +mmxxxiv 2034 +mmxxxv 2035 diff --git a/third_party/chinese_text_normalization/thrax/src/universal/thousands_punct.grm b/third_party/chinese_text_normalization/thrax/src/universal/thousands_punct.grm new file mode 100644 index 000000000..90ce4a115 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/universal/thousands_punct.grm @@ -0,0 +1,126 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Specifies common ways of delimiting thousands in digit strings. + +import 'util/byte.grm' as bytelib; +import 'util/util.grm' as util; + +killcomma = "," : ""; +dot2comma = "." : ","; +spaces2comma = " "+ : ","; + +zero = "0"; + +# no_delimiter = zero | "[1-9][0-9]*"; +export no_delimiter = zero | (util.d1to9 bytelib.kDigit*); + +# delim_map_dot = ("[0-9]" | ("\." : ","))*; +delim_map_dot = (bytelib.kDigit | dot2comma)*; + +# delim_map_space = ("[0-9]" | (" +" : ","))*; +delim_map_space = (bytelib.kDigit | spaces2comma)*; + +## Western systems group thousands. Korean goes this way too. + +# comma_thousands = zero | ("[1-9][0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9]")*); +export comma_thousands = zero | (util.d1to9 bytelib.kDigit{0,2} (killcomma bytelib.kDigit{3})*); + +# ComposeFst: 1st argument cannot match on output labels and 2nd argument +# cannot match on input labels (sort?). +export dot_thousands = delim_map_dot @ comma_thousands; + +# ComposeFst: 1st argument cannot match on output labels and 2nd argument +# cannot match on input labels (sort?). +export space_thousands = delim_map_space @ comma_thousands; + +## Chinese prefers grouping by fours (by ten-thousands). + +# chinese_comma = +# zero | ("[1-9][0-9]?[0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9][0-9]")*); +export chinese_comma = zero | (util.d1to9 (bytelib.kDigit{0,3}) (killcomma bytelib.kDigit{4})*); + +## The Indian system is more complex because of the Stravinskian alternation +## between lakhs and crores. +## +## According to Wikipedia: +## +## Indian English Value +## One 1 +## Ten 10 +## Hundred 100 +## Thousand 1,000 +## Lakh 1,00,000 +## Crore 1,00,00,000 +## Arab 1,00,00,00,000 +## Kharab 1,00,00,00,00,000 + +# indian_hundreds = "[1-9][0-9]?[0-9]?"; +indian_hundreds = util.d1to9 bytelib.kDigit{0,2}; + +## Up to 99,999. + +# indian_comma_thousands = "[1-9][0-9]?" ("," : "") "[0-9][0-9][0-9]"; +indian_comma_thousands = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{3}; + +## Up to 99,99,999. + +# indian_comma_lakhs = "[1-9][0-9]?" ("," : "") "[0-9][0-9]" ("," : "") "[0-9][0-9][0-9]"; +indian_comma_lakhs = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{2} killcomma bytelib.kDigit{3}; + +## Up to 999,99,99,999 + +indian_comma_crores = + util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma + (bytelib.kDigit{2} killcomma)? + bytelib.kDigit{2} killcomma + bytelib.kDigit{3} +; + +## Up to 99,999,99,99,999. + +indian_comma_thousand_crores = + util.d1to9 bytelib.kDigit? killcomma + bytelib.kDigit{3} killcomma + bytelib.kDigit{2} killcomma + bytelib.kDigit{2} killcomma + bytelib.kDigit{3} +; + +## Up to 999,99,999,99,99,999. + +indian_comma_lakh_crores = + util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma + bytelib.kDigit{2} killcomma + bytelib.kDigit{3} killcomma + bytelib.kDigit{2} killcomma + bytelib.kDigit{2} killcomma + bytelib.kDigit{3} +; + +export indian_comma = + zero + | indian_hundreds + | indian_comma_thousands + | indian_comma_lakhs + | indian_comma_crores + | indian_comma_thousand_crores + | indian_comma_lakh_crores +; + +# Indian number system with dots. +export indian_dot_number = delim_map_dot @ indian_comma; + +# Indian number system with spaces. +export indian_space_number = delim_map_space @ indian_comma; diff --git a/third_party/chinese_text_normalization/thrax/src/util/README.md b/third_party/chinese_text_normalization/thrax/src/util/README.md new file mode 100644 index 000000000..9df3c8035 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/util/README.md @@ -0,0 +1,3 @@ +# Utility grammar definitions + +This directory contains various utility grammar definitions. diff --git a/third_party/chinese_text_normalization/thrax/src/util/arithmetic.grm b/third_party/chinese_text_normalization/thrax/src/util/arithmetic.grm new file mode 100644 index 000000000..b1396db8d --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/util/arithmetic.grm @@ -0,0 +1,326 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Basic arithmetic on S-expressions. Exported arithmetic transducers may either: +# +# * Support weak vigesimal addition and multiplication... +# +# (+ 20 17 +) -> 37 +# (+ 20 10 7 +) -> 37 +# (* 4 20 *) -> 80 +# +# ...or not. +# +# * Support "Germanic decade flop" addition.... +# +# (+ 8 20 +) -> 28 +# (+ 4 60 +) -> 64 +# +# ...or not. +# +# * Support multiplication where the left-hand side multiplicand is of a higher +# order than the right-hand side multiplicand. +# +# (* 1000 100) -> 100000 +# +# ...or not. +# +# However, modulo these exceptions, arithmetic transducers do not support +# addition that requires "carrying", or multiplication where the right-hand +# side multiplicand is not a power of ten. So this is not a *generic* +# S-expression evaluator. +# +# LEAVES is a transducer that accepts symbols in delta but deletes symbols +# in sigma - delta. So it essentially removes markup. +# +# REPEAT_FILTER is an acceptor which blocks derivations of the form +# +# (+ (* 50 1000 *) (* 4 1000) ...) "fifty thousand four thousand..." +# +# in languages where that is not licensed. + +import 'util/byte.grm' as b; + +# Deleter FST. +func D[expr] { + return expr : ""; +} + +delta = b.kDigit; +sigma = delta | " " | "(" | ")" | "+" | "*"; + +sigmastar = sigma*; +deltastar = delta*; + +rparen = Optimize["+)" | "*)"]; +space_or_rparen = Optimize[" " | rparen]; + +## Multiplication. + +# Generic multiplication where the RHS is a power of ten. + +del_one = Optimize[delta+ D[" 1"] "0"+]; + +test1_1 = AssertEqual["2 10" @ del_one, "20"]; +test1_2 = AssertEqual["20 10" @ del_one, "200"]; +test1_3 = AssertEqual["2 100" @ del_one, "200"]; +test1_4 = AssertEqual["20 100" @ del_one, "2000"]; +test1_5 = AssertEqual["200 100" @ del_one, "20000"]; +test1_6 = AssertEqual["2 1000" @ del_one, "2000"]; +test1_7 = AssertEqual["20 1000" @ del_one, "20000"]; +test1_8 = AssertEqual["200 1000" @ del_one, "200000"]; +test1_9 = AssertEqual["2000 1000" @ del_one, "2000000"]; + +# Generic multiplication where the RHS is a power of ten and the LHS has fewer +# trailing zeros than the RHS. +del_one_restricted = Optimize[ # e.g., "2 x 10", "2 x 100", etc. + delta D[" 1"] "0"+ | + # e.g., "20 x 100", etc. + delta{1,2} D[" 1"] "0" "0"+ | + # e.g., "200" x 1000", etc. + delta{2,3} D[" 1"] "0"{2} "0"+ | + delta{3,4} D[" 1"] "0"{3} "0"+ | + delta{4,5} D[" 1"] "0"{4} "0"+]; + +test2_01 = AssertEqual["2 10" @ del_one_restricted, "20"]; +test2_02 = AssertNull["20 10" @ del_one_restricted]; +test2_03 = AssertEqual["2 100" @ del_one_restricted, "200"]; +test2_04 = AssertEqual["20 100" @ del_one_restricted, "2000"]; +test2_05 = AssertNull[ "200 100" @ del_one_restricted]; +test2_06 = AssertEqual["2 1000" @ del_one_restricted, "2000"]; +test2_07 = AssertEqual["20 1000" @ del_one_restricted, "20000"]; +test2_08 = AssertEqual["200 1000" @ del_one_restricted, "200000"]; +test2_09 = AssertNull["2000 1000" @ del_one_restricted]; +test2_10 = AssertEqual["1000 10000000" @ del_one_restricted, "10000000000"]; + +# Multiplication of vigesimal base for weak vigesimal systems + +vigesimal_times_map = ("1" : "2") | ("2" : "4") | ("3" : "6") | ("4" : "8"); + +del_two = Optimize[vigesimal_times_map D[" 2"] "0"+]; + +test3_1 = AssertEqual["1 20" @ del_two, "20"]; +test3_2 = AssertEqual["2 20" @ del_two, "40"]; +test3_3 = AssertEqual["3 20" @ del_two, "60"]; +test3_4 = AssertEqual["4 20" @ del_two, "80"]; + +# Multiplication of vigesimal base restricted to cases where the LHS is [1-4] +# and the RHS is a power of ten. + +del_two_restricted = Optimize[vigesimal_times_map D[" 2"] "0"+]; + +test4_1 = AssertEqual["1 20" @ del_two_restricted, "20"]; +test4_2 = AssertEqual["2 20" @ del_two_restricted, "40"]; +test4_3 = AssertEqual["3 20" @ del_two_restricted, "60"]; +test4_4 = AssertEqual["4 20" @ del_two_restricted, "80"]; +test4_5 = AssertNull["5 20" @ del_two_restricted]; +test4_6 = AssertNull["10 20" @ del_two_restricted]; + +products = del_one | del_two; +products_restricted = del_one_restricted | del_two_restricted; + +multiplication = CDRewrite[D["(* "] products D[" *)"], "", "", sigmastar]; +multiplication_restricted = CDRewrite[D["(* "] products_restricted D[" *)"], + "", "", sigmastar]; + +test5_1 = AssertEqual["(* 8 100 *)" @ multiplication, "800"]; +test5_2 = AssertEqual["(* 1 100 *)" @ multiplication, "100"]; +test5_3 = AssertEqual["(* 4 20 *)" @ multiplication, "80"]; +test5_4 = AssertEqual["(* 13 1000 *)" @ multiplication, "13000"]; +test5_5 = AssertEqual["(* 13000 10 *)" @ multiplication, "130000"]; +test5_6 = AssertEqual["(* 13000 10 *)" @ multiplication_restricted, + "(* 13000 10 *)"]; # Can't reduce this. + +## Addition. + +insum = "+" (sigma - "(")*; +rcon = insum deltastar; + +# Generic zero deletion up to 12. +del_zero = Optimize[ + # Handles lone zero inside a plus statement. + CDRewrite[D[" 0"], rcon, space_or_rparen, sigmastar] @ + # If we need to go any larger, we probably should switch to a PDT. + CDRewrite[D["0"{12} " "] delta{12}, rcon, space_or_rparen, sigmastar] @ + CDRewrite[D["0"{11} " "] delta{11}, rcon, space_or_rparen, sigmastar] @ + CDRewrite[D["0"{10} " "] delta{10}, rcon, space_or_rparen, sigmastar] @ + CDRewrite[D["0"{9} " "] delta{9}, rcon, space_or_rparen, sigmastar] @ + CDRewrite[D["0"{8} " "] delta{8}, rcon, space_or_rparen, sigmastar] @ + CDRewrite[D["0"{7} " "] delta{7}, rcon, space_or_rparen, sigmastar] @ + CDRewrite[D["0"{6} " "] delta{6}, rcon, space_or_rparen, sigmastar] @ + CDRewrite[D["0"{5} " "] delta{5}, rcon, space_or_rparen, sigmastar] @ + CDRewrite[D["0"{4} " "] delta{4}, rcon, space_or_rparen, sigmastar] @ + CDRewrite[D["0"{3} " "] delta{3}, rcon, space_or_rparen, sigmastar] @ + CDRewrite[D["0"{2} " "] delta{2}, rcon, space_or_rparen, sigmastar] @ + CDRewrite[D["0" " "] delta, rcon, space_or_rparen, sigmastar]]; + +## Weak vigesimal cases involving scores and teens. + +vigesimal_plus_map = Optimize[("20 1" : "3") delta | + ("40 1" : "5") delta | + ("60 1" : "7") delta | + ("80 1" : "9") delta]; + +vigesimal = CDRewrite[vigesimal_plus_map, insum, space_or_rparen, sigmastar]; + +## Germanic decade flop. + +germanic_map = StringFile['util/germanic.tsv']; + +germanic = CDRewrite[germanic_map, insum, space_or_rparen, sigmastar]; + +sums = Optimize[germanic @ vigesimal @ del_zero]; + +# Deletes the surrounding "(+ +)" around a successful reduction. + +del_plus = CDRewrite[D["(+ "] delta+ D[" +)"], "", "", sigmastar]; + +addition = Optimize[sums @ del_plus]; + +test6_1 = AssertEqual["(+ 30 2 +)" @ addition, "32"]; +test6_2 = AssertEqual["(+ 300 20 1 +)" @ addition, "321"]; +test6_3 = AssertEqual["(+ 80 17 +)" @ addition, "97"]; +test6_4 = AssertEqual["(+ 4 50 +)" @ addition, "54"]; +test6_5 = AssertEqual["(+ 3000 80 17 +)" @ addition, "3097"]; +test6_6 = AssertEqual["(+ 3000 4 50 +)" @ addition, "3054"]; +test6_7 = AssertEqual["(+ 0 10 +)" @ addition, "10"]; +test6_8 = AssertEqual["(+ 0 20 +)" @ addition, "20"]; +test6_9 = AssertEqual["(+ 200 (+ 0 20 +) +)" @ addition @ addition, "220"]; + +## Export statements. + +export ARITHMETIC = Optimize[multiplication @ addition]; +export ARITHMETIC_RESTRICTED = Optimize[multiplication_restricted @ addition]; + +# Lightweight versions that lack the vigesimal /vɪˈdʒɛsɪməl/ or Germanic decade +# flop, or both. + +export ARITHMETIC_BASIC = Optimize[multiplication @ del_zero @ del_plus]; +export ARITHMETIC_BASIC_RESTRICTED = Optimize[multiplication_restricted @ + del_zero @ del_plus]; + +export ARITHMETIC_GERMANIC = Optimize[multiplication @ germanic @ del_zero @ + del_plus]; + +export ARITHMETIC_GERMANIC_RESTRICTED = Optimize[multiplication_restricted @ + germanic @ del_zero @ + del_plus]; + +export ARITHMETIC_VIGESIMAL = Optimize[multiplication @ vigesimal @ del_zero @ + del_plus]; +export ARITHMETIC_VIGESIMAL_RESTRICTED = Optimize[multiplication_restricted @ + vigesimal @ del_zero @ + del_plus]; + +## LEAVES transducer. + +nonterm = "+" | "*"; +export LEAVES = Optimize[CDRewrite["(" nonterm " " | " " nonterm ")" : "", + "", "", sigmastar]]; + +test7 = AssertEqual["(* (+ (* 4 20 *) 10 7 +) 1000 *)" @ LEAVES, + "4 20 10 7 1000"]; + +## Optional filter for repeated large powers of ten, to be applied to leaves. + +func Filter[expr, sigstar] { + return Optimize[sigstar - (sigstar expr sigstar)]; +} + +func FilterMoreThanOne[expr, sigstar] { + return Filter[expr " " (sigstar " ")? expr, sigstar]; +} + +filter_sigstar = (delta | " ")*; + +export REPEAT_FILTER = + Optimize[FilterMoreThanOne["1000", filter_sigstar] @ + FilterMoreThanOne["10000", filter_sigstar] @ + FilterMoreThanOne["100000", filter_sigstar] @ + FilterMoreThanOne["1000000", filter_sigstar] @ + FilterMoreThanOne["1000000000", filter_sigstar] @ + FilterMoreThanOne["1000000000000", filter_sigstar]]; + +test8_1 = AssertNull["50 1000 4 1000" @ REPEAT_FILTER]; +test8_2 = AssertNull["50 1000000 4 1000000" @ REPEAT_FILTER]; +test8_3 = AssertEqual["50 100 1000" @ REPEAT_FILTER, "50 100 1000"]; +test8_4 = AssertNull["20 1000 1000 20" @ REPEAT_FILTER]; +test8_5 = AssertEqual[ + "70 1000000 400 0 70 0 7 1000 100 0 70" @ REPEAT_FILTER, + "70 1000000 400 0 70 0 7 1000 100 0 70" @ REPEAT_FILTER]; +test8_6 = AssertNull[ + "70 1000000 400 0 70 1000 0 7 1000 100 0 70" @ REPEAT_FILTER]; + +# Filters to force the output of *inverting* the arithmetic as applied to a +# digit string to be a well-formed sexpr: + +not_space = b.kNotSpace; + +# Things like (+ 1 +)(+ 9 +). + +bad_parens = + sigmastar ")" not_space sigmastar + | sigmastar not_space "(" sigmastar +; + +no_bad_parens = sigmastar - bad_parens; + +# Things like (+ 1 +) or (* 3 *). + +spurious_operators = + sigmastar "(+ " delta+ " +)" sigmastar + | sigmastar "(* " delta+ " *)" sigmastar +; + +no_spurious_operators = sigmastar - spurious_operators; + +no_strings_of_zeros = + sigmastar - (sigmastar " " "0"+ " " "0"+ " " sigmastar) +; + +no_bad_sequences = + Optimize[no_bad_parens @ no_strings_of_zeros] +; + +export SEXP_FILTER = Optimize[ + ( delta+ + | "(* " no_bad_sequences " *)" + | "(+ " no_bad_sequences " +)") @ no_spurious_operators] +; + +# For convenience adds inverses of the arithmetic rules: + +export IARITHMETIC = Invert[ARITHMETIC]; + +export IARITHMETIC_RESTRICTED = Invert[ARITHMETIC_RESTRICTED]; + +export IARITHMETIC_BASIC = Invert[ARITHMETIC_BASIC]; + +export IARITHMETIC_BASIC_RESTRICTED = Invert[ARITHMETIC_BASIC_RESTRICTED]; + +export IARITHMETIC_GERMANIC = Invert[ARITHMETIC_GERMANIC]; + +export IARITHMETIC_GERMANIC_RESTRICTED = + Invert[ARITHMETIC_GERMANIC_RESTRICTED] +; + +export IARITHMETIC_VIGESIMAL = Invert[ARITHMETIC_VIGESIMAL]; + +export IARITHMETIC_VIGESIMAL_RESTRICTED = + Invert[ARITHMETIC_VIGESIMAL_RESTRICTED] +; + +## This should be applied on the lefthand side of FG to ensure that the only +## digit input nis permitted. +export DELTA_STAR = deltastar; diff --git a/third_party/chinese_text_normalization/thrax/src/util/byte.grm b/third_party/chinese_text_normalization/thrax/src/util/byte.grm new file mode 100644 index 000000000..32e6ead75 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/util/byte.grm @@ -0,0 +1,75 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Standard constants for ASCII (byte) based strings. This mirrors the +# functions provided by C/C++'s ctype.h library. + +# Note that [0] is missing; matching the string-termination character is kinda weird. +export kBytes = Optimize[ + "[1]" | "[2]" | "[3]" | "[4]" | "[5]" | "[6]" | "[7]" | "[8]" | "[9]" | "[10]" | + "[11]" | "[12]" | "[13]" | "[14]" | "[15]" | "[16]" | "[17]" | "[18]" | "[19]" | "[20]" | + "[21]" | "[22]" | "[23]" | "[24]" | "[25]" | "[26]" | "[27]" | "[28]" | "[29]" | "[30]" | + "[31]" | "[32]" | "[33]" | "[34]" | "[35]" | "[36]" | "[37]" | "[38]" | "[39]" | "[40]" | + "[41]" | "[42]" | "[43]" | "[44]" | "[45]" | "[46]" | "[47]" | "[48]" | "[49]" | "[50]" | + "[51]" | "[52]" | "[53]" | "[54]" | "[55]" | "[56]" | "[57]" | "[58]" | "[59]" | "[60]" | + "[61]" | "[62]" | "[63]" | "[64]" | "[65]" | "[66]" | "[67]" | "[68]" | "[69]" | "[70]" | + "[71]" | "[72]" | "[73]" | "[74]" | "[75]" | "[76]" | "[77]" | "[78]" | "[79]" | "[80]" | + "[81]" | "[82]" | "[83]" | "[84]" | "[85]" | "[86]" | "[87]" | "[88]" | "[89]" | "[90]" | + "[91]" | "[92]" | "[93]" | "[94]" | "[95]" | "[96]" | "[97]" | "[98]" | "[99]" | "[100]" | +"[101]" | "[102]" | "[103]" | "[104]" | "[105]" | "[106]" | "[107]" | "[108]" | "[109]" | "[110]" | +"[111]" | "[112]" | "[113]" | "[114]" | "[115]" | "[116]" | "[117]" | "[118]" | "[119]" | "[120]" | +"[121]" | "[122]" | "[123]" | "[124]" | "[125]" | "[126]" | "[127]" | "[128]" | "[129]" | "[130]" | +"[131]" | "[132]" | "[133]" | "[134]" | "[135]" | "[136]" | "[137]" | "[138]" | "[139]" | "[140]" | +"[141]" | "[142]" | "[143]" | "[144]" | "[145]" | "[146]" | "[147]" | "[148]" | "[149]" | "[150]" | +"[151]" | "[152]" | "[153]" | "[154]" | "[155]" | "[156]" | "[157]" | "[158]" | "[159]" | "[160]" | +"[161]" | "[162]" | "[163]" | "[164]" | "[165]" | "[166]" | "[167]" | "[168]" | "[169]" | "[170]" | +"[171]" | "[172]" | "[173]" | "[174]" | "[175]" | "[176]" | "[177]" | "[178]" | "[179]" | "[180]" | +"[181]" | "[182]" | "[183]" | "[184]" | "[185]" | "[186]" | "[187]" | "[188]" | "[189]" | "[190]" | +"[191]" | "[192]" | "[193]" | "[194]" | "[195]" | "[196]" | "[197]" | "[198]" | "[199]" | "[200]" | +"[201]" | "[202]" | "[203]" | "[204]" | "[205]" | "[206]" | "[207]" | "[208]" | "[209]" | "[210]" | +"[211]" | "[212]" | "[213]" | "[214]" | "[215]" | "[216]" | "[217]" | "[218]" | "[219]" | "[220]" | +"[221]" | "[222]" | "[223]" | "[224]" | "[225]" | "[226]" | "[227]" | "[228]" | "[229]" | "[230]" | +"[231]" | "[232]" | "[233]" | "[234]" | "[235]" | "[236]" | "[237]" | "[238]" | "[239]" | "[240]" | +"[241]" | "[242]" | "[243]" | "[244]" | "[245]" | "[246]" | "[247]" | "[248]" | "[249]" | "[250]" | +"[251]" | "[252]" | "[253]" | "[254]" | "[255]" +]; + +export kDigit = Optimize[ + "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" +]; + +export kLower = Optimize[ + "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | + "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" +]; +export kUpper = Optimize[ + "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | + "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" +]; +export kAlpha = Optimize[kLower | kUpper]; + +export kAlnum = Optimize[kDigit | kAlpha]; + +export kSpace = Optimize[ + " " | "\t" | "\n" | "\r" +]; +export kNotSpace = Optimize[kBytes - kSpace]; + +export kPunct = Optimize[ + "!" | "\"" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," | + "-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "\[" | "\\" | + "\]" | "^" | "_" | "`" | "{" | "|" | "}" | "~" +]; + +export kGraph = Optimize[kAlnum | kPunct]; diff --git a/third_party/chinese_text_normalization/thrax/src/util/case.grm b/third_party/chinese_text_normalization/thrax/src/util/case.grm new file mode 100644 index 000000000..ff10354b7 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/util/case.grm @@ -0,0 +1,3383 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Case-conversion functions. + +import 'util/byte.grm' as bytelib; + +export UPPER = + "A" + | "B" + | "C" + | "D" + | "E" + | "F" + | "G" + | "H" + | "I" + | "J" + | "K" + | "L" + | "M" + | "N" + | "O" + | "P" + | "Q" + | "R" + | "S" + | "T" + | "U" + | "V" + | "W" + | "X" + | "Y" + | "Z" + | "À" + | "Á" + | "Â" + | "Ã" + | "Ä" + | "Å" + | "Æ" + | "Ç" + | "È" + | "É" + | "Ê" + | "Ë" + | "Ì" + | "Í" + | "Î" + | "Ï" + | "Ð" + | "Ñ" + | "Ò" + | "Ó" + | "Ô" + | "Õ" + | "Ö" + | "Ø" + | "Ù" + | "Ú" + | "Û" + | "Ü" + | "Ý" + | "Þ" + | "Ā" + | "Ă" + | "Ą" + | "Ć" + | "Ĉ" + | "Ċ" + | "Č" + | "Ď" + | "Đ" + | "Ē" + | "Ĕ" + | "Ė" + | "Ę" + | "Ě" + | "Ĝ" + | "Ğ" + | "Ġ" + | "Ģ" + | "Ĥ" + | "Ħ" + | "Ĩ" + | "Ī" + | "Ĭ" + | "Į" + | "İ" + | "IJ" + | "Ĵ" + | "Ķ" + | "Ĺ" + | "Ļ" + | "Ľ" + | "Ŀ" + | "Ł" + | "Ń" + | "Ņ" + | "Ň" + | "Ŋ" + | "Ō" + | "Ŏ" + | "Ő" + | "Œ" + | "Ŕ" + | "Ŗ" + | "Ř" + | "Ś" + | "Ŝ" + | "Ş" + | "Š" + | "Ţ" + | "Ť" + | "Ŧ" + | "Ũ" + | "Ū" + | "Ŭ" + | "Ů" + | "Ű" + | "Ų" + | "Ŵ" + | "Ŷ" + | "Ÿ" + | "Ź" + | "Ż" + | "Ž" + | "Ɓ" + | "Ƃ" + | "Ƅ" + | "Ɔ" + | "Ƈ" + | "Ɖ" + | "Ɗ" + | "Ƌ" + | "Ǝ" + | "Ə" + | "Ɛ" + | "Ƒ" + | "Ɠ" + | "Ɣ" + | "Ɩ" + | "Ɨ" + | "Ƙ" + | "Ɯ" + | "Ɲ" + | "Ɵ" + | "Ơ" + | "Ƣ" + | "Ƥ" + | "Ƨ" + | "Ʃ" + | "Ƭ" + | "Ʈ" + | "Ư" + | "Ʊ" + | "Ʋ" + | "Ƴ" + | "Ƶ" + | "Ʒ" + | "Ƹ" + | "Ƽ" + | "DŽ" + | "Dž" + | "LJ" + | "Lj" + | "NJ" + | "Nj" + | "Ǎ" + | "Ǐ" + | "Ǒ" + | "Ǔ" + | "Ǖ" + | "Ǘ" + | "Ǚ" + | "Ǜ" + | "Ǟ" + | "Ǡ" + | "Ǣ" + | "Ǥ" + | "Ǧ" + | "Ǩ" + | "Ǫ" + | "Ǭ" + | "Ǯ" + | "DZ" + | "Dz" + | "Ǵ" + | "Ƕ" + | "Ƿ" + | "Ǹ" + | "Ǻ" + | "Ǽ" + | "Ǿ" + | "Ȁ" + | "Ȃ" + | "Ȅ" + | "Ȇ" + | "Ȉ" + | "Ȋ" + | "Ȍ" + | "Ȏ" + | "Ȑ" + | "Ȓ" + | "Ȕ" + | "Ȗ" + | "Ș" + | "Ț" + | "Ȝ" + | "Ȟ" + | "Ƞ" + | "Ȣ" + | "Ȥ" + | "Ȧ" + | "Ȩ" + | "Ȫ" + | "Ȭ" + | "Ȯ" + | "Ȱ" + | "Ȳ" + | "Ȼ" + | "Ƚ" + | "Ɂ" + | "Ά" + | "Έ" + | "Ή" + | "Ί" + | "Ό" + | "Ύ" + | "Ώ" + | "Α" + | "Β" + | "Γ" + | "Δ" + | "Ε" + | "Ζ" + | "Η" + | "Θ" + | "Ι" + | "Κ" + | "Λ" + | "Μ" + | "Ν" + | "Ξ" + | "Ο" + | "Π" + | "Ρ" + | "Σ" + | "Τ" + | "Υ" + | "Φ" + | "Χ" + | "Ψ" + | "Ω" + | "Ϊ" + | "Ϋ" + | "Ϣ" + | "Ϥ" + | "Ϧ" + | "Ϩ" + | "Ϫ" + | "Ϭ" + | "Ϯ" + | "ϴ" + | "Ϸ" + | "Ϲ" + | "Ϻ" + | "Ѐ" + | "Ё" + | "Ђ" + | "Ѓ" + | "Є" + | "Ѕ" + | "І" + | "Ї" + | "Ј" + | "Љ" + | "Њ" + | "Ћ" + | "Ќ" + | "Ѝ" + | "Ў" + | "Џ" + | "А" + | "Б" + | "В" + | "Г" + | "Д" + | "Е" + | "Ж" + | "З" + | "И" + | "Й" + | "К" + | "Л" + | "М" + | "Н" + | "О" + | "П" + | "Р" + | "С" + | "Т" + | "У" + | "Ф" + | "Х" + | "Ц" + | "Ч" + | "Ш" + | "Щ" + | "Ъ" + | "Ы" + | "Ь" + | "Э" + | "Ю" + | "Я" + | "Ѡ" + | "Ѣ" + | "Ѥ" + | "Ѧ" + | "Ѩ" + | "Ѫ" + | "Ѭ" + | "Ѯ" + | "Ѱ" + | "Ѳ" + | "Ѵ" + | "Ѷ" + | "Ѹ" + | "Ѻ" + | "Ѽ" + | "Ѿ" + | "Ҁ" + | "Ҋ" + | "Ҍ" + | "Ҏ" + | "Ґ" + | "Ғ" + | "Ҕ" + | "Җ" + | "Ҙ" + | "Қ" + | "Ҝ" + | "Ҟ" + | "Ҡ" + | "Ң" + | "Ҥ" + | "Ҧ" + | "Ҩ" + | "Ҫ" + | "Ҭ" + | "Ү" + | "Ұ" + | "Ҳ" + | "Ҵ" + | "Ҷ" + | "Ҹ" + | "Һ" + | "Ҽ" + | "Ҿ" + | "Ӂ" + | "Ӄ" + | "Ӆ" + | "Ӈ" + | "Ӊ" + | "Ӌ" + | "Ӎ" + | "Ӑ" + | "Ӓ" + | "Ӕ" + | "Ӗ" + | "Ә" + | "Ӛ" + | "Ӝ" + | "Ӟ" + | "Ӡ" + | "Ӣ" + | "Ӥ" + | "Ӧ" + | "Ө" + | "Ӫ" + | "Ӭ" + | "Ӯ" + | "Ӱ" + | "Ӳ" + | "Ӵ" + | "Ӷ" + | "Ӹ" + | "Ԁ" + | "Ԃ" + | "Ԅ" + | "Ԇ" + | "Ԉ" + | "Ԋ" + | "Ԍ" + | "Ԏ" + | "Ա" + | "Բ" + | "Գ" + | "Դ" + | "Ե" + | "Զ" + | "Է" + | "Ը" + | "Թ" + | "Ժ" + | "Ի" + | "Լ" + | "Խ" + | "Ծ" + | "Կ" + | "Հ" + | "Ձ" + | "Ղ" + | "Ճ" + | "Մ" + | "Յ" + | "Ն" + | "Շ" + | "Ո" + | "Չ" + | "Պ" + | "Ջ" + | "Ռ" + | "Ս" + | "Վ" + | "Տ" + | "Ր" + | "Ց" + | "Ւ" + | "Փ" + | "Ք" + | "Օ" + | "Ֆ" + | "Ⴀ" + | "Ⴁ" + | "Ⴂ" + | "Ⴃ" + | "Ⴄ" + | "Ⴅ" + | "Ⴆ" + | "Ⴇ" + | "Ⴈ" + | "Ⴉ" + | "Ⴊ" + | "Ⴋ" + | "Ⴌ" + | "Ⴍ" + | "Ⴎ" + | "Ⴏ" + | "Ⴐ" + | "Ⴑ" + | "Ⴒ" + | "Ⴓ" + | "Ⴔ" + | "Ⴕ" + | "Ⴖ" + | "Ⴗ" + | "Ⴘ" + | "Ⴙ" + | "Ⴚ" + | "Ⴛ" + | "Ⴜ" + | "Ⴝ" + | "Ⴞ" + | "Ⴟ" + | "Ⴠ" + | "Ⴡ" + | "Ⴢ" + | "Ⴣ" + | "Ⴤ" + | "Ⴥ" + | "Ḁ" + | "Ḃ" + | "Ḅ" + | "Ḇ" + | "Ḉ" + | "Ḋ" + | "Ḍ" + | "Ḏ" + | "Ḑ" + | "Ḓ" + | "Ḕ" + | "Ḗ" + | "Ḙ" + | "Ḛ" + | "Ḝ" + | "Ḟ" + | "Ḡ" + | "Ḣ" + | "Ḥ" + | "Ḧ" + | "Ḩ" + | "Ḫ" + | "Ḭ" + | "Ḯ" + | "Ḱ" + | "Ḳ" + | "Ḵ" + | "Ḷ" + | "Ḹ" + | "Ḻ" + | "Ḽ" + | "Ḿ" + | "Ṁ" + | "Ṃ" + | "Ṅ" + | "Ṇ" + | "Ṉ" + | "Ṋ" + | "Ṍ" + | "Ṏ" + | "Ṑ" + | "Ṓ" + | "Ṕ" + | "Ṗ" + | "Ṙ" + | "Ṛ" + | "Ṝ" + | "Ṟ" + | "Ṡ" + | "Ṣ" + | "Ṥ" + | "Ṧ" + | "Ṩ" + | "Ṫ" + | "Ṭ" + | "Ṯ" + | "Ṱ" + | "Ṳ" + | "Ṵ" + | "Ṷ" + | "Ṹ" + | "Ṻ" + | "Ṽ" + | "Ṿ" + | "Ẁ" + | "Ẃ" + | "Ẅ" + | "Ẇ" + | "Ẉ" + | "Ẋ" + | "Ẍ" + | "Ẏ" + | "Ẑ" + | "Ẓ" + | "Ẕ" + | "Ạ" + | "Ả" + | "Ấ" + | "Ầ" + | "Ẩ" + | "Ẫ" + | "Ậ" + | "Ắ" + | "Ằ" + | "Ẳ" + | "Ẵ" + | "Ặ" + | "Ẹ" + | "Ẻ" + | "Ẽ" + | "Ế" + | "Ề" + | "Ể" + | "Ễ" + | "Ệ" + | "Ỉ" + | "Ị" + | "Ọ" + | "Ỏ" + | "Ố" + | "Ồ" + | "Ổ" + | "Ỗ" + | "Ộ" + | "Ớ" + | "Ờ" + | "Ở" + | "Ỡ" + | "Ợ" + | "Ụ" + | "Ủ" + | "Ứ" + | "Ừ" + | "Ử" + | "Ữ" + | "Ự" + | "Ỳ" + | "Ỵ" + | "Ỷ" + | "Ỹ" + | "Ἀ" + | "Ἁ" + | "Ἂ" + | "Ἃ" + | "Ἄ" + | "Ἅ" + | "Ἆ" + | "Ἇ" + | "Ἐ" + | "Ἑ" + | "Ἒ" + | "Ἓ" + | "Ἔ" + | "Ἕ" + | "Ἠ" + | "Ἡ" + | "Ἢ" + | "Ἣ" + | "Ἤ" + | "Ἥ" + | "Ἦ" + | "Ἧ" + | "Ἰ" + | "Ἱ" + | "Ἲ" + | "Ἳ" + | "Ἴ" + | "Ἵ" + | "Ἶ" + | "Ἷ" + | "Ὀ" + | "Ὁ" + | "Ὂ" + | "Ὃ" + | "Ὄ" + | "Ὅ" + | "Ὑ" + | "Ὓ" + | "Ὕ" + | "Ὗ" + | "Ὠ" + | "Ὡ" + | "Ὢ" + | "Ὣ" + | "Ὤ" + | "Ὥ" + | "Ὦ" + | "Ὧ" + | "ᾈ" + | "ᾉ" + | "ᾊ" + | "ᾋ" + | "ᾌ" + | "ᾍ" + | "ᾎ" + | "ᾏ" + | "ᾘ" + | "ᾙ" + | "ᾚ" + | "ᾛ" + | "ᾜ" + | "ᾝ" + | "ᾞ" + | "ᾟ" + | "ᾨ" + | "ᾩ" + | "ᾪ" + | "ᾫ" + | "ᾬ" + | "ᾭ" + | "ᾮ" + | "ᾯ" + | "Ᾰ" + | "Ᾱ" + | "Ὰ" + | "Ά" + | "ᾼ" + | "Ὲ" + | "Έ" + | "Ὴ" + | "Ή" + | "ῌ" + | "Ῐ" + | "Ῑ" + | "Ὶ" + | "Ί" + | "Ῠ" + | "Ῡ" + | "Ὺ" + | "Ύ" + | "Ῥ" + | "Ὸ" + | "Ό" + | "Ὼ" + | "Ώ" + | "ῼ" + | "Ⓐ" + | "Ⓑ" + | "Ⓒ" + | "Ⓓ" + | "Ⓔ" + | "Ⓕ" + | "Ⓖ" + | "Ⓗ" + | "Ⓘ" + | "Ⓙ" + | "Ⓚ" + | "Ⓛ" + | "Ⓜ" + | "Ⓝ" + | "Ⓞ" + | "Ⓟ" + | "Ⓠ" + | "Ⓡ" + | "Ⓢ" + | "Ⓣ" + | "Ⓤ" + | "Ⓥ" + | "Ⓦ" + | "Ⓧ" + | "Ⓨ" + | "Ⓩ" + | "Ⰰ" + | "Ⰱ" + | "Ⰲ" + | "Ⰳ" + | "Ⰴ" + | "Ⰵ" + | "Ⰶ" + | "Ⰷ" + | "Ⰸ" + | "Ⰹ" + | "Ⰺ" + | "Ⰻ" + | "Ⰼ" + | "Ⰽ" + | "Ⰾ" + | "Ⰿ" + | "Ⱀ" + | "Ⱁ" + | "Ⱂ" + | "Ⱃ" + | "Ⱄ" + | "Ⱅ" + | "Ⱆ" + | "Ⱇ" + | "Ⱈ" + | "Ⱉ" + | "Ⱊ" + | "Ⱋ" + | "Ⱌ" + | "Ⱍ" + | "Ⱎ" + | "Ⱏ" + | "Ⱐ" + | "Ⱑ" + | "Ⱒ" + | "Ⱓ" + | "Ⱔ" + | "Ⱕ" + | "Ⱖ" + | "Ⱗ" + | "Ⱘ" + | "Ⱙ" + | "Ⱚ" + | "Ⱛ" + | "Ⱜ" + | "Ⱝ" + | "Ⱞ" + | "Ⲁ" + | "Ⲃ" + | "Ⲅ" + | "Ⲇ" + | "Ⲉ" + | "Ⲋ" + | "Ⲍ" + | "Ⲏ" + | "Ⲑ" + | "Ⲓ" + | "Ⲕ" + | "Ⲗ" + | "Ⲙ" + | "Ⲛ" + | "Ⲝ" + | "Ⲟ" + | "Ⲡ" + | "Ⲣ" + | "Ⲥ" + | "Ⲧ" + | "Ⲩ" + | "Ⲫ" + | "Ⲭ" + | "Ⲯ" + | "Ⲱ" + | "Ⲳ" + | "Ⲵ" + | "Ⲷ" + | "Ⲹ" + | "Ⲻ" + | "Ⲽ" + | "Ⲿ" + | "Ⳁ" + | "Ⳃ" + | "Ⳅ" + | "Ⳇ" + | "Ⳉ" + | "Ⳋ" + | "Ⳍ" + | "Ⳏ" + | "Ⳑ" + | "Ⳓ" + | "Ⳕ" + | "Ⳗ" + | "Ⳙ" + | "Ⳛ" + | "Ⳝ" + | "Ⳟ" + | "Ⳡ" + | "Ⳣ" + | "A" + | "B" + | "C" + | "D" + | "E" + | "F" + | "G" + | "H" + | "I" + | "J" + | "K" + | "L" + | "M" + | "N" + | "O" + | "P" + | "Q" + | "R" + | "S" + | "T" + | "U" + | "V" + | "W" + | "X" + | "Y" + | "Z" +; + +export LOWER = + "a" + | "b" + | "c" + | "d" + | "e" + | "f" + | "g" + | "h" + | "i" + | "j" + | "k" + | "l" + | "m" + | "n" + | "o" + | "p" + | "q" + | "r" + | "s" + | "t" + | "u" + | "v" + | "w" + | "x" + | "y" + | "z" + | "à" + | "á" + | "â" + | "ã" + | "ä" + | "å" + | "æ" + | "ç" + | "è" + | "é" + | "ê" + | "ë" + | "ì" + | "í" + | "î" + | "ï" + | "ð" + | "ñ" + | "ò" + | "ó" + | "ô" + | "õ" + | "ö" + | "ø" + | "ù" + | "ú" + | "û" + | "ü" + | "ý" + | "þ" + | "ā" + | "ă" + | "ą" + | "ć" + | "ĉ" + | "ċ" + | "č" + | "ď" + | "đ" + | "ē" + | "ĕ" + | "ė" + | "ę" + | "ě" + | "ĝ" + | "ğ" + | "ġ" + | "ģ" + | "ĥ" + | "ħ" + | "ĩ" + | "ī" + | "ĭ" + | "į" + | "i" + | "ij" + | "ĵ" + | "ķ" + | "ĺ" + | "ļ" + | "ľ" + | "ŀ" + | "ł" + | "ń" + | "ņ" + | "ň" + | "ŋ" + | "ō" + | "ŏ" + | "ő" + | "œ" + | "ŕ" + | "ŗ" + | "ř" + | "ś" + | "ŝ" + | "ş" + | "ß" + | "š" + | "ţ" + | "ť" + | "ŧ" + | "ũ" + | "ū" + | "ŭ" + | "ů" + | "ű" + | "ų" + | "ŵ" + | "ŷ" + | "ÿ" + | "ź" + | "ż" + | "ž" + | "ɓ" + | "ƃ" + | "ƅ" + | "ɔ" + | "ƈ" + | "ɖ" + | "ɗ" + | "ƌ" + | "ǝ" + | "ə" + | "ɛ" + | "ƒ" + | "ɠ" + | "ɣ" + | "ɩ" + | "ɨ" + | "ƙ" + | "ɯ" + | "ɲ" + | "ɵ" + | "ơ" + | "ƣ" + | "ƥ" + | "ƨ" + | "ʃ" + | "ƭ" + | "ʈ" + | "ư" + | "ʊ" + | "ʋ" + | "ƴ" + | "ƶ" + | "ʒ" + | "ƹ" + | "ƽ" + | "dž" + | "dž" + | "lj" + | "lj" + | "nj" + | "nj" + | "ǎ" + | "ǐ" + | "ǒ" + | "ǔ" + | "ǖ" + | "ǘ" + | "ǚ" + | "ǜ" + | "ǟ" + | "ǡ" + | "ǣ" + | "ǥ" + | "ǧ" + | "ǩ" + | "ǫ" + | "ǭ" + | "ǯ" + | "dz" + | "dz" + | "ǵ" + | "ƕ" + | "ƿ" + | "ǹ" + | "ǻ" + | "ǽ" + | "ǿ" + | "ȁ" + | "ȃ" + | "ȅ" + | "ȇ" + | "ȉ" + | "ȋ" + | "ȍ" + | "ȏ" + | "ȑ" + | "ȓ" + | "ȕ" + | "ȗ" + | "ș" + | "ț" + | "ȝ" + | "ȟ" + | "ƞ" + | "ȣ" + | "ȥ" + | "ȧ" + | "ȩ" + | "ȫ" + | "ȭ" + | "ȯ" + | "ȱ" + | "ȳ" + | "ȼ" + | "ƚ" + | "ʔ" + | "ά" + | "έ" + | "ή" + | "ί" + | "ό" + | "ύ" + | "ώ" + | "α" + | "β" + | "γ" + | "δ" + | "ε" + | "ζ" + | "η" + | "θ" + | "ι" + | "κ" + | "λ" + | "μ" + | "ν" + | "ξ" + | "ο" + | "π" + | "ρ" + | "σ" + | "ς" + | "τ" + | "υ" + | "φ" + | "χ" + | "ψ" + | "ω" + | "ϊ" + | "ϋ" + | "ϣ" + | "ϥ" + | "ϧ" + | "ϩ" + | "ϫ" + | "ϭ" + | "ϯ" + | "θ" + | "ϸ" + | "ϲ" + | "ϻ" + | "ѐ" + | "ё" + | "ђ" + | "ѓ" + | "є" + | "ѕ" + | "і" + | "ї" + | "ј" + | "љ" + | "њ" + | "ћ" + | "ќ" + | "ѝ" + | "ў" + | "џ" + | "а" + | "б" + | "в" + | "г" + | "д" + | "е" + | "ж" + | "з" + | "и" + | "й" + | "к" + | "л" + | "м" + | "н" + | "о" + | "п" + | "р" + | "с" + | "т" + | "у" + | "ф" + | "х" + | "ц" + | "ч" + | "ш" + | "щ" + | "ъ" + | "ы" + | "ь" + | "э" + | "ю" + | "я" + | "ѡ" + | "ѣ" + | "ѥ" + | "ѧ" + | "ѩ" + | "ѫ" + | "ѭ" + | "ѯ" + | "ѱ" + | "ѳ" + | "ѵ" + | "ѷ" + | "ѹ" + | "ѻ" + | "ѽ" + | "ѿ" + | "ҁ" + | "ҋ" + | "ҍ" + | "ҏ" + | "ґ" + | "ғ" + | "ҕ" + | "җ" + | "ҙ" + | "қ" + | "ҝ" + | "ҟ" + | "ҡ" + | "ң" + | "ҥ" + | "ҧ" + | "ҩ" + | "ҫ" + | "ҭ" + | "ү" + | "ұ" + | "ҳ" + | "ҵ" + | "ҷ" + | "ҹ" + | "һ" + | "ҽ" + | "ҿ" + | "ӂ" + | "ӄ" + | "ӆ" + | "ӈ" + | "ӊ" + | "ӌ" + | "ӎ" + | "ӑ" + | "ӓ" + | "ӕ" + | "ӗ" + | "ә" + | "ӛ" + | "ӝ" + | "ӟ" + | "ӡ" + | "ӣ" + | "ӥ" + | "ӧ" + | "ө" + | "ӫ" + | "ӭ" + | "ӯ" + | "ӱ" + | "ӳ" + | "ӵ" + | "ӷ" + | "ӹ" + | "ԁ" + | "ԃ" + | "ԅ" + | "ԇ" + | "ԉ" + | "ԋ" + | "ԍ" + | "ԏ" + | "ա" + | "բ" + | "գ" + | "դ" + | "ե" + | "զ" + | "է" + | "ը" + | "թ" + | "ժ" + | "ի" + | "լ" + | "խ" + | "ծ" + | "կ" + | "հ" + | "ձ" + | "ղ" + | "ճ" + | "մ" + | "յ" + | "ն" + | "շ" + | "ո" + | "չ" + | "պ" + | "ջ" + | "ռ" + | "ս" + | "վ" + | "տ" + | "ր" + | "ց" + | "ւ" + | "փ" + | "ք" + | "օ" + | "ֆ" + | "ⴀ" + | "ⴁ" + | "ⴂ" + | "ⴃ" + | "ⴄ" + | "ⴅ" + | "ⴆ" + | "ⴇ" + | "ⴈ" + | "ⴉ" + | "ⴊ" + | "ⴋ" + | "ⴌ" + | "ⴍ" + | "ⴎ" + | "ⴏ" + | "ⴐ" + | "ⴑ" + | "ⴒ" + | "ⴓ" + | "ⴔ" + | "ⴕ" + | "ⴖ" + | "ⴗ" + | "ⴘ" + | "ⴙ" + | "ⴚ" + | "ⴛ" + | "ⴜ" + | "ⴝ" + | "ⴞ" + | "ⴟ" + | "ⴠ" + | "ⴡ" + | "ⴢ" + | "ⴣ" + | "ⴤ" + | "ⴥ" + | "ḁ" + | "ḃ" + | "ḅ" + | "ḇ" + | "ḉ" + | "ḋ" + | "ḍ" + | "ḏ" + | "ḑ" + | "ḓ" + | "ḕ" + | "ḗ" + | "ḙ" + | "ḛ" + | "ḝ" + | "ḟ" + | "ḡ" + | "ḣ" + | "ḥ" + | "ḧ" + | "ḩ" + | "ḫ" + | "ḭ" + | "ḯ" + | "ḱ" + | "ḳ" + | "ḵ" + | "ḷ" + | "ḹ" + | "ḻ" + | "ḽ" + | "ḿ" + | "ṁ" + | "ṃ" + | "ṅ" + | "ṇ" + | "ṉ" + | "ṋ" + | "ṍ" + | "ṏ" + | "ṑ" + | "ṓ" + | "ṕ" + | "ṗ" + | "ṙ" + | "ṛ" + | "ṝ" + | "ṟ" + | "ṡ" + | "ṣ" + | "ṥ" + | "ṧ" + | "ṩ" + | "ṫ" + | "ṭ" + | "ṯ" + | "ṱ" + | "ṳ" + | "ṵ" + | "ṷ" + | "ṹ" + | "ṻ" + | "ṽ" + | "ṿ" + | "ẁ" + | "ẃ" + | "ẅ" + | "ẇ" + | "ẉ" + | "ẋ" + | "ẍ" + | "ẏ" + | "ẑ" + | "ẓ" + | "ẕ" + | "ạ" + | "ả" + | "ấ" + | "ầ" + | "ẩ" + | "ẫ" + | "ậ" + | "ắ" + | "ằ" + | "ẳ" + | "ẵ" + | "ặ" + | "ẹ" + | "ẻ" + | "ẽ" + | "ế" + | "ề" + | "ể" + | "ễ" + | "ệ" + | "ỉ" + | "ị" + | "ọ" + | "ỏ" + | "ố" + | "ồ" + | "ổ" + | "ỗ" + | "ộ" + | "ớ" + | "ờ" + | "ở" + | "ỡ" + | "ợ" + | "ụ" + | "ủ" + | "ứ" + | "ừ" + | "ử" + | "ữ" + | "ự" + | "ỳ" + | "ỵ" + | "ỷ" + | "ỹ" + | "ἀ" + | "ἁ" + | "ἂ" + | "ἃ" + | "ἄ" + | "ἅ" + | "ἆ" + | "ἇ" + | "ἐ" + | "ἑ" + | "ἒ" + | "ἓ" + | "ἔ" + | "ἕ" + | "ἠ" + | "ἡ" + | "ἢ" + | "ἣ" + | "ἤ" + | "ἥ" + | "ἦ" + | "ἧ" + | "ἰ" + | "ἱ" + | "ἲ" + | "ἳ" + | "ἴ" + | "ἵ" + | "ἶ" + | "ἷ" + | "ὀ" + | "ὁ" + | "ὂ" + | "ὃ" + | "ὄ" + | "ὅ" + | "ὑ" + | "ὓ" + | "ὕ" + | "ὗ" + | "ὠ" + | "ὡ" + | "ὢ" + | "ὣ" + | "ὤ" + | "ὥ" + | "ὦ" + | "ὧ" + | "ᾀ" + | "ᾁ" + | "ᾂ" + | "ᾃ" + | "ᾄ" + | "ᾅ" + | "ᾆ" + | "ᾇ" + | "ᾐ" + | "ᾑ" + | "ᾒ" + | "ᾓ" + | "ᾔ" + | "ᾕ" + | "ᾖ" + | "ᾗ" + | "ᾠ" + | "ᾡ" + | "ᾢ" + | "ᾣ" + | "ᾤ" + | "ᾥ" + | "ᾦ" + | "ᾧ" + | "ᾰ" + | "ᾱ" + | "ὰ" + | "ά" + | "ᾳ" + | "ὲ" + | "έ" + | "ὴ" + | "ή" + | "ῃ" + | "ῐ" + | "ῑ" + | "ὶ" + | "ί" + | "ῠ" + | "ῡ" + | "ὺ" + | "ύ" + | "ῥ" + | "ὸ" + | "ό" + | "ὼ" + | "ώ" + | "ῳ" + | "ⓐ" + | "ⓑ" + | "ⓒ" + | "ⓓ" + | "ⓔ" + | "ⓕ" + | "ⓖ" + | "ⓗ" + | "ⓘ" + | "ⓙ" + | "ⓚ" + | "ⓛ" + | "ⓜ" + | "ⓝ" + | "ⓞ" + | "ⓟ" + | "ⓠ" + | "ⓡ" + | "ⓢ" + | "ⓣ" + | "ⓤ" + | "ⓥ" + | "ⓦ" + | "ⓧ" + | "ⓨ" + | "ⓩ" + | "ⰰ" + | "ⰱ" + | "ⰲ" + | "ⰳ" + | "ⰴ" + | "ⰵ" + | "ⰶ" + | "ⰷ" + | "ⰸ" + | "ⰹ" + | "ⰺ" + | "ⰻ" + | "ⰼ" + | "ⰽ" + | "ⰾ" + | "ⰿ" + | "ⱀ" + | "ⱁ" + | "ⱂ" + | "ⱃ" + | "ⱄ" + | "ⱅ" + | "ⱆ" + | "ⱇ" + | "ⱈ" + | "ⱉ" + | "ⱊ" + | "ⱋ" + | "ⱌ" + | "ⱍ" + | "ⱎ" + | "ⱏ" + | "ⱐ" + | "ⱑ" + | "ⱒ" + | "ⱓ" + | "ⱔ" + | "ⱕ" + | "ⱖ" + | "ⱗ" + | "ⱘ" + | "ⱙ" + | "ⱚ" + | "ⱛ" + | "ⱜ" + | "ⱝ" + | "ⱞ" + | "ⲁ" + | "ⲃ" + | "ⲅ" + | "ⲇ" + | "ⲉ" + | "ⲋ" + | "ⲍ" + | "ⲏ" + | "ⲑ" + | "ⲓ" + | "ⲕ" + | "ⲗ" + | "ⲙ" + | "ⲛ" + | "ⲝ" + | "ⲟ" + | "ⲡ" + | "ⲣ" + | "ⲥ" + | "ⲧ" + | "ⲩ" + | "ⲫ" + | "ⲭ" + | "ⲯ" + | "ⲱ" + | "ⲳ" + | "ⲵ" + | "ⲷ" + | "ⲹ" + | "ⲻ" + | "ⲽ" + | "ⲿ" + | "ⳁ" + | "ⳃ" + | "ⳅ" + | "ⳇ" + | "ⳉ" + | "ⳋ" + | "ⳍ" + | "ⳏ" + | "ⳑ" + | "ⳓ" + | "ⳕ" + | "ⳗ" + | "ⳙ" + | "ⳛ" + | "ⳝ" + | "ⳟ" + | "ⳡ" + | "ⳣ" + | "a" + | "b" + | "c" + | "d" + | "e" + | "f" + | "g" + | "h" + | "i" + | "j" + | "k" + | "l" + | "m" + | "n" + | "o" + | "p" + | "q" + | "r" + | "s" + | "t" + | "u" + | "v" + | "w" + | "x" + | "y" + | "z" +; + +export toupper_deterministic = Determinize[ + ("a" : "A") + | ("b" : "B") + | ("c" : "C") + | ("d" : "D") + | ("e" : "E") + | ("f" : "F") + | ("g" : "G") + | ("h" : "H") + | ("i" : "I") + | ("j" : "J") + | ("k" : "K") + | ("l" : "L") + | ("m" : "M") + | ("n" : "N") + | ("o" : "O") + | ("p" : "P") + | ("q" : "Q") + | ("r" : "R") + | ("s" : "S") + | ("t" : "T") + | ("u" : "U") + | ("v" : "V") + | ("w" : "W") + | ("x" : "X") + | ("y" : "Y") + | ("z" : "Z") + | ("à" : "À") + | ("á" : "Á") + | ("â" : "Â") + | ("ã" : "Ã") + | ("ä" : "Ä") + | ("å" : "Å") + | ("æ" : "Æ") + | ("ç" : "Ç") + | ("è" : "È") + | ("é" : "É") + | ("ê" : "Ê") + | ("ë" : "Ë") + | ("ì" : "Ì") + | ("í" : "Í") + | ("î" : "Î") + | ("ï" : "Ï") + | ("ð" : "Ð") + | ("ñ" : "Ñ") + | ("ò" : "Ò") + | ("ó" : "Ó") + | ("ô" : "Ô") + | ("õ" : "Õ") + | ("ö" : "Ö") + | ("ø" : "Ø") + | ("ù" : "Ù") + | ("ú" : "Ú") + | ("û" : "Û") + | ("ü" : "Ü") + | ("ý" : "Ý") + | ("þ" : "Þ") + | ("ā" : "Ā") + | ("ă" : "Ă") + | ("ą" : "Ą") + | ("ć" : "Ć") + | ("ĉ" : "Ĉ") + | ("ċ" : "Ċ") + | ("č" : "Č") + | ("ď" : "Ď") + | ("đ" : "Đ") + | ("ē" : "Ē") + | ("ĕ" : "Ĕ") + | ("ė" : "Ė") + | ("ę" : "Ę") + | ("ě" : "Ě") + | ("ĝ" : "Ĝ") + | ("ğ" : "Ğ") + | ("ġ" : "Ġ") + | ("ģ" : "Ģ") + | ("ĥ" : "Ĥ") + | ("ħ" : "Ħ") + | ("ĩ" : "Ĩ") + | ("ī" : "Ī") + | ("ĭ" : "Ĭ") + | ("į" : "Į") + | ("ij" : "IJ") + | ("ĵ" : "Ĵ") + | ("ķ" : "Ķ") + | ("ĺ" : "Ĺ") + | ("ļ" : "Ļ") + | ("ľ" : "Ľ") + | ("ŀ" : "Ŀ") + | ("ł" : "Ł") + | ("ń" : "Ń") + | ("ņ" : "Ņ") + | ("ň" : "Ň") + | ("ŋ" : "Ŋ") + | ("ō" : "Ō") + | ("ŏ" : "Ŏ") + | ("ő" : "Ő") + | ("œ" : "Œ") + | ("ŕ" : "Ŕ") + | ("ŗ" : "Ŗ") + | ("ř" : "Ř") + | ("ś" : "Ś") + | ("ŝ" : "Ŝ") + | ("ş" : "Ş") + | ("š" : "Š") + | ("ţ" : "Ţ") + | ("ť" : "Ť") + | ("ŧ" : "Ŧ") + | ("ũ" : "Ũ") + | ("ū" : "Ū") + | ("ŭ" : "Ŭ") + | ("ů" : "Ů") + | ("ű" : "Ű") + | ("ų" : "Ų") + | ("ŵ" : "Ŵ") + | ("ŷ" : "Ŷ") + | ("ÿ" : "Ÿ") + | ("ź" : "Ź") + | ("ż" : "Ż") + | ("ž" : "Ž") + | ("ɓ" : "Ɓ") + | ("ƃ" : "Ƃ") + | ("ƅ" : "Ƅ") + | ("ɔ" : "Ɔ") + | ("ƈ" : "Ƈ") + | ("ɖ" : "Ɖ") + | ("ɗ" : "Ɗ") + | ("ƌ" : "Ƌ") + | ("ǝ" : "Ǝ") + | ("ə" : "Ə") + | ("ɛ" : "Ɛ") + | ("ƒ" : "Ƒ") + | ("ɠ" : "Ɠ") + | ("ɣ" : "Ɣ") + | ("ɩ" : "Ɩ") + | ("ɨ" : "Ɨ") + | ("ƙ" : "Ƙ") + | ("ɯ" : "Ɯ") + | ("ɲ" : "Ɲ") + | ("ɵ" : "Ɵ") + | ("ơ" : "Ơ") + | ("ƣ" : "Ƣ") + | ("ƥ" : "Ƥ") + | ("ƨ" : "Ƨ") + | ("ʃ" : "Ʃ") + | ("ƭ" : "Ƭ") + | ("ʈ" : "Ʈ") + | ("ư" : "Ư") + | ("ʊ" : "Ʊ") + | ("ʋ" : "Ʋ") + | ("ƴ" : "Ƴ") + | ("ƶ" : "Ƶ") + | ("ʒ" : "Ʒ") + | ("ƹ" : "Ƹ") + | ("ƽ" : "Ƽ") + | ("dž" : "DŽ") + | ("lj" : "LJ") + | ("nj" : "NJ") + | ("ǎ" : "Ǎ") + | ("ǐ" : "Ǐ") + | ("ǒ" : "Ǒ") + | ("ǔ" : "Ǔ") + | ("ǖ" : "Ǖ") + | ("ǘ" : "Ǘ") + | ("ǚ" : "Ǚ") + | ("ǜ" : "Ǜ") + | ("ǟ" : "Ǟ") + | ("ǡ" : "Ǡ") + | ("ǣ" : "Ǣ") + | ("ǥ" : "Ǥ") + | ("ǧ" : "Ǧ") + | ("ǩ" : "Ǩ") + | ("ǫ" : "Ǫ") + | ("ǭ" : "Ǭ") + | ("ǯ" : "Ǯ") + | ("dz" : "DZ") + | ("ǵ" : "Ǵ") + | ("ƕ" : "Ƕ") + | ("ƿ" : "Ƿ") + | ("ǹ" : "Ǹ") + | ("ǻ" : "Ǻ") + | ("ǽ" : "Ǽ") + | ("ǿ" : "Ǿ") + | ("ȁ" : "Ȁ") + | ("ȃ" : "Ȃ") + | ("ȅ" : "Ȅ") + | ("ȇ" : "Ȇ") + | ("ȉ" : "Ȉ") + | ("ȋ" : "Ȋ") + | ("ȍ" : "Ȍ") + | ("ȏ" : "Ȏ") + | ("ȑ" : "Ȑ") + | ("ȓ" : "Ȓ") + | ("ȕ" : "Ȕ") + | ("ȗ" : "Ȗ") + | ("ș" : "Ș") + | ("ț" : "Ț") + | ("ȝ" : "Ȝ") + | ("ȟ" : "Ȟ") + | ("ƞ" : "Ƞ") + | ("ȣ" : "Ȣ") + | ("ȥ" : "Ȥ") + | ("ȧ" : "Ȧ") + | ("ȩ" : "Ȩ") + | ("ȫ" : "Ȫ") + | ("ȭ" : "Ȭ") + | ("ȯ" : "Ȯ") + | ("ȱ" : "Ȱ") + | ("ȳ" : "Ȳ") + | ("ȼ" : "Ȼ") + | ("ƚ" : "Ƚ") + | ("ʔ" : "Ɂ") + | ("ά" : "Ά") + | ("έ" : "Έ") + | ("ή" : "Ή") + | ("ί" : "Ί") + | ("ό" : "Ό") + | ("ύ" : "Ύ") + | ("ώ" : "Ώ") + | ("α" : "Α") + | ("β" : "Β") + | ("γ" : "Γ") + | ("δ" : "Δ") + | ("ε" : "Ε") + | ("ζ" : "Ζ") + | ("η" : "Η") + | ("θ" : "Θ") + | ("ι" : "Ι") + | ("κ" : "Κ") + | ("λ" : "Λ") + | ("μ" : "Μ") + | ("ν" : "Ν") + | ("ξ" : "Ξ") + | ("ο" : "Ο") + | ("π" : "Π") + | ("ρ" : "Ρ") + | ("σ" : "Σ") + | ("τ" : "Τ") + | ("υ" : "Υ") + | ("φ" : "Φ") + | ("χ" : "Χ") + | ("ψ" : "Ψ") + | ("ω" : "Ω") + | ("ϊ" : "Ϊ") + | ("ϋ" : "Ϋ") + | ("ϣ" : "Ϣ") + | ("ϥ" : "Ϥ") + | ("ϧ" : "Ϧ") + | ("ϩ" : "Ϩ") + | ("ϫ" : "Ϫ") + | ("ϭ" : "Ϭ") + | ("ϯ" : "Ϯ") + | ("ϸ" : "Ϸ") + | ("ϲ" : "Ϲ") + | ("ϻ" : "Ϻ") + | ("ѐ" : "Ѐ") + | ("ё" : "Ё") + | ("ђ" : "Ђ") + | ("ѓ" : "Ѓ") + | ("є" : "Є") + | ("ѕ" : "Ѕ") + | ("і" : "І") + | ("ї" : "Ї") + | ("ј" : "Ј") + | ("љ" : "Љ") + | ("њ" : "Њ") + | ("ћ" : "Ћ") + | ("ќ" : "Ќ") + | ("ѝ" : "Ѝ") + | ("ў" : "Ў") + | ("џ" : "Џ") + | ("а" : "А") + | ("б" : "Б") + | ("в" : "В") + | ("г" : "Г") + | ("д" : "Д") + | ("е" : "Е") + | ("ж" : "Ж") + | ("з" : "З") + | ("и" : "И") + | ("й" : "Й") + | ("к" : "К") + | ("л" : "Л") + | ("м" : "М") + | ("н" : "Н") + | ("о" : "О") + | ("п" : "П") + | ("р" : "Р") + | ("с" : "С") + | ("т" : "Т") + | ("у" : "У") + | ("ф" : "Ф") + | ("х" : "Х") + | ("ц" : "Ц") + | ("ч" : "Ч") + | ("ш" : "Ш") + | ("щ" : "Щ") + | ("ъ" : "Ъ") + | ("ы" : "Ы") + | ("ь" : "Ь") + | ("э" : "Э") + | ("ю" : "Ю") + | ("я" : "Я") + | ("ѡ" : "Ѡ") + | ("ѣ" : "Ѣ") + | ("ѥ" : "Ѥ") + | ("ѧ" : "Ѧ") + | ("ѩ" : "Ѩ") + | ("ѫ" : "Ѫ") + | ("ѭ" : "Ѭ") + | ("ѯ" : "Ѯ") + | ("ѱ" : "Ѱ") + | ("ѳ" : "Ѳ") + | ("ѵ" : "Ѵ") + | ("ѷ" : "Ѷ") + | ("ѹ" : "Ѹ") + | ("ѻ" : "Ѻ") + | ("ѽ" : "Ѽ") + | ("ѿ" : "Ѿ") + | ("ҁ" : "Ҁ") + | ("ҋ" : "Ҋ") + | ("ҍ" : "Ҍ") + | ("ҏ" : "Ҏ") + | ("ґ" : "Ґ") + | ("ғ" : "Ғ") + | ("ҕ" : "Ҕ") + | ("җ" : "Җ") + | ("ҙ" : "Ҙ") + | ("қ" : "Қ") + | ("ҝ" : "Ҝ") + | ("ҟ" : "Ҟ") + | ("ҡ" : "Ҡ") + | ("ң" : "Ң") + | ("ҥ" : "Ҥ") + | ("ҧ" : "Ҧ") + | ("ҩ" : "Ҩ") + | ("ҫ" : "Ҫ") + | ("ҭ" : "Ҭ") + | ("ү" : "Ү") + | ("ұ" : "Ұ") + | ("ҳ" : "Ҳ") + | ("ҵ" : "Ҵ") + | ("ҷ" : "Ҷ") + | ("ҹ" : "Ҹ") + | ("һ" : "Һ") + | ("ҽ" : "Ҽ") + | ("ҿ" : "Ҿ") + | ("ӂ" : "Ӂ") + | ("ӄ" : "Ӄ") + | ("ӆ" : "Ӆ") + | ("ӈ" : "Ӈ") + | ("ӊ" : "Ӊ") + | ("ӌ" : "Ӌ") + | ("ӎ" : "Ӎ") + | ("ӑ" : "Ӑ") + | ("ӓ" : "Ӓ") + | ("ӕ" : "Ӕ") + | ("ӗ" : "Ӗ") + | ("ә" : "Ә") + | ("ӛ" : "Ӛ") + | ("ӝ" : "Ӝ") + | ("ӟ" : "Ӟ") + | ("ӡ" : "Ӡ") + | ("ӣ" : "Ӣ") + | ("ӥ" : "Ӥ") + | ("ӧ" : "Ӧ") + | ("ө" : "Ө") + | ("ӫ" : "Ӫ") + | ("ӭ" : "Ӭ") + | ("ӯ" : "Ӯ") + | ("ӱ" : "Ӱ") + | ("ӳ" : "Ӳ") + | ("ӵ" : "Ӵ") + | ("ӷ" : "Ӷ") + | ("ӹ" : "Ӹ") + | ("ԁ" : "Ԁ") + | ("ԃ" : "Ԃ") + | ("ԅ" : "Ԅ") + | ("ԇ" : "Ԇ") + | ("ԉ" : "Ԉ") + | ("ԋ" : "Ԋ") + | ("ԍ" : "Ԍ") + | ("ԏ" : "Ԏ") + | ("ա" : "Ա") + | ("բ" : "Բ") + | ("գ" : "Գ") + | ("դ" : "Դ") + | ("ե" : "Ե") + | ("զ" : "Զ") + | ("է" : "Է") + | ("ը" : "Ը") + | ("թ" : "Թ") + | ("ժ" : "Ժ") + | ("ի" : "Ի") + | ("լ" : "Լ") + | ("խ" : "Խ") + | ("ծ" : "Ծ") + | ("կ" : "Կ") + | ("հ" : "Հ") + | ("ձ" : "Ձ") + | ("ղ" : "Ղ") + | ("ճ" : "Ճ") + | ("մ" : "Մ") + | ("յ" : "Յ") + | ("ն" : "Ն") + | ("շ" : "Շ") + | ("ո" : "Ո") + | ("չ" : "Չ") + | ("պ" : "Պ") + | ("ջ" : "Ջ") + | ("ռ" : "Ռ") + | ("ս" : "Ս") + | ("վ" : "Վ") + | ("տ" : "Տ") + | ("ր" : "Ր") + | ("ց" : "Ց") + | ("ւ" : "Ւ") + | ("փ" : "Փ") + | ("ք" : "Ք") + | ("օ" : "Օ") + | ("ֆ" : "Ֆ") + | ("ⴀ" : "Ⴀ") + | ("ⴁ" : "Ⴁ") + | ("ⴂ" : "Ⴂ") + | ("ⴃ" : "Ⴃ") + | ("ⴄ" : "Ⴄ") + | ("ⴅ" : "Ⴅ") + | ("ⴆ" : "Ⴆ") + | ("ⴇ" : "Ⴇ") + | ("ⴈ" : "Ⴈ") + | ("ⴉ" : "Ⴉ") + | ("ⴊ" : "Ⴊ") + | ("ⴋ" : "Ⴋ") + | ("ⴌ" : "Ⴌ") + | ("ⴍ" : "Ⴍ") + | ("ⴎ" : "Ⴎ") + | ("ⴏ" : "Ⴏ") + | ("ⴐ" : "Ⴐ") + | ("ⴑ" : "Ⴑ") + | ("ⴒ" : "Ⴒ") + | ("ⴓ" : "Ⴓ") + | ("ⴔ" : "Ⴔ") + | ("ⴕ" : "Ⴕ") + | ("ⴖ" : "Ⴖ") + | ("ⴗ" : "Ⴗ") + | ("ⴘ" : "Ⴘ") + | ("ⴙ" : "Ⴙ") + | ("ⴚ" : "Ⴚ") + | ("ⴛ" : "Ⴛ") + | ("ⴜ" : "Ⴜ") + | ("ⴝ" : "Ⴝ") + | ("ⴞ" : "Ⴞ") + | ("ⴟ" : "Ⴟ") + | ("ⴠ" : "Ⴠ") + | ("ⴡ" : "Ⴡ") + | ("ⴢ" : "Ⴢ") + | ("ⴣ" : "Ⴣ") + | ("ⴤ" : "Ⴤ") + | ("ⴥ" : "Ⴥ") + | ("ḁ" : "Ḁ") + | ("ḃ" : "Ḃ") + | ("ḅ" : "Ḅ") + | ("ḇ" : "Ḇ") + | ("ḉ" : "Ḉ") + | ("ḋ" : "Ḋ") + | ("ḍ" : "Ḍ") + | ("ḏ" : "Ḏ") + | ("ḑ" : "Ḑ") + | ("ḓ" : "Ḓ") + | ("ḕ" : "Ḕ") + | ("ḗ" : "Ḗ") + | ("ḙ" : "Ḙ") + | ("ḛ" : "Ḛ") + | ("ḝ" : "Ḝ") + | ("ḟ" : "Ḟ") + | ("ḡ" : "Ḡ") + | ("ḣ" : "Ḣ") + | ("ḥ" : "Ḥ") + | ("ḧ" : "Ḧ") + | ("ḩ" : "Ḩ") + | ("ḫ" : "Ḫ") + | ("ḭ" : "Ḭ") + | ("ḯ" : "Ḯ") + | ("ḱ" : "Ḱ") + | ("ḳ" : "Ḳ") + | ("ḵ" : "Ḵ") + | ("ḷ" : "Ḷ") + | ("ḹ" : "Ḹ") + | ("ḻ" : "Ḻ") + | ("ḽ" : "Ḽ") + | ("ḿ" : "Ḿ") + | ("ṁ" : "Ṁ") + | ("ṃ" : "Ṃ") + | ("ṅ" : "Ṅ") + | ("ṇ" : "Ṇ") + | ("ṉ" : "Ṉ") + | ("ṋ" : "Ṋ") + | ("ṍ" : "Ṍ") + | ("ṏ" : "Ṏ") + | ("ṑ" : "Ṑ") + | ("ṓ" : "Ṓ") + | ("ṕ" : "Ṕ") + | ("ṗ" : "Ṗ") + | ("ṙ" : "Ṙ") + | ("ṛ" : "Ṛ") + | ("ṝ" : "Ṝ") + | ("ṟ" : "Ṟ") + | ("ṡ" : "Ṡ") + | ("ṣ" : "Ṣ") + | ("ṥ" : "Ṥ") + | ("ṧ" : "Ṧ") + | ("ṩ" : "Ṩ") + | ("ṫ" : "Ṫ") + | ("ṭ" : "Ṭ") + | ("ṯ" : "Ṯ") + | ("ṱ" : "Ṱ") + | ("ṳ" : "Ṳ") + | ("ṵ" : "Ṵ") + | ("ṷ" : "Ṷ") + | ("ṹ" : "Ṹ") + | ("ṻ" : "Ṻ") + | ("ṽ" : "Ṽ") + | ("ṿ" : "Ṿ") + | ("ẁ" : "Ẁ") + | ("ẃ" : "Ẃ") + | ("ẅ" : "Ẅ") + | ("ẇ" : "Ẇ") + | ("ẉ" : "Ẉ") + | ("ẋ" : "Ẋ") + | ("ẍ" : "Ẍ") + | ("ẏ" : "Ẏ") + | ("ẑ" : "Ẑ") + | ("ẓ" : "Ẓ") + | ("ẕ" : "Ẕ") + | ("ạ" : "Ạ") + | ("ả" : "Ả") + | ("ấ" : "Ấ") + | ("ầ" : "Ầ") + | ("ẩ" : "Ẩ") + | ("ẫ" : "Ẫ") + | ("ậ" : "Ậ") + | ("ắ" : "Ắ") + | ("ằ" : "Ằ") + | ("ẳ" : "Ẳ") + | ("ẵ" : "Ẵ") + | ("ặ" : "Ặ") + | ("ẹ" : "Ẹ") + | ("ẻ" : "Ẻ") + | ("ẽ" : "Ẽ") + | ("ế" : "Ế") + | ("ề" : "Ề") + | ("ể" : "Ể") + | ("ễ" : "Ễ") + | ("ệ" : "Ệ") + | ("ỉ" : "Ỉ") + | ("ị" : "Ị") + | ("ọ" : "Ọ") + | ("ỏ" : "Ỏ") + | ("ố" : "Ố") + | ("ồ" : "Ồ") + | ("ổ" : "Ổ") + | ("ỗ" : "Ỗ") + | ("ộ" : "Ộ") + | ("ớ" : "Ớ") + | ("ờ" : "Ờ") + | ("ở" : "Ở") + | ("ỡ" : "Ỡ") + | ("ợ" : "Ợ") + | ("ụ" : "Ụ") + | ("ủ" : "Ủ") + | ("ứ" : "Ứ") + | ("ừ" : "Ừ") + | ("ử" : "Ử") + | ("ữ" : "Ữ") + | ("ự" : "Ự") + | ("ỳ" : "Ỳ") + | ("ỵ" : "Ỵ") + | ("ỷ" : "Ỷ") + | ("ỹ" : "Ỹ") + | ("ἀ" : "Ἀ") + | ("ἁ" : "Ἁ") + | ("ἂ" : "Ἂ") + | ("ἃ" : "Ἃ") + | ("ἄ" : "Ἄ") + | ("ἅ" : "Ἅ") + | ("ἆ" : "Ἆ") + | ("ἇ" : "Ἇ") + | ("ἐ" : "Ἐ") + | ("ἑ" : "Ἑ") + | ("ἒ" : "Ἒ") + | ("ἓ" : "Ἓ") + | ("ἔ" : "Ἔ") + | ("ἕ" : "Ἕ") + | ("ἠ" : "Ἠ") + | ("ἡ" : "Ἡ") + | ("ἢ" : "Ἢ") + | ("ἣ" : "Ἣ") + | ("ἤ" : "Ἤ") + | ("ἥ" : "Ἥ") + | ("ἦ" : "Ἦ") + | ("ἧ" : "Ἧ") + | ("ἰ" : "Ἰ") + | ("ἱ" : "Ἱ") + | ("ἲ" : "Ἲ") + | ("ἳ" : "Ἳ") + | ("ἴ" : "Ἴ") + | ("ἵ" : "Ἵ") + | ("ἶ" : "Ἶ") + | ("ἷ" : "Ἷ") + | ("ὀ" : "Ὀ") + | ("ὁ" : "Ὁ") + | ("ὂ" : "Ὂ") + | ("ὃ" : "Ὃ") + | ("ὄ" : "Ὄ") + | ("ὅ" : "Ὅ") + | ("ὑ" : "Ὑ") + | ("ὓ" : "Ὓ") + | ("ὕ" : "Ὕ") + | ("ὗ" : "Ὗ") + | ("ὠ" : "Ὠ") + | ("ὡ" : "Ὡ") + | ("ὢ" : "Ὢ") + | ("ὣ" : "Ὣ") + | ("ὤ" : "Ὤ") + | ("ὥ" : "Ὥ") + | ("ὦ" : "Ὦ") + | ("ὧ" : "Ὧ") + | ("ᾀ" : "ᾈ") + | ("ᾁ" : "ᾉ") + | ("ᾂ" : "ᾊ") + | ("ᾃ" : "ᾋ") + | ("ᾄ" : "ᾌ") + | ("ᾅ" : "ᾍ") + | ("ᾆ" : "ᾎ") + | ("ᾇ" : "ᾏ") + | ("ᾐ" : "ᾘ") + | ("ᾑ" : "ᾙ") + | ("ᾒ" : "ᾚ") + | ("ᾓ" : "ᾛ") + | ("ᾔ" : "ᾜ") + | ("ᾕ" : "ᾝ") + | ("ᾖ" : "ᾞ") + | ("ᾗ" : "ᾟ") + | ("ᾠ" : "ᾨ") + | ("ᾡ" : "ᾩ") + | ("ᾢ" : "ᾪ") + | ("ᾣ" : "ᾫ") + | ("ᾤ" : "ᾬ") + | ("ᾥ" : "ᾭ") + | ("ᾦ" : "ᾮ") + | ("ᾧ" : "ᾯ") + | ("ᾰ" : "Ᾰ") + | ("ᾱ" : "Ᾱ") + | ("ὰ" : "Ὰ") + | ("ά" : "Ά") + | ("ᾳ" : "ᾼ") + | ("ὲ" : "Ὲ") + | ("έ" : "Έ") + | ("ὴ" : "Ὴ") + | ("ή" : "Ή") + | ("ῃ" : "ῌ") + | ("ῐ" : "Ῐ") + | ("ῑ" : "Ῑ") + | ("ὶ" : "Ὶ") + | ("ί" : "Ί") + | ("ῠ" : "Ῠ") + | ("ῡ" : "Ῡ") + | ("ὺ" : "Ὺ") + | ("ύ" : "Ύ") + | ("ῥ" : "Ῥ") + | ("ὸ" : "Ὸ") + | ("ό" : "Ό") + | ("ὼ" : "Ὼ") + | ("ώ" : "Ώ") + | ("ῳ" : "ῼ") + | ("ⓐ" : "Ⓐ") + | ("ⓑ" : "Ⓑ") + | ("ⓒ" : "Ⓒ") + | ("ⓓ" : "Ⓓ") + | ("ⓔ" : "Ⓔ") + | ("ⓕ" : "Ⓕ") + | ("ⓖ" : "Ⓖ") + | ("ⓗ" : "Ⓗ") + | ("ⓘ" : "Ⓘ") + | ("ⓙ" : "Ⓙ") + | ("ⓚ" : "Ⓚ") + | ("ⓛ" : "Ⓛ") + | ("ⓜ" : "Ⓜ") + | ("ⓝ" : "Ⓝ") + | ("ⓞ" : "Ⓞ") + | ("ⓟ" : "Ⓟ") + | ("ⓠ" : "Ⓠ") + | ("ⓡ" : "Ⓡ") + | ("ⓢ" : "Ⓢ") + | ("ⓣ" : "Ⓣ") + | ("ⓤ" : "Ⓤ") + | ("ⓥ" : "Ⓥ") + | ("ⓦ" : "Ⓦ") + | ("ⓧ" : "Ⓧ") + | ("ⓨ" : "Ⓨ") + | ("ⓩ" : "Ⓩ") + | ("ⰰ" : "Ⰰ") + | ("ⰱ" : "Ⰱ") + | ("ⰲ" : "Ⰲ") + | ("ⰳ" : "Ⰳ") + | ("ⰴ" : "Ⰴ") + | ("ⰵ" : "Ⰵ") + | ("ⰶ" : "Ⰶ") + | ("ⰷ" : "Ⰷ") + | ("ⰸ" : "Ⰸ") + | ("ⰹ" : "Ⰹ") + | ("ⰺ" : "Ⰺ") + | ("ⰻ" : "Ⰻ") + | ("ⰼ" : "Ⰼ") + | ("ⰽ" : "Ⰽ") + | ("ⰾ" : "Ⰾ") + | ("ⰿ" : "Ⰿ") + | ("ⱀ" : "Ⱀ") + | ("ⱁ" : "Ⱁ") + | ("ⱂ" : "Ⱂ") + | ("ⱃ" : "Ⱃ") + | ("ⱄ" : "Ⱄ") + | ("ⱅ" : "Ⱅ") + | ("ⱆ" : "Ⱆ") + | ("ⱇ" : "Ⱇ") + | ("ⱈ" : "Ⱈ") + | ("ⱉ" : "Ⱉ") + | ("ⱊ" : "Ⱊ") + | ("ⱋ" : "Ⱋ") + | ("ⱌ" : "Ⱌ") + | ("ⱍ" : "Ⱍ") + | ("ⱎ" : "Ⱎ") + | ("ⱏ" : "Ⱏ") + | ("ⱐ" : "Ⱐ") + | ("ⱑ" : "Ⱑ") + | ("ⱒ" : "Ⱒ") + | ("ⱓ" : "Ⱓ") + | ("ⱔ" : "Ⱔ") + | ("ⱕ" : "Ⱕ") + | ("ⱖ" : "Ⱖ") + | ("ⱗ" : "Ⱗ") + | ("ⱘ" : "Ⱘ") + | ("ⱙ" : "Ⱙ") + | ("ⱚ" : "Ⱚ") + | ("ⱛ" : "Ⱛ") + | ("ⱜ" : "Ⱜ") + | ("ⱝ" : "Ⱝ") + | ("ⱞ" : "Ⱞ") + | ("ⲁ" : "Ⲁ") + | ("ⲃ" : "Ⲃ") + | ("ⲅ" : "Ⲅ") + | ("ⲇ" : "Ⲇ") + | ("ⲉ" : "Ⲉ") + | ("ⲋ" : "Ⲋ") + | ("ⲍ" : "Ⲍ") + | ("ⲏ" : "Ⲏ") + | ("ⲑ" : "Ⲑ") + | ("ⲓ" : "Ⲓ") + | ("ⲕ" : "Ⲕ") + | ("ⲗ" : "Ⲗ") + | ("ⲙ" : "Ⲙ") + | ("ⲛ" : "Ⲛ") + | ("ⲝ" : "Ⲝ") + | ("ⲟ" : "Ⲟ") + | ("ⲡ" : "Ⲡ") + | ("ⲣ" : "Ⲣ") + | ("ⲥ" : "Ⲥ") + | ("ⲧ" : "Ⲧ") + | ("ⲩ" : "Ⲩ") + | ("ⲫ" : "Ⲫ") + | ("ⲭ" : "Ⲭ") + | ("ⲯ" : "Ⲯ") + | ("ⲱ" : "Ⲱ") + | ("ⲳ" : "Ⲳ") + | ("ⲵ" : "Ⲵ") + | ("ⲷ" : "Ⲷ") + | ("ⲹ" : "Ⲹ") + | ("ⲻ" : "Ⲻ") + | ("ⲽ" : "Ⲽ") + | ("ⲿ" : "Ⲿ") + | ("ⳁ" : "Ⳁ") + | ("ⳃ" : "Ⳃ") + | ("ⳅ" : "Ⳅ") + | ("ⳇ" : "Ⳇ") + | ("ⳉ" : "Ⳉ") + | ("ⳋ" : "Ⳋ") + | ("ⳍ" : "Ⳍ") + | ("ⳏ" : "Ⳏ") + | ("ⳑ" : "Ⳑ") + | ("ⳓ" : "Ⳓ") + | ("ⳕ" : "Ⳕ") + | ("ⳗ" : "Ⳗ") + | ("ⳙ" : "Ⳙ") + | ("ⳛ" : "Ⳛ") + | ("ⳝ" : "Ⳝ") + | ("ⳟ" : "Ⳟ") + | ("ⳡ" : "Ⳡ") + | ("ⳣ" : "Ⳣ") + | ("a" : "A") + | ("b" : "B") + | ("c" : "C") + | ("d" : "D") + | ("e" : "E") + | ("f" : "F") + | ("g" : "G") + | ("h" : "H") + | ("i" : "I") + | ("j" : "J") + | ("k" : "K") + | ("l" : "L") + | ("m" : "M") + | ("n" : "N") + | ("o" : "O") + | ("p" : "P") + | ("q" : "Q") + | ("r" : "R") + | ("s" : "S") + | ("t" : "T") + | ("u" : "U") + | ("v" : "V") + | ("w" : "W") + | ("x" : "X") + | ("y" : "Y") + | ("z" : "Z") +]; + +export toupper = toupper_deterministic + | ("i" : "İ") + | ("dž" : "Dž") + | ("lj" : "Lj") + | ("nj" : "Nj") + | ("dz" : "Dz") + | ("θ" : "ϴ"); + +export tolower = + ("A" : "a") + | ("B" : "b") + | ("C" : "c") + | ("D" : "d") + | ("E" : "e") + | ("F" : "f") + | ("G" : "g") + | ("H" : "h") + | ("I" : "i") + | ("J" : "j") + | ("K" : "k") + | ("L" : "l") + | ("M" : "m") + | ("N" : "n") + | ("O" : "o") + | ("P" : "p") + | ("Q" : "q") + | ("R" : "r") + | ("S" : "s") + | ("T" : "t") + | ("U" : "u") + | ("V" : "v") + | ("W" : "w") + | ("X" : "x") + | ("Y" : "y") + | ("Z" : "z") + | ("À" : "à") + | ("Á" : "á") + | ("Â" : "â") + | ("Ã" : "ã") + | ("Ä" : "ä") + | ("Å" : "å") + | ("Æ" : "æ") + | ("Ç" : "ç") + | ("È" : "è") + | ("É" : "é") + | ("Ê" : "ê") + | ("Ë" : "ë") + | ("Ì" : "ì") + | ("Í" : "í") + | ("Î" : "î") + | ("Ï" : "ï") + | ("Ð" : "ð") + | ("Ñ" : "ñ") + | ("Ò" : "ò") + | ("Ó" : "ó") + | ("Ô" : "ô") + | ("Õ" : "õ") + | ("Ö" : "ö") + | ("Ø" : "ø") + | ("Ù" : "ù") + | ("Ú" : "ú") + | ("Û" : "û") + | ("Ü" : "ü") + | ("Ý" : "ý") + | ("Þ" : "þ") + | ("Ā" : "ā") + | ("Ă" : "ă") + | ("Ą" : "ą") + | ("Ć" : "ć") + | ("Ĉ" : "ĉ") + | ("Ċ" : "ċ") + | ("Č" : "č") + | ("Ď" : "ď") + | ("Đ" : "đ") + | ("Ē" : "ē") + | ("Ĕ" : "ĕ") + | ("Ė" : "ė") + | ("Ę" : "ę") + | ("Ě" : "ě") + | ("Ĝ" : "ĝ") + | ("Ğ" : "ğ") + | ("Ġ" : "ġ") + | ("Ģ" : "ģ") + | ("Ĥ" : "ĥ") + | ("Ħ" : "ħ") + | ("Ĩ" : "ĩ") + | ("Ī" : "ī") + | ("Ĭ" : "ĭ") + | ("Į" : "į") + | ("İ" : "i") + | ("IJ" : "ij") + | ("Ĵ" : "ĵ") + | ("Ķ" : "ķ") + | ("Ĺ" : "ĺ") + | ("Ļ" : "ļ") + | ("Ľ" : "ľ") + | ("Ŀ" : "ŀ") + | ("Ł" : "ł") + | ("Ń" : "ń") + | ("Ņ" : "ņ") + | ("Ň" : "ň") + | ("Ŋ" : "ŋ") + | ("Ō" : "ō") + | ("Ŏ" : "ŏ") + | ("Ő" : "ő") + | ("Œ" : "œ") + | ("Ŕ" : "ŕ") + | ("Ŗ" : "ŗ") + | ("Ř" : "ř") + | ("Ś" : "ś") + | ("Ŝ" : "ŝ") + | ("Ş" : "ş") + | ("Š" : "š") + | ("Ţ" : "ţ") + | ("Ť" : "ť") + | ("Ŧ" : "ŧ") + | ("Ũ" : "ũ") + | ("Ū" : "ū") + | ("Ŭ" : "ŭ") + | ("Ů" : "ů") + | ("Ű" : "ű") + | ("Ų" : "ų") + | ("Ŵ" : "ŵ") + | ("Ŷ" : "ŷ") + | ("Ÿ" : "ÿ") + | ("Ź" : "ź") + | ("Ż" : "ż") + | ("Ž" : "ž") + | ("Ɓ" : "ɓ") + | ("Ƃ" : "ƃ") + | ("Ƅ" : "ƅ") + | ("Ɔ" : "ɔ") + | ("Ƈ" : "ƈ") + | ("Ɖ" : "ɖ") + | ("Ɗ" : "ɗ") + | ("Ƌ" : "ƌ") + | ("Ǝ" : "ǝ") + | ("Ə" : "ə") + | ("Ɛ" : "ɛ") + | ("Ƒ" : "ƒ") + | ("Ɠ" : "ɠ") + | ("Ɣ" : "ɣ") + | ("Ɩ" : "ɩ") + | ("Ɨ" : "ɨ") + | ("Ƙ" : "ƙ") + | ("Ɯ" : "ɯ") + | ("Ɲ" : "ɲ") + | ("Ɵ" : "ɵ") + | ("Ơ" : "ơ") + | ("Ƣ" : "ƣ") + | ("Ƥ" : "ƥ") + | ("Ƨ" : "ƨ") + | ("Ʃ" : "ʃ") + | ("Ƭ" : "ƭ") + | ("Ʈ" : "ʈ") + | ("Ư" : "ư") + | ("Ʊ" : "ʊ") + | ("Ʋ" : "ʋ") + | ("Ƴ" : "ƴ") + | ("Ƶ" : "ƶ") + | ("Ʒ" : "ʒ") + | ("Ƹ" : "ƹ") + | ("Ƽ" : "ƽ") + | ("DŽ" : "dž") + | ("Dž" : "dž") + | ("LJ" : "lj") + | ("Lj" : "lj") + | ("NJ" : "nj") + | ("Nj" : "nj") + | ("Ǎ" : "ǎ") + | ("Ǐ" : "ǐ") + | ("Ǒ" : "ǒ") + | ("Ǔ" : "ǔ") + | ("Ǖ" : "ǖ") + | ("Ǘ" : "ǘ") + | ("Ǚ" : "ǚ") + | ("Ǜ" : "ǜ") + | ("Ǟ" : "ǟ") + | ("Ǡ" : "ǡ") + | ("Ǣ" : "ǣ") + | ("Ǥ" : "ǥ") + | ("Ǧ" : "ǧ") + | ("Ǩ" : "ǩ") + | ("Ǫ" : "ǫ") + | ("Ǭ" : "ǭ") + | ("Ǯ" : "ǯ") + | ("DZ" : "dz") + | ("Dz" : "dz") + | ("Ǵ" : "ǵ") + | ("Ƕ" : "ƕ") + | ("Ƿ" : "ƿ") + | ("Ǹ" : "ǹ") + | ("Ǻ" : "ǻ") + | ("Ǽ" : "ǽ") + | ("Ǿ" : "ǿ") + | ("Ȁ" : "ȁ") + | ("Ȃ" : "ȃ") + | ("Ȅ" : "ȅ") + | ("Ȇ" : "ȇ") + | ("Ȉ" : "ȉ") + | ("Ȋ" : "ȋ") + | ("Ȍ" : "ȍ") + | ("Ȏ" : "ȏ") + | ("Ȑ" : "ȑ") + | ("Ȓ" : "ȓ") + | ("Ȕ" : "ȕ") + | ("Ȗ" : "ȗ") + | ("Ș" : "ș") + | ("Ț" : "ț") + | ("Ȝ" : "ȝ") + | ("Ȟ" : "ȟ") + | ("Ƞ" : "ƞ") + | ("Ȣ" : "ȣ") + | ("Ȥ" : "ȥ") + | ("Ȧ" : "ȧ") + | ("Ȩ" : "ȩ") + | ("Ȫ" : "ȫ") + | ("Ȭ" : "ȭ") + | ("Ȯ" : "ȯ") + | ("Ȱ" : "ȱ") + | ("Ȳ" : "ȳ") + | ("Ȼ" : "ȼ") + | ("Ƚ" : "ƚ") + | ("Ɂ" : "ʔ") + | ("Ά" : "ά") + | ("Έ" : "έ") + | ("Ή" : "ή") + | ("Ί" : "ί") + | ("Ό" : "ό") + | ("Ύ" : "ύ") + | ("Ώ" : "ώ") + | ("Α" : "α") + | ("Β" : "β") + | ("Γ" : "γ") + | ("Δ" : "δ") + | ("Ε" : "ε") + | ("Ζ" : "ζ") + | ("Η" : "η") + | ("Θ" : "θ") + | ("Ι" : "ι") + | ("Κ" : "κ") + | ("Λ" : "λ") + | ("Μ" : "μ") + | ("Ν" : "ν") + | ("Ξ" : "ξ") + | ("Ο" : "ο") + | ("Π" : "π") + | ("Ρ" : "ρ") + | ("Σ" : "σ") + | ("Τ" : "τ") + | ("Υ" : "υ") + | ("Φ" : "φ") + | ("Χ" : "χ") + | ("Ψ" : "ψ") + | ("Ω" : "ω") + | ("Ϊ" : "ϊ") + | ("Ϋ" : "ϋ") + | ("Ϣ" : "ϣ") + | ("Ϥ" : "ϥ") + | ("Ϧ" : "ϧ") + | ("Ϩ" : "ϩ") + | ("Ϫ" : "ϫ") + | ("Ϭ" : "ϭ") + | ("Ϯ" : "ϯ") + | ("ϴ" : "θ") + | ("Ϸ" : "ϸ") + | ("Ϲ" : "ϲ") + | ("Ϻ" : "ϻ") + | ("Ѐ" : "ѐ") + | ("Ё" : "ё") + | ("Ђ" : "ђ") + | ("Ѓ" : "ѓ") + | ("Є" : "є") + | ("Ѕ" : "ѕ") + | ("І" : "і") + | ("Ї" : "ї") + | ("Ј" : "ј") + | ("Љ" : "љ") + | ("Њ" : "њ") + | ("Ћ" : "ћ") + | ("Ќ" : "ќ") + | ("Ѝ" : "ѝ") + | ("Ў" : "ў") + | ("Џ" : "џ") + | ("А" : "а") + | ("Б" : "б") + | ("В" : "в") + | ("Г" : "г") + | ("Д" : "д") + | ("Е" : "е") + | ("Ж" : "ж") + | ("З" : "з") + | ("И" : "и") + | ("Й" : "й") + | ("К" : "к") + | ("Л" : "л") + | ("М" : "м") + | ("Н" : "н") + | ("О" : "о") + | ("П" : "п") + | ("Р" : "р") + | ("С" : "с") + | ("Т" : "т") + | ("У" : "у") + | ("Ф" : "ф") + | ("Х" : "х") + | ("Ц" : "ц") + | ("Ч" : "ч") + | ("Ш" : "ш") + | ("Щ" : "щ") + | ("Ъ" : "ъ") + | ("Ы" : "ы") + | ("Ь" : "ь") + | ("Э" : "э") + | ("Ю" : "ю") + | ("Я" : "я") + | ("Ѡ" : "ѡ") + | ("Ѣ" : "ѣ") + | ("Ѥ" : "ѥ") + | ("Ѧ" : "ѧ") + | ("Ѩ" : "ѩ") + | ("Ѫ" : "ѫ") + | ("Ѭ" : "ѭ") + | ("Ѯ" : "ѯ") + | ("Ѱ" : "ѱ") + | ("Ѳ" : "ѳ") + | ("Ѵ" : "ѵ") + | ("Ѷ" : "ѷ") + | ("Ѹ" : "ѹ") + | ("Ѻ" : "ѻ") + | ("Ѽ" : "ѽ") + | ("Ѿ" : "ѿ") + | ("Ҁ" : "ҁ") + | ("Ҋ" : "ҋ") + | ("Ҍ" : "ҍ") + | ("Ҏ" : "ҏ") + | ("Ґ" : "ґ") + | ("Ғ" : "ғ") + | ("Ҕ" : "ҕ") + | ("Җ" : "җ") + | ("Ҙ" : "ҙ") + | ("Қ" : "қ") + | ("Ҝ" : "ҝ") + | ("Ҟ" : "ҟ") + | ("Ҡ" : "ҡ") + | ("Ң" : "ң") + | ("Ҥ" : "ҥ") + | ("Ҧ" : "ҧ") + | ("Ҩ" : "ҩ") + | ("Ҫ" : "ҫ") + | ("Ҭ" : "ҭ") + | ("Ү" : "ү") + | ("Ұ" : "ұ") + | ("Ҳ" : "ҳ") + | ("Ҵ" : "ҵ") + | ("Ҷ" : "ҷ") + | ("Ҹ" : "ҹ") + | ("Һ" : "һ") + | ("Ҽ" : "ҽ") + | ("Ҿ" : "ҿ") + | ("Ӂ" : "ӂ") + | ("Ӄ" : "ӄ") + | ("Ӆ" : "ӆ") + | ("Ӈ" : "ӈ") + | ("Ӊ" : "ӊ") + | ("Ӌ" : "ӌ") + | ("Ӎ" : "ӎ") + | ("Ӑ" : "ӑ") + | ("Ӓ" : "ӓ") + | ("Ӕ" : "ӕ") + | ("Ӗ" : "ӗ") + | ("Ә" : "ә") + | ("Ӛ" : "ӛ") + | ("Ӝ" : "ӝ") + | ("Ӟ" : "ӟ") + | ("Ӡ" : "ӡ") + | ("Ӣ" : "ӣ") + | ("Ӥ" : "ӥ") + | ("Ӧ" : "ӧ") + | ("Ө" : "ө") + | ("Ӫ" : "ӫ") + | ("Ӭ" : "ӭ") + | ("Ӯ" : "ӯ") + | ("Ӱ" : "ӱ") + | ("Ӳ" : "ӳ") + | ("Ӵ" : "ӵ") + | ("Ӷ" : "ӷ") + | ("Ӹ" : "ӹ") + | ("Ԁ" : "ԁ") + | ("Ԃ" : "ԃ") + | ("Ԅ" : "ԅ") + | ("Ԇ" : "ԇ") + | ("Ԉ" : "ԉ") + | ("Ԋ" : "ԋ") + | ("Ԍ" : "ԍ") + | ("Ԏ" : "ԏ") + | ("Ա" : "ա") + | ("Բ" : "բ") + | ("Գ" : "գ") + | ("Դ" : "դ") + | ("Ե" : "ե") + | ("Զ" : "զ") + | ("Է" : "է") + | ("Ը" : "ը") + | ("Թ" : "թ") + | ("Ժ" : "ժ") + | ("Ի" : "ի") + | ("Լ" : "լ") + | ("Խ" : "խ") + | ("Ծ" : "ծ") + | ("Կ" : "կ") + | ("Հ" : "հ") + | ("Ձ" : "ձ") + | ("Ղ" : "ղ") + | ("Ճ" : "ճ") + | ("Մ" : "մ") + | ("Յ" : "յ") + | ("Ն" : "ն") + | ("Շ" : "շ") + | ("Ո" : "ո") + | ("Չ" : "չ") + | ("Պ" : "պ") + | ("Ջ" : "ջ") + | ("Ռ" : "ռ") + | ("Ս" : "ս") + | ("Վ" : "վ") + | ("Տ" : "տ") + | ("Ր" : "ր") + | ("Ց" : "ց") + | ("Ւ" : "ւ") + | ("Փ" : "փ") + | ("Ք" : "ք") + | ("Օ" : "օ") + | ("Ֆ" : "ֆ") + | ("Ⴀ" : "ⴀ") + | ("Ⴁ" : "ⴁ") + | ("Ⴂ" : "ⴂ") + | ("Ⴃ" : "ⴃ") + | ("Ⴄ" : "ⴄ") + | ("Ⴅ" : "ⴅ") + | ("Ⴆ" : "ⴆ") + | ("Ⴇ" : "ⴇ") + | ("Ⴈ" : "ⴈ") + | ("Ⴉ" : "ⴉ") + | ("Ⴊ" : "ⴊ") + | ("Ⴋ" : "ⴋ") + | ("Ⴌ" : "ⴌ") + | ("Ⴍ" : "ⴍ") + | ("Ⴎ" : "ⴎ") + | ("Ⴏ" : "ⴏ") + | ("Ⴐ" : "ⴐ") + | ("Ⴑ" : "ⴑ") + | ("Ⴒ" : "ⴒ") + | ("Ⴓ" : "ⴓ") + | ("Ⴔ" : "ⴔ") + | ("Ⴕ" : "ⴕ") + | ("Ⴖ" : "ⴖ") + | ("Ⴗ" : "ⴗ") + | ("Ⴘ" : "ⴘ") + | ("Ⴙ" : "ⴙ") + | ("Ⴚ" : "ⴚ") + | ("Ⴛ" : "ⴛ") + | ("Ⴜ" : "ⴜ") + | ("Ⴝ" : "ⴝ") + | ("Ⴞ" : "ⴞ") + | ("Ⴟ" : "ⴟ") + | ("Ⴠ" : "ⴠ") + | ("Ⴡ" : "ⴡ") + | ("Ⴢ" : "ⴢ") + | ("Ⴣ" : "ⴣ") + | ("Ⴤ" : "ⴤ") + | ("Ⴥ" : "ⴥ") + | ("Ḁ" : "ḁ") + | ("Ḃ" : "ḃ") + | ("Ḅ" : "ḅ") + | ("Ḇ" : "ḇ") + | ("Ḉ" : "ḉ") + | ("Ḋ" : "ḋ") + | ("Ḍ" : "ḍ") + | ("Ḏ" : "ḏ") + | ("Ḑ" : "ḑ") + | ("Ḓ" : "ḓ") + | ("Ḕ" : "ḕ") + | ("Ḗ" : "ḗ") + | ("Ḙ" : "ḙ") + | ("Ḛ" : "ḛ") + | ("Ḝ" : "ḝ") + | ("Ḟ" : "ḟ") + | ("Ḡ" : "ḡ") + | ("Ḣ" : "ḣ") + | ("Ḥ" : "ḥ") + | ("Ḧ" : "ḧ") + | ("Ḩ" : "ḩ") + | ("Ḫ" : "ḫ") + | ("Ḭ" : "ḭ") + | ("Ḯ" : "ḯ") + | ("Ḱ" : "ḱ") + | ("Ḳ" : "ḳ") + | ("Ḵ" : "ḵ") + | ("Ḷ" : "ḷ") + | ("Ḹ" : "ḹ") + | ("Ḻ" : "ḻ") + | ("Ḽ" : "ḽ") + | ("Ḿ" : "ḿ") + | ("Ṁ" : "ṁ") + | ("Ṃ" : "ṃ") + | ("Ṅ" : "ṅ") + | ("Ṇ" : "ṇ") + | ("Ṉ" : "ṉ") + | ("Ṋ" : "ṋ") + | ("Ṍ" : "ṍ") + | ("Ṏ" : "ṏ") + | ("Ṑ" : "ṑ") + | ("Ṓ" : "ṓ") + | ("Ṕ" : "ṕ") + | ("Ṗ" : "ṗ") + | ("Ṙ" : "ṙ") + | ("Ṛ" : "ṛ") + | ("Ṝ" : "ṝ") + | ("Ṟ" : "ṟ") + | ("Ṡ" : "ṡ") + | ("Ṣ" : "ṣ") + | ("Ṥ" : "ṥ") + | ("Ṧ" : "ṧ") + | ("Ṩ" : "ṩ") + | ("Ṫ" : "ṫ") + | ("Ṭ" : "ṭ") + | ("Ṯ" : "ṯ") + | ("Ṱ" : "ṱ") + | ("Ṳ" : "ṳ") + | ("Ṵ" : "ṵ") + | ("Ṷ" : "ṷ") + | ("Ṹ" : "ṹ") + | ("Ṻ" : "ṻ") + | ("Ṽ" : "ṽ") + | ("Ṿ" : "ṿ") + | ("Ẁ" : "ẁ") + | ("Ẃ" : "ẃ") + | ("Ẅ" : "ẅ") + | ("Ẇ" : "ẇ") + | ("Ẉ" : "ẉ") + | ("Ẋ" : "ẋ") + | ("Ẍ" : "ẍ") + | ("Ẏ" : "ẏ") + | ("Ẑ" : "ẑ") + | ("Ẓ" : "ẓ") + | ("Ẕ" : "ẕ") + | ("Ạ" : "ạ") + | ("Ả" : "ả") + | ("Ấ" : "ấ") + | ("Ầ" : "ầ") + | ("Ẩ" : "ẩ") + | ("Ẫ" : "ẫ") + | ("Ậ" : "ậ") + | ("Ắ" : "ắ") + | ("Ằ" : "ằ") + | ("Ẳ" : "ẳ") + | ("Ẵ" : "ẵ") + | ("Ặ" : "ặ") + | ("Ẹ" : "ẹ") + | ("Ẻ" : "ẻ") + | ("Ẽ" : "ẽ") + | ("Ế" : "ế") + | ("Ề" : "ề") + | ("Ể" : "ể") + | ("Ễ" : "ễ") + | ("Ệ" : "ệ") + | ("Ỉ" : "ỉ") + | ("Ị" : "ị") + | ("Ọ" : "ọ") + | ("Ỏ" : "ỏ") + | ("Ố" : "ố") + | ("Ồ" : "ồ") + | ("Ổ" : "ổ") + | ("Ỗ" : "ỗ") + | ("Ộ" : "ộ") + | ("Ớ" : "ớ") + | ("Ờ" : "ờ") + | ("Ở" : "ở") + | ("Ỡ" : "ỡ") + | ("Ợ" : "ợ") + | ("Ụ" : "ụ") + | ("Ủ" : "ủ") + | ("Ứ" : "ứ") + | ("Ừ" : "ừ") + | ("Ử" : "ử") + | ("Ữ" : "ữ") + | ("Ự" : "ự") + | ("Ỳ" : "ỳ") + | ("Ỵ" : "ỵ") + | ("Ỷ" : "ỷ") + | ("Ỹ" : "ỹ") + | ("Ἀ" : "ἀ") + | ("Ἁ" : "ἁ") + | ("Ἂ" : "ἂ") + | ("Ἃ" : "ἃ") + | ("Ἄ" : "ἄ") + | ("Ἅ" : "ἅ") + | ("Ἆ" : "ἆ") + | ("Ἇ" : "ἇ") + | ("Ἐ" : "ἐ") + | ("Ἑ" : "ἑ") + | ("Ἒ" : "ἒ") + | ("Ἓ" : "ἓ") + | ("Ἔ" : "ἔ") + | ("Ἕ" : "ἕ") + | ("Ἠ" : "ἠ") + | ("Ἡ" : "ἡ") + | ("Ἢ" : "ἢ") + | ("Ἣ" : "ἣ") + | ("Ἤ" : "ἤ") + | ("Ἥ" : "ἥ") + | ("Ἦ" : "ἦ") + | ("Ἧ" : "ἧ") + | ("Ἰ" : "ἰ") + | ("Ἱ" : "ἱ") + | ("Ἲ" : "ἲ") + | ("Ἳ" : "ἳ") + | ("Ἴ" : "ἴ") + | ("Ἵ" : "ἵ") + | ("Ἶ" : "ἶ") + | ("Ἷ" : "ἷ") + | ("Ὀ" : "ὀ") + | ("Ὁ" : "ὁ") + | ("Ὂ" : "ὂ") + | ("Ὃ" : "ὃ") + | ("Ὄ" : "ὄ") + | ("Ὅ" : "ὅ") + | ("Ὑ" : "ὑ") + | ("Ὓ" : "ὓ") + | ("Ὕ" : "ὕ") + | ("Ὗ" : "ὗ") + | ("Ὠ" : "ὠ") + | ("Ὡ" : "ὡ") + | ("Ὢ" : "ὢ") + | ("Ὣ" : "ὣ") + | ("Ὤ" : "ὤ") + | ("Ὥ" : "ὥ") + | ("Ὦ" : "ὦ") + | ("Ὧ" : "ὧ") + | ("ᾈ" : "ᾀ") + | ("ᾉ" : "ᾁ") + | ("ᾊ" : "ᾂ") + | ("ᾋ" : "ᾃ") + | ("ᾌ" : "ᾄ") + | ("ᾍ" : "ᾅ") + | ("ᾎ" : "ᾆ") + | ("ᾏ" : "ᾇ") + | ("ᾘ" : "ᾐ") + | ("ᾙ" : "ᾑ") + | ("ᾚ" : "ᾒ") + | ("ᾛ" : "ᾓ") + | ("ᾜ" : "ᾔ") + | ("ᾝ" : "ᾕ") + | ("ᾞ" : "ᾖ") + | ("ᾟ" : "ᾗ") + | ("ᾨ" : "ᾠ") + | ("ᾩ" : "ᾡ") + | ("ᾪ" : "ᾢ") + | ("ᾫ" : "ᾣ") + | ("ᾬ" : "ᾤ") + | ("ᾭ" : "ᾥ") + | ("ᾮ" : "ᾦ") + | ("ᾯ" : "ᾧ") + | ("Ᾰ" : "ᾰ") + | ("Ᾱ" : "ᾱ") + | ("Ὰ" : "ὰ") + | ("Ά" : "ά") + | ("ᾼ" : "ᾳ") + | ("Ὲ" : "ὲ") + | ("Έ" : "έ") + | ("Ὴ" : "ὴ") + | ("Ή" : "ή") + | ("ῌ" : "ῃ") + | ("Ῐ" : "ῐ") + | ("Ῑ" : "ῑ") + | ("Ὶ" : "ὶ") + | ("Ί" : "ί") + | ("Ῠ" : "ῠ") + | ("Ῡ" : "ῡ") + | ("Ὺ" : "ὺ") + | ("Ύ" : "ύ") + | ("Ῥ" : "ῥ") + | ("Ὸ" : "ὸ") + | ("Ό" : "ό") + | ("Ὼ" : "ὼ") + | ("Ώ" : "ώ") + | ("ῼ" : "ῳ") + | ("Ⓐ" : "ⓐ") + | ("Ⓑ" : "ⓑ") + | ("Ⓒ" : "ⓒ") + | ("Ⓓ" : "ⓓ") + | ("Ⓔ" : "ⓔ") + | ("Ⓕ" : "ⓕ") + | ("Ⓖ" : "ⓖ") + | ("Ⓗ" : "ⓗ") + | ("Ⓘ" : "ⓘ") + | ("Ⓙ" : "ⓙ") + | ("Ⓚ" : "ⓚ") + | ("Ⓛ" : "ⓛ") + | ("Ⓜ" : "ⓜ") + | ("Ⓝ" : "ⓝ") + | ("Ⓞ" : "ⓞ") + | ("Ⓟ" : "ⓟ") + | ("Ⓠ" : "ⓠ") + | ("Ⓡ" : "ⓡ") + | ("Ⓢ" : "ⓢ") + | ("Ⓣ" : "ⓣ") + | ("Ⓤ" : "ⓤ") + | ("Ⓥ" : "ⓥ") + | ("Ⓦ" : "ⓦ") + | ("Ⓧ" : "ⓧ") + | ("Ⓨ" : "ⓨ") + | ("Ⓩ" : "ⓩ") + | ("Ⰰ" : "ⰰ") + | ("Ⰱ" : "ⰱ") + | ("Ⰲ" : "ⰲ") + | ("Ⰳ" : "ⰳ") + | ("Ⰴ" : "ⰴ") + | ("Ⰵ" : "ⰵ") + | ("Ⰶ" : "ⰶ") + | ("Ⰷ" : "ⰷ") + | ("Ⰸ" : "ⰸ") + | ("Ⰹ" : "ⰹ") + | ("Ⰺ" : "ⰺ") + | ("Ⰻ" : "ⰻ") + | ("Ⰼ" : "ⰼ") + | ("Ⰽ" : "ⰽ") + | ("Ⰾ" : "ⰾ") + | ("Ⰿ" : "ⰿ") + | ("Ⱀ" : "ⱀ") + | ("Ⱁ" : "ⱁ") + | ("Ⱂ" : "ⱂ") + | ("Ⱃ" : "ⱃ") + | ("Ⱄ" : "ⱄ") + | ("Ⱅ" : "ⱅ") + | ("Ⱆ" : "ⱆ") + | ("Ⱇ" : "ⱇ") + | ("Ⱈ" : "ⱈ") + | ("Ⱉ" : "ⱉ") + | ("Ⱊ" : "ⱊ") + | ("Ⱋ" : "ⱋ") + | ("Ⱌ" : "ⱌ") + | ("Ⱍ" : "ⱍ") + | ("Ⱎ" : "ⱎ") + | ("Ⱏ" : "ⱏ") + | ("Ⱐ" : "ⱐ") + | ("Ⱑ" : "ⱑ") + | ("Ⱒ" : "ⱒ") + | ("Ⱓ" : "ⱓ") + | ("Ⱔ" : "ⱔ") + | ("Ⱕ" : "ⱕ") + | ("Ⱖ" : "ⱖ") + | ("Ⱗ" : "ⱗ") + | ("Ⱘ" : "ⱘ") + | ("Ⱙ" : "ⱙ") + | ("Ⱚ" : "ⱚ") + | ("Ⱛ" : "ⱛ") + | ("Ⱜ" : "ⱜ") + | ("Ⱝ" : "ⱝ") + | ("Ⱞ" : "ⱞ") + | ("Ⲁ" : "ⲁ") + | ("Ⲃ" : "ⲃ") + | ("Ⲅ" : "ⲅ") + | ("Ⲇ" : "ⲇ") + | ("Ⲉ" : "ⲉ") + | ("Ⲋ" : "ⲋ") + | ("Ⲍ" : "ⲍ") + | ("Ⲏ" : "ⲏ") + | ("Ⲑ" : "ⲑ") + | ("Ⲓ" : "ⲓ") + | ("Ⲕ" : "ⲕ") + | ("Ⲗ" : "ⲗ") + | ("Ⲙ" : "ⲙ") + | ("Ⲛ" : "ⲛ") + | ("Ⲝ" : "ⲝ") + | ("Ⲟ" : "ⲟ") + | ("Ⲡ" : "ⲡ") + | ("Ⲣ" : "ⲣ") + | ("Ⲥ" : "ⲥ") + | ("Ⲧ" : "ⲧ") + | ("Ⲩ" : "ⲩ") + | ("Ⲫ" : "ⲫ") + | ("Ⲭ" : "ⲭ") + | ("Ⲯ" : "ⲯ") + | ("Ⲱ" : "ⲱ") + | ("Ⲳ" : "ⲳ") + | ("Ⲵ" : "ⲵ") + | ("Ⲷ" : "ⲷ") + | ("Ⲹ" : "ⲹ") + | ("Ⲻ" : "ⲻ") + | ("Ⲽ" : "ⲽ") + | ("Ⲿ" : "ⲿ") + | ("Ⳁ" : "ⳁ") + | ("Ⳃ" : "ⳃ") + | ("Ⳅ" : "ⳅ") + | ("Ⳇ" : "ⳇ") + | ("Ⳉ" : "ⳉ") + | ("Ⳋ" : "ⳋ") + | ("Ⳍ" : "ⳍ") + | ("Ⳏ" : "ⳏ") + | ("Ⳑ" : "ⳑ") + | ("Ⳓ" : "ⳓ") + | ("Ⳕ" : "ⳕ") + | ("Ⳗ" : "ⳗ") + | ("Ⳙ" : "ⳙ") + | ("Ⳛ" : "ⳛ") + | ("Ⳝ" : "ⳝ") + | ("Ⳟ" : "ⳟ") + | ("Ⳡ" : "ⳡ") + | ("Ⳣ" : "ⳣ") + | ("A" : "a") + | ("B" : "b") + | ("C" : "c") + | ("D" : "d") + | ("E" : "e") + | ("F" : "f") + | ("G" : "g") + | ("H" : "h") + | ("I" : "i") + | ("J" : "j") + | ("K" : "k") + | ("L" : "l") + | ("M" : "m") + | ("N" : "n") + | ("O" : "o") + | ("P" : "p") + | ("Q" : "q") + | ("R" : "r") + | ("S" : "s") + | ("T" : "t") + | ("U" : "u") + | ("V" : "v") + | ("W" : "w") + | ("X" : "x") + | ("Y" : "y") + | ("Z" : "z") +; + +sigma_star = Optimize[bytelib.kBytes*] ; + +export TOUPPER_DETERMINISTIC = + CDRewrite[toupper_deterministic, "", "", sigma_star, 'ltr', 'obl'] ; + +export TOUPPER = CDRewrite[toupper, "", "", sigma_star, 'ltr', 'obl'] ; + +export TOLOWER = CDRewrite[tolower, "", "", sigma_star, 'ltr', 'obl'] ; + +export a_through_z_toupper = Optimize[ + ("a" : "A") + | ("b" : "B") + | ("c" : "C") + | ("d" : "D") + | ("e" : "E") + | ("f" : "F") + | ("g" : "G") + | ("h" : "H") + | ("i" : "I") + | ("j" : "J") + | ("k" : "K") + | ("l" : "L") + | ("m" : "M") + | ("n" : "N") + | ("o" : "O") + | ("p" : "P") + | ("q" : "Q") + | ("r" : "R") + | ("s" : "S") + | ("t" : "T") + | ("u" : "U") + | ("v" : "V") + | ("w" : "W") + | ("x" : "X") + | ("y" : "Y") + | ("z" : "Z") +]; diff --git a/third_party/chinese_text_normalization/thrax/src/util/germanic.tsv b/third_party/chinese_text_normalization/thrax/src/util/germanic.tsv new file mode 100644 index 000000000..6285e0106 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/util/germanic.tsv @@ -0,0 +1,81 @@ +1 10 11 +2 10 12 +3 10 13 +4 10 14 +5 10 15 +6 10 16 +7 10 17 +8 10 18 +9 10 19 +1 20 21 +2 20 22 +3 20 23 +4 20 24 +5 20 25 +6 20 26 +7 20 27 +8 20 28 +9 20 29 +1 30 31 +2 30 32 +3 30 33 +4 30 34 +5 30 35 +6 30 36 +7 30 37 +8 30 38 +9 30 39 +1 40 41 +2 40 42 +3 40 43 +4 40 44 +5 40 45 +6 40 46 +7 40 47 +8 40 48 +9 40 49 +1 50 51 +2 50 52 +3 50 53 +4 50 54 +5 50 55 +6 50 56 +7 50 57 +8 50 58 +9 50 59 +1 60 61 +2 60 62 +3 60 63 +4 60 64 +5 60 65 +6 60 66 +7 60 67 +8 60 68 +9 60 69 +1 70 71 +2 70 72 +3 70 73 +4 70 74 +5 70 75 +6 70 76 +7 70 77 +8 70 78 +9 70 79 +1 80 81 +2 80 82 +3 80 83 +4 80 84 +5 80 85 +6 80 86 +7 80 87 +8 80 88 +9 80 89 +1 90 91 +2 90 92 +3 90 93 +4 90 94 +5 90 95 +6 90 96 +7 90 97 +8 90 98 +9 90 99 diff --git a/third_party/chinese_text_normalization/thrax/src/util/util.grm b/third_party/chinese_text_normalization/thrax/src/util/util.grm new file mode 100644 index 000000000..bb559235f --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/src/util/util.grm @@ -0,0 +1,528 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Utility functions. + +import 'util/byte.grm' as bytelib; +import 'util/case.grm' as case; + +# A simplification helper function that encapsulates the left-to-right and +# obligatory options. +func CDR[t, l, r, s] { + return CDRewrite[t, l, r, s, 'ltr', 'obl']; +} + +# Useful insertion and deletion functions. + +func I[expr] { + return "" : expr; +} + +func D[expr] { + return expr : ""; +} + +# A machine that accepts nothing. +export NULL = Optimize["" - ""]; + +export d1to9 = Optimize[bytelib.kDigit - "0"]; +export d02to9 = Optimize[bytelib.kDigit - "1"]; +export d2to9 = Optimize[d02to9 - "0"]; +# Any number that isn't zero. May have leading zeroes. +export non_zero_number = Optimize["0"* d1to9 bytelib.kDigit*]; +# Any number, allowing for factorization markers. +export factorized_number = Optimize[(bytelib.kDigit | "\[" | "E" | "\]")*]; +export non_zero_factorized_number = Optimize["0"* d1to9 factorized_number]; + +export ins_space = "" : " "; +export ins_sil = "" : " sil "; +export ins_short_sil = "" : " sil|short "; +export ins_quote = "" : "\""; + +# Caveat: pass_anything does not pass stuff like "[~~]". +export pass_anything = bytelib.kBytes*; +export pass_any_word = bytelib.kNotSpace+; + +export pass_space_plus = bytelib.kSpace+; +export pass_space_star = bytelib.kSpace*; + +export clear_space = bytelib.kSpace : ""; +export clear_space_plus = bytelib.kSpace+ : ""; +export clear_space_star = bytelib.kSpace* : ""; + +export space_to_underscore = (bytelib.kAlnum | (" " : "_"))*; +export one_space = clear_space_star ins_space; + +export CLEAN_SPACES = Optimize[ + "" | (clear_space_star + (pass_any_word (bytelib.kSpace+ : " "))* + pass_any_word clear_space_star)] +; + +export del_space_star = " "* : ""; +export del_space_plus = " "+ : ""; + +export sigma_star = Optimize[pass_anything]; + +export DELETE_SPACES = + CDRewrite[clear_space_plus, "", "", sigma_star]; + +export REMOVE_LEADING_SPACES = + CDRewrite[clear_space_plus, "[BOS]", "", sigma_star]; + +export REMOVE_FINAL_SPACES = + CDRewrite[clear_space_plus, "", "[EOS]", sigma_star]; + +export REMOVE_BOUNDARY_SPACES = REMOVE_LEADING_SPACES @ REMOVE_FINAL_SPACES; + +export delete_initial_zero = + CDRewrite["0" : "", "[BOS]", bytelib.kDigit, sigma_star]; + +export lower_case_letter = Optimize[case.tolower | case.LOWER | bytelib.kLower]; +export lower_case = Optimize[lower_case_letter+]; +export lower_case_anything = case.TOLOWER; + +export upper_case_letter = Optimize[case.toupper | case.UPPER | bytelib.kUpper]; +export upper_case = Optimize[upper_case_letter+]; +export upper_case_anything = case.TOUPPER; + +export opening_brace = del_space_star ("{" : "") del_space_star; +export closing_brace = del_space_star ("}" : "") del_space_star; + +export quote = del_space_star ("\"" : "") del_space_star; +export double_quote = del_space_star ("\"\"" : "") del_space_star; + +export VOWELS = Optimize["a" | "e" | "i" | "o" | "u"]; +export VOWELS_Y = Optimize["a" | "e" | "i" | "o" | "u" | "y"]; +export VOWELS_INSENSITIVE = Optimize[VOWELS_Y | "A" | "E" | "I" + | "O" | "U" | "Y"]; +export CONSONANTS = Optimize[bytelib.kLower - VOWELS]; +export CONSONANTS_INSENSITIVE = Optimize[bytelib.kAlpha - VOWELS_INSENSITIVE]; + +# LSEQs that can be used for URL verbalization for all languages; +# mainly protocol names & file extensions. +export URL_LSEQS = Optimize["www" | "edu" | "ftp" | "htm" | "html" | "imdb" | + "php" | "asp" | "aspx" | "bbc" | "cgi" | "xhtml" | + "shtml" | "jsp"]; + +# Rule for swapping cardinal to decimal; useful for measures where +# both can appear in the proto but may be handled similarly. +export CARDINAL_TO_DECIMAL = Optimize[ + CDRewrite["cardinal" : "decimal", "", "", sigma_star] @ + CDRewrite["integer:" : "integer_part:", "", "", sigma_star] +]; + +export escape_quotes_and_backslashes = + ((bytelib.kBytes - "\"" - "\\") | ("\"" : "\\\"") | ("\\" : "\\\\"))* +; + +## Generally useful definition: + +export hours = + "0" + | "1" + | "2" + | "3" + | "4" + | "5" + | "6" + | "7" + | "8" + | "9" + | "10" + | "11" + | "12" + | "13" + | "14" + | "15" + | "16" + | "17" + | "18" + | "19" + | "20" + | "21" + | "22" + | "23" + | "24" +; + +export hours_shift = + ("0" : "1") + | ("1" : "2") + | ("2" : "3") + | ("3" : "4") + | ("4" : "5") + | ("5" : "6") + | ("6" : "7") + | ("7" : "8") + | ("8" : "9") + | ("9" : "10") + | ("10" : "11") + | ("11" : "12") + | ("12" : "13") + | ("13" : "14") + | ("14" : "15") + | ("15" : "16") + | ("16" : "17") + | ("17" : "18") + | ("18" : "19") + | ("19" : "20") + | ("20" : "21") + | ("21" : "22") + | ("22" : "23") + | ("23" : "24") + | ("24" : "1") +; + +export hours_24_to_12 = + ("0" : "12") + | "1" + | "2" + | "3" + | "4" + | "5" + | "6" + | "7" + | "8" + | "9" + | "10" + | "11" + | "12" + | ("13" : "1") + | ("14" : "2") + | ("15" : "3") + | ("16" : "4") + | ("17" : "5") + | ("18" : "6") + | ("19" : "7") + | ("20" : "8") + | ("21" : "9") + | ("22" : "10") + | ("23" : "11") + | ("24" : "12") +; + +export hours_24_to_12_next = + ("0" : "1") + | ("1" : "2") + | ("2" : "3") + | ("3" : "4") + | ("4" : "5") + | ("5" : "6") + | ("6" : "7") + | ("7" : "8") + | ("8" : "9") + | ("9" : "10") + | ("10" : "11") + | ("11" : "12") + | ("12" : "1") + | ("13" : "2") + | ("14" : "3") + | ("15" : "4") + | ("16" : "5") + | ("17" : "6") + | ("18" : "7") + | ("19" : "8") + | ("20" : "9") + | ("21" : "10") + | ("22" : "11") + | ("23" : "12") + | ("24" : "1") +; + +export minutes = + "0" + | "1" + | "2" + | "3" + | "4" + | "5" + | "6" + | "7" + | "8" + | "9" + | "10" + | "11" + | "12" + | "13" + | "14" + | "15" + | "16" + | "17" + | "18" + | "19" + | "20" + | "21" + | "22" + | "23" + | "24" + | "25" + | "26" + | "27" + | "28" + | "29" + | "30" + | "31" + | "32" + | "33" + | "34" + | "35" + | "36" + | "37" + | "38" + | "39" + | "40" + | "41" + | "42" + | "43" + | "44" + | "45" + | "46" + | "47" + | "48" + | "49" + | "50" + | "51" + | "52" + | "53" + | "54" + | "55" + | "56" + | "57" + | "58" + | "59" +; + +export round_minutes = + ("1" : "0") + | ("2" : "0") + | ("3" : "5") + | ("4" : "5") + | ("6" : "5") + | ("7" : "5") + | ("8" : "10") + | ("9" : "10") + | ("11" : "10") + | ("12" : "10") + | ("13" : "15") + | ("14" : "15") + | ("16" : "15") + | ("17" : "15") + | ("18" : "20") + | ("19" : "20") + | ("21" : "20") + | ("22" : "20") + | ("23" : "25") + | ("24" : "25") + | ("26" : "25") + | ("27" : "25") + | ("28" : "30") + | ("29" : "30") + | ("31" : "30") + | ("32" : "30") + | ("33" : "35") + | ("34" : "35") + | ("36" : "35") + | ("37" : "35") + | ("38" : "40") + | ("39" : "40") + | ("41" : "40") + | ("42" : "40") + | ("43" : "45") + | ("44" : "45") + | ("46" : "45") + | ("47" : "45") + | ("48" : "50") + | ("49" : "50") + | ("51" : "50") + | ("52" : "50") + | ("53" : "55") + | ("54" : "55") + | ("56" : "55") + | ("57" : "55") +; + +export unrounded_minutes = + ("0" : "0") + | ("5" : "5") + | ("10" : "10") + | ("15" : "15") + | ("20" : "20") + | ("25" : "25") + | ("30" : "30") + | ("35" : "35") + | ("40" : "40") + | ("45" : "45") + | ("50" : "50") + | ("55" : "55") +; + +export round_minutes_next_hour = + ("58" : "0") + | ("59" : "0") +; + +export subtract_from_60 = + "30" + | ("31" : "29" ) + | ("32" : "28" ) + | ("33" : "27" ) + | ("34" : "26" ) + | ("35" : "25" ) + | ("36" : "24" ) + | ("37" : "23" ) + | ("38" : "22" ) + | ("39" : "21" ) + | ("40" : "20" ) + | ("41" : "19" ) + | ("42" : "18" ) + | ("43" : "17" ) + | ("44" : "16" ) + | ("45" : "15" ) + | ("46" : "14" ) + | ("47" : "13" ) + | ("48" : "12" ) + | ("49" : "11" ) + | ("50" : "10" ) + | ("51" : "9" ) + | ("52" : "8" ) + | ("53" : "7" ) + | ("54" : "6" ) + | ("55" : "5" ) + | ("56" : "4" ) + | ("57" : "3" ) + | ("58" : "2" ) + | ("59" : "1" ) +; + +export any_month = + (("0" : "")? + ( + "1" + | "2" + | "3" + | "4" + | "5" + | "6" + | "7" + | "8" + | "9" + )) + | "10" + | "11" + | "12" +; + +export any_day = + (("0" : "")? + ( + "1" + | "2" + | "3" + | "4" + | "5" + | "6" + | "7" + | "8" + | "9" + )) + | "10" + | "11" + | "12" + | "13" + | "14" + | "15" + | "16" + | "17" + | "18" + | "19" + | "20" + | "21" + | "22" + | "23" + | "24" + | "25" + | "26" + | "27" + | "28" + | "29" + | "30" + | "31" +; + +## TODO: These rules need to be coordinated with the markup since that may +## change. + +export approximately = "[~~]"; + +## Rounded: say "approximately". + +approx1 = Optimize[ + "minutes:" + ("" : approximately) (minutes @ round_minutes) + "|" + "hours:" + hours + "|" + pass_anything] +; + +## Rounded to next hour. + +approx2 = Optimize[ + "minutes:" + ("" : approximately) round_minutes_next_hour + "|" + "hours:" + hours_shift + "|" + pass_anything] +; + +## Not rounded: don't say "approximately". + +approx3 = Optimize[ + "minutes:" + (minutes @ unrounded_minutes) + "|" + "hours:" + hours + "|" + pass_anything] +; + +export approx = Optimize[ + approx1 | approx2 | approx3 +]; + +# "|" and "\" are escaped in the new serialization scheme using a backslash, so +# we need to adjust these in the verbatim mappings. + +func EscapedMappings[raw_mappings] { + escapes = ("\\\\" : "\\") | ("\\|" : "|"); + return Optimize[ + ((Project[raw_mappings, 'input'] - Project[escapes, 'output']) | escapes) + @ raw_mappings + ]; +} + +# Allows verbatim grammars to be more permissive by accepting all inputs, it +# simply consumes the input if it is not present in the raw mappings. + +func ConsumeUnmapped[raw_mappings] { + unmapped = bytelib.kBytes - Project[raw_mappings, 'input']; + return Optimize[ + D[unmapped]<20> + ]; +} diff --git a/third_party/chinese_text_normalization/thrax/testcase_cn.txt b/third_party/chinese_text_normalization/thrax/testcase_cn.txt new file mode 100644 index 000000000..688456e86 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/testcase_cn.txt @@ -0,0 +1,54 @@ +一 +二 +三 +四 +五 +六 +七 +八 +九 +十 +十九 +二十 +二十八 +三十 +三十七 +四十 +四十六 +五十 +五十五 +六十 +六十四 +七十 +七十三 +八十 +八十二 +九十 +九十一 +一百 +一百零一 +一百一 +一百一十二 +两百二 +二百二十三 +三百三 +三百三十四 +四百四 +四百四十五 +一千五百五 +两千五百五十六 +三千六百六 +四千六百六十七 +五千七百七 +六千七百七十八 +七千八百八 +八千八百八十九 +九千九百九 +九千九百九十一 +二零一九年九月十二日 +两千零五年八月五号 +八五年二月二十七日 +公元一六三年 +零六年一月二号 +有百分之六十二的人认为 + diff --git a/third_party/chinese_text_normalization/thrax/testcase_en.txt b/third_party/chinese_text_normalization/thrax/testcase_en.txt new file mode 100644 index 000000000..b5c1312b3 --- /dev/null +++ b/third_party/chinese_text_normalization/thrax/testcase_en.txt @@ -0,0 +1,14 @@ +23,000 +1980 +8:35 +14.5 +1/4 +2/15 +5% +$10086 +www.google.com +5:50 a.m. +4:30 PM +www.interspeech.edu +www.iccasp.net +jiayu@gmail.com diff --git a/third_party/install.sh b/third_party/install.sh new file mode 100644 index 000000000..b3d5197bd --- /dev/null +++ b/third_party/install.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# install kaldi-comptiable feature +pushd python_kaldi_features +python3 setup.py install +if [ $? != 0 ]; then + error_msg "Please check why kaldi feature install error!" + exit -1 +fi +popd + +# install zhon +pushd zhon +python3 setup.py install +if [ $? != 0 ]; then + error_msg "Please check why zhon install error!" + exit -1 +fi +popd + +# install pypinyin +pushd python-pinyin +python3 setup.py install +if [ $? != 0 ]; then + error_msg "Please check why pypinyin install error!" + exit -1 +fi +popd + +# install mmseg +pushd pymmseg-cpp/ +python3 setup.py install +if [ $? != 0 ]; then + error_msg "Please check why pymmseg install error!" + exit -1 +fi +popd + diff --git a/third_party/zhon/.gitignore b/third_party/zhon/.gitignore new file mode 100644 index 000000000..b5a8ab121 --- /dev/null +++ b/third_party/zhon/.gitignore @@ -0,0 +1,3 @@ +build +dist +*egg-info diff --git a/third_party/zhon/.travis.yml b/third_party/zhon/.travis.yml new file mode 100644 index 000000000..d426bf49b --- /dev/null +++ b/third_party/zhon/.travis.yml @@ -0,0 +1,21 @@ +sudo: false +language: python +install: pip install tox +script: tox + +matrix: + include: + - python: 2.7 + env: TOXENV=py27 + - python: 3.4 + env: TOXENV=py34 + - python: 3.5 + env: TOXENV=py35 + - python: 3.6 + env: TOXENV=py36 + - python: 3.6 + env: TOXENV=pep8 + - python: 3.6 + env: TOXENV=docs + - python: 3.6 + env: TOXENV=packaging diff --git a/third_party/zhon/AUTHORS.rst b/third_party/zhon/AUTHORS.rst new file mode 100644 index 000000000..08a4360ed --- /dev/null +++ b/third_party/zhon/AUTHORS.rst @@ -0,0 +1,14 @@ +======= +Credits +======= + +Author and Maintainer +--------------------- + +* Thomas Roten <https://github.com/tsroten> + +Contributors +------------ + +None yet. Why not be the first? + diff --git a/third_party/zhon/CHANGES.rst b/third_party/zhon/CHANGES.rst new file mode 100644 index 000000000..75bb91302 --- /dev/null +++ b/third_party/zhon/CHANGES.rst @@ -0,0 +1,88 @@ +Changes +======= + +v0.1.0 (2013-05-05) +------------------- + +* Initial release + +v0.1.1 (2013-05-05) +------------------- + +* Adds zhon.cedict package to setup.py + +v0.2.0 (2013-05-07) +------------------- + +* Allows for mapping between simplified and traditional. +* Adds logging to build_string(). +* Adds constants for numbered Pinyin and accented Pinyin. + +v0.2.1 (2013-05-07) +------------------- + +* Fixes typo in README.rst. + +v.1.0.0 (2014-01-25) +-------------------- + +* Complete rewrite that refactors code, renames constants, and improves Pinyin + support. + +v.1.1.0 (2014-01-28) +-------------------- + +* Adds ``zhon.pinyin.punctuation`` constant. +* Adds ``zhon.pinyin.accented_syllable``, ``zhon.pinyin.accented_word``, and + ``zhon.pinyin.accented_sentence`` constants. +* Adds ``zhon.pinyin.numbered_syllable``, ``zhon.pinyin.numbered_word``, and + ``zhon.pinyin.numbered_sentence`` constants. +* Fixes some README.rst typos. +* Clarifies information regarding Traditional and Simplified character + constants in README.rst. +* Adds constant short names to README.rst. + +v.1.1.1 (2014-01-29) +-------------------- + +* Adds documentation. +* Adds ``zhon.cedict.all`` constant. +* Removes duplicate code ranges from ``zhon.hanzi.characters``. +* Makes ``zhon.hanzi.non_stops`` a string containing all non-stops instead of + a string containing code ranges. +* Removes duplicate letters in ``zhon.pinyin.consonants``. +* Refactors Pinyin vowels/consonant code. +* Removes the Latin alpha from ``zhon.pinyin.vowels``. Fixes #16. +* Adds ``cjk_ideographs`` alias for ``zhon.hanzi.characters``. +* Fixes various typos. +* Removes numbers from Pinyin word constants. Fixes #15. +* Adds lowercase and uppercase constants to ``zhon.pinyin``. +* Fixes a bug with ``zhon.pinyin.sentence``. +* Adds ``sent`` alias for ``zhon.pinyin.sentence``. + +v.1.1.2 (2014-01-31) +-------------------- + +* Fixes bug with ``zhon.cedict.all``. + +v.1.1.3 (2014-02-12) +-------------------- + +* Adds Ideographic number zero to ``zhon.hanzi.characters``. Fixes #17. +* Fixes r-suffix bug. Fixes #18. + +v.1.1.4 (2015-01-25) +-------------------- + +* Removes duplicate module declarations in documentation. +* Moves tests inside zhon package. +* Adds travis config file. +* Adds Python 3.4 tests to travis and tox. +* Fixes flake8 warnings. +* Adds distutil fallback import statment to setup.py. +* Adds missing hanzi punctuation. Fixes #19. + +v.1.1.5 (2016-05-23) +-------------------- + +* Add missing Zhuyin characters. Fixes #23. diff --git a/third_party/zhon/CONTRIBUTING.rst b/third_party/zhon/CONTRIBUTING.rst new file mode 100644 index 000000000..e6a50bdf4 --- /dev/null +++ b/third_party/zhon/CONTRIBUTING.rst @@ -0,0 +1,107 @@ +============ +Contributing +============ + +Contributions are welcome, and they are greatly appreciated! Every +little bit helps, and credit will always be given. + +You can contribute in many ways: + +Types of Contributions +---------------------- + +Report Bugs +~~~~~~~~~~~ + +Report bugs at https://github.com/tsroten/zhon/issues. + +If you are reporting a bug, please include: + +* Your operating system name and version. +* Any details about your local setup that might be helpful in troubleshooting. +* Detailed steps to reproduce the bug. + +Fix Bugs +~~~~~~~~ + +Look through the GitHub issues for bugs. Anything tagged with "bug" +is open to whoever wants to implement it. + +Implement Features +~~~~~~~~~~~~~~~~~~ + +Look through the GitHub issues for features. Anything tagged with "feature" +is open to whoever wants to implement it. + +Write Documentation +~~~~~~~~~~~~~~~~~~~ + +Zhon could always use more documentation, whether as part of the +official Zhon docs, in docstrings, or even on the web in blog posts, +articles, and such. + +Submit Feedback +~~~~~~~~~~~~~~~ + +The best way to send feedback is to file an issue at https://github.com/tsroten/zhon/issues. + +If you are proposing a feature: + +* Explain in detail how it would work. +* Keep the scope as narrow as possible, to make it easier to implement. +* Remember that this is a volunteer-driven project, and that contributions + are welcome :) + +Get Started! +------------ + +Ready to contribute? Here's how to set up `zhon` for local development. + +1. Fork the `zhon` repo on GitHub. +2. Clone your fork locally:: + + $ git clone git@github.com:your_name_here/zhon.git + +3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: + + $ mkvirtualenv zhon + $ cd zhon/ + $ python setup.py develop + +4. Create a branch for local development:: + + $ git checkout -b name-of-your-bugfix-or-feature + + Now you can make your changes locally. + +5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: + + $ flake8 zhon + $ python setup.py test + $ tox + + To get flake8 and tox, just pip install them into your virtualenv. + + You can ignore the flake8 errors regarding `zhon.cedict` files. Rather than include hundreds of newline characters in each file, we are ignoring those errors. + +6. Commit your changes and push your branch to GitHub:: + + $ git add . + $ git commit -m "Your detailed description of your changes." + $ git push origin name-of-your-bugfix-or-feature + +7. Submit a pull request through the GitHub website. + +Pull Request Guidelines +----------------------- + +Before you submit a pull request, check that it meets these guidelines: + +1. The pull request should include tests. +2. If the pull request adds functionality, the docs should be updated. Put + your new functionality into a function with a docstring, and add the + feature to the list in README.rst. +3. The pull request should work for Python 2.7, 3.3, and 3.4. Check + https://travis-ci.org/tsroten/zhon/pull_requests + and make sure that the tests pass for all supported Python versions. +4. If you want to receive credit, add your name to `AUTHORS.rst`. diff --git a/third_party/zhon/LICENSE.txt b/third_party/zhon/LICENSE.txt new file mode 100644 index 000000000..9c7b63736 --- /dev/null +++ b/third_party/zhon/LICENSE.txt @@ -0,0 +1,7 @@ +Copyright (c) 2013-2014 Thomas Roten + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/third_party/zhon/MANIFEST.in b/third_party/zhon/MANIFEST.in new file mode 100644 index 000000000..b39761fc8 --- /dev/null +++ b/third_party/zhon/MANIFEST.in @@ -0,0 +1,6 @@ +include *.txt *.rst +include Makefile +include tox.ini +recursive-include docs * +recursive-include tests *.py +prune docs/_build diff --git a/third_party/zhon/Makefile b/third_party/zhon/Makefile new file mode 100644 index 000000000..2abcc3733 --- /dev/null +++ b/third_party/zhon/Makefile @@ -0,0 +1,42 @@ +PROJECT = zhon + +.PHONY: docs clean lint test test-all coverage dist release + +help: + @echo "clean - remove all build artifacts" + @echo "lint - check style with flake8" + @echo "test - run tests quickly with the current Python" + @echo "test-all - run tests in all environments" + @echo "coverage - check code coverage" + @echo "docs - generate Sphinx HTML documentation" + @echo "dist - make the source and binary distributions" + @echo "release - package and upload a release" + +clean: + rm -rf build dist egg *.egg-info htmlcov + find . -name '*.py[co]' -exec rm -f {} + + $(MAKE) -C docs clean + +lint: + flake8 $(PROJECT) tests setup.py + +test: + python setup.py test + +test-all: + tox + +coverage: + coverage run --source $(PROJECT) setup.py test + coverage report --fail-under=100 + +docs: + $(MAKE) -C docs clean + $(MAKE) -C docs html + open docs/_build/html/index.html + +dist: clean + python setup.py sdist bdist_wheel + +release: clean dist + twine upload -s dist/* diff --git a/third_party/zhon/README.rst b/third_party/zhon/README.rst new file mode 100644 index 000000000..fad7c313d --- /dev/null +++ b/third_party/zhon/README.rst @@ -0,0 +1,64 @@ +==== +Zhon +==== + +.. image:: https://badge.fury.io/py/zhon.png + :target: http://badge.fury.io/py/zhon + +.. image:: https://travis-ci.org/tsroten/zhon.png?branch=develop + :target: https://travis-ci.org/tsroten/zhon + +Zhon is a Python library that provides constants commonly used in Chinese text +processing. + +* Documentation: http://zhon.rtfd.org +* GitHub: https://github.com/tsroten/zhon +* Support: https://github.com/tsroten/zhon/issues +* Free software: `MIT license <http://opensource.org/licenses/MIT>`_ + +About +----- + +Zhon's constants can be used in Chinese text processing, for example: + +* Find CJK characters in a string: + + .. code:: python + + >>> re.findall('[{}]'.format(zhon.hanzi.characters), 'I broke a plate: 我打破了一个盘子.') + ['我', '打', '破', '了', '一', '个', '盘', '子'] + +* Validate Pinyin syllables, words, or sentences: + + .. code:: python + + >>> re.findall(zhon.pinyin.syllable, 'Yuànzi lǐ tíngzhe yí liàng chē.', re.I) + ['Yuàn', 'zi', 'lǐ', 'tíng', 'zhe', 'yí', 'liàng', 'chē'] + + >>> re.findall(zhon.pinyin.word, 'Yuànzi lǐ tíngzhe yí liàng chē.', re.I) + ['Yuànzi', 'lǐ', 'tíngzhe', 'yí', 'liàng', 'chē'] + + >>> re.findall(zhon.pinyin.sentence, 'Yuànzi lǐ tíngzhe yí liàng chē.', re.I) + ['Yuànzi lǐ tíngzhe yí liàng chē.'] + +Features +-------- + ++ Includes commonly-used constants: + - CJK characters and radicals + - Chinese punctuation marks + - Chinese sentence regular expression pattern + - Pinyin vowels, consonants, lowercase, uppercase, and punctuation + - Pinyin syllable, word, and sentence regular expression patterns + - Zhuyin characters and marks + - Zhuyin syllable regular expression pattern + - CC-CEDICT characters ++ Runs on Python 2.7 and 3 + +Getting Started +--------------- + +* `Install Zhon <http://zhon.readthedocs.org/en/latest/#installation>`_ +* Read `Zhon's introduction <http://zhon.readthedocs.org/en/latest/#using-zhon>`_ +* Learn from the `API documentation <http://zhon.readthedocs.org/en/latest/#zhon-hanzi>`_ +* `Contribute <https://github.com/tsroten/zhon/blob/develop/CONTRIBUTING.rst>`_ documentation, code, or feedback diff --git a/third_party/zhon/docs/Makefile b/third_party/zhon/docs/Makefile new file mode 100644 index 000000000..37d9b79e0 --- /dev/null +++ b/third_party/zhon/docs/Makefile @@ -0,0 +1,177 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Zhon.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Zhon.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/Zhon" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Zhon" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/third_party/zhon/docs/conf.py b/third_party/zhon/docs/conf.py new file mode 100644 index 000000000..4b3915e06 --- /dev/null +++ b/third_party/zhon/docs/conf.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Zhon documentation build configuration file, created by +# sphinx-quickstart on Tue Jan 28 22:18:02 2014. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.viewcode', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Zhon' +copyright = '2016, Thomas Roten' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '1.1' +# The full version, including alpha/beta/rc tags. +release = '1.1.5' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Zhondoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + #'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('index', 'Zhon.tex', 'Zhon Documentation', + 'Thomas Roten', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'zhon', 'Zhon Documentation', + ['Thomas Roten'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'Zhon', 'Zhon Documentation', + 'Thomas Roten', 'Zhon', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'http://docs.python.org/3': None} diff --git a/third_party/zhon/docs/index.rst b/third_party/zhon/docs/index.rst new file mode 100644 index 000000000..347ab3c70 --- /dev/null +++ b/third_party/zhon/docs/index.rst @@ -0,0 +1,413 @@ +.. Zhon documentation master file, created by + sphinx-quickstart on Tue Jan 28 22:18:02 2014. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Zhon +==== + +Introduction +------------ + +Zhon is a Python library that provides constants commonly used in Chinese text +processing: + +* CJK characters and radicals +* Chinese punctuation marks +* Chinese sentence regular expression pattern +* Pinyin vowels, consonants, lowercase, uppercase, and punctuation +* Pinyin syllable, word, and sentence regular expression patterns +* Zhuyin characters and marks +* Zhuyin syllable regular expression pattern +* CC-CEDICT characters + +Installation +------------ + +Zhon supports Python 2.7 and 3. Install using pip: + +.. code:: bash + + $ pip install zhon + +If you want to download the latest source code, check out `Zhon's GitHub +repository <https://github.com/tsroten/zhon>`_. + +Be sure to `report any bugs <https://github.com/tsroten/zhon/issues>`_ you find. +Thanks! + +.. module:: zhon + +Using Zhon +---------- + +Zhon contains four modules that export helpful Chinese constants: + +* :py:mod:`zhon.hanzi` +* :py:mod:`zhon.pinyin` +* :py:mod:`zhon.zhuyin` +* :py:mod:`zhon.cedict` + +Zhon's constants are formatted in one of three ways: + +* Characters listed individually. These can be used with membership tests + or used to build regular expression patterns. For example, ``'aeiou'``. +* Character code ranges. These are used to build regular expression patterns. + For example, ``'u\0041-\u005A\u0061-\u007A'``. +* Regular expression pattern. These are regular expression patterns + that can be used with the regular expression library directly. For + example, ``'[u\0020-\u007E]+'``. + +Using the constants listed below is simple. For constants that list the +characters individually, you can perform membership tests or use them in +regular expressions: + +.. code:: python + + >>> '车' in zhon.cedict.traditional + False + + >>> # This regular expression finds all characters that aren't considered + ... # traditional according to CC-CEDICT + ... re.findall('[^{}]'.format(zhon.cedict.traditional), '我买了一辆车') + ['买', '辆', '车'] + +For constants that contain character code ranges, you'll want to build a +regular expression: + +.. code:: python + + >>> re.findall('[{}]'.format(zhon.hanzi.punctuation), '我买了一辆车。') + ['。'] + +For constants that are regular expression patterns, you can use them directly +with the regular expression library, without formatting them: + +.. code:: python + + >>> re.findall(zhon.hanzi.sentence, '我买了一辆车。妈妈做的菜,很好吃!') + ['我买了一辆车。', '妈妈做的菜,很好吃!'] + +.. module:: zhon.hanzi + +``zhon.hanzi`` +~~~~~~~~~~~~~~ + +These constants can be used when working directly with Chinese characters. + +These constants can be used in a variety of ways, but they can't directly +distinguish between Chinese, Japanese, and Korean characters/words. +Chapter 12 of The Unicode Standard +(`PDF <http://www.unicode.org/versions/Unicode6.2.0/ch12.pdf>`_) +has some useful information about this: + + There is some concern that unifying the Han characters may lead to confusion because they are sometimes used differently by the various East Asian languages. Computationally, Han character unification presents no more difficulty than employing a single Latin character set that is used to write languages as different as English and French. Programmers do not expect the characters "c", "h", "a", and "t" alone to tell us whether chat is a French word for cat or an English word meaning “informal talk.” Likewise, we depend on context to identify the American hood (of a car) with the British bonnet. Few computer users are confused by the fact that ASCII can also be used to represent such words as the Welsh word ynghyd, which are strange looking to English eyes. Although it would be convenient to identify words by language for programs such as spell-checkers, it is neither practical nor productive to encode a separate Latin character set for every language that uses it. + +.. py:data:: characters + cjk_ideographs + + Character codes and code ranges for pertinent CJK ideograph Unicode characters. This includes: + + * `CJK Unified Ideographs <http://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)>`_ + * `CJK Unified Ideographs Extension A <http://en.wikipedia.org/wiki/CJK_Unified_Ideographs_Extension_A>`_ + * `CJK Unified Ideographs Extension B <http://en.wikipedia.org/wiki/CJK_Unified_Ideographs_Extension_B>`_ + * `CJK Unified Ideographs Extension C <http://en.wikipedia.org/wiki/CJK_Unified_Ideographs_Extension_C>`_ + * `CJK Unified Ideographs Extension D <http://en.wikipedia.org/wiki/CJK_Unified_Ideographs_Extension_D>`_ + * `CJK Compatibility Ideographs <http://en.wikipedia.org/wiki/CJK_Compatibility_Ideographs>`_ + * `CJK Compatibility Ideographs Supplement <http://en.wikipedia.org/wiki/CJK_Compatibility_Ideographs_Supplement>`_ + * Ideographic number zero + + Some of the characters in this constant will not be Chinese characters, + but this is a convienient way to approach the issue. If you'd rather have + an enormous string of Chinese characters from a Chinese dictionary, check + out :py:data:`zhon.cedict`. + +.. py:data:: radicals + + Character code ranges for the `Kangxi Radicals <http://en.wikipedia.org/wiki/Kangxi_radical#Unicode>`_ + and `CJK Radicals Supplement <http://en.wikipedia.org/wiki/CJK_Radicals_Supplement>`_ + Unicode blocks. + +.. py:data:: punctuation + + This is the concatenation of :py:data:`zhon.hanzi.non_stops` and + :py:data:`zhon.hanzi.stops`. + +.. py:data:: non_stops + + The string ``'"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、 、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏'``. + This contains Chinese punctuation marks, excluding punctuation marks that + function as stops. + +.. py:data:: stops + + The string ``'!?。。'``. These punctuation marks function as stops. + +.. py:data:: sent + sentence + + A regular expression pattern for a Chinese sentence. A sentence is defined + as a series of CJK characters (as defined by + :py:data:`zhon.hanzi.characters`) and non-stop punctuation marks followed + by a stop and zero or more container-closing punctuation marks (e.g. + apostrophe and brackets). + + .. code:: python + + >>> re.findall(zhon.hanzi.sentence, '我买了一辆车。') + ['我买了一辆车。'] + +.. module:: zhon.pinyin + +``zhon.pinyin`` +~~~~~~~~~~~~~~~ + +These constants can be used when working with Pinyin. + +.. py:data:: vowels + + The string ``'aeiouvüāēīōūǖáéíóúǘǎěǐǒǔǚàèìòùǜAEIOUVÜĀĒĪŌŪǕÁÉÍÓÚǗǍĚǏǑǓǙÀÈÌÒÙǛ'``. This contains every Pinyin vowel (lowercase and uppercase). + +.. py:data:: consonants + + The string ``'bpmfdtnlgkhjqxzcsrwyBPMFDTNLGKHJQXZCSRWY'``. This + contains every Pinyin consonant (lowercase and uppercase). + +.. py:data:: lowercase + + The string ``'bpmfdtnlgkhjqxzcsrwyaeiouvüāēīōūǖáéíóúǘǎěǐǒǔǚàèìòùǜ'``. This contains every lowercase Pinyin vowel and consonant. + +.. py:data:: uppercase + + The string ``'BPMFDTNLGKHJQXZCSRWYAEIOUVÜĀĒĪŌŪǕÁÉÍÓÚǗǍĚǏǑǓǙÀÈÌÒÙǛ'``. + This contains every uppercase vowel and consonant. + +.. py:data:: marks + + The string ``"·012345:-'"``. This contains all Pinyin marks that have + special meaning: a middle dot and numbers for indicating tone, a colon for + easily writing ü ('u:'), a hyphen for connecting syllables within words, + and an apostrophe for separating a syllable beginning with a vowel from + the previous syllable in its word. All of these marks can be used within a + valid Pinyin word. + +.. py:data:: punctuation + + The concatenation of :py:data:`zhon.pinyin.non_stops` and + :py:data:`zhon.pinyin.stops`. + +.. py:data:: non_stops + + The string ``'"#$%&\'()*+,-/:;<=>@[\]^_`{|}~"'``. This contains every + ASCII punctuation mark that doesn't function as a stop. + +.. py:data:: stops + + The string ``'.!?'``. This contains every ASCII punctuation mark that + functions as a stop. + +.. py:data:: printable + + The concatenation of :py:data:`zhon.pinyin.vowels`, + :py:data:`zhon.pinyin.consonants`, :py:data:`zhon.pinyin.marks`, + :py:data:`zhon.pinyin.punctuation`, and :py:data:`string.whitespace`. This + is essentially a Pinyin whitelist for complete Pinyin sentences -- it's + every possible valid character a Pinyin string can use assuming all + non-Chinese words that might be included (like proper nouns) use ASCII. + +Validating and splitting Pinyin isn't as simple as checking that only +valid characters exist or matching maximum-length valid syllables. +The regular expression library's lookahead features are used in this +module's regular expression patterns to ensure that only valid Pinyin +syllables are matched. The approach used to segment a string into valid +Pinyin syllables is roughly: + +1. Match the longest possible valid syllable. +2. If that match is followed directly by a vowel, drop that match and try + again with the next longest possible valid syllable. + +Additionally, lookahead assertions are used to ensure that hyphens and +apostrophes are only accepted when they are used correctly. This helps to +weed out non-Pinyin strings. + +.. py:data:: syl + syllable + + A regular expression pattern for a valid Pinyin syllable (accented or + numbered). Compile with :py:data:`re.IGNORECASE` (:py:data:`re.I`) to + accept uppercase letters as well. + + .. code:: python + + >>> re.findall(zhon.pinyin.syllable, 'Shū zài zhuōzi shàngmian. Shu1 zai4 zhuo1zi5 shang4mian5.', re.IGNORECASE) + ['Shū', 'zài', 'zhuō', 'zi', 'shàng', 'mian', 'Shu1', 'zai4', 'zhuo1', 'zi5', 'shang4', 'mian5'] + +.. py:data:: a_syl + acc_syl + accented_syllable + + A regular expression for a valid accented Pinyin syllable. Compile with + :py:data:`re.IGNORECASE` (:py:data:`re.I`) to accept uppercase letters as + well. + + .. code:: python + + >>> re.findall(zhon.pinyin.acc_syl, 'Shū zài zhuōzi shàngmian.', re.IGNORECASE) + ['Shū', 'zài', 'zhuō', 'zi', 'shàng', 'mian'] + + +.. py:data:: n_syl + num_syl + numbered_syllable + + A regular expression for a valid numbered Pinyin syllable. Compile with + :py:data:`re.IGNORECASE` (:py:data:`re.I`) to accept uppercase letters as + well. + + .. code:: python + + >>> re.findall(zhon.pinyin.num_syl, 'Shu1 zai4 zhuo1zi5 shang4mian5.', re.IGNORECASE) + ['Shu1', 'zai4', 'zhuo1', 'zi5', 'shang4', 'mian5'] + +.. py:data:: word + + A regular expression pattern for a valid Pinyin word (accented or + numbered). Compile with :py:data:`re.IGNORECASE` (:py:data:`re.I`) to + accept uppercase letters as well. + + .. code:: python + + >>> re.findall(zhon.pinyin.word, 'Shū zài zhuōzi shàngmian. Shu1 zai4 zhuo1zi5 shang4mian5.', re.IGNORECASE) + ['Shū', 'zài', 'zhuōzi', 'shàngmian', 'Shu1', 'zai4', 'zhuo1zi5', 'shang4mian5' + +.. py:data:: a_word + acc_word + accented_word + + A regular expression for a valid accented Pinyin word. Compile with + :py:data:`re.IGNORECASE` (:py:data:`re.I`) to accept uppercase letters as + well. + + .. code:: python + + >>> re.findall(zhon.pinyin.acc_word, 'Shū zài zhuōzi shàngmian.', re.IGNORECASE) + ['Shū', 'zài', 'zhuōzi', 'shàngmian'] + + +.. py:data:: n_word + num_word + numbered_word + + A regular expression for a valid numbered Pinyin word. Compile with + :py:data:`re.IGNORECASE` (:py:data:`re.I`) to accept uppercase letters as + well. + + .. code:: python + + >>> re.findall(zhon.pinyin.num_word, 'Shu1 zai4 zhuo1zi5 shang4mian5.', re.IGNORECASE) + ['Shu1', 'zai4', 'zhuo1zi5', 'shang4mian5'] + +.. py:data:: sent + sentence + + A regular expression pattern for a valid Pinyin sentence (accented or + numbered). Compile with :py:data:`re.IGNORECASE` (:py:data:`re.I`) to + accept uppercase letters as well. + + .. code:: python + + >>> re.findall(zhon.pinyin.sentence, 'Shū zài zhuōzi shàngmian. Shu1 zai4 zhuo1zi5 shang4mian5.', re.IGNORECASE) + ['Shū zài zhuōzi shàngmian.', 'Shu1 zai4 zhuo1zi5 shang4mian5.'] + +.. py:data:: a_sent + acc_sent + accented_sentence + + A regular expression for a valid accented Pinyin sentence. Compile with + :py:data:`re.IGNORECASE` (:py:data:`re.I`) to accept uppercase letters as + well. + + + .. code:: python + + >>> re.findall(zhon.pinyin.acc_sent, 'Shū zài zhuōzi shàngmian.', re.IGNORECASE) + ['Shū zài zhuōzi shàngmian.'] + + +.. py:data:: n_sent + num_sent + numbered_sentence + + A regular expression for a valid numbered Pinyin sentence. Compile with + :py:data:`re.IGNORECASE` (:py:data:`re.I`) to accept uppercase letters as + well. + + + .. code:: python + + >>> re.findall(zhon.pinyin.num_sent, 'Shu1 zai4 zhuo1zi5 shang4mian5.', re.IGNORECASE) + ['Shu1 zai4 zhuo1zi5 shang4mian5.'] + +.. module:: zhon.zhuyin + +``zhon.zhuyin`` +~~~~~~~~~~~~~~~ + +These constants can be used when working with Zhuyin (Bopomofo). + +.. py:data:: characters + + The string ``'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄝㄜㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧ'``. + This contains all Zhuyin characters as defined by the `Bomopofo Unicode + block <http://en.wikipedia.org/wiki/Bopomofo_(Unicode_block)>`_. It does + not include the + `Bomopofo Extended block <http://en.wikipedia.org/wiki/Bopomofo_Extended_(Unicode_block)>`_ + that defines characters used in non-standard dialects or minority + languages. + +.. py:data:: marks + + The string ``'ˇˊˋ˙'``. This contains the Zhuyin tone marks. + +.. py:data:: syl + syllable + + A regular expression pattern for a valid Zhuyin syllable. + + .. code:: python + + >>> re.findall(zhon.zhuyin.syllable, 'ㄓㄨˋ ㄧㄣ ㄈㄨˊ ㄏㄠˋ') + ['ㄓㄨˋ', 'ㄧㄣ', 'ㄈㄨˊ', 'ㄏㄠˋ'] + +.. module:: zhon.cedict + +``zhon.cedict`` +~~~~~~~~~~~~~~~ + +These constants are built from the `CC-CEDICT dictionary +<http://cc-cedict.org/wiki/>`_. +They aren't guaranteed to contain every possible Chinese character. They only +provide characters that exist in the CC-CEDICT dictionary. + +.. py:data:: all + + A string containing all Chinese characters found in `CC-CEDICT + <http://cc-cedict.org/wiki/>`_. + +.. py:data:: trad + traditional + + A string containing characters considered by `CC-CEDICT + <http://cc-cedict.org/wiki/>`_ to be Traditional Chinese characters. + Some of these characters are also present in + :py:data:`zhon.cedict.simplified` because many characters were left + untouched by the simplification process. + +.. py:data:: simp + simplified + + A string containing characters considered by `CC-CEDICT + <http://cc-cedict.org/wiki/>`_ to be Simplified Chinese characters. + Some of these characters are also present in + :py:data:`zhon.cedict.traditional` because many characters were left + untouched by the simplification process. diff --git a/third_party/zhon/docs/make.bat b/third_party/zhon/docs/make.bat new file mode 100644 index 000000000..22e3dbf3d --- /dev/null +++ b/third_party/zhon/docs/make.bat @@ -0,0 +1,242 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^<target^>` where ^<target^> is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Zhon.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Zhon.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/third_party/zhon/requirements.txt b/third_party/zhon/requirements.txt new file mode 100644 index 000000000..b1a5408f3 --- /dev/null +++ b/third_party/zhon/requirements.txt @@ -0,0 +1,6 @@ +coverage==4.3.4 +flake8==3.3.0 +Sphinx==1.5.3 +tox==2.6.0 +twine==1.8.1 +wheel==0.29.0 diff --git a/third_party/zhon/setup.cfg b/third_party/zhon/setup.cfg new file mode 100644 index 000000000..210e4f254 --- /dev/null +++ b/third_party/zhon/setup.cfg @@ -0,0 +1,6 @@ +[bdist_wheel] +universal = 1 + +[flake8] +ignore = E731, P101 +exclude = zhon/cedict/* diff --git a/third_party/zhon/setup.py b/third_party/zhon/setup.py new file mode 100644 index 000000000..fd495a490 --- /dev/null +++ b/third_party/zhon/setup.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +import os +import sys + + +enc_open = open + +try: + from setuptools import setup +except ImportError: + from distutils.core import setup + + +if sys.argv[-1] == 'publish': + os.system('python setup.py sdist upload') + sys.exit() + +with enc_open('README.rst', 'r', encoding='utf-8') as f: + long_description = f.read() + +setup( + name='zhon', + version='1.1.5', + author='Thomas Roten', + author_email='thomas@roten.us', + url='https://github.com/tsroten/zhon', + description=('Zhon provides constants used in Chinese text processing.'), + long_description=long_description, + packages=['zhon', 'zhon.cedict'], + keywords=('chinese mandarin segmentation tokenization punctuation hanzi ' + 'unicode radicals han cjk cedict cc-cedict traditional ' + 'simplified characters pinyin zhuyin'), + classifiers=[ + 'Operating System :: OS Independent', + 'Intended Audience :: Developers', + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Text Processing :: Linguistic', + ], + platforms='Any', + test_suite='tests', +) diff --git a/third_party/zhon/tests/__init__.py b/third_party/zhon/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/third_party/zhon/tests/test-cedict.py b/third_party/zhon/tests/test-cedict.py new file mode 100644 index 000000000..ce5a2f0c5 --- /dev/null +++ b/third_party/zhon/tests/test-cedict.py @@ -0,0 +1,33 @@ + +"""Tests for the zhon.cedict module.""" + +import re +import unittest +from zhon import cedict + + +class TestSimplified(unittest.TestCase): + + simplified_text = '有人丢失了一把斧子怎么找也没有找到' + + def test_re_complement_search(self): + re_complement = re.compile('[^{}]'.format(cedict.simplified)) + self.assertEqual(re_complement.search(self.simplified_text), None) + + +class TestTraditional(unittest.TestCase): + + simplified_text = '有人丢失了一把斧子怎么找也没有找到' + + def test_re_complement_search(self): + re_complement = re.compile('[^{}]'.format(cedict.traditional)) + self.assertNotEqual(re_complement.search(self.simplified_text), None) + + +class TestAll(unittest.TestCase): + + all_text = '车車' + + def test_re_complement_search(self): + re_complement = re.compile('[^{}]'.format(cedict.all)) + self.assertEqual(re_complement.search(self.all_text), None) diff --git a/third_party/zhon/tests/test-hanzi.py b/third_party/zhon/tests/test-hanzi.py new file mode 100644 index 000000000..5f4466695 --- /dev/null +++ b/third_party/zhon/tests/test-hanzi.py @@ -0,0 +1,49 @@ + +"""Tests for the zhon.hanzi module.""" + +import re +import unittest + +from zhon import hanzi + + +class TestCharacters(unittest.TestCase): + + def test_all_chinese(self): + c_re = re.compile('[^{}]'.format(hanzi.characters)) + t = '你我都很她它隹廿' + self.assertEqual(c_re.search(t), None) + + def test_chinese_and_punc(self): + c_re = re.compile('[^{}]'.format(hanzi.characters)) + t = '你我都很她它隹廿。,!' + self.assertNotEqual(c_re.search(t), None) + + +class TestRadicals(unittest.TestCase): + + def test_only_radicals(self): + r_re = re.compile('[^{}]'.format(hanzi.radicals)) + t = '\u2F00\u2F31\u2FBA\u2E98\u2EF3\u2ECF' + self.assertEqual(r_re.search(t), None) + + def test_chinese_equivalents(self): + r_re = re.compile('[^{}]'.format(hanzi.radicals)) + t = '\u4E00\u5E7F\u516B\u5165' + self.assertNotEqual(r_re.search(t), None) + + +class TestPunctuation(unittest.TestCase): + + def test_split_on_punctuation(self): + p_re = re.compile('[{}]'.format(hanzi.punctuation)) + t = '你好你好好好哈哈,米饭很好吃;哈哈!' + self.assertEqual(len(p_re.split(t)), 4) + + def test_issue_19(self): + self.assertTrue('《' in hanzi.punctuation) + self.assertTrue('·' in hanzi.punctuation) + self.assertTrue('〈' in hanzi.punctuation) + self.assertTrue('〉' in hanzi.punctuation) + self.assertTrue('﹑' in hanzi.punctuation) + self.assertTrue('﹔' in hanzi.punctuation) diff --git a/third_party/zhon/tests/test-pinyin.py b/third_party/zhon/tests/test-pinyin.py new file mode 100644 index 000000000..78f71b791 --- /dev/null +++ b/third_party/zhon/tests/test-pinyin.py @@ -0,0 +1,204 @@ + +"""Tests for the zhon.pinyin module.""" + +import random +import re +import unittest + +from zhon import pinyin + + +NUM_WORDS = 50 # Number of random words to test +WORD_LENGTH = 4 # Length of random words (number of syllables) +NUM_SENT = 10 # Number of random sentences to test +SENT_LENGTH = 5 # Length of random sentences (number of words) + +VALID_SYLS = ( # 411 total syllables, including 'r' + 'ba', 'pa', 'ma', 'fa', 'da', 'ta', 'na', 'la', 'ga', 'ka', 'ha', 'za', + 'ca', 'sa', 'zha', 'cha', 'sha', 'a', 'bo', 'po', 'mo', 'fo', 'yo', 'lo', + 'o', 'me', 'de', 'te', 'ne', 'le', 'ge', 'ke', 'he', 'ze', 'ce', 'se', + 'zhe', 'che', 'she', 're', 'e', 'bai', 'pai', 'mai', 'dai', 'tai', + 'nai', 'lai', 'gai', 'kai', 'hai', 'zai', 'cai', 'sai', 'zhai', 'chai', + 'shai', 'ai', 'bei', 'pei', 'mei', 'fei', 'dei', 'tei', 'nei', 'lei', + 'gei', 'kei', 'hei', 'zei', 'zhei', 'shei', 'ei', 'bao', 'pao', 'mao', + 'dao', 'tao', 'nao', 'lao', 'gao', 'kao', 'hao', 'zao', 'cao', 'sao', + 'zhao', 'chao', 'shao', 'rao', 'ao', 'pou', 'mou', 'fou', 'dou', 'tou', + 'nou', 'lou', 'gou', 'kou', 'hou', 'zou', 'cou', 'sou', 'zhou', 'chou', + 'shou', 'rou', 'ou', 'ban', 'pan', 'man', 'fan', 'dan', 'tan', 'nan', + 'lan', 'gan', 'kan', 'han', 'zan', 'can', 'san', 'zhan', 'chan', + 'shan', 'ran', 'an', 'bang', 'pang', 'mang', 'fang', 'dang', 'tang', + 'nang', 'lang', 'gang', 'kang', 'hang', 'zang', 'cang', 'sang', + 'zhang', 'chang', 'shang', 'rang', 'ang', 'ben', 'pen', 'men', 'fen', + 'den', 'nen', 'gen', 'ken', 'hen', 'zen', 'cen', 'sen', 'zhen', 'chen', + 'shen', 'ren', 'en', 'beng', 'peng', 'meng', 'feng', 'deng', 'teng', + 'neng', 'leng', 'geng', 'keng', 'heng', 'zeng', 'ceng', 'seng', + 'zheng', 'cheng', 'sheng', 'reng', 'eng', 'dong', 'tong', 'nong', + 'long', 'gong', 'kong', 'hong', 'zong', 'cong', 'song', 'zhong', + 'chong', 'rong', 'bu', 'pu', 'mu', 'fu', 'du', 'tu', 'nu', 'lu', + 'gu', 'ku', 'hu', 'zu', 'cu', 'su', 'zhu', 'chu', 'shu', 'ru', 'wu', + 'gua', 'kua', 'hua', 'zhua', 'chua', 'shua', 'rua', 'wa', 'duo', 'tuo', + 'nuo', 'luo', 'guo', 'kuo', 'huo', 'zuo', 'cuo', 'suo', 'zhuo', 'chuo', + 'shuo', 'ruo', 'wo', 'guai', 'kuai', 'huai', 'zhuai', 'chuai', 'shuai', + 'wai', 'dui', 'tui', 'gui', 'kui', 'hui', 'zui', 'cui', 'sui', 'zhui', + 'chui', 'shui', 'rui', 'wei', 'duan', 'tuan', 'nuan', 'luan', 'guan', + 'kuan', 'huan', 'zuan', 'cuan', 'suan', 'zhuan', 'chuan', 'shuan', + 'ruan', 'wan', 'guang', 'kuang', 'huang', 'zhuang', 'chuang', 'shuang', + 'wang', 'dun', 'tun', 'nun', 'lun', 'gun', 'kun', 'hun', 'zun', 'cun', + 'sun', 'zhun', 'chun', 'shun', 'run', 'wen', 'weng', 'bi', 'pi', 'mi', + 'di', 'ti', 'ni', 'li', 'zi', 'ci', 'si', 'zhi', 'chi', 'shi', 'ri', + 'ji', 'qi', 'xi', 'yi', 'dia', 'lia', 'jia', 'qia', 'xia', 'ya', 'bie', + 'pie', 'mie', 'die', 'tie', 'nie', 'lie', 'jie', 'qie', 'xie', 'ye', + 'biao', 'piao', 'miao', 'diao', 'tiao', 'niao', 'liao', 'jiao', 'qiao', + 'xiao', 'yao', 'miu', 'diu', 'niu', 'liu', 'jiu', 'qiu', 'xiu', 'you', + 'bian', 'pian', 'mian', 'dian', 'tian', 'nian', 'lian', 'jian', 'qian', + 'xian', 'yan', 'niang', 'liang', 'jiang', 'qiang', 'xiang', 'yang', + 'bin', 'pin', 'min', 'nin', 'lin', 'jin', 'qin', 'xin', 'yin', 'bing', + 'ping', 'ming', 'ding', 'ting', 'ning', 'ling', 'jing', 'qing', 'xing', + 'ying', 'jiong', 'qiong', 'xiong', 'yong', 'nü', 'lü', 'ju', 'qu', + 'xu', 'yu', 'nüe', 'lüe', 'jue', 'que', 'xue', 'yue', 'juan', 'quan', + 'xuan', 'yuan', 'jun', 'qun', 'xun', 'yun', 'er', 'r' +) + +SYL = re.compile(pinyin.syllable) +A_SYL = re.compile(pinyin.a_syl) +N_SYL = re.compile(pinyin.n_syl) +WORD = re.compile(pinyin.word) +N_WORD = re.compile(pinyin.n_word) +A_WORD = re.compile(pinyin.a_word) +SENT = re.compile(pinyin.sentence) +N_SENT = re.compile(pinyin.n_sent) +A_SENT = re.compile(pinyin.a_sent) + + +VOWELS = 'aeiou\u00FC' +VOWEL_MAP = { + 'a1': '\u0101', 'a2': '\xe1', 'a3': '\u01ce', 'a4': '\xe0', 'a5': 'a', + 'e1': '\u0113', 'e2': '\xe9', 'e3': '\u011b', 'e4': '\xe8', 'e5': 'e', + 'i1': '\u012b', 'i2': '\xed', 'i3': '\u01d0', 'i4': '\xec', 'i5': 'i', + 'o1': '\u014d', 'o2': '\xf3', 'o3': '\u01d2', 'o4': '\xf2', 'o5': 'o', + 'u1': '\u016b', 'u2': '\xfa', 'u3': '\u01d4', 'u4': '\xf9', 'u5': 'u', + '\u00fc1': '\u01d6', '\u00fc2': '\u01d8', '\u00fc3': '\u01da', + '\u00fc4': '\u01dc', '\u00fc5': '\u00fc' +} + + +def _num_vowel_to_acc(vowel, tone): + """Convert a numbered vowel to an accented vowel.""" + try: + return VOWEL_MAP[vowel + str(tone)] + except IndexError: + raise ValueError("Vowel must be one of '{}' and tone must be an int" + "1-5.".format(VOWELS)) + + +def num_syl_to_acc(syllable): + """Convert a numbered pinyin syllable to an accented pinyin syllable. + + Implements the following algorithm: + 1. If the syllable has an 'a' or 'e', put the tone over that vowel. + 2. If the syllable has 'ou', place the tone over the 'o'. + 3. Otherwise, put the tone on the last vowel. + + """ + if syllable.startswith('r') and len(syllable) <= 2: + return 'r' # Special case for 'r' syllable. + if re.search('[{}]'.format(VOWELS), syllable) is None: + return syllable + syl, tone = syllable[:-1], syllable[-1] + if tone not in '12345': + # We did not find a tone number. Abort conversion. + return syl + syl = re.sub('u:|v', '\u00fc', syl) + if 'a' in syl: + return syl.replace('a', _num_vowel_to_acc('a', tone)) + elif 'e' in syl: + return syl.replace('e', _num_vowel_to_acc('e', tone)) + elif 'ou' in syl: + return syl.replace('o', _num_vowel_to_acc('o', tone)) + last_vowel = syl[max(map(syl.rfind, VOWELS))] # Find last vowel index. + return syl.replace(last_vowel, _num_vowel_to_acc(last_vowel, tone)) + + +class TestPinyinSyllables(unittest.TestCase): + + maxDiff = None + + def test_number_syllables(self): + vs = list(VALID_SYLS) + _vs = [] + for n in range(0, len(vs)): + vs[n] = vs[n] + str(random.randint(1, 5)) + _vs.append(vs[n]) + if _vs[n][0] in 'aeo': + _vs[n] = "'{}".format(_vs[n]) + s = ''.join(_vs) + self.assertEqual(SYL.findall(s), vs) + self.assertEqual(N_SYL.findall(s), vs) + + def test_accent_syllables(self): + vs = list(VALID_SYLS) + _vs = [] + for n in range(0, len(vs)): + syl = vs[n] + vs[n] = num_syl_to_acc(vs[n] + str(random.randint(1, 5))) + _vs.append(vs[n]) + if syl[0] in 'aeo': + _vs[n] = "'{}".format(_vs[n]) + s = ''.join(_vs) + self.assertEqual(SYL.findall(s), vs) + self.assertEqual(A_SYL.findall(s), vs) + + +def create_word(accented=False): + if accented: + tone = lambda: str(random.randint(1, 5)) + vs = [num_syl_to_acc(s + tone()) for s in VALID_SYLS] + else: + vs = [s + str(random.randint(1, 5)) for s in VALID_SYLS] + word = vs[random.randint(0, len(vs) - 1)] + for n in range(1, WORD_LENGTH): + num = random.randint(0, len(vs) - 1) + word += ['-', ''][random.randint(0, 1)] + if VALID_SYLS[num][0] in 'aeo' and word[-1] != '-': + word += "'" + word += vs[num] + return word + + +class TestPinyinWords(unittest.TestCase): + + def test_number_words(self): + for n in range(0, NUM_WORDS): + word = create_word() + self.assertEqual(WORD.match(word).group(0), word) + self.assertEqual(N_WORD.match(word).group(0), word) + + def test_accent_words(self): + for n in range(0, NUM_WORDS): + word = create_word(accented=True) + self.assertEqual(WORD.match(word).group(0), word) + self.assertEqual(A_WORD.match(word).group(0), word) + + +def create_sentence(accented=False): + _sent = [] + for n in range(0, SENT_LENGTH): + _sent.append(create_word(accented=accented)) + sentence = [_sent.pop(0)] + sentence.extend([random.choice([' ', ', ', '; ']) + w for w in _sent]) + return ''.join(sentence) + '.' + + +class TestPinyinSentences(unittest.TestCase): + + def test_number_sentences(self): + for n in range(0, NUM_SENT): + sentence = create_sentence() + self.assertEqual(SENT.match(sentence).group(0), sentence) + self.assertEqual(N_SENT.match(sentence).group(0), sentence) + + def test_accent_sentences(self): + for n in range(0, NUM_SENT): + sentence = create_sentence(accented=True) + self.assertEqual(SENT.match(sentence).group(0), sentence) + self.assertEqual(A_SENT.match(sentence).group(0), sentence) diff --git a/third_party/zhon/tests/test-zhuyin.py b/third_party/zhon/tests/test-zhuyin.py new file mode 100644 index 000000000..1db510afb --- /dev/null +++ b/third_party/zhon/tests/test-zhuyin.py @@ -0,0 +1,78 @@ + +"""Tests for the zhon.zhuyin module.""" + +import random +import re +import unittest + +from zhon import zhuyin + +VALID_SYLS = ( + 'ㄓ', 'ㄔ', 'ㄕ', 'ㄖ', 'ㄗ', 'ㄘ', 'ㄙ', 'ㄚ', 'ㄅㄚ', 'ㄆㄚ', 'ㄇㄚ', + 'ㄈㄚ', 'ㄉㄚ', 'ㄊㄚ', 'ㄋㄚ', 'ㄌㄚ', 'ㄍㄚ', 'ㄎㄚ', 'ㄏㄚ', 'ㄓㄚ', + 'ㄔㄚ', 'ㄕㄚ', 'ㄗㄚ', 'ㄘㄚ', 'ㄙㄚ', 'ㄛ', 'ㄅㄛ', 'ㄆㄛ', 'ㄇㄛ', + 'ㄈㄛ', 'ㄌㄛ', 'ㄜ', 'ㄇㄜ', 'ㄉㄜ', 'ㄊㄜ', 'ㄋㄜ', 'ㄌㄜ', 'ㄍㄜ', + 'ㄎㄜ', 'ㄏㄜ', 'ㄓㄜ', 'ㄔㄜ', 'ㄕㄜ', 'ㄖㄜ', 'ㄗㄜ', 'ㄘㄜ', 'ㄙㄜ', + 'ㄝ', 'ㄞ', 'ㄅㄞ', 'ㄆㄞ', 'ㄇㄞ', 'ㄉㄞ', 'ㄊㄞ', 'ㄋㄞ', 'ㄌㄞ', 'ㄍㄞ', + 'ㄎㄞ', 'ㄏㄞ', 'ㄓㄞ', 'ㄔㄞ', 'ㄕㄞ', 'ㄗㄞ', 'ㄘㄞ', 'ㄙㄞ', 'ㄟ', + 'ㄅㄟ', 'ㄆㄟ', 'ㄇㄟ', 'ㄈㄟ', 'ㄉㄟ', 'ㄋㄟ', 'ㄌㄟ', 'ㄍㄟ', 'ㄏㄟ', + 'ㄓㄟ', 'ㄕㄟ', 'ㄗㄟ', 'ㄠ', 'ㄅㄠ', 'ㄆㄠ', 'ㄇㄠ', 'ㄉㄠ', 'ㄊㄠ', + 'ㄋㄠ', 'ㄌㄠ', 'ㄍㄠ', 'ㄎㄠ', 'ㄏㄠ', 'ㄓㄠ', 'ㄔㄠ', 'ㄕㄠ', 'ㄖㄠ', + 'ㄗㄠ', 'ㄘㄠ', 'ㄙㄠ', 'ㄡ', 'ㄆㄡ', 'ㄇㄡ', 'ㄈㄡ', 'ㄉㄡ', 'ㄊㄡ', + 'ㄋㄡ', 'ㄌㄡ', 'ㄍㄡ', 'ㄎㄡ', 'ㄏㄡ', 'ㄓㄡ', 'ㄔㄡ', 'ㄕㄡ', 'ㄖㄡ', + 'ㄗㄡ', 'ㄘㄡ', 'ㄙㄡ', 'ㄢ', 'ㄅㄢ', 'ㄆㄢ', 'ㄇㄢ', 'ㄈㄢ', 'ㄉㄢ', + 'ㄊㄢ', 'ㄋㄢ', 'ㄌㄢ', 'ㄍㄢ', 'ㄎㄢ', 'ㄏㄢ', 'ㄓㄢ', 'ㄔㄢ', 'ㄕㄢ', + 'ㄖㄢ', 'ㄗㄢ', 'ㄘㄢ', 'ㄙㄢ', 'ㄣ', 'ㄅㄣ', 'ㄆㄣ', 'ㄇㄣ', 'ㄈㄣ', + 'ㄋㄣ', 'ㄍㄣ', 'ㄎㄣ', 'ㄏㄣ', 'ㄓㄣ', 'ㄔㄣ', 'ㄕㄣ', 'ㄖㄣ', 'ㄗㄣ', + 'ㄘㄣ', 'ㄙㄣ', 'ㄤ', 'ㄅㄤ', 'ㄆㄤ', 'ㄇㄤ', 'ㄈㄤ', 'ㄉㄤ', 'ㄊㄤ', + 'ㄋㄤ', 'ㄌㄤ', 'ㄍㄤ', 'ㄎㄤ', 'ㄏㄤ', 'ㄓㄤ', 'ㄔㄤ', 'ㄕㄤ', 'ㄖㄤ', + 'ㄗㄤ', 'ㄘㄤ', 'ㄙㄤ', 'ㄥ', 'ㄅㄥ', 'ㄆㄥ', 'ㄇㄥ', 'ㄈㄥ', 'ㄉㄥ', + 'ㄊㄥ', 'ㄋㄥ', 'ㄌㄥ', 'ㄍㄥ', 'ㄎㄥ', 'ㄏㄥ', 'ㄓㄥ', 'ㄔㄥ', 'ㄕㄥ', + 'ㄖㄥ', 'ㄗㄥ', 'ㄘㄥ', 'ㄙㄥ', 'ㄦ', 'ㄧ', 'ㄅㄧ', 'ㄆㄧ', 'ㄇㄧ', 'ㄉㄧ', + 'ㄊㄧ', 'ㄋㄧ', 'ㄌㄧ', 'ㄐㄧ', 'ㄑㄧ', 'ㄒㄧ', 'ㄧㄚ', 'ㄉㄧㄚ', 'ㄌㄧㄚ', + 'ㄐㄧㄚ', 'ㄑㄧㄚ', 'ㄒㄧㄚ', 'ㄧㄛ', 'ㄧㄝ', 'ㄅㄧㄝ', 'ㄆㄧㄝ', 'ㄇㄧㄝ', + 'ㄉㄧㄝ', 'ㄊㄧㄝ', 'ㄋㄧㄝ', 'ㄌㄧㄝ', 'ㄐㄧㄝ', 'ㄑㄧㄝ', 'ㄒㄧㄝ', + 'ㄧㄞ', 'ㄧㄠ', 'ㄅㄧㄠ', 'ㄆㄧㄠ', 'ㄇㄧㄠ', 'ㄉㄧㄠ', 'ㄊㄧㄠ', 'ㄋㄧㄠ', + 'ㄌㄧㄠ', 'ㄐㄧㄠ', 'ㄑㄧㄠ', 'ㄒㄧㄠ', 'ㄧㄡ', 'ㄇㄧㄡ', 'ㄉㄧㄡ', + 'ㄋㄧㄡ', 'ㄌㄧㄡ', 'ㄐㄧㄡ', 'ㄑㄧㄡ', 'ㄒㄧㄡ', 'ㄧㄢ', 'ㄅㄧㄢ', + 'ㄆㄧㄢ', 'ㄇㄧㄢ', 'ㄉㄧㄢ', 'ㄊㄧㄢ', 'ㄋㄧㄢ', 'ㄌㄧㄢ', 'ㄐㄧㄢ', + 'ㄑㄧㄢ', 'ㄒㄧㄢ', 'ㄧㄣ', 'ㄅㄧㄣ', 'ㄆㄧㄣ', 'ㄇㄧㄣ', 'ㄋㄧㄣ', + 'ㄌㄧㄣ', 'ㄐㄧㄣ', 'ㄑㄧㄣ', 'ㄒㄧㄣ', 'ㄧㄤ', 'ㄋㄧㄤ', 'ㄌㄧㄤ', + 'ㄐㄧㄤ', 'ㄑㄧㄤ', 'ㄒㄧㄤ', 'ㄧㄥ', 'ㄅㄧㄥ', 'ㄆㄧㄥ', 'ㄇㄧㄥ', + 'ㄉㄧㄥ', 'ㄊㄧㄥ', 'ㄋㄧㄥ', 'ㄌㄧㄥ', 'ㄐㄧㄥ', 'ㄑㄧㄥ', 'ㄒㄧㄥ', 'ㄨ', + 'ㄅㄨ', 'ㄆㄨ', 'ㄇㄨ', 'ㄈㄨ', 'ㄉㄨ', 'ㄊㄨ', 'ㄋㄨ', 'ㄌㄨ', 'ㄍㄨ', + 'ㄎㄨ', 'ㄏㄨ', 'ㄓㄨ', 'ㄔㄨ', 'ㄕㄨ', 'ㄖㄨ', 'ㄗㄨ', 'ㄘㄨ', 'ㄙㄨ', + 'ㄨㄚ', 'ㄍㄨㄚ', 'ㄎㄨㄚ', 'ㄏㄨㄚ', 'ㄓㄨㄚ', 'ㄔㄨㄚ', 'ㄕㄨㄚ', 'ㄨㄛ', + 'ㄉㄨㄛ', 'ㄊㄨㄛ', 'ㄋㄨㄛ', 'ㄌㄨㄛ', 'ㄍㄨㄛ', 'ㄎㄨㄛ', 'ㄏㄨㄛ', + 'ㄓㄨㄛ', 'ㄔㄨㄛ', 'ㄕㄨㄛ', 'ㄖㄨㄛ', 'ㄗㄨㄛ', 'ㄘㄨㄛ', 'ㄙㄨㄛ', + 'ㄨㄞ', 'ㄍㄨㄞ', 'ㄎㄨㄞ', 'ㄏㄨㄞ', 'ㄓㄨㄞ', 'ㄔㄨㄞ', 'ㄕㄨㄞ', 'ㄨㄟ', + 'ㄉㄨㄟ', 'ㄊㄨㄟ', 'ㄍㄨㄟ', 'ㄎㄨㄟ', 'ㄏㄨㄟ', 'ㄓㄨㄟ', 'ㄔㄨㄟ', + 'ㄕㄨㄟ', 'ㄖㄨㄟ', 'ㄗㄨㄟ', 'ㄘㄨㄟ', 'ㄙㄨㄟ', 'ㄨㄢ', 'ㄉㄨㄢ', + 'ㄊㄨㄢ', 'ㄋㄨㄢ', 'ㄌㄨㄢ', 'ㄍㄨㄢ', 'ㄎㄨㄢ', 'ㄏㄨㄢ', 'ㄓㄨㄢ', + 'ㄔㄨㄢ', 'ㄕㄨㄢ', 'ㄖㄨㄢ', 'ㄗㄨㄢ', 'ㄘㄨㄢ', 'ㄙㄨㄢ', 'ㄨㄣ', + 'ㄉㄨㄣ', 'ㄊㄨㄣ', 'ㄌㄨㄣ', 'ㄍㄨㄣ', 'ㄎㄨㄣ', 'ㄏㄨㄣ', 'ㄓㄨㄣ', + 'ㄔㄨㄣ', 'ㄕㄨㄣ', 'ㄖㄨㄣ', 'ㄗㄨㄣ', 'ㄘㄨㄣ', 'ㄙㄨㄣ', 'ㄨㄤ', + 'ㄍㄨㄤ', 'ㄎㄨㄤ', 'ㄏㄨㄤ', 'ㄓㄨㄤ', 'ㄔㄨㄤ', 'ㄕㄨㄤ', 'ㄨㄥ', + 'ㄉㄨㄥ', 'ㄊㄨㄥ', 'ㄋㄨㄥ', 'ㄌㄨㄥ', 'ㄍㄨㄥ', 'ㄎㄨㄥ', 'ㄏㄨㄥ', + 'ㄓㄨㄥ', 'ㄔㄨㄥ', 'ㄖㄨㄥ', 'ㄗㄨㄥ', 'ㄘㄨㄥ', 'ㄙㄨㄥ', 'ㄩ', 'ㄋㄩ', + 'ㄌㄩ', 'ㄐㄩ', 'ㄑㄩ', 'ㄒㄩ', 'ㄩㄝ', 'ㄋㄩㄝ', 'ㄌㄩㄝ', 'ㄐㄩㄝ', + 'ㄑㄩㄝ', 'ㄒㄩㄝ', 'ㄩㄢ', 'ㄐㄩㄢ', 'ㄑㄩㄢ', 'ㄒㄩㄢ', 'ㄩㄣ', 'ㄌㄩㄣ', + 'ㄐㄩㄣ', 'ㄑㄩㄣ', 'ㄒㄩㄣ', 'ㄩㄥ', 'ㄐㄩㄥ', 'ㄑㄩㄥ', 'ㄒㄩㄥ' +) + +SYL = re.compile(zhuyin.syllable) + + +def create_syllable(): + syl = random.choice(VALID_SYLS) + return syl + random.choice(list(zhuyin.marks) + [' ']) + + +class TestZhuyinSyllables(unittest.TestCase): + + def test_zhuyin_syllable(self): + vs = [] + for n in range(0, len(VALID_SYLS)): + vs.append(VALID_SYLS[n] + random.choice(list(zhuyin.marks) + [''])) + s = ''.join(vs) + self.assertEqual(''.join(SYL.findall(s)), s) diff --git a/third_party/zhon/tox.ini b/third_party/zhon/tox.ini new file mode 100644 index 000000000..1f968883f --- /dev/null +++ b/third_party/zhon/tox.ini @@ -0,0 +1,39 @@ +[tox] +envlist = py27, py34, py35, py36, pep8, docs, packaging + +[testenv] +whitelist_externals = make +setenv = + PYTHONPATH = {toxinidir}:{toxinidir}/zhon +commands = make test +deps = -r{toxinidir}/requirements.txt + +[testenv:pep8] +whitelist_externals = make +deps = + flake8 + pep8-naming + flake8-blind-except + flake8-builtins + flake8-pep3101 + flake8-string-format +commands = make lint + +[testenv:docs] +changedir = docs +deps = + sphinx + releases +whitelist_externals = make +commands = + make clean + make html + make linkcheck + +[testenv:packaging] +deps = + check-manifest + readme_renderer +commands = + check-manifest + python setup.py check -m -r -s diff --git a/third_party/zhon/zhon/__init__.py b/third_party/zhon/zhon/__init__.py new file mode 100644 index 000000000..ca584329c --- /dev/null +++ b/third_party/zhon/zhon/__init__.py @@ -0,0 +1,3 @@ +"""Provides constants used in Chinese text processing.""" + +__version__ = '1.1.5' diff --git a/third_party/zhon/zhon/cedict/__init__.py b/third_party/zhon/zhon/cedict/__init__.py new file mode 100644 index 000000000..df6c2ee1b --- /dev/null +++ b/third_party/zhon/zhon/cedict/__init__.py @@ -0,0 +1,14 @@ +"""Provides CC-CEDICT character constants.""" + +from . import simplified +from . import traditional +from . import all + +#: A string containing all Simplified characters according to CC-CEDICT. +simp = simplified = simplified.CHARACTERS + +#: A string containing all Traditional characters according to CC-CEDICT. +trad = traditional = traditional.CHARACTERS + +#: A string containing all Chinese characters found in CC-CEDICT. +all = all.CHARACTERS diff --git a/third_party/zhon/zhon/cedict/all.py b/third_party/zhon/zhon/cedict/all.py new file mode 100644 index 000000000..daa7015c6 --- /dev/null +++ b/third_party/zhon/zhon/cedict/all.py @@ -0,0 +1,3 @@ +"""Provides a string of characters used by CC-CEDICT.""" + +CHARACTERS = '张頦貫誯鐮瞭颳稲钻瘺簾徽悃輀粋鬈倖鴔膙検僣罠廧鯥蹪韭煬旱聶揵険燹卸翽奼胃扂框啄繊姉髏摎嗑雗灖荘拟襜瀣劢龠縧墦薤咮醬堳枰茵涴吻鄹刿㲾鼽节洁侀袆守蒎服蠓甑猕嶔舟伝遣嵡鹧孥仪诨瓮坭晷跴固鏸牿赁激乇譅屋饉蝍懌諒镑噗緔蛚賞潝錢窥趫漪讯键皭倬馳笲閻眺鎿紾憃澇㐄弊箏而阐肙战躝繠忧棩擱腶靺烹剸鵾绽塼臃捂过敎槓哑煖動旛艘厢缧蠩檨單邬萱舵沴唻逹犸匿鸽碼亀覆樅龊砉褓殒嚐醖岔枚涞丝酣屡齧秦婥俪歯噭蛰杷賴基鋸巾飼豁佇詅铄嵋竈孏虍惌诒鑑盐块鉕糔恙跞湝逢羡崠禥身氪阮疭砲戶隻琺厹邿績嶽抃沇尊餈碏茌锐刖莙勣豪闭獬柱艶懵佴典鹾恂俁資橆籊飏櫓铗牖曛腘惟乞譜傢簧丫榨嚮鎬㢲蜱新澴嘻熸鴽漁誆椅瞄笉妈訓眑銖焕撚鱧竦奥湫桯目啭売摷迴登哺釸灿廾淀幋魉秈塏蕍揌裒靑痐呗酕翔捙鄢纡辫洪覯霮礲薷挶町効缾岽掃谀涇羋須福舌鐐疗化芙貝口籠巧飥晤铭饲曱乴狹偸齾婼慂仁歆嫉李毓囑僕聘懟佞冢鲠紧宦蚤伫梨垮銬撰耵溴圻鈹炸儿鰽膂讆栅的鶊墈損咐鎖瀕应蜛幡篦塥裨楯矮筳蓰敷郸忾䍃蹁衅雄磈奏葍拌角陑啗違绔藚连汝阢禡鰦羥被横躯逮玭唬鲳縲芷搶邻爺喹砾宽撃謀檇鼈纏蔌錐犗吖餔薙朘駡筠淩畬懱葶柵鉺痹顾嵼時談汆俅兄巉黏泓乒凑埕惛蝘赜鮠贩澨傮閬脱掰榴倻锹嘿蚂椁貆熄骊紉很挍谓厐眕抚鑣奡彥桫迨湯米菰觴勺襁毀轅塋幏荍旌軒珐教霢碡娠鴦纥辯鄮听缲获收猺咹社媽斃言殇禋萌鈐玗蒙氜嗣飡穠寧黥恤泩乨鋭齲惱鍺噸竽屼杂浆仅駋筊慎淓僑鏗柟豜産垢骠肤谩伯钬崳蘵梴儻皸粼螂清簉庈戍贓澒瀑瘕脛镣矢塡緦楫軨潯燮絳捷裴煻睿衁郄奋鱉终遑狐南陕磔摙觞管夠鸦株侩貯鈮圬昶銻瀺钿稾褀丆窋堊鴈亓鄐炗嘖螙攘漜団鯡祠鷥濩釭睬鱲遺啸驾秽彼摂卄髋忉鳏扎滓发郗癖嗕招蕘咢砧度輩涨鞬帳茱蔵殴刻霹疸餽羼蒂玄颊缉芎愍沒甕舛悚陣瓢顧绦橫跨狮六繳说闸瑿譁赅齉屏腍賒燐镕篔胚杙諞楝俜锢堠粥讫椪亩趯錮炭嘬紲膷朶閿笾榇箋夊鰈綏逐熗圖搘䖟髡姧鳥扤襪滩郭癬鵲拱蝶酺盹呸磽幼觇潆剄祊廉鷏捎釗睖哕換旟蹜眣颠礧芤踩厮弳舱悰萵檴医阹璸唿樁辆氅犄馊縉莎怍輓涒茛憚蔟殞靣痢饧翦履賨浯珮偭罳已胰慷諴獻出铸畿対黼詁检鹉糈聍曌鉑兗鑕臚诞騢疡逦玥芯鰮羭夬邳爲鲻妹梃蜀暇璋嘊錈褌鼐堖锔愜闡睠囧浤鉲深佰毵鹺學鑾矽兼蛇恆铋凉鋏汎惓硖泛乚識櫟腜砣媢鞠瘧脩推馬贱澰丷尻箸霽誂肆隊焉貎抒紑徐颖笕喔谛訟撞顣啡陧烦扯彭勲述藴类建鯸穿蕁荅鷄呋酉珈轍槌苒鵑幗痔楙杝鬢璡鄦薫未纭猲鶻缺鮿甾榃蘀枇疋圊鈈羗外鐔樘蚝怜姣铡癠埧鋥汤惩硬泱买襶磹慆屄闋睊僉济仍祖嫕佚詘客㐡隠貤耩抨嶮儳谱頹窸嬿阽炼搁膆瀉劈渍脓排簑庐首咔贛斞篢呡燦剥幭煳史潷蓴絻嗾曀艅鳄遉狈协绐彗驕瓔晝唠鈦伤搪肯鸮維嬬銳瀲買麻容颿瘾喽檃蔀撇皋吊鄈与謌粗些讙嫣韡哧潤揩鷭遲濱詶鱺篹奸陾卼桂蓇扆彄雋叉郏湎拓忑鳗棟荜稣劦辤憨刳褵枴帻禸堿锽玼袂朁钊冈悒缑骖垔沚蠟坡入臨息绮牳繻駸图旀鍉燈偏赍齑饕矔賚歙蛞敝吠錦两螫攪鎳趷欶紺眾咽殃文瞋熏伎訌鰐嬖阔供誙栘蕪择㽮酲棵竹诃楂藇捆幄韋畊勉量潎廑鷗策壕濛槟艜钠厦您微踱沰蠵暴碸鐽覂莆怅傈辎氍憒縑岐砕嚔輛褟枞魣镧健䗪胨翮屭獳海蛴罻祿鋼擀黄坋鉉烈櫌鹑嵗顕盔跚橙鸢匠鐦瞥肫截蚯箭崬钳瘲侵颻麿瀾沃丂茀刊霈鬐窗尖鄔趙漘亟莝攜㹢獠勧早陲槱豶驺緹彸珽啼賃苇摆奄嗉雏桎姑籖轘滟簣咦枨墮鶬吳褱殰輵羸疼躂愁朅禄垈㨗缕冔蠛䴙踟鱣绢兡瓦恫蟨景孭瑳囲诰致硻嫺僾闼揀駄镉噏淌饑齕燔浙俘胞鼢删锦膫挪螯餮窭尬讷洶亵笺舀掇匊阈瞏谌算崖途貙芝搜巣郡牠桤荪擩髭籬棱赶幸酾狽呼跃菇塄釋哉楎旓壑鯗絖廕槛濟銠喦袤萩暨妮唳蠱踵悴鰹纸鈽辂态薆鎊琉蔓枒砑鶖縕傔褛殚輟痦噥慫杯婭畳埲歷胴祻寺罿铼飄克盈坏泌蟒顑字㷖鹕烔诚汙付臞玡疥芫怪蒯騮隳琲袷渶砺徹爾冽溃锈餐碗帖錔辙膝廣鏡煠僧闥此聪柩駭鑲蹶淵顺翹鉾燽坼軃求繖櫛赘縣岢瀧嚦认蜩斨媮榰贵掴鼹綸尿瞼貂持蚆邊眉喈植侌撒笑紕厔訛伙谟択鹣糢卡盦啥扫藨摯奭哲桷件菴壺鷸勾乃荁鯄剋靉痈呏魑塗珔苞慝鴢璥莫薯鬮碭帬甲覷漶示庹鶿猾澃耀王儊疏踌預鈔躙氘肝昜灠铥橤曩飭镲櫱轶泵饺鍾烽噼臇杆煊囉歎染駗罖毛淟虜缣嶢邠垦誤蘩宮圳許械谵鸹崿趂戁搅窄醊瘉咈讎二蜓斒稑墐龖講丙负掞齣剡捫篮嗲裰楷俶苴姺絿佃艁髄卋陉問衍滌磐套觚䳘恝舢涡传殥唤鲫妩骯枭樲喱隷蒻昺芿氾刂纇堆上后墓暗贔咛猘功龝礜跡潠该畤鹪秩蟭慬該痱坰遶嵴虺郃牂寅湊諏瓛噚鍘属㿝饜辠渧閤餩箨娯锱瞰嘷綴脹掸榼往领斄躊钎眍册頓窒肖皚粞弝拢蹧籫廪鯨鷴摻菸承觼偃鵁翀幇魅藄襉毈髒蕑某雚煙变鳞罝姜茢没丠褦缪墩観欲咱然劵机莿派亽羇夆伊計唎逌妓林谔邙原麝砜賡瑤齪蛭恬譲瓱酶屴惹釃峁祆跋诏畎囍痛坚嵞顜愣躠钤窨肬鐱皰踽梼鎂簁庀馆稅撄辊栉閎瘍傌餓箒戕䲔锛候錟帝轧絫忪筯旮楳雰凶鳴敻裼元绀驅蓄衉㗎降葑曐艕泔勘鷞壜澡榥麫簪颯断栲钷父况蚻携肿渾倂娆梏嘎輔焘浠觥篩奨藭衲矱啰鉶彴葺揹鋃鳇穆姅軋汊裏癎嗍塒盛烟幞戣氧師鬩禨堯茹憸贽殼邂缁骆礅构貊階甍厌験愕侔璚匙速艣佡繫峪硯闰瑷恿诼剃齁屇饅蟄之證镍珌飒旐腕濔獙點絝脢謦嘤蜮播讳㐰猶債螻膿漾玃儂鈌宓萐斗踔垛瀘稜池裥癤鵪扬襲盱呰鍶年蕺轾泽鏃廁鷇筆壅迋浊觏睎响篓矛遘營彞挣負洧隤夯芬阱環唷逵纴船悸谽縁岀鮆砅暄越劌鬓昑莖怕五疚则羞尝荣乡赧罫飨祯柮歳铰畷叶黴杻胸俾諼千鹁嵇蛄駒虑擐滔闚牙嫜蘢榡谦澥儤麯貳渲冱銷琶垵戺蚿栾璃嘂将輈溏锌抗褔嚟紜觡歠迥煤驪緩敬蹲印奴艺旹衾毽铃療姁髇必裋軏反苗烛剚盟塞鵜搣讠樧醤羨帯薬鄱现刷禴蔹枸涼隂礁宀鲆缅誊侈邎喌鰓愑朕逛队碞嬝董詧佥硫嫪繯惮汳仲鏰牷哶㝵駴恻晿跼呃饁節臄赉濈乏鍍痌黒腑提蝕槔留埘飞筝蜢梡账溥値笪岩足漲嚵膻挺螿疃圂箇崆踈嶓舐掗蠔军钙瘘垟骝簜裡軥灤幨轲剰镶盵襾櫽闃壁鯇絆觋歊煎勍归敖博陘矟攣誠欧夫鰩纨弯週犰阵碴萹暸沼码悄變氉予爍和薖昕蕣柢淦乥祫寪黨懮俲鋰獷嗶飴蛸板購啃鹅胄豉住鉍拐處鏚瑙囘殡踦骫復憭躳氲邷喵芻蒿樾皃餀碇帆贈沏刎库脐謔力鞙甘馝缜诡楠翩臭兰陶痵槽佼瑂寁繆巅請俉賏牎凍结胗晖做镘婞齜亦鎤鼩螬丳熰倷锵箴斸褽澼紅掄漉銎垌鸓粒挑蚖窞虣擢滦穫籯拮囶扻魁秀轉配荑懐蕕比郚睙髞祝応輦沥判鮫礪鶯悭洲励眶薿款頀禇残谈阌従耐隙喟额諡桠牤饪绩屨胭晬狱偰靶婴腺曹譾乼韃嫁罆诋楊仉跏獎働嵒杖珛党鹜朣侦銤鸩粨蚬炰儷鐵窴撸蠽溼閂墀龆抄覊渉瀍嚌鼓我搕錛锟堝蝣既姪揮郰埶捻敿軼㗂驁奇鱅苄蹉遍盌艑癙硝訢斡耦接崤钫瘪谮戲宵費渺調搾碃需皇倆撋茈抏導餌蔔岛香媟鞝焜杠菥陪卨迭楬色緱彰顶啴蹺柽穂雇蓋苏忍烓剒幚魘竟昧鶤含鄩玨覬羰帷鬵疴殸骂甁蚊愉鲎逓犒洑侐袖鰛纚夙璞衣虧惦囪鏨牯泮恳壶间诸橿臼镁噇跄秌鋒赑乗寘凜报尤閫傩鎯贮膳挲岱媵访放阀瞇儆斋興掏崎頌冓踐澗颙実若籤靪烩軭糱幰饶呴轺棹蕾曽駃品煆勅藋菏絎卒篟啞遜朧鲤犨匯鰱纰強騵蠹檸悼琁嚀醆沄民亐視樕鴛羚堙鬟懦畫鋨獯淮乭慳黰罷俺諸歿胼均鉅賄拈鹍佗評擔硙嫘闞蠢弤琪厩邯踮芳怲鲷妵氺显窃堂销璇分暋脈劓贐墟猜臥鑪珩全聲翱驶坴豺毹虾旽铇牆凅蛋摊胏巍狓俑绛饘磟噞樣薠搧龤錩熨亮讬鼱綰尷餵主謹蔽掼颂省喀澄按納完炒誖鸛嬙皞詣葧拦癫哪灯滮扳觸桿菼靁呇酅迄草懈篌轑襕柔姘雞畝褢暡茦鞫甪助醯輮沭莳庱洺覿朾夂疇匆枋弎厓谐钝蛡摠胥镪狩偨賭绱磵噴櫹蝾擽祂囁闇僅乄蟋臏罎珓兒仑跗翛嵚願租坞欣麤侮説鸱皴伻訹落押馂瘁咀瀅薊戉龎簍熒渑讖鼛続餟瞞吝槢睫嗪部濮絷寶裸孃陁瓀遅艉蹑泐衕壘韞踢萦枥夤爪喩蠮殭骷徵樺粃市錀犇吆蜈堎謐檗脔缘废甜蟥痩坨语浬孰兴淹佸聾揽鳃鋇扊蛏硎諗磛婚统镜氣亢莠戧倫锩嘯餱费丿於麂钆眅榄芊攉颎嶌鐓皒椑漕崙鸟匝豣滢擦火癯森摳鯰穷島迸湿屃珀則规蕉柈魍緌雒毐轕懔忘郞獝嗜蔦堤醫猪咩鞯薳庵欺辿愾綃异鈀率唆憋喓訐殗耔胡扠硤鍪瓩器諭蝲绵側乸腾拽鷃罂鏇囅蟏祎坒诗仕孚鹘翟兞洣颤鐩皨圯責窰嬷鸵炴撼瀁閆瘅莊搉馎稍岌锓瞒树渕餛尙初赣荧旦只㑩雨睯槮敳筷郴楻軸潿蓼遁狀升际葉曈驍糌蹕惔籙牝哜谢採斥嬤銫垩钯榭蚳搲縶貿戾専鄀嘆撏嚓褐梗世茔媛龙紘醝代菡藥筤矩觭篱奰却衺艾懽黃叁郇癆嗅苋蓏穎姍呒湖塚糟剞靜渣膠性别霩启鬱禰輹涸鲂猁冀〇隆殄有骎後欑碚鰟蹣波曦门瑯駰硷延跸幃鍁燀假镅评蝉饍翌铒槐赕飚巘煝埜财抡撥梭丬螳攲缶岵亹趿挾羃崂退熇圆垓蠐舔宛邝瘜俣苡恠蓥穤酪盩周蕲齶剴滹終勁采睆慊藏筎矓篛奚鱘卞陜漣肠愧骤璨唯躬碰夷犴踹沸耽鶂傀琅檄膊昉鮎砍霓疒樑辖氕禚帙鴟玞倝淢腧獫铨畯杳祷死賸烀兇鑅諄額绌闒詑棐豕拔繙鏞灝那嶡缠宥攤个謨垭娲贴熹錸尾羽餼趁亇讅锄縊搎融梓訒閑蘚抟鉢彠继賫公杰浴凹獸忽祼轀毅秉魈腌晚臙汞违倣犢帧鬥氫躪樯霭砳爿考澀善鰋変騏唑阗匕愙莘覜勢絡筥藤哮釬惶荵呻酹剿糾恃裆穏術巔葛揘艟佝槜醣岡縠搤伪梩嚭瀬嬲撱抵谴场点鈸纽頼挂侇椆誅鐄嶋颉宏蚍垗焖銕斛蜚贞幠跫误偬晰蟷汴闻價牸硼況蹀瑄彊磉驈职限兖鱔胙軝玢弧禦洫疮画猿満輇咄堉窈鬏悌向皐粔怙芘渝袜叢黠姦衩癭女篲葱懶啻遹卿鹽什潇视嫈筏藎拘荟九棜銣御素鲧妥喭猬堲輴玹綽鬼怂辁樆覅霄庋明鞑咗悟踞遢糧驦軫楨裯卬髳数蓷潴電叹煸郿巽筼濁赀届築饈镐剖燕齔湞距鼡尧窦渫貪皮锭稳搷瘻钺麾企谇垄嬉頏掌疐鐗儕羔挙膘亞讜罡嫦祥乫泪譩囮磲蝱拶噻鍹偿绾扃胂汇諆繋鳊硏蛎嗐瑗虛鎣簠昤檩蠨咭爬暱萰悵踴犹逸弾粽躁袅阄律紊殓褒猖邕蔚輞珣酢屠魦迫桨觯靮摰藷湴韻勹灸峽滁嵊操监鑐卖苙潞賝匣鸡崧箦漫笳攷閺媼范丁贇嚄鼋娉抌璐锗纔肘氝凢繡實蛤佫淪詩䥯瑭孳虱揶聵坻痺翾捃臂浇识罋鷊壈祏蟎哐鏖惘櫜钣笠骧愤涩厭甬枵褴刺疹霸昂要氆辅鄄稊鶉序怎沓踒醑琖悛瓣院遮啬鳳苷嗹槁婊齈揍鍐呖饔藙话吣餡栫說侨港炮户瘿颾嶼萃殀訇羈鸏斌互圕禔攙螘团鯠峦罥櫪僮獭闬绲腱偻狺摃橇胎铖委柘號淜閣稠庥沩劭琬悱舰暵区夾鰼袁躅逄笊愎芍涓輒邑厗甖隕茚枟痣面篧觫酮呬鷳扰菷桴癸響嫽籼棁詀滅灄鹈拍啖顔楞諝唣瞢頡綦鸥讪亨漯紳薶蔃檀謇丅尉纈鼏們碔蚘貜左胤佯铬聱时虵养珺秾驽敃歇跆鯊廈罏赓勐獗闖壔曘蝟泜隣审颧挤质眬尲掱脰债瞹锸禽搂超錄龉戎谒瘖抛撟伜監鑢奠磧鹦諫俩慰歴鋻培畸铿姽毁乆淅獄塊懍蝌仓珑酐痕魔恚觝嘣堧约鴥袪氯犮鄭縳鲲蚶爻邺琿骾徼榀厄騋綈枌逗唕箔朙薘辜哢棪轩韬平糲烺呿竾饽晃蓂軆穋苎蹓雖寔䝙濜鞣砠戤谨傭抱耰撵訴儺鐸嬾漆貅鈄宋挎銑冗钕掛脚斟东矣镢秧齦诫汨路噬恰鏻囹闿壽繼血但泅牄绉鱈俓浒遐驔慚蛙裝圣疢大鰥親玮逭缳鶲愷猻醺咸甿蜃複劄鬋暌刑鄗吕窔昙栝躜穡籥苤槪蹩叮灭彳緲艱柶葵卻篾顽权筋絏菎煗韖嫔荛擘眠隧厥憩茨徭礬刲䠶段蔴禹吾瞽樂薁怆莅鴄咋瘊醉辍舒店砖沛檟磣驢坠盧蓫捨苯鱮奬郳潰軷整鳻姹筸埽旁蝀揅組燉襌扒緑齐镔摞锡嘧蚪戯粮瀳銲簻麺宸稿钾冼蠃蘇嶄儉玈伍崑疔亚朝膜僦跤腩罭駬狲赱乷绺饹婿桃諂胆瑋雊又牏忐飖嗔毘詟馣咡瘠氤悩舨庭砬沱踰檵萴强騸唾皽逼欂愆喋眊邉洎躍鲑得涛蔞秣噠矧酦華塬釳迷摴壹穸鯿国灼擁虀担坊鉈行奖鑔潚裙敞鐡圧鈥攫振綮鸭漷媸閾蜇锋倉錏不少侚誘昝肜寢雠処賤聩差繭关珲豱佷詵翺项孿痾陽诂杇臆畋獏跎腓廐罗哔赛譟惜骣钧冥掩嶭第澱贰蜴尺餸疽錼戆鼄琊鎉渎抓耒龑岗馕溛谚梟蘞竣顢瓧鉦佤慨胯鹮㷭孬鋳俱浰杴黻对祸嗽獼柁懅罄告珉酈乎譌恒秕汚诙晞藝娣吧昫怯纮邲氷誶縻难厼訃萇径阋匉透弑夕欙覘攝莜壢睡闠勦煥迤荩轱幻糺塿盾裂摇铊凈灏髖埔様揜琠閧傥渤耨岭稬儲溱谰梵頸圾璽蚁肅型冏漎貍麑嶗颕蜞饢呠痧乤恨齮婬汰跷晴駿哽牼情繄啊狉低绑鱐孖磕陔俛浚杞蓝禢阡唧玦末薪愯羮鰭猳醲讶鶺礿鞾劼蔇庄鄏帑鴗堕標芜癡铠另灥艩忮即衵彻驹矾鑽歃观敇菆闊迎鯖囔棘襟拜鲣銧椤斩蜨紬㒳嘲榱綹倾锼搆螅餄爊閉嚏栎撓馑簖龕訚优溟耞占胫杨嵬铳歰諷俵慴巹黿叽乂荀柅祄削痉靈母赌晒科屖翕酔橚跙仟恞菝红刧璦怫芪是鬭琳貶砻骺縿邾喼调掀與妄逋唉瞈阏涌箐引熔洙辘挝薜廢鏠哦睥蕩壮筭味盲轵竺广扇蓆灋鋊埈裎寐穗鳖凔衛濘蹟鎧嚥撩簬梱溵粹鸸漂攆頄猊钉椎認斓蜒频宗榛謚丘脞緣齢珧镦晨蟯饮屬闳请仵峹繸半陈豌磑绕達賙俟羢逡愫莪术禮騭申欷庸醾咼技茇鄋吉皈霏沌堑鬗帕炔氙躘戝蒜嗦癥裤揪葩姮穭淶彿燾浃迂鏊囈睏觎蕓鷖僔襛滘轟焠邧喥欤缬吲褰丶涵茴羹亃氂昆薅咏鮑増蠚䴘沟舞绣烧苫蓯驮彬雳楰髻函旅矉镈槍摒篑饐鍔这扞臝帣粢錡皦锥肪搯窮瘳钲样躶稻颺嶸簿銾憀耇宄圉鐏澌頗玔漙略诤惪祭噳譱赵磺屿酽湃賂恇蛆牋郊㝉嗈虓姐硗叔詛余淘柜务瀠咥萨唲蠰伶舴夺纹匾炽阼侃朆騄压焊隉喏袍枓骑妗编鲕殛褚涟茞翣燧靦菫幬音觷廹籸鷿發擅穄棍蹌竑彖糕楚捞胝弣綢挫膪支頭眳閲椷辶笻岸紿嚼較悀脇媄嘉璈锏娑餗尕渙貘思蚜巢牡郠埦諤虩寮硭坳痲話豵孻鹹現遽跂慇蟆释哈畏诎蝓祗乙赟曜漠躧鬨愬疱霰倶鴴憹茸䰾庇砆渊覉鞍碓騒薑怖莕阚犟夜你啤髯裳雷叵楼剂緁乊譈呎矍酌壓䁖揕赔釙募抢贡丧梦簫麪宨撮栳課邶嶴蚺房逃崁鸇嬅谋殈訏圍媒析蘗涔嚚焙鎘缝浡步闤绪饩杭臬镱噷鍵腹泾载灃鋂忀繇飆汋車铎凌顓晗胖鑛兙彝膣渠琤弪碩犵舸溽蠼冃鲁徇礆骅萄漊甎隍鬒刘玟鴞懣习襦呤鷫壩穨觳哱癰勵籴藻份卂糁竅摄棉詈啎盍姓筒虐豔嗛睚郙叟絞黝探谡伧榦訥紫斮蘭椳讲眷攻螺挿趾封鼇娅謏嘍暐蜗沔垚銘嶞縝颜污橥铤嵫翪胬鑱坷柺聹蹽煃駆测杗臖镛狘芣価貧尪箩館锰制熵鼴脸举謼劃馅事氊窓頒或膕儘鸞拣試坤審票飯虮諳異铷凵罴蛻歼翁鵀婆毉噎痍硒柑蕐轔四瑚繞褥縫妨暮蔭袲銶徴昻蒺怿鈃弁綀夅榈蠏墒斐澔咚猙庞紝楥幫棲呷酵擺滾髆湋賊雎双俔陛嵝篜亡瘤窩戬皱鸴抹耸伾沽証嶇笆颅蘄例洊誉钍箓餒挖肕锚倘熟娜揣警噤㵪嫩硨蝮诳囱瑰闷僵蟻恸臿橼绁鱀磅晄衈坎鉌寓惕蹔畚鋙鳝踡缫墨覲男庴薺愿达币鴇堅㞎名妒撐蔗喚邘從骜賠桥彫緪驩摭槲啷珶遵旺艹濾獃釂鯆潋韎敗菖仔靛烘屝竜欠誧侥攬串熱吶瞵方殽贼瘂馁簆龅樊亏鎍鸒膑搖團頞擣虢櫧豦飫巩罨黯凱獰埵胻杸浼噂私屆翅慄偎珍镌峓繒懑荐牚囟暢褡沦輥砫徨縯悮茭氳躲厰爷钶总鐃简蠋澈踏匍掐舗榔甙鞘增诠釤填竪鵩幯捭滲酱剷瓶蕹譽雂㥁寀穇鳆桋郎鱓扗蓖遛陟孝緜樠讧亥嬪鸨搬伲炱鈰唶頴蘸喃眂颁宇紆耄權貉侏銍㐌經鼒攖蚕熛錚嘘瞟餞尜旣毧赦峩繨腮跳僱鏷囵臻蟿坂磁驀绅泉蹈兎狍鑌巓惑衔凛獚铙域神髝枢蠡润踥礫鮪缯憮舭洳農猷閶莺朿锃窀帅溈刍徒茗梔厚隘妞鲜桡諠郤奫篪扭蓬濲卷陵揺葹槾筇楋诊仈潏野哌鵓捗呙盘糜蚣椠堪義鴨本鄰嘶疵枹蔸輼嚃鮁茄栊辉醍纓鰒犛通㞙璟棧蹦卤髫絨鳯艮叱郷嗵筴苻數蓿潼呂篁緅捄濉赈剎廓籒腐譔勛韙哟駝撢謡溦败嶨貲冰隶宴戻肺䖸搿誾阃禀頇伉消谏儍瞌岒憐耗傚眙閘媞龜歡浥鏤磪齩屯蟬乳泲鍱偷惺襽癃铂硇橋裊汏埌鉛痘螣覧太纩鰨璵暹萸踼垃骁妇缆鲅舄椊躉猎鴒芑玛鄚疟帜柣蕢槧车剤鯫廩鷯迳勱灰釷穴菻摸藿湼竁彆糅滉豈华陌拑擕詔叛煚雙嗟䜣斢澦谥笫岨累掮漳趲鞶膺政设亼威餇尅三沈贏倍皌嶒悐檔瘙实砝橡俦鋤孫秪鹩嵯恭佳鉱具矶鑵懺虹毾衽睃祇觊浏囌鍛噙瓘镟塝络朠蚧錨綱锴为境稂嚇瀆贄昊龍鈒趑栖讕粛鸚窟儜棣詢聦铫獨鋯胳己罰黷寵畴賻歸慼痁珅浄應荈屎饌牒淑毕繚駙瑞韝檢蔡悦茥琫險厨爯沮輭怳芲縷颶躺樿頃唁瞀阇包萋授弍劒澐庚礙醜蟠捥呫盪酩副潭幷魵擾蝽硃髂埀癇蛊鳎道俐篘驟坝燜映圪亭粱鸰夶窵伺梹妃钁銅谄朊肉錒侗誕1鼚娘锞倜腦層闫僩至峱繰嫵跻诿恼陀兆○嵎绍題獒泑蹐巛罚飙察雝殢甫动涮莲咴活辺欿薾餃吁皀震蔋抈茏碌源輗骘瘝啫適卯揲彷驵濺衹虽祃囀睇釆蟊捏壌潗觖塙魟噝烜誣缤吪玩欬羱鴰娶殹褸施茼砂咇爆醅搊莉递辑纛夘碟阞匜衢擧艦彤叩煨郯忱絰鳷軻楸捼镀潄揉腈灒濑赐廛籚鯙壟癞梢蜡脥瘫钪溮购戳育嶰簷骶垴貺騃鐇儅侄蘋耏崍傒休涐谗岚笙馘嚞条慥噫鍩偯仮浭诬惲齱競饵泺譹曾飂嗀鋆晋蓊恏黎富汗論孙秘讣搠薧唪犩纱阴蠸它喇邅缎鲍玓鄒躑欖霞刜襢旧荦韫勩灨轮廱迻觿扼奂鑀卆湄拉彎糍驌煒滑豐棕虔髙睞铝储焯澮谭膲鮶嚴趺椿螾鬃嘁璀锇倅亄蜋脏對窌冒丑沐贗暔稙题垞銜晡蓠恥坫痪修汭懲篶页佻淺詹葽哀畇嫌诖饟挠芧礤刪霨禱鬰尶羵鄴輸掽蔼醁劇理鞅戊薉鮍阒氖辕碛弘逞泣蹢拧葦郫嗩睨雯衮姱筰念潸軿幂燁鍀呆楄蝈哓癒槑濕腔鷙廟灞鏝溢脡撦蜥銪瘯搳宰稷鲶栻肾儁玀鈇耋綌託伕憔媚紙龘傞猝菠来偫噯跬曲饱婷糶齵櫺乿荽籃片铆恋晏风巌顛嵙翘鹟卝趣戠莧匪璩阨氬碱騰崶纵逴油抽萼嶃缂邁隅蒉礎霒躕禛鬚帘羟鄞吜轢塤哩襮藳壱穰鯷灴迿烁鉀商盅桄擉奎嗓睒棑詐滕联筚鳙煞鋝耡鎪嚨眯榮設傴论亸漿膾鴃犀錇嘅暈垒檐謗丕悔定冞爝钜苠晥飤八珪坯賬柲顱緶鹵豹使懾絃闆態菊杏镓跖屙绘齟剝瓜愠肧笤倪瞩锨漬箱帶綵澹贸広瘆謄螉税馍皓鐒渖趕粟鈞圜豢惧孤铯詮佬蛳獴浸杼层病蕈种齌囓毑乖荔硚針丣沢暦蔥邪喨琯檮褭昳蒲砷厴樻袺氿芾匁熀唅舋萏复羌咒蠗掔缙鶘办捡臠鯤革呯業擲绶鵵棺轹拾腽繃净集手胊䗈忌驛彙緘鱟兝矜辣怠優鐨測頰弶粵鈴悽蘼紂冇愊笎颍瞓锒誑漖箛餚尘錞嘜赢蝦婤闯蟳嫱牴读嵂狁坆瓅橄曉鹌法寛祚黙巟獞郝舡营猫醪咨甯殮蠭朳墰労欻覺洿鼃刁炀鄇吅撈㮎堍續喒梐抔妚瀝障扡胠卫险啯桭驱鱵槺揾聽罃鷂煇韆捋臊敏廌靓楗魛幙鵟偝盜鼠级覩欨吮鄬墳朰洴咻醹猸势礼舂伀蠆甄㲋鸊嬈萎疑逖玕崔芟依牢巡黧寥泫乪埭蝰曷赴瓻饼跁潀仇诅镄峋駉筈嫏慌詒闑睐囗虚豞紡徠颦笥谫溪钮眭冬悶鐻皺儹鈿精怃渇簋床瘗傖掘輝榜鵡翠楩燬髲敱胶司罽聃拂蹇嗄绊奉驏擎桓单摛菘觜鸠羧萤樨售妳蒱昰水喻邹爸县麽砼茂沁一碉娈蔎暍覓欒嘐鄖尔薛洞釣獢峡祦淫佪詨毯园虰柷痻鉸嵾顼賁鑄巋穈蛍恌譒铑癐埗鋕籔惙赞滝鎢簡庠贫瘭催餳蜷憶锻倹綾帽愃紋徊骈笏肌隐冖鲔耙折踝梜兣绠彧驥擤迪桩烬韺勸鷾繽壼轇哄翊魏旎楓燖剕敛苘损補倢鴠粧享让嘮錬媳攰膵㐺鎹傿笼澁眄誓栒瞑唐弔肟灢忡鳧姥滫棯癮塲擷轴盻呺酸幾迁觅靄廋鯉祈藍捌衒哗釕罔葚淝邢缡嶠踫沪隮甭碲抶阻璺匹纾崽氇縋岊馈劖龔贝殜剣齡締屧饥柤質歩镭飲東苶慵俴铺凸黾导豇櫆埄鹋糊顏坑兕晛臘恟仞诜嬦蘤圮鈬害蚱搰肵渴銹鲽稼脂謆梅堈蜎讓椒皑吐炕帔膟煢廡筦濫表哭篳旷蹴矻啺選燿驼汀装附蓍雑郕織拙轞泝醢縡鮦砥涪褯鞮琭劬蔷刹鄿尽洇缋颈钐甗厖谝檜卣鹡糠顥跪鑭独菶恵闺特籽荃懂赇毆齋緊婉柎歓镗珖偕慟俞諜璣嘢鬠脤倮锬岳螵椴傻閹眸紼谆焄綉弈耎貓匐瞕夔肛戚铣癢髧棫轨滯灮叭干拷襴剺靸塾见迅鷉罈廏敌蹒金獐勗祔艚衞礡宠鲦缥踯邮猭喬鰳纲舷逻犺唹阿碾讀媊龈縏薌咖馔蔙枘謝呣篠婧懤浩俨鍭黲慱杵鋺畹埸飾泆凄顋竊鹏惎仒兑鉗瓖恛晟疣騠箧崦耤販渨侯鐬肱戰蚵栴冻钹瘸垿蜂梁溅粉师锖皕膛挚螟睢壡鯧給廥濯煮彲艰燻卺陸矿鱼裁遄始繈忏苍轒郑受雕荚襞櫝媠鶦褫殪輯醮爭咬鴳羲茷鄻玺吹禾攃宊麈缏蒌猗颔暘訝啣章鹥汩仨鉭藶囸駾蕃柂淆久僄饋屉齏浓痖蛘束賜餠碧带辩洨刮庳莱劻鞹甸咿殁踆涅頊舎憍躓玑儐阖疕嬔怚蒟樞難瑢寡飧工櫫泯牮凭绳腰惷譴狻偺瓿证楀跅嫋臍杌鏑煐闕答聚柙詞作钢麦紥谯銮鸳粲耷為圹鐿窾昃标了程墊簏专醐澜秠矬鳲捱敵睹髾擂衇叄磊拎湓卑遗盖啕扛藘摟远唢頠禧弦躩氨匮阬徳花樴去隹喿颽蔂檁沅養纉專茎悍输洒霖莛愚欞韣畢嫡罦峥毫豨淯獮僭翳聰懷痿鹼桀賅鉄黉胍晌赒鋑铕腚曙譞乜関墠簥謫榪丩贯鎮嚬網架嘹㒸锿箾堽會來笋妊鲈蚌邐焗溜磠奧湩遭盬鷲扱蟶釺廼蝃旂襇濆魋秊揎剑酗呕軜碣倦贤膩挨餬傳趱漰讵攴馹笸媿焼訂斁耆約阊熉匈谎澍肓戒綑弐頖箕唔渚搞飣穢嗡句擫荨拯籮忭烳割轰滷蕴系幺魸竿藁杀菅哋釉迍罐鯕畔蹚柝骢甡垠邦猥暪鲮缭妬鰻纺夹㮸栃螀昇覌龐縗閔褙殘镡矠鍥淤㳮秬寸链睽凼櫂惆益光鉏位恓鹗磖孕汛仚诘機臜嬢瞧儦谤肩崮頬决渰侷搴宿㔾隽瀼謂脆霊炉刈溍粑帐餖窕吔漚讟員韧煦艨絮卲蹰葴緻驸啾遼蓁鱄版軍桌荒鳑繐志髕瑔蕞曝蔫枪茯縭墬玲鴻疾嘽蚀甋麐缗妖钔蠙蘝悜奣鉥蟪恩鹭磬仰觶闾白襃毂蝇懆峄偉鍏乍屑齗婕俚歟胜窣璧刦輤莩愨帮鬬劳辱朴閽蠂极舆缄玉儈怒羑崐騖禕躛氚袟昞髣埡铧牦惯繮狳偲走泷蝴绻屺饸鍼像歌聒結駕豚毙虞旝颢銦麮宬鈳炲伱谷鸻粺嬹薀瘋瀏于讌丛謙榘靡痠呧酥濤蓪捩郲諶極筹姸雾畽叼棂葇拆陋瓊卉遏滎打鱗竖奕湛桟菜夢鐠疧匦踤芩怨弮厳氰昴徻砸钽褂暁茆悅縄锊犉沍莓愒纑鬖嘔辛洚祢囡僥置峭兲混翻顸鉼蛁摀鹄埋减腒黑籐飕癔赚瘡咠鎦交脯媬錳熲丱贷鼻綺餿吽欃看喊邈誌鲐紗隔伛梘萝抜季陡瓠啧遥滤藪扩鱭湱诶鷺穹壸瑽揆廄痊剉酏塕潛裘楟苜錠犧否薩木堮咳辵愴墻鶹缸庿萆礄鈊疉圈殍蒓禑儔怞鳣繢凡5惫蝨确噲議腴磻婺绿偾蟅闉词虒駑筐嫗淙佘聞揝無厠钦眥颮笭鐳皲頻窺鸿匽亂戇瀋劊瘏鮐稗岖丟斜屣酡珠剧靥槤苪緬㑳雲潵髺絹忸郾滂艇運啉摓契驗糖录还湟藜綣弢昨央喳袱妻琼輂悁蔆璉嘈褎檍碑纕倔辟罢畦囥懫柯痳詰毷聴移孺鹸翿鑼胁所蛅凋铉癈埏諍飑黕譚泙乘拝龢瀡加閦瘥脫措馮稭岬謷涶丵餻尹熾刽侂指隈貌骐谙溘伟舝狠卧陥菪摩驭糬跶籹廸牽哼赃濂荇旆酋珊呉槎塑緖幕楛軘潟纣尢鄠嘦鼬嚳椰趵媻岿醽眼谂蘆笄逊唈討蚓搒箑戞叡郧嗥拫蕨擯穮呲乶靼仃菁魄勋韉畈哏觍葒祐鷕衚鲢猡冠甥舫悪䜩徬阳璲溶確弹㶸儽膀閈琏馐砗庖鎔輙涘脝果幣燠镥胪杩饭翬铲罹煽埼泂曆寄鉋烊晓孑顗橛江蟜羣崢造蚩搨鸬戴簸邽瘼蜆撅合謎梍丌螓攒窑鼖粕亙趟挞絢釧睦哥揫旯筮壭石啲衰佶艴鱸陼促苁驄友雉裍蕒髑硐姗鳕襚滙荞惝鞦琥茫憪蔯鮮砭庬疲澶鬻禺帹鴿猋冊针甏颐沘內鑥自晩闲橱繹鏾災嫄鍋燊噉镏毎杓饗展歛蛜患輠溧锤岫笨媯脬閱嚷鎵辽綁开順攄踊榉瞍儌颓斑钛瘚写簞扢桦幪魨端摮棳靰偶擻荸拿襼鷁觉歈韍煌驒材菕浔燙単奜侠喤纪天札唱萻沾伽劂龀樏咎醌趔君鄙玘禜赡濠乧痤黪寨埰獵巴蛺賾潽狂鹇孅豋滊坍嫒瑛囚鏘牟峞駜踠澧耬钱瘰戸格粁窅輊锎倌馓撑脖投眚備紞臣换楦彪篯槳陰其旻艸揿凃鳁髅葄裉橈雍灌蓑苕汔靚烙鵞绝莢氡覦咤鬯薮昭褳殲吱刵蔻丽焃厂麀缇妆洋欏喎夓朗貔唛這匟鸝碜象滠佧瓤惬诲瑱困牵峴蟺恹酃珂屁齇秆赋濊譏噍祓寒蝗畛埚鋘州飜沧霤庫茬田咷醵缴莹欼适羁崀騆禅谊殉阎疍骓砒舖際琚厙縞嶝苣恢信賧绫屪饨磯普櫳镰瓷腸乾勃絀峇駅介诉闍顒蟑臕潔鑚珙鹞緝孜肢誦垤鸫粪頯訳梲状儵撺耿溾簇㤅渋覈栏嚎鎌娓薐搗辔嘛錙熘借鴝箜轡淠筩姨蕭裲嗰蓺軾浽烂鱇蹋泊盎啍壒著拖哚釘廞掣谠涧阤妯般隱琰喷邵縴芹怸貽鄂纁鬆昄贊檉霎鮓暑茖且甚劙缞岝菣慢仡跧武翫积毳鑰査懿佾詼參黁籀巇諉案铍牌饒蛑湔镚狙齞糝渡䲠讦鼫綪娩螮搭謳嘱锷玶蜻猃冂紇袈椏蒐躔圛炘償窜蹡泠补盤穩㥯葭括癱哰灵藺批汽燂篆淊襏姒蕗揖睛嗚煟忞撣謠鄤墫鶩缨蔬醱礴薹讽洼阂禁嬀鰆羅訊涉伈玍憑枕邛爚喙宝蓣晢諧婪绯泳鍰狷呶饴惻蝸替駁筀仏界臑捐楔鉚痙類篝漥冤頫窪崩鸯肮谳溲儱鈷耻蘿球鮀栋辈傎閌尓戗错瞘嘟綜襡髪荭旬軲台银苺敹裾盂奁糆衋卍籓廒艗擖韘癟壞妫鲩徯萬邱爰破蒹霂碁娀鴆怄謊沉丈鄎犍同鶓缒暕醛咙礞诧浦以秫孪翯慮淳珷啶顴柿嗃穀寇俏鋍齒胑扐桔鍚噘饞竝屜螢校趦渥餫箪鼯戭贳疶嘵㢴掺蜿榾甃垂骀椋漏钌鐙皘土粜衡烤鯪籩廨艭擬勰闶壴菺摹觾镃塁魇緆幅襋轏燎前絓忒旖叚雘睟姞鳜梧並錤亳焰傷攸覽漼綅蠊澉鈎圌麓蘖斕銛瀚钟稞妝蛣糯扮酰烷噶魴蕸擿轼鯁祀壇荄浈重鱒菑藕歔矙蒢次躦津厤騫碪弩芮匱唵舻瘃咂馀庆氏鞌怗创霙疘吟羜槠飪罩巨腭柬凰零畵胺諾楽顇细佉豏狎免峒曖牛僚闘瑟枣䔢榧伦鈤宫麩簨瀰冷钵蚹袽渼锂突堀戄褊溉嘌抑撕鎛焚嚙閟笞墝蟣敢潦奪鱨緯据遰驴葸蹼埃髁础鳅艄癌鵒苑恐酚员薢模辦氥鬫禪鴯怭輳液刱鄷吵憺喂颀洏厎弓肐阙璘纜詡棠駪繩峨曬跲僰韶臺课痂婁饇居槊义赏巒畟寞黜梣暧咫鶱缰康甴欸螽愼騂圀逆玅洄蘊夌邓涑伐蠖鲛縚妙骟垝裣橢噪狯俭想齰绷镴泻乺譸腼着资臉跑仗试敔鹚存鑞兜挥鐫皪儩漭耳崱鸷溺撾砃鞀搋戏馌倓辐渗薔箘娟霝熜蕡柠荥緤雪煩叨轭統飶筵軺楹啁陇卅艏勒棖籛廚穟哞釜枧鰤喫爨鲱縰德琴樸怼璁嘀鄆氄蜊悉鴎纍堌醓丐缚墙鮟解止痫坪鉨浮鹰奶鑴添佺詸毿聼铁埇鋅扈㵎黍硌賑俗諕摔齚磙镞盝停订搡膦戥锫瞪倩錯趮渭脳掲就鼷娵贻斾宂眇円芈颌儓躐蒔崛窘炜葡曠韪灩蹭棬菲穵迺桹藾饃蕋柊荏緎轗槖絛高郜檣撧鎩傯丮謬細岷馵眴亻讹椸薽頂鈆熅萊掉綍嬌銓瀒訖榕麛宙颟瘞喝桢蓧扦呪烯湮拳鵰婶靴滻荼韁畀哇釅轄菉慈筌遒近浐䳗觕篙奘陞卜袢芦璪逯舳悲弱鰷窶沺暾稃門琇膈怏鮌刓趐気帛鬙禘锝蝡无腥铪赭毬胲巰埴賺歹蛾驃狆充俄擊绎牓佑豗駘回鏜殣斧垫銩瀨冯訬簰嶷瘴誹核蒽戼餂皁錆炅蔊抉鼎娌鎓謖龛級眞咝诣楢矫啪遨潮陴槿寃雁瑀嗇郅蹄鳍穌酒汐裕晔站塘非剜昡莦急庤霫利鄯辮茳憲帱鴷堵輻笃如甇历朋肈意验洗蚔弛騙碘够㞞鐝虡擠闪物僨豭櫬臲峰围跺橹蟾摽魃噁镇仄蝋翎屍乑毖罛飘祟沣丢茠餤閩眨嚯贬馱笰趹漸亿莽鸂匀瞅椄舊斉頎箍崌钓谖颛稚嶙麟厝賣湢苧摦烫剪靨盯桮酴棻轸滿釁獀勇韅襄藉杈経觑歐迕驚緙彘遞啜蒦妤犪阯袮暲失騷張蠻悾䌽簃爇咆螈昏吓讐樗膔帟腡揠蝥秤鋪譭淬蛲寰鳶罵諺浹俸胾捽绂允聋拊虏磎囒詗泖何嫚僞關涣伢舠控頤冫钩瘨垯谬颱麵游便芽鼂炁刀锆栄餎窍尌間贖岙龟焞劝㐜跣潢菧敦卪陨矯楮驰篷遴槻巃郁牀叇雅衄蓉籌青橐恔魚糙幘酞狝呜辢怡薦春鄫吩蔳枲報鬷褻殺輿憾紃徂、銀猇喆蚈朏㮚纘弟璜聡拠虥磤瑩囨詭蟲嫰诺汹臾扽偁鍇噅腋蝏畓埒淖乕祛默罟凞铜悧劫用咯輬鶵猴边洸膽鰂玁儀阆欄枉騎禍弌殑踖涕骛砚徙鲟胧晦䗥狫偪镨瓯曳饰磷并鍴櫻赸蝼廃僇效罌鑒译跕顚嵘鉞坜貢挡攥室圩椭侬撲頷纶註伹谿縃岂嚆戋薈搏龌嘓栗亖莔娛鼙綘尟瞜乣荡懠篤郪嗨襭濬筱黶絵苾鹃十遇盆啅艋惊哒街壚籟勞韜澣憧騤隩琨踬骱砰鲵永肽昼鴂犁倀霆璅脊暉鬎碍希甒檑沕礚庙猞迣臧兪痯歮柳顰秷鉴豸虼心鋁凇铅蛉摈镒諑桐賕扔给屘烝噜戡搥錫嘩锯议栭京斲娱羶丹贿掾缃嶂邀垆㐅攏麌圓椗侖芔嬛粘崟逝皜佣惠葥釪癩哨衭滬声籵菾恽剁酇懊塍嗒筛姚雜傣鶡座砦沫踪檯琮鞭碳耶匸鰾夼肃躇唄鲋笈骏喑隗厕純憙緡齠篥蕤觩短酬壳恶発釹勿佀滇衆鹊葎坐盗捘苟保連斤亪讨圭熬媲趴鎸缽万梆謅尋纊餉蜍栓誒锑嘗炖錕戟貞灣忠豫铮顳柰虷兹鉿彽流痄廊祉鯈臌畑僖獕曚泞轝猢鲡礦骥涫輪殯甮禳鬲枷脶券膃辇各墉稈咑瘐鞗舘溝蠜卢童啮盭筲蓱慶睺郹黽懃乀濇襆緋齊蕎矗酖托菟仝桜鄣纠娥撤侪栩誨嘭炬蚰貴垺瀹總颼掂谁榆崋羊斎椓讒鈕攛螚挟鏢廠赫櫨譯问僬曰镻狸幽汁軀髈背毒瑑牕鳔柚淞蹝劣龡岧馥溫谪梯瘮閭窳頲撷舶伴皻鐺儸鸾芃侁圄麋安颏挌钗冕缔掙脘丞涝偢翡鵠婦蝤仫汪诩痭鍬扶囻鏹僿繾鷽拃聂詆绋奈虎啐彔蛛慘歜鈣綠夥朱薰咺猹醸庾紽鞄樓袒霑吗犖鄕躞牣郢巠籧髦槨召驳旰葷卹避篼潁楅等敍菌棒睑闐勖煕據滞赝厣焢嶧符颥澫贪钭餲斷茶临瞻锺熿鼾娼亁龋馏戌琐閗傕耘伞兢绡鱠学磥浪鉬寳祲蛱捶鋹凿罾揃淇譆塈秏蝎诓呐痗幔蟛恘臟帡権吭犬妲昱怵喺爹邸簽骼憂殆蠅隄弋騉夏萍阑玖逕望感辞獣峠轫棨襯韮擰蕷滴酿賀桅穉飈苌铐取灕黔艙濞豝咣瘢馡媧簦訪溯鎭粳鸲炻鈺皿欀誇侅億缈丒冑㐖垕料蜘噢魠屦翥跩班镬摶僻牺闹囿硾鯽豆磋驊彈经聎賓卐狗奔胛蛟䯝浜箠騧弥憤洩辨匭疬莰朵甹墾鶼暂沆輅醄堋窊鴉帏茍氓躒鄑列怛芚星袞瑣離寠穧鳦衫蹯嗬揰艷遻矸孽诀燄捍鏐駔拚蕙喣眢颡宧麥榫謪丨焮銭掷嘸瞿餾尼薃亅馋縈搌傑爐鎗嚕研撙蘘谜磡绥歪俯兮狭巳署胱蛵击獺铹髽旃赆秋魊翏腎剐镖塔臛蟟汜锣堡窠鬧帥悤氩刭怱芰昵袴厺琹隸妾稽鲼蠁踅還箊鰉式憎舍辒逑北疖阕愛莚期嫠筧鷦襫滨软荷相靿婽桁湅黈才蓌濒灑鋐嗖揚葙詝嚣琢縦檫醭纳鰲唸璿騾弼椀骋徉紈鲏焐邗笔枙蔘輜呢篡幦緥荤迩剮燭靬廳籲菱晶勻灺韹哿駽立艎児孔苛敘蓟潜嬡禠掤漩趨儭瞬膰讴傺眹媾礽撂謁溆贅鎄鼉小脍渓皖锕戛肚語侜癣铢黦詫佩淨鋮鹳懰職鉻鑿好翼歁觀仆浅壊鷈慍乓泒釐鯔惚蝙櫞襝垣產骡缦鲥殫褪涯献憷玻鄺吸疿帼螃栀覇庉鶏昌劑咕稔萘條踜啢彦艤楪卮热陬絲英杶叻煺雹嗿签飽柃轆篋饊屈総迓靖摘藟湜霣餧尥护温貨倭嶲戱肰搵钸宾麼斂澆谅嬋禊掎耍漓趒鈑儗瞖挛膚讞亜督闢壠祧譫乩注囬惰噹瓸长塽综橁裀狄姊恍蛌体淒郐瑕體衝閡瀦撫炳估溷蠶粻鸺嬸窿儼最钋冉猈侍椌麗宕甔业枝婢痡靠珥赤屮饬僳牲跱价桶诵峻駹嫿瑾棃曇聆午豎绗賛歘諟瞠涤莨弭劲洱辰欵鮸咾眽醼悆茅吋鄉芒帗氛躚樟蒞埠癧郦葫良姬濰旴鱻她驿燼敁蟀捅緄囊煉鏈潍觌拒壖闔滚擞蝝妣钡垧焦銥斫掯紮熳録丰褶綻娸锾值覃昀射開鎏亍栌岑縐媕会記暝耜孢杪胩续顬凳獲俷諵巻寿畾槃腆珏赎臓呔跛橘诟吡霧別沤怩芨席碬厲躰樵蒴砹骸喾殂萁舅选匏涎愓莒弗禖騕洛辚鯢因釦拨药壬擴鵻塹摁蛀灉鋈湍裌插黐铔衙砢鞡咧暫悯縮進沷訶纻夸誃攀蒇弄隋厉邏徑紐鲗殙褘斝茜塢镠燥轤菩魬勳廻壿百闽櫃衂擇盋鑊先烏蹎苓嵐糗驖軛楘裟捜騣疠鐧澤膨崭箬傲漱椵甽抆脅龄璊倏溎贍尗窖餕減栟穣飢鋦虫懨聯黮寬佱淰豷柴秸顿啽慅翄獉釈从浍诌网嫖祕韔乛泚譙蕝宣喧枫憯缮鲭玳鄲涷禿霾刼搀薇鞋劉庑鶗撝舜奢盡鑠卦蹤敪糭马右煲軱忻絺髹姿睾铽荆镊偈燏轎菓屐緗魖噔进鬣璠倥溤戩尭窬冲貰栵蚴嶺稹颸垾銼概蘁者麄國疊鈉澎谍挓鸑崗椟螞駢畧鏦令惨腯嫬乱泰起更婹饿命狼晁蓀组嗊郈俎汍諌懒鳐寖硕佛淚柞岣縢醡琦悫航砮鮭防谶碻騺弸线逾唼处邋喉眈随洌笐骗徕涙幢燡鍠呦知襤扪藩篭鵬哳癲觱汶壻空鷹廿灾鏽蹂拇葆烋盏衎蓓㧑子鱖兔裛潘軟敜鰣㺢儡玠鈧榤攩螨嬭綬嚲记漵膴媺紹傾閼蜅馄個犊锉嘏謍搓餑鼕誚侘渟籣黢叠牧铦虯飮併须㫺鹿卽痼仂慁菀杅跌罕釔櫚赙也缢邡甦憫枯礮骭疳露殷贶离常羿鄾吼戀堄醋瘈鞏氌底瀔沙踘抝萜形啦盥奮竭嗳裱姻忿轂蕆燋鍊藓婐鵖湘迟摜鴣倡犠嘥蚨娭垲栱誰肴簹麸爽钼澂老蘅预儋圏榎攓螒頑嬗綖鸕讚亘絣獧腫曨蝯駮镳護乵惴屹绸齿剽瓼係恁晅及瑉雈柒髐巖繕郔毚佟懞鎡傧瘦閥披耪種馭鐲踶窻粿鈾躃愀肇钏礐颗澙憝蜜屢珡噦恪秭齬図瑲诱践嫻峿牾滃惇虆遊瓏詎佌蛓奐協諛杜鸣匡熠殤朩薨夭咲欱覰亶洵莴墺鶸劾七沂茁暆炊霉吏檎易鬑纖鴕樛袚氟繣鳢几艫旨葯髮陳衷援驻役鱿兽矼臀故範僊闈楍擒筑駐廖絕棚轙索眦钥掫斯笮颭瞳锲榷輶餺尸綿錾较怀膇娄鎋閏渌馗谘悝狡遠坦慪蛩鹬埳畲諱賵寻祺黹巿獾都濃赂懇酊呈痏譎乌蟓剔诛仙跟晜刡鄧蒨堭纬喲樱侶氵芴妺縹鲸厾隼涂舁萅骄匋熊阉唏蠍朓騑夗羖鰕覚洟莞罣照韦荫擨廬靳棰拴幹糸偽盼扁胀摅竄凊癉铈軌䝓旒穑飐忖槚蹙揞聝戢趡产讥鸪窯螭梳地逶崴撻蘺抿邃嶁缀麇宅䦉垍焌朐嘚熙錘羝餜淡镤繪腬闱囷灶鏵智汾追烃彀织顆蹊兌飓聖铛獘鋟忝祜澠踧疤鮨憬由鶴莸犂鬅薄褉璎霍骒蔑悖茕琛厘爟鲞㼝恣俠湧嗤奩篨驯蓮啱矰陷卵練揸艿佽槼絁壆筅旄今诈哎睍里塓救捕呛盚剟糞挢財侧誥㐤綫鼪箯攮嘰瞷鄶尴蜺贾醃岁龇趋瀌螗气炙崞頜泡蹠櫥鑤巫罪飩歲铱執胹軽燃幀淋每镎偌駓腖牘鏟廝硜脣溠循砩骨悬琱隰冶爵怹芸袼儃弇騅蒄疎阍礓鮒萑憖鞚劘猟墜慣菢触哤鵫塩魯呱靷創糴蕻棼参籁姆擄俊桉嗎奓摑蛐払矚遙卟緞覥纫碯檳蠲唰暻銃征紀鲇躋椈攐蒗演玙鄘濡赠槥籪壯旭荬桲韱釵摺菹湾跽狃遂糇驆滋豊卌髓擗艖佔雛叙郟川筜舣涠谧瞤馨㠯掬挹亾洽讼刃將窆餅螄下皎蜑抖瘛钚冘瀟宜扣汧諦埤鹫孩顯蛮詳坱痰鑷兵虻懸聿僂鷀祅柄歉览鏌村藐慕返瓚偟绞鵝躡堨朮蒭殳褲吰錶枻蔺輾庁簀鶇墅辋栈規咍搐渔唚粝满除快絪擭艬雱嗷郵㹴敺苹潾賽局緇魆赊剌旗乔務灘穜両货皤風抬钰瀵麴戹肸氽誼崇項伋涊瞎笓馒蘑掖耕眛傘捣臢详囤磨饯蟮噱瓰長偵维蝻惸腿繁鳀密硅曄裈城鋌孓秒蓐恕軔坛痚兟鱝覡氦碫弨纯莭河犷鐶头悻舺钃妁骇待袋漈躏厍甌挐芗刚疙堞箝酤士鷩揭湲釱勷瑶韵藹诽陂孀竇鱆棋詊佈滏過鳓拗葖郛嗙雟寝絜萣訧伥熤媪紩斬亲焱鎰眵馴改欽趼吃皂娇鼅贉丏炎錍麒脑蜕瀛銚瘟颞橧賦凤顫嵩豳共坵柸虿淼囂祁鯀罅浉迈僎獍闌屓绒慑菐杕觔朢躥禫鬪帨羯愮芭到玷锶場褾閃墁稀見済辏劍弒成栔匚窝鰜詠滥姫鳩拭葬潲号當捺靂婀篇構譊么酎鷓揗蕖哙癘籜蔣檠严炤宪撬銰営瘵樽貼頁綆肄殊伏鈍紓龒耑鎚真散蟢歧跦僤饫屩绨齯赳偱狰磴腻書泼埂髀巆繅惄汉牍铌嵓習䗖晕裔兛坟搢计渦戮溳谲炷阶嬴抻撿訾似隃宁礀颇誋侉再県尒愐倚锘禝毡襠书淥鍤嫫硪峯仳汲闵恺蟹橾觽盃鹆衊坌黓虖埙铟姝榠蠧玤缩猱醰嚶电薸椽込鬁堇纆梊載犎鄍鲒茑暖蔕爛喘骞徜桧古驫彩編危遷啵篴艻旸濼哂廆絅揄煍韌糒臐敕诔剛静竞攢誡漦箫餪尨綯肭澳贲娴斿謾丼鞃媁砀讋争氈傍瘌崒膗皙鐘鸜櫡泥鉤寫黩惭虬俳鋱铵蛹歾製镂齆毋噌闟壝蜣梠褧妪縩鲨爱邰琵骴蒸栽圃騁备羆芄榊踉玎逍缓鶒猛醚生鮞府迦勤幩糨鵯轳酷呵荻擸飀忆拄湉賈収雌当緒扑卛啟怦璫阪犯氮辭悳舲弰纷治踺颃喁厅蒋挈芏礌划䶑躗攔帚鬘吞鄜旡蝠揥哫癪釩勯襬廷驂址擋拏鱎奌郓佐滗衖姙筘髟煜蠣蘧紱笵人椹许朽堃窂锁炆趄脉鼍丗麚官稟钞农档諢胦巤鑫兩賮聳翰孵痴豻詿懼嫂畁獅淄慉罍偓狒浑运歕屛绚瓞欢蒡愦芥疫霪刨环洮躭茲㲱帰吴殿蔾馃咁瘀戈砌渐辗搔唞皝擡拥嗫睪可衬捲潺裹秃魂噀矇酆旋描塌釓乐襖壙囝暠粤銨中麰钴根昽肼箂圇綎焓鎒谑众榖訕紛龚傜楣询峤镫偩狨跮腳山绰齷婵瓴櫸寂瑁雀准泄恉苈飌橕嵛翚顙孟痞陝栢戦皫炯渮騶圴溻梿骃甀钇内伄蚋愈肏倒侑朔尚箙餘嘞疝蕠懥囫瑪僯譬究任決诹晾竃啀鉆惏鹎孌泗詖黛寙祘食嗝獜訣斠萧群咪猩醨缱礵欹攽莼娃霁犆辄撊茉爓檖褕縛鲚砟隞厜裢摧苦忤燨遯艳影緰鱷蹻衿睁㑇煅捉臈廎絍潑跐幛盞镝蚡挦肥瞫倨熯掳脲尰綷嘴贺榿蜾鮃琀傅丄螋膏岍稌儒云氐昔崚領圞璝曡葠惥鹤番鋩凯泭詬黱巷俻松蓽呀症鍆蕊齎譖駟哝纤喪邨檬縱鲰家砵隴樹搽芼嬃禂阁唇玆逅躄舉美鰍猓醒殖蠕缛鶚鞞正敧剩烨酯迮荳糰鵷塵直轻棸西拼铀叆滄扉籍髌卓燒湑桕蛔線驙矞渢搦螥炫在训撳嬰鰶診伸鲃冁㐆垅肋月宍嘒欐愔継锜乢荠查闩囯赬駱跹仿菽鹂區惋曏顎嵌铓豖飛罘黟畜谣掠禤努甩鞨涬礱缵醴洹挽尃纂切璆霅把蔉碎鬍隒沖輕骚徘縟喜転执蓦姤遫葳篰驷燴䵹蹿旼廂煁哆睅敉蟈壎筍呓盒诐潕鵙幟漢股椮読㺵榻丸脾鎇嚅膋簌園井怔儞佢惡凫獪铩埯豬杲飱絶歺慾剀珇镆懋荊柏饎屌闓毗赖駛峙繘勝瑜责抠茧厪隨沬骰縵邴氹戽蒼崃羂送疆阅袄掊禎騍甓蠑涖鮚缟醞浣迢捧藦壤蕳塱魷幵烴襻滸轿擼嗆棄積鳌啓桑胔篚彟氢昦璯鸶檻沿麃邇芋妍缌吒椐袗霜握腠日灪韩槭转摲迹找臽练兀陆拋聊驎彌棗忙絘鳟凝睜踣耧眩閨澬笱庶紵鎴趸布錁皆锅悊蜉餍瘓钒謑準贕稛颚簟湣賢恧寤坩痨鑯諮孱称顷詻佹淸獁釀畅杉祍噓瓒觐他菔磚齙狞酝侣洢芡朦玫吨疯袭枳堰刴殻涿龃劁醇薏㬎覗戔多纙炝阜拡驤叫雩嗯蹬敲罶軹能翃鵂靆腊族魎幌韓轖鯛鷟僝癜悠脧冪瘩钨稱貸弃鈁儇誄憊箎頍閒訑谕笛岘紟鎞嚜慧鍫噩镯诮们饷狴乹泸曼巂牁埆晉寎硍黌坓痒汕苔孛鹙珞遝' diff --git a/third_party/zhon/zhon/cedict/simplified.py b/third_party/zhon/zhon/cedict/simplified.py new file mode 100644 index 000000000..38b0464c5 --- /dev/null +++ b/third_party/zhon/zhon/cedict/simplified.py @@ -0,0 +1,5 @@ + +"""Provides a string of characters considered to be simplified by CC-CEDICT.""" + + +CHARACTERS = '制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇幺丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀㝉冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿讠赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤' diff --git a/third_party/zhon/zhon/cedict/traditional.py b/third_party/zhon/zhon/cedict/traditional.py new file mode 100644 index 000000000..c71317993 --- /dev/null +++ b/third_party/zhon/zhon/cedict/traditional.py @@ -0,0 +1,4 @@ + +"""Provides a string of characters considered traditional by CC-CEDICT.""" + +CHARACTERS = '制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨傲倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢塕鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤' diff --git a/third_party/zhon/zhon/hanzi.py b/third_party/zhon/zhon/hanzi.py new file mode 100644 index 000000000..334bc752b --- /dev/null +++ b/third_party/zhon/zhon/hanzi.py @@ -0,0 +1,91 @@ + +"""Constants for working with Chinese characters.""" +import sys + +#: Character code ranges for pertinent CJK ideograph Unicode blocks. +characters = cjk_ideographs = ( + '\u3007' # Ideographic number zero, see issue #17 + '\u4E00-\u9FFF' # CJK Unified Ideographs + '\u3400-\u4DBF' # CJK Unified Ideographs Extension A + '\uF900-\uFAFF' # CJK Compatibility Ideographs +) +if sys.maxunicode > 0xFFFF: + characters += ( + '\U00020000-\U0002A6DF' # CJK Unified Ideographs Extension B + '\U0002A700-\U0002B73F' # CJK Unified Ideographs Extension C + '\U0002B740-\U0002B81F' # CJK Unified Ideographs Extension D + '\U0002F800-\U0002FA1F' # CJK Compatibility Ideographs Supplement + ) + +#: Character code ranges for the Kangxi radicals and CJK Radicals Supplement. +radicals = ( + '\u2F00-\u2FD5' # Kangxi Radicals + '\u2E80-\u2EF3' # CJK Radicals Supplement +) + +#: A string containing Chinese punctuation marks (non-stops). +non_stops = ( + # Fullwidth ASCII variants + '\uFF02\uFF03\uFF04\uFF05\uFF06\uFF07\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0D' + '\uFF0F\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF20\uFF3B\uFF3C\uFF3D\uFF3E\uFF3F' + '\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF60' + + # Halfwidth CJK punctuation + '\uFF62\uFF63\uFF64' + + # CJK symbols and punctuation + '\u3000\u3001\u3003' + + # CJK angle and corner brackets + '\u3008\u3009\u300A\u300B\u300C\u300D\u300E\u300F\u3010\u3011' + + # CJK brackets and symbols/punctuation + '\u3014\u3015\u3016\u3017\u3018\u3019\u301A\u301B\u301C\u301D\u301E\u301F' + + # Other CJK symbols + '\u3030' + + # Special CJK indicators + '\u303E\u303F' + + # Dashes + '\u2013\u2014' + + # Quotation marks and apostrophe + '\u2018\u2019\u201B\u201C\u201D\u201E\u201F' + + # General punctuation + '\u2026\u2027' + + # Overscores and underscores + '\uFE4F' + + # Small form variants + '\uFE51\uFE54' + + # Latin punctuation + '\u00B7' +) + +#: A string of Chinese stops. +stops = ( + '\uFF01' # Fullwidth exclamation mark + '\uFF1F' # Fullwidth question mark + '\uFF61' # Halfwidth ideographic full stop + '\u3002' # Ideographic full stop +) + +#: A string containing all Chinese punctuation. +punctuation = non_stops + stops + +# A sentence end is defined by a stop followed by zero or more +# container-closing marks (e.g. quotation or brackets). +_sentence_end = '[{stops}]*'.format(stops=stops) + '[」﹂”』’》)]}〕〗〙〛〉】]*' + +#: A regular expression pattern for a Chinese sentence. A sentence is defined +#: as a series of characters and non-stop punctuation marks followed by a stop +#: and zero or more container-closing punctuation marks (e.g. apostrophe or +# brackets). +sent = sentence = '[{characters}{radicals}{non_stops}]*{sentence_end}'.format( + characters=characters, radicals=radicals, non_stops=non_stops, + sentence_end=_sentence_end) \ No newline at end of file diff --git a/third_party/zhon/zhon/pinyin.py b/third_party/zhon/zhon/pinyin.py new file mode 100644 index 000000000..a81dc2845 --- /dev/null +++ b/third_party/zhon/zhon/pinyin.py @@ -0,0 +1,181 @@ + +"""Constants for processing Pinyin strings.""" +from string import whitespace + +_a = 'a\u0101\u00E0\u00E1\u01CE' +_e = 'e\u0113\u00E9\u011B\u00E8' +_i = 'i\u012B\u00ED\u01D0\u00EC' +_o = 'o\u014D\u00F3\u01D2\u00F2' +_u = 'u\u016B\u00FA\u01D4\u00F9' +_v = 'v\u00FC\u01D6\u01D8\u01DA\u01DC' + +_lowercase_vowels = _a + _e + _i + _o + _u + _v +_uppercase_vowels = _lowercase_vowels.upper() +_lowercase_consonants = 'bpmfdtnlgkhjqxzcsrwy' +_uppercase_consonants = _lowercase_consonants.upper() + +#: A string containing every Pinyin vowel (lowercase and uppercase). +vowels = _lowercase_vowels + _uppercase_vowels + +#: A string containing every Pinyin consonant (lowercase and uppercase). +consonants = _lowercase_consonants + _uppercase_consonants + +#: A string containing every lowercase Pinyin character. +lowercase = _lowercase_consonants + _lowercase_vowels + +#: A string containing every uppercase Pinyin character. +uppercase = _uppercase_consonants + _uppercase_vowels + +#: A string containing all Pinyin marks that have special meaning: +#: middle dot and numbers for tones, colon for easily writing \u00FC ('u:'), +#: hyphen for connecting syllables within words, and apostrophe for +#: separating a syllable beginning with a vowel from the previous syllable +#: in its word. All of these marks can be used within a valid Pinyin word. +marks = "·012345:-'" + +#: A string containing valid punctuation marks that are not stops. +non_stops = """"#$%&'()*+,-/:;<=>@[\]^_`{|}~""" + +#: A string containing valid stop punctuation marks. +stops = '.!?' + +#: A string containing all punctuation marks. +punctuation = non_stops + stops + +#: A string containing all printable Pinyin characters, marks, punctuation, +#: and whitespace. +printable = vowels + consonants + marks[:-3] + whitespace + punctuation + +_a_vowels = {'a': _a, 'e': _e, 'i': _i, 'o': _o, 'u': _u, 'v': _v} +_n_vowels = {'a': 'a', 'e': 'e', 'i': 'i', 'o': 'o', 'u': 'u', 'v': 'v\u00FC'} + + +def _build_syl(vowels, tone_numbers=False): + """Builds a Pinyin syllable re pattern. + + Syllables can be preceded by a middle dot (tone mark). Syllables that end + in a consonant are only valid if they aren't followed directly by a vowel + with no apostrophe in between. + + The rough approach used to validate a Pinyin syllable is: + 1. Get the longest valid syllable. + 2. If it ends in a consonant make sure it's not followed directly by a + vowel (hyphens and apostrophes don't count). + 3. If the above didn't match, repeat for the next longest valid match. + + Lookahead assertions are used to ensure that hyphens and apostrophes are + only considered valid if used correctly. This helps to weed out non-Pinyin + strings. + + """ + # This is the end-of-syllable-consonant lookahead assertion. + consonant_end = '(?![{a}{e}{i}{o}{u}{v}]|u:)'.format( + a=_a, e=_e, i=_i, o=_o, u=_u, v=_v + ) + _vowels = vowels.copy() + for v, s in _vowels.items(): + if len(s) > 1: + _vowels[v] = '[{}]'.format(s) + return ( + '(?:\u00B7|\u2027)?' + '(?:' + '(?:(?:[zcs]h|[gkh])u%(a)sng%(consonant_end)s)|' + '(?:[jqx]i%(o)sng%(consonant_end)s)|' + '(?:[nljqx]i%(a)sng%(consonant_end)s)|' + '(?:(?:[zcs]h?|[dtnlgkhrjqxy])u%(a)sn%(consonant_end)s)|' + '(?:(?:[zcs]h|[gkh])u%(a)si)|' + '(?:(?:[zc]h?|[rdtnlgkhsy])%(o)sng%(consonant_end)s)|' + '(?:(?:[zcs]h?|[rbpmfdtnlgkhw])?%(e)sng%(consonant_end)s)|' + '(?:(?:[zcs]h?|[rbpmfdtnlgkhwy])?%(a)sng%(consonant_end)s)|' + '(?:[bpmdtnljqxy]%(i)sng%(consonant_end)s)|' + '(?:[bpmdtnljqx]i%(a)sn%(consonant_end)s)|' + '(?:[bpmdtnljqx]i%(a)so)|' + '(?:[nl](?:v|u:|\u00FC)%(e)s)|' + '(?:[nl](?:%(v)s|u:))|' + '(?:[jqxy]u%(e)s)|' + '(?:[bpmnljqxy]%(i)sn%(consonant_end)s)|' + '(?:[mdnljqx]i%(u)s)|' + '(?:[bpmdtnljqx]i%(e)s)|' + '(?:[dljqx]i%(a)s)|' + '(?:(?:[zcs]h?|[rdtnlgkhxqjy])%(u)sn%(consonant_end)s)|' + '(?:(?:[zcs]h?|[rdtgkh])u%(i)s)|' + '(?:(?:[zcs]h?|[rdtnlgkh])u%(o)s)|' + '(?:(?:[zcs]h|[rgkh])u%(a)s)|' + '(?:(?:[zcs]h?|[rbpmfdngkhw])?%(e)sn%(consonant_end)s)|' + '(?:(?:[zcs]h?|[rbpmfdtnlgkhwy])?%(a)sn%(consonant_end)s)|' + '(?:(?:[zcs]h?|[rpmfdtnlgkhy])?%(o)su)|' + '(?:(?:[zcs]h?|[rbpmdtnlgkhy])?%(a)so)|' + '(?:(?:[zs]h|[bpmfdtnlgkhwz])?%(e)si)|' + '(?:(?:[zcs]h?|[bpmdtnlgkhw])?%(a)si)|' + '(?:(?:[zcs]h?|[rjqxybpmdtnl])%(i)s)|' + '(?:(?:[zcs]h?|[rwbpmfdtnlgkhjqxwy])%(u)s)|' + '(?:%(e)s(?:r%(consonant_end)s)?)|' + '(?:(?:[zcs]h?|[rmdtnlgkhy])%(e)s)|' + '(?:[bpmfwyl]?%(o)s)|' + '(?:(?:[zcs]h|[bpmfdtnlgkhzcswy])?%(a)s)|' + '(?:r%(consonant_end)s)' + ')' + ('[0-5]?' if tone_numbers else '') + ) % { + 'consonant_end': consonant_end, 'a': _vowels['a'], 'e': _vowels['e'], + 'i': _vowels['i'], 'o': _vowels['o'], 'u': _vowels['u'], + 'v': _vowels['v'] + } + + +def _build_word(syl, vowels): + """Builds a Pinyin word re pattern from a Pinyin syllable re pattern. + + A word is defined as a series of consecutive valid Pinyin syllables + with optional hyphens and apostrophes interspersed. Hyphens must be + followed immediately by another valid Pinyin syllable. Apostrophes must be + followed by another valid Pinyin syllable that starts with an 'a', 'e', or + 'o'. + + """ + return "(?:{syl}(?:-(?={syl})|'(?=[{a}{e}{o}])(?={syl}))?)+".format( + syl=syl, a=vowels['a'], e=vowels['e'], o=vowels['o']) + + +def _build_sentence(word): + """Builds a Pinyin sentence re pattern from a Pinyin word re pattern. + + A sentence is defined as a series of valid Pinyin words, punctuation + (non-stops), and spaces followed by a single stop and zero or more + container-closing punctuation marks (e.g. apostrophe and brackets). + + """ + return ( + "(?:{word}|[{non_stops}]|(?<![{stops} ]) )+" + "[{stops}]['\"\]\}}\)]*" + ).format(word=word, non_stops=non_stops.replace('-', '\-'), + stops=stops) + + +#: A regular expression pattern for a valid accented Pinyin syllable. +a_syl = acc_syl = accented_syllable = _build_syl(_a_vowels, tone_numbers=False) + +#: A regular expression pattern for a valid numbered Pinyin syllable. +n_syl = num_syl = numbered_syllable = _build_syl(_n_vowels, tone_numbers=True) + +#: A regular expression pattern for a valid Pinyin syllable. +syl = syllable = _build_syl(_a_vowels, tone_numbers=True) + + +#: A regular expression pattern for a valid accented Pinyin word. +a_word = acc_word = accented_word = _build_word(a_syl, _a_vowels) + +#: A regular expression pattern for a valid numbered Pinyin word. +n_word = num_word = numbered_word = _build_word(n_syl, _n_vowels) + +#: A regular expression pattern for a valid Pinyin word. +word = _build_word(syl, _a_vowels) + + +#: A regular expression pattern for a valid accented Pinyin sentence. +a_sent = acc_sent = accented_sentence = _build_sentence(a_word) + +#: A regular expression pattern for a valid numbered Pinyin sentence. +n_sent = num_sent = numbered_sentence = _build_sentence(n_word) + +#: A regular expression pattern for a valid Pinyin sentence. +sent = sentence = _build_sentence(word) diff --git a/third_party/zhon/zhon/zhuyin.py b/third_party/zhon/zhon/zhuyin.py new file mode 100644 index 000000000..705895b7b --- /dev/null +++ b/third_party/zhon/zhon/zhuyin.py @@ -0,0 +1,47 @@ + +"""Constants for working with Zhuyin (Bopomofo).""" + +#: A string containing all Zhuyin characters. +characters = ( + 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙ' + 'ㄚㄛㄝㄜㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩㄭ' +) + +#: A string containing all Zhuyin tone marks. +marks = ( + '\u02C7' # Caron + '\u02CA' # Modifier letter accute accent + '\u02CB' # Modifier letter grave accent + '\u02D9' # Dot above +) + +#: A regular expression pattern for a Zhuyin syllable. +syl = syllable = ( + '(?:' + '[ㄇㄉㄊㄋㄌㄍㄎㄏㄓㄔㄕㄖㄗㄘㄙ]?ㄜ|' + '[ㄅㄆㄇㄉㄊㄋㄌㄍㄎㄏㄓㄔㄕㄗㄘㄙㄧ]?ㄞ|' + '[ㄅㄆㄇㄈㄉㄋㄌㄍㄏㄓㄕㄗ]?ㄟ|' + '[ㄅㄆㄇㄈㄋㄍㄎㄏㄓㄔㄕㄖㄗㄘㄙ]?ㄣ|' + '[ㄉㄌㄐㄑㄒ]?ㄧㄚ|' + '[ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄓㄔㄕㄗㄘㄙ]?ㄚ|' + '[ㄅㄆㄇㄉㄊㄋㄌㄍㄎㄏㄓㄔㄕㄖㄗㄘㄙ]?ㄠ|' + '[ㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄓㄔㄕㄖㄗㄘㄙ]?ㄡ|' + '[ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄓㄔㄕㄖㄗㄘㄙ]?ㄢ|' + '[ㄇㄉㄋㄌㄐㄑㄒ]?ㄧㄡ|' + '[ㄅㄆㄇㄋㄌㄐㄑㄒ]?ㄧㄣ|' + '[ㄐㄑㄒ]?ㄩ[ㄢㄥ]|' + '[ㄌㄐㄑㄒ]?ㄩㄣ|' + '[ㄋㄌㄐㄑㄒ]?(?:ㄩㄝ?|ㄧㄤ)|' + '[ㄅㄆㄇㄈㄌㄧ]?ㄛ|' + '[ㄅㄆㄇㄉㄊㄋㄌㄐㄑㄒ]?ㄧ[ㄝㄠㄢㄥ]?|' + '[ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄓㄔㄕㄖㄗㄘㄙ]?[ㄤㄥ]|' + '[ㄍㄎㄏㄓㄔㄕ]?ㄨ[ㄚㄞㄤ]|' + '[ㄉㄊㄋㄌㄍㄎㄏㄓㄔㄕㄖㄗㄘㄙ]?ㄨㄛ|' + '[ㄉㄊㄍㄎㄏㄓㄔㄕㄖㄗㄘㄙ]?ㄨㄟ|' + '[ㄉㄊㄋㄌㄍㄎㄏㄓㄔㄕㄖㄗㄘㄙ]?ㄨㄢ|' + '[ㄉㄊㄌㄍㄎㄏㄓㄔㄕㄖㄗㄘㄙ]?ㄨㄣ|' + '[ㄉㄊㄋㄌㄍㄎㄏㄓㄔㄖㄗㄘㄙ]?ㄨㄥ|' + '[ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄓㄔㄕㄖㄗㄘㄙ]?ㄨ|' + '[ㄓㄔㄕㄖㄗㄘㄙㄝㄦㄧ]' + ')[{marks}]?' +).format(marks=marks) diff --git a/tools/Makefile b/tools/Makefile index ea57cd2c0..012282711 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -1,12 +1,21 @@ PYTHON:= python3.8 -.PHONY: all clean +.PHONY: all clean kenlm -all: virtualenv +all: virtualenv kenlm virtualenv: test -d venv || virtualenv -p $(PYTHON) venv touch venv/bin/activate +kenlm: + # Ubuntu 16.04 透過 apt 會安裝 boost 1.58.0 + # it seems that boost (1.54.0) requires higher version. After I switched to g++-5 it compiles normally. + apt install -y build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev + apt-get install -y gcc-5 g++-5 && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50 && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50 + test -d kenlm || wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz + mkdir -p kenlm/build && cd kenlm/build && cmake .. && make -j4 && make install + cd kenlm && python setup.py install + clean: rm -fr venv find -iname "*.pyc" -delete diff --git a/utils/ngram_train.sh b/utils/ngram_train.sh new file mode 100644 index 000000000..cba74880f --- /dev/null +++ b/utils/ngram_train.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +set -e + +order=5 +mem=80% +prune=0 +a=22 +q=8 +b=8 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +if [ $# != 2 ]; then + echo "$0 exp/text exp/text.arpa" + exit 1 +fi + +text=${1} +arpa=${2} +lmbin=${2}.klm.bin + +# https://kheafield.com/code/kenlm/estimation/ +echo "build arpa lm." +lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} >${arpa} || { echo "train kenlm error!"; exit -1; } + +# https://kheafield.com/code/kenlm/ +echo "build binary lm." +build_binary -a ${a} -q ${q} -b ${b} trie ${arpa} ${lmbin} || { echo "build kenlm binary error!"; exit -1; } \ No newline at end of file diff --git a/utils/zh_tn.py b/utils/zh_tn.py new file mode 100644 index 000000000..4dcf27431 --- /dev/null +++ b/utils/zh_tn.py @@ -0,0 +1,927 @@ +#!/usr/bin/env python3 +# https://github.com/speechio/chinese_text_normalization/blob/master/python/cn_tn.py +import argparse +import re +import string +import sys +from typing import List +from typing import Text + +import jieba +from zhon import hanzi + +# ================================================================================ # +# basic constant +# ================================================================================ # +CHINESE_DIGIS = u'零一二三四五六七八九' +BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖' +BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖' +SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万' +SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬' +LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载' +LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載' +SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万' +SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬' + +ZERO_ALT = u'〇' +ONE_ALT = u'幺' +TWO_ALTS = [u'两', u'兩'] + +POSITIVE = [u'正', u'正'] +NEGATIVE = [u'负', u'負'] +POINT = [u'点', u'點'] +# PLUS = [u'加', u'加'] +# SIL = [u'杠', u'槓'] + +# 中文数字系统类型 +NUMBERING_TYPES = ['low', 'mid', 'high'] + +CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \ + '里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)' +CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' +COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \ + '砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \ + '针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \ + '毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \ + '盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \ + '纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)' + +# punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git) +CHINESE_PUNC_STOP = '!?。。' +CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏' +CHINESE_PUNC_OTHER = '·〈〉-' +CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP + CHINESE_PUNC_OTHER + + +# ================================================================================ # +# basic class +# ================================================================================ # +class ChineseChar(): + """ + 中文字符 + 每个字符对应简体和繁体, + e.g. 简体 = '负', 繁体 = '負' + 转换时可转换为简体或繁体 + """ + + def __init__(self, simplified, traditional): + self.simplified = simplified + self.traditional = traditional + + def __str__(self): + return self.simplified or self.traditional or None + + def __repr__(self): + return self.__str__() + + +class ChineseNumberUnit(ChineseChar): + """ + 中文数字/数位字符 + 每个字符除繁简体外还有一个额外的大写字符 + e.g. '陆' 和 '陸' + """ + + def __init__(self, power, simplified, traditional, big_s, big_t): + super().__init__(simplified, traditional) + self.power = power + self.big_s = big_s + self.big_t = big_t + + def __str__(self): + return '10^{}'.format(self.power) + + @classmethod + def create(cls, + index, + value, + numbering_type=NUMBERING_TYPES[1], + small_unit=False): + + if small_unit: + return ChineseNumberUnit( + power=index + 1, + simplified=value[0], + traditional=value[1], + big_s=value[1], + big_t=value[1]) + elif numbering_type == NUMBERING_TYPES[0]: + return ChineseNumberUnit( + power=index + 8, + simplified=value[0], + traditional=value[1], + big_s=value[0], + big_t=value[1]) + elif numbering_type == NUMBERING_TYPES[1]: + return ChineseNumberUnit( + power=(index + 2) * 4, + simplified=value[0], + traditional=value[1], + big_s=value[0], + big_t=value[1]) + elif numbering_type == NUMBERING_TYPES[2]: + return ChineseNumberUnit( + power=pow(2, index + 3), + simplified=value[0], + traditional=value[1], + big_s=value[0], + big_t=value[1]) + else: + raise ValueError('Counting type should be in {0} ({1} provided).'. + format(NUMBERING_TYPES, numbering_type)) + + +class ChineseNumberDigit(ChineseChar): + """ + 中文数字字符 + """ + + def __init__(self, + value, + simplified, + traditional, + big_s, + big_t, + alt_s=None, + alt_t=None): + super().__init__(simplified, traditional) + self.value = value + self.big_s = big_s + self.big_t = big_t + self.alt_s = alt_s + self.alt_t = alt_t + + def __str__(self): + return str(self.value) + + @classmethod + def create(cls, i, v): + return ChineseNumberDigit(i, v[0], v[1], v[2], v[3]) + + +class ChineseMath(ChineseChar): + """ + 中文数位字符 + """ + + def __init__(self, simplified, traditional, symbol, expression=None): + super().__init__(simplified, traditional) + self.symbol = symbol + self.expression = expression + self.big_s = simplified + self.big_t = traditional + + +CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath + + +class NumberSystem(): + """ + 中文数字系统 + """ + pass + + +class MathSymbol(): + """ + 用于中文数字系统的数学符号 (繁/简体), e.g. + positive = ['正', '正'] + negative = ['负', '負'] + point = ['点', '點'] + """ + + def __init__(self, positive, negative, point): + self.positive = positive + self.negative = negative + self.point = point + + def __iter__(self): + for v in self.__dict__.values(): + yield v + + +# class OtherSymbol(): +# """ +# 其他符号 +# """ +# +# def __init__(self, sil): +# self.sil = sil +# +# def __iter__(self): +# for v in self.__dict__.values(): +# yield v + + +# ================================================================================ # +# basic utils +# ================================================================================ # +def create_system(numbering_type=NUMBERING_TYPES[1]): + """ + 根据数字系统类型返回创建相应的数字系统,默认为 mid + NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型 + low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc. + mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc. + high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc. + 返回对应的数字系统 + """ + + # chinese number units of '亿' and larger + all_larger_units = zip(LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, + LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL) + larger_units = [ + CNU.create(i, v, numbering_type, False) + for i, v in enumerate(all_larger_units) + ] + # chinese number units of '十, 百, 千, 万' + all_smaller_units = zip(SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, + SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL) + smaller_units = [ + CNU.create(i, v, small_unit=True) + for i, v in enumerate(all_smaller_units) + ] + # digis + chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS, + BIG_CHINESE_DIGIS_SIMPLIFIED, + BIG_CHINESE_DIGIS_TRADITIONAL) + digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)] + digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT + digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT + digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1] + + # symbols + positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x) + negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x) + point_cn = CM(POINT[0], POINT[1], '.', + lambda x, y: float(str(x) + '.' + str(y))) + # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y))) + system = NumberSystem() + system.units = smaller_units + larger_units + system.digits = digits + system.math = MathSymbol(positive_cn, negative_cn, point_cn) + # system.symbols = OtherSymbol(sil_cn) + return system + + +def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]): + def get_symbol(char, system): + for u in system.units: + if char in [u.traditional, u.simplified, u.big_s, u.big_t]: + return u + for d in system.digits: + if char in [ + d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, + d.alt_t + ]: + return d + for m in system.math: + if char in [m.traditional, m.simplified]: + return m + + def string2symbols(chinese_string, system): + int_string, dec_string = chinese_string, '' + for p in [system.math.point.simplified, system.math.point.traditional]: + if p in chinese_string: + int_string, dec_string = chinese_string.split(p) + break + return [get_symbol(c, system) for c in int_string], \ + [get_symbol(c, system) for c in dec_string] + + def correct_symbols(integer_symbols, system): + """ + 一百八 to 一百八十 + 一亿一千三百万 to 一亿 一千万 三百万 + """ + + if integer_symbols and isinstance(integer_symbols[0], CNU): + if integer_symbols[0].power == 1: + integer_symbols = [system.digits[1]] + integer_symbols + + if len(integer_symbols) > 1: + if isinstance(integer_symbols[-1], CND) and isinstance( + integer_symbols[-2], CNU): + integer_symbols.append( + CNU(integer_symbols[-2].power - 1, None, None, None, None)) + + result = [] + unit_count = 0 + for s in integer_symbols: + if isinstance(s, CND): + result.append(s) + unit_count = 0 + elif isinstance(s, CNU): + current_unit = CNU(s.power, None, None, None, None) + unit_count += 1 + + if unit_count == 1: + result.append(current_unit) + elif unit_count > 1: + for i in range(len(result)): + if isinstance( + result[-i - 1], + CNU) and result[-i - 1].power < current_unit.power: + result[-i - 1] = CNU( + result[-i - 1].power + current_unit.power, None, + None, None, None) + return result + + def compute_value(integer_symbols): + """ + Compute the value. + When current unit is larger than previous unit, current unit * all previous units will be used as all previous units. + e.g. '两千万' = 2000 * 10000 not 2000 + 10000 + """ + value = [0] + last_power = 0 + for s in integer_symbols: + if isinstance(s, CND): + value[-1] = s.value + elif isinstance(s, CNU): + value[-1] *= pow(10, s.power) + if s.power > last_power: + value[:-1] = list( + map(lambda v: v * pow(10, s.power), value[:-1])) + last_power = s.power + value.append(0) + return sum(value) + + system = create_system(numbering_type) + int_part, dec_part = string2symbols(chinese_string, system) + int_part = correct_symbols(int_part, system) + int_str = str(compute_value(int_part)) + dec_str = ''.join([str(d.value) for d in dec_part]) + if dec_part: + return '{0}.{1}'.format(int_str, dec_str) + else: + return int_str + + +def num2chn(number_string, + numbering_type=NUMBERING_TYPES[1], + big=False, + traditional=False, + alt_zero=False, + alt_one=False, + alt_two=True, + use_zeros=True, + use_units=True): + def get_value(value_string, use_zeros=True): + striped_string = value_string.lstrip('0') + + # record nothing if all zeros + if not striped_string: + return [] + # record one digits + elif len(striped_string) == 1: + if use_zeros and len(value_string) != len(striped_string): + return [system.digits[0], system.digits[int(striped_string)]] + else: + return [system.digits[int(striped_string)]] + # recursively record multiple digits + else: + result_unit = next( + u for u in reversed(system.units) + if u.power < len(striped_string)) + result_string = value_string[:-result_unit.power] + return get_value(result_string) + [result_unit] + get_value( + striped_string[-result_unit.power:]) + + system = create_system(numbering_type) + + int_dec = number_string.split('.') + if len(int_dec) == 1: + int_string = int_dec[0] + dec_string = "" + elif len(int_dec) == 2: + int_string = int_dec[0] + dec_string = int_dec[1] + else: + raise ValueError("invalid input num string with more than one dot: {}". + format(number_string)) + + if use_units and len(int_string) > 1: + result_symbols = get_value(int_string) + else: + result_symbols = [system.digits[int(c)] for c in int_string] + + dec_symbols = [system.digits[int(c)] for c in dec_string] + if dec_string: + result_symbols += [system.math.point] + dec_symbols + + if alt_two: + liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t, + system.digits[2].big_s, system.digits[2].big_t) + for i, v in enumerate(result_symbols): + if isinstance(v, CND) and v.value == 2: + next_symbol = result_symbols[i + 1] if i < len( + result_symbols) - 1 else None + previous_symbol = result_symbols[i - 1] if i > 0 else None + if isinstance(next_symbol, CNU) and isinstance( + previous_symbol, (CNU, type(None))): + # yapf: disable + if next_symbol.power != 1 and ((previous_symbol is None) or + (previous_symbol.power != 1)): + result_symbols[i] = liang + # yapf: enable + + # if big is True, '两' will not be used and `alt_two` has no impact on output + if big: + attr_name = 'big_' + if traditional: + attr_name += 't' + else: + attr_name += 's' + else: + if traditional: + attr_name = 'traditional' + else: + attr_name = 'simplified' + + result = ''.join([getattr(s, attr_name) for s in result_symbols]) + + # if not use_zeros: + # result = result.strip(getattr(system.digits[0], attr_name)) + + if alt_zero: + result = result.replace( + getattr(system.digits[0], attr_name), system.digits[0].alt_s) + + if alt_one: + result = result.replace( + getattr(system.digits[1], attr_name), system.digits[1].alt_s) + + for i, p in enumerate(POINT): + if result.startswith(p): + return CHINESE_DIGIS[0] + result + + # ^10, 11, .., 19 + if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0], + SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \ + result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]: + result = result[1:] + + return result + + +# ================================================================================ # +# different types of rewriters +# ================================================================================ # +class Cardinal: + """ + CARDINAL类 + """ + + def __init__(self, cardinal=None, chntext=None): + self.cardinal = cardinal + self.chntext = chntext + + def chntext2cardinal(self): + return chn2num(self.chntext) + + def cardinal2chntext(self): + return num2chn(self.cardinal) + + +class Digit: + """ + DIGIT类 + """ + + def __init__(self, digit=None, chntext=None): + self.digit = digit + self.chntext = chntext + + # def chntext2digit(self): + # return chn2num(self.chntext) + + def digit2chntext(self): + return num2chn(self.digit, alt_two=False, use_units=False) + + +class TelePhone: + """ + TELEPHONE类 + """ + + def __init__(self, telephone=None, raw_chntext=None, chntext=None): + self.telephone = telephone + self.raw_chntext = raw_chntext + self.chntext = chntext + + # def chntext2telephone(self): + # sil_parts = self.raw_chntext.split('<SIL>') + # self.telephone = '-'.join([ + # str(chn2num(p)) for p in sil_parts + # ]) + # return self.telephone + + def telephone2chntext(self, fixed=False): + if fixed: + sil_parts = self.telephone.split('-') + self.raw_chntext = '<SIL>'.join([ + num2chn(part, alt_two=False, use_units=False) + for part in sil_parts + ]) + self.chntext = self.raw_chntext.replace('<SIL>', '') + else: + sp_parts = self.telephone.strip('+').split() + self.raw_chntext = '<SP>'.join([ + num2chn(part, alt_two=False, use_units=False) + for part in sp_parts + ]) + self.chntext = self.raw_chntext.replace('<SP>', '') + return self.chntext + + +class Fraction: + """ + FRACTION类 + """ + + def __init__(self, fraction=None, chntext=None): + self.fraction = fraction + self.chntext = chntext + + def chntext2fraction(self): + denominator, numerator = self.chntext.split('分之') + return chn2num(numerator) + '/' + chn2num(denominator) + + def fraction2chntext(self): + numerator, denominator = self.fraction.split('/') + return num2chn(denominator) + '分之' + num2chn(numerator) + + +class Date: + """ + DATE类 + """ + + def __init__(self, date=None, chntext=None): + self.date = date + self.chntext = chntext + + # def chntext2date(self): + # chntext = self.chntext + # try: + # year, other = chntext.strip().split('年', maxsplit=1) + # year = Digit(chntext=year).digit2chntext() + '年' + # except ValueError: + # other = chntext + # year = '' + # if other: + # try: + # month, day = other.strip().split('月', maxsplit=1) + # month = Cardinal(chntext=month).chntext2cardinal() + '月' + # except ValueError: + # day = chntext + # month = '' + # if day: + # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1] + # else: + # month = '' + # day = '' + # date = year + month + day + # self.date = date + # return self.date + + def date2chntext(self): + date = self.date + try: + year, other = date.strip().split('年', 1) + year = Digit(digit=year).digit2chntext() + '年' + except ValueError: + other = date + year = '' + + if other: + try: + month, day = other.strip().split('月', 1) + month = Cardinal(cardinal=month).cardinal2chntext() + '月' + except ValueError: + day = date + month = '' + + if day: + day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1] + else: + month = '' + day = '' + + chntext = year + month + day + self.chntext = chntext + return self.chntext + + +class Money: + """ + MONEY类 + """ + + def __init__(self, money=None, chntext=None): + self.money = money + self.chntext = chntext + + # def chntext2money(self): + # return self.money + + def money2chntext(self): + money = self.money + pattern = re.compile(r'(\d+(\.\d+)?)') + matchers = pattern.findall(money) + if matchers: + for matcher in matchers: + money = money.replace( + matcher[0], + Cardinal(cardinal=matcher[0]).cardinal2chntext()) + self.chntext = money + return self.chntext + + +class Percentage: + """ + PERCENTAGE类 + """ + + def __init__(self, percentage=None, chntext=None): + self.percentage = percentage + self.chntext = chntext + + def chntext2percentage(self): + return chn2num(self.chntext.strip().strip('百分之')) + '%' + + def percentage2chntext(self): + return '百分之' + num2chn(self.percentage.strip().strip('%')) + + +# ================================================================================ # +# NSW Normalizer +# ================================================================================ # +class NSWNormalizer: + def __init__(self, raw_text): + self.raw_text = '^' + raw_text + '$' + self.norm_text = '' + + def _particular(self): + text = self.norm_text + pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))") + matchers = pattern.findall(text) + if matchers: + # print('particular') + for matcher in matchers: + text = text.replace(matcher[0], matcher[1] + '2' + matcher[2], + 1) + self.norm_text = text + return self.norm_text + + def normalize(self): + text = self.raw_text + + # 规范化日期 + pattern = re.compile( + r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)") + matchers = pattern.findall(text) + if matchers: + #print('date') + for matcher in matchers: + text = text.replace( + matcher[0], Date(date=matcher[0]).date2chntext(), 1) + + # 规范化金钱 + pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + + r"(\d" + CURRENCY_UNITS + r"?)?)") + matchers = pattern.findall(text) + if matchers: + #print('money') + for matcher in matchers: + text = text.replace( + matcher[0], Money(money=matcher[0]).money2chntext(), 1) + + # 规范化固话/手机号码 + # 手机 + # http://www.jihaoba.com/news/show/13680 + # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 + # 联通:130、131、132、156、155、186、185、176 + # 电信:133、153、189、180、181、177 + pattern = re.compile( + r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D") + matchers = pattern.findall(text) + if matchers: + #print('telephone') + for matcher in matchers: + text = text.replace( + matcher[0], + TelePhone(telephone=matcher[0]).telephone2chntext(), + 1) + # 固话 + pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D") + matchers = pattern.findall(text) + if matchers: + # print('fixed telephone') + for matcher in matchers: + text = text.replace( + matcher[0], + TelePhone(telephone=matcher[0]).telephone2chntext( + fixed=True), + 1) + + # 规范化分数 + pattern = re.compile(r"(\d+/\d+)") + matchers = pattern.findall(text) + if matchers: + #print('fraction') + for matcher in matchers: + text = text.replace( + matcher, Fraction(fraction=matcher).fraction2chntext(), 1) + + # 规范化百分数 + text = text.replace('%', '%') + pattern = re.compile(r"(\d+(\.\d+)?%)") + matchers = pattern.findall(text) + if matchers: + #print('percentage') + for matcher in matchers: + text = text.replace( + matcher[0], + Percentage(percentage=matcher[0]).percentage2chntext(), + 1) + + # 规范化纯数+量词 + pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS) + matchers = pattern.findall(text) + if matchers: + #print('cardinal+quantifier') + for matcher in matchers: + text = text.replace( + matcher[0], + Cardinal(cardinal=matcher[0]).cardinal2chntext(), + 1) + + # 规范化数字编号 + pattern = re.compile(r"(\d{4,32})") + matchers = pattern.findall(text) + if matchers: + #print('digit') + for matcher in matchers: + text = text.replace( + matcher, Digit(digit=matcher).digit2chntext(), 1) + + # 规范化纯数 + pattern = re.compile(r"(\d+(\.\d+)?)") + matchers = pattern.findall(text) + if matchers: + #print('cardinal') + for matcher in matchers: + text = text.replace( + matcher[0], + Cardinal(cardinal=matcher[0]).cardinal2chntext(), + 1) + + self.norm_text = text + self._particular() + + return self.norm_text.lstrip('^').rstrip('$') + + +def nsw_test_case(raw_text): + print('I:' + raw_text) + print('O:' + NSWNormalizer(raw_text).normalize()) + print('') + + +def nsw_test(): + nsw_test_case('固话:0595-23865596或23880880。') + nsw_test_case('固话:0595-23865596或23880880。') + nsw_test_case('手机:+86 19859213959或15659451527。') + nsw_test_case('分数:32477/76391。') + nsw_test_case('百分数:80.03%。') + nsw_test_case('编号:31520181154418。') + nsw_test_case('纯数:2983.07克或12345.60米。') + nsw_test_case('日期:1999年2月20日或09年3月15号。') + nsw_test_case('金钱:12块5,34.5元,20.1万') + nsw_test_case('特殊:O2O或B2C。') + nsw_test_case('3456万吨') + nsw_test_case('2938个') + nsw_test_case('938') + nsw_test_case('今天吃了115个小笼包231个馒头') + nsw_test_case('有62%的概率') + + +def char_token(s: Text) -> List[Text]: + """chinese charactor + + Args: + s (Text): [description] + + Returns: + List[Text]: [description] + """ + return list(s) + + +def word_token(s: Text) -> List[Text]: + """chinese word + + Args: + s (Text): [description] + + Returns: + List[Text]: [description] + """ + return jieba.lcut(s) + + +def text_process(s: Text) -> Text: + """do chinese text normaliztion + + Args: + s (Text): [description] + + Returns: + Text: [description] + """ + s = s.replace('*', '') + # NSW(Non-Standard-Word) normalization + s = NSWNormalizer(s).normalize() + # Punctuations removal + s = re.sub(f'[{hanzi.punctuation}{string.punctuation}]', "", s) + # rm english + s = ''.join(re.findall(hanzi.sent, s)) + return s + + +def main(infile, outfile, args): + # tokenizer + token_type = args.token_type + if token_type == 'char': + tokenizer = char_token + elif token_type == 'word': + tokenizer = word_token + else: + tokenizer = None + + with open(infile, 'rt') as fin, open(outfile, 'wt') as fout: + lines = fin.readlines() + n = 0 + for l in lines: + key = '' + text = '' + + if args.has_key: + cols = l.split(maxsplit=1) + key = cols[0] + if len(cols) == 2: + text = cols[1] + else: + text = '' + else: + text = l + + # strip + text = text.strip() + # cases + if args.to_upper and args.to_lower: + sys.stderr.write('to_upper OR to_lower?') + exit(1) + if args.to_upper: + text = text.upper() + if args.to_lower: + text = text.lower() + + # Normalization + text = text_process(text) + if tokenizer: + text = ' '.join(tokenizer(text)) + + if args.has_key: + fout.write(key + '\t' + text + '\n') + else: + if text.strip( + ) != '': # skip empty line in pure text format(without Kaldi's utt key) + fout.write(text + '\n') + + n += 1 + if n % args.log_interval == 0: + print(f"process {n} lines.", file=sys.stderr) + + +if __name__ == '__main__': + p = argparse.ArgumentParser() + p.add_argument('token_type', default=None, help='token type. [char|word]') + p.add_argument('ifile', help='input filename, assume utf-8 encoding') + p.add_argument('ofile', help='output filename') + p.add_argument( + '--to_upper', action='store_true', help='convert to upper case') + p.add_argument( + '--to_lower', action='store_true', help='convert to lower case') + p.add_argument( + '--has_key', + action='store_true', + help="input text has Kaldi's key as first field.") + p.add_argument( + '--log_interval', + type=int, + default=100000, + help='log interval in number of processed lines') + args = p.parse_args() + + main(args.ifile, args.ofile, args)