down and parse cedict

4 years ago · fad7349e2d
parent f22f681992
commit fad7349e2d
5 changed files with 136 additions and 0 deletions
--- a/examples/cc-cedict/.gitignore
+++ b/examples/cc-cedict/.gitignore
@ -0,0 +1,2 @@
+data
+exp
--- a/examples/cc-cedict/README.md
+++ b/examples/cc-cedict/README.md
@ -0,0 +1,7 @@
+# Ngram LM
+
+Train chinese chararctor ngram lm by [kenlm](https://github.com/kpu/kenlm).
+
+```
+bash run.sh
+```
--- a/examples/cc-cedict/local/parser.py
+++ b/examples/cc-cedict/local/parser.py
@ -0,0 +1,78 @@
+# https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py
+
+#A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
+
+#Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
+
+#Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
+
+#Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
+
+#This code was written by Franki Allegra in February 2020.
+
+
+import sys
+import json
+
+# usage: bin ccedict dump.json
+
+with open(sys.argv[1], 'rt') as file:
+    text = file.read()
+    lines = text.split('\n')
+    dict_lines = list(lines)
+
+    def parse_line(line):
+        parsed = {}
+        if line == '':
+            dict_lines.remove(line)
+            return 0
+        if line.startswith('#'):
+            return 0
+        if line.startswith('%'):
+            return 0
+        line = line.rstrip('/')
+        line = line.split('/')
+        if len(line) <= 1:
+            return 0
+        english = line[1]
+        char_and_pinyin = line[0].split('[')
+        characters = char_and_pinyin[0]
+        characters = characters.split()
+        traditional = characters[0]
+        simplified = characters[1]
+        pinyin = char_and_pinyin[1]
+        pinyin = pinyin.rstrip()
+        pinyin = pinyin.rstrip("]")
+        parsed['traditional'] = traditional
+        parsed['simplified'] = simplified
+        parsed['pinyin'] = pinyin
+        parsed['english'] = english
+        list_of_dicts.append(parsed)
+
+    def remove_surnames():
+        for x in range(len(list_of_dicts)-1, -1, -1):
+            if "surname " in list_of_dicts[x]['english']:
+                if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']:
+                    list_of_dicts.pop(x)
+
+    def main():
+
+        #make each line into a dictionary
+        print("Parsing dictionary . . .")
+        for line in dict_lines:
+                parse_line(line)
+
+        #remove entries for surnames from the data (optional):
+        print("Removing Surnames . . .")
+        remove_surnames()
+
+
+        print("Saving to database (this may take a few minutes) . . .")
+        with open(sys.argv[2], 'wt') as fout:
+            for one_dict in list_of_dicts:
+                json_str = json.dumps(one_dict)
+                fout.write(json_str + "\n")
+        print('Done!')
+
+list_of_dicts = []
+parsed_dict = main()
--- a/examples/cc-cedict/path.sh
+++ b/examples/cc-cedict/path.sh
@ -0,0 +1,10 @@
+export MAIN_ROOT=${PWD}/../../
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH}
--- a/examples/cc-cedict/run.sh
+++ b/examples/cc-cedict/run.sh
@ -0,0 +1,39 @@
+#!/bin/bash
+
+# CC-CEDICT download: https://www.mdbg.net/chinese/dictionary?page=cc-cedict
+# The word dictionary of this website is based on CC-CEDICT.
+# CC-CEDICT is a continuation of the CEDICT project started by Paul Denisowski in 1997 with the
+# aim to provide a complete downloadable Chinese to English dictionary with pronunciation in pinyin for the Chinese characters.
+# This website allows you to easily add new entries or correct existing entries in CC-CEDICT.
+# Submitted entries will be checked and processed frequently and released for download in CEDICT format on this page.
+
+set -e
+source path.sh
+
+stage=-1
+stop_stage=100
+
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
+
+
+cedict_url=https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.zip
+cedict=cedict_1_0_ts_utf-8_mdbg.zip
+
+mkdir -p data
+
+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
+    test -f data/${cedict} || wget -O data/${cedict} ${cedict_url}
+    pushd data
+    unzip ${cedict}
+    popd
+
+fi
+
+mkdir -p exp
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
+    cp data/cedict_ts.u8 exp/cedict
+    python3 local/parser.py exp/cedict exp/cedict.json
+fi
+