From fad7349e2d91cc0673cc17dd2541a4a8f86a87e2 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 21 May 2021 06:56:07 +0000 Subject: [PATCH] down and parse cedict --- examples/cc-cedict/.gitignore | 2 + examples/cc-cedict/README.md | 7 +++ examples/cc-cedict/local/parser.py | 78 ++++++++++++++++++++++++++++++ examples/cc-cedict/path.sh | 10 ++++ examples/cc-cedict/run.sh | 39 +++++++++++++++ 5 files changed, 136 insertions(+) create mode 100644 examples/cc-cedict/.gitignore create mode 100644 examples/cc-cedict/README.md create mode 100644 examples/cc-cedict/local/parser.py create mode 100644 examples/cc-cedict/path.sh create mode 100755 examples/cc-cedict/run.sh diff --git a/examples/cc-cedict/.gitignore b/examples/cc-cedict/.gitignore new file mode 100644 index 000000000..bbd86a25b --- /dev/null +++ b/examples/cc-cedict/.gitignore @@ -0,0 +1,2 @@ +data +exp diff --git a/examples/cc-cedict/README.md b/examples/cc-cedict/README.md new file mode 100644 index 000000000..698d7c290 --- /dev/null +++ b/examples/cc-cedict/README.md @@ -0,0 +1,7 @@ +# Ngram LM + +Train chinese chararctor ngram lm by [kenlm](https://github.com/kpu/kenlm). + +``` +bash run.sh +``` diff --git a/examples/cc-cedict/local/parser.py b/examples/cc-cedict/local/parser.py new file mode 100644 index 000000000..d6acb834f --- /dev/null +++ b/examples/cc-cedict/local/parser.py @@ -0,0 +1,78 @@ +# https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py + +#A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys. + +#Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13. + +#Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message. + +#Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60. + +#This code was written by Franki Allegra in February 2020. + + +import sys +import json + +# usage: bin ccedict dump.json + +with open(sys.argv[1], 'rt') as file: + text = file.read() + lines = text.split('\n') + dict_lines = list(lines) + + def parse_line(line): + parsed = {} + if line == '': + dict_lines.remove(line) + return 0 + if line.startswith('#'): + return 0 + if line.startswith('%'): + return 0 + line = line.rstrip('/') + line = line.split('/') + if len(line) <= 1: + return 0 + english = line[1] + char_and_pinyin = line[0].split('[') + characters = char_and_pinyin[0] + characters = characters.split() + traditional = characters[0] + simplified = characters[1] + pinyin = char_and_pinyin[1] + pinyin = pinyin.rstrip() + pinyin = pinyin.rstrip("]") + parsed['traditional'] = traditional + parsed['simplified'] = simplified + parsed['pinyin'] = pinyin + parsed['english'] = english + list_of_dicts.append(parsed) + + def remove_surnames(): + for x in range(len(list_of_dicts)-1, -1, -1): + if "surname " in list_of_dicts[x]['english']: + if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']: + list_of_dicts.pop(x) + + def main(): + + #make each line into a dictionary + print("Parsing dictionary . . .") + for line in dict_lines: + parse_line(line) + + #remove entries for surnames from the data (optional): + print("Removing Surnames . . .") + remove_surnames() + + + print("Saving to database (this may take a few minutes) . . .") + with open(sys.argv[2], 'wt') as fout: + for one_dict in list_of_dicts: + json_str = json.dumps(one_dict) + fout.write(json_str + "\n") + print('Done!') + +list_of_dicts = [] +parsed_dict = main() diff --git a/examples/cc-cedict/path.sh b/examples/cc-cedict/path.sh new file mode 100644 index 000000000..84e2de7d0 --- /dev/null +++ b/examples/cc-cedict/path.sh @@ -0,0 +1,10 @@ +export MAIN_ROOT=${PWD}/../../ + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH} \ No newline at end of file diff --git a/examples/cc-cedict/run.sh b/examples/cc-cedict/run.sh new file mode 100755 index 000000000..f05320510 --- /dev/null +++ b/examples/cc-cedict/run.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# CC-CEDICT download: https://www.mdbg.net/chinese/dictionary?page=cc-cedict +# The word dictionary of this website is based on CC-CEDICT. +# CC-CEDICT is a continuation of the CEDICT project started by Paul Denisowski in 1997 with the +# aim to provide a complete downloadable Chinese to English dictionary with pronunciation in pinyin for the Chinese characters. +# This website allows you to easily add new entries or correct existing entries in CC-CEDICT. +# Submitted entries will be checked and processed frequently and released for download in CEDICT format on this page. + +set -e +source path.sh + +stage=-1 +stop_stage=100 + + +source ${MAIN_ROOT}/utils/parse_options.sh || exit -1 + + +cedict_url=https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.zip +cedict=cedict_1_0_ts_utf-8_mdbg.zip + +mkdir -p data + +if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then + test -f data/${cedict} || wget -O data/${cedict} ${cedict_url} + pushd data + unzip ${cedict} + popd + +fi + +mkdir -p exp + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then + cp data/cedict_ts.u8 exp/cedict + python3 local/parser.py exp/cedict exp/cedict.json +fi +