parent
f22f681992
commit
fad7349e2d
@ -0,0 +1,2 @@
|
||||
data
|
||||
exp
|
@ -0,0 +1,7 @@
|
||||
# Ngram LM
|
||||
|
||||
Train chinese chararctor ngram lm by [kenlm](https://github.com/kpu/kenlm).
|
||||
|
||||
```
|
||||
bash run.sh
|
||||
```
|
@ -0,0 +1,78 @@
|
||||
# https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py
|
||||
|
||||
#A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
|
||||
|
||||
#Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
|
||||
|
||||
#Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
|
||||
|
||||
#Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
|
||||
|
||||
#This code was written by Franki Allegra in February 2020.
|
||||
|
||||
|
||||
import sys
|
||||
import json
|
||||
|
||||
# usage: bin ccedict dump.json
|
||||
|
||||
with open(sys.argv[1], 'rt') as file:
|
||||
text = file.read()
|
||||
lines = text.split('\n')
|
||||
dict_lines = list(lines)
|
||||
|
||||
def parse_line(line):
|
||||
parsed = {}
|
||||
if line == '':
|
||||
dict_lines.remove(line)
|
||||
return 0
|
||||
if line.startswith('#'):
|
||||
return 0
|
||||
if line.startswith('%'):
|
||||
return 0
|
||||
line = line.rstrip('/')
|
||||
line = line.split('/')
|
||||
if len(line) <= 1:
|
||||
return 0
|
||||
english = line[1]
|
||||
char_and_pinyin = line[0].split('[')
|
||||
characters = char_and_pinyin[0]
|
||||
characters = characters.split()
|
||||
traditional = characters[0]
|
||||
simplified = characters[1]
|
||||
pinyin = char_and_pinyin[1]
|
||||
pinyin = pinyin.rstrip()
|
||||
pinyin = pinyin.rstrip("]")
|
||||
parsed['traditional'] = traditional
|
||||
parsed['simplified'] = simplified
|
||||
parsed['pinyin'] = pinyin
|
||||
parsed['english'] = english
|
||||
list_of_dicts.append(parsed)
|
||||
|
||||
def remove_surnames():
|
||||
for x in range(len(list_of_dicts)-1, -1, -1):
|
||||
if "surname " in list_of_dicts[x]['english']:
|
||||
if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']:
|
||||
list_of_dicts.pop(x)
|
||||
|
||||
def main():
|
||||
|
||||
#make each line into a dictionary
|
||||
print("Parsing dictionary . . .")
|
||||
for line in dict_lines:
|
||||
parse_line(line)
|
||||
|
||||
#remove entries for surnames from the data (optional):
|
||||
print("Removing Surnames . . .")
|
||||
remove_surnames()
|
||||
|
||||
|
||||
print("Saving to database (this may take a few minutes) . . .")
|
||||
with open(sys.argv[2], 'wt') as fout:
|
||||
for one_dict in list_of_dicts:
|
||||
json_str = json.dumps(one_dict)
|
||||
fout.write(json_str + "\n")
|
||||
print('Done!')
|
||||
|
||||
list_of_dicts = []
|
||||
parsed_dict = main()
|
@ -0,0 +1,10 @@
|
||||
export MAIN_ROOT=${PWD}/../../
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:${LD_LIBRARY_PATH}
|
@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
|
||||
# CC-CEDICT download: https://www.mdbg.net/chinese/dictionary?page=cc-cedict
|
||||
# The word dictionary of this website is based on CC-CEDICT.
|
||||
# CC-CEDICT is a continuation of the CEDICT project started by Paul Denisowski in 1997 with the
|
||||
# aim to provide a complete downloadable Chinese to English dictionary with pronunciation in pinyin for the Chinese characters.
|
||||
# This website allows you to easily add new entries or correct existing entries in CC-CEDICT.
|
||||
# Submitted entries will be checked and processed frequently and released for download in CEDICT format on this page.
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
|
||||
cedict_url=https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.zip
|
||||
cedict=cedict_1_0_ts_utf-8_mdbg.zip
|
||||
|
||||
mkdir -p data
|
||||
|
||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ];then
|
||||
test -f data/${cedict} || wget -O data/${cedict} ${cedict_url}
|
||||
pushd data
|
||||
unzip ${cedict}
|
||||
popd
|
||||
|
||||
fi
|
||||
|
||||
mkdir -p exp
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
||||
cp data/cedict_ts.u8 exp/cedict
|
||||
python3 local/parser.py exp/cedict exp/cedict.json
|
||||
fi
|
||||
|
Loading…
Reference in new issue