You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
199 lines
4.6 KiB
199 lines
4.6 KiB
#include <cstdio>
|
|
|
|
#include "dict.h"
|
|
|
|
using namespace std;
|
|
|
|
namespace rmmseg {
|
|
struct Entry {
|
|
Word *word;
|
|
Entry *next;
|
|
};
|
|
|
|
const size_t init_size = 262147;
|
|
const size_t max_density = 5;
|
|
/*
|
|
Table of prime numbers 2^n+a, 2<=n<=30.
|
|
*/
|
|
static size_t primes[] = {
|
|
524288 + 21,
|
|
1048576 + 7,
|
|
2097152 + 17,
|
|
4194304 + 15,
|
|
8388608 + 9,
|
|
16777216 + 43,
|
|
33554432 + 35,
|
|
67108864 + 15,
|
|
134217728 + 29,
|
|
268435456 + 3,
|
|
536870912 + 11,
|
|
1073741824 + 85,
|
|
};
|
|
|
|
|
|
static size_t n_bins = init_size;
|
|
static size_t n_entries = 0;
|
|
static Entry **bins =
|
|
static_cast<Entry **>(std::calloc(init_size, sizeof(Entry *)));
|
|
|
|
static size_t new_size() {
|
|
for (size_t i = 0; i < sizeof(primes) / sizeof(primes[0]); ++i) {
|
|
if (primes[i] > n_bins) {
|
|
return primes[i];
|
|
}
|
|
}
|
|
// TODO: raise exception here
|
|
return n_bins;
|
|
}
|
|
|
|
static unsigned int hash(const char *str, int len) {
|
|
unsigned int key = 0;
|
|
while (len--) {
|
|
key += *str++;
|
|
key += (key << 10);
|
|
key ^= (key >> 6);
|
|
}
|
|
key += (key << 3);
|
|
key ^= (key >> 11);
|
|
key += (key << 15);
|
|
return key;
|
|
}
|
|
|
|
static void rehash() {
|
|
size_t new_n_bins = new_size();
|
|
Entry **new_bins =
|
|
static_cast<Entry **>(calloc(new_n_bins, sizeof(Entry *)));
|
|
Entry *entry, *next;
|
|
unsigned int hash_val;
|
|
|
|
for (size_t i = 0; i < n_bins; ++i) {
|
|
entry = bins[i];
|
|
while (entry) {
|
|
next = entry->next;
|
|
hash_val =
|
|
hash(entry->word->text, entry->word->nbytes) % new_n_bins;
|
|
entry->next = new_bins[hash_val];
|
|
new_bins[hash_val] = entry;
|
|
entry = next;
|
|
}
|
|
}
|
|
free(bins);
|
|
n_bins = new_n_bins;
|
|
bins = new_bins;
|
|
}
|
|
|
|
namespace dict {
|
|
|
|
/**
|
|
* str: the base of the string
|
|
* len: length of the string (in bytes)
|
|
*
|
|
* str may be a substring of a big chunk of text thus not nul-terminated,
|
|
* so len is necessary here.
|
|
*/
|
|
Word *get(const char *str, int len) {
|
|
unsigned int h = hash(str, len) % n_bins;
|
|
Entry *entry = bins[h];
|
|
if (!entry) return NULL;
|
|
do {
|
|
if (len == entry->word->nbytes &&
|
|
strncmp(str, entry->word->text, len) == 0)
|
|
return entry->word;
|
|
entry = entry->next;
|
|
} while (entry);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
void add(Word *word) {
|
|
unsigned int hash_val = hash(word->text, word->nbytes);
|
|
unsigned int h = hash_val % n_bins;
|
|
Entry *entry = bins[h];
|
|
if (!entry) {
|
|
if (n_entries / n_bins > max_density) {
|
|
rehash();
|
|
h = hash_val % n_bins;
|
|
}
|
|
|
|
entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
|
|
entry->word = word;
|
|
entry->next = NULL;
|
|
bins[h] = entry;
|
|
n_entries++;
|
|
return;
|
|
}
|
|
|
|
bool done = false;
|
|
do {
|
|
if (word->nbytes == entry->word->nbytes &&
|
|
strncmp(word->text, entry->word->text, word->nbytes) == 0) {
|
|
/* Overwriting. WARNING: the original Word object is
|
|
* permanently lost. This IS a memory leak, because
|
|
* the memory is allocated by pool_alloc. Instead of
|
|
* fixing this, tuning the dictionary file is a better
|
|
* idea
|
|
*/
|
|
entry->word = word;
|
|
done = true;
|
|
break;
|
|
}
|
|
entry = entry->next;
|
|
} while (entry);
|
|
|
|
if (!done) {
|
|
entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
|
|
entry->word = word;
|
|
entry->next = bins[h];
|
|
bins[h] = entry;
|
|
n_entries++;
|
|
}
|
|
}
|
|
|
|
bool load_chars(const char *filename) {
|
|
FILE *fp = fopen(filename, "r");
|
|
if (!fp) {
|
|
return false;
|
|
}
|
|
|
|
const size_t buf_len = 24;
|
|
char buf[buf_len];
|
|
char *ptr;
|
|
|
|
while (fgets(buf, buf_len, fp)) {
|
|
// NOTE: there SHOULD be a newline at the end of the file
|
|
buf[strlen(buf) - 1] = '\0'; // truncate the newline
|
|
ptr = strchr(buf, ' ');
|
|
if (!ptr) continue; // illegal input
|
|
*ptr = '\0';
|
|
add(make_word(ptr + 1, 1, atoi(buf)));
|
|
}
|
|
|
|
fclose(fp);
|
|
return true;
|
|
}
|
|
|
|
bool load_words(const char *filename) {
|
|
FILE *fp = fopen(filename, "r");
|
|
if (!fp) {
|
|
return false;
|
|
}
|
|
|
|
const int buf_len = 48;
|
|
char buf[buf_len];
|
|
char *ptr;
|
|
|
|
while (fgets(buf, buf_len, fp)) {
|
|
// NOTE: there SHOULD be a newline at the end of the file
|
|
buf[strlen(buf) - 1] = '\0'; // truncate the newline
|
|
ptr = strchr(buf, ' ');
|
|
if (!ptr) continue; // illegal input
|
|
*ptr = '\0';
|
|
add(make_word(ptr + 1, atoi(buf), 0));
|
|
}
|
|
|
|
fclose(fp);
|
|
return true;
|
|
}
|
|
}
|
|
}
|