#include #include "dict.h" using namespace std; namespace rmmseg { struct Entry { Word *word; Entry *next; }; const size_t init_size = 262147; const size_t max_density = 5; /* Table of prime numbers 2^n+a, 2<=n<=30. */ static size_t primes[] = { 524288 + 21, 1048576 + 7, 2097152 + 17, 4194304 + 15, 8388608 + 9, 16777216 + 43, 33554432 + 35, 67108864 + 15, 134217728 + 29, 268435456 + 3, 536870912 + 11, 1073741824 + 85, }; static size_t n_bins = init_size; static size_t n_entries = 0; static Entry **bins = static_cast(std::calloc(init_size, sizeof(Entry *))); static size_t new_size() { for (size_t i = 0; i < sizeof(primes) / sizeof(primes[0]); ++i) { if (primes[i] > n_bins) { return primes[i]; } } // TODO: raise exception here return n_bins; } static unsigned int hash(const char *str, int len) { unsigned int key = 0; while (len--) { key += *str++; key += (key << 10); key ^= (key >> 6); } key += (key << 3); key ^= (key >> 11); key += (key << 15); return key; } static void rehash() { size_t new_n_bins = new_size(); Entry **new_bins = static_cast(calloc(new_n_bins, sizeof(Entry *))); Entry *entry, *next; unsigned int hash_val; for (size_t i = 0; i < n_bins; ++i) { entry = bins[i]; while (entry) { next = entry->next; hash_val = hash(entry->word->text, entry->word->nbytes) % new_n_bins; entry->next = new_bins[hash_val]; new_bins[hash_val] = entry; entry = next; } } free(bins); n_bins = new_n_bins; bins = new_bins; } namespace dict { /** * str: the base of the string * len: length of the string (in bytes) * * str may be a substring of a big chunk of text thus not nul-terminated, * so len is necessary here. */ Word *get(const char *str, int len) { unsigned int h = hash(str, len) % n_bins; Entry *entry = bins[h]; if (!entry) return NULL; do { if (len == entry->word->nbytes && strncmp(str, entry->word->text, len) == 0) return entry->word; entry = entry->next; } while (entry); return NULL; } void add(Word *word) { unsigned int hash_val = hash(word->text, word->nbytes); unsigned int h = hash_val % n_bins; Entry *entry = bins[h]; if (!entry) { if (n_entries / n_bins > max_density) { rehash(); h = hash_val % n_bins; } entry = static_cast(pool_alloc(sizeof(Entry))); entry->word = word; entry->next = NULL; bins[h] = entry; n_entries++; return; } bool done = false; do { if (word->nbytes == entry->word->nbytes && strncmp(word->text, entry->word->text, word->nbytes) == 0) { /* Overwriting. WARNING: the original Word object is * permanently lost. This IS a memory leak, because * the memory is allocated by pool_alloc. Instead of * fixing this, tuning the dictionary file is a better * idea */ entry->word = word; done = true; break; } entry = entry->next; } while (entry); if (!done) { entry = static_cast(pool_alloc(sizeof(Entry))); entry->word = word; entry->next = bins[h]; bins[h] = entry; n_entries++; } } bool load_chars(const char *filename) { FILE *fp = fopen(filename, "r"); if (!fp) { return false; } const size_t buf_len = 24; char buf[buf_len]; char *ptr; while (fgets(buf, buf_len, fp)) { // NOTE: there SHOULD be a newline at the end of the file buf[strlen(buf) - 1] = '\0'; // truncate the newline ptr = strchr(buf, ' '); if (!ptr) continue; // illegal input *ptr = '\0'; add(make_word(ptr + 1, 1, atoi(buf))); } fclose(fp); return true; } bool load_words(const char *filename) { FILE *fp = fopen(filename, "r"); if (!fp) { return false; } const int buf_len = 48; char buf[buf_len]; char *ptr; while (fgets(buf, buf_len, fp)) { // NOTE: there SHOULD be a newline at the end of the file buf[strlen(buf) - 1] = '\0'; // truncate the newline ptr = strchr(buf, ' '); if (!ptr) continue; // illegal input *ptr = '\0'; add(make_word(ptr + 1, atoi(buf), 0)); } fclose(fp); return true; } } }