You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
svelte/src/validate/utils/FuzzySet.ts

235 lines
5.4 KiB

// adapted from https://github.com/Glench/fuzzyset.js/blob/master/lib/fuzzyset.js
// BSD Licensed
const GRAM_SIZE_LOWER = 2;
const GRAM_SIZE_UPPER = 3;
// return an edit distance from 0 to 1
function _distance(str1: string, str2: string) {
if (str1 === null && str2 === null)
throw 'Trying to compare two null values';
if (str1 === null || str2 === null) return 0;
str1 = String(str1);
str2 = String(str2);
const distance = levenshtein(str1, str2);
if (str1.length > str2.length) {
return 1 - distance / str1.length;
} else {
return 1 - distance / str2.length;
}
}
// helper functions
function levenshtein(str1: string, str2: string) {
const current: number[] = [];
let prev;
let value;
for (let i = 0; i <= str2.length; i++) {
for (let j = 0; j <= str1.length; j++) {
if (i && j) {
if (str1.charAt(j - 1) === str2.charAt(i - 1)) {
value = prev;
} else {
value = Math.min(current[j], current[j - 1], prev) + 1;
}
} else {
value = i + j;
}
prev = current[j];
current[j] = value;
}
}
return current.pop();
}
const _nonWordRe = /[^\w, ]+/;
function _iterateGrams(value: string, gramSize: number) {
gramSize = gramSize || 2;
const simplified = '-' + value.toLowerCase().replace(_nonWordRe, '') + '-';
const lenDiff = gramSize - simplified.length;
const results = [];
if (lenDiff > 0) {
for (let i = 0; i < lenDiff; ++i) {
value += '-';
}
}
for (let i = 0; i < simplified.length - gramSize + 1; ++i) {
results.push(simplified.slice(i, i + gramSize));
}
return results;
}
function _gramCounter(value: string, gramSize: number) {
// return an object where key=gram, value=number of occurrences
gramSize = gramSize || 2;
const result = {};
const grams = _iterateGrams(value, gramSize);
let i = 0;
for (i; i < grams.length; ++i) {
if (grams[i] in result) {
result[grams[i]] += 1;
} else {
result[grams[i]] = 1;
}
}
return result;
}
function sortDescending(a, b) {
return b[0] - a[0];
}
export default class FuzzySet {
exactSet: object;
matchDict: object;
items: object;
constructor(arr: string[]) {
// define all the object functions and attributes
this.exactSet = {};
this.matchDict = {};
this.items = {};
// initialization
for (let i = GRAM_SIZE_LOWER; i < GRAM_SIZE_UPPER + 1; ++i) {
this.items[i] = [];
}
// add all the items to the set
for (let i = 0; i < arr.length; ++i) {
this.add(arr[i]);
}
}
add(value: string) {
const normalizedValue = value.toLowerCase();
if (normalizedValue in this.exactSet) {
return false;
}
let i = GRAM_SIZE_LOWER;
for (i; i < GRAM_SIZE_UPPER + 1; ++i) {
this._add(value, i);
}
}
_add(value: string, gramSize: number) {
const normalizedValue = value.toLowerCase();
const items = this.items[gramSize] || [];
const index = items.length;
items.push(0);
const gramCounts = _gramCounter(normalizedValue, gramSize);
let sumOfSquareGramCounts = 0;
let gram;
let gramCount;
for (gram in gramCounts) {
gramCount = gramCounts[gram];
sumOfSquareGramCounts += Math.pow(gramCount, 2);
if (gram in this.matchDict) {
this.matchDict[gram].push([index, gramCount]);
} else {
this.matchDict[gram] = [[index, gramCount]];
}
}
const vectorNormal = Math.sqrt(sumOfSquareGramCounts);
items[index] = [vectorNormal, normalizedValue];
this.items[gramSize] = items;
this.exactSet[normalizedValue] = value;
};
get(value: string) {
const normalizedValue = value.toLowerCase();
const result = this.exactSet[normalizedValue];
if (result) {
return [[1, result]];
}
let results = [];
// start with high gram size and if there are no results, go to lower gram sizes
for (
let gramSize = GRAM_SIZE_UPPER;
gramSize >= GRAM_SIZE_LOWER;
--gramSize
) {
results = this.__get(value, gramSize);
if (results) {
return results;
}
}
return null;
}
__get(value: string, gramSize: number) {
const normalizedValue = value.toLowerCase();
const matches = {};
const gramCounts = _gramCounter(normalizedValue, gramSize);
const items = this.items[gramSize];
let sumOfSquareGramCounts = 0;
let gram;
let gramCount;
let i;
let index;
let otherGramCount;
for (gram in gramCounts) {
gramCount = gramCounts[gram];
sumOfSquareGramCounts += Math.pow(gramCount, 2);
if (gram in this.matchDict) {
for (i = 0; i < this.matchDict[gram].length; ++i) {
index = this.matchDict[gram][i][0];
otherGramCount = this.matchDict[gram][i][1];
if (index in matches) {
matches[index] += gramCount * otherGramCount;
} else {
matches[index] = gramCount * otherGramCount;
}
}
}
}
const vectorNormal = Math.sqrt(sumOfSquareGramCounts);
let results = [];
let matchScore;
// build a results list of [score, str]
for (const matchIndex in matches) {
matchScore = matches[matchIndex];
results.push([
matchScore / (vectorNormal * items[matchIndex][0]),
items[matchIndex][1],
]);
}
results.sort(sortDescending);
let newResults = [];
const endIndex = Math.min(50, results.length);
// truncate somewhat arbitrarily to 50
for (let i = 0; i < endIndex; ++i) {
newResults.push([
_distance(results[i][1], normalizedValue),
results[i][1],
]);
}
results = newResults;
results.sort(sortDescending);
newResults = [];
for (let i = 0; i < results.length; ++i) {
if (results[i][0] == results[0][0]) {
newResults.push([results[i][0], this.exactSet[results[i][1]]]);
}
}
return newResults;
};
}