diff --git a/server/modules/search/sqlite3/engine.js b/server/modules/search/sqlite3/engine.js index e2660722..0c4f5a4d 100644 --- a/server/modules/search/sqlite3/engine.js +++ b/server/modules/search/sqlite3/engine.js @@ -3,6 +3,8 @@ const Promise = require('bluebird') const pipeline = Promise.promisify(stream.pipeline) const _ = require('lodash') +const matchquery = require('./match-query') + /* global WIKI */ module.exports = { @@ -44,13 +46,20 @@ module.exports = { try { let qry = ` SELECT rowid as id, path, locale, title, description - FROM "fts5_pages_vector" - WHERE content MATCH ? - ORDER BY rank - ` - let qryParams = [ q ] + FROM "fts5_pages_vector"` + let qryEnd = 'ORDER BY rank' + let qryWhere = 'WHERE fts5_pages_vector MATCH ?' + + let o = matchquery.parse(q) + if (o.negated) + qryWhere = 'WHERE rowid not in (select rowid from fts5_pages_vector where fts5_pages_vector MATCH ?)' + let qryParams = [ o.str ] - const results = await WIKI.models.knex.raw(qry, qryParams) + const results = await WIKI.models.knex.raw(` + ${qry} + ${qryWhere} + ${qryEnd} + ` , qryParams) return { results, suggestions: [], @@ -119,24 +128,10 @@ module.exports = { WIKI.logger.info(`(SEARCH/SQLITE3) Rebuilding Index...`) await WIKI.models.knex('fts5_pages_vector').truncate() - await pipeline( - WIKI.models.knex.column('path', 'localeCode', 'title', 'description', 'render').select().from('pages').where({ - isPublished: true, - isPrivate: false - }).stream(), - new stream.Transform({ - objectMode: true, - transform: async (page, enc, cb) => { - const content = WIKI.models.pages.cleanHTML(page.render) - await WIKI.models.knex.raw(` - INSERT INTO "fts5_pages_vector" (path, locale, title, description, content) VALUES ( - ?, ?, ?, ?, ? - ) - `, [page.path, page.localeCode, page.title, page.description, content]) - cb() - } - }) - ) - WIKI.logger.info(`(SEARCH/SQLITE3) Index rebuilt successfully.`) + await WIKI.models.knex.raw(` + INSERT INTO "fts5_pages_vector" (path, locale, title, description, content) + SELECT path, localeCode, title, description, content from pages`) + } + } diff --git a/server/modules/search/sqlite3/match-query.js b/server/modules/search/sqlite3/match-query.js new file mode 100644 index 00000000..e4f21e43 --- /dev/null +++ b/server/modules/search/sqlite3/match-query.js @@ -0,0 +1,349 @@ +const _ = require('lodash') + +/* + * Full text query preprocessor for sqlite3 FTS similar to pg-tsquery. + * Converts input string into internal sqlite match query + * FTS info: https://www.sqlite.org/fts5.html#full_text_query_syntax + */ + +/* +| input | output | +| --- | --- | +| `foo bar` | `foo bar` | +| `foo -bar`, `foo !bar`, `foo + !bar` | `foo NOT bar` | +| `foo bar,bip`, `foo+bar | bip` | `(foo bar) OR bip` | +| `foo (bar,bip)`, `foo+(bar|bip)` | `foo (bar OR bip)` | +| `foo*,bar* bana*` | `(foo *) or (bar * bana*)` | +*/ + + +module.exports = { + parse(input) { + let p = new MatchQueryParser() + let v = p.parse(input) + + let negated = v.negated + /* + * Since sqlite does not support top level negated MATCH queries + * calling function need to create negated sql query like + * select * not in (select ... match) + */ + if (negated) + v.negated = false + return { + negated, + str: v.toString() + } + } +} + +class Token +{ + constructor(type, value) { + this.type = type + this.value = value + } +} + +class Node +{ + constructor({type, value, negated = false, args, parNode = undefined, star = false}) { + this.type = type + this.value = value + this.negated = negated + this.star = star + this.args = args + if (this.args) { + this.args.forEach(item => { item.parNode = this }) + } + this.parNode = parNode + } + + toString() { + let s = '' + if (this.type == 'id') { + s = `"${this.value}"` + if (this.star) + s += '*' + } else { + let separator = '' + + if (this.type == 'and') { + separator = ' AND ' + } else if (this.type == 'or') { + separator = ' OR ' + } else { + throw new Error('should not reach') + } + + if (this.args && this.args.length > 0) + this.args.forEach(item => { + if (s != '') { + if (item.negated && this.type == 'and') + s += ' ' + else + s += separator + } + s += item + }) + if (this.parNode !== undefined || this.negated) + s = `(${s})` + } + if (this.negated) + s = 'NOT ' + s + return s + } +} + +function negateNodeType(node) +{ + if (node.type == 'or') + return 'and' + else if (node.type == 'and') + return 'or' + else + throw new Error('should not reach') +} + +function negateNodes(lst) +{ + lst.forEach(item => { + if (!(item instanceof Node)) + throw new Error('should not reach') + item.negated = !item.negated + }) +} + +class MatchQueryParser +{ + constructor() { + this.tokenRegex = /^([",!*\(\)-])/ + this.phraseSeparator = ' ' + this.terms = /[ \t,!*\(\)-]/ + this.knownLexemes = { + '-' : 'not', '!' : 'not', 'not' : 'not', + '&' : 'and', 'and' : 'and', + ',' : 'or', 'or' : 'or', '|' : 'or', + } + } + + asKeywordToken(s) { + const k = s.toLowerCase() + if (!this.knownLexemes.hasOwnProperty(k)) + return undefined + return new Token(this.knownLexemes[k], s) + } + + intNextToken() { + let tail = this.input.substring(this.idx).trimStart() + if (!tail) + return undefined + + tail = tail.trimStart() + this.idx = this.input.length - tail.length + + let m = tail.match(this.tokenRegex) + if (m) { + if (m[0] == '"') { + let idx = tail.indexOf('"', 1) + if (idx == -1) { + tail = tail.substring(1) + this.idx = this.input.length + } else { + tail = tail.substring(1, idx) + this.idx += idx + 1 + } + return new Token('id', tail) + } + this.idx += m[0].length + let keyword = this.asKeywordToken(m[0]) + return keyword || new Token(m[0], m[0]) + } + + // this is literal string, find next valid token start + let idx = tail.search(this.terms) + if (idx > 0) + tail = tail.substring(0, idx) + this.idx += tail.length + + let keyword = this.asKeywordToken(tail) + return keyword || new Token('id', tail) + } + + nextToken() { + this.tok = this.intNextToken() + return this.tok + } + + match(v) { + if (this.tok === undefined) + return false + return this.tok.type == v + } + + eat(v) { + if (!this.match(v)) + return false + this.nextToken() + return true + } + + setParent(node, par) { + if (node === undefined) + return undefined + + node.parNode = par + if (!node.args) + return + + node.args.forEach(item => { + if (item instanceof Node) + this.setParent(item, node) + }) + } + /* + * Sqlite3 `NOT` operator is binary but our input search string + * have unary not ('!', '-') operators so we need to preprocess request + * and rearange some items to generate valid queries + */ + preprocess(node) { + if (node === undefined || node.args === undefined) + return node + + node.args.forEach(item => { + if (item instanceof Node) + this.preprocess(item) + }) + + //try to rearrange items + let l = [], nl = [] + node.args.forEach(item => { + if (item.negated) + nl.push(item) + else + l.push(item) + }) + + if (l.length == 0 && nl.length > 1) { + /* invert node type if all children are negated */ + node.negated = !node.negated + node.type = negateNodeType(node) + negateNodes(node.args) + return node + } else if (nl.length > 1) { + // merge multiple negated nodes into one, since NOT(A & B) = NOT(A) | NOT(B) + negateNodes(nl) + nl = [ + new Node({ + type: negateNodeType(node), + parNode: node, + negated: true, + args: nl + + }) + ] + } + node.args = l.concat(nl) + + return node + } + + parse(str) { + this.input = str + this.tok = undefined + this.idx = 0 + + this.nextToken() + let o = this.parseOr() + this.setParent(o, undefined) + return this.preprocess(o) + } + + parseOr() { + let o = this.parseAnd() + if (!o) + return undefined + if (!this.match('or')) + return o + + let l = [o] + + while (this.eat('or')) { + o = this.parseAnd() + if (!o) + break + l.push(o) + } + return new Node({ + type: 'or', + args: l + }) + } + + parseAnd() { + let o = this.parseLit() + if (!o) + return undefined + + let l = [o] + while (true) { + this.eat('and') //optional + o = this.parseLit() + if (!o) + break + l.push(o) + } + if (l.length == 1) + return l[0] + + return new Node({ + type: 'and', + args: l, + }) + } + + parseLit() { + let o = this.tok + let negated = false + let star = false + + if (o == undefined) + return o + + if (this.eat('not')) { + if (this.tok == undefined) { + return new Node({ + type: 'id', + negated: false, + value: o.value, + }) + } + negated = true + o = this.tok + } + + if (this.eat('(')) { + let tail = this.input + let n = this.parseOr() + if (!this.eat(')') || n === undefined) + return undefined + n.negated = negated + return n + } + if (['and', 'or', '(', ')'].indexOf(o.type) >= 0) { + return undefined + } + + this.nextToken() + if (this.eat('*')) + star = true + + return new Node({ + type: 'id', + negated, + star, + value: o.value, + }) + } +} +