feat: add azure search with vector embeddings

pull/7366/head
Gabor Kajtar 1 year ago
parent 55a63a3c5a
commit 6c5599b29c

@ -0,0 +1,41 @@
#!/bin/bash
# Check if platform argument is provided
if [ -z "$1" ]; then
echo "Error: Platform argument is required."
echo "Usage: $0 <platform>"
echo "Example: $0 linux/amd64"
exit 1
fi
# Assign the first argument to PLATFORM
PLATFORM=$1
# Function to check if Docker is running
check_docker_running() {
if ! docker info >/dev/null 2>&1; then
echo "Error: Docker is not running. Please make sure Docker is installed and running before running this script."
exit 1
fi
}
# Call the function to check Docker status
check_docker_running
# Generate a unique tag using today's date and an optional index (e.g., 20230901-1)
TAG=$(date +"%Y%m%d_%H%M%S")
# Login to the Azure Container registry
az acr login --name acruicwiki
# Build the Docker image with the specified platform
docker build --platform "$PLATFORM" -t acruicwiki.azurecr.io/uic-wiki:$TAG -f dev/build/Dockerfile .
# Push the Docker image to the registry
docker push acruicwiki.azurecr.io/uic-wiki:$TAG
# Update the container app with the new image
# az containerapp update \
# --name ca-uic-wiki \
# --resource-group rg-uic-wiki \
# --image acruicwiki.azurecr.io/uic-wiki:$TAG

@ -47,7 +47,7 @@ services:
depends_on:
- db
ports:
- "3000:3000"
- "3500:3000"
volumes:
- ../..:/wiki
- /wiki/node_modules

@ -4,7 +4,7 @@
"releaseDate": "2019-01-01T01:01:01.000Z",
"description": "A modern, lightweight and powerful wiki app built on NodeJS, Git and Markdown",
"main": "wiki.js",
"dev": true,
"dev": false,
"scripts": {
"start": "node server",
"dev": "NODE_OPTIONS=--openssl-legacy-provider node dev",
@ -37,6 +37,7 @@
"node": ">=10.12"
},
"dependencies": {
"@azure/search-documents": "^12.0.0",
"@azure/storage-blob": "12.12.0",
"@exlinc/keycloak-passport": "1.0.2",
"@joplin/turndown-plugin-gfm": "1.0.45",
@ -335,7 +336,6 @@
"zxcvbn": "4.4.2"
},
"resolutions": {
"apollo-server-express/**/graphql-tools": "4.0.8",
"graphql": "15.3.0"
},
"browserslist": [

@ -0,0 +1,45 @@
key: azure-search-similarity
title: Azure Search with Similarity Search
description: AI-Powered cloud search service with vectorisation for embedding.
author: UIC Digital
logo: https://static.requarks.io/logo/azure.svg
website: https://azure.microsoft.com/services/search/
isAvailable: true
props:
endpoint:
type: String
title: Azure Search Endpoint
hint: The endpoint of the Azure Search Service. Found under Properties.
order: 1
adminKey:
type: String
title: Admin API Key
hint: Either the primary or secondary admin key. Found under Keys.
order: 2
indexName:
type: String
title: Index Name
hint: 'Name to use when creating the index. (default: wiki)'
default: wiki
order: 3
embeddingModelAPIVersion:
type: String
title: Embedding Model API Version
hint: 'API version of the embedding model to use for vectorisation'
default: 2024-02-01
order: 4
embeddingModelKey:
type: String
title: Embedding Model Key
hint: 'Key of the embedding model to use for vectorisation'
order: 5
embeddingModelEndpoint:
type: String
title: Embedding Model Endpoint
hint: 'Endpoint of the embedding model to use for vectorisation'
order: 6
embeddingModelDeploymentName:
type: String
title: Embedding Model Deployment Name
hint: 'Deployment name of the embedding model to use for vectorisation'
order: 7

@ -0,0 +1,300 @@
const _ = require('lodash')
const { SearchService, QueryType } = require('azure-search-client')
const {
AzureKeyCredential,
SearchIndexClient,
IndexDocumentsResult,
SearchClient,
SearchDocumentsResult,
SearchFieldArray,
SelectArray,
SelectFields,
} = require('@azure/search-documents');
const request = require('request-promise')
const stream = require('stream')
const Promise = require('bluebird')
const pipeline = Promise.promisify(stream.pipeline)
/* global WIKI */
module.exports = {
async activate() {
// not used
},
async deactivate() {
// not used
},
/**
* INIT
*/
async init() {
WIKI.logger.info(`(SEARCH/AZURE) Initializing...`)
this.client = new SearchIndexClient(
this.config.endpoint,
new AzureKeyCredential(this.config.adminKey)
);
// -> Create Search Index
WIKI.logger.info(`(SEARCH/AZURE) Creating index...`)
await this.client.createOrUpdateIndex({
name: this.config.indexName,
fields: [
{
name: 'id',
type: 'Edm.String',
key: true,
searchable: false
},
{
name: 'locale',
type: 'Edm.String',
searchable: false
},
{
name: 'path',
type: 'Edm.String',
searchable: false
},
{
name: 'title',
type: 'Edm.String',
searchable: true
},
{
name: 'titleVector',
type: 'Edm.Collection(Edm.Single)',
searchable: true,
vectorSearchDimensions: 1536,
vectorSearchProfileName: 'vector-profile',
},
{
name: 'description',
type: 'Edm.String',
searchable: true
},
{
name: 'descriptionVector',
type: 'Edm.Collection(Edm.Single)',
searchable: true,
vectorSearchDimensions: 1536,
vectorSearchProfileName: 'vector-profile',
},
{
name: 'content',
type: 'Edm.String',
searchable: true
},
{
name: 'contentVector',
type: 'Edm.Collection(Edm.Single)',
searchable: true,
vectorSearchDimensions: 1536,
vectorSearchProfileName: 'vector-profile',
},
],
scoringProfiles: [
{
name: 'fieldWeights',
text: {
weights: {
title: 4,
description: 3,
content: 1
}
}
}
],
corsOptions: {
allowedOrigins: ['*'],
},
vectorSearch: {
algorithms: [{ name: 'vector-search-algorithm', kind: 'hnsw' }],
profiles: [
{
name: 'vector-profile',
algorithmConfigurationName: 'vector-search-algorithm',
},
],
},
suggesters: [
{
name: 'suggestions',
searchMode: 'analyzingInfixMatching',
sourceFields: ['title', 'description', 'content']
}
]
})
this.searchClient = new SearchClient(
this.config.endpoint,
this.config.indexName,
new AzureKeyCredential(this.config.adminKey)
);
WIKI.logger.info(`(SEARCH/AZURE) Initialization completed.`)
},
/**
* QUERY
*
* @param {String} q Query
* @param {Object} opts Additional options
*/
async query(q, opts) {
try {
const results = await this.searchClient.search(q, {
select: ['id', 'locale', 'path', 'title', 'description'],
searchFields: ['title', 'description', 'content'],
queryType: 'full',
top: 50,
includeTotalCount: true,
});
const searchResults = [];
for await (const result of results.results) {
searchResults.push(result.document);
}
WIKI.logger.info(`(SEARCH/AZURE) Search: ${JSON.stringify(searchResults)}.`)
return {
results: searchResults,
suggestions: [],
totalHits: results.count
}
} catch (err) {
WIKI.logger.warn('Search Engine Error:')
WIKI.logger.warn(err)
}
},
/**
* CREATE
*
* @param {Object} page Page to create
*/
async created(page) {
const doc = {
id: page.hash,
locale: page.localeCode,
path: page.path,
title: page.title,
description: page.description,
content: page.safeContent
}
await this.updateDocument(doc)
},
/**
* UPDATE
*
* @param {Object} page Page to update
*/
async updated(page) {
const doc = {
id: page.hash,
locale: page.localeCode,
path: page.path,
title: page.title,
description: page.description,
content: page.safeContent
}
await this.updateDocument(doc)
},
/**
* DELETE
*
* @param {Object} page Page to delete
*/
async deleted(page) {
await this.searchClient.deleteDocuments([page.hash])
},
/**
* RENAME
*
* @param {Object} page Page to rename
*/
async renamed(page) {
const doc = {
id: page.destinationHash,
locale: page.destinationLocaleCode,
path: page.destinationPath,
title: page.title,
description: page.description,
content: page.safeContent
}
await this.updateDocument(doc)
},
/**
* REBUILD INDEX
*/
async rebuild() {
WIKI.logger.info(`(SEARCH/AZURE) Rebuilding Index...`)
await pipeline(
WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({
isPublished: true,
isPrivate: false
}).stream(),
new stream.Transform({
objectMode: true,
transform: async (page, enc, cb) => {
await this.rebuildPage(page)
cb()
}
}),
)
WIKI.logger.info(`(SEARCH/AZURE) Index rebuilt successfully.`)
},
async updateDocument(doc) {
const [titleVector, descriptionVector, contentVector] = await Promise.all([
this.generateEmbedding(doc.title),
this.generateEmbedding(doc.description),
this.generateEmbedding(doc.content)
])
WIKI.logger.info(`(SEARCH/AZURE) Generated embeddings for ${doc.id}.`)
doc.titleVector = titleVector;
doc.descriptionVector = descriptionVector;
doc.contentVector = contentVector;
await this.searchClient.mergeOrUploadDocuments([doc])
},
async rebuildPage(page) {
const doc = {
id: page.id,
locale: page.locale,
path: page.path,
title: page.title,
description: page.description,
content: WIKI.models.pages.cleanHTML(page.render)
}
await this.updateDocument(doc)
// sleep for 1 second to avoid rate limiting
await new Promise(resolve => setTimeout(resolve, 1000))
},
async generateEmbedding(str) {
const apiKey = this.config.embeddingModelKey;
const apiBase = this.config.embeddingModelEndpoint;
const deploymentName = this.config.embeddingModelDeploymentName;
const apiVersion = this.config.embeddingModelAPIVersion;
const url = `${apiBase}/openai/deployments/${deploymentName}/embeddings?api-version=${apiVersion}`;
const body = {
input: str,
};
try {
const response = await request({
uri: url,
method: 'post',
headers: {
'api-key': apiKey,
'Content-Type': 'application/json'
},
json: true,
body,
})
return response.data[0].embedding;
} catch (error) {
WIKI.logger.info(`(SEARCH/AZURE) Error generating embedding. ${error}`)
}
}
}

49432
yarn.lock

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save