perf(localSearch): add concurrency pooling, cleanup logic, improve performance (#3374)

pull/3398/head
Yuxuan Zhang 6 months ago committed by GitHub
parent d1ff29431f
commit ac5881eeac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,6 +1,7 @@
import _debug from 'debug' import _debug from 'debug'
import fs from 'fs-extra' import fs from 'fs-extra'
import MiniSearch from 'minisearch' import MiniSearch from 'minisearch'
import pMap from 'p-map'
import path from 'path' import path from 'path'
import type { Plugin, ViteDevServer } from 'vite' import type { Plugin, ViteDevServer } from 'vite'
import type { SiteConfig } from '../config' import type { SiteConfig } from '../config'
@ -53,15 +54,18 @@ export async function localSearchPlugin(
const options = siteConfig.site.themeConfig.search.options || {} const options = siteConfig.site.themeConfig.search.options || {}
function render(file: string) { async function render(file: string) {
if (!fs.existsSync(file)) return ''
const { srcDir, cleanUrls = false } = siteConfig const { srcDir, cleanUrls = false } = siteConfig
const relativePath = slash(path.relative(srcDir, file)) const relativePath = slash(path.relative(srcDir, file))
const env: MarkdownEnv = { path: file, relativePath, cleanUrls } const env: MarkdownEnv = { path: file, relativePath, cleanUrls }
let src = fs.readFileSync(file, 'utf-8') const md_raw = await fs.promises.readFile(file, 'utf-8')
src = processIncludes(srcDir, src, file, []) const md_src = processIncludes(srcDir, md_raw, file, [])
if (options._render) return options._render(src, env, md) if (options._render) return await options._render(md_src, env, md)
const html = md.render(src, env) else {
return env.frontmatter?.search === false ? '' : html const html = md.render(md_src, env)
return env.frontmatter?.search === false ? '' : html
}
} }
const indexByLocales = new Map<string, MiniSearch<IndexObject>>() const indexByLocales = new Map<string, MiniSearch<IndexObject>>()
@ -85,11 +89,6 @@ export async function localSearchPlugin(
return siteData?.localeIndex ?? 'root' return siteData?.localeIndex ?? 'root'
} }
function getIndexForPath(file: string) {
const locale = getLocaleForPath(file)
return getIndexByLocale(locale)
}
let server: ViteDevServer | undefined let server: ViteDevServer | undefined
function onIndexUpdated() { function onIndexUpdated() {
@ -123,43 +122,39 @@ export async function localSearchPlugin(
return id return id
} }
async function indexAllFiles(files: string[]) { async function indexFile(page: string) {
const documentsByLocale = new Map<string, IndexObject[]>() const file = path.join(siteConfig.srcDir, page)
await Promise.all( // get file metadata
files const fileId = getDocId(file)
.filter((file) => fs.existsSync(file)) const locale = getLocaleForPath(file)
.map(async (file) => { const index = getIndexByLocale(locale)
const fileId = getDocId(file) // retrieve file and split into "sections"
const sections = splitPageIntoSections(render(file)) const html = await render(file)
if (sections.length === 0) return const sections =
const locale = getLocaleForPath(file) // user provided generator
let documents = documentsByLocale.get(locale) (await options.miniSearch?._splitIntoSections?.(file, html)) ??
if (!documents) { // default implementation
documents = [] splitPageIntoSections(html)
documentsByLocale.set(locale, documents) // add sections to the locale index
} for await (const section of sections) {
documents.push( if (!section || !(section.text || section.titles)) break
...sections.map((section) => ({ const { anchor, text, titles } = section
id: `${fileId}#${section.anchor}`, const id = anchor ? [fileId, anchor].join('#') : fileId
text: section.text, index.add({
title: section.titles.at(-1)!, id,
titles: section.titles.slice(0, -1) text,
})) title: titles.at(-1)!,
) titles: titles.slice(0, -1)
}) })
)
for (const [locale, documents] of documentsByLocale) {
const index = getIndexByLocale(locale)
index.removeAll()
await index.addAllAsync(documents)
} }
debug(`🔍️ Indexed ${files.length} files`)
} }
async function scanForBuild() { async function scanForBuild() {
await indexAllFiles( debug('🔍️ Indexing files for search...')
siteConfig.pages.map((f) => path.join(siteConfig.srcDir, f)) await pMap(siteConfig.pages, indexFile, {
) concurrency: siteConfig.buildConcurrency
})
debug('✅ Indexing finished...')
} }
return { return {
@ -214,25 +209,8 @@ export async function localSearchPlugin(
async handleHotUpdate({ file }) { async handleHotUpdate({ file }) {
if (file.endsWith('.md')) { if (file.endsWith('.md')) {
const fileId = getDocId(file) await indexFile(file)
if (!fs.existsSync(file)) return
const index = getIndexForPath(file)
const sections = splitPageIntoSections(render(file))
if (sections.length === 0) return
for (const section of sections) {
const id = `${fileId}#${section.anchor}`
if (index.has(id)) {
index.discard(id)
}
index.add({
id,
text: section.text,
title: section.titles.at(-1)!,
titles: section.titles.slice(0, -1)
})
}
debug('🔍️ Updated', file) debug('🔍️ Updated', file)
onIndexUpdated() onIndexUpdated()
} }
} }
@ -242,20 +220,13 @@ export async function localSearchPlugin(
const headingRegex = /<h(\d*).*?>(.*?<a.*? href="#.*?".*?>.*?<\/a>)<\/h\1>/gi const headingRegex = /<h(\d*).*?>(.*?<a.*? href="#.*?".*?>.*?<\/a>)<\/h\1>/gi
const headingContentRegex = /(.*?)<a.*? href="#(.*?)".*?>.*?<\/a>/i const headingContentRegex = /(.*?)<a.*? href="#(.*?)".*?>.*?<\/a>/i
interface PageSection {
anchor: string
titles: string[]
text: string
}
/** /**
* Splits HTML into sections based on headings * Splits HTML into sections based on headings
*/ */
function splitPageIntoSections(html: string) { function* splitPageIntoSections(html: string) {
const result = html.split(headingRegex) const result = html.split(headingRegex)
result.shift() result.shift()
let parentTitles: string[] = [] let parentTitles: string[] = []
const sections: PageSection[] = []
for (let i = 0; i < result.length; i += 3) { for (let i = 0; i < result.length; i += 3) {
const level = parseInt(result[i]) - 1 const level = parseInt(result[i]) - 1
const heading = result[i + 1] const heading = result[i + 1]
@ -266,14 +237,13 @@ function splitPageIntoSections(html: string) {
if (!title || !content) continue if (!title || !content) continue
const titles = parentTitles.slice(0, level) const titles = parentTitles.slice(0, level)
titles[level] = title titles[level] = title
sections.push({ anchor, titles, text: getSearchableText(content) }) yield { anchor, titles, text: getSearchableText(content) }
if (level === 0) { if (level === 0) {
parentTitles = [title] parentTitles = [title]
} else { } else {
parentTitles[level] = title parentTitles[level] = title
} }
} }
return sections
} }
function getSearchableText(content: string) { function getSearchableText(content: string) {

@ -2,8 +2,11 @@ import type MarkdownIt from 'markdown-it'
import type { Options as MiniSearchOptions } from 'minisearch' import type { Options as MiniSearchOptions } from 'minisearch'
import type { ComputedRef, Ref } from 'vue' import type { ComputedRef, Ref } from 'vue'
import type { DocSearchProps } from './docsearch.js' import type { DocSearchProps } from './docsearch.js'
import type { LocalSearchTranslations } from './local-search.js' import type {
import type { MarkdownEnv, PageData } from './shared.js' LocalSearchTranslations,
PageSplitSection
} from './local-search.js'
import type { Awaitable, MarkdownEnv, PageData } from './shared.js'
export namespace DefaultTheme { export namespace DefaultTheme {
export interface Config { export interface Config {
@ -422,13 +425,34 @@ export namespace DefaultTheme {
* @see https://lucaong.github.io/minisearch/modules/_minisearch_.html#searchoptions-1 * @see https://lucaong.github.io/minisearch/modules/_minisearch_.html#searchoptions-1
*/ */
searchOptions?: MiniSearchOptions['searchOptions'] searchOptions?: MiniSearchOptions['searchOptions']
}
/**
* Overrides the default regex based page splitter.
* Supports async generator, making it possible to run in true parallel
* (when used along with `node:child_process` or `worker_threads`)
* ---
* This should be especially useful for scalability reasons.
* ---
* @param {string} path - absolute path to the markdown source file
* @param {string} html - document page rendered as html
*/
_splitIntoSections?: (
path: string,
html: string
) =>
| AsyncGenerator<PageSplitSection>
| Generator<PageSplitSection>
| Awaitable<PageSplitSection[]>
}
/** /**
* Allows transformation of content before indexing (node only) * Allows transformation of content before indexing (node only)
* Return empty string to skip indexing * Return empty string to skip indexing
*/ */
_render?: (src: string, env: MarkdownEnv, md: MarkdownIt) => string _render?: (
src: string,
env: MarkdownEnv,
md: MarkdownIt
) => Awaitable<string>
} }
// algolia ------------------------------------------------------------------- // algolia -------------------------------------------------------------------

@ -25,3 +25,9 @@ export interface FooterTranslations {
closeText?: string closeText?: string
closeKeyAriaLabel?: string closeKeyAriaLabel?: string
} }
export interface PageSplitSection {
anchor?: string
titles: string[]
text: string
}

Loading…
Cancel
Save