agent-ecosystem/src/shared/utils/markdownTextSearch.ts
iliya c21350713c perf: replace remark-based search with plain text indexOf
Manually ported from upstream 5c7f921e. Key changes:
- SessionSearcher: indexOf instead of remark AST, batch size 8→16
- conversationSlice: indexOf with MAX_SEARCH_MATCHES=500 cap
- Item-scoped store selectors (searchMatchItemIds Set) to skip re-renders
- Pre-filter in markdownTextSearch (skip parse if no raw match)
- SearchTextCache: 200→1000 entries
- ProjectScanner: 30s search project cache, batch 4→8
2026-03-25 14:32:37 +02:00

210 lines
6.6 KiB
TypeScript

/**
* Markdown-aware text search utility.
*
* Converts markdown through the **same pipeline** as react-markdown:
* remark-parse → remarkGfm → mdast-util-to-hast → HAST tree
*
* Then collects text nodes only from HAST elements whose corresponding
* React components call `hl(children)` (highlightSearchInChildren).
* This ensures match counts align exactly with what the renderer produces.
*
* Key design: segments are collected per-text-node, NOT concatenated.
* `highlightSearchText` operates per-React-string-child, so a match
* spanning two elements is not valid in either layer.
*/
import { toHast } from 'mdast-util-to-hast';
import remarkGfm from 'remark-gfm';
import remarkParse from 'remark-parse';
import { unified } from 'unified';
import type { Nodes as HastNodes } from 'hast';
import type { Root as MdastRoot } from 'mdast';
// ---------------------------------------------------------------------------
// Parser singleton
// ---------------------------------------------------------------------------
// eslint-disable-next-line @typescript-eslint/explicit-function-return-type -- inferred type used by MarkdownParser alias
function createParser() {
return unified().use(remarkParse).use(remarkGfm);
}
type MarkdownParser = ReturnType<typeof createParser>;
let _parser: MarkdownParser | null = null;
function getParser(): MarkdownParser {
if (!_parser) {
_parser = createParser();
}
return _parser;
}
function parseMarkdown(text: string): MdastRoot {
return getParser().parse(text);
}
// ---------------------------------------------------------------------------
// Segment cache (parse once, search many times per query keystroke)
// ---------------------------------------------------------------------------
const MAX_CACHE_SIZE = 1000;
const segmentCache = new Map<string, string[]>();
function getCachedSegments(markdown: string): string[] {
const cached = segmentCache.get(markdown);
if (cached) return cached;
const segments = collectTextSegments(markdown);
// Evict oldest entries when cache is full
if (segmentCache.size >= MAX_CACHE_SIZE) {
const firstKey = segmentCache.keys().next().value;
if (firstKey !== undefined) segmentCache.delete(firstKey);
}
segmentCache.set(markdown, segments);
return segments;
}
// ---------------------------------------------------------------------------
// HAST → text segments
// ---------------------------------------------------------------------------
/**
* HTML element tag names whose React component counterparts call
* `hl(children)` (highlightSearchInChildren).
*
* Block-level elements call hl(): p, h1-h6, blockquote, li, th, td, code (block only)
* Inline elements do NOT call hl(): strong, em, a, del, code (inline)
* The block element's hl() recursively descends into inline children,
* processing text in document order — matching this walker's traversal.
*
* Inline tags are omitted from this set because they are always nested
* inside a block-level HL element in standard markdown, so their text
* is collected via the inherited `inHlElement` flag.
*
* Must stay in sync with createMarkdownComponents() in markdownComponents.tsx,
* createUserMarkdownComponents() in UserChatGroup.tsx, and
* createViewerMarkdownComponents() in MarkdownViewer.tsx.
*/
const HL_TAGS = new Set([
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'p',
'code',
'blockquote',
'li',
'th',
'td',
]);
/**
* Parse markdown → mdast → HAST, then collect text nodes from elements
* whose React components call `hl()`. This produces the exact same
* text segments that `highlightSearchInChildren` processes at render time.
*/
export function collectTextSegments(markdown: string): string[] {
const mdast = parseMarkdown(markdown);
const hast = toHast(mdast);
if (!hast) return [];
const segments: string[] = [];
walkHast(hast, segments, false);
return segments;
}
function walkHast(node: HastNodes, segments: string[], inHlElement: boolean): void {
// Raw HTML nodes (e.g. <context>...</context>) are dropped by ReactMarkdown
// without rehype-raw, so we must skip them to keep match counts aligned.
if (node.type === 'raw') return;
if (node.type === 'text') {
if (inHlElement && node.value) {
segments.push(node.value);
}
return;
}
if (node.type === 'element' || node.type === 'root') {
const isHl = node.type === 'element' && HL_TAGS.has(node.tagName);
for (const child of node.children) {
walkHast(child as HastNodes, segments, inHlElement || isHl);
}
}
// skip comments, doctypes
}
// ---------------------------------------------------------------------------
// Search functions
// ---------------------------------------------------------------------------
export interface MarkdownSearchMatch {
matchIndexInItem: number;
}
/**
* Parse markdown into segments and search each segment individually.
* Returns per-item match indices that align with what the renderer produces.
*/
export function findMarkdownSearchMatches(markdown: string, query: string): MarkdownSearchMatch[] {
if (!query || !markdown) return [];
// Fast pre-filter: skip expensive markdown parsing if query doesn't appear in raw text
if (!markdown.toLowerCase().includes(query.toLowerCase())) return [];
const segments = getCachedSegments(markdown);
const lowerQuery = query.toLowerCase();
const matches: MarkdownSearchMatch[] = [];
let matchIndex = 0;
for (const segment of segments) {
const lowerSegment = segment.toLowerCase();
let pos = 0;
while ((pos = lowerSegment.indexOf(lowerQuery, pos)) !== -1) {
matches.push({ matchIndexInItem: matchIndex });
matchIndex++;
pos += lowerQuery.length;
}
}
return matches;
}
/**
* Count matches (cheaper than allocating match objects when only the count is needed).
*/
export function countMarkdownSearchMatches(markdown: string, query: string): number {
if (!query || !markdown) return 0;
// Fast pre-filter: skip expensive markdown parsing if query doesn't appear in raw text
if (!markdown.toLowerCase().includes(query.toLowerCase())) return 0;
const segments = getCachedSegments(markdown);
const lowerQuery = query.toLowerCase();
let count = 0;
for (const segment of segments) {
const lowerSegment = segment.toLowerCase();
let pos = 0;
while ((pos = lowerSegment.indexOf(lowerQuery, pos)) !== -1) {
count++;
pos += lowerQuery.length;
}
}
return count;
}
/**
* Join all visible text segments with spaces for use in context snippets.
*/
export function extractMarkdownPlainText(markdown: string): string {
if (!markdown) return '';
const segments = getCachedSegments(markdown);
return segments.join(' ');
}