agent-ecosystem/test/shared/utils/markdownTextSearch.test.ts

import { describe, expect, it } from 'vitest';

import {
  collectTextSegments,
  countMarkdownSearchMatches,
  extractMarkdownPlainText,
  findMarkdownSearchMatches,
} from '../../../src/shared/utils/markdownTextSearch';

describe('markdownTextSearch', () => {
  // ---------------------------------------------------------------------------
  // collectTextSegments (now takes markdown string, uses HAST internally)
  // ---------------------------------------------------------------------------

  describe('collectTextSegments', () => {
    it('extracts plain text from a paragraph', () => {
      const segments = collectTextSegments('Hello world');
      expect(segments).toEqual(['Hello world']);
    });

    it('extracts text from bold/italic nodes', () => {
      const segments = collectTextSegments('Hello **bold** and *italic*');
      expect(segments).toEqual(['Hello ', 'bold', ' and ', 'italic']);
    });

    it('keeps code block content as a single segment with trailing newline', () => {
      // HAST adds trailing \n to code block text — matches what ReactMarkdown
      // passes to its <code> component as children
      const segments = collectTextSegments('```js\nconst x = 1;\nconst y = 2;\n```');
      expect(segments).toEqual(['const x = 1;\nconst y = 2;\n']);
    });

    it('extracts inline code text', () => {
      const segments = collectTextSegments('Use `findMatches` here');
      expect(segments).toEqual(['Use ', 'findMatches', ' here']);
    });

    it('extracts link text but not URL', () => {
      const segments = collectTextSegments('[docs](https://example.com)');
      expect(segments).toEqual(['docs']);
    });

    it('does NOT include image alt text', () => {
      const segments = collectTextSegments('![screenshot](./img.png)');
      expect(segments).toEqual([]);
    });

    it('extracts list item text', () => {
      const segments = collectTextSegments('- item one\n- item two');
      expect(segments).toContain('item one');
      expect(segments).toContain('item two');
    });

    it('extracts heading text', () => {
      const segments = collectTextSegments('## Important Section');
      expect(segments).toContain('Important Section');
    });

    it('extracts table cell text', () => {
      const segments = collectTextSegments(
        '| Header | Value |\n|--------|-------|\n| Cell   | Data  |'
      );
      expect(segments).toContain('Header');
      expect(segments).toContain('Cell');
      expect(segments).toContain('Data');
    });

    it('extracts blockquote text', () => {
      const segments = collectTextSegments('> quoted text');
      expect(segments).toContain('quoted text');
    });

    it('extracts h5 heading text', () => {
      const segments = collectTextSegments('##### Sub-heading');
      expect(segments).toContain('Sub-heading');
    });

    it('extracts h6 heading text', () => {
      const segments = collectTextSegments('###### Tiny heading');
      expect(segments).toContain('Tiny heading');
    });

    it('extracts strikethrough (del) text', () => {
      const segments = collectTextSegments('This is ~~removed~~ text');
      expect(segments).toContain('removed');
    });

    it('collects nested inline text in document order', () => {
      const segments = collectTextSegments('first **bold** last');
      // Segments must be in document order: "first " before "bold" before " last"
      expect(segments).toEqual(['first ', 'bold', ' last']);
    });

    it('does NOT include inter-block whitespace', () => {
      // Whitespace text nodes at root level (between blocks) should NOT be collected
      const segments = collectTextSegments('Paragraph one\n\nParagraph two');
      const newlineOnlySegments = segments.filter((s) => s.trim() === '');
      // Any whitespace segments should only be inside hl elements (like li), not at root level
      expect(segments).toContain('Paragraph one');
      expect(segments).toContain('Paragraph two');
      // Root-level "\n" nodes should be excluded
      expect(newlineOnlySegments.length).toBeLessThanOrEqual(0);
    });
  });

  // ---------------------------------------------------------------------------
  // findMarkdownSearchMatches
  // ---------------------------------------------------------------------------

  describe('findMarkdownSearchMatches', () => {
    it('finds matches in plain text', () => {
      const matches = findMarkdownSearchMatches('hello world hello', 'hello');
      expect(matches).toHaveLength(2);
      expect(matches[0].matchIndexInItem).toBe(0);
      expect(matches[1].matchIndexInItem).toBe(1);
    });

    it('is case-insensitive', () => {
      const matches = findMarkdownSearchMatches('Hello HELLO', 'hello');
      expect(matches).toHaveLength(2);
    });

    it('finds matches in bold text (strips ** markers)', () => {
      const matches = findMarkdownSearchMatches('This is **important** text', 'important');
      expect(matches).toHaveLength(1);
    });

    it('does NOT match markdown syntax characters like **', () => {
      const matches = findMarkdownSearchMatches('This is **bold** text', '**');
      expect(matches).toHaveLength(0);
    });

    it('does NOT match code fence language identifiers', () => {
      const md = '```tsx\nconst x = 1;\n```';
      const matches = findMarkdownSearchMatches(md, 'tsx');
      expect(matches).toHaveLength(0);
    });

    it('finds matches inside fenced code block content', () => {
      const md = '```ts\nconst tsx = "value";\n```';
      const matches = findMarkdownSearchMatches(md, 'tsx');
      expect(matches).toHaveLength(1);
    });

    it('finds matches in inline code', () => {
      const matches = findMarkdownSearchMatches('Use `findMatches` here', 'findmatches');
      expect(matches).toHaveLength(1);
    });

    it('does NOT match link URLs', () => {
      const md = 'Check [docs](https://example.com/docs) here';
      const matches = findMarkdownSearchMatches(md, 'example.com');
      expect(matches).toHaveLength(0);
    });

    it('matches link text but not URL', () => {
      const md = 'Check [the docs](https://example.com) here';
      const matches = findMarkdownSearchMatches(md, 'the docs');
      expect(matches).toHaveLength(1);
    });

    it('does NOT match image alt text', () => {
      const md = 'An image: ![screenshot](./img.png)';
      const matches = findMarkdownSearchMatches(md, 'screenshot');
      expect(matches).toHaveLength(0);
    });

    it('does NOT match heading markers (#)', () => {
      const md = '# Title\n\nSome text';
      const matches = findMarkdownSearchMatches(md, '#');
      expect(matches).toHaveLength(0);
    });

    it('finds matches in heading text', () => {
      const md = '## Important Section\n\nBody text';
      const matches = findMarkdownSearchMatches(md, 'important');
      expect(matches).toHaveLength(1);
    });

    it('does NOT match list markers', () => {
      const md = '- item one\n- item two';
      const matches = findMarkdownSearchMatches(md, '-');
      expect(matches).toHaveLength(0);
    });

    it('does NOT match across text segments (no cross-node matches)', () => {
      // "**th**eory" renders as two text nodes: "th" and "eory"
      // A search for "theory" should NOT match because it spans nodes
      const md = '**th**eory';
      const matches = findMarkdownSearchMatches(md, 'theory');
      expect(matches).toHaveLength(0);
    });

    it('handles strikethrough text', () => {
      const md = 'This is ~~deleted~~ text';
      const matches = findMarkdownSearchMatches(md, 'deleted');
      expect(matches).toHaveLength(1);
      const tildeMatches = findMarkdownSearchMatches(md, '~~');
      expect(tildeMatches).toHaveLength(0);
    });

    it('handles tables', () => {
      const md = '| Header | Value |\n|--------|-------|\n| Cell   | Data  |';
      const matches = findMarkdownSearchMatches(md, 'cell');
      expect(matches).toHaveLength(1);
    });

    it('returns empty for empty input', () => {
      expect(findMarkdownSearchMatches('', 'test')).toEqual([]);
      expect(findMarkdownSearchMatches('test', '')).toEqual([]);
    });

    it('handles blockquotes', () => {
      const md = '> quoted text here';
      const matches = findMarkdownSearchMatches(md, 'quoted');
      expect(matches).toHaveLength(1);
    });

    it('finds matches in h5 headings', () => {
      const md = '##### Sub-heading\n\nBody text';
      const matches = findMarkdownSearchMatches(md, 'sub-heading');
      expect(matches).toHaveLength(1);
    });

    it('finds matches in h6 headings', () => {
      const md = '###### Tiny heading\n\nBody text';
      const matches = findMarkdownSearchMatches(md, 'tiny');
      expect(matches).toHaveLength(1);
    });

    it('finds matches in strikethrough (del) text', () => {
      const md = 'This is ~~deleted content~~ here';
      const matches = findMarkdownSearchMatches(md, 'deleted');
      expect(matches).toHaveLength(1);
    });

    it('does not match reference-style link definitions', () => {
      const md = '[link text][ref]\n\n[ref]: https://example.com';
      const matches = findMarkdownSearchMatches(md, 'example.com');
      expect(matches).toHaveLength(0);
    });

    it('treats code block content as single segment (allows cross-line match)', () => {
      // Code block is a single text node in HAST, matching what ReactMarkdown's
      // <code> component receives as children. Cross-line matches ARE valid
      // because highlightSearchText operates on the full string.
      const md = '```js\nconst x = 1;\nconst y = 2;\n```';
      const matches = findMarkdownSearchMatches(md, '1;\nconst');
      expect(matches).toHaveLength(1);
    });

    it('finds per-line matches inside code blocks', () => {
      const md = '```js\nconst x = 1;\nconst y = 2;\n```';
      const matches = findMarkdownSearchMatches(md, 'const');
      expect(matches).toHaveLength(2);
      expect(matches[0].matchIndexInItem).toBe(0);
      expect(matches[1].matchIndexInItem).toBe(1);
    });
  });

  // ---------------------------------------------------------------------------
  // countMarkdownSearchMatches
  // ---------------------------------------------------------------------------

  describe('countMarkdownSearchMatches', () => {
    it('returns correct count', () => {
      const count = countMarkdownSearchMatches('hello **world** hello', 'hello');
      expect(count).toBe(2);
    });

    it('returns 0 for no matches', () => {
      expect(countMarkdownSearchMatches('hello world', 'xyz')).toBe(0);
    });

    it('returns 0 for empty inputs', () => {
      expect(countMarkdownSearchMatches('', 'test')).toBe(0);
      expect(countMarkdownSearchMatches('test', '')).toBe(0);
    });
  });

  // ---------------------------------------------------------------------------
  // extractMarkdownPlainText
  // ---------------------------------------------------------------------------

  describe('extractMarkdownPlainText', () => {
    it('extracts plain text from markdown', () => {
      const text = extractMarkdownPlainText('**bold** and `code`');
      expect(text).toContain('bold');
      expect(text).toContain('code');
      expect(text).not.toContain('**');
      expect(text).not.toContain('`');
    });

    it('strips code fence language', () => {
      const text = extractMarkdownPlainText('```tsx\nconst x = 1;\n```');
      expect(text).toContain('const x = 1;');
      expect(text).not.toMatch(/(?:^|\s)tsx(?:\s|$)/);
    });

    it('returns empty string for empty input', () => {
      expect(extractMarkdownPlainText('')).toBe('');
    });
  });
});