readline.createInterface runs an expensive Unicode line-break regex + extra stream/string-decoder machinery per chunk. The main transcript parser (parseJsonlStream) already uses a buffer + manual newline split; these per-team readers still used readline. Add readJsonlLines(): an async generator that yields a JSONL file's lines via a chunked utf8 stream read + a plain '\n' split (drop-in for 'for await (const line of rl)'), so the consumers' loop bodies are unchanged. Stream is utf8-decoded before splitting, so multi-byte chars across chunk boundaries are safe; trailing CR (CRLF) is stripped; empty lines and a final newline-less line are yielded, matching readline; breaking out of the loop destroys the stream via the generator's finally. Adopt it in MemberStatsComputer, TaskBoundaryParser, and FileContentResolver (file-history scan). Behavior-identical (their existing tests pass: 18 + 6 + 12) plus 6 new tests for the generator (CRLF, empty lines, no-trailing-newline, early break, multi-byte chunk boundary). Note: session-browser readline paths (jsonl metadata extractor, metadataExtraction, SessionContentFilter) are off the launch path and left as-is for now.
73 lines
2.5 KiB
TypeScript
73 lines
2.5 KiB
TypeScript
import * as fs from 'fs/promises';
|
||
import * as os from 'os';
|
||
import * as path from 'path';
|
||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||
|
||
import { readJsonlLines } from '../../../src/main/utils/jsonlLineReader';
|
||
|
||
describe('readJsonlLines', () => {
|
||
let dir: string;
|
||
|
||
beforeEach(async () => {
|
||
dir = await fs.mkdtemp(path.join(os.tmpdir(), 'jsonl-line-reader-'));
|
||
});
|
||
afterEach(async () => {
|
||
await fs.rm(dir, { recursive: true, force: true });
|
||
});
|
||
|
||
async function write(name: string, content: string): Promise<string> {
|
||
const p = path.join(dir, name);
|
||
await fs.writeFile(p, content, 'utf8');
|
||
return p;
|
||
}
|
||
|
||
async function collect(filePath: string): Promise<string[]> {
|
||
const out: string[] = [];
|
||
for await (const line of readJsonlLines(filePath)) {
|
||
out.push(line);
|
||
}
|
||
return out;
|
||
}
|
||
|
||
it('yields every line in order, including empty lines', async () => {
|
||
// empty lines must still be yielded so callers tracking line numbers match readline
|
||
const p = await write('a.jsonl', 'a\n\nb\nc\n');
|
||
expect(await collect(p)).toEqual(['a', '', 'b', 'c']);
|
||
});
|
||
|
||
it('strips a trailing CR from CRLF endings', async () => {
|
||
const p = await write('crlf.jsonl', 'one\r\ntwo\r\nthree\r\n');
|
||
expect(await collect(p)).toEqual(['one', 'two', 'three']);
|
||
});
|
||
|
||
it('yields a final line that has no trailing newline', async () => {
|
||
const p = await write('tail.jsonl', 'first\nlast-no-newline');
|
||
expect(await collect(p)).toEqual(['first', 'last-no-newline']);
|
||
});
|
||
|
||
it('returns nothing for an empty file', async () => {
|
||
const p = await write('empty.jsonl', '');
|
||
expect(await collect(p)).toEqual([]);
|
||
});
|
||
|
||
it('stops and cleans up when the consumer breaks out of the loop', async () => {
|
||
const p = await write('stop.jsonl', 'l1\nl2\nl3\nl4\n');
|
||
const seen: string[] = [];
|
||
for await (const line of readJsonlLines(p)) {
|
||
seen.push(line);
|
||
if (line === 'l2') break;
|
||
}
|
||
expect(seen).toEqual(['l1', 'l2']);
|
||
});
|
||
|
||
it('decodes multi-byte UTF-8 that straddles a read-chunk boundary', async () => {
|
||
// >64KB of 2-byte Cyrillic before the marker forces a multi-byte char to span the
|
||
// stream's default 64KB chunk boundary; the marker line must still arrive intact.
|
||
const big = 'я'.repeat(40_000); // ~80KB
|
||
const p = await write('mb.jsonl', `${big}\n${big}\nМАРКЕР-Ω\n`);
|
||
const lines = await collect(p);
|
||
expect(lines).toHaveLength(3);
|
||
expect(lines[0]).toBe(big);
|
||
expect(lines[2]).toBe('МАРКЕР-Ω');
|
||
});
|
||
});
|