perf: replace readline with a bounded chunked read in the affinity head scan

fileBelongsToTeam streamed the head window via createReadStream + readline. readline's
line iterator runs an expensive Unicode line-break regex and stream/string-decoder
machinery per chunk, which showed up as a top main-thread cost during launch (the line-
split regex alone was ~5.7% in the warm launch profile).

Replace it with a bounded chunked fs.read + a plain '\n' split. JSONL is strictly
newline-delimited and each line is trim()'d (so a trailing CR from CRLF is dropped),
so a '\n' split is cheaper and more correct (it will not split on a bare CR or a
Unicode line/paragraph separator inside a JSON string value, which readline would). A
StringDecoder preserves multi-byte UTF-8 sequences that straddle a chunk boundary.

Byte-identical semantics to the old loop: inspect up to TEAM_AFFINITY_SCAN_LINES
non-empty lines, first match wins via early break, and a final line is honored even
without a trailing newline. Reads in 64KB chunks so a team decided in its first lines
is not penalized by a huge file. Adds tests for CRLF endings + no-trailing-newline,
a multi-byte char straddling the 64KB boundary, and the 40-line window bound (21 pass).
This commit is contained in:
777genius 2026-05-30 14:54:58 +03:00
parent 5c1d2e8d92
commit f0797e2c12
2 changed files with 130 additions and 34 deletions

View file

@ -8,10 +8,10 @@ import {
} from '@main/utils/pathDecoder';
import { isLeadMember } from '@shared/utils/leadDetection';
import { createLogger } from '@shared/utils/logger';
import { createReadStream, type Dirent } from 'fs';
import { type Dirent } from 'fs';
import * as fs from 'fs/promises';
import * as path from 'path';
import * as readline from 'readline';
import { StringDecoder } from 'string_decoder';
import { TeamConfigReader } from './TeamConfigReader';
@ -21,6 +21,10 @@ const logger = createLogger('Service:TeamTranscriptProjectResolver');
const SESSION_DISCOVERY_CACHE_TTL = 30_000;
const TEAM_AFFINITY_SCAN_LINES = 40;
// Read size for the head-window affinity scan. Read in chunks (not the whole file)
// so a transcript whose head holds the team's first TEAM_AFFINITY_SCAN_LINES lines
// is decided after reading just those, not the entire (possibly huge) file.
const TEAM_AFFINITY_READ_CHUNK_BYTES = 64 * 1024;
const TEAM_AFFINITY_FILE_CACHE_MAX_ENTRIES = 4_096;
const ROOT_DISCOVERY_CONCURRENCY = 12;
const FAST_CONTEXT_ROOT_DISCOVERY_MTIME_GRACE_MS = 24 * 60 * 60_000;
@ -1066,49 +1070,83 @@ export class TeamTranscriptProjectResolver {
}
}
const stream = createReadStream(filePath, { encoding: 'utf8' });
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
// Read the head window with a bounded chunked read plus a plain newline split
// instead of readline. readline's async line iterator runs an expensive Unicode
// line-break regex and stream/string-decoder machinery per chunk, which showed up
// as a top main-thread cost during launch. JSONL is strictly newline-delimited and
// each line is trim()'d (so a trailing CR from a CRLF ending is dropped), so a plain
// newline split is both cheaper and more correct here: it will not split on a bare
// CR or a Unicode line/paragraph separator that appears inside a JSON string value.
// A StringDecoder preserves multi-byte UTF-8 sequences that straddle a chunk
// boundary. Semantics are byte-identical to the old readline loop: inspect up to
// TEAM_AFFINITY_SCAN_LINES non-empty lines, first match wins via early break, and a
// final line is honored even without a trailing newline.
let belongsToTeam = false;
let inspected = 0;
const inspectHeadLine = (rawLine: string): boolean => {
const trimmed = rawLine.trim();
if (!trimmed) {
return false;
}
inspected += 1;
try {
const entry = JSON.parse(trimmed) as Record<string, unknown>;
const directTeamName = extractDirectTeamName(entry);
if (directTeamName === normalizedTeam) {
belongsToTeam = true;
return true;
}
if (entryContainsNestedTeamName(entry, normalizedTeam)) {
belongsToTeam = true;
return true;
}
const textContent = extractTextContent(entry);
if (textContent && lineMentionsTeam(textContent, normalizedTeam)) {
belongsToTeam = true;
return true;
}
} catch {
// ignore malformed head lines
}
return inspected >= TEAM_AFFINITY_SCAN_LINES;
};
let handle: fs.FileHandle | null = null;
try {
for await (const line of rl) {
const trimmed = line.trim();
if (!trimmed) {
continue;
}
inspected += 1;
try {
const entry = JSON.parse(trimmed) as Record<string, unknown>;
const directTeamName = extractDirectTeamName(entry);
if (directTeamName === normalizedTeam) {
belongsToTeam = true;
break;
handle = await fs.open(filePath, 'r');
const decoder = new StringDecoder('utf8');
const chunk = Buffer.allocUnsafe(TEAM_AFFINITY_READ_CHUNK_BYTES);
let pending = '';
let position = 0;
let stop = false;
while (!stop) {
const { bytesRead } = await handle.read(chunk, 0, chunk.length, position);
if (bytesRead <= 0) {
// EOF: flush the decoder and honor a final line with no trailing newline.
pending += decoder.end();
if (pending.length > 0) {
inspectHeadLine(pending);
}
if (entryContainsNestedTeamName(entry, normalizedTeam)) {
belongsToTeam = true;
break;
}
const textContent = extractTextContent(entry);
if (textContent && lineMentionsTeam(textContent, normalizedTeam)) {
belongsToTeam = true;
break;
}
} catch {
// ignore malformed head lines
}
if (inspected >= TEAM_AFFINITY_SCAN_LINES) {
break;
}
position += bytesRead;
pending += decoder.write(chunk.subarray(0, bytesRead));
let newlineIndex = pending.indexOf('\n');
while (newlineIndex !== -1) {
const line = pending.slice(0, newlineIndex);
pending = pending.slice(newlineIndex + 1);
if (inspectHeadLine(line)) {
stop = true;
break;
}
newlineIndex = pending.indexOf('\n');
}
}
} catch {
return false;
} finally {
rl.close();
stream.destroy();
await handle?.close().catch(() => undefined);
}
this.setTeamAffinityFileCacheEntry(cacheKey, {

View file

@ -749,4 +749,62 @@ describe('TeamTranscriptProjectResolver', () => {
expect(entry?.size).toBe(999_999); // cache recorded the precomputed stat -> no re-stat
expect(entry?.mtimeMs).toBe(123_456);
});
// The head-window scan reads chunks + splits on '\n' (not readline). These lock the
// byte-exact equivalence: CRLF endings, a final line with no trailing newline, a
// multi-byte char straddling the 64KB read boundary, and the 40-line window bound.
const teamTextLine = (team: string) =>
JSON.stringify({
type: 'user',
message: { role: 'user', content: [{ type: 'text', text: `Team name: ${team}` }] },
});
const noiseLine = (i: number) =>
JSON.stringify({ type: 'user', message: { role: 'user', content: `noise ${i}` } });
it('matches with CRLF line endings and a final line that has no trailing newline', async () => {
await setupClaudeRoot();
const resolver = new TeamTranscriptProjectResolver() as unknown as ResolverProbe;
const team = 'crlf-team';
const projectDir = path.join(tmpDir!, 'projects', encodePath('/repo/crlf'));
await fs.mkdir(projectDir, { recursive: true });
const jsonlPath = path.join(projectDir, 'c.jsonl');
// CRLF separators; the matching line is last and has NO trailing newline.
await fs.writeFile(
jsonlPath,
`${noiseLine(0)}\r\n${noiseLine(1)}\r\n${teamTextLine(team)}`,
'utf8'
);
expect(await resolver.fileBelongsToTeam(jsonlPath, team)).toBe(true);
});
it('matches a team mention located past the 64KB read boundary with multi-byte content', async () => {
await setupClaudeRoot();
const resolver = new TeamTranscriptProjectResolver() as unknown as ResolverProbe;
const team = 'boundary-team';
const projectDir = path.join(tmpDir!, 'projects', encodePath('/repo/mb'));
await fs.mkdir(projectDir, { recursive: true });
const jsonlPath = path.join(projectDir, 'mb.jsonl');
// ~40KB of 2-byte Cyrillic per line: the first two lines (~80KB) push the matching
// third line past the 64KB read chunk and force a multi-byte char to straddle the
// chunk boundary, which the StringDecoder must stitch back together.
const big = 'я'.repeat(20_000);
const heavy = (i: number) =>
JSON.stringify({ type: 'user', message: { role: 'user', content: `${big} ${i}` } });
await fs.writeFile(jsonlPath, `${heavy(0)}\n${heavy(1)}\n${teamTextLine(team)}\n`, 'utf8');
expect(await resolver.fileBelongsToTeam(jsonlPath, team)).toBe(true);
});
it('ignores a team mention that appears only after the 40-line head window', async () => {
await setupClaudeRoot();
const resolver = new TeamTranscriptProjectResolver() as unknown as ResolverProbe;
const team = 'late-team';
const projectDir = path.join(tmpDir!, 'projects', encodePath('/repo/late'));
await fs.mkdir(projectDir, { recursive: true });
const jsonlPath = path.join(projectDir, 'late.jsonl');
// 40 non-matching lines fill the head window; the mention is on line 41.
const lines = Array.from({ length: 40 }, (_, i) => noiseLine(i));
lines.push(teamTextLine(team));
await fs.writeFile(jsonlPath, `${lines.join('\n')}\n`, 'utf8');
expect(await resolver.fileBelongsToTeam(jsonlPath, team)).toBe(false);
});
});