perf: replace readline with a bounded chunked read in the affinity head scan
fileBelongsToTeam streamed the head window via createReadStream + readline. readline's line iterator runs an expensive Unicode line-break regex and stream/string-decoder machinery per chunk, which showed up as a top main-thread cost during launch (the line- split regex alone was ~5.7% in the warm launch profile). Replace it with a bounded chunked fs.read + a plain '\n' split. JSONL is strictly newline-delimited and each line is trim()'d (so a trailing CR from CRLF is dropped), so a '\n' split is cheaper and more correct (it will not split on a bare CR or a Unicode line/paragraph separator inside a JSON string value, which readline would). A StringDecoder preserves multi-byte UTF-8 sequences that straddle a chunk boundary. Byte-identical semantics to the old loop: inspect up to TEAM_AFFINITY_SCAN_LINES non-empty lines, first match wins via early break, and a final line is honored even without a trailing newline. Reads in 64KB chunks so a team decided in its first lines is not penalized by a huge file. Adds tests for CRLF endings + no-trailing-newline, a multi-byte char straddling the 64KB boundary, and the 40-line window bound (21 pass).
This commit is contained in:
parent
5c1d2e8d92
commit
f0797e2c12
2 changed files with 130 additions and 34 deletions
|
|
@ -8,10 +8,10 @@ import {
|
|||
} from '@main/utils/pathDecoder';
|
||||
import { isLeadMember } from '@shared/utils/leadDetection';
|
||||
import { createLogger } from '@shared/utils/logger';
|
||||
import { createReadStream, type Dirent } from 'fs';
|
||||
import { type Dirent } from 'fs';
|
||||
import * as fs from 'fs/promises';
|
||||
import * as path from 'path';
|
||||
import * as readline from 'readline';
|
||||
import { StringDecoder } from 'string_decoder';
|
||||
|
||||
import { TeamConfigReader } from './TeamConfigReader';
|
||||
|
||||
|
|
@ -21,6 +21,10 @@ const logger = createLogger('Service:TeamTranscriptProjectResolver');
|
|||
|
||||
const SESSION_DISCOVERY_CACHE_TTL = 30_000;
|
||||
const TEAM_AFFINITY_SCAN_LINES = 40;
|
||||
// Read size for the head-window affinity scan. Read in chunks (not the whole file)
|
||||
// so a transcript whose head holds the team's first TEAM_AFFINITY_SCAN_LINES lines
|
||||
// is decided after reading just those, not the entire (possibly huge) file.
|
||||
const TEAM_AFFINITY_READ_CHUNK_BYTES = 64 * 1024;
|
||||
const TEAM_AFFINITY_FILE_CACHE_MAX_ENTRIES = 4_096;
|
||||
const ROOT_DISCOVERY_CONCURRENCY = 12;
|
||||
const FAST_CONTEXT_ROOT_DISCOVERY_MTIME_GRACE_MS = 24 * 60 * 60_000;
|
||||
|
|
@ -1066,49 +1070,83 @@ export class TeamTranscriptProjectResolver {
|
|||
}
|
||||
}
|
||||
|
||||
const stream = createReadStream(filePath, { encoding: 'utf8' });
|
||||
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
|
||||
// Read the head window with a bounded chunked read plus a plain newline split
|
||||
// instead of readline. readline's async line iterator runs an expensive Unicode
|
||||
// line-break regex and stream/string-decoder machinery per chunk, which showed up
|
||||
// as a top main-thread cost during launch. JSONL is strictly newline-delimited and
|
||||
// each line is trim()'d (so a trailing CR from a CRLF ending is dropped), so a plain
|
||||
// newline split is both cheaper and more correct here: it will not split on a bare
|
||||
// CR or a Unicode line/paragraph separator that appears inside a JSON string value.
|
||||
// A StringDecoder preserves multi-byte UTF-8 sequences that straddle a chunk
|
||||
// boundary. Semantics are byte-identical to the old readline loop: inspect up to
|
||||
// TEAM_AFFINITY_SCAN_LINES non-empty lines, first match wins via early break, and a
|
||||
// final line is honored even without a trailing newline.
|
||||
let belongsToTeam = false;
|
||||
let inspected = 0;
|
||||
|
||||
const inspectHeadLine = (rawLine: string): boolean => {
|
||||
const trimmed = rawLine.trim();
|
||||
if (!trimmed) {
|
||||
return false;
|
||||
}
|
||||
inspected += 1;
|
||||
try {
|
||||
const entry = JSON.parse(trimmed) as Record<string, unknown>;
|
||||
const directTeamName = extractDirectTeamName(entry);
|
||||
if (directTeamName === normalizedTeam) {
|
||||
belongsToTeam = true;
|
||||
return true;
|
||||
}
|
||||
if (entryContainsNestedTeamName(entry, normalizedTeam)) {
|
||||
belongsToTeam = true;
|
||||
return true;
|
||||
}
|
||||
const textContent = extractTextContent(entry);
|
||||
if (textContent && lineMentionsTeam(textContent, normalizedTeam)) {
|
||||
belongsToTeam = true;
|
||||
return true;
|
||||
}
|
||||
} catch {
|
||||
// ignore malformed head lines
|
||||
}
|
||||
return inspected >= TEAM_AFFINITY_SCAN_LINES;
|
||||
};
|
||||
|
||||
let handle: fs.FileHandle | null = null;
|
||||
try {
|
||||
for await (const line of rl) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) {
|
||||
continue;
|
||||
}
|
||||
|
||||
inspected += 1;
|
||||
try {
|
||||
const entry = JSON.parse(trimmed) as Record<string, unknown>;
|
||||
const directTeamName = extractDirectTeamName(entry);
|
||||
if (directTeamName === normalizedTeam) {
|
||||
belongsToTeam = true;
|
||||
break;
|
||||
handle = await fs.open(filePath, 'r');
|
||||
const decoder = new StringDecoder('utf8');
|
||||
const chunk = Buffer.allocUnsafe(TEAM_AFFINITY_READ_CHUNK_BYTES);
|
||||
let pending = '';
|
||||
let position = 0;
|
||||
let stop = false;
|
||||
while (!stop) {
|
||||
const { bytesRead } = await handle.read(chunk, 0, chunk.length, position);
|
||||
if (bytesRead <= 0) {
|
||||
// EOF: flush the decoder and honor a final line with no trailing newline.
|
||||
pending += decoder.end();
|
||||
if (pending.length > 0) {
|
||||
inspectHeadLine(pending);
|
||||
}
|
||||
if (entryContainsNestedTeamName(entry, normalizedTeam)) {
|
||||
belongsToTeam = true;
|
||||
break;
|
||||
}
|
||||
|
||||
const textContent = extractTextContent(entry);
|
||||
if (textContent && lineMentionsTeam(textContent, normalizedTeam)) {
|
||||
belongsToTeam = true;
|
||||
break;
|
||||
}
|
||||
} catch {
|
||||
// ignore malformed head lines
|
||||
}
|
||||
|
||||
if (inspected >= TEAM_AFFINITY_SCAN_LINES) {
|
||||
break;
|
||||
}
|
||||
position += bytesRead;
|
||||
pending += decoder.write(chunk.subarray(0, bytesRead));
|
||||
let newlineIndex = pending.indexOf('\n');
|
||||
while (newlineIndex !== -1) {
|
||||
const line = pending.slice(0, newlineIndex);
|
||||
pending = pending.slice(newlineIndex + 1);
|
||||
if (inspectHeadLine(line)) {
|
||||
stop = true;
|
||||
break;
|
||||
}
|
||||
newlineIndex = pending.indexOf('\n');
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
return false;
|
||||
} finally {
|
||||
rl.close();
|
||||
stream.destroy();
|
||||
await handle?.close().catch(() => undefined);
|
||||
}
|
||||
|
||||
this.setTeamAffinityFileCacheEntry(cacheKey, {
|
||||
|
|
|
|||
|
|
@ -749,4 +749,62 @@ describe('TeamTranscriptProjectResolver', () => {
|
|||
expect(entry?.size).toBe(999_999); // cache recorded the precomputed stat -> no re-stat
|
||||
expect(entry?.mtimeMs).toBe(123_456);
|
||||
});
|
||||
|
||||
// The head-window scan reads chunks + splits on '\n' (not readline). These lock the
|
||||
// byte-exact equivalence: CRLF endings, a final line with no trailing newline, a
|
||||
// multi-byte char straddling the 64KB read boundary, and the 40-line window bound.
|
||||
const teamTextLine = (team: string) =>
|
||||
JSON.stringify({
|
||||
type: 'user',
|
||||
message: { role: 'user', content: [{ type: 'text', text: `Team name: ${team}` }] },
|
||||
});
|
||||
const noiseLine = (i: number) =>
|
||||
JSON.stringify({ type: 'user', message: { role: 'user', content: `noise ${i}` } });
|
||||
|
||||
it('matches with CRLF line endings and a final line that has no trailing newline', async () => {
|
||||
await setupClaudeRoot();
|
||||
const resolver = new TeamTranscriptProjectResolver() as unknown as ResolverProbe;
|
||||
const team = 'crlf-team';
|
||||
const projectDir = path.join(tmpDir!, 'projects', encodePath('/repo/crlf'));
|
||||
await fs.mkdir(projectDir, { recursive: true });
|
||||
const jsonlPath = path.join(projectDir, 'c.jsonl');
|
||||
// CRLF separators; the matching line is last and has NO trailing newline.
|
||||
await fs.writeFile(
|
||||
jsonlPath,
|
||||
`${noiseLine(0)}\r\n${noiseLine(1)}\r\n${teamTextLine(team)}`,
|
||||
'utf8'
|
||||
);
|
||||
expect(await resolver.fileBelongsToTeam(jsonlPath, team)).toBe(true);
|
||||
});
|
||||
|
||||
it('matches a team mention located past the 64KB read boundary with multi-byte content', async () => {
|
||||
await setupClaudeRoot();
|
||||
const resolver = new TeamTranscriptProjectResolver() as unknown as ResolverProbe;
|
||||
const team = 'boundary-team';
|
||||
const projectDir = path.join(tmpDir!, 'projects', encodePath('/repo/mb'));
|
||||
await fs.mkdir(projectDir, { recursive: true });
|
||||
const jsonlPath = path.join(projectDir, 'mb.jsonl');
|
||||
// ~40KB of 2-byte Cyrillic per line: the first two lines (~80KB) push the matching
|
||||
// third line past the 64KB read chunk and force a multi-byte char to straddle the
|
||||
// chunk boundary, which the StringDecoder must stitch back together.
|
||||
const big = 'я'.repeat(20_000);
|
||||
const heavy = (i: number) =>
|
||||
JSON.stringify({ type: 'user', message: { role: 'user', content: `${big} ${i}` } });
|
||||
await fs.writeFile(jsonlPath, `${heavy(0)}\n${heavy(1)}\n${teamTextLine(team)}\n`, 'utf8');
|
||||
expect(await resolver.fileBelongsToTeam(jsonlPath, team)).toBe(true);
|
||||
});
|
||||
|
||||
it('ignores a team mention that appears only after the 40-line head window', async () => {
|
||||
await setupClaudeRoot();
|
||||
const resolver = new TeamTranscriptProjectResolver() as unknown as ResolverProbe;
|
||||
const team = 'late-team';
|
||||
const projectDir = path.join(tmpDir!, 'projects', encodePath('/repo/late'));
|
||||
await fs.mkdir(projectDir, { recursive: true });
|
||||
const jsonlPath = path.join(projectDir, 'late.jsonl');
|
||||
// 40 non-matching lines fill the head window; the mention is on line 41.
|
||||
const lines = Array.from({ length: 40 }, (_, i) => noiseLine(i));
|
||||
lines.push(teamTextLine(team));
|
||||
await fs.writeFile(jsonlPath, `${lines.join('\n')}\n`, 'utf8');
|
||||
expect(await resolver.fileBelongsToTeam(jsonlPath, team)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
Loading…
Reference in a new issue