perf(main): cache runtime resource telemetry

This commit is contained in:
777genius 2026-05-30 23:59:56 +03:00
parent 172ddad18a
commit e90886316f
3 changed files with 61 additions and 18 deletions

View file

@ -39,16 +39,13 @@ export interface RuntimeProcessTableRow {
* from the large Electron main process. Runtime liveness/telemetry callers fire
* very frequently (every team file change invalidates their per-team snapshot
* caches), so without throttling here the main process spawns `ps` dozens of
* times per second while a team runs. Those callers already tolerate ~2s
* staleness via their own snapshot caches (AGENT_RUNTIME_SNAPSHOT_CACHE_TTL_MS),
* so caching the table for a SHORTER window than the consumers read it just
* re-spawns `ps` on every consumer rebuild for no freshness benefit. Match the
* 2s consumer window to collapse those redundant spawns: liveness verdicts are
* identity- (team+agent+command) not bare-PID matched, and OpenCode host cleanup
* re-validates each PID against live state before acting, so a ~2s-stale table
* cannot cause a wrong liveness call or an unsafe kill.
* times per second while a team runs. Runtime liveness can tolerate a small
* delay because verdicts are identity- (team+agent+command) not bare-PID
* matched, and OpenCode host cleanup re-validates each PID against live state
* before acting. Keep this cache long enough to collapse bursts from concurrent
* team refreshes, but short enough that stale "alive" UI is brief.
*/
const RUNTIME_PROCESS_TABLE_CACHE_TTL_MS = 2_000;
const RUNTIME_PROCESS_TABLE_CACHE_TTL_MS = 10_000;
interface RuntimeProcessTableCacheEntry {
rows: RuntimeProcessTableRow[];

View file

@ -3347,6 +3347,9 @@ export class TeamProvisioningService {
private static readonly SAME_TEAM_PERSIST_RETRY_MS = 2_000;
private static readonly AGENT_RUNTIME_SNAPSHOT_CACHE_TTL_MS = 2_000;
private static readonly PERSISTED_AGENT_RUNTIME_SNAPSHOT_CACHE_TTL_MS = 10_000;
private static readonly RUNTIME_RESOURCE_TELEMETRY_CACHE_TTL_MS = 60_000;
private static readonly RUNTIME_RESOURCE_TELEMETRY_FAILURE_CACHE_TTL_MS = 10_000;
private static readonly RUNTIME_RESOURCE_SAMPLE_MIN_INTERVAL_MS = 30_000;
private static readonly AGENT_RUNTIME_RESOURCE_HISTORY_LIMIT = 60;
private static readonly BOOTSTRAP_TRANSCRIPT_OUTCOME_CACHE_MAX_ENTRIES = 2_048;
private static readonly PERSISTED_BOOTSTRAP_TRANSCRIPT_OUTCOME_LOOKUP_CACHE_TTL_MS = 10_000;
@ -3773,10 +3776,10 @@ export class TeamProvisioningService {
this.agentRuntimeSnapshotInFlightByTeam.delete(teamName);
this.liveTeamAgentRuntimeMetadataCache.delete(teamName);
this.liveTeamAgentRuntimeMetadataInFlightByTeam.delete(teamName);
this.runtimeProcessRowsForUsageSnapshotByTeam.delete(teamName);
this.persistedTeamConfigCache.delete(teamName);
// CPU/RSS samples are TTL-bound and do not decide liveness; keeping them
// avoids repeated pidusage forks when launch-state churn invalidates snapshots.
// CPU/RSS telemetry is TTL-bound and does not decide liveness. Keep the
// process table cache across noisy runtime snapshot invalidations so UI
// refreshes do not respawn `ps` just to repaint resource badges.
}
private cloneMemberSpawnStatusesSnapshot(
@ -25448,7 +25451,11 @@ export class TeamProvisioningService {
}
if (processRowsReadForMetadata) {
this.runtimeProcessRowsForUsageSnapshotByTeam.set(teamName, {
expiresAtMs: Date.now() + this.getAgentRuntimeSnapshotCacheTtlMs(teamName, runId),
expiresAtMs:
Date.now() +
(processTableAvailable
? TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_CACHE_TTL_MS
: TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_FAILURE_CACHE_TTL_MS),
generation: generationAtStart,
runId,
rows: processTableAvailable ? processRows : null,
@ -25732,6 +25739,17 @@ export class TeamProvisioningService {
...(params.pid ? { pid: params.pid } : {}),
...(params.runtimePid ? { runtimePid: params.runtimePid } : {}),
};
const lastSample = existingHistory.at(-1);
const lastSampleMs = lastSample ? Date.parse(lastSample.timestamp) : Number.NaN;
const sampleMs = Date.parse(sample.timestamp);
const sampledRecently =
Number.isFinite(lastSampleMs) &&
Number.isFinite(sampleMs) &&
sampleMs - lastSampleMs >= 0 &&
sampleMs - lastSampleMs < TeamProvisioningService.RUNTIME_RESOURCE_SAMPLE_MIN_INTERVAL_MS;
if (sampledRecently) {
return existingHistory.map((entry) => ({ ...entry }));
}
const nextHistory = [...existingHistory, sample].slice(
-TeamProvisioningService.AGENT_RUNTIME_RESOURCE_HISTORY_LIMIT
);
@ -25883,10 +25901,7 @@ export class TeamProvisioningService {
process.platform === 'win32' && options.includeWindowsHostRows === true;
const cached = this.runtimeProcessRowsForUsageSnapshotByTeam.get(teamName);
const canUseCached =
cached &&
cached.expiresAtMs > Date.now() &&
cached.generation === this.getRuntimeSnapshotCacheGeneration(teamName) &&
cached.runId === this.getTrackedRunId(teamName);
cached && cached.expiresAtMs > Date.now() && cached.runId === this.getTrackedRunId(teamName);
if (canUseCached && (!includeWindowsHostRows || cached.includesWindowsHostRows)) {
return cached.rows;
}
@ -25945,7 +25960,9 @@ export class TeamProvisioningService {
this.runtimeProcessRowsForUsageSnapshotByTeam.set(teamName, {
expiresAtMs:
Date.now() +
this.getAgentRuntimeSnapshotCacheTtlMs(teamName, this.getTrackedRunId(teamName)),
(resultRows === null
? TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_FAILURE_CACHE_TTL_MS
: TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_CACHE_TTL_MS),
generation: this.getRuntimeSnapshotCacheGeneration(teamName),
runId: this.getTrackedRunId(teamName),
rows: resultRows,

View file

@ -657,6 +657,9 @@ type TeamProvisioningServicePrivateHarness = {
readProcessUsageStatsByPid: (
pids: readonly number[]
) => Promise<Map<number, { rssBytes?: number; cpuPercent?: number }>>;
readRuntimeProcessRowsForUsageSnapshot: (teamName: string) => Promise<unknown[] | null>;
invalidateRuntimeSnapshotCaches: (teamName: string) => void;
aliveRunByTeam: Map<string, string>;
readRecentBootstrapTranscriptOutcome: (
filePath: string,
sinceMs: number | null,
@ -3735,6 +3738,32 @@ describe('TeamProvisioningService', () => {
});
});
it('keeps cached runtime resource process rows across snapshot invalidations', async () => {
vi.useFakeTimers();
vi.setSystemTime(new Date('2026-05-03T12:00:00.000Z'));
const svc = new TeamProvisioningService();
const harness = privateHarness(svc);
harness.aliveRunByTeam.set('runtime-team', 'run-1');
vi.mocked(listRuntimeProcessTableForCurrentPlatform).mockResolvedValueOnce([
{
pid: 111,
ppid: 1,
command: '/usr/bin/node lead.js',
cpuPercent: 3.5,
rssBytes: 123_000_000,
},
]);
const firstRows = await harness.readRuntimeProcessRowsForUsageSnapshot('runtime-team');
harness.invalidateRuntimeSnapshotCaches('runtime-team');
vi.setSystemTime(new Date('2026-05-03T12:00:05.000Z'));
const secondRows = await harness.readRuntimeProcessRowsForUsageSnapshot('runtime-team');
expect(listRuntimeProcessTableForCurrentPlatform).toHaveBeenCalledTimes(1);
expect(secondRows).toEqual(firstRows);
vi.useRealTimers();
});
it('skips pidusage by default when process table metrics are missing', async () => {
await withRuntimePidusageTelemetryEnv(undefined, async () => {
const svc = new TeamProvisioningService();