perf(main): cache runtime resource telemetry
This commit is contained in:
parent
172ddad18a
commit
e90886316f
3 changed files with 61 additions and 18 deletions
|
|
@ -39,16 +39,13 @@ export interface RuntimeProcessTableRow {
|
|||
* from the large Electron main process. Runtime liveness/telemetry callers fire
|
||||
* very frequently (every team file change invalidates their per-team snapshot
|
||||
* caches), so without throttling here the main process spawns `ps` dozens of
|
||||
* times per second while a team runs. Those callers already tolerate ~2s
|
||||
* staleness via their own snapshot caches (AGENT_RUNTIME_SNAPSHOT_CACHE_TTL_MS),
|
||||
* so caching the table for a SHORTER window than the consumers read it just
|
||||
* re-spawns `ps` on every consumer rebuild for no freshness benefit. Match the
|
||||
* 2s consumer window to collapse those redundant spawns: liveness verdicts are
|
||||
* identity- (team+agent+command) not bare-PID matched, and OpenCode host cleanup
|
||||
* re-validates each PID against live state before acting, so a ~2s-stale table
|
||||
* cannot cause a wrong liveness call or an unsafe kill.
|
||||
* times per second while a team runs. Runtime liveness can tolerate a small
|
||||
* delay because verdicts are identity- (team+agent+command) not bare-PID
|
||||
* matched, and OpenCode host cleanup re-validates each PID against live state
|
||||
* before acting. Keep this cache long enough to collapse bursts from concurrent
|
||||
* team refreshes, but short enough that stale "alive" UI is brief.
|
||||
*/
|
||||
const RUNTIME_PROCESS_TABLE_CACHE_TTL_MS = 2_000;
|
||||
const RUNTIME_PROCESS_TABLE_CACHE_TTL_MS = 10_000;
|
||||
|
||||
interface RuntimeProcessTableCacheEntry {
|
||||
rows: RuntimeProcessTableRow[];
|
||||
|
|
|
|||
|
|
@ -3347,6 +3347,9 @@ export class TeamProvisioningService {
|
|||
private static readonly SAME_TEAM_PERSIST_RETRY_MS = 2_000;
|
||||
private static readonly AGENT_RUNTIME_SNAPSHOT_CACHE_TTL_MS = 2_000;
|
||||
private static readonly PERSISTED_AGENT_RUNTIME_SNAPSHOT_CACHE_TTL_MS = 10_000;
|
||||
private static readonly RUNTIME_RESOURCE_TELEMETRY_CACHE_TTL_MS = 60_000;
|
||||
private static readonly RUNTIME_RESOURCE_TELEMETRY_FAILURE_CACHE_TTL_MS = 10_000;
|
||||
private static readonly RUNTIME_RESOURCE_SAMPLE_MIN_INTERVAL_MS = 30_000;
|
||||
private static readonly AGENT_RUNTIME_RESOURCE_HISTORY_LIMIT = 60;
|
||||
private static readonly BOOTSTRAP_TRANSCRIPT_OUTCOME_CACHE_MAX_ENTRIES = 2_048;
|
||||
private static readonly PERSISTED_BOOTSTRAP_TRANSCRIPT_OUTCOME_LOOKUP_CACHE_TTL_MS = 10_000;
|
||||
|
|
@ -3773,10 +3776,10 @@ export class TeamProvisioningService {
|
|||
this.agentRuntimeSnapshotInFlightByTeam.delete(teamName);
|
||||
this.liveTeamAgentRuntimeMetadataCache.delete(teamName);
|
||||
this.liveTeamAgentRuntimeMetadataInFlightByTeam.delete(teamName);
|
||||
this.runtimeProcessRowsForUsageSnapshotByTeam.delete(teamName);
|
||||
this.persistedTeamConfigCache.delete(teamName);
|
||||
// CPU/RSS samples are TTL-bound and do not decide liveness; keeping them
|
||||
// avoids repeated pidusage forks when launch-state churn invalidates snapshots.
|
||||
// CPU/RSS telemetry is TTL-bound and does not decide liveness. Keep the
|
||||
// process table cache across noisy runtime snapshot invalidations so UI
|
||||
// refreshes do not respawn `ps` just to repaint resource badges.
|
||||
}
|
||||
|
||||
private cloneMemberSpawnStatusesSnapshot(
|
||||
|
|
@ -25448,7 +25451,11 @@ export class TeamProvisioningService {
|
|||
}
|
||||
if (processRowsReadForMetadata) {
|
||||
this.runtimeProcessRowsForUsageSnapshotByTeam.set(teamName, {
|
||||
expiresAtMs: Date.now() + this.getAgentRuntimeSnapshotCacheTtlMs(teamName, runId),
|
||||
expiresAtMs:
|
||||
Date.now() +
|
||||
(processTableAvailable
|
||||
? TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_CACHE_TTL_MS
|
||||
: TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_FAILURE_CACHE_TTL_MS),
|
||||
generation: generationAtStart,
|
||||
runId,
|
||||
rows: processTableAvailable ? processRows : null,
|
||||
|
|
@ -25732,6 +25739,17 @@ export class TeamProvisioningService {
|
|||
...(params.pid ? { pid: params.pid } : {}),
|
||||
...(params.runtimePid ? { runtimePid: params.runtimePid } : {}),
|
||||
};
|
||||
const lastSample = existingHistory.at(-1);
|
||||
const lastSampleMs = lastSample ? Date.parse(lastSample.timestamp) : Number.NaN;
|
||||
const sampleMs = Date.parse(sample.timestamp);
|
||||
const sampledRecently =
|
||||
Number.isFinite(lastSampleMs) &&
|
||||
Number.isFinite(sampleMs) &&
|
||||
sampleMs - lastSampleMs >= 0 &&
|
||||
sampleMs - lastSampleMs < TeamProvisioningService.RUNTIME_RESOURCE_SAMPLE_MIN_INTERVAL_MS;
|
||||
if (sampledRecently) {
|
||||
return existingHistory.map((entry) => ({ ...entry }));
|
||||
}
|
||||
const nextHistory = [...existingHistory, sample].slice(
|
||||
-TeamProvisioningService.AGENT_RUNTIME_RESOURCE_HISTORY_LIMIT
|
||||
);
|
||||
|
|
@ -25883,10 +25901,7 @@ export class TeamProvisioningService {
|
|||
process.platform === 'win32' && options.includeWindowsHostRows === true;
|
||||
const cached = this.runtimeProcessRowsForUsageSnapshotByTeam.get(teamName);
|
||||
const canUseCached =
|
||||
cached &&
|
||||
cached.expiresAtMs > Date.now() &&
|
||||
cached.generation === this.getRuntimeSnapshotCacheGeneration(teamName) &&
|
||||
cached.runId === this.getTrackedRunId(teamName);
|
||||
cached && cached.expiresAtMs > Date.now() && cached.runId === this.getTrackedRunId(teamName);
|
||||
if (canUseCached && (!includeWindowsHostRows || cached.includesWindowsHostRows)) {
|
||||
return cached.rows;
|
||||
}
|
||||
|
|
@ -25945,7 +25960,9 @@ export class TeamProvisioningService {
|
|||
this.runtimeProcessRowsForUsageSnapshotByTeam.set(teamName, {
|
||||
expiresAtMs:
|
||||
Date.now() +
|
||||
this.getAgentRuntimeSnapshotCacheTtlMs(teamName, this.getTrackedRunId(teamName)),
|
||||
(resultRows === null
|
||||
? TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_FAILURE_CACHE_TTL_MS
|
||||
: TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_CACHE_TTL_MS),
|
||||
generation: this.getRuntimeSnapshotCacheGeneration(teamName),
|
||||
runId: this.getTrackedRunId(teamName),
|
||||
rows: resultRows,
|
||||
|
|
|
|||
|
|
@ -657,6 +657,9 @@ type TeamProvisioningServicePrivateHarness = {
|
|||
readProcessUsageStatsByPid: (
|
||||
pids: readonly number[]
|
||||
) => Promise<Map<number, { rssBytes?: number; cpuPercent?: number }>>;
|
||||
readRuntimeProcessRowsForUsageSnapshot: (teamName: string) => Promise<unknown[] | null>;
|
||||
invalidateRuntimeSnapshotCaches: (teamName: string) => void;
|
||||
aliveRunByTeam: Map<string, string>;
|
||||
readRecentBootstrapTranscriptOutcome: (
|
||||
filePath: string,
|
||||
sinceMs: number | null,
|
||||
|
|
@ -3735,6 +3738,32 @@ describe('TeamProvisioningService', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('keeps cached runtime resource process rows across snapshot invalidations', async () => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date('2026-05-03T12:00:00.000Z'));
|
||||
const svc = new TeamProvisioningService();
|
||||
const harness = privateHarness(svc);
|
||||
harness.aliveRunByTeam.set('runtime-team', 'run-1');
|
||||
vi.mocked(listRuntimeProcessTableForCurrentPlatform).mockResolvedValueOnce([
|
||||
{
|
||||
pid: 111,
|
||||
ppid: 1,
|
||||
command: '/usr/bin/node lead.js',
|
||||
cpuPercent: 3.5,
|
||||
rssBytes: 123_000_000,
|
||||
},
|
||||
]);
|
||||
|
||||
const firstRows = await harness.readRuntimeProcessRowsForUsageSnapshot('runtime-team');
|
||||
harness.invalidateRuntimeSnapshotCaches('runtime-team');
|
||||
vi.setSystemTime(new Date('2026-05-03T12:00:05.000Z'));
|
||||
const secondRows = await harness.readRuntimeProcessRowsForUsageSnapshot('runtime-team');
|
||||
|
||||
expect(listRuntimeProcessTableForCurrentPlatform).toHaveBeenCalledTimes(1);
|
||||
expect(secondRows).toEqual(firstRows);
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it('skips pidusage by default when process table metrics are missing', async () => {
|
||||
await withRuntimePidusageTelemetryEnv(undefined, async () => {
|
||||
const svc = new TeamProvisioningService();
|
||||
|
|
|
|||
Loading…
Reference in a new issue