From e90886316f41913a1b62deceb2b845f12be5e15e Mon Sep 17 00:00:00 2001 From: 777genius Date: Sat, 30 May 2026 23:59:56 +0300 Subject: [PATCH] perf(main): cache runtime resource telemetry --- .../runtime/TmuxPlatformCommandExecutor.ts | 15 ++++---- .../services/team/TeamProvisioningService.ts | 35 ++++++++++++++----- .../team/TeamProvisioningService.test.ts | 29 +++++++++++++++ 3 files changed, 61 insertions(+), 18 deletions(-) diff --git a/src/features/tmux-installer/main/infrastructure/runtime/TmuxPlatformCommandExecutor.ts b/src/features/tmux-installer/main/infrastructure/runtime/TmuxPlatformCommandExecutor.ts index c0eae9c5..c7ef6b81 100644 --- a/src/features/tmux-installer/main/infrastructure/runtime/TmuxPlatformCommandExecutor.ts +++ b/src/features/tmux-installer/main/infrastructure/runtime/TmuxPlatformCommandExecutor.ts @@ -39,16 +39,13 @@ export interface RuntimeProcessTableRow { * from the large Electron main process. Runtime liveness/telemetry callers fire * very frequently (every team file change invalidates their per-team snapshot * caches), so without throttling here the main process spawns `ps` dozens of - * times per second while a team runs. Those callers already tolerate ~2s - * staleness via their own snapshot caches (AGENT_RUNTIME_SNAPSHOT_CACHE_TTL_MS), - * so caching the table for a SHORTER window than the consumers read it just - * re-spawns `ps` on every consumer rebuild for no freshness benefit. Match the - * 2s consumer window to collapse those redundant spawns: liveness verdicts are - * identity- (team+agent+command) not bare-PID matched, and OpenCode host cleanup - * re-validates each PID against live state before acting, so a ~2s-stale table - * cannot cause a wrong liveness call or an unsafe kill. + * times per second while a team runs. Runtime liveness can tolerate a small + * delay because verdicts are identity- (team+agent+command) not bare-PID + * matched, and OpenCode host cleanup re-validates each PID against live state + * before acting. Keep this cache long enough to collapse bursts from concurrent + * team refreshes, but short enough that stale "alive" UI is brief. */ -const RUNTIME_PROCESS_TABLE_CACHE_TTL_MS = 2_000; +const RUNTIME_PROCESS_TABLE_CACHE_TTL_MS = 10_000; interface RuntimeProcessTableCacheEntry { rows: RuntimeProcessTableRow[]; diff --git a/src/main/services/team/TeamProvisioningService.ts b/src/main/services/team/TeamProvisioningService.ts index 96c019a1..9e8f1ce2 100644 --- a/src/main/services/team/TeamProvisioningService.ts +++ b/src/main/services/team/TeamProvisioningService.ts @@ -3347,6 +3347,9 @@ export class TeamProvisioningService { private static readonly SAME_TEAM_PERSIST_RETRY_MS = 2_000; private static readonly AGENT_RUNTIME_SNAPSHOT_CACHE_TTL_MS = 2_000; private static readonly PERSISTED_AGENT_RUNTIME_SNAPSHOT_CACHE_TTL_MS = 10_000; + private static readonly RUNTIME_RESOURCE_TELEMETRY_CACHE_TTL_MS = 60_000; + private static readonly RUNTIME_RESOURCE_TELEMETRY_FAILURE_CACHE_TTL_MS = 10_000; + private static readonly RUNTIME_RESOURCE_SAMPLE_MIN_INTERVAL_MS = 30_000; private static readonly AGENT_RUNTIME_RESOURCE_HISTORY_LIMIT = 60; private static readonly BOOTSTRAP_TRANSCRIPT_OUTCOME_CACHE_MAX_ENTRIES = 2_048; private static readonly PERSISTED_BOOTSTRAP_TRANSCRIPT_OUTCOME_LOOKUP_CACHE_TTL_MS = 10_000; @@ -3773,10 +3776,10 @@ export class TeamProvisioningService { this.agentRuntimeSnapshotInFlightByTeam.delete(teamName); this.liveTeamAgentRuntimeMetadataCache.delete(teamName); this.liveTeamAgentRuntimeMetadataInFlightByTeam.delete(teamName); - this.runtimeProcessRowsForUsageSnapshotByTeam.delete(teamName); this.persistedTeamConfigCache.delete(teamName); - // CPU/RSS samples are TTL-bound and do not decide liveness; keeping them - // avoids repeated pidusage forks when launch-state churn invalidates snapshots. + // CPU/RSS telemetry is TTL-bound and does not decide liveness. Keep the + // process table cache across noisy runtime snapshot invalidations so UI + // refreshes do not respawn `ps` just to repaint resource badges. } private cloneMemberSpawnStatusesSnapshot( @@ -25448,7 +25451,11 @@ export class TeamProvisioningService { } if (processRowsReadForMetadata) { this.runtimeProcessRowsForUsageSnapshotByTeam.set(teamName, { - expiresAtMs: Date.now() + this.getAgentRuntimeSnapshotCacheTtlMs(teamName, runId), + expiresAtMs: + Date.now() + + (processTableAvailable + ? TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_CACHE_TTL_MS + : TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_FAILURE_CACHE_TTL_MS), generation: generationAtStart, runId, rows: processTableAvailable ? processRows : null, @@ -25732,6 +25739,17 @@ export class TeamProvisioningService { ...(params.pid ? { pid: params.pid } : {}), ...(params.runtimePid ? { runtimePid: params.runtimePid } : {}), }; + const lastSample = existingHistory.at(-1); + const lastSampleMs = lastSample ? Date.parse(lastSample.timestamp) : Number.NaN; + const sampleMs = Date.parse(sample.timestamp); + const sampledRecently = + Number.isFinite(lastSampleMs) && + Number.isFinite(sampleMs) && + sampleMs - lastSampleMs >= 0 && + sampleMs - lastSampleMs < TeamProvisioningService.RUNTIME_RESOURCE_SAMPLE_MIN_INTERVAL_MS; + if (sampledRecently) { + return existingHistory.map((entry) => ({ ...entry })); + } const nextHistory = [...existingHistory, sample].slice( -TeamProvisioningService.AGENT_RUNTIME_RESOURCE_HISTORY_LIMIT ); @@ -25883,10 +25901,7 @@ export class TeamProvisioningService { process.platform === 'win32' && options.includeWindowsHostRows === true; const cached = this.runtimeProcessRowsForUsageSnapshotByTeam.get(teamName); const canUseCached = - cached && - cached.expiresAtMs > Date.now() && - cached.generation === this.getRuntimeSnapshotCacheGeneration(teamName) && - cached.runId === this.getTrackedRunId(teamName); + cached && cached.expiresAtMs > Date.now() && cached.runId === this.getTrackedRunId(teamName); if (canUseCached && (!includeWindowsHostRows || cached.includesWindowsHostRows)) { return cached.rows; } @@ -25945,7 +25960,9 @@ export class TeamProvisioningService { this.runtimeProcessRowsForUsageSnapshotByTeam.set(teamName, { expiresAtMs: Date.now() + - this.getAgentRuntimeSnapshotCacheTtlMs(teamName, this.getTrackedRunId(teamName)), + (resultRows === null + ? TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_FAILURE_CACHE_TTL_MS + : TeamProvisioningService.RUNTIME_RESOURCE_TELEMETRY_CACHE_TTL_MS), generation: this.getRuntimeSnapshotCacheGeneration(teamName), runId: this.getTrackedRunId(teamName), rows: resultRows, diff --git a/test/main/services/team/TeamProvisioningService.test.ts b/test/main/services/team/TeamProvisioningService.test.ts index 927952a2..610e8d67 100644 --- a/test/main/services/team/TeamProvisioningService.test.ts +++ b/test/main/services/team/TeamProvisioningService.test.ts @@ -657,6 +657,9 @@ type TeamProvisioningServicePrivateHarness = { readProcessUsageStatsByPid: ( pids: readonly number[] ) => Promise>; + readRuntimeProcessRowsForUsageSnapshot: (teamName: string) => Promise; + invalidateRuntimeSnapshotCaches: (teamName: string) => void; + aliveRunByTeam: Map; readRecentBootstrapTranscriptOutcome: ( filePath: string, sinceMs: number | null, @@ -3735,6 +3738,32 @@ describe('TeamProvisioningService', () => { }); }); + it('keeps cached runtime resource process rows across snapshot invalidations', async () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date('2026-05-03T12:00:00.000Z')); + const svc = new TeamProvisioningService(); + const harness = privateHarness(svc); + harness.aliveRunByTeam.set('runtime-team', 'run-1'); + vi.mocked(listRuntimeProcessTableForCurrentPlatform).mockResolvedValueOnce([ + { + pid: 111, + ppid: 1, + command: '/usr/bin/node lead.js', + cpuPercent: 3.5, + rssBytes: 123_000_000, + }, + ]); + + const firstRows = await harness.readRuntimeProcessRowsForUsageSnapshot('runtime-team'); + harness.invalidateRuntimeSnapshotCaches('runtime-team'); + vi.setSystemTime(new Date('2026-05-03T12:00:05.000Z')); + const secondRows = await harness.readRuntimeProcessRowsForUsageSnapshot('runtime-team'); + + expect(listRuntimeProcessTableForCurrentPlatform).toHaveBeenCalledTimes(1); + expect(secondRows).toEqual(firstRows); + vi.useRealTimers(); + }); + it('skips pidusage by default when process table metrics are missing', async () => { await withRuntimePidusageTelemetryEnv(undefined, async () => { const svc = new TeamProvisioningService();