From 068399543ef4be4e93593a2146cad9126dc7dcd8 Mon Sep 17 00:00:00 2001 From: 777genius Date: Fri, 29 May 2026 00:06:48 +0300 Subject: [PATCH] feat(team): clean stale persisted runtime metadata for dead direct processes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Когда снимок liveness возвращает stale_metadata для direct-process teammate с persisted runtimePid, который реально мёртв — собираем кандидатов на очистку и сбрасываем их runtimePid/bootstrap-поля из config.json через двойной чек под guard для запущенных run/launch state. Это убирает мёртвые pid из последующих snapshot'ов и не трогает OpenCode/lane-aware/runtime-session-имеющиеся записи. Дополнительно добавлены targeted-pid liveness check (используется расширение TeamRuntimeLivenessResolver.targetedProcess) и shouldUsePersistedLaunchRuntimePidForMetadata, чтобы не подсасывать устаревший pid в metricsPid для членов с lane-aware конфигурацией. --- .../services/team/TeamProvisioningService.ts | 785 ++++++++++++++---- .../StaleProcessRuntimeMetadataCleanup.ts | 210 +++++ ...StaleProcessRuntimeMetadataCleanup.test.ts | 341 ++++++++ .../team/TeamProvisioningService.test.ts | 57 +- 4 files changed, 1252 insertions(+), 141 deletions(-) create mode 100644 src/main/services/team/provisioning/StaleProcessRuntimeMetadataCleanup.ts create mode 100644 src/main/services/team/provisioning/__tests__/StaleProcessRuntimeMetadataCleanup.test.ts diff --git a/src/main/services/team/TeamProvisioningService.ts b/src/main/services/team/TeamProvisioningService.ts index 526346ba..939de21c 100644 --- a/src/main/services/team/TeamProvisioningService.ts +++ b/src/main/services/team/TeamProvisioningService.ts @@ -184,6 +184,11 @@ import { type TeamRuntimeSettingsJson, } from '../runtime/teamRuntimeSettingsBundle'; +import { openCodeRuntimeApprovalProvider } from './approvals/OpenCodeRuntimeApprovalProvider'; +import { + RuntimeToolApprovalCoordinator, + type RuntimeToolApprovalEntry, +} from './approvals/RuntimeToolApprovalCoordinator'; import { parseBootstrapRuntimeProofDetail, validateBootstrapRuntimeProofEnvelope, @@ -193,6 +198,97 @@ import { buildNativeAppManagedBootstrapSpecsWithDiagnostics, type NativeAppManagedBootstrapSpec, } from './bootstrap/NativeAppManagedBootstrapContextBuilder'; +import { isOpenCodeBridgeNoOutputDiagnostic } from './opencode/bridge/OpenCodeBridgeSupportDiagnostics'; +import { + buildOpenCodePromptDeliveryAttemptId, + createOpenCodePromptDeliveryLedgerStore, + hashOpenCodePromptDeliveryPayload, + isOpenCodePromptDeliveryAttemptDue, + isOpenCodePromptResponseStateResponded, + isOpenCodeResolvedBehaviorChangedReason, + isOpenCodeSessionRefreshResponseState, + isOpenCodeSessionTransportChangedReason, + OPENCODE_PROMPT_DELIVERY_SESSION_REFRESH_MAX_ATTEMPTS, + type OpenCodePromptDeliveryLedgerRecord, + type OpenCodePromptDeliveryLedgerStore, + type OpenCodePromptDeliveryStatus, +} from './opencode/delivery/OpenCodePromptDeliveryLedger'; +import { + decideOpenCodePromptDeliveryRepair, + type OpenCodePromptDeliveryHardFailureKind, +} from './opencode/delivery/OpenCodePromptDeliveryRepairPolicy'; +import { + isOpenCodePromptDeliveryObserveLaterResponseState, + isOpenCodePromptDeliveryRetryableResponseState, + isOpenCodePromptDeliveryRetryAttemptDue, + isOpenCodeVisibleReplyReadCommitAllowed, + isOpenCodeVisibleReplySemanticallySufficient, + OPENCODE_PROMPT_DELIVERY_OBSERVE_DELAY_MS, + OPENCODE_PROMPT_DELIVERY_RETRY_DELAY_MS, + OPENCODE_PROMPT_WATCHDOG_GLOBAL_CONCURRENCY, + OPENCODE_PROMPT_WATCHDOG_PER_TEAM_CONCURRENCY, + type OpenCodeVisibleReplyProof, +} from './opencode/delivery/OpenCodePromptDeliveryWatchdog'; +import { + classifyOpenCodeRuntimeDeliveryReasonCode, + decideOpenCodeRuntimeDeliveryAdvisory, + isDeferredGenericOpenCodeRuntimeDeliveryReason, + isPotentialOpenCodeRuntimeDeliveryError, + type OpenCodeRuntimeDeliveryAdvisoryDecision, + toOpenCodeRuntimeDeliveryUserVisibleImpact, +} from './opencode/delivery/OpenCodeRuntimeDeliveryAdvisoryPolicy'; +import { selectOpenCodeRuntimeDeliveryReason } from './opencode/delivery/OpenCodeRuntimeDeliveryDiagnostics'; +import { + getOpenCodeVisibleReplyInboxCandidates as resolveOpenCodeVisibleReplyInboxCandidates, + isOpenCodeLeadReplyRecipientAlias as isOpenCodeLeadReplyRecipientAliasValue, + isOpenCodeRecoveredVisibleReplyCandidate as isOpenCodeRecoveredVisibleReplyCandidateValue, + isOpenCodeVisibleReplyTimestampEligible as isOpenCodeVisibleReplyTimestampEligibleValue, + normalizeOpenCodeTaskRefsForComparison as normalizeOpenCodeTaskRefsForComparisonValue, + openCodeTaskRefKey as openCodeTaskRefKeyValue, + openCodeTaskRefsIncludeAll as openCodeTaskRefsIncludeAllValue, +} from './opencode/delivery/OpenCodeRuntimeDeliveryProofMatching'; +import { OpenCodeRuntimeDeliveryProofReader } from './opencode/delivery/OpenCodeRuntimeDeliveryProofReader'; +import { inferOpenCodeTaskRefsFromInboxMessage } from './opencode/delivery/OpenCodeRuntimeDeliveryTaskRefInference'; +import { createRuntimeDeliveryJournalStore } from './opencode/delivery/RuntimeDeliveryJournal'; +import { + type RuntimeDeliveryDestinationPort, + RuntimeDeliveryDestinationRegistry, + RuntimeDeliveryReconciler, + RuntimeDeliveryService, +} from './opencode/delivery/RuntimeDeliveryService'; +import { + clearOpenCodeRuntimeLaneStorage, + getOpenCodeLaneScopedRuntimeFilePath, + getOpenCodeRuntimeManifestPath, + getOpenCodeRuntimeRunTombstonesPath, + inspectOpenCodeRuntimeLaneStorage, + migrateLegacyOpenCodeRuntimeState, + OpenCodeRuntimeManifestEvidenceReader, + prepareOpenCodeRuntimeLaneForLaunchGeneration, + readCommittedOpenCodeBootstrapSessionEvidence, + readOpenCodeRuntimeLaneIndex, + recoverStaleOpenCodeRuntimeLaneIndexEntry, + setOpenCodeRuntimeActiveRunManifest, + upsertOpenCodeRuntimeLaneIndexEntry, +} from './opencode/store/OpenCodeRuntimeManifestEvidenceReader'; +import { + createRuntimeRunTombstoneStore, + type RuntimeEvidenceKind, + RuntimeStaleEvidenceError, +} from './opencode/store/RuntimeRunTombstoneStore'; +import { + createRuntimeStoreManifestStore, + createRuntimeStoreReceiptStore, + OPENCODE_RUNTIME_STORE_DESCRIPTORS, + RuntimeStoreBatchWriter, +} from './opencode/store/RuntimeStoreManifest'; +import { + clearStaleProcessRuntimeMetadataFromMember, + collectStaleProcessRuntimeMetadataCleanupCandidate, + hasDirectProcessRuntimeMetadataForStaleCleanup, + shouldSkipStaleProcessRuntimeMetadataCleanupForRuntimeGuard, + type StaleProcessRuntimeMetadataCleanupCandidate, +} from './provisioning/StaleProcessRuntimeMetadataCleanup'; import { getSystemLocale } from './provisioning/TeamProvisioningAgentLanguage'; import { buildDeterministicCreateBootstrapSpec, @@ -331,111 +427,6 @@ import { getTeamProviderLabel, logRuntimeLaunchSnapshot, } from './provisioning/TeamProvisioningRuntimeDiagnostics'; - -import type { RuntimeTurnSettledProvider } from '@features/member-work-sync/main'; -export type { RuntimeBootstrapMemberMcpLaunchConfig } from './provisioning/TeamProvisioningBootstrapSpec'; -export { buildDirectTmuxRestartEnvAssignments } from './provisioning/TeamProvisioningDirectRestart'; -export { - getMixedLaunchFallbackRecoveryError, - getOpenCodeMixedProviderProvisioningError, -} from './provisioning/TeamProvisioningLaunchCompatibility'; -export { - shouldWarnOnMissingRegisteredMember, - shouldWarnOnUnreadableMemberAuditConfig, -} from './provisioning/TeamProvisioningMemberSpawnStatusPolicy'; -export { - buildAddMemberSpawnMessage, - buildRestartMemberSpawnMessage, -} from './provisioning/TeamProvisioningPromptBuilders'; -import { openCodeRuntimeApprovalProvider } from './approvals/OpenCodeRuntimeApprovalProvider'; -import { - RuntimeToolApprovalCoordinator, - type RuntimeToolApprovalEntry, -} from './approvals/RuntimeToolApprovalCoordinator'; -import { isOpenCodeBridgeNoOutputDiagnostic } from './opencode/bridge/OpenCodeBridgeSupportDiagnostics'; -import { - buildOpenCodePromptDeliveryAttemptId, - createOpenCodePromptDeliveryLedgerStore, - hashOpenCodePromptDeliveryPayload, - isOpenCodePromptDeliveryAttemptDue, - isOpenCodePromptResponseStateResponded, - isOpenCodeResolvedBehaviorChangedReason, - isOpenCodeSessionRefreshResponseState, - isOpenCodeSessionTransportChangedReason, - OPENCODE_PROMPT_DELIVERY_SESSION_REFRESH_MAX_ATTEMPTS, - type OpenCodePromptDeliveryLedgerRecord, - type OpenCodePromptDeliveryLedgerStore, - type OpenCodePromptDeliveryStatus, -} from './opencode/delivery/OpenCodePromptDeliveryLedger'; -import { - decideOpenCodePromptDeliveryRepair, - type OpenCodePromptDeliveryHardFailureKind, -} from './opencode/delivery/OpenCodePromptDeliveryRepairPolicy'; -import { - isOpenCodePromptDeliveryObserveLaterResponseState, - isOpenCodePromptDeliveryRetryableResponseState, - isOpenCodePromptDeliveryRetryAttemptDue, - isOpenCodeVisibleReplyReadCommitAllowed, - isOpenCodeVisibleReplySemanticallySufficient, - OPENCODE_PROMPT_DELIVERY_OBSERVE_DELAY_MS, - OPENCODE_PROMPT_DELIVERY_RETRY_DELAY_MS, - OPENCODE_PROMPT_WATCHDOG_GLOBAL_CONCURRENCY, - OPENCODE_PROMPT_WATCHDOG_PER_TEAM_CONCURRENCY, - type OpenCodeVisibleReplyProof, -} from './opencode/delivery/OpenCodePromptDeliveryWatchdog'; -import { - classifyOpenCodeRuntimeDeliveryReasonCode, - decideOpenCodeRuntimeDeliveryAdvisory, - isDeferredGenericOpenCodeRuntimeDeliveryReason, - isPotentialOpenCodeRuntimeDeliveryError, - type OpenCodeRuntimeDeliveryAdvisoryDecision, - toOpenCodeRuntimeDeliveryUserVisibleImpact, -} from './opencode/delivery/OpenCodeRuntimeDeliveryAdvisoryPolicy'; -import { selectOpenCodeRuntimeDeliveryReason } from './opencode/delivery/OpenCodeRuntimeDeliveryDiagnostics'; -import { - getOpenCodeVisibleReplyInboxCandidates as resolveOpenCodeVisibleReplyInboxCandidates, - isOpenCodeLeadReplyRecipientAlias as isOpenCodeLeadReplyRecipientAliasValue, - isOpenCodeRecoveredVisibleReplyCandidate as isOpenCodeRecoveredVisibleReplyCandidateValue, - isOpenCodeVisibleReplyTimestampEligible as isOpenCodeVisibleReplyTimestampEligibleValue, - normalizeOpenCodeTaskRefsForComparison as normalizeOpenCodeTaskRefsForComparisonValue, - openCodeTaskRefKey as openCodeTaskRefKeyValue, - openCodeTaskRefsIncludeAll as openCodeTaskRefsIncludeAllValue, -} from './opencode/delivery/OpenCodeRuntimeDeliveryProofMatching'; -import { OpenCodeRuntimeDeliveryProofReader } from './opencode/delivery/OpenCodeRuntimeDeliveryProofReader'; -import { inferOpenCodeTaskRefsFromInboxMessage } from './opencode/delivery/OpenCodeRuntimeDeliveryTaskRefInference'; -import { createRuntimeDeliveryJournalStore } from './opencode/delivery/RuntimeDeliveryJournal'; -import { - type RuntimeDeliveryDestinationPort, - RuntimeDeliveryDestinationRegistry, - RuntimeDeliveryReconciler, - RuntimeDeliveryService, -} from './opencode/delivery/RuntimeDeliveryService'; -import { - clearOpenCodeRuntimeLaneStorage, - getOpenCodeLaneScopedRuntimeFilePath, - getOpenCodeRuntimeManifestPath, - getOpenCodeRuntimeRunTombstonesPath, - inspectOpenCodeRuntimeLaneStorage, - migrateLegacyOpenCodeRuntimeState, - OpenCodeRuntimeManifestEvidenceReader, - prepareOpenCodeRuntimeLaneForLaunchGeneration, - readCommittedOpenCodeBootstrapSessionEvidence, - readOpenCodeRuntimeLaneIndex, - recoverStaleOpenCodeRuntimeLaneIndexEntry, - setOpenCodeRuntimeActiveRunManifest, - upsertOpenCodeRuntimeLaneIndexEntry, -} from './opencode/store/OpenCodeRuntimeManifestEvidenceReader'; -import { - createRuntimeRunTombstoneStore, - type RuntimeEvidenceKind, - RuntimeStaleEvidenceError, -} from './opencode/store/RuntimeRunTombstoneStore'; -import { - createRuntimeStoreManifestStore, - createRuntimeStoreReceiptStore, - OPENCODE_RUNTIME_STORE_DESCRIPTORS, - RuntimeStoreBatchWriter, -} from './opencode/store/RuntimeStoreManifest'; import { OpenCodeTaskLogAttributionStore } from './taskLogs/stream/OpenCodeTaskLogAttributionStore'; import { getCurrentAgentTeamsMcpHttpTransportEvidence } from './AgentTeamsMcpHttpServer'; import { isAgentTeamsToolUse } from './agentTeamsToolNames'; @@ -506,7 +497,9 @@ import { commandArgEquals, extractCliArgValues, isStrongRuntimeEvidence, + type ResolvedTeamMemberRuntimeLiveness, resolveTeamMemberRuntimeLiveness, + type ResolveTeamMemberRuntimeLivenessInput, sanitizeProcessCommandForDiagnostics, } from './TeamRuntimeLivenessResolver'; import { TeamSentMessagesStore } from './TeamSentMessagesStore'; @@ -532,6 +525,22 @@ import type { TeamRuntimePrepareResult, TeamRuntimeStopInput, } from './runtime'; +import type { RuntimeTurnSettledProvider } from '@features/member-work-sync/main'; + +export type { RuntimeBootstrapMemberMcpLaunchConfig } from './provisioning/TeamProvisioningBootstrapSpec'; +export { buildDirectTmuxRestartEnvAssignments } from './provisioning/TeamProvisioningDirectRestart'; +export { + getMixedLaunchFallbackRecoveryError, + getOpenCodeMixedProviderProvisioningError, +} from './provisioning/TeamProvisioningLaunchCompatibility'; +export { + shouldWarnOnMissingRegisteredMember, + shouldWarnOnUnreadableMemberAuditConfig, +} from './provisioning/TeamProvisioningMemberSpawnStatusPolicy'; +export { + buildAddMemberSpawnMessage, + buildRestartMemberSpawnMessage, +} from './provisioning/TeamProvisioningPromptBuilders'; type OpenCodeRuntimeMessageAdapter = TeamLaunchRuntimeAdapter & { sendMessageToMember( @@ -3501,6 +3510,8 @@ export class TeamProvisioningService { promise: Promise>; } >(); + private readonly staleProcessRuntimeMetadataCleanupInFlight = new Set(); + private readonly staleProcessRuntimeMetadataCleanupQueueByTeam = new Map>(); private readonly runtimeSnapshotCacheGenerationByTeam = new Map(); private readonly memberSpawnStatusesSnapshotCache = new Map< string, @@ -24912,6 +24923,70 @@ export class TeamProvisioningService { return undefined; } + private hasMatchingConfiguredDirectProcessRuntimeMetadata( + configuredMembers: TeamConfig['members'] | undefined, + memberName: string, + runtimePid: number + ): boolean { + return (configuredMembers ?? []).some((member) => { + const candidateName = typeof member?.name === 'string' ? member.name.trim() : ''; + if (!candidateName || !matchesExactTeamMemberName(candidateName, memberName)) { + return false; + } + const record = member as unknown as Record; + const providerId = + typeof record.providerId === 'string' + ? record.providerId.trim().toLowerCase() + : typeof record.provider === 'string' + ? record.provider.trim().toLowerCase() + : ''; + if (providerId === 'opencode') { + return false; + } + if ( + typeof record.runtimeSessionId === 'string' || + typeof record.laneId === 'string' || + typeof record.laneKind === 'string' || + typeof record.laneOwnerProviderId === 'string' + ) { + return false; + } + return ( + record.runtimePid === runtimePid && + hasDirectProcessRuntimeMetadataForStaleCleanup({ + backendType: record.backendType, + tmuxPaneId: record.tmuxPaneId, + runtimePid, + }) + ); + }); + } + + private shouldUsePersistedLaunchRuntimePidForMetadata(params: { + configuredMembers: TeamConfig['members'] | undefined; + memberName: string; + persistedMember: PersistedTeamLaunchMemberState; + }): boolean { + const runtimePid = normalizeRuntimePositiveInteger(params.persistedMember.runtimePid); + if (!runtimePid) { + return false; + } + if ( + params.persistedMember.providerId === 'opencode' || + params.persistedMember.runtimeSessionId || + params.persistedMember.laneId || + params.persistedMember.laneKind || + params.persistedMember.laneOwnerProviderId + ) { + return true; + } + return this.hasMatchingConfiguredDirectProcessRuntimeMetadata( + params.configuredMembers, + params.memberName, + runtimePid + ); + } + private findMetaMemberModel( metaMembers: Awaited>, memberName: string @@ -25269,6 +25344,8 @@ export class TeamProvisioningService { metaMembers = []; } + const staleProcessRuntimeMetadataCleanupCandidates: StaleProcessRuntimeMetadataCleanupCandidate[] = + []; const persistedRuntimeMembers = this.readPersistedRuntimeMembers(teamName); const metadataByMember = new Map(); const upsertMetadata = ( @@ -25454,6 +25531,13 @@ export class TeamProvisioningService { const activeRunProviderId = normalizeOptionalTeamProviderId(activeRunMember?.providerId) ?? inferTeamProviderIdFromModel(activeRunModel ?? evidenceModel); + const persistedLaunchRuntimePid = this.shouldUsePersistedLaunchRuntimePidForMetadata({ + configuredMembers, + memberName, + persistedMember, + }) + ? normalizeRuntimePositiveInteger(persistedMember.runtimePid) + : undefined; const effectiveProviderId = activeRunProviderId ?? persistedMember.providerId; upsertMetadata(memberName, { backendType: @@ -25481,8 +25565,8 @@ export class TeamProvisioningService { ...(typeof currentRuntimeAdapterEvidence?.runtimePid === 'number' && currentRuntimeAdapterEvidence.runtimePid > 0 ? { metricsPid: currentRuntimeAdapterEvidence.runtimePid } - : typeof persistedMember.runtimePid === 'number' && persistedMember.runtimePid > 0 - ? { metricsPid: persistedMember.runtimePid } + : persistedLaunchRuntimePid + ? { metricsPid: persistedLaunchRuntimePid } : {}), ...(currentRuntimeAdapterEvidence?.sessionId ? { runtimeSessionId: currentRuntimeAdapterEvidence.sessionId } @@ -25674,8 +25758,17 @@ export class TeamProvisioningService { }; for (const [memberName, metadata] of metadataByMember.entries()) { - const paneId = metadata.tmuxPaneId?.trim() ?? ''; const launchMember = persistedLaunchSnapshot?.members[memberName]; + const persistedLaunchRuntimePid = launchMember + ? this.shouldUsePersistedLaunchRuntimePidForMetadata({ + configuredMembers, + memberName, + persistedMember: launchMember, + }) + ? normalizeRuntimePositiveInteger(launchMember.runtimePid) + : undefined + : undefined; + const paneId = metadata.tmuxPaneId?.trim() ?? ''; const adapterEvidence = getCurrentRuntimeAdapterEvidence(memberName); const adapterStatus: MemberSpawnStatusEntry | undefined = adapterEvidence ? { @@ -25719,6 +25812,9 @@ export class TeamProvisioningService { const memberProcessTableAvailable = shouldUseWindowsHostRows ? windowsHostProcessTableAvailable || processTableAvailable : processTableAvailable; + const staleCleanupProcessTableAvailable = shouldUseWindowsHostRows + ? windowsHostProcessTableAvailable && processTableAvailable + : processTableAvailable; const trackedStatus = this.findTrackedMemberSpawnStatus(run, memberName); const launchStatus = this.isLaunchMemberStatusRelevantToRuntimeRun(launchMember, activeRuntimeRunId) && @@ -25731,14 +25827,14 @@ export class TeamProvisioningService { : this.shouldPreferCurrentLaunchMemberStatus(trackedStatus, adapterStatus) ? adapterStatus : (trackedStatus ?? adapterStatus ?? launchStatus ?? committedPrimarySessionStatus); - const resolved = resolveTeamMemberRuntimeLiveness({ + const livenessInput: ResolveTeamMemberRuntimeLivenessInput = { teamName, memberName, agentId: metadata.agentId, backendType: metadata.backendType, providerId: metadata.providerId ?? launchMember?.providerId, tmuxPaneId: metadata.tmuxPaneId, - persistedRuntimePid: launchMember?.runtimePid ?? metadata.metricsPid, + persistedRuntimePid: persistedLaunchRuntimePid ?? metadata.metricsPid, persistedRuntimeSessionId: launchMember?.runtimeSessionId ?? metadata.runtimeSessionId, trackedSpawnStatus: status, runtimePid: metadata.metricsPid, @@ -25747,7 +25843,33 @@ export class TeamProvisioningService { processRows: memberProcessRows, processTableAvailable: memberProcessTableAvailable, nowIso: nowIso(), - }); + }; + let resolved = resolveTeamMemberRuntimeLiveness(livenessInput); + const targetedRuntimePid = normalizeRuntimePositiveInteger( + persistedLaunchRuntimePid ?? metadata.metricsPid + ); + if ( + targetedRuntimePid !== undefined && + this.shouldTryTargetedDirectProcessRuntimeLivenessCheck({ + metadata, + providerId: metadata.providerId ?? launchMember?.providerId, + resolved, + runtimePid: targetedRuntimePid, + }) + ) { + const targetedCommand = + this.findRuntimeProcessCommandByPid(memberProcessRows, targetedRuntimePid) ?? + this.readProcessCommandByPid(targetedRuntimePid); + if (targetedCommand) { + resolved = resolveTeamMemberRuntimeLiveness({ + ...livenessInput, + targetedProcess: { + pid: targetedRuntimePid, + command: targetedCommand, + }, + }); + } + } const bootstrapTransportDiagnostic = status?.runtimeDiagnostic ?? launchMember?.runtimeDiagnostic; const bootstrapTransportDiagnosticSeverity = @@ -25770,6 +25892,33 @@ export class TeamProvisioningService { const runtimeDiagnosticSeverity = hasProcessBootstrapTransportDiagnostic ? (bootstrapTransportDiagnosticSeverity ?? resolved.runtimeDiagnosticSeverity) : resolved.runtimeDiagnosticSeverity; + const staleProcessRuntimeMetadataCleanupCandidate = + collectStaleProcessRuntimeMetadataCleanupCandidate({ + memberName, + providerId: metadata.providerId ?? launchMember?.providerId, + backendType: metadata.backendType, + agentId: metadata.agentId, + tmuxPaneId: metadata.tmuxPaneId, + runtimePid: resolved.pid ?? metadata.metricsPid ?? persistedLaunchRuntimePid, + runtimeSessionId: + resolved.runtimeSessionId ?? + metadata.runtimeSessionId ?? + launchMember?.runtimeSessionId, + runtimeRunId: launchMember?.runtimeRunId, + laneId: launchMember?.laneId, + laneKind: launchMember?.laneKind, + laneOwnerProviderId: launchMember?.laneOwnerProviderId, + livenessKind: resolved.livenessKind, + runtimeDiagnostic: resolved.runtimeDiagnostic, + processTableAvailable: staleCleanupProcessTableAvailable, + isLead: isLeadMember({ name: memberName }), + isRemoved: this.isMemberRemovedInMeta(metaMembers, memberName), + }); + if (staleProcessRuntimeMetadataCleanupCandidate) { + staleProcessRuntimeMetadataCleanupCandidates.push( + staleProcessRuntimeMetadataCleanupCandidate + ); + } metadataByMember.set(memberName, { ...metadata, alive: resolved.alive, @@ -25794,19 +25943,371 @@ export class TeamProvisioningService { }); } - if ( + const canFinalizeRuntimeMetadataSnapshot = this.getRuntimeSnapshotCacheGeneration(teamName) === generationAtStart && - this.getTrackedRunId(teamName) === runId - ) { + this.getTrackedRunId(teamName) === runId; + if (canFinalizeRuntimeMetadataSnapshot) { this.liveTeamAgentRuntimeMetadataCache.set(teamName, { expiresAtMs: Date.now() + TeamProvisioningService.AGENT_RUNTIME_SNAPSHOT_CACHE_TTL_MS, metadata: this.cloneLiveTeamAgentRuntimeMetadata(metadataByMember), runId, }); + this.scheduleStaleProcessRuntimeMetadataCleanup( + teamName, + staleProcessRuntimeMetadataCleanupCandidates + ); } return metadataByMember; } + private scheduleStaleProcessRuntimeMetadataCleanup( + teamName: string, + candidates: readonly StaleProcessRuntimeMetadataCleanupCandidate[] + ): void { + if (candidates.length === 0) { + return; + } + + const runnableCandidates: StaleProcessRuntimeMetadataCleanupCandidate[] = []; + for (const candidate of candidates) { + const key = `${teamName}\0${candidate.memberName}\0${candidate.runtimePid}`; + if (this.staleProcessRuntimeMetadataCleanupInFlight.has(key)) { + continue; + } + this.staleProcessRuntimeMetadataCleanupInFlight.add(key); + runnableCandidates.push(candidate); + } + + if (runnableCandidates.length === 0) { + return; + } + + const previousCleanup = + this.staleProcessRuntimeMetadataCleanupQueueByTeam.get(teamName) ?? Promise.resolve(); + const cleanup = previousCleanup + .catch(() => undefined) + .then(() => this.cleanupStaleProcessRuntimeMetadataCandidates(teamName, runnableCandidates)) + .catch((error) => { + logger.debug( + `[${teamName}] Failed to clean stale process runtime metadata: ${ + error instanceof Error ? error.message : String(error) + }` + ); + }) + .finally(() => { + for (const candidate of runnableCandidates) { + this.staleProcessRuntimeMetadataCleanupInFlight.delete( + `${teamName}\0${candidate.memberName}\0${candidate.runtimePid}` + ); + } + if (this.staleProcessRuntimeMetadataCleanupQueueByTeam.get(teamName) === cleanup) { + this.staleProcessRuntimeMetadataCleanupQueueByTeam.delete(teamName); + } + }); + this.staleProcessRuntimeMetadataCleanupQueueByTeam.set(teamName, cleanup); + } + + private dedupeStaleProcessRuntimeMetadataCleanupCandidates( + candidates: readonly StaleProcessRuntimeMetadataCleanupCandidate[] + ): StaleProcessRuntimeMetadataCleanupCandidate[] { + const byKey = new Map(); + for (const candidate of candidates) { + const key = `${candidate.memberName}\0${candidate.runtimePid}`; + if (!byKey.has(key)) { + byKey.set(key, candidate); + } + } + return [...byKey.values()]; + } + + private logStaleProcessRuntimeMetadataCleanupDebug( + teamName: string, + candidates: readonly StaleProcessRuntimeMetadataCleanupCandidate[] + ): void { + for (const candidate of candidates) { + logger.debug( + `[${teamName}] Cleaned stale process runtime metadata for ${candidate.memberName} pid=${candidate.runtimePid}` + ); + } + } + + private findStaleProcessRuntimeMetadataConfigMemberIndex( + members: readonly unknown[], + candidate: StaleProcessRuntimeMetadataCleanupCandidate + ): number { + return members.findIndex((member) => { + if (!member || typeof member !== 'object' || Array.isArray(member)) { + return false; + } + const memberName = (member as { name?: unknown }).name; + return ( + typeof memberName === 'string' && + matchesExactTeamMemberName(memberName, candidate.memberName) + ); + }); + } + + private applyStaleProcessRuntimeMetadataCleanupCandidates(params: { + teamName: string; + members: readonly unknown[]; + candidates: readonly StaleProcessRuntimeMetadataCleanupCandidate[]; + processRows: readonly RuntimeTelemetryProcessTableRow[]; + }): { + members: unknown[]; + cleanedCandidates: StaleProcessRuntimeMetadataCleanupCandidate[]; + } { + const nextMembers = [...params.members]; + const cleanedCandidates: StaleProcessRuntimeMetadataCleanupCandidate[] = []; + for (const candidate of params.candidates) { + if ( + params.processRows.some((row) => row.pid === candidate.runtimePid) || + this.processRowsContainVerifiedRuntimeProcess( + params.teamName, + candidate, + params.processRows + ) + ) { + continue; + } + + const memberIndex = this.findStaleProcessRuntimeMetadataConfigMemberIndex( + nextMembers, + candidate + ); + if (memberIndex < 0) { + continue; + } + const currentMember = nextMembers[memberIndex]; + if (!currentMember || typeof currentMember !== 'object' || Array.isArray(currentMember)) { + continue; + } + const cleanupResult = clearStaleProcessRuntimeMetadataFromMember( + currentMember as Record, + candidate + ); + if (!cleanupResult.changed) { + continue; + } + nextMembers[memberIndex] = cleanupResult.member; + cleanedCandidates.push(candidate); + } + return { members: nextMembers, cleanedCandidates }; + } + + private async readConfigForStaleProcessRuntimeMetadataCleanup( + configPath: string + ): Promise<(Record & { members: unknown[] }) | null> { + const raw = await tryReadRegularFileUtf8(configPath, { + timeoutMs: TEAM_JSON_READ_TIMEOUT_MS, + maxBytes: TEAM_CONFIG_MAX_BYTES, + }); + if (!raw) { + return null; + } + + const parsed = JSON.parse(raw) as unknown; + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + return null; + } + const config = parsed as Record & { members?: unknown }; + return Array.isArray(config.members) + ? ({ ...config, members: config.members } as Record & { + members: unknown[]; + }) + : null; + } + + private async cleanupStaleProcessRuntimeMetadataCandidates( + teamName: string, + candidates: readonly StaleProcessRuntimeMetadataCleanupCandidate[] + ): Promise { + const dedupedCandidates = this.dedupeStaleProcessRuntimeMetadataCleanupCandidates(candidates); + if (dedupedCandidates.length === 0) { + return; + } + if (this.shouldSkipStaleProcessRuntimeMetadataCleanup(teamName)) { + return; + } + if (await this.hasActiveLaunchStateForStaleProcessRuntimeMetadataCleanup(teamName)) { + return; + } + if (this.shouldSkipStaleProcessRuntimeMetadataCleanup(teamName)) { + return; + } + + const configPath = this.resolveSafeTeamStoragePath(getTeamsBasePath(), teamName, 'config.json'); + const processRows = await this.readRuntimeProcessRowsForStaleProcessMetadataCleanup(teamName); + if (!processRows) { + return; + } + if (this.shouldSkipStaleProcessRuntimeMetadataCleanup(teamName)) { + return; + } + const firstPassConfig = await this.readConfigForStaleProcessRuntimeMetadataCleanup(configPath); + if (!firstPassConfig?.members) { + return; + } + const firstPassCleanupResult = this.applyStaleProcessRuntimeMetadataCleanupCandidates({ + teamName, + members: firstPassConfig.members, + candidates: dedupedCandidates, + processRows, + }); + if (firstPassCleanupResult.cleanedCandidates.length === 0) { + return; + } + + if (this.shouldSkipStaleProcessRuntimeMetadataCleanup(teamName)) { + return; + } + if (await this.hasActiveLaunchStateForStaleProcessRuntimeMetadataCleanup(teamName)) { + return; + } + if (this.shouldSkipStaleProcessRuntimeMetadataCleanup(teamName)) { + return; + } + const latestConfig = await this.readConfigForStaleProcessRuntimeMetadataCleanup(configPath); + if (!latestConfig?.members) { + return; + } + if (this.shouldSkipStaleProcessRuntimeMetadataCleanup(teamName)) { + return; + } + const latestProcessRows = + await this.readRuntimeProcessRowsForStaleProcessMetadataCleanup(teamName); + if (!latestProcessRows) { + return; + } + if (this.shouldSkipStaleProcessRuntimeMetadataCleanup(teamName)) { + return; + } + const latestCleanupResult = this.applyStaleProcessRuntimeMetadataCleanupCandidates({ + teamName, + members: latestConfig.members, + candidates: firstPassCleanupResult.cleanedCandidates, + processRows: latestProcessRows, + }); + if (latestCleanupResult.cleanedCandidates.length === 0) { + return; + } + if (this.shouldSkipStaleProcessRuntimeMetadataCleanup(teamName)) { + return; + } + await atomicWriteAsync( + configPath, + `${JSON.stringify({ ...latestConfig, members: latestCleanupResult.members }, null, 2)}\n` + ); + TeamConfigReader.invalidateTeam(teamName); + this.invalidateRuntimeSnapshotCaches(teamName); + this.logStaleProcessRuntimeMetadataCleanupDebug( + teamName, + latestCleanupResult.cleanedCandidates + ); + } + + private shouldSkipStaleProcessRuntimeMetadataCleanup(teamName: string): boolean { + return shouldSkipStaleProcessRuntimeMetadataCleanupForRuntimeGuard({ + hasTrackedRun: this.getTrackedRunId(teamName) != null, + hasRuntimeAdapterRun: this.runtimeAdapterRunByTeam.has(teamName), + hasSecondaryRuntimeRun: this.secondaryRuntimeRunByTeam.has(teamName), + isStoppingSecondaryRuntimeTeam: this.stoppingSecondaryRuntimeTeams.has(teamName), + hasLaunchStateStoreOperation: this.launchStateStoreQueue.has(teamName), + hasTeamOperationLock: this.teamOpLocks.has(teamName), + }); + } + + private async hasActiveLaunchStateForStaleProcessRuntimeMetadataCleanup( + teamName: string + ): Promise { + try { + const [bootstrapSnapshot, launchSnapshot] = await Promise.all([ + readBootstrapLaunchSnapshot(teamName), + this.launchStateStore.read(teamName), + ]); + const preferredSnapshot = choosePreferredLaunchSnapshot(bootstrapSnapshot, launchSnapshot); + return preferredSnapshot?.launchPhase === 'active'; + } catch (error) { + logger.debug( + `[${teamName}] Failed to read launch state for stale runtime cleanup: ${ + error instanceof Error ? error.message : String(error) + }` + ); + return true; + } + } + + private async readRuntimeProcessRowsForStaleProcessMetadataCleanup( + teamName: string + ): Promise { + try { + const processRows = await this.readCurrentRuntimeProcessTableRows( + 'process table stale runtime cleanup' + ); + if (!processRows) { + return null; + } + if (process.platform !== 'win32') { + return processRows; + } + + const windowsHostRows = this.normalizeRuntimeProcessRowsForTelemetry( + await this.withRuntimeTelemetryTimeout( + listWindowsProcessTable(TeamProvisioningService.RUNTIME_WINDOWS_PROCESS_TABLE_TIMEOUT_MS), + TeamProvisioningService.RUNTIME_WINDOWS_PROCESS_TABLE_TIMEOUT_MS, + 'Windows process table stale runtime cleanup' + ), + 'windows-host' + ); + return windowsHostRows ? [...processRows, ...windowsHostRows] : null; + } catch (error) { + logger.debug( + `[${teamName}] Failed to read process table for stale runtime cleanup: ${ + error instanceof Error ? error.message : String(error) + }` + ); + return null; + } + } + + private processRowsContainVerifiedRuntimeProcess( + teamName: string, + candidate: StaleProcessRuntimeMetadataCleanupCandidate, + processRows: readonly RuntimeTelemetryProcessTableRow[] + ): boolean { + const agentId = candidate.agentId?.trim(); + if (!agentId) { + return false; + } + return processRows.some( + (row) => + commandArgEquals(row.command, '--team-name', teamName) && + commandArgEquals(row.command, '--agent-id', agentId) + ); + } + + private shouldTryTargetedDirectProcessRuntimeLivenessCheck(input: { + metadata: LiveTeamAgentRuntimeMetadata; + providerId?: TeamProviderId; + resolved: ResolvedTeamMemberRuntimeLiveness; + runtimePid?: number; + }): boolean { + const runtimePid = normalizeRuntimePositiveInteger(input.runtimePid); + if (!runtimePid) { + return false; + } + if (input.resolved.livenessKind === 'runtime_process') { + return false; + } + if (input.resolved.livenessKind === 'permission_blocked') { + return false; + } + if (input.providerId === 'opencode') { + return false; + } + const paneId = input.metadata.tmuxPaneId?.trim() ?? ''; + return input.metadata.backendType === 'process' || paneId === `process:${runtimePid}`; + } + private buildAgentRuntimeResourceHistoryKey(params: { memberName: string; pid?: number; @@ -29332,6 +29833,31 @@ export class TeamProvisioningService { return path.join(getTeamRuntimeEventsDir(teamName), `${filePrefix}.runtime.jsonl`); } + private resolveBootstrapRuntimeEvidenceBoundaryMs( + member: Pick, + runtimeMember: PersistedRuntimeMemberLike | undefined + ): number { + const firstSpawnAcceptedMs = Date.parse(member.firstSpawnAcceptedAt ?? ''); + const bootstrapExpectedAfterMs = Date.parse(runtimeMember?.bootstrapExpectedAfter ?? ''); + if (!Number.isFinite(firstSpawnAcceptedMs)) { + return Number.isFinite(bootstrapExpectedAfterMs) ? bootstrapExpectedAfterMs : Number.NaN; + } + if (!Number.isFinite(bootstrapExpectedAfterMs)) { + return firstSpawnAcceptedMs; + } + + const proofToken = runtimeMember?.bootstrapProofToken?.trim() ?? ''; + const memberRunId = typeof member.runtimeRunId === 'string' ? member.runtimeRunId.trim() : ''; + const runtimeRunId = runtimeMember?.bootstrapRunId?.trim() ?? ''; + const runIdsCompatible = + memberRunId.length === 0 || runtimeRunId.length === 0 || memberRunId === runtimeRunId; + if (proofToken.length === 0 || !runIdsCompatible) { + return firstSpawnAcceptedMs; + } + + return Math.min(firstSpawnAcceptedMs, bootstrapExpectedAfterMs); + } + private async readRuntimeBootstrapProofEvents( eventsPath: string ): Promise[]> { @@ -29427,31 +29953,6 @@ export class TeamProvisioningService { ); } - private resolveBootstrapRuntimeEvidenceBoundaryMs( - member: Pick, - runtimeMember: PersistedRuntimeMemberLike | undefined - ): number { - const firstSpawnAcceptedMs = Date.parse(member.firstSpawnAcceptedAt ?? ''); - const bootstrapExpectedAfterMs = Date.parse(runtimeMember?.bootstrapExpectedAfter ?? ''); - if (!Number.isFinite(firstSpawnAcceptedMs)) { - return Number.isFinite(bootstrapExpectedAfterMs) ? bootstrapExpectedAfterMs : Number.NaN; - } - if (!Number.isFinite(bootstrapExpectedAfterMs)) { - return firstSpawnAcceptedMs; - } - - const proofToken = runtimeMember?.bootstrapProofToken?.trim() ?? ''; - const memberRunId = typeof member.runtimeRunId === 'string' ? member.runtimeRunId.trim() : ''; - const runtimeRunId = runtimeMember?.bootstrapRunId?.trim() ?? ''; - const runIdsCompatible = - memberRunId.length === 0 || runtimeRunId.length === 0 || memberRunId === runtimeRunId; - if (proofToken.length === 0 || !runIdsCompatible) { - return firstSpawnAcceptedMs; - } - - return Math.min(firstSpawnAcceptedMs, bootstrapExpectedAfterMs); - } - private async findBootstrapRuntimeProofObservedAt( teamName: string, memberName: string, @@ -31788,7 +32289,10 @@ export class TeamProvisioningService { } private async waitForInFlightTeamOperationsForShutdown(timeoutMs = 2_000): Promise { - const locks = Array.from(this.teamOpLocks.values()); + const locks = [ + ...this.teamOpLocks.values(), + ...this.staleProcessRuntimeMetadataCleanupQueueByTeam.values(), + ]; if (locks.length === 0) { return; } @@ -34350,6 +34854,7 @@ export class TeamProvisioningService { const suffix = fileName.slice(prefix.length, -'.json'.length); return /^\d+$/.test(suffix); } + /** * Safely add tool names to the permissions.allow (or deny) array in a Claude settings file. * Creates the file and parent directories if they don't exist. diff --git a/src/main/services/team/provisioning/StaleProcessRuntimeMetadataCleanup.ts b/src/main/services/team/provisioning/StaleProcessRuntimeMetadataCleanup.ts new file mode 100644 index 00000000..025ce326 --- /dev/null +++ b/src/main/services/team/provisioning/StaleProcessRuntimeMetadataCleanup.ts @@ -0,0 +1,210 @@ +import type { + TeamAgentRuntimeBackendType, + TeamAgentRuntimeLivenessKind, + TeamProviderId, +} from '@shared/types'; + +export const STALE_PROCESS_RUNTIME_METADATA_DIAGNOSTIC = 'persisted runtime pid is not alive'; + +const STALE_PROCESS_RUNTIME_METADATA_FIELDS = [ + 'runtimePid', + 'bootstrapExpectedAfter', + 'bootstrapProofToken', + 'bootstrapRunId', + 'bootstrapProofMode', + 'bootstrapContextHash', + 'bootstrapBriefingHash', + 'bootstrapRuntimeEventsPath', +] as const; + +export interface StaleProcessRuntimeMetadataCleanupCandidate { + memberName: string; + runtimePid: number; + processPaneId: string; + agentId?: string; +} + +export interface StaleProcessRuntimeMetadataCleanupInput { + memberName: string; + providerId?: TeamProviderId | string; + backendType?: TeamAgentRuntimeBackendType | string; + agentId?: string; + tmuxPaneId?: string; + runtimePid?: number; + runtimeSessionId?: string; + runtimeRunId?: string; + laneId?: string; + laneKind?: string; + laneOwnerProviderId?: string; + livenessKind?: TeamAgentRuntimeLivenessKind | string; + runtimeDiagnostic?: string; + processTableAvailable: boolean; + isLead: boolean; + isRemoved: boolean; +} + +export interface StaleProcessRuntimeMetadataCleanupResult { + member: Record; + changed: boolean; +} + +export interface StaleProcessRuntimeMetadataRuntimeGuard { + hasTrackedRun?: boolean; + hasRuntimeAdapterRun?: boolean; + hasSecondaryRuntimeRun?: boolean; + isStoppingSecondaryRuntimeTeam?: boolean; + hasLaunchStateStoreOperation?: boolean; + hasTeamOperationLock?: boolean; + hasActiveLaunchState?: boolean; +} + +function normalizePositiveInteger(value: unknown): number | undefined { + return typeof value === 'number' && Number.isInteger(value) && value > 0 ? value : undefined; +} + +function normalizeString(value: unknown): string { + return typeof value === 'string' ? value.trim() : ''; +} + +function isOpenCodeProvider(providerId: unknown): boolean { + return normalizeString(providerId).toLowerCase() === 'opencode'; +} + +function hasRuntimeSessionId(value: unknown): boolean { + return normalizeString(value).length > 0; +} + +function hasLaneRuntimeMetadata(value: { + laneId?: unknown; + laneKind?: unknown; + laneOwnerProviderId?: unknown; +}): boolean { + return ( + normalizeString(value.laneId).length > 0 || + normalizeString(value.laneKind).length > 0 || + normalizeString(value.laneOwnerProviderId).length > 0 + ); +} + +function isDirectProcessRuntimeMetadata(params: { + backendType?: unknown; + tmuxPaneId?: unknown; + runtimePid: number; +}): boolean { + const backendType = normalizeString(params.backendType).toLowerCase(); + const tmuxPaneId = normalizeString(params.tmuxPaneId); + const processPaneId = `process:${params.runtimePid}`; + if (tmuxPaneId && tmuxPaneId !== processPaneId) { + return false; + } + return backendType === 'process' || tmuxPaneId === processPaneId; +} + +export function hasDirectProcessRuntimeMetadataForStaleCleanup(params: { + backendType?: unknown; + tmuxPaneId?: unknown; + runtimePid?: unknown; +}): boolean { + const runtimePid = normalizePositiveInteger(params.runtimePid); + return runtimePid != null + ? isDirectProcessRuntimeMetadata({ + backendType: params.backendType, + tmuxPaneId: params.tmuxPaneId, + runtimePid, + }) + : false; +} + +export function shouldSkipStaleProcessRuntimeMetadataCleanupForRuntimeGuard( + input: StaleProcessRuntimeMetadataRuntimeGuard +): boolean { + return Boolean( + input.hasTrackedRun || + input.hasRuntimeAdapterRun || + input.hasSecondaryRuntimeRun || + input.isStoppingSecondaryRuntimeTeam || + input.hasLaunchStateStoreOperation || + input.hasTeamOperationLock || + input.hasActiveLaunchState + ); +} + +export function collectStaleProcessRuntimeMetadataCleanupCandidate( + input: StaleProcessRuntimeMetadataCleanupInput +): StaleProcessRuntimeMetadataCleanupCandidate | null { + const memberName = input.memberName.trim(); + const runtimePid = normalizePositiveInteger(input.runtimePid); + if (!memberName || runtimePid == null) { + return null; + } + if (input.isLead || input.isRemoved) { + return null; + } + if (input.livenessKind !== 'stale_metadata') { + return null; + } + if (input.runtimeDiagnostic !== STALE_PROCESS_RUNTIME_METADATA_DIAGNOSTIC) { + return null; + } + if (!input.processTableAvailable) { + return null; + } + if ( + isOpenCodeProvider(input.providerId) || + hasRuntimeSessionId(input.runtimeSessionId) || + hasLaneRuntimeMetadata(input) + ) { + return null; + } + if ( + !isDirectProcessRuntimeMetadata({ + backendType: input.backendType, + tmuxPaneId: input.tmuxPaneId, + runtimePid, + }) + ) { + return null; + } + + return { + memberName, + runtimePid, + processPaneId: `process:${runtimePid}`, + ...(input.agentId?.trim() ? { agentId: input.agentId.trim() } : {}), + }; +} + +export function clearStaleProcessRuntimeMetadataFromMember( + member: Record, + candidate: StaleProcessRuntimeMetadataCleanupCandidate +): StaleProcessRuntimeMetadataCleanupResult { + const runtimePid = normalizePositiveInteger(member.runtimePid); + if (runtimePid == null || runtimePid !== candidate.runtimePid) { + return { member: { ...member }, changed: false }; + } + if (isOpenCodeProvider(member.providerId ?? member.provider)) { + return { member: { ...member }, changed: false }; + } + if (hasRuntimeSessionId(member.runtimeSessionId) || hasLaneRuntimeMetadata(member)) { + return { member: { ...member }, changed: false }; + } + if ( + !isDirectProcessRuntimeMetadata({ + backendType: member.backendType, + tmuxPaneId: member.tmuxPaneId, + runtimePid, + }) + ) { + return { member: { ...member }, changed: false }; + } + + const next = { ...member }; + for (const field of STALE_PROCESS_RUNTIME_METADATA_FIELDS) { + delete next[field]; + } + if (normalizeString(member.tmuxPaneId) === candidate.processPaneId) { + delete next.tmuxPaneId; + } + + return { member: next, changed: true }; +} diff --git a/src/main/services/team/provisioning/__tests__/StaleProcessRuntimeMetadataCleanup.test.ts b/src/main/services/team/provisioning/__tests__/StaleProcessRuntimeMetadataCleanup.test.ts new file mode 100644 index 00000000..19eb8b44 --- /dev/null +++ b/src/main/services/team/provisioning/__tests__/StaleProcessRuntimeMetadataCleanup.test.ts @@ -0,0 +1,341 @@ +import { describe, expect, it } from 'vitest'; + +import { resolveTeamMemberRuntimeLiveness } from '../../TeamRuntimeLivenessResolver'; +import { + clearStaleProcessRuntimeMetadataFromMember, + collectStaleProcessRuntimeMetadataCleanupCandidate, + hasDirectProcessRuntimeMetadataForStaleCleanup, + shouldSkipStaleProcessRuntimeMetadataCleanupForRuntimeGuard, + STALE_PROCESS_RUNTIME_METADATA_DIAGNOSTIC, +} from '../StaleProcessRuntimeMetadataCleanup'; + +import type { RuntimeProcessTableRow } from '@features/tmux-installer/main'; + +const baseCandidateInput = { + memberName: 'tom', + providerId: 'codex', + backendType: 'process', + agentId: 'tom@signal-ops-2', + tmuxPaneId: 'process:37749', + runtimePid: 37749, + runtimeSessionId: undefined, + livenessKind: 'stale_metadata', + runtimeDiagnostic: STALE_PROCESS_RUNTIME_METADATA_DIAGNOSTIC, + processTableAvailable: true, + isLead: false, + isRemoved: false, +} as const; + +function createRuntimeMember(overrides: Record = {}): Record { + return { + name: 'tom', + agentId: 'tom@signal-ops-2', + provider: 'codex', + providerId: 'codex', + model: 'gpt-5.5', + role: 'developer', + prompt: 'Build things', + color: 'yellow', + cwd: '/repo', + subscriptions: ['team-lead'], + backendType: 'process', + tmuxPaneId: 'process:37749', + runtimePid: 37749, + bootstrapExpectedAfter: '2026-05-16T18:35:52.562Z', + bootstrapProofToken: 'token', + bootstrapRunId: 'run-1', + bootstrapProofMode: 'native_app_managed_context', + bootstrapContextHash: 'context-hash', + bootstrapBriefingHash: 'briefing-hash', + bootstrapRuntimeEventsPath: '/tmp/tom.runtime.jsonl', + ...overrides, + }; +} + +describe('stale process runtime metadata cleanup planner', () => { + it('clears only stale direct-process runtime fields and preserves member identity', () => { + const candidate = collectStaleProcessRuntimeMetadataCleanupCandidate(baseCandidateInput); + + expect(candidate).toEqual({ + memberName: 'tom', + runtimePid: 37749, + processPaneId: 'process:37749', + agentId: 'tom@signal-ops-2', + }); + + const result = clearStaleProcessRuntimeMetadataFromMember(createRuntimeMember(), candidate!); + + expect(result.changed).toBe(true); + expect(result.member).toMatchObject({ + name: 'tom', + agentId: 'tom@signal-ops-2', + provider: 'codex', + providerId: 'codex', + model: 'gpt-5.5', + role: 'developer', + prompt: 'Build things', + color: 'yellow', + cwd: '/repo', + subscriptions: ['team-lead'], + backendType: 'process', + }); + expect(result.member.runtimePid).toBeUndefined(); + expect(result.member.tmuxPaneId).toBeUndefined(); + expect(result.member.bootstrapRunId).toBeUndefined(); + expect(result.member.bootstrapRuntimeEventsPath).toBeUndefined(); + }); + + it('skips OpenCode members', () => { + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + providerId: 'opencode', + }) + ).toBeNull(); + + const candidate = collectStaleProcessRuntimeMetadataCleanupCandidate(baseCandidateInput)!; + const result = clearStaleProcessRuntimeMetadataFromMember( + createRuntimeMember({ providerId: 'opencode', provider: 'opencode' }), + candidate + ); + + expect(result.changed).toBe(false); + expect(result.member.runtimePid).toBe(37749); + }); + + it('skips normal tmux pane metadata', () => { + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + backendType: 'tmux', + tmuxPaneId: '%12', + }) + ).toBeNull(); + }); + + it('skips mismatched process pane metadata', () => { + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + tmuxPaneId: 'process:99999', + }) + ).toBeNull(); + }); + + it('matches only direct-process runtime metadata shapes', () => { + expect( + hasDirectProcessRuntimeMetadataForStaleCleanup({ + backendType: 'process', + tmuxPaneId: 'process:37749', + runtimePid: 37749, + }) + ).toBe(true); + expect( + hasDirectProcessRuntimeMetadataForStaleCleanup({ + backendType: 'process', + tmuxPaneId: 'process:99999', + runtimePid: 37749, + }) + ).toBe(false); + expect( + hasDirectProcessRuntimeMetadataForStaleCleanup({ + backendType: 'tmux', + tmuxPaneId: '%12', + runtimePid: 37749, + }) + ).toBe(false); + }); + + it('skips active or uncertain cleanup guards', () => { + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + processTableAvailable: false, + }) + ).toBeNull(); + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + isLead: true, + }) + ).toBeNull(); + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + isRemoved: true, + }) + ).toBeNull(); + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + runtimeSessionId: 'session-1', + }) + ).toBeNull(); + + expect( + shouldSkipStaleProcessRuntimeMetadataCleanupForRuntimeGuard({ hasTrackedRun: true }) + ).toBe(true); + expect( + shouldSkipStaleProcessRuntimeMetadataCleanupForRuntimeGuard({ + hasRuntimeAdapterRun: true, + }) + ).toBe(true); + expect( + shouldSkipStaleProcessRuntimeMetadataCleanupForRuntimeGuard({ + hasActiveLaunchState: true, + }) + ).toBe(true); + expect(shouldSkipStaleProcessRuntimeMetadataCleanupForRuntimeGuard({})).toBe(false); + }); + + it('skips lane metadata but allows direct bootstrap run ids', () => { + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + laneId: 'secondary:bob', + }) + ).toBeNull(); + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + runtimeRunId: 'bootstrap-run-1', + })?.runtimePid + ).toBe(37749); + + const candidate = collectStaleProcessRuntimeMetadataCleanupCandidate(baseCandidateInput)!; + const result = clearStaleProcessRuntimeMetadataFromMember( + createRuntimeMember({ laneId: 'secondary:bob' }), + candidate + ); + + expect(result.changed).toBe(false); + expect(result.member.runtimePid).toBe(37749); + }); + + it('does not clear if the process table shows the same pid is alive', () => { + const candidate = collectStaleProcessRuntimeMetadataCleanupCandidate(baseCandidateInput)!; + const processRows: RuntimeProcessTableRow[] = [ + { pid: 37749, ppid: 1, command: 'node some-other-process.js' }, + ]; + + const processStillExists = processRows.some((row) => row.pid === candidate.runtimePid); + + expect(processStillExists).toBe(true); + }); +}); + +describe('stale process runtime metadata cleanup runtime flow', () => { + it('plans cleanup for stale metadata when process table is available and no pid exists', () => { + const resolved = resolveTeamMemberRuntimeLiveness({ + teamName: 'signal-ops-2', + memberName: 'tom', + agentId: 'tom@signal-ops-2', + providerId: 'codex', + backendType: 'process', + tmuxPaneId: 'process:37749', + persistedRuntimePid: 37749, + processRows: [], + processTableAvailable: true, + nowIso: '2026-05-28T00:00:00.000Z', + }); + + const candidate = collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + livenessKind: resolved.livenessKind, + runtimeDiagnostic: resolved.runtimeDiagnostic, + runtimePid: resolved.pid, + }); + + expect(resolved.livenessKind).toBe('stale_metadata'); + expect(candidate?.runtimePid).toBe(37749); + }); + + it('does not plan cleanup for registered-only metadata', () => { + const resolved = resolveTeamMemberRuntimeLiveness({ + teamName: 'signal-ops-2', + memberName: 'bob', + agentId: 'bob@signal-ops-2', + providerId: 'opencode', + backendType: 'process', + processRows: [], + processTableAvailable: true, + nowIso: '2026-05-28T00:00:00.000Z', + }); + + expect(resolved.livenessKind).toBe('registered_only'); + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + memberName: 'bob', + providerId: 'opencode', + runtimePid: undefined, + livenessKind: resolved.livenessKind, + runtimeDiagnostic: resolved.runtimeDiagnostic, + }) + ).toBeNull(); + }); + + it('does not plan cleanup for verified runtime process evidence', () => { + const resolved = resolveTeamMemberRuntimeLiveness({ + teamName: 'signal-ops-2', + memberName: 'tom', + agentId: 'tom@signal-ops-2', + providerId: 'codex', + backendType: 'process', + persistedRuntimePid: 37749, + processRows: [ + { + pid: 55555, + ppid: 1, + command: 'node runtime.js --team-name signal-ops-2 --agent-id tom@signal-ops-2', + }, + ], + processTableAvailable: true, + nowIso: '2026-05-28T00:00:00.000Z', + }); + + expect(resolved.livenessKind).toBe('runtime_process'); + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + runtimePid: resolved.pid, + livenessKind: resolved.livenessKind, + runtimeDiagnostic: resolved.runtimeDiagnostic, + }) + ).toBeNull(); + }); + + it('does not plan cleanup for confirmed bootstrap evidence', () => { + const resolved = resolveTeamMemberRuntimeLiveness({ + teamName: 'signal-ops-2', + memberName: 'alice', + agentId: 'alice@signal-ops-2', + providerId: 'anthropic', + backendType: 'process', + trackedSpawnStatus: { + status: 'online', + launchState: 'confirmed_alive', + agentToolAccepted: true, + runtimeAlive: true, + bootstrapConfirmed: true, + hardFailure: false, + updatedAt: '2026-05-28T00:00:00.000Z', + }, + processRows: [], + processTableAvailable: true, + nowIso: '2026-05-28T00:00:00.000Z', + }); + + expect(resolved.livenessKind).toBe('confirmed_bootstrap'); + expect( + collectStaleProcessRuntimeMetadataCleanupCandidate({ + ...baseCandidateInput, + memberName: 'alice', + providerId: 'anthropic', + runtimePid: undefined, + livenessKind: resolved.livenessKind, + runtimeDiagnostic: resolved.runtimeDiagnostic, + }) + ).toBeNull(); + }); +}); diff --git a/test/main/services/team/TeamProvisioningService.test.ts b/test/main/services/team/TeamProvisioningService.test.ts index 7cbea2ac..54baf1b6 100644 --- a/test/main/services/team/TeamProvisioningService.test.ts +++ b/test/main/services/team/TeamProvisioningService.test.ts @@ -5196,7 +5196,7 @@ describe('TeamProvisioningService', () => { launchState: 'failed_to_start', runtimeAlive: false, livenessSource: undefined, - livenessKind: 'stale_metadata', + livenessKind: 'not_found', hardFailure: true, hardFailureReason: 'Teammate did not join within the launch grace window.', }); @@ -5318,6 +5318,61 @@ describe('TeamProvisioningService', () => { }); }); + it('uses targeted pid verification when the full snapshot misses a live direct process teammate', async () => { + const svc = new TeamProvisioningService(); + (svc as any).configReader = { + getConfig: vi.fn(async () => ({ + members: [ + { name: 'team-lead', agentType: 'team-lead' }, + { + name: 'alice', + providerId: 'codex', + model: 'gpt-5.4-mini', + agentId: 'alice@vector-room-13', + backendType: 'process', + runtimePid: 74735, + tmuxPaneId: 'process:74735', + }, + ], + })), + }; + (svc as any).membersMetaStore = { + getMembers: vi.fn(async () => [ + { + name: 'alice', + providerId: 'codex', + model: 'gpt-5.4-mini', + }, + ]), + }; + (svc as any).readPersistedRuntimeMembers = vi.fn(() => [ + { + name: 'alice', + providerId: 'codex', + model: 'gpt-5.4-mini', + agentId: 'alice@vector-room-13', + backendType: 'process', + runtimePid: 74735, + tmuxPaneId: 'process:74735', + }, + ]); + vi.mocked(listRuntimeProcessTableForCurrentPlatform).mockResolvedValueOnce([]); + const targetedRead = vi.spyOn(svc as any, 'readProcessCommandByPid').mockReturnValue( + '/Users/belief/.bun/bin/bun cli.js --agent-id alice@vector-room-13 --agent-name alice --team-name vector-room-13 --model gpt-5.4-mini' + ); + + const metadata = await (svc as any).getLiveTeamAgentRuntimeMetadata('vector-room-13'); + + expect(targetedRead).toHaveBeenCalledWith(74735); + expect(metadata.get('alice')).toMatchObject({ + alive: true, + livenessKind: 'runtime_process', + pidSource: 'agent_process_table', + pid: 74735, + runtimeDiagnostic: 'verified runtime process detected by targeted pid check', + }); + }); + it('does not let removed base member metadata hide an active suffixed member', async () => { const svc = new TeamProvisioningService(); (svc as any).configReader = {