fix(team): heal stale confirmed bootstrap diagnostics
- Carry bootstrap run ids from bootstrap-state into member evidence and compare them with current run identity. - Allow small confirmation clock skew for delayed Anthropic app acceptance without accepting stale rapid relaunch evidence. - Clean confirmed bootstrap members that only have stale persisted runtime pid diagnostics. - Cover process-table unavailable, post-stop stale pid and mixed launch reconcile cases.
This commit is contained in:
parent
9d34a534a2
commit
7aa87f2278
7 changed files with 977 additions and 45 deletions
|
|
@ -332,7 +332,8 @@ function toIso(value: unknown, fallback: string): string {
|
|||
function normalizeBootstrapMemberState(
|
||||
memberName: string,
|
||||
raw: RawBootstrapMemberState,
|
||||
updatedAt: string
|
||||
updatedAt: string,
|
||||
runtimeRunId?: string
|
||||
): PersistedTeamLaunchMemberState {
|
||||
const status = typeof raw.status === 'string' ? raw.status : 'pending';
|
||||
const hardFailure = status === 'failed';
|
||||
|
|
@ -363,6 +364,7 @@ function normalizeBootstrapMemberState(
|
|||
typeof raw.failureReason === 'string' && raw.failureReason.trim().length > 0
|
||||
? raw.failureReason.trim()
|
||||
: undefined,
|
||||
...(runtimeRunId ? { runtimeRunId } : {}),
|
||||
firstSpawnAcceptedAt: agentToolAccepted ? toIso(raw.lastAttemptAt, updatedAt) : undefined,
|
||||
lastHeartbeatAt: bootstrapConfirmed ? toIso(raw.lastObservedAt, updatedAt) : undefined,
|
||||
lastRuntimeAliveAt: runtimeAlive ? toIso(raw.lastObservedAt, updatedAt) : undefined,
|
||||
|
|
@ -622,6 +624,7 @@ export async function readBootstrapLaunchSnapshot(
|
|||
}
|
||||
try {
|
||||
const updatedAt = toIso(raw.updatedAt, new Date().toISOString());
|
||||
const runtimeRunId = typeof raw.runId === 'string' ? raw.runId.trim() : '';
|
||||
const rawMembers = Array.isArray(raw.members) ? raw.members : [];
|
||||
const members: Record<string, PersistedTeamLaunchMemberState> = {};
|
||||
const expectedMembers: string[] = [];
|
||||
|
|
@ -632,7 +635,12 @@ export async function readBootstrapLaunchSnapshot(
|
|||
const memberName = typeof rawMember.name === 'string' ? rawMember.name.trim() : '';
|
||||
if (!memberName || memberName === 'team-lead' || memberName === 'user') continue;
|
||||
expectedMembers.push(memberName);
|
||||
members[memberName] = normalizeBootstrapMemberState(memberName, rawMember, updatedAt);
|
||||
members[memberName] = normalizeBootstrapMemberState(
|
||||
memberName,
|
||||
rawMember,
|
||||
updatedAt,
|
||||
runtimeRunId || undefined
|
||||
);
|
||||
}
|
||||
|
||||
const terminal =
|
||||
|
|
|
|||
|
|
@ -939,6 +939,17 @@ function buildRuntimeDiagnosticForSpawn(
|
|||
: 'process table unavailable';
|
||||
}
|
||||
|
||||
function isConfirmedBootstrapStaleRuntimeDiagnostic(reason?: string): boolean {
|
||||
const text = reason?.trim();
|
||||
return text === 'persisted runtime pid is not alive';
|
||||
}
|
||||
|
||||
function shouldClearRuntimeDiagnosticAfterBootstrapConfirmation(reason?: string): boolean {
|
||||
return (
|
||||
isAutoClearableLaunchFailureReason(reason) || isConfirmedBootstrapStaleRuntimeDiagnostic(reason)
|
||||
);
|
||||
}
|
||||
|
||||
function runtimeTaskRefs(teamName: string, value: unknown): InboxMessage['taskRefs'] | undefined {
|
||||
const refs = normalizeRuntimeStringArray(value);
|
||||
return refs.length > 0
|
||||
|
|
@ -9676,9 +9687,8 @@ export class TeamProvisioningService {
|
|||
}
|
||||
|
||||
private getRunLeadName(run: ProvisioningRun): string {
|
||||
return (
|
||||
run.request.members.find((m) => m.role?.toLowerCase().includes('lead'))?.name || 'team-lead'
|
||||
);
|
||||
const members = Array.isArray(run.request?.members) ? run.request.members : [];
|
||||
return members.find((m) => m.role?.toLowerCase().includes('lead'))?.name || 'team-lead';
|
||||
}
|
||||
|
||||
private rememberRecentCrossTeamLeadDeliveryMessageIds(
|
||||
|
|
@ -23396,34 +23406,62 @@ export class TeamProvisioningService {
|
|||
(current.launchState === 'runtime_pending_bootstrap' ||
|
||||
current.launchState === 'failed_to_start') &&
|
||||
isProcessBootstrapTransportDiagnostic(current.runtimeDiagnostic);
|
||||
const runtimeDiagnostic = shouldPreserveProcessBootstrapTransportDiagnostic
|
||||
? current.runtimeDiagnostic
|
||||
: buildRuntimeDiagnosticForSpawn(metadata);
|
||||
const metadataLivenessKind =
|
||||
current.bootstrapConfirmed === true || current.launchState === 'confirmed_alive'
|
||||
? metadata.livenessKind === 'runtime_process' ||
|
||||
metadata.livenessKind === 'confirmed_bootstrap'
|
||||
? metadata.livenessKind
|
||||
: current.livenessKind
|
||||
: metadata.livenessKind;
|
||||
const hasStrongEvidence = isStrongRuntimeEvidence(metadata);
|
||||
const hasConfirmedBootstrap =
|
||||
current.bootstrapConfirmed === true || current.launchState === 'confirmed_alive';
|
||||
const shouldSuppressWeakRuntimeMetadataForConfirmedBootstrap =
|
||||
hasConfirmedBootstrap && !hasStrongEvidence;
|
||||
let runtimeDiagnostic: string | undefined;
|
||||
let runtimeDiagnosticSeverity: TeamAgentRuntimeDiagnosticSeverity | undefined;
|
||||
if (shouldPreserveProcessBootstrapTransportDiagnostic) {
|
||||
runtimeDiagnostic = current.runtimeDiagnostic;
|
||||
runtimeDiagnosticSeverity = current.runtimeDiagnosticSeverity;
|
||||
} else if (shouldSuppressWeakRuntimeMetadataForConfirmedBootstrap) {
|
||||
if (
|
||||
current.runtimeDiagnostic &&
|
||||
!shouldClearRuntimeDiagnosticAfterBootstrapConfirmation(current.runtimeDiagnostic)
|
||||
) {
|
||||
runtimeDiagnostic = current.runtimeDiagnostic;
|
||||
runtimeDiagnosticSeverity = current.runtimeDiagnosticSeverity;
|
||||
} else {
|
||||
const metadataRuntimeDiagnostic = buildRuntimeDiagnosticForSpawn(metadata);
|
||||
if (
|
||||
metadataRuntimeDiagnostic &&
|
||||
!shouldClearRuntimeDiagnosticAfterBootstrapConfirmation(metadataRuntimeDiagnostic)
|
||||
) {
|
||||
runtimeDiagnostic = metadataRuntimeDiagnostic;
|
||||
runtimeDiagnosticSeverity = metadata.runtimeDiagnosticSeverity;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
runtimeDiagnostic = buildRuntimeDiagnosticForSpawn(metadata);
|
||||
runtimeDiagnosticSeverity = metadata.runtimeDiagnosticSeverity;
|
||||
}
|
||||
const metadataLivenessKind = hasConfirmedBootstrap
|
||||
? metadata.livenessKind === 'runtime_process' ||
|
||||
metadata.livenessKind === 'confirmed_bootstrap'
|
||||
? metadata.livenessKind
|
||||
: current.livenessKind === 'stale_metadata' || current.livenessKind === 'registered_only'
|
||||
? 'confirmed_bootstrap'
|
||||
: (current.livenessKind ?? 'confirmed_bootstrap')
|
||||
: metadata.livenessKind;
|
||||
const nextEntry: MemberSpawnStatusEntry = {
|
||||
...current,
|
||||
...(metadata.model ? { runtimeModel: metadata.model } : {}),
|
||||
...(metadataLivenessKind ? { livenessKind: metadataLivenessKind } : {}),
|
||||
...(runtimeDiagnostic ? { runtimeDiagnostic } : {}),
|
||||
...(runtimeDiagnostic || shouldSuppressWeakRuntimeMetadataForConfirmedBootstrap
|
||||
? { runtimeDiagnostic }
|
||||
: {}),
|
||||
...(shouldPreserveProcessBootstrapTransportDiagnostic
|
||||
? { runtimeDiagnosticSeverity: current.runtimeDiagnosticSeverity }
|
||||
: metadata.runtimeDiagnosticSeverity
|
||||
? { runtimeDiagnosticSeverity: metadata.runtimeDiagnosticSeverity }
|
||||
? { runtimeDiagnosticSeverity }
|
||||
: runtimeDiagnosticSeverity || shouldSuppressWeakRuntimeMetadataForConfirmedBootstrap
|
||||
? { runtimeDiagnosticSeverity }
|
||||
: {}),
|
||||
livenessLastCheckedAt: nowIso(),
|
||||
};
|
||||
const failureReason = current.hardFailureReason ?? current.error;
|
||||
const hasStrongEvidence = isStrongRuntimeEvidence(metadata);
|
||||
const hasWeakEvidence =
|
||||
metadata.livenessKind != null &&
|
||||
!isStrongRuntimeEvidence(metadata) &&
|
||||
current.bootstrapConfirmed !== true;
|
||||
metadata.livenessKind != null && !hasStrongEvidence && current.bootstrapConfirmed !== true;
|
||||
if (
|
||||
hasStrongEvidence &&
|
||||
!openCodeSecondaryBootstrapPending &&
|
||||
|
|
@ -25827,7 +25865,13 @@ export class TeamProvisioningService {
|
|||
}
|
||||
const current =
|
||||
run.memberSpawnStatuses.get(memberName) ?? createInitialMemberSpawnStatusEntry();
|
||||
if (!isBootstrapMemberEvidenceCurrentForMember(current, bootstrapMember, 'confirmation')) {
|
||||
if (
|
||||
!isBootstrapMemberEvidenceCurrentForMember(
|
||||
{ ...current, runtimeRunId: run.runId },
|
||||
bootstrapMember,
|
||||
'confirmation'
|
||||
)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
if (current.launchState === 'skipped_for_launch' || current.skippedForLaunch === true) {
|
||||
|
|
@ -25920,7 +25964,13 @@ export class TeamProvisioningService {
|
|||
if (!current || bootstrapMember?.bootstrapConfirmed !== true) {
|
||||
continue;
|
||||
}
|
||||
if (!isBootstrapMemberEvidenceCurrentForMember(current, bootstrapMember, 'confirmation')) {
|
||||
if (
|
||||
!isBootstrapMemberEvidenceCurrentForMember(
|
||||
{ ...current, runtimeRunId: run.runId },
|
||||
bootstrapMember,
|
||||
'confirmation'
|
||||
)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
if (
|
||||
|
|
@ -28278,6 +28328,105 @@ export class TeamProvisioningService {
|
|||
return false;
|
||||
}
|
||||
|
||||
private needsConfirmedBootstrapDiagnosticReconcile(
|
||||
snapshot: PersistedTeamLaunchSnapshot | null
|
||||
): boolean {
|
||||
if (!snapshot) {
|
||||
return false;
|
||||
}
|
||||
for (const member of Object.values(snapshot.members)) {
|
||||
if (
|
||||
member?.bootstrapConfirmed !== true ||
|
||||
member.hardFailure === true ||
|
||||
isPersistedOpenCodeSecondaryLaneMember(member)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
if (
|
||||
member.livenessKind === 'stale_metadata' ||
|
||||
member.livenessKind === 'registered_only' ||
|
||||
member.pidSource === 'persisted_metadata' ||
|
||||
shouldClearRuntimeDiagnosticAfterBootstrapConfirmation(member.runtimeDiagnostic)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private cleanConfirmedBootstrapRuntimeDiagnostics(
|
||||
snapshot: PersistedTeamLaunchSnapshot | null
|
||||
): PersistedTeamLaunchSnapshot | null {
|
||||
if (!snapshot) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let changed = false;
|
||||
const updatedAt = nowIso();
|
||||
const members: Record<string, PersistedTeamLaunchMemberState> = { ...snapshot.members };
|
||||
for (const memberName of this.getPersistedLaunchMemberNames(snapshot)) {
|
||||
const current = members[memberName];
|
||||
if (
|
||||
!current ||
|
||||
current.bootstrapConfirmed !== true ||
|
||||
current.hardFailure === true ||
|
||||
isPersistedOpenCodeSecondaryLaneMember(current)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const hasConfirmedBootstrapStaleRuntimeState =
|
||||
current.livenessKind === 'stale_metadata' ||
|
||||
current.livenessKind === 'registered_only' ||
|
||||
current.pidSource === 'persisted_metadata' ||
|
||||
shouldClearRuntimeDiagnosticAfterBootstrapConfirmation(current.runtimeDiagnostic) ||
|
||||
current.bootstrapStalled === true;
|
||||
if (!hasConfirmedBootstrapStaleRuntimeState) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const next: PersistedTeamLaunchMemberState = {
|
||||
...current,
|
||||
livenessKind:
|
||||
current.livenessKind === 'stale_metadata' ||
|
||||
current.livenessKind === 'registered_only' ||
|
||||
current.livenessKind == null
|
||||
? 'confirmed_bootstrap'
|
||||
: current.livenessKind,
|
||||
pidSource:
|
||||
current.pidSource === 'persisted_metadata' || current.pidSource == null
|
||||
? 'runtime_bootstrap'
|
||||
: current.pidSource,
|
||||
bootstrapStalled: undefined,
|
||||
diagnostics: undefined,
|
||||
lastEvaluatedAt: updatedAt,
|
||||
};
|
||||
if (shouldClearRuntimeDiagnosticAfterBootstrapConfirmation(next.runtimeDiagnostic)) {
|
||||
next.runtimeDiagnostic = undefined;
|
||||
next.runtimeDiagnosticSeverity = undefined;
|
||||
} else if (!next.runtimeDiagnostic) {
|
||||
next.runtimeDiagnosticSeverity = undefined;
|
||||
}
|
||||
next.launchState = deriveMemberLaunchState(next);
|
||||
members[memberName] = next;
|
||||
changed = true;
|
||||
}
|
||||
|
||||
if (!changed) {
|
||||
return snapshot;
|
||||
}
|
||||
|
||||
return createPersistedLaunchSnapshot({
|
||||
teamName: snapshot.teamName,
|
||||
expectedMembers: snapshot.expectedMembers,
|
||||
bootstrapExpectedMembers: snapshot.bootstrapExpectedMembers,
|
||||
leadSessionId: snapshot.leadSessionId,
|
||||
launchPhase: snapshot.launchPhase,
|
||||
members,
|
||||
updatedAt,
|
||||
});
|
||||
}
|
||||
|
||||
private async reconcilePersistedLaunchState(teamName: string): Promise<{
|
||||
snapshot: ReturnType<typeof createPersistedLaunchSnapshot> | null;
|
||||
statuses: Record<string, MemberSpawnStatusEntry>;
|
||||
|
|
@ -28315,11 +28464,15 @@ export class TeamProvisioningService {
|
|||
const promotedRecoveredMixedSnapshot = promoteOpenCodePersistedFailureReasonsFromDiagnostics(
|
||||
stableRecoveredMixedSnapshotWithCommittedEvidence
|
||||
);
|
||||
const cleanedRecoveredMixedSnapshot = this.cleanConfirmedBootstrapRuntimeDiagnostics(
|
||||
promotedRecoveredMixedSnapshot
|
||||
);
|
||||
const stableRecoveredMixedSnapshot =
|
||||
promotedRecoveredMixedSnapshot &&
|
||||
promotedRecoveredMixedSnapshot !== stableRecoveredMixedSnapshotWithCommittedEvidence
|
||||
? await this.writeLaunchStateSnapshot(teamName, promotedRecoveredMixedSnapshot)
|
||||
: promotedRecoveredMixedSnapshot;
|
||||
cleanedRecoveredMixedSnapshot &&
|
||||
(promotedRecoveredMixedSnapshot !== stableRecoveredMixedSnapshotWithCommittedEvidence ||
|
||||
cleanedRecoveredMixedSnapshot !== promotedRecoveredMixedSnapshot)
|
||||
? await this.writeLaunchStateSnapshot(teamName, cleanedRecoveredMixedSnapshot)
|
||||
: cleanedRecoveredMixedSnapshot;
|
||||
const filteredBootstrapSnapshot = bootstrapSnapshot
|
||||
? this.filterRemovedMembersFromLaunchSnapshot(bootstrapSnapshot, metaMembers)
|
||||
: null;
|
||||
|
|
@ -28331,6 +28484,7 @@ export class TeamProvisioningService {
|
|||
stableRecoveredMixedSnapshot,
|
||||
overlaidBootstrapSnapshot
|
||||
) &&
|
||||
!this.needsConfirmedBootstrapDiagnosticReconcile(stableRecoveredMixedSnapshot) &&
|
||||
!(await this.hasBootstrapTranscriptLaunchReconcileOutcome(stableRecoveredMixedSnapshot))
|
||||
) {
|
||||
return {
|
||||
|
|
@ -28361,15 +28515,18 @@ export class TeamProvisioningService {
|
|||
);
|
||||
const shouldPersistFailureReasonPromotion =
|
||||
promotedPersisted !== filteredPersistedWithBootstrapStall;
|
||||
const cleanedPersisted = this.cleanConfirmedBootstrapRuntimeDiagnostics(promotedPersisted);
|
||||
const shouldPersistConfirmedBootstrapDiagnosticCleanup = cleanedPersisted !== promotedPersisted;
|
||||
const shouldPersistBootstrapStallOverlay =
|
||||
filteredPersistedWithBootstrapStall !== filteredPersisted;
|
||||
const persistedWithCommittedEvidence =
|
||||
promotedPersisted &&
|
||||
cleanedPersisted &&
|
||||
(shouldPersistCommittedEvidenceOverlay ||
|
||||
shouldPersistFailureReasonPromotion ||
|
||||
shouldPersistConfirmedBootstrapDiagnosticCleanup ||
|
||||
shouldPersistBootstrapStallOverlay)
|
||||
? await this.writeLaunchStateSnapshot(teamName, promotedPersisted)
|
||||
: promotedPersisted;
|
||||
? await this.writeLaunchStateSnapshot(teamName, cleanedPersisted)
|
||||
: cleanedPersisted;
|
||||
const preferredSnapshot = choosePreferredLaunchSnapshot(
|
||||
overlaidBootstrapSnapshot,
|
||||
persistedWithCommittedEvidence
|
||||
|
|
@ -28413,6 +28570,7 @@ export class TeamProvisioningService {
|
|||
|
||||
const configPath = path.join(getTeamsBasePath(), teamName, 'config.json');
|
||||
let configMembers = new Set<string>();
|
||||
let configBootstrapRunIds = new Map<string, string>();
|
||||
let leadName = 'team-lead';
|
||||
try {
|
||||
const raw = await tryReadRegularFileUtf8(configPath, {
|
||||
|
|
@ -28421,14 +28579,26 @@ export class TeamProvisioningService {
|
|||
});
|
||||
if (raw) {
|
||||
const config = JSON.parse(raw) as {
|
||||
members?: { name?: string; agentType?: string }[];
|
||||
members?: { name?: string; agentType?: string; bootstrapRunId?: string }[];
|
||||
};
|
||||
leadName = config.members?.find((member) => isLeadMember(member))?.name?.trim() || leadName;
|
||||
const configuredMembers = config.members ?? [];
|
||||
leadName =
|
||||
configuredMembers.find((member) => isLeadMember(member))?.name?.trim() || leadName;
|
||||
configMembers = new Set(
|
||||
(config.members ?? [])
|
||||
configuredMembers
|
||||
.map((member) => (typeof member?.name === 'string' ? member.name.trim() : ''))
|
||||
.filter((name) => name.length > 0 && !isLeadMember({ name }))
|
||||
);
|
||||
configBootstrapRunIds = new Map(
|
||||
configuredMembers.flatMap((member) => {
|
||||
const name = typeof member?.name === 'string' ? member.name.trim() : '';
|
||||
const runId =
|
||||
typeof member?.bootstrapRunId === 'string' ? member.bootstrapRunId.trim() : '';
|
||||
return name.length > 0 && runId.length > 0 && !isLeadMember({ name })
|
||||
? [[name, runId] as const]
|
||||
: [];
|
||||
})
|
||||
);
|
||||
}
|
||||
} catch {
|
||||
// best-effort
|
||||
|
|
@ -28449,6 +28619,7 @@ export class TeamProvisioningService {
|
|||
persistedWithCommittedEvidence,
|
||||
overlaidBootstrapSnapshot
|
||||
) &&
|
||||
!this.needsConfirmedBootstrapDiagnosticReconcile(persistedWithCommittedEvidence) &&
|
||||
!(await this.hasBootstrapTranscriptLaunchReconcileOutcome(persistedWithCommittedEvidence))
|
||||
) {
|
||||
return {
|
||||
|
|
@ -28473,10 +28644,23 @@ export class TeamProvisioningService {
|
|||
lastEvaluatedAt: now,
|
||||
};
|
||||
const isOpenCodeSecondaryLaneMember = isPersistedOpenCodeSecondaryLaneMember(current);
|
||||
const matchedConfigNames = [...configMembers].filter((name) =>
|
||||
matchesObservedMemberNameForExpected(name, expected)
|
||||
);
|
||||
const configBootstrapRunId = matchedConfigNames
|
||||
.map((name) => configBootstrapRunIds.get(name))
|
||||
.find((runId): runId is string => typeof runId === 'string' && runId.length > 0);
|
||||
const currentBootstrapEvidenceBoundary = configBootstrapRunId
|
||||
? { ...current, runtimeRunId: configBootstrapRunId }
|
||||
: current;
|
||||
if (
|
||||
bootstrapMember?.agentToolAccepted &&
|
||||
!current.agentToolAccepted &&
|
||||
isBootstrapMemberEvidenceCurrentForMember(current, bootstrapMember, 'acceptance')
|
||||
isBootstrapMemberEvidenceCurrentForMember(
|
||||
currentBootstrapEvidenceBoundary,
|
||||
bootstrapMember,
|
||||
'acceptance'
|
||||
)
|
||||
) {
|
||||
current.agentToolAccepted = true;
|
||||
current.firstSpawnAcceptedAt =
|
||||
|
|
@ -28486,14 +28670,15 @@ export class TeamProvisioningService {
|
|||
bootstrapMember?.bootstrapConfirmed &&
|
||||
!current.bootstrapConfirmed &&
|
||||
!isOpenCodeSecondaryLaneMember &&
|
||||
isBootstrapMemberEvidenceCurrentForMember(current, bootstrapMember, 'confirmation')
|
||||
isBootstrapMemberEvidenceCurrentForMember(
|
||||
currentBootstrapEvidenceBoundary,
|
||||
bootstrapMember,
|
||||
'confirmation'
|
||||
)
|
||||
) {
|
||||
current.bootstrapConfirmed = true;
|
||||
current.lastHeartbeatAt = current.lastHeartbeatAt ?? bootstrapMember.lastHeartbeatAt;
|
||||
}
|
||||
const matchedConfigNames = [...configMembers].filter((name) =>
|
||||
matchesObservedMemberNameForExpected(name, expected)
|
||||
);
|
||||
const runtimeMetadataCandidates = [...liveRuntimeByMember.entries()].filter(([name]) =>
|
||||
matchesObservedMemberNameForExpected(name, expected)
|
||||
);
|
||||
|
|
@ -28619,6 +28804,25 @@ export class TeamProvisioningService {
|
|||
finalTimeoutReached: graceExpired,
|
||||
});
|
||||
}
|
||||
if (current.bootstrapConfirmed && !current.hardFailure && !isOpenCodeSecondaryLaneMember) {
|
||||
current.livenessKind =
|
||||
current.livenessKind === 'stale_metadata' ||
|
||||
current.livenessKind === 'registered_only' ||
|
||||
current.livenessKind == null
|
||||
? 'confirmed_bootstrap'
|
||||
: current.livenessKind;
|
||||
current.pidSource =
|
||||
current.pidSource === 'persisted_metadata' || current.pidSource == null
|
||||
? 'runtime_bootstrap'
|
||||
: current.pidSource;
|
||||
if (shouldClearRuntimeDiagnosticAfterBootstrapConfirmation(current.runtimeDiagnostic)) {
|
||||
current.runtimeDiagnostic = undefined;
|
||||
current.runtimeDiagnosticSeverity = undefined;
|
||||
} else if (!current.runtimeDiagnostic) {
|
||||
current.runtimeDiagnosticSeverity = undefined;
|
||||
}
|
||||
current.bootstrapStalled = undefined;
|
||||
}
|
||||
if (
|
||||
isOpenCodeSecondaryLaneMember &&
|
||||
shouldMarkPersistedOpenCodeBootstrapStalled(current, Date.now())
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ export const OPENCODE_BOOTSTRAP_PENDING_DIAGNOSTIC =
|
|||
export const OPENCODE_APP_MANAGED_BOOTSTRAP_PENDING_DIAGNOSTIC =
|
||||
'OpenCode app-managed bootstrap evidence is pending after materialized session.';
|
||||
|
||||
const BOOTSTRAP_EVIDENCE_BOUNDARY_SKEW_MS = 10_000;
|
||||
const OPENCODE_MEMBER_SESSION_RECORDED_AT_PATTERN =
|
||||
/\bmember_session_recorded\s+at\s+([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9:.+-]+Z?)\b/i;
|
||||
|
||||
|
|
@ -584,13 +585,29 @@ export function isRecoverableOpenCodeRuntimeEvidence(
|
|||
}
|
||||
|
||||
export function isBootstrapMemberEvidenceCurrentForMember(
|
||||
current: { firstSpawnAcceptedAt?: string; lastEvaluatedAt?: string },
|
||||
current: { firstSpawnAcceptedAt?: string; lastEvaluatedAt?: string; runtimeRunId?: string },
|
||||
bootstrapMember: Pick<
|
||||
PersistedTeamLaunchMemberState,
|
||||
'firstSpawnAcceptedAt' | 'lastHeartbeatAt' | 'lastRuntimeAliveAt' | 'lastEvaluatedAt'
|
||||
| 'firstSpawnAcceptedAt'
|
||||
| 'lastHeartbeatAt'
|
||||
| 'lastRuntimeAliveAt'
|
||||
| 'lastEvaluatedAt'
|
||||
| 'runtimeRunId'
|
||||
>,
|
||||
evidenceKind: 'acceptance' | 'confirmation'
|
||||
): boolean {
|
||||
const currentRuntimeRunId =
|
||||
typeof current.runtimeRunId === 'string' ? current.runtimeRunId.trim() : '';
|
||||
const bootstrapRuntimeRunId =
|
||||
typeof bootstrapMember.runtimeRunId === 'string' ? bootstrapMember.runtimeRunId.trim() : '';
|
||||
if (
|
||||
currentRuntimeRunId.length > 0 &&
|
||||
bootstrapRuntimeRunId.length > 0 &&
|
||||
currentRuntimeRunId !== bootstrapRuntimeRunId
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const bootstrapFirstSpawnAcceptedMs = Date.parse(bootstrapMember.firstSpawnAcceptedAt ?? '');
|
||||
const bootstrapLastEvaluatedMs = Date.parse(bootstrapMember.lastEvaluatedAt ?? '');
|
||||
const hasDurableBootstrapSpawnAcceptedAt =
|
||||
|
|
@ -615,5 +632,15 @@ export function isBootstrapMemberEvidenceCurrentForMember(
|
|||
Number.isFinite(firstSpawnAcceptedMs) &&
|
||||
(!Number.isFinite(lastEvaluatedMs) || firstSpawnAcceptedMs <= lastEvaluatedMs);
|
||||
const boundaryMs = hasDurableSpawnBoundary ? firstSpawnAcceptedMs : NaN;
|
||||
return !Number.isFinite(boundaryMs) || evidenceMs >= boundaryMs;
|
||||
const hasCompatibleRuntimeRunIdForSkew =
|
||||
currentRuntimeRunId.length === 0 ||
|
||||
(bootstrapRuntimeRunId.length > 0 && currentRuntimeRunId === bootstrapRuntimeRunId);
|
||||
const withinBootstrapConfirmationClockSkew =
|
||||
evidenceKind === 'confirmation' &&
|
||||
Number.isFinite(boundaryMs) &&
|
||||
boundaryMs - evidenceMs <= BOOTSTRAP_EVIDENCE_BOUNDARY_SKEW_MS &&
|
||||
hasCompatibleRuntimeRunIdForSkew;
|
||||
return (
|
||||
!Number.isFinite(boundaryMs) || evidenceMs >= boundaryMs || withinBootstrapConfirmationClockSkew
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1123,7 +1123,7 @@ export interface PersistedTeamLaunchMemberState {
|
|||
hardFailureReason?: string;
|
||||
pendingPermissionRequestIds?: string[];
|
||||
runtimePid?: number;
|
||||
/** OpenCode runtime run id that produced the current runtimeSessionId/liveness evidence. */
|
||||
/** Runtime/bootstrap run id that produced current liveness or bootstrap evidence. */
|
||||
runtimeRunId?: string;
|
||||
runtimeSessionId?: string;
|
||||
bootstrapEvidenceSource?: OpenCodeBootstrapEvidenceSource;
|
||||
|
|
|
|||
|
|
@ -316,6 +316,85 @@ describe('TeamProvisioningOpenCodeRuntimeEvidencePolicy', () => {
|
|||
expect(hasRecoverableOpenCodeBootstrapDiagnostic([])).toBe(false);
|
||||
});
|
||||
|
||||
it('accepts bootstrap evidence that slightly predates delayed spawn acceptance', () => {
|
||||
expect(
|
||||
isBootstrapMemberEvidenceCurrentForMember(
|
||||
{
|
||||
firstSpawnAcceptedAt: '2026-01-01T00:00:45.000Z',
|
||||
lastEvaluatedAt: '2026-01-01T00:01:00.000Z',
|
||||
runtimeRunId: 'run-new',
|
||||
},
|
||||
{
|
||||
firstSpawnAcceptedAt: '2026-01-01T00:00:33.000Z',
|
||||
lastHeartbeatAt: '2026-01-01T00:00:42.500Z',
|
||||
lastEvaluatedAt: '2026-01-01T00:00:42.500Z',
|
||||
},
|
||||
'confirmation'
|
||||
)
|
||||
).toBe(false);
|
||||
|
||||
expect(
|
||||
isBootstrapMemberEvidenceCurrentForMember(
|
||||
{
|
||||
firstSpawnAcceptedAt: '2026-01-01T00:00:45.000Z',
|
||||
lastEvaluatedAt: '2026-01-01T00:01:00.000Z',
|
||||
runtimeRunId: 'run-new',
|
||||
},
|
||||
{
|
||||
firstSpawnAcceptedAt: '2026-01-01T00:00:33.000Z',
|
||||
lastHeartbeatAt: '2026-01-01T00:00:42.500Z',
|
||||
lastEvaluatedAt: '2026-01-01T00:00:42.500Z',
|
||||
runtimeRunId: 'run-old',
|
||||
},
|
||||
'confirmation'
|
||||
)
|
||||
).toBe(false);
|
||||
|
||||
expect(
|
||||
isBootstrapMemberEvidenceCurrentForMember(
|
||||
{
|
||||
firstSpawnAcceptedAt: '2026-01-01T00:00:45.000Z',
|
||||
lastEvaluatedAt: '2026-01-01T00:01:00.000Z',
|
||||
},
|
||||
{
|
||||
firstSpawnAcceptedAt: '2026-01-01T00:00:33.000Z',
|
||||
lastHeartbeatAt: '2026-01-01T00:00:42.500Z',
|
||||
lastEvaluatedAt: '2026-01-01T00:00:42.500Z',
|
||||
},
|
||||
'confirmation'
|
||||
)
|
||||
).toBe(true);
|
||||
|
||||
expect(
|
||||
isBootstrapMemberEvidenceCurrentForMember(
|
||||
{
|
||||
firstSpawnAcceptedAt: '2026-01-01T00:00:45.000Z',
|
||||
lastEvaluatedAt: '2026-01-01T00:01:00.000Z',
|
||||
},
|
||||
{
|
||||
firstSpawnAcceptedAt: '2026-01-01T00:00:20.000Z',
|
||||
lastHeartbeatAt: '2026-01-01T00:00:20.000Z',
|
||||
lastEvaluatedAt: '2026-01-01T00:00:20.000Z',
|
||||
},
|
||||
'confirmation'
|
||||
)
|
||||
).toBe(false);
|
||||
|
||||
expect(
|
||||
isBootstrapMemberEvidenceCurrentForMember(
|
||||
{
|
||||
firstSpawnAcceptedAt: '2026-01-01T00:00:45.000Z',
|
||||
lastEvaluatedAt: '2026-01-01T00:01:00.000Z',
|
||||
},
|
||||
{
|
||||
firstSpawnAcceptedAt: '2026-01-01T00:00:42.500Z',
|
||||
lastEvaluatedAt: '2026-01-01T00:00:42.500Z',
|
||||
},
|
||||
'acceptance'
|
||||
)
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it('classifies recoverable persisted OpenCode runtime candidates', () => {
|
||||
expect(
|
||||
isRecoverablePersistedOpenCodeRuntimeCandidate(makePersisted({ runtimeSessionId: 'rt-1' }))
|
||||
|
|
|
|||
|
|
@ -371,13 +371,15 @@ function writeBootstrapState(
|
|||
lastObservedAt?: number;
|
||||
failureReason?: string;
|
||||
}[],
|
||||
updatedAt = new Date().toISOString()
|
||||
updatedAt = new Date().toISOString(),
|
||||
options?: { runId?: string }
|
||||
): void {
|
||||
fs.writeFileSync(
|
||||
getTeamBootstrapStatePath(teamName),
|
||||
`${JSON.stringify(
|
||||
{
|
||||
version: 1,
|
||||
...(options?.runId ? { runId: options.runId } : {}),
|
||||
teamName,
|
||||
updatedAt,
|
||||
phase: 'completed',
|
||||
|
|
@ -390,6 +392,17 @@ function writeBootstrapState(
|
|||
);
|
||||
}
|
||||
|
||||
function writeMemberBootstrapRunId(teamName: string, memberName: string, runId: string): void {
|
||||
const configPath = path.join(tempTeamsBase, teamName, 'config.json');
|
||||
const config = JSON.parse(fs.readFileSync(configPath, 'utf8')) as {
|
||||
members?: Array<Record<string, unknown>>;
|
||||
};
|
||||
config.members = (config.members ?? []).map((member) =>
|
||||
member.name === memberName ? { ...member, bootstrapRunId: runId } : member
|
||||
);
|
||||
fs.writeFileSync(configPath, JSON.stringify(config), 'utf8');
|
||||
}
|
||||
|
||||
function writeAliveProcessRegistry(teamName: string): void {
|
||||
const teamDir = path.join(tempTeamsBase, teamName);
|
||||
fs.mkdirSync(teamDir, { recursive: true });
|
||||
|
|
@ -20028,10 +20041,293 @@ describe('TeamProvisioningService', () => {
|
|||
status: 'online',
|
||||
launchState: 'confirmed_alive',
|
||||
bootstrapConfirmed: true,
|
||||
runtimeAlive: false,
|
||||
livenessKind: 'confirmed_bootstrap',
|
||||
hardFailure: false,
|
||||
error: undefined,
|
||||
});
|
||||
expect(result.statuses.jack?.hardFailureReason).toBeUndefined();
|
||||
expect(result.statuses.jack?.runtimeDiagnostic).toBeUndefined();
|
||||
expect(result.statuses.jack?.runtimeDiagnosticSeverity).toBeUndefined();
|
||||
});
|
||||
|
||||
it('heals process-table unavailable failure when Anthropic bootstrap confirmation slightly predates delayed app acceptance', async () => {
|
||||
allowConsoleLogs();
|
||||
const teamName = 'zz-unit-process-table-unavailable-bootstrap-skew-heals';
|
||||
const leadSessionId = 'lead-session';
|
||||
const projectPath = '/Users/test/proj';
|
||||
const bootstrapAttemptAt = '2026-05-24T09:25:33.388Z';
|
||||
const bootstrapConfirmedAt = '2026-05-24T09:25:42.494Z';
|
||||
const appAcceptedAt = '2026-05-24T09:25:45.178Z';
|
||||
const cleanupAt = '2026-05-24T09:31:05.525Z';
|
||||
const runtimePid = 97_255;
|
||||
const bootstrapRunId = 'run-process-table-unavailable-skew';
|
||||
const reason = 'runtime pid could not be verified because process table is unavailable';
|
||||
|
||||
writeLaunchConfig(teamName, projectPath, leadSessionId, ['tom']);
|
||||
writeMemberBootstrapRunId(teamName, 'tom', bootstrapRunId);
|
||||
writeLaunchState(
|
||||
teamName,
|
||||
leadSessionId,
|
||||
{
|
||||
tom: {
|
||||
providerId: 'anthropic',
|
||||
model: 'haiku',
|
||||
laneId: 'primary',
|
||||
laneKind: 'primary',
|
||||
laneOwnerProviderId: 'codex',
|
||||
launchState: 'failed_to_start',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: false,
|
||||
runtimePid,
|
||||
bootstrapConfirmed: false,
|
||||
hardFailure: true,
|
||||
hardFailureReason: reason,
|
||||
livenessKind: 'registered_only',
|
||||
runtimeDiagnostic: reason,
|
||||
runtimeDiagnosticSeverity: 'warning',
|
||||
firstSpawnAcceptedAt: appAcceptedAt,
|
||||
runtimeLastSeenAt: cleanupAt,
|
||||
lastEvaluatedAt: cleanupAt,
|
||||
},
|
||||
},
|
||||
{ launchPhase: 'finished', updatedAt: cleanupAt }
|
||||
);
|
||||
writeBootstrapState(
|
||||
teamName,
|
||||
[
|
||||
{
|
||||
name: 'tom',
|
||||
status: 'bootstrap_confirmed',
|
||||
lastAttemptAt: Date.parse(bootstrapAttemptAt),
|
||||
lastObservedAt: Date.parse(bootstrapConfirmedAt),
|
||||
},
|
||||
],
|
||||
cleanupAt,
|
||||
{ runId: bootstrapRunId }
|
||||
);
|
||||
|
||||
const svc = new TeamProvisioningService();
|
||||
privateHarness(svc).getLiveTeamAgentRuntimeMetadata = vi.fn(
|
||||
async () =>
|
||||
new Map([
|
||||
[
|
||||
'tom',
|
||||
{
|
||||
alive: false,
|
||||
backendType: 'process',
|
||||
providerId: 'anthropic',
|
||||
livenessKind: 'registered_only',
|
||||
pidSource: 'persisted_metadata',
|
||||
runtimeDiagnostic: reason,
|
||||
runtimeDiagnosticSeverity: 'warning',
|
||||
metricsPid: runtimePid,
|
||||
model: 'haiku',
|
||||
},
|
||||
],
|
||||
])
|
||||
);
|
||||
|
||||
const result = await svc.getMemberSpawnStatuses(teamName);
|
||||
|
||||
expect(result.teamLaunchState).toBe('clean_success');
|
||||
expect(result.statuses.tom).toMatchObject({
|
||||
status: 'online',
|
||||
launchState: 'confirmed_alive',
|
||||
bootstrapConfirmed: true,
|
||||
runtimeAlive: false,
|
||||
livenessKind: 'confirmed_bootstrap',
|
||||
hardFailure: false,
|
||||
error: undefined,
|
||||
});
|
||||
expect(result.statuses.tom?.hardFailureReason).toBeUndefined();
|
||||
expect(result.statuses.tom?.runtimeDiagnostic).toBeUndefined();
|
||||
expect(result.statuses.tom?.runtimeDiagnosticSeverity).toBeUndefined();
|
||||
});
|
||||
|
||||
it('does not heal rapid relaunch failures from previous bootstrap-state run id', async () => {
|
||||
allowConsoleLogs();
|
||||
const teamName = 'zz-unit-process-table-unavailable-stale-rapid-run-ignored';
|
||||
const leadSessionId = 'lead-session';
|
||||
const projectPath = '/Users/test/proj';
|
||||
const bootstrapAttemptAt = '2026-05-24T09:25:33.388Z';
|
||||
const bootstrapConfirmedAt = '2026-05-24T09:25:42.494Z';
|
||||
const appAcceptedAt = '2026-05-24T09:25:45.178Z';
|
||||
const cleanupAt = '2026-05-24T09:31:05.525Z';
|
||||
const runtimePid = 97_255;
|
||||
const currentRunId = 'run-new-process-table-unavailable';
|
||||
const staleRunId = 'run-old-process-table-unavailable';
|
||||
const reason = 'runtime pid could not be verified because process table is unavailable';
|
||||
|
||||
writeLaunchConfig(teamName, projectPath, leadSessionId, ['tom']);
|
||||
writeMemberBootstrapRunId(teamName, 'tom', currentRunId);
|
||||
writeLaunchState(
|
||||
teamName,
|
||||
leadSessionId,
|
||||
{
|
||||
tom: {
|
||||
providerId: 'anthropic',
|
||||
model: 'haiku',
|
||||
laneId: 'primary',
|
||||
laneKind: 'primary',
|
||||
laneOwnerProviderId: 'codex',
|
||||
launchState: 'failed_to_start',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: false,
|
||||
runtimePid,
|
||||
bootstrapConfirmed: false,
|
||||
hardFailure: true,
|
||||
hardFailureReason: reason,
|
||||
livenessKind: 'registered_only',
|
||||
runtimeDiagnostic: reason,
|
||||
runtimeDiagnosticSeverity: 'warning',
|
||||
firstSpawnAcceptedAt: appAcceptedAt,
|
||||
runtimeLastSeenAt: cleanupAt,
|
||||
lastEvaluatedAt: cleanupAt,
|
||||
},
|
||||
},
|
||||
{ launchPhase: 'finished', updatedAt: cleanupAt }
|
||||
);
|
||||
writeBootstrapState(
|
||||
teamName,
|
||||
[
|
||||
{
|
||||
name: 'tom',
|
||||
status: 'bootstrap_confirmed',
|
||||
lastAttemptAt: Date.parse(bootstrapAttemptAt),
|
||||
lastObservedAt: Date.parse(bootstrapConfirmedAt),
|
||||
},
|
||||
],
|
||||
cleanupAt,
|
||||
{ runId: staleRunId }
|
||||
);
|
||||
|
||||
const svc = new TeamProvisioningService();
|
||||
privateHarness(svc).getLiveTeamAgentRuntimeMetadata = vi.fn(
|
||||
async () =>
|
||||
new Map([
|
||||
[
|
||||
'tom',
|
||||
{
|
||||
alive: false,
|
||||
backendType: 'process',
|
||||
providerId: 'anthropic',
|
||||
livenessKind: 'registered_only',
|
||||
pidSource: 'persisted_metadata',
|
||||
runtimeDiagnostic: reason,
|
||||
runtimeDiagnosticSeverity: 'warning',
|
||||
metricsPid: runtimePid,
|
||||
model: 'haiku',
|
||||
},
|
||||
],
|
||||
])
|
||||
);
|
||||
|
||||
const result = await svc.getMemberSpawnStatuses(teamName);
|
||||
|
||||
expect(result.teamLaunchState).toBe('partial_failure');
|
||||
expect(result.statuses.tom).toMatchObject({
|
||||
status: 'error',
|
||||
launchState: 'failed_to_start',
|
||||
bootstrapConfirmed: false,
|
||||
hardFailure: true,
|
||||
});
|
||||
});
|
||||
|
||||
it('heals post-stop stale pid diagnostics when bootstrap-state already confirmed the Anthropic member', async () => {
|
||||
allowConsoleLogs();
|
||||
const teamName = 'zz-unit-post-stop-stale-pid-bootstrap-skew-heals';
|
||||
const leadSessionId = 'lead-session';
|
||||
const projectPath = '/Users/test/proj';
|
||||
const bootstrapAttemptAt = '2026-05-24T09:25:33.388Z';
|
||||
const bootstrapConfirmedAt = '2026-05-24T09:25:42.904Z';
|
||||
const appAcceptedAt = '2026-05-24T09:25:45.178Z';
|
||||
const originalFailureAt = '2026-05-24T09:31:05.525Z';
|
||||
const postStopRefreshAt = '2026-05-24T11:36:56.881Z';
|
||||
const runtimePid = 97_255;
|
||||
const bootstrapRunId = 'run-post-stop-stale-pid-bootstrap-skew';
|
||||
const originalReason = 'runtime pid could not be verified because process table is unavailable';
|
||||
const postStopDiagnostic = 'persisted runtime pid is not alive';
|
||||
|
||||
writeLaunchConfig(teamName, projectPath, leadSessionId, ['tom']);
|
||||
writeMemberBootstrapRunId(teamName, 'tom', bootstrapRunId);
|
||||
writeLaunchState(
|
||||
teamName,
|
||||
leadSessionId,
|
||||
{
|
||||
tom: {
|
||||
providerId: 'anthropic',
|
||||
model: 'haiku',
|
||||
laneId: 'primary',
|
||||
laneKind: 'primary',
|
||||
laneOwnerProviderId: 'codex',
|
||||
launchState: 'failed_to_start',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: false,
|
||||
runtimePid,
|
||||
bootstrapConfirmed: false,
|
||||
hardFailure: true,
|
||||
hardFailureReason: originalReason,
|
||||
livenessKind: 'stale_metadata',
|
||||
runtimeDiagnostic: postStopDiagnostic,
|
||||
runtimeDiagnosticSeverity: 'warning',
|
||||
firstSpawnAcceptedAt: appAcceptedAt,
|
||||
runtimeLastSeenAt: originalFailureAt,
|
||||
lastEvaluatedAt: originalFailureAt,
|
||||
},
|
||||
},
|
||||
{ launchPhase: 'finished', updatedAt: postStopRefreshAt }
|
||||
);
|
||||
writeBootstrapState(
|
||||
teamName,
|
||||
[
|
||||
{
|
||||
name: 'tom',
|
||||
status: 'bootstrap_confirmed',
|
||||
lastAttemptAt: Date.parse(bootstrapAttemptAt),
|
||||
lastObservedAt: Date.parse(bootstrapConfirmedAt),
|
||||
},
|
||||
],
|
||||
'2026-05-24T09:26:08.090Z',
|
||||
{ runId: bootstrapRunId }
|
||||
);
|
||||
|
||||
const svc = new TeamProvisioningService();
|
||||
privateHarness(svc).getLiveTeamAgentRuntimeMetadata = vi.fn(
|
||||
async () =>
|
||||
new Map([
|
||||
[
|
||||
'tom',
|
||||
{
|
||||
alive: false,
|
||||
backendType: 'process',
|
||||
providerId: 'anthropic',
|
||||
livenessKind: 'stale_metadata',
|
||||
pidSource: 'persisted_metadata',
|
||||
runtimeDiagnostic: postStopDiagnostic,
|
||||
runtimeDiagnosticSeverity: 'warning',
|
||||
metricsPid: runtimePid,
|
||||
model: 'haiku',
|
||||
},
|
||||
],
|
||||
])
|
||||
);
|
||||
|
||||
const result = await svc.getMemberSpawnStatuses(teamName);
|
||||
|
||||
expect(result.teamLaunchState).toBe('clean_success');
|
||||
expect(result.statuses.tom).toMatchObject({
|
||||
status: 'online',
|
||||
launchState: 'confirmed_alive',
|
||||
bootstrapConfirmed: true,
|
||||
runtimeAlive: false,
|
||||
livenessKind: 'confirmed_bootstrap',
|
||||
hardFailure: false,
|
||||
error: undefined,
|
||||
});
|
||||
expect(result.statuses.tom?.hardFailureReason).toBeUndefined();
|
||||
expect(result.statuses.tom?.runtimeDiagnostic).toBeUndefined();
|
||||
expect(result.statuses.tom?.runtimeDiagnosticSeverity).toBeUndefined();
|
||||
});
|
||||
|
||||
it('does not heal cleanup-finalized launch failures from stale bootstrap-state confirmation', async () => {
|
||||
|
|
@ -25884,6 +26180,292 @@ describe('TeamProvisioningService', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('reconciles mixed launch when Anthropic primary bootstrap confirmation slightly predates delayed app acceptance', async () => {
|
||||
const teamName = 'mixed-anthropic-primary-bootstrap-skew-heals';
|
||||
const reason = 'runtime pid could not be verified because process table is unavailable';
|
||||
const postStopDiagnostic = 'persisted runtime pid is not alive';
|
||||
const bootstrapRunId = 'run-mixed-anthropic-primary-bootstrap-skew';
|
||||
writeTeamMeta(teamName, {
|
||||
providerId: 'codex',
|
||||
providerBackendId: 'codex-native',
|
||||
model: 'gpt-5.5',
|
||||
});
|
||||
writeMembersMeta(teamName, [
|
||||
{ name: 'alice', providerId: 'codex', model: 'gpt-5.5' },
|
||||
{ name: 'tom', providerId: 'anthropic', model: 'haiku' },
|
||||
{ name: 'bob', providerId: 'opencode', model: 'opencode/deepseek-v4-flash-free' },
|
||||
{ name: 'jack', providerId: 'opencode', model: 'opencode/big-pickle' },
|
||||
]);
|
||||
writeLaunchConfig(teamName, '/Users/test/proj', 'lead-session', ['alice', 'tom']);
|
||||
writeMemberBootstrapRunId(teamName, 'alice', bootstrapRunId);
|
||||
writeMemberBootstrapRunId(teamName, 'tom', bootstrapRunId);
|
||||
writeBootstrapState(
|
||||
teamName,
|
||||
[
|
||||
{
|
||||
name: 'alice',
|
||||
status: 'bootstrap_confirmed',
|
||||
lastAttemptAt: Date.parse('2026-05-24T09:25:28.034Z'),
|
||||
lastObservedAt: Date.parse('2026-05-24T09:26:07.735Z'),
|
||||
},
|
||||
{
|
||||
name: 'tom',
|
||||
status: 'bootstrap_confirmed',
|
||||
lastAttemptAt: Date.parse('2026-05-24T09:25:33.388Z'),
|
||||
lastObservedAt: Date.parse('2026-05-24T09:25:42.494Z'),
|
||||
},
|
||||
],
|
||||
'2026-05-24T09:26:08.090Z',
|
||||
{ runId: bootstrapRunId }
|
||||
);
|
||||
fs.writeFileSync(
|
||||
getTeamLaunchStatePath(teamName),
|
||||
`${JSON.stringify(
|
||||
createPersistedLaunchSnapshot({
|
||||
teamName,
|
||||
leadSessionId: 'lead-session',
|
||||
launchPhase: 'finished',
|
||||
expectedMembers: ['alice', 'tom', 'bob', 'jack'],
|
||||
bootstrapExpectedMembers: ['alice', 'tom'],
|
||||
members: {
|
||||
alice: {
|
||||
name: 'alice',
|
||||
providerId: 'codex',
|
||||
laneId: 'primary',
|
||||
laneKind: 'primary',
|
||||
laneOwnerProviderId: 'codex',
|
||||
launchState: 'confirmed_alive',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: true,
|
||||
bootstrapConfirmed: true,
|
||||
hardFailure: false,
|
||||
firstSpawnAcceptedAt: '2026-05-24T09:25:45.176Z',
|
||||
lastHeartbeatAt: '2026-05-24T09:26:07.735Z',
|
||||
lastEvaluatedAt: '2026-05-24T09:26:09.249Z',
|
||||
},
|
||||
tom: {
|
||||
name: 'tom',
|
||||
providerId: 'anthropic',
|
||||
model: 'haiku',
|
||||
laneId: 'primary',
|
||||
laneKind: 'primary',
|
||||
laneOwnerProviderId: 'codex',
|
||||
launchState: 'failed_to_start',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: false,
|
||||
runtimePid: 97_255,
|
||||
bootstrapConfirmed: false,
|
||||
hardFailure: true,
|
||||
hardFailureReason: reason,
|
||||
livenessKind: 'stale_metadata',
|
||||
runtimeDiagnostic: postStopDiagnostic,
|
||||
runtimeDiagnosticSeverity: 'warning',
|
||||
firstSpawnAcceptedAt: '2026-05-24T09:25:45.178Z',
|
||||
runtimeLastSeenAt: '2026-05-24T09:31:05.525Z',
|
||||
lastEvaluatedAt: '2026-05-24T09:31:05.525Z',
|
||||
},
|
||||
bob: {
|
||||
name: 'bob',
|
||||
providerId: 'opencode',
|
||||
model: 'opencode/deepseek-v4-flash-free',
|
||||
laneId: 'secondary:opencode:bob',
|
||||
laneKind: 'secondary',
|
||||
laneOwnerProviderId: 'opencode',
|
||||
launchState: 'confirmed_alive',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: true,
|
||||
bootstrapConfirmed: true,
|
||||
hardFailure: false,
|
||||
runtimePid: 2_756,
|
||||
runtimeSessionId: 'ses_bob',
|
||||
livenessKind: 'confirmed_bootstrap',
|
||||
lastHeartbeatAt: '2026-05-24T09:31:39.741Z',
|
||||
lastEvaluatedAt: '2026-05-24T09:31:39.741Z',
|
||||
},
|
||||
jack: {
|
||||
name: 'jack',
|
||||
providerId: 'opencode',
|
||||
model: 'opencode/big-pickle',
|
||||
laneId: 'secondary:opencode:jack',
|
||||
laneKind: 'secondary',
|
||||
laneOwnerProviderId: 'opencode',
|
||||
launchState: 'confirmed_alive',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: true,
|
||||
bootstrapConfirmed: true,
|
||||
hardFailure: false,
|
||||
runtimePid: 2_756,
|
||||
runtimeSessionId: 'ses_jack',
|
||||
livenessKind: 'confirmed_bootstrap',
|
||||
lastHeartbeatAt: '2026-05-24T09:31:39.741Z',
|
||||
lastEvaluatedAt: '2026-05-24T09:31:39.741Z',
|
||||
},
|
||||
},
|
||||
updatedAt: '2026-05-24T11:36:56.881Z',
|
||||
}),
|
||||
null,
|
||||
2
|
||||
)}\n`,
|
||||
'utf8'
|
||||
);
|
||||
|
||||
const svc = new TeamProvisioningService();
|
||||
const result = await svc.getMemberSpawnStatuses(teamName);
|
||||
|
||||
expect(result.teamLaunchState).toBe('clean_success');
|
||||
expect(result.statuses.tom).toMatchObject({
|
||||
status: 'online',
|
||||
launchState: 'confirmed_alive',
|
||||
bootstrapConfirmed: true,
|
||||
runtimeAlive: false,
|
||||
livenessKind: 'confirmed_bootstrap',
|
||||
hardFailure: false,
|
||||
error: undefined,
|
||||
});
|
||||
expect(result.statuses.tom?.runtimeDiagnostic).toBeUndefined();
|
||||
expect(result.statuses.tom?.runtimeDiagnosticSeverity).toBeUndefined();
|
||||
expect(result.statuses.bob).toMatchObject({
|
||||
status: 'online',
|
||||
launchState: 'confirmed_alive',
|
||||
bootstrapConfirmed: true,
|
||||
});
|
||||
expect(result.statuses.jack).toMatchObject({
|
||||
status: 'online',
|
||||
launchState: 'confirmed_alive',
|
||||
bootstrapConfirmed: true,
|
||||
});
|
||||
});
|
||||
|
||||
it('cleans stale confirmed primary diagnostics from an already successful mixed launch', async () => {
|
||||
const teamName = 'mixed-confirmed-primary-stale-diagnostic-cleans';
|
||||
writeTeamMeta(teamName, {
|
||||
providerId: 'codex',
|
||||
providerBackendId: 'codex-native',
|
||||
model: 'gpt-5.5',
|
||||
});
|
||||
writeMembersMeta(teamName, [
|
||||
{ name: 'alice', providerId: 'codex', model: 'gpt-5.5' },
|
||||
{ name: 'tom', providerId: 'anthropic', model: 'haiku' },
|
||||
{ name: 'bob', providerId: 'opencode', model: 'opencode/deepseek-v4-flash-free' },
|
||||
{ name: 'jack', providerId: 'opencode', model: 'opencode/big-pickle' },
|
||||
]);
|
||||
writeLaunchConfig(teamName, '/Users/test/proj', 'lead-session', ['alice', 'tom']);
|
||||
fs.writeFileSync(
|
||||
getTeamLaunchStatePath(teamName),
|
||||
`${JSON.stringify(
|
||||
createPersistedLaunchSnapshot({
|
||||
teamName,
|
||||
leadSessionId: 'lead-session',
|
||||
launchPhase: 'finished',
|
||||
expectedMembers: ['alice', 'tom', 'bob', 'jack'],
|
||||
members: {
|
||||
alice: {
|
||||
name: 'alice',
|
||||
providerId: 'codex',
|
||||
laneId: 'primary',
|
||||
laneKind: 'primary',
|
||||
laneOwnerProviderId: 'codex',
|
||||
launchState: 'confirmed_alive',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: true,
|
||||
bootstrapConfirmed: true,
|
||||
hardFailure: false,
|
||||
lastEvaluatedAt: '2026-05-24T12:04:48.900Z',
|
||||
},
|
||||
tom: {
|
||||
name: 'tom',
|
||||
providerId: 'anthropic',
|
||||
model: 'haiku',
|
||||
laneId: 'primary',
|
||||
laneKind: 'primary',
|
||||
laneOwnerProviderId: 'codex',
|
||||
launchState: 'confirmed_alive',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: false,
|
||||
runtimePid: 97_255,
|
||||
bootstrapConfirmed: true,
|
||||
hardFailure: false,
|
||||
livenessKind: 'stale_metadata',
|
||||
pidSource: 'persisted_metadata',
|
||||
runtimeDiagnostic: 'persisted runtime pid is not alive',
|
||||
runtimeDiagnosticSeverity: 'warning',
|
||||
firstSpawnAcceptedAt: '2026-05-24T09:25:45.178Z',
|
||||
lastHeartbeatAt: '2026-05-24T09:25:42.904Z',
|
||||
runtimeLastSeenAt: '2026-05-24T09:31:05.525Z',
|
||||
lastEvaluatedAt: '2026-05-24T12:04:48.900Z',
|
||||
},
|
||||
bob: {
|
||||
name: 'bob',
|
||||
providerId: 'opencode',
|
||||
model: 'opencode/deepseek-v4-flash-free',
|
||||
laneId: 'secondary:opencode:bob',
|
||||
laneKind: 'secondary',
|
||||
laneOwnerProviderId: 'opencode',
|
||||
launchState: 'confirmed_alive',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: true,
|
||||
bootstrapConfirmed: true,
|
||||
hardFailure: false,
|
||||
runtimePid: 2_756,
|
||||
runtimeSessionId: 'ses_bob',
|
||||
livenessKind: 'confirmed_bootstrap',
|
||||
lastHeartbeatAt: '2026-05-24T09:31:39.741Z',
|
||||
lastEvaluatedAt: '2026-05-24T09:31:39.741Z',
|
||||
},
|
||||
jack: {
|
||||
name: 'jack',
|
||||
providerId: 'opencode',
|
||||
model: 'opencode/big-pickle',
|
||||
laneId: 'secondary:opencode:jack',
|
||||
laneKind: 'secondary',
|
||||
laneOwnerProviderId: 'opencode',
|
||||
launchState: 'confirmed_alive',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: true,
|
||||
bootstrapConfirmed: true,
|
||||
hardFailure: false,
|
||||
runtimePid: 2_756,
|
||||
runtimeSessionId: 'ses_jack',
|
||||
livenessKind: 'confirmed_bootstrap',
|
||||
lastHeartbeatAt: '2026-05-24T09:31:39.741Z',
|
||||
lastEvaluatedAt: '2026-05-24T09:31:39.741Z',
|
||||
},
|
||||
},
|
||||
updatedAt: '2026-05-24T12:04:48.900Z',
|
||||
}),
|
||||
null,
|
||||
2
|
||||
)}\n`,
|
||||
'utf8'
|
||||
);
|
||||
|
||||
const svc = new TeamProvisioningService();
|
||||
const result = await svc.getMemberSpawnStatuses(teamName);
|
||||
|
||||
expect(result.teamLaunchState).toBe('clean_success');
|
||||
expect(result.statuses.tom).toMatchObject({
|
||||
status: 'online',
|
||||
launchState: 'confirmed_alive',
|
||||
bootstrapConfirmed: true,
|
||||
runtimeAlive: false,
|
||||
livenessKind: 'confirmed_bootstrap',
|
||||
hardFailure: false,
|
||||
error: undefined,
|
||||
});
|
||||
expect(result.statuses.tom?.runtimeDiagnostic).toBeUndefined();
|
||||
expect(result.statuses.tom?.runtimeDiagnosticSeverity).toBeUndefined();
|
||||
const persisted = JSON.parse(
|
||||
await fsPromises.readFile(getTeamLaunchStatePath(teamName), 'utf8')
|
||||
);
|
||||
expect(persisted.members.tom).toMatchObject({
|
||||
launchState: 'confirmed_alive',
|
||||
bootstrapConfirmed: true,
|
||||
livenessKind: 'confirmed_bootstrap',
|
||||
});
|
||||
expect(persisted.members.tom.runtimeDiagnostic).toBeUndefined();
|
||||
expect(persisted.members.tom.runtimeDiagnosticSeverity).toBeUndefined();
|
||||
});
|
||||
|
||||
it('does not collapse persisted mixed secondary failures when primary bootstrap snapshot is clean and richer', async () => {
|
||||
const teamName = 'mixed-clean-bootstrap-does-not-collapse-secondary-failure';
|
||||
writeMembersMeta(teamName, [
|
||||
|
|
|
|||
|
|
@ -91,6 +91,38 @@ describe('member launch diagnostics', () => {
|
|||
expect(formatMemberLaunchDiagnosticsPayload(payload)).toContain('"memberCardError"');
|
||||
});
|
||||
|
||||
it('does not surface post-stop stale runtime warnings as confirmed member card errors', () => {
|
||||
const payload = buildMemberLaunchDiagnosticsPayload({
|
||||
teamName: 'forge-labs-11',
|
||||
runId: 'e90c7699-54d7-449e-8a4a-6a3276396926',
|
||||
memberName: 'tom',
|
||||
spawnEntry: {
|
||||
status: 'online',
|
||||
launchState: 'confirmed_alive',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: false,
|
||||
bootstrapConfirmed: true,
|
||||
hardFailure: false,
|
||||
livenessKind: 'confirmed_bootstrap',
|
||||
updatedAt: '2026-05-24T12:04:48.900Z',
|
||||
},
|
||||
runtimeEntry: {
|
||||
memberName: 'tom',
|
||||
alive: false,
|
||||
restartable: true,
|
||||
livenessKind: 'stale_metadata',
|
||||
runtimeDiagnostic: 'persisted runtime pid is not alive',
|
||||
runtimeDiagnosticSeverity: 'warning',
|
||||
updatedAt: '2026-05-24T12:04:48.900Z',
|
||||
},
|
||||
});
|
||||
|
||||
expect(payload.memberCardError).toBeUndefined();
|
||||
expect(hasMemberLaunchDiagnosticsError(payload)).toBe(false);
|
||||
expect(getMemberLaunchDiagnosticsErrorMessage(payload)).toBeUndefined();
|
||||
expect(payload.runtimeDiagnostic).toBe('persisted runtime pid is not alive');
|
||||
});
|
||||
|
||||
it('includes runtime advisory evidence in copy diagnostics', () => {
|
||||
const payload = buildMemberLaunchDiagnosticsPayload({
|
||||
memberName: 'alice',
|
||||
|
|
|
|||
Loading…
Reference in a new issue