From 8ef89eefcead6b3dfb70b9466b12e61dd55fc18c Mon Sep 17 00:00:00 2001 From: iliya Date: Tue, 7 Apr 2026 00:43:38 +0300 Subject: [PATCH] feat(team): add bootstrap recovery and app integration --- src/main/http/teams.ts | 12 +- .../services/team/TeamBootstrapStateReader.ts | 713 ++++++++++++++++++ src/main/services/team/TeamConfigReader.ts | 26 +- .../services/team/TeamProvisioningService.ts | 488 +++++++++++- test/main/http/teams.test.ts | 9 +- .../team/TeamBootstrapStateReader.test.ts | 352 +++++++++ 6 files changed, 1563 insertions(+), 37 deletions(-) create mode 100644 src/main/services/team/TeamBootstrapStateReader.ts create mode 100644 test/main/services/team/TeamBootstrapStateReader.test.ts diff --git a/src/main/http/teams.ts b/src/main/http/teams.ts index 77d9feb9..0166e058 100644 --- a/src/main/http/teams.ts +++ b/src/main/http/teams.ts @@ -175,7 +175,7 @@ export function registerTeamRoutes(app: FastifyInstance, services: HttpServices) const teamProvisioningService = getTeamProvisioningService(services); teamProvisioningService.stopTeam(validatedTeamName.value!); - return reply.send(teamProvisioningService.getRuntimeState(validatedTeamName.value!)); + return reply.send(await teamProvisioningService.getRuntimeState(validatedTeamName.value!)); } catch (error) { if (shouldLogError(error)) { logger.error( @@ -198,7 +198,7 @@ export function registerTeamRoutes(app: FastifyInstance, services: HttpServices) } return reply.send( - getTeamProvisioningService(services).getRuntimeState(validatedTeamName.value!) + await getTeamProvisioningService(services).getRuntimeState(validatedTeamName.value!) ); } catch (error) { if (shouldLogError(error)) { @@ -236,9 +236,11 @@ export function registerTeamRoutes(app: FastifyInstance, services: HttpServices) app.get('/api/teams/runtime/alive', async (_request, reply) => { try { const teamProvisioningService = getTeamProvisioningService(services); - const runtimeStates = teamProvisioningService - .getAliveTeams() - .map((teamName) => teamProvisioningService.getRuntimeState(teamName)); + const runtimeStates = await Promise.all( + teamProvisioningService + .getAliveTeams() + .map((teamName) => teamProvisioningService.getRuntimeState(teamName)) + ); return reply.send(runtimeStates); } catch (error) { if (shouldLogError(error)) { diff --git a/src/main/services/team/TeamBootstrapStateReader.ts b/src/main/services/team/TeamBootstrapStateReader.ts new file mode 100644 index 00000000..1bce8d39 --- /dev/null +++ b/src/main/services/team/TeamBootstrapStateReader.ts @@ -0,0 +1,713 @@ +import { getTeamsBasePath } from '@main/utils/pathDecoder'; +import { createPersistedLaunchSnapshot } from './TeamLaunchStateEvaluator'; +import * as fs from 'fs'; +import * as path from 'path'; + +import type { + PersistedTeamLaunchMemberState, + PersistedTeamLaunchSnapshot, + TeamProvisioningProgress, + TeamRuntimeState, +} from '@shared/types'; + +const TEAM_BOOTSTRAP_STATE_FILE = 'bootstrap-state.json'; +const TEAM_BOOTSTRAP_JOURNAL_FILE = 'bootstrap-journal.jsonl'; +const TEAM_BOOTSTRAP_LOCK_DIR = '.bootstrap.lock'; +const TEAM_BOOTSTRAP_LOCK_METADATA_FILE = 'metadata.json'; +const MAX_BOOTSTRAP_STATE_BYTES = 256 * 1024; +const MAX_BOOTSTRAP_JOURNAL_BYTES = 256 * 1024; +const MAX_BOOTSTRAP_LOCK_METADATA_BYTES = 64 * 1024; +const ACTIVE_BOOTSTRAP_STUCK_CLASSIFICATION_MS = 3 * 60 * 1000; + +type RawBootstrapMemberState = { + name?: unknown; + status?: unknown; + lastAttemptAt?: unknown; + lastObservedAt?: unknown; + failureReason?: unknown; +}; + +type RawBootstrapState = { + version?: unknown; + runId?: unknown; + teamName?: unknown; + startedAt?: unknown; + ownerPid?: unknown; + updatedAt?: unknown; + phase?: unknown; + realTaskSubmissionState?: unknown; + members?: unknown; + terminal?: unknown; +}; + +type RawBootstrapJournalRecord = + | { ts?: unknown; type?: 'phase'; phase?: unknown } + | { ts?: unknown; type?: 'lock'; action?: unknown; ownerPid?: unknown; detail?: unknown } + | { ts?: unknown; type?: 'member'; name?: unknown; action?: unknown; detail?: unknown } + | { ts?: unknown; type?: 'terminal'; status?: unknown; reason?: unknown } + | { ts?: unknown; type?: 'real_task'; state?: unknown; detail?: unknown }; + +type RawBootstrapLockMetadata = { + pid?: unknown; + runId?: unknown; + requestHash?: unknown; + ownerStartedAt?: unknown; + createdAt?: unknown; + nonce?: unknown; +}; + +type BootstrapStateInspection = { + raw: RawBootstrapState | null; + issue?: string; +}; + +type BootstrapJournalInspection = { + warnings?: string[]; + issue?: string; +}; + +type BootstrapLockMetadata = { + pid: number; + runId: string; + ownerStartedAt?: number; +}; + +type BootstrapRuntimePhase = + | 'validating_spec' + | 'loading_existing_state' + | 'acquiring_bootstrap_lock' + | 'creating_team' + | 'spawning_members' + | 'auditing_truth' + | 'completed' + | 'failed' + | 'canceled'; + +function isBootstrapPhaseTerminal(phase: BootstrapRuntimePhase): boolean { + return phase === 'completed' || phase === 'failed' || phase === 'canceled'; +} + +function isProcessAlive(pid: number): boolean { + if (!Number.isFinite(pid) || pid <= 0) { + return false; + } + try { + process.kill(pid, 0); + return true; + } catch (error) { + return (error as NodeJS.ErrnoException | undefined)?.code === 'EPERM'; + } +} + +function classifyBootstrapOwnerState(raw: RawBootstrapState): { + ownerDead: boolean; + stale: boolean; + failureReason?: string; +} { + const phase = typeof raw.phase === 'string' ? (raw.phase as BootstrapRuntimePhase) : null; + if (!phase || isBootstrapPhaseTerminal(phase)) { + return { ownerDead: false, stale: false }; + } + + const ownerPid = typeof raw.ownerPid === 'number' ? raw.ownerPid : null; + if (ownerPid === null || isProcessAlive(ownerPid)) { + return { ownerDead: false, stale: false }; + } + + const updatedAtMs = + typeof raw.updatedAt === 'number' + ? raw.updatedAt + : typeof raw.updatedAt === 'string' + ? Date.parse(raw.updatedAt) + : NaN; + const stale = + Number.isFinite(updatedAtMs) && + Date.now() - updatedAtMs >= ACTIVE_BOOTSTRAP_STUCK_CLASSIFICATION_MS; + + return { + ownerDead: true, + stale, + failureReason: stale + ? `bootstrap owner pid ${ownerPid} is gone and persisted bootstrap state is stale` + : `bootstrap owner pid ${ownerPid} is gone before bootstrap reached a terminal state`, + }; +} + +async function inspectBootstrapState(teamName: string): Promise { + const targetPath = getTeamBootstrapStatePath(teamName); + try { + const stat = await fs.promises.lstat(targetPath); + if (stat.isSymbolicLink()) { + return { + raw: null, + issue: + 'Persisted deterministic bootstrap state is unreadable because bootstrap-state.json is a symlink.', + }; + } + if (!stat.isFile()) { + return { + raw: null, + issue: + 'Persisted deterministic bootstrap state is unreadable because bootstrap-state.json is not a regular file.', + }; + } + if (stat.size > MAX_BOOTSTRAP_STATE_BYTES) { + return { + raw: null, + issue: + 'Persisted deterministic bootstrap state is unreadable because bootstrap-state.json is oversized.', + }; + } + const raw = JSON.parse(await fs.promises.readFile(targetPath, 'utf8')) as RawBootstrapState; + if (raw.version !== 1) { + return { + raw: null, + issue: + 'Persisted deterministic bootstrap state is unreadable because bootstrap-state.json has an unsupported schema version.', + }; + } + return { raw }; + } catch (error) { + if ((error as NodeJS.ErrnoException | undefined)?.code === 'ENOENT') { + return { raw: null }; + } + return { + raw: null, + issue: + 'Persisted deterministic bootstrap state is unreadable because bootstrap-state.json is invalid, truncated, or inaccessible.', + }; + } +} + +async function readRawBootstrapState(teamName: string): Promise { + return (await inspectBootstrapState(teamName)).raw; +} + +function getBootstrapProgressProjection( + phase: BootstrapRuntimePhase, + memberCount: number +): { state: Exclude; message: string } | null { + switch (phase) { + case 'validating_spec': + return { + state: 'validating', + message: 'Validating deterministic bootstrap spec', + }; + case 'loading_existing_state': + return { + state: 'configuring', + message: 'Loading existing team state', + }; + case 'acquiring_bootstrap_lock': + return { + state: 'configuring', + message: 'Acquiring deterministic bootstrap lock', + }; + case 'creating_team': + return { + state: 'assembling', + message: 'Creating team config', + }; + case 'spawning_members': + return { + state: 'assembling', + message: + memberCount > 0 + ? `Spawning teammate runtimes (${memberCount})` + : 'Spawning teammate runtimes', + }; + case 'auditing_truth': + return { + state: 'finalizing', + message: 'Auditing registered teammates and bootstrap truth', + }; + case 'completed': + return { + state: 'ready', + message: 'Deterministic bootstrap completed', + }; + case 'failed': + return { + state: 'failed', + message: 'Deterministic bootstrap failed', + }; + case 'canceled': + return { + state: 'cancelled', + message: 'Deterministic bootstrap cancelled', + }; + default: + return null; + } +} + +function toIso(value: unknown, fallback: string): string { + if (typeof value === 'string' && value.trim().length > 0) { + return value; + } + if (typeof value === 'number' && Number.isFinite(value) && value > 0) { + return new Date(value).toISOString(); + } + return fallback; +} + +function normalizeBootstrapMemberState( + memberName: string, + raw: RawBootstrapMemberState, + updatedAt: string +): PersistedTeamLaunchMemberState { + const status = typeof raw.status === 'string' ? raw.status : 'pending'; + const hardFailure = status === 'failed'; + const bootstrapConfirmed = status === 'bootstrap_confirmed'; + const runtimeAlive = bootstrapConfirmed || status === 'runtime_alive'; + const agentToolAccepted = + bootstrapConfirmed || + runtimeAlive || + status === 'registered' || + status === 'spawn_started' || + hardFailure; + + return { + name: memberName, + launchState: hardFailure + ? 'failed_to_start' + : bootstrapConfirmed + ? 'confirmed_alive' + : runtimeAlive || agentToolAccepted + ? 'runtime_pending_bootstrap' + : 'starting', + agentToolAccepted, + runtimeAlive, + bootstrapConfirmed, + hardFailure, + hardFailureReason: + typeof raw.failureReason === 'string' && raw.failureReason.trim().length > 0 + ? raw.failureReason.trim() + : undefined, + firstSpawnAcceptedAt: agentToolAccepted ? toIso(raw.lastAttemptAt, updatedAt) : undefined, + lastHeartbeatAt: bootstrapConfirmed ? toIso(raw.lastObservedAt, updatedAt) : undefined, + lastRuntimeAliveAt: runtimeAlive ? toIso(raw.lastObservedAt, updatedAt) : undefined, + lastEvaluatedAt: toIso(raw.lastObservedAt, updatedAt), + sources: { + configRegistered: + status === 'registered' || + status === 'runtime_alive' || + status === 'bootstrap_confirmed' || + hardFailure, + processAlive: runtimeAlive || undefined, + hardFailureSignal: hardFailure || undefined, + }, + diagnostics: hardFailure + ? [ + typeof raw.failureReason === 'string' && raw.failureReason.trim().length > 0 + ? raw.failureReason.trim() + : 'deterministic bootstrap failed', + ] + : runtimeAlive + ? bootstrapConfirmed + ? ['late heartbeat received'] + : ['runtime alive', 'waiting for bootstrap'] + : agentToolAccepted + ? ['spawn accepted'] + : undefined, + }; +} + +export function getTeamBootstrapStatePath(teamName: string): string { + return path.join(getTeamsBasePath(), teamName, TEAM_BOOTSTRAP_STATE_FILE); +} + +function getTeamBootstrapJournalPath(teamName: string): string { + return path.join(getTeamsBasePath(), teamName, TEAM_BOOTSTRAP_JOURNAL_FILE); +} + +function getTeamBootstrapLockMetadataPath(teamName: string): string { + return path.join( + getTeamsBasePath(), + teamName, + TEAM_BOOTSTRAP_LOCK_DIR, + TEAM_BOOTSTRAP_LOCK_METADATA_FILE + ); +} + +async function readBootstrapLockMetadata(teamName: string): Promise { + const targetPath = getTeamBootstrapLockMetadataPath(teamName); + try { + const stat = await fs.promises.lstat(targetPath); + if (stat.isSymbolicLink() || !stat.isFile() || stat.size > MAX_BOOTSTRAP_LOCK_METADATA_BYTES) { + return null; + } + const raw = JSON.parse( + await fs.promises.readFile(targetPath, 'utf8') + ) as RawBootstrapLockMetadata; + if ( + typeof raw.pid !== 'number' || + !Number.isFinite(raw.pid) || + raw.pid <= 0 || + typeof raw.runId !== 'string' || + raw.runId.trim().length === 0 + ) { + return null; + } + return { + pid: raw.pid, + runId: raw.runId.trim(), + ownerStartedAt: + typeof raw.ownerStartedAt === 'number' && Number.isFinite(raw.ownerStartedAt) + ? raw.ownerStartedAt + : undefined, + }; + } catch { + return null; + } +} + +async function readBootstrapJournalWarnings(teamName: string): Promise { + return (await inspectBootstrapJournal(teamName)).warnings; +} + +async function inspectBootstrapJournal(teamName: string): Promise { + const targetPath = getTeamBootstrapJournalPath(teamName); + try { + const stat = await fs.promises.lstat(targetPath); + if (stat.isSymbolicLink()) { + return { + issue: + 'Persisted deterministic bootstrap journal is unreadable because bootstrap-journal.jsonl is a symlink.', + }; + } + if (!stat.isFile()) { + return { + issue: + 'Persisted deterministic bootstrap journal is unreadable because bootstrap-journal.jsonl is not a regular file.', + }; + } + if (stat.size > MAX_BOOTSTRAP_JOURNAL_BYTES) { + return { + issue: + 'Persisted deterministic bootstrap journal is unreadable because bootstrap-journal.jsonl is oversized.', + }; + } + + const raw = await fs.promises.readFile(targetPath, 'utf8'); + const lines = raw + .split('\n') + .map((line) => line.trim()) + .filter((line) => line.length > 0) + .slice(-3); + + const messages = lines + .map((line) => { + try { + return JSON.parse(line) as RawBootstrapJournalRecord; + } catch { + return null; + } + }) + .filter((record): record is RawBootstrapJournalRecord => Boolean(record)) + .map((record) => { + if (record.type === 'phase' && typeof record.phase === 'string') { + return `bootstrap phase: ${record.phase}`; + } + if (record.type === 'lock' && typeof record.action === 'string') { + const owner = typeof record.ownerPid === 'number' ? ` (pid ${record.ownerPid})` : ''; + return `bootstrap lock ${record.action}${owner}`; + } + if ( + record.type === 'member' && + typeof record.name === 'string' && + typeof record.action === 'string' + ) { + return typeof record.detail === 'string' && record.detail.trim().length > 0 + ? `${record.name}: ${record.action} (${record.detail.trim()})` + : `${record.name}: ${record.action}`; + } + if (record.type === 'terminal' && typeof record.status === 'string') { + return typeof record.reason === 'string' && record.reason.trim().length > 0 + ? `bootstrap ${record.status}: ${record.reason.trim()}` + : `bootstrap ${record.status}`; + } + if (record.type === 'real_task' && typeof record.state === 'string') { + return typeof record.detail === 'string' && record.detail.trim().length > 0 + ? `first task ${record.state}: ${record.detail.trim()}` + : `first task ${record.state}`; + } + return null; + }) + .filter((item): item is string => Boolean(item)); + + return { + warnings: + messages.length > 0 + ? [`Recent deterministic bootstrap events: ${messages.join(' | ')}`] + : undefined, + }; + } catch (error) { + if ((error as NodeJS.ErrnoException | undefined)?.code === 'ENOENT') { + return {}; + } + return { + issue: + 'Persisted deterministic bootstrap journal is unreadable because bootstrap-journal.jsonl is invalid, truncated, or inaccessible.', + }; + } +} + +async function readDegradedBootstrapRuntimeState( + teamName: string, + stateIssue: string +): Promise { + const lockMetadata = await readBootstrapLockMetadata(teamName); + if (!lockMetadata) { + return null; + } + + const journalInspection = await inspectBootstrapJournal(teamName); + const warnings = [ + stateIssue, + journalInspection.issue, + ...(journalInspection.warnings ?? []), + ].filter((item): item is string => typeof item === 'string' && item.trim().length > 0); + const ownerAlive = isProcessAlive(lockMetadata.pid); + const now = new Date().toISOString(); + + return { + teamName, + isAlive: false, + runId: lockMetadata.runId, + progress: { + runId: lockMetadata.runId, + teamName, + state: ownerAlive ? 'configuring' : 'failed', + message: ownerAlive + ? 'Deterministic bootstrap recovery is degraded because persisted bootstrap state is unreadable' + : 'Deterministic bootstrap recovery failed because persisted bootstrap state is unreadable and the bootstrap owner is gone', + messageSeverity: 'warning', + error: ownerAlive + ? stateIssue + : `${stateIssue} Bootstrap owner pid ${lockMetadata.pid} is not alive.`, + warnings: warnings.length > 0 ? warnings : undefined, + startedAt: + typeof lockMetadata.ownerStartedAt === 'number' && + Number.isFinite(lockMetadata.ownerStartedAt) + ? new Date(lockMetadata.ownerStartedAt).toISOString() + : now, + updatedAt: now, + pid: lockMetadata.pid, + }, + }; +} + +export async function readBootstrapLaunchSnapshot( + teamName: string +): Promise { + const raw = await readRawBootstrapState(teamName); + if (!raw) { + return null; + } + try { + const updatedAt = toIso(raw.updatedAt, new Date().toISOString()); + const rawMembers = Array.isArray(raw.members) ? raw.members : []; + const members: Record = {}; + const expectedMembers: string[] = []; + + for (const item of rawMembers) { + if (!item || typeof item !== 'object') continue; + const rawMember = item as RawBootstrapMemberState; + const memberName = typeof rawMember.name === 'string' ? rawMember.name.trim() : ''; + if (!memberName || memberName === 'team-lead' || memberName === 'user') continue; + expectedMembers.push(memberName); + members[memberName] = normalizeBootstrapMemberState(memberName, rawMember, updatedAt); + } + + const terminal = + raw.terminal && typeof raw.terminal === 'object' + ? (raw.terminal as Record) + : null; + const terminalStatus = typeof terminal?.status === 'string' ? terminal.status : undefined; + const phase = typeof raw.phase === 'string' ? raw.phase : undefined; + const ownerState = classifyBootstrapOwnerState(raw); + const launchPhase = + terminalStatus === 'completed' || + terminalStatus === 'partial_success' || + terminalStatus === 'failed' || + terminalStatus === 'canceled' || + ownerState.ownerDead || + phase === 'completed' || + phase === 'failed' || + phase === 'canceled' + ? 'finished' + : 'active'; + + if (ownerState.ownerDead) { + const diagnostics = ownerState.failureReason ? [ownerState.failureReason] : undefined; + for (const memberName of expectedMembers) { + const entry = members[memberName]; + if ( + !entry || + entry.launchState === 'confirmed_alive' || + entry.launchState === 'failed_to_start' + ) { + continue; + } + members[memberName] = { + ...entry, + launchState: 'failed_to_start', + hardFailure: true, + hardFailureReason: ownerState.failureReason, + diagnostics: diagnostics ?? entry.diagnostics, + sources: { + ...entry.sources, + hardFailureSignal: true, + }, + }; + } + } + + return createPersistedLaunchSnapshot({ + teamName: + typeof raw.teamName === 'string' && raw.teamName.trim().length > 0 + ? raw.teamName.trim() + : teamName, + expectedMembers, + launchPhase, + members, + updatedAt, + }); + } catch { + return null; + } +} + +export async function readBootstrapRealTaskSubmissionState( + teamName: string +): Promise<'not_submitted' | 'submitted' | 'unknown' | null> { + const raw = await readRawBootstrapState(teamName); + if (!raw) { + return null; + } + const state = raw.realTaskSubmissionState; + return state === 'not_submitted' || state === 'submitted' || state === 'unknown' ? state : null; +} + +export async function readBootstrapRuntimeState( + teamName: string +): Promise { + const inspection = await inspectBootstrapState(teamName); + const raw = inspection.raw; + if (!raw) { + return inspection.issue ? readDegradedBootstrapRuntimeState(teamName, inspection.issue) : null; + } + + try { + const journalWarnings = await readBootstrapJournalWarnings(teamName); + const phase = typeof raw.phase === 'string' ? (raw.phase as BootstrapRuntimePhase) : null; + if (!phase) { + return null; + } + const ownerState = classifyBootstrapOwnerState(raw); + if (ownerState.ownerDead) { + const startedAt = toIso(raw.startedAt, new Date().toISOString()); + const updatedAt = toIso(raw.updatedAt, startedAt); + return { + teamName: + typeof raw.teamName === 'string' && raw.teamName.trim().length > 0 + ? raw.teamName.trim() + : teamName, + isAlive: false, + runId: typeof raw.runId === 'string' ? raw.runId : null, + progress: { + runId: typeof raw.runId === 'string' ? raw.runId : teamName, + teamName: + typeof raw.teamName === 'string' && raw.teamName.trim().length > 0 + ? raw.teamName.trim() + : teamName, + state: 'failed', + message: ownerState.stale + ? 'Deterministic bootstrap became stuck after owner process exited' + : 'Deterministic bootstrap owner exited before bootstrap completed', + error: ownerState.failureReason, + warnings: journalWarnings, + startedAt, + updatedAt, + ...(typeof raw.ownerPid === 'number' ? { pid: raw.ownerPid } : {}), + }, + }; + } + const activePhases: BootstrapRuntimePhase[] = [ + 'validating_spec', + 'loading_existing_state', + 'acquiring_bootstrap_lock', + 'creating_team', + 'spawning_members', + 'auditing_truth', + ]; + if (!activePhases.includes(phase)) { + return null; + } + const projection = getBootstrapProgressProjection( + phase, + Array.isArray(raw.members) ? raw.members.length : 0 + ); + if (!projection) { + return null; + } + + const startedAt = toIso(raw.startedAt, new Date().toISOString()); + const updatedAt = toIso(raw.updatedAt, startedAt); + const runId = typeof raw.runId === 'string' && raw.runId.trim().length > 0 ? raw.runId : null; + const pid = + typeof raw.ownerPid === 'number' && Number.isFinite(raw.ownerPid) && raw.ownerPid > 0 + ? raw.ownerPid + : undefined; + + const progress: TeamProvisioningProgress = { + runId: runId ?? `bootstrap:${teamName}`, + teamName: + typeof raw.teamName === 'string' && raw.teamName.trim().length > 0 + ? raw.teamName.trim() + : teamName, + state: projection.state, + message: projection.message, + warnings: journalWarnings, + startedAt, + updatedAt, + ...(pid ? { pid } : {}), + }; + + return { + teamName: + typeof raw.teamName === 'string' && raw.teamName.trim().length > 0 + ? raw.teamName.trim() + : teamName, + isAlive: false, + runId, + progress, + }; + } catch { + return null; + } +} + +export async function clearBootstrapState(teamName: string): Promise { + try { + await fs.promises.rm(getTeamBootstrapStatePath(teamName), { force: true }); + } catch { + // best-effort + } +} + +export function choosePreferredLaunchSnapshot( + bootstrapSnapshot: T | null, + launchSnapshot: T | null +): T | null { + if (!bootstrapSnapshot) return launchSnapshot; + if (!launchSnapshot) return bootstrapSnapshot; + + const bootstrapMs = Date.parse(bootstrapSnapshot.updatedAt ?? ''); + const launchMs = Date.parse(launchSnapshot.updatedAt ?? ''); + if (Number.isFinite(bootstrapMs) && Number.isFinite(launchMs)) { + return bootstrapMs >= launchMs ? bootstrapSnapshot : launchSnapshot; + } + return bootstrapSnapshot; +} diff --git a/src/main/services/team/TeamConfigReader.ts b/src/main/services/team/TeamConfigReader.ts index 58276f94..96195389 100644 --- a/src/main/services/team/TeamConfigReader.ts +++ b/src/main/services/team/TeamConfigReader.ts @@ -12,6 +12,10 @@ import * as path from 'path'; import { getTeamFsWorkerClient } from './TeamFsWorkerClient'; import { TeamMembersMetaStore } from './TeamMembersMetaStore'; import { TeamMetaStore } from './TeamMetaStore'; +import { + choosePreferredLaunchSnapshot, + readBootstrapLaunchSnapshot, +} from './TeamBootstrapStateReader'; import { normalizePersistedLaunchSnapshot } from './TeamLaunchStateEvaluator'; import type { TeamConfig, TeamMember, TeamSummary, TeamSummaryMember } from '@shared/types'; @@ -42,17 +46,27 @@ interface LaunchStateSummary { } async function readLaunchStateSummary(teamDir: string): Promise { + const bootstrapSnapshot = await readBootstrapLaunchSnapshot(path.basename(teamDir)); const launchStatePath = path.join(teamDir, TEAM_LAUNCH_STATE_FILE); + let launchSnapshot = null; try { const stat = await fs.promises.stat(launchStatePath); if (!stat.isFile() || stat.size > MAX_LAUNCH_STATE_BYTES) { - return null; - } - const raw = await readFileUtf8WithTimeout(launchStatePath, PER_TEAM_READ_TIMEOUT_MS); - const snapshot = normalizePersistedLaunchSnapshot(path.basename(teamDir), JSON.parse(raw)); - if (!snapshot) { - return null; + launchSnapshot = null; + } else { + const raw = await readFileUtf8WithTimeout(launchStatePath, PER_TEAM_READ_TIMEOUT_MS); + launchSnapshot = normalizePersistedLaunchSnapshot(path.basename(teamDir), JSON.parse(raw)); } + } catch { + launchSnapshot = null; + } + + const snapshot = choosePreferredLaunchSnapshot(bootstrapSnapshot, launchSnapshot); + if (!snapshot) { + return null; + } + + try { const missingMembers = snapshot.expectedMembers.filter((name) => { const member = snapshot.members[name]; return member?.launchState === 'failed_to_start'; diff --git a/src/main/services/team/TeamProvisioningService.ts b/src/main/services/team/TeamProvisioningService.ts index 2f8bdfcc..edac1d53 100644 --- a/src/main/services/team/TeamProvisioningService.ts +++ b/src/main/services/team/TeamProvisioningService.ts @@ -73,6 +73,13 @@ import { TeamMetaStore } from './TeamMetaStore'; import { TeamSentMessagesStore } from './TeamSentMessagesStore'; import { TeamTaskReader } from './TeamTaskReader'; import { TeamLaunchStateStore } from './TeamLaunchStateStore'; +import { + choosePreferredLaunchSnapshot, + clearBootstrapState, + readBootstrapLaunchSnapshot, + readBootstrapRealTaskSubmissionState, + readBootstrapRuntimeState, +} from './TeamBootstrapStateReader'; import { resolveDesktopTeammateModeDecision } from './runtimeTeammateMode'; import { createPersistedLaunchSnapshot, @@ -131,6 +138,7 @@ import type { TeamProvisioningState, TeamRuntimeState, TeamTask, + EffortLevel, ToolActivityEventPayload, ToolApprovalAutoResolved, ToolApprovalEvent, @@ -501,7 +509,12 @@ interface ProvisioningRun { provisioningComplete: boolean; /** Path to the generated MCP config file for later cleanup. */ mcpConfigPath: string | null; + /** Path to the deterministic bootstrap spec file for later cleanup. */ + bootstrapSpecPath: string | null; + /** Path to the deferred first-user-task file consumed by runtime after bootstrap. */ + bootstrapUserPromptPath: string | null; isLaunch: boolean; + deterministicBootstrap: boolean; leadRelayCapture: { leadName: string; startedAt: string; @@ -1138,6 +1151,142 @@ export function buildAddMemberSpawnMessage( ); } +type RuntimeBootstrapMemberSpec = { + name: string; + prompt: string; + cwd?: string; + model?: string; + provider?: TeamProviderId; + effort?: EffortLevel; + agentType?: string; + description?: string; + useSplitPane?: boolean; + planModeRequired?: boolean; +}; + +type RuntimeBootstrapSpec = { + version: 1; + runId: string; + mode: 'create'; + initiator: { + kind: 'app'; + source: 'claude_team_freecode'; + }; + team: { + name: string; + displayName?: string; + description?: string; + color?: string; + cwd: string; + }; + lead: { + providerId?: TeamProviderId; + model?: string; + effort?: EffortLevel; + skipPermissions?: boolean; + worktree?: string | null; + extraCliArgs?: string[]; + }; + members: RuntimeBootstrapMemberSpec[]; + launch?: { + initialUserPrompt?: string | null; + bootstrapTimeoutMs?: number; + continueOnPartialFailure?: boolean; + }; + ui?: { + emitStructuredEvents?: boolean; + }; +}; + +function buildDeterministicBootstrapSpec( + runId: string, + request: TeamCreateRequest, + effectiveMembers: TeamCreateRequest['members'] +): RuntimeBootstrapSpec { + const displayName = request.displayName?.trim() || request.teamName; + const leadName = + effectiveMembers.find((member) => member.role?.toLowerCase().includes('lead'))?.name || + 'team-lead'; + + return { + version: 1, + runId, + mode: 'create', + initiator: { + kind: 'app', + source: 'claude_team_freecode', + }, + team: { + name: request.teamName, + ...(request.displayName?.trim() ? { displayName: request.displayName.trim() } : {}), + ...(request.description?.trim() ? { description: request.description.trim() } : {}), + ...(request.color?.trim() ? { color: request.color.trim() } : {}), + cwd: request.cwd, + }, + lead: { + ...(request.providerId ? { providerId: request.providerId } : {}), + ...(request.model?.trim() ? { model: request.model.trim() } : {}), + ...(request.effort ? { effort: request.effort } : {}), + ...(request.skipPermissions !== undefined + ? { skipPermissions: request.skipPermissions } + : {}), + ...(request.worktree ? { worktree: request.worktree } : {}), + ...(request.extraCliArgs ? { extraCliArgs: parseCliArgs(request.extraCliArgs) } : {}), + }, + members: effectiveMembers.map((member) => ({ + name: member.name, + prompt: buildMemberSpawnPrompt(member, displayName, request.teamName, leadName), + ...(member.role?.trim() ? { role: member.role.trim() } : {}), + ...(member.workflow?.trim() ? { workflow: member.workflow.trim() } : {}), + ...(request.cwd ? { cwd: request.cwd } : {}), + ...(member.model?.trim() ? { model: member.model.trim() } : {}), + ...(member.providerId ? { provider: member.providerId } : {}), + ...(member.effort ? { effort: member.effort } : {}), + ...(member.role?.trim() ? { description: member.role.trim() } : {}), + })), + launch: { + continueOnPartialFailure: true, + }, + ui: { + emitStructuredEvents: true, + }, + }; +} + +async function writeDeterministicBootstrapSpecFile(spec: RuntimeBootstrapSpec): Promise { + const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'agent-teams-bootstrap-')); + const filePath = path.join(tempDir, `${spec.team.name}-${randomUUID()}.json`); + await fs.promises.writeFile(filePath, JSON.stringify(spec), { + encoding: 'utf8', + mode: 0o600, + }); + return filePath; +} + +async function removeDeterministicBootstrapSpecFile(filePath: string | null): Promise { + if (!filePath) return; + await fs.promises.rm(filePath, { force: true }).catch(() => {}); + await fs.promises.rmdir(path.dirname(filePath)).catch(() => {}); +} + +async function writeDeterministicBootstrapUserPromptFile(prompt: string): Promise { + const tempDir = await fs.promises.mkdtemp( + path.join(os.tmpdir(), 'agent-teams-bootstrap-prompt-') + ); + const filePath = path.join(tempDir, `${randomUUID()}.txt`); + await fs.promises.writeFile(filePath, prompt, { + encoding: 'utf8', + mode: 0o600, + }); + return filePath; +} + +async function removeDeterministicBootstrapUserPromptFile(filePath: string | null): Promise { + if (!filePath) return; + await fs.promises.rm(filePath, { force: true }).catch(() => {}); + await fs.promises.rmdir(path.dirname(filePath)).catch(() => {}); +} + function buildTeamCtlOpsInstructions(teamName: string, leadName: string): string { return wrapInAgentBlock( [ @@ -4030,6 +4179,7 @@ export class TeamProvisioningService { // Verify --mcp-config still exists; regenerate if deleted (e.g. by stale GC) const mcpFlagIdx = ctx.args.indexOf('--mcp-config'); + const bootstrapPromptFlagIdx = ctx.args.indexOf('--team-bootstrap-user-prompt-file'); if (mcpFlagIdx !== -1 && mcpFlagIdx + 1 < ctx.args.length) { const existingConfigPath = ctx.args[mcpFlagIdx + 1]; try { @@ -4054,6 +4204,73 @@ export class TeamProvisioningService { } } + if (bootstrapPromptFlagIdx !== -1 && bootstrapPromptFlagIdx + 1 < ctx.args.length) { + const existingPromptPath = ctx.args[bootstrapPromptFlagIdx + 1]; + try { + await fs.promises.access(existingPromptPath, fs.constants.F_OK); + } catch { + const submissionState = await readBootstrapRealTaskSubmissionState(run.teamName); + if (submissionState === 'submitted') { + ctx.args.splice(bootstrapPromptFlagIdx, 2); + ctx.prompt = ''; + run.bootstrapUserPromptPath = null; + } else if (submissionState === 'unknown') { + run.authRetryInProgress = false; + const progress = updateProgress( + run, + 'failed', + 'Unable to safely retry first task after auth failure', + { + error: + 'deterministic bootstrap recorded the first real task as unknown, so retry would risk a duplicate submission', + cliLogsTail: extractCliLogsFromRun(run), + } + ); + run.onProgress(progress); + this.cleanupRun(run); + return; + } else if (ctx.prompt.trim().length === 0) { + run.authRetryInProgress = false; + const progress = updateProgress( + run, + 'failed', + 'Failed to restore deferred first task after auth retry', + { + error: + 'deterministic bootstrap user prompt file was missing and no prompt was available to regenerate it', + cliLogsTail: extractCliLogsFromRun(run), + } + ); + run.onProgress(progress); + this.cleanupRun(run); + return; + } else { + logger.warn( + `[${run.teamName}] Bootstrap user prompt file ${existingPromptPath} missing, regenerating` + ); + try { + const newPromptPath = await writeDeterministicBootstrapUserPromptFile(ctx.prompt); + ctx.args[bootstrapPromptFlagIdx + 1] = newPromptPath; + run.bootstrapUserPromptPath = newPromptPath; + } catch (regenErr) { + run.authRetryInProgress = false; + const progress = updateProgress( + run, + 'failed', + 'Failed to regenerate deferred first task for auth retry', + { + error: regenErr instanceof Error ? regenErr.message : String(regenErr), + cliLogsTail: extractCliLogsFromRun(run), + } + ); + run.onProgress(progress); + this.cleanupRun(run); + return; + } + } + } + } + // Respawn with saved context — CLI handles its own auth refresh. let child: ReturnType; try { @@ -4091,8 +4308,9 @@ export class TeamProvisioningService { }); run.onProgress(run.progress); - // Resend prompt - if (child.stdin?.writable) { + // Resend prompt only for legacy direct-stdin flows. Deterministic bootstrap + // owns the first real task via --team-bootstrap-user-prompt-file. + if (bootstrapPromptFlagIdx === -1 && child.stdin?.writable) { const message = JSON.stringify({ type: 'user', message: { @@ -4352,7 +4570,10 @@ export class TeamProvisioningService { waitingTasksSince: null, provisioningComplete: false, mcpConfigPath: null, + bootstrapSpecPath: null, + bootstrapUserPromptPath: null, isLaunch: false, + deterministicBootstrap: true, fsPhase: 'waiting_config', leadRelayCapture: null, activeCrossTeamReplyHints: [], @@ -4404,24 +4625,41 @@ export class TeamProvisioningService { run.onProgress(run.progress); await this.clearPersistedLaunchState(request.teamName); - const prompt = buildProvisioningPrompt(request, effectiveMemberSpecs); - const promptSize = getPromptSizeSummary(prompt); + const bootstrapSpec = buildDeterministicBootstrapSpec(runId, request, effectiveMemberSpecs); + const initialUserPrompt = request.prompt?.trim() ?? ''; + const promptSize = getPromptSizeSummary(initialUserPrompt); let child: ReturnType; const { env: shellEnv, geminiRuntimeAuth } = await this.buildProvisioningEnv( request.providerId ); + shellEnv.CLAUDE_ENABLE_DETERMINISTIC_TEAM_BOOTSTRAP = '1'; const teammateModeDecision = await resolveDesktopTeammateModeDecision(request.extraCliArgs); if (teammateModeDecision.forceProcessTeammates) { shellEnv.CLAUDE_TEAM_FORCE_PROCESS_TEAMMATES = '1'; } let mcpConfigPath: string; + let bootstrapSpecPath: string; + let bootstrapUserPromptPath: string | null = null; try { + bootstrapSpecPath = await writeDeterministicBootstrapSpecFile(bootstrapSpec); + run.bootstrapSpecPath = bootstrapSpecPath; + if (initialUserPrompt) { + bootstrapUserPromptPath = + await writeDeterministicBootstrapUserPromptFile(initialUserPrompt); + run.bootstrapUserPromptPath = bootstrapUserPromptPath; + } mcpConfigPath = await this.mcpConfigBuilder.writeConfigFile(request.cwd); run.mcpConfigPath = mcpConfigPath; await this.validateAgentTeamsMcpRuntime(claudePath, request.cwd, shellEnv, mcpConfigPath); } catch (error) { this.runs.delete(runId); this.provisioningRunByTeam.delete(request.teamName); + await removeDeterministicBootstrapSpecFile(run.bootstrapSpecPath).catch(() => {}); + run.bootstrapSpecPath = null; + await removeDeterministicBootstrapUserPromptFile(run.bootstrapUserPromptPath).catch( + () => {} + ); + run.bootstrapUserPromptPath = null; throw error; } const spawnArgs = [ @@ -4434,6 +4672,11 @@ export class TeamProvisioningService { 'user,project,local', '--mcp-config', mcpConfigPath, + '--team-bootstrap-spec', + bootstrapSpecPath, + ...(bootstrapUserPromptPath + ? ['--team-bootstrap-user-prompt-file', bootstrapUserPromptPath] + : []), '--disallowedTools', APP_TEAM_RUNTIME_DISALLOWED_TOOLS, // Explicit --permission-mode overrides user's defaultMode in ~/.claude/settings.json @@ -4505,6 +4748,12 @@ export class TeamProvisioningService { const tasksDir = path.join(getTasksBasePath(), request.teamName); await fs.promises.rm(teamDir, { recursive: true, force: true }).catch(() => {}); await fs.promises.rm(tasksDir, { recursive: true, force: true }).catch(() => {}); + await removeDeterministicBootstrapSpecFile(run.bootstrapSpecPath).catch(() => {}); + run.bootstrapSpecPath = null; + await removeDeterministicBootstrapUserPromptFile(run.bootstrapUserPromptPath).catch( + () => {} + ); + run.bootstrapUserPromptPath = null; if (run.mcpConfigPath) { await this.mcpConfigBuilder.removeConfigFile(run.mcpConfigPath).catch(() => {}); run.mcpConfigPath = null; @@ -4525,21 +4774,9 @@ export class TeamProvisioningService { args: spawnArgs, cwd: request.cwd, env: { ...shellEnv }, - prompt, + prompt: initialUserPrompt, }; - // Send provisioning prompt as first stream-json message (SDKUserMessage format) - if (child.stdin?.writable) { - const message = JSON.stringify({ - type: 'user', - message: { - role: 'user', - content: [{ type: 'text', text: prompt }], - }, - }); - child.stdin.write(message + '\n'); - } - this.attachStdoutHandler(run); this.attachStderrHandler(run); @@ -4841,7 +5078,10 @@ export class TeamProvisioningService { waitingTasksSince: null, provisioningComplete: false, mcpConfigPath: null, + bootstrapSpecPath: null, + bootstrapUserPromptPath: null, isLaunch: true, + deterministicBootstrap: false, fsPhase: 'waiting_members', leadRelayCapture: null, activeCrossTeamReplyHints: [], @@ -5840,10 +6080,17 @@ export class TeamProvisioningService { return Array.from(this.aliveRunByTeam.keys()).filter((name) => this.isTeamAlive(name)); } - getRuntimeState(teamName: string): TeamRuntimeState { + async getRuntimeState(teamName: string): Promise { const runId = this.getTrackedRunId(teamName); const run = runId ? (this.runs.get(runId) ?? null) : null; + if (!run) { + const recovered = await readBootstrapRuntimeState(teamName); + if (recovered) { + return recovered; + } + } + return { teamName, isAlive: this.isTeamAlive(teamName), @@ -6175,7 +6422,11 @@ export class TeamProvisioningService { } const current = run.memberSpawnStatuses.get(expected); - if (current?.launchState === 'failed_to_start') { + if ( + current?.launchState === 'failed_to_start' || + current?.bootstrapConfirmed || + current?.runtimeAlive + ) { continue; } @@ -6224,6 +6475,7 @@ export class TeamProvisioningService { private async clearPersistedLaunchState(teamName: string): Promise { await this.launchStateStore.clear(teamName); + await clearBootstrapState(teamName); } private getFailedSpawnMembers( @@ -6338,7 +6590,15 @@ export class TeamProvisioningService { snapshot: ReturnType | null; statuses: Record; }> { + const bootstrapSnapshot = await readBootstrapLaunchSnapshot(teamName); const persisted = await this.launchStateStore.read(teamName); + const preferredSnapshot = choosePreferredLaunchSnapshot(bootstrapSnapshot, persisted); + if (preferredSnapshot) { + return { + snapshot: preferredSnapshot, + statuses: snapshotToMemberSpawnStatuses(preferredSnapshot), + }; + } if (!persisted) { return { snapshot: null, statuses: {} }; } @@ -6998,6 +7258,152 @@ export class TeamProvisioningService { * Process a parsed stream-json message from stdout. * Extracts assistant text for progress reporting and detects turn completion. */ + private handleDeterministicBootstrapEvent( + run: ProvisioningRun, + msg: Record + ): boolean { + if (msg.type !== 'system' || msg.subtype !== 'team_bootstrap') { + return false; + } + + const event = typeof msg.event === 'string' ? msg.event : undefined; + if (!event) { + return true; + } + + if (event === 'started') { + const progress = updateProgress(run, 'configuring', 'Starting deterministic team bootstrap'); + run.onProgress(progress); + return true; + } + + if (event === 'phase_changed') { + const phase = typeof msg.phase === 'string' ? msg.phase : ''; + if (phase === 'loading_existing_state') { + const progress = updateProgress(run, 'configuring', 'Loading existing team state'); + run.onProgress(progress); + } else if (phase === 'acquiring_bootstrap_lock') { + const progress = updateProgress( + run, + 'configuring', + 'Acquiring deterministic bootstrap lock' + ); + run.onProgress(progress); + } else if (phase === 'creating_team') { + const progress = updateProgress(run, 'assembling', 'Creating team config'); + run.onProgress(progress); + } else if (phase === 'spawning_members') { + const progress = updateProgress(run, 'assembling', 'Spawning teammate runtimes'); + run.onProgress(progress); + } else if (phase === 'auditing_truth') { + const progress = updateProgress( + run, + 'finalizing', + 'Auditing registered teammates and bootstrap truth', + { configReady: true } + ); + run.onProgress(progress); + } + return true; + } + + if (event === 'team_created') { + const reused = msg.reused_existing_team === true; + const progress = updateProgress( + run, + 'assembling', + reused + ? 'Attached to existing team, starting teammates' + : 'Team config created, starting teammates', + { configReady: true } + ); + run.onProgress(progress); + return true; + } + + if (event === 'member_spawn_started') { + const memberName = typeof msg.member_name === 'string' ? msg.member_name.trim() : ''; + if (memberName) { + this.setMemberSpawnStatus(run, memberName, 'spawning'); + } + return true; + } + + if (event === 'member_spawn_result') { + const memberName = typeof msg.member_name === 'string' ? msg.member_name.trim() : ''; + const outcome = typeof msg.outcome === 'string' ? msg.outcome : ''; + const reason = typeof msg.reason === 'string' ? msg.reason.trim() : undefined; + if (!memberName) { + return true; + } + + if (outcome === 'failed') { + this.setMemberSpawnStatus( + run, + memberName, + 'error', + reason || 'Deterministic bootstrap failed to spawn teammate.' + ); + return true; + } + + if (outcome === 'already_running') { + this.setMemberSpawnStatus(run, memberName, 'online', undefined, 'process'); + return true; + } + + this.setMemberSpawnStatus(run, memberName, 'waiting'); + return true; + } + + if (event === 'completed') { + const failedMembers = Array.isArray(msg.failed_members) ? msg.failed_members : []; + for (const failed of failedMembers) { + const memberName = typeof failed?.name === 'string' ? failed.name.trim() : ''; + const reason = typeof failed?.reason === 'string' ? failed.reason.trim() : undefined; + if (memberName) { + this.setMemberSpawnStatus( + run, + memberName, + 'error', + reason || 'Deterministic bootstrap failed to spawn teammate.' + ); + } + } + if (!run.provisioningComplete && !run.cancelRequested) { + void this.handleProvisioningTurnComplete(run).catch((error: unknown) => { + logger.error( + `[${run.teamName}] deterministic bootstrap completion handler failed: ${ + error instanceof Error ? error.message : String(error) + }` + ); + }); + } + return true; + } + + if (event === 'failed') { + if (run.progress.state === 'failed' || run.cancelRequested) { + return true; + } + const reason = + typeof msg.reason === 'string' && msg.reason.trim().length > 0 + ? msg.reason.trim() + : 'Deterministic bootstrap failed.'; + const progress = updateProgress(run, 'failed', 'Deterministic bootstrap failed', { + error: reason, + cliLogsTail: extractCliLogsFromRun(run), + }); + run.onProgress(progress); + run.processKilled = true; + killTeamProcess(run.child); + this.cleanupRun(run); + return true; + } + + return true; + } + private handleStreamJsonMessage(run: ProvisioningRun, msg: Record): void { // stream-json output has various message types: // {"type":"assistant","content":[{"type":"text","text":"..."},...]} @@ -7194,6 +7600,10 @@ export class TeamProvisioningService { } } + if (this.handleDeterministicBootstrapEvent(run, msg)) { + return; + } + // Handle control_request — tool approval protocol (only when --dangerously-skip-permissions is NOT set) if (msg.type === 'control_request') { this.handleControlRequest(run, msg); @@ -9334,6 +9744,14 @@ export class TeamProvisioningService { void this.mcpConfigBuilder.removeConfigFile(run.mcpConfigPath); run.mcpConfigPath = null; } + if (run.bootstrapSpecPath) { + void removeDeterministicBootstrapSpecFile(run.bootstrapSpecPath); + run.bootstrapSpecPath = null; + } + if (run.bootstrapUserPromptPath) { + void removeDeterministicBootstrapUserPromptFile(run.bootstrapUserPromptPath); + run.bootstrapUserPromptPath = null; + } // Remove from runs Map to free memory (stdoutBuffer, stderrBuffer, claudeLogLines) this.runs.delete(run.runId); } @@ -9397,10 +9815,32 @@ export class TeamProvisioningService { } if (run.fsPhase === 'waiting_members') { + if (run.deterministicBootstrap) { + const registeredNames = await this.getRegisteredTeamMemberNames(run.teamName); + const registeredMembers = registeredNames + ? request.members.filter((member) => registeredNames.has(member.name)).length + : 0; + + if (registeredMembers >= request.members.length) { + run.fsPhase = 'all_files_found'; + if (!run.provisioningComplete) { + void this.handleProvisioningTurnComplete(run); + } + return; + } + } + if (request.members.length === 0) { - run.fsPhase = 'waiting_tasks'; - const progress = updateProgress(run, 'finalizing', 'Solo team, preparing workspace'); - run.onProgress(progress); + if (run.deterministicBootstrap) { + run.fsPhase = 'all_files_found'; + if (!run.provisioningComplete) { + void this.handleProvisioningTurnComplete(run); + } + } else { + run.fsPhase = 'waiting_tasks'; + const progress = updateProgress(run, 'finalizing', 'Solo team, preparing workspace'); + run.onProgress(progress); + } } else { const teamDir = (await resolveTeamDir()) ?? configuredTeamDir; const inboxDir = path.join(teamDir, 'inboxes'); @@ -9735,6 +10175,10 @@ export class TeamProvisioningService { members: run.effectiveMembers, } ); + await this.refreshMemberSpawnStatusesFromLeadInbox(run); + await this.maybeAuditMemberSpawnStatuses(run, { force: true }); + await this.finalizeMissingRegisteredMembersAsFailed(run); + await this.persistLaunchStateSnapshot(run, 'finished'); // Process was killed by timeout — mark as disconnected, not ready const progress = updateProgress(run, 'disconnected', 'Team provisioned but process timed out', { warnings, diff --git a/test/main/http/teams.test.ts b/test/main/http/teams.test.ts index 45aaeb70..ad5f2a56 100644 --- a/test/main/http/teams.test.ts +++ b/test/main/http/teams.test.ts @@ -15,7 +15,7 @@ describe('HTTP team runtime routes', () => { const launchTeam = vi.fn< (request: TeamLaunchRequest, onProgress: (progress: TeamProvisioningProgress) => void) => Promise >(); - const getRuntimeState = vi.fn<(teamName: string) => TeamRuntimeState>(); + const getRuntimeState = vi.fn<(teamName: string) => Promise>(); const getProvisioningStatus = vi.fn<(runId: string) => Promise>(); const stopTeam = vi.fn<(teamName: string) => void>(); const getAliveTeams = vi.fn<() => string[]>(); @@ -82,6 +82,7 @@ describe('HTTP team runtime routes', () => { teamName: 'demo-team', cwd: '/tmp/project', prompt: 'Resume work', + providerId: 'anthropic', skipPermissions: false, clearContext: true, }, @@ -115,7 +116,7 @@ describe('HTTP team runtime routes', () => { it('returns runtime state, provisioning status, and stop results', async () => { const { app, getRuntimeState, getProvisioningStatus, stopTeam, getAliveTeams } = await createApp(); getRuntimeState - .mockReturnValueOnce({ + .mockResolvedValueOnce({ teamName: 'demo-team', isAlive: true, runId: 'run-2', @@ -128,13 +129,13 @@ describe('HTTP team runtime routes', () => { updatedAt: '2026-03-12T00:00:01.000Z', }, }) - .mockReturnValueOnce({ + .mockResolvedValueOnce({ teamName: 'demo-team', isAlive: false, runId: null, progress: null, }) - .mockReturnValueOnce({ + .mockResolvedValueOnce({ teamName: 'demo-team', isAlive: true, runId: 'run-2', diff --git a/test/main/services/team/TeamBootstrapStateReader.test.ts b/test/main/services/team/TeamBootstrapStateReader.test.ts new file mode 100644 index 00000000..f783fa76 --- /dev/null +++ b/test/main/services/team/TeamBootstrapStateReader.test.ts @@ -0,0 +1,352 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const hoisted = vi.hoisted(() => { + const files = new Map< + string, + { + contents: string; + size?: number; + symbolicLink?: boolean; + } + >(); + + const norm = (p: string): string => p.replace(/\\/g, '/'); + + const lstat = vi.fn(async (filePath: string) => { + const entry = files.get(norm(filePath)); + if (!entry) { + const error = new Error('ENOENT') as NodeJS.ErrnoException; + error.code = 'ENOENT'; + throw error; + } + return { + isFile: () => !entry.symbolicLink, + isSymbolicLink: () => Boolean(entry.symbolicLink), + size: entry.size ?? Buffer.byteLength(entry.contents, 'utf8'), + }; + }); + + const readFile = vi.fn(async (filePath: string) => { + const entry = files.get(norm(filePath)); + if (!entry) { + const error = new Error('ENOENT') as NodeJS.ErrnoException; + error.code = 'ENOENT'; + throw error; + } + return entry.contents; + }); + + const access = vi.fn(async (filePath: string) => { + const entry = files.get(norm(filePath)); + if (!entry) { + const error = new Error('ENOENT') as NodeJS.ErrnoException; + error.code = 'ENOENT'; + throw error; + } + }); + + const rm = vi.fn(async (filePath: string) => { + files.delete(norm(filePath)); + }); + + return { files, lstat, readFile, access, rm }; +}); + +vi.mock('fs', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + promises: { + ...actual.promises, + lstat: hoisted.lstat, + readFile: hoisted.readFile, + access: hoisted.access, + rm: hoisted.rm, + }, + }; +}); + +vi.mock('../../../../src/main/utils/pathDecoder', () => ({ + getTeamsBasePath: () => '/mock/teams', +})); + +import { + choosePreferredLaunchSnapshot, + readBootstrapLaunchSnapshot, + readBootstrapRealTaskSubmissionState, + readBootstrapRuntimeState, +} from '../../../../src/main/services/team/TeamBootstrapStateReader'; + +describe('TeamBootstrapStateReader', () => { + beforeEach(() => { + hoisted.files.clear(); + hoisted.lstat.mockClear(); + hoisted.readFile.mockClear(); + hoisted.access.mockClear(); + hoisted.rm.mockClear(); + }); + + it('rejects symlink bootstrap-state files', async () => { + hoisted.files.set('/mock/teams/demo/bootstrap-state.json', { + contents: '{}', + symbolicLink: true, + }); + + await expect(readBootstrapLaunchSnapshot('demo')).resolves.toBeNull(); + await expect(readBootstrapRuntimeState('demo')).resolves.toBeNull(); + }); + + it('projects active bootstrap-state into runtime progress', async () => { + const nowSpy = vi.spyOn(Date, 'now').mockReturnValue(1700000001000); + const killSpy = vi.spyOn(process, 'kill').mockImplementation(() => true as never); + + hoisted.files.set('/mock/teams/demo/bootstrap-state.json', { + contents: JSON.stringify({ + version: 1, + runId: 'run-123', + teamName: 'demo', + ownerPid: 4242, + startedAt: 1700000000000, + updatedAt: 1700000000500, + phase: 'acquiring_bootstrap_lock', + members: [{ name: 'alice', status: 'pending' }], + }), + }); + hoisted.files.set('/mock/teams/demo/bootstrap-journal.jsonl', { + contents: [ + JSON.stringify({ ts: 1, type: 'phase', runId: 'run-123', phase: 'loading_existing_state' }), + JSON.stringify({ ts: 2, type: 'lock', runId: 'run-123', action: 'acquired', ownerPid: 4242 }), + JSON.stringify({ ts: 3, type: 'member', runId: 'run-123', name: 'alice', action: 'spawn_started' }), + ].join('\n'), + }); + + await expect(readBootstrapRuntimeState('demo')).resolves.toEqual({ + teamName: 'demo', + isAlive: false, + runId: 'run-123', + progress: { + runId: 'run-123', + teamName: 'demo', + state: 'configuring', + message: 'Acquiring deterministic bootstrap lock', + warnings: [ + 'Recent deterministic bootstrap events: bootstrap phase: loading_existing_state | bootstrap lock acquired (pid 4242) | alice: spawn_started', + ], + startedAt: '2023-11-14T22:13:20.000Z', + updatedAt: '2023-11-14T22:13:20.500Z', + pid: 4242, + }, + }); + + killSpy.mockRestore(); + nowSpy.mockRestore(); + }); + + it('ignores terminal bootstrap-state for runtime recovery projection', async () => { + hoisted.files.set('/mock/teams/demo/bootstrap-state.json', { + contents: JSON.stringify({ + version: 1, + runId: 'run-123', + teamName: 'demo', + startedAt: 1700000000000, + updatedAt: 1700000000500, + phase: 'completed', + terminal: { + status: 'completed', + finishedAt: 1700000000500, + }, + members: [{ name: 'alice', status: 'registered' }], + }), + }); + + await expect(readBootstrapRuntimeState('demo')).resolves.toBeNull(); + }); + + it('reads persisted real-task submission state', async () => { + hoisted.files.set('/mock/teams/demo/bootstrap-state.json', { + contents: JSON.stringify({ + version: 1, + runId: 'run-123', + teamName: 'demo', + startedAt: 1700000000000, + updatedAt: 1700000000500, + phase: 'completed', + realTaskSubmissionState: 'submitted', + members: [], + }), + }); + + await expect(readBootstrapRealTaskSubmissionState('demo')).resolves.toBe('submitted'); + }); + + it('classifies dead bootstrap owner as failed launch snapshot instead of pending', async () => { + const nowSpy = vi.spyOn(Date, 'now').mockReturnValue(1700000300000); + const killSpy = vi + .spyOn(process, 'kill') + .mockImplementation(() => { + const error = new Error('ESRCH') as NodeJS.ErrnoException; + error.code = 'ESRCH'; + throw error; + }); + + hoisted.files.set('/mock/teams/demo/bootstrap-state.json', { + contents: JSON.stringify({ + version: 1, + runId: 'run-dead', + teamName: 'demo', + ownerPid: 777, + startedAt: 1700000000000, + updatedAt: 1700000000000, + phase: 'spawning_members', + members: [{ name: 'alice', status: 'registered' }], + }), + }); + + await expect(readBootstrapLaunchSnapshot('demo')).resolves.toMatchObject({ + launchPhase: 'finished', + members: { + alice: { + launchState: 'failed_to_start', + hardFailure: true, + hardFailureReason: + 'bootstrap owner pid 777 is gone and persisted bootstrap state is stale', + }, + }, + }); + + killSpy.mockRestore(); + nowSpy.mockRestore(); + }); + + it('projects dead bootstrap owner into failed runtime progress', async () => { + const nowSpy = vi.spyOn(Date, 'now').mockReturnValue(1700000201000); + const killSpy = vi + .spyOn(process, 'kill') + .mockImplementation(() => { + const error = new Error('ESRCH') as NodeJS.ErrnoException; + error.code = 'ESRCH'; + throw error; + }); + + hoisted.files.set('/mock/teams/demo/bootstrap-state.json', { + contents: JSON.stringify({ + version: 1, + runId: 'run-dead', + teamName: 'demo', + ownerPid: 777, + startedAt: 1700000000000, + updatedAt: 1700000200000, + phase: 'spawning_members', + members: [{ name: 'alice', status: 'registered' }], + }), + }); + + await expect(readBootstrapRuntimeState('demo')).resolves.toMatchObject({ + teamName: 'demo', + isAlive: false, + runId: 'run-dead', + progress: { + state: 'failed', + message: 'Deterministic bootstrap owner exited before bootstrap completed', + error: + 'bootstrap owner pid 777 is gone before bootstrap reached a terminal state', + }, + }); + + killSpy.mockRestore(); + nowSpy.mockRestore(); + }); + + it('projects degraded runtime progress when bootstrap-state is unreadable but lock owner is alive', async () => { + const killSpy = vi.spyOn(process, 'kill').mockImplementation(() => true as never); + + hoisted.files.set('/mock/teams/demo/bootstrap-state.json', { + contents: '{invalid-json', + }); + hoisted.files.set('/mock/teams/demo/.bootstrap.lock/metadata.json', { + contents: JSON.stringify({ + pid: 4242, + runId: 'run-lock', + requestHash: 'hash-1', + ownerStartedAt: 1700000000000, + createdAt: 1700000000100, + nonce: 'nonce-1', + }), + }); + hoisted.files.set('/mock/teams/demo/bootstrap-journal.jsonl', { + contents: JSON.stringify({ + ts: 3, + type: 'member', + runId: 'run-lock', + name: 'alice', + action: 'spawn_started', + }), + }); + + await expect(readBootstrapRuntimeState('demo')).resolves.toMatchObject({ + teamName: 'demo', + isAlive: false, + runId: 'run-lock', + progress: { + state: 'configuring', + message: + 'Deterministic bootstrap recovery is degraded because persisted bootstrap state is unreadable', + messageSeverity: 'warning', + pid: 4242, + }, + }); + + killSpy.mockRestore(); + }); + + it('projects degraded failed runtime progress when bootstrap-state is unreadable and lock owner is dead', async () => { + const killSpy = vi + .spyOn(process, 'kill') + .mockImplementation(() => { + const error = new Error('ESRCH') as NodeJS.ErrnoException; + error.code = 'ESRCH'; + throw error; + }); + + hoisted.files.set('/mock/teams/demo/bootstrap-state.json', { + contents: '{invalid-json', + }); + hoisted.files.set('/mock/teams/demo/.bootstrap.lock/metadata.json', { + contents: JSON.stringify({ + pid: 7331, + runId: 'run-dead-lock', + requestHash: 'hash-2', + ownerStartedAt: 1700000000000, + createdAt: 1700000000100, + nonce: 'nonce-2', + }), + }); + + await expect(readBootstrapRuntimeState('demo')).resolves.toMatchObject({ + teamName: 'demo', + isAlive: false, + runId: 'run-dead-lock', + progress: { + state: 'failed', + message: + 'Deterministic bootstrap recovery failed because persisted bootstrap state is unreadable and the bootstrap owner is gone', + messageSeverity: 'warning', + pid: 7331, + }, + }); + + killSpy.mockRestore(); + }); + + it('prefers the newer launch snapshot when bootstrap snapshot is stale', () => { + const preferred = choosePreferredLaunchSnapshot( + { updatedAt: '2026-04-06T10:00:00.000Z', kind: 'bootstrap' }, + { updatedAt: '2026-04-06T10:05:00.000Z', kind: 'launch' } + ); + + expect(preferred).toEqual({ + updatedAt: '2026-04-06T10:05:00.000Z', + kind: 'launch', + }); + }); +});