agent-ecosystem/src/main/services/team/TeamBootstrapStateReader.ts

713 lines
22 KiB
TypeScript

import { getTeamsBasePath } from '@main/utils/pathDecoder';
import { createPersistedLaunchSnapshot } from './TeamLaunchStateEvaluator';
import * as fs from 'fs';
import * as path from 'path';
import type {
PersistedTeamLaunchMemberState,
PersistedTeamLaunchSnapshot,
TeamProvisioningProgress,
TeamRuntimeState,
} from '@shared/types';
const TEAM_BOOTSTRAP_STATE_FILE = 'bootstrap-state.json';
const TEAM_BOOTSTRAP_JOURNAL_FILE = 'bootstrap-journal.jsonl';
const TEAM_BOOTSTRAP_LOCK_DIR = '.bootstrap.lock';
const TEAM_BOOTSTRAP_LOCK_METADATA_FILE = 'metadata.json';
const MAX_BOOTSTRAP_STATE_BYTES = 256 * 1024;
const MAX_BOOTSTRAP_JOURNAL_BYTES = 256 * 1024;
const MAX_BOOTSTRAP_LOCK_METADATA_BYTES = 64 * 1024;
const ACTIVE_BOOTSTRAP_STUCK_CLASSIFICATION_MS = 3 * 60 * 1000;
type RawBootstrapMemberState = {
name?: unknown;
status?: unknown;
lastAttemptAt?: unknown;
lastObservedAt?: unknown;
failureReason?: unknown;
};
type RawBootstrapState = {
version?: unknown;
runId?: unknown;
teamName?: unknown;
startedAt?: unknown;
ownerPid?: unknown;
updatedAt?: unknown;
phase?: unknown;
realTaskSubmissionState?: unknown;
members?: unknown;
terminal?: unknown;
};
type RawBootstrapJournalRecord =
| { ts?: unknown; type?: 'phase'; phase?: unknown }
| { ts?: unknown; type?: 'lock'; action?: unknown; ownerPid?: unknown; detail?: unknown }
| { ts?: unknown; type?: 'member'; name?: unknown; action?: unknown; detail?: unknown }
| { ts?: unknown; type?: 'terminal'; status?: unknown; reason?: unknown }
| { ts?: unknown; type?: 'real_task'; state?: unknown; detail?: unknown };
type RawBootstrapLockMetadata = {
pid?: unknown;
runId?: unknown;
requestHash?: unknown;
ownerStartedAt?: unknown;
createdAt?: unknown;
nonce?: unknown;
};
type BootstrapStateInspection = {
raw: RawBootstrapState | null;
issue?: string;
};
type BootstrapJournalInspection = {
warnings?: string[];
issue?: string;
};
type BootstrapLockMetadata = {
pid: number;
runId: string;
ownerStartedAt?: number;
};
type BootstrapRuntimePhase =
| 'validating_spec'
| 'loading_existing_state'
| 'acquiring_bootstrap_lock'
| 'creating_team'
| 'spawning_members'
| 'auditing_truth'
| 'completed'
| 'failed'
| 'canceled';
function isBootstrapPhaseTerminal(phase: BootstrapRuntimePhase): boolean {
return phase === 'completed' || phase === 'failed' || phase === 'canceled';
}
function isProcessAlive(pid: number): boolean {
if (!Number.isFinite(pid) || pid <= 0) {
return false;
}
try {
process.kill(pid, 0);
return true;
} catch (error) {
return (error as NodeJS.ErrnoException | undefined)?.code === 'EPERM';
}
}
function classifyBootstrapOwnerState(raw: RawBootstrapState): {
ownerDead: boolean;
stale: boolean;
failureReason?: string;
} {
const phase = typeof raw.phase === 'string' ? (raw.phase as BootstrapRuntimePhase) : null;
if (!phase || isBootstrapPhaseTerminal(phase)) {
return { ownerDead: false, stale: false };
}
const ownerPid = typeof raw.ownerPid === 'number' ? raw.ownerPid : null;
if (ownerPid === null || isProcessAlive(ownerPid)) {
return { ownerDead: false, stale: false };
}
const updatedAtMs =
typeof raw.updatedAt === 'number'
? raw.updatedAt
: typeof raw.updatedAt === 'string'
? Date.parse(raw.updatedAt)
: NaN;
const stale =
Number.isFinite(updatedAtMs) &&
Date.now() - updatedAtMs >= ACTIVE_BOOTSTRAP_STUCK_CLASSIFICATION_MS;
return {
ownerDead: true,
stale,
failureReason: stale
? `bootstrap owner pid ${ownerPid} is gone and persisted bootstrap state is stale`
: `bootstrap owner pid ${ownerPid} is gone before bootstrap reached a terminal state`,
};
}
async function inspectBootstrapState(teamName: string): Promise<BootstrapStateInspection> {
const targetPath = getTeamBootstrapStatePath(teamName);
try {
const stat = await fs.promises.lstat(targetPath);
if (stat.isSymbolicLink()) {
return {
raw: null,
issue:
'Persisted deterministic bootstrap state is unreadable because bootstrap-state.json is a symlink.',
};
}
if (!stat.isFile()) {
return {
raw: null,
issue:
'Persisted deterministic bootstrap state is unreadable because bootstrap-state.json is not a regular file.',
};
}
if (stat.size > MAX_BOOTSTRAP_STATE_BYTES) {
return {
raw: null,
issue:
'Persisted deterministic bootstrap state is unreadable because bootstrap-state.json is oversized.',
};
}
const raw = JSON.parse(await fs.promises.readFile(targetPath, 'utf8')) as RawBootstrapState;
if (raw.version !== 1) {
return {
raw: null,
issue:
'Persisted deterministic bootstrap state is unreadable because bootstrap-state.json has an unsupported schema version.',
};
}
return { raw };
} catch (error) {
if ((error as NodeJS.ErrnoException | undefined)?.code === 'ENOENT') {
return { raw: null };
}
return {
raw: null,
issue:
'Persisted deterministic bootstrap state is unreadable because bootstrap-state.json is invalid, truncated, or inaccessible.',
};
}
}
async function readRawBootstrapState(teamName: string): Promise<RawBootstrapState | null> {
return (await inspectBootstrapState(teamName)).raw;
}
function getBootstrapProgressProjection(
phase: BootstrapRuntimePhase,
memberCount: number
): { state: Exclude<TeamProvisioningProgress['state'], 'idle'>; message: string } | null {
switch (phase) {
case 'validating_spec':
return {
state: 'validating',
message: 'Validating deterministic bootstrap spec',
};
case 'loading_existing_state':
return {
state: 'configuring',
message: 'Loading existing team state',
};
case 'acquiring_bootstrap_lock':
return {
state: 'configuring',
message: 'Acquiring deterministic bootstrap lock',
};
case 'creating_team':
return {
state: 'assembling',
message: 'Creating team config',
};
case 'spawning_members':
return {
state: 'assembling',
message:
memberCount > 0
? `Spawning teammate runtimes (${memberCount})`
: 'Spawning teammate runtimes',
};
case 'auditing_truth':
return {
state: 'finalizing',
message: 'Auditing registered teammates and bootstrap truth',
};
case 'completed':
return {
state: 'ready',
message: 'Deterministic bootstrap completed',
};
case 'failed':
return {
state: 'failed',
message: 'Deterministic bootstrap failed',
};
case 'canceled':
return {
state: 'cancelled',
message: 'Deterministic bootstrap cancelled',
};
default:
return null;
}
}
function toIso(value: unknown, fallback: string): string {
if (typeof value === 'string' && value.trim().length > 0) {
return value;
}
if (typeof value === 'number' && Number.isFinite(value) && value > 0) {
return new Date(value).toISOString();
}
return fallback;
}
function normalizeBootstrapMemberState(
memberName: string,
raw: RawBootstrapMemberState,
updatedAt: string
): PersistedTeamLaunchMemberState {
const status = typeof raw.status === 'string' ? raw.status : 'pending';
const hardFailure = status === 'failed';
const bootstrapConfirmed = status === 'bootstrap_confirmed';
const runtimeAlive = bootstrapConfirmed || status === 'runtime_alive';
const agentToolAccepted =
bootstrapConfirmed ||
runtimeAlive ||
status === 'registered' ||
status === 'spawn_started' ||
hardFailure;
return {
name: memberName,
launchState: hardFailure
? 'failed_to_start'
: bootstrapConfirmed
? 'confirmed_alive'
: runtimeAlive || agentToolAccepted
? 'runtime_pending_bootstrap'
: 'starting',
agentToolAccepted,
runtimeAlive,
bootstrapConfirmed,
hardFailure,
hardFailureReason:
typeof raw.failureReason === 'string' && raw.failureReason.trim().length > 0
? raw.failureReason.trim()
: undefined,
firstSpawnAcceptedAt: agentToolAccepted ? toIso(raw.lastAttemptAt, updatedAt) : undefined,
lastHeartbeatAt: bootstrapConfirmed ? toIso(raw.lastObservedAt, updatedAt) : undefined,
lastRuntimeAliveAt: runtimeAlive ? toIso(raw.lastObservedAt, updatedAt) : undefined,
lastEvaluatedAt: toIso(raw.lastObservedAt, updatedAt),
sources: {
configRegistered:
status === 'registered' ||
status === 'runtime_alive' ||
status === 'bootstrap_confirmed' ||
hardFailure,
processAlive: runtimeAlive || undefined,
hardFailureSignal: hardFailure || undefined,
},
diagnostics: hardFailure
? [
typeof raw.failureReason === 'string' && raw.failureReason.trim().length > 0
? raw.failureReason.trim()
: 'deterministic bootstrap failed',
]
: runtimeAlive
? bootstrapConfirmed
? ['late heartbeat received']
: ['runtime alive', 'waiting for bootstrap']
: agentToolAccepted
? ['spawn accepted']
: undefined,
};
}
export function getTeamBootstrapStatePath(teamName: string): string {
return path.join(getTeamsBasePath(), teamName, TEAM_BOOTSTRAP_STATE_FILE);
}
function getTeamBootstrapJournalPath(teamName: string): string {
return path.join(getTeamsBasePath(), teamName, TEAM_BOOTSTRAP_JOURNAL_FILE);
}
function getTeamBootstrapLockMetadataPath(teamName: string): string {
return path.join(
getTeamsBasePath(),
teamName,
TEAM_BOOTSTRAP_LOCK_DIR,
TEAM_BOOTSTRAP_LOCK_METADATA_FILE
);
}
async function readBootstrapLockMetadata(teamName: string): Promise<BootstrapLockMetadata | null> {
const targetPath = getTeamBootstrapLockMetadataPath(teamName);
try {
const stat = await fs.promises.lstat(targetPath);
if (stat.isSymbolicLink() || !stat.isFile() || stat.size > MAX_BOOTSTRAP_LOCK_METADATA_BYTES) {
return null;
}
const raw = JSON.parse(
await fs.promises.readFile(targetPath, 'utf8')
) as RawBootstrapLockMetadata;
if (
typeof raw.pid !== 'number' ||
!Number.isFinite(raw.pid) ||
raw.pid <= 0 ||
typeof raw.runId !== 'string' ||
raw.runId.trim().length === 0
) {
return null;
}
return {
pid: raw.pid,
runId: raw.runId.trim(),
ownerStartedAt:
typeof raw.ownerStartedAt === 'number' && Number.isFinite(raw.ownerStartedAt)
? raw.ownerStartedAt
: undefined,
};
} catch {
return null;
}
}
async function readBootstrapJournalWarnings(teamName: string): Promise<string[] | undefined> {
return (await inspectBootstrapJournal(teamName)).warnings;
}
async function inspectBootstrapJournal(teamName: string): Promise<BootstrapJournalInspection> {
const targetPath = getTeamBootstrapJournalPath(teamName);
try {
const stat = await fs.promises.lstat(targetPath);
if (stat.isSymbolicLink()) {
return {
issue:
'Persisted deterministic bootstrap journal is unreadable because bootstrap-journal.jsonl is a symlink.',
};
}
if (!stat.isFile()) {
return {
issue:
'Persisted deterministic bootstrap journal is unreadable because bootstrap-journal.jsonl is not a regular file.',
};
}
if (stat.size > MAX_BOOTSTRAP_JOURNAL_BYTES) {
return {
issue:
'Persisted deterministic bootstrap journal is unreadable because bootstrap-journal.jsonl is oversized.',
};
}
const raw = await fs.promises.readFile(targetPath, 'utf8');
const lines = raw
.split('\n')
.map((line) => line.trim())
.filter((line) => line.length > 0)
.slice(-3);
const messages = lines
.map((line) => {
try {
return JSON.parse(line) as RawBootstrapJournalRecord;
} catch {
return null;
}
})
.filter((record): record is RawBootstrapJournalRecord => Boolean(record))
.map((record) => {
if (record.type === 'phase' && typeof record.phase === 'string') {
return `bootstrap phase: ${record.phase}`;
}
if (record.type === 'lock' && typeof record.action === 'string') {
const owner = typeof record.ownerPid === 'number' ? ` (pid ${record.ownerPid})` : '';
return `bootstrap lock ${record.action}${owner}`;
}
if (
record.type === 'member' &&
typeof record.name === 'string' &&
typeof record.action === 'string'
) {
return typeof record.detail === 'string' && record.detail.trim().length > 0
? `${record.name}: ${record.action} (${record.detail.trim()})`
: `${record.name}: ${record.action}`;
}
if (record.type === 'terminal' && typeof record.status === 'string') {
return typeof record.reason === 'string' && record.reason.trim().length > 0
? `bootstrap ${record.status}: ${record.reason.trim()}`
: `bootstrap ${record.status}`;
}
if (record.type === 'real_task' && typeof record.state === 'string') {
return typeof record.detail === 'string' && record.detail.trim().length > 0
? `first task ${record.state}: ${record.detail.trim()}`
: `first task ${record.state}`;
}
return null;
})
.filter((item): item is string => Boolean(item));
return {
warnings:
messages.length > 0
? [`Recent deterministic bootstrap events: ${messages.join(' | ')}`]
: undefined,
};
} catch (error) {
if ((error as NodeJS.ErrnoException | undefined)?.code === 'ENOENT') {
return {};
}
return {
issue:
'Persisted deterministic bootstrap journal is unreadable because bootstrap-journal.jsonl is invalid, truncated, or inaccessible.',
};
}
}
async function readDegradedBootstrapRuntimeState(
teamName: string,
stateIssue: string
): Promise<TeamRuntimeState | null> {
const lockMetadata = await readBootstrapLockMetadata(teamName);
if (!lockMetadata) {
return null;
}
const journalInspection = await inspectBootstrapJournal(teamName);
const warnings = [
stateIssue,
journalInspection.issue,
...(journalInspection.warnings ?? []),
].filter((item): item is string => typeof item === 'string' && item.trim().length > 0);
const ownerAlive = isProcessAlive(lockMetadata.pid);
const now = new Date().toISOString();
return {
teamName,
isAlive: false,
runId: lockMetadata.runId,
progress: {
runId: lockMetadata.runId,
teamName,
state: ownerAlive ? 'configuring' : 'failed',
message: ownerAlive
? 'Deterministic bootstrap recovery is degraded because persisted bootstrap state is unreadable'
: 'Deterministic bootstrap recovery failed because persisted bootstrap state is unreadable and the bootstrap owner is gone',
messageSeverity: 'warning',
error: ownerAlive
? stateIssue
: `${stateIssue} Bootstrap owner pid ${lockMetadata.pid} is not alive.`,
warnings: warnings.length > 0 ? warnings : undefined,
startedAt:
typeof lockMetadata.ownerStartedAt === 'number' &&
Number.isFinite(lockMetadata.ownerStartedAt)
? new Date(lockMetadata.ownerStartedAt).toISOString()
: now,
updatedAt: now,
pid: lockMetadata.pid,
},
};
}
export async function readBootstrapLaunchSnapshot(
teamName: string
): Promise<PersistedTeamLaunchSnapshot | null> {
const raw = await readRawBootstrapState(teamName);
if (!raw) {
return null;
}
try {
const updatedAt = toIso(raw.updatedAt, new Date().toISOString());
const rawMembers = Array.isArray(raw.members) ? raw.members : [];
const members: Record<string, PersistedTeamLaunchMemberState> = {};
const expectedMembers: string[] = [];
for (const item of rawMembers) {
if (!item || typeof item !== 'object') continue;
const rawMember = item as RawBootstrapMemberState;
const memberName = typeof rawMember.name === 'string' ? rawMember.name.trim() : '';
if (!memberName || memberName === 'team-lead' || memberName === 'user') continue;
expectedMembers.push(memberName);
members[memberName] = normalizeBootstrapMemberState(memberName, rawMember, updatedAt);
}
const terminal =
raw.terminal && typeof raw.terminal === 'object'
? (raw.terminal as Record<string, unknown>)
: null;
const terminalStatus = typeof terminal?.status === 'string' ? terminal.status : undefined;
const phase = typeof raw.phase === 'string' ? raw.phase : undefined;
const ownerState = classifyBootstrapOwnerState(raw);
const launchPhase =
terminalStatus === 'completed' ||
terminalStatus === 'partial_success' ||
terminalStatus === 'failed' ||
terminalStatus === 'canceled' ||
ownerState.ownerDead ||
phase === 'completed' ||
phase === 'failed' ||
phase === 'canceled'
? 'finished'
: 'active';
if (ownerState.ownerDead) {
const diagnostics = ownerState.failureReason ? [ownerState.failureReason] : undefined;
for (const memberName of expectedMembers) {
const entry = members[memberName];
if (
!entry ||
entry.launchState === 'confirmed_alive' ||
entry.launchState === 'failed_to_start'
) {
continue;
}
members[memberName] = {
...entry,
launchState: 'failed_to_start',
hardFailure: true,
hardFailureReason: ownerState.failureReason,
diagnostics: diagnostics ?? entry.diagnostics,
sources: {
...entry.sources,
hardFailureSignal: true,
},
};
}
}
return createPersistedLaunchSnapshot({
teamName:
typeof raw.teamName === 'string' && raw.teamName.trim().length > 0
? raw.teamName.trim()
: teamName,
expectedMembers,
launchPhase,
members,
updatedAt,
});
} catch {
return null;
}
}
export async function readBootstrapRealTaskSubmissionState(
teamName: string
): Promise<'not_submitted' | 'submitted' | 'unknown' | null> {
const raw = await readRawBootstrapState(teamName);
if (!raw) {
return null;
}
const state = raw.realTaskSubmissionState;
return state === 'not_submitted' || state === 'submitted' || state === 'unknown' ? state : null;
}
export async function readBootstrapRuntimeState(
teamName: string
): Promise<TeamRuntimeState | null> {
const inspection = await inspectBootstrapState(teamName);
const raw = inspection.raw;
if (!raw) {
return inspection.issue ? readDegradedBootstrapRuntimeState(teamName, inspection.issue) : null;
}
try {
const journalWarnings = await readBootstrapJournalWarnings(teamName);
const phase = typeof raw.phase === 'string' ? (raw.phase as BootstrapRuntimePhase) : null;
if (!phase) {
return null;
}
const ownerState = classifyBootstrapOwnerState(raw);
if (ownerState.ownerDead) {
const startedAt = toIso(raw.startedAt, new Date().toISOString());
const updatedAt = toIso(raw.updatedAt, startedAt);
return {
teamName:
typeof raw.teamName === 'string' && raw.teamName.trim().length > 0
? raw.teamName.trim()
: teamName,
isAlive: false,
runId: typeof raw.runId === 'string' ? raw.runId : null,
progress: {
runId: typeof raw.runId === 'string' ? raw.runId : teamName,
teamName:
typeof raw.teamName === 'string' && raw.teamName.trim().length > 0
? raw.teamName.trim()
: teamName,
state: 'failed',
message: ownerState.stale
? 'Deterministic bootstrap became stuck after owner process exited'
: 'Deterministic bootstrap owner exited before bootstrap completed',
error: ownerState.failureReason,
warnings: journalWarnings,
startedAt,
updatedAt,
...(typeof raw.ownerPid === 'number' ? { pid: raw.ownerPid } : {}),
},
};
}
const activePhases: BootstrapRuntimePhase[] = [
'validating_spec',
'loading_existing_state',
'acquiring_bootstrap_lock',
'creating_team',
'spawning_members',
'auditing_truth',
];
if (!activePhases.includes(phase)) {
return null;
}
const projection = getBootstrapProgressProjection(
phase,
Array.isArray(raw.members) ? raw.members.length : 0
);
if (!projection) {
return null;
}
const startedAt = toIso(raw.startedAt, new Date().toISOString());
const updatedAt = toIso(raw.updatedAt, startedAt);
const runId = typeof raw.runId === 'string' && raw.runId.trim().length > 0 ? raw.runId : null;
const pid =
typeof raw.ownerPid === 'number' && Number.isFinite(raw.ownerPid) && raw.ownerPid > 0
? raw.ownerPid
: undefined;
const progress: TeamProvisioningProgress = {
runId: runId ?? `bootstrap:${teamName}`,
teamName:
typeof raw.teamName === 'string' && raw.teamName.trim().length > 0
? raw.teamName.trim()
: teamName,
state: projection.state,
message: projection.message,
warnings: journalWarnings,
startedAt,
updatedAt,
...(pid ? { pid } : {}),
};
return {
teamName:
typeof raw.teamName === 'string' && raw.teamName.trim().length > 0
? raw.teamName.trim()
: teamName,
isAlive: false,
runId,
progress,
};
} catch {
return null;
}
}
export async function clearBootstrapState(teamName: string): Promise<void> {
try {
await fs.promises.rm(getTeamBootstrapStatePath(teamName), { force: true });
} catch {
// best-effort
}
}
export function choosePreferredLaunchSnapshot<T extends { updatedAt?: string }>(
bootstrapSnapshot: T | null,
launchSnapshot: T | null
): T | null {
if (!bootstrapSnapshot) return launchSnapshot;
if (!launchSnapshot) return bootstrapSnapshot;
const bootstrapMs = Date.parse(bootstrapSnapshot.updatedAt ?? '');
const launchMs = Date.parse(launchSnapshot.updatedAt ?? '');
if (Number.isFinite(bootstrapMs) && Number.isFinite(launchMs)) {
return bootstrapMs >= launchMs ? bootstrapSnapshot : launchSnapshot;
}
return bootstrapSnapshot;
}