From 6dab896aeeb7e07f4cf97905ad14c87bebd55ae5 Mon Sep 17 00:00:00 2001 From: 777genius Date: Sat, 9 May 2026 07:33:33 +0300 Subject: [PATCH] fix(team): harden process launch diagnostics --- AGENTS.md | 2 + package.json | 1 + scripts/prove-provider-launch-stress.mjs | 79 ++ .../team/TeamLaunchFailureArtifactPack.ts | 485 ++++++++++ .../services/team/TeamProvisioningService.ts | 505 +++++++++- .../team/MixedProviderTeamLaunch.live.test.ts | 49 +- .../ProviderLaunchStress.live-e2e.test.ts | 860 ++++++++++++++++++ .../TeamLaunchFailureArtifactPack.test.ts | 197 ++++ .../team/TeamProvisioningService.test.ts | 160 ++++ 9 files changed, 2335 insertions(+), 3 deletions(-) create mode 100644 scripts/prove-provider-launch-stress.mjs create mode 100644 src/main/services/team/TeamLaunchFailureArtifactPack.ts create mode 100644 test/main/services/team/ProviderLaunchStress.live-e2e.test.ts create mode 100644 test/main/services/team/TeamLaunchFailureArtifactPack.test.ts diff --git a/AGENTS.md b/AGENTS.md index 41f55f49..53ecb1d3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,6 +17,8 @@ For new features: - Treat regressions in agent team messaging, task lifecycle, session parsing, code review UI, and provider/runtime detection as high priority. - For team launch hangs, OpenCode `registered`/`bootstrap unconfirmed`, missing teammate replies, or suspicious task logs, follow [docs/team-management/debugging-agent-teams.md](docs/team-management/debugging-agent-teams.md) before changing code. +- For launch failures, first inspect the newest artifact pack under `~/.claude/teams//launch-failure-artifacts/latest.json`, then open its `manifest.json`. The manifest includes `classification`, `bootstrapTransportBreadcrumb`, launch diagnostics, member spawn statuses, and redacted copies/tails of launch-state, bootstrap-state, bootstrap-journal, CLI logs, progress trace, and runtime adapter trace. +- When running live smoke tests, keep cleanup narrow: stop only the smoke-owned team/run and launch-owned process teammates. Do not kill shared OpenCode hosts, unrelated tmux panes, or user teams while trying to clean stale smoke artifacts. - Verify new medium and large features follow `docs/FEATURE_ARCHITECTURE_STANDARD.md`, especially cross-process boundaries and public feature entrypoints. - Check that Electron main, preload, renderer, and shared code keep their responsibilities separate and use the documented path aliases. - Flag changes that manually concatenate agent block markers instead of using `wrapAgentBlock(text)`. diff --git a/package.json b/package.json index d0affd34..213bc3dc 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "opencode:prove-semantic-model-matrix": "node ./scripts/prove-opencode-semantic-model-matrix.mjs", "opencode:prove-team-provisioning": "node ./scripts/prove-opencode-team-provisioning.mjs", "team:prove-agent-cli-launch": "node ./scripts/prove-agent-cli-launch.mjs", + "team:prove-provider-launch-stress": "node ./scripts/prove-provider-launch-stress.mjs", "team:prove-launch-matrix": "pnpm exec vitest run --maxWorkers 1 --minWorkers 1 test/main/services/team/TeamAgentLaunchMatrix.safe-e2e.test.ts", "prebuild": "tsx scripts/fetch-pricing-data.ts && pnpm --filter agent-teams-controller build && pnpm --filter agent-teams-mcp build", "build": "node --max-old-space-size=8192 ./node_modules/electron-vite/bin/electron-vite.js build", diff --git a/scripts/prove-provider-launch-stress.mjs b/scripts/prove-provider-launch-stress.mjs new file mode 100644 index 00000000..8fda98bf --- /dev/null +++ b/scripts/prove-provider-launch-stress.mjs @@ -0,0 +1,79 @@ +#!/usr/bin/env node + +import { spawnSync } from 'node:child_process'; +import path from 'node:path'; +import process from 'node:process'; +import { fileURLToPath } from 'node:url'; + +import { + exitForSkippedPreflight, + preflightOpenCodeLiveEnvironment, +} from './lib/opencode-live-preflight.mjs'; + +const scriptDir = path.dirname(fileURLToPath(import.meta.url)); +const repoRoot = path.resolve(scriptDir, '..'); +const orchestratorRoot = process.env.CLAUDE_DEV_RUNTIME_ROOT?.trim(); +const siblingOrchestrator = path.resolve(repoRoot, '..', 'agent_teams_orchestrator'); +const order = process.env.PROVIDER_LAUNCH_STRESS_ORDER?.trim() || 'anthropic,codex,opencode,mixed'; + +const env = { + ...process.env, + PROVIDER_LAUNCH_STRESS_LIVE: '1', + PROVIDER_LAUNCH_STRESS_ORDER: order, + PROVIDER_LAUNCH_STRESS_MEMBER_COUNT: + process.env.PROVIDER_LAUNCH_STRESS_MEMBER_COUNT?.trim() || '5', + PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH: + process.env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH?.trim() || + (process.env.ANTHROPIC_API_KEY?.trim() ? 'api-key' : 'subscription'), + OPENCODE_E2E: '1', + OPENCODE_E2E_USE_REAL_APP_CREDENTIALS: '1', + OPENCODE_DISABLE_AUTOUPDATE: process.env.OPENCODE_DISABLE_AUTOUPDATE ?? '1', +}; + +if (!env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH?.trim()) { + const runtimeRoot = orchestratorRoot ? path.resolve(orchestratorRoot) : siblingOrchestrator; + env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH = path.join(runtimeRoot, 'cli'); +} + +console.log('Running provider launch stress live smoke'); +console.log(`Order: ${env.PROVIDER_LAUNCH_STRESS_ORDER}`); +console.log(`Members per scenario: ${env.PROVIDER_LAUNCH_STRESS_MEMBER_COUNT}`); +console.log(`Anthropic auth: ${env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH}`); +console.log( + `Models: anthropic=${env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_MODEL || 'haiku'}, codex=${ + env.PROVIDER_LAUNCH_STRESS_CODEX_MODEL || 'gpt-5.4-mini' + }, opencode=${env.PROVIDER_LAUNCH_STRESS_OPENCODE_MODEL || 'openai/gpt-5.4-mini'}` +); +console.log(`Orchestrator CLI: ${env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH}`); + +if (order.split(',').some((item) => ['opencode', 'mixed'].includes(item.trim()))) { + const preflight = await preflightOpenCodeLiveEnvironment({ repoRoot }); + exitForSkippedPreflight(preflight); +} + +const result = spawnSync( + 'pnpm', + [ + 'exec', + 'vitest', + 'run', + '--maxWorkers', + '1', + '--minWorkers', + '1', + 'test/main/services/team/ProviderLaunchStress.live-e2e.test.ts', + ], + { + cwd: repoRoot, + env, + stdio: 'inherit', + shell: process.platform === 'win32', + } +); + +if (result.error) { + console.error(`Failed to run provider launch stress smoke: ${result.error.message}`); + process.exit(1); +} + +process.exit(result.status ?? 1); diff --git a/src/main/services/team/TeamLaunchFailureArtifactPack.ts b/src/main/services/team/TeamLaunchFailureArtifactPack.ts new file mode 100644 index 00000000..51221489 --- /dev/null +++ b/src/main/services/team/TeamLaunchFailureArtifactPack.ts @@ -0,0 +1,485 @@ +import { getTeamsBasePath } from '@main/utils/pathDecoder'; +import { createLogger } from '@shared/utils/logger'; +import * as fs from 'fs'; +import * as path from 'path'; + +import { atomicWriteAsync } from './atomicWrite'; +import { getTeamBootstrapStatePath } from './TeamBootstrapStateReader'; +import { getTeamLaunchStatePath, getTeamLaunchSummaryPath } from './TeamLaunchStateStore'; + +import type { + MemberSpawnStatusEntry, + PersistedTeamLaunchSnapshot, + TeamLaunchDiagnosticItem, + TeamMember, + TeamProviderBackendId, + TeamProviderId, + TeamProvisioningProgress, +} from '@shared/types'; + +const logger = createLogger('Service:TeamLaunchFailureArtifactPack'); + +const ARTIFACTS_DIR_NAME = 'launch-failure-artifacts'; +const LATEST_ARTIFACT_FILE = 'latest.json'; +const MAX_CLI_LOG_CHARS = 256_000; +const MAX_TRACE_CHARS = 128_000; +const MAX_COPIED_FILE_BYTES = 256 * 1024; + +type JsonRecord = Record; + +export interface TeamLaunchFailureArtifactPackInput { + teamName: string; + runId: string; + reason: string; + startedAt?: string; + cwd?: string; + pid?: number | null; + providerId?: TeamProviderId; + providerBackendId?: TeamProviderBackendId; + model?: string; + expectedMembers?: readonly string[]; + effectiveMembers?: readonly TeamMember[]; + progress?: TeamProvisioningProgress | null; + launchSnapshot?: PersistedTeamLaunchSnapshot | null; + launchDiagnostics?: readonly TeamLaunchDiagnosticItem[]; + memberSpawnStatuses?: Record; + cliLogs?: string | null; + progressTraceLines?: readonly string[]; + runtimeAdapterTraceLines?: readonly string[]; + flags?: JsonRecord; +} + +export interface TeamLaunchFailureArtifactPackResult { + directory: string; + manifestPath: string; + files: string[]; +} + +export type LaunchFailureArtifactClassificationCode = + | 'transport_rejected' + | 'stdin_missing' + | 'provider_quota' + | 'provider_auth' + | 'model_no_bootstrap' + | 'process_exited' + | 'opencode_protocol' + | 'unknown'; + +export interface LaunchFailureArtifactClassification { + code: LaunchFailureArtifactClassificationCode; + confidence: number; + evidence: string[]; +} + +export interface LaunchBootstrapTransportBreadcrumb { + lastTransportStage: string | null; + submitRejected: boolean; + retryable: boolean | null; + noStdinWarning: boolean; + bootstrapSubmitted: boolean; + evidence: string[]; +} + +interface CopiedArtifactFile { + sourcePath: string; + artifactName: string; + issue?: string; +} + +function sanitizeArtifactNamePart(value: string): string { + const sanitized = value + .trim() + .replace(/[^a-zA-Z0-9._-]+/g, '-') + .replace(/^-+|-+$/g, ''); + return sanitized || 'unknown'; +} + +function artifactTimestamp(now: Date): string { + return now.toISOString().replace(/[:.]/g, '-'); +} + +function assertPathWithin(root: string, target: string): void { + const relative = path.relative(path.resolve(root), path.resolve(target)); + if (relative.startsWith('..') || path.isAbsolute(relative)) { + throw new Error(`Launch artifact path escaped teams root: ${target}`); + } +} + +function truncateTail(text: string, maxChars: number): string { + if (text.length <= maxChars) return text; + return `[truncated to last ${maxChars} chars]\n${text.slice(text.length - maxChars)}`; +} + +export function redactLaunchFailureArtifactText(text: string): string { + return text + .replace(/sk-ant-[A-Za-z0-9_-]{20,}/g, '[REDACTED_ANTHROPIC_API_KEY]') + .replace(/sk-proj-[A-Za-z0-9_-]{20,}/g, '[REDACTED_OPENAI_API_KEY]') + .replace(/sk-[A-Za-z0-9_-]{20,}/g, '[REDACTED_API_KEY]') + .replace( + /\b(ANTHROPIC_API_KEY|OPENAI_API_KEY|CODEX_API_KEY|OPENROUTER_API_KEY|GEMINI_API_KEY)=([^\s"'`]+)/gi, + '$1=[REDACTED]' + ) + .replace(/\b(authorization:\s*bearer\s+)([A-Za-z0-9._~+/=-]{20,})/gi, '$1[REDACTED]') + .replace( + /\b(api[_-]?key|token|access[_-]?token|refresh[_-]?token)(["']?\s*[:=]\s*["']?)([A-Za-z0-9._~+/=-]{20,})/gi, + '$1$2[REDACTED]' + ); +} + +function redactJsonLike(value: T): T { + return redactJsonValue(value) as T; +} + +function isSecretJsonKey(key: string): boolean { + return /^(api[_-]?key|token|access[_-]?token|refresh[_-]?token|authorization)$/i.test(key); +} + +function redactJsonValue(value: unknown, key = ''): unknown { + if (isSecretJsonKey(key)) { + return '[REDACTED]'; + } + if (typeof value === 'string') { + return redactLaunchFailureArtifactText(value); + } + if (Array.isArray(value)) { + return value.map((item) => redactJsonValue(item)); + } + if (value && typeof value === 'object') { + return Object.fromEntries( + Object.entries(value as JsonRecord).map(([entryKey, entryValue]) => [ + entryKey, + redactJsonValue(entryValue, entryKey), + ]) + ); + } + return value; +} + +function appendIfString(parts: string[], value: unknown): void { + if (typeof value === 'string' && value.trim()) { + parts.push(value.trim()); + } +} + +function collectLaunchFailureSearchParts(input: TeamLaunchFailureArtifactPackInput): string[] { + const parts: string[] = []; + appendIfString(parts, input.reason); + appendIfString(parts, input.cliLogs); + for (const line of input.progressTraceLines ?? []) appendIfString(parts, line); + for (const line of input.runtimeAdapterTraceLines ?? []) appendIfString(parts, line); + appendIfString(parts, input.progress?.message); + appendIfString(parts, input.progress?.error); + appendIfString(parts, input.progress?.cliLogsTail); + for (const warning of input.progress?.warnings ?? []) appendIfString(parts, warning); + for (const diagnostic of input.launchDiagnostics ?? input.progress?.launchDiagnostics ?? []) { + appendIfString(parts, diagnostic.code); + appendIfString(parts, diagnostic.label); + appendIfString(parts, diagnostic.detail); + } + for (const [memberName, entry] of Object.entries(input.memberSpawnStatuses ?? {})) { + appendIfString(parts, memberName); + appendIfString(parts, entry.status); + appendIfString(parts, entry.launchState); + appendIfString(parts, entry.error); + appendIfString(parts, entry.hardFailureReason); + appendIfString(parts, entry.runtimeDiagnostic); + } + if (input.launchSnapshot) { + appendIfString(parts, input.launchSnapshot.launchPhase); + appendIfString(parts, input.launchSnapshot.teamLaunchState); + for (const [memberName, member] of Object.entries(input.launchSnapshot.members)) { + appendIfString(parts, memberName); + appendIfString(parts, member.launchState); + appendIfString(parts, member.hardFailureReason); + appendIfString(parts, member.runtimeDiagnostic); + for (const diagnostic of member.diagnostics ?? []) appendIfString(parts, diagnostic); + } + } + return parts; +} + +function firstEvidence(parts: readonly string[], pattern: RegExp): string[] { + const evidence: string[] = []; + for (const part of parts) { + if (pattern.test(part)) { + evidence.push(truncateTail(part, 600)); + if (evidence.length >= 3) break; + } + } + return evidence; +} + +export function classifyLaunchFailureArtifact( + input: TeamLaunchFailureArtifactPackInput +): LaunchFailureArtifactClassification { + const parts = collectLaunchFailureSearchParts(input); + const text = parts.join('\n').toLowerCase(); + const candidates: { + code: LaunchFailureArtifactClassificationCode; + confidence: number; + pattern: RegExp; + }[] = [ + { + code: 'transport_rejected', + confidence: 0.95, + pattern: /bootstrap_submit_rejected|submit rejected by local prompt handler/i, + }, + { + code: 'stdin_missing', + confidence: 0.9, + pattern: /no stdin data received|proceeding without it/i, + }, + { + code: 'provider_quota', + confidence: 0.92, + pattern: /quota exhausted|insufficient credits|key limit exceeded|total limit|rate limit/i, + }, + { + code: 'provider_auth', + confidence: 0.88, + pattern: + /401 unauthorized|not_logged_in|login required|auth(?:entication)? failed|api key.*(?:missing|invalid)|token refresh failed/i, + }, + { + code: 'opencode_protocol', + confidence: 0.84, + pattern: + /visible_reply_still_required|non_visible_tool_without_task_progress|empty_assistant_turn|runtime_bootstrap_checkin/i, + }, + { + code: 'model_no_bootstrap', + confidence: 0.82, + pattern: + /did not bootstrap-confirm|bootstrap unconfirmed|bootstrap-confirm before timeout|check-in not yet received|bootstrap_stalled/i, + }, + { + code: 'process_exited', + confidence: 0.78, + pattern: /process exited|pid is not alive|pid was not found|stale_metadata|exited before/i, + }, + ]; + + for (const candidate of candidates) { + if (candidate.pattern.test(text)) { + return { + code: candidate.code, + confidence: candidate.confidence, + evidence: firstEvidence(parts, candidate.pattern).map(redactLaunchFailureArtifactText), + }; + } + } + return { + code: 'unknown', + confidence: 0.2, + evidence: firstEvidence(parts, /failed|error|timeout/i).map(redactLaunchFailureArtifactText), + }; +} + +export function extractLaunchBootstrapTransportBreadcrumb( + input: TeamLaunchFailureArtifactPackInput +): LaunchBootstrapTransportBreadcrumb { + const parts = collectLaunchFailureSearchParts(input); + const combined = parts.join('\n'); + const lastStageMatches = [...combined.matchAll(/last transport stage:\s*([^;\n]+)/gi)]; + const retryableMatches = [ + ...combined.matchAll(/bootstrap_submit_rejected[^\n]*(?:retryable[=:]\s*(true|false))/gi), + ]; + const evidence = firstEvidence( + parts, + /bootstrap_submit_|last transport stage|no stdin data received|local prompt handler/i + ).map(redactLaunchFailureArtifactText); + const retryableRaw = retryableMatches.at(-1)?.[1]?.toLowerCase(); + return { + lastTransportStage: lastStageMatches.at(-1)?.[1]?.trim() ?? null, + submitRejected: /bootstrap_submit_rejected|submit rejected by local prompt handler/i.test( + combined + ), + retryable: retryableRaw === 'true' ? true : retryableRaw === 'false' ? false : null, + noStdinWarning: /no stdin data received|proceeding without it/i.test(combined), + bootstrapSubmitted: + /(?:event["']?\s*:\s*["']bootstrap_submitted["']|bootstrap_submit_accepted|bootstrap submitted)/i.test( + combined + ), + evidence, + }; +} + +async function readBoundedTextFile(sourcePath: string): Promise<{ text?: string; issue?: string }> { + try { + const stat = await fs.promises.stat(sourcePath); + if (!stat.isFile()) { + return { issue: 'not_regular_file' }; + } + const handle = await fs.promises.open(sourcePath, 'r'); + try { + const start = Math.max(0, stat.size - MAX_COPIED_FILE_BYTES); + const buffer = Buffer.alloc(stat.size - start); + if (buffer.length > 0) { + await handle.read(buffer, 0, buffer.length, start); + } + const prefix = start > 0 ? `[truncated to last ${MAX_COPIED_FILE_BYTES} bytes]\n` : ''; + return { text: `${prefix}${buffer.toString('utf8')}` }; + } finally { + await handle.close().catch(() => undefined); + } + } catch (error) { + const code = (error as NodeJS.ErrnoException).code; + return { issue: code === 'ENOENT' ? 'missing' : 'unreadable' }; + } +} + +function getKnownLaunchArtifactSourceFiles(teamName: string): CopiedArtifactFile[] { + const bootstrapStatePath = getTeamBootstrapStatePath(teamName); + const teamDir = path.dirname(bootstrapStatePath); + return [ + { + sourcePath: getTeamLaunchStatePath(teamName), + artifactName: 'launch-state.json', + }, + { + sourcePath: getTeamLaunchSummaryPath(teamName), + artifactName: 'launch-summary.json', + }, + { + sourcePath: bootstrapStatePath, + artifactName: 'bootstrap-state.json', + }, + { + sourcePath: path.join(teamDir, 'bootstrap-journal.jsonl'), + artifactName: 'bootstrap-journal.tail.jsonl', + }, + { + sourcePath: path.join(teamDir, '.bootstrap.lock', 'metadata.json'), + artifactName: 'bootstrap-lock-metadata.json', + }, + ]; +} + +async function writeArtifactTextFile( + directory: string, + artifactName: string, + rawText: string, + files: string[] +): Promise { + const targetPath = path.join(directory, artifactName); + await atomicWriteAsync(targetPath, `${redactLaunchFailureArtifactText(rawText).trimEnd()}\n`); + files.push(artifactName); +} + +export async function writeTeamLaunchFailureArtifactPack( + input: TeamLaunchFailureArtifactPackInput +): Promise { + const teamsRoot = getTeamsBasePath(); + const teamDir = path.join(teamsRoot, input.teamName); + const artifactsRoot = path.join(teamDir, ARTIFACTS_DIR_NAME); + const createdAt = new Date(); + const directory = path.join( + artifactsRoot, + `${artifactTimestamp(createdAt)}-${sanitizeArtifactNamePart(input.runId)}` + ); + assertPathWithin(teamsRoot, directory); + await fs.promises.mkdir(directory, { recursive: true }); + + const files: string[] = []; + const copiedFiles: CopiedArtifactFile[] = []; + + if (input.cliLogs?.trim()) { + await writeArtifactTextFile( + directory, + 'cli-logs-tail.txt', + truncateTail(input.cliLogs, MAX_CLI_LOG_CHARS), + files + ); + } + if (input.progressTraceLines?.length) { + await writeArtifactTextFile( + directory, + 'progress-trace.txt', + truncateTail(input.progressTraceLines.join('\n'), MAX_TRACE_CHARS), + files + ); + } + if (input.runtimeAdapterTraceLines?.length) { + await writeArtifactTextFile( + directory, + 'runtime-adapter-trace.txt', + truncateTail(input.runtimeAdapterTraceLines.join('\n'), MAX_TRACE_CHARS), + files + ); + } + + for (const source of getKnownLaunchArtifactSourceFiles(input.teamName)) { + const read = await readBoundedTextFile(source.sourcePath); + if (read.text !== undefined) { + await writeArtifactTextFile(directory, source.artifactName, read.text, files); + copiedFiles.push(source); + } else { + copiedFiles.push({ ...source, issue: read.issue ?? 'unreadable' }); + } + } + + const classification = classifyLaunchFailureArtifact(input); + const bootstrapTransportBreadcrumb = extractLaunchBootstrapTransportBreadcrumb(input); + const manifest = redactJsonLike({ + version: 1, + createdAt: createdAt.toISOString(), + reason: input.reason, + classification, + bootstrapTransportBreadcrumb, + teamName: input.teamName, + runId: input.runId, + startedAt: input.startedAt, + cwd: input.cwd, + pid: input.pid ?? null, + providerId: input.providerId, + providerBackendId: input.providerBackendId, + model: input.model, + expectedMembers: input.expectedMembers ?? [], + effectiveMembers: (input.effectiveMembers ?? []).map((member) => ({ + name: member.name, + role: member.role, + providerId: member.providerId, + providerBackendId: member.providerBackendId, + model: member.model, + agentType: member.agentType, + removedAt: member.removedAt, + })), + progress: input.progress ?? null, + launchDiagnostics: input.launchDiagnostics ?? input.progress?.launchDiagnostics ?? [], + memberSpawnStatuses: input.memberSpawnStatuses ?? {}, + launchSnapshot: input.launchSnapshot ?? null, + flags: input.flags ?? {}, + artifactFiles: files, + copiedFiles, + }); + + const manifestPath = path.join(directory, 'manifest.json'); + await atomicWriteAsync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`); + files.unshift('manifest.json'); + + await fs.promises.mkdir(artifactsRoot, { recursive: true }); + await atomicWriteAsync( + path.join(artifactsRoot, LATEST_ARTIFACT_FILE), + `${JSON.stringify( + redactJsonLike({ + version: 1, + createdAt: createdAt.toISOString(), + teamName: input.teamName, + runId: input.runId, + reason: input.reason, + directory, + manifestPath, + }), + null, + 2 + )}\n` + ); + + logger.info(`[${input.teamName}] Wrote launch failure artifact pack`, { + runId: input.runId, + reason: input.reason, + directory, + }); + + return { directory, manifestPath, files }; +} diff --git a/src/main/services/team/TeamProvisioningService.ts b/src/main/services/team/TeamProvisioningService.ts index aa201676..d59f81d6 100644 --- a/src/main/services/team/TeamProvisioningService.ts +++ b/src/main/services/team/TeamProvisioningService.ts @@ -289,6 +289,7 @@ import { snapshotFromRuntimeMemberStatuses, snapshotToMemberSpawnStatuses, } from './TeamLaunchStateEvaluator'; +import { writeTeamLaunchFailureArtifactPack } from './TeamLaunchFailureArtifactPack'; import { TeamLaunchStateStore } from './TeamLaunchStateStore'; import { TeamMcpConfigBuilder } from './TeamMcpConfigBuilder'; import { TeamMemberLogsFinder } from './TeamMemberLogsFinder'; @@ -407,6 +408,11 @@ type BootstrapTranscriptSuccessSource = 'member_briefing' | 'assistant_text'; const BOOTSTRAP_RUNTIME_PROOF_TAIL_BYTES = 256 * 1024; const BOOTSTRAP_RUNTIME_EVENT_MAX_LINES = 256; const BOOTSTRAP_RUNTIME_EVENT_MAX_LINE_BYTES = 16 * 1024; +const TEAMMATE_RUNTIME_ENV = 'CLAUDE_CODE_TEAMMATE_RUNTIME'; +const TEAMMATE_RUNTIME_EVENTS_ENV = 'CLAUDE_CODE_TEAMMATE_RUNTIME_EVENTS_PATH'; +const TEAMMATE_BOOTSTRAP_PROOF_TOKEN_ENV = 'CLAUDE_CODE_BOOTSTRAP_PROOF_TOKEN'; +const NATIVE_APP_MANAGED_BOOTSTRAP_CONTEXT_ENV = + 'CLAUDE_CODE_NATIVE_APP_MANAGED_BOOTSTRAP_CONTEXT_PATH'; function getTeamRuntimeEventsDir(teamName: string): string { return path.join(getTeamsBasePath(), teamName, 'runtime'); @@ -5519,6 +5525,7 @@ export class TeamProvisioningService { >(); private readonly memberSpawnStatusesCacheGenerationByTeam = new Map(); private readonly launchStateStore = new TeamLaunchStateStore(); + private readonly launchFailureArtifactPackRunIds = new Set(); private readonly launchStateStoreQueue = new Map>(); private readonly launchStateWrittenRunIdByTeam = new Map(); private readonly failedOpenCodeSecondaryRetryInFlightByTeam = new Map< @@ -5624,6 +5631,56 @@ export class TeamProvisioningService { return choosePreferredLaunchSnapshot(bootstrapSnapshot, launchSnapshot); } + private writeLaunchFailureArtifactPackBestEffort( + run: ProvisioningRun, + options: { + reason: string; + launchSnapshot?: PersistedTeamLaunchSnapshot | null; + } + ): void { + const key = `${run.teamName}:${run.runId}`; + if (this.launchFailureArtifactPackRunIds.has(key)) return; + this.launchFailureArtifactPackRunIds.add(key); + + const memberSpawnStatuses = Object.fromEntries(run.memberSpawnStatuses.entries()); + const request = run.request as Partial | undefined; + void writeTeamLaunchFailureArtifactPack({ + teamName: run.teamName, + runId: run.runId, + reason: options.reason, + startedAt: run.startedAt, + cwd: request?.cwd ?? '', + pid: run.child?.pid ?? run.progress.pid ?? null, + providerId: request?.providerId, + providerBackendId: request?.providerBackendId, + model: request?.model, + expectedMembers: run.expectedMembers, + effectiveMembers: run.allEffectiveMembers, + progress: run.progress, + launchSnapshot: options.launchSnapshot ?? null, + launchDiagnostics: run.progress.launchDiagnostics ?? buildLaunchDiagnosticsFromRun(run), + memberSpawnStatuses, + cliLogs: extractCliLogsFromRun(run), + progressTraceLines: run.provisioningTraceLines, + runtimeAdapterTraceLines: this.runtimeAdapterTraceLinesByRunId.get(run.runId), + flags: { + isLaunch: run.isLaunch, + provisioningComplete: run.provisioningComplete, + deterministicBootstrap: run.deterministicBootstrap, + processKilled: run.processKilled, + finalizingByTimeout: run.finalizingByTimeout, + cancelRequested: run.cancelRequested, + }, + }).catch((error: unknown) => { + this.launchFailureArtifactPackRunIds.delete(key); + logger.warn( + `[${run.teamName}] Failed to write launch failure artifact pack: ${ + error instanceof Error ? error.message : String(error) + }` + ); + }); + } + async repairStaleTaskActivityIntervalsBeforeSnapshot(teamName: string): Promise { if (this.crashRepairedActivityIntervalsByTeam.has(teamName)) { return; @@ -13787,6 +13844,13 @@ export class TeamProvisioningService { providerId: TeamProviderId; joinedAt: number; bootstrapExpectedAfter: string; + backendType?: 'tmux' | 'process'; + runtimePid?: number; + bootstrapRuntimeEventsPath?: string; + bootstrapProofToken?: string; + bootstrapRunId?: string; + bootstrapContextHash?: string; + bootstrapBriefingHash?: string; }): Promise { const configPath = path.join(getTeamsBasePath(), input.teamName, 'config.json'); const raw = await tryReadRegularFileUtf8(configPath, { @@ -13822,10 +13886,25 @@ export class TeamProvisioningService { color: input.color, joinedAt: input.joinedAt, bootstrapExpectedAfter: input.bootstrapExpectedAfter, + ...(input.bootstrapProofToken ? { bootstrapProofToken: input.bootstrapProofToken } : {}), + ...(input.bootstrapRunId ? { bootstrapRunId: input.bootstrapRunId } : {}), + ...(input.bootstrapRuntimeEventsPath + ? { bootstrapRuntimeEventsPath: input.bootstrapRuntimeEventsPath } + : {}), + ...(input.bootstrapContextHash + ? { + bootstrapProofMode: 'native_app_managed_context', + bootstrapContextHash: input.bootstrapContextHash, + } + : {}), + ...(input.bootstrapBriefingHash + ? { bootstrapBriefingHash: input.bootstrapBriefingHash } + : {}), tmuxPaneId: input.paneId, + ...(typeof input.runtimePid === 'number' ? { runtimePid: input.runtimePid } : {}), cwd: input.cwd, subscriptions: Array.isArray(existing.subscriptions) ? existing.subscriptions : [], - backendType: 'tmux', + backendType: input.backendType ?? 'tmux', }; if (existingIndex >= 0) { @@ -14047,6 +14126,378 @@ export class TeamProvisioningService { this.setMemberSpawnStatus(input.run, input.memberName, 'waiting'); } + private async launchDirectProcessMemberRestart(input: { + run: ProvisioningRun; + teamName: string; + displayName: string; + leadName: string; + memberName: string; + config: TeamConfig; + configuredMember: NonNullable< + ReturnType + >; + persistedRuntimeMembers: readonly PersistedRuntimeMemberLike[]; + }): Promise { + const providerId = resolveTeamProviderId(input.configuredMember.providerId); + const claudePath = input.run.spawnContext?.claudePath ?? (await ClaudeBinaryResolver.resolve()); + if (!claudePath) { + throw new Error('Claude CLI not found; install it or provide a valid path'); + } + + const cwd = this.resolveDirectRestartRuntimeCwd({ + configuredMember: input.configuredMember, + persistedRuntimeMembers: input.persistedRuntimeMembers, + config: input.config, + run: input.run, + }); + await ensureCwdExists(cwd); + + const provisioningEnv = await this.buildProvisioningEnv( + providerId, + input.configuredMember.providerBackendId, + { + teamRuntimeAuth: { + teamName: input.teamName, + authMaterialId: `${input.run.runId}-process-restart-${input.configuredMember.name}-${randomUUID()}`, + allowAnthropicApiKeyHelper: true, + }, + } + ); + if (provisioningEnv.warning) { + throw new Error(provisioningEnv.warning); + } + + const mcpConfigPath = await this.mcpConfigBuilder.writeConfigFile(cwd); + const agentId = `${input.configuredMember.name}@${input.teamName}`; + const color = + input.config.members + ?.find((member) => matchesExactTeamMemberName(member.name, input.memberName)) + ?.color?.trim() || getMemberColorByName(input.configuredMember.name); + const parentSessionId = + input.run.detectedSessionId?.trim() || input.config.leadSessionId?.trim() || input.run.runId; + const memberSpec: TeamCreateRequest['members'][number] = { + name: input.configuredMember.name, + ...(input.configuredMember.role ? { role: input.configuredMember.role } : {}), + ...(input.configuredMember.workflow ? { workflow: input.configuredMember.workflow } : {}), + ...(input.configuredMember.providerId + ? { providerId: input.configuredMember.providerId } + : {}), + ...(input.configuredMember.providerBackendId + ? { providerBackendId: input.configuredMember.providerBackendId } + : {}), + ...(input.configuredMember.model ? { model: input.configuredMember.model } : {}), + ...(input.configuredMember.effort ? { effort: input.configuredMember.effort } : {}), + ...(input.configuredMember.agentType ? { agentType: input.configuredMember.agentType } : {}), + ...(input.configuredMember.isolation === 'worktree' + ? { isolation: 'worktree' as const } + : {}), + ...(input.configuredMember.cwd ? { cwd: input.configuredMember.cwd } : {}), + }; + const prompt = buildMemberSpawnPrompt( + memberSpec, + input.displayName, + input.teamName, + input.leadName, + { + restart: true, + } + ); + const bootstrapExpectedAfter = nowIso(); + const bootstrapProofToken = randomUUID(); + const runtimePaths = this.getDirectProcessRestartRuntimePaths( + input.teamName, + input.configuredMember.name + ); + await fs.promises.mkdir(runtimePaths.dir, { recursive: true }); + await fs.promises.writeFile(runtimePaths.eventsPath, '', { encoding: 'utf8', mode: 0o600 }); + + const nativeBootstrapSpec = + ( + await buildNativeAppManagedBootstrapSpecs({ + teamName: input.teamName, + cwd, + members: [memberSpec], + }) + ).get(input.configuredMember.name) ?? null; + const nativeBootstrapEnv = await this.materializeDirectProcessNativeBootstrapContext({ + teamName: input.teamName, + memberName: input.configuredMember.name, + agentId, + providerId, + runId: input.run.runId, + bootstrapProofToken, + spec: nativeBootstrapSpec, + }); + + const runtimeArgsPlan = await this.buildTeamRuntimeLaunchArgsPlan({ + teamName: input.teamName, + providerId, + launchIdentity: null, + envResolution: provisioningEnv, + extraArgs: [], + includeAnthropicHelper: providerId === 'anthropic', + contextLabel: `Direct process teammate restart (${input.configuredMember.name})`, + }); + + const runtimeArgs = mergeJsonSettingsArgs([ + '--teammate-runtime', + 'headless', + '--agent-id', + agentId, + '--agent-name', + input.configuredMember.name, + '--team-name', + input.teamName, + '--agent-color', + color, + '--parent-session-id', + parentSessionId, + ...(input.configuredMember.agentType + ? ['--agent-type', input.configuredMember.agentType] + : []), + '--mcp-config', + mcpConfigPath, + '--strict-mcp-config', + '--disallowedTools', + APP_TEAM_RUNTIME_DISALLOWED_TOOLS, + ...(input.run.request.skipPermissions !== false + ? ['--dangerously-skip-permissions', '--permission-mode', 'bypassPermissions'] + : ['--permission-prompt-tool', 'stdio', '--permission-mode', 'default']), + ...(input.configuredMember.model ? ['--model', input.configuredMember.model] : []), + ...(input.configuredMember.effort ? ['--effort', input.configuredMember.effort] : []), + ...runtimeArgsPlan.fastModeArgs, + ...runtimeArgsPlan.runtimeTurnSettledHookArgs, + ...runtimeArgsPlan.providerArgs, + ...runtimeArgsPlan.settingsArgs, + ]); + + const stdoutLog = fs.createWriteStream(runtimePaths.stdoutPath, { flags: 'a', mode: 0o600 }); + const stderrLog = fs.createWriteStream(runtimePaths.stderrPath, { flags: 'a', mode: 0o600 }); + const child = spawnCli(claudePath, runtimeArgs, { + cwd, + detached: true, + env: { + ...provisioningEnv.env, + ...nativeBootstrapEnv, + [TEAMMATE_RUNTIME_ENV]: 'headless', + [TEAMMATE_RUNTIME_EVENTS_ENV]: runtimePaths.eventsPath, + [TEAMMATE_BOOTSTRAP_PROOF_TOKEN_ENV]: bootstrapProofToken, + }, + stdio: ['pipe', 'pipe', 'pipe'], + }); + if (!child.pid) { + stdoutLog.destroy(); + stderrLog.destroy(); + throw new Error(`Failed to spawn teammate process for ${agentId}: missing pid`); + } + + const runtimePid = child.pid; + const processPaneId = `process:${runtimePid}`; + child.stdout?.pipe(stdoutLog); + child.stderr?.pipe(stderrLog); + child.stdin?.on('error', (error) => { + logger.debug( + `[${input.teamName}] Direct process restart stdin failed for ${agentId}: ${error.message}` + ); + }); + child.once('close', (code, signal) => { + void this.appendDirectProcessRuntimeEvent({ + type: 'exited', + eventsPath: runtimePaths.eventsPath, + pid: runtimePid, + teamName: input.teamName, + agentName: input.configuredMember.name, + agentId, + runId: parentSessionId, + bootstrapRunId: input.run.runId, + source: 'TeamProvisioningService.direct_process_restart', + detail: + code !== null + ? `process exited with code ${code}` + : signal + ? `process exited from signal ${signal}` + : 'process exited', + }); + stdoutLog.end(); + stderrLog.end(); + }); + child.once('error', (error) => { + void this.appendDirectProcessRuntimeEvent({ + type: 'failed', + eventsPath: runtimePaths.eventsPath, + pid: runtimePid, + teamName: input.teamName, + agentName: input.configuredMember.name, + agentId, + runId: parentSessionId, + bootstrapRunId: input.run.runId, + source: 'TeamProvisioningService.direct_process_restart', + detail: `process error: ${error.message}`, + }); + }); + (child.stdin as { unref?: () => void } | null)?.unref?.(); + (child.stdout as { unref?: () => void } | null)?.unref?.(); + (child.stderr as { unref?: () => void } | null)?.unref?.(); + child.unref(); + + await this.appendDirectProcessRuntimeEvent({ + type: 'process_spawned', + eventsPath: runtimePaths.eventsPath, + pid: runtimePid, + teamName: input.teamName, + agentName: input.configuredMember.name, + agentId, + runId: parentSessionId, + bootstrapRunId: input.run.runId, + source: 'TeamProvisioningService.direct_process_restart', + detail: 'process spawned', + }); + await this.appendDirectProcessRuntimeEvent({ + type: 'stdout_attached', + eventsPath: runtimePaths.eventsPath, + pid: runtimePid, + teamName: input.teamName, + agentName: input.configuredMember.name, + agentId, + runId: parentSessionId, + bootstrapRunId: input.run.runId, + source: 'TeamProvisioningService.direct_process_restart', + detail: 'stdout and stderr attached', + }); + + await this.updateDirectTmuxRestartMemberConfig({ + teamName: input.teamName, + memberName: input.memberName, + member: input.configuredMember, + agentId, + color, + prompt, + paneId: processPaneId, + cwd, + providerId, + joinedAt: Date.now(), + bootstrapExpectedAfter, + backendType: 'process', + runtimePid, + bootstrapRuntimeEventsPath: runtimePaths.eventsPath, + bootstrapProofToken, + bootstrapRunId: input.run.runId, + ...(nativeBootstrapSpec + ? { + bootstrapContextHash: nativeBootstrapSpec.contextHash, + bootstrapBriefingHash: nativeBootstrapSpec.briefingHash, + } + : {}), + }); + this.enqueueDirectRestartPrompt({ + teamName: input.teamName, + memberName: input.configuredMember.name, + leadName: input.leadName, + leadSessionId: parentSessionId, + prompt, + }); + await this.appendDirectProcessRuntimeEvent({ + type: 'mailbox_bootstrap_written', + eventsPath: runtimePaths.eventsPath, + pid: runtimePid, + teamName: input.teamName, + agentName: input.configuredMember.name, + agentId, + runId: parentSessionId, + bootstrapRunId: input.run.runId, + source: 'TeamProvisioningService.direct_process_restart', + }); + this.appendMemberBootstrapDiagnostic( + input.run, + input.memberName, + `restart process spawned with pid ${runtimePid}` + ); + this.setMemberSpawnStatus(input.run, input.memberName, 'waiting'); + } + + private getDirectProcessRestartRuntimePaths( + teamName: string, + memberName: string + ): { dir: string; eventsPath: string; stdoutPath: string; stderrPath: string } { + const dir = getTeamRuntimeEventsDir(teamName); + const filePrefix = sanitizeProcessRuntimeEventFilePrefix(memberName); + return { + dir, + eventsPath: path.join(dir, `${filePrefix}.runtime.jsonl`), + stdoutPath: path.join(dir, `${filePrefix}.stdout.log`), + stderrPath: path.join(dir, `${filePrefix}.stderr.log`), + }; + } + + private async materializeDirectProcessNativeBootstrapContext(input: { + teamName: string; + memberName: string; + agentId: string; + providerId: TeamProviderId; + runId: string; + bootstrapProofToken: string; + spec: NativeAppManagedBootstrapSpec | null; + }): Promise> { + if (!input.spec || (input.providerId !== 'anthropic' && input.providerId !== 'codex')) { + return {}; + } + const context = { + ...input.spec, + kind: 'native_app_managed_bootstrap', + teamName: input.teamName, + memberName: input.memberName, + agentId: input.agentId, + runId: input.runId, + provider: input.providerId, + bootstrapProofToken: input.bootstrapProofToken, + }; + const dir = path.join(getTeamRuntimeEventsDir(input.teamName), 'native-bootstrap'); + await fs.promises.mkdir(dir, { recursive: true }); + const finalPath = path.join( + dir, + `${sanitizeProcessRuntimeEventFilePrefix(input.memberName)}-${randomUUID()}.native-bootstrap.json` + ); + const tempPath = `${finalPath}.tmp`; + await fs.promises.writeFile(tempPath, JSON.stringify(context), { + encoding: 'utf8', + mode: 0o600, + }); + await fs.promises.rename(tempPath, finalPath); + return { [NATIVE_APP_MANAGED_BOOTSTRAP_CONTEXT_ENV]: finalPath }; + } + + private async appendDirectProcessRuntimeEvent(input: { + type: string; + eventsPath: string; + pid: number; + teamName: string; + agentName: string; + agentId: string; + runId: string; + bootstrapRunId: string; + source: string; + detail?: string; + }): Promise { + await fs.promises.mkdir(path.dirname(input.eventsPath), { recursive: true }); + await fs.promises.appendFile( + input.eventsPath, + `${JSON.stringify({ + version: 1, + type: input.type, + timestamp: nowIso(), + pid: input.pid, + teamName: input.teamName, + agentName: input.agentName, + agentId: input.agentId, + runId: input.runId, + bootstrapRunId: input.bootstrapRunId, + source: input.source, + ...(input.detail ? { detail: input.detail } : {}), + })}\n`, + { encoding: 'utf8', mode: 0o600 } + ); + } + private getMemberLifecycleOperationKey(teamName: string, memberName: string): string { return `${teamName.trim().toLowerCase()}\u0000${memberName.trim().toLowerCase()}`; } @@ -14408,6 +14859,38 @@ export class TeamProvisioningService { } } + const shouldDirectProcessRestart = backendTypes.has('process') || livePids.size > 0; + if (shouldDirectProcessRestart) { + try { + await this.launchDirectProcessMemberRestart({ + run, + teamName, + displayName: config?.name?.trim() || teamName, + leadName, + memberName, + config, + configuredMember, + persistedRuntimeMembers, + }); + return; + } catch (error) { + run.pendingMemberRestarts.delete(memberName); + this.setMemberSpawnStatus( + run, + memberName, + 'error', + error instanceof Error ? error.message : String(error) + ); + if (run.isLaunch) { + await this.persistLaunchStateSnapshot( + run, + run.provisioningComplete ? 'finished' : 'active' + ); + } + throw error; + } + } + const restartMessage = buildRestartMemberSpawnMessage( teamName, config?.name?.trim() || teamName, @@ -29518,6 +30001,14 @@ export class TeamProvisioningService { } ); run.onProgress(progress); + if (hasSpawnFailures) { + this.writeLaunchFailureArtifactPackBestEffort(run, { + reason: run.isLaunch + ? 'launch_completed_with_teammate_errors' + : 'provisioning_completed_with_teammate_errors', + launchSnapshot: persistedLaunchSnapshot, + }); + } this.provisioningRunByTeam.delete(run.teamName); this.aliveRunByTeam.set(run.teamName, run.runId); logger.info(`[${run.teamName}] Provisioning complete. Process alive for subsequent tasks.`); @@ -30111,6 +30602,18 @@ export class TeamProvisioningService { }); void this.persistLaunchStateSnapshot(run, 'finished'); } + if ( + !hasNewerTrackedRun && + (run.progress.state === 'failed' || + (run.isLaunch && !run.provisioningComplete && !run.cancelRequested)) + ) { + this.writeLaunchFailureArtifactPackBestEffort(run, { + reason: + run.progress.state === 'failed' + ? 'launch_progress_failed' + : 'launch_cleanup_unconfirmed_bootstrap', + }); + } this.resetRuntimeToolActivity(run); this.setLeadActivity(run, 'offline'); run.pendingDirectCrossTeamSendRefresh = false; diff --git a/test/main/services/team/MixedProviderTeamLaunch.live.test.ts b/test/main/services/team/MixedProviderTeamLaunch.live.test.ts index d6515e2b..ab85879f 100644 --- a/test/main/services/team/MixedProviderTeamLaunch.live.test.ts +++ b/test/main/services/team/MixedProviderTeamLaunch.live.test.ts @@ -10,6 +10,7 @@ import { getTeamsBasePath, setClaudeBasePathOverride, } from '../../../../src/main/utils/pathDecoder'; +import { killProcessByPid } from '../../../../src/main/utils/processKill'; import { createOpenCodeLiveHarness, waitForOpenCodeLanesStopped, @@ -131,8 +132,7 @@ liveDescribe('Mixed provider team launch live e2e', () => { afterEach(async () => { const keepProcesses = process.env.MIXED_PROVIDER_TEAM_LIVE_KEEP_PROCESSES === '1'; if (!keepProcesses && harness && teamName) { - await harness.svc.stopTeam(teamName).catch(() => undefined); - await waitForOpenCodeLanesStopped(teamName, 90_000).catch(() => undefined); + await cleanupMixedProviderSmokeTeam(harness, teamName); } if (!keepProcesses && usingAnthropicSubscriptionAuth && teamName) { await fs.rm(path.join(getTeamsBasePath(), teamName), { recursive: true, force: true }); @@ -531,6 +531,51 @@ async function removeTempDirWithRetries(dirPath: string): Promise { } } +async function cleanupMixedProviderSmokeTeam( + harness: OpenCodeLiveHarness, + teamName: string +): Promise { + const beforeStopSnapshot = await harness.svc + .getTeamAgentRuntimeSnapshot(teamName) + .catch(() => null); + await harness.svc.stopTeam(teamName).catch(() => undefined); + await waitForOpenCodeLanesStopped(teamName, 90_000).catch(() => undefined); + await terminateSmokeOwnedProcessBackends(beforeStopSnapshot); + const afterStopSnapshot = await harness.svc + .getTeamAgentRuntimeSnapshot(teamName) + .catch(() => null); + await terminateSmokeOwnedProcessBackends(afterStopSnapshot); +} + +async function terminateSmokeOwnedProcessBackends( + snapshot: Awaited> | null +): Promise { + const pids = new Set(); + for (const member of Object.values(snapshot?.members ?? {})) { + if (member.backendType !== 'process' || member.providerId === 'opencode') { + continue; + } + const pid = member.runtimePid ?? member.pid; + if (typeof pid === 'number' && Number.isFinite(pid) && pid > 0) { + pids.add(pid); + } + } + await Promise.all( + Array.from(pids).map(async (pid) => { + try { + process.kill(pid, 0); + } catch { + return; + } + try { + killProcessByPid(pid); + } catch { + // Best-effort smoke cleanup. The process may have exited between the liveness probe and kill. + } + }) + ); +} + function formatProgressDump(progressEvents: TeamProvisioningProgress[]): string { return redactSecrets( progressEvents diff --git a/test/main/services/team/ProviderLaunchStress.live-e2e.test.ts b/test/main/services/team/ProviderLaunchStress.live-e2e.test.ts new file mode 100644 index 00000000..eb9ecbd4 --- /dev/null +++ b/test/main/services/team/ProviderLaunchStress.live-e2e.test.ts @@ -0,0 +1,860 @@ +// @vitest-environment node +import { constants as fsConstants, promises as fs } from 'node:fs'; +import * as os from 'node:os'; +import * as path from 'node:path'; + +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +import { createOpenCodeLiveHarness, waitForOpenCodeLanesStopped, waitUntil } from './openCodeLiveTestHarness'; +import { + getTasksBasePath, + getTeamsBasePath, + setClaudeBasePathOverride, +} from '../../../../src/main/utils/pathDecoder'; +import { killProcessByPid } from '../../../../src/main/utils/processKill'; +import { TeamDataService } from '../../../../src/main/services/team/TeamDataService'; +import { TeamProvisioningService } from '../../../../src/main/services/team/TeamProvisioningService'; +import { TeamTaskReader } from '../../../../src/main/services/team/TeamTaskReader'; + +import type { + TeamAgentRuntimeSnapshot, + TeamCreateRequest, + TeamMember, + TeamProviderId, + TeamProvisioningProgress, +} from '../../../../src/shared/types'; + +vi.mock('../../../../src/main/services/infrastructure/NotificationManager', () => ({ + NotificationManager: { + getInstance: () => ({ + addTeamNotification: vi.fn(async () => undefined), + }), + }, +})); + +const liveDescribe = + process.env.PROVIDER_LAUNCH_STRESS_LIVE === '1' && hasAnthropicAuthConfigured() + ? describe + : describe.skip; + +const DEFAULT_ORCHESTRATOR_CLI = '/Users/belief/dev/projects/claude/agent_teams_orchestrator/cli'; +const DEFAULT_ANTHROPIC_MODEL = 'haiku'; +const DEFAULT_CODEX_MODEL = 'gpt-5.4-mini'; +const DEFAULT_CODEX_EFFORT = 'low' as const; +const DEFAULT_OPENCODE_MODEL = 'openai/gpt-5.4-mini'; +const DEFAULT_ORDER: ProviderLaunchStressScenario[] = ['anthropic', 'codex', 'opencode', 'mixed']; +const MEMBER_NAMES = ['alice', 'bob', 'jack', 'tom', 'atlas', 'nova', 'cody', 'oscar']; +const RESTART_CONFIRM_TIMEOUT_MS = 300_000; +const POST_LAUNCH_WORK_TIMEOUT_MS = 300_000; +let currentStressTempDir = ''; +let currentStressProjectPath = ''; + +type ProviderLaunchStressScenario = 'anthropic' | 'codex' | 'opencode' | 'mixed'; + +interface ActiveScenario { + scenario: ProviderLaunchStressScenario; + teamName: string; + svc: TeamProvisioningService; + harness?: Awaited>; + codexCleanup?: () => Promise; + failed: boolean; +} + +liveDescribe('provider launch stress live e2e', () => { + let tempDir: string; + let tempClaudeRoot: string; + let tempHome: string; + let projectPath: string; + let previousCliPath: string | undefined; + let previousCliFlavor: string | undefined; + let previousCodexHome: string | undefined; + let previousHome: string | undefined; + let previousUserProfile: string | undefined; + let previousNodeEnv: string | undefined; + let previousAnthropicApiKey: string | undefined; + let previousAnthropicAuthToken: string | undefined; + let previousClaudeJsonConfig: string | null | undefined; + const activeScenarios: ActiveScenario[] = []; + + beforeEach(async () => { + tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'provider-launch-stress-live-')); + tempClaudeRoot = usingAnthropicSubscriptionAuth() + ? os.userInfo().homedir + : path.join(tempDir, '.claude'); + tempHome = path.join(tempDir, 'home'); + projectPath = path.join(tempDir, 'project'); + currentStressTempDir = tempDir; + currentStressProjectPath = projectPath; + await fs.mkdir(tempHome, { recursive: true }); + await fs.mkdir(projectPath, { recursive: true }); + await fs.writeFile( + path.join(projectPath, 'README.md'), + '# Provider launch stress live e2e\n\nKeep this project intentionally tiny.\n', + 'utf8' + ); + + if (usingAnthropicSubscriptionAuth()) { + setClaudeBasePathOverride(null); + previousClaudeJsonConfig = await upsertTrustedClaudeProjectConfig( + tempClaudeRoot, + projectPath + ); + } else { + await fs.mkdir(tempClaudeRoot, { recursive: true }); + await writeTrustedClaudeConfig(tempClaudeRoot, projectPath); + setClaudeBasePathOverride(tempClaudeRoot); + previousClaudeJsonConfig = undefined; + } + + previousCliPath = process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH; + previousCliFlavor = process.env.CLAUDE_TEAM_CLI_FLAVOR; + previousCodexHome = process.env.CODEX_HOME; + previousHome = process.env.HOME; + previousUserProfile = process.env.USERPROFILE; + previousNodeEnv = process.env.NODE_ENV; + previousAnthropicApiKey = process.env.ANTHROPIC_API_KEY; + previousAnthropicAuthToken = process.env.ANTHROPIC_AUTH_TOKEN; + + process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH = + process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH?.trim() || DEFAULT_ORCHESTRATOR_CLI; + process.env.CLAUDE_TEAM_CLI_FLAVOR = 'agent_teams_orchestrator'; + process.env.CODEX_HOME = resolveConnectedCodexHome(previousCodexHome); + process.env.HOME = usingAnthropicSubscriptionAuth() ? os.userInfo().homedir : tempHome; + process.env.USERPROFILE = usingAnthropicSubscriptionAuth() ? os.userInfo().homedir : tempHome; + process.env.NODE_ENV = 'production'; + if (usingAnthropicSubscriptionAuth()) { + delete process.env.ANTHROPIC_API_KEY; + delete process.env.ANTHROPIC_AUTH_TOKEN; + } + }); + + afterEach(async () => { + for (const active of [...activeScenarios].reverse()) { + await cleanupActiveScenario(active, { preserveFiles: active.failed }).catch(() => undefined); + } + activeScenarios.length = 0; + discardKnownProviderLaunchStressWarnings(); + + if (usingAnthropicSubscriptionAuth() && previousClaudeJsonConfig !== undefined) { + await restoreClaudeJsonConfig(tempClaudeRoot, previousClaudeJsonConfig); + } + setClaudeBasePathOverride(null); + + restoreEnv('CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH', previousCliPath); + restoreEnv('CLAUDE_TEAM_CLI_FLAVOR', previousCliFlavor); + restoreEnv('CODEX_HOME', previousCodexHome); + restoreEnv('HOME', previousHome); + restoreEnv('USERPROFILE', previousUserProfile); + restoreEnv('NODE_ENV', previousNodeEnv); + restoreEnv('ANTHROPIC_API_KEY', previousAnthropicApiKey); + restoreEnv('ANTHROPIC_AUTH_TOKEN', previousAnthropicAuthToken); + + if (process.env.PROVIDER_LAUNCH_STRESS_KEEP_TEMP === '1') { + process.stderr.write(`[ProviderLaunchStress.live] preserved temp dir: ${tempDir}\n`); + } else { + await fs.rm(tempDir, { recursive: true, force: true }); + } + currentStressTempDir = ''; + currentStressProjectPath = ''; + }, 240_000); + + it( + 'launches, restarts, and exercises post-launch work for provider teams with five teammates each', + async () => { + const orchestratorCli = process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH?.trim(); + expect(orchestratorCli).toBeTruthy(); + await assertExecutable(orchestratorCli!); + await assertCodexSubscriptionAuthAvailable(process.env.CODEX_HOME!); + + for (const scenario of getStressOrder()) { + await runProviderStressScenario(scenario, activeScenarios); + } + }, + 30 * 60_000 + ); +}); + +async function runProviderStressScenario( + scenario: ProviderLaunchStressScenario, + activeScenarios: ActiveScenario[] +): Promise { + const selected = resolveScenarioSelection(scenario); + const memberCount = getStressMemberCount(); + const teamName = `provider-stress-${scenario}-${Date.now()}`; + const progressEvents: TeamProvisioningProgress[] = []; + process.stderr.write( + `[ProviderLaunchStress.live] starting ${scenario} with ${memberCount} teammates\n` + ); + let codexCleanup: (() => Promise) | undefined; + let harness: Awaited> | undefined; + try { + codexCleanup = + scenario === 'codex' || scenario === 'mixed' ? await installCodexAccountFeature() : undefined; + harness = + scenario === 'opencode' || scenario === 'mixed' + ? await createOpenCodeLiveHarness({ + tempDir: currentStressTempDir, + selectedModel: selected.openCodeModel, + projectPath: projectPathForStress(), + }) + : undefined; + } catch (error) { + await harness?.dispose().catch(() => undefined); + await codexCleanup?.().catch(() => undefined); + throw error; + } + const svc = harness?.svc ?? new TeamProvisioningService(); + const active: ActiveScenario = { scenario, teamName, svc, harness, codexCleanup, failed: false }; + activeScenarios.push(active); + + try { + await svc.createTeam( + buildStressCreateRequest({ + scenario, + teamName, + memberCount, + selection: selected, + }), + (progress) => progressEvents.push(progress) + ); + + await waitUntil(async () => { + const last = progressEvents.at(-1); + if (last?.state === 'failed') { + active.failed = true; + throw new Error(await formatStressDiagnostics(svc, teamName, progressEvents)); + } + return last?.state === 'ready'; + }, 420_000); + + const expectedMembers = buildExpectedMemberNames(memberCount); + await waitUntil(async () => { + const statuses = await svc.getMemberSpawnStatuses(teamName); + if (statuses.teamLaunchState === 'partial_failure') { + active.failed = true; + throw new Error(await formatStressDiagnostics(svc, teamName, progressEvents)); + } + return expectedMembers.every((memberName) => { + const entry = statuses.statuses[memberName]; + return ( + entry?.status === 'online' && + entry.launchState === 'confirmed_alive' && + entry.bootstrapConfirmed === true + ); + }); + }, 240_000); + + await waitUntil(async () => { + const snapshot = await svc.getTeamAgentRuntimeSnapshot(teamName); + return expectedMembers.every((memberName) => snapshot.members[memberName]?.alive === true); + }, 180_000); + process.stderr.write(`[ProviderLaunchStress.live] ${scenario} confirmed all teammates\n`); + + await runRestartStressChecks(active, expectedMembers, progressEvents); + await runPostLaunchWorkProofCheck(active, expectedMembers, progressEvents); + } catch (error) { + active.failed = true; + throw error; + } finally { + if (!active.failed) { + await cleanupActiveScenario(active, { preserveFiles: false }); + const index = activeScenarios.indexOf(active); + if (index >= 0) activeScenarios.splice(index, 1); + } + } +} + +async function runRestartStressChecks( + active: ActiveScenario, + expectedMembers: string[], + progressEvents: TeamProvisioningProgress[] +): Promise { + const targets = resolveRestartStressTargets(active.scenario, expectedMembers); + for (const memberName of targets) { + process.stderr.write( + `[ProviderLaunchStress.live] restarting ${active.scenario}/${memberName}\n` + ); + try { + await active.svc.restartMember(active.teamName, memberName); + await waitForStressCondition( + `restart ${active.teamName}/${memberName}`, + async () => { + const statuses = await active.svc.getMemberSpawnStatuses(active.teamName); + const entry = statuses.statuses[memberName]; + if (entry?.status === 'error' || entry?.launchState === 'failed_to_start') { + throw new Error( + `restart ${memberName} failed: ${entry.hardFailureReason ?? entry.error ?? 'unknown'}` + ); + } + return ( + entry?.status === 'online' && + entry.launchState === 'confirmed_alive' && + entry.bootstrapConfirmed === true + ); + }, + RESTART_CONFIRM_TIMEOUT_MS, + 2_000, + () => formatStressDiagnostics(active.svc, active.teamName, progressEvents) + ); + await waitForStressCondition( + `runtime alive after restart ${active.teamName}/${memberName}`, + async () => { + const snapshot = await active.svc.getTeamAgentRuntimeSnapshot(active.teamName); + return snapshot.members[memberName]?.alive === true; + }, + 120_000, + 2_000, + () => formatStressDiagnostics(active.svc, active.teamName, progressEvents) + ); + } catch (error) { + throw new Error( + `Restart stress failed for ${active.scenario}/${memberName}: ${error instanceof Error ? error.message : String(error)}` + ); + } + } + + await waitForStressCondition( + `all teammates still confirmed after restarts ${active.teamName}`, + async () => { + const statuses = await active.svc.getMemberSpawnStatuses(active.teamName); + return expectedMembers.every((memberName) => { + const entry = statuses.statuses[memberName]; + return ( + entry?.status === 'online' && + entry.launchState === 'confirmed_alive' && + entry.bootstrapConfirmed === true + ); + }); + }, + 120_000, + 2_000, + () => formatStressDiagnostics(active.svc, active.teamName, progressEvents) + ); + process.stderr.write(`[ProviderLaunchStress.live] ${active.scenario} restart checks passed\n`); +} + +async function runPostLaunchWorkProofCheck( + active: ActiveScenario, + expectedMembers: string[], + progressEvents: TeamProvisioningProgress[] +): Promise { + const memberName = resolvePostLaunchWorkTarget(active.scenario, expectedMembers); + const marker = `provider-stress-${active.scenario}-${Date.now()}`; + const teamDataService = new TeamDataService(); + const taskReader = new TeamTaskReader(); + + process.stderr.write( + `[ProviderLaunchStress.live] sending post-launch work probe to ${active.scenario}/${memberName}\n` + ); + const task = await teamDataService.createTask(active.teamName, { + subject: `Provider launch stress proof ${marker}`, + owner: memberName, + startImmediately: true, + prompt: [ + `This is a live provider launch stress validation. Marker: ${marker}.`, + 'Do not edit files.', + 'Add one task comment containing exactly:', + `${marker}:done`, + 'Then mark this task complete.', + 'After that stop. Do not send a separate user-visible chat reply.', + ].join('\n'), + }); + + const relay = await active.svc.relayInboxFileToLiveRecipient(active.teamName, memberName); + if (!isAcceptedStressRelayResult(relay)) { + throw new Error( + `Post-launch work probe was not relayed to ${memberName}; relay result: ${JSON.stringify(relay)}` + ); + } + + await waitForStressCondition( + `post-launch work proof ${active.teamName}/${memberName}/${task.id}`, + async () => { + const tasks = await taskReader.getTasks(active.teamName); + const current = tasks.find((candidate) => candidate.id === task.id); + if (!current) return false; + const hasMarkerComment = current.comments?.some((comment) => + comment.text.includes(`${marker}:done`) + ); + return Boolean(hasMarkerComment || current.status === 'completed'); + }, + POST_LAUNCH_WORK_TIMEOUT_MS, + 2_000, + () => formatStressDiagnostics(active.svc, active.teamName, progressEvents) + ); + process.stderr.write(`[ProviderLaunchStress.live] ${active.scenario} post-launch work passed\n`); +} + +function isAcceptedStressRelayResult(relay: Awaited< + ReturnType +>): boolean { + if (relay.kind === 'native_member_noop') return true; + if (relay.relayed > 0) return true; + const lastDelivery = relay.lastDelivery; + return Boolean( + lastDelivery && + (lastDelivery.accepted === true || + lastDelivery.delivered === true || + lastDelivery.responsePending === true) + ); +} + +function resolveRestartStressTargets( + scenario: ProviderLaunchStressScenario, + expectedMembers: string[] +): string[] { + if (expectedMembers.length === 0) return []; + // Pure OpenCode launch can finish without a tracked lead run. Per-member + // restart for OpenCode is covered by the mixed secondary-lane scenario, + // where the app owns the live run and can reattach the OpenCode lane. + if (scenario === 'opencode') return []; + if (scenario !== 'mixed') { + return [expectedMembers[1] ?? expectedMembers[0]]; + } + + const targets: string[] = []; + const wantedProviders: TeamProviderId[] = ['anthropic', 'codex', 'opencode']; + for (const providerId of wantedProviders) { + const index = expectedMembers.findIndex( + (_memberName, memberIndex) => resolveStressMemberProvider('mixed', memberIndex) === providerId + ); + if (index >= 0) targets.push(expectedMembers[index]!); + } + return targets; +} + +function resolvePostLaunchWorkTarget( + scenario: ProviderLaunchStressScenario, + expectedMembers: string[] +): string { + if (scenario === 'mixed') { + const openCodeIndex = expectedMembers.findIndex( + (_memberName, memberIndex) => + resolveStressMemberProvider('mixed', memberIndex) === 'opencode' + ); + if (openCodeIndex >= 0) return expectedMembers[openCodeIndex]!; + } + return expectedMembers[1] ?? expectedMembers[0] ?? 'alice'; +} + +async function waitForStressCondition( + label: string, + predicate: () => Promise, + timeoutMs: number, + pollMs: number, + diagnostics: () => Promise +): Promise { + const deadline = Date.now() + timeoutMs; + let lastError: unknown; + while (Date.now() < deadline) { + try { + if (await predicate()) return; + lastError = undefined; + } catch (error) { + lastError = error; + break; + } + await new Promise((resolve) => setTimeout(resolve, pollMs)); + } + + const suffix = lastError + ? `\nLast error: ${lastError instanceof Error ? lastError.message : String(lastError)}` + : ''; + throw new Error( + `Timed out waiting for ${label} after ${timeoutMs}ms${suffix}\n${await diagnostics()}` + ); +} + +function discardKnownProviderLaunchStressWarnings(): void { + const warn = vi.mocked(console.warn); + if (!warn.mock) return; + const calls = warn.mock.calls; + for (let index = calls.length - 1; index >= 0; index -= 1) { + const text = calls[index]?.map((value) => String(value)).join(' ') ?? ''; + if (text.includes('Failed to resolve login shell env: shell env resolve timeout')) { + calls.splice(index, 1); + } + } +} + +function buildStressCreateRequest(input: { + scenario: ProviderLaunchStressScenario; + teamName: string; + memberCount: number; + selection: ReturnType; +}): TeamCreateRequest { + const members = buildStressMembers(input.scenario, input.memberCount, input.selection); + const providerId: TeamProviderId = input.scenario === 'mixed' ? 'anthropic' : input.scenario; + return { + teamName: input.teamName, + cwd: projectPathForStress(), + providerId, + providerBackendId: providerId === 'codex' ? 'codex-native' : undefined, + model: + providerId === 'codex' + ? input.selection.codexModel + : providerId === 'opencode' + ? input.selection.openCodeModel + : input.selection.anthropicModel, + effort: providerId === 'codex' ? input.selection.codexEffort : undefined, + fastMode: providerId === 'codex' ? 'off' : undefined, + skipPermissions: true, + prompt: 'Keep the team idle after bootstrap. Do not start extra work.', + members, + }; +} + +function buildStressMembers( + scenario: ProviderLaunchStressScenario, + memberCount: number, + selection: ReturnType +): TeamMember[] { + const names = buildExpectedMemberNames(memberCount); + return names.map((name, index) => { + const providerId = resolveStressMemberProvider(scenario, index); + return { + name, + role: index % 2 === 0 ? 'Developer' : 'Reviewer', + providerId, + providerBackendId: providerId === 'codex' ? 'codex-native' : undefined, + model: + providerId === 'codex' + ? selection.codexModel + : providerId === 'opencode' + ? selection.openCodeModel + : selection.anthropicModel, + effort: providerId === 'codex' ? selection.codexEffort : undefined, + fastMode: providerId === 'codex' ? 'off' : undefined, + }; + }); +} + +function resolveStressMemberProvider( + scenario: ProviderLaunchStressScenario, + index: number +): TeamProviderId { + if (scenario !== 'mixed') return scenario; + const providers: TeamProviderId[] = ['anthropic', 'codex', 'opencode', 'anthropic', 'codex']; + return providers[index % providers.length] ?? 'anthropic'; +} + +function resolveScenarioSelection(scenario: ProviderLaunchStressScenario): { + anthropicModel: string; + codexModel: string; + codexEffort: 'low' | 'medium' | 'high' | 'xhigh'; + openCodeModel: string; +} { + return { + anthropicModel: + process.env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_MODEL?.trim() || DEFAULT_ANTHROPIC_MODEL, + codexModel: process.env.PROVIDER_LAUNCH_STRESS_CODEX_MODEL?.trim() || DEFAULT_CODEX_MODEL, + codexEffort: (process.env.PROVIDER_LAUNCH_STRESS_CODEX_EFFORT?.trim() || + DEFAULT_CODEX_EFFORT) as 'low' | 'medium' | 'high' | 'xhigh', + openCodeModel: + process.env.PROVIDER_LAUNCH_STRESS_OPENCODE_MODEL?.trim() || DEFAULT_OPENCODE_MODEL, + }; +} + +function getStressMemberCount(): number { + const parsed = Number.parseInt(process.env.PROVIDER_LAUNCH_STRESS_MEMBER_COUNT ?? '5', 10); + return Number.isFinite(parsed) && parsed > 0 + ? Math.min(parsed, MEMBER_NAMES.length) + : 5; +} + +function buildExpectedMemberNames(memberCount: number): string[] { + return MEMBER_NAMES.slice(0, memberCount); +} + +function getStressOrder(): ProviderLaunchStressScenario[] { + const raw = process.env.PROVIDER_LAUNCH_STRESS_ORDER?.trim(); + if (!raw) return DEFAULT_ORDER; + const parsed = raw + .split(',') + .map((item) => item.trim()) + .filter((item): item is ProviderLaunchStressScenario => + ['anthropic', 'codex', 'opencode', 'mixed'].includes(item) + ); + return parsed.length > 0 ? parsed : DEFAULT_ORDER; +} + +function projectPathForStress(): string { + const explicit = process.env.PROVIDER_LAUNCH_STRESS_PROJECT_PATH?.trim(); + if (explicit) return path.resolve(explicit); + if (!currentStressProjectPath) { + throw new Error('Provider launch stress project path requested before test setup'); + } + return currentStressProjectPath; +} + +async function cleanupActiveScenario( + active: ActiveScenario, + options: { preserveFiles: boolean } +): Promise { + const beforeStopSnapshot = await active.svc + .getTeamAgentRuntimeSnapshot(active.teamName) + .catch(() => null); + await active.svc.stopTeam(active.teamName).catch(() => undefined); + if (active.harness) { + await waitForOpenCodeLanesStopped(active.teamName, 90_000).catch(() => undefined); + } + await terminateProcessBackends(beforeStopSnapshot); + const afterStopSnapshot = await active.svc + .getTeamAgentRuntimeSnapshot(active.teamName) + .catch(() => null); + await terminateProcessBackends(afterStopSnapshot); + await active.harness?.dispose().catch(() => undefined); + await active.codexCleanup?.().catch(() => undefined); + if (!options.preserveFiles) { + await fs.rm(path.join(getTeamsBasePath(), active.teamName), { recursive: true, force: true }); + await fs.rm(path.join(getTasksBasePath(), active.teamName), { recursive: true, force: true }); + } +} + +async function terminateProcessBackends(snapshot: TeamAgentRuntimeSnapshot | null): Promise { + const pids = new Set(); + for (const member of Object.values(snapshot?.members ?? {})) { + if (member.backendType !== 'process' || member.providerId === 'opencode') continue; + const pid = member.runtimePid ?? member.pid; + if (typeof pid === 'number' && Number.isFinite(pid) && pid > 0) pids.add(pid); + } + for (const pid of pids) { + try { + process.kill(pid, 0); + killProcessByPid(pid); + } catch { + // Best-effort cleanup; the runtime may already be gone. + } + } +} + +async function installCodexAccountFeature(): Promise<() => Promise> { + const [{ createCodexAccountFeature }, { ProviderConnectionService }] = await Promise.all([ + import('../../../../src/features/codex-account/main/composition/createCodexAccountFeature'), + import('../../../../src/main/services/runtime/ProviderConnectionService'), + ]); + const feature = createCodexAccountFeature({ + logger: { + info: () => undefined, + warn: () => undefined, + error: () => undefined, + }, + configManager: { + getConfig: () => ({ + providerConnections: { + codex: { + preferredAuthMode: 'chatgpt' as const, + }, + }, + }), + }, + }); + const providerConnectionService = ProviderConnectionService.getInstance(); + providerConnectionService.setCodexAccountFeature(feature); + return async () => { + providerConnectionService.setCodexAccountFeature(null); + await feature.dispose().catch(() => undefined); + }; +} + +async function formatStressDiagnostics( + svc: TeamProvisioningService, + teamName: string, + progressEvents: TeamProvisioningProgress[] +): Promise { + const [spawnStatuses, runtimeSnapshot, artifact] = await Promise.all([ + svc.getMemberSpawnStatuses(teamName).catch((error) => ({ error: String(error) })), + svc.getTeamAgentRuntimeSnapshot(teamName).catch((error) => ({ error: String(error) })), + readLatestArtifactManifest(teamName), + ]); + return redactSecrets( + JSON.stringify( + { + progress: progressEvents.map((progress) => ({ + state: progress.state, + message: progress.message, + messageSeverity: progress.messageSeverity, + error: progress.error, + launchDiagnostics: progress.launchDiagnostics, + })), + spawnStatuses, + runtimeSnapshot, + artifact, + }, + null, + 2 + ) + ); +} + +async function readLatestArtifactManifest(teamName: string): Promise { + try { + const latest = JSON.parse( + await fs.readFile( + path.join(getTeamsBasePath(), teamName, 'launch-failure-artifacts', 'latest.json'), + 'utf8' + ) + ) as { manifestPath?: unknown }; + if (typeof latest.manifestPath !== 'string') return latest; + return JSON.parse(await fs.readFile(latest.manifestPath, 'utf8')); + } catch { + return null; + } +} + +function hasAnthropicAuthConfigured(): boolean { + return usingAnthropicSubscriptionAuth() || Boolean(process.env.ANTHROPIC_API_KEY?.trim()); +} + +function usingAnthropicSubscriptionAuth(): boolean { + const mode = process.env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH?.trim().toLowerCase(); + return mode === 'subscription' || mode === 'oauth'; +} + +async function assertExecutable(filePath: string): Promise { + await fs.access(filePath, fsConstants.X_OK); +} + +async function assertCodexSubscriptionAuthAvailable(codexHome: string): Promise { + const legacyAuthPath = path.join(codexHome, 'auth.json'); + if (await pathReadable(legacyAuthPath)) { + const legacyAuth = await readJsonObject(legacyAuthPath); + if (isCodexChatGptSubscriptionAuth(legacyAuth)) return; + } + + const accountsDir = path.join(codexHome, 'accounts'); + const registry = await readJsonObject(path.join(accountsDir, 'registry.json')).catch(() => null); + const activeAccountId = + readStringProperty(registry, 'active_account_id') ?? + readStringProperty(registry, 'activeAccountId') ?? + readStringProperty(registry, 'current_account_id') ?? + readStringProperty(registry, 'currentAccountId'); + + const candidates = new Set(); + if (activeAccountId) { + candidates.add(path.join(accountsDir, `${activeAccountId}.auth.json`)); + candidates.add(path.join(accountsDir, activeAccountId)); + } + const entries = await fs.readdir(accountsDir).catch(() => []); + for (const entry of entries) { + if (entry.endsWith('.auth.json')) candidates.add(path.join(accountsDir, entry)); + } + for (const candidate of candidates) { + const auth = await readJsonObject(candidate).catch(() => null); + if (isCodexChatGptSubscriptionAuth(auth)) return; + } + throw new Error(`Codex subscription auth not found in ${codexHome}`); +} + +async function pathReadable(filePath: string): Promise { + try { + await fs.access(filePath, fsConstants.R_OK); + return true; + } catch { + return false; + } +} + +async function readJsonObject(filePath: string): Promise> { + const parsed = JSON.parse(await fs.readFile(filePath, 'utf8')); + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + throw new Error(`Expected JSON object in ${filePath}`); + } + return parsed as Record; +} + +function readStringProperty(source: Record | null, key: string): string | null { + const value = source?.[key]; + return typeof value === 'string' && value.trim() ? value.trim() : null; +} + +function isCodexChatGptSubscriptionAuth(source: Record | null): boolean { + if (!source) return false; + const direct = readStringProperty(source, 'refresh_token'); + const tokens = source.tokens; + const nested = + tokens && typeof tokens === 'object' && !Array.isArray(tokens) + ? readStringProperty(tokens as Record, 'refresh_token') + : null; + return Boolean(direct || nested); +} + +function resolveConnectedCodexHome(previousCodexHome: string | undefined): string { + const explicit = process.env.PROVIDER_LAUNCH_STRESS_CODEX_HOME?.trim(); + if (explicit) return path.resolve(explicit); + const previous = previousCodexHome?.trim(); + if (previous) return path.resolve(previous); + return path.join(os.userInfo().homedir, '.codex'); +} + +async function writeTrustedClaudeConfig(configDir: string, projectPath: string): Promise { + const normalizedProjectPath = path.normalize(await fs.realpath(projectPath)).replace(/\\/g, '/'); + const approvedApiKeySuffix = process.env.ANTHROPIC_API_KEY?.trim().slice(-20); + const config: { + projects: Record; + customApiKeyResponses?: { approved: string[]; rejected: string[] }; + } = { + projects: { + [normalizedProjectPath]: { + hasTrustDialogAccepted: true, + }, + }, + }; + if (approvedApiKeySuffix) { + config.customApiKeyResponses = { approved: [approvedApiKeySuffix], rejected: [] }; + } + await fs.writeFile( + path.join(configDir, '.claude.json'), + `${JSON.stringify(config, null, 2)}\n`, + 'utf8' + ); +} + +async function upsertTrustedClaudeProjectConfig( + configDir: string, + projectPath: string +): Promise { + const configPath = path.join(configDir, '.claude.json'); + const previous = await fs.readFile(configPath, 'utf8').catch((error) => { + if ((error as NodeJS.ErrnoException).code === 'ENOENT') return null; + throw error; + }); + const existing = previous ? (JSON.parse(previous) as Record) : {}; + const normalizedProjectPath = path.normalize(await fs.realpath(projectPath)).replace(/\\/g, '/'); + const projects = + existing.projects && typeof existing.projects === 'object' && !Array.isArray(existing.projects) + ? { ...(existing.projects as Record) } + : {}; + const current = + projects[normalizedProjectPath] && + typeof projects[normalizedProjectPath] === 'object' && + !Array.isArray(projects[normalizedProjectPath]) + ? (projects[normalizedProjectPath] as Record) + : {}; + projects[normalizedProjectPath] = { ...current, hasTrustDialogAccepted: true }; + await fs.writeFile(configPath, `${JSON.stringify({ ...existing, projects }, null, 2)}\n`, 'utf8'); + return previous; +} + +async function restoreClaudeJsonConfig(configDir: string, previous: string | null): Promise { + const configPath = path.join(configDir, '.claude.json'); + if (previous === null) { + await fs.rm(configPath, { force: true }); + } else { + await fs.writeFile(configPath, previous, 'utf8'); + } +} + +function restoreEnv(name: string, previous: string | undefined): void { + if (previous === undefined) { + delete process.env[name]; + } else { + process.env[name] = previous; + } +} + +function redactSecrets(text: string): string { + return text + .replace(/sk-ant-api03-[A-Za-z0-9_-]+/g, '') + .replace(/\b(?:sk|ak)-[A-Za-z0-9_-]{20,}\b/g, ''); +} diff --git a/test/main/services/team/TeamLaunchFailureArtifactPack.test.ts b/test/main/services/team/TeamLaunchFailureArtifactPack.test.ts new file mode 100644 index 00000000..e82bbf18 --- /dev/null +++ b/test/main/services/team/TeamLaunchFailureArtifactPack.test.ts @@ -0,0 +1,197 @@ +import * as fs from 'fs/promises'; +import * as os from 'os'; +import * as path from 'path'; + +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; + +import { + classifyLaunchFailureArtifact, + extractLaunchBootstrapTransportBreadcrumb, + redactLaunchFailureArtifactText, + writeTeamLaunchFailureArtifactPack, +} from '../../../../src/main/services/team/TeamLaunchFailureArtifactPack'; +import { + getTeamsBasePath, + setClaudeBasePathOverride, +} from '../../../../src/main/utils/pathDecoder'; + +describe('TeamLaunchFailureArtifactPack', () => { + let tempRoot: string; + + beforeEach(async () => { + tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'team-launch-artifact-pack-')); + setClaudeBasePathOverride(path.join(tempRoot, '.claude')); + }); + + afterEach(async () => { + setClaudeBasePathOverride(null); + await fs.rm(tempRoot, { recursive: true, force: true }); + }); + + it('writes a bounded redacted launch failure artifact pack with known launch files', async () => { + const teamName = 'artifact-team'; + const runId = 'run-secret-1'; + const teamDir = path.join(getTeamsBasePath(), teamName); + await fs.mkdir(path.join(teamDir, '.bootstrap.lock'), { recursive: true }); + await fs.writeFile( + path.join(teamDir, 'launch-state.json'), + JSON.stringify({ + teamName, + runId, + secret: 'sk-ant-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa', + token: 'abcdefghijklmnopqrstuvwxyz123456', + }), + 'utf8' + ); + await fs.writeFile(path.join(teamDir, 'launch-summary.json'), '{"summary":true}\n', 'utf8'); + await fs.writeFile(path.join(teamDir, 'bootstrap-state.json'), '{"bootstrap":true}\n', 'utf8'); + await fs.writeFile( + path.join(teamDir, 'bootstrap-journal.jsonl'), + '{"event":"started"}\n', + 'utf8' + ); + await fs.writeFile( + path.join(teamDir, '.bootstrap.lock', 'metadata.json'), + '{"pid":123,"runId":"run-secret-1"}\n', + 'utf8' + ); + + const result = await writeTeamLaunchFailureArtifactPack({ + teamName, + runId, + reason: 'launch_progress_failed', + startedAt: '2026-05-09T00:00:00.000Z', + cwd: '/repo', + pid: 123, + providerId: 'anthropic', + model: 'claude-opus', + expectedMembers: ['alice'], + effectiveMembers: [{ name: 'alice', role: 'developer', provider: 'anthropic' } as never], + progress: { + runId, + teamName, + state: 'failed', + message: 'Launch failed', + startedAt: '2026-05-09T00:00:00.000Z', + updatedAt: '2026-05-09T00:01:00.000Z', + error: + 'Authentication failed: ANTHROPIC_API_KEY=sk-ant-bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb', + }, + memberSpawnStatuses: { + alice: { + status: 'error', + launchState: 'failed_to_start', + hardFailureReason: 'bootstrap timeout', + updatedAt: '2026-05-09T00:01:00.000Z', + }, + }, + cliLogs: 'stderr OPENAI_API_KEY=sk-proj-cccccccccccccccccccccccccccccccccccccccc', + progressTraceLines: ['[failed] launch failed'], + runtimeAdapterTraceLines: ['runtime trace'], + }); + + const manifest = JSON.parse(await fs.readFile(result.manifestPath, 'utf8')) as { + reason: string; + artifactFiles: string[]; + classification: { code: string }; + bootstrapTransportBreadcrumb: { lastTransportStage: string | null }; + progress: { error: string }; + }; + expect(manifest.reason).toBe('launch_progress_failed'); + expect(manifest.classification.code).toBe('provider_auth'); + expect(manifest.artifactFiles).toContain('cli-logs-tail.txt'); + expect(manifest.artifactFiles).toContain('launch-state.json'); + expect(manifest.progress.error).toContain('[REDACTED]'); + + const copiedLaunchState = await fs.readFile(path.join(result.directory, 'launch-state.json'), 'utf8'); + expect(copiedLaunchState).toContain('[REDACTED_ANTHROPIC_API_KEY]'); + expect(() => JSON.parse(copiedLaunchState)).not.toThrow(); + expect(copiedLaunchState).toContain('"token":"[REDACTED]"'); + expect(copiedLaunchState).not.toContain('sk-ant-'); + + const cliLogs = await fs.readFile(path.join(result.directory, 'cli-logs-tail.txt'), 'utf8'); + expect(cliLogs).toContain('OPENAI_API_KEY=[REDACTED]'); + expect(cliLogs).not.toContain('sk-proj-'); + + const latest = JSON.parse( + await fs.readFile(path.join(teamDir, 'launch-failure-artifacts', 'latest.json'), 'utf8') + ) as { manifestPath: string }; + expect(latest.manifestPath).toBe(result.manifestPath); + }); + + it('redacts common bearer and token-shaped secrets', () => { + const redacted = redactLaunchFailureArtifactText( + 'Authorization: Bearer abcdefghijklmnopqrstuvwxyz123456 token: abcdefghijklmnopqrstuvwxyz123456' + ); + expect(redacted).toContain('Authorization: Bearer [REDACTED]'); + expect(redacted).toContain('token: [REDACTED]'); + }); + + it('classifies bootstrap transport rejection and extracts breadcrumb details', () => { + const input = { + teamName: 'artifact-team', + runId: 'run-transport', + reason: 'launch_cleanup_unconfirmed_bootstrap', + progressTraceLines: [ + 'bob did not submit bootstrap prompt: timed out waiting for bootstrap_submitted; last transport stage: bootstrap_submit_rejected: submit rejected by local prompt handler retryable=true', + 'Warning: no stdin data received in 3s, proceeding without it.', + ], + }; + + expect(classifyLaunchFailureArtifact(input).code).toBe('transport_rejected'); + expect(extractLaunchBootstrapTransportBreadcrumb(input)).toMatchObject({ + lastTransportStage: 'bootstrap_submit_rejected: submit rejected by local prompt handler retryable=true', + submitRejected: true, + retryable: true, + noStdinWarning: true, + bootstrapSubmitted: false, + }); + }); + + it('classifies provider quota separately from protocol errors', () => { + expect( + classifyLaunchFailureArtifact({ + teamName: 'artifact-team', + runId: 'run-quota', + reason: + 'OpenCode quota exhausted. This request requires more credits, or fewer max_tokens.', + }).code + ).toBe('provider_quota'); + }); + + it.each([ + { + name: 'stdin warning', + text: 'Warning: no stdin data received in 3s, proceeding without it.', + code: 'stdin_missing', + }, + { + name: 'provider auth', + text: 'Codex API error. Token refresh failed: 401 Unauthorized', + code: 'provider_auth', + }, + { + name: 'model bootstrap timeout', + text: 'bob: Teammate was registered but did not bootstrap-confirm before timeout.', + code: 'model_no_bootstrap', + }, + { + name: 'process stale pid', + text: 'persisted runtime pid is not alive; persisted runtime pid was not found in process table', + code: 'process_exited', + }, + { + name: 'opencode protocol', + text: 'OpenCode API error. non_visible_tool_without_task_progress', + code: 'opencode_protocol', + }, + ])('classifies production-like failure string: $name', ({ text, code }) => { + expect( + classifyLaunchFailureArtifact({ + teamName: 'artifact-team', + runId: `run-${code}`, + reason: text, + }).code + ).toBe(code); + }); +}); diff --git a/test/main/services/team/TeamProvisioningService.test.ts b/test/main/services/team/TeamProvisioningService.test.ts index 0e69b86e..c7eb7c45 100644 --- a/test/main/services/team/TeamProvisioningService.test.ts +++ b/test/main/services/team/TeamProvisioningService.test.ts @@ -615,6 +615,17 @@ function createClaudeLogsRun(overrides: Record = {}) { } as any; } +async function waitForFile(filePath: string, timeoutMs = 2_000): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + if (fs.existsSync(filePath)) { + return; + } + await new Promise((resolve) => setTimeout(resolve, 25)); + } + throw new Error(`Timed out waiting for file: ${filePath}`); +} + describe('TeamProvisioningService', () => { beforeEach(() => { vi.clearAllMocks(); @@ -1137,6 +1148,102 @@ describe('TeamProvisioningService', () => { }); }); + it('writes a launch failure artifact pack when cleanup finalizes failed launch state', async () => { + allowConsoleLogs(); + const svc = new TeamProvisioningService(); + const teamName = 'launch-artifact-cleanup-team'; + const runId = 'run-launch-artifact-cleanup'; + const startedAt = '2026-05-09T00:25:00.000Z'; + const run = createClaudeLogsRun({ + runId, + teamName, + startedAt, + isLaunch: true, + provisioningComplete: false, + cancelRequested: false, + deterministicBootstrap: true, + expectedMembers: ['bob'], + allEffectiveMembers: [ + { + name: 'bob', + role: 'Developer', + providerId: 'anthropic', + model: 'opus', + }, + ], + request: { + cwd: '/repo', + providerId: 'anthropic', + model: 'opus', + members: [ + { + name: 'bob', + role: 'Developer', + providerId: 'anthropic', + model: 'opus', + }, + ], + }, + memberSpawnStatuses: new Map([ + [ + 'bob', + createMemberSpawnStatusEntry({ + status: 'spawning', + launchState: 'runtime_pending_bootstrap', + runtimeAlive: true, + firstSpawnAcceptedAt: '2026-05-09T00:25:05.000Z', + updatedAt: '2026-05-09T00:25:05.000Z', + }), + ], + ]), + progress: { + runId, + teamName, + state: 'failed', + message: 'Launch failed', + startedAt, + updatedAt: '2026-05-09T00:26:00.000Z', + error: + 'Teammate process bob@signal-ops did not submit bootstrap prompt: timed out waiting for bootstrap_submitted; last transport stage: bootstrap_submit_rejected: submit rejected by local prompt handler retryable=true Last stderr: Warning: no stdin data received in 3s, proceeding without it.', + }, + claudeLogLines: [ + '[stderr]', + 'Warning: no stdin data received in 3s, proceeding without it.', + ], + provisioningOutputParts: [], + }); + + (svc as any).runs.set(run.runId, run); + (svc as any).aliveRunByTeam.set(run.teamName, run.runId); + (svc as any).cleanupRun(run); + + const latestPath = path.join( + tempTeamsBase, + teamName, + 'launch-failure-artifacts', + 'latest.json' + ); + await waitForFile(latestPath); + const latest = JSON.parse(fs.readFileSync(latestPath, 'utf8')) as { manifestPath: string }; + const manifest = JSON.parse(fs.readFileSync(latest.manifestPath, 'utf8')) as { + reason: string; + classification: { code: string }; + bootstrapTransportBreadcrumb: { + submitRejected: boolean; + noStdinWarning: boolean; + retryable: boolean | null; + }; + }; + + expect(manifest.reason).toBe('launch_progress_failed'); + expect(manifest.classification.code).toBe('transport_rejected'); + expect(manifest.bootstrapTransportBreadcrumb).toMatchObject({ + submitRejected: true, + noStdinWarning: true, + retryable: true, + }); + }); + it('falls back to the persisted lead transcript when no live run exists', async () => { const svc = new TeamProvisioningService(); const teamName = 'offline-logs-team'; @@ -11591,6 +11698,59 @@ describe('TeamProvisioningService', () => { expect(sendMessageToRun).not.toHaveBeenCalled(); }); + it('restarts a process backend teammate directly without asking the lead to respawn it', async () => { + const svc = new TeamProvisioningService(); + const run = createMemberSpawnRun({ + teamName: 'process-team', + expectedMembers: ['forge'], + memberSpawnStatuses: new Map(), + }); + run.child = { pid: 111 }; + run.processKilled = false; + run.cancelRequested = false; + + const sendMessageToRun = vi.fn(async () => {}); + const directProcessRestart = vi.fn(async () => {}); + (svc as any).sendMessageToRun = sendMessageToRun; + (svc as any).launchDirectProcessMemberRestart = directProcessRestart; + (svc as any).configReader = { + getConfig: vi.fn(async () => ({ + name: 'Process Team', + members: [{ name: 'team-lead', agentType: 'team-lead' }], + })), + }; + (svc as any).membersMetaStore = { + getMembers: vi.fn(async () => [ + { + name: 'forge', + role: 'Developer', + providerId: 'codex', + model: 'gpt-5.4', + effort: 'medium', + agentType: 'general-purpose', + }, + ]), + }; + (svc as any).readPersistedRuntimeMembers = vi.fn(() => [ + { + name: 'forge', + agentId: 'forge@process-team', + backendType: 'process', + tmuxPaneId: 'process:1234', + runtimePid: 1234, + }, + ]); + (svc as any).getLiveTeamAgentRuntimeMetadata = vi.fn(async () => new Map()); + (svc as any).aliveRunByTeam.set('process-team', run.runId); + (svc as any).runs.set(run.runId, run); + + await svc.restartMember('process-team', 'forge'); + + expect(directProcessRestart).toHaveBeenCalledTimes(1); + expect(sendMessageToRun).not.toHaveBeenCalled(); + expect(run.pendingMemberRestarts.has('forge')).toBe(true); + }); + it('rejects a second restart request while the first restart is still in flight', async () => { const svc = new TeamProvisioningService(); const run = createMemberSpawnRun({