fix(team): harden process launch diagnostics

This commit is contained in:
777genius 2026-05-09 07:33:33 +03:00
parent 81700a1a14
commit 6dab896aee
9 changed files with 2335 additions and 3 deletions

View file

@ -17,6 +17,8 @@ For new features:
- Treat regressions in agent team messaging, task lifecycle, session parsing, code review UI, and provider/runtime detection as high priority.
- For team launch hangs, OpenCode `registered`/`bootstrap unconfirmed`, missing teammate replies, or suspicious task logs, follow [docs/team-management/debugging-agent-teams.md](docs/team-management/debugging-agent-teams.md) before changing code.
- For launch failures, first inspect the newest artifact pack under `~/.claude/teams/<team>/launch-failure-artifacts/latest.json`, then open its `manifest.json`. The manifest includes `classification`, `bootstrapTransportBreadcrumb`, launch diagnostics, member spawn statuses, and redacted copies/tails of launch-state, bootstrap-state, bootstrap-journal, CLI logs, progress trace, and runtime adapter trace.
- When running live smoke tests, keep cleanup narrow: stop only the smoke-owned team/run and launch-owned process teammates. Do not kill shared OpenCode hosts, unrelated tmux panes, or user teams while trying to clean stale smoke artifacts.
- Verify new medium and large features follow `docs/FEATURE_ARCHITECTURE_STANDARD.md`, especially cross-process boundaries and public feature entrypoints.
- Check that Electron main, preload, renderer, and shared code keep their responsibilities separate and use the documented path aliases.
- Flag changes that manually concatenate agent block markers instead of using `wrapAgentBlock(text)`.

View file

@ -27,6 +27,7 @@
"opencode:prove-semantic-model-matrix": "node ./scripts/prove-opencode-semantic-model-matrix.mjs",
"opencode:prove-team-provisioning": "node ./scripts/prove-opencode-team-provisioning.mjs",
"team:prove-agent-cli-launch": "node ./scripts/prove-agent-cli-launch.mjs",
"team:prove-provider-launch-stress": "node ./scripts/prove-provider-launch-stress.mjs",
"team:prove-launch-matrix": "pnpm exec vitest run --maxWorkers 1 --minWorkers 1 test/main/services/team/TeamAgentLaunchMatrix.safe-e2e.test.ts",
"prebuild": "tsx scripts/fetch-pricing-data.ts && pnpm --filter agent-teams-controller build && pnpm --filter agent-teams-mcp build",
"build": "node --max-old-space-size=8192 ./node_modules/electron-vite/bin/electron-vite.js build",

View file

@ -0,0 +1,79 @@
#!/usr/bin/env node
import { spawnSync } from 'node:child_process';
import path from 'node:path';
import process from 'node:process';
import { fileURLToPath } from 'node:url';
import {
exitForSkippedPreflight,
preflightOpenCodeLiveEnvironment,
} from './lib/opencode-live-preflight.mjs';
const scriptDir = path.dirname(fileURLToPath(import.meta.url));
const repoRoot = path.resolve(scriptDir, '..');
const orchestratorRoot = process.env.CLAUDE_DEV_RUNTIME_ROOT?.trim();
const siblingOrchestrator = path.resolve(repoRoot, '..', 'agent_teams_orchestrator');
const order = process.env.PROVIDER_LAUNCH_STRESS_ORDER?.trim() || 'anthropic,codex,opencode,mixed';
const env = {
...process.env,
PROVIDER_LAUNCH_STRESS_LIVE: '1',
PROVIDER_LAUNCH_STRESS_ORDER: order,
PROVIDER_LAUNCH_STRESS_MEMBER_COUNT:
process.env.PROVIDER_LAUNCH_STRESS_MEMBER_COUNT?.trim() || '5',
PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH:
process.env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH?.trim() ||
(process.env.ANTHROPIC_API_KEY?.trim() ? 'api-key' : 'subscription'),
OPENCODE_E2E: '1',
OPENCODE_E2E_USE_REAL_APP_CREDENTIALS: '1',
OPENCODE_DISABLE_AUTOUPDATE: process.env.OPENCODE_DISABLE_AUTOUPDATE ?? '1',
};
if (!env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH?.trim()) {
const runtimeRoot = orchestratorRoot ? path.resolve(orchestratorRoot) : siblingOrchestrator;
env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH = path.join(runtimeRoot, 'cli');
}
console.log('Running provider launch stress live smoke');
console.log(`Order: ${env.PROVIDER_LAUNCH_STRESS_ORDER}`);
console.log(`Members per scenario: ${env.PROVIDER_LAUNCH_STRESS_MEMBER_COUNT}`);
console.log(`Anthropic auth: ${env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH}`);
console.log(
`Models: anthropic=${env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_MODEL || 'haiku'}, codex=${
env.PROVIDER_LAUNCH_STRESS_CODEX_MODEL || 'gpt-5.4-mini'
}, opencode=${env.PROVIDER_LAUNCH_STRESS_OPENCODE_MODEL || 'openai/gpt-5.4-mini'}`
);
console.log(`Orchestrator CLI: ${env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH}`);
if (order.split(',').some((item) => ['opencode', 'mixed'].includes(item.trim()))) {
const preflight = await preflightOpenCodeLiveEnvironment({ repoRoot });
exitForSkippedPreflight(preflight);
}
const result = spawnSync(
'pnpm',
[
'exec',
'vitest',
'run',
'--maxWorkers',
'1',
'--minWorkers',
'1',
'test/main/services/team/ProviderLaunchStress.live-e2e.test.ts',
],
{
cwd: repoRoot,
env,
stdio: 'inherit',
shell: process.platform === 'win32',
}
);
if (result.error) {
console.error(`Failed to run provider launch stress smoke: ${result.error.message}`);
process.exit(1);
}
process.exit(result.status ?? 1);

View file

@ -0,0 +1,485 @@
import { getTeamsBasePath } from '@main/utils/pathDecoder';
import { createLogger } from '@shared/utils/logger';
import * as fs from 'fs';
import * as path from 'path';
import { atomicWriteAsync } from './atomicWrite';
import { getTeamBootstrapStatePath } from './TeamBootstrapStateReader';
import { getTeamLaunchStatePath, getTeamLaunchSummaryPath } from './TeamLaunchStateStore';
import type {
MemberSpawnStatusEntry,
PersistedTeamLaunchSnapshot,
TeamLaunchDiagnosticItem,
TeamMember,
TeamProviderBackendId,
TeamProviderId,
TeamProvisioningProgress,
} from '@shared/types';
const logger = createLogger('Service:TeamLaunchFailureArtifactPack');
const ARTIFACTS_DIR_NAME = 'launch-failure-artifacts';
const LATEST_ARTIFACT_FILE = 'latest.json';
const MAX_CLI_LOG_CHARS = 256_000;
const MAX_TRACE_CHARS = 128_000;
const MAX_COPIED_FILE_BYTES = 256 * 1024;
type JsonRecord = Record<string, unknown>;
export interface TeamLaunchFailureArtifactPackInput {
teamName: string;
runId: string;
reason: string;
startedAt?: string;
cwd?: string;
pid?: number | null;
providerId?: TeamProviderId;
providerBackendId?: TeamProviderBackendId;
model?: string;
expectedMembers?: readonly string[];
effectiveMembers?: readonly TeamMember[];
progress?: TeamProvisioningProgress | null;
launchSnapshot?: PersistedTeamLaunchSnapshot | null;
launchDiagnostics?: readonly TeamLaunchDiagnosticItem[];
memberSpawnStatuses?: Record<string, MemberSpawnStatusEntry>;
cliLogs?: string | null;
progressTraceLines?: readonly string[];
runtimeAdapterTraceLines?: readonly string[];
flags?: JsonRecord;
}
export interface TeamLaunchFailureArtifactPackResult {
directory: string;
manifestPath: string;
files: string[];
}
export type LaunchFailureArtifactClassificationCode =
| 'transport_rejected'
| 'stdin_missing'
| 'provider_quota'
| 'provider_auth'
| 'model_no_bootstrap'
| 'process_exited'
| 'opencode_protocol'
| 'unknown';
export interface LaunchFailureArtifactClassification {
code: LaunchFailureArtifactClassificationCode;
confidence: number;
evidence: string[];
}
export interface LaunchBootstrapTransportBreadcrumb {
lastTransportStage: string | null;
submitRejected: boolean;
retryable: boolean | null;
noStdinWarning: boolean;
bootstrapSubmitted: boolean;
evidence: string[];
}
interface CopiedArtifactFile {
sourcePath: string;
artifactName: string;
issue?: string;
}
function sanitizeArtifactNamePart(value: string): string {
const sanitized = value
.trim()
.replace(/[^a-zA-Z0-9._-]+/g, '-')
.replace(/^-+|-+$/g, '');
return sanitized || 'unknown';
}
function artifactTimestamp(now: Date): string {
return now.toISOString().replace(/[:.]/g, '-');
}
function assertPathWithin(root: string, target: string): void {
const relative = path.relative(path.resolve(root), path.resolve(target));
if (relative.startsWith('..') || path.isAbsolute(relative)) {
throw new Error(`Launch artifact path escaped teams root: ${target}`);
}
}
function truncateTail(text: string, maxChars: number): string {
if (text.length <= maxChars) return text;
return `[truncated to last ${maxChars} chars]\n${text.slice(text.length - maxChars)}`;
}
export function redactLaunchFailureArtifactText(text: string): string {
return text
.replace(/sk-ant-[A-Za-z0-9_-]{20,}/g, '[REDACTED_ANTHROPIC_API_KEY]')
.replace(/sk-proj-[A-Za-z0-9_-]{20,}/g, '[REDACTED_OPENAI_API_KEY]')
.replace(/sk-[A-Za-z0-9_-]{20,}/g, '[REDACTED_API_KEY]')
.replace(
/\b(ANTHROPIC_API_KEY|OPENAI_API_KEY|CODEX_API_KEY|OPENROUTER_API_KEY|GEMINI_API_KEY)=([^\s"'`]+)/gi,
'$1=[REDACTED]'
)
.replace(/\b(authorization:\s*bearer\s+)([A-Za-z0-9._~+/=-]{20,})/gi, '$1[REDACTED]')
.replace(
/\b(api[_-]?key|token|access[_-]?token|refresh[_-]?token)(["']?\s*[:=]\s*["']?)([A-Za-z0-9._~+/=-]{20,})/gi,
'$1$2[REDACTED]'
);
}
function redactJsonLike<T>(value: T): T {
return redactJsonValue(value) as T;
}
function isSecretJsonKey(key: string): boolean {
return /^(api[_-]?key|token|access[_-]?token|refresh[_-]?token|authorization)$/i.test(key);
}
function redactJsonValue(value: unknown, key = ''): unknown {
if (isSecretJsonKey(key)) {
return '[REDACTED]';
}
if (typeof value === 'string') {
return redactLaunchFailureArtifactText(value);
}
if (Array.isArray(value)) {
return value.map((item) => redactJsonValue(item));
}
if (value && typeof value === 'object') {
return Object.fromEntries(
Object.entries(value as JsonRecord).map(([entryKey, entryValue]) => [
entryKey,
redactJsonValue(entryValue, entryKey),
])
);
}
return value;
}
function appendIfString(parts: string[], value: unknown): void {
if (typeof value === 'string' && value.trim()) {
parts.push(value.trim());
}
}
function collectLaunchFailureSearchParts(input: TeamLaunchFailureArtifactPackInput): string[] {
const parts: string[] = [];
appendIfString(parts, input.reason);
appendIfString(parts, input.cliLogs);
for (const line of input.progressTraceLines ?? []) appendIfString(parts, line);
for (const line of input.runtimeAdapterTraceLines ?? []) appendIfString(parts, line);
appendIfString(parts, input.progress?.message);
appendIfString(parts, input.progress?.error);
appendIfString(parts, input.progress?.cliLogsTail);
for (const warning of input.progress?.warnings ?? []) appendIfString(parts, warning);
for (const diagnostic of input.launchDiagnostics ?? input.progress?.launchDiagnostics ?? []) {
appendIfString(parts, diagnostic.code);
appendIfString(parts, diagnostic.label);
appendIfString(parts, diagnostic.detail);
}
for (const [memberName, entry] of Object.entries(input.memberSpawnStatuses ?? {})) {
appendIfString(parts, memberName);
appendIfString(parts, entry.status);
appendIfString(parts, entry.launchState);
appendIfString(parts, entry.error);
appendIfString(parts, entry.hardFailureReason);
appendIfString(parts, entry.runtimeDiagnostic);
}
if (input.launchSnapshot) {
appendIfString(parts, input.launchSnapshot.launchPhase);
appendIfString(parts, input.launchSnapshot.teamLaunchState);
for (const [memberName, member] of Object.entries(input.launchSnapshot.members)) {
appendIfString(parts, memberName);
appendIfString(parts, member.launchState);
appendIfString(parts, member.hardFailureReason);
appendIfString(parts, member.runtimeDiagnostic);
for (const diagnostic of member.diagnostics ?? []) appendIfString(parts, diagnostic);
}
}
return parts;
}
function firstEvidence(parts: readonly string[], pattern: RegExp): string[] {
const evidence: string[] = [];
for (const part of parts) {
if (pattern.test(part)) {
evidence.push(truncateTail(part, 600));
if (evidence.length >= 3) break;
}
}
return evidence;
}
export function classifyLaunchFailureArtifact(
input: TeamLaunchFailureArtifactPackInput
): LaunchFailureArtifactClassification {
const parts = collectLaunchFailureSearchParts(input);
const text = parts.join('\n').toLowerCase();
const candidates: {
code: LaunchFailureArtifactClassificationCode;
confidence: number;
pattern: RegExp;
}[] = [
{
code: 'transport_rejected',
confidence: 0.95,
pattern: /bootstrap_submit_rejected|submit rejected by local prompt handler/i,
},
{
code: 'stdin_missing',
confidence: 0.9,
pattern: /no stdin data received|proceeding without it/i,
},
{
code: 'provider_quota',
confidence: 0.92,
pattern: /quota exhausted|insufficient credits|key limit exceeded|total limit|rate limit/i,
},
{
code: 'provider_auth',
confidence: 0.88,
pattern:
/401 unauthorized|not_logged_in|login required|auth(?:entication)? failed|api key.*(?:missing|invalid)|token refresh failed/i,
},
{
code: 'opencode_protocol',
confidence: 0.84,
pattern:
/visible_reply_still_required|non_visible_tool_without_task_progress|empty_assistant_turn|runtime_bootstrap_checkin/i,
},
{
code: 'model_no_bootstrap',
confidence: 0.82,
pattern:
/did not bootstrap-confirm|bootstrap unconfirmed|bootstrap-confirm before timeout|check-in not yet received|bootstrap_stalled/i,
},
{
code: 'process_exited',
confidence: 0.78,
pattern: /process exited|pid is not alive|pid was not found|stale_metadata|exited before/i,
},
];
for (const candidate of candidates) {
if (candidate.pattern.test(text)) {
return {
code: candidate.code,
confidence: candidate.confidence,
evidence: firstEvidence(parts, candidate.pattern).map(redactLaunchFailureArtifactText),
};
}
}
return {
code: 'unknown',
confidence: 0.2,
evidence: firstEvidence(parts, /failed|error|timeout/i).map(redactLaunchFailureArtifactText),
};
}
export function extractLaunchBootstrapTransportBreadcrumb(
input: TeamLaunchFailureArtifactPackInput
): LaunchBootstrapTransportBreadcrumb {
const parts = collectLaunchFailureSearchParts(input);
const combined = parts.join('\n');
const lastStageMatches = [...combined.matchAll(/last transport stage:\s*([^;\n]+)/gi)];
const retryableMatches = [
...combined.matchAll(/bootstrap_submit_rejected[^\n]*(?:retryable[=:]\s*(true|false))/gi),
];
const evidence = firstEvidence(
parts,
/bootstrap_submit_|last transport stage|no stdin data received|local prompt handler/i
).map(redactLaunchFailureArtifactText);
const retryableRaw = retryableMatches.at(-1)?.[1]?.toLowerCase();
return {
lastTransportStage: lastStageMatches.at(-1)?.[1]?.trim() ?? null,
submitRejected: /bootstrap_submit_rejected|submit rejected by local prompt handler/i.test(
combined
),
retryable: retryableRaw === 'true' ? true : retryableRaw === 'false' ? false : null,
noStdinWarning: /no stdin data received|proceeding without it/i.test(combined),
bootstrapSubmitted:
/(?:event["']?\s*:\s*["']bootstrap_submitted["']|bootstrap_submit_accepted|bootstrap submitted)/i.test(
combined
),
evidence,
};
}
async function readBoundedTextFile(sourcePath: string): Promise<{ text?: string; issue?: string }> {
try {
const stat = await fs.promises.stat(sourcePath);
if (!stat.isFile()) {
return { issue: 'not_regular_file' };
}
const handle = await fs.promises.open(sourcePath, 'r');
try {
const start = Math.max(0, stat.size - MAX_COPIED_FILE_BYTES);
const buffer = Buffer.alloc(stat.size - start);
if (buffer.length > 0) {
await handle.read(buffer, 0, buffer.length, start);
}
const prefix = start > 0 ? `[truncated to last ${MAX_COPIED_FILE_BYTES} bytes]\n` : '';
return { text: `${prefix}${buffer.toString('utf8')}` };
} finally {
await handle.close().catch(() => undefined);
}
} catch (error) {
const code = (error as NodeJS.ErrnoException).code;
return { issue: code === 'ENOENT' ? 'missing' : 'unreadable' };
}
}
function getKnownLaunchArtifactSourceFiles(teamName: string): CopiedArtifactFile[] {
const bootstrapStatePath = getTeamBootstrapStatePath(teamName);
const teamDir = path.dirname(bootstrapStatePath);
return [
{
sourcePath: getTeamLaunchStatePath(teamName),
artifactName: 'launch-state.json',
},
{
sourcePath: getTeamLaunchSummaryPath(teamName),
artifactName: 'launch-summary.json',
},
{
sourcePath: bootstrapStatePath,
artifactName: 'bootstrap-state.json',
},
{
sourcePath: path.join(teamDir, 'bootstrap-journal.jsonl'),
artifactName: 'bootstrap-journal.tail.jsonl',
},
{
sourcePath: path.join(teamDir, '.bootstrap.lock', 'metadata.json'),
artifactName: 'bootstrap-lock-metadata.json',
},
];
}
async function writeArtifactTextFile(
directory: string,
artifactName: string,
rawText: string,
files: string[]
): Promise<void> {
const targetPath = path.join(directory, artifactName);
await atomicWriteAsync(targetPath, `${redactLaunchFailureArtifactText(rawText).trimEnd()}\n`);
files.push(artifactName);
}
export async function writeTeamLaunchFailureArtifactPack(
input: TeamLaunchFailureArtifactPackInput
): Promise<TeamLaunchFailureArtifactPackResult> {
const teamsRoot = getTeamsBasePath();
const teamDir = path.join(teamsRoot, input.teamName);
const artifactsRoot = path.join(teamDir, ARTIFACTS_DIR_NAME);
const createdAt = new Date();
const directory = path.join(
artifactsRoot,
`${artifactTimestamp(createdAt)}-${sanitizeArtifactNamePart(input.runId)}`
);
assertPathWithin(teamsRoot, directory);
await fs.promises.mkdir(directory, { recursive: true });
const files: string[] = [];
const copiedFiles: CopiedArtifactFile[] = [];
if (input.cliLogs?.trim()) {
await writeArtifactTextFile(
directory,
'cli-logs-tail.txt',
truncateTail(input.cliLogs, MAX_CLI_LOG_CHARS),
files
);
}
if (input.progressTraceLines?.length) {
await writeArtifactTextFile(
directory,
'progress-trace.txt',
truncateTail(input.progressTraceLines.join('\n'), MAX_TRACE_CHARS),
files
);
}
if (input.runtimeAdapterTraceLines?.length) {
await writeArtifactTextFile(
directory,
'runtime-adapter-trace.txt',
truncateTail(input.runtimeAdapterTraceLines.join('\n'), MAX_TRACE_CHARS),
files
);
}
for (const source of getKnownLaunchArtifactSourceFiles(input.teamName)) {
const read = await readBoundedTextFile(source.sourcePath);
if (read.text !== undefined) {
await writeArtifactTextFile(directory, source.artifactName, read.text, files);
copiedFiles.push(source);
} else {
copiedFiles.push({ ...source, issue: read.issue ?? 'unreadable' });
}
}
const classification = classifyLaunchFailureArtifact(input);
const bootstrapTransportBreadcrumb = extractLaunchBootstrapTransportBreadcrumb(input);
const manifest = redactJsonLike({
version: 1,
createdAt: createdAt.toISOString(),
reason: input.reason,
classification,
bootstrapTransportBreadcrumb,
teamName: input.teamName,
runId: input.runId,
startedAt: input.startedAt,
cwd: input.cwd,
pid: input.pid ?? null,
providerId: input.providerId,
providerBackendId: input.providerBackendId,
model: input.model,
expectedMembers: input.expectedMembers ?? [],
effectiveMembers: (input.effectiveMembers ?? []).map((member) => ({
name: member.name,
role: member.role,
providerId: member.providerId,
providerBackendId: member.providerBackendId,
model: member.model,
agentType: member.agentType,
removedAt: member.removedAt,
})),
progress: input.progress ?? null,
launchDiagnostics: input.launchDiagnostics ?? input.progress?.launchDiagnostics ?? [],
memberSpawnStatuses: input.memberSpawnStatuses ?? {},
launchSnapshot: input.launchSnapshot ?? null,
flags: input.flags ?? {},
artifactFiles: files,
copiedFiles,
});
const manifestPath = path.join(directory, 'manifest.json');
await atomicWriteAsync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`);
files.unshift('manifest.json');
await fs.promises.mkdir(artifactsRoot, { recursive: true });
await atomicWriteAsync(
path.join(artifactsRoot, LATEST_ARTIFACT_FILE),
`${JSON.stringify(
redactJsonLike({
version: 1,
createdAt: createdAt.toISOString(),
teamName: input.teamName,
runId: input.runId,
reason: input.reason,
directory,
manifestPath,
}),
null,
2
)}\n`
);
logger.info(`[${input.teamName}] Wrote launch failure artifact pack`, {
runId: input.runId,
reason: input.reason,
directory,
});
return { directory, manifestPath, files };
}

View file

@ -289,6 +289,7 @@ import {
snapshotFromRuntimeMemberStatuses,
snapshotToMemberSpawnStatuses,
} from './TeamLaunchStateEvaluator';
import { writeTeamLaunchFailureArtifactPack } from './TeamLaunchFailureArtifactPack';
import { TeamLaunchStateStore } from './TeamLaunchStateStore';
import { TeamMcpConfigBuilder } from './TeamMcpConfigBuilder';
import { TeamMemberLogsFinder } from './TeamMemberLogsFinder';
@ -407,6 +408,11 @@ type BootstrapTranscriptSuccessSource = 'member_briefing' | 'assistant_text';
const BOOTSTRAP_RUNTIME_PROOF_TAIL_BYTES = 256 * 1024;
const BOOTSTRAP_RUNTIME_EVENT_MAX_LINES = 256;
const BOOTSTRAP_RUNTIME_EVENT_MAX_LINE_BYTES = 16 * 1024;
const TEAMMATE_RUNTIME_ENV = 'CLAUDE_CODE_TEAMMATE_RUNTIME';
const TEAMMATE_RUNTIME_EVENTS_ENV = 'CLAUDE_CODE_TEAMMATE_RUNTIME_EVENTS_PATH';
const TEAMMATE_BOOTSTRAP_PROOF_TOKEN_ENV = 'CLAUDE_CODE_BOOTSTRAP_PROOF_TOKEN';
const NATIVE_APP_MANAGED_BOOTSTRAP_CONTEXT_ENV =
'CLAUDE_CODE_NATIVE_APP_MANAGED_BOOTSTRAP_CONTEXT_PATH';
function getTeamRuntimeEventsDir(teamName: string): string {
return path.join(getTeamsBasePath(), teamName, 'runtime');
@ -5519,6 +5525,7 @@ export class TeamProvisioningService {
>();
private readonly memberSpawnStatusesCacheGenerationByTeam = new Map<string, number>();
private readonly launchStateStore = new TeamLaunchStateStore();
private readonly launchFailureArtifactPackRunIds = new Set<string>();
private readonly launchStateStoreQueue = new Map<string, Promise<unknown>>();
private readonly launchStateWrittenRunIdByTeam = new Map<string, string>();
private readonly failedOpenCodeSecondaryRetryInFlightByTeam = new Map<
@ -5624,6 +5631,56 @@ export class TeamProvisioningService {
return choosePreferredLaunchSnapshot(bootstrapSnapshot, launchSnapshot);
}
private writeLaunchFailureArtifactPackBestEffort(
run: ProvisioningRun,
options: {
reason: string;
launchSnapshot?: PersistedTeamLaunchSnapshot | null;
}
): void {
const key = `${run.teamName}:${run.runId}`;
if (this.launchFailureArtifactPackRunIds.has(key)) return;
this.launchFailureArtifactPackRunIds.add(key);
const memberSpawnStatuses = Object.fromEntries(run.memberSpawnStatuses.entries());
const request = run.request as Partial<TeamCreateRequest> | undefined;
void writeTeamLaunchFailureArtifactPack({
teamName: run.teamName,
runId: run.runId,
reason: options.reason,
startedAt: run.startedAt,
cwd: request?.cwd ?? '',
pid: run.child?.pid ?? run.progress.pid ?? null,
providerId: request?.providerId,
providerBackendId: request?.providerBackendId,
model: request?.model,
expectedMembers: run.expectedMembers,
effectiveMembers: run.allEffectiveMembers,
progress: run.progress,
launchSnapshot: options.launchSnapshot ?? null,
launchDiagnostics: run.progress.launchDiagnostics ?? buildLaunchDiagnosticsFromRun(run),
memberSpawnStatuses,
cliLogs: extractCliLogsFromRun(run),
progressTraceLines: run.provisioningTraceLines,
runtimeAdapterTraceLines: this.runtimeAdapterTraceLinesByRunId.get(run.runId),
flags: {
isLaunch: run.isLaunch,
provisioningComplete: run.provisioningComplete,
deterministicBootstrap: run.deterministicBootstrap,
processKilled: run.processKilled,
finalizingByTimeout: run.finalizingByTimeout,
cancelRequested: run.cancelRequested,
},
}).catch((error: unknown) => {
this.launchFailureArtifactPackRunIds.delete(key);
logger.warn(
`[${run.teamName}] Failed to write launch failure artifact pack: ${
error instanceof Error ? error.message : String(error)
}`
);
});
}
async repairStaleTaskActivityIntervalsBeforeSnapshot(teamName: string): Promise<void> {
if (this.crashRepairedActivityIntervalsByTeam.has(teamName)) {
return;
@ -13787,6 +13844,13 @@ export class TeamProvisioningService {
providerId: TeamProviderId;
joinedAt: number;
bootstrapExpectedAfter: string;
backendType?: 'tmux' | 'process';
runtimePid?: number;
bootstrapRuntimeEventsPath?: string;
bootstrapProofToken?: string;
bootstrapRunId?: string;
bootstrapContextHash?: string;
bootstrapBriefingHash?: string;
}): Promise<void> {
const configPath = path.join(getTeamsBasePath(), input.teamName, 'config.json');
const raw = await tryReadRegularFileUtf8(configPath, {
@ -13822,10 +13886,25 @@ export class TeamProvisioningService {
color: input.color,
joinedAt: input.joinedAt,
bootstrapExpectedAfter: input.bootstrapExpectedAfter,
...(input.bootstrapProofToken ? { bootstrapProofToken: input.bootstrapProofToken } : {}),
...(input.bootstrapRunId ? { bootstrapRunId: input.bootstrapRunId } : {}),
...(input.bootstrapRuntimeEventsPath
? { bootstrapRuntimeEventsPath: input.bootstrapRuntimeEventsPath }
: {}),
...(input.bootstrapContextHash
? {
bootstrapProofMode: 'native_app_managed_context',
bootstrapContextHash: input.bootstrapContextHash,
}
: {}),
...(input.bootstrapBriefingHash
? { bootstrapBriefingHash: input.bootstrapBriefingHash }
: {}),
tmuxPaneId: input.paneId,
...(typeof input.runtimePid === 'number' ? { runtimePid: input.runtimePid } : {}),
cwd: input.cwd,
subscriptions: Array.isArray(existing.subscriptions) ? existing.subscriptions : [],
backendType: 'tmux',
backendType: input.backendType ?? 'tmux',
};
if (existingIndex >= 0) {
@ -14047,6 +14126,378 @@ export class TeamProvisioningService {
this.setMemberSpawnStatus(input.run, input.memberName, 'waiting');
}
private async launchDirectProcessMemberRestart(input: {
run: ProvisioningRun;
teamName: string;
displayName: string;
leadName: string;
memberName: string;
config: TeamConfig;
configuredMember: NonNullable<
ReturnType<TeamProvisioningService['resolveEffectiveConfiguredMember']>
>;
persistedRuntimeMembers: readonly PersistedRuntimeMemberLike[];
}): Promise<void> {
const providerId = resolveTeamProviderId(input.configuredMember.providerId);
const claudePath = input.run.spawnContext?.claudePath ?? (await ClaudeBinaryResolver.resolve());
if (!claudePath) {
throw new Error('Claude CLI not found; install it or provide a valid path');
}
const cwd = this.resolveDirectRestartRuntimeCwd({
configuredMember: input.configuredMember,
persistedRuntimeMembers: input.persistedRuntimeMembers,
config: input.config,
run: input.run,
});
await ensureCwdExists(cwd);
const provisioningEnv = await this.buildProvisioningEnv(
providerId,
input.configuredMember.providerBackendId,
{
teamRuntimeAuth: {
teamName: input.teamName,
authMaterialId: `${input.run.runId}-process-restart-${input.configuredMember.name}-${randomUUID()}`,
allowAnthropicApiKeyHelper: true,
},
}
);
if (provisioningEnv.warning) {
throw new Error(provisioningEnv.warning);
}
const mcpConfigPath = await this.mcpConfigBuilder.writeConfigFile(cwd);
const agentId = `${input.configuredMember.name}@${input.teamName}`;
const color =
input.config.members
?.find((member) => matchesExactTeamMemberName(member.name, input.memberName))
?.color?.trim() || getMemberColorByName(input.configuredMember.name);
const parentSessionId =
input.run.detectedSessionId?.trim() || input.config.leadSessionId?.trim() || input.run.runId;
const memberSpec: TeamCreateRequest['members'][number] = {
name: input.configuredMember.name,
...(input.configuredMember.role ? { role: input.configuredMember.role } : {}),
...(input.configuredMember.workflow ? { workflow: input.configuredMember.workflow } : {}),
...(input.configuredMember.providerId
? { providerId: input.configuredMember.providerId }
: {}),
...(input.configuredMember.providerBackendId
? { providerBackendId: input.configuredMember.providerBackendId }
: {}),
...(input.configuredMember.model ? { model: input.configuredMember.model } : {}),
...(input.configuredMember.effort ? { effort: input.configuredMember.effort } : {}),
...(input.configuredMember.agentType ? { agentType: input.configuredMember.agentType } : {}),
...(input.configuredMember.isolation === 'worktree'
? { isolation: 'worktree' as const }
: {}),
...(input.configuredMember.cwd ? { cwd: input.configuredMember.cwd } : {}),
};
const prompt = buildMemberSpawnPrompt(
memberSpec,
input.displayName,
input.teamName,
input.leadName,
{
restart: true,
}
);
const bootstrapExpectedAfter = nowIso();
const bootstrapProofToken = randomUUID();
const runtimePaths = this.getDirectProcessRestartRuntimePaths(
input.teamName,
input.configuredMember.name
);
await fs.promises.mkdir(runtimePaths.dir, { recursive: true });
await fs.promises.writeFile(runtimePaths.eventsPath, '', { encoding: 'utf8', mode: 0o600 });
const nativeBootstrapSpec =
(
await buildNativeAppManagedBootstrapSpecs({
teamName: input.teamName,
cwd,
members: [memberSpec],
})
).get(input.configuredMember.name) ?? null;
const nativeBootstrapEnv = await this.materializeDirectProcessNativeBootstrapContext({
teamName: input.teamName,
memberName: input.configuredMember.name,
agentId,
providerId,
runId: input.run.runId,
bootstrapProofToken,
spec: nativeBootstrapSpec,
});
const runtimeArgsPlan = await this.buildTeamRuntimeLaunchArgsPlan({
teamName: input.teamName,
providerId,
launchIdentity: null,
envResolution: provisioningEnv,
extraArgs: [],
includeAnthropicHelper: providerId === 'anthropic',
contextLabel: `Direct process teammate restart (${input.configuredMember.name})`,
});
const runtimeArgs = mergeJsonSettingsArgs([
'--teammate-runtime',
'headless',
'--agent-id',
agentId,
'--agent-name',
input.configuredMember.name,
'--team-name',
input.teamName,
'--agent-color',
color,
'--parent-session-id',
parentSessionId,
...(input.configuredMember.agentType
? ['--agent-type', input.configuredMember.agentType]
: []),
'--mcp-config',
mcpConfigPath,
'--strict-mcp-config',
'--disallowedTools',
APP_TEAM_RUNTIME_DISALLOWED_TOOLS,
...(input.run.request.skipPermissions !== false
? ['--dangerously-skip-permissions', '--permission-mode', 'bypassPermissions']
: ['--permission-prompt-tool', 'stdio', '--permission-mode', 'default']),
...(input.configuredMember.model ? ['--model', input.configuredMember.model] : []),
...(input.configuredMember.effort ? ['--effort', input.configuredMember.effort] : []),
...runtimeArgsPlan.fastModeArgs,
...runtimeArgsPlan.runtimeTurnSettledHookArgs,
...runtimeArgsPlan.providerArgs,
...runtimeArgsPlan.settingsArgs,
]);
const stdoutLog = fs.createWriteStream(runtimePaths.stdoutPath, { flags: 'a', mode: 0o600 });
const stderrLog = fs.createWriteStream(runtimePaths.stderrPath, { flags: 'a', mode: 0o600 });
const child = spawnCli(claudePath, runtimeArgs, {
cwd,
detached: true,
env: {
...provisioningEnv.env,
...nativeBootstrapEnv,
[TEAMMATE_RUNTIME_ENV]: 'headless',
[TEAMMATE_RUNTIME_EVENTS_ENV]: runtimePaths.eventsPath,
[TEAMMATE_BOOTSTRAP_PROOF_TOKEN_ENV]: bootstrapProofToken,
},
stdio: ['pipe', 'pipe', 'pipe'],
});
if (!child.pid) {
stdoutLog.destroy();
stderrLog.destroy();
throw new Error(`Failed to spawn teammate process for ${agentId}: missing pid`);
}
const runtimePid = child.pid;
const processPaneId = `process:${runtimePid}`;
child.stdout?.pipe(stdoutLog);
child.stderr?.pipe(stderrLog);
child.stdin?.on('error', (error) => {
logger.debug(
`[${input.teamName}] Direct process restart stdin failed for ${agentId}: ${error.message}`
);
});
child.once('close', (code, signal) => {
void this.appendDirectProcessRuntimeEvent({
type: 'exited',
eventsPath: runtimePaths.eventsPath,
pid: runtimePid,
teamName: input.teamName,
agentName: input.configuredMember.name,
agentId,
runId: parentSessionId,
bootstrapRunId: input.run.runId,
source: 'TeamProvisioningService.direct_process_restart',
detail:
code !== null
? `process exited with code ${code}`
: signal
? `process exited from signal ${signal}`
: 'process exited',
});
stdoutLog.end();
stderrLog.end();
});
child.once('error', (error) => {
void this.appendDirectProcessRuntimeEvent({
type: 'failed',
eventsPath: runtimePaths.eventsPath,
pid: runtimePid,
teamName: input.teamName,
agentName: input.configuredMember.name,
agentId,
runId: parentSessionId,
bootstrapRunId: input.run.runId,
source: 'TeamProvisioningService.direct_process_restart',
detail: `process error: ${error.message}`,
});
});
(child.stdin as { unref?: () => void } | null)?.unref?.();
(child.stdout as { unref?: () => void } | null)?.unref?.();
(child.stderr as { unref?: () => void } | null)?.unref?.();
child.unref();
await this.appendDirectProcessRuntimeEvent({
type: 'process_spawned',
eventsPath: runtimePaths.eventsPath,
pid: runtimePid,
teamName: input.teamName,
agentName: input.configuredMember.name,
agentId,
runId: parentSessionId,
bootstrapRunId: input.run.runId,
source: 'TeamProvisioningService.direct_process_restart',
detail: 'process spawned',
});
await this.appendDirectProcessRuntimeEvent({
type: 'stdout_attached',
eventsPath: runtimePaths.eventsPath,
pid: runtimePid,
teamName: input.teamName,
agentName: input.configuredMember.name,
agentId,
runId: parentSessionId,
bootstrapRunId: input.run.runId,
source: 'TeamProvisioningService.direct_process_restart',
detail: 'stdout and stderr attached',
});
await this.updateDirectTmuxRestartMemberConfig({
teamName: input.teamName,
memberName: input.memberName,
member: input.configuredMember,
agentId,
color,
prompt,
paneId: processPaneId,
cwd,
providerId,
joinedAt: Date.now(),
bootstrapExpectedAfter,
backendType: 'process',
runtimePid,
bootstrapRuntimeEventsPath: runtimePaths.eventsPath,
bootstrapProofToken,
bootstrapRunId: input.run.runId,
...(nativeBootstrapSpec
? {
bootstrapContextHash: nativeBootstrapSpec.contextHash,
bootstrapBriefingHash: nativeBootstrapSpec.briefingHash,
}
: {}),
});
this.enqueueDirectRestartPrompt({
teamName: input.teamName,
memberName: input.configuredMember.name,
leadName: input.leadName,
leadSessionId: parentSessionId,
prompt,
});
await this.appendDirectProcessRuntimeEvent({
type: 'mailbox_bootstrap_written',
eventsPath: runtimePaths.eventsPath,
pid: runtimePid,
teamName: input.teamName,
agentName: input.configuredMember.name,
agentId,
runId: parentSessionId,
bootstrapRunId: input.run.runId,
source: 'TeamProvisioningService.direct_process_restart',
});
this.appendMemberBootstrapDiagnostic(
input.run,
input.memberName,
`restart process spawned with pid ${runtimePid}`
);
this.setMemberSpawnStatus(input.run, input.memberName, 'waiting');
}
private getDirectProcessRestartRuntimePaths(
teamName: string,
memberName: string
): { dir: string; eventsPath: string; stdoutPath: string; stderrPath: string } {
const dir = getTeamRuntimeEventsDir(teamName);
const filePrefix = sanitizeProcessRuntimeEventFilePrefix(memberName);
return {
dir,
eventsPath: path.join(dir, `${filePrefix}.runtime.jsonl`),
stdoutPath: path.join(dir, `${filePrefix}.stdout.log`),
stderrPath: path.join(dir, `${filePrefix}.stderr.log`),
};
}
private async materializeDirectProcessNativeBootstrapContext(input: {
teamName: string;
memberName: string;
agentId: string;
providerId: TeamProviderId;
runId: string;
bootstrapProofToken: string;
spec: NativeAppManagedBootstrapSpec | null;
}): Promise<Record<string, string>> {
if (!input.spec || (input.providerId !== 'anthropic' && input.providerId !== 'codex')) {
return {};
}
const context = {
...input.spec,
kind: 'native_app_managed_bootstrap',
teamName: input.teamName,
memberName: input.memberName,
agentId: input.agentId,
runId: input.runId,
provider: input.providerId,
bootstrapProofToken: input.bootstrapProofToken,
};
const dir = path.join(getTeamRuntimeEventsDir(input.teamName), 'native-bootstrap');
await fs.promises.mkdir(dir, { recursive: true });
const finalPath = path.join(
dir,
`${sanitizeProcessRuntimeEventFilePrefix(input.memberName)}-${randomUUID()}.native-bootstrap.json`
);
const tempPath = `${finalPath}.tmp`;
await fs.promises.writeFile(tempPath, JSON.stringify(context), {
encoding: 'utf8',
mode: 0o600,
});
await fs.promises.rename(tempPath, finalPath);
return { [NATIVE_APP_MANAGED_BOOTSTRAP_CONTEXT_ENV]: finalPath };
}
private async appendDirectProcessRuntimeEvent(input: {
type: string;
eventsPath: string;
pid: number;
teamName: string;
agentName: string;
agentId: string;
runId: string;
bootstrapRunId: string;
source: string;
detail?: string;
}): Promise<void> {
await fs.promises.mkdir(path.dirname(input.eventsPath), { recursive: true });
await fs.promises.appendFile(
input.eventsPath,
`${JSON.stringify({
version: 1,
type: input.type,
timestamp: nowIso(),
pid: input.pid,
teamName: input.teamName,
agentName: input.agentName,
agentId: input.agentId,
runId: input.runId,
bootstrapRunId: input.bootstrapRunId,
source: input.source,
...(input.detail ? { detail: input.detail } : {}),
})}\n`,
{ encoding: 'utf8', mode: 0o600 }
);
}
private getMemberLifecycleOperationKey(teamName: string, memberName: string): string {
return `${teamName.trim().toLowerCase()}\u0000${memberName.trim().toLowerCase()}`;
}
@ -14408,6 +14859,38 @@ export class TeamProvisioningService {
}
}
const shouldDirectProcessRestart = backendTypes.has('process') || livePids.size > 0;
if (shouldDirectProcessRestart) {
try {
await this.launchDirectProcessMemberRestart({
run,
teamName,
displayName: config?.name?.trim() || teamName,
leadName,
memberName,
config,
configuredMember,
persistedRuntimeMembers,
});
return;
} catch (error) {
run.pendingMemberRestarts.delete(memberName);
this.setMemberSpawnStatus(
run,
memberName,
'error',
error instanceof Error ? error.message : String(error)
);
if (run.isLaunch) {
await this.persistLaunchStateSnapshot(
run,
run.provisioningComplete ? 'finished' : 'active'
);
}
throw error;
}
}
const restartMessage = buildRestartMemberSpawnMessage(
teamName,
config?.name?.trim() || teamName,
@ -29518,6 +30001,14 @@ export class TeamProvisioningService {
}
);
run.onProgress(progress);
if (hasSpawnFailures) {
this.writeLaunchFailureArtifactPackBestEffort(run, {
reason: run.isLaunch
? 'launch_completed_with_teammate_errors'
: 'provisioning_completed_with_teammate_errors',
launchSnapshot: persistedLaunchSnapshot,
});
}
this.provisioningRunByTeam.delete(run.teamName);
this.aliveRunByTeam.set(run.teamName, run.runId);
logger.info(`[${run.teamName}] Provisioning complete. Process alive for subsequent tasks.`);
@ -30111,6 +30602,18 @@ export class TeamProvisioningService {
});
void this.persistLaunchStateSnapshot(run, 'finished');
}
if (
!hasNewerTrackedRun &&
(run.progress.state === 'failed' ||
(run.isLaunch && !run.provisioningComplete && !run.cancelRequested))
) {
this.writeLaunchFailureArtifactPackBestEffort(run, {
reason:
run.progress.state === 'failed'
? 'launch_progress_failed'
: 'launch_cleanup_unconfirmed_bootstrap',
});
}
this.resetRuntimeToolActivity(run);
this.setLeadActivity(run, 'offline');
run.pendingDirectCrossTeamSendRefresh = false;

View file

@ -10,6 +10,7 @@ import {
getTeamsBasePath,
setClaudeBasePathOverride,
} from '../../../../src/main/utils/pathDecoder';
import { killProcessByPid } from '../../../../src/main/utils/processKill';
import {
createOpenCodeLiveHarness,
waitForOpenCodeLanesStopped,
@ -131,8 +132,7 @@ liveDescribe('Mixed provider team launch live e2e', () => {
afterEach(async () => {
const keepProcesses = process.env.MIXED_PROVIDER_TEAM_LIVE_KEEP_PROCESSES === '1';
if (!keepProcesses && harness && teamName) {
await harness.svc.stopTeam(teamName).catch(() => undefined);
await waitForOpenCodeLanesStopped(teamName, 90_000).catch(() => undefined);
await cleanupMixedProviderSmokeTeam(harness, teamName);
}
if (!keepProcesses && usingAnthropicSubscriptionAuth && teamName) {
await fs.rm(path.join(getTeamsBasePath(), teamName), { recursive: true, force: true });
@ -531,6 +531,51 @@ async function removeTempDirWithRetries(dirPath: string): Promise<void> {
}
}
async function cleanupMixedProviderSmokeTeam(
harness: OpenCodeLiveHarness,
teamName: string
): Promise<void> {
const beforeStopSnapshot = await harness.svc
.getTeamAgentRuntimeSnapshot(teamName)
.catch(() => null);
await harness.svc.stopTeam(teamName).catch(() => undefined);
await waitForOpenCodeLanesStopped(teamName, 90_000).catch(() => undefined);
await terminateSmokeOwnedProcessBackends(beforeStopSnapshot);
const afterStopSnapshot = await harness.svc
.getTeamAgentRuntimeSnapshot(teamName)
.catch(() => null);
await terminateSmokeOwnedProcessBackends(afterStopSnapshot);
}
async function terminateSmokeOwnedProcessBackends(
snapshot: Awaited<ReturnType<OpenCodeLiveHarness['svc']['getTeamAgentRuntimeSnapshot']>> | null
): Promise<void> {
const pids = new Set<number>();
for (const member of Object.values(snapshot?.members ?? {})) {
if (member.backendType !== 'process' || member.providerId === 'opencode') {
continue;
}
const pid = member.runtimePid ?? member.pid;
if (typeof pid === 'number' && Number.isFinite(pid) && pid > 0) {
pids.add(pid);
}
}
await Promise.all(
Array.from(pids).map(async (pid) => {
try {
process.kill(pid, 0);
} catch {
return;
}
try {
killProcessByPid(pid);
} catch {
// Best-effort smoke cleanup. The process may have exited between the liveness probe and kill.
}
})
);
}
function formatProgressDump(progressEvents: TeamProvisioningProgress[]): string {
return redactSecrets(
progressEvents

View file

@ -0,0 +1,860 @@
// @vitest-environment node
import { constants as fsConstants, promises as fs } from 'node:fs';
import * as os from 'node:os';
import * as path from 'node:path';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { createOpenCodeLiveHarness, waitForOpenCodeLanesStopped, waitUntil } from './openCodeLiveTestHarness';
import {
getTasksBasePath,
getTeamsBasePath,
setClaudeBasePathOverride,
} from '../../../../src/main/utils/pathDecoder';
import { killProcessByPid } from '../../../../src/main/utils/processKill';
import { TeamDataService } from '../../../../src/main/services/team/TeamDataService';
import { TeamProvisioningService } from '../../../../src/main/services/team/TeamProvisioningService';
import { TeamTaskReader } from '../../../../src/main/services/team/TeamTaskReader';
import type {
TeamAgentRuntimeSnapshot,
TeamCreateRequest,
TeamMember,
TeamProviderId,
TeamProvisioningProgress,
} from '../../../../src/shared/types';
vi.mock('../../../../src/main/services/infrastructure/NotificationManager', () => ({
NotificationManager: {
getInstance: () => ({
addTeamNotification: vi.fn(async () => undefined),
}),
},
}));
const liveDescribe =
process.env.PROVIDER_LAUNCH_STRESS_LIVE === '1' && hasAnthropicAuthConfigured()
? describe
: describe.skip;
const DEFAULT_ORCHESTRATOR_CLI = '/Users/belief/dev/projects/claude/agent_teams_orchestrator/cli';
const DEFAULT_ANTHROPIC_MODEL = 'haiku';
const DEFAULT_CODEX_MODEL = 'gpt-5.4-mini';
const DEFAULT_CODEX_EFFORT = 'low' as const;
const DEFAULT_OPENCODE_MODEL = 'openai/gpt-5.4-mini';
const DEFAULT_ORDER: ProviderLaunchStressScenario[] = ['anthropic', 'codex', 'opencode', 'mixed'];
const MEMBER_NAMES = ['alice', 'bob', 'jack', 'tom', 'atlas', 'nova', 'cody', 'oscar'];
const RESTART_CONFIRM_TIMEOUT_MS = 300_000;
const POST_LAUNCH_WORK_TIMEOUT_MS = 300_000;
let currentStressTempDir = '';
let currentStressProjectPath = '';
type ProviderLaunchStressScenario = 'anthropic' | 'codex' | 'opencode' | 'mixed';
interface ActiveScenario {
scenario: ProviderLaunchStressScenario;
teamName: string;
svc: TeamProvisioningService;
harness?: Awaited<ReturnType<typeof createOpenCodeLiveHarness>>;
codexCleanup?: () => Promise<void>;
failed: boolean;
}
liveDescribe('provider launch stress live e2e', () => {
let tempDir: string;
let tempClaudeRoot: string;
let tempHome: string;
let projectPath: string;
let previousCliPath: string | undefined;
let previousCliFlavor: string | undefined;
let previousCodexHome: string | undefined;
let previousHome: string | undefined;
let previousUserProfile: string | undefined;
let previousNodeEnv: string | undefined;
let previousAnthropicApiKey: string | undefined;
let previousAnthropicAuthToken: string | undefined;
let previousClaudeJsonConfig: string | null | undefined;
const activeScenarios: ActiveScenario[] = [];
beforeEach(async () => {
tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'provider-launch-stress-live-'));
tempClaudeRoot = usingAnthropicSubscriptionAuth()
? os.userInfo().homedir
: path.join(tempDir, '.claude');
tempHome = path.join(tempDir, 'home');
projectPath = path.join(tempDir, 'project');
currentStressTempDir = tempDir;
currentStressProjectPath = projectPath;
await fs.mkdir(tempHome, { recursive: true });
await fs.mkdir(projectPath, { recursive: true });
await fs.writeFile(
path.join(projectPath, 'README.md'),
'# Provider launch stress live e2e\n\nKeep this project intentionally tiny.\n',
'utf8'
);
if (usingAnthropicSubscriptionAuth()) {
setClaudeBasePathOverride(null);
previousClaudeJsonConfig = await upsertTrustedClaudeProjectConfig(
tempClaudeRoot,
projectPath
);
} else {
await fs.mkdir(tempClaudeRoot, { recursive: true });
await writeTrustedClaudeConfig(tempClaudeRoot, projectPath);
setClaudeBasePathOverride(tempClaudeRoot);
previousClaudeJsonConfig = undefined;
}
previousCliPath = process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH;
previousCliFlavor = process.env.CLAUDE_TEAM_CLI_FLAVOR;
previousCodexHome = process.env.CODEX_HOME;
previousHome = process.env.HOME;
previousUserProfile = process.env.USERPROFILE;
previousNodeEnv = process.env.NODE_ENV;
previousAnthropicApiKey = process.env.ANTHROPIC_API_KEY;
previousAnthropicAuthToken = process.env.ANTHROPIC_AUTH_TOKEN;
process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH =
process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH?.trim() || DEFAULT_ORCHESTRATOR_CLI;
process.env.CLAUDE_TEAM_CLI_FLAVOR = 'agent_teams_orchestrator';
process.env.CODEX_HOME = resolveConnectedCodexHome(previousCodexHome);
process.env.HOME = usingAnthropicSubscriptionAuth() ? os.userInfo().homedir : tempHome;
process.env.USERPROFILE = usingAnthropicSubscriptionAuth() ? os.userInfo().homedir : tempHome;
process.env.NODE_ENV = 'production';
if (usingAnthropicSubscriptionAuth()) {
delete process.env.ANTHROPIC_API_KEY;
delete process.env.ANTHROPIC_AUTH_TOKEN;
}
});
afterEach(async () => {
for (const active of [...activeScenarios].reverse()) {
await cleanupActiveScenario(active, { preserveFiles: active.failed }).catch(() => undefined);
}
activeScenarios.length = 0;
discardKnownProviderLaunchStressWarnings();
if (usingAnthropicSubscriptionAuth() && previousClaudeJsonConfig !== undefined) {
await restoreClaudeJsonConfig(tempClaudeRoot, previousClaudeJsonConfig);
}
setClaudeBasePathOverride(null);
restoreEnv('CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH', previousCliPath);
restoreEnv('CLAUDE_TEAM_CLI_FLAVOR', previousCliFlavor);
restoreEnv('CODEX_HOME', previousCodexHome);
restoreEnv('HOME', previousHome);
restoreEnv('USERPROFILE', previousUserProfile);
restoreEnv('NODE_ENV', previousNodeEnv);
restoreEnv('ANTHROPIC_API_KEY', previousAnthropicApiKey);
restoreEnv('ANTHROPIC_AUTH_TOKEN', previousAnthropicAuthToken);
if (process.env.PROVIDER_LAUNCH_STRESS_KEEP_TEMP === '1') {
process.stderr.write(`[ProviderLaunchStress.live] preserved temp dir: ${tempDir}\n`);
} else {
await fs.rm(tempDir, { recursive: true, force: true });
}
currentStressTempDir = '';
currentStressProjectPath = '';
}, 240_000);
it(
'launches, restarts, and exercises post-launch work for provider teams with five teammates each',
async () => {
const orchestratorCli = process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH?.trim();
expect(orchestratorCli).toBeTruthy();
await assertExecutable(orchestratorCli!);
await assertCodexSubscriptionAuthAvailable(process.env.CODEX_HOME!);
for (const scenario of getStressOrder()) {
await runProviderStressScenario(scenario, activeScenarios);
}
},
30 * 60_000
);
});
async function runProviderStressScenario(
scenario: ProviderLaunchStressScenario,
activeScenarios: ActiveScenario[]
): Promise<void> {
const selected = resolveScenarioSelection(scenario);
const memberCount = getStressMemberCount();
const teamName = `provider-stress-${scenario}-${Date.now()}`;
const progressEvents: TeamProvisioningProgress[] = [];
process.stderr.write(
`[ProviderLaunchStress.live] starting ${scenario} with ${memberCount} teammates\n`
);
let codexCleanup: (() => Promise<void>) | undefined;
let harness: Awaited<ReturnType<typeof createOpenCodeLiveHarness>> | undefined;
try {
codexCleanup =
scenario === 'codex' || scenario === 'mixed' ? await installCodexAccountFeature() : undefined;
harness =
scenario === 'opencode' || scenario === 'mixed'
? await createOpenCodeLiveHarness({
tempDir: currentStressTempDir,
selectedModel: selected.openCodeModel,
projectPath: projectPathForStress(),
})
: undefined;
} catch (error) {
await harness?.dispose().catch(() => undefined);
await codexCleanup?.().catch(() => undefined);
throw error;
}
const svc = harness?.svc ?? new TeamProvisioningService();
const active: ActiveScenario = { scenario, teamName, svc, harness, codexCleanup, failed: false };
activeScenarios.push(active);
try {
await svc.createTeam(
buildStressCreateRequest({
scenario,
teamName,
memberCount,
selection: selected,
}),
(progress) => progressEvents.push(progress)
);
await waitUntil(async () => {
const last = progressEvents.at(-1);
if (last?.state === 'failed') {
active.failed = true;
throw new Error(await formatStressDiagnostics(svc, teamName, progressEvents));
}
return last?.state === 'ready';
}, 420_000);
const expectedMembers = buildExpectedMemberNames(memberCount);
await waitUntil(async () => {
const statuses = await svc.getMemberSpawnStatuses(teamName);
if (statuses.teamLaunchState === 'partial_failure') {
active.failed = true;
throw new Error(await formatStressDiagnostics(svc, teamName, progressEvents));
}
return expectedMembers.every((memberName) => {
const entry = statuses.statuses[memberName];
return (
entry?.status === 'online' &&
entry.launchState === 'confirmed_alive' &&
entry.bootstrapConfirmed === true
);
});
}, 240_000);
await waitUntil(async () => {
const snapshot = await svc.getTeamAgentRuntimeSnapshot(teamName);
return expectedMembers.every((memberName) => snapshot.members[memberName]?.alive === true);
}, 180_000);
process.stderr.write(`[ProviderLaunchStress.live] ${scenario} confirmed all teammates\n`);
await runRestartStressChecks(active, expectedMembers, progressEvents);
await runPostLaunchWorkProofCheck(active, expectedMembers, progressEvents);
} catch (error) {
active.failed = true;
throw error;
} finally {
if (!active.failed) {
await cleanupActiveScenario(active, { preserveFiles: false });
const index = activeScenarios.indexOf(active);
if (index >= 0) activeScenarios.splice(index, 1);
}
}
}
async function runRestartStressChecks(
active: ActiveScenario,
expectedMembers: string[],
progressEvents: TeamProvisioningProgress[]
): Promise<void> {
const targets = resolveRestartStressTargets(active.scenario, expectedMembers);
for (const memberName of targets) {
process.stderr.write(
`[ProviderLaunchStress.live] restarting ${active.scenario}/${memberName}\n`
);
try {
await active.svc.restartMember(active.teamName, memberName);
await waitForStressCondition(
`restart ${active.teamName}/${memberName}`,
async () => {
const statuses = await active.svc.getMemberSpawnStatuses(active.teamName);
const entry = statuses.statuses[memberName];
if (entry?.status === 'error' || entry?.launchState === 'failed_to_start') {
throw new Error(
`restart ${memberName} failed: ${entry.hardFailureReason ?? entry.error ?? 'unknown'}`
);
}
return (
entry?.status === 'online' &&
entry.launchState === 'confirmed_alive' &&
entry.bootstrapConfirmed === true
);
},
RESTART_CONFIRM_TIMEOUT_MS,
2_000,
() => formatStressDiagnostics(active.svc, active.teamName, progressEvents)
);
await waitForStressCondition(
`runtime alive after restart ${active.teamName}/${memberName}`,
async () => {
const snapshot = await active.svc.getTeamAgentRuntimeSnapshot(active.teamName);
return snapshot.members[memberName]?.alive === true;
},
120_000,
2_000,
() => formatStressDiagnostics(active.svc, active.teamName, progressEvents)
);
} catch (error) {
throw new Error(
`Restart stress failed for ${active.scenario}/${memberName}: ${error instanceof Error ? error.message : String(error)}`
);
}
}
await waitForStressCondition(
`all teammates still confirmed after restarts ${active.teamName}`,
async () => {
const statuses = await active.svc.getMemberSpawnStatuses(active.teamName);
return expectedMembers.every((memberName) => {
const entry = statuses.statuses[memberName];
return (
entry?.status === 'online' &&
entry.launchState === 'confirmed_alive' &&
entry.bootstrapConfirmed === true
);
});
},
120_000,
2_000,
() => formatStressDiagnostics(active.svc, active.teamName, progressEvents)
);
process.stderr.write(`[ProviderLaunchStress.live] ${active.scenario} restart checks passed\n`);
}
async function runPostLaunchWorkProofCheck(
active: ActiveScenario,
expectedMembers: string[],
progressEvents: TeamProvisioningProgress[]
): Promise<void> {
const memberName = resolvePostLaunchWorkTarget(active.scenario, expectedMembers);
const marker = `provider-stress-${active.scenario}-${Date.now()}`;
const teamDataService = new TeamDataService();
const taskReader = new TeamTaskReader();
process.stderr.write(
`[ProviderLaunchStress.live] sending post-launch work probe to ${active.scenario}/${memberName}\n`
);
const task = await teamDataService.createTask(active.teamName, {
subject: `Provider launch stress proof ${marker}`,
owner: memberName,
startImmediately: true,
prompt: [
`This is a live provider launch stress validation. Marker: ${marker}.`,
'Do not edit files.',
'Add one task comment containing exactly:',
`${marker}:done`,
'Then mark this task complete.',
'After that stop. Do not send a separate user-visible chat reply.',
].join('\n'),
});
const relay = await active.svc.relayInboxFileToLiveRecipient(active.teamName, memberName);
if (!isAcceptedStressRelayResult(relay)) {
throw new Error(
`Post-launch work probe was not relayed to ${memberName}; relay result: ${JSON.stringify(relay)}`
);
}
await waitForStressCondition(
`post-launch work proof ${active.teamName}/${memberName}/${task.id}`,
async () => {
const tasks = await taskReader.getTasks(active.teamName);
const current = tasks.find((candidate) => candidate.id === task.id);
if (!current) return false;
const hasMarkerComment = current.comments?.some((comment) =>
comment.text.includes(`${marker}:done`)
);
return Boolean(hasMarkerComment || current.status === 'completed');
},
POST_LAUNCH_WORK_TIMEOUT_MS,
2_000,
() => formatStressDiagnostics(active.svc, active.teamName, progressEvents)
);
process.stderr.write(`[ProviderLaunchStress.live] ${active.scenario} post-launch work passed\n`);
}
function isAcceptedStressRelayResult(relay: Awaited<
ReturnType<TeamProvisioningService['relayInboxFileToLiveRecipient']>
>): boolean {
if (relay.kind === 'native_member_noop') return true;
if (relay.relayed > 0) return true;
const lastDelivery = relay.lastDelivery;
return Boolean(
lastDelivery &&
(lastDelivery.accepted === true ||
lastDelivery.delivered === true ||
lastDelivery.responsePending === true)
);
}
function resolveRestartStressTargets(
scenario: ProviderLaunchStressScenario,
expectedMembers: string[]
): string[] {
if (expectedMembers.length === 0) return [];
// Pure OpenCode launch can finish without a tracked lead run. Per-member
// restart for OpenCode is covered by the mixed secondary-lane scenario,
// where the app owns the live run and can reattach the OpenCode lane.
if (scenario === 'opencode') return [];
if (scenario !== 'mixed') {
return [expectedMembers[1] ?? expectedMembers[0]];
}
const targets: string[] = [];
const wantedProviders: TeamProviderId[] = ['anthropic', 'codex', 'opencode'];
for (const providerId of wantedProviders) {
const index = expectedMembers.findIndex(
(_memberName, memberIndex) => resolveStressMemberProvider('mixed', memberIndex) === providerId
);
if (index >= 0) targets.push(expectedMembers[index]!);
}
return targets;
}
function resolvePostLaunchWorkTarget(
scenario: ProviderLaunchStressScenario,
expectedMembers: string[]
): string {
if (scenario === 'mixed') {
const openCodeIndex = expectedMembers.findIndex(
(_memberName, memberIndex) =>
resolveStressMemberProvider('mixed', memberIndex) === 'opencode'
);
if (openCodeIndex >= 0) return expectedMembers[openCodeIndex]!;
}
return expectedMembers[1] ?? expectedMembers[0] ?? 'alice';
}
async function waitForStressCondition(
label: string,
predicate: () => Promise<boolean>,
timeoutMs: number,
pollMs: number,
diagnostics: () => Promise<string>
): Promise<void> {
const deadline = Date.now() + timeoutMs;
let lastError: unknown;
while (Date.now() < deadline) {
try {
if (await predicate()) return;
lastError = undefined;
} catch (error) {
lastError = error;
break;
}
await new Promise((resolve) => setTimeout(resolve, pollMs));
}
const suffix = lastError
? `\nLast error: ${lastError instanceof Error ? lastError.message : String(lastError)}`
: '';
throw new Error(
`Timed out waiting for ${label} after ${timeoutMs}ms${suffix}\n${await diagnostics()}`
);
}
function discardKnownProviderLaunchStressWarnings(): void {
const warn = vi.mocked(console.warn);
if (!warn.mock) return;
const calls = warn.mock.calls;
for (let index = calls.length - 1; index >= 0; index -= 1) {
const text = calls[index]?.map((value) => String(value)).join(' ') ?? '';
if (text.includes('Failed to resolve login shell env: shell env resolve timeout')) {
calls.splice(index, 1);
}
}
}
function buildStressCreateRequest(input: {
scenario: ProviderLaunchStressScenario;
teamName: string;
memberCount: number;
selection: ReturnType<typeof resolveScenarioSelection>;
}): TeamCreateRequest {
const members = buildStressMembers(input.scenario, input.memberCount, input.selection);
const providerId: TeamProviderId = input.scenario === 'mixed' ? 'anthropic' : input.scenario;
return {
teamName: input.teamName,
cwd: projectPathForStress(),
providerId,
providerBackendId: providerId === 'codex' ? 'codex-native' : undefined,
model:
providerId === 'codex'
? input.selection.codexModel
: providerId === 'opencode'
? input.selection.openCodeModel
: input.selection.anthropicModel,
effort: providerId === 'codex' ? input.selection.codexEffort : undefined,
fastMode: providerId === 'codex' ? 'off' : undefined,
skipPermissions: true,
prompt: 'Keep the team idle after bootstrap. Do not start extra work.',
members,
};
}
function buildStressMembers(
scenario: ProviderLaunchStressScenario,
memberCount: number,
selection: ReturnType<typeof resolveScenarioSelection>
): TeamMember[] {
const names = buildExpectedMemberNames(memberCount);
return names.map((name, index) => {
const providerId = resolveStressMemberProvider(scenario, index);
return {
name,
role: index % 2 === 0 ? 'Developer' : 'Reviewer',
providerId,
providerBackendId: providerId === 'codex' ? 'codex-native' : undefined,
model:
providerId === 'codex'
? selection.codexModel
: providerId === 'opencode'
? selection.openCodeModel
: selection.anthropicModel,
effort: providerId === 'codex' ? selection.codexEffort : undefined,
fastMode: providerId === 'codex' ? 'off' : undefined,
};
});
}
function resolveStressMemberProvider(
scenario: ProviderLaunchStressScenario,
index: number
): TeamProviderId {
if (scenario !== 'mixed') return scenario;
const providers: TeamProviderId[] = ['anthropic', 'codex', 'opencode', 'anthropic', 'codex'];
return providers[index % providers.length] ?? 'anthropic';
}
function resolveScenarioSelection(scenario: ProviderLaunchStressScenario): {
anthropicModel: string;
codexModel: string;
codexEffort: 'low' | 'medium' | 'high' | 'xhigh';
openCodeModel: string;
} {
return {
anthropicModel:
process.env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_MODEL?.trim() || DEFAULT_ANTHROPIC_MODEL,
codexModel: process.env.PROVIDER_LAUNCH_STRESS_CODEX_MODEL?.trim() || DEFAULT_CODEX_MODEL,
codexEffort: (process.env.PROVIDER_LAUNCH_STRESS_CODEX_EFFORT?.trim() ||
DEFAULT_CODEX_EFFORT) as 'low' | 'medium' | 'high' | 'xhigh',
openCodeModel:
process.env.PROVIDER_LAUNCH_STRESS_OPENCODE_MODEL?.trim() || DEFAULT_OPENCODE_MODEL,
};
}
function getStressMemberCount(): number {
const parsed = Number.parseInt(process.env.PROVIDER_LAUNCH_STRESS_MEMBER_COUNT ?? '5', 10);
return Number.isFinite(parsed) && parsed > 0
? Math.min(parsed, MEMBER_NAMES.length)
: 5;
}
function buildExpectedMemberNames(memberCount: number): string[] {
return MEMBER_NAMES.slice(0, memberCount);
}
function getStressOrder(): ProviderLaunchStressScenario[] {
const raw = process.env.PROVIDER_LAUNCH_STRESS_ORDER?.trim();
if (!raw) return DEFAULT_ORDER;
const parsed = raw
.split(',')
.map((item) => item.trim())
.filter((item): item is ProviderLaunchStressScenario =>
['anthropic', 'codex', 'opencode', 'mixed'].includes(item)
);
return parsed.length > 0 ? parsed : DEFAULT_ORDER;
}
function projectPathForStress(): string {
const explicit = process.env.PROVIDER_LAUNCH_STRESS_PROJECT_PATH?.trim();
if (explicit) return path.resolve(explicit);
if (!currentStressProjectPath) {
throw new Error('Provider launch stress project path requested before test setup');
}
return currentStressProjectPath;
}
async function cleanupActiveScenario(
active: ActiveScenario,
options: { preserveFiles: boolean }
): Promise<void> {
const beforeStopSnapshot = await active.svc
.getTeamAgentRuntimeSnapshot(active.teamName)
.catch(() => null);
await active.svc.stopTeam(active.teamName).catch(() => undefined);
if (active.harness) {
await waitForOpenCodeLanesStopped(active.teamName, 90_000).catch(() => undefined);
}
await terminateProcessBackends(beforeStopSnapshot);
const afterStopSnapshot = await active.svc
.getTeamAgentRuntimeSnapshot(active.teamName)
.catch(() => null);
await terminateProcessBackends(afterStopSnapshot);
await active.harness?.dispose().catch(() => undefined);
await active.codexCleanup?.().catch(() => undefined);
if (!options.preserveFiles) {
await fs.rm(path.join(getTeamsBasePath(), active.teamName), { recursive: true, force: true });
await fs.rm(path.join(getTasksBasePath(), active.teamName), { recursive: true, force: true });
}
}
async function terminateProcessBackends(snapshot: TeamAgentRuntimeSnapshot | null): Promise<void> {
const pids = new Set<number>();
for (const member of Object.values(snapshot?.members ?? {})) {
if (member.backendType !== 'process' || member.providerId === 'opencode') continue;
const pid = member.runtimePid ?? member.pid;
if (typeof pid === 'number' && Number.isFinite(pid) && pid > 0) pids.add(pid);
}
for (const pid of pids) {
try {
process.kill(pid, 0);
killProcessByPid(pid);
} catch {
// Best-effort cleanup; the runtime may already be gone.
}
}
}
async function installCodexAccountFeature(): Promise<() => Promise<void>> {
const [{ createCodexAccountFeature }, { ProviderConnectionService }] = await Promise.all([
import('../../../../src/features/codex-account/main/composition/createCodexAccountFeature'),
import('../../../../src/main/services/runtime/ProviderConnectionService'),
]);
const feature = createCodexAccountFeature({
logger: {
info: () => undefined,
warn: () => undefined,
error: () => undefined,
},
configManager: {
getConfig: () => ({
providerConnections: {
codex: {
preferredAuthMode: 'chatgpt' as const,
},
},
}),
},
});
const providerConnectionService = ProviderConnectionService.getInstance();
providerConnectionService.setCodexAccountFeature(feature);
return async () => {
providerConnectionService.setCodexAccountFeature(null);
await feature.dispose().catch(() => undefined);
};
}
async function formatStressDiagnostics(
svc: TeamProvisioningService,
teamName: string,
progressEvents: TeamProvisioningProgress[]
): Promise<string> {
const [spawnStatuses, runtimeSnapshot, artifact] = await Promise.all([
svc.getMemberSpawnStatuses(teamName).catch((error) => ({ error: String(error) })),
svc.getTeamAgentRuntimeSnapshot(teamName).catch((error) => ({ error: String(error) })),
readLatestArtifactManifest(teamName),
]);
return redactSecrets(
JSON.stringify(
{
progress: progressEvents.map((progress) => ({
state: progress.state,
message: progress.message,
messageSeverity: progress.messageSeverity,
error: progress.error,
launchDiagnostics: progress.launchDiagnostics,
})),
spawnStatuses,
runtimeSnapshot,
artifact,
},
null,
2
)
);
}
async function readLatestArtifactManifest(teamName: string): Promise<unknown> {
try {
const latest = JSON.parse(
await fs.readFile(
path.join(getTeamsBasePath(), teamName, 'launch-failure-artifacts', 'latest.json'),
'utf8'
)
) as { manifestPath?: unknown };
if (typeof latest.manifestPath !== 'string') return latest;
return JSON.parse(await fs.readFile(latest.manifestPath, 'utf8'));
} catch {
return null;
}
}
function hasAnthropicAuthConfigured(): boolean {
return usingAnthropicSubscriptionAuth() || Boolean(process.env.ANTHROPIC_API_KEY?.trim());
}
function usingAnthropicSubscriptionAuth(): boolean {
const mode = process.env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH?.trim().toLowerCase();
return mode === 'subscription' || mode === 'oauth';
}
async function assertExecutable(filePath: string): Promise<void> {
await fs.access(filePath, fsConstants.X_OK);
}
async function assertCodexSubscriptionAuthAvailable(codexHome: string): Promise<void> {
const legacyAuthPath = path.join(codexHome, 'auth.json');
if (await pathReadable(legacyAuthPath)) {
const legacyAuth = await readJsonObject(legacyAuthPath);
if (isCodexChatGptSubscriptionAuth(legacyAuth)) return;
}
const accountsDir = path.join(codexHome, 'accounts');
const registry = await readJsonObject(path.join(accountsDir, 'registry.json')).catch(() => null);
const activeAccountId =
readStringProperty(registry, 'active_account_id') ??
readStringProperty(registry, 'activeAccountId') ??
readStringProperty(registry, 'current_account_id') ??
readStringProperty(registry, 'currentAccountId');
const candidates = new Set<string>();
if (activeAccountId) {
candidates.add(path.join(accountsDir, `${activeAccountId}.auth.json`));
candidates.add(path.join(accountsDir, activeAccountId));
}
const entries = await fs.readdir(accountsDir).catch(() => []);
for (const entry of entries) {
if (entry.endsWith('.auth.json')) candidates.add(path.join(accountsDir, entry));
}
for (const candidate of candidates) {
const auth = await readJsonObject(candidate).catch(() => null);
if (isCodexChatGptSubscriptionAuth(auth)) return;
}
throw new Error(`Codex subscription auth not found in ${codexHome}`);
}
async function pathReadable(filePath: string): Promise<boolean> {
try {
await fs.access(filePath, fsConstants.R_OK);
return true;
} catch {
return false;
}
}
async function readJsonObject(filePath: string): Promise<Record<string, unknown>> {
const parsed = JSON.parse(await fs.readFile(filePath, 'utf8'));
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
throw new Error(`Expected JSON object in ${filePath}`);
}
return parsed as Record<string, unknown>;
}
function readStringProperty(source: Record<string, unknown> | null, key: string): string | null {
const value = source?.[key];
return typeof value === 'string' && value.trim() ? value.trim() : null;
}
function isCodexChatGptSubscriptionAuth(source: Record<string, unknown> | null): boolean {
if (!source) return false;
const direct = readStringProperty(source, 'refresh_token');
const tokens = source.tokens;
const nested =
tokens && typeof tokens === 'object' && !Array.isArray(tokens)
? readStringProperty(tokens as Record<string, unknown>, 'refresh_token')
: null;
return Boolean(direct || nested);
}
function resolveConnectedCodexHome(previousCodexHome: string | undefined): string {
const explicit = process.env.PROVIDER_LAUNCH_STRESS_CODEX_HOME?.trim();
if (explicit) return path.resolve(explicit);
const previous = previousCodexHome?.trim();
if (previous) return path.resolve(previous);
return path.join(os.userInfo().homedir, '.codex');
}
async function writeTrustedClaudeConfig(configDir: string, projectPath: string): Promise<void> {
const normalizedProjectPath = path.normalize(await fs.realpath(projectPath)).replace(/\\/g, '/');
const approvedApiKeySuffix = process.env.ANTHROPIC_API_KEY?.trim().slice(-20);
const config: {
projects: Record<string, { hasTrustDialogAccepted: true }>;
customApiKeyResponses?: { approved: string[]; rejected: string[] };
} = {
projects: {
[normalizedProjectPath]: {
hasTrustDialogAccepted: true,
},
},
};
if (approvedApiKeySuffix) {
config.customApiKeyResponses = { approved: [approvedApiKeySuffix], rejected: [] };
}
await fs.writeFile(
path.join(configDir, '.claude.json'),
`${JSON.stringify(config, null, 2)}\n`,
'utf8'
);
}
async function upsertTrustedClaudeProjectConfig(
configDir: string,
projectPath: string
): Promise<string | null> {
const configPath = path.join(configDir, '.claude.json');
const previous = await fs.readFile(configPath, 'utf8').catch((error) => {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') return null;
throw error;
});
const existing = previous ? (JSON.parse(previous) as Record<string, unknown>) : {};
const normalizedProjectPath = path.normalize(await fs.realpath(projectPath)).replace(/\\/g, '/');
const projects =
existing.projects && typeof existing.projects === 'object' && !Array.isArray(existing.projects)
? { ...(existing.projects as Record<string, unknown>) }
: {};
const current =
projects[normalizedProjectPath] &&
typeof projects[normalizedProjectPath] === 'object' &&
!Array.isArray(projects[normalizedProjectPath])
? (projects[normalizedProjectPath] as Record<string, unknown>)
: {};
projects[normalizedProjectPath] = { ...current, hasTrustDialogAccepted: true };
await fs.writeFile(configPath, `${JSON.stringify({ ...existing, projects }, null, 2)}\n`, 'utf8');
return previous;
}
async function restoreClaudeJsonConfig(configDir: string, previous: string | null): Promise<void> {
const configPath = path.join(configDir, '.claude.json');
if (previous === null) {
await fs.rm(configPath, { force: true });
} else {
await fs.writeFile(configPath, previous, 'utf8');
}
}
function restoreEnv(name: string, previous: string | undefined): void {
if (previous === undefined) {
delete process.env[name];
} else {
process.env[name] = previous;
}
}
function redactSecrets(text: string): string {
return text
.replace(/sk-ant-api03-[A-Za-z0-9_-]+/g, '<redacted-anthropic-key>')
.replace(/\b(?:sk|ak)-[A-Za-z0-9_-]{20,}\b/g, '<redacted-api-key>');
}

View file

@ -0,0 +1,197 @@
import * as fs from 'fs/promises';
import * as os from 'os';
import * as path from 'path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import {
classifyLaunchFailureArtifact,
extractLaunchBootstrapTransportBreadcrumb,
redactLaunchFailureArtifactText,
writeTeamLaunchFailureArtifactPack,
} from '../../../../src/main/services/team/TeamLaunchFailureArtifactPack';
import {
getTeamsBasePath,
setClaudeBasePathOverride,
} from '../../../../src/main/utils/pathDecoder';
describe('TeamLaunchFailureArtifactPack', () => {
let tempRoot: string;
beforeEach(async () => {
tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'team-launch-artifact-pack-'));
setClaudeBasePathOverride(path.join(tempRoot, '.claude'));
});
afterEach(async () => {
setClaudeBasePathOverride(null);
await fs.rm(tempRoot, { recursive: true, force: true });
});
it('writes a bounded redacted launch failure artifact pack with known launch files', async () => {
const teamName = 'artifact-team';
const runId = 'run-secret-1';
const teamDir = path.join(getTeamsBasePath(), teamName);
await fs.mkdir(path.join(teamDir, '.bootstrap.lock'), { recursive: true });
await fs.writeFile(
path.join(teamDir, 'launch-state.json'),
JSON.stringify({
teamName,
runId,
secret: 'sk-ant-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
token: 'abcdefghijklmnopqrstuvwxyz123456',
}),
'utf8'
);
await fs.writeFile(path.join(teamDir, 'launch-summary.json'), '{"summary":true}\n', 'utf8');
await fs.writeFile(path.join(teamDir, 'bootstrap-state.json'), '{"bootstrap":true}\n', 'utf8');
await fs.writeFile(
path.join(teamDir, 'bootstrap-journal.jsonl'),
'{"event":"started"}\n',
'utf8'
);
await fs.writeFile(
path.join(teamDir, '.bootstrap.lock', 'metadata.json'),
'{"pid":123,"runId":"run-secret-1"}\n',
'utf8'
);
const result = await writeTeamLaunchFailureArtifactPack({
teamName,
runId,
reason: 'launch_progress_failed',
startedAt: '2026-05-09T00:00:00.000Z',
cwd: '/repo',
pid: 123,
providerId: 'anthropic',
model: 'claude-opus',
expectedMembers: ['alice'],
effectiveMembers: [{ name: 'alice', role: 'developer', provider: 'anthropic' } as never],
progress: {
runId,
teamName,
state: 'failed',
message: 'Launch failed',
startedAt: '2026-05-09T00:00:00.000Z',
updatedAt: '2026-05-09T00:01:00.000Z',
error:
'Authentication failed: ANTHROPIC_API_KEY=sk-ant-bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb',
},
memberSpawnStatuses: {
alice: {
status: 'error',
launchState: 'failed_to_start',
hardFailureReason: 'bootstrap timeout',
updatedAt: '2026-05-09T00:01:00.000Z',
},
},
cliLogs: 'stderr OPENAI_API_KEY=sk-proj-cccccccccccccccccccccccccccccccccccccccc',
progressTraceLines: ['[failed] launch failed'],
runtimeAdapterTraceLines: ['runtime trace'],
});
const manifest = JSON.parse(await fs.readFile(result.manifestPath, 'utf8')) as {
reason: string;
artifactFiles: string[];
classification: { code: string };
bootstrapTransportBreadcrumb: { lastTransportStage: string | null };
progress: { error: string };
};
expect(manifest.reason).toBe('launch_progress_failed');
expect(manifest.classification.code).toBe('provider_auth');
expect(manifest.artifactFiles).toContain('cli-logs-tail.txt');
expect(manifest.artifactFiles).toContain('launch-state.json');
expect(manifest.progress.error).toContain('[REDACTED]');
const copiedLaunchState = await fs.readFile(path.join(result.directory, 'launch-state.json'), 'utf8');
expect(copiedLaunchState).toContain('[REDACTED_ANTHROPIC_API_KEY]');
expect(() => JSON.parse(copiedLaunchState)).not.toThrow();
expect(copiedLaunchState).toContain('"token":"[REDACTED]"');
expect(copiedLaunchState).not.toContain('sk-ant-');
const cliLogs = await fs.readFile(path.join(result.directory, 'cli-logs-tail.txt'), 'utf8');
expect(cliLogs).toContain('OPENAI_API_KEY=[REDACTED]');
expect(cliLogs).not.toContain('sk-proj-');
const latest = JSON.parse(
await fs.readFile(path.join(teamDir, 'launch-failure-artifacts', 'latest.json'), 'utf8')
) as { manifestPath: string };
expect(latest.manifestPath).toBe(result.manifestPath);
});
it('redacts common bearer and token-shaped secrets', () => {
const redacted = redactLaunchFailureArtifactText(
'Authorization: Bearer abcdefghijklmnopqrstuvwxyz123456 token: abcdefghijklmnopqrstuvwxyz123456'
);
expect(redacted).toContain('Authorization: Bearer [REDACTED]');
expect(redacted).toContain('token: [REDACTED]');
});
it('classifies bootstrap transport rejection and extracts breadcrumb details', () => {
const input = {
teamName: 'artifact-team',
runId: 'run-transport',
reason: 'launch_cleanup_unconfirmed_bootstrap',
progressTraceLines: [
'bob did not submit bootstrap prompt: timed out waiting for bootstrap_submitted; last transport stage: bootstrap_submit_rejected: submit rejected by local prompt handler retryable=true',
'Warning: no stdin data received in 3s, proceeding without it.',
],
};
expect(classifyLaunchFailureArtifact(input).code).toBe('transport_rejected');
expect(extractLaunchBootstrapTransportBreadcrumb(input)).toMatchObject({
lastTransportStage: 'bootstrap_submit_rejected: submit rejected by local prompt handler retryable=true',
submitRejected: true,
retryable: true,
noStdinWarning: true,
bootstrapSubmitted: false,
});
});
it('classifies provider quota separately from protocol errors', () => {
expect(
classifyLaunchFailureArtifact({
teamName: 'artifact-team',
runId: 'run-quota',
reason:
'OpenCode quota exhausted. This request requires more credits, or fewer max_tokens.',
}).code
).toBe('provider_quota');
});
it.each([
{
name: 'stdin warning',
text: 'Warning: no stdin data received in 3s, proceeding without it.',
code: 'stdin_missing',
},
{
name: 'provider auth',
text: 'Codex API error. Token refresh failed: 401 Unauthorized',
code: 'provider_auth',
},
{
name: 'model bootstrap timeout',
text: 'bob: Teammate was registered but did not bootstrap-confirm before timeout.',
code: 'model_no_bootstrap',
},
{
name: 'process stale pid',
text: 'persisted runtime pid is not alive; persisted runtime pid was not found in process table',
code: 'process_exited',
},
{
name: 'opencode protocol',
text: 'OpenCode API error. non_visible_tool_without_task_progress',
code: 'opencode_protocol',
},
])('classifies production-like failure string: $name', ({ text, code }) => {
expect(
classifyLaunchFailureArtifact({
teamName: 'artifact-team',
runId: `run-${code}`,
reason: text,
}).code
).toBe(code);
});
});

View file

@ -615,6 +615,17 @@ function createClaudeLogsRun(overrides: Record<string, unknown> = {}) {
} as any;
}
async function waitForFile(filePath: string, timeoutMs = 2_000): Promise<void> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
if (fs.existsSync(filePath)) {
return;
}
await new Promise((resolve) => setTimeout(resolve, 25));
}
throw new Error(`Timed out waiting for file: ${filePath}`);
}
describe('TeamProvisioningService', () => {
beforeEach(() => {
vi.clearAllMocks();
@ -1137,6 +1148,102 @@ describe('TeamProvisioningService', () => {
});
});
it('writes a launch failure artifact pack when cleanup finalizes failed launch state', async () => {
allowConsoleLogs();
const svc = new TeamProvisioningService();
const teamName = 'launch-artifact-cleanup-team';
const runId = 'run-launch-artifact-cleanup';
const startedAt = '2026-05-09T00:25:00.000Z';
const run = createClaudeLogsRun({
runId,
teamName,
startedAt,
isLaunch: true,
provisioningComplete: false,
cancelRequested: false,
deterministicBootstrap: true,
expectedMembers: ['bob'],
allEffectiveMembers: [
{
name: 'bob',
role: 'Developer',
providerId: 'anthropic',
model: 'opus',
},
],
request: {
cwd: '/repo',
providerId: 'anthropic',
model: 'opus',
members: [
{
name: 'bob',
role: 'Developer',
providerId: 'anthropic',
model: 'opus',
},
],
},
memberSpawnStatuses: new Map([
[
'bob',
createMemberSpawnStatusEntry({
status: 'spawning',
launchState: 'runtime_pending_bootstrap',
runtimeAlive: true,
firstSpawnAcceptedAt: '2026-05-09T00:25:05.000Z',
updatedAt: '2026-05-09T00:25:05.000Z',
}),
],
]),
progress: {
runId,
teamName,
state: 'failed',
message: 'Launch failed',
startedAt,
updatedAt: '2026-05-09T00:26:00.000Z',
error:
'Teammate process bob@signal-ops did not submit bootstrap prompt: timed out waiting for bootstrap_submitted; last transport stage: bootstrap_submit_rejected: submit rejected by local prompt handler retryable=true Last stderr: Warning: no stdin data received in 3s, proceeding without it.',
},
claudeLogLines: [
'[stderr]',
'Warning: no stdin data received in 3s, proceeding without it.',
],
provisioningOutputParts: [],
});
(svc as any).runs.set(run.runId, run);
(svc as any).aliveRunByTeam.set(run.teamName, run.runId);
(svc as any).cleanupRun(run);
const latestPath = path.join(
tempTeamsBase,
teamName,
'launch-failure-artifacts',
'latest.json'
);
await waitForFile(latestPath);
const latest = JSON.parse(fs.readFileSync(latestPath, 'utf8')) as { manifestPath: string };
const manifest = JSON.parse(fs.readFileSync(latest.manifestPath, 'utf8')) as {
reason: string;
classification: { code: string };
bootstrapTransportBreadcrumb: {
submitRejected: boolean;
noStdinWarning: boolean;
retryable: boolean | null;
};
};
expect(manifest.reason).toBe('launch_progress_failed');
expect(manifest.classification.code).toBe('transport_rejected');
expect(manifest.bootstrapTransportBreadcrumb).toMatchObject({
submitRejected: true,
noStdinWarning: true,
retryable: true,
});
});
it('falls back to the persisted lead transcript when no live run exists', async () => {
const svc = new TeamProvisioningService();
const teamName = 'offline-logs-team';
@ -11591,6 +11698,59 @@ describe('TeamProvisioningService', () => {
expect(sendMessageToRun).not.toHaveBeenCalled();
});
it('restarts a process backend teammate directly without asking the lead to respawn it', async () => {
const svc = new TeamProvisioningService();
const run = createMemberSpawnRun({
teamName: 'process-team',
expectedMembers: ['forge'],
memberSpawnStatuses: new Map(),
});
run.child = { pid: 111 };
run.processKilled = false;
run.cancelRequested = false;
const sendMessageToRun = vi.fn(async () => {});
const directProcessRestart = vi.fn(async () => {});
(svc as any).sendMessageToRun = sendMessageToRun;
(svc as any).launchDirectProcessMemberRestart = directProcessRestart;
(svc as any).configReader = {
getConfig: vi.fn(async () => ({
name: 'Process Team',
members: [{ name: 'team-lead', agentType: 'team-lead' }],
})),
};
(svc as any).membersMetaStore = {
getMembers: vi.fn(async () => [
{
name: 'forge',
role: 'Developer',
providerId: 'codex',
model: 'gpt-5.4',
effort: 'medium',
agentType: 'general-purpose',
},
]),
};
(svc as any).readPersistedRuntimeMembers = vi.fn(() => [
{
name: 'forge',
agentId: 'forge@process-team',
backendType: 'process',
tmuxPaneId: 'process:1234',
runtimePid: 1234,
},
]);
(svc as any).getLiveTeamAgentRuntimeMetadata = vi.fn(async () => new Map());
(svc as any).aliveRunByTeam.set('process-team', run.runId);
(svc as any).runs.set(run.runId, run);
await svc.restartMember('process-team', 'forge');
expect(directProcessRestart).toHaveBeenCalledTimes(1);
expect(sendMessageToRun).not.toHaveBeenCalled();
expect(run.pendingMemberRestarts.has('forge')).toBe(true);
});
it('rejects a second restart request while the first restart is still in flight', async () => {
const svc = new TeamProvisioningService();
const run = createMemberSpawnRun({