fix(team): harden process launch diagnostics
This commit is contained in:
parent
81700a1a14
commit
6dab896aee
9 changed files with 2335 additions and 3 deletions
|
|
@ -17,6 +17,8 @@ For new features:
|
|||
|
||||
- Treat regressions in agent team messaging, task lifecycle, session parsing, code review UI, and provider/runtime detection as high priority.
|
||||
- For team launch hangs, OpenCode `registered`/`bootstrap unconfirmed`, missing teammate replies, or suspicious task logs, follow [docs/team-management/debugging-agent-teams.md](docs/team-management/debugging-agent-teams.md) before changing code.
|
||||
- For launch failures, first inspect the newest artifact pack under `~/.claude/teams/<team>/launch-failure-artifacts/latest.json`, then open its `manifest.json`. The manifest includes `classification`, `bootstrapTransportBreadcrumb`, launch diagnostics, member spawn statuses, and redacted copies/tails of launch-state, bootstrap-state, bootstrap-journal, CLI logs, progress trace, and runtime adapter trace.
|
||||
- When running live smoke tests, keep cleanup narrow: stop only the smoke-owned team/run and launch-owned process teammates. Do not kill shared OpenCode hosts, unrelated tmux panes, or user teams while trying to clean stale smoke artifacts.
|
||||
- Verify new medium and large features follow `docs/FEATURE_ARCHITECTURE_STANDARD.md`, especially cross-process boundaries and public feature entrypoints.
|
||||
- Check that Electron main, preload, renderer, and shared code keep their responsibilities separate and use the documented path aliases.
|
||||
- Flag changes that manually concatenate agent block markers instead of using `wrapAgentBlock(text)`.
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@
|
|||
"opencode:prove-semantic-model-matrix": "node ./scripts/prove-opencode-semantic-model-matrix.mjs",
|
||||
"opencode:prove-team-provisioning": "node ./scripts/prove-opencode-team-provisioning.mjs",
|
||||
"team:prove-agent-cli-launch": "node ./scripts/prove-agent-cli-launch.mjs",
|
||||
"team:prove-provider-launch-stress": "node ./scripts/prove-provider-launch-stress.mjs",
|
||||
"team:prove-launch-matrix": "pnpm exec vitest run --maxWorkers 1 --minWorkers 1 test/main/services/team/TeamAgentLaunchMatrix.safe-e2e.test.ts",
|
||||
"prebuild": "tsx scripts/fetch-pricing-data.ts && pnpm --filter agent-teams-controller build && pnpm --filter agent-teams-mcp build",
|
||||
"build": "node --max-old-space-size=8192 ./node_modules/electron-vite/bin/electron-vite.js build",
|
||||
|
|
|
|||
79
scripts/prove-provider-launch-stress.mjs
Normal file
79
scripts/prove-provider-launch-stress.mjs
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
import { spawnSync } from 'node:child_process';
|
||||
import path from 'node:path';
|
||||
import process from 'node:process';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
import {
|
||||
exitForSkippedPreflight,
|
||||
preflightOpenCodeLiveEnvironment,
|
||||
} from './lib/opencode-live-preflight.mjs';
|
||||
|
||||
const scriptDir = path.dirname(fileURLToPath(import.meta.url));
|
||||
const repoRoot = path.resolve(scriptDir, '..');
|
||||
const orchestratorRoot = process.env.CLAUDE_DEV_RUNTIME_ROOT?.trim();
|
||||
const siblingOrchestrator = path.resolve(repoRoot, '..', 'agent_teams_orchestrator');
|
||||
const order = process.env.PROVIDER_LAUNCH_STRESS_ORDER?.trim() || 'anthropic,codex,opencode,mixed';
|
||||
|
||||
const env = {
|
||||
...process.env,
|
||||
PROVIDER_LAUNCH_STRESS_LIVE: '1',
|
||||
PROVIDER_LAUNCH_STRESS_ORDER: order,
|
||||
PROVIDER_LAUNCH_STRESS_MEMBER_COUNT:
|
||||
process.env.PROVIDER_LAUNCH_STRESS_MEMBER_COUNT?.trim() || '5',
|
||||
PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH:
|
||||
process.env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH?.trim() ||
|
||||
(process.env.ANTHROPIC_API_KEY?.trim() ? 'api-key' : 'subscription'),
|
||||
OPENCODE_E2E: '1',
|
||||
OPENCODE_E2E_USE_REAL_APP_CREDENTIALS: '1',
|
||||
OPENCODE_DISABLE_AUTOUPDATE: process.env.OPENCODE_DISABLE_AUTOUPDATE ?? '1',
|
||||
};
|
||||
|
||||
if (!env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH?.trim()) {
|
||||
const runtimeRoot = orchestratorRoot ? path.resolve(orchestratorRoot) : siblingOrchestrator;
|
||||
env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH = path.join(runtimeRoot, 'cli');
|
||||
}
|
||||
|
||||
console.log('Running provider launch stress live smoke');
|
||||
console.log(`Order: ${env.PROVIDER_LAUNCH_STRESS_ORDER}`);
|
||||
console.log(`Members per scenario: ${env.PROVIDER_LAUNCH_STRESS_MEMBER_COUNT}`);
|
||||
console.log(`Anthropic auth: ${env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH}`);
|
||||
console.log(
|
||||
`Models: anthropic=${env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_MODEL || 'haiku'}, codex=${
|
||||
env.PROVIDER_LAUNCH_STRESS_CODEX_MODEL || 'gpt-5.4-mini'
|
||||
}, opencode=${env.PROVIDER_LAUNCH_STRESS_OPENCODE_MODEL || 'openai/gpt-5.4-mini'}`
|
||||
);
|
||||
console.log(`Orchestrator CLI: ${env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH}`);
|
||||
|
||||
if (order.split(',').some((item) => ['opencode', 'mixed'].includes(item.trim()))) {
|
||||
const preflight = await preflightOpenCodeLiveEnvironment({ repoRoot });
|
||||
exitForSkippedPreflight(preflight);
|
||||
}
|
||||
|
||||
const result = spawnSync(
|
||||
'pnpm',
|
||||
[
|
||||
'exec',
|
||||
'vitest',
|
||||
'run',
|
||||
'--maxWorkers',
|
||||
'1',
|
||||
'--minWorkers',
|
||||
'1',
|
||||
'test/main/services/team/ProviderLaunchStress.live-e2e.test.ts',
|
||||
],
|
||||
{
|
||||
cwd: repoRoot,
|
||||
env,
|
||||
stdio: 'inherit',
|
||||
shell: process.platform === 'win32',
|
||||
}
|
||||
);
|
||||
|
||||
if (result.error) {
|
||||
console.error(`Failed to run provider launch stress smoke: ${result.error.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
process.exit(result.status ?? 1);
|
||||
485
src/main/services/team/TeamLaunchFailureArtifactPack.ts
Normal file
485
src/main/services/team/TeamLaunchFailureArtifactPack.ts
Normal file
|
|
@ -0,0 +1,485 @@
|
|||
import { getTeamsBasePath } from '@main/utils/pathDecoder';
|
||||
import { createLogger } from '@shared/utils/logger';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
import { atomicWriteAsync } from './atomicWrite';
|
||||
import { getTeamBootstrapStatePath } from './TeamBootstrapStateReader';
|
||||
import { getTeamLaunchStatePath, getTeamLaunchSummaryPath } from './TeamLaunchStateStore';
|
||||
|
||||
import type {
|
||||
MemberSpawnStatusEntry,
|
||||
PersistedTeamLaunchSnapshot,
|
||||
TeamLaunchDiagnosticItem,
|
||||
TeamMember,
|
||||
TeamProviderBackendId,
|
||||
TeamProviderId,
|
||||
TeamProvisioningProgress,
|
||||
} from '@shared/types';
|
||||
|
||||
const logger = createLogger('Service:TeamLaunchFailureArtifactPack');
|
||||
|
||||
const ARTIFACTS_DIR_NAME = 'launch-failure-artifacts';
|
||||
const LATEST_ARTIFACT_FILE = 'latest.json';
|
||||
const MAX_CLI_LOG_CHARS = 256_000;
|
||||
const MAX_TRACE_CHARS = 128_000;
|
||||
const MAX_COPIED_FILE_BYTES = 256 * 1024;
|
||||
|
||||
type JsonRecord = Record<string, unknown>;
|
||||
|
||||
export interface TeamLaunchFailureArtifactPackInput {
|
||||
teamName: string;
|
||||
runId: string;
|
||||
reason: string;
|
||||
startedAt?: string;
|
||||
cwd?: string;
|
||||
pid?: number | null;
|
||||
providerId?: TeamProviderId;
|
||||
providerBackendId?: TeamProviderBackendId;
|
||||
model?: string;
|
||||
expectedMembers?: readonly string[];
|
||||
effectiveMembers?: readonly TeamMember[];
|
||||
progress?: TeamProvisioningProgress | null;
|
||||
launchSnapshot?: PersistedTeamLaunchSnapshot | null;
|
||||
launchDiagnostics?: readonly TeamLaunchDiagnosticItem[];
|
||||
memberSpawnStatuses?: Record<string, MemberSpawnStatusEntry>;
|
||||
cliLogs?: string | null;
|
||||
progressTraceLines?: readonly string[];
|
||||
runtimeAdapterTraceLines?: readonly string[];
|
||||
flags?: JsonRecord;
|
||||
}
|
||||
|
||||
export interface TeamLaunchFailureArtifactPackResult {
|
||||
directory: string;
|
||||
manifestPath: string;
|
||||
files: string[];
|
||||
}
|
||||
|
||||
export type LaunchFailureArtifactClassificationCode =
|
||||
| 'transport_rejected'
|
||||
| 'stdin_missing'
|
||||
| 'provider_quota'
|
||||
| 'provider_auth'
|
||||
| 'model_no_bootstrap'
|
||||
| 'process_exited'
|
||||
| 'opencode_protocol'
|
||||
| 'unknown';
|
||||
|
||||
export interface LaunchFailureArtifactClassification {
|
||||
code: LaunchFailureArtifactClassificationCode;
|
||||
confidence: number;
|
||||
evidence: string[];
|
||||
}
|
||||
|
||||
export interface LaunchBootstrapTransportBreadcrumb {
|
||||
lastTransportStage: string | null;
|
||||
submitRejected: boolean;
|
||||
retryable: boolean | null;
|
||||
noStdinWarning: boolean;
|
||||
bootstrapSubmitted: boolean;
|
||||
evidence: string[];
|
||||
}
|
||||
|
||||
interface CopiedArtifactFile {
|
||||
sourcePath: string;
|
||||
artifactName: string;
|
||||
issue?: string;
|
||||
}
|
||||
|
||||
function sanitizeArtifactNamePart(value: string): string {
|
||||
const sanitized = value
|
||||
.trim()
|
||||
.replace(/[^a-zA-Z0-9._-]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '');
|
||||
return sanitized || 'unknown';
|
||||
}
|
||||
|
||||
function artifactTimestamp(now: Date): string {
|
||||
return now.toISOString().replace(/[:.]/g, '-');
|
||||
}
|
||||
|
||||
function assertPathWithin(root: string, target: string): void {
|
||||
const relative = path.relative(path.resolve(root), path.resolve(target));
|
||||
if (relative.startsWith('..') || path.isAbsolute(relative)) {
|
||||
throw new Error(`Launch artifact path escaped teams root: ${target}`);
|
||||
}
|
||||
}
|
||||
|
||||
function truncateTail(text: string, maxChars: number): string {
|
||||
if (text.length <= maxChars) return text;
|
||||
return `[truncated to last ${maxChars} chars]\n${text.slice(text.length - maxChars)}`;
|
||||
}
|
||||
|
||||
export function redactLaunchFailureArtifactText(text: string): string {
|
||||
return text
|
||||
.replace(/sk-ant-[A-Za-z0-9_-]{20,}/g, '[REDACTED_ANTHROPIC_API_KEY]')
|
||||
.replace(/sk-proj-[A-Za-z0-9_-]{20,}/g, '[REDACTED_OPENAI_API_KEY]')
|
||||
.replace(/sk-[A-Za-z0-9_-]{20,}/g, '[REDACTED_API_KEY]')
|
||||
.replace(
|
||||
/\b(ANTHROPIC_API_KEY|OPENAI_API_KEY|CODEX_API_KEY|OPENROUTER_API_KEY|GEMINI_API_KEY)=([^\s"'`]+)/gi,
|
||||
'$1=[REDACTED]'
|
||||
)
|
||||
.replace(/\b(authorization:\s*bearer\s+)([A-Za-z0-9._~+/=-]{20,})/gi, '$1[REDACTED]')
|
||||
.replace(
|
||||
/\b(api[_-]?key|token|access[_-]?token|refresh[_-]?token)(["']?\s*[:=]\s*["']?)([A-Za-z0-9._~+/=-]{20,})/gi,
|
||||
'$1$2[REDACTED]'
|
||||
);
|
||||
}
|
||||
|
||||
function redactJsonLike<T>(value: T): T {
|
||||
return redactJsonValue(value) as T;
|
||||
}
|
||||
|
||||
function isSecretJsonKey(key: string): boolean {
|
||||
return /^(api[_-]?key|token|access[_-]?token|refresh[_-]?token|authorization)$/i.test(key);
|
||||
}
|
||||
|
||||
function redactJsonValue(value: unknown, key = ''): unknown {
|
||||
if (isSecretJsonKey(key)) {
|
||||
return '[REDACTED]';
|
||||
}
|
||||
if (typeof value === 'string') {
|
||||
return redactLaunchFailureArtifactText(value);
|
||||
}
|
||||
if (Array.isArray(value)) {
|
||||
return value.map((item) => redactJsonValue(item));
|
||||
}
|
||||
if (value && typeof value === 'object') {
|
||||
return Object.fromEntries(
|
||||
Object.entries(value as JsonRecord).map(([entryKey, entryValue]) => [
|
||||
entryKey,
|
||||
redactJsonValue(entryValue, entryKey),
|
||||
])
|
||||
);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function appendIfString(parts: string[], value: unknown): void {
|
||||
if (typeof value === 'string' && value.trim()) {
|
||||
parts.push(value.trim());
|
||||
}
|
||||
}
|
||||
|
||||
function collectLaunchFailureSearchParts(input: TeamLaunchFailureArtifactPackInput): string[] {
|
||||
const parts: string[] = [];
|
||||
appendIfString(parts, input.reason);
|
||||
appendIfString(parts, input.cliLogs);
|
||||
for (const line of input.progressTraceLines ?? []) appendIfString(parts, line);
|
||||
for (const line of input.runtimeAdapterTraceLines ?? []) appendIfString(parts, line);
|
||||
appendIfString(parts, input.progress?.message);
|
||||
appendIfString(parts, input.progress?.error);
|
||||
appendIfString(parts, input.progress?.cliLogsTail);
|
||||
for (const warning of input.progress?.warnings ?? []) appendIfString(parts, warning);
|
||||
for (const diagnostic of input.launchDiagnostics ?? input.progress?.launchDiagnostics ?? []) {
|
||||
appendIfString(parts, diagnostic.code);
|
||||
appendIfString(parts, diagnostic.label);
|
||||
appendIfString(parts, diagnostic.detail);
|
||||
}
|
||||
for (const [memberName, entry] of Object.entries(input.memberSpawnStatuses ?? {})) {
|
||||
appendIfString(parts, memberName);
|
||||
appendIfString(parts, entry.status);
|
||||
appendIfString(parts, entry.launchState);
|
||||
appendIfString(parts, entry.error);
|
||||
appendIfString(parts, entry.hardFailureReason);
|
||||
appendIfString(parts, entry.runtimeDiagnostic);
|
||||
}
|
||||
if (input.launchSnapshot) {
|
||||
appendIfString(parts, input.launchSnapshot.launchPhase);
|
||||
appendIfString(parts, input.launchSnapshot.teamLaunchState);
|
||||
for (const [memberName, member] of Object.entries(input.launchSnapshot.members)) {
|
||||
appendIfString(parts, memberName);
|
||||
appendIfString(parts, member.launchState);
|
||||
appendIfString(parts, member.hardFailureReason);
|
||||
appendIfString(parts, member.runtimeDiagnostic);
|
||||
for (const diagnostic of member.diagnostics ?? []) appendIfString(parts, diagnostic);
|
||||
}
|
||||
}
|
||||
return parts;
|
||||
}
|
||||
|
||||
function firstEvidence(parts: readonly string[], pattern: RegExp): string[] {
|
||||
const evidence: string[] = [];
|
||||
for (const part of parts) {
|
||||
if (pattern.test(part)) {
|
||||
evidence.push(truncateTail(part, 600));
|
||||
if (evidence.length >= 3) break;
|
||||
}
|
||||
}
|
||||
return evidence;
|
||||
}
|
||||
|
||||
export function classifyLaunchFailureArtifact(
|
||||
input: TeamLaunchFailureArtifactPackInput
|
||||
): LaunchFailureArtifactClassification {
|
||||
const parts = collectLaunchFailureSearchParts(input);
|
||||
const text = parts.join('\n').toLowerCase();
|
||||
const candidates: {
|
||||
code: LaunchFailureArtifactClassificationCode;
|
||||
confidence: number;
|
||||
pattern: RegExp;
|
||||
}[] = [
|
||||
{
|
||||
code: 'transport_rejected',
|
||||
confidence: 0.95,
|
||||
pattern: /bootstrap_submit_rejected|submit rejected by local prompt handler/i,
|
||||
},
|
||||
{
|
||||
code: 'stdin_missing',
|
||||
confidence: 0.9,
|
||||
pattern: /no stdin data received|proceeding without it/i,
|
||||
},
|
||||
{
|
||||
code: 'provider_quota',
|
||||
confidence: 0.92,
|
||||
pattern: /quota exhausted|insufficient credits|key limit exceeded|total limit|rate limit/i,
|
||||
},
|
||||
{
|
||||
code: 'provider_auth',
|
||||
confidence: 0.88,
|
||||
pattern:
|
||||
/401 unauthorized|not_logged_in|login required|auth(?:entication)? failed|api key.*(?:missing|invalid)|token refresh failed/i,
|
||||
},
|
||||
{
|
||||
code: 'opencode_protocol',
|
||||
confidence: 0.84,
|
||||
pattern:
|
||||
/visible_reply_still_required|non_visible_tool_without_task_progress|empty_assistant_turn|runtime_bootstrap_checkin/i,
|
||||
},
|
||||
{
|
||||
code: 'model_no_bootstrap',
|
||||
confidence: 0.82,
|
||||
pattern:
|
||||
/did not bootstrap-confirm|bootstrap unconfirmed|bootstrap-confirm before timeout|check-in not yet received|bootstrap_stalled/i,
|
||||
},
|
||||
{
|
||||
code: 'process_exited',
|
||||
confidence: 0.78,
|
||||
pattern: /process exited|pid is not alive|pid was not found|stale_metadata|exited before/i,
|
||||
},
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (candidate.pattern.test(text)) {
|
||||
return {
|
||||
code: candidate.code,
|
||||
confidence: candidate.confidence,
|
||||
evidence: firstEvidence(parts, candidate.pattern).map(redactLaunchFailureArtifactText),
|
||||
};
|
||||
}
|
||||
}
|
||||
return {
|
||||
code: 'unknown',
|
||||
confidence: 0.2,
|
||||
evidence: firstEvidence(parts, /failed|error|timeout/i).map(redactLaunchFailureArtifactText),
|
||||
};
|
||||
}
|
||||
|
||||
export function extractLaunchBootstrapTransportBreadcrumb(
|
||||
input: TeamLaunchFailureArtifactPackInput
|
||||
): LaunchBootstrapTransportBreadcrumb {
|
||||
const parts = collectLaunchFailureSearchParts(input);
|
||||
const combined = parts.join('\n');
|
||||
const lastStageMatches = [...combined.matchAll(/last transport stage:\s*([^;\n]+)/gi)];
|
||||
const retryableMatches = [
|
||||
...combined.matchAll(/bootstrap_submit_rejected[^\n]*(?:retryable[=:]\s*(true|false))/gi),
|
||||
];
|
||||
const evidence = firstEvidence(
|
||||
parts,
|
||||
/bootstrap_submit_|last transport stage|no stdin data received|local prompt handler/i
|
||||
).map(redactLaunchFailureArtifactText);
|
||||
const retryableRaw = retryableMatches.at(-1)?.[1]?.toLowerCase();
|
||||
return {
|
||||
lastTransportStage: lastStageMatches.at(-1)?.[1]?.trim() ?? null,
|
||||
submitRejected: /bootstrap_submit_rejected|submit rejected by local prompt handler/i.test(
|
||||
combined
|
||||
),
|
||||
retryable: retryableRaw === 'true' ? true : retryableRaw === 'false' ? false : null,
|
||||
noStdinWarning: /no stdin data received|proceeding without it/i.test(combined),
|
||||
bootstrapSubmitted:
|
||||
/(?:event["']?\s*:\s*["']bootstrap_submitted["']|bootstrap_submit_accepted|bootstrap submitted)/i.test(
|
||||
combined
|
||||
),
|
||||
evidence,
|
||||
};
|
||||
}
|
||||
|
||||
async function readBoundedTextFile(sourcePath: string): Promise<{ text?: string; issue?: string }> {
|
||||
try {
|
||||
const stat = await fs.promises.stat(sourcePath);
|
||||
if (!stat.isFile()) {
|
||||
return { issue: 'not_regular_file' };
|
||||
}
|
||||
const handle = await fs.promises.open(sourcePath, 'r');
|
||||
try {
|
||||
const start = Math.max(0, stat.size - MAX_COPIED_FILE_BYTES);
|
||||
const buffer = Buffer.alloc(stat.size - start);
|
||||
if (buffer.length > 0) {
|
||||
await handle.read(buffer, 0, buffer.length, start);
|
||||
}
|
||||
const prefix = start > 0 ? `[truncated to last ${MAX_COPIED_FILE_BYTES} bytes]\n` : '';
|
||||
return { text: `${prefix}${buffer.toString('utf8')}` };
|
||||
} finally {
|
||||
await handle.close().catch(() => undefined);
|
||||
}
|
||||
} catch (error) {
|
||||
const code = (error as NodeJS.ErrnoException).code;
|
||||
return { issue: code === 'ENOENT' ? 'missing' : 'unreadable' };
|
||||
}
|
||||
}
|
||||
|
||||
function getKnownLaunchArtifactSourceFiles(teamName: string): CopiedArtifactFile[] {
|
||||
const bootstrapStatePath = getTeamBootstrapStatePath(teamName);
|
||||
const teamDir = path.dirname(bootstrapStatePath);
|
||||
return [
|
||||
{
|
||||
sourcePath: getTeamLaunchStatePath(teamName),
|
||||
artifactName: 'launch-state.json',
|
||||
},
|
||||
{
|
||||
sourcePath: getTeamLaunchSummaryPath(teamName),
|
||||
artifactName: 'launch-summary.json',
|
||||
},
|
||||
{
|
||||
sourcePath: bootstrapStatePath,
|
||||
artifactName: 'bootstrap-state.json',
|
||||
},
|
||||
{
|
||||
sourcePath: path.join(teamDir, 'bootstrap-journal.jsonl'),
|
||||
artifactName: 'bootstrap-journal.tail.jsonl',
|
||||
},
|
||||
{
|
||||
sourcePath: path.join(teamDir, '.bootstrap.lock', 'metadata.json'),
|
||||
artifactName: 'bootstrap-lock-metadata.json',
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
async function writeArtifactTextFile(
|
||||
directory: string,
|
||||
artifactName: string,
|
||||
rawText: string,
|
||||
files: string[]
|
||||
): Promise<void> {
|
||||
const targetPath = path.join(directory, artifactName);
|
||||
await atomicWriteAsync(targetPath, `${redactLaunchFailureArtifactText(rawText).trimEnd()}\n`);
|
||||
files.push(artifactName);
|
||||
}
|
||||
|
||||
export async function writeTeamLaunchFailureArtifactPack(
|
||||
input: TeamLaunchFailureArtifactPackInput
|
||||
): Promise<TeamLaunchFailureArtifactPackResult> {
|
||||
const teamsRoot = getTeamsBasePath();
|
||||
const teamDir = path.join(teamsRoot, input.teamName);
|
||||
const artifactsRoot = path.join(teamDir, ARTIFACTS_DIR_NAME);
|
||||
const createdAt = new Date();
|
||||
const directory = path.join(
|
||||
artifactsRoot,
|
||||
`${artifactTimestamp(createdAt)}-${sanitizeArtifactNamePart(input.runId)}`
|
||||
);
|
||||
assertPathWithin(teamsRoot, directory);
|
||||
await fs.promises.mkdir(directory, { recursive: true });
|
||||
|
||||
const files: string[] = [];
|
||||
const copiedFiles: CopiedArtifactFile[] = [];
|
||||
|
||||
if (input.cliLogs?.trim()) {
|
||||
await writeArtifactTextFile(
|
||||
directory,
|
||||
'cli-logs-tail.txt',
|
||||
truncateTail(input.cliLogs, MAX_CLI_LOG_CHARS),
|
||||
files
|
||||
);
|
||||
}
|
||||
if (input.progressTraceLines?.length) {
|
||||
await writeArtifactTextFile(
|
||||
directory,
|
||||
'progress-trace.txt',
|
||||
truncateTail(input.progressTraceLines.join('\n'), MAX_TRACE_CHARS),
|
||||
files
|
||||
);
|
||||
}
|
||||
if (input.runtimeAdapterTraceLines?.length) {
|
||||
await writeArtifactTextFile(
|
||||
directory,
|
||||
'runtime-adapter-trace.txt',
|
||||
truncateTail(input.runtimeAdapterTraceLines.join('\n'), MAX_TRACE_CHARS),
|
||||
files
|
||||
);
|
||||
}
|
||||
|
||||
for (const source of getKnownLaunchArtifactSourceFiles(input.teamName)) {
|
||||
const read = await readBoundedTextFile(source.sourcePath);
|
||||
if (read.text !== undefined) {
|
||||
await writeArtifactTextFile(directory, source.artifactName, read.text, files);
|
||||
copiedFiles.push(source);
|
||||
} else {
|
||||
copiedFiles.push({ ...source, issue: read.issue ?? 'unreadable' });
|
||||
}
|
||||
}
|
||||
|
||||
const classification = classifyLaunchFailureArtifact(input);
|
||||
const bootstrapTransportBreadcrumb = extractLaunchBootstrapTransportBreadcrumb(input);
|
||||
const manifest = redactJsonLike({
|
||||
version: 1,
|
||||
createdAt: createdAt.toISOString(),
|
||||
reason: input.reason,
|
||||
classification,
|
||||
bootstrapTransportBreadcrumb,
|
||||
teamName: input.teamName,
|
||||
runId: input.runId,
|
||||
startedAt: input.startedAt,
|
||||
cwd: input.cwd,
|
||||
pid: input.pid ?? null,
|
||||
providerId: input.providerId,
|
||||
providerBackendId: input.providerBackendId,
|
||||
model: input.model,
|
||||
expectedMembers: input.expectedMembers ?? [],
|
||||
effectiveMembers: (input.effectiveMembers ?? []).map((member) => ({
|
||||
name: member.name,
|
||||
role: member.role,
|
||||
providerId: member.providerId,
|
||||
providerBackendId: member.providerBackendId,
|
||||
model: member.model,
|
||||
agentType: member.agentType,
|
||||
removedAt: member.removedAt,
|
||||
})),
|
||||
progress: input.progress ?? null,
|
||||
launchDiagnostics: input.launchDiagnostics ?? input.progress?.launchDiagnostics ?? [],
|
||||
memberSpawnStatuses: input.memberSpawnStatuses ?? {},
|
||||
launchSnapshot: input.launchSnapshot ?? null,
|
||||
flags: input.flags ?? {},
|
||||
artifactFiles: files,
|
||||
copiedFiles,
|
||||
});
|
||||
|
||||
const manifestPath = path.join(directory, 'manifest.json');
|
||||
await atomicWriteAsync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`);
|
||||
files.unshift('manifest.json');
|
||||
|
||||
await fs.promises.mkdir(artifactsRoot, { recursive: true });
|
||||
await atomicWriteAsync(
|
||||
path.join(artifactsRoot, LATEST_ARTIFACT_FILE),
|
||||
`${JSON.stringify(
|
||||
redactJsonLike({
|
||||
version: 1,
|
||||
createdAt: createdAt.toISOString(),
|
||||
teamName: input.teamName,
|
||||
runId: input.runId,
|
||||
reason: input.reason,
|
||||
directory,
|
||||
manifestPath,
|
||||
}),
|
||||
null,
|
||||
2
|
||||
)}\n`
|
||||
);
|
||||
|
||||
logger.info(`[${input.teamName}] Wrote launch failure artifact pack`, {
|
||||
runId: input.runId,
|
||||
reason: input.reason,
|
||||
directory,
|
||||
});
|
||||
|
||||
return { directory, manifestPath, files };
|
||||
}
|
||||
|
|
@ -289,6 +289,7 @@ import {
|
|||
snapshotFromRuntimeMemberStatuses,
|
||||
snapshotToMemberSpawnStatuses,
|
||||
} from './TeamLaunchStateEvaluator';
|
||||
import { writeTeamLaunchFailureArtifactPack } from './TeamLaunchFailureArtifactPack';
|
||||
import { TeamLaunchStateStore } from './TeamLaunchStateStore';
|
||||
import { TeamMcpConfigBuilder } from './TeamMcpConfigBuilder';
|
||||
import { TeamMemberLogsFinder } from './TeamMemberLogsFinder';
|
||||
|
|
@ -407,6 +408,11 @@ type BootstrapTranscriptSuccessSource = 'member_briefing' | 'assistant_text';
|
|||
const BOOTSTRAP_RUNTIME_PROOF_TAIL_BYTES = 256 * 1024;
|
||||
const BOOTSTRAP_RUNTIME_EVENT_MAX_LINES = 256;
|
||||
const BOOTSTRAP_RUNTIME_EVENT_MAX_LINE_BYTES = 16 * 1024;
|
||||
const TEAMMATE_RUNTIME_ENV = 'CLAUDE_CODE_TEAMMATE_RUNTIME';
|
||||
const TEAMMATE_RUNTIME_EVENTS_ENV = 'CLAUDE_CODE_TEAMMATE_RUNTIME_EVENTS_PATH';
|
||||
const TEAMMATE_BOOTSTRAP_PROOF_TOKEN_ENV = 'CLAUDE_CODE_BOOTSTRAP_PROOF_TOKEN';
|
||||
const NATIVE_APP_MANAGED_BOOTSTRAP_CONTEXT_ENV =
|
||||
'CLAUDE_CODE_NATIVE_APP_MANAGED_BOOTSTRAP_CONTEXT_PATH';
|
||||
|
||||
function getTeamRuntimeEventsDir(teamName: string): string {
|
||||
return path.join(getTeamsBasePath(), teamName, 'runtime');
|
||||
|
|
@ -5519,6 +5525,7 @@ export class TeamProvisioningService {
|
|||
>();
|
||||
private readonly memberSpawnStatusesCacheGenerationByTeam = new Map<string, number>();
|
||||
private readonly launchStateStore = new TeamLaunchStateStore();
|
||||
private readonly launchFailureArtifactPackRunIds = new Set<string>();
|
||||
private readonly launchStateStoreQueue = new Map<string, Promise<unknown>>();
|
||||
private readonly launchStateWrittenRunIdByTeam = new Map<string, string>();
|
||||
private readonly failedOpenCodeSecondaryRetryInFlightByTeam = new Map<
|
||||
|
|
@ -5624,6 +5631,56 @@ export class TeamProvisioningService {
|
|||
return choosePreferredLaunchSnapshot(bootstrapSnapshot, launchSnapshot);
|
||||
}
|
||||
|
||||
private writeLaunchFailureArtifactPackBestEffort(
|
||||
run: ProvisioningRun,
|
||||
options: {
|
||||
reason: string;
|
||||
launchSnapshot?: PersistedTeamLaunchSnapshot | null;
|
||||
}
|
||||
): void {
|
||||
const key = `${run.teamName}:${run.runId}`;
|
||||
if (this.launchFailureArtifactPackRunIds.has(key)) return;
|
||||
this.launchFailureArtifactPackRunIds.add(key);
|
||||
|
||||
const memberSpawnStatuses = Object.fromEntries(run.memberSpawnStatuses.entries());
|
||||
const request = run.request as Partial<TeamCreateRequest> | undefined;
|
||||
void writeTeamLaunchFailureArtifactPack({
|
||||
teamName: run.teamName,
|
||||
runId: run.runId,
|
||||
reason: options.reason,
|
||||
startedAt: run.startedAt,
|
||||
cwd: request?.cwd ?? '',
|
||||
pid: run.child?.pid ?? run.progress.pid ?? null,
|
||||
providerId: request?.providerId,
|
||||
providerBackendId: request?.providerBackendId,
|
||||
model: request?.model,
|
||||
expectedMembers: run.expectedMembers,
|
||||
effectiveMembers: run.allEffectiveMembers,
|
||||
progress: run.progress,
|
||||
launchSnapshot: options.launchSnapshot ?? null,
|
||||
launchDiagnostics: run.progress.launchDiagnostics ?? buildLaunchDiagnosticsFromRun(run),
|
||||
memberSpawnStatuses,
|
||||
cliLogs: extractCliLogsFromRun(run),
|
||||
progressTraceLines: run.provisioningTraceLines,
|
||||
runtimeAdapterTraceLines: this.runtimeAdapterTraceLinesByRunId.get(run.runId),
|
||||
flags: {
|
||||
isLaunch: run.isLaunch,
|
||||
provisioningComplete: run.provisioningComplete,
|
||||
deterministicBootstrap: run.deterministicBootstrap,
|
||||
processKilled: run.processKilled,
|
||||
finalizingByTimeout: run.finalizingByTimeout,
|
||||
cancelRequested: run.cancelRequested,
|
||||
},
|
||||
}).catch((error: unknown) => {
|
||||
this.launchFailureArtifactPackRunIds.delete(key);
|
||||
logger.warn(
|
||||
`[${run.teamName}] Failed to write launch failure artifact pack: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
async repairStaleTaskActivityIntervalsBeforeSnapshot(teamName: string): Promise<void> {
|
||||
if (this.crashRepairedActivityIntervalsByTeam.has(teamName)) {
|
||||
return;
|
||||
|
|
@ -13787,6 +13844,13 @@ export class TeamProvisioningService {
|
|||
providerId: TeamProviderId;
|
||||
joinedAt: number;
|
||||
bootstrapExpectedAfter: string;
|
||||
backendType?: 'tmux' | 'process';
|
||||
runtimePid?: number;
|
||||
bootstrapRuntimeEventsPath?: string;
|
||||
bootstrapProofToken?: string;
|
||||
bootstrapRunId?: string;
|
||||
bootstrapContextHash?: string;
|
||||
bootstrapBriefingHash?: string;
|
||||
}): Promise<void> {
|
||||
const configPath = path.join(getTeamsBasePath(), input.teamName, 'config.json');
|
||||
const raw = await tryReadRegularFileUtf8(configPath, {
|
||||
|
|
@ -13822,10 +13886,25 @@ export class TeamProvisioningService {
|
|||
color: input.color,
|
||||
joinedAt: input.joinedAt,
|
||||
bootstrapExpectedAfter: input.bootstrapExpectedAfter,
|
||||
...(input.bootstrapProofToken ? { bootstrapProofToken: input.bootstrapProofToken } : {}),
|
||||
...(input.bootstrapRunId ? { bootstrapRunId: input.bootstrapRunId } : {}),
|
||||
...(input.bootstrapRuntimeEventsPath
|
||||
? { bootstrapRuntimeEventsPath: input.bootstrapRuntimeEventsPath }
|
||||
: {}),
|
||||
...(input.bootstrapContextHash
|
||||
? {
|
||||
bootstrapProofMode: 'native_app_managed_context',
|
||||
bootstrapContextHash: input.bootstrapContextHash,
|
||||
}
|
||||
: {}),
|
||||
...(input.bootstrapBriefingHash
|
||||
? { bootstrapBriefingHash: input.bootstrapBriefingHash }
|
||||
: {}),
|
||||
tmuxPaneId: input.paneId,
|
||||
...(typeof input.runtimePid === 'number' ? { runtimePid: input.runtimePid } : {}),
|
||||
cwd: input.cwd,
|
||||
subscriptions: Array.isArray(existing.subscriptions) ? existing.subscriptions : [],
|
||||
backendType: 'tmux',
|
||||
backendType: input.backendType ?? 'tmux',
|
||||
};
|
||||
|
||||
if (existingIndex >= 0) {
|
||||
|
|
@ -14047,6 +14126,378 @@ export class TeamProvisioningService {
|
|||
this.setMemberSpawnStatus(input.run, input.memberName, 'waiting');
|
||||
}
|
||||
|
||||
private async launchDirectProcessMemberRestart(input: {
|
||||
run: ProvisioningRun;
|
||||
teamName: string;
|
||||
displayName: string;
|
||||
leadName: string;
|
||||
memberName: string;
|
||||
config: TeamConfig;
|
||||
configuredMember: NonNullable<
|
||||
ReturnType<TeamProvisioningService['resolveEffectiveConfiguredMember']>
|
||||
>;
|
||||
persistedRuntimeMembers: readonly PersistedRuntimeMemberLike[];
|
||||
}): Promise<void> {
|
||||
const providerId = resolveTeamProviderId(input.configuredMember.providerId);
|
||||
const claudePath = input.run.spawnContext?.claudePath ?? (await ClaudeBinaryResolver.resolve());
|
||||
if (!claudePath) {
|
||||
throw new Error('Claude CLI not found; install it or provide a valid path');
|
||||
}
|
||||
|
||||
const cwd = this.resolveDirectRestartRuntimeCwd({
|
||||
configuredMember: input.configuredMember,
|
||||
persistedRuntimeMembers: input.persistedRuntimeMembers,
|
||||
config: input.config,
|
||||
run: input.run,
|
||||
});
|
||||
await ensureCwdExists(cwd);
|
||||
|
||||
const provisioningEnv = await this.buildProvisioningEnv(
|
||||
providerId,
|
||||
input.configuredMember.providerBackendId,
|
||||
{
|
||||
teamRuntimeAuth: {
|
||||
teamName: input.teamName,
|
||||
authMaterialId: `${input.run.runId}-process-restart-${input.configuredMember.name}-${randomUUID()}`,
|
||||
allowAnthropicApiKeyHelper: true,
|
||||
},
|
||||
}
|
||||
);
|
||||
if (provisioningEnv.warning) {
|
||||
throw new Error(provisioningEnv.warning);
|
||||
}
|
||||
|
||||
const mcpConfigPath = await this.mcpConfigBuilder.writeConfigFile(cwd);
|
||||
const agentId = `${input.configuredMember.name}@${input.teamName}`;
|
||||
const color =
|
||||
input.config.members
|
||||
?.find((member) => matchesExactTeamMemberName(member.name, input.memberName))
|
||||
?.color?.trim() || getMemberColorByName(input.configuredMember.name);
|
||||
const parentSessionId =
|
||||
input.run.detectedSessionId?.trim() || input.config.leadSessionId?.trim() || input.run.runId;
|
||||
const memberSpec: TeamCreateRequest['members'][number] = {
|
||||
name: input.configuredMember.name,
|
||||
...(input.configuredMember.role ? { role: input.configuredMember.role } : {}),
|
||||
...(input.configuredMember.workflow ? { workflow: input.configuredMember.workflow } : {}),
|
||||
...(input.configuredMember.providerId
|
||||
? { providerId: input.configuredMember.providerId }
|
||||
: {}),
|
||||
...(input.configuredMember.providerBackendId
|
||||
? { providerBackendId: input.configuredMember.providerBackendId }
|
||||
: {}),
|
||||
...(input.configuredMember.model ? { model: input.configuredMember.model } : {}),
|
||||
...(input.configuredMember.effort ? { effort: input.configuredMember.effort } : {}),
|
||||
...(input.configuredMember.agentType ? { agentType: input.configuredMember.agentType } : {}),
|
||||
...(input.configuredMember.isolation === 'worktree'
|
||||
? { isolation: 'worktree' as const }
|
||||
: {}),
|
||||
...(input.configuredMember.cwd ? { cwd: input.configuredMember.cwd } : {}),
|
||||
};
|
||||
const prompt = buildMemberSpawnPrompt(
|
||||
memberSpec,
|
||||
input.displayName,
|
||||
input.teamName,
|
||||
input.leadName,
|
||||
{
|
||||
restart: true,
|
||||
}
|
||||
);
|
||||
const bootstrapExpectedAfter = nowIso();
|
||||
const bootstrapProofToken = randomUUID();
|
||||
const runtimePaths = this.getDirectProcessRestartRuntimePaths(
|
||||
input.teamName,
|
||||
input.configuredMember.name
|
||||
);
|
||||
await fs.promises.mkdir(runtimePaths.dir, { recursive: true });
|
||||
await fs.promises.writeFile(runtimePaths.eventsPath, '', { encoding: 'utf8', mode: 0o600 });
|
||||
|
||||
const nativeBootstrapSpec =
|
||||
(
|
||||
await buildNativeAppManagedBootstrapSpecs({
|
||||
teamName: input.teamName,
|
||||
cwd,
|
||||
members: [memberSpec],
|
||||
})
|
||||
).get(input.configuredMember.name) ?? null;
|
||||
const nativeBootstrapEnv = await this.materializeDirectProcessNativeBootstrapContext({
|
||||
teamName: input.teamName,
|
||||
memberName: input.configuredMember.name,
|
||||
agentId,
|
||||
providerId,
|
||||
runId: input.run.runId,
|
||||
bootstrapProofToken,
|
||||
spec: nativeBootstrapSpec,
|
||||
});
|
||||
|
||||
const runtimeArgsPlan = await this.buildTeamRuntimeLaunchArgsPlan({
|
||||
teamName: input.teamName,
|
||||
providerId,
|
||||
launchIdentity: null,
|
||||
envResolution: provisioningEnv,
|
||||
extraArgs: [],
|
||||
includeAnthropicHelper: providerId === 'anthropic',
|
||||
contextLabel: `Direct process teammate restart (${input.configuredMember.name})`,
|
||||
});
|
||||
|
||||
const runtimeArgs = mergeJsonSettingsArgs([
|
||||
'--teammate-runtime',
|
||||
'headless',
|
||||
'--agent-id',
|
||||
agentId,
|
||||
'--agent-name',
|
||||
input.configuredMember.name,
|
||||
'--team-name',
|
||||
input.teamName,
|
||||
'--agent-color',
|
||||
color,
|
||||
'--parent-session-id',
|
||||
parentSessionId,
|
||||
...(input.configuredMember.agentType
|
||||
? ['--agent-type', input.configuredMember.agentType]
|
||||
: []),
|
||||
'--mcp-config',
|
||||
mcpConfigPath,
|
||||
'--strict-mcp-config',
|
||||
'--disallowedTools',
|
||||
APP_TEAM_RUNTIME_DISALLOWED_TOOLS,
|
||||
...(input.run.request.skipPermissions !== false
|
||||
? ['--dangerously-skip-permissions', '--permission-mode', 'bypassPermissions']
|
||||
: ['--permission-prompt-tool', 'stdio', '--permission-mode', 'default']),
|
||||
...(input.configuredMember.model ? ['--model', input.configuredMember.model] : []),
|
||||
...(input.configuredMember.effort ? ['--effort', input.configuredMember.effort] : []),
|
||||
...runtimeArgsPlan.fastModeArgs,
|
||||
...runtimeArgsPlan.runtimeTurnSettledHookArgs,
|
||||
...runtimeArgsPlan.providerArgs,
|
||||
...runtimeArgsPlan.settingsArgs,
|
||||
]);
|
||||
|
||||
const stdoutLog = fs.createWriteStream(runtimePaths.stdoutPath, { flags: 'a', mode: 0o600 });
|
||||
const stderrLog = fs.createWriteStream(runtimePaths.stderrPath, { flags: 'a', mode: 0o600 });
|
||||
const child = spawnCli(claudePath, runtimeArgs, {
|
||||
cwd,
|
||||
detached: true,
|
||||
env: {
|
||||
...provisioningEnv.env,
|
||||
...nativeBootstrapEnv,
|
||||
[TEAMMATE_RUNTIME_ENV]: 'headless',
|
||||
[TEAMMATE_RUNTIME_EVENTS_ENV]: runtimePaths.eventsPath,
|
||||
[TEAMMATE_BOOTSTRAP_PROOF_TOKEN_ENV]: bootstrapProofToken,
|
||||
},
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
});
|
||||
if (!child.pid) {
|
||||
stdoutLog.destroy();
|
||||
stderrLog.destroy();
|
||||
throw new Error(`Failed to spawn teammate process for ${agentId}: missing pid`);
|
||||
}
|
||||
|
||||
const runtimePid = child.pid;
|
||||
const processPaneId = `process:${runtimePid}`;
|
||||
child.stdout?.pipe(stdoutLog);
|
||||
child.stderr?.pipe(stderrLog);
|
||||
child.stdin?.on('error', (error) => {
|
||||
logger.debug(
|
||||
`[${input.teamName}] Direct process restart stdin failed for ${agentId}: ${error.message}`
|
||||
);
|
||||
});
|
||||
child.once('close', (code, signal) => {
|
||||
void this.appendDirectProcessRuntimeEvent({
|
||||
type: 'exited',
|
||||
eventsPath: runtimePaths.eventsPath,
|
||||
pid: runtimePid,
|
||||
teamName: input.teamName,
|
||||
agentName: input.configuredMember.name,
|
||||
agentId,
|
||||
runId: parentSessionId,
|
||||
bootstrapRunId: input.run.runId,
|
||||
source: 'TeamProvisioningService.direct_process_restart',
|
||||
detail:
|
||||
code !== null
|
||||
? `process exited with code ${code}`
|
||||
: signal
|
||||
? `process exited from signal ${signal}`
|
||||
: 'process exited',
|
||||
});
|
||||
stdoutLog.end();
|
||||
stderrLog.end();
|
||||
});
|
||||
child.once('error', (error) => {
|
||||
void this.appendDirectProcessRuntimeEvent({
|
||||
type: 'failed',
|
||||
eventsPath: runtimePaths.eventsPath,
|
||||
pid: runtimePid,
|
||||
teamName: input.teamName,
|
||||
agentName: input.configuredMember.name,
|
||||
agentId,
|
||||
runId: parentSessionId,
|
||||
bootstrapRunId: input.run.runId,
|
||||
source: 'TeamProvisioningService.direct_process_restart',
|
||||
detail: `process error: ${error.message}`,
|
||||
});
|
||||
});
|
||||
(child.stdin as { unref?: () => void } | null)?.unref?.();
|
||||
(child.stdout as { unref?: () => void } | null)?.unref?.();
|
||||
(child.stderr as { unref?: () => void } | null)?.unref?.();
|
||||
child.unref();
|
||||
|
||||
await this.appendDirectProcessRuntimeEvent({
|
||||
type: 'process_spawned',
|
||||
eventsPath: runtimePaths.eventsPath,
|
||||
pid: runtimePid,
|
||||
teamName: input.teamName,
|
||||
agentName: input.configuredMember.name,
|
||||
agentId,
|
||||
runId: parentSessionId,
|
||||
bootstrapRunId: input.run.runId,
|
||||
source: 'TeamProvisioningService.direct_process_restart',
|
||||
detail: 'process spawned',
|
||||
});
|
||||
await this.appendDirectProcessRuntimeEvent({
|
||||
type: 'stdout_attached',
|
||||
eventsPath: runtimePaths.eventsPath,
|
||||
pid: runtimePid,
|
||||
teamName: input.teamName,
|
||||
agentName: input.configuredMember.name,
|
||||
agentId,
|
||||
runId: parentSessionId,
|
||||
bootstrapRunId: input.run.runId,
|
||||
source: 'TeamProvisioningService.direct_process_restart',
|
||||
detail: 'stdout and stderr attached',
|
||||
});
|
||||
|
||||
await this.updateDirectTmuxRestartMemberConfig({
|
||||
teamName: input.teamName,
|
||||
memberName: input.memberName,
|
||||
member: input.configuredMember,
|
||||
agentId,
|
||||
color,
|
||||
prompt,
|
||||
paneId: processPaneId,
|
||||
cwd,
|
||||
providerId,
|
||||
joinedAt: Date.now(),
|
||||
bootstrapExpectedAfter,
|
||||
backendType: 'process',
|
||||
runtimePid,
|
||||
bootstrapRuntimeEventsPath: runtimePaths.eventsPath,
|
||||
bootstrapProofToken,
|
||||
bootstrapRunId: input.run.runId,
|
||||
...(nativeBootstrapSpec
|
||||
? {
|
||||
bootstrapContextHash: nativeBootstrapSpec.contextHash,
|
||||
bootstrapBriefingHash: nativeBootstrapSpec.briefingHash,
|
||||
}
|
||||
: {}),
|
||||
});
|
||||
this.enqueueDirectRestartPrompt({
|
||||
teamName: input.teamName,
|
||||
memberName: input.configuredMember.name,
|
||||
leadName: input.leadName,
|
||||
leadSessionId: parentSessionId,
|
||||
prompt,
|
||||
});
|
||||
await this.appendDirectProcessRuntimeEvent({
|
||||
type: 'mailbox_bootstrap_written',
|
||||
eventsPath: runtimePaths.eventsPath,
|
||||
pid: runtimePid,
|
||||
teamName: input.teamName,
|
||||
agentName: input.configuredMember.name,
|
||||
agentId,
|
||||
runId: parentSessionId,
|
||||
bootstrapRunId: input.run.runId,
|
||||
source: 'TeamProvisioningService.direct_process_restart',
|
||||
});
|
||||
this.appendMemberBootstrapDiagnostic(
|
||||
input.run,
|
||||
input.memberName,
|
||||
`restart process spawned with pid ${runtimePid}`
|
||||
);
|
||||
this.setMemberSpawnStatus(input.run, input.memberName, 'waiting');
|
||||
}
|
||||
|
||||
private getDirectProcessRestartRuntimePaths(
|
||||
teamName: string,
|
||||
memberName: string
|
||||
): { dir: string; eventsPath: string; stdoutPath: string; stderrPath: string } {
|
||||
const dir = getTeamRuntimeEventsDir(teamName);
|
||||
const filePrefix = sanitizeProcessRuntimeEventFilePrefix(memberName);
|
||||
return {
|
||||
dir,
|
||||
eventsPath: path.join(dir, `${filePrefix}.runtime.jsonl`),
|
||||
stdoutPath: path.join(dir, `${filePrefix}.stdout.log`),
|
||||
stderrPath: path.join(dir, `${filePrefix}.stderr.log`),
|
||||
};
|
||||
}
|
||||
|
||||
private async materializeDirectProcessNativeBootstrapContext(input: {
|
||||
teamName: string;
|
||||
memberName: string;
|
||||
agentId: string;
|
||||
providerId: TeamProviderId;
|
||||
runId: string;
|
||||
bootstrapProofToken: string;
|
||||
spec: NativeAppManagedBootstrapSpec | null;
|
||||
}): Promise<Record<string, string>> {
|
||||
if (!input.spec || (input.providerId !== 'anthropic' && input.providerId !== 'codex')) {
|
||||
return {};
|
||||
}
|
||||
const context = {
|
||||
...input.spec,
|
||||
kind: 'native_app_managed_bootstrap',
|
||||
teamName: input.teamName,
|
||||
memberName: input.memberName,
|
||||
agentId: input.agentId,
|
||||
runId: input.runId,
|
||||
provider: input.providerId,
|
||||
bootstrapProofToken: input.bootstrapProofToken,
|
||||
};
|
||||
const dir = path.join(getTeamRuntimeEventsDir(input.teamName), 'native-bootstrap');
|
||||
await fs.promises.mkdir(dir, { recursive: true });
|
||||
const finalPath = path.join(
|
||||
dir,
|
||||
`${sanitizeProcessRuntimeEventFilePrefix(input.memberName)}-${randomUUID()}.native-bootstrap.json`
|
||||
);
|
||||
const tempPath = `${finalPath}.tmp`;
|
||||
await fs.promises.writeFile(tempPath, JSON.stringify(context), {
|
||||
encoding: 'utf8',
|
||||
mode: 0o600,
|
||||
});
|
||||
await fs.promises.rename(tempPath, finalPath);
|
||||
return { [NATIVE_APP_MANAGED_BOOTSTRAP_CONTEXT_ENV]: finalPath };
|
||||
}
|
||||
|
||||
private async appendDirectProcessRuntimeEvent(input: {
|
||||
type: string;
|
||||
eventsPath: string;
|
||||
pid: number;
|
||||
teamName: string;
|
||||
agentName: string;
|
||||
agentId: string;
|
||||
runId: string;
|
||||
bootstrapRunId: string;
|
||||
source: string;
|
||||
detail?: string;
|
||||
}): Promise<void> {
|
||||
await fs.promises.mkdir(path.dirname(input.eventsPath), { recursive: true });
|
||||
await fs.promises.appendFile(
|
||||
input.eventsPath,
|
||||
`${JSON.stringify({
|
||||
version: 1,
|
||||
type: input.type,
|
||||
timestamp: nowIso(),
|
||||
pid: input.pid,
|
||||
teamName: input.teamName,
|
||||
agentName: input.agentName,
|
||||
agentId: input.agentId,
|
||||
runId: input.runId,
|
||||
bootstrapRunId: input.bootstrapRunId,
|
||||
source: input.source,
|
||||
...(input.detail ? { detail: input.detail } : {}),
|
||||
})}\n`,
|
||||
{ encoding: 'utf8', mode: 0o600 }
|
||||
);
|
||||
}
|
||||
|
||||
private getMemberLifecycleOperationKey(teamName: string, memberName: string): string {
|
||||
return `${teamName.trim().toLowerCase()}\u0000${memberName.trim().toLowerCase()}`;
|
||||
}
|
||||
|
|
@ -14408,6 +14859,38 @@ export class TeamProvisioningService {
|
|||
}
|
||||
}
|
||||
|
||||
const shouldDirectProcessRestart = backendTypes.has('process') || livePids.size > 0;
|
||||
if (shouldDirectProcessRestart) {
|
||||
try {
|
||||
await this.launchDirectProcessMemberRestart({
|
||||
run,
|
||||
teamName,
|
||||
displayName: config?.name?.trim() || teamName,
|
||||
leadName,
|
||||
memberName,
|
||||
config,
|
||||
configuredMember,
|
||||
persistedRuntimeMembers,
|
||||
});
|
||||
return;
|
||||
} catch (error) {
|
||||
run.pendingMemberRestarts.delete(memberName);
|
||||
this.setMemberSpawnStatus(
|
||||
run,
|
||||
memberName,
|
||||
'error',
|
||||
error instanceof Error ? error.message : String(error)
|
||||
);
|
||||
if (run.isLaunch) {
|
||||
await this.persistLaunchStateSnapshot(
|
||||
run,
|
||||
run.provisioningComplete ? 'finished' : 'active'
|
||||
);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
const restartMessage = buildRestartMemberSpawnMessage(
|
||||
teamName,
|
||||
config?.name?.trim() || teamName,
|
||||
|
|
@ -29518,6 +30001,14 @@ export class TeamProvisioningService {
|
|||
}
|
||||
);
|
||||
run.onProgress(progress);
|
||||
if (hasSpawnFailures) {
|
||||
this.writeLaunchFailureArtifactPackBestEffort(run, {
|
||||
reason: run.isLaunch
|
||||
? 'launch_completed_with_teammate_errors'
|
||||
: 'provisioning_completed_with_teammate_errors',
|
||||
launchSnapshot: persistedLaunchSnapshot,
|
||||
});
|
||||
}
|
||||
this.provisioningRunByTeam.delete(run.teamName);
|
||||
this.aliveRunByTeam.set(run.teamName, run.runId);
|
||||
logger.info(`[${run.teamName}] Provisioning complete. Process alive for subsequent tasks.`);
|
||||
|
|
@ -30111,6 +30602,18 @@ export class TeamProvisioningService {
|
|||
});
|
||||
void this.persistLaunchStateSnapshot(run, 'finished');
|
||||
}
|
||||
if (
|
||||
!hasNewerTrackedRun &&
|
||||
(run.progress.state === 'failed' ||
|
||||
(run.isLaunch && !run.provisioningComplete && !run.cancelRequested))
|
||||
) {
|
||||
this.writeLaunchFailureArtifactPackBestEffort(run, {
|
||||
reason:
|
||||
run.progress.state === 'failed'
|
||||
? 'launch_progress_failed'
|
||||
: 'launch_cleanup_unconfirmed_bootstrap',
|
||||
});
|
||||
}
|
||||
this.resetRuntimeToolActivity(run);
|
||||
this.setLeadActivity(run, 'offline');
|
||||
run.pendingDirectCrossTeamSendRefresh = false;
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import {
|
|||
getTeamsBasePath,
|
||||
setClaudeBasePathOverride,
|
||||
} from '../../../../src/main/utils/pathDecoder';
|
||||
import { killProcessByPid } from '../../../../src/main/utils/processKill';
|
||||
import {
|
||||
createOpenCodeLiveHarness,
|
||||
waitForOpenCodeLanesStopped,
|
||||
|
|
@ -131,8 +132,7 @@ liveDescribe('Mixed provider team launch live e2e', () => {
|
|||
afterEach(async () => {
|
||||
const keepProcesses = process.env.MIXED_PROVIDER_TEAM_LIVE_KEEP_PROCESSES === '1';
|
||||
if (!keepProcesses && harness && teamName) {
|
||||
await harness.svc.stopTeam(teamName).catch(() => undefined);
|
||||
await waitForOpenCodeLanesStopped(teamName, 90_000).catch(() => undefined);
|
||||
await cleanupMixedProviderSmokeTeam(harness, teamName);
|
||||
}
|
||||
if (!keepProcesses && usingAnthropicSubscriptionAuth && teamName) {
|
||||
await fs.rm(path.join(getTeamsBasePath(), teamName), { recursive: true, force: true });
|
||||
|
|
@ -531,6 +531,51 @@ async function removeTempDirWithRetries(dirPath: string): Promise<void> {
|
|||
}
|
||||
}
|
||||
|
||||
async function cleanupMixedProviderSmokeTeam(
|
||||
harness: OpenCodeLiveHarness,
|
||||
teamName: string
|
||||
): Promise<void> {
|
||||
const beforeStopSnapshot = await harness.svc
|
||||
.getTeamAgentRuntimeSnapshot(teamName)
|
||||
.catch(() => null);
|
||||
await harness.svc.stopTeam(teamName).catch(() => undefined);
|
||||
await waitForOpenCodeLanesStopped(teamName, 90_000).catch(() => undefined);
|
||||
await terminateSmokeOwnedProcessBackends(beforeStopSnapshot);
|
||||
const afterStopSnapshot = await harness.svc
|
||||
.getTeamAgentRuntimeSnapshot(teamName)
|
||||
.catch(() => null);
|
||||
await terminateSmokeOwnedProcessBackends(afterStopSnapshot);
|
||||
}
|
||||
|
||||
async function terminateSmokeOwnedProcessBackends(
|
||||
snapshot: Awaited<ReturnType<OpenCodeLiveHarness['svc']['getTeamAgentRuntimeSnapshot']>> | null
|
||||
): Promise<void> {
|
||||
const pids = new Set<number>();
|
||||
for (const member of Object.values(snapshot?.members ?? {})) {
|
||||
if (member.backendType !== 'process' || member.providerId === 'opencode') {
|
||||
continue;
|
||||
}
|
||||
const pid = member.runtimePid ?? member.pid;
|
||||
if (typeof pid === 'number' && Number.isFinite(pid) && pid > 0) {
|
||||
pids.add(pid);
|
||||
}
|
||||
}
|
||||
await Promise.all(
|
||||
Array.from(pids).map(async (pid) => {
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
} catch {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
killProcessByPid(pid);
|
||||
} catch {
|
||||
// Best-effort smoke cleanup. The process may have exited between the liveness probe and kill.
|
||||
}
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
function formatProgressDump(progressEvents: TeamProvisioningProgress[]): string {
|
||||
return redactSecrets(
|
||||
progressEvents
|
||||
|
|
|
|||
860
test/main/services/team/ProviderLaunchStress.live-e2e.test.ts
Normal file
860
test/main/services/team/ProviderLaunchStress.live-e2e.test.ts
Normal file
|
|
@ -0,0 +1,860 @@
|
|||
// @vitest-environment node
|
||||
import { constants as fsConstants, promises as fs } from 'node:fs';
|
||||
import * as os from 'node:os';
|
||||
import * as path from 'node:path';
|
||||
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import { createOpenCodeLiveHarness, waitForOpenCodeLanesStopped, waitUntil } from './openCodeLiveTestHarness';
|
||||
import {
|
||||
getTasksBasePath,
|
||||
getTeamsBasePath,
|
||||
setClaudeBasePathOverride,
|
||||
} from '../../../../src/main/utils/pathDecoder';
|
||||
import { killProcessByPid } from '../../../../src/main/utils/processKill';
|
||||
import { TeamDataService } from '../../../../src/main/services/team/TeamDataService';
|
||||
import { TeamProvisioningService } from '../../../../src/main/services/team/TeamProvisioningService';
|
||||
import { TeamTaskReader } from '../../../../src/main/services/team/TeamTaskReader';
|
||||
|
||||
import type {
|
||||
TeamAgentRuntimeSnapshot,
|
||||
TeamCreateRequest,
|
||||
TeamMember,
|
||||
TeamProviderId,
|
||||
TeamProvisioningProgress,
|
||||
} from '../../../../src/shared/types';
|
||||
|
||||
vi.mock('../../../../src/main/services/infrastructure/NotificationManager', () => ({
|
||||
NotificationManager: {
|
||||
getInstance: () => ({
|
||||
addTeamNotification: vi.fn(async () => undefined),
|
||||
}),
|
||||
},
|
||||
}));
|
||||
|
||||
const liveDescribe =
|
||||
process.env.PROVIDER_LAUNCH_STRESS_LIVE === '1' && hasAnthropicAuthConfigured()
|
||||
? describe
|
||||
: describe.skip;
|
||||
|
||||
const DEFAULT_ORCHESTRATOR_CLI = '/Users/belief/dev/projects/claude/agent_teams_orchestrator/cli';
|
||||
const DEFAULT_ANTHROPIC_MODEL = 'haiku';
|
||||
const DEFAULT_CODEX_MODEL = 'gpt-5.4-mini';
|
||||
const DEFAULT_CODEX_EFFORT = 'low' as const;
|
||||
const DEFAULT_OPENCODE_MODEL = 'openai/gpt-5.4-mini';
|
||||
const DEFAULT_ORDER: ProviderLaunchStressScenario[] = ['anthropic', 'codex', 'opencode', 'mixed'];
|
||||
const MEMBER_NAMES = ['alice', 'bob', 'jack', 'tom', 'atlas', 'nova', 'cody', 'oscar'];
|
||||
const RESTART_CONFIRM_TIMEOUT_MS = 300_000;
|
||||
const POST_LAUNCH_WORK_TIMEOUT_MS = 300_000;
|
||||
let currentStressTempDir = '';
|
||||
let currentStressProjectPath = '';
|
||||
|
||||
type ProviderLaunchStressScenario = 'anthropic' | 'codex' | 'opencode' | 'mixed';
|
||||
|
||||
interface ActiveScenario {
|
||||
scenario: ProviderLaunchStressScenario;
|
||||
teamName: string;
|
||||
svc: TeamProvisioningService;
|
||||
harness?: Awaited<ReturnType<typeof createOpenCodeLiveHarness>>;
|
||||
codexCleanup?: () => Promise<void>;
|
||||
failed: boolean;
|
||||
}
|
||||
|
||||
liveDescribe('provider launch stress live e2e', () => {
|
||||
let tempDir: string;
|
||||
let tempClaudeRoot: string;
|
||||
let tempHome: string;
|
||||
let projectPath: string;
|
||||
let previousCliPath: string | undefined;
|
||||
let previousCliFlavor: string | undefined;
|
||||
let previousCodexHome: string | undefined;
|
||||
let previousHome: string | undefined;
|
||||
let previousUserProfile: string | undefined;
|
||||
let previousNodeEnv: string | undefined;
|
||||
let previousAnthropicApiKey: string | undefined;
|
||||
let previousAnthropicAuthToken: string | undefined;
|
||||
let previousClaudeJsonConfig: string | null | undefined;
|
||||
const activeScenarios: ActiveScenario[] = [];
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'provider-launch-stress-live-'));
|
||||
tempClaudeRoot = usingAnthropicSubscriptionAuth()
|
||||
? os.userInfo().homedir
|
||||
: path.join(tempDir, '.claude');
|
||||
tempHome = path.join(tempDir, 'home');
|
||||
projectPath = path.join(tempDir, 'project');
|
||||
currentStressTempDir = tempDir;
|
||||
currentStressProjectPath = projectPath;
|
||||
await fs.mkdir(tempHome, { recursive: true });
|
||||
await fs.mkdir(projectPath, { recursive: true });
|
||||
await fs.writeFile(
|
||||
path.join(projectPath, 'README.md'),
|
||||
'# Provider launch stress live e2e\n\nKeep this project intentionally tiny.\n',
|
||||
'utf8'
|
||||
);
|
||||
|
||||
if (usingAnthropicSubscriptionAuth()) {
|
||||
setClaudeBasePathOverride(null);
|
||||
previousClaudeJsonConfig = await upsertTrustedClaudeProjectConfig(
|
||||
tempClaudeRoot,
|
||||
projectPath
|
||||
);
|
||||
} else {
|
||||
await fs.mkdir(tempClaudeRoot, { recursive: true });
|
||||
await writeTrustedClaudeConfig(tempClaudeRoot, projectPath);
|
||||
setClaudeBasePathOverride(tempClaudeRoot);
|
||||
previousClaudeJsonConfig = undefined;
|
||||
}
|
||||
|
||||
previousCliPath = process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH;
|
||||
previousCliFlavor = process.env.CLAUDE_TEAM_CLI_FLAVOR;
|
||||
previousCodexHome = process.env.CODEX_HOME;
|
||||
previousHome = process.env.HOME;
|
||||
previousUserProfile = process.env.USERPROFILE;
|
||||
previousNodeEnv = process.env.NODE_ENV;
|
||||
previousAnthropicApiKey = process.env.ANTHROPIC_API_KEY;
|
||||
previousAnthropicAuthToken = process.env.ANTHROPIC_AUTH_TOKEN;
|
||||
|
||||
process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH =
|
||||
process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH?.trim() || DEFAULT_ORCHESTRATOR_CLI;
|
||||
process.env.CLAUDE_TEAM_CLI_FLAVOR = 'agent_teams_orchestrator';
|
||||
process.env.CODEX_HOME = resolveConnectedCodexHome(previousCodexHome);
|
||||
process.env.HOME = usingAnthropicSubscriptionAuth() ? os.userInfo().homedir : tempHome;
|
||||
process.env.USERPROFILE = usingAnthropicSubscriptionAuth() ? os.userInfo().homedir : tempHome;
|
||||
process.env.NODE_ENV = 'production';
|
||||
if (usingAnthropicSubscriptionAuth()) {
|
||||
delete process.env.ANTHROPIC_API_KEY;
|
||||
delete process.env.ANTHROPIC_AUTH_TOKEN;
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
for (const active of [...activeScenarios].reverse()) {
|
||||
await cleanupActiveScenario(active, { preserveFiles: active.failed }).catch(() => undefined);
|
||||
}
|
||||
activeScenarios.length = 0;
|
||||
discardKnownProviderLaunchStressWarnings();
|
||||
|
||||
if (usingAnthropicSubscriptionAuth() && previousClaudeJsonConfig !== undefined) {
|
||||
await restoreClaudeJsonConfig(tempClaudeRoot, previousClaudeJsonConfig);
|
||||
}
|
||||
setClaudeBasePathOverride(null);
|
||||
|
||||
restoreEnv('CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH', previousCliPath);
|
||||
restoreEnv('CLAUDE_TEAM_CLI_FLAVOR', previousCliFlavor);
|
||||
restoreEnv('CODEX_HOME', previousCodexHome);
|
||||
restoreEnv('HOME', previousHome);
|
||||
restoreEnv('USERPROFILE', previousUserProfile);
|
||||
restoreEnv('NODE_ENV', previousNodeEnv);
|
||||
restoreEnv('ANTHROPIC_API_KEY', previousAnthropicApiKey);
|
||||
restoreEnv('ANTHROPIC_AUTH_TOKEN', previousAnthropicAuthToken);
|
||||
|
||||
if (process.env.PROVIDER_LAUNCH_STRESS_KEEP_TEMP === '1') {
|
||||
process.stderr.write(`[ProviderLaunchStress.live] preserved temp dir: ${tempDir}\n`);
|
||||
} else {
|
||||
await fs.rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
currentStressTempDir = '';
|
||||
currentStressProjectPath = '';
|
||||
}, 240_000);
|
||||
|
||||
it(
|
||||
'launches, restarts, and exercises post-launch work for provider teams with five teammates each',
|
||||
async () => {
|
||||
const orchestratorCli = process.env.CLAUDE_AGENT_TEAMS_ORCHESTRATOR_CLI_PATH?.trim();
|
||||
expect(orchestratorCli).toBeTruthy();
|
||||
await assertExecutable(orchestratorCli!);
|
||||
await assertCodexSubscriptionAuthAvailable(process.env.CODEX_HOME!);
|
||||
|
||||
for (const scenario of getStressOrder()) {
|
||||
await runProviderStressScenario(scenario, activeScenarios);
|
||||
}
|
||||
},
|
||||
30 * 60_000
|
||||
);
|
||||
});
|
||||
|
||||
async function runProviderStressScenario(
|
||||
scenario: ProviderLaunchStressScenario,
|
||||
activeScenarios: ActiveScenario[]
|
||||
): Promise<void> {
|
||||
const selected = resolveScenarioSelection(scenario);
|
||||
const memberCount = getStressMemberCount();
|
||||
const teamName = `provider-stress-${scenario}-${Date.now()}`;
|
||||
const progressEvents: TeamProvisioningProgress[] = [];
|
||||
process.stderr.write(
|
||||
`[ProviderLaunchStress.live] starting ${scenario} with ${memberCount} teammates\n`
|
||||
);
|
||||
let codexCleanup: (() => Promise<void>) | undefined;
|
||||
let harness: Awaited<ReturnType<typeof createOpenCodeLiveHarness>> | undefined;
|
||||
try {
|
||||
codexCleanup =
|
||||
scenario === 'codex' || scenario === 'mixed' ? await installCodexAccountFeature() : undefined;
|
||||
harness =
|
||||
scenario === 'opencode' || scenario === 'mixed'
|
||||
? await createOpenCodeLiveHarness({
|
||||
tempDir: currentStressTempDir,
|
||||
selectedModel: selected.openCodeModel,
|
||||
projectPath: projectPathForStress(),
|
||||
})
|
||||
: undefined;
|
||||
} catch (error) {
|
||||
await harness?.dispose().catch(() => undefined);
|
||||
await codexCleanup?.().catch(() => undefined);
|
||||
throw error;
|
||||
}
|
||||
const svc = harness?.svc ?? new TeamProvisioningService();
|
||||
const active: ActiveScenario = { scenario, teamName, svc, harness, codexCleanup, failed: false };
|
||||
activeScenarios.push(active);
|
||||
|
||||
try {
|
||||
await svc.createTeam(
|
||||
buildStressCreateRequest({
|
||||
scenario,
|
||||
teamName,
|
||||
memberCount,
|
||||
selection: selected,
|
||||
}),
|
||||
(progress) => progressEvents.push(progress)
|
||||
);
|
||||
|
||||
await waitUntil(async () => {
|
||||
const last = progressEvents.at(-1);
|
||||
if (last?.state === 'failed') {
|
||||
active.failed = true;
|
||||
throw new Error(await formatStressDiagnostics(svc, teamName, progressEvents));
|
||||
}
|
||||
return last?.state === 'ready';
|
||||
}, 420_000);
|
||||
|
||||
const expectedMembers = buildExpectedMemberNames(memberCount);
|
||||
await waitUntil(async () => {
|
||||
const statuses = await svc.getMemberSpawnStatuses(teamName);
|
||||
if (statuses.teamLaunchState === 'partial_failure') {
|
||||
active.failed = true;
|
||||
throw new Error(await formatStressDiagnostics(svc, teamName, progressEvents));
|
||||
}
|
||||
return expectedMembers.every((memberName) => {
|
||||
const entry = statuses.statuses[memberName];
|
||||
return (
|
||||
entry?.status === 'online' &&
|
||||
entry.launchState === 'confirmed_alive' &&
|
||||
entry.bootstrapConfirmed === true
|
||||
);
|
||||
});
|
||||
}, 240_000);
|
||||
|
||||
await waitUntil(async () => {
|
||||
const snapshot = await svc.getTeamAgentRuntimeSnapshot(teamName);
|
||||
return expectedMembers.every((memberName) => snapshot.members[memberName]?.alive === true);
|
||||
}, 180_000);
|
||||
process.stderr.write(`[ProviderLaunchStress.live] ${scenario} confirmed all teammates\n`);
|
||||
|
||||
await runRestartStressChecks(active, expectedMembers, progressEvents);
|
||||
await runPostLaunchWorkProofCheck(active, expectedMembers, progressEvents);
|
||||
} catch (error) {
|
||||
active.failed = true;
|
||||
throw error;
|
||||
} finally {
|
||||
if (!active.failed) {
|
||||
await cleanupActiveScenario(active, { preserveFiles: false });
|
||||
const index = activeScenarios.indexOf(active);
|
||||
if (index >= 0) activeScenarios.splice(index, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function runRestartStressChecks(
|
||||
active: ActiveScenario,
|
||||
expectedMembers: string[],
|
||||
progressEvents: TeamProvisioningProgress[]
|
||||
): Promise<void> {
|
||||
const targets = resolveRestartStressTargets(active.scenario, expectedMembers);
|
||||
for (const memberName of targets) {
|
||||
process.stderr.write(
|
||||
`[ProviderLaunchStress.live] restarting ${active.scenario}/${memberName}\n`
|
||||
);
|
||||
try {
|
||||
await active.svc.restartMember(active.teamName, memberName);
|
||||
await waitForStressCondition(
|
||||
`restart ${active.teamName}/${memberName}`,
|
||||
async () => {
|
||||
const statuses = await active.svc.getMemberSpawnStatuses(active.teamName);
|
||||
const entry = statuses.statuses[memberName];
|
||||
if (entry?.status === 'error' || entry?.launchState === 'failed_to_start') {
|
||||
throw new Error(
|
||||
`restart ${memberName} failed: ${entry.hardFailureReason ?? entry.error ?? 'unknown'}`
|
||||
);
|
||||
}
|
||||
return (
|
||||
entry?.status === 'online' &&
|
||||
entry.launchState === 'confirmed_alive' &&
|
||||
entry.bootstrapConfirmed === true
|
||||
);
|
||||
},
|
||||
RESTART_CONFIRM_TIMEOUT_MS,
|
||||
2_000,
|
||||
() => formatStressDiagnostics(active.svc, active.teamName, progressEvents)
|
||||
);
|
||||
await waitForStressCondition(
|
||||
`runtime alive after restart ${active.teamName}/${memberName}`,
|
||||
async () => {
|
||||
const snapshot = await active.svc.getTeamAgentRuntimeSnapshot(active.teamName);
|
||||
return snapshot.members[memberName]?.alive === true;
|
||||
},
|
||||
120_000,
|
||||
2_000,
|
||||
() => formatStressDiagnostics(active.svc, active.teamName, progressEvents)
|
||||
);
|
||||
} catch (error) {
|
||||
throw new Error(
|
||||
`Restart stress failed for ${active.scenario}/${memberName}: ${error instanceof Error ? error.message : String(error)}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
await waitForStressCondition(
|
||||
`all teammates still confirmed after restarts ${active.teamName}`,
|
||||
async () => {
|
||||
const statuses = await active.svc.getMemberSpawnStatuses(active.teamName);
|
||||
return expectedMembers.every((memberName) => {
|
||||
const entry = statuses.statuses[memberName];
|
||||
return (
|
||||
entry?.status === 'online' &&
|
||||
entry.launchState === 'confirmed_alive' &&
|
||||
entry.bootstrapConfirmed === true
|
||||
);
|
||||
});
|
||||
},
|
||||
120_000,
|
||||
2_000,
|
||||
() => formatStressDiagnostics(active.svc, active.teamName, progressEvents)
|
||||
);
|
||||
process.stderr.write(`[ProviderLaunchStress.live] ${active.scenario} restart checks passed\n`);
|
||||
}
|
||||
|
||||
async function runPostLaunchWorkProofCheck(
|
||||
active: ActiveScenario,
|
||||
expectedMembers: string[],
|
||||
progressEvents: TeamProvisioningProgress[]
|
||||
): Promise<void> {
|
||||
const memberName = resolvePostLaunchWorkTarget(active.scenario, expectedMembers);
|
||||
const marker = `provider-stress-${active.scenario}-${Date.now()}`;
|
||||
const teamDataService = new TeamDataService();
|
||||
const taskReader = new TeamTaskReader();
|
||||
|
||||
process.stderr.write(
|
||||
`[ProviderLaunchStress.live] sending post-launch work probe to ${active.scenario}/${memberName}\n`
|
||||
);
|
||||
const task = await teamDataService.createTask(active.teamName, {
|
||||
subject: `Provider launch stress proof ${marker}`,
|
||||
owner: memberName,
|
||||
startImmediately: true,
|
||||
prompt: [
|
||||
`This is a live provider launch stress validation. Marker: ${marker}.`,
|
||||
'Do not edit files.',
|
||||
'Add one task comment containing exactly:',
|
||||
`${marker}:done`,
|
||||
'Then mark this task complete.',
|
||||
'After that stop. Do not send a separate user-visible chat reply.',
|
||||
].join('\n'),
|
||||
});
|
||||
|
||||
const relay = await active.svc.relayInboxFileToLiveRecipient(active.teamName, memberName);
|
||||
if (!isAcceptedStressRelayResult(relay)) {
|
||||
throw new Error(
|
||||
`Post-launch work probe was not relayed to ${memberName}; relay result: ${JSON.stringify(relay)}`
|
||||
);
|
||||
}
|
||||
|
||||
await waitForStressCondition(
|
||||
`post-launch work proof ${active.teamName}/${memberName}/${task.id}`,
|
||||
async () => {
|
||||
const tasks = await taskReader.getTasks(active.teamName);
|
||||
const current = tasks.find((candidate) => candidate.id === task.id);
|
||||
if (!current) return false;
|
||||
const hasMarkerComment = current.comments?.some((comment) =>
|
||||
comment.text.includes(`${marker}:done`)
|
||||
);
|
||||
return Boolean(hasMarkerComment || current.status === 'completed');
|
||||
},
|
||||
POST_LAUNCH_WORK_TIMEOUT_MS,
|
||||
2_000,
|
||||
() => formatStressDiagnostics(active.svc, active.teamName, progressEvents)
|
||||
);
|
||||
process.stderr.write(`[ProviderLaunchStress.live] ${active.scenario} post-launch work passed\n`);
|
||||
}
|
||||
|
||||
function isAcceptedStressRelayResult(relay: Awaited<
|
||||
ReturnType<TeamProvisioningService['relayInboxFileToLiveRecipient']>
|
||||
>): boolean {
|
||||
if (relay.kind === 'native_member_noop') return true;
|
||||
if (relay.relayed > 0) return true;
|
||||
const lastDelivery = relay.lastDelivery;
|
||||
return Boolean(
|
||||
lastDelivery &&
|
||||
(lastDelivery.accepted === true ||
|
||||
lastDelivery.delivered === true ||
|
||||
lastDelivery.responsePending === true)
|
||||
);
|
||||
}
|
||||
|
||||
function resolveRestartStressTargets(
|
||||
scenario: ProviderLaunchStressScenario,
|
||||
expectedMembers: string[]
|
||||
): string[] {
|
||||
if (expectedMembers.length === 0) return [];
|
||||
// Pure OpenCode launch can finish without a tracked lead run. Per-member
|
||||
// restart for OpenCode is covered by the mixed secondary-lane scenario,
|
||||
// where the app owns the live run and can reattach the OpenCode lane.
|
||||
if (scenario === 'opencode') return [];
|
||||
if (scenario !== 'mixed') {
|
||||
return [expectedMembers[1] ?? expectedMembers[0]];
|
||||
}
|
||||
|
||||
const targets: string[] = [];
|
||||
const wantedProviders: TeamProviderId[] = ['anthropic', 'codex', 'opencode'];
|
||||
for (const providerId of wantedProviders) {
|
||||
const index = expectedMembers.findIndex(
|
||||
(_memberName, memberIndex) => resolveStressMemberProvider('mixed', memberIndex) === providerId
|
||||
);
|
||||
if (index >= 0) targets.push(expectedMembers[index]!);
|
||||
}
|
||||
return targets;
|
||||
}
|
||||
|
||||
function resolvePostLaunchWorkTarget(
|
||||
scenario: ProviderLaunchStressScenario,
|
||||
expectedMembers: string[]
|
||||
): string {
|
||||
if (scenario === 'mixed') {
|
||||
const openCodeIndex = expectedMembers.findIndex(
|
||||
(_memberName, memberIndex) =>
|
||||
resolveStressMemberProvider('mixed', memberIndex) === 'opencode'
|
||||
);
|
||||
if (openCodeIndex >= 0) return expectedMembers[openCodeIndex]!;
|
||||
}
|
||||
return expectedMembers[1] ?? expectedMembers[0] ?? 'alice';
|
||||
}
|
||||
|
||||
async function waitForStressCondition(
|
||||
label: string,
|
||||
predicate: () => Promise<boolean>,
|
||||
timeoutMs: number,
|
||||
pollMs: number,
|
||||
diagnostics: () => Promise<string>
|
||||
): Promise<void> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
let lastError: unknown;
|
||||
while (Date.now() < deadline) {
|
||||
try {
|
||||
if (await predicate()) return;
|
||||
lastError = undefined;
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
break;
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, pollMs));
|
||||
}
|
||||
|
||||
const suffix = lastError
|
||||
? `\nLast error: ${lastError instanceof Error ? lastError.message : String(lastError)}`
|
||||
: '';
|
||||
throw new Error(
|
||||
`Timed out waiting for ${label} after ${timeoutMs}ms${suffix}\n${await diagnostics()}`
|
||||
);
|
||||
}
|
||||
|
||||
function discardKnownProviderLaunchStressWarnings(): void {
|
||||
const warn = vi.mocked(console.warn);
|
||||
if (!warn.mock) return;
|
||||
const calls = warn.mock.calls;
|
||||
for (let index = calls.length - 1; index >= 0; index -= 1) {
|
||||
const text = calls[index]?.map((value) => String(value)).join(' ') ?? '';
|
||||
if (text.includes('Failed to resolve login shell env: shell env resolve timeout')) {
|
||||
calls.splice(index, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function buildStressCreateRequest(input: {
|
||||
scenario: ProviderLaunchStressScenario;
|
||||
teamName: string;
|
||||
memberCount: number;
|
||||
selection: ReturnType<typeof resolveScenarioSelection>;
|
||||
}): TeamCreateRequest {
|
||||
const members = buildStressMembers(input.scenario, input.memberCount, input.selection);
|
||||
const providerId: TeamProviderId = input.scenario === 'mixed' ? 'anthropic' : input.scenario;
|
||||
return {
|
||||
teamName: input.teamName,
|
||||
cwd: projectPathForStress(),
|
||||
providerId,
|
||||
providerBackendId: providerId === 'codex' ? 'codex-native' : undefined,
|
||||
model:
|
||||
providerId === 'codex'
|
||||
? input.selection.codexModel
|
||||
: providerId === 'opencode'
|
||||
? input.selection.openCodeModel
|
||||
: input.selection.anthropicModel,
|
||||
effort: providerId === 'codex' ? input.selection.codexEffort : undefined,
|
||||
fastMode: providerId === 'codex' ? 'off' : undefined,
|
||||
skipPermissions: true,
|
||||
prompt: 'Keep the team idle after bootstrap. Do not start extra work.',
|
||||
members,
|
||||
};
|
||||
}
|
||||
|
||||
function buildStressMembers(
|
||||
scenario: ProviderLaunchStressScenario,
|
||||
memberCount: number,
|
||||
selection: ReturnType<typeof resolveScenarioSelection>
|
||||
): TeamMember[] {
|
||||
const names = buildExpectedMemberNames(memberCount);
|
||||
return names.map((name, index) => {
|
||||
const providerId = resolveStressMemberProvider(scenario, index);
|
||||
return {
|
||||
name,
|
||||
role: index % 2 === 0 ? 'Developer' : 'Reviewer',
|
||||
providerId,
|
||||
providerBackendId: providerId === 'codex' ? 'codex-native' : undefined,
|
||||
model:
|
||||
providerId === 'codex'
|
||||
? selection.codexModel
|
||||
: providerId === 'opencode'
|
||||
? selection.openCodeModel
|
||||
: selection.anthropicModel,
|
||||
effort: providerId === 'codex' ? selection.codexEffort : undefined,
|
||||
fastMode: providerId === 'codex' ? 'off' : undefined,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function resolveStressMemberProvider(
|
||||
scenario: ProviderLaunchStressScenario,
|
||||
index: number
|
||||
): TeamProviderId {
|
||||
if (scenario !== 'mixed') return scenario;
|
||||
const providers: TeamProviderId[] = ['anthropic', 'codex', 'opencode', 'anthropic', 'codex'];
|
||||
return providers[index % providers.length] ?? 'anthropic';
|
||||
}
|
||||
|
||||
function resolveScenarioSelection(scenario: ProviderLaunchStressScenario): {
|
||||
anthropicModel: string;
|
||||
codexModel: string;
|
||||
codexEffort: 'low' | 'medium' | 'high' | 'xhigh';
|
||||
openCodeModel: string;
|
||||
} {
|
||||
return {
|
||||
anthropicModel:
|
||||
process.env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_MODEL?.trim() || DEFAULT_ANTHROPIC_MODEL,
|
||||
codexModel: process.env.PROVIDER_LAUNCH_STRESS_CODEX_MODEL?.trim() || DEFAULT_CODEX_MODEL,
|
||||
codexEffort: (process.env.PROVIDER_LAUNCH_STRESS_CODEX_EFFORT?.trim() ||
|
||||
DEFAULT_CODEX_EFFORT) as 'low' | 'medium' | 'high' | 'xhigh',
|
||||
openCodeModel:
|
||||
process.env.PROVIDER_LAUNCH_STRESS_OPENCODE_MODEL?.trim() || DEFAULT_OPENCODE_MODEL,
|
||||
};
|
||||
}
|
||||
|
||||
function getStressMemberCount(): number {
|
||||
const parsed = Number.parseInt(process.env.PROVIDER_LAUNCH_STRESS_MEMBER_COUNT ?? '5', 10);
|
||||
return Number.isFinite(parsed) && parsed > 0
|
||||
? Math.min(parsed, MEMBER_NAMES.length)
|
||||
: 5;
|
||||
}
|
||||
|
||||
function buildExpectedMemberNames(memberCount: number): string[] {
|
||||
return MEMBER_NAMES.slice(0, memberCount);
|
||||
}
|
||||
|
||||
function getStressOrder(): ProviderLaunchStressScenario[] {
|
||||
const raw = process.env.PROVIDER_LAUNCH_STRESS_ORDER?.trim();
|
||||
if (!raw) return DEFAULT_ORDER;
|
||||
const parsed = raw
|
||||
.split(',')
|
||||
.map((item) => item.trim())
|
||||
.filter((item): item is ProviderLaunchStressScenario =>
|
||||
['anthropic', 'codex', 'opencode', 'mixed'].includes(item)
|
||||
);
|
||||
return parsed.length > 0 ? parsed : DEFAULT_ORDER;
|
||||
}
|
||||
|
||||
function projectPathForStress(): string {
|
||||
const explicit = process.env.PROVIDER_LAUNCH_STRESS_PROJECT_PATH?.trim();
|
||||
if (explicit) return path.resolve(explicit);
|
||||
if (!currentStressProjectPath) {
|
||||
throw new Error('Provider launch stress project path requested before test setup');
|
||||
}
|
||||
return currentStressProjectPath;
|
||||
}
|
||||
|
||||
async function cleanupActiveScenario(
|
||||
active: ActiveScenario,
|
||||
options: { preserveFiles: boolean }
|
||||
): Promise<void> {
|
||||
const beforeStopSnapshot = await active.svc
|
||||
.getTeamAgentRuntimeSnapshot(active.teamName)
|
||||
.catch(() => null);
|
||||
await active.svc.stopTeam(active.teamName).catch(() => undefined);
|
||||
if (active.harness) {
|
||||
await waitForOpenCodeLanesStopped(active.teamName, 90_000).catch(() => undefined);
|
||||
}
|
||||
await terminateProcessBackends(beforeStopSnapshot);
|
||||
const afterStopSnapshot = await active.svc
|
||||
.getTeamAgentRuntimeSnapshot(active.teamName)
|
||||
.catch(() => null);
|
||||
await terminateProcessBackends(afterStopSnapshot);
|
||||
await active.harness?.dispose().catch(() => undefined);
|
||||
await active.codexCleanup?.().catch(() => undefined);
|
||||
if (!options.preserveFiles) {
|
||||
await fs.rm(path.join(getTeamsBasePath(), active.teamName), { recursive: true, force: true });
|
||||
await fs.rm(path.join(getTasksBasePath(), active.teamName), { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
async function terminateProcessBackends(snapshot: TeamAgentRuntimeSnapshot | null): Promise<void> {
|
||||
const pids = new Set<number>();
|
||||
for (const member of Object.values(snapshot?.members ?? {})) {
|
||||
if (member.backendType !== 'process' || member.providerId === 'opencode') continue;
|
||||
const pid = member.runtimePid ?? member.pid;
|
||||
if (typeof pid === 'number' && Number.isFinite(pid) && pid > 0) pids.add(pid);
|
||||
}
|
||||
for (const pid of pids) {
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
killProcessByPid(pid);
|
||||
} catch {
|
||||
// Best-effort cleanup; the runtime may already be gone.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function installCodexAccountFeature(): Promise<() => Promise<void>> {
|
||||
const [{ createCodexAccountFeature }, { ProviderConnectionService }] = await Promise.all([
|
||||
import('../../../../src/features/codex-account/main/composition/createCodexAccountFeature'),
|
||||
import('../../../../src/main/services/runtime/ProviderConnectionService'),
|
||||
]);
|
||||
const feature = createCodexAccountFeature({
|
||||
logger: {
|
||||
info: () => undefined,
|
||||
warn: () => undefined,
|
||||
error: () => undefined,
|
||||
},
|
||||
configManager: {
|
||||
getConfig: () => ({
|
||||
providerConnections: {
|
||||
codex: {
|
||||
preferredAuthMode: 'chatgpt' as const,
|
||||
},
|
||||
},
|
||||
}),
|
||||
},
|
||||
});
|
||||
const providerConnectionService = ProviderConnectionService.getInstance();
|
||||
providerConnectionService.setCodexAccountFeature(feature);
|
||||
return async () => {
|
||||
providerConnectionService.setCodexAccountFeature(null);
|
||||
await feature.dispose().catch(() => undefined);
|
||||
};
|
||||
}
|
||||
|
||||
async function formatStressDiagnostics(
|
||||
svc: TeamProvisioningService,
|
||||
teamName: string,
|
||||
progressEvents: TeamProvisioningProgress[]
|
||||
): Promise<string> {
|
||||
const [spawnStatuses, runtimeSnapshot, artifact] = await Promise.all([
|
||||
svc.getMemberSpawnStatuses(teamName).catch((error) => ({ error: String(error) })),
|
||||
svc.getTeamAgentRuntimeSnapshot(teamName).catch((error) => ({ error: String(error) })),
|
||||
readLatestArtifactManifest(teamName),
|
||||
]);
|
||||
return redactSecrets(
|
||||
JSON.stringify(
|
||||
{
|
||||
progress: progressEvents.map((progress) => ({
|
||||
state: progress.state,
|
||||
message: progress.message,
|
||||
messageSeverity: progress.messageSeverity,
|
||||
error: progress.error,
|
||||
launchDiagnostics: progress.launchDiagnostics,
|
||||
})),
|
||||
spawnStatuses,
|
||||
runtimeSnapshot,
|
||||
artifact,
|
||||
},
|
||||
null,
|
||||
2
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
async function readLatestArtifactManifest(teamName: string): Promise<unknown> {
|
||||
try {
|
||||
const latest = JSON.parse(
|
||||
await fs.readFile(
|
||||
path.join(getTeamsBasePath(), teamName, 'launch-failure-artifacts', 'latest.json'),
|
||||
'utf8'
|
||||
)
|
||||
) as { manifestPath?: unknown };
|
||||
if (typeof latest.manifestPath !== 'string') return latest;
|
||||
return JSON.parse(await fs.readFile(latest.manifestPath, 'utf8'));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function hasAnthropicAuthConfigured(): boolean {
|
||||
return usingAnthropicSubscriptionAuth() || Boolean(process.env.ANTHROPIC_API_KEY?.trim());
|
||||
}
|
||||
|
||||
function usingAnthropicSubscriptionAuth(): boolean {
|
||||
const mode = process.env.PROVIDER_LAUNCH_STRESS_ANTHROPIC_AUTH?.trim().toLowerCase();
|
||||
return mode === 'subscription' || mode === 'oauth';
|
||||
}
|
||||
|
||||
async function assertExecutable(filePath: string): Promise<void> {
|
||||
await fs.access(filePath, fsConstants.X_OK);
|
||||
}
|
||||
|
||||
async function assertCodexSubscriptionAuthAvailable(codexHome: string): Promise<void> {
|
||||
const legacyAuthPath = path.join(codexHome, 'auth.json');
|
||||
if (await pathReadable(legacyAuthPath)) {
|
||||
const legacyAuth = await readJsonObject(legacyAuthPath);
|
||||
if (isCodexChatGptSubscriptionAuth(legacyAuth)) return;
|
||||
}
|
||||
|
||||
const accountsDir = path.join(codexHome, 'accounts');
|
||||
const registry = await readJsonObject(path.join(accountsDir, 'registry.json')).catch(() => null);
|
||||
const activeAccountId =
|
||||
readStringProperty(registry, 'active_account_id') ??
|
||||
readStringProperty(registry, 'activeAccountId') ??
|
||||
readStringProperty(registry, 'current_account_id') ??
|
||||
readStringProperty(registry, 'currentAccountId');
|
||||
|
||||
const candidates = new Set<string>();
|
||||
if (activeAccountId) {
|
||||
candidates.add(path.join(accountsDir, `${activeAccountId}.auth.json`));
|
||||
candidates.add(path.join(accountsDir, activeAccountId));
|
||||
}
|
||||
const entries = await fs.readdir(accountsDir).catch(() => []);
|
||||
for (const entry of entries) {
|
||||
if (entry.endsWith('.auth.json')) candidates.add(path.join(accountsDir, entry));
|
||||
}
|
||||
for (const candidate of candidates) {
|
||||
const auth = await readJsonObject(candidate).catch(() => null);
|
||||
if (isCodexChatGptSubscriptionAuth(auth)) return;
|
||||
}
|
||||
throw new Error(`Codex subscription auth not found in ${codexHome}`);
|
||||
}
|
||||
|
||||
async function pathReadable(filePath: string): Promise<boolean> {
|
||||
try {
|
||||
await fs.access(filePath, fsConstants.R_OK);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function readJsonObject(filePath: string): Promise<Record<string, unknown>> {
|
||||
const parsed = JSON.parse(await fs.readFile(filePath, 'utf8'));
|
||||
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
|
||||
throw new Error(`Expected JSON object in ${filePath}`);
|
||||
}
|
||||
return parsed as Record<string, unknown>;
|
||||
}
|
||||
|
||||
function readStringProperty(source: Record<string, unknown> | null, key: string): string | null {
|
||||
const value = source?.[key];
|
||||
return typeof value === 'string' && value.trim() ? value.trim() : null;
|
||||
}
|
||||
|
||||
function isCodexChatGptSubscriptionAuth(source: Record<string, unknown> | null): boolean {
|
||||
if (!source) return false;
|
||||
const direct = readStringProperty(source, 'refresh_token');
|
||||
const tokens = source.tokens;
|
||||
const nested =
|
||||
tokens && typeof tokens === 'object' && !Array.isArray(tokens)
|
||||
? readStringProperty(tokens as Record<string, unknown>, 'refresh_token')
|
||||
: null;
|
||||
return Boolean(direct || nested);
|
||||
}
|
||||
|
||||
function resolveConnectedCodexHome(previousCodexHome: string | undefined): string {
|
||||
const explicit = process.env.PROVIDER_LAUNCH_STRESS_CODEX_HOME?.trim();
|
||||
if (explicit) return path.resolve(explicit);
|
||||
const previous = previousCodexHome?.trim();
|
||||
if (previous) return path.resolve(previous);
|
||||
return path.join(os.userInfo().homedir, '.codex');
|
||||
}
|
||||
|
||||
async function writeTrustedClaudeConfig(configDir: string, projectPath: string): Promise<void> {
|
||||
const normalizedProjectPath = path.normalize(await fs.realpath(projectPath)).replace(/\\/g, '/');
|
||||
const approvedApiKeySuffix = process.env.ANTHROPIC_API_KEY?.trim().slice(-20);
|
||||
const config: {
|
||||
projects: Record<string, { hasTrustDialogAccepted: true }>;
|
||||
customApiKeyResponses?: { approved: string[]; rejected: string[] };
|
||||
} = {
|
||||
projects: {
|
||||
[normalizedProjectPath]: {
|
||||
hasTrustDialogAccepted: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
if (approvedApiKeySuffix) {
|
||||
config.customApiKeyResponses = { approved: [approvedApiKeySuffix], rejected: [] };
|
||||
}
|
||||
await fs.writeFile(
|
||||
path.join(configDir, '.claude.json'),
|
||||
`${JSON.stringify(config, null, 2)}\n`,
|
||||
'utf8'
|
||||
);
|
||||
}
|
||||
|
||||
async function upsertTrustedClaudeProjectConfig(
|
||||
configDir: string,
|
||||
projectPath: string
|
||||
): Promise<string | null> {
|
||||
const configPath = path.join(configDir, '.claude.json');
|
||||
const previous = await fs.readFile(configPath, 'utf8').catch((error) => {
|
||||
if ((error as NodeJS.ErrnoException).code === 'ENOENT') return null;
|
||||
throw error;
|
||||
});
|
||||
const existing = previous ? (JSON.parse(previous) as Record<string, unknown>) : {};
|
||||
const normalizedProjectPath = path.normalize(await fs.realpath(projectPath)).replace(/\\/g, '/');
|
||||
const projects =
|
||||
existing.projects && typeof existing.projects === 'object' && !Array.isArray(existing.projects)
|
||||
? { ...(existing.projects as Record<string, unknown>) }
|
||||
: {};
|
||||
const current =
|
||||
projects[normalizedProjectPath] &&
|
||||
typeof projects[normalizedProjectPath] === 'object' &&
|
||||
!Array.isArray(projects[normalizedProjectPath])
|
||||
? (projects[normalizedProjectPath] as Record<string, unknown>)
|
||||
: {};
|
||||
projects[normalizedProjectPath] = { ...current, hasTrustDialogAccepted: true };
|
||||
await fs.writeFile(configPath, `${JSON.stringify({ ...existing, projects }, null, 2)}\n`, 'utf8');
|
||||
return previous;
|
||||
}
|
||||
|
||||
async function restoreClaudeJsonConfig(configDir: string, previous: string | null): Promise<void> {
|
||||
const configPath = path.join(configDir, '.claude.json');
|
||||
if (previous === null) {
|
||||
await fs.rm(configPath, { force: true });
|
||||
} else {
|
||||
await fs.writeFile(configPath, previous, 'utf8');
|
||||
}
|
||||
}
|
||||
|
||||
function restoreEnv(name: string, previous: string | undefined): void {
|
||||
if (previous === undefined) {
|
||||
delete process.env[name];
|
||||
} else {
|
||||
process.env[name] = previous;
|
||||
}
|
||||
}
|
||||
|
||||
function redactSecrets(text: string): string {
|
||||
return text
|
||||
.replace(/sk-ant-api03-[A-Za-z0-9_-]+/g, '<redacted-anthropic-key>')
|
||||
.replace(/\b(?:sk|ak)-[A-Za-z0-9_-]{20,}\b/g, '<redacted-api-key>');
|
||||
}
|
||||
197
test/main/services/team/TeamLaunchFailureArtifactPack.test.ts
Normal file
197
test/main/services/team/TeamLaunchFailureArtifactPack.test.ts
Normal file
|
|
@ -0,0 +1,197 @@
|
|||
import * as fs from 'fs/promises';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
|
||||
import {
|
||||
classifyLaunchFailureArtifact,
|
||||
extractLaunchBootstrapTransportBreadcrumb,
|
||||
redactLaunchFailureArtifactText,
|
||||
writeTeamLaunchFailureArtifactPack,
|
||||
} from '../../../../src/main/services/team/TeamLaunchFailureArtifactPack';
|
||||
import {
|
||||
getTeamsBasePath,
|
||||
setClaudeBasePathOverride,
|
||||
} from '../../../../src/main/utils/pathDecoder';
|
||||
|
||||
describe('TeamLaunchFailureArtifactPack', () => {
|
||||
let tempRoot: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'team-launch-artifact-pack-'));
|
||||
setClaudeBasePathOverride(path.join(tempRoot, '.claude'));
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
setClaudeBasePathOverride(null);
|
||||
await fs.rm(tempRoot, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('writes a bounded redacted launch failure artifact pack with known launch files', async () => {
|
||||
const teamName = 'artifact-team';
|
||||
const runId = 'run-secret-1';
|
||||
const teamDir = path.join(getTeamsBasePath(), teamName);
|
||||
await fs.mkdir(path.join(teamDir, '.bootstrap.lock'), { recursive: true });
|
||||
await fs.writeFile(
|
||||
path.join(teamDir, 'launch-state.json'),
|
||||
JSON.stringify({
|
||||
teamName,
|
||||
runId,
|
||||
secret: 'sk-ant-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
|
||||
token: 'abcdefghijklmnopqrstuvwxyz123456',
|
||||
}),
|
||||
'utf8'
|
||||
);
|
||||
await fs.writeFile(path.join(teamDir, 'launch-summary.json'), '{"summary":true}\n', 'utf8');
|
||||
await fs.writeFile(path.join(teamDir, 'bootstrap-state.json'), '{"bootstrap":true}\n', 'utf8');
|
||||
await fs.writeFile(
|
||||
path.join(teamDir, 'bootstrap-journal.jsonl'),
|
||||
'{"event":"started"}\n',
|
||||
'utf8'
|
||||
);
|
||||
await fs.writeFile(
|
||||
path.join(teamDir, '.bootstrap.lock', 'metadata.json'),
|
||||
'{"pid":123,"runId":"run-secret-1"}\n',
|
||||
'utf8'
|
||||
);
|
||||
|
||||
const result = await writeTeamLaunchFailureArtifactPack({
|
||||
teamName,
|
||||
runId,
|
||||
reason: 'launch_progress_failed',
|
||||
startedAt: '2026-05-09T00:00:00.000Z',
|
||||
cwd: '/repo',
|
||||
pid: 123,
|
||||
providerId: 'anthropic',
|
||||
model: 'claude-opus',
|
||||
expectedMembers: ['alice'],
|
||||
effectiveMembers: [{ name: 'alice', role: 'developer', provider: 'anthropic' } as never],
|
||||
progress: {
|
||||
runId,
|
||||
teamName,
|
||||
state: 'failed',
|
||||
message: 'Launch failed',
|
||||
startedAt: '2026-05-09T00:00:00.000Z',
|
||||
updatedAt: '2026-05-09T00:01:00.000Z',
|
||||
error:
|
||||
'Authentication failed: ANTHROPIC_API_KEY=sk-ant-bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb',
|
||||
},
|
||||
memberSpawnStatuses: {
|
||||
alice: {
|
||||
status: 'error',
|
||||
launchState: 'failed_to_start',
|
||||
hardFailureReason: 'bootstrap timeout',
|
||||
updatedAt: '2026-05-09T00:01:00.000Z',
|
||||
},
|
||||
},
|
||||
cliLogs: 'stderr OPENAI_API_KEY=sk-proj-cccccccccccccccccccccccccccccccccccccccc',
|
||||
progressTraceLines: ['[failed] launch failed'],
|
||||
runtimeAdapterTraceLines: ['runtime trace'],
|
||||
});
|
||||
|
||||
const manifest = JSON.parse(await fs.readFile(result.manifestPath, 'utf8')) as {
|
||||
reason: string;
|
||||
artifactFiles: string[];
|
||||
classification: { code: string };
|
||||
bootstrapTransportBreadcrumb: { lastTransportStage: string | null };
|
||||
progress: { error: string };
|
||||
};
|
||||
expect(manifest.reason).toBe('launch_progress_failed');
|
||||
expect(manifest.classification.code).toBe('provider_auth');
|
||||
expect(manifest.artifactFiles).toContain('cli-logs-tail.txt');
|
||||
expect(manifest.artifactFiles).toContain('launch-state.json');
|
||||
expect(manifest.progress.error).toContain('[REDACTED]');
|
||||
|
||||
const copiedLaunchState = await fs.readFile(path.join(result.directory, 'launch-state.json'), 'utf8');
|
||||
expect(copiedLaunchState).toContain('[REDACTED_ANTHROPIC_API_KEY]');
|
||||
expect(() => JSON.parse(copiedLaunchState)).not.toThrow();
|
||||
expect(copiedLaunchState).toContain('"token":"[REDACTED]"');
|
||||
expect(copiedLaunchState).not.toContain('sk-ant-');
|
||||
|
||||
const cliLogs = await fs.readFile(path.join(result.directory, 'cli-logs-tail.txt'), 'utf8');
|
||||
expect(cliLogs).toContain('OPENAI_API_KEY=[REDACTED]');
|
||||
expect(cliLogs).not.toContain('sk-proj-');
|
||||
|
||||
const latest = JSON.parse(
|
||||
await fs.readFile(path.join(teamDir, 'launch-failure-artifacts', 'latest.json'), 'utf8')
|
||||
) as { manifestPath: string };
|
||||
expect(latest.manifestPath).toBe(result.manifestPath);
|
||||
});
|
||||
|
||||
it('redacts common bearer and token-shaped secrets', () => {
|
||||
const redacted = redactLaunchFailureArtifactText(
|
||||
'Authorization: Bearer abcdefghijklmnopqrstuvwxyz123456 token: abcdefghijklmnopqrstuvwxyz123456'
|
||||
);
|
||||
expect(redacted).toContain('Authorization: Bearer [REDACTED]');
|
||||
expect(redacted).toContain('token: [REDACTED]');
|
||||
});
|
||||
|
||||
it('classifies bootstrap transport rejection and extracts breadcrumb details', () => {
|
||||
const input = {
|
||||
teamName: 'artifact-team',
|
||||
runId: 'run-transport',
|
||||
reason: 'launch_cleanup_unconfirmed_bootstrap',
|
||||
progressTraceLines: [
|
||||
'bob did not submit bootstrap prompt: timed out waiting for bootstrap_submitted; last transport stage: bootstrap_submit_rejected: submit rejected by local prompt handler retryable=true',
|
||||
'Warning: no stdin data received in 3s, proceeding without it.',
|
||||
],
|
||||
};
|
||||
|
||||
expect(classifyLaunchFailureArtifact(input).code).toBe('transport_rejected');
|
||||
expect(extractLaunchBootstrapTransportBreadcrumb(input)).toMatchObject({
|
||||
lastTransportStage: 'bootstrap_submit_rejected: submit rejected by local prompt handler retryable=true',
|
||||
submitRejected: true,
|
||||
retryable: true,
|
||||
noStdinWarning: true,
|
||||
bootstrapSubmitted: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('classifies provider quota separately from protocol errors', () => {
|
||||
expect(
|
||||
classifyLaunchFailureArtifact({
|
||||
teamName: 'artifact-team',
|
||||
runId: 'run-quota',
|
||||
reason:
|
||||
'OpenCode quota exhausted. This request requires more credits, or fewer max_tokens.',
|
||||
}).code
|
||||
).toBe('provider_quota');
|
||||
});
|
||||
|
||||
it.each([
|
||||
{
|
||||
name: 'stdin warning',
|
||||
text: 'Warning: no stdin data received in 3s, proceeding without it.',
|
||||
code: 'stdin_missing',
|
||||
},
|
||||
{
|
||||
name: 'provider auth',
|
||||
text: 'Codex API error. Token refresh failed: 401 Unauthorized',
|
||||
code: 'provider_auth',
|
||||
},
|
||||
{
|
||||
name: 'model bootstrap timeout',
|
||||
text: 'bob: Teammate was registered but did not bootstrap-confirm before timeout.',
|
||||
code: 'model_no_bootstrap',
|
||||
},
|
||||
{
|
||||
name: 'process stale pid',
|
||||
text: 'persisted runtime pid is not alive; persisted runtime pid was not found in process table',
|
||||
code: 'process_exited',
|
||||
},
|
||||
{
|
||||
name: 'opencode protocol',
|
||||
text: 'OpenCode API error. non_visible_tool_without_task_progress',
|
||||
code: 'opencode_protocol',
|
||||
},
|
||||
])('classifies production-like failure string: $name', ({ text, code }) => {
|
||||
expect(
|
||||
classifyLaunchFailureArtifact({
|
||||
teamName: 'artifact-team',
|
||||
runId: `run-${code}`,
|
||||
reason: text,
|
||||
}).code
|
||||
).toBe(code);
|
||||
});
|
||||
});
|
||||
|
|
@ -615,6 +615,17 @@ function createClaudeLogsRun(overrides: Record<string, unknown> = {}) {
|
|||
} as any;
|
||||
}
|
||||
|
||||
async function waitForFile(filePath: string, timeoutMs = 2_000): Promise<void> {
|
||||
const deadline = Date.now() + timeoutMs;
|
||||
while (Date.now() < deadline) {
|
||||
if (fs.existsSync(filePath)) {
|
||||
return;
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 25));
|
||||
}
|
||||
throw new Error(`Timed out waiting for file: ${filePath}`);
|
||||
}
|
||||
|
||||
describe('TeamProvisioningService', () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
|
|
@ -1137,6 +1148,102 @@ describe('TeamProvisioningService', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('writes a launch failure artifact pack when cleanup finalizes failed launch state', async () => {
|
||||
allowConsoleLogs();
|
||||
const svc = new TeamProvisioningService();
|
||||
const teamName = 'launch-artifact-cleanup-team';
|
||||
const runId = 'run-launch-artifact-cleanup';
|
||||
const startedAt = '2026-05-09T00:25:00.000Z';
|
||||
const run = createClaudeLogsRun({
|
||||
runId,
|
||||
teamName,
|
||||
startedAt,
|
||||
isLaunch: true,
|
||||
provisioningComplete: false,
|
||||
cancelRequested: false,
|
||||
deterministicBootstrap: true,
|
||||
expectedMembers: ['bob'],
|
||||
allEffectiveMembers: [
|
||||
{
|
||||
name: 'bob',
|
||||
role: 'Developer',
|
||||
providerId: 'anthropic',
|
||||
model: 'opus',
|
||||
},
|
||||
],
|
||||
request: {
|
||||
cwd: '/repo',
|
||||
providerId: 'anthropic',
|
||||
model: 'opus',
|
||||
members: [
|
||||
{
|
||||
name: 'bob',
|
||||
role: 'Developer',
|
||||
providerId: 'anthropic',
|
||||
model: 'opus',
|
||||
},
|
||||
],
|
||||
},
|
||||
memberSpawnStatuses: new Map([
|
||||
[
|
||||
'bob',
|
||||
createMemberSpawnStatusEntry({
|
||||
status: 'spawning',
|
||||
launchState: 'runtime_pending_bootstrap',
|
||||
runtimeAlive: true,
|
||||
firstSpawnAcceptedAt: '2026-05-09T00:25:05.000Z',
|
||||
updatedAt: '2026-05-09T00:25:05.000Z',
|
||||
}),
|
||||
],
|
||||
]),
|
||||
progress: {
|
||||
runId,
|
||||
teamName,
|
||||
state: 'failed',
|
||||
message: 'Launch failed',
|
||||
startedAt,
|
||||
updatedAt: '2026-05-09T00:26:00.000Z',
|
||||
error:
|
||||
'Teammate process bob@signal-ops did not submit bootstrap prompt: timed out waiting for bootstrap_submitted; last transport stage: bootstrap_submit_rejected: submit rejected by local prompt handler retryable=true Last stderr: Warning: no stdin data received in 3s, proceeding without it.',
|
||||
},
|
||||
claudeLogLines: [
|
||||
'[stderr]',
|
||||
'Warning: no stdin data received in 3s, proceeding without it.',
|
||||
],
|
||||
provisioningOutputParts: [],
|
||||
});
|
||||
|
||||
(svc as any).runs.set(run.runId, run);
|
||||
(svc as any).aliveRunByTeam.set(run.teamName, run.runId);
|
||||
(svc as any).cleanupRun(run);
|
||||
|
||||
const latestPath = path.join(
|
||||
tempTeamsBase,
|
||||
teamName,
|
||||
'launch-failure-artifacts',
|
||||
'latest.json'
|
||||
);
|
||||
await waitForFile(latestPath);
|
||||
const latest = JSON.parse(fs.readFileSync(latestPath, 'utf8')) as { manifestPath: string };
|
||||
const manifest = JSON.parse(fs.readFileSync(latest.manifestPath, 'utf8')) as {
|
||||
reason: string;
|
||||
classification: { code: string };
|
||||
bootstrapTransportBreadcrumb: {
|
||||
submitRejected: boolean;
|
||||
noStdinWarning: boolean;
|
||||
retryable: boolean | null;
|
||||
};
|
||||
};
|
||||
|
||||
expect(manifest.reason).toBe('launch_progress_failed');
|
||||
expect(manifest.classification.code).toBe('transport_rejected');
|
||||
expect(manifest.bootstrapTransportBreadcrumb).toMatchObject({
|
||||
submitRejected: true,
|
||||
noStdinWarning: true,
|
||||
retryable: true,
|
||||
});
|
||||
});
|
||||
|
||||
it('falls back to the persisted lead transcript when no live run exists', async () => {
|
||||
const svc = new TeamProvisioningService();
|
||||
const teamName = 'offline-logs-team';
|
||||
|
|
@ -11591,6 +11698,59 @@ describe('TeamProvisioningService', () => {
|
|||
expect(sendMessageToRun).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('restarts a process backend teammate directly without asking the lead to respawn it', async () => {
|
||||
const svc = new TeamProvisioningService();
|
||||
const run = createMemberSpawnRun({
|
||||
teamName: 'process-team',
|
||||
expectedMembers: ['forge'],
|
||||
memberSpawnStatuses: new Map(),
|
||||
});
|
||||
run.child = { pid: 111 };
|
||||
run.processKilled = false;
|
||||
run.cancelRequested = false;
|
||||
|
||||
const sendMessageToRun = vi.fn(async () => {});
|
||||
const directProcessRestart = vi.fn(async () => {});
|
||||
(svc as any).sendMessageToRun = sendMessageToRun;
|
||||
(svc as any).launchDirectProcessMemberRestart = directProcessRestart;
|
||||
(svc as any).configReader = {
|
||||
getConfig: vi.fn(async () => ({
|
||||
name: 'Process Team',
|
||||
members: [{ name: 'team-lead', agentType: 'team-lead' }],
|
||||
})),
|
||||
};
|
||||
(svc as any).membersMetaStore = {
|
||||
getMembers: vi.fn(async () => [
|
||||
{
|
||||
name: 'forge',
|
||||
role: 'Developer',
|
||||
providerId: 'codex',
|
||||
model: 'gpt-5.4',
|
||||
effort: 'medium',
|
||||
agentType: 'general-purpose',
|
||||
},
|
||||
]),
|
||||
};
|
||||
(svc as any).readPersistedRuntimeMembers = vi.fn(() => [
|
||||
{
|
||||
name: 'forge',
|
||||
agentId: 'forge@process-team',
|
||||
backendType: 'process',
|
||||
tmuxPaneId: 'process:1234',
|
||||
runtimePid: 1234,
|
||||
},
|
||||
]);
|
||||
(svc as any).getLiveTeamAgentRuntimeMetadata = vi.fn(async () => new Map());
|
||||
(svc as any).aliveRunByTeam.set('process-team', run.runId);
|
||||
(svc as any).runs.set(run.runId, run);
|
||||
|
||||
await svc.restartMember('process-team', 'forge');
|
||||
|
||||
expect(directProcessRestart).toHaveBeenCalledTimes(1);
|
||||
expect(sendMessageToRun).not.toHaveBeenCalled();
|
||||
expect(run.pendingMemberRestarts.has('forge')).toBe(true);
|
||||
});
|
||||
|
||||
it('rejects a second restart request while the first restart is still in flight', async () => {
|
||||
const svc = new TeamProvisioningService();
|
||||
const run = createMemberSpawnRun({
|
||||
|
|
|
|||
Loading…
Reference in a new issue