From e6e3ae9f54fce8fa5daabffa08394704269bb322 Mon Sep 17 00:00:00 2001 From: 777genius Date: Fri, 24 Apr 2026 23:25:53 +0300 Subject: [PATCH] fix(team): harden launch failure recovery copy --- .../services/team/TeamProvisioningService.ts | 26 +++- .../utils/teamProvisioningPresentation.ts | 18 +-- .../team/TeamProvisioningService.test.ts | 125 +++++++++++++++++- .../TeamProvisioningServicePrepare.test.ts | 1 + .../teamProvisioningPresentation.test.ts | 56 ++++++++ 5 files changed, 207 insertions(+), 19 deletions(-) diff --git a/src/main/services/team/TeamProvisioningService.ts b/src/main/services/team/TeamProvisioningService.ts index d375803e..d5bd3091 100644 --- a/src/main/services/team/TeamProvisioningService.ts +++ b/src/main/services/team/TeamProvisioningService.ts @@ -1560,7 +1560,10 @@ function isConfigRegistrationFailureReason(reason?: string): boolean { function isTmuxNoServerRunningError(error: unknown): boolean { const text = error instanceof Error ? error.message : String(error ?? ''); - return /no server running on /i.test(text); + return ( + /no server running on /i.test(text) || + /error connecting to .*no such file or directory/i.test(text) + ); } function isAutoClearableLaunchFailureReason(reason?: string): boolean { @@ -2194,6 +2197,7 @@ function extractHeartbeatTimestamp(text: string, fallback?: string): string | un function extractBootstrapFailureReason(text: string): string | null { const trimmed = text.trim(); if (!trimmed) return null; + if (isBootstrapInstructionPrompt(trimmed)) return null; const lower = trimmed.toLowerCase(); const looksLikeBootstrapFailure = lower.includes('bootstrap failed') || @@ -2241,6 +2245,17 @@ function extractBootstrapFailureReason(text: string): string | null { return trimmed.slice(0, 280); } +function isBootstrapInstructionPrompt(text: string): boolean { + const normalized = text.replace(/\s+/g, ' ').trim().toLowerCase(); + if (!normalized.startsWith('you are bootstrapping into team ')) { + return false; + } + return ( + normalized.includes('your first action is to call the mcp tool') && + (normalized.includes('member_briefing') || normalized.includes('lead_briefing')) + ); +} + function isBootstrapTranscriptSuccessText( text: string, teamName: string, @@ -14942,7 +14957,7 @@ export class TeamProvisioningService { const expectedMembers = this.getPersistedLaunchMemberNames(snapshot); for (const expected of expectedMembers) { const current = snapshot.members[expected]; - if (!current || current.bootstrapConfirmed || current.hardFailure) { + if (!current || current.bootstrapConfirmed) { continue; } const acceptedAtMs = @@ -15128,7 +15143,7 @@ export class TeamProvisioningService { current.hardFailure = false; current.hardFailureReason = undefined; } - if (!current.bootstrapConfirmed && !current.hardFailure) { + if (!current.bootstrapConfirmed) { const transcriptOutcome = await this.findBootstrapTranscriptOutcome( teamName, expected, @@ -15139,7 +15154,10 @@ export class TeamProvisioningService { current.lastHeartbeatAt = current.lastHeartbeatAt ?? transcriptOutcome.observedAt; current.hardFailure = false; current.hardFailureReason = undefined; - } else if (transcriptOutcome?.kind === 'failure') { + if (current.sources) { + current.sources.hardFailureSignal = undefined; + } + } else if (transcriptOutcome?.kind === 'failure' && !current.hardFailure) { current.hardFailure = true; current.hardFailureReason = transcriptOutcome.reason; current.sources.hardFailureSignal = true; diff --git a/src/renderer/utils/teamProvisioningPresentation.ts b/src/renderer/utils/teamProvisioningPresentation.ts index ee4c6fda..eae8ac56 100644 --- a/src/renderer/utils/teamProvisioningPresentation.ts +++ b/src/renderer/utils/teamProvisioningPresentation.ts @@ -390,12 +390,8 @@ function getSkippedSpawnDetails(params: { .sort((left, right) => left.name.localeCompare(right.name)); } -function truncateFailureReason(reason: string, maxLength = 160): string { - const normalized = reason.replace(/\s+/g, ' ').trim(); - if (normalized.length <= maxLength) { - return normalized; - } - return `${normalized.slice(0, Math.max(0, maxLength - 3)).trimEnd()}...`; +function normalizeFailureReason(reason: string): string { + return reason.replace(/\s+/g, ' ').trim(); } function buildFailedSpawnPanelMessage( @@ -407,13 +403,13 @@ function buildFailedSpawnPanelMessage( if (failedSpawnDetails.length === 1) { const [failed] = failedSpawnDetails; return failed.reason - ? `${failed.name} failed to start - ${truncateFailureReason(failed.reason, 220)}` + ? `${failed.name} failed to start - ${normalizeFailureReason(failed.reason)}` : `${failed.name} failed to start`; } const listedFailures = failedSpawnDetails .slice(0, 2) .map((failed) => - failed.reason ? `${failed.name} - ${truncateFailureReason(failed.reason, 120)}` : failed.name + failed.reason ? `${failed.name} - ${normalizeFailureReason(failed.reason)}` : failed.name ) .join('; '); const remainingCount = failedSpawnDetails.length - Math.min(failedSpawnDetails.length, 2); @@ -454,15 +450,13 @@ function buildSkippedSpawnPanelMessage( if (skippedSpawnDetails.length === 1) { const [skipped] = skippedSpawnDetails; return skipped.reason - ? `${skipped.name} skipped for this launch - ${truncateFailureReason(skipped.reason, 220)}` + ? `${skipped.name} skipped for this launch - ${normalizeFailureReason(skipped.reason)}` : `${skipped.name} skipped for this launch`; } const listedSkipped = skippedSpawnDetails .slice(0, 3) .map((skipped) => - skipped.reason - ? `${skipped.name} - ${truncateFailureReason(skipped.reason, 100)}` - : skipped.name + skipped.reason ? `${skipped.name} - ${normalizeFailureReason(skipped.reason)}` : skipped.name ) .join('; '); const remainingCount = skippedSpawnDetails.length - Math.min(skippedSpawnDetails.length, 3); diff --git a/test/main/services/team/TeamProvisioningService.test.ts b/test/main/services/team/TeamProvisioningService.test.ts index dfae9753..35fb063e 100644 --- a/test/main/services/team/TeamProvisioningService.test.ts +++ b/test/main/services/team/TeamProvisioningService.test.ts @@ -4747,7 +4747,7 @@ describe('TeamProvisioningService', () => { (svc as any).runs.set(run.runId, run); vi.mocked(listTmuxPanePidsForCurrentPlatform).mockRejectedValue( - new Error('no server running on /private/tmp/tmux-501/default') + new Error('error connecting to /private/tmp/tmux-501/default (No such file or directory)') ); await svc.restartMember('tmux-team', 'forge'); @@ -6558,12 +6558,13 @@ describe('TeamProvisioningService', () => { firstSpawnAcceptedAt: acceptedAt, }, bob: { - launchState: 'starting', - agentToolAccepted: false, + launchState: 'runtime_pending_bootstrap', + agentToolAccepted: true, runtimeAlive: false, bootstrapConfirmed: false, hardFailure: false, hardFailureReason: undefined, + firstSpawnAcceptedAt: acceptedAt, }, }); @@ -6639,6 +6640,124 @@ describe('TeamProvisioningService', () => { expect(result.statuses.alice?.error).toBeUndefined(); }); + it('does not classify the bootstrap instruction prompt as a member launch failure', async () => { + allowConsoleLogs(); + const teamName = 'zz-unit-bootstrap-prompt-not-failure'; + const leadSessionId = 'lead-session'; + const memberSessionId = 'alice-session'; + const projectPath = '/Users/test/proj'; + const projectId = '-Users-test-proj'; + const acceptedAt = new Date(Date.now() - 5_000).toISOString(); + + writeLaunchConfig(teamName, projectPath, leadSessionId, ['alice']); + + const projectRoot = path.join(tempProjectsBase, projectId); + fs.mkdirSync(projectRoot, { recursive: true }); + fs.writeFileSync( + path.join(projectRoot, `${memberSessionId}.jsonl`), + `${JSON.stringify({ + timestamp: acceptedAt, + teamName, + agentName: 'alice', + type: 'user', + message: { + role: 'user', + content: `You are bootstrapping into team "${teamName}" as member "alice".\nYour first action is to call the MCP tool member_briefing on the agent-teams server with teamName="${teamName}" and memberName="alice".\nIf member_briefing is still unavailable after that one retry, send exactly one short SendMessage to "team-lead" with the exact error text, then stop this turn and wait.`, + }, + })}\n`, + 'utf8' + ); + + const svc = new TeamProvisioningService(); + const reason = await (svc as any).findBootstrapTranscriptFailureReason( + teamName, + 'alice', + Date.parse(acceptedAt) - 1 + ); + + expect(reason).toBeNull(); + }); + + it('clears a stale persisted bootstrap-prompt failure when member_briefing later succeeds', async () => { + allowConsoleLogs(); + const teamName = 'zz-unit-bootstrap-stale-prompt-failure'; + const leadSessionId = 'lead-session'; + const memberSessionId = 'alice-session'; + const projectPath = '/Users/test/proj'; + const projectId = '-Users-test-proj'; + const acceptedAt = new Date(Date.now() - 5_000).toISOString(); + const successAt = new Date(Date.now() - 4_000).toISOString(); + const staleReason = `You are bootstrapping into team "${teamName}" as member "alice".\nYour first action is to call the MCP tool member_briefing on the agent-teams server with teamName="${teamName}" and memberName="alice".\nIf tool search shows only the prefixed MCP name, use mcp__agent-teams__member_briefing.`; + + writeLaunchConfig(teamName, projectPath, leadSessionId, ['alice', 'bob']); + writeLaunchState(teamName, leadSessionId, { + alice: { + launchState: 'failed_to_start', + agentToolAccepted: true, + runtimeAlive: false, + bootstrapConfirmed: false, + hardFailure: true, + hardFailureReason: staleReason, + firstSpawnAcceptedAt: acceptedAt, + }, + bob: { + launchState: 'starting', + agentToolAccepted: false, + runtimeAlive: false, + bootstrapConfirmed: false, + hardFailure: false, + hardFailureReason: undefined, + }, + }); + + const projectRoot = path.join(tempProjectsBase, projectId); + fs.mkdirSync(projectRoot, { recursive: true }); + fs.writeFileSync( + path.join(projectRoot, `${memberSessionId}.jsonl`), + [ + JSON.stringify({ + timestamp: acceptedAt, + teamName, + agentName: 'alice', + type: 'user', + message: { + role: 'user', + content: staleReason, + }, + }), + JSON.stringify({ + timestamp: successAt, + teamName, + agentName: 'alice', + type: 'user', + message: { + role: 'user', + content: [ + { + type: 'tool_result', + tool_use_id: 'item_1', + content: `Member briefing for alice on team "${teamName}" (${teamName}).\nTask briefing for alice:\nNo actionable tasks.`, + is_error: false, + }, + ], + }, + }), + ].join('\n') + '\n', + 'utf8' + ); + + const svc = new TeamProvisioningService(); + const result = await svc.getMemberSpawnStatuses(teamName); + + expect(result.statuses.alice).toMatchObject({ + status: 'online', + launchState: 'confirmed_alive', + bootstrapConfirmed: true, + hardFailure: false, + }); + expect(result.statuses.alice?.hardFailureReason).toBeUndefined(); + }); + it('marks an online teammate bootstrap as failed when transcript shows model unavailability', async () => { allowConsoleLogs(); const teamName = 'zz-live-bootstrap-model-unavailable'; diff --git a/test/main/services/team/TeamProvisioningServicePrepare.test.ts b/test/main/services/team/TeamProvisioningServicePrepare.test.ts index 8628a372..5505c405 100644 --- a/test/main/services/team/TeamProvisioningServicePrepare.test.ts +++ b/test/main/services/team/TeamProvisioningServicePrepare.test.ts @@ -172,6 +172,7 @@ const REQUIRED_MOCK_AGENT_TEAMS_TOOLS = [ 'task_get_comment', 'task_link', 'task_list', + 'task_restore', 'task_set_clarification', 'task_set_owner', 'task_set_status', diff --git a/test/renderer/utils/teamProvisioningPresentation.test.ts b/test/renderer/utils/teamProvisioningPresentation.test.ts index 0bd0c2f8..664ddc94 100644 --- a/test/renderer/utils/teamProvisioningPresentation.test.ts +++ b/test/renderer/utils/teamProvisioningPresentation.test.ts @@ -91,6 +91,62 @@ describe('buildTeamProvisioningPresentation', () => { expect(presentation?.defaultLiveOutputOpen).toBe(false); }); + it('does not truncate long failed teammate reasons in the panel message', () => { + const reason = + 'You are bootstrapping into team "relay-works-10" as member "alice". Your first action is to call the MCP tool member_briefing on the agent-teams server with teamName="relay-works-10" and memberName="alice". If tool search shows only the prefixed MCP name, use mcp__agent-teams__member_briefing.'; + const presentation = buildTeamProvisioningPresentation({ + progress: { + runId: 'run-long-failure', + teamName: 'relay-works-10', + state: 'ready', + startedAt: '2026-04-13T10:00:00.000Z', + updatedAt: '2026-04-13T10:00:08.000Z', + message: 'Launch completed with teammate errors', + messageSeverity: 'warning', + pid: 4321, + cliLogsTail: '', + assistantOutput: '', + }, + members: [ + { + name: 'team-lead', + agentType: 'team-lead', + status: 'active', + currentTaskId: null, + taskCount: 0, + lastActiveAt: null, + messageCount: 0, + }, + { + name: 'alice', + agentType: 'reviewer', + status: 'unknown', + currentTaskId: null, + taskCount: 0, + lastActiveAt: null, + messageCount: 0, + }, + ], + memberSpawnStatuses: { + alice: { + status: 'error', + launchState: 'failed_to_start', + error: reason, + hardFailureReason: reason, + updatedAt: '2026-04-13T10:00:03.000Z', + runtimeAlive: false, + bootstrapConfirmed: false, + hardFailure: true, + agentToolAccepted: true, + firstSpawnAcceptedAt: '2026-04-13T10:00:01.000Z', + }, + }, + memberSpawnSnapshot: undefined, + }); + + expect(presentation?.panelMessage).toBe(`alice failed to start - ${reason}`); + }); + it('surfaces the failed teammate reason after launch completes with errors', () => { const presentation = buildTeamProvisioningPresentation({ progress: {