fix(team): harden launch failure recovery copy

This commit is contained in:
777genius 2026-04-24 23:25:53 +03:00
parent 5c5b5c7afc
commit e6e3ae9f54
5 changed files with 207 additions and 19 deletions

View file

@ -1560,7 +1560,10 @@ function isConfigRegistrationFailureReason(reason?: string): boolean {
function isTmuxNoServerRunningError(error: unknown): boolean {
const text = error instanceof Error ? error.message : String(error ?? '');
return /no server running on /i.test(text);
return (
/no server running on /i.test(text) ||
/error connecting to .*no such file or directory/i.test(text)
);
}
function isAutoClearableLaunchFailureReason(reason?: string): boolean {
@ -2194,6 +2197,7 @@ function extractHeartbeatTimestamp(text: string, fallback?: string): string | un
function extractBootstrapFailureReason(text: string): string | null {
const trimmed = text.trim();
if (!trimmed) return null;
if (isBootstrapInstructionPrompt(trimmed)) return null;
const lower = trimmed.toLowerCase();
const looksLikeBootstrapFailure =
lower.includes('bootstrap failed') ||
@ -2241,6 +2245,17 @@ function extractBootstrapFailureReason(text: string): string | null {
return trimmed.slice(0, 280);
}
function isBootstrapInstructionPrompt(text: string): boolean {
const normalized = text.replace(/\s+/g, ' ').trim().toLowerCase();
if (!normalized.startsWith('you are bootstrapping into team ')) {
return false;
}
return (
normalized.includes('your first action is to call the mcp tool') &&
(normalized.includes('member_briefing') || normalized.includes('lead_briefing'))
);
}
function isBootstrapTranscriptSuccessText(
text: string,
teamName: string,
@ -14942,7 +14957,7 @@ export class TeamProvisioningService {
const expectedMembers = this.getPersistedLaunchMemberNames(snapshot);
for (const expected of expectedMembers) {
const current = snapshot.members[expected];
if (!current || current.bootstrapConfirmed || current.hardFailure) {
if (!current || current.bootstrapConfirmed) {
continue;
}
const acceptedAtMs =
@ -15128,7 +15143,7 @@ export class TeamProvisioningService {
current.hardFailure = false;
current.hardFailureReason = undefined;
}
if (!current.bootstrapConfirmed && !current.hardFailure) {
if (!current.bootstrapConfirmed) {
const transcriptOutcome = await this.findBootstrapTranscriptOutcome(
teamName,
expected,
@ -15139,7 +15154,10 @@ export class TeamProvisioningService {
current.lastHeartbeatAt = current.lastHeartbeatAt ?? transcriptOutcome.observedAt;
current.hardFailure = false;
current.hardFailureReason = undefined;
} else if (transcriptOutcome?.kind === 'failure') {
if (current.sources) {
current.sources.hardFailureSignal = undefined;
}
} else if (transcriptOutcome?.kind === 'failure' && !current.hardFailure) {
current.hardFailure = true;
current.hardFailureReason = transcriptOutcome.reason;
current.sources.hardFailureSignal = true;

View file

@ -390,12 +390,8 @@ function getSkippedSpawnDetails(params: {
.sort((left, right) => left.name.localeCompare(right.name));
}
function truncateFailureReason(reason: string, maxLength = 160): string {
const normalized = reason.replace(/\s+/g, ' ').trim();
if (normalized.length <= maxLength) {
return normalized;
}
return `${normalized.slice(0, Math.max(0, maxLength - 3)).trimEnd()}...`;
function normalizeFailureReason(reason: string): string {
return reason.replace(/\s+/g, ' ').trim();
}
function buildFailedSpawnPanelMessage(
@ -407,13 +403,13 @@ function buildFailedSpawnPanelMessage(
if (failedSpawnDetails.length === 1) {
const [failed] = failedSpawnDetails;
return failed.reason
? `${failed.name} failed to start - ${truncateFailureReason(failed.reason, 220)}`
? `${failed.name} failed to start - ${normalizeFailureReason(failed.reason)}`
: `${failed.name} failed to start`;
}
const listedFailures = failedSpawnDetails
.slice(0, 2)
.map((failed) =>
failed.reason ? `${failed.name} - ${truncateFailureReason(failed.reason, 120)}` : failed.name
failed.reason ? `${failed.name} - ${normalizeFailureReason(failed.reason)}` : failed.name
)
.join('; ');
const remainingCount = failedSpawnDetails.length - Math.min(failedSpawnDetails.length, 2);
@ -454,15 +450,13 @@ function buildSkippedSpawnPanelMessage(
if (skippedSpawnDetails.length === 1) {
const [skipped] = skippedSpawnDetails;
return skipped.reason
? `${skipped.name} skipped for this launch - ${truncateFailureReason(skipped.reason, 220)}`
? `${skipped.name} skipped for this launch - ${normalizeFailureReason(skipped.reason)}`
: `${skipped.name} skipped for this launch`;
}
const listedSkipped = skippedSpawnDetails
.slice(0, 3)
.map((skipped) =>
skipped.reason
? `${skipped.name} - ${truncateFailureReason(skipped.reason, 100)}`
: skipped.name
skipped.reason ? `${skipped.name} - ${normalizeFailureReason(skipped.reason)}` : skipped.name
)
.join('; ');
const remainingCount = skippedSpawnDetails.length - Math.min(skippedSpawnDetails.length, 3);

View file

@ -4747,7 +4747,7 @@ describe('TeamProvisioningService', () => {
(svc as any).runs.set(run.runId, run);
vi.mocked(listTmuxPanePidsForCurrentPlatform).mockRejectedValue(
new Error('no server running on /private/tmp/tmux-501/default')
new Error('error connecting to /private/tmp/tmux-501/default (No such file or directory)')
);
await svc.restartMember('tmux-team', 'forge');
@ -6558,12 +6558,13 @@ describe('TeamProvisioningService', () => {
firstSpawnAcceptedAt: acceptedAt,
},
bob: {
launchState: 'starting',
agentToolAccepted: false,
launchState: 'runtime_pending_bootstrap',
agentToolAccepted: true,
runtimeAlive: false,
bootstrapConfirmed: false,
hardFailure: false,
hardFailureReason: undefined,
firstSpawnAcceptedAt: acceptedAt,
},
});
@ -6639,6 +6640,124 @@ describe('TeamProvisioningService', () => {
expect(result.statuses.alice?.error).toBeUndefined();
});
it('does not classify the bootstrap instruction prompt as a member launch failure', async () => {
allowConsoleLogs();
const teamName = 'zz-unit-bootstrap-prompt-not-failure';
const leadSessionId = 'lead-session';
const memberSessionId = 'alice-session';
const projectPath = '/Users/test/proj';
const projectId = '-Users-test-proj';
const acceptedAt = new Date(Date.now() - 5_000).toISOString();
writeLaunchConfig(teamName, projectPath, leadSessionId, ['alice']);
const projectRoot = path.join(tempProjectsBase, projectId);
fs.mkdirSync(projectRoot, { recursive: true });
fs.writeFileSync(
path.join(projectRoot, `${memberSessionId}.jsonl`),
`${JSON.stringify({
timestamp: acceptedAt,
teamName,
agentName: 'alice',
type: 'user',
message: {
role: 'user',
content: `You are bootstrapping into team "${teamName}" as member "alice".\nYour first action is to call the MCP tool member_briefing on the agent-teams server with teamName="${teamName}" and memberName="alice".\nIf member_briefing is still unavailable after that one retry, send exactly one short SendMessage to "team-lead" with the exact error text, then stop this turn and wait.`,
},
})}\n`,
'utf8'
);
const svc = new TeamProvisioningService();
const reason = await (svc as any).findBootstrapTranscriptFailureReason(
teamName,
'alice',
Date.parse(acceptedAt) - 1
);
expect(reason).toBeNull();
});
it('clears a stale persisted bootstrap-prompt failure when member_briefing later succeeds', async () => {
allowConsoleLogs();
const teamName = 'zz-unit-bootstrap-stale-prompt-failure';
const leadSessionId = 'lead-session';
const memberSessionId = 'alice-session';
const projectPath = '/Users/test/proj';
const projectId = '-Users-test-proj';
const acceptedAt = new Date(Date.now() - 5_000).toISOString();
const successAt = new Date(Date.now() - 4_000).toISOString();
const staleReason = `You are bootstrapping into team "${teamName}" as member "alice".\nYour first action is to call the MCP tool member_briefing on the agent-teams server with teamName="${teamName}" and memberName="alice".\nIf tool search shows only the prefixed MCP name, use mcp__agent-teams__member_briefing.`;
writeLaunchConfig(teamName, projectPath, leadSessionId, ['alice', 'bob']);
writeLaunchState(teamName, leadSessionId, {
alice: {
launchState: 'failed_to_start',
agentToolAccepted: true,
runtimeAlive: false,
bootstrapConfirmed: false,
hardFailure: true,
hardFailureReason: staleReason,
firstSpawnAcceptedAt: acceptedAt,
},
bob: {
launchState: 'starting',
agentToolAccepted: false,
runtimeAlive: false,
bootstrapConfirmed: false,
hardFailure: false,
hardFailureReason: undefined,
},
});
const projectRoot = path.join(tempProjectsBase, projectId);
fs.mkdirSync(projectRoot, { recursive: true });
fs.writeFileSync(
path.join(projectRoot, `${memberSessionId}.jsonl`),
[
JSON.stringify({
timestamp: acceptedAt,
teamName,
agentName: 'alice',
type: 'user',
message: {
role: 'user',
content: staleReason,
},
}),
JSON.stringify({
timestamp: successAt,
teamName,
agentName: 'alice',
type: 'user',
message: {
role: 'user',
content: [
{
type: 'tool_result',
tool_use_id: 'item_1',
content: `Member briefing for alice on team "${teamName}" (${teamName}).\nTask briefing for alice:\nNo actionable tasks.`,
is_error: false,
},
],
},
}),
].join('\n') + '\n',
'utf8'
);
const svc = new TeamProvisioningService();
const result = await svc.getMemberSpawnStatuses(teamName);
expect(result.statuses.alice).toMatchObject({
status: 'online',
launchState: 'confirmed_alive',
bootstrapConfirmed: true,
hardFailure: false,
});
expect(result.statuses.alice?.hardFailureReason).toBeUndefined();
});
it('marks an online teammate bootstrap as failed when transcript shows model unavailability', async () => {
allowConsoleLogs();
const teamName = 'zz-live-bootstrap-model-unavailable';

View file

@ -172,6 +172,7 @@ const REQUIRED_MOCK_AGENT_TEAMS_TOOLS = [
'task_get_comment',
'task_link',
'task_list',
'task_restore',
'task_set_clarification',
'task_set_owner',
'task_set_status',

View file

@ -91,6 +91,62 @@ describe('buildTeamProvisioningPresentation', () => {
expect(presentation?.defaultLiveOutputOpen).toBe(false);
});
it('does not truncate long failed teammate reasons in the panel message', () => {
const reason =
'You are bootstrapping into team "relay-works-10" as member "alice". Your first action is to call the MCP tool member_briefing on the agent-teams server with teamName="relay-works-10" and memberName="alice". If tool search shows only the prefixed MCP name, use mcp__agent-teams__member_briefing.';
const presentation = buildTeamProvisioningPresentation({
progress: {
runId: 'run-long-failure',
teamName: 'relay-works-10',
state: 'ready',
startedAt: '2026-04-13T10:00:00.000Z',
updatedAt: '2026-04-13T10:00:08.000Z',
message: 'Launch completed with teammate errors',
messageSeverity: 'warning',
pid: 4321,
cliLogsTail: '',
assistantOutput: '',
},
members: [
{
name: 'team-lead',
agentType: 'team-lead',
status: 'active',
currentTaskId: null,
taskCount: 0,
lastActiveAt: null,
messageCount: 0,
},
{
name: 'alice',
agentType: 'reviewer',
status: 'unknown',
currentTaskId: null,
taskCount: 0,
lastActiveAt: null,
messageCount: 0,
},
],
memberSpawnStatuses: {
alice: {
status: 'error',
launchState: 'failed_to_start',
error: reason,
hardFailureReason: reason,
updatedAt: '2026-04-13T10:00:03.000Z',
runtimeAlive: false,
bootstrapConfirmed: false,
hardFailure: true,
agentToolAccepted: true,
firstSpawnAcceptedAt: '2026-04-13T10:00:01.000Z',
},
},
memberSpawnSnapshot: undefined,
});
expect(presentation?.panelMessage).toBe(`alice failed to start - ${reason}`);
});
it('surfaces the failed teammate reason after launch completes with errors', () => {
const presentation = buildTeamProvisioningPresentation({
progress: {