fix(team): harden launch failure recovery copy
This commit is contained in:
parent
5c5b5c7afc
commit
e6e3ae9f54
5 changed files with 207 additions and 19 deletions
|
|
@ -1560,7 +1560,10 @@ function isConfigRegistrationFailureReason(reason?: string): boolean {
|
|||
|
||||
function isTmuxNoServerRunningError(error: unknown): boolean {
|
||||
const text = error instanceof Error ? error.message : String(error ?? '');
|
||||
return /no server running on /i.test(text);
|
||||
return (
|
||||
/no server running on /i.test(text) ||
|
||||
/error connecting to .*no such file or directory/i.test(text)
|
||||
);
|
||||
}
|
||||
|
||||
function isAutoClearableLaunchFailureReason(reason?: string): boolean {
|
||||
|
|
@ -2194,6 +2197,7 @@ function extractHeartbeatTimestamp(text: string, fallback?: string): string | un
|
|||
function extractBootstrapFailureReason(text: string): string | null {
|
||||
const trimmed = text.trim();
|
||||
if (!trimmed) return null;
|
||||
if (isBootstrapInstructionPrompt(trimmed)) return null;
|
||||
const lower = trimmed.toLowerCase();
|
||||
const looksLikeBootstrapFailure =
|
||||
lower.includes('bootstrap failed') ||
|
||||
|
|
@ -2241,6 +2245,17 @@ function extractBootstrapFailureReason(text: string): string | null {
|
|||
return trimmed.slice(0, 280);
|
||||
}
|
||||
|
||||
function isBootstrapInstructionPrompt(text: string): boolean {
|
||||
const normalized = text.replace(/\s+/g, ' ').trim().toLowerCase();
|
||||
if (!normalized.startsWith('you are bootstrapping into team ')) {
|
||||
return false;
|
||||
}
|
||||
return (
|
||||
normalized.includes('your first action is to call the mcp tool') &&
|
||||
(normalized.includes('member_briefing') || normalized.includes('lead_briefing'))
|
||||
);
|
||||
}
|
||||
|
||||
function isBootstrapTranscriptSuccessText(
|
||||
text: string,
|
||||
teamName: string,
|
||||
|
|
@ -14942,7 +14957,7 @@ export class TeamProvisioningService {
|
|||
const expectedMembers = this.getPersistedLaunchMemberNames(snapshot);
|
||||
for (const expected of expectedMembers) {
|
||||
const current = snapshot.members[expected];
|
||||
if (!current || current.bootstrapConfirmed || current.hardFailure) {
|
||||
if (!current || current.bootstrapConfirmed) {
|
||||
continue;
|
||||
}
|
||||
const acceptedAtMs =
|
||||
|
|
@ -15128,7 +15143,7 @@ export class TeamProvisioningService {
|
|||
current.hardFailure = false;
|
||||
current.hardFailureReason = undefined;
|
||||
}
|
||||
if (!current.bootstrapConfirmed && !current.hardFailure) {
|
||||
if (!current.bootstrapConfirmed) {
|
||||
const transcriptOutcome = await this.findBootstrapTranscriptOutcome(
|
||||
teamName,
|
||||
expected,
|
||||
|
|
@ -15139,7 +15154,10 @@ export class TeamProvisioningService {
|
|||
current.lastHeartbeatAt = current.lastHeartbeatAt ?? transcriptOutcome.observedAt;
|
||||
current.hardFailure = false;
|
||||
current.hardFailureReason = undefined;
|
||||
} else if (transcriptOutcome?.kind === 'failure') {
|
||||
if (current.sources) {
|
||||
current.sources.hardFailureSignal = undefined;
|
||||
}
|
||||
} else if (transcriptOutcome?.kind === 'failure' && !current.hardFailure) {
|
||||
current.hardFailure = true;
|
||||
current.hardFailureReason = transcriptOutcome.reason;
|
||||
current.sources.hardFailureSignal = true;
|
||||
|
|
|
|||
|
|
@ -390,12 +390,8 @@ function getSkippedSpawnDetails(params: {
|
|||
.sort((left, right) => left.name.localeCompare(right.name));
|
||||
}
|
||||
|
||||
function truncateFailureReason(reason: string, maxLength = 160): string {
|
||||
const normalized = reason.replace(/\s+/g, ' ').trim();
|
||||
if (normalized.length <= maxLength) {
|
||||
return normalized;
|
||||
}
|
||||
return `${normalized.slice(0, Math.max(0, maxLength - 3)).trimEnd()}...`;
|
||||
function normalizeFailureReason(reason: string): string {
|
||||
return reason.replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
function buildFailedSpawnPanelMessage(
|
||||
|
|
@ -407,13 +403,13 @@ function buildFailedSpawnPanelMessage(
|
|||
if (failedSpawnDetails.length === 1) {
|
||||
const [failed] = failedSpawnDetails;
|
||||
return failed.reason
|
||||
? `${failed.name} failed to start - ${truncateFailureReason(failed.reason, 220)}`
|
||||
? `${failed.name} failed to start - ${normalizeFailureReason(failed.reason)}`
|
||||
: `${failed.name} failed to start`;
|
||||
}
|
||||
const listedFailures = failedSpawnDetails
|
||||
.slice(0, 2)
|
||||
.map((failed) =>
|
||||
failed.reason ? `${failed.name} - ${truncateFailureReason(failed.reason, 120)}` : failed.name
|
||||
failed.reason ? `${failed.name} - ${normalizeFailureReason(failed.reason)}` : failed.name
|
||||
)
|
||||
.join('; ');
|
||||
const remainingCount = failedSpawnDetails.length - Math.min(failedSpawnDetails.length, 2);
|
||||
|
|
@ -454,15 +450,13 @@ function buildSkippedSpawnPanelMessage(
|
|||
if (skippedSpawnDetails.length === 1) {
|
||||
const [skipped] = skippedSpawnDetails;
|
||||
return skipped.reason
|
||||
? `${skipped.name} skipped for this launch - ${truncateFailureReason(skipped.reason, 220)}`
|
||||
? `${skipped.name} skipped for this launch - ${normalizeFailureReason(skipped.reason)}`
|
||||
: `${skipped.name} skipped for this launch`;
|
||||
}
|
||||
const listedSkipped = skippedSpawnDetails
|
||||
.slice(0, 3)
|
||||
.map((skipped) =>
|
||||
skipped.reason
|
||||
? `${skipped.name} - ${truncateFailureReason(skipped.reason, 100)}`
|
||||
: skipped.name
|
||||
skipped.reason ? `${skipped.name} - ${normalizeFailureReason(skipped.reason)}` : skipped.name
|
||||
)
|
||||
.join('; ');
|
||||
const remainingCount = skippedSpawnDetails.length - Math.min(skippedSpawnDetails.length, 3);
|
||||
|
|
|
|||
|
|
@ -4747,7 +4747,7 @@ describe('TeamProvisioningService', () => {
|
|||
(svc as any).runs.set(run.runId, run);
|
||||
|
||||
vi.mocked(listTmuxPanePidsForCurrentPlatform).mockRejectedValue(
|
||||
new Error('no server running on /private/tmp/tmux-501/default')
|
||||
new Error('error connecting to /private/tmp/tmux-501/default (No such file or directory)')
|
||||
);
|
||||
|
||||
await svc.restartMember('tmux-team', 'forge');
|
||||
|
|
@ -6558,12 +6558,13 @@ describe('TeamProvisioningService', () => {
|
|||
firstSpawnAcceptedAt: acceptedAt,
|
||||
},
|
||||
bob: {
|
||||
launchState: 'starting',
|
||||
agentToolAccepted: false,
|
||||
launchState: 'runtime_pending_bootstrap',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: false,
|
||||
bootstrapConfirmed: false,
|
||||
hardFailure: false,
|
||||
hardFailureReason: undefined,
|
||||
firstSpawnAcceptedAt: acceptedAt,
|
||||
},
|
||||
});
|
||||
|
||||
|
|
@ -6639,6 +6640,124 @@ describe('TeamProvisioningService', () => {
|
|||
expect(result.statuses.alice?.error).toBeUndefined();
|
||||
});
|
||||
|
||||
it('does not classify the bootstrap instruction prompt as a member launch failure', async () => {
|
||||
allowConsoleLogs();
|
||||
const teamName = 'zz-unit-bootstrap-prompt-not-failure';
|
||||
const leadSessionId = 'lead-session';
|
||||
const memberSessionId = 'alice-session';
|
||||
const projectPath = '/Users/test/proj';
|
||||
const projectId = '-Users-test-proj';
|
||||
const acceptedAt = new Date(Date.now() - 5_000).toISOString();
|
||||
|
||||
writeLaunchConfig(teamName, projectPath, leadSessionId, ['alice']);
|
||||
|
||||
const projectRoot = path.join(tempProjectsBase, projectId);
|
||||
fs.mkdirSync(projectRoot, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(projectRoot, `${memberSessionId}.jsonl`),
|
||||
`${JSON.stringify({
|
||||
timestamp: acceptedAt,
|
||||
teamName,
|
||||
agentName: 'alice',
|
||||
type: 'user',
|
||||
message: {
|
||||
role: 'user',
|
||||
content: `You are bootstrapping into team "${teamName}" as member "alice".\nYour first action is to call the MCP tool member_briefing on the agent-teams server with teamName="${teamName}" and memberName="alice".\nIf member_briefing is still unavailable after that one retry, send exactly one short SendMessage to "team-lead" with the exact error text, then stop this turn and wait.`,
|
||||
},
|
||||
})}\n`,
|
||||
'utf8'
|
||||
);
|
||||
|
||||
const svc = new TeamProvisioningService();
|
||||
const reason = await (svc as any).findBootstrapTranscriptFailureReason(
|
||||
teamName,
|
||||
'alice',
|
||||
Date.parse(acceptedAt) - 1
|
||||
);
|
||||
|
||||
expect(reason).toBeNull();
|
||||
});
|
||||
|
||||
it('clears a stale persisted bootstrap-prompt failure when member_briefing later succeeds', async () => {
|
||||
allowConsoleLogs();
|
||||
const teamName = 'zz-unit-bootstrap-stale-prompt-failure';
|
||||
const leadSessionId = 'lead-session';
|
||||
const memberSessionId = 'alice-session';
|
||||
const projectPath = '/Users/test/proj';
|
||||
const projectId = '-Users-test-proj';
|
||||
const acceptedAt = new Date(Date.now() - 5_000).toISOString();
|
||||
const successAt = new Date(Date.now() - 4_000).toISOString();
|
||||
const staleReason = `You are bootstrapping into team "${teamName}" as member "alice".\nYour first action is to call the MCP tool member_briefing on the agent-teams server with teamName="${teamName}" and memberName="alice".\nIf tool search shows only the prefixed MCP name, use mcp__agent-teams__member_briefing.`;
|
||||
|
||||
writeLaunchConfig(teamName, projectPath, leadSessionId, ['alice', 'bob']);
|
||||
writeLaunchState(teamName, leadSessionId, {
|
||||
alice: {
|
||||
launchState: 'failed_to_start',
|
||||
agentToolAccepted: true,
|
||||
runtimeAlive: false,
|
||||
bootstrapConfirmed: false,
|
||||
hardFailure: true,
|
||||
hardFailureReason: staleReason,
|
||||
firstSpawnAcceptedAt: acceptedAt,
|
||||
},
|
||||
bob: {
|
||||
launchState: 'starting',
|
||||
agentToolAccepted: false,
|
||||
runtimeAlive: false,
|
||||
bootstrapConfirmed: false,
|
||||
hardFailure: false,
|
||||
hardFailureReason: undefined,
|
||||
},
|
||||
});
|
||||
|
||||
const projectRoot = path.join(tempProjectsBase, projectId);
|
||||
fs.mkdirSync(projectRoot, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(projectRoot, `${memberSessionId}.jsonl`),
|
||||
[
|
||||
JSON.stringify({
|
||||
timestamp: acceptedAt,
|
||||
teamName,
|
||||
agentName: 'alice',
|
||||
type: 'user',
|
||||
message: {
|
||||
role: 'user',
|
||||
content: staleReason,
|
||||
},
|
||||
}),
|
||||
JSON.stringify({
|
||||
timestamp: successAt,
|
||||
teamName,
|
||||
agentName: 'alice',
|
||||
type: 'user',
|
||||
message: {
|
||||
role: 'user',
|
||||
content: [
|
||||
{
|
||||
type: 'tool_result',
|
||||
tool_use_id: 'item_1',
|
||||
content: `Member briefing for alice on team "${teamName}" (${teamName}).\nTask briefing for alice:\nNo actionable tasks.`,
|
||||
is_error: false,
|
||||
},
|
||||
],
|
||||
},
|
||||
}),
|
||||
].join('\n') + '\n',
|
||||
'utf8'
|
||||
);
|
||||
|
||||
const svc = new TeamProvisioningService();
|
||||
const result = await svc.getMemberSpawnStatuses(teamName);
|
||||
|
||||
expect(result.statuses.alice).toMatchObject({
|
||||
status: 'online',
|
||||
launchState: 'confirmed_alive',
|
||||
bootstrapConfirmed: true,
|
||||
hardFailure: false,
|
||||
});
|
||||
expect(result.statuses.alice?.hardFailureReason).toBeUndefined();
|
||||
});
|
||||
|
||||
it('marks an online teammate bootstrap as failed when transcript shows model unavailability', async () => {
|
||||
allowConsoleLogs();
|
||||
const teamName = 'zz-live-bootstrap-model-unavailable';
|
||||
|
|
|
|||
|
|
@ -172,6 +172,7 @@ const REQUIRED_MOCK_AGENT_TEAMS_TOOLS = [
|
|||
'task_get_comment',
|
||||
'task_link',
|
||||
'task_list',
|
||||
'task_restore',
|
||||
'task_set_clarification',
|
||||
'task_set_owner',
|
||||
'task_set_status',
|
||||
|
|
|
|||
|
|
@ -91,6 +91,62 @@ describe('buildTeamProvisioningPresentation', () => {
|
|||
expect(presentation?.defaultLiveOutputOpen).toBe(false);
|
||||
});
|
||||
|
||||
it('does not truncate long failed teammate reasons in the panel message', () => {
|
||||
const reason =
|
||||
'You are bootstrapping into team "relay-works-10" as member "alice". Your first action is to call the MCP tool member_briefing on the agent-teams server with teamName="relay-works-10" and memberName="alice". If tool search shows only the prefixed MCP name, use mcp__agent-teams__member_briefing.';
|
||||
const presentation = buildTeamProvisioningPresentation({
|
||||
progress: {
|
||||
runId: 'run-long-failure',
|
||||
teamName: 'relay-works-10',
|
||||
state: 'ready',
|
||||
startedAt: '2026-04-13T10:00:00.000Z',
|
||||
updatedAt: '2026-04-13T10:00:08.000Z',
|
||||
message: 'Launch completed with teammate errors',
|
||||
messageSeverity: 'warning',
|
||||
pid: 4321,
|
||||
cliLogsTail: '',
|
||||
assistantOutput: '',
|
||||
},
|
||||
members: [
|
||||
{
|
||||
name: 'team-lead',
|
||||
agentType: 'team-lead',
|
||||
status: 'active',
|
||||
currentTaskId: null,
|
||||
taskCount: 0,
|
||||
lastActiveAt: null,
|
||||
messageCount: 0,
|
||||
},
|
||||
{
|
||||
name: 'alice',
|
||||
agentType: 'reviewer',
|
||||
status: 'unknown',
|
||||
currentTaskId: null,
|
||||
taskCount: 0,
|
||||
lastActiveAt: null,
|
||||
messageCount: 0,
|
||||
},
|
||||
],
|
||||
memberSpawnStatuses: {
|
||||
alice: {
|
||||
status: 'error',
|
||||
launchState: 'failed_to_start',
|
||||
error: reason,
|
||||
hardFailureReason: reason,
|
||||
updatedAt: '2026-04-13T10:00:03.000Z',
|
||||
runtimeAlive: false,
|
||||
bootstrapConfirmed: false,
|
||||
hardFailure: true,
|
||||
agentToolAccepted: true,
|
||||
firstSpawnAcceptedAt: '2026-04-13T10:00:01.000Z',
|
||||
},
|
||||
},
|
||||
memberSpawnSnapshot: undefined,
|
||||
});
|
||||
|
||||
expect(presentation?.panelMessage).toBe(`alice failed to start - ${reason}`);
|
||||
});
|
||||
|
||||
it('surfaces the failed teammate reason after launch completes with errors', () => {
|
||||
const presentation = buildTeamProvisioningPresentation({
|
||||
progress: {
|
||||
|
|
|
|||
Loading…
Reference in a new issue