fix(team): resolve stuck "reconciling" state and skip resume when teammates never spawned (#55)
* fix(team): resolve stuck "reconciling" state and skip resume when teammates never spawned Addresses #54. When a team launch fails to bootstrap teammates, the team gets stuck showing "Last launch is still reconciling" indefinitely, and retrying with --resume reconnects the lead but does not re-spawn the dead teammates. The only workaround was enabling "Clear context (fresh session)", which loses the lead's prior conversation context. Two root causes addressed: 1. createPersistedLaunchSnapshot counted members still in 'starting' state (agentToolAccepted=false) as 'pending' regardless of launchPhase. When launchPhase was 'finished' with never-spawned members, the aggregate state stayed as 'partial_pending' forever, rendered as "still reconciling". Fix: when launchPhase != 'active', promote such members to 'failed_to_start' so the aggregate becomes 'partial_failure' ("Launch failed partway"), which correctly signals a terminal state. 2. TeamProvisioningService._launchTeamInner always used --resume when a previous leadSessionId existed, even if the previous launch had no teammates successfully spawned. The CLI's deterministic reconnect path restores lead context but does not re-spawn dead teammates, so the team stays broken across relaunches. Fix: before adding --resume, read the persisted launch state. If every expected teammate is 'starting' (never spawned) or 'failed_to_start', skip --resume so the CLI performs a full fresh bootstrap that spawns all teammates. Verified manually on Linux: a team stuck in "still reconciling" correctly transitions to "failed partway" after the first fix, and the next Launch (without "Clear context") fully bootstraps and brings teammates online. * fix(team): narrow skip resume to never-spawned teammates --------- Co-authored-by: 777genius <quantjumppro@gmail.com>
This commit is contained in:
parent
f819dd0c27
commit
080e0af55a
3 changed files with 249 additions and 18 deletions
|
|
@ -213,13 +213,39 @@ export function createPersistedLaunchSnapshot(params: {
|
|||
)
|
||||
);
|
||||
const members = params.members ?? {};
|
||||
const launchPhase = params.launchPhase ?? 'active';
|
||||
|
||||
// When the launch is over (finished/reconciled), members still in 'starting' state
|
||||
// (never spawned — agentToolAccepted is false) are unreachable and should be marked
|
||||
// as failed. Without this, they stay as 'pending' forever, causing the UI to show
|
||||
// "Last launch is still reconciling" indefinitely after a crash or incomplete launch.
|
||||
if (launchPhase !== 'active') {
|
||||
for (const name of expectedMembers) {
|
||||
const member = members[name];
|
||||
if (
|
||||
member &&
|
||||
member.launchState === 'starting' &&
|
||||
!member.agentToolAccepted &&
|
||||
!member.runtimeAlive &&
|
||||
!member.bootstrapConfirmed &&
|
||||
!member.hardFailure
|
||||
) {
|
||||
member.hardFailure = true;
|
||||
member.hardFailureReason =
|
||||
member.hardFailureReason ?? 'Teammate was never spawned during launch.';
|
||||
member.launchState = deriveMemberLaunchState(member);
|
||||
member.diagnostics = buildDiagnostics(member);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const summary = summarizePersistedLaunchMembers(expectedMembers, members);
|
||||
return {
|
||||
version: 2,
|
||||
teamName: params.teamName,
|
||||
updatedAt,
|
||||
...(params.leadSessionId ? { leadSessionId: params.leadSessionId } : {}),
|
||||
launchPhase: params.launchPhase ?? 'active',
|
||||
launchPhase,
|
||||
expectedMembers,
|
||||
members,
|
||||
summary,
|
||||
|
|
|
|||
|
|
@ -4860,11 +4860,59 @@ export class TeamProvisioningService {
|
|||
// so the lead retains full context of prior work.
|
||||
// When clearContext is true, skip resume entirely to start a fresh session.
|
||||
let previousSessionId: string | undefined;
|
||||
let skipResume = false;
|
||||
if (request.clearContext) {
|
||||
skipResume = true;
|
||||
logger.info(
|
||||
`[${request.teamName}] clearContext requested — skipping session resume, starting fresh`
|
||||
);
|
||||
} else {
|
||||
// Check persisted launch state: if the previous launch ended with no teammates
|
||||
// ever spawned (all in 'starting' state), resuming would reconnect the lead but
|
||||
// the CLI's deterministic bootstrap won't re-spawn dead teammates in reconnect
|
||||
// mode. Skip resume so the CLI creates a fresh session that fully bootstraps.
|
||||
const persistedLaunchState = await this.launchStateStore.read(request.teamName);
|
||||
if (persistedLaunchState) {
|
||||
const {
|
||||
expectedMembers: prevExpected,
|
||||
members: prevMembers,
|
||||
launchPhase,
|
||||
} = persistedLaunchState;
|
||||
const teammateWasNeverSpawned = (
|
||||
member:
|
||||
| {
|
||||
agentToolAccepted?: boolean;
|
||||
firstSpawnAcceptedAt?: string;
|
||||
runtimeAlive?: boolean;
|
||||
bootstrapConfirmed?: boolean;
|
||||
}
|
||||
| undefined
|
||||
): boolean => {
|
||||
if (!member) return true;
|
||||
const hasAcceptedSpawn =
|
||||
member.agentToolAccepted === true ||
|
||||
(typeof member.firstSpawnAcceptedAt === 'string' &&
|
||||
member.firstSpawnAcceptedAt.trim().length > 0);
|
||||
return (
|
||||
!hasAcceptedSpawn &&
|
||||
member.runtimeAlive !== true &&
|
||||
member.bootstrapConfirmed !== true
|
||||
);
|
||||
};
|
||||
const allTeammatesNeverSpawned =
|
||||
launchPhase !== 'active' &&
|
||||
prevExpected.length > 0 &&
|
||||
prevExpected.every((name) => teammateWasNeverSpawned(prevMembers[name]));
|
||||
if (allTeammatesNeverSpawned) {
|
||||
skipResume = true;
|
||||
logger.info(
|
||||
`[${request.teamName}] Previous launch had no teammates successfully spawned — ` +
|
||||
`skipping session resume to allow full bootstrap`
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!skipResume) {
|
||||
try {
|
||||
const configParsed = JSON.parse(configRaw) as Record<string, unknown>;
|
||||
const resumeGuard = shouldSkipResumeForProviderRuntimeChange(request, configParsed);
|
||||
|
|
|
|||
|
|
@ -51,6 +51,8 @@ vi.mock('@main/utils/pathDecoder', async (importOriginal) => {
|
|||
});
|
||||
|
||||
import { TeamProvisioningService } from '@main/services/team/TeamProvisioningService';
|
||||
import { createPersistedLaunchSnapshot } from '@main/services/team/TeamLaunchStateEvaluator';
|
||||
import { getTeamLaunchStatePath } from '@main/services/team/TeamLaunchStateStore';
|
||||
import { ClaudeBinaryResolver } from '@main/services/team/ClaudeBinaryResolver';
|
||||
import { spawnCli } from '@main/utils/childProcess';
|
||||
import { AGENT_TEAMS_NAMESPACED_TEAMMATE_OPERATIONAL_TOOL_NAMES } from 'agent-teams-controller';
|
||||
|
|
@ -84,6 +86,63 @@ function createRunningChild() {
|
|||
});
|
||||
}
|
||||
|
||||
function writeLaunchConfig(
|
||||
teamName: string,
|
||||
projectPath: string,
|
||||
leadSessionId: string,
|
||||
members: string[]
|
||||
): void {
|
||||
const teamDir = path.join(tempTeamsBase, teamName);
|
||||
fs.mkdirSync(teamDir, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(teamDir, 'config.json'),
|
||||
JSON.stringify({
|
||||
name: teamName,
|
||||
projectPath,
|
||||
leadSessionId,
|
||||
members: [
|
||||
{ name: 'team-lead', agentType: 'team-lead' },
|
||||
...members.map((name) => ({ name })),
|
||||
],
|
||||
}),
|
||||
'utf8'
|
||||
);
|
||||
}
|
||||
|
||||
function writeLaunchState(
|
||||
teamName: string,
|
||||
leadSessionId: string,
|
||||
members: Record<string, Record<string, unknown>>
|
||||
): void {
|
||||
const snapshot = createPersistedLaunchSnapshot({
|
||||
teamName,
|
||||
leadSessionId,
|
||||
launchPhase: 'finished',
|
||||
expectedMembers: Object.keys(members),
|
||||
members: Object.fromEntries(
|
||||
Object.entries(members).map(([name, member]) => [
|
||||
name,
|
||||
{
|
||||
name,
|
||||
launchState: 'failed_to_start',
|
||||
agentToolAccepted: false,
|
||||
runtimeAlive: false,
|
||||
bootstrapConfirmed: false,
|
||||
hardFailure: true,
|
||||
hardFailureReason: 'Teammate was never spawned during launch.',
|
||||
lastEvaluatedAt: new Date().toISOString(),
|
||||
...member,
|
||||
},
|
||||
])
|
||||
) as any,
|
||||
});
|
||||
fs.writeFileSync(
|
||||
getTeamLaunchStatePath(teamName),
|
||||
`${JSON.stringify(snapshot, null, 2)}\n`,
|
||||
'utf8'
|
||||
);
|
||||
}
|
||||
|
||||
describe('TeamProvisioningService', () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
|
|
@ -100,7 +159,6 @@ describe('TeamProvisioningService', () => {
|
|||
fs.mkdirSync(tempProjectsBase, { recursive: true });
|
||||
});
|
||||
|
||||
|
||||
afterEach(() => {
|
||||
vi.useRealTimers();
|
||||
try {
|
||||
|
|
@ -389,14 +447,12 @@ describe('TeamProvisioningService', () => {
|
|||
|
||||
it('expands teammate permission suggestions to the operational tool set only', async () => {
|
||||
allowConsoleLogs();
|
||||
const svc = new TeamProvisioningService(
|
||||
{
|
||||
getConfig: vi.fn(async () => ({
|
||||
projectPath: tempClaudeRoot,
|
||||
members: [{ cwd: tempClaudeRoot }],
|
||||
})),
|
||||
} as any
|
||||
);
|
||||
const svc = new TeamProvisioningService({
|
||||
getConfig: vi.fn(async () => ({
|
||||
projectPath: tempClaudeRoot,
|
||||
members: [{ cwd: tempClaudeRoot }],
|
||||
})),
|
||||
} as any);
|
||||
|
||||
await (svc as any).respondToTeammatePermission(
|
||||
{ teamName: 'ops-team' },
|
||||
|
|
@ -427,14 +483,12 @@ describe('TeamProvisioningService', () => {
|
|||
|
||||
it('does not broaden admin/runtime teammate permission suggestions', async () => {
|
||||
allowConsoleLogs();
|
||||
const svc = new TeamProvisioningService(
|
||||
{
|
||||
getConfig: vi.fn(async () => ({
|
||||
projectPath: tempClaudeRoot,
|
||||
members: [{ cwd: tempClaudeRoot }],
|
||||
})),
|
||||
} as any
|
||||
);
|
||||
const svc = new TeamProvisioningService({
|
||||
getConfig: vi.fn(async () => ({
|
||||
projectPath: tempClaudeRoot,
|
||||
members: [{ cwd: tempClaudeRoot }],
|
||||
})),
|
||||
} as any);
|
||||
|
||||
await (svc as any).respondToTeammatePermission(
|
||||
{ teamName: 'ops-team' },
|
||||
|
|
@ -516,4 +570,107 @@ describe('TeamProvisioningService', () => {
|
|||
})
|
||||
).toBe('Questions (2): First question with extra spacing.');
|
||||
});
|
||||
|
||||
it('skips --resume when the persisted launch state shows no teammate ever spawned', async () => {
|
||||
allowConsoleLogs();
|
||||
const teamName = 'resume-skip-team';
|
||||
const leadSessionId = 'lead-session-skip';
|
||||
writeLaunchConfig(teamName, tempClaudeRoot, leadSessionId, ['alice', 'bob']);
|
||||
writeLaunchState(teamName, leadSessionId, {
|
||||
alice: {
|
||||
launchState: 'failed_to_start',
|
||||
},
|
||||
bob: {
|
||||
launchState: 'starting',
|
||||
hardFailure: false,
|
||||
},
|
||||
});
|
||||
|
||||
vi.mocked(ClaudeBinaryResolver.resolve).mockResolvedValue('/mock/claude');
|
||||
vi.mocked(spawnCli).mockImplementation(() => {
|
||||
throw new Error('launch spawn EINVAL');
|
||||
});
|
||||
|
||||
const svc = new TeamProvisioningService(undefined, undefined, undefined, undefined, {
|
||||
writeConfigFile: vi.fn(async () => '/mock/mcp-config-launch.json'),
|
||||
removeConfigFile: vi.fn(async () => {}),
|
||||
} as any);
|
||||
(svc as any).buildProvisioningEnv = vi.fn(async () => ({
|
||||
env: { ANTHROPIC_API_KEY: 'test' },
|
||||
authSource: 'anthropic_api_key',
|
||||
}));
|
||||
(svc as any).resolveLaunchExpectedMembers = vi.fn(async () => ({
|
||||
members: [{ name: 'alice' }, { name: 'bob' }],
|
||||
source: 'members-meta',
|
||||
warning: undefined,
|
||||
}));
|
||||
(svc as any).normalizeTeamConfigForLaunch = vi.fn(async () => {});
|
||||
(svc as any).assertConfigLeadOnlyForLaunch = vi.fn(async () => {});
|
||||
(svc as any).updateConfigProjectPath = vi.fn(async () => {});
|
||||
(svc as any).restorePrelaunchConfig = vi.fn(async () => {});
|
||||
(svc as any).validateAgentTeamsMcpRuntime = vi.fn(async () => {});
|
||||
(svc as any).pathExists = vi.fn(async (targetPath: string) =>
|
||||
targetPath.endsWith(`${leadSessionId}.jsonl`)
|
||||
);
|
||||
|
||||
await expect(svc.launchTeam({ teamName, cwd: tempClaudeRoot }, () => {})).rejects.toThrow(
|
||||
'launch spawn EINVAL'
|
||||
);
|
||||
|
||||
const launchArgs = vi.mocked(spawnCli).mock.calls[0]?.[1] as string[];
|
||||
expect(launchArgs).toBeTruthy();
|
||||
expect(launchArgs).not.toContain('--resume');
|
||||
expect(launchArgs).not.toContain(leadSessionId);
|
||||
});
|
||||
|
||||
it('keeps --resume when a teammate had an accepted spawn before failing bootstrap', async () => {
|
||||
allowConsoleLogs();
|
||||
const teamName = 'resume-keep-team';
|
||||
const leadSessionId = 'lead-session-keep';
|
||||
const acceptedAt = '2026-04-14T12:00:00.000Z';
|
||||
writeLaunchConfig(teamName, tempClaudeRoot, leadSessionId, ['alice']);
|
||||
writeLaunchState(teamName, leadSessionId, {
|
||||
alice: {
|
||||
launchState: 'failed_to_start',
|
||||
agentToolAccepted: true,
|
||||
firstSpawnAcceptedAt: acceptedAt,
|
||||
hardFailureReason: 'Teammate did not join within the launch grace window.',
|
||||
},
|
||||
});
|
||||
|
||||
vi.mocked(ClaudeBinaryResolver.resolve).mockResolvedValue('/mock/claude');
|
||||
vi.mocked(spawnCli).mockImplementation(() => {
|
||||
throw new Error('launch spawn EINVAL');
|
||||
});
|
||||
|
||||
const svc = new TeamProvisioningService(undefined, undefined, undefined, undefined, {
|
||||
writeConfigFile: vi.fn(async () => '/mock/mcp-config-launch.json'),
|
||||
removeConfigFile: vi.fn(async () => {}),
|
||||
} as any);
|
||||
(svc as any).buildProvisioningEnv = vi.fn(async () => ({
|
||||
env: { ANTHROPIC_API_KEY: 'test' },
|
||||
authSource: 'anthropic_api_key',
|
||||
}));
|
||||
(svc as any).resolveLaunchExpectedMembers = vi.fn(async () => ({
|
||||
members: [{ name: 'alice' }],
|
||||
source: 'members-meta',
|
||||
warning: undefined,
|
||||
}));
|
||||
(svc as any).normalizeTeamConfigForLaunch = vi.fn(async () => {});
|
||||
(svc as any).assertConfigLeadOnlyForLaunch = vi.fn(async () => {});
|
||||
(svc as any).updateConfigProjectPath = vi.fn(async () => {});
|
||||
(svc as any).restorePrelaunchConfig = vi.fn(async () => {});
|
||||
(svc as any).validateAgentTeamsMcpRuntime = vi.fn(async () => {});
|
||||
(svc as any).pathExists = vi.fn(async (targetPath: string) =>
|
||||
targetPath.endsWith(`${leadSessionId}.jsonl`)
|
||||
);
|
||||
|
||||
await expect(svc.launchTeam({ teamName, cwd: tempClaudeRoot }, () => {})).rejects.toThrow(
|
||||
'launch spawn EINVAL'
|
||||
);
|
||||
|
||||
const launchArgs = vi.mocked(spawnCli).mock.calls[0]?.[1] as string[];
|
||||
expect(launchArgs).toContain('--resume');
|
||||
expect(launchArgs).toContain(leadSessionId);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
Loading…
Reference in a new issue