fix(team): harden restart and session refresh flows

This commit is contained in:
777genius 2026-04-19 22:34:43 +03:00
parent 4cab3052eb
commit cb62acb4fb
5 changed files with 178 additions and 9 deletions

View file

@ -885,6 +885,15 @@ function buildRestartStillRunningReason(memberName: string): string {
);
}
function buildRestartDuplicateUnconfirmedReason(memberName: string, rawReason?: string): string {
const suffix = rawReason?.trim()
? ` Agent returned duplicate_skipped with unrecognized reason "${rawReason.trim()}".`
: ' Agent returned duplicate_skipped without a reason.';
return (
`Restart for teammate "${memberName}" could not be confirmed and may not have applied.` + suffix
);
}
function buildRestartGraceTimeoutReason(memberName: string): string {
return `Teammate "${memberName}" did not rejoin within the restart grace window.`;
}
@ -4048,8 +4057,25 @@ export class TeamProvisioningService {
const detail =
parsedStatus.reason === 'already_running'
? 'duplicate spawn skipped - already running'
: 'duplicate spawn skipped - teammate bootstrap still pending';
: parsedStatus.reason === 'bootstrap_pending'
? 'duplicate spawn skipped - teammate bootstrap still pending'
: parsedStatus.rawReason
? `duplicate spawn skipped - unrecognized reason: ${parsedStatus.rawReason}`
: 'duplicate spawn skipped - reason unavailable';
this.appendMemberBootstrapDiagnostic(run, spawnedMemberName, detail);
if (pendingRestart && !parsedStatus.reason) {
logger.warn(
`[${run.teamName}] Restart for teammate "${spawnedMemberName}" returned duplicate_skipped without a recognized reason`
);
run.pendingMemberRestarts.delete(spawnedMemberName);
this.setMemberSpawnStatus(
run,
spawnedMemberName,
'error',
buildRestartDuplicateUnconfirmedReason(spawnedMemberName, parsedStatus.rawReason)
);
return;
}
if (parsedStatus.reason === 'already_running') {
if (pendingRestart) {
run.pendingMemberRestarts.delete(spawnedMemberName);

View file

@ -10,6 +10,16 @@ import type { Session, SessionSortMode } from '@renderer/types/data';
import type { StateCreator } from 'zustand';
const logger = createLogger('Store:session');
const SESSION_IN_PLACE_RETRY_DELAY_MS = 150;
function isTransientSessionsPaginatedIpcError(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error ?? '');
return (
message.includes(
"Error invoking remote method 'get-sessions-paginated': reply was never sent"
) || message.includes("No handler registered for 'get-sessions-paginated'")
);
}
/**
* Tracks the latest in-place refresh generation per project.
@ -257,20 +267,21 @@ export const createSessionSlice: StateCreator<AppState, [], [], SessionSlice> =
const generation = (projectRefreshGeneration.get(projectId) ?? 0) + 1;
projectRefreshGeneration.set(projectId, generation);
try {
const fetchPage = async () => {
const { connectionMode } = get();
const result = await api.getSessionsPaginated(projectId, null, 20, {
return api.getSessionsPaginated(projectId, null, 20, {
includeTotalCount: false,
prefilterAll: false,
metadataLevel: connectionMode === 'ssh' ? 'light' : 'deep',
});
};
const applyResult = (result: Awaited<ReturnType<typeof api.getSessionsPaginated>>) => {
// Drop stale responses from older in-flight refreshes
if (projectRefreshGeneration.get(projectId) !== generation) {
return;
}
// Update sessions without loading state
set({
sessions: result.sessions,
sessionsCursor: result.nextCursor,
@ -278,7 +289,27 @@ export const createSessionSlice: StateCreator<AppState, [], [], SessionSlice> =
sessionsTotalCount: result.totalCount,
// Don't touch sessionsLoading - keep it as-is
});
};
try {
const result = await fetchPage();
applyResult(result);
} catch (error) {
if (isTransientSessionsPaginatedIpcError(error) && get().selectedProjectId === projectId) {
logger.warn('refreshSessionsInPlace transient IPC error - retrying once');
try {
await new Promise((resolve) => setTimeout(resolve, SESSION_IN_PLACE_RETRY_DELAY_MS));
if (get().selectedProjectId !== projectId) {
return;
}
const retried = await fetchPage();
applyResult(retried);
return;
} catch (retryError) {
logger.error('refreshSessionsInPlace retry error:', retryError);
return;
}
}
logger.error('refreshSessionsInPlace error:', error);
// Don't set error state - this is a background refresh
}

View file

@ -9,7 +9,8 @@ export type AgentToolDuplicateSkipReason = 'already_running' | 'bootstrap_pendin
export interface ParsedAgentToolResultStatus {
status: 'duplicate_skipped';
reason: AgentToolDuplicateSkipReason;
reason?: AgentToolDuplicateSkipReason;
rawReason?: string;
name?: string;
teamName?: string;
}
@ -262,14 +263,14 @@ export function parseAgentToolResultStatus(content: unknown): ParsedAgentToolRes
return null;
}
const reason = fields.get('reason');
if (reason !== 'already_running' && reason !== 'bootstrap_pending') {
return null;
}
const rawReason = fields.get('reason');
const reason =
rawReason === 'already_running' || rawReason === 'bootstrap_pending' ? rawReason : undefined;
return {
status: 'duplicate_skipped',
reason,
...(rawReason ? { rawReason } : {}),
name: fields.get('name'),
teamName: fields.get('team_name'),
};

View file

@ -1064,6 +1064,88 @@ describe('TeamProvisioningService', () => {
expect(run.pendingMemberRestarts.has('bob')).toBe(true);
});
it('fails a codex teammate restart immediately when Agent returns duplicate_skipped without a reason', async () => {
allowConsoleLogs();
const svc = new TeamProvisioningService();
const run = createMemberSpawnRun({
teamName: 'codex-team',
expectedMembers: ['jack'],
memberSpawnStatuses: new Map([
[
'jack',
createMemberSpawnStatusEntry({
launchState: 'failed_to_start',
hardFailure: true,
hardFailureReason: 'Teammate was never spawned during launch.',
error: 'Teammate was never spawned during launch.',
}),
],
]),
});
run.child = { pid: 111 };
run.processKilled = false;
run.cancelRequested = false;
(svc as any).sendMessageToRun = vi.fn(async () => {});
(svc as any).configReader = {
getConfig: vi.fn(async () => ({
name: 'Codex Team',
members: [{ name: 'team-lead', agentType: 'team-lead' }],
})),
};
(svc as any).membersMetaStore = {
getMembers: vi.fn(async () => [
{
name: 'jack',
role: 'Developer',
providerId: 'codex',
model: 'gpt-5.4',
effort: 'medium',
agentType: 'general-purpose',
},
]),
};
(svc as any).readPersistedRuntimeMembers = vi.fn(() => []);
(svc as any).getLiveTeamAgentRuntimeMetadata = vi.fn(async () => new Map());
(svc as any).aliveRunByTeam.set('codex-team', run.runId);
(svc as any).runs.set(run.runId, run);
await svc.restartMember('codex-team', 'jack');
run.activeToolCalls.set('tool-agent-1', {
memberName: 'jack',
toolUseId: 'tool-agent-1',
toolName: 'Agent',
preview: 'Spawn teammate jack',
startedAt: new Date().toISOString(),
state: 'running',
source: 'runtime',
});
run.memberSpawnToolUseIds.set('tool-agent-1', 'jack');
(svc as any).finishRuntimeToolActivity(
run,
'tool-agent-1',
[
{
type: 'text',
text: 'status: duplicate_skipped\nname: jack\nteam_name: codex-team',
},
],
false
);
expect(run.pendingMemberRestarts.has('jack')).toBe(false);
expect(run.memberSpawnStatuses.get('jack')).toMatchObject({
status: 'error',
launchState: 'failed_to_start',
runtimeAlive: false,
hardFailure: true,
hardFailureReason:
'Restart for teammate "jack" could not be confirmed and may not have applied. Agent returned duplicate_skipped without a reason.',
});
});
it('waits for a killed tmux pane to disappear before sending a restart request', async () => {
vi.useFakeTimers();

View file

@ -270,6 +270,35 @@ describe('sessionSlice', () => {
await Promise.all([first, second]);
expect(store.getState().sessions[0]?.id).toBe('newest');
});
it('should retry once on transient invoke lifecycle errors', async () => {
vi.useFakeTimers();
store.setState({
selectedProjectId: 'project-1',
sessions: [{ id: 'seed' }] as never[],
});
mockAPI.getSessionsPaginated
.mockRejectedValueOnce(
new Error(
"Error invoking remote method 'get-sessions-paginated': reply was never sent"
)
)
.mockResolvedValueOnce({
sessions: [{ id: 'recovered' }] as never[],
nextCursor: null,
hasMore: false,
totalCount: 1,
});
const refreshPromise = store.getState().refreshSessionsInPlace('project-1');
await vi.advanceTimersByTimeAsync(150);
await refreshPromise;
expect(mockAPI.getSessionsPaginated).toHaveBeenCalledTimes(2);
expect(store.getState().sessions[0]?.id).toBe('recovered');
vi.useRealTimers();
});
});
describe('fetchSessionDetail', () => {