fix(team): harden teammate restart lifecycle
This commit is contained in:
parent
cd52660f88
commit
1d3d7e1f1f
6 changed files with 1332 additions and 30 deletions
|
|
@ -1311,6 +1311,7 @@ export class TeamDataService {
|
|||
}
|
||||
nextByName.add(name.toLowerCase());
|
||||
const prev = existingByName.get(name.toLowerCase());
|
||||
const isSameActiveMember = Boolean(prev && prev.removedAt == null);
|
||||
return {
|
||||
name,
|
||||
role: member.role?.trim() || undefined,
|
||||
|
|
@ -1322,6 +1323,7 @@ export class TeamDataService {
|
|||
? member.effort
|
||||
: undefined,
|
||||
agentType: prev?.agentType ?? 'general-purpose',
|
||||
agentId: isSameActiveMember ? prev?.agentId : undefined,
|
||||
color: prev?.color,
|
||||
joinedAt: prev?.joinedAt ?? joinedAt,
|
||||
removedAt: undefined,
|
||||
|
|
|
|||
|
|
@ -1065,19 +1065,57 @@ function sleep(ms: number): Promise<void> {
|
|||
async function waitForPidsToExit(
|
||||
pids: readonly number[],
|
||||
opts: { timeoutMs: number; pollMs: number }
|
||||
): Promise<void> {
|
||||
): Promise<number[]> {
|
||||
if (pids.length === 0) {
|
||||
return;
|
||||
return [];
|
||||
}
|
||||
|
||||
const deadline = Date.now() + opts.timeoutMs;
|
||||
let remainingPids = [...new Set(pids)];
|
||||
while (Date.now() < deadline) {
|
||||
const remaining = pids.filter((pid) => isProcessAlive(pid));
|
||||
if (remaining.length === 0) {
|
||||
return;
|
||||
remainingPids = remainingPids.filter((pid) => isProcessAlive(pid));
|
||||
if (remainingPids.length === 0) {
|
||||
return [];
|
||||
}
|
||||
await sleep(opts.pollMs);
|
||||
}
|
||||
|
||||
return remainingPids;
|
||||
}
|
||||
|
||||
async function waitForTmuxPanesToExit(
|
||||
paneIds: readonly string[],
|
||||
opts: { timeoutMs: number; pollMs: number }
|
||||
): Promise<string[]> {
|
||||
const normalizedPaneIds = [...new Set(paneIds.map((paneId) => paneId.trim()).filter(Boolean))];
|
||||
if (normalizedPaneIds.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const deadline = Date.now() + opts.timeoutMs;
|
||||
let remainingPaneIds = normalizedPaneIds;
|
||||
let lastError: unknown = null;
|
||||
while (Date.now() < deadline) {
|
||||
let livePanePidById: Map<string, number>;
|
||||
try {
|
||||
livePanePidById = await listTmuxPanePidsForCurrentPlatform(remainingPaneIds);
|
||||
lastError = null;
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
await sleep(opts.pollMs);
|
||||
continue;
|
||||
}
|
||||
remainingPaneIds = remainingPaneIds.filter((paneId) => livePanePidById.has(paneId));
|
||||
if (remainingPaneIds.length === 0) {
|
||||
return [];
|
||||
}
|
||||
await sleep(opts.pollMs);
|
||||
}
|
||||
|
||||
if (lastError) {
|
||||
throw lastError;
|
||||
}
|
||||
return remainingPaneIds;
|
||||
}
|
||||
|
||||
async function waitForChildProcessToExit(
|
||||
|
|
@ -4100,6 +4138,23 @@ export class TeamProvisioningService {
|
|||
}
|
||||
}
|
||||
|
||||
private clearMemberSpawnToolTracking(run: ProvisioningRun, memberName: string): void {
|
||||
let removed = false;
|
||||
for (const [toolUseId, trackedMemberName] of run.memberSpawnToolUseIds.entries()) {
|
||||
if (trackedMemberName !== memberName) continue;
|
||||
run.memberSpawnToolUseIds.delete(toolUseId);
|
||||
removed = true;
|
||||
}
|
||||
|
||||
if (removed) {
|
||||
this.appendMemberBootstrapDiagnostic(
|
||||
run,
|
||||
memberName,
|
||||
'cleared stale spawn tool tracking before manual restart'
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update spawn status for a specific team member and emit a change event.
|
||||
*/
|
||||
|
|
@ -4182,6 +4237,11 @@ export class TeamProvisioningService {
|
|||
next.launchState = 'failed_to_start';
|
||||
} else if (status === 'offline') {
|
||||
Object.assign(next, createInitialMemberSpawnStatusEntry(), { updatedAt });
|
||||
next.error = undefined;
|
||||
next.hardFailureReason = undefined;
|
||||
next.livenessSource = undefined;
|
||||
next.firstSpawnAcceptedAt = undefined;
|
||||
next.lastHeartbeatAt = undefined;
|
||||
}
|
||||
|
||||
next.launchState = deriveMemberLaunchState(next);
|
||||
|
|
@ -4202,8 +4262,12 @@ export class TeamProvisioningService {
|
|||
}
|
||||
|
||||
run.memberSpawnStatuses.set(memberName, next);
|
||||
if ((status === 'online' && next.bootstrapConfirmed) || status === 'offline') {
|
||||
run.pendingMemberRestarts.delete(memberName);
|
||||
if (
|
||||
(status === 'online' && (next.bootstrapConfirmed || livenessSource === 'process')) ||
|
||||
status === 'offline' ||
|
||||
status === 'error'
|
||||
) {
|
||||
run.pendingMemberRestarts?.delete(memberName);
|
||||
}
|
||||
this.syncMemberLaunchGraceCheck(run, memberName, next);
|
||||
|
||||
|
|
@ -4441,20 +4505,38 @@ export class TeamProvisioningService {
|
|||
throw new Error(`Team "${teamName}" is not currently running`);
|
||||
}
|
||||
|
||||
const config = await this.configReader.getConfig(teamName);
|
||||
const configuredMembers = config?.members ?? [];
|
||||
let metaMembers: Awaited<ReturnType<TeamMembersMetaStore['getMembers']>> = [];
|
||||
try {
|
||||
metaMembers = await this.membersMetaStore.getMembers(teamName);
|
||||
} catch {
|
||||
metaMembers = [];
|
||||
}
|
||||
const readCurrentConfiguredMember = async (): Promise<{
|
||||
config: TeamConfig | null;
|
||||
configuredMembers: TeamConfig['members'];
|
||||
metaMembers: Awaited<ReturnType<TeamMembersMetaStore['getMembers']>>;
|
||||
configuredMember: ReturnType<TeamProvisioningService['resolveEffectiveConfiguredMember']>;
|
||||
}> => {
|
||||
const config = await this.configReader.getConfig(teamName);
|
||||
const configuredMembers = config?.members ?? [];
|
||||
let metaMembers: Awaited<ReturnType<TeamMembersMetaStore['getMembers']>> = [];
|
||||
try {
|
||||
metaMembers = await this.membersMetaStore.getMembers(teamName);
|
||||
} catch {
|
||||
metaMembers = [];
|
||||
}
|
||||
|
||||
const configuredMember = this.resolveEffectiveConfiguredMember(
|
||||
configuredMembers,
|
||||
metaMembers,
|
||||
memberName
|
||||
);
|
||||
return {
|
||||
config,
|
||||
configuredMembers,
|
||||
metaMembers,
|
||||
configuredMember: this.resolveEffectiveConfiguredMember(
|
||||
configuredMembers,
|
||||
metaMembers,
|
||||
memberName
|
||||
),
|
||||
};
|
||||
};
|
||||
|
||||
let { config, configuredMembers, metaMembers, configuredMember } =
|
||||
await readCurrentConfiguredMember();
|
||||
if (!config) {
|
||||
throw new Error(`Team "${teamName}" configuration is no longer available`);
|
||||
}
|
||||
if (!configuredMember) {
|
||||
throw new Error(`Member "${memberName}" is not configured in team "${teamName}"`);
|
||||
}
|
||||
|
|
@ -4484,6 +4566,8 @@ export class TeamProvisioningService {
|
|||
);
|
||||
}
|
||||
|
||||
this.agentRuntimeSnapshotCache.delete(teamName);
|
||||
this.liveTeamAgentRuntimeMetadataCache.delete(teamName);
|
||||
const liveRuntimeByMember = await this.getLiveTeamAgentRuntimeMetadata(teamName);
|
||||
const livePids = new Set<number>();
|
||||
let hasAliveRuntimeWithoutPid = false;
|
||||
|
|
@ -4506,6 +4590,7 @@ export class TeamProvisioningService {
|
|||
);
|
||||
}
|
||||
|
||||
const tmuxPaneIdsToVerify: string[] = [];
|
||||
for (const persistedRuntimeMember of persistedRuntimeMembers) {
|
||||
const paneId =
|
||||
typeof persistedRuntimeMember.tmuxPaneId === 'string'
|
||||
|
|
@ -4515,6 +4600,7 @@ export class TeamProvisioningService {
|
|||
if (!paneId || backendType !== 'tmux') {
|
||||
continue;
|
||||
}
|
||||
tmuxPaneIdsToVerify.push(paneId);
|
||||
try {
|
||||
killTmuxPaneForCurrentPlatformSync(paneId);
|
||||
logger.info(
|
||||
|
|
@ -4542,15 +4628,73 @@ export class TeamProvisioningService {
|
|||
}
|
||||
|
||||
if (livePids.size > 0) {
|
||||
await waitForPidsToExit(Array.from(livePids), {
|
||||
const lingeringPids = await waitForPidsToExit(Array.from(livePids), {
|
||||
timeoutMs: 1_500,
|
||||
pollMs: 100,
|
||||
});
|
||||
if (lingeringPids.length > 0) {
|
||||
throw new Error(
|
||||
`Restart for teammate "${memberName}" is still waiting for the previous process to exit (${lingeringPids.join(', ')}).`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (tmuxPaneIdsToVerify.length > 0) {
|
||||
let lingeringPaneIds: string[];
|
||||
try {
|
||||
lingeringPaneIds = await waitForTmuxPanesToExit(tmuxPaneIdsToVerify, {
|
||||
timeoutMs: 1_500,
|
||||
pollMs: 100,
|
||||
});
|
||||
} catch (error) {
|
||||
throw new Error(
|
||||
`Restart for teammate "${memberName}" could not verify that the previous tmux pane exited: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`
|
||||
);
|
||||
}
|
||||
if (lingeringPaneIds.length > 0) {
|
||||
throw new Error(
|
||||
`Restart for teammate "${memberName}" is still waiting for the previous tmux pane to exit (${lingeringPaneIds.join(', ')}).`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
this.setMemberSpawnStatus(run, memberName, 'offline');
|
||||
|
||||
const latestRunId = this.getAliveRunId(teamName);
|
||||
const currentRun = this.runs.get(runId);
|
||||
if (
|
||||
latestRunId !== runId ||
|
||||
!currentRun ||
|
||||
currentRun !== run ||
|
||||
currentRun.processKilled ||
|
||||
currentRun.cancelRequested
|
||||
) {
|
||||
throw new Error(`Team "${teamName}" is not currently running`);
|
||||
}
|
||||
|
||||
({ config, configuredMembers, metaMembers, configuredMember } =
|
||||
await readCurrentConfiguredMember());
|
||||
if (!config) {
|
||||
throw new Error(`Team "${teamName}" configuration disappeared while restart was in progress`);
|
||||
}
|
||||
if (!configuredMember) {
|
||||
throw new Error(
|
||||
`Member "${memberName}" is no longer configured in team "${teamName}" after restart preparation`
|
||||
);
|
||||
}
|
||||
if (configuredMember.removedAt) {
|
||||
throw new Error(`Member "${memberName}" was removed while restart was in progress`);
|
||||
}
|
||||
if (isLeadMember({ name: memberName, agentType: configuredMember.agentType })) {
|
||||
throw new Error('Lead restart is not supported from member controls');
|
||||
}
|
||||
|
||||
this.agentRuntimeSnapshotCache.delete(teamName);
|
||||
this.liveTeamAgentRuntimeMetadataCache.delete(teamName);
|
||||
this.setMemberSpawnStatus(run, memberName, 'offline');
|
||||
this.resetRuntimeToolActivity(run, memberName);
|
||||
this.clearMemberSpawnToolTracking(run, memberName);
|
||||
this.setMemberSpawnStatus(run, memberName, 'spawning');
|
||||
this.appendMemberBootstrapDiagnostic(run, memberName, 'manual restart requested from UI');
|
||||
run.pendingMemberRestarts.set(memberName, {
|
||||
|
|
@ -4613,6 +4757,10 @@ export class TeamProvisioningService {
|
|||
return;
|
||||
}
|
||||
if (!entry.firstSpawnAcceptedAt) {
|
||||
if (existing) {
|
||||
clearTimeout(existing);
|
||||
this.pendingTimeouts.delete(key);
|
||||
}
|
||||
return;
|
||||
}
|
||||
const remainingMs =
|
||||
|
|
@ -8402,6 +8550,23 @@ export class TeamProvisioningService {
|
|||
});
|
||||
}
|
||||
|
||||
for (const member of metaMembers) {
|
||||
const memberName = typeof member?.name === 'string' ? member.name.trim() : '';
|
||||
if (!memberName || isLeadMember({ name: memberName, agentType: member.agentType })) {
|
||||
continue;
|
||||
}
|
||||
const runtimeModel =
|
||||
member.model?.trim() ||
|
||||
this.findConfiguredMemberModel(configuredMembers, memberName) ||
|
||||
this.findEffectiveRunMemberModel(run, memberName);
|
||||
upsertMetadata(memberName, {
|
||||
...(runtimeModel ? { model: runtimeModel } : {}),
|
||||
...(typeof member.agentId === 'string' && member.agentId.trim()
|
||||
? { agentId: member.agentId.trim() }
|
||||
: {}),
|
||||
});
|
||||
}
|
||||
|
||||
for (const member of run?.effectiveMembers ?? []) {
|
||||
const memberName = member.name?.trim() ?? '';
|
||||
if (!memberName || isLeadMember(member) || memberName.toLowerCase() === 'user') {
|
||||
|
|
@ -8448,12 +8613,14 @@ export class TeamProvisioningService {
|
|||
? processPid
|
||||
: undefined;
|
||||
const status = this.findTrackedMemberSpawnStatus(run, memberName);
|
||||
const mayInferAliveFromStatusOnly = status?.launchState !== 'failed_to_start';
|
||||
const alive =
|
||||
typeof resolvedPid === 'number' && resolvedPid > 0
|
||||
? true
|
||||
: backendType === 'tmux'
|
||||
? false
|
||||
: Boolean(status?.runtimeAlive || status?.bootstrapConfirmed);
|
||||
: mayInferAliveFromStatusOnly &&
|
||||
Boolean(status?.runtimeAlive || status?.bootstrapConfirmed);
|
||||
metadataByMember.set(memberName, {
|
||||
...metadata,
|
||||
alive,
|
||||
|
|
@ -9670,6 +9837,16 @@ export class TeamProvisioningService {
|
|||
}
|
||||
|
||||
if (outcome === 'already_running') {
|
||||
if (run.pendingMemberRestarts.has(memberName)) {
|
||||
run.pendingMemberRestarts.delete(memberName);
|
||||
this.setMemberSpawnStatus(
|
||||
run,
|
||||
memberName,
|
||||
'error',
|
||||
buildRestartStillRunningReason(memberName)
|
||||
);
|
||||
return true;
|
||||
}
|
||||
this.setMemberSpawnStatus(run, memberName, 'online', undefined, 'process');
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4179,11 +4179,14 @@ export const createTeamSlice: StateCreator<AppState, [], [], TeamSlice> = (set,
|
|||
},
|
||||
|
||||
restartMember: async (teamName: string, memberName: string) => {
|
||||
await unwrapIpc('team:restartMember', () => api.teams.restartMember(teamName, memberName));
|
||||
await Promise.all([
|
||||
get().fetchMemberSpawnStatuses(teamName),
|
||||
get().fetchTeamAgentRuntime(teamName),
|
||||
]);
|
||||
try {
|
||||
await unwrapIpc('team:restartMember', () => api.teams.restartMember(teamName, memberName));
|
||||
} finally {
|
||||
await Promise.allSettled([
|
||||
get().fetchMemberSpawnStatuses(teamName),
|
||||
get().fetchTeamAgentRuntime(teamName),
|
||||
]);
|
||||
}
|
||||
},
|
||||
|
||||
removeMember: async (teamName: string, memberName: string) => {
|
||||
|
|
|
|||
|
|
@ -528,6 +528,128 @@ describe('TeamDataService', () => {
|
|||
expect(writeMembers).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('preserves agentId for existing members during replaceMembers', async () => {
|
||||
const writeMembers = vi.fn(async () => {});
|
||||
const membersMetaStore = {
|
||||
getMembers: vi.fn(async () => [
|
||||
{
|
||||
name: 'alice',
|
||||
role: 'Developer',
|
||||
providerId: 'codex',
|
||||
model: 'gpt-5.4-mini',
|
||||
effort: 'medium',
|
||||
agentType: 'general-purpose',
|
||||
agentId: 'alice@runtime-team',
|
||||
joinedAt: 1710000000000,
|
||||
},
|
||||
]),
|
||||
writeMembers,
|
||||
} as never;
|
||||
|
||||
const service = new TeamDataService(
|
||||
{ getConfig: vi.fn(), listTeams: vi.fn() } as never,
|
||||
{ getTasks: vi.fn(async () => []) } as never,
|
||||
{ listInboxNames: vi.fn(async () => []), getMessages: vi.fn(async () => []) } as never,
|
||||
{} as never,
|
||||
{} as never,
|
||||
{ resolveMembers: vi.fn(() => []) } as never,
|
||||
{
|
||||
getState: vi.fn(async () => ({ teamName: 'runtime-team', reviewers: [], tasks: {} })),
|
||||
} as never,
|
||||
{} as never,
|
||||
membersMetaStore,
|
||||
{ readMessages: vi.fn(async () => []) } as never
|
||||
);
|
||||
|
||||
await service.replaceMembers('runtime-team', {
|
||||
members: [
|
||||
{
|
||||
name: 'alice',
|
||||
role: 'Reviewer',
|
||||
providerId: 'codex',
|
||||
model: 'gpt-5.2',
|
||||
effort: 'high',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(writeMembers).toHaveBeenCalledWith(
|
||||
'runtime-team',
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
name: 'alice',
|
||||
role: 'Reviewer',
|
||||
providerId: 'codex',
|
||||
model: 'gpt-5.2',
|
||||
effort: 'high',
|
||||
agentId: 'alice@runtime-team',
|
||||
}),
|
||||
])
|
||||
);
|
||||
});
|
||||
|
||||
it('does not carry over agentId from a previously removed member with the same name', async () => {
|
||||
const writeMembers = vi.fn(async () => {});
|
||||
const membersMetaStore = {
|
||||
getMembers: vi.fn(async () => [
|
||||
{
|
||||
name: 'alice',
|
||||
role: 'Developer',
|
||||
providerId: 'codex',
|
||||
model: 'gpt-5.4-mini',
|
||||
effort: 'medium',
|
||||
agentType: 'general-purpose',
|
||||
agentId: 'alice@old-runtime-team',
|
||||
joinedAt: 1710000000000,
|
||||
removedAt: 1715000000000,
|
||||
},
|
||||
]),
|
||||
writeMembers,
|
||||
} as never;
|
||||
|
||||
const service = new TeamDataService(
|
||||
{ getConfig: vi.fn(), listTeams: vi.fn() } as never,
|
||||
{ getTasks: vi.fn(async () => []) } as never,
|
||||
{ listInboxNames: vi.fn(async () => []), getMessages: vi.fn(async () => []) } as never,
|
||||
{} as never,
|
||||
{} as never,
|
||||
{ resolveMembers: vi.fn(() => []) } as never,
|
||||
{
|
||||
getState: vi.fn(async () => ({ teamName: 'runtime-team', reviewers: [], tasks: {} })),
|
||||
} as never,
|
||||
{} as never,
|
||||
membersMetaStore,
|
||||
{ readMessages: vi.fn(async () => []) } as never
|
||||
);
|
||||
|
||||
await service.replaceMembers('runtime-team', {
|
||||
members: [
|
||||
{
|
||||
name: 'alice',
|
||||
role: 'Reviewer',
|
||||
providerId: 'codex',
|
||||
model: 'gpt-5.2',
|
||||
effort: 'high',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(writeMembers).toHaveBeenCalledWith(
|
||||
'runtime-team',
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
name: 'alice',
|
||||
role: 'Reviewer',
|
||||
providerId: 'codex',
|
||||
model: 'gpt-5.2',
|
||||
effort: 'high',
|
||||
agentId: undefined,
|
||||
removedAt: undefined,
|
||||
}),
|
||||
])
|
||||
);
|
||||
});
|
||||
|
||||
it('keeps getTeamData read-only and skips kanban garbage-collect', async () => {
|
||||
const order: string[] = [];
|
||||
const tasks: TeamTask[] = [
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -2421,6 +2421,22 @@ describe('teamSlice actions', () => {
|
|||
expect(store.getState().teamAgentRuntimeByTeam['my-team']).toEqual(createRuntimeSnapshot());
|
||||
});
|
||||
|
||||
it('restartMember refreshes spawn statuses and runtime snapshot even when restart fails', async () => {
|
||||
const store = createSliceStore();
|
||||
const refreshSpawnStatuses = vi.fn(async (_teamName: string) => undefined);
|
||||
const refreshRuntimeSnapshot = vi.fn(async (_teamName: string) => undefined);
|
||||
store.setState({
|
||||
fetchMemberSpawnStatuses: refreshSpawnStatuses,
|
||||
fetchTeamAgentRuntime: refreshRuntimeSnapshot,
|
||||
});
|
||||
hoisted.restartMember.mockRejectedValueOnce(new Error('restart failed'));
|
||||
|
||||
await expect(store.getState().restartMember('my-team', 'alice')).rejects.toThrow('restart failed');
|
||||
|
||||
expect(refreshSpawnStatuses).toHaveBeenCalledWith('my-team');
|
||||
expect(refreshRuntimeSnapshot).toHaveBeenCalledWith('my-team');
|
||||
});
|
||||
|
||||
it('clears stale runtime snapshots on delete', async () => {
|
||||
const store = createSliceStore();
|
||||
store.setState({
|
||||
|
|
|
|||
Loading…
Reference in a new issue