feat: implement auth status check with retry mechanism

- Added a retry mechanism for checking authentication status in the CliInstallerService to handle stale locks after Ctrl+C interruptions.
- Introduced constants for maximum retries and delay between attempts to improve robustness.
- Updated logging to provide detailed feedback on retry attempts and outcomes.
This commit is contained in:
iliya 2026-03-01 18:51:11 +02:00
parent 0190ae77e8
commit 47d979a43d
3 changed files with 393 additions and 163 deletions

View file

@ -66,6 +66,12 @@ const EBUSY_MAX_RETRIES = 3;
/** Delay between EBUSY retries (multiplied by attempt number) */
const EBUSY_RETRY_DELAY_MS = 2000;
/** Max retries for auth status check (covers stale locks after Ctrl+C) */
const AUTH_STATUS_MAX_RETRIES = 2;
/** Delay before retrying auth status check (ms) — gives previous process time to clean up */
const AUTH_STATUS_RETRY_DELAY_MS = 1500;
/**
* Build env for child processes with correct HOME.
* On Windows with non-ASCII usernames, process.env may have a broken HOME/USERPROFILE.
@ -284,19 +290,38 @@ export class CliInstallerService {
logger.warn('Failed to get CLI version:', getErrorMessage(err));
}
// Check auth status
try {
const { stdout: authStdout } = await execCli(binaryPath, ['auth', 'status'], {
timeout: VERSION_TIMEOUT_MS,
env: buildChildEnv(),
});
const auth = JSON.parse(authStdout.trim()) as { loggedIn?: boolean; authMethod?: string };
r.authLoggedIn = auth.loggedIn === true;
r.authMethod = auth.authMethod ?? null;
logger.info(`Auth status: loggedIn=${r.authLoggedIn}, method=${r.authMethod ?? 'null'}`);
} catch (err) {
logger.warn('Failed to check auth status:', getErrorMessage(err));
r.authLoggedIn = false;
// Check auth status with retry — covers stale lock files after Ctrl+C interruption
for (let authAttempt = 1; authAttempt <= AUTH_STATUS_MAX_RETRIES; authAttempt++) {
try {
const { stdout: authStdout } = await execCli(binaryPath, ['auth', 'status'], {
timeout: VERSION_TIMEOUT_MS,
env: buildChildEnv(),
});
const auth = JSON.parse(authStdout.trim()) as {
loggedIn?: boolean;
authMethod?: string;
};
r.authLoggedIn = auth.loggedIn === true;
r.authMethod = auth.authMethod ?? null;
logger.info(
`Auth status: loggedIn=${r.authLoggedIn}, method=${r.authMethod ?? 'null'}` +
(authAttempt > 1 ? ` (attempt ${authAttempt})` : '')
);
break;
} catch (err) {
if (authAttempt < AUTH_STATUS_MAX_RETRIES) {
logger.warn(
`Auth status check failed (attempt ${authAttempt}/${AUTH_STATUS_MAX_RETRIES}), ` +
`retrying in ${AUTH_STATUS_RETRY_DELAY_MS}ms: ${getErrorMessage(err)}`
);
await new Promise((resolve) => setTimeout(resolve, AUTH_STATUS_RETRY_DELAY_MS));
} else {
logger.warn(
`Auth status check failed after ${AUTH_STATUS_MAX_RETRIES} attempts: ${getErrorMessage(err)}`
);
r.authLoggedIn = false;
}
}
}
}

View file

@ -146,14 +146,16 @@ export class TeamConfigReader {
return null;
}
// Case-insensitive dedup: key is lowercase name, value keeps the original casing
const memberMap = new Map<string, TeamSummaryMember>();
const addMember = (m: TeamMember): void => {
const mergeMember = (m: TeamMember): void => {
const name = m.name?.trim();
if (!name) return;
const existing = memberMap.get(name);
memberMap.set(name, {
name,
const key = name.toLowerCase();
const existing = memberMap.get(key);
memberMap.set(key, {
name: existing?.name ?? name,
role: m.role?.trim() || existing?.role,
color: m.color?.trim() || existing?.color,
});
@ -162,7 +164,7 @@ export class TeamConfigReader {
if (config && Array.isArray(config.members)) {
for (const member of config.members) {
if (member && typeof member.name === 'string') {
addMember(member);
mergeMember(member);
}
}
}
@ -173,7 +175,7 @@ export class TeamConfigReader {
const metaMembers = await this.membersMetaStore.getMembers(teamName);
for (const member of metaMembers) {
if (!member.removedAt) {
addMember(member);
mergeMember(member);
}
}
} catch {

View file

@ -59,6 +59,8 @@ const UI_LOGS_TAIL_LIMIT = 128 * 1024;
const SHELL_ENV_TIMEOUT_MS = 12000;
const CLI_PREPARE_TIMEOUT_MS = 10000;
const PREFLIGHT_TIMEOUT_MS = 30000;
const PREFLIGHT_AUTH_RETRY_DELAY_MS = 2000;
const PREFLIGHT_AUTH_MAX_RETRIES = 2;
const KEYCHAIN_TIMEOUT_MS = 5000;
const FS_MONITOR_POLL_MS = 2000;
const TASK_WAIT_FALLBACK_MS = 15_000;
@ -144,6 +146,18 @@ interface ProvisioningRun {
detectedSessionId: string | null;
/** Lead process activity: 'active' during turn processing, 'idle' waiting for input, 'offline' after exit. */
leadActivityState: LeadActivityState;
/** Whether an auth failure retry was already attempted for this run. */
authFailureRetried: boolean;
/** Set to true while auth-failure respawn is in progress to prevent duplicate handling. */
authRetryInProgress: boolean;
/** Saved spawn context for auth-failure respawn. */
spawnContext: {
claudePath: string;
args: string[];
cwd: string;
env: NodeJS.ProcessEnv;
prompt: string;
} | null;
}
type LeadActivityState = 'active' | 'idle' | 'offline';
@ -872,6 +886,231 @@ export class TeamProvisioningService {
);
}
/**
* Detects auth failure keywords in stderr/stdout during provisioning.
* On first detection: kills process, waits, and respawns automatically.
* On second detection (after retry): fails fast with a clear error.
*/
private handleAuthFailureInOutput(run: ProvisioningRun, text: string, source: string): void {
if (run.provisioningComplete || run.processKilled || run.authRetryInProgress) return;
if (!this.isAuthFailureWarning(text)) return;
if (!run.authFailureRetried) {
logger.warn(
`[${run.teamName}] Auth failure detected in ${source} during provisioning — ` +
`will kill process and retry after ${PREFLIGHT_AUTH_RETRY_DELAY_MS}ms`
);
run.authRetryInProgress = true;
void this.respawnAfterAuthFailure(run);
} else {
logger.error(`[${run.teamName}] Auth failure detected in ${source} after retry — giving up`);
run.processKilled = true;
killProcessTree(run.child);
const progress = updateProgress(run, 'failed', 'Authentication failed — CLI requires login', {
error:
'Claude CLI is not authenticated. Run `claude` in a terminal to complete login, ' +
'or set ANTHROPIC_API_KEY / CLAUDE_CODE_OAUTH_TOKEN and try again.',
cliLogsTail: extractLogsTail(run.stdoutBuffer, run.stderrBuffer),
});
run.onProgress(progress);
this.cleanupRun(run);
}
}
/**
* Kills the current process, waits for lock release, and respawns with saved context.
* Reattaches all stream listeners and resends the prompt.
*/
private async respawnAfterAuthFailure(run: ProvisioningRun): Promise<void> {
const ctx = run.spawnContext;
if (!ctx) {
logger.error(`[${run.teamName}] Cannot respawn — no spawn context saved`);
run.authRetryInProgress = false;
return;
}
// Tear down current process without full cleanupRun (keep run alive)
if (run.timeoutHandle) {
clearTimeout(run.timeoutHandle);
run.timeoutHandle = null;
}
this.stopFilesystemMonitor(run);
if (run.child) {
run.child.stdout?.removeAllListeners('data');
run.child.stderr?.removeAllListeners('data');
run.child.removeAllListeners('error');
run.child.removeAllListeners('exit');
killProcessTree(run.child);
run.child = null;
}
// Reset buffers for fresh attempt
run.stdoutBuffer = '';
run.stderrBuffer = '';
run.authFailureRetried = true;
updateProgress(run, 'spawning', 'Auth failed — retrying after short delay');
run.onProgress(run.progress);
await sleep(PREFLIGHT_AUTH_RETRY_DELAY_MS);
if (run.cancelRequested) {
run.authRetryInProgress = false;
return;
}
// Respawn
let child: ReturnType<typeof spawn>;
try {
child = spawnCli(ctx.claudePath, ctx.args, {
cwd: ctx.cwd,
env: { ...ctx.env },
stdio: ['pipe', 'pipe', 'pipe'],
});
} catch (error) {
run.authRetryInProgress = false;
const progress = updateProgress(run, 'failed', 'Failed to respawn Claude CLI', {
error: error instanceof Error ? error.message : String(error),
});
run.onProgress(progress);
this.cleanupRun(run);
return;
}
logger.info(
`[${run.teamName}] Respawned CLI process after auth failure (pid=${child.pid ?? '?'})`
);
run.child = child;
run.authRetryInProgress = false;
updateProgress(run, 'spawning', 'CLI respawned — sending prompt', {
pid: child.pid ?? undefined,
});
run.onProgress(run.progress);
// Resend prompt
if (child.stdin?.writable) {
const message = JSON.stringify({
type: 'user',
message: {
role: 'user',
content: [{ type: 'text', text: ctx.prompt }],
},
});
child.stdin.write(message + '\n');
}
// Reattach stdout handler
this.attachStdoutHandler(run);
// Reattach stderr handler
this.attachStderrHandler(run);
// Restart filesystem monitor for createTeam (launch skips it)
if (!run.isLaunch) {
this.startFilesystemMonitor(run, run.request);
} else {
updateProgress(run, 'monitoring', 'CLI running — reconnecting with teammates');
run.onProgress(run.progress);
}
// Restart timeout
run.timeoutHandle = setTimeout(() => {
if (!run.processKilled && !run.provisioningComplete) {
run.processKilled = true;
run.finalizingByTimeout = true;
void (async () => {
const readyOnTimeout = await this.tryCompleteAfterTimeout(run);
run.child?.stdin?.end();
killProcessTree(run.child);
if (readyOnTimeout) return;
const hint = run.isLaunch ? ' (launch)' : '';
const progress = updateProgress(run, 'failed', `Timed out waiting for CLI${hint}`, {
error: `Timed out waiting for CLI${hint}.`,
cliLogsTail: extractLogsTail(run.stdoutBuffer, run.stderrBuffer),
});
run.onProgress(progress);
this.cleanupRun(run);
})();
}
}, RUN_TIMEOUT_MS);
child.once('error', (error) => {
const hint = run.isLaunch ? ' (launch)' : '';
const progress = updateProgress(run, 'failed', `Failed to start Claude CLI${hint}`, {
error: error.message,
cliLogsTail: extractLogsTail(run.stdoutBuffer, run.stderrBuffer),
});
run.onProgress(progress);
this.cleanupRun(run);
});
child.once('exit', (code) => {
void this.handleProcessExit(run, code);
});
}
/** Attaches the stdout stream-json parser to the current child process. */
private attachStdoutHandler(run: ProvisioningRun): void {
const child = run.child;
if (!child?.stdout) return;
let stdoutLineBuf = '';
child.stdout.on('data', (chunk: Buffer) => {
const text = chunk.toString('utf8');
run.stdoutBuffer += text;
if (run.stdoutBuffer.length > STDOUT_RING_LIMIT) {
run.stdoutBuffer = run.stdoutBuffer.slice(run.stdoutBuffer.length - STDOUT_RING_LIMIT);
}
// Parse stream-json lines (newline-delimited JSON)
stdoutLineBuf += text;
const lines = stdoutLineBuf.split('\n');
stdoutLineBuf = lines.pop() ?? '';
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
const msg = JSON.parse(trimmed) as Record<string, unknown>;
this.handleStreamJsonMessage(run, msg);
} catch {
// Not valid JSON — check for auth failure in raw text output
this.handleAuthFailureInOutput(run, trimmed, 'stdout');
}
}
const currentTs = Date.now();
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
run.lastLogProgressAt = currentTs;
emitLogsProgress(run);
}
});
}
/** Attaches the stderr handler with auth failure detection. */
private attachStderrHandler(run: ProvisioningRun): void {
const child = run.child;
if (!child?.stderr) return;
child.stderr.on('data', (chunk: Buffer) => {
const text = chunk.toString('utf8');
run.stderrBuffer += text;
if (run.stderrBuffer.length > STDERR_RING_LIMIT) {
run.stderrBuffer = run.stderrBuffer.slice(run.stderrBuffer.length - STDERR_RING_LIMIT);
}
// Detect auth failure early instead of waiting for 5-minute timeout
this.handleAuthFailureInOutput(run, text, 'stderr');
const currentTs = Date.now();
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
run.lastLogProgressAt = currentTs;
emitLogsProgress(run);
}
});
}
async createTeam(
request: TeamCreateRequest,
onProgress: (progress: TeamProvisioningProgress) => void
@ -924,6 +1163,9 @@ export class TeamProvisioningService {
provisioningOutputParts: [],
detectedSessionId: null,
leadActivityState: 'active',
authFailureRetried: false,
authRetryInProgress: false,
spawnContext: null,
progress: {
runId,
teamName: request.teamName,
@ -948,30 +1190,25 @@ export class TeamProvisioningService {
'Attempting spawn anyway — CLI may authenticate via apiKeyHelper, SSO, or other mechanism.'
);
}
const spawnArgs = [
'--input-format',
'stream-json',
'--output-format',
'stream-json',
'--verbose',
'--setting-sources',
'user,project,local',
'--disallowedTools',
'TeamDelete,TodoWrite',
'--dangerously-skip-permissions',
...(request.model ? ['--model', request.model] : []),
];
try {
child = spawnCli(
claudePath,
[
'--input-format',
'stream-json',
'--output-format',
'stream-json',
'--verbose',
'--setting-sources',
'user,project,local',
'--disallowedTools',
'TeamDelete,TodoWrite',
'--dangerously-skip-permissions',
...(request.model ? ['--model', request.model] : []),
],
{
cwd: request.cwd,
env: {
...shellEnv,
},
stdio: ['pipe', 'pipe', 'pipe'],
}
);
child = spawnCli(claudePath, spawnArgs, {
cwd: request.cwd,
env: { ...shellEnv },
stdio: ['pipe', 'pipe', 'pipe'],
});
} catch (error) {
this.runs.delete(runId);
this.activeByTeam.delete(request.teamName);
@ -981,6 +1218,13 @@ export class TeamProvisioningService {
updateProgress(run, 'spawning', 'Starting Claude CLI process', { pid: child.pid ?? undefined });
run.onProgress(run.progress);
run.child = child;
run.spawnContext = {
claudePath,
args: spawnArgs,
cwd: request.cwd,
env: { ...shellEnv },
prompt,
};
// Send provisioning prompt as first stream-json message (SDKUserMessage format)
if (child.stdin?.writable) {
@ -994,51 +1238,8 @@ export class TeamProvisioningService {
child.stdin.write(message + '\n');
}
if (child.stdout) {
let stdoutLineBuf = '';
child.stdout.on('data', (chunk: Buffer) => {
const text = chunk.toString('utf8');
run.stdoutBuffer += text;
if (run.stdoutBuffer.length > STDOUT_RING_LIMIT) {
run.stdoutBuffer = run.stdoutBuffer.slice(run.stdoutBuffer.length - STDOUT_RING_LIMIT);
}
// Parse stream-json lines (newline-delimited JSON)
stdoutLineBuf += text;
const lines = stdoutLineBuf.split('\n');
stdoutLineBuf = lines.pop() ?? '';
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
const msg = JSON.parse(trimmed) as Record<string, unknown>;
this.handleStreamJsonMessage(run, msg);
} catch {
// Not valid JSON — raw text output, ignore
}
}
const currentTs = Date.now();
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
run.lastLogProgressAt = currentTs;
emitLogsProgress(run);
}
});
}
if (child.stderr) {
child.stderr.on('data', (chunk: Buffer) => {
run.stderrBuffer += chunk.toString('utf8');
if (run.stderrBuffer.length > STDERR_RING_LIMIT) {
run.stderrBuffer = run.stderrBuffer.slice(run.stderrBuffer.length - STDERR_RING_LIMIT);
}
const currentTs = Date.now();
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
run.lastLogProgressAt = currentTs;
emitLogsProgress(run);
}
});
}
this.attachStdoutHandler(run);
this.attachStderrHandler(run);
// Filesystem-based progress monitor: actively polls team files instead
// of relying on stdout (which only arrives at the end in text mode).
@ -1207,6 +1408,9 @@ export class TeamProvisioningService {
provisioningOutputParts: [],
detectedSessionId: null,
leadActivityState: 'active',
authFailureRetried: false,
authRetryInProgress: false,
spawnContext: null,
progress: {
runId,
teamName: request.teamName,
@ -1291,6 +1495,13 @@ export class TeamProvisioningService {
});
run.onProgress(run.progress);
run.child = child;
run.spawnContext = {
claudePath,
args: launchArgs,
cwd: request.cwd,
env: { ...shellEnv },
prompt,
};
// Send launch prompt
if (child.stdin?.writable) {
@ -1304,50 +1515,8 @@ export class TeamProvisioningService {
child.stdin.write(message + '\n');
}
if (child.stdout) {
let stdoutLineBuf = '';
child.stdout.on('data', (chunk: Buffer) => {
const text = chunk.toString('utf8');
run.stdoutBuffer += text;
if (run.stdoutBuffer.length > STDOUT_RING_LIMIT) {
run.stdoutBuffer = run.stdoutBuffer.slice(run.stdoutBuffer.length - STDOUT_RING_LIMIT);
}
stdoutLineBuf += text;
const lines = stdoutLineBuf.split('\n');
stdoutLineBuf = lines.pop() ?? '';
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
const msg = JSON.parse(trimmed) as Record<string, unknown>;
this.handleStreamJsonMessage(run, msg);
} catch {
// Not valid JSON
}
}
const currentTs = Date.now();
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
run.lastLogProgressAt = currentTs;
emitLogsProgress(run);
}
});
}
if (child.stderr) {
child.stderr.on('data', (chunk: Buffer) => {
run.stderrBuffer += chunk.toString('utf8');
if (run.stderrBuffer.length > STDERR_RING_LIMIT) {
run.stderrBuffer = run.stderrBuffer.slice(run.stderrBuffer.length - STDERR_RING_LIMIT);
}
const currentTs = Date.now();
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
run.lastLogProgressAt = currentTs;
emitLogsProgress(run);
}
});
}
this.attachStdoutHandler(run);
this.attachStderrHandler(run);
// For launch, skip the filesystem monitor — files (config, inboxes, tasks)
// already exist from the previous run and would trigger immediate false
@ -2217,6 +2386,13 @@ export class TeamProvisioningService {
if (run.progress.state === 'failed' || run.cancelRequested) {
return;
}
// Skip if respawn after auth failure is in progress — the old process is being replaced
if (run.authRetryInProgress) {
logger.info(
`[${run.teamName}] Process exited (code ${code ?? '?'}) during auth-failure respawn — ignoring`
);
return;
}
// === Process exited AFTER provisioning completed ===
// This means the team went offline (crash, kill, or natural exit).
@ -3119,40 +3295,67 @@ export class TeamProvisioningService {
throw new Error(`Failed to warm up Claude CLI: ${errorText}`);
}
// Stage 2: verify `-p` mode auth actually works
let pingProbe: { exitCode: number | null; stdout: string; stderr: string } | null = null;
try {
pingProbe = await this.spawnProbe(
claudePath,
['-p', 'Reply with the single word PONG and nothing else', '--output-format', 'text'],
cwd,
env,
PREFLIGHT_TIMEOUT_MS
);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
return {
warning:
'Preflight check for `claude -p` did not complete. ' +
`Proceeding anyway. Details: ${message}`,
};
}
// Stage 2: verify `-p` mode auth actually works (with retry for stale locks after Ctrl+C)
for (let attempt = 1; attempt <= PREFLIGHT_AUTH_MAX_RETRIES; attempt++) {
let pingProbe: { exitCode: number | null; stdout: string; stderr: string } | null = null;
try {
pingProbe = await this.spawnProbe(
claudePath,
['-p', 'Reply with the single word PONG and nothing else', '--output-format', 'text'],
cwd,
env,
PREFLIGHT_TIMEOUT_MS
);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
if (attempt < PREFLIGHT_AUTH_MAX_RETRIES) {
logger.warn(
`Preflight ping failed (attempt ${attempt}/${PREFLIGHT_AUTH_MAX_RETRIES}), ` +
`retrying in ${PREFLIGHT_AUTH_RETRY_DELAY_MS}ms: ${message}`
);
await new Promise((resolve) => setTimeout(resolve, PREFLIGHT_AUTH_RETRY_DELAY_MS));
continue;
}
return {
warning:
'Preflight check for `claude -p` did not complete. ' +
`Proceeding anyway. Details: ${message}`,
};
}
const combinedOutput = buildCombinedLogs(pingProbe.stdout, pingProbe.stderr);
const lowerOutput = combinedOutput.toLowerCase();
const isAuthFailure =
lowerOutput.includes('not logged in') ||
lowerOutput.includes('please run /login') ||
lowerOutput.includes('missing api key') ||
lowerOutput.includes('invalid api key');
const combinedOutput = buildCombinedLogs(pingProbe.stdout, pingProbe.stderr);
const lowerOutput = combinedOutput.toLowerCase();
const isAuthFailure =
lowerOutput.includes('not logged in') ||
lowerOutput.includes('please run /login') ||
lowerOutput.includes('missing api key') ||
lowerOutput.includes('invalid api key');
if (isAuthFailure || pingProbe.exitCode !== 0) {
const hint = isAuthFailure
? 'Claude CLI `-p` mode is not authenticated. ' +
'Set ANTHROPIC_API_KEY, or run `claude setup-token` to generate a long-lived OAuth token, ' +
'then export it as CLAUDE_CODE_OAUTH_TOKEN.'
: `Claude CLI preflight check failed (exit code ${pingProbe.exitCode ?? 'unknown'}).`;
return { warning: hint };
if (isAuthFailure && attempt < PREFLIGHT_AUTH_MAX_RETRIES) {
logger.warn(
`Preflight auth failure detected (attempt ${attempt}/${PREFLIGHT_AUTH_MAX_RETRIES}), ` +
`retrying in ${PREFLIGHT_AUTH_RETRY_DELAY_MS}ms — likely stale locks from interrupted process`
);
await new Promise((resolve) => setTimeout(resolve, PREFLIGHT_AUTH_RETRY_DELAY_MS));
continue;
}
if (isAuthFailure || pingProbe.exitCode !== 0) {
const hint = isAuthFailure
? 'Claude CLI `-p` mode is not authenticated. ' +
'Set ANTHROPIC_API_KEY, or run `claude setup-token` to generate a long-lived OAuth token, ' +
'then export it as CLAUDE_CODE_OAUTH_TOKEN.' +
(attempt > 1 ? ` (failed after ${attempt} attempts)` : '')
: `Claude CLI preflight check failed (exit code ${pingProbe.exitCode ?? 'unknown'}).`;
return { warning: hint };
}
if (attempt > 1) {
logger.info(
`Preflight auth succeeded on attempt ${attempt} (previous attempt had auth failure)`
);
}
return {};
}
return {};