feat: implement auth status check with retry mechanism
- Added a retry mechanism for checking authentication status in the CliInstallerService to handle stale locks after Ctrl+C interruptions. - Introduced constants for maximum retries and delay between attempts to improve robustness. - Updated logging to provide detailed feedback on retry attempts and outcomes.
This commit is contained in:
parent
0190ae77e8
commit
47d979a43d
3 changed files with 393 additions and 163 deletions
|
|
@ -66,6 +66,12 @@ const EBUSY_MAX_RETRIES = 3;
|
|||
/** Delay between EBUSY retries (multiplied by attempt number) */
|
||||
const EBUSY_RETRY_DELAY_MS = 2000;
|
||||
|
||||
/** Max retries for auth status check (covers stale locks after Ctrl+C) */
|
||||
const AUTH_STATUS_MAX_RETRIES = 2;
|
||||
|
||||
/** Delay before retrying auth status check (ms) — gives previous process time to clean up */
|
||||
const AUTH_STATUS_RETRY_DELAY_MS = 1500;
|
||||
|
||||
/**
|
||||
* Build env for child processes with correct HOME.
|
||||
* On Windows with non-ASCII usernames, process.env may have a broken HOME/USERPROFILE.
|
||||
|
|
@ -284,19 +290,38 @@ export class CliInstallerService {
|
|||
logger.warn('Failed to get CLI version:', getErrorMessage(err));
|
||||
}
|
||||
|
||||
// Check auth status
|
||||
try {
|
||||
const { stdout: authStdout } = await execCli(binaryPath, ['auth', 'status'], {
|
||||
timeout: VERSION_TIMEOUT_MS,
|
||||
env: buildChildEnv(),
|
||||
});
|
||||
const auth = JSON.parse(authStdout.trim()) as { loggedIn?: boolean; authMethod?: string };
|
||||
r.authLoggedIn = auth.loggedIn === true;
|
||||
r.authMethod = auth.authMethod ?? null;
|
||||
logger.info(`Auth status: loggedIn=${r.authLoggedIn}, method=${r.authMethod ?? 'null'}`);
|
||||
} catch (err) {
|
||||
logger.warn('Failed to check auth status:', getErrorMessage(err));
|
||||
r.authLoggedIn = false;
|
||||
// Check auth status with retry — covers stale lock files after Ctrl+C interruption
|
||||
for (let authAttempt = 1; authAttempt <= AUTH_STATUS_MAX_RETRIES; authAttempt++) {
|
||||
try {
|
||||
const { stdout: authStdout } = await execCli(binaryPath, ['auth', 'status'], {
|
||||
timeout: VERSION_TIMEOUT_MS,
|
||||
env: buildChildEnv(),
|
||||
});
|
||||
const auth = JSON.parse(authStdout.trim()) as {
|
||||
loggedIn?: boolean;
|
||||
authMethod?: string;
|
||||
};
|
||||
r.authLoggedIn = auth.loggedIn === true;
|
||||
r.authMethod = auth.authMethod ?? null;
|
||||
logger.info(
|
||||
`Auth status: loggedIn=${r.authLoggedIn}, method=${r.authMethod ?? 'null'}` +
|
||||
(authAttempt > 1 ? ` (attempt ${authAttempt})` : '')
|
||||
);
|
||||
break;
|
||||
} catch (err) {
|
||||
if (authAttempt < AUTH_STATUS_MAX_RETRIES) {
|
||||
logger.warn(
|
||||
`Auth status check failed (attempt ${authAttempt}/${AUTH_STATUS_MAX_RETRIES}), ` +
|
||||
`retrying in ${AUTH_STATUS_RETRY_DELAY_MS}ms: ${getErrorMessage(err)}`
|
||||
);
|
||||
await new Promise((resolve) => setTimeout(resolve, AUTH_STATUS_RETRY_DELAY_MS));
|
||||
} else {
|
||||
logger.warn(
|
||||
`Auth status check failed after ${AUTH_STATUS_MAX_RETRIES} attempts: ${getErrorMessage(err)}`
|
||||
);
|
||||
r.authLoggedIn = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -146,14 +146,16 @@ export class TeamConfigReader {
|
|||
return null;
|
||||
}
|
||||
|
||||
// Case-insensitive dedup: key is lowercase name, value keeps the original casing
|
||||
const memberMap = new Map<string, TeamSummaryMember>();
|
||||
|
||||
const addMember = (m: TeamMember): void => {
|
||||
const mergeMember = (m: TeamMember): void => {
|
||||
const name = m.name?.trim();
|
||||
if (!name) return;
|
||||
const existing = memberMap.get(name);
|
||||
memberMap.set(name, {
|
||||
name,
|
||||
const key = name.toLowerCase();
|
||||
const existing = memberMap.get(key);
|
||||
memberMap.set(key, {
|
||||
name: existing?.name ?? name,
|
||||
role: m.role?.trim() || existing?.role,
|
||||
color: m.color?.trim() || existing?.color,
|
||||
});
|
||||
|
|
@ -162,7 +164,7 @@ export class TeamConfigReader {
|
|||
if (config && Array.isArray(config.members)) {
|
||||
for (const member of config.members) {
|
||||
if (member && typeof member.name === 'string') {
|
||||
addMember(member);
|
||||
mergeMember(member);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -173,7 +175,7 @@ export class TeamConfigReader {
|
|||
const metaMembers = await this.membersMetaStore.getMembers(teamName);
|
||||
for (const member of metaMembers) {
|
||||
if (!member.removedAt) {
|
||||
addMember(member);
|
||||
mergeMember(member);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
|
|
|
|||
|
|
@ -59,6 +59,8 @@ const UI_LOGS_TAIL_LIMIT = 128 * 1024;
|
|||
const SHELL_ENV_TIMEOUT_MS = 12000;
|
||||
const CLI_PREPARE_TIMEOUT_MS = 10000;
|
||||
const PREFLIGHT_TIMEOUT_MS = 30000;
|
||||
const PREFLIGHT_AUTH_RETRY_DELAY_MS = 2000;
|
||||
const PREFLIGHT_AUTH_MAX_RETRIES = 2;
|
||||
const KEYCHAIN_TIMEOUT_MS = 5000;
|
||||
const FS_MONITOR_POLL_MS = 2000;
|
||||
const TASK_WAIT_FALLBACK_MS = 15_000;
|
||||
|
|
@ -144,6 +146,18 @@ interface ProvisioningRun {
|
|||
detectedSessionId: string | null;
|
||||
/** Lead process activity: 'active' during turn processing, 'idle' waiting for input, 'offline' after exit. */
|
||||
leadActivityState: LeadActivityState;
|
||||
/** Whether an auth failure retry was already attempted for this run. */
|
||||
authFailureRetried: boolean;
|
||||
/** Set to true while auth-failure respawn is in progress to prevent duplicate handling. */
|
||||
authRetryInProgress: boolean;
|
||||
/** Saved spawn context for auth-failure respawn. */
|
||||
spawnContext: {
|
||||
claudePath: string;
|
||||
args: string[];
|
||||
cwd: string;
|
||||
env: NodeJS.ProcessEnv;
|
||||
prompt: string;
|
||||
} | null;
|
||||
}
|
||||
|
||||
type LeadActivityState = 'active' | 'idle' | 'offline';
|
||||
|
|
@ -872,6 +886,231 @@ export class TeamProvisioningService {
|
|||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects auth failure keywords in stderr/stdout during provisioning.
|
||||
* On first detection: kills process, waits, and respawns automatically.
|
||||
* On second detection (after retry): fails fast with a clear error.
|
||||
*/
|
||||
private handleAuthFailureInOutput(run: ProvisioningRun, text: string, source: string): void {
|
||||
if (run.provisioningComplete || run.processKilled || run.authRetryInProgress) return;
|
||||
if (!this.isAuthFailureWarning(text)) return;
|
||||
|
||||
if (!run.authFailureRetried) {
|
||||
logger.warn(
|
||||
`[${run.teamName}] Auth failure detected in ${source} during provisioning — ` +
|
||||
`will kill process and retry after ${PREFLIGHT_AUTH_RETRY_DELAY_MS}ms`
|
||||
);
|
||||
run.authRetryInProgress = true;
|
||||
void this.respawnAfterAuthFailure(run);
|
||||
} else {
|
||||
logger.error(`[${run.teamName}] Auth failure detected in ${source} after retry — giving up`);
|
||||
run.processKilled = true;
|
||||
killProcessTree(run.child);
|
||||
const progress = updateProgress(run, 'failed', 'Authentication failed — CLI requires login', {
|
||||
error:
|
||||
'Claude CLI is not authenticated. Run `claude` in a terminal to complete login, ' +
|
||||
'or set ANTHROPIC_API_KEY / CLAUDE_CODE_OAUTH_TOKEN and try again.',
|
||||
cliLogsTail: extractLogsTail(run.stdoutBuffer, run.stderrBuffer),
|
||||
});
|
||||
run.onProgress(progress);
|
||||
this.cleanupRun(run);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Kills the current process, waits for lock release, and respawns with saved context.
|
||||
* Reattaches all stream listeners and resends the prompt.
|
||||
*/
|
||||
private async respawnAfterAuthFailure(run: ProvisioningRun): Promise<void> {
|
||||
const ctx = run.spawnContext;
|
||||
if (!ctx) {
|
||||
logger.error(`[${run.teamName}] Cannot respawn — no spawn context saved`);
|
||||
run.authRetryInProgress = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// Tear down current process without full cleanupRun (keep run alive)
|
||||
if (run.timeoutHandle) {
|
||||
clearTimeout(run.timeoutHandle);
|
||||
run.timeoutHandle = null;
|
||||
}
|
||||
this.stopFilesystemMonitor(run);
|
||||
if (run.child) {
|
||||
run.child.stdout?.removeAllListeners('data');
|
||||
run.child.stderr?.removeAllListeners('data');
|
||||
run.child.removeAllListeners('error');
|
||||
run.child.removeAllListeners('exit');
|
||||
killProcessTree(run.child);
|
||||
run.child = null;
|
||||
}
|
||||
|
||||
// Reset buffers for fresh attempt
|
||||
run.stdoutBuffer = '';
|
||||
run.stderrBuffer = '';
|
||||
run.authFailureRetried = true;
|
||||
|
||||
updateProgress(run, 'spawning', 'Auth failed — retrying after short delay');
|
||||
run.onProgress(run.progress);
|
||||
|
||||
await sleep(PREFLIGHT_AUTH_RETRY_DELAY_MS);
|
||||
|
||||
if (run.cancelRequested) {
|
||||
run.authRetryInProgress = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// Respawn
|
||||
let child: ReturnType<typeof spawn>;
|
||||
try {
|
||||
child = spawnCli(ctx.claudePath, ctx.args, {
|
||||
cwd: ctx.cwd,
|
||||
env: { ...ctx.env },
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
});
|
||||
} catch (error) {
|
||||
run.authRetryInProgress = false;
|
||||
const progress = updateProgress(run, 'failed', 'Failed to respawn Claude CLI', {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
});
|
||||
run.onProgress(progress);
|
||||
this.cleanupRun(run);
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info(
|
||||
`[${run.teamName}] Respawned CLI process after auth failure (pid=${child.pid ?? '?'})`
|
||||
);
|
||||
run.child = child;
|
||||
run.authRetryInProgress = false;
|
||||
|
||||
updateProgress(run, 'spawning', 'CLI respawned — sending prompt', {
|
||||
pid: child.pid ?? undefined,
|
||||
});
|
||||
run.onProgress(run.progress);
|
||||
|
||||
// Resend prompt
|
||||
if (child.stdin?.writable) {
|
||||
const message = JSON.stringify({
|
||||
type: 'user',
|
||||
message: {
|
||||
role: 'user',
|
||||
content: [{ type: 'text', text: ctx.prompt }],
|
||||
},
|
||||
});
|
||||
child.stdin.write(message + '\n');
|
||||
}
|
||||
|
||||
// Reattach stdout handler
|
||||
this.attachStdoutHandler(run);
|
||||
|
||||
// Reattach stderr handler
|
||||
this.attachStderrHandler(run);
|
||||
|
||||
// Restart filesystem monitor for createTeam (launch skips it)
|
||||
if (!run.isLaunch) {
|
||||
this.startFilesystemMonitor(run, run.request);
|
||||
} else {
|
||||
updateProgress(run, 'monitoring', 'CLI running — reconnecting with teammates');
|
||||
run.onProgress(run.progress);
|
||||
}
|
||||
|
||||
// Restart timeout
|
||||
run.timeoutHandle = setTimeout(() => {
|
||||
if (!run.processKilled && !run.provisioningComplete) {
|
||||
run.processKilled = true;
|
||||
run.finalizingByTimeout = true;
|
||||
void (async () => {
|
||||
const readyOnTimeout = await this.tryCompleteAfterTimeout(run);
|
||||
run.child?.stdin?.end();
|
||||
killProcessTree(run.child);
|
||||
if (readyOnTimeout) return;
|
||||
|
||||
const hint = run.isLaunch ? ' (launch)' : '';
|
||||
const progress = updateProgress(run, 'failed', `Timed out waiting for CLI${hint}`, {
|
||||
error: `Timed out waiting for CLI${hint}.`,
|
||||
cliLogsTail: extractLogsTail(run.stdoutBuffer, run.stderrBuffer),
|
||||
});
|
||||
run.onProgress(progress);
|
||||
this.cleanupRun(run);
|
||||
})();
|
||||
}
|
||||
}, RUN_TIMEOUT_MS);
|
||||
|
||||
child.once('error', (error) => {
|
||||
const hint = run.isLaunch ? ' (launch)' : '';
|
||||
const progress = updateProgress(run, 'failed', `Failed to start Claude CLI${hint}`, {
|
||||
error: error.message,
|
||||
cliLogsTail: extractLogsTail(run.stdoutBuffer, run.stderrBuffer),
|
||||
});
|
||||
run.onProgress(progress);
|
||||
this.cleanupRun(run);
|
||||
});
|
||||
|
||||
child.once('exit', (code) => {
|
||||
void this.handleProcessExit(run, code);
|
||||
});
|
||||
}
|
||||
|
||||
/** Attaches the stdout stream-json parser to the current child process. */
|
||||
private attachStdoutHandler(run: ProvisioningRun): void {
|
||||
const child = run.child;
|
||||
if (!child?.stdout) return;
|
||||
|
||||
let stdoutLineBuf = '';
|
||||
child.stdout.on('data', (chunk: Buffer) => {
|
||||
const text = chunk.toString('utf8');
|
||||
run.stdoutBuffer += text;
|
||||
if (run.stdoutBuffer.length > STDOUT_RING_LIMIT) {
|
||||
run.stdoutBuffer = run.stdoutBuffer.slice(run.stdoutBuffer.length - STDOUT_RING_LIMIT);
|
||||
}
|
||||
|
||||
// Parse stream-json lines (newline-delimited JSON)
|
||||
stdoutLineBuf += text;
|
||||
const lines = stdoutLineBuf.split('\n');
|
||||
stdoutLineBuf = lines.pop() ?? '';
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) continue;
|
||||
try {
|
||||
const msg = JSON.parse(trimmed) as Record<string, unknown>;
|
||||
this.handleStreamJsonMessage(run, msg);
|
||||
} catch {
|
||||
// Not valid JSON — check for auth failure in raw text output
|
||||
this.handleAuthFailureInOutput(run, trimmed, 'stdout');
|
||||
}
|
||||
}
|
||||
|
||||
const currentTs = Date.now();
|
||||
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
|
||||
run.lastLogProgressAt = currentTs;
|
||||
emitLogsProgress(run);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/** Attaches the stderr handler with auth failure detection. */
|
||||
private attachStderrHandler(run: ProvisioningRun): void {
|
||||
const child = run.child;
|
||||
if (!child?.stderr) return;
|
||||
|
||||
child.stderr.on('data', (chunk: Buffer) => {
|
||||
const text = chunk.toString('utf8');
|
||||
run.stderrBuffer += text;
|
||||
if (run.stderrBuffer.length > STDERR_RING_LIMIT) {
|
||||
run.stderrBuffer = run.stderrBuffer.slice(run.stderrBuffer.length - STDERR_RING_LIMIT);
|
||||
}
|
||||
|
||||
// Detect auth failure early instead of waiting for 5-minute timeout
|
||||
this.handleAuthFailureInOutput(run, text, 'stderr');
|
||||
|
||||
const currentTs = Date.now();
|
||||
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
|
||||
run.lastLogProgressAt = currentTs;
|
||||
emitLogsProgress(run);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async createTeam(
|
||||
request: TeamCreateRequest,
|
||||
onProgress: (progress: TeamProvisioningProgress) => void
|
||||
|
|
@ -924,6 +1163,9 @@ export class TeamProvisioningService {
|
|||
provisioningOutputParts: [],
|
||||
detectedSessionId: null,
|
||||
leadActivityState: 'active',
|
||||
authFailureRetried: false,
|
||||
authRetryInProgress: false,
|
||||
spawnContext: null,
|
||||
progress: {
|
||||
runId,
|
||||
teamName: request.teamName,
|
||||
|
|
@ -948,30 +1190,25 @@ export class TeamProvisioningService {
|
|||
'Attempting spawn anyway — CLI may authenticate via apiKeyHelper, SSO, or other mechanism.'
|
||||
);
|
||||
}
|
||||
const spawnArgs = [
|
||||
'--input-format',
|
||||
'stream-json',
|
||||
'--output-format',
|
||||
'stream-json',
|
||||
'--verbose',
|
||||
'--setting-sources',
|
||||
'user,project,local',
|
||||
'--disallowedTools',
|
||||
'TeamDelete,TodoWrite',
|
||||
'--dangerously-skip-permissions',
|
||||
...(request.model ? ['--model', request.model] : []),
|
||||
];
|
||||
try {
|
||||
child = spawnCli(
|
||||
claudePath,
|
||||
[
|
||||
'--input-format',
|
||||
'stream-json',
|
||||
'--output-format',
|
||||
'stream-json',
|
||||
'--verbose',
|
||||
'--setting-sources',
|
||||
'user,project,local',
|
||||
'--disallowedTools',
|
||||
'TeamDelete,TodoWrite',
|
||||
'--dangerously-skip-permissions',
|
||||
...(request.model ? ['--model', request.model] : []),
|
||||
],
|
||||
{
|
||||
cwd: request.cwd,
|
||||
env: {
|
||||
...shellEnv,
|
||||
},
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
}
|
||||
);
|
||||
child = spawnCli(claudePath, spawnArgs, {
|
||||
cwd: request.cwd,
|
||||
env: { ...shellEnv },
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
});
|
||||
} catch (error) {
|
||||
this.runs.delete(runId);
|
||||
this.activeByTeam.delete(request.teamName);
|
||||
|
|
@ -981,6 +1218,13 @@ export class TeamProvisioningService {
|
|||
updateProgress(run, 'spawning', 'Starting Claude CLI process', { pid: child.pid ?? undefined });
|
||||
run.onProgress(run.progress);
|
||||
run.child = child;
|
||||
run.spawnContext = {
|
||||
claudePath,
|
||||
args: spawnArgs,
|
||||
cwd: request.cwd,
|
||||
env: { ...shellEnv },
|
||||
prompt,
|
||||
};
|
||||
|
||||
// Send provisioning prompt as first stream-json message (SDKUserMessage format)
|
||||
if (child.stdin?.writable) {
|
||||
|
|
@ -994,51 +1238,8 @@ export class TeamProvisioningService {
|
|||
child.stdin.write(message + '\n');
|
||||
}
|
||||
|
||||
if (child.stdout) {
|
||||
let stdoutLineBuf = '';
|
||||
child.stdout.on('data', (chunk: Buffer) => {
|
||||
const text = chunk.toString('utf8');
|
||||
run.stdoutBuffer += text;
|
||||
if (run.stdoutBuffer.length > STDOUT_RING_LIMIT) {
|
||||
run.stdoutBuffer = run.stdoutBuffer.slice(run.stdoutBuffer.length - STDOUT_RING_LIMIT);
|
||||
}
|
||||
|
||||
// Parse stream-json lines (newline-delimited JSON)
|
||||
stdoutLineBuf += text;
|
||||
const lines = stdoutLineBuf.split('\n');
|
||||
stdoutLineBuf = lines.pop() ?? '';
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) continue;
|
||||
try {
|
||||
const msg = JSON.parse(trimmed) as Record<string, unknown>;
|
||||
this.handleStreamJsonMessage(run, msg);
|
||||
} catch {
|
||||
// Not valid JSON — raw text output, ignore
|
||||
}
|
||||
}
|
||||
|
||||
const currentTs = Date.now();
|
||||
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
|
||||
run.lastLogProgressAt = currentTs;
|
||||
emitLogsProgress(run);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (child.stderr) {
|
||||
child.stderr.on('data', (chunk: Buffer) => {
|
||||
run.stderrBuffer += chunk.toString('utf8');
|
||||
if (run.stderrBuffer.length > STDERR_RING_LIMIT) {
|
||||
run.stderrBuffer = run.stderrBuffer.slice(run.stderrBuffer.length - STDERR_RING_LIMIT);
|
||||
}
|
||||
const currentTs = Date.now();
|
||||
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
|
||||
run.lastLogProgressAt = currentTs;
|
||||
emitLogsProgress(run);
|
||||
}
|
||||
});
|
||||
}
|
||||
this.attachStdoutHandler(run);
|
||||
this.attachStderrHandler(run);
|
||||
|
||||
// Filesystem-based progress monitor: actively polls team files instead
|
||||
// of relying on stdout (which only arrives at the end in text mode).
|
||||
|
|
@ -1207,6 +1408,9 @@ export class TeamProvisioningService {
|
|||
provisioningOutputParts: [],
|
||||
detectedSessionId: null,
|
||||
leadActivityState: 'active',
|
||||
authFailureRetried: false,
|
||||
authRetryInProgress: false,
|
||||
spawnContext: null,
|
||||
progress: {
|
||||
runId,
|
||||
teamName: request.teamName,
|
||||
|
|
@ -1291,6 +1495,13 @@ export class TeamProvisioningService {
|
|||
});
|
||||
run.onProgress(run.progress);
|
||||
run.child = child;
|
||||
run.spawnContext = {
|
||||
claudePath,
|
||||
args: launchArgs,
|
||||
cwd: request.cwd,
|
||||
env: { ...shellEnv },
|
||||
prompt,
|
||||
};
|
||||
|
||||
// Send launch prompt
|
||||
if (child.stdin?.writable) {
|
||||
|
|
@ -1304,50 +1515,8 @@ export class TeamProvisioningService {
|
|||
child.stdin.write(message + '\n');
|
||||
}
|
||||
|
||||
if (child.stdout) {
|
||||
let stdoutLineBuf = '';
|
||||
child.stdout.on('data', (chunk: Buffer) => {
|
||||
const text = chunk.toString('utf8');
|
||||
run.stdoutBuffer += text;
|
||||
if (run.stdoutBuffer.length > STDOUT_RING_LIMIT) {
|
||||
run.stdoutBuffer = run.stdoutBuffer.slice(run.stdoutBuffer.length - STDOUT_RING_LIMIT);
|
||||
}
|
||||
|
||||
stdoutLineBuf += text;
|
||||
const lines = stdoutLineBuf.split('\n');
|
||||
stdoutLineBuf = lines.pop() ?? '';
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) continue;
|
||||
try {
|
||||
const msg = JSON.parse(trimmed) as Record<string, unknown>;
|
||||
this.handleStreamJsonMessage(run, msg);
|
||||
} catch {
|
||||
// Not valid JSON
|
||||
}
|
||||
}
|
||||
|
||||
const currentTs = Date.now();
|
||||
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
|
||||
run.lastLogProgressAt = currentTs;
|
||||
emitLogsProgress(run);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (child.stderr) {
|
||||
child.stderr.on('data', (chunk: Buffer) => {
|
||||
run.stderrBuffer += chunk.toString('utf8');
|
||||
if (run.stderrBuffer.length > STDERR_RING_LIMIT) {
|
||||
run.stderrBuffer = run.stderrBuffer.slice(run.stderrBuffer.length - STDERR_RING_LIMIT);
|
||||
}
|
||||
const currentTs = Date.now();
|
||||
if (currentTs - run.lastLogProgressAt >= LOG_PROGRESS_THROTTLE_MS) {
|
||||
run.lastLogProgressAt = currentTs;
|
||||
emitLogsProgress(run);
|
||||
}
|
||||
});
|
||||
}
|
||||
this.attachStdoutHandler(run);
|
||||
this.attachStderrHandler(run);
|
||||
|
||||
// For launch, skip the filesystem monitor — files (config, inboxes, tasks)
|
||||
// already exist from the previous run and would trigger immediate false
|
||||
|
|
@ -2217,6 +2386,13 @@ export class TeamProvisioningService {
|
|||
if (run.progress.state === 'failed' || run.cancelRequested) {
|
||||
return;
|
||||
}
|
||||
// Skip if respawn after auth failure is in progress — the old process is being replaced
|
||||
if (run.authRetryInProgress) {
|
||||
logger.info(
|
||||
`[${run.teamName}] Process exited (code ${code ?? '?'}) during auth-failure respawn — ignoring`
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// === Process exited AFTER provisioning completed ===
|
||||
// This means the team went offline (crash, kill, or natural exit).
|
||||
|
|
@ -3119,40 +3295,67 @@ export class TeamProvisioningService {
|
|||
throw new Error(`Failed to warm up Claude CLI: ${errorText}`);
|
||||
}
|
||||
|
||||
// Stage 2: verify `-p` mode auth actually works
|
||||
let pingProbe: { exitCode: number | null; stdout: string; stderr: string } | null = null;
|
||||
try {
|
||||
pingProbe = await this.spawnProbe(
|
||||
claudePath,
|
||||
['-p', 'Reply with the single word PONG and nothing else', '--output-format', 'text'],
|
||||
cwd,
|
||||
env,
|
||||
PREFLIGHT_TIMEOUT_MS
|
||||
);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
return {
|
||||
warning:
|
||||
'Preflight check for `claude -p` did not complete. ' +
|
||||
`Proceeding anyway. Details: ${message}`,
|
||||
};
|
||||
}
|
||||
// Stage 2: verify `-p` mode auth actually works (with retry for stale locks after Ctrl+C)
|
||||
for (let attempt = 1; attempt <= PREFLIGHT_AUTH_MAX_RETRIES; attempt++) {
|
||||
let pingProbe: { exitCode: number | null; stdout: string; stderr: string } | null = null;
|
||||
try {
|
||||
pingProbe = await this.spawnProbe(
|
||||
claudePath,
|
||||
['-p', 'Reply with the single word PONG and nothing else', '--output-format', 'text'],
|
||||
cwd,
|
||||
env,
|
||||
PREFLIGHT_TIMEOUT_MS
|
||||
);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
if (attempt < PREFLIGHT_AUTH_MAX_RETRIES) {
|
||||
logger.warn(
|
||||
`Preflight ping failed (attempt ${attempt}/${PREFLIGHT_AUTH_MAX_RETRIES}), ` +
|
||||
`retrying in ${PREFLIGHT_AUTH_RETRY_DELAY_MS}ms: ${message}`
|
||||
);
|
||||
await new Promise((resolve) => setTimeout(resolve, PREFLIGHT_AUTH_RETRY_DELAY_MS));
|
||||
continue;
|
||||
}
|
||||
return {
|
||||
warning:
|
||||
'Preflight check for `claude -p` did not complete. ' +
|
||||
`Proceeding anyway. Details: ${message}`,
|
||||
};
|
||||
}
|
||||
|
||||
const combinedOutput = buildCombinedLogs(pingProbe.stdout, pingProbe.stderr);
|
||||
const lowerOutput = combinedOutput.toLowerCase();
|
||||
const isAuthFailure =
|
||||
lowerOutput.includes('not logged in') ||
|
||||
lowerOutput.includes('please run /login') ||
|
||||
lowerOutput.includes('missing api key') ||
|
||||
lowerOutput.includes('invalid api key');
|
||||
const combinedOutput = buildCombinedLogs(pingProbe.stdout, pingProbe.stderr);
|
||||
const lowerOutput = combinedOutput.toLowerCase();
|
||||
const isAuthFailure =
|
||||
lowerOutput.includes('not logged in') ||
|
||||
lowerOutput.includes('please run /login') ||
|
||||
lowerOutput.includes('missing api key') ||
|
||||
lowerOutput.includes('invalid api key');
|
||||
|
||||
if (isAuthFailure || pingProbe.exitCode !== 0) {
|
||||
const hint = isAuthFailure
|
||||
? 'Claude CLI `-p` mode is not authenticated. ' +
|
||||
'Set ANTHROPIC_API_KEY, or run `claude setup-token` to generate a long-lived OAuth token, ' +
|
||||
'then export it as CLAUDE_CODE_OAUTH_TOKEN.'
|
||||
: `Claude CLI preflight check failed (exit code ${pingProbe.exitCode ?? 'unknown'}).`;
|
||||
return { warning: hint };
|
||||
if (isAuthFailure && attempt < PREFLIGHT_AUTH_MAX_RETRIES) {
|
||||
logger.warn(
|
||||
`Preflight auth failure detected (attempt ${attempt}/${PREFLIGHT_AUTH_MAX_RETRIES}), ` +
|
||||
`retrying in ${PREFLIGHT_AUTH_RETRY_DELAY_MS}ms — likely stale locks from interrupted process`
|
||||
);
|
||||
await new Promise((resolve) => setTimeout(resolve, PREFLIGHT_AUTH_RETRY_DELAY_MS));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isAuthFailure || pingProbe.exitCode !== 0) {
|
||||
const hint = isAuthFailure
|
||||
? 'Claude CLI `-p` mode is not authenticated. ' +
|
||||
'Set ANTHROPIC_API_KEY, or run `claude setup-token` to generate a long-lived OAuth token, ' +
|
||||
'then export it as CLAUDE_CODE_OAUTH_TOKEN.' +
|
||||
(attempt > 1 ? ` (failed after ${attempt} attempts)` : '')
|
||||
: `Claude CLI preflight check failed (exit code ${pingProbe.exitCode ?? 'unknown'}).`;
|
||||
return { warning: hint };
|
||||
}
|
||||
|
||||
if (attempt > 1) {
|
||||
logger.info(
|
||||
`Preflight auth succeeded on attempt ${attempt} (previous attempt had auth failure)`
|
||||
);
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
return {};
|
||||
|
|
|
|||
Loading…
Reference in a new issue