diff --git a/src/main/services/team/TeamDataService.ts b/src/main/services/team/TeamDataService.ts index 880ac553..aeb48a46 100644 --- a/src/main/services/team/TeamDataService.ts +++ b/src/main/services/team/TeamDataService.ts @@ -230,7 +230,27 @@ export class TeamDataService { needsClarification: task.needsClarification, deletedAt: task.deletedAt, reviewState, - // Intentionally omit description/comments/activeForm/workIntervals/links to keep payload small + // IMPORTANT: comments MUST be included here (at least lightweight metadata). + // + // Previously comments were omitted from GlobalTask payload to keep IPC small. + // This silently broke task comment notifications in the renderer: the store's + // detectTaskCommentNotifications() compares oldTask.comments vs newTask.comments + // to find new comments and fire native OS toasts. Without comments in the payload, + // both counts were always 0 → newCommentCount <= oldCommentCount → every comment + // was silently skipped → "Task comment notifications" toggle had no effect. + // + // Fix: include lightweight comment metadata (id, author, truncated text for toast + // preview, createdAt, type). Full text and attachments are still omitted — those + // are loaded on-demand by the task detail view via team:getData. + comments: Array.isArray(task.comments) + ? task.comments.map((c) => ({ + id: c.id, + author: c.author, + text: c.text.slice(0, 120), + createdAt: c.createdAt, + type: c.type, + })) + : undefined, kanbanColumn, teamName: task.teamName, teamDisplayName: info.displayName, diff --git a/src/main/services/team/TeamProvisioningService.ts b/src/main/services/team/TeamProvisioningService.ts index 9804c48f..4b215949 100644 --- a/src/main/services/team/TeamProvisioningService.ts +++ b/src/main/services/team/TeamProvisioningService.ts @@ -112,6 +112,9 @@ const PREFLIGHT_AUTH_RETRY_DELAY_MS = 2000; const PREFLIGHT_AUTH_MAX_RETRIES = 2; const FS_MONITOR_POLL_MS = 2000; const TASK_WAIT_FALLBACK_MS = 15_000; +const STALL_CHECK_INTERVAL_MS = 10_000; +const STALL_WARNING_THRESHOLD_MS = 45_000; +const STALL_WARNING_REPEAT_MS = 30_000; const TEAM_JSON_READ_TIMEOUT_MS = 5_000; const TEAM_CONFIG_MAX_BYTES = 10 * 1024 * 1024; const TEAM_INBOX_MAX_BYTES = 2 * 1024 * 1024; @@ -120,6 +123,13 @@ const CROSS_TEAM_TOOL_RECIPIENT_NAMES = new Set([ 'cross_team_list_targets', 'cross_team_get_outbox', ]); +const HANDLED_STREAM_JSON_TYPES = new Set([ + 'user', + 'assistant', + 'control_request', + 'result', + 'system', +]); const PREFLIGHT_PING_PROMPT = 'Output only the single word PONG.'; const PREFLIGHT_PING_ARGS = [ '-p', @@ -217,6 +227,12 @@ interface ProvisioningRun { expectedMembers: string[]; request: TeamCreateRequest; lastLogProgressAt: number; + /** Monotonic ms timestamp of last stdout/stderr data. For stall detection. */ + lastDataReceivedAt: number; + /** Stall watchdog interval handle. Cleared in cleanupRun(). */ + stallCheckHandle: NodeJS.Timeout | null; + /** True after emitApiErrorWarning() fires once — prevents duplicate warnings and pre-complete false positives. */ + apiErrorWarningEmitted: boolean; fsPhase: 'waiting_config' | 'waiting_members' | 'waiting_tasks' | 'all_files_found'; waitingTasksSince: number | null; provisioningComplete: boolean; @@ -2358,6 +2374,127 @@ export class TeamProvisioningService { this.cleanupRun(run); } + /** + * Shows a non-fatal API error warning in the Live output section. + * Unlike failProvisioningWithApiError, does NOT kill the process — lets the SDK retry. + * Deduplicates: only the first warning per run is shown. + */ + private emitApiErrorWarning(run: ProvisioningRun, text: string): void { + if (run.provisioningComplete || run.processKilled || run.authRetryInProgress) return; + if (run.progress.state === 'failed' || run.cancelRequested) return; + if (run.apiErrorWarningEmitted) return; + + run.apiErrorWarningEmitted = true; + + const snippet = this.extractApiErrorSnippet(text); + const status = /api error:\s*(\d{3})\b/i.exec(text)?.[1] ?? null; + const label = status ? `API Error ${status}` : 'API Error'; + + const warningText = snippet + ? `**${label} — SDK is retrying**\n\n\`\`\`\n${snippet}\n\`\`\`\n\nОжидаем повторной попытки...` + : `**${label} — SDK is retrying**\n\nОжидаем повторной попытки...`; + + run.provisioningOutputParts.push(warningText); + run.progress.message = `${label} — SDK retrying...`; + emitLogsProgress(run); + // Prevent double-emit: the calling stderr/stdout handler will also try throttled emitLogsProgress + // after this returns. Updating lastLogProgressAt ensures the throttle check rejects it. + run.lastLogProgressAt = Date.now(); + } + + /** + * Starts a periodic watchdog that detects when the CLI process has produced + * no stdout/stderr data for an extended period. Pushes progressive warnings + * into provisioningOutputParts so they appear in the Live output section. + */ + private startStallWatchdog(run: ProvisioningRun): void { + if (run.stallCheckHandle) return; + + let lastWarningAt = 0; + + run.stallCheckHandle = setInterval(() => { + // try/catch: Node.js does NOT catch errors in setInterval callbacks — + // without this, an exception would silently kill the watchdog. + try { + if ( + run.provisioningComplete || + run.processKilled || + run.cancelRequested || + run.authRetryInProgress + ) { + this.stopStallWatchdog(run); + return; + } + + const now = Date.now(); + const silenceMs = now - run.lastDataReceivedAt; + + if (silenceMs < STALL_WARNING_THRESHOLD_MS) return; + if (lastWarningAt > 0 && now - lastWarningAt < STALL_WARNING_REPEAT_MS) return; + + lastWarningAt = now; + const silenceSec = Math.round(silenceMs / 1000); + + run.provisioningOutputParts.push(this.buildStallWarningText(silenceSec)); + const mins = Math.floor(silenceSec / 60); + const secs = silenceSec % 60; + const elapsed = mins > 0 ? `${mins} мин ${secs > 0 ? `${secs} сек` : ''}` : `${secs} сек`; + run.progress.message = `CLI не отвечает ${elapsed} — возможен rate limit`; + emitLogsProgress(run); + } catch (err) { + logger.error( + `[${run.teamName}] Stall watchdog error: ${ + err instanceof Error ? err.message : String(err) + }` + ); + } + }, STALL_CHECK_INTERVAL_MS); + } + + private stopStallWatchdog(run: ProvisioningRun): void { + if (run.stallCheckHandle) { + clearInterval(run.stallCheckHandle); + run.stallCheckHandle = null; + } + } + + private buildStallWarningText(silenceSec: number): string { + const mins = Math.floor(silenceSec / 60); + const secs = silenceSec % 60; + const elapsed = mins > 0 ? `${mins} мин ${secs > 0 ? `${secs} сек` : ''}` : `${secs} сек`; + + if (silenceSec < 60) { + return ( + `---\n\n` + + `**Ожидание ответа CLI** (тишина ${elapsed})\n\n` + + `Процесс запущен, но пока не выдаёт данных. ` + + `Это может быть вызвано задержкой API (rate limit / model cooldown) — ` + + `SDK выполняет повторные попытки автоматически.\n\n` + + `Ожидаем...` + ); + } + + if (silenceSec < 120) { + return ( + `---\n\n` + + `**Ожидание ответа CLI** (тишина ${elapsed})\n\n` + + `Процесс по-прежнему не отвечает. Вероятна задержка из-за rate limiting ` + + `(ошибка 429 / model cooldown). SDK автоматически повторяет запрос — ` + + `обычно это проходит в течение 1-3 минут.\n\n` + + `Можно отменить и попробовать позже, если ожидание затянется.` + ); + } + + return ( + `---\n\n` + + `**Длительное ожидание CLI** (тишина ${elapsed})\n\n` + + `Процесс молчит уже более ${mins} минут. Вероятные причины:\n` + + `- Rate limiting / model cooldown (429) — SDK повторяет автоматически\n` + + `- Перегрузка API сервера\n\n` + + `Рекомендуем отменить и попробовать через несколько минут.` + ); + } + /** * Detects auth failure keywords in stderr/stdout during provisioning. * On first detection: kills process, waits, and respawns automatically. @@ -2411,6 +2548,7 @@ export class TeamProvisioningService { run.timeoutHandle = null; } this.stopFilesystemMonitor(run); + this.stopStallWatchdog(run); if (run.child) { run.child.stdout?.removeAllListeners('data'); run.child.stderr?.removeAllListeners('data'); @@ -2429,6 +2567,7 @@ export class TeamProvisioningService { run.stderrLogLineBuf = ''; run.claudeLogsUpdatedAt = undefined; run.authFailureRetried = true; + run.apiErrorWarningEmitted = false; updateProgress(run, 'spawning', 'Auth failed — retrying after short delay'); run.onProgress(run.progress); @@ -2487,6 +2626,9 @@ export class TeamProvisioningService { // Reattach stderr handler this.attachStderrHandler(run); + run.lastDataReceivedAt = Date.now(); + this.startStallWatchdog(run); + // Restart filesystem monitor for createTeam (launch skips it) if (!run.isLaunch) { this.startFilesystemMonitor(run, run.request); @@ -2538,6 +2680,8 @@ export class TeamProvisioningService { let stdoutLineBuf = ''; child.stdout.on('data', (chunk: Buffer) => { + // Reset stall watchdog FIRST — any data (even partial JSON) means the CLI is alive. + run.lastDataReceivedAt = Date.now(); const text = chunk.toString('utf8'); this.appendCliLogs(run, 'stdout', text); run.stdoutBuffer += text; @@ -2573,7 +2717,9 @@ export class TeamProvisioningService { // Not valid JSON — check for auth failure in raw text output this.handleAuthFailureInOutput(run, trimmed, 'stdout'); if (this.hasApiError(trimmed) && !this.isAuthFailureWarning(trimmed, 'stdout')) { - this.failProvisioningWithApiError(run, trimmed); + // Show warning but do NOT kill — the SDK may be retrying internally (e.g. 429 model_cooldown). + // If all retries fail, result.subtype="error" will catch it and kill then. + this.emitApiErrorWarning(run, trimmed); } } } @@ -2592,6 +2738,8 @@ export class TeamProvisioningService { if (!child?.stderr) return; child.stderr.on('data', (chunk: Buffer) => { + // Reset stall watchdog FIRST — any data (even partial JSON) means the CLI is alive. + run.lastDataReceivedAt = Date.now(); const text = chunk.toString('utf8'); this.appendCliLogs(run, 'stderr', text); run.stderrBuffer += text; @@ -2602,7 +2750,9 @@ export class TeamProvisioningService { // Detect auth failure early instead of waiting for 5-minute timeout this.handleAuthFailureInOutput(run, text, 'stderr'); if (this.hasApiError(text) && !this.isAuthFailureWarning(text, 'stderr')) { - this.failProvisioningWithApiError(run, text); + // Show warning but do NOT kill — the SDK may be retrying internally (e.g. 429 model_cooldown). + // If all retries fail, result.subtype="error" will catch it and kill then. + this.emitApiErrorWarning(run, text); } const currentTs = Date.now(); @@ -2679,6 +2829,9 @@ export class TeamProvisioningService { expectedMembers: request.members.map((member) => member.name), request, lastLogProgressAt: 0, + lastDataReceivedAt: 0, // intentionally 0 — real reset happens after spawn (see startStallWatchdog call sites) + stallCheckHandle: null, + apiErrorWarningEmitted: false, waitingTasksSince: null, provisioningComplete: false, isLaunch: false, @@ -2792,6 +2945,11 @@ export class TeamProvisioningService { this.attachStdoutHandler(run); this.attachStderrHandler(run); + // Reset AFTER spawn — not at run init — because async operations (buildProvisioningEnv, + // writeConfigFile) between init and spawn can take seconds, causing false stall warnings. + run.lastDataReceivedAt = Date.now(); + this.startStallWatchdog(run); + // Filesystem-based progress monitor: actively polls team files instead // of relying on stdout (which only arrives at the end in text mode). // When config + members + tasks are all present, kill the process early @@ -3043,6 +3201,9 @@ export class TeamProvisioningService { expectedMembers, request: syntheticRequest, lastLogProgressAt: 0, + lastDataReceivedAt: 0, // intentionally 0 — real reset happens after spawn (see startStallWatchdog call sites) + stallCheckHandle: null, + apiErrorWarningEmitted: false, waitingTasksSince: null, provisioningComplete: false, isLaunch: true, @@ -3196,6 +3357,11 @@ export class TeamProvisioningService { this.attachStdoutHandler(run); this.attachStderrHandler(run); + // Reset AFTER spawn — not at run init — because async operations between init + // and spawn can take seconds, causing false stall warnings. + run.lastDataReceivedAt = Date.now(); + this.startStallWatchdog(run); + // For launch, skip the filesystem monitor — files (config, inboxes, tasks) // already exist from the previous run and would trigger immediate false // completion on the first poll. Rely on stream-json result.success instead. @@ -4865,6 +5031,23 @@ export class TeamProvisioningService { } } } + + // Catch-all: detect API errors in unrecognised message types. + // Guards against future protocol additions that carry error payloads + // (e.g. type: "error") which would otherwise be silently dropped. + if (typeof msg.type === 'string' && !HANDLED_STREAM_JSON_TYPES.has(msg.type)) { + const raw = JSON.stringify(msg); + logger.warn( + `[${run.teamName}] Unhandled stream-json type "${msg.type}": ${raw.slice(0, 300)}` + ); + if ( + !run.provisioningComplete && + this.hasApiError(raw) && + !this.isAuthFailureWarning(raw, 'stdout') + ) { + this.emitApiErrorWarning(run, raw); + } + } } /** @@ -5333,7 +5516,10 @@ export class TeamProvisioningService { if ( preCompleteText && this.hasApiError(preCompleteText) && - !this.isAuthFailureWarning(preCompleteText, 'pre-complete') + !this.isAuthFailureWarning(preCompleteText, 'pre-complete') && + // Skip if we already showed a warning for this error — the SDK had a chance to retry + // and the CLI reported success. Killing now would be a false positive. + !run.apiErrorWarningEmitted ) { this.failProvisioningWithApiError(run, preCompleteText); return; @@ -5352,6 +5538,7 @@ export class TeamProvisioningService { run.timeoutHandle = null; } this.stopFilesystemMonitor(run); + this.stopStallWatchdog(run); if (run.isLaunch) { await this.updateConfigPostLaunch( @@ -5811,6 +5998,7 @@ export class TeamProvisioningService { clearTimeout(run.timeoutHandle); run.timeoutHandle = null; } + this.stopStallWatchdog(run); if (run.silentUserDmForwardClearHandle) { clearTimeout(run.silentUserDmForwardClearHandle); run.silentUserDmForwardClearHandle = null; @@ -6016,6 +6204,12 @@ export class TeamProvisioningService { return; } + // IMPORTANT: stopStallWatchdog MUST be AFTER authRetryInProgress guard above! + // During respawn, the old process exit fires but run.stallCheckHandle already + // points to the NEW process's watchdog. Stopping it here would kill the wrong timer. + // The authRetryInProgress guard returns early, keeping the new watchdog alive. + this.stopStallWatchdog(run); + // === Process exited AFTER provisioning completed === // This means the team went offline (crash, kill, or natural exit). if (run.provisioningComplete) { diff --git a/src/renderer/components/team/dialogs/CreateTeamDialog.tsx b/src/renderer/components/team/dialogs/CreateTeamDialog.tsx index 36dfd18b..106a137e 100644 --- a/src/renderer/components/team/dialogs/CreateTeamDialog.tsx +++ b/src/renderer/components/team/dialogs/CreateTeamDialog.tsx @@ -255,7 +255,8 @@ export const CreateTeamDialog = ({ const [teamColor, setTeamColor] = useState(''); const [conflictDismissed, setConflictDismissed] = useState(false); const [selectedModel, setSelectedModelRaw] = useState(() => { - const stored = localStorage.getItem('team:lastSelectedModel') ?? ''; + const stored = localStorage.getItem('team:lastSelectedModel'); + if (stored === null) return 'opus'; return stored === '__default__' ? '' : stored; }); const [limitContext, setLimitContextRaw] = useState( @@ -264,9 +265,10 @@ export const CreateTeamDialog = ({ const [skipPermissions, setSkipPermissionsRaw] = useState( () => localStorage.getItem('team:lastSkipPermissions') !== 'false' ); - const [selectedEffort, setSelectedEffortRaw] = useState( - () => localStorage.getItem('team:lastSelectedEffort') ?? '' - ); + const [selectedEffort, setSelectedEffortRaw] = useState(() => { + const stored = localStorage.getItem('team:lastSelectedEffort'); + return stored === null ? 'medium' : stored; + }); // Advanced CLI section state (use teamName-derived key for localStorage) const advancedKey = sanitizeTeamName(teamName.trim()) || '_new_'; diff --git a/src/renderer/components/team/dialogs/LaunchTeamDialog.tsx b/src/renderer/components/team/dialogs/LaunchTeamDialog.tsx index c00c79e7..1947db81 100644 --- a/src/renderer/components/team/dialogs/LaunchTeamDialog.tsx +++ b/src/renderer/components/team/dialogs/LaunchTeamDialog.tsx @@ -157,15 +157,17 @@ export const LaunchTeamDialog = (props: LaunchTeamDialogProps): React.JSX.Elemen const [isSubmitting, setIsSubmitting] = useState(false); const [selectedModel, setSelectedModelRaw] = useState(() => { - const stored = localStorage.getItem('team:lastSelectedModel') ?? ''; + const stored = localStorage.getItem('team:lastSelectedModel'); + if (stored === null) return 'opus'; return stored === '__default__' ? '' : stored; }); const [skipPermissions, setSkipPermissionsRaw] = useState( () => localStorage.getItem('team:lastSkipPermissions') !== 'false' ); - const [selectedEffort, setSelectedEffortRaw] = useState( - () => localStorage.getItem('team:lastSelectedEffort') ?? '' - ); + const [selectedEffort, setSelectedEffortRaw] = useState(() => { + const stored = localStorage.getItem('team:lastSelectedEffort'); + return stored === null ? 'medium' : stored; + }); // --------------------------------------------------------------------------- // Launch-only state @@ -323,6 +325,8 @@ export const LaunchTeamDialog = (props: LaunchTeamDialogProps): React.JSX.Elemen setCwdMode('project'); setSelectedProjectPath(''); setCustomCwd(''); + setSelectedModelRaw('opus'); + setSelectedEffortRaw('medium'); } setLocalError(null);