From d517f2b320829f4c63db7290ac227cee56db1f4d Mon Sep 17 00:00:00 2001 From: 777genius Date: Fri, 24 Apr 2026 16:18:12 +0300 Subject: [PATCH] feat(team): harden member runtime liveness --- .../member-liveness-hardening-plan.md | 2336 +++++++++++++++++ .../agent-graph/src/canvas/draw-agents.ts | 15 +- packages/agent-graph/src/ports/types.ts | 4 + scripts/dev-with-runtime.mjs | 34 +- .../main/composition/runtimeSupport.ts | 18 +- src/features/tmux-installer/main/index.ts | 6 + .../runtime/TmuxPlatformCommandExecutor.ts | 114 +- .../TmuxPlatformCommandExecutor.test.ts | 10 +- .../main/infrastructure/wsl/TmuxWslService.ts | 17 + .../services/team/TeamLaunchStateEvaluator.ts | 86 +- .../team/TeamLaunchSummaryProjection.ts | 31 + .../services/team/TeamMcpConfigBuilder.ts | 22 +- .../services/team/TeamMemberLivenessMode.ts | 14 + .../services/team/TeamProvisioningService.ts | 723 +++-- .../team/TeamRuntimeLivenessResolver.ts | 350 +++ src/main/services/team/progressPayload.ts | 30 + .../runtime/OpenCodeTeamRuntimeAdapter.ts | 29 +- .../team/runtime/TeamRuntimeAdapter.ts | 5 + .../team/ProvisioningProgressBlock.tsx | 41 + .../components/team/TeamProvisioningPanel.tsx | 1 + .../components/team/members/MemberCard.tsx | 37 +- .../team/members/MemberDetailDialog.tsx | 2 + .../team/members/MemberDetailHeader.tsx | 11 +- .../team/members/MemberHoverCard.tsx | 12 +- .../components/team/members/MemberList.tsx | 12 +- .../components/team/provisioningSteps.ts | 5 +- src/renderer/store/index.ts | 17 + src/renderer/store/slices/teamSlice.ts | 18 +- src/renderer/utils/memberHelpers.ts | 43 +- .../utils/teamProvisioningPresentation.ts | 25 +- src/shared/types/team.ts | 80 + .../team/TeamLaunchStateEvaluator.test.ts | 29 +- .../team/TeamMcpConfigBuilder.test.ts | 107 +- .../team/TeamProvisioningService.test.ts | 99 +- 34 files changed, 4089 insertions(+), 294 deletions(-) create mode 100644 docs/team-management/member-liveness-hardening-plan.md create mode 100644 src/main/services/team/TeamMemberLivenessMode.ts create mode 100644 src/main/services/team/TeamRuntimeLivenessResolver.ts diff --git a/docs/team-management/member-liveness-hardening-plan.md b/docs/team-management/member-liveness-hardening-plan.md new file mode 100644 index 00000000..d78a8d97 --- /dev/null +++ b/docs/team-management/member-liveness-hardening-plan.md @@ -0,0 +1,2336 @@ +# Member Liveness Hardening Plan + +## Коротко + +Нужно исправить кейс, где launch UI висит на `Members joining`, участники выглядят как `starting`, а runtime memory показывает около `2 MB`. По текущему коду это почти наверняка значит, что UI видит tmux pane/shell PID, а не реальный teammate runtime. + +Главное изменение: разделить "что-то зарегистрировано", "pane/shell жив", "процесс runtime реально найден" и "member сделал bootstrap/check-in". Сейчас эти сигналы частично смешаны через `runtimeAlive`. + +Рекомендуемый путь: **UI + строгая liveness-модель**. +🎯 9/10 🛡️ 9/10 🧠 7/10 Примерно 650-950 строк production-кода + 350-550 строк тестов. + +## Почему не UI-only + +Топ 3 вариантов: + +1. UI-only diagnostics + 🎯 7 🛡️ 4 🧠 3 Примерно 180-260 строк. + Покажет, что происходит, но backend все равно сможет считать shell живым runtime. Зависание станет понятнее, но не надежнее. + +2. UI + строгая liveness-модель + 🎯 9 🛡️ 9 🧠 7 Примерно 650-950 строк. + Исправляет причину: weak evidence больше не маскирует timeout, UI получает понятные причины, self-heal остается только для надежных сигналов. + +3. Полный lease/heartbeat runtime manager + 🎯 8 🛡️ 10 🧠 9 Примерно 1200-1800 строк. + Самый надежный вариант, но слишком большой для первого фикса. Его лучше делать после варианта 2, когда станут видны реальные runtime-команды и частота edge cases. + +## Что проверено в коде + +Факты, которые важны для плана: + +- `mcp-server/src/tools/runtimeTools.ts` уже содержит `runtime_bootstrap_checkin` и `runtime_heartbeat`. Это сильный сигнал, его надо сделать главным источником подтверждения. +- `agent-teams-controller/src/internal/runtime.js` уже прокидывает `runtimeBootstrapCheckin()` в desktop runtime. +- `src/main/services/team/TeamBootstrapStateReader.ts` уже читает `bootstrap-state.json`, `bootstrap-journal.jsonl` и классифицирует stuck bootstrap. Там уже есть важные тайминги: `ACTIVE_BOOTSTRAP_STUCK_CLASSIFICATION_MS = 3 min` и `TERMINAL_BOOTSTRAP_ONLY_PENDING_GRACE_MS = 5 min`. +- `TeamProvisioningService.getLiveTeamAgentRuntimeMetadata()` сейчас собирает evidence из config/meta/persisted runtime/tmux/process table. +- Для tmux сейчас читается только `#{pane_id}\t#{pane_pid}` через `listTmuxPanePidsForCurrentPlatform()`. `pane_pid` часто является shell (`zsh`, `bash`, `sh`), поэтому `2 MB` выглядит логично. +- `attachLiveRuntimeMetadataToStatuses()` превращает `metadata.alive` в `runtimeAlive: true` и `livenessSource: "process"`. +- `reevaluateMemberLaunchStatus()` не fail-ит member после grace timeout, если `runtimeAlive === true`. +- `OpenCodeTeamRuntimeAdapter.mapBridgeMemberToRuntimeEvidence()` сейчас может выставить `runtimeAlive: true`, если bridge просто вернул member в состоянии `created` или `permission_blocked`. Это полезный материализационный сигнал, но он слабее реального `runtimePid` и слабее bootstrap. +- `recordOpenCodeRuntimeBootstrapCheckin()` и `recordOpenCodeRuntimeHeartbeat()` уже пишут `confirmed_alive`, `runtimeAlive: true`, `bootstrapConfirmed: true`, `nativeHeartbeat: true` через `updateOpenCodeRuntimeMemberLiveness()`. Значит confirmed state уже есть, надо не дать слабым сигналам выглядеть как он. +- `OpenCodeLaunchTransactionStore.canMarkOpenCodeRunReady()` уже требует `member_session_recorded`, `required_tools_proven` и `bootstrap_confirmed`. Это strict readiness precedent, который надо сохранить. +- Renderer уже получает оба источника: `memberSpawnStatuses` и `teamAgentRuntimeByTeam`. Но `MemberCard` сейчас получает только `runtimeSummary` строкой, а не сам `TeamAgentRuntimeEntry`. +- `teamSlice.areTeamAgentRuntimeEntriesEqual()` сейчас сравнивает только `memberName`, `alive`, `restartable`, `backendType`, `pid`, `runtimeModel`, `rssBytes`. Если добавить `livenessKind`, `pidSource`, `diagnostics`, но не обновить comparator, UI может не перерендериться. +- `teamSlice.areMemberSpawnStatusEntriesEqual()` сейчас намеренно игнорирует timing fields и сравнивает только visible spawn fields. Если добавить `livenessKind/runtimeDiagnostic`, comparator тоже надо обновить. +- `areLaunchSummaryCountsEqual()` сейчас знает только `confirmedCount`, `pendingCount`, `failedCount`, `runtimeAlivePendingCount`. Новые aggregate diagnostic counts не будут обновлять UI без расширения comparator. +- `TeamAgentRuntimeWatcher` обновляет runtime snapshot раз в 5 секунд, а spawn statuses раз в 2.5 секунды. Диагностические поля должны попадать либо в оба snapshot слоя, либо UX должен быть устойчив к задержке runtime snapshot. +- Renderer `member-spawn` event сейчас вызывает refresh spawn statuses, но не runtime snapshot. Если tooltip/detail зависят от `TeamAgentRuntimeSnapshot`, event handler тоже должен запланировать runtime refresh. +- Runtime tools принимают `metadata`, но `recordOpenCodeRuntimeBootstrapCheckin()` и `recordOpenCodeRuntimeHeartbeat()` сейчас используют только `diagnostics`. Если runtime присылает PID/version/command в `metadata`, эта информация теряется. +- `handleMemberSpawnToolResult()` при reason `already_running` сейчас делает `setMemberSpawnStatus(..., "online", ..., "process")`. В strict model это нельзя оставлять как strong liveness без проверки актуального runtime identity. +- `waitForTmuxPanesToExit()` использует `listTmuxPanePidsForCurrentPlatform()` только как "pane exists" check. Поэтому старый `listPanePids()` wrapper должен остаться ровно pane-existence helper, а не получить новую liveness-семантику. +- В проекте уже есть env-mode precedent: `CLAUDE_TEAM_OPENCODE_LAUNCH_MODE` с `dogfood`/`production`/`disabled`. Для liveness rollout лучше использовать такой же явный режим, а не скрытый boolean. +- `src/shared/types/api.ts`, `src/preload/index.ts` и `src/renderer/api/httpClient.ts` уже прокидывают `getMemberSpawnStatuses()` и `getTeamAgentRuntime()` через shared snapshot types. Новый контракт можно добавить optional fields без нового IPC channel, но browser HTTP fallback должен возвращать валидный старый shape. +- `TeamProvisioningService.readUnixProcessTableRows()` сейчас приватный, sync и читает только `pid,command`. Для надежного liveness нужен `ppid`, WSL-aware execution и unit-test seam. Это не должно оставаться приватным ad hoc helper внутри огромного service. +- `getLiveTeamAgentRuntimeMetadata()` сейчас читает tmux panes и process table внутри одного метода. После strict model там станет слишком много правил, поэтому план должен вынести pure resolution в отдельный helper/module. + +## Главная проблема + +Текущий `runtimeAlive` слишком широкий: + +```text +tmux pane exists +-> pane_pid is zsh/bash with low RSS +-> metadata.alive = true +-> MemberSpawnStatusEntry.runtimeAlive = true +-> grace timeout does not fail +-> UI shows starting/joining for minutes +``` + +Нужно прекратить использовать один boolean для разных уровней доверия. + +## Целевой контракт + +### Evidence ladder + +Сигналы должны оцениваться сверху вниз: + +1. `confirmed_bootstrap` + Member сделал `member_briefing`, `runtime_bootstrap_checkin`, `runtime_heartbeat`, meaningful inbox heartbeat или успешный bootstrap transcript. Это самый сильный сигнал. + +2. `runtime_process` + Найден процесс runtime с надежной идентичностью: `--team-name ` + `--agent-id `, или OpenCode bridge вернул валидный `runtimePid`/`sessionId`, и PID жив. + +3. `runtime_process_candidate` + Найден non-shell descendant под tmux pane, но без строгого identity match. Это diagnostic signal, не strong alive signal в первой реализации. + +4. `permission_blocked` + Runtime/bridge явно говорит, что требуется permission approval. + +5. `shell_only` + Tmux pane жив, но foreground command или root pane process выглядит как shell, и runtime child не найден. + +6. `registered_only` + Member есть в `config.json`/`members.meta.json`, но live process не найден. + +7. `stale_metadata` + Есть persisted `agentId`, `tmuxPaneId` или `runtimePid`, но live evidence не подтвержден. + +8. `not_found` + Нет полезных runtime данных. + +### Strong vs weak + +Только эти состояния ставят `runtimeAlive: true`: + +- `confirmed_bootstrap` +- `runtime_process` + +Эти состояния не ставят `runtimeAlive: true`: + +- `runtime_process_candidate` +- `permission_blocked` +- `shell_only` +- `registered_only` +- `stale_metadata` +- `not_found` + +Почему `runtime_process_candidate` не strong: non-shell child может быть `node`, `script`, `sleep`, wrapper или одноразовая команда. Без `teamName/agentId/sessionId` это слишком рискованно для снятия failure. + +## Тайминги + +Оставить текущий `MEMBER_LAUNCH_GRACE_MS = 90_000` как короткий timeout для отсутствия strong evidence. + +Добавить отдельный bootstrap stall deadline: + +```ts +const MEMBER_BOOTSTRAP_STALL_MS = 5 * 60_000; +``` + +Правила: + +- После 90 секунд: + - `shell_only`, `registered_only`, `stale_metadata`, `not_found` -> `failed_to_start`. + - `permission_blocked` -> не hard fail, показать permission UI. + - `runtime_process_candidate` -> warning, но не считать ready. + - `runtime_process` -> warning `waiting for bootstrap`, но не hard fail на 90 сек. + +- После 5 минут: + - `runtime_process_candidate` без bootstrap -> `failed_to_start`. + - `runtime_process` без bootstrap -> `runtimeDiagnosticSeverity: "warning"` и launch banner должен перестать быть мутным: `runtime alive but no bootstrap after 5 min`. + +Важно: verified runtime process не надо сразу убивать или hard fail-ить только потому, что bootstrap не пришел. Но UI не должен продолжать generic `starting`. + +## Rollout mode + +Строгая модель меняет поведение launch timeout, поэтому ее надо включать контролируемо. + +Топ 3 rollout вариантов: + +1. Diagnostics-only default, strict behind env flag + 🎯 9 🛡️ 9 🧠 5 Примерно 80-140 строк. + По умолчанию UI получает новые diagnostics, но `runtimeAlive` behavior остается старым. Strict включается через env для dogfood. Это самый безопасный путь для первого PR. + +2. Strict default сразу + 🎯 6 🛡️ 6 🧠 4 Примерно 40-80 строк. + Быстрее исправляет проблему, но риск false negative выше, если реальные teammate processes не содержат ожидаемые identity args. + +3. Полный app setting + env override + 🎯 8 🛡️ 8 🧠 7 Примерно 180-260 строк. + Удобно для пользователей, но это больше surface area: settings UI, persistence, migration, tests. Лучше после dogfood данных. + +Рекомендация: вариант 1. + +Добавить mode resolver рядом с team runtime кодом: + +```ts +export type TeamMemberLivenessMode = 'diagnostics' | 'strict'; + +export const CLAUDE_TEAM_MEMBER_LIVENESS_MODE_ENV = 'CLAUDE_TEAM_MEMBER_LIVENESS_MODE'; + +export function resolveTeamMemberLivenessModeFromEnv( + env: NodeJS.ProcessEnv = process.env +): TeamMemberLivenessMode { + const raw = env[CLAUDE_TEAM_MEMBER_LIVENESS_MODE_ENV]?.trim().toLowerCase(); + if (raw === 'strict') return 'strict'; + return 'diagnostics'; +} +``` + +Behavior by mode: + +| Area | `diagnostics` | `strict` | +| ------------------------------ | ---------------------------------------- | --------------------------- | +| `livenessKind` | filled | filled | +| UI labels | enabled | enabled | +| `runtimeAlive` from shell-only | old behavior may remain temporarily | always false | +| `already_running` shortcut | warning diagnostic, old fallback allowed | must verify strong evidence | +| timeout self-heal | old behavior | strong evidence only | +| launchDiagnostics | enabled | enabled | + +Important default: + +- In local dogfood, run with `CLAUDE_TEAM_MEMBER_LIVENESS_MODE=strict`. +- Production default can stay `diagnostics` for one release if Phase 0 data is unknown. +- After manual scenarios pass, flip default to `strict` and keep env as rollback: `CLAUDE_TEAM_MEMBER_LIVENESS_MODE=diagnostics`. + +This gives an emergency fallback without reverting the UI diagnostics work. + +## Structured launch diagnostics + +Файлы: + +- `src/shared/types/team.ts` +- `src/main/services/team/TeamProvisioningService.ts` +- `src/main/services/team/progressPayload.ts` +- `src/renderer/components/team/ProvisioningProgressBlock.tsx` + +`TeamProvisioningProgress` сейчас почти полностью строковый: + +- `message` +- `warnings` +- `cliLogsTail` +- `assistantOutput` + +`cliLogsTail` и `assistantOutput` уже специально ограничены (`PROGRESS_LOG_TAIL_LINES`, `PROGRESS_OUTPUT_TAIL_PARTS`), чтобы не провоцировать renderer OOM. Поэтому нельзя решать проблему "непонятно что происходит" простым расширением логов. + +Добавить маленький структурированный payload: + +```ts +export interface TeamLaunchDiagnosticItem { + id: string; + memberName?: string; + severity: 'info' | 'warning' | 'error'; + code: + | 'spawn_accepted' + | 'runtime_process_detected' + | 'runtime_process_candidate' + | 'tmux_shell_only' + | 'runtime_not_found' + | 'permission_pending' + | 'bootstrap_confirmed' + | 'bootstrap_stalled' + | 'stale_runtime_event_rejected' + | 'process_table_unavailable'; + label: string; + detail?: string; + observedAt: string; +} + +export interface TeamProvisioningProgress { + // existing fields... + launchDiagnostics?: TeamLaunchDiagnosticItem[]; +} +``` + +Bounded contract: + +- максимум 20 diagnostic items в progress payload; +- newest-first или stable sorted by severity/member; +- no raw unbounded command strings; +- process command must be sanitized/truncated; +- member-level details live in `MemberSpawnStatusEntry`/`TeamAgentRuntimeEntry`, progress diagnostics are only summary. + +Renderer: + +- `ProvisioningProgressBlock` can render a compact "Diagnostics" disclosure above Live output. +- It should show code-specific rows like `bob - shell only - tmux pane foreground command is zsh`. +- It should not require opening CLI logs to understand common stuck states. + +Recommended UI rows: + +```text +bob shell only tmux pane foreground command is zsh +jack waiting for bootstrap verified runtime process, no check-in yet +tom no runtime found spawn accepted 94s ago +``` + +This is separate from `Copy diagnostics`, which can include full sanitized JSON. + +## Типы + +Файл: `src/shared/types/team.ts` + +```ts +export type TeamAgentRuntimeLivenessKind = + | 'confirmed_bootstrap' + | 'runtime_process' + | 'runtime_process_candidate' + | 'permission_blocked' + | 'shell_only' + | 'registered_only' + | 'stale_metadata' + | 'not_found'; + +export type TeamAgentRuntimePidSource = + | 'lead_process' + | 'tmux_pane' + | 'tmux_child' + | 'agent_process_table' + | 'opencode_bridge' + | 'runtime_bootstrap' + | 'persisted_metadata'; + +export type TeamAgentRuntimeDiagnosticSeverity = 'info' | 'warning' | 'error'; + +export interface TeamAgentRuntimeEntry { + memberName: string; + alive: boolean; + restartable: boolean; + backendType?: TeamAgentRuntimeBackendType; + providerId?: TeamProviderId; + providerBackendId?: TeamProviderBackendId; + laneId?: string; + laneKind?: 'primary' | 'secondary'; + pid?: number; + runtimeModel?: string; + rssBytes?: number; + livenessKind?: TeamAgentRuntimeLivenessKind; + pidSource?: TeamAgentRuntimePidSource; + processCommand?: string; + paneId?: string; + panePid?: number; + paneCurrentCommand?: string; + runtimePid?: number; + runtimeSessionId?: string; + runtimeLeaseExpiresAt?: string; + runtimeLastSeenAt?: string; + runtimeDiagnostic?: string; + runtimeDiagnosticSeverity?: TeamAgentRuntimeDiagnosticSeverity; + diagnostics?: string[]; + updatedAt: string; +} +``` + +В `MemberSpawnStatusEntry` добавить только компактные поля для launch UI: + +```ts +export interface MemberSpawnStatusEntry { + // existing fields + runtimeDiagnostic?: string; + runtimeDiagnosticSeverity?: 'info' | 'warning' | 'error'; + livenessKind?: TeamAgentRuntimeLivenessKind; + livenessLastCheckedAt?: string; +} +``` + +Почему `runtimeSessionId` и `runtimeLastSeenAt` важны: + +- OpenCode runtime tools всегда передают `runtimeSessionId`. +- `runtime_bootstrap_checkin` и `runtime_heartbeat` уже являются lease-like сигналом. +- Без `runtimeLastSeenAt` UI не сможет отличить "процесс подтвержден 10 секунд назад" от "persisted state висит со вчера". +- `runtimeLeaseExpiresAt` можно не включать в Phase 0, но тип стоит заложить сразу, если lease/heartbeat manager будет Phase 5. + +## Runtime tool metadata + +Файлы: + +- `mcp-server/src/tools/runtimeTools.ts` +- `src/main/services/team/TeamProvisioningService.ts` + +`runtime_bootstrap_checkin` и `runtime_heartbeat` уже принимают `metadata`, но main сейчас не извлекает из нее ничего. Это упущение: OpenCode/runtime может передать полезные low-level детали, которые не стоит парсить из logs. + +Поддержать bounded metadata: + +```ts +interface RuntimeToolMetadata { + runtimePid?: number; + processCommand?: string; + runtimeVersion?: string; + hostPid?: number; + cwd?: string; +} +``` + +Parser: + +```ts +function parseRuntimeToolMetadata(value: unknown): RuntimeToolMetadata { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + return {}; + } + const raw = value as Record; + const runtimePid = + typeof raw.runtimePid === 'number' && Number.isFinite(raw.runtimePid) && raw.runtimePid > 0 + ? Math.trunc(raw.runtimePid) + : undefined; + const processCommand = + typeof raw.processCommand === 'string' ? raw.processCommand.slice(0, 500) : undefined; + return { + ...(runtimePid ? { runtimePid } : {}), + ...(processCommand ? { processCommand } : {}), + }; +} +``` + +Security/robustness: + +- bound string lengths; +- ignore nested objects except allowlisted fields; +- never put raw metadata into logs/UI; +- include sanitized fields in copy diagnostics. + +`updateOpenCodeRuntimeMemberLiveness()` should accept sanitized metadata: + +```ts +await this.updateOpenCodeRuntimeMemberLiveness({ + teamName, + runId, + memberName, + runtimeSessionId, + observedAt, + diagnostics: payload.diagnostics, + metadata: parseRuntimeToolMetadata(payload.metadata), + reason: 'OpenCode runtime bootstrap check-in accepted', +}); +``` + +If metadata has `runtimePid`, still verify it: + +- PID must be alive now; +- command must still look like the expected runtime, if command info is available; +- runId/teamName/sessionId must match current tombstone/launch state. + +Do not trust metadata PID by itself. + +## Internal metadata + +Файл: `src/main/services/team/TeamProvisioningService.ts` + +Расширить внутренний тип: + +```ts +interface LiveTeamAgentRuntimeMetadata { + alive: boolean; + backendType?: TeamAgentRuntimeBackendType; + agentId?: string; + pid?: number; + metricsPid?: number; + model?: string; + tmuxPaneId?: string; + livenessKind?: TeamAgentRuntimeLivenessKind; + pidSource?: TeamAgentRuntimePidSource; + processCommand?: string; + panePid?: number; + paneCurrentCommand?: string; + runtimeSessionId?: string; + diagnostics?: string[]; +} +``` + +Helper: + +```ts +function isStrongRuntimeEvidence(metadata: LiveTeamAgentRuntimeMetadata | undefined): boolean { + return ( + metadata?.livenessKind === 'confirmed_bootstrap' || metadata?.livenessKind === 'runtime_process' + ); +} + +function isWeakRuntimeEvidence(metadata: LiveTeamAgentRuntimeMetadata | undefined): boolean { + return ( + metadata?.livenessKind === 'runtime_process_candidate' || + metadata?.livenessKind === 'permission_blocked' || + metadata?.livenessKind === 'shell_only' || + metadata?.livenessKind === 'registered_only' || + metadata?.livenessKind === 'stale_metadata' || + metadata?.livenessKind === 'not_found' + ); +} +``` + +## Liveness resolver seam + +Файл: `src/main/services/team/TeamRuntimeLivenessResolver.ts` + +Не стоит держать весь liveness algorithm внутри `TeamProvisioningService`. Там уже смешаны launch state, persistence, progress, tmux, OpenCode, inbox audit и runtime snapshot. Для надежности и тестов лучше вынести pure resolver. + +Варианты: + +1. Вынести только pure helpers + 🎯 8 🛡️ 7 🧠 4 Примерно 120-180 строк. + Быстро, но `getLiveTeamAgentRuntimeMetadata()` останется большим orchestration методом. + +2. Вынести resolver с input/output контрактом + 🎯 9 🛡️ 9 🧠 6 Примерно 220-340 строк. + Лучший баланс: service собирает raw facts, resolver принимает facts и возвращает `LiveTeamAgentRuntimeMetadata`. + +3. Вынести полноценный runtime monitor service + 🎯 8 🛡️ 10 🧠 8 Примерно 500-800 строк. + Архитектурно чище, но слишком большой шаг для текущего фикса. + +Рекомендация: вариант 2. + +Resolver input: + +```ts +export interface ResolveTeamMemberRuntimeLivenessInput { + teamName: string; + memberName: string; + agentId?: string; + backendType?: TeamAgentRuntimeBackendType; + providerId?: TeamProviderId; + tmuxPaneId?: string; + persistedRuntimePid?: number; + persistedRuntimeSessionId?: string; + trackedSpawnStatus?: MemberSpawnStatusEntry; + openCodeEvidence?: TeamRuntimeMemberLaunchEvidence; + pane?: TmuxPaneRuntimeInfo; + processRows: readonly RuntimeProcessTableRow[]; + nowIso: string; +} +``` + +Resolver output: + +```ts +export interface ResolvedTeamMemberRuntimeLiveness { + alive: boolean; + livenessKind: TeamAgentRuntimeLivenessKind; + pidSource?: TeamAgentRuntimePidSource; + pid?: number; + metricsPid?: number; + panePid?: number; + paneCurrentCommand?: string; + processCommand?: string; + runtimeSessionId?: string; + runtimeDiagnostic: string; + runtimeDiagnosticSeverity: TeamAgentRuntimeDiagnosticSeverity; + diagnostics: string[]; +} +``` + +`TeamProvisioningService` responsibilities after extraction: + +- read config/meta/persisted launch/runtime state; +- batch-read tmux pane runtime info once; +- batch-read process table once; +- call resolver per member; +- cache and expose the resolved metadata; +- invalidate caches on check-in/heartbeat/restart/stop/pane kill. + +Resolver responsibilities: + +- classify shell-only vs runtime process vs candidate; +- enforce strong/weak evidence rules; +- choose `pidSource`; +- sanitize diagnostics; +- never read filesystem, tmux, process table or stores directly. + +This seam makes the hardest rules unit-testable without spawning tmux or fake processes. + +## Tmux runtime info + +Файл: `src/features/tmux-installer/main/infrastructure/runtime/TmuxPlatformCommandExecutor.ts` + +Сейчас читается только pane PID. Нужно получать больше контекста: + +```ts +export interface TmuxPaneRuntimeInfo { + paneId: string; + panePid: number; + currentCommand?: string; + currentPath?: string; + sessionName?: string; + windowName?: string; +} + +async listPaneRuntimeInfo(paneIds: readonly string[]): Promise> { + const normalizedPaneIds = [...new Set(paneIds.map((paneId) => paneId.trim()).filter(Boolean))]; + if (normalizedPaneIds.length === 0) return new Map(); + + const format = [ + '#{pane_id}', + '#{pane_pid}', + '#{pane_current_command}', + '#{pane_current_path}', + '#{session_name}', + '#{window_name}', + ].join('\t'); + + const result = await this.execTmux(['list-panes', '-a', '-F', format], 3_000); + if (result.exitCode !== 0) { + throw new Error(result.stderr || 'Failed to list tmux panes'); + } + + const wanted = new Set(normalizedPaneIds); + const infoByPaneId = new Map(); + + for (const line of result.stdout.split('\n')) { + const [paneId = '', rawPid = '', currentCommand = '', currentPath = '', sessionName = '', windowName = ''] = + line.split('\t'); + const normalizedPaneId = paneId.trim(); + if (!wanted.has(normalizedPaneId)) continue; + + const panePid = Number.parseInt(rawPid.trim(), 10); + if (!Number.isFinite(panePid) || panePid <= 0) continue; + + infoByPaneId.set(normalizedPaneId, { + paneId: normalizedPaneId, + panePid, + currentCommand: currentCommand.trim() || undefined, + currentPath: currentPath.trim() || undefined, + sessionName: sessionName.trim() || undefined, + windowName: windowName.trim() || undefined, + }); + } + + return infoByPaneId; +} +``` + +Оставить старый метод как wrapper: + +```ts +async listPanePids(paneIds: readonly string[]): Promise> { + const info = await this.listPaneRuntimeInfo(paneIds); + return new Map([...info.entries()].map(([paneId, pane]) => [paneId, pane.panePid])); +} +``` + +Compatibility rule: + +- `listPanePids()` remains "does this pane exist and what is its root pane PID". +- It must not imply teammate runtime liveness. +- Existing callers like `waitForTmuxPanesToExit()` should keep working without knowing about `livenessKind`. + +## Process table + +Нужен `ppid`, иначе невозможно понять, есть ли runtime child под tmux pane. + +```ts +interface RuntimeProcessTableRow { + pid: number; + ppid: number; + command: string; +} +``` + +Do not implement this as `readUnixProcessTableRows()` inside `TeamProvisioningService`. The current helper is private, sync and native-only. The strict model needs a testable, platform-aware provider. + +Recommended shape: + +```ts +export interface RuntimeProcessTableProvider { + listRuntimeProcesses(): Promise; +} +``` + +`TmuxPlatformCommandExecutor` can implement it because it already knows whether the current tmux runtime is native or WSL-backed. + +На macOS/Linux: + +```ts +ps -ax -o pid=,ppid=,command= +``` + +На Windows/WSL важно: `ps` должен выполняться внутри той же WSL distro, где выполняется tmux. Host-side Windows `ps` не увидит Linux children. + +Практичный вариант: + +- добавить в `TmuxPlatformCommandExecutor` метод `listRuntimeProcesses()`; +- внутри Windows ветки использовать `TmuxWslService` и запускать `wsl -d -e ps -ax -o pid=,ppid=,command=`; +- на native платформах использовать обычный `execFile('ps', ...)`. + +Пример парсинга: + +```ts +function parseRuntimeProcessTable(output: string): RuntimeProcessTableRow[] { + const rows: RuntimeProcessTableRow[] = []; + + for (const line of output.split('\n')) { + const match = /^\s*(\d+)\s+(\d+)\s+(.*)$/.exec(line); + if (!match) continue; + + const pid = Number.parseInt(match[1], 10); + const ppid = Number.parseInt(match[2], 10); + const command = match[3]?.trim() ?? ''; + + if (Number.isFinite(pid) && pid > 0 && Number.isFinite(ppid) && command) { + rows.push({ pid, ppid, command }); + } + } + + return rows; +} +``` + +Performance contract: + +- read process table once per runtime snapshot, not once per member; +- reuse the same rows for every member resolver call; +- respect the existing backend cache TTL around 2 seconds; +- if process table read fails, return an explicit diagnostic and do not mark shell-only as strong alive. + +Failure contract: + +- `process_table_unavailable` is a warning, not an immediate hard fail by itself; +- if tmux pane info is available but process table is unavailable, classify as `shell_only` only when `pane_current_command` is shell-like; +- if both tmux and process table are unavailable, classify as `stale_metadata` or `not_found` based on persisted evidence; +- do not self-clear a previous failure on provider failure. + +### PID freshness and reuse + +PID alone is not identity. A stale persisted `runtimePid` can be reused by the OS for another process. + +Rules: + +- Never treat persisted PID as strong evidence without reading the current process table. +- A PID match is strong only if current command identity also matches expected runtime identity. +- If possible later, add process start time to the table and compare it with `firstSpawnAcceptedAt`/`runtimeLastSeenAt`. +- If process start time is unavailable, use command identity and current run/session identity as the minimum. + +Optional future row: + +```ts +interface RuntimeProcessTableRow { + pid: number; + ppid: number; + command: string; + startedAtMs?: number; +} +``` + +Do not block Phase 1 on `startedAtMs`; block it on "no PID-only strong evidence". + +## Shell detection + +```ts +const SHELL_COMMAND_NAMES = new Set(['sh', 'bash', 'zsh', 'fish', 'dash', 'login', 'tmux']); + +function basenameCommand(command: string | undefined): string { + const firstToken = command?.trim().split(/\s+/, 1)[0] ?? ''; + const base = firstToken.split(/[\\/]/).pop() ?? firstToken; + return base.replace(/^-/, '').toLowerCase(); +} + +function isShellLikeCommand(command: string | undefined): boolean { + return SHELL_COMMAND_NAMES.has(basenameCommand(command)); +} +``` + +## Runtime identity matching + +Текущий `commandContainsCliArgValue()` лучше заменить на helper, который поддерживает оба вида: + +- `--agent-id abc` +- `--agent-id=abc` +- quoted values + +Минимально: + +```ts +function extractCliArgValues(command: string, argName: string): string[] { + const escapedArg = argName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + const pattern = new RegExp( + `(?:^|\\s)${escapedArg}(?:=|\\s+)("([^"]*)"|'([^']*)'|([^\\s]+))`, + 'g' + ); + + const values: string[] = []; + for (const match of command.matchAll(pattern)) { + const value = (match[2] ?? match[3] ?? match[4] ?? '').trim(); + if (value) values.push(value); + } + return values; +} + +function commandArgEquals(command: string, argName: string, expected: string | undefined): boolean { + if (!expected?.trim()) return false; + return extractCliArgValues(command, argName).some((value) => value === expected.trim()); +} +``` + +Strong process match: + +```ts +function isVerifiedRuntimeProcess(params: { + row: RuntimeProcessTableRow; + teamName: string; + agentId?: string; +}): boolean { + return ( + commandArgEquals(params.row.command, '--team-name', params.teamName) && + commandArgEquals(params.row.command, '--agent-id', params.agentId) + ); +} +``` + +Sanitize any command before it reaches UI/logs/copy diagnostics: + +```ts +const SECRET_FLAG_PATTERN = + /(--(?:api-key|token|password|secret|authorization|auth-token)(?:=|\s+))("[^"]*"|'[^']*'|\S+)/gi; + +function sanitizeProcessCommandForDiagnostics(command: string | undefined): string | undefined { + const trimmed = command?.trim(); + if (!trimmed) return undefined; + return trimmed.replace(SECRET_FLAG_PATTERN, '$1[redacted]').slice(0, 500); +} +``` + +Do not use sanitized commands for identity matching. Match on the raw process table row inside main process memory, then only expose sanitized/truncated command text. + +## Descendant resolution + +```ts +function collectDescendants( + rows: readonly RuntimeProcessTableRow[], + rootPid: number +): RuntimeProcessTableRow[] { + const childrenByParent = new Map(); + + for (const row of rows) { + const bucket = childrenByParent.get(row.ppid) ?? []; + bucket.push(row); + childrenByParent.set(row.ppid, bucket); + } + + const result: RuntimeProcessTableRow[] = []; + const queue = [...(childrenByParent.get(rootPid) ?? [])]; + + while (queue.length > 0) { + const next = queue.shift(); + if (!next) continue; + result.push(next); + queue.push(...(childrenByParent.get(next.pid) ?? [])); + } + + return result; +} +``` + +Resolution: + +```ts +interface ResolvedRuntimeProcess { + kind: TeamAgentRuntimeLivenessKind; + pid?: number; + command?: string; + pidSource?: TeamAgentRuntimePidSource; + diagnostics: string[]; +} + +function resolveTmuxRuntimeProcess(params: { + teamName: string; + agentId?: string; + pane: TmuxPaneRuntimeInfo; + rows: readonly RuntimeProcessTableRow[]; +}): ResolvedRuntimeProcess { + const descendants = collectDescendants(params.rows, params.pane.panePid); + + const verified = descendants.find((row) => + isVerifiedRuntimeProcess({ + row, + teamName: params.teamName, + agentId: params.agentId, + }) + ); + + if (verified) { + return { + kind: 'runtime_process', + pid: verified.pid, + command: verified.command, + pidSource: 'tmux_child', + diagnostics: ['matched tmux descendant by team-name and agent-id'], + }; + } + + const candidate = descendants.find((row) => !isShellLikeCommand(row.command)); + if (candidate) { + return { + kind: 'runtime_process_candidate', + pid: candidate.pid, + command: candidate.command, + pidSource: 'tmux_child', + diagnostics: ['found non-shell descendant without team/member identity'], + }; + } + + if (isShellLikeCommand(params.pane.currentCommand)) { + return { + kind: 'shell_only', + pid: params.pane.panePid, + command: params.pane.currentCommand, + pidSource: 'tmux_pane', + diagnostics: [ + `tmux pane is alive, but foreground command is ${params.pane.currentCommand}`, + 'no verified runtime descendant process was found', + ], + }; + } + + return { + kind: 'not_found', + diagnostics: ['tmux pane exists, but no runtime process could be identified'], + }; +} +``` + +## OpenCode bridge correction + +Файл: `src/main/services/team/runtime/OpenCodeTeamRuntimeAdapter.ts` + +Сейчас `pendingRuntimeObserved = createdOrBlocked && runtimeMaterialized`, а `runtimeMaterialized` фактически означает "bridge вернул member". Это не равно live runtime. + +Надо разделить: + +- `agentToolAccepted`: bridge принял/создал member. +- `runtimeAlive`: есть подтвержденный live runtime signal или confirmed bootstrap. +- `bootstrapConfirmed`: `launchState === "confirmed_alive"`. + +Пример: + +```ts +function mapBridgeMemberToRuntimeEvidence( + memberName: string, + launchState: OpenCodeTeamMemberLaunchBridgeState, + sessionId: string | undefined, + runtimePid: number | undefined, + pendingPermissionRequestIds: string[] | undefined, + runtimeMaterialized: boolean, + diagnostics: string[] +): TeamRuntimeMemberLaunchEvidence { + const confirmed = launchState === 'confirmed_alive'; + const failed = launchState === 'failed'; + const permissionBlocked = launchState === 'permission_blocked'; + const validRuntimePid = + typeof runtimePid === 'number' && Number.isFinite(runtimePid) && runtimePid > 0; + const hasRuntimeSession = typeof sessionId === 'string' && sessionId.trim().length > 0; + const runtimeLivenessKind = confirmed + ? 'confirmed_bootstrap' + : validRuntimePid + ? 'runtime_process' + : permissionBlocked + ? 'permission_blocked' + : hasRuntimeSession + ? 'runtime_process_candidate' + : undefined; + + return { + memberName, + providerId: 'opencode', + launchState: failed + ? 'failed_to_start' + : confirmed + ? 'confirmed_alive' + : permissionBlocked + ? 'runtime_pending_permission' + : 'runtime_pending_bootstrap', + agentToolAccepted: confirmed || runtimeMaterialized, + runtimeAlive: confirmed || validRuntimePid, + bootstrapConfirmed: confirmed, + hardFailure: failed, + hardFailureReason: failed ? 'OpenCode bridge reported member launch failure' : undefined, + pendingPermissionRequestIds: + pendingPermissionRequestIds && pendingPermissionRequestIds.length > 0 + ? [...new Set(pendingPermissionRequestIds)] + : undefined, + sessionId, + ...(validRuntimePid ? { runtimePid } : {}), + ...(runtimeLivenessKind ? { livenessKind: runtimeLivenessKind } : {}), + diagnostics, + }; +} +``` + +Важно: `sessionId` без `runtimePid` лучше считать candidate, а не strong live process. Session id полезен для delivery/permission correlation, но сам по себе не доказывает, что процесс сейчас жив. + +Также `toOpenCodePersistedLaunchMember()` должен сохранять `runtimePid` и `sessionId`, если они есть. Сейчас для primary OpenCode launch evidence это легко потерять. + +### OpenCode transaction/readiness invariant + +`canMarkOpenCodeRunReady()` уже требует `bootstrap_confirmed`, поэтому новая liveness-модель не должна поднимать aggregate state в `clean_success`, если есть только: + +- bridge `created`; +- `sessionId` без bootstrap; +- permission request; +- stale launch-state member. + +Regression test: + +```ts +expect( + canMarkOpenCodeRunReady({ + members: [{ name: 'bob', launchState: 'runtime_pending_bootstrap' }], + // checkpoints exist except bootstrap + }).ok +).toBe(false); +``` + +### Stale runtime events + +`assertOpenCodeRuntimeEvidenceAccepted()` already checks tombstones/current run ownership before accepting bootstrap/heartbeat/delivery evidence. This must remain the gate for all strong OpenCode liveness. + +Rules: + +- `runtime_bootstrap_checkin` from an old `runId` must not revive a stopped/relaunched member. +- `runtime_heartbeat` from an old lane must not refresh `runtimeLastSeenAt`. +- Runtime metadata from rejected evidence must not be written to persisted launch state. +- UI copy diagnostics should include `runId` and `runtimeSessionId` only after accepted evidence. + +Regression tests: + +```ts +await expect( + service.recordOpenCodeRuntimeHeartbeat({ + teamName, + runId: oldRunId, + memberName: 'bob', + runtimeSessionId: oldSessionId, + }) +).rejects.toThrow(); +``` + +## `getLiveTeamAgentRuntimeMetadata()` + +Новая логика: + +1. Сначала читать durable status: + - `bootstrapConfirmed` + - `lastHeartbeatAt` + - `runtime_bootstrap_checkin` + - transcript success + +2. Потом читать verified runtime: + - process table match by `--team-name` + `--agent-id` + - OpenCode runtimePid/sessionId + - tmux descendant with verified identity + +3. Потом diagnostic-only: + - tmux pane shell + - config/meta registration + - stale persisted metadata + +Sketch: + +```ts +const status = this.findTrackedMemberSpawnStatus(run, memberName); +const diagnostics: string[] = []; + +let livenessKind: TeamAgentRuntimeLivenessKind = 'not_found'; +let pid: number | undefined; +let pidSource: TeamAgentRuntimePidSource | undefined; +let processCommand: string | undefined; + +if (status?.bootstrapConfirmed === true) { + livenessKind = 'confirmed_bootstrap'; + diagnostics.push('bootstrap was confirmed by member check-in or heartbeat'); +} + +if (livenessKind !== 'confirmed_bootstrap' && metadata.agentId) { + const processPid = processPidByAgentId.get(metadata.agentId); + if (processPid) { + livenessKind = 'runtime_process'; + pid = processPid; + pidSource = 'agent_process_table'; + diagnostics.push('matched process table by team-name and agent-id'); + } +} + +if (livenessKind !== 'runtime_process' && paneInfo) { + const resolved = resolveTmuxRuntimeProcess({ + teamName, + agentId: metadata.agentId, + pane: paneInfo, + rows: processRows, + }); + + livenessKind = resolved.kind; + pid = resolved.pid; + pidSource = resolved.pidSource; + processCommand = resolved.command; + diagnostics.push(...resolved.diagnostics); +} + +if (livenessKind === 'not_found' && metadata.agentId) { + livenessKind = 'stale_metadata'; + diagnostics.push('persisted agent id exists, but no live process matched it'); +} + +const alive = livenessKind === 'confirmed_bootstrap' || livenessKind === 'runtime_process'; + +metadataByMember.set(memberName, { + ...metadata, + alive, + livenessKind, + ...(pid ? { pid } : {}), + ...(pidSource ? { pidSource } : {}), + ...(processCommand ? { processCommand } : {}), + ...(paneInfo + ? { + panePid: paneInfo.panePid, + paneCurrentCommand: paneInfo.currentCommand, + } + : {}), + diagnostics, +}); +``` + +Fallback policy: + +- Если enhanced tmux info failed, не возвращать `alive: true` только из старого `panePid`. +- Если `ps` failed, показывать diagnostic `process table unavailable`; не self-clear failure. +- Если cached metadata есть, сохранять `model/backendType`, но не сохранять stale `alive`. +- Если `previousMember.bootstrapConfirmed === true`, persisted launch state может оставаться confirmed для истории, но runtime snapshot должен показывать `alive` отдельно от historical `bootstrapConfirmed`. Иначе UI может считать старого member live после stop/relaunch. + +## Persisted launch state + +Файл: `src/main/services/team/TeamLaunchStateEvaluator.ts` + +Сейчас `RuntimeMemberSpawnState` и persisted member normalization не знают про новые diagnostic поля. Нужно расширить аккуратно, чтобы старые snapshots читались без migration. + +Добавить в `PersistedTeamLaunchMemberState`: + +```ts +runtimeSessionId?: string; +livenessKind?: TeamAgentRuntimeLivenessKind; +pidSource?: TeamAgentRuntimePidSource; +runtimeDiagnostic?: string; +runtimeDiagnosticSeverity?: TeamAgentRuntimeDiagnosticSeverity; +``` + +Правило: + +- persisted `livenessKind` можно использовать для UI explanation; +- persisted `livenessKind` нельзя использовать как live proof без свежего `lastRuntimeAliveAt` или live runtime check. + +Normalize: + +```ts +function normalizeLivenessKind(value: unknown): TeamAgentRuntimeLivenessKind | undefined { + return value === 'confirmed_bootstrap' || + value === 'runtime_process' || + value === 'runtime_process_candidate' || + value === 'permission_blocked' || + value === 'shell_only' || + value === 'registered_only' || + value === 'stale_metadata' || + value === 'not_found' + ? value + : undefined; +} +``` + +`updateOpenCodeRuntimeMemberLiveness()` должен сохранять: + +```ts +livenessKind: 'confirmed_bootstrap', +pidSource: 'runtime_bootstrap', +runtimeSessionId: input.runtimeSessionId, +runtimeDiagnostic: undefined, +runtimeDiagnosticSeverity: undefined, +``` + +`toOpenCodePersistedLaunchMember()` должен сохранять: + +```ts +runtimePid: evidence?.runtimePid, +runtimeSessionId: evidence?.sessionId, +livenessKind: evidence?.bootstrapConfirmed + ? 'confirmed_bootstrap' + : evidence?.runtimeAlive + ? 'runtime_process' + : evidence?.pendingPermissionRequestIds?.length + ? 'permission_blocked' + : undefined, +``` + +Mapping functions that must be updated: + +- `RuntimeMemberSpawnState` pick list must include `livenessKind`, `runtimeDiagnostic`, `runtimeDiagnosticSeverity`. +- `snapshotFromRuntimeMemberStatuses()` must copy those fields into `PersistedTeamLaunchMemberState`. +- `snapshotToMemberSpawnStatuses()` must copy them back into `MemberSpawnStatusEntry`. +- `normalizePersistedLaunchSnapshot()` must normalize unknown old files without dropping valid new fields. + +Example: + +```ts +statuses[memberName] = { + status, + launchState: entry.launchState, + error: entry.hardFailure ? entry.hardFailureReason : undefined, + hardFailureReason: entry.hardFailureReason, + livenessSource, + agentToolAccepted: entry.agentToolAccepted, + runtimeAlive: entry.runtimeAlive, + bootstrapConfirmed: entry.bootstrapConfirmed, + hardFailure: entry.hardFailure, + pendingPermissionRequestIds: entry.pendingPermissionRequestIds, + firstSpawnAcceptedAt: entry.firstSpawnAcceptedAt, + lastHeartbeatAt: entry.lastHeartbeatAt, + livenessKind: entry.livenessKind, + runtimeDiagnostic: entry.runtimeDiagnostic, + runtimeDiagnosticSeverity: entry.runtimeDiagnosticSeverity, + updatedAt: entry.lastEvaluatedAt, +}; +``` + +Backward compatibility: + +- old snapshots without these fields should behave exactly as today; +- new optional summary counts should default to `0` at presentation time; +- do not bump snapshot `version` unless a required field is introduced. For this plan, keep `version: 2`. + +## `attachLiveRuntimeMetadataToStatuses()` + +Текущий behavior: + +```ts +if (metadata.alive) { + nextEntry.runtimeAlive = true; + nextEntry.livenessSource = 'process'; +} +``` + +Новый behavior: + +```ts +const strongRuntimeAlive = isStrongRuntimeEvidence(metadata); +const weakRuntimeEvidence = isWeakRuntimeEvidence(metadata); + +if ( + strongRuntimeAlive && + current.hardFailure !== true && + current.launchState !== 'failed_to_start' +) { + nextEntry.status = 'online'; + nextEntry.agentToolAccepted = true; + nextEntry.runtimeAlive = true; + nextEntry.hardFailure = false; + nextEntry.hardFailureReason = undefined; + nextEntry.error = undefined; + nextEntry.livenessSource = current.bootstrapConfirmed ? current.livenessSource : 'process'; + nextEntry.livenessKind = metadata.livenessKind; + nextEntry.runtimeDiagnostic = undefined; + nextEntry.runtimeDiagnosticSeverity = undefined; + nextEntry.launchState = deriveMemberLaunchState(nextEntry); +} + +if (weakRuntimeEvidence && current.bootstrapConfirmed !== true) { + nextEntry.runtimeAlive = false; + nextEntry.livenessKind = metadata.livenessKind; + nextEntry.runtimeDiagnostic = buildRuntimeDiagnostic(metadata); + nextEntry.runtimeDiagnosticSeverity = metadata.livenessKind === 'shell_only' ? 'warning' : 'info'; +} +``` + +Self-heal из `failed_to_start` оставить только для strong evidence: + +```ts +if ( + strongRuntimeAlive && + current.launchState === 'failed_to_start' && + isAutoClearableLaunchFailureReason(failureReason) +) { + // clear auto failure +} +``` + +## Spawn tool result handling + +Файл: `src/main/services/team/TeamProvisioningService.ts` + +`handleMemberSpawnToolResult()` сейчас содержит shortcut: + +```ts +if (parsedStatus.reason === 'already_running') { + this.setMemberSpawnStatus(run, spawnedMemberName, 'online', undefined, 'process'); +} +``` + +В strict liveness модели это опасно: `already_running` доказывает, что runtime/CLI отказался дублировать spawn, но не доказывает, что нужный teammate сейчас прошел bootstrap или что текущий pane PID является runtime процессом. + +Новая логика: + +```ts +if (parsedStatus.reason === 'already_running') { + this.agentRuntimeSnapshotCache.delete(run.teamName); + this.liveTeamAgentRuntimeMetadataCache.delete(run.teamName); + const runtime = await this.findStrongRuntimeEvidenceForMember(run.teamName, spawnedMemberName); + if (isStrongRuntimeEvidence(runtime)) { + this.setMemberSpawnStatus(run, spawnedMemberName, 'online', undefined, 'process'); + } else { + this.setMemberSpawnStatus(run, spawnedMemberName, 'waiting'); + this.setMemberRuntimeDiagnostic(run, spawnedMemberName, { + livenessKind: runtime?.livenessKind ?? 'registered_only', + message: 'Runtime reported already running, but no verified member process was found yet.', + severity: 'warning', + }); + } +} +``` + +Tests: + +- `already_running` + shell-only pane -> stays pending/warning, no `runtimeAlive`. +- `already_running` + verified process -> can become `runtime_pending_bootstrap`. +- `already_running` + confirmed bootstrap -> confirmed alive. + +## `reevaluateMemberLaunchStatus()` + +Текущий early return по `refreshed.runtimeAlive` слишком широкий. + +Новый sketch: + +```ts +await this.refreshMemberSpawnStatusesFromLeadInbox(run); +await this.maybeAuditMemberSpawnStatuses(run, { force: true }); + +const refreshed = run.memberSpawnStatuses.get(memberName); +if (!refreshed) return; + +const runtimeByMember = await this.getLiveTeamAgentRuntimeMetadata(run.teamName); +const runtime = findRuntimeMetadataForMember(runtimeByMember, memberName); +const strongRuntimeAlive = isStrongRuntimeEvidence(runtime); + +if (refreshed.launchState === 'failed_to_start' || refreshed.launchState === 'confirmed_alive') { + return; +} + +if (strongRuntimeAlive) { + this.setMemberRuntimeDiagnostic(run, memberName, { + livenessKind: runtime?.livenessKind, + message: 'Runtime process is alive, waiting for teammate bootstrap/check-in.', + severity: 'warning', + }); + return; +} + +if (runtime?.livenessKind === 'permission_blocked') { + return; +} + +const reason = + runtime?.livenessKind === 'shell_only' + ? `Teammate did not join within the launch grace window. Tmux pane is alive, but only shell command "${runtime.paneCurrentCommand ?? 'unknown'}" was detected.` + : runtime?.livenessKind === 'runtime_process_candidate' + ? 'Teammate did not confirm bootstrap. Only an unverified runtime process candidate was found.' + : 'Teammate did not join within the launch grace window.'; + +this.setMemberSpawnStatus(run, memberName, 'error', reason); +``` + +Для `runtime_process_candidate` лучше использовать 5 минут, не 90 секунд: + +```ts +const acceptedAtMs = Date.parse(refreshed.firstSpawnAcceptedAt ?? ''); +const elapsedMs = Number.isFinite(acceptedAtMs) ? Date.now() - acceptedAtMs : 0; +if ( + runtime?.livenessKind === 'runtime_process_candidate' && + elapsedMs < MEMBER_BOOTSTRAP_STALL_MS +) { + return; +} +``` + +## Runtime snapshot and memory display + +`getTeamAgentRuntimeSnapshot()` сейчас выбирает `rssPid = liveRuntimeMember?.pid ?? liveRuntimeMember?.metricsPid`. Это нормально для сбора метрики, но UI должен знать источник. + +Правило: + +- `pidSource = tmux_pane` + `livenessKind = shell_only` -> memory is shell/pane RSS, не runtime RSS. +- `pidSource = tmux_child` или `agent_process_table` -> memory is runtime process RSS. +- OpenCode shared host `metricsPid` -> показать как shared host, не как member-owned runtime. +- `launchSnapshotAlive` сейчас может сделать `alive: true`, если persisted launch member был `runtimeAlive` или `bootstrapConfirmed`. После изменения это надо разделить: + - `historicallyConfirmedBootstrap` - для display/history. + - `alive` - только свежий live runtime или свежий heartbeat lease. + +Добавить в `TeamAgentRuntimeEntry`: + +```ts +runtimeDiagnostic?: string; +pidSource?: TeamAgentRuntimePidSource; +paneCurrentCommand?: string; +historicalBootstrapConfirmed?: boolean; +runtimeLastSeenAt?: string; +``` + +UI tooltip может объяснить: + +```text +RSS source: tmux pane shell +PID: 26691 +Command: zsh +Runtime process: not found +Bootstrap: no check-in yet +``` + +## Restartability semantics + +Файлы: + +- `src/main/services/team/TeamProvisioningService.ts` +- `src/renderer/components/team/members/MemberDetailDialog.tsx` + +Важно не смешать `alive` и `restartable`. + +`shell_only` должен быть `alive: false`, но часто должен оставаться `restartable: true`, если есть `tmuxPaneId`. Иначе пользователь увидит `shell only`, но не сможет нажать Restart. + +Rules: + +- `confirmed_bootstrap` / `runtime_process` with member-owned PID -> `alive: true`, `restartable: true`. +- `shell_only` with `tmuxPaneId` -> `alive: false`, `restartable: true`, restart kills pane. +- `registered_only` without PID/pane -> `alive: false`, `restartable: false`. +- OpenCode shared host `metricsPid` -> `restartable: false` unless adapter owns a member lane stop/restart path. +- `in-process` -> keep `restartable: false`. + +`restartMember()` already kills persisted tmux panes via `killTmuxPaneForCurrentPlatformSync(paneId)`, so strict liveness should not remove pane ids from runtime snapshot just because they are weak evidence. + +Test: + +```ts +expect(shellOnlyRuntimeEntry).toMatchObject({ + alive: false, + restartable: true, + livenessKind: 'shell_only', + pidSource: 'tmux_pane', +}); +``` + +## IPC/store implications + +Файлы: + +- `src/main/ipc/teams.ts` +- `src/renderer/store/index.ts` +- `src/renderer/store/slices/teamSlice.ts` +- `src/renderer/components/team/TeamDetailView.tsx` + +IPC уже возвращает `TeamAgentRuntimeSnapshot`, значит новый контракт проходит без нового channel. Но store equality обязательно надо обновить: + +```ts +function areTeamAgentRuntimeEntriesEqual( + left: TeamAgentRuntimeEntry | undefined, + right: TeamAgentRuntimeEntry | undefined +): boolean { + if (left === right) return true; + if (!left || !right) return left === right; + return ( + left.memberName === right.memberName && + left.alive === right.alive && + left.restartable === right.restartable && + left.backendType === right.backendType && + left.pid === right.pid && + left.runtimeModel === right.runtimeModel && + left.rssBytes === right.rssBytes && + left.livenessKind === right.livenessKind && + left.pidSource === right.pidSource && + left.paneCurrentCommand === right.paneCurrentCommand && + left.runtimeDiagnostic === right.runtimeDiagnostic && + left.runtimeDiagnosticSeverity === right.runtimeDiagnosticSeverity && + left.runtimeLastSeenAt === right.runtimeLastSeenAt + ); +} +``` + +Если не сделать это, backend может правильно вычислять `shell_only`, а UI продолжит показывать старую карточку из-за suppressed store update. + +Нужно обновить и spawn equality: + +```ts +function areMemberSpawnStatusEntriesEqual( + left: MemberSpawnStatusEntry | undefined, + right: MemberSpawnStatusEntry | undefined +): boolean { + if (left === right) return true; + if (!left || !right) return left === right; + return ( + // existing visible fields + left.status === right.status && + left.launchState === right.launchState && + left.error === right.error && + left.hardFailureReason === right.hardFailureReason && + left.livenessSource === right.livenessSource && + left.runtimeAlive === right.runtimeAlive && + left.runtimeModel === right.runtimeModel && + left.bootstrapConfirmed === right.bootstrapConfirmed && + left.hardFailure === right.hardFailure && + // new visible diagnostic fields + left.livenessKind === right.livenessKind && + left.runtimeDiagnostic === right.runtimeDiagnostic && + left.runtimeDiagnosticSeverity === right.runtimeDiagnosticSeverity + ); +} +``` + +Summary equality: + +```ts +function areLaunchSummaryCountsEqual( + left: PersistedTeamLaunchSummary | undefined, + right: PersistedTeamLaunchSummary | undefined +): boolean { + if (left === right) return true; + if (!left || !right) return left === right; + return ( + left.confirmedCount === right.confirmedCount && + left.pendingCount === right.pendingCount && + left.failedCount === right.failedCount && + left.runtimeAlivePendingCount === right.runtimeAlivePendingCount && + left.shellOnlyPendingCount === right.shellOnlyPendingCount && + left.runtimeProcessPendingCount === right.runtimeProcessPendingCount && + left.runtimeCandidatePendingCount === right.runtimeCandidatePendingCount && + left.noRuntimePendingCount === right.noRuntimePendingCount && + left.permissionPendingCount === right.permissionPendingCount + ); +} +``` + +Event handling: + +```ts +if (event.type === 'member-spawn') { + if (isStaleRuntimeEvent) return; + seedCurrentRunIdIfMissing(); + scheduleMemberSpawnStatusesRefresh(event.teamName); + scheduleTeamAgentRuntimeRefresh(event.teamName); + return; +} +``` + +If `scheduleTeamAgentRuntimeRefresh()` does not exist, add a small debounced variant mirroring `scheduleMemberSpawnStatusesRefresh()`. + +Polling: + +- `TeamSpawnStatusWatcher` - 2.5 sec. +- `TeamAgentRuntimeWatcher` - 5 sec. +- Backend runtime metadata cache TTL is 2 sec. + +Для launch UI лучше продублировать короткий `livenessKind/runtimeDiagnostic` в `MemberSpawnStatusEntry`, а подробные PID/command детали оставить в runtime snapshot. Тогда badge меняется быстро, tooltip догоняет через runtime snapshot. + +Cache invalidation checklist: + +- invalidate `agentRuntimeSnapshotCache` and `liveTeamAgentRuntimeMetadataCache` on runtime check-in; +- invalidate on heartbeat; +- invalidate on member restart/stop/remove; +- invalidate when tmux pane kill succeeds; +- invalidate when launch state store writes a new liveness diagnostic. + +Without this, a member can remain visually `shell only` for up to the polling interval after a valid check-in, which is acceptable, but not after an explicit check-in event. + +## API/preload propagation + +No new IPC channel is needed, but the type propagation still has sharp edges. + +Files to verify: + +- `src/shared/types/team.ts` +- `src/shared/types/api.ts` +- `src/preload/index.ts` +- `src/renderer/api/httpClient.ts` +- `src/renderer/store/slices/teamSlice.ts` + +Rules: + +- New fields on `TeamAgentRuntimeEntry`, `MemberSpawnStatusEntry` and `PersistedTeamLaunchSummary` must be optional at first. +- `src/preload/index.ts` can keep the same `invokeIpcWithResult()` calls. +- `src/shared/types/api.ts` should not need method signature changes, but typecheck must prove it. +- `src/renderer/api/httpClient.ts` browser fallback must still return valid snapshots when new fields are absent. +- Renderer helpers must tolerate `undefined` `livenessKind` and map it to current behavior. + +Recommended type compatibility test: + +```ts +const snapshot: TeamAgentRuntimeSnapshot = { + teamName: 'demo', + updatedAt: new Date().toISOString(), + runId: null, + members: { + bob: { + memberName: 'bob', + alive: false, + restartable: true, + livenessKind: 'shell_only', + pidSource: 'tmux_pane', + paneCurrentCommand: 'zsh', + updatedAt: new Date().toISOString(), + }, + }, +}; +``` + +This catches accidental required fields before runtime. + +## Progress diagnostics update path + +`updateProgress()` currently accepts only: + +```ts +Pick< + TeamProvisioningProgress, + 'pid' | 'error' | 'warnings' | 'cliLogsTail' | 'configReady' | 'messageSeverity' +>; +``` + +If `launchDiagnostics` is added to `TeamProvisioningProgress`, `updateProgress()` must accept it explicitly: + +```ts +extras?: Pick< + TeamProvisioningProgress, + | 'pid' + | 'error' + | 'warnings' + | 'cliLogsTail' + | 'configReady' + | 'messageSeverity' + | 'launchDiagnostics' +> +``` + +And keep it bounded: + +```ts +launchDiagnostics: boundLaunchDiagnostics( + extras?.launchDiagnostics ?? run.progress.launchDiagnostics +), +``` + +Do not store this as `assistantOutput`. `assistantOutput` is rendered as markdown and is the wrong surface for machine-produced liveness facts. + +## Renderer UX + +### Member card labels + +Файлы: + +- `src/renderer/utils/memberHelpers.ts` +- `src/renderer/components/team/members/MemberCard.tsx` +- `src/renderer/utils/memberRuntimeSummary.ts` + +Новые visual states: + +```ts +export type MemberLaunchVisualState = + | 'waiting' + | 'spawning' + | 'permission_pending' + | 'waiting_bootstrap' + | 'shell_only' + | 'runtime_candidate' + | 'registered_only' + | 'stale_runtime' + | 'error' + | null; +``` + +Mapping: + +```ts +function resolveLaunchVisualState(params: { + spawnStatus?: MemberSpawnStatusEntry; + runtimeEntry?: TeamAgentRuntimeEntry; +}): MemberLaunchVisualState { + const { spawnStatus, runtimeEntry } = params; + + if (spawnStatus?.launchState === 'failed_to_start') return 'error'; + if (spawnStatus?.launchState === 'runtime_pending_permission') return 'permission_pending'; + + if (runtimeEntry?.livenessKind === 'shell_only') return 'shell_only'; + if (runtimeEntry?.livenessKind === 'runtime_process_candidate') return 'runtime_candidate'; + if (runtimeEntry?.livenessKind === 'registered_only') return 'registered_only'; + if (runtimeEntry?.livenessKind === 'stale_metadata') return 'stale_runtime'; + + if ( + spawnStatus?.launchState === 'runtime_pending_bootstrap' && + runtimeEntry?.livenessKind === 'runtime_process' + ) { + return 'waiting_bootstrap'; + } + + return spawnStatus?.status === 'spawning' ? 'spawning' : 'waiting'; +} +``` + +Labels: + +```ts +const MEMBER_LAUNCH_LABELS: Record, string> = { + waiting: 'starting', + spawning: 'starting', + permission_pending: 'permission', + waiting_bootstrap: 'waiting for bootstrap', + shell_only: 'shell only', + runtime_candidate: 'process candidate', + registered_only: 'registered', + stale_runtime: 'stale runtime', + error: 'spawn failed', +}; +``` + +Текущий `MemberCard` не принимает `runtimeEntry`, поэтому надо изменить props: + +```ts +interface MemberCardProps { + // existing + runtimeEntry?: TeamAgentRuntimeEntry; + spawnEntry?: MemberSpawnStatusEntry; +} +``` + +И передавать из `MemberList`: + +```tsx + +``` + +Затем `buildMemberLaunchPresentation()` должен принимать `runtimeEntry` или хотя бы `livenessKind`: + +```ts +const launchPresentation = buildMemberLaunchPresentation({ + member, + spawnStatus, + spawnLaunchState, + spawnLivenessSource, + spawnRuntimeAlive, + runtimeEntry, + runtimeAdvisory: member.runtimeAdvisory, + isLaunchSettling, + isTeamAlive, + isTeamProvisioning, + leadActivity, +}); +``` + +То же нужно для `MemberDetailHeader` и `MemberHoverCard`, иначе список и detail view будут расходиться по labels. + +### Tooltip + +Tooltip examples: + +```text +bob +Spawn accepted: yes +Registered in config: yes +Runtime: tmux pane alive, foreground command is zsh +Runtime process: not found +PID source: tmux pane +Bootstrap: no member_briefing/check-in yet +``` + +```text +alice +Spawn accepted: yes +Runtime: verified process detected +PID source: tmux child +Bootstrap: waiting for member_briefing/check-in +``` + +```text +tom +Spawn accepted: yes +Runtime: not found after 90s +Bootstrap: no check-in +Last error: Teammate did not join within the launch grace window. +``` + +### Launch banner + +Файл: `src/renderer/utils/teamProvisioningPresentation.ts` + +Generic: + +```text +4 teammates still joining +``` + +Заменить на aggregate detail: + +```text +4 teammates still joining - 3 shell-only, 1 waiting for bootstrap +``` + +Helper: + +```ts +function summarizePendingLaunchDiagnostics(params: { + statuses: Record; + runtimeEntries: Record | undefined; +}): string | null { + const counts = { + shellOnly: 0, + waitingBootstrap: 0, + candidate: 0, + permission: 0, + noRuntime: 0, + }; + + for (const [memberName, status] of Object.entries(params.statuses)) { + if (status.launchState === 'confirmed_alive' || status.launchState === 'failed_to_start') { + continue; + } + + const runtimeEntry = params.runtimeEntries?.[memberName]; + if (status.launchState === 'runtime_pending_permission') counts.permission += 1; + else if (runtimeEntry?.livenessKind === 'shell_only') counts.shellOnly += 1; + else if (runtimeEntry?.livenessKind === 'runtime_process') counts.waitingBootstrap += 1; + else if (runtimeEntry?.livenessKind === 'runtime_process_candidate') counts.candidate += 1; + else counts.noRuntime += 1; + } + + const parts = [ + counts.shellOnly ? `${counts.shellOnly} shell-only` : '', + counts.waitingBootstrap ? `${counts.waitingBootstrap} waiting for bootstrap` : '', + counts.candidate ? `${counts.candidate} process candidates` : '', + counts.permission ? `${counts.permission} awaiting permission` : '', + counts.noRuntime ? `${counts.noRuntime} no runtime found` : '', + ].filter(Boolean); + + return parts.length > 0 ? parts.join(', ') : null; +} +``` + +Сейчас `buildTeamProvisioningPresentation()` принимает только spawn statuses/snapshot, не runtime entries. Есть три варианта: + +1. Добавить `runtimeSnapshot?: TeamAgentRuntimeSnapshot` в `buildTeamProvisioningPresentation()`. + 🎯 8 🛡️ 8 🧠 5 Примерно 80-130 строк. + +2. Дублировать aggregate diagnostic counts в `MemberSpawnStatusesSnapshot.summary`. + 🎯 9 🛡️ 9 🧠 6 Примерно 120-190 строк. + +3. Использовать только `progress.message`. + 🎯 6 🛡️ 5 🧠 3 Примерно 30-60 строк. + +Рекомендую 2: backend уже лучше знает truth model и может атомарно отдать `shellOnlyCount`, `runtimeProcessPendingCount`, `candidateCount`, `noRuntimeCount`. UI тогда не зависит от race между 2.5 sec spawn polling и 5 sec runtime polling. + +Расширить summary: + +```ts +export interface PersistedTeamLaunchSummary { + confirmedCount: number; + pendingCount: number; + failedCount: number; + runtimeAlivePendingCount: number; + shellOnlyPendingCount?: number; + runtimeProcessPendingCount?: number; + runtimeCandidatePendingCount?: number; + noRuntimePendingCount?: number; + permissionPendingCount?: number; +} +``` + +### Stepper semantics + +Файл: `src/renderer/components/team/provisioningSteps.ts` + +The current stepper uses: + +- `heartbeatConfirmedCount` +- `processOnlyAliveCount` +- `pendingSpawnCount` +- `failedSpawnCount` + +After strict liveness, `processOnlyAliveCount` must mean **strong runtime process only**. It must not include: + +- `shell_only` +- `runtime_process_candidate` +- `registered_only` +- `stale_metadata` +- `permission_blocked` + +Mapping: + +```ts +if (entry.launchState === 'runtime_pending_bootstrap') { + if (entry.runtimeAlive === true && entry.livenessKind === 'runtime_process') { + processOnlyAliveCount += 1; + } else { + pendingSpawnCount += 1; + } +} +``` + +Why this matters: the screenshot problem is exactly the UI being stuck on "Members joining". Shell-only should remain in joining until it fails, while verified process can move toward finalizing but still show `waiting for bootstrap`. + +### Copy diagnostics + +Добавить в launch details или member tooltip маленькое действие `Copy diagnostics`. + +Payload: + +```ts +interface MemberLaunchDiagnosticsPayload { + teamName: string; + memberName: string; + launchState?: MemberLaunchState; + spawnStatus?: MemberSpawnStatus; + livenessKind?: TeamAgentRuntimeLivenessKind; + livenessSource?: MemberSpawnLivenessSource; + pid?: number; + pidSource?: TeamAgentRuntimePidSource; + paneId?: string; + panePid?: number; + paneCurrentCommand?: string; + processCommand?: string; + runtimeDiagnostic?: string; + diagnostics?: string[]; + updatedAt?: string; +} +``` + +Это поможет быстро понять проблему на скрине друга без доступа к его машине. + +## Файлы для изменения + +Backend/shared: + +- `src/shared/types/team.ts` + - добавить liveness/pid source типы; + - расширить `TeamAgentRuntimeEntry`; + - добавить компактные diagnostic fields в `MemberSpawnStatusEntry`. + - добавить bounded `TeamLaunchDiagnosticItem` и `TeamProvisioningProgress.launchDiagnostics`. + +- `src/main/services/team/TeamMemberLivenessMode.ts` + - добавить `CLAUDE_TEAM_MEMBER_LIVENESS_MODE`; + - добавить resolver `diagnostics`/`strict`; + - использовать как dogfood/rollback lever. + +- `src/main/services/team/TeamRuntimeLivenessResolver.ts` + - вынести pure liveness classification; + - принимать tmux/process/OpenCode/persisted facts; + - возвращать strong/weak classification и sanitized diagnostics. + +- `src/features/tmux-installer/main/infrastructure/runtime/TmuxPlatformCommandExecutor.ts` + - добавить `listPaneRuntimeInfo()`; + - добавить `listRuntimeProcesses()` или equivalent; + - оставить `listPanePids()` совместимым wrapper. + +- `src/features/tmux-installer/main/composition/runtimeSupport.ts` + - экспортировать `listTmuxPaneRuntimeInfoForCurrentPlatform()`; + - экспортировать process table helper, если он живет в tmux runtime executor. + +- `src/main/services/team/TeamProvisioningService.ts` + - расширить `LiveTeamAgentRuntimeMetadata`; + - parse sanitized runtime tool `metadata`; + - добавить strict evidence helpers; + - подключить `TeamMemberLivenessMode`; + - использовать `TeamRuntimeLivenessResolver`; + - обновить `updateProgress()` extras для `launchDiagnostics`; + - переписать tmux/process resolution; + - убрать strong `online/process` shortcut из `already_running`; + - исправить `attachLiveRuntimeMetadataToStatuses()`; + - исправить `reevaluateMemberLaunchStatus()`; + - invalidate runtime caches на check-in/heartbeat/restart/stop; + - прокинуть diagnostics в `getTeamAgentRuntimeSnapshot()`. + +- `src/main/services/team/TeamLaunchStateEvaluator.ts` + - нормализовать persisted liveness diagnostic fields; + - считать optional diagnostic counts в summary; + - не превращать stale persisted `runtimeAlive` в live proof. + +- `src/main/services/team/runtime/OpenCodeTeamRuntimeAdapter.ts` + - не считать `created` bridge member strong alive без `runtimePid`; + - сохранить `runtimePid` и `sessionId` в persisted launch state. + +- `src/main/services/team/runtime/TeamRuntimeAdapter.ts` + - расширить `TeamRuntimeMemberLaunchEvidence` полями `livenessKind`, `pidSource`, `runtimeDiagnostic`; + - сохранить backward compatibility для существующих adapters. + +- `src/main/services/team/progressPayload.ts` + - добавить `boundLaunchDiagnostics()` и не расширять raw log tails. + +- `src/shared/types/api.ts` + - проверить, что existing `getMemberSpawnStatuses()` и `getTeamAgentRuntime()` contracts не требуют нового channel. + +- `src/preload/index.ts` + - оставить существующие IPC methods, убедиться typecheck проходит с optional fields. + +- `src/renderer/api/httpClient.ts` + - browser fallback должен оставаться valid при отсутствующих diagnostic fields. + +- `src/renderer/store/slices/teamSlice.ts` + - обновить `areTeamAgentRuntimeEntriesEqual()`; + - обновить `areMemberSpawnStatusEntriesEqual()`; + - обновить `areLaunchSummaryCountsEqual()`; + - убедиться, что runtime diagnostic changes не suppress-ятся. + +- `src/renderer/store/index.ts` + - на `member-spawn` event обновлять и spawn statuses, и runtime snapshot. + +Renderer: + +- `src/renderer/utils/memberHelpers.ts` + - добавить visual states и labels. + +- `src/renderer/utils/memberRuntimeSummary.ts` + - memory summary должен учитывать `pidSource`. + +- `src/renderer/components/team/members/MemberList.tsx` + - передать `runtimeEntry` и `spawnEntry` в presentation/member card layer. + +- `src/renderer/components/team/members/MemberCard.tsx` + - badge + tooltip + copy diagnostics. + +- `src/renderer/components/team/members/MemberDetailHeader.tsx` + - использовать тот же launch presentation contract, что и card. + +- `src/renderer/components/team/members/MemberHoverCard.tsx` + - не отставать от list/card labels. + +- `src/renderer/utils/teamProvisioningPresentation.ts` + - aggregate launch diagnostics. + +- `src/renderer/components/team/provisioningSteps.ts` + - `processOnlyAliveCount` считать только для strong runtime process. + +- `src/renderer/components/team/ProvisioningProgressBlock.tsx` + - добавить компактный Diagnostics disclosure для `launchDiagnostics`. + +## Tests + +Backend: + +- `TeamMemberLivenessMode.test.ts` + - default mode is `diagnostics`; + - `CLAUDE_TEAM_MEMBER_LIVENESS_MODE=strict` enables strict; + - unknown values fall back to `diagnostics`. + +- `TeamRuntimeLivenessResolver.test.ts` + - tmux foreground shell + no child -> `shell_only`; + - verified process row by `--team-name` + `--agent-id` -> `runtime_process`; + - non-shell descendant without identity -> `runtime_process_candidate`; + - persisted PID without current process identity -> `stale_metadata`; + - process command secrets are redacted in diagnostics; + - provider failure diagnostic does not produce strong alive. + +- `TeamProvisioningService.test.ts` + - tmux shell-only pane не ставит `runtimeAlive`; + - shell-only после 90 секунд становится `failed_to_start`; + - stale persisted `tmuxPaneId` не self-clear-ит failure; + - verified process by `--team-name` + `--agent-id` ставит `runtimeAlive`; + - runtime process candidate не считается strong alive; + - OpenCode `created` без `runtimePid` не ставит `runtimeAlive`; + - OpenCode `created` с `runtimePid` ставит `runtimeAlive`; + - OpenCode `sessionId` без `runtimePid` становится `runtime_process_candidate`, а не strong alive; + - `runtime_bootstrap_checkin` сохраняет `runtimeSessionId`, `livenessKind: "confirmed_bootstrap"`; + - stale runtime heartbeat от old `runId` rejected и не меняет launch state; + - runtime metadata PID без process identity не становится strong alive; + - `already_running` + shell-only не ставит `runtimeAlive`; + - permission blocked остается pending permission, не hard fail. + +- `TmuxPlatformCommandExecutor.test.ts` + - `listPaneRuntimeInfo()` парсит `pane_current_command`; + - `listPanePids()` остается совместимым pane-existence helper; + - process table parser поддерживает `pid`, `ppid`, `command`; + - WSL branch не использует host process table. + +Renderer: + +- `memberHelpers.test.ts` + - `shell_only` -> `shell only`; + - `runtime_process` + pending bootstrap -> `waiting for bootstrap`; + - `runtime_process_candidate` -> `process candidate`; + - permission state не затирается runtime diagnostics. + +- `memberRuntimeSummary.test.ts` + - `2 MB` с `pidSource=tmux_pane` получает tooltip/source `tmux pane shell`; + - runtime child показывает обычный runtime memory. + +- `teamSlice.test.ts` + - изменение `livenessKind` обновляет `teamAgentRuntimeByTeam`; + - изменение `runtimeDiagnostic` обновляет `teamAgentRuntimeByTeam`. + - изменение spawn `livenessKind/runtimeDiagnostic` обновляет `memberSpawnStatusesByTeam`; + - изменение optional summary diagnostic counts обновляет presentation. + - `member-spawn` event schedules both spawn status refresh and runtime snapshot refresh. + +- `httpClient.test.ts` + - browser fallback `getTeamAgentRuntime()` remains valid without diagnostic fields; + - browser fallback `getMemberSpawnStatuses()` remains valid without summary diagnostic counts. + +- `teamProvisioningPresentation.test.ts` + - banner показывает `3 shell-only, 1 waiting for bootstrap`; + - pending permission получает отдельный count. + +- `provisioningSteps.test.ts` + - `shell_only` не увеличивает `processOnlyAliveCount`; + - `runtime_process_candidate` не увеличивает `processOnlyAliveCount`; + - `runtime_process` увеличивает `processOnlyAliveCount`. + +- `ProvisioningProgressBlock.test.tsx` + - renders bounded `launchDiagnostics`; + - does not require opening CLI logs to see `shell only`; + - long process command is truncated/sanitized. + +## Phases + +### Phase 0 - Diagnostics without behavior change + +🎯 10 🛡️ 10 🧠 4 Примерно 180-260 строк. + +Добавить новые optional fields и заполнить `livenessKind`, `pidSource`, `paneCurrentCommand`, `diagnostics`, но пока не менять timeout behavior. + +Цель: увидеть на реальном launch, что именно определяется у друга: shell-only, process candidate, stale metadata или OpenCode bridge claim. + +Add: + +- `TeamMemberLivenessMode` with default `diagnostics`; +- `TeamRuntimeLivenessResolver` pure tests; +- process table/tmux providers, but strict behavior disabled by default. + +Verification: + +```bash +pnpm typecheck +pnpm exec vitest run test/main/features/tmux-installer test/main/services/team/TeamProvisioningService.test.ts +``` + +### Phase 1 - Strict strong evidence + +🎯 9 🛡️ 9 🧠 7 Примерно 220-320 строк. + +Переключить `attachLiveRuntimeMetadataToStatuses()` на strong evidence only when `CLAUDE_TEAM_MEMBER_LIVENESS_MODE=strict`. Shell/pane/candidate больше не выставляют `runtimeAlive` в strict mode. + +Keep diagnostics mode as rollback until manual launch scenarios pass. + +Verification: + +```bash +CLAUDE_TEAM_MEMBER_LIVENESS_MODE=strict pnpm exec vitest run test/main/services/team/TeamProvisioningService.test.ts +pnpm exec vitest run test/main/services/team/TeamProvisioningService.test.ts +``` + +### Phase 2 - Timeout and self-heal hardening + +🎯 9 🛡️ 9 🧠 6 Примерно 120-180 строк. + +Исправить `reevaluateMemberLaunchStatus()`: + +- shell-only/no-runtime/stale -> fail after 90s; +- permission -> stay pending permission; +- candidate -> warning, fail after 5 min; +- verified runtime -> warning, no false hard fail at 90s; +- auto-clear failure только по strong evidence. + +### Phase 3 - UI visibility + +🎯 9 🛡️ 8 🧠 6 Примерно 220-320 строк. + +Добавить: + +- labels `shell only`, `waiting for bootstrap`, `process candidate`; +- tooltip; +- aggregate banner detail; +- copy diagnostics. + +### Phase 4 - Real launch validation + +🎯 8 🛡️ 9 🧠 6 Примерно 100-180 строк тестовых fixtures/scripts. + +Manual checks: + +```bash +tmux list-panes -a -F '#{pane_id} #{pane_pid} #{pane_current_command}' +ps -ax -o pid=,ppid=,command= | rg '||claude|codex|opencode' +``` + +Scenarios: + +1. Успешный Anthropic tmux launch. +2. Shell-only pane. +3. Missing MCP/member_briefing. +4. Permission pending. +5. OpenCode bridge member without `runtimePid`. +6. OpenCode bridge member with `runtimePid`. +7. Restart member while old pane exists. + +## Acceptance criteria + +1. Tmux pane жив, foreground command `zsh/bash/sh`, runtime child не найден: + - `TeamAgentRuntimeEntry.alive === false` + - `livenessKind === "shell_only"` + - `pidSource === "tmux_pane"` + - UI показывает `shell only` + - после 90 секунд member становится `failed_to_start` + +2. Найден process с `--team-name --agent-id `: + - `TeamAgentRuntimeEntry.alive === true` + - `livenessKind === "runtime_process"` + - `MemberSpawnStatusEntry.runtimeAlive === true` + - UI показывает `waiting for bootstrap`, если bootstrap еще не пришел + +3. Member сделал check-in: + - `bootstrapConfirmed === true` + - `livenessKind === "confirmed_bootstrap"` + - `launchState === "confirmed_alive"` + - UI показывает `ready` + +4. Persisted metadata есть, process не найден: + - не self-clear failure; + - не `runtimeAlive`; + - UI показывает `stale runtime` или `registered`. + +5. OpenCode bridge вернул member без `runtimePid`: + - `agentToolAccepted === true`; + - `runtimeAlive === false`; + - UI показывает pending/bridge diagnostics, не `online`. + +6. `2.0 MB` больше не выглядит как полноценный runtime: + - tooltip объясняет `RSS source: tmux pane shell`; + - launch badge показывает `shell only`. + +7. Launch details объясняет stuck state без открытия logs: + - `launchDiagnostics` содержит bounded rows; + - UI показывает хотя бы `shell only`, `waiting for bootstrap`, `no runtime found`; + - `cliLogsTail` и `assistantOutput` остаются bounded. + +8. Store suppression не скрывает диагностику: + - изменение `livenessKind` меняет renderer state; + - изменение summary diagnostic counts меняет presentation; + - `member-spawn` event refreshes runtime snapshot. + +9. Rollout безопасен: + - default `diagnostics` mode не меняет hard timeout behavior до включения strict; + - `CLAUDE_TEAM_MEMBER_LIVENESS_MODE=strict` включает strong-only behavior; + - `CLAUDE_TEAM_MEMBER_LIVENESS_MODE=diagnostics` работает как rollback без удаления UI diagnostics. + +10. Provider failures не создают ложный ready: + +- process table failure дает `process_table_unavailable`; +- tmux/process provider failure не self-clear-ит failure; +- command diagnostics sanitized and truncated. + +## Main risks + +### False negative для реального runtime + +Если реальный teammate не содержит `--team-name`/`--agent-id` в command, strict model может понизить его до candidate. + +Mitigation: + +- Phase 0 сначала собирает diagnostics без behavior change. +- Candidate не fail-ится за 90 секунд. +- Allowlist runtime command markers добавлять только после реальных данных. + +### Windows/WSL process tree + +Host-side process table не увидит Linux tmux descendants. + +Mitigation: + +- process table должен жить рядом с tmux executor; +- Windows branch должен запускать `ps` внутри WSL distro. + +### OpenCode shared host + +Один OpenCode host PID может обслуживать несколько members. + +Mitigation: + +- `runtimePid` хранить как `metricsPid`, если это shared host; +- `restartable=false`, если PID не member-owned; +- UI label `shared OpenCode host`, не "member runtime". + +### UI overload + +Слишком много деталей в карточке сделают интерфейс шумным. + +Mitigation: + +- короткий badge в карточке; +- детали в tooltip; +- aggregate counts в banner; +- полный JSON только через copy diagnostics. + +### Process command privacy + +`ps` command can include cwd, file paths, API keys or tokens. + +Mitigation: + +- identity matching uses raw command only inside main process memory; +- UI/logs/copy diagnostics receive sanitized command only; +- redact common secret flags; +- truncate command strings to 500 chars; +- do not include raw runtime tool metadata. + +### Process table overhead + +Reading `ps` per member would be wasteful and flaky on large systems. + +Mitigation: + +- read process table once per runtime snapshot; +- keep existing 2 sec backend cache TTL; +- do not call `pidusage` for weak shell-only rows unless UI needs memory display; +- cap diagnostics to 20 progress rows. + +## Minimal safe patch order + +1. Добавить типы и optional fields. +2. Добавить `TeamMemberLivenessMode` default `diagnostics`. +3. Добавить sanitized runtime tool metadata parser. +4. Добавить tmux `listPaneRuntimeInfo()` и сохранить wrapper `listPanePids()`. +5. Добавить process table provider/parser с `ppid`. +6. Вынести `TeamRuntimeLivenessResolver`. +7. Заполнить `livenessKind` без behavior change. +8. Написать backend tests на shell-only, verified runtime, stale event, metadata PID. +9. Переключить `attachLiveRuntimeMetadataToStatuses()` на strong evidence behind strict mode. +10. Исправить `already_running` shortcut behind strict mode. +11. Переключить timeout/self-heal logic behind strict mode. +12. Исправить OpenCode bridge mapping. +13. Обновить persisted summary diagnostics и store equality. +14. Добавить `launchDiagnostics` в progress payload и UI disclosure. +15. Добавить renderer labels/tooltips/banner. +16. Добавить copy diagnostics. +17. После manual validation включить strict default или оставить env rollback на один release. + +## Expected UX + +Before: + +```text +bob starting 2.0 MB +jack starting 2.0 MB +tom starting 2.0 MB +``` + +After: + +```text +bob shell only Anthropic · Opus 4.7 · 2.0 MB +jack waiting for bootstrap Anthropic · Opus 4.7 · 418 MB +tom spawn failed no runtime process after 90s +``` + +Launch banner: + +```text +4 teammates still joining - 3 shell-only, 1 waiting for bootstrap +``` + +Tooltip for shell-only: + +```text +Spawn accepted: yes +Registered in config: yes +Runtime process: not found +Tmux pane: alive +Foreground command: zsh +PID source: tmux pane +Bootstrap: no member_briefing/check-in yet +``` diff --git a/packages/agent-graph/src/canvas/draw-agents.ts b/packages/agent-graph/src/canvas/draw-agents.ts index 0e844e9d..e530b971 100644 --- a/packages/agent-graph/src/canvas/draw-agents.ts +++ b/packages/agent-graph/src/canvas/draw-agents.ts @@ -279,7 +279,13 @@ function drawLaunchStage( for (let index = 0; index < 3; index += 1) { const angle = time * 1.2 + (Math.PI * 2 * index) / 3; ctx.beginPath(); - ctx.arc(x + Math.cos(angle) * dotOrbit, y + Math.sin(angle) * dotOrbit, 1.7, 0, Math.PI * 2); + ctx.arc( + x + Math.cos(angle) * dotOrbit, + y + Math.sin(angle) * dotOrbit, + 1.7, + 0, + Math.PI * 2 + ); ctx.fillStyle = hexWithAlpha('#e4e4e7', 0.72); ctx.fill(); } @@ -736,6 +742,13 @@ function getLaunchStatusColor(visualState: GraphNode['launchVisualState']): stri return hexWithAlpha('#f59e0b', 0.92); case 'runtime_pending': return hexWithAlpha('#67e8f9', 0.9); + case 'shell_only': + case 'runtime_candidate': + return hexWithAlpha('#f97316', 0.9); + case 'registered_only': + return hexWithAlpha('#a1a1aa', 0.82); + case 'stale_runtime': + return hexWithAlpha('#ef4444', 0.82); case 'settling': return hexWithAlpha('#22c55e', 0.9); case 'error': diff --git a/packages/agent-graph/src/ports/types.ts b/packages/agent-graph/src/ports/types.ts index fa7461bc..9b1a254c 100644 --- a/packages/agent-graph/src/ports/types.ts +++ b/packages/agent-graph/src/ports/types.ts @@ -22,6 +22,10 @@ export type GraphLaunchVisualState = | 'spawning' | 'permission_pending' | 'runtime_pending' + | 'shell_only' + | 'runtime_candidate' + | 'registered_only' + | 'stale_runtime' | 'settling' | 'error'; diff --git a/scripts/dev-with-runtime.mjs b/scripts/dev-with-runtime.mjs index b0baa0d1..ebd5db07 100644 --- a/scripts/dev-with-runtime.mjs +++ b/scripts/dev-with-runtime.mjs @@ -19,6 +19,7 @@ const runtimeCacheRoot = process.env.CLAUDE_DEV_RUNTIME_CACHE_ROOT?.trim() ? path.resolve(process.env.CLAUDE_DEV_RUNTIME_CACHE_ROOT.trim()) : defaultRuntimeCacheRoot; const shouldPrintRuntimePath = process.argv.includes('--print-runtime-path'); +const runtimeDisplayName = 'teams orchestrator'; const WINDOWS_SHELL_COMMANDS = new Set(['pnpm', 'npm', 'npx', 'yarn', 'yarnpkg', 'corepack']); function shouldUseWindowsShell(cmd) { @@ -108,9 +109,10 @@ function getPlatformAssetKey() { } function getReleaseAssetUrl(runtimeLock, asset) { - const releaseTag = typeof runtimeLock.releaseTag === 'string' && runtimeLock.releaseTag.trim().length > 0 - ? runtimeLock.releaseTag.trim() - : runtimeLock.sourceRef; + const releaseTag = + typeof runtimeLock.releaseTag === 'string' && runtimeLock.releaseTag.trim().length > 0 + ? runtimeLock.releaseTag.trim() + : runtimeLock.sourceRef; return `https://github.com/${runtimeLock.releaseRepository}/releases/download/${releaseTag}/${encodeURIComponent(asset.file)}`; } @@ -152,9 +154,7 @@ function truncateMiddle(value, maxLength) { function buildProgressBar(progressRatio, width) { const safeWidth = Math.max(10, width); - const clampedRatio = Number.isFinite(progressRatio) - ? Math.min(1, Math.max(0, progressRatio)) - : 0; + const clampedRatio = Number.isFinite(progressRatio) ? Math.min(1, Math.max(0, progressRatio)) : 0; const filledWidth = Math.round(safeWidth * clampedRatio); return `${'='.repeat(filledWidth)}${'-'.repeat(safeWidth - filledWidth)}`; } @@ -164,7 +164,8 @@ function supportsProgressRedraw() { } function formatProgressLine(label, writtenBytes, totalBytes, hasTotal) { - const columns = process.stdout.columns && process.stdout.columns > 0 ? process.stdout.columns : 100; + const columns = + process.stdout.columns && process.stdout.columns > 0 ? process.stdout.columns : 100; const ratio = hasTotal ? writtenBytes / totalBytes : 0; const percentText = hasTotal ? ` ${Math.floor(ratio * 100)}%` : ''; const bytesText = hasTotal @@ -196,6 +197,16 @@ function readBinaryVersion(binaryPath) { return runAndCapture(binaryPath, ['--version']); } +function formatRuntimeVersionForDisplay(versionText) { + const trimmed = versionText.trim(); + if (!trimmed) { + return runtimeDisplayName; + } + + const versionOnly = trimmed.replace(/\s*\([^)]*\)\s*$/, ''); + return `${versionOnly} (${runtimeDisplayName})`; +} + function isExecutable(filePath) { if (!fs.existsSync(filePath)) { return false; @@ -305,7 +316,10 @@ async function downloadWithProgress(url, destinationPath) { readline.clearLine(process.stdout, 0); readline.cursorTo(process.stdout, 0); process.stdout.write(`${formatProgressLine(label, writtenBytes, totalBytes, hasTotal)}\n`); - } else if ((hasTotal && lastLoggedPercent < 100) || (!hasTotal && writtenBytes !== lastLoggedBytes)) { + } else if ( + (hasTotal && lastLoggedPercent < 100) || + (!hasTotal && writtenBytes !== lastLoggedBytes) + ) { process.stdout.write(`${formatProgressSummary(writtenBytes, totalBytes, hasTotal)}\n`); } } @@ -511,7 +525,9 @@ async function main() { if ('cacheDir' in resolvedRuntime && resolvedRuntime.cacheDir) { process.stdout.write(`Runtime cache: ${resolvedRuntime.cacheDir}\n`); } - process.stdout.write(`Runtime version: ${resolvedRuntime.versionText}\n`); + process.stdout.write( + `Runtime version: ${formatRuntimeVersionForDisplay(resolvedRuntime.versionText)}\n` + ); const uiEnv = { ...process.env, diff --git a/src/features/tmux-installer/main/composition/runtimeSupport.ts b/src/features/tmux-installer/main/composition/runtimeSupport.ts index 183b918b..7300608b 100644 --- a/src/features/tmux-installer/main/composition/runtimeSupport.ts +++ b/src/features/tmux-installer/main/composition/runtimeSupport.ts @@ -1,5 +1,9 @@ import { TmuxStatusSourceAdapter } from '../adapters/output/sources/TmuxStatusSourceAdapter'; -import { TmuxPlatformCommandExecutor } from '../infrastructure/runtime/TmuxPlatformCommandExecutor'; +import { + TmuxPlatformCommandExecutor, + type RuntimeProcessTableRow, + type TmuxPaneRuntimeInfo, +} from '../infrastructure/runtime/TmuxPlatformCommandExecutor'; const runtimeStatusSource = new TmuxStatusSourceAdapter(); const runtimeCommandExecutor = new TmuxPlatformCommandExecutor(); @@ -24,6 +28,18 @@ export async function listTmuxPanePidsForCurrentPlatform( return runtimeCommandExecutor.listPanePids(paneIds); } +export async function listTmuxPaneRuntimeInfoForCurrentPlatform( + paneIds: readonly string[] +): Promise> { + return runtimeCommandExecutor.listPaneRuntimeInfo(paneIds); +} + +export async function listRuntimeProcessesForCurrentTmuxPlatform(): Promise< + RuntimeProcessTableRow[] +> { + return runtimeCommandExecutor.listRuntimeProcesses(); +} + export function killTmuxPaneForCurrentPlatformSync(paneId: string): void { runtimeCommandExecutor.killPaneSync(paneId); invalidateTmuxRuntimeStatusCache(); diff --git a/src/features/tmux-installer/main/index.ts b/src/features/tmux-installer/main/index.ts index 21c41e9f..3d133b99 100644 --- a/src/features/tmux-installer/main/index.ts +++ b/src/features/tmux-installer/main/index.ts @@ -9,5 +9,11 @@ export { isTmuxRuntimeReadyForCurrentPlatform, killTmuxPaneForCurrentPlatform, killTmuxPaneForCurrentPlatformSync, + listRuntimeProcessesForCurrentTmuxPlatform, listTmuxPanePidsForCurrentPlatform, + listTmuxPaneRuntimeInfoForCurrentPlatform, } from './composition/runtimeSupport'; +export type { + RuntimeProcessTableRow, + TmuxPaneRuntimeInfo, +} from './infrastructure/runtime/TmuxPlatformCommandExecutor'; diff --git a/src/features/tmux-installer/main/infrastructure/runtime/TmuxPlatformCommandExecutor.ts b/src/features/tmux-installer/main/infrastructure/runtime/TmuxPlatformCommandExecutor.ts index 4b062134..0500d252 100644 --- a/src/features/tmux-installer/main/infrastructure/runtime/TmuxPlatformCommandExecutor.ts +++ b/src/features/tmux-installer/main/infrastructure/runtime/TmuxPlatformCommandExecutor.ts @@ -12,6 +12,43 @@ interface ExecResult { stderr: string; } +export interface TmuxPaneRuntimeInfo { + paneId: string; + panePid: number; + currentCommand?: string; + currentPath?: string; + sessionName?: string; + windowName?: string; +} + +export interface RuntimeProcessTableRow { + pid: number; + ppid: number; + command: string; +} + +export function parseRuntimeProcessTable(output: string): RuntimeProcessTableRow[] { + const rows: RuntimeProcessTableRow[] = []; + for (const line of output.split('\n')) { + const match = /^\s*(\d+)\s+(\d+)\s+(.*)$/.exec(line); + if (!match) continue; + + const pid = Number.parseInt(match[1], 10); + const ppid = Number.parseInt(match[2], 10); + const command = match[3]?.trim() ?? ''; + if ( + Number.isFinite(pid) && + pid > 0 && + Number.isFinite(ppid) && + ppid >= 0 && + command.length > 0 + ) { + rows.push({ pid, ppid, command }); + } + } + return rows; +} + export class TmuxPlatformCommandExecutor { readonly #wslService: TmuxWslService; readonly #packageManagerResolver: TmuxPackageManagerResolver; @@ -54,34 +91,70 @@ export class TmuxPlatformCommandExecutor { } } - async listPanePids(paneIds: readonly string[]): Promise> { + async listPaneRuntimeInfo(paneIds: readonly string[]): Promise> { const normalizedPaneIds = [...new Set(paneIds.map((paneId) => paneId.trim()).filter(Boolean))]; if (normalizedPaneIds.length === 0) { return new Map(); } - const result = await this.execTmux( - ['list-panes', '-a', '-F', '#{pane_id}\t#{pane_pid}'], - 3_000 - ); + const format = [ + '#{pane_id}', + '#{pane_pid}', + '#{pane_current_command}', + '#{pane_current_path}', + '#{session_name}', + '#{window_name}', + ].join('\t'); + + const result = await this.execTmux(['list-panes', '-a', '-F', format], 3_000); if (result.exitCode !== 0) { throw new Error(result.stderr || 'Failed to list tmux panes'); } const wanted = new Set(normalizedPaneIds); - const panePidById = new Map(); + const paneInfoById = new Map(); for (const line of result.stdout.split('\n')) { const trimmed = line.trim(); if (!trimmed) continue; - const [paneId = '', rawPid = ''] = trimmed.split('\t'); + const [ + paneId = '', + rawPid = '', + currentCommand = '', + currentPath = '', + sessionName = '', + windowName = '', + ] = trimmed.split('\t'); const normalizedPaneId = paneId.trim(); if (!wanted.has(normalizedPaneId)) continue; const pid = Number.parseInt(rawPid.trim(), 10); if (Number.isFinite(pid) && pid > 0) { - panePidById.set(normalizedPaneId, pid); + paneInfoById.set(normalizedPaneId, { + paneId: normalizedPaneId, + panePid: pid, + currentCommand: currentCommand.trim() || undefined, + currentPath: currentPath.trim() || undefined, + sessionName: sessionName.trim() || undefined, + windowName: windowName.trim() || undefined, + }); } } - return panePidById; + return paneInfoById; + } + + async listPanePids(paneIds: readonly string[]): Promise> { + const info = await this.listPaneRuntimeInfo(paneIds); + return new Map([...info.entries()].map(([paneId, pane]) => [paneId, pane.panePid])); + } + + async listRuntimeProcesses(): Promise { + const result = + process.platform === 'win32' + ? await this.#wslService.execInPreferredDistro(['ps', '-ax', '-o', 'pid=,ppid=,command=']) + : await this.#execNativePs(); + if (result.exitCode !== 0) { + throw new Error(result.stderr || 'Failed to list runtime processes'); + } + return parseRuntimeProcessTable(result.stdout); } killPaneSync(paneId: string): void { @@ -125,6 +198,29 @@ export class TmuxPlatformCommandExecutor { return [...candidates]; } + async #execNativePs(): Promise { + await resolveInteractiveShellEnv(); + const env = buildEnrichedEnv(); + return new Promise((resolve) => { + execFile( + 'ps', + ['-ax', '-o', 'pid=,ppid=,command='], + { env, timeout: 3_000, maxBuffer: 2 * 1024 * 1024 }, + (error, stdout, stderr) => { + const errorCode = + typeof error === 'object' && error !== null && 'code' in error + ? (error as NodeJS.ErrnoException).code + : undefined; + resolve({ + exitCode: typeof errorCode === 'number' ? errorCode : error ? 1 : 0, + stdout: String(stdout), + stderr: String(stderr) || (error instanceof Error ? error.message : ''), + }); + } + ); + }); + } + async #resolveNativeTmuxExecutable(env: NodeJS.ProcessEnv): Promise { const platform = process.platform === 'darwin' || process.platform === 'linux' || process.platform === 'win32' diff --git a/src/features/tmux-installer/main/infrastructure/runtime/__tests__/TmuxPlatformCommandExecutor.test.ts b/src/features/tmux-installer/main/infrastructure/runtime/__tests__/TmuxPlatformCommandExecutor.test.ts index 376afc79..9514d8ba 100644 --- a/src/features/tmux-installer/main/infrastructure/runtime/__tests__/TmuxPlatformCommandExecutor.test.ts +++ b/src/features/tmux-installer/main/infrastructure/runtime/__tests__/TmuxPlatformCommandExecutor.test.ts @@ -78,7 +78,8 @@ describe('TmuxPlatformCommandExecutor', () => { ); vi.spyOn(executor, 'execTmux').mockResolvedValue({ exitCode: 0, - stdout: '%1\t111\n%2\t222\n%3\tnot-a-pid\n', + stdout: + '%1\t111\tzsh\t/tmp\tteam\tmain\n%2\t222\tnode\t/project\tteam\tworker\n%3\tnot-a-pid\tzsh\t/tmp\tteam\tmain\n', stderr: '', }); @@ -86,7 +87,12 @@ describe('TmuxPlatformCommandExecutor', () => { new Map([['%2', 222]]) ); expect(executor.execTmux).toHaveBeenCalledWith( - ['list-panes', '-a', '-F', '#{pane_id}\t#{pane_pid}'], + [ + 'list-panes', + '-a', + '-F', + '#{pane_id}\t#{pane_pid}\t#{pane_current_command}\t#{pane_current_path}\t#{session_name}\t#{window_name}', + ], 3_000 ); }); diff --git a/src/features/tmux-installer/main/infrastructure/wsl/TmuxWslService.ts b/src/features/tmux-installer/main/infrastructure/wsl/TmuxWslService.ts index 335a5e69..84cc73a1 100644 --- a/src/features/tmux-installer/main/infrastructure/wsl/TmuxWslService.ts +++ b/src/features/tmux-installer/main/infrastructure/wsl/TmuxWslService.ts @@ -268,6 +268,23 @@ export class TmuxWslService { return this.#run(['-d', distroName, '-e', 'tmux', ...args], timeout); } + async execInPreferredDistro( + args: string[], + preferredDistroName?: string | null, + timeout = 5_000 + ): Promise { + const distroName = preferredDistroName ?? (await this.probe()).preference?.preferredDistroName; + if (!distroName) { + return { + exitCode: 1, + stdout: '', + stderr: 'No WSL distribution is available.', + }; + } + + return this.#run(['-d', distroName, '-e', ...args], timeout); + } + getPersistedPreferredDistroSync(): string | null { return this.#preferenceStore.getPreferredDistroSync(); } diff --git a/src/main/services/team/TeamLaunchStateEvaluator.ts b/src/main/services/team/TeamLaunchStateEvaluator.ts index 6171c037..9a74353c 100644 --- a/src/main/services/team/TeamLaunchStateEvaluator.ts +++ b/src/main/services/team/TeamLaunchStateEvaluator.ts @@ -12,6 +12,9 @@ import type { PersistedTeamLaunchSnapshot, PersistedTeamLaunchSummary, ProviderModelLaunchIdentity, + TeamAgentRuntimeDiagnosticSeverity, + TeamAgentRuntimeLivenessKind, + TeamAgentRuntimePidSource, TeamLaunchAggregateState, } from '@shared/types'; @@ -37,8 +40,13 @@ type RuntimeMemberSpawnState = Pick< | 'bootstrapConfirmed' | 'hardFailure' | 'pendingPermissionRequestIds' + | 'livenessKind' + | 'runtimeDiagnostic' + | 'runtimeDiagnosticSeverity' + | 'livenessLastCheckedAt' | 'firstSpawnAcceptedAt' | 'lastHeartbeatAt' + | 'runtimeModel' | 'updatedAt' >; @@ -59,6 +67,41 @@ function normalizeRuntimePid(value: unknown): number | undefined { : undefined; } +function normalizeLivenessKind(value: unknown): TeamAgentRuntimeLivenessKind | undefined { + return value === 'confirmed_bootstrap' || + value === 'runtime_process' || + value === 'runtime_process_candidate' || + value === 'permission_blocked' || + value === 'shell_only' || + value === 'registered_only' || + value === 'stale_metadata' || + value === 'not_found' + ? value + : undefined; +} + +function normalizePidSource(value: unknown): TeamAgentRuntimePidSource | undefined { + return value === 'lead_process' || + value === 'tmux_pane' || + value === 'tmux_child' || + value === 'agent_process_table' || + value === 'opencode_bridge' || + value === 'runtime_bootstrap' || + value === 'persisted_metadata' + ? value + : undefined; +} + +function normalizeDiagnosticSeverity( + value: unknown +): TeamAgentRuntimeDiagnosticSeverity | undefined { + return value === 'info' || value === 'warning' || value === 'error' ? value : undefined; +} + +function normalizeOptionalString(value: unknown): string | undefined { + return typeof value === 'string' && value.trim().length > 0 ? value.trim() : undefined; +} + function normalizeMemberName(name: string): string { return name.trim(); } @@ -110,6 +153,11 @@ export function summarizePersistedLaunchMembers( let pendingCount = 0; let failedCount = 0; let runtimeAlivePendingCount = 0; + let shellOnlyPendingCount = 0; + let runtimeProcessPendingCount = 0; + let runtimeCandidatePendingCount = 0; + let noRuntimePendingCount = 0; + let permissionPendingCount = 0; const normalizedExpected = expectedMembers.map(normalizeMemberName).filter(Boolean); const memberNames = Array.from( new Set([ @@ -136,9 +184,31 @@ export function summarizePersistedLaunchMembers( if (entry.runtimeAlive) { runtimeAlivePendingCount += 1; } + if (entry.launchState === 'runtime_pending_permission') { + permissionPendingCount += 1; + } + if (entry.livenessKind === 'shell_only') { + shellOnlyPendingCount += 1; + } else if (entry.livenessKind === 'runtime_process') { + runtimeProcessPendingCount += 1; + } else if (entry.livenessKind === 'runtime_process_candidate') { + runtimeCandidatePendingCount += 1; + } else if (entry.livenessKind === 'not_found' || entry.livenessKind === 'stale_metadata') { + noRuntimePendingCount += 1; + } } - return { confirmedCount, pendingCount, failedCount, runtimeAlivePendingCount }; + return { + confirmedCount, + pendingCount, + failedCount, + runtimeAlivePendingCount, + shellOnlyPendingCount, + runtimeProcessPendingCount, + runtimeCandidatePendingCount, + noRuntimePendingCount, + permissionPendingCount, + }; } export function hasMixedPersistedLaunchMetadata( @@ -340,6 +410,12 @@ function normalizePersistedMemberState( parsed.pendingPermissionRequestIds ), runtimePid: normalizeRuntimePid(parsed.runtimePid), + runtimeSessionId: normalizeOptionalString(parsed.runtimeSessionId), + livenessKind: normalizeLivenessKind(parsed.livenessKind), + pidSource: normalizePidSource(parsed.pidSource), + runtimeDiagnostic: normalizeOptionalString(parsed.runtimeDiagnostic), + runtimeDiagnosticSeverity: normalizeDiagnosticSeverity(parsed.runtimeDiagnosticSeverity), + runtimeLastSeenAt: normalizeOptionalString(parsed.runtimeLastSeenAt), firstSpawnAcceptedAt: typeof parsed.firstSpawnAcceptedAt === 'string' ? parsed.firstSpawnAcceptedAt : undefined, lastHeartbeatAt: @@ -492,6 +568,10 @@ export function snapshotFromRuntimeMemberStatuses(params: { pendingPermissionRequestIds: runtime?.pendingPermissionRequestIds?.length ? [...new Set(runtime.pendingPermissionRequestIds)] : undefined, + livenessKind: runtime?.livenessKind, + runtimeDiagnostic: runtime?.runtimeDiagnostic, + runtimeDiagnosticSeverity: runtime?.runtimeDiagnosticSeverity, + runtimeLastSeenAt: runtime?.livenessLastCheckedAt, firstSpawnAcceptedAt: runtime?.firstSpawnAcceptedAt, lastHeartbeatAt: runtime?.lastHeartbeatAt, lastRuntimeAliveAt: runtime?.runtimeAlive ? updatedAt : undefined, @@ -555,6 +635,10 @@ export function snapshotToMemberSpawnStatuses( bootstrapConfirmed: entry.bootstrapConfirmed, hardFailure: entry.hardFailure, pendingPermissionRequestIds: entry.pendingPermissionRequestIds, + livenessKind: entry.livenessKind, + runtimeDiagnostic: entry.runtimeDiagnostic, + runtimeDiagnosticSeverity: entry.runtimeDiagnosticSeverity, + livenessLastCheckedAt: entry.runtimeLastSeenAt ?? entry.lastEvaluatedAt, firstSpawnAcceptedAt: entry.firstSpawnAcceptedAt, lastHeartbeatAt: entry.lastHeartbeatAt, updatedAt: entry.lastEvaluatedAt, diff --git a/src/main/services/team/TeamLaunchSummaryProjection.ts b/src/main/services/team/TeamLaunchSummaryProjection.ts index da1fe435..1a69ffe9 100644 --- a/src/main/services/team/TeamLaunchSummaryProjection.ts +++ b/src/main/services/team/TeamLaunchSummaryProjection.ts @@ -19,6 +19,11 @@ export interface LaunchStateSummary { pendingCount?: number; failedCount?: number; runtimeAlivePendingCount?: number; + shellOnlyPendingCount?: number; + runtimeProcessPendingCount?: number; + runtimeCandidatePendingCount?: number; + noRuntimePendingCount?: number; + permissionPendingCount?: number; } export interface PersistedTeamLaunchSummaryProjection extends LaunchStateSummary { @@ -73,6 +78,11 @@ export function createLaunchStateSummary( pendingCount: snapshot.summary.pendingCount, failedCount: snapshot.summary.failedCount, runtimeAlivePendingCount: snapshot.summary.runtimeAlivePendingCount, + shellOnlyPendingCount: snapshot.summary.shellOnlyPendingCount, + runtimeProcessPendingCount: snapshot.summary.runtimeProcessPendingCount, + runtimeCandidatePendingCount: snapshot.summary.runtimeCandidatePendingCount, + noRuntimePendingCount: snapshot.summary.noRuntimePendingCount, + permissionPendingCount: snapshot.summary.permissionPendingCount, }; } @@ -147,6 +157,27 @@ export function normalizePersistedLaunchSummaryProjection( if (typeof record.runtimeAlivePendingCount === 'number' && record.runtimeAlivePendingCount >= 0) { normalized.runtimeAlivePendingCount = record.runtimeAlivePendingCount; } + if (typeof record.shellOnlyPendingCount === 'number' && record.shellOnlyPendingCount >= 0) { + normalized.shellOnlyPendingCount = record.shellOnlyPendingCount; + } + if ( + typeof record.runtimeProcessPendingCount === 'number' && + record.runtimeProcessPendingCount >= 0 + ) { + normalized.runtimeProcessPendingCount = record.runtimeProcessPendingCount; + } + if ( + typeof record.runtimeCandidatePendingCount === 'number' && + record.runtimeCandidatePendingCount >= 0 + ) { + normalized.runtimeCandidatePendingCount = record.runtimeCandidatePendingCount; + } + if (typeof record.noRuntimePendingCount === 'number' && record.noRuntimePendingCount >= 0) { + normalized.noRuntimePendingCount = record.noRuntimePendingCount; + } + if (typeof record.permissionPendingCount === 'number' && record.permissionPendingCount >= 0) { + normalized.permissionPendingCount = record.permissionPendingCount; + } normalized.launchUpdatedAt = updatedAt; return normalized; } diff --git a/src/main/services/team/TeamMcpConfigBuilder.ts b/src/main/services/team/TeamMcpConfigBuilder.ts index 5251334c..6f91ea84 100644 --- a/src/main/services/team/TeamMcpConfigBuilder.ts +++ b/src/main/services/team/TeamMcpConfigBuilder.ts @@ -227,17 +227,7 @@ export async function resolveAgentTeamsMcpLaunchSpec(): Promise { logger.warn(`Packaged MCP entry not found at ${packagedEntry}, falling back to workspace`); } - // 2. Dev mode — prefer built dist for reliable direct execution - const builtEntry = getBuiltServerEntry(); - checked.push(builtEntry); - if (await pathExists(builtEntry)) { - return { - command: await resolveNodePath(), - args: [builtEntry], - }; - } - - // 3. Dev mode fallback — run source directly through a local tsx binary + // 2. Dev mode — prefer source so pnpm dev always sees current MCP tools const sourceEntry = getSourceServerEntry(); checked.push(sourceEntry); if (await pathExists(sourceEntry)) { @@ -252,6 +242,16 @@ export async function resolveAgentTeamsMcpLaunchSpec(): Promise { } } + // 3. Dev mode fallback — use built dist when source execution is unavailable + const builtEntry = getBuiltServerEntry(); + checked.push(builtEntry); + if (await pathExists(builtEntry)) { + return { + command: await resolveNodePath(), + args: [builtEntry], + }; + } + throw new Error( `agent-teams-mcp entrypoint not found. Checked paths:\n${checked.map((p) => ` - ${p}`).join('\n')}` ); diff --git a/src/main/services/team/TeamMemberLivenessMode.ts b/src/main/services/team/TeamMemberLivenessMode.ts new file mode 100644 index 00000000..6817d7f7 --- /dev/null +++ b/src/main/services/team/TeamMemberLivenessMode.ts @@ -0,0 +1,14 @@ +export type TeamMemberLivenessMode = 'diagnostics' | 'strict'; + +export const CLAUDE_TEAM_MEMBER_LIVENESS_MODE_ENV = 'CLAUDE_TEAM_MEMBER_LIVENESS_MODE'; + +export function resolveTeamMemberLivenessModeFromEnv( + env: NodeJS.ProcessEnv = process.env +): TeamMemberLivenessMode { + const raw = env[CLAUDE_TEAM_MEMBER_LIVENESS_MODE_ENV]?.trim().toLowerCase(); + return raw === 'strict' ? 'strict' : 'diagnostics'; +} + +export function isStrictTeamMemberLivenessMode(env: NodeJS.ProcessEnv = process.env): boolean { + return resolveTeamMemberLivenessModeFromEnv(env) === 'strict'; +} diff --git a/src/main/services/team/TeamProvisioningService.ts b/src/main/services/team/TeamProvisioningService.ts index b09f1ed5..a285af1c 100644 --- a/src/main/services/team/TeamProvisioningService.ts +++ b/src/main/services/team/TeamProvisioningService.ts @@ -16,7 +16,10 @@ import { import { createTeamRuntimeLaneCoordinator } from '@features/team-runtime-lanes/main'; import { killTmuxPaneForCurrentPlatformSync, + listRuntimeProcessesForCurrentTmuxPlatform, listTmuxPanePidsForCurrentPlatform, + listTmuxPaneRuntimeInfoForCurrentPlatform, + type TmuxPaneRuntimeInfo, } from '@features/tmux-installer/main'; import { ConfigManager } from '@main/services/infrastructure/ConfigManager'; import { NotificationManager } from '@main/services/infrastructure/NotificationManager'; @@ -149,7 +152,11 @@ import { } from './idleNotificationMainProcessSemantics'; import { withInboxLock } from './inboxLock'; import { getEffectiveInboxMessageId } from './inboxMessageIdentity'; -import { buildProgressAssistantOutput, buildProgressLogsTail } from './progressPayload'; +import { + boundLaunchDiagnostics, + buildProgressAssistantOutput, + buildProgressLogsTail, +} from './progressPayload'; import { resolveDesktopTeammateModeDecision } from './runtimeTeammateMode'; import { choosePreferredLaunchSnapshot, @@ -170,6 +177,11 @@ import { } from './TeamLaunchStateEvaluator'; import { TeamLaunchStateStore } from './TeamLaunchStateStore'; import { TeamMcpConfigBuilder } from './TeamMcpConfigBuilder'; +import { + isStrongRuntimeEvidence, + resolveTeamMemberRuntimeLiveness, +} from './TeamRuntimeLivenessResolver'; +import { isStrictTeamMemberLivenessMode } from './TeamMemberLivenessMode'; import { TeamMemberLogsFinder } from './TeamMemberLogsFinder'; import { TeamMembersMetaStore } from './TeamMembersMetaStore'; import { TeamMetaStore } from './TeamMetaStore'; @@ -219,6 +231,9 @@ interface PersistedRuntimeMemberLike { agentId?: string; tmuxPaneId?: string; backendType?: string; + providerId?: string; + runtimePid?: number; + runtimeSessionId?: string; } type RelayInboxMessage = InboxMessage & { messageId: string }; @@ -273,7 +288,10 @@ import type { PersistedTeamLaunchSummary, ProviderModelLaunchIdentity, TeamAgentRuntimeBackendType, + TeamAgentRuntimeDiagnosticSeverity, TeamAgentRuntimeEntry, + TeamAgentRuntimeLivenessKind, + TeamAgentRuntimePidSource, TeamAgentRuntimeSnapshot, TeamChangeEvent, TeamConfig, @@ -281,6 +299,7 @@ import type { TeamCreateResponse, TeamFastMode, TeamLaunchAggregateState, + TeamLaunchDiagnosticItem, TeamLaunchRequest, TeamLaunchResponse, TeamMember, @@ -362,6 +381,50 @@ function normalizeRuntimeStringArray(value: unknown): string[] { : []; } +interface RuntimeToolMetadata { + runtimePid?: number; + processCommand?: string; + runtimeVersion?: string; + hostPid?: number; + cwd?: string; +} + +function normalizeRuntimePositiveInteger(value: unknown): number | undefined { + return typeof value === 'number' && Number.isFinite(value) && value > 0 + ? Math.trunc(value) + : undefined; +} + +function normalizeRuntimeMetadataString(value: unknown, maxLength: number): string | undefined { + return typeof value === 'string' && value.trim().length > 0 + ? value.trim().slice(0, maxLength) + : undefined; +} + +function parseRuntimeToolMetadata(value: unknown): RuntimeToolMetadata { + if (!value || typeof value !== 'object' || Array.isArray(value)) { + return {}; + } + const raw = value as Record; + return { + ...(normalizeRuntimePositiveInteger(raw.runtimePid) + ? { runtimePid: normalizeRuntimePositiveInteger(raw.runtimePid) } + : {}), + ...(normalizeRuntimeMetadataString(raw.processCommand, 500) + ? { processCommand: normalizeRuntimeMetadataString(raw.processCommand, 500) } + : {}), + ...(normalizeRuntimeMetadataString(raw.runtimeVersion, 80) + ? { runtimeVersion: normalizeRuntimeMetadataString(raw.runtimeVersion, 80) } + : {}), + ...(normalizeRuntimePositiveInteger(raw.hostPid) + ? { hostPid: normalizeRuntimePositiveInteger(raw.hostPid) } + : {}), + ...(normalizeRuntimeMetadataString(raw.cwd, 500) + ? { cwd: normalizeRuntimeMetadataString(raw.cwd, 500) } + : {}), + }; +} + function runtimeTaskRefs(teamName: string, value: unknown): InboxMessage['taskRefs'] | undefined { const refs = normalizeRuntimeStringArray(value); return refs.length > 0 @@ -1210,23 +1273,7 @@ interface ProvisioningRun { geminiPostLaunchHydrationSent: boolean; suppressGeminiPostLaunchHydrationOutput: boolean; /** Per-member spawn lifecycle statuses tracked from stream-json output. */ - memberSpawnStatuses: Map< - string, - { - status: MemberSpawnStatus; - launchState: MemberLaunchState; - error?: string; - hardFailureReason?: string; - livenessSource?: MemberSpawnLivenessSource; - agentToolAccepted?: boolean; - runtimeAlive?: boolean; - bootstrapConfirmed?: boolean; - hardFailure?: boolean; - firstSpawnAcceptedAt?: string; - lastHeartbeatAt?: string; - updatedAt: string; - } - >; + memberSpawnStatuses: Map; /** Agent tool_use_id -> teammate name for persistent teammate spawns. */ memberSpawnToolUseIds: Map; /** Explicit restart requests awaiting teammate rejoin or failure. */ @@ -1307,6 +1354,7 @@ interface PromptSizeSummary { } const MEMBER_LAUNCH_GRACE_MS = 90_000; +const MEMBER_BOOTSTRAP_STALL_MS = 5 * 60_000; export function shouldWarnOnUnreadableMemberAuditConfig(params: { nowMs: number; @@ -1360,27 +1408,22 @@ function createInitialMemberSpawnStatusEntry(): MemberSpawnStatusEntry { interface LiveTeamAgentRuntimeMetadata { alive: boolean; backendType?: TeamAgentRuntimeBackendType; + providerId?: TeamProviderId; agentId?: string; pid?: number; metricsPid?: number; model?: string; tmuxPaneId?: string; -} - -function escapeRegexLiteral(value: string): string { - return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); -} - -function commandContainsCliArgValue(command: string, argName: string, value: string): boolean { - const normalizedCommand = command.trim(); - const normalizedValue = value.trim(); - if (!normalizedCommand || !normalizedValue) { - return false; - } - const pattern = new RegExp( - `(?:^|\\s)${escapeRegexLiteral(argName)}(?:=|\\s+)${escapeRegexLiteral(normalizedValue)}(?:\\s|$)` - ); - return pattern.test(normalizedCommand); + livenessKind?: TeamAgentRuntimeLivenessKind; + pidSource?: TeamAgentRuntimePidSource; + processCommand?: string; + panePid?: number; + paneCurrentCommand?: string; + runtimeSessionId?: string; + runtimeLastSeenAt?: string; + runtimeDiagnostic?: string; + runtimeDiagnosticSeverity?: TeamAgentRuntimeDiagnosticSeverity; + diagnostics?: string[]; } function isNeverSpawnedDuringLaunchReason(reason?: string): boolean { @@ -1460,6 +1503,11 @@ function summarizeMemberSpawnStatusRecord( let pendingCount = 0; let failedCount = 0; let runtimeAlivePendingCount = 0; + let shellOnlyPendingCount = 0; + let runtimeProcessPendingCount = 0; + let runtimeCandidatePendingCount = 0; + let noRuntimePendingCount = 0; + let permissionPendingCount = 0; const memberNames = Array.from(new Set([...expectedMembers, ...Object.keys(statuses)])); for (const memberName of memberNames) { @@ -1480,9 +1528,31 @@ function summarizeMemberSpawnStatusRecord( if (entry.runtimeAlive) { runtimeAlivePendingCount += 1; } + if (entry.launchState === 'runtime_pending_permission') { + permissionPendingCount += 1; + } + if (entry.livenessKind === 'shell_only') { + shellOnlyPendingCount += 1; + } else if (entry.livenessKind === 'runtime_process') { + runtimeProcessPendingCount += 1; + } else if (entry.livenessKind === 'runtime_process_candidate') { + runtimeCandidatePendingCount += 1; + } else if (entry.livenessKind === 'not_found' || entry.livenessKind === 'stale_metadata') { + noRuntimePendingCount += 1; + } } - return { confirmedCount, pendingCount, failedCount, runtimeAlivePendingCount }; + return { + confirmedCount, + pendingCount, + failedCount, + runtimeAlivePendingCount, + shellOnlyPendingCount, + runtimeProcessPendingCount, + runtimeCandidatePendingCount, + noRuntimePendingCount, + permissionPendingCount, + }; } function buildRestartStillRunningReason(memberName: string): string { @@ -3136,7 +3206,13 @@ function updateProgress( message: string, extras?: Pick< TeamProvisioningProgress, - 'pid' | 'error' | 'warnings' | 'cliLogsTail' | 'configReady' | 'messageSeverity' + | 'pid' + | 'error' + | 'warnings' + | 'cliLogsTail' + | 'configReady' + | 'messageSeverity' + | 'launchDiagnostics' > ): TeamProvisioningProgress { // Cap assistant output on every progress tick. `updateProgress` is invoked @@ -3157,10 +3233,110 @@ function updateProgress( assistantOutput, configReady: extras?.configReady ?? run.progress.configReady, messageSeverity: extras?.messageSeverity, + launchDiagnostics: boundLaunchDiagnostics( + extras?.launchDiagnostics ?? + buildLaunchDiagnosticsFromRun(run) ?? + run.progress.launchDiagnostics + ), }; return run.progress; } +function buildLaunchDiagnosticsFromRun( + run: ProvisioningRun +): TeamLaunchDiagnosticItem[] | undefined { + if (!run.isLaunch || run.memberSpawnStatuses.size === 0) { + return undefined; + } + const observedAt = nowIso(); + const items: TeamLaunchDiagnosticItem[] = []; + for (const [memberName, entry] of run.memberSpawnStatuses.entries()) { + if (entry.launchState === 'confirmed_alive') { + items.push({ + id: `${memberName}:bootstrap_confirmed`, + memberName, + severity: 'info', + code: 'bootstrap_confirmed', + label: `${memberName} - bootstrap confirmed`, + observedAt, + }); + continue; + } + if (entry.launchState === 'failed_to_start') { + items.push({ + id: `${memberName}:bootstrap_stalled`, + memberName, + severity: 'error', + code: 'bootstrap_stalled', + label: `${memberName} - failed to start`, + detail: entry.hardFailureReason ?? entry.error, + observedAt, + }); + continue; + } + if (entry.launchState === 'runtime_pending_permission') { + items.push({ + id: `${memberName}:permission_pending`, + memberName, + severity: 'warning', + code: 'permission_pending', + label: `${memberName} - awaiting permission`, + detail: entry.runtimeDiagnostic, + observedAt, + }); + continue; + } + if (entry.livenessKind === 'shell_only') { + items.push({ + id: `${memberName}:tmux_shell_only`, + memberName, + severity: 'warning', + code: 'tmux_shell_only', + label: `${memberName} - shell only`, + detail: entry.runtimeDiagnostic, + observedAt, + }); + continue; + } + if (entry.livenessKind === 'runtime_process_candidate') { + items.push({ + id: `${memberName}:runtime_process_candidate`, + memberName, + severity: 'warning', + code: 'runtime_process_candidate', + label: `${memberName} - process candidate`, + detail: entry.runtimeDiagnostic, + observedAt, + }); + continue; + } + if (entry.livenessKind === 'runtime_process') { + items.push({ + id: `${memberName}:runtime_process_detected`, + memberName, + severity: 'info', + code: 'runtime_process_detected', + label: `${memberName} - waiting for bootstrap`, + detail: entry.runtimeDiagnostic, + observedAt, + }); + continue; + } + if (entry.agentToolAccepted) { + items.push({ + id: `${memberName}:spawn_accepted`, + memberName, + severity: 'info', + code: 'spawn_accepted', + label: `${memberName} - spawn accepted`, + detail: entry.runtimeDiagnostic, + observedAt, + }); + } + } + return items.length > 0 ? items : undefined; +} + function buildCombinedLogs( stdoutBuffer: string | undefined, stderrBuffer: string | undefined @@ -5488,6 +5664,7 @@ export class TeamProvisioningService { runtimeSessionId, observedAt, diagnostics: payload.diagnostics, + metadata: parseRuntimeToolMetadata(payload.metadata), reason: 'OpenCode runtime bootstrap check-in accepted', }); @@ -5618,6 +5795,7 @@ export class TeamProvisioningService { runtimeSessionId, observedAt, diagnostics: undefined, + metadata: parseRuntimeToolMetadata(payload.metadata), reason: `OpenCode runtime heartbeat accepted${optionalRuntimeString(payload.status) ? ` (${optionalRuntimeString(payload.status)})` : ''}`, }); @@ -5662,6 +5840,7 @@ export class TeamProvisioningService { runtimeSessionId: string; observedAt: string; diagnostics: unknown; + metadata?: RuntimeToolMetadata; reason: string; }): Promise { const previous = await this.launchStateStore.read(input.teamName); @@ -5685,6 +5864,13 @@ export class TeamProvisioningService { runtimeAlive: true, bootstrapConfirmed: true, hardFailure: false, + ...(input.metadata?.runtimePid ? { runtimePid: input.metadata.runtimePid } : {}), + runtimeSessionId: input.runtimeSessionId, + livenessKind: 'confirmed_bootstrap', + ...(input.metadata?.runtimePid ? { pidSource: 'runtime_bootstrap' as const } : {}), + runtimeDiagnostic: input.reason, + runtimeDiagnosticSeverity: 'info', + runtimeLastSeenAt: input.observedAt, firstSpawnAcceptedAt: previousMember?.firstSpawnAcceptedAt ?? input.observedAt, lastHeartbeatAt: input.observedAt, lastRuntimeAliveAt: input.observedAt, @@ -5712,6 +5898,8 @@ export class TeamProvisioningService { updatedAt: input.observedAt, }); await this.launchStateStore.write(input.teamName, snapshot); + this.agentRuntimeSnapshotCache.delete(input.teamName); + this.liveTeamAgentRuntimeMetadataCache.delete(input.teamName); this.teamChangeEmitter?.({ type: 'member-spawn', teamName: input.teamName, @@ -6310,6 +6498,16 @@ export class TeamProvisioningService { } this.agentRuntimeSnapshotCache.delete(run.teamName); this.liveTeamAgentRuntimeMetadataCache.delete(run.teamName); + if (isStrictTeamMemberLivenessMode()) { + this.setMemberSpawnStatus(run, spawnedMemberName, 'waiting'); + this.appendMemberBootstrapDiagnostic( + run, + spawnedMemberName, + 'already_running requires strong runtime verification' + ); + void this.reevaluateMemberLaunchStatus(run, spawnedMemberName); + return; + } this.setMemberSpawnStatus(run, spawnedMemberName, 'online', undefined, 'process'); } else { this.setMemberSpawnStatus(run, spawnedMemberName, 'waiting'); @@ -6452,6 +6650,10 @@ export class TeamProvisioningService { next.error = undefined; next.hardFailureReason = undefined; next.livenessSource = undefined; + next.livenessKind = undefined; + next.runtimeDiagnostic = undefined; + next.runtimeDiagnosticSeverity = undefined; + next.livenessLastCheckedAt = undefined; next.firstSpawnAcceptedAt = undefined; next.lastHeartbeatAt = undefined; next.launchState = 'starting'; @@ -6463,6 +6665,10 @@ export class TeamProvisioningService { next.error = undefined; next.hardFailureReason = undefined; next.livenessSource = undefined; + next.livenessKind = undefined; + next.runtimeDiagnostic = undefined; + next.runtimeDiagnosticSeverity = undefined; + next.livenessLastCheckedAt = undefined; next.firstSpawnAcceptedAt = prev.firstSpawnAcceptedAt ?? updatedAt; next.lastHeartbeatAt = undefined; next.launchState = 'runtime_pending_bootstrap'; @@ -6495,6 +6701,10 @@ export class TeamProvisioningService { next.error = undefined; next.hardFailureReason = undefined; next.livenessSource = undefined; + next.livenessKind = undefined; + next.runtimeDiagnostic = undefined; + next.runtimeDiagnosticSeverity = undefined; + next.livenessLastCheckedAt = undefined; next.firstSpawnAcceptedAt = undefined; next.lastHeartbeatAt = undefined; } @@ -6510,6 +6720,9 @@ export class TeamProvisioningService { prev.runtimeAlive === next.runtimeAlive && prev.bootstrapConfirmed === next.bootstrapConfirmed && prev.hardFailure === next.hardFailure && + prev.livenessKind === next.livenessKind && + prev.runtimeDiagnostic === next.runtimeDiagnostic && + prev.runtimeDiagnosticSeverity === next.runtimeDiagnosticSeverity && prev.firstSpawnAcceptedAt === next.firstSpawnAcceptedAt && prev.lastHeartbeatAt === next.lastHeartbeatAt ) { @@ -6525,6 +6738,15 @@ export class TeamProvisioningService { run.pendingMemberRestarts?.delete(memberName); } this.syncMemberLaunchGraceCheck(run, memberName, next); + const launchDiagnostics = boundLaunchDiagnostics(buildLaunchDiagnosticsFromRun(run)); + if (launchDiagnostics) { + run.progress = { + ...run.progress, + updatedAt: nowIso(), + launchDiagnostics, + }; + run.onProgress(run.progress); + } if (status === 'spawning') { this.appendMemberBootstrapDiagnostic(run, memberName, 'Agent tool invoked'); @@ -6702,6 +6924,7 @@ export class TeamProvisioningService { } const updatedAt = nowIso(); + const strictLiveness = isStrictTeamMemberLivenessMode(); const run = runId ? (this.runs.get(runId) ?? null) : null; const persistedTeamMeta = await this.teamMetaStore.getMeta(teamName).catch(() => null); @@ -6815,11 +7038,9 @@ export class TeamProvisioningService { const persistedRuntimeMember = getPersistedRuntimeMember(memberName); const liveRuntimeMember = getLiveRuntimeMember(memberName); const launchMember = launchSnapshot?.members[memberName]; - const backendType = normalizeTeamAgentRuntimeBackendType( - persistedRuntimeMember?.backendType, - false - ); - const rssPid = liveRuntimeMember?.pid ?? liveRuntimeMember?.metricsPid; + const backendType = + liveRuntimeMember?.backendType ?? + normalizeTeamAgentRuntimeBackendType(persistedRuntimeMember?.backendType, false); const runtimeModel = liveRuntimeMember?.model ?? launchMember?.model?.trim() ?? @@ -6832,22 +7053,27 @@ export class TeamProvisioningService { inferTeamProviderIdFromModel(launchMember?.model) ?? inferTeamProviderIdFromModel(member.model); const isOpenCodeMember = memberProviderId === 'opencode'; + const metricsPid = liveRuntimeMember?.metricsPid; const isSharedOpenCodeHost = isOpenCodeMember && - !liveRuntimeMember?.pid && - typeof liveRuntimeMember?.metricsPid === 'number' && - liveRuntimeMember.metricsPid > 0; - const displayPid = liveRuntimeMember?.pid ?? (isSharedOpenCodeHost ? rssPid : undefined); + typeof metricsPid === 'number' && + metricsPid > 0 && + liveRuntimeMember?.pidSource !== 'agent_process_table'; + const rssPid = isSharedOpenCodeHost ? metricsPid : (liveRuntimeMember?.pid ?? metricsPid); + const displayPid = isSharedOpenCodeHost ? rssPid : liveRuntimeMember?.pid; const restartable = isOpenCodeMember - ? Boolean(liveRuntimeMember?.pid) + ? !isSharedOpenCodeHost && Boolean(liveRuntimeMember?.pid) : isSharedOpenCodeHost ? false : backendType !== 'in-process'; const launchSnapshotAlive = this.isTeamAlive(teamName) && - (launchMember?.runtimeAlive === true || - launchMember?.bootstrapConfirmed === true || - launchMember?.launchState === 'confirmed_alive'); + (strictLiveness + ? launchMember?.bootstrapConfirmed === true || + launchMember?.launchState === 'confirmed_alive' + : launchMember?.runtimeAlive === true || + launchMember?.bootstrapConfirmed === true || + launchMember?.launchState === 'confirmed_alive'); let rssBytes = rssPid ? rssBytesByPid.get(rssPid) : undefined; if (rssBytes == null && isSharedOpenCodeHost && typeof rssPid === 'number' && rssPid > 0) { try { @@ -6875,6 +7101,32 @@ export class TeamProvisioningService { ...(displayPid ? { pid: displayPid } : {}), ...(runtimeModel ? { runtimeModel } : {}), ...(typeof rssBytes === 'number' && rssBytes >= 0 ? { rssBytes } : {}), + ...(liveRuntimeMember?.livenessKind + ? { livenessKind: liveRuntimeMember.livenessKind } + : {}), + ...(liveRuntimeMember?.pidSource ? { pidSource: liveRuntimeMember.pidSource } : {}), + ...(liveRuntimeMember?.processCommand + ? { processCommand: liveRuntimeMember.processCommand } + : {}), + ...(liveRuntimeMember?.tmuxPaneId ? { paneId: liveRuntimeMember.tmuxPaneId } : {}), + ...(liveRuntimeMember?.panePid ? { panePid: liveRuntimeMember.panePid } : {}), + ...(liveRuntimeMember?.paneCurrentCommand + ? { paneCurrentCommand: liveRuntimeMember.paneCurrentCommand } + : {}), + ...(liveRuntimeMember?.metricsPid ? { runtimePid: liveRuntimeMember.metricsPid } : {}), + ...(liveRuntimeMember?.runtimeSessionId + ? { runtimeSessionId: liveRuntimeMember.runtimeSessionId } + : {}), + ...(liveRuntimeMember?.runtimeLastSeenAt + ? { runtimeLastSeenAt: liveRuntimeMember.runtimeLastSeenAt } + : {}), + ...(liveRuntimeMember?.runtimeDiagnostic + ? { runtimeDiagnostic: liveRuntimeMember.runtimeDiagnostic } + : {}), + ...(liveRuntimeMember?.runtimeDiagnosticSeverity + ? { runtimeDiagnosticSeverity: liveRuntimeMember.runtimeDiagnosticSeverity } + : {}), + ...(liveRuntimeMember?.diagnostics ? { diagnostics: liveRuntimeMember.diagnostics } : {}), updatedAt, }; } @@ -7346,6 +7598,73 @@ export class TeamProvisioningService { ) { return; } + const refreshedFirstSpawnAcceptedAt = refreshed.firstSpawnAcceptedAt; + if (!refreshedFirstSpawnAcceptedAt) { + return; + } + if (isStrictTeamMemberLivenessMode()) { + const runtimeByMember = await this.getLiveTeamAgentRuntimeMetadata(run.teamName); + const metadata = + runtimeByMember.get(memberName) ?? + [...runtimeByMember.entries()].find(([candidateName]) => + matchesObservedMemberNameForExpected(candidateName, memberName) + )?.[1]; + const acceptedAtMs = Date.parse(refreshedFirstSpawnAcceptedAt); + const elapsedMs = Number.isFinite(acceptedAtMs) ? Date.now() - acceptedAtMs : Infinity; + const runtimeDiagnostic = metadata?.runtimeDiagnostic; + if (metadata?.livenessKind === 'runtime_process') { + this.setMemberSpawnStatus(run, memberName, 'online', undefined, 'process'); + return; + } + if (metadata?.livenessKind === 'permission_blocked') { + const next = { + ...refreshed, + livenessKind: metadata.livenessKind, + runtimeDiagnostic: runtimeDiagnostic ?? 'waiting for permission approval', + runtimeDiagnosticSeverity: metadata.runtimeDiagnosticSeverity ?? 'warning', + livenessLastCheckedAt: nowIso(), + launchState: 'runtime_pending_permission' as const, + }; + run.memberSpawnStatuses.set(memberName, next); + this.emitMemberSpawnChange(run, memberName); + return; + } + if ( + metadata?.livenessKind === 'runtime_process_candidate' && + elapsedMs < MEMBER_BOOTSTRAP_STALL_MS + ) { + const next = { + ...refreshed, + livenessKind: metadata.livenessKind, + runtimeDiagnostic: runtimeDiagnostic ?? 'runtime process candidate detected', + runtimeDiagnosticSeverity: metadata.runtimeDiagnosticSeverity ?? 'warning', + livenessLastCheckedAt: nowIso(), + }; + run.memberSpawnStatuses.set(memberName, next); + this.emitMemberSpawnChange(run, memberName); + const stallDelayMs = Math.max( + 1_000, + Date.parse(refreshedFirstSpawnAcceptedAt) + MEMBER_BOOTSTRAP_STALL_MS - Date.now() + ); + const stallKey = `${this.getMemberLaunchGraceKey(run, memberName)}:bootstrap-stall`; + if (!this.pendingTimeouts.has(stallKey)) { + const timer = setTimeout(() => { + this.pendingTimeouts.delete(stallKey); + void this.reevaluateMemberLaunchStatus(run, memberName); + }, stallDelayMs); + timer.unref?.(); + this.pendingTimeouts.set(stallKey, timer); + } + return; + } + const strictReason = + runtimeDiagnostic ?? + (metadata?.livenessKind === 'shell_only' + ? 'Tmux pane is alive, but no teammate runtime process was found.' + : 'Teammate did not join within the launch grace window.'); + this.setMemberSpawnStatus(run, memberName, 'error', strictReason); + return; + } const restartPending = run.pendingMemberRestarts.has(memberName); if (restartPending) { run.pendingMemberRestarts.delete(memberName); @@ -9958,6 +10277,13 @@ export class TeamProvisioningService { pendingPermissionRequestIds: evidence?.pendingPermissionRequestIds?.length ? [...new Set(evidence.pendingPermissionRequestIds)] : undefined, + ...(evidence?.runtimePid ? { runtimePid: evidence.runtimePid } : {}), + ...(evidence?.sessionId ? { runtimeSessionId: evidence.sessionId } : {}), + ...(evidence?.livenessKind ? { livenessKind: evidence.livenessKind } : {}), + ...(evidence?.pidSource ? { pidSource: evidence.pidSource } : {}), + ...(evidence?.runtimeDiagnostic ? { runtimeDiagnostic: evidence.runtimeDiagnostic } : {}), + ...(evidence?.runtimeDiagnostic ? { runtimeDiagnosticSeverity: 'info' as const } : {}), + ...(evidence?.runtimeAlive ? { runtimeLastSeenAt: now } : {}), firstSpawnAcceptedAt: evidence?.agentToolAccepted ? now : undefined, lastHeartbeatAt: evidence?.bootstrapConfirmed ? now : undefined, lastRuntimeAliveAt: evidence?.runtimeAlive ? now : undefined, @@ -11652,6 +11978,7 @@ export class TeamProvisioningService { async getRuntimeState(teamName: string): Promise { const runId = this.getTrackedRunId(teamName); const run = runId ? (this.runs.get(runId) ?? null) : null; + const strictLiveness = isStrictTeamMemberLivenessMode(); if (!run) { const recovered = await readBootstrapRuntimeState(teamName); @@ -12024,6 +12351,7 @@ export class TeamProvisioningService { statuses: Record ): Promise> { const runtimeByMember = await this.getLiveTeamAgentRuntimeMetadata(teamName); + const strictLiveness = isStrictTeamMemberLivenessMode(); const nextStatuses = { ...statuses }; for (const [memberName, metadata] of runtimeByMember.entries()) { const resolvedStatusKey = @@ -12045,10 +12373,19 @@ export class TeamProvisioningService { const nextEntry: MemberSpawnStatusEntry = { ...current, ...(metadata.model ? { runtimeModel: metadata.model } : {}), + ...(metadata.livenessKind ? { livenessKind: metadata.livenessKind } : {}), + ...(metadata.runtimeDiagnostic ? { runtimeDiagnostic: metadata.runtimeDiagnostic } : {}), + ...(metadata.runtimeDiagnosticSeverity + ? { runtimeDiagnosticSeverity: metadata.runtimeDiagnosticSeverity } + : {}), + livenessLastCheckedAt: nowIso(), }; const failureReason = current.hardFailureReason ?? current.error; + const hasStrongEvidence = strictLiveness + ? isStrongRuntimeEvidence(metadata) + : metadata.alive === true; if ( - metadata.alive && + hasStrongEvidence && current.hardFailure !== true && current.launchState !== 'failed_to_start' ) { @@ -12062,7 +12399,7 @@ export class TeamProvisioningService { nextEntry.launchState = deriveMemberLaunchState(nextEntry); } if ( - metadata.alive && + hasStrongEvidence && current.launchState === 'failed_to_start' && isAutoClearableLaunchFailureReason(failureReason) ) { @@ -12332,6 +12669,7 @@ export class TeamProvisioningService { if (cached && cached.expiresAtMs > Date.now()) { return cached.metadata; } + const strictLiveness = isStrictTeamMemberLivenessMode(); const runId = this.getTrackedRunId(teamName); const run = runId ? (this.runs.get(runId) ?? null) : null; @@ -12379,10 +12717,17 @@ export class TeamProvisioningService { this.findMetaMemberModel(metaMembers, memberName); upsertMetadata(memberName, { backendType: normalizeTeamAgentRuntimeBackendType(member.backendType, false), + providerId: normalizeOptionalTeamProviderId(member.providerId), agentId: typeof member.agentId === 'string' ? member.agentId.trim() || undefined : undefined, tmuxPaneId: typeof member.tmuxPaneId === 'string' ? member.tmuxPaneId.trim() || undefined : undefined, + ...(normalizeRuntimePositiveInteger(member.runtimePid) + ? { metricsPid: normalizeRuntimePositiveInteger(member.runtimePid) } + : {}), + ...(typeof member.runtimeSessionId === 'string' && member.runtimeSessionId.trim() + ? { runtimeSessionId: member.runtimeSessionId.trim() } + : {}), ...(runtimeModel ? { model: runtimeModel } : {}), }); } @@ -12417,6 +12762,9 @@ export class TeamProvisioningService { ...(runtimeModel ? { model: runtimeModel } : {}), ...(configuredAgentId ? { agentId: configuredAgentId } : {}), ...(configuredTmuxPaneId ? { tmuxPaneId: configuredTmuxPaneId } : {}), + ...(normalizeOptionalTeamProviderId(member.providerId) + ? { providerId: normalizeOptionalTeamProviderId(member.providerId) } + : {}), ...(normalizeTeamAgentRuntimeBackendType(configuredBackendType, false) ? { backendType: normalizeTeamAgentRuntimeBackendType(configuredBackendType, false), @@ -12440,6 +12788,9 @@ export class TeamProvisioningService { this.findEffectiveRunMemberModel(run, memberName); upsertMetadata(memberName, { ...(runtimeModel ? { model: runtimeModel } : {}), + ...(normalizeOptionalTeamProviderId(member.providerId) + ? { providerId: normalizeOptionalTeamProviderId(member.providerId) } + : {}), ...(typeof member.agentId === 'string' && member.agentId.trim() ? { agentId: member.agentId.trim() } : {}), @@ -12465,100 +12816,125 @@ export class TeamProvisioningService { const runtimeModel = lane.member.model?.trim() || undefined; upsertMetadata(memberName, { backendType: 'process', - alive: evidence?.runtimeAlive === true || evidence?.agentToolAccepted === true, + providerId: 'opencode', + alive: evidence?.runtimeAlive === true, + livenessKind: evidence?.livenessKind, + pidSource: evidence?.pidSource, + runtimeDiagnostic: evidence?.runtimeDiagnostic, ...(runtimeModel ? { model: runtimeModel } : {}), ...(typeof evidence?.runtimePid === 'number' && evidence.runtimePid > 0 ? { metricsPid: evidence.runtimePid } : {}), + ...(evidence?.sessionId ? { runtimeSessionId: evidence.sessionId } : {}), }); } - const shouldReadPersistedOpenCodeLaunchSnapshot = - (run?.mixedSecondaryLanes?.length ?? 0) > 0 || - configuredMembers.some( - (member) => normalizeOptionalTeamProviderId(member.providerId) === 'opencode' - ) || - metaMembers.some( - (member) => normalizeOptionalTeamProviderId(member.providerId) === 'opencode' - ); - const persistedLaunchSnapshot = shouldReadPersistedOpenCodeLaunchSnapshot - ? await this.launchStateStore.read(teamName).catch(() => null) - : null; + const persistedLaunchSnapshot = await this.launchStateStore.read(teamName).catch(() => null); for (const persistedMember of Object.values(persistedLaunchSnapshot?.members ?? {})) { const memberName = persistedMember.name?.trim() ?? ''; - if ( - !memberName || - this.isMemberRemovedInMeta(metaMembers, memberName) || - persistedMember.providerId !== 'opencode' || - persistedMember.laneKind !== 'secondary' || - persistedMember.laneOwnerProviderId !== 'opencode' - ) { + if (!memberName || this.isMemberRemovedInMeta(metaMembers, memberName)) { continue; } upsertMetadata(memberName, { - backendType: 'process', + backendType: + persistedMember.providerId === 'opencode' + ? 'process' + : metadataByMember.get(memberName)?.backendType, + providerId: persistedMember.providerId, alive: persistedMember.runtimeAlive === true || persistedMember.bootstrapConfirmed === true, + livenessKind: persistedMember.livenessKind, + pidSource: persistedMember.pidSource, + runtimeDiagnostic: persistedMember.runtimeDiagnostic, + runtimeDiagnosticSeverity: persistedMember.runtimeDiagnosticSeverity, + runtimeLastSeenAt: + persistedMember.runtimeLastSeenAt ?? + persistedMember.lastHeartbeatAt ?? + persistedMember.lastRuntimeAliveAt, ...(persistedMember.model?.trim() ? { model: persistedMember.model.trim() } : {}), ...(typeof persistedMember.runtimePid === 'number' && persistedMember.runtimePid > 0 ? { metricsPid: persistedMember.runtimePid } : {}), + ...(persistedMember.runtimeSessionId + ? { runtimeSessionId: persistedMember.runtimeSessionId } + : {}), }); } const paneIds = [...metadataByMember.values()] .map((metadata) => metadata.tmuxPaneId?.trim() ?? '') .filter((paneId) => paneId.length > 0); - let panePidById = new Map(); + let paneInfoById = new Map(); if (paneIds.length > 0) { try { - panePidById = await listTmuxPanePidsForCurrentPlatform(paneIds); + paneInfoById = await listTmuxPaneRuntimeInfoForCurrentPlatform(paneIds); } catch (error) { logger.debug( - `[${teamName}] Failed to read tmux pane pids for runtime snapshot: ${ + `[${teamName}] Failed to read tmux pane info for runtime snapshot: ${ error instanceof Error ? error.message : String(error) }` ); } } - const unresolvedAgentIds = [...metadataByMember.values()] - .map((metadata) => metadata.agentId?.trim() ?? '') - .filter((agentId) => agentId.length > 0); - const processPidByAgentId = - unresolvedAgentIds.length > 0 - ? this.findLiveProcessPidByAgentId(teamName, unresolvedAgentIds) - : new Map(); + let processRows: Awaited> = []; + let processTableAvailable = true; + try { + processRows = await listRuntimeProcessesForCurrentTmuxPlatform(); + } catch (error) { + processTableAvailable = false; + logger.debug( + `[${teamName}] Failed to read process table for runtime snapshot: ${ + error instanceof Error ? error.message : String(error) + }` + ); + } for (const [memberName, metadata] of metadataByMember.entries()) { const paneId = metadata.tmuxPaneId?.trim() ?? ''; - const backendType = metadata.backendType; - const panePid = paneId ? panePidById.get(paneId) : undefined; - const processPid = metadata.agentId ? processPidByAgentId.get(metadata.agentId) : undefined; - const resolvedPid = - typeof panePid === 'number' && panePid > 0 - ? panePid - : typeof processPid === 'number' && processPid > 0 - ? processPid - : undefined; const status = this.findTrackedMemberSpawnStatus(run, memberName); - const mayInferAliveFromStatusOnly = status?.launchState !== 'failed_to_start'; - const sharedRuntimeAlive = - backendType === 'process' && - typeof metadata.metricsPid === 'number' && - metadata.metricsPid > 0; - const alive = - typeof resolvedPid === 'number' && resolvedPid > 0 - ? true - : backendType === 'tmux' - ? false - : sharedRuntimeAlive - ? true - : mayInferAliveFromStatusOnly && - Boolean(status?.runtimeAlive || status?.bootstrapConfirmed); + const launchMember = persistedLaunchSnapshot?.members[memberName]; + const resolved = resolveTeamMemberRuntimeLiveness({ + teamName, + memberName, + agentId: metadata.agentId, + backendType: metadata.backendType, + providerId: metadata.providerId ?? launchMember?.providerId, + tmuxPaneId: metadata.tmuxPaneId, + persistedRuntimePid: launchMember?.runtimePid ?? metadata.metricsPid, + persistedRuntimeSessionId: launchMember?.runtimeSessionId ?? metadata.runtimeSessionId, + trackedSpawnStatus: status, + runtimePid: metadata.metricsPid, + runtimeSessionId: metadata.runtimeSessionId, + pane: paneId ? paneInfoById.get(paneId) : undefined, + processRows, + processTableAvailable, + nowIso: nowIso(), + }); + const legacyWeakAlive = + resolved.alive || + (resolved.pidSource === 'tmux_pane' && typeof resolved.pid === 'number') || + (metadata.backendType === 'process' && + typeof metadata.metricsPid === 'number' && + metadata.metricsPid > 0); metadataByMember.set(memberName, { ...metadata, - alive, - ...(typeof resolvedPid === 'number' && resolvedPid > 0 ? { pid: resolvedPid } : {}), + alive: strictLiveness ? resolved.alive : legacyWeakAlive, + ...(typeof resolved.pid === 'number' && resolved.pid > 0 ? { pid: resolved.pid } : {}), + ...(typeof (resolved.metricsPid ?? metadata.metricsPid) === 'number' && + Number.isFinite(resolved.metricsPid ?? metadata.metricsPid) && + (resolved.metricsPid ?? metadata.metricsPid)! > 0 + ? { metricsPid: resolved.metricsPid ?? metadata.metricsPid } + : {}), + livenessKind: resolved.livenessKind, + ...(resolved.pidSource ? { pidSource: resolved.pidSource } : {}), + ...(resolved.processCommand ? { processCommand: resolved.processCommand } : {}), + ...(resolved.panePid ? { panePid: resolved.panePid } : {}), + ...(resolved.paneCurrentCommand ? { paneCurrentCommand: resolved.paneCurrentCommand } : {}), + ...(resolved.runtimeSessionId ? { runtimeSessionId: resolved.runtimeSessionId } : {}), + ...(resolved.runtimeLastSeenAt ? { runtimeLastSeenAt: resolved.runtimeLastSeenAt } : {}), + runtimeDiagnostic: resolved.runtimeDiagnostic, + runtimeDiagnosticSeverity: resolved.runtimeDiagnosticSeverity, + diagnostics: resolved.diagnostics, }); } @@ -12569,83 +12945,6 @@ export class TeamProvisioningService { return metadataByMember; } - private readUnixProcessTableRows(): { - pid: number; - command: string; - }[] { - if (process.platform === 'win32') { - return []; - } - - let output = ''; - try { - output = execFileSync('ps', ['-ax', '-o', 'pid=,command='], { - encoding: 'utf8', - stdio: ['ignore', 'pipe', 'ignore'], - }); - } catch { - return []; - } - - const rows: { pid: number; command: string }[] = []; - for (const line of output.split('\n')) { - const trimmed = line.trim(); - if (!trimmed) continue; - const match = /^(\d+)\s+(.*)$/.exec(trimmed); - if (!match) continue; - const pid = Number.parseInt(match[1], 10); - const command = match[2]?.trim() ?? ''; - if (!Number.isFinite(pid) || pid <= 0 || command.length === 0) { - continue; - } - rows.push({ - pid, - command, - }); - } - return rows; - } - - private findLiveProcessPidByAgentId( - teamName: string, - agentIds: readonly string[] - ): Map { - const normalizedAgentIds = [ - ...new Set(agentIds.map((agentId) => agentId.trim()).filter(Boolean)), - ]; - if (normalizedAgentIds.length === 0) { - return new Map(); - } - - const rows = this.readUnixProcessTableRows(); - if (rows.length === 0) { - return new Map(); - } - - const pidByAgentId = new Map(); - for (const row of rows) { - if ( - !commandContainsCliArgValue(row.command, '--team-name', teamName) || - !row.command.includes('--agent-id') - ) { - continue; - } - - for (const agentId of normalizedAgentIds) { - if (!commandContainsCliArgValue(row.command, '--agent-id', agentId)) { - continue; - } - const currentPid = pidByAgentId.get(agentId); - if (!currentPid || row.pid > currentPid) { - pidByAgentId.set(agentId, row.pid); - } - break; - } - } - - return pidByAgentId; - } - private async readProcessRssBytesByPid(pids: readonly number[]): Promise> { const uniquePids = [...new Set(pids.filter((pid) => Number.isFinite(pid) && pid > 0))]; if (uniquePids.length === 0) { @@ -12711,6 +13010,11 @@ export class TeamProvisioningService { pendingCount: number; failedCount: number; runtimeAlivePendingCount: number; + shellOnlyPendingCount?: number; + runtimeProcessPendingCount?: number; + runtimeCandidatePendingCount?: number; + noRuntimePendingCount?: number; + permissionPendingCount?: number; } { const expectedMembers = run.expectedMembers ?? []; const memberSpawnStatuses = run.memberSpawnStatuses ?? new Map(); @@ -12718,6 +13022,11 @@ export class TeamProvisioningService { let pendingCount = 0; let failedCount = 0; let runtimeAlivePendingCount = 0; + let shellOnlyPendingCount = 0; + let runtimeProcessPendingCount = 0; + let runtimeCandidatePendingCount = 0; + let noRuntimePendingCount = 0; + let permissionPendingCount = 0; for (const expected of expectedMembers) { const entry = memberSpawnStatuses.get(expected) ?? createInitialMemberSpawnStatusEntry(); if (entry.launchState === 'confirmed_alive') { @@ -12732,8 +13041,30 @@ export class TeamProvisioningService { if (entry.runtimeAlive) { runtimeAlivePendingCount += 1; } + if (entry.launchState === 'runtime_pending_permission') { + permissionPendingCount += 1; + } + if (entry.livenessKind === 'shell_only') { + shellOnlyPendingCount += 1; + } else if (entry.livenessKind === 'runtime_process') { + runtimeProcessPendingCount += 1; + } else if (entry.livenessKind === 'runtime_process_candidate') { + runtimeCandidatePendingCount += 1; + } else if (entry.livenessKind === 'not_found' || entry.livenessKind === 'stale_metadata') { + noRuntimePendingCount += 1; + } } - return { confirmedCount, pendingCount, failedCount, runtimeAlivePendingCount }; + return { + confirmedCount, + pendingCount, + failedCount, + runtimeAlivePendingCount, + shellOnlyPendingCount, + runtimeProcessPendingCount, + runtimeCandidatePendingCount, + noRuntimePendingCount, + permissionPendingCount, + }; } private buildPendingBootstrapStatusMessage( @@ -12743,6 +13074,11 @@ export class TeamProvisioningService { confirmedCount: number; pendingCount: number; runtimeAlivePendingCount: number; + shellOnlyPendingCount?: number; + runtimeProcessPendingCount?: number; + runtimeCandidatePendingCount?: number; + noRuntimePendingCount?: number; + permissionPendingCount?: number; }, snapshot?: PersistedTeamLaunchSnapshot | null ): string { @@ -12768,6 +13104,21 @@ export class TeamProvisioningService { 0, launchSummary.pendingCount - launchSummary.runtimeAlivePendingCount ); + const diagnosticParts = [ + launchSummary.shellOnlyPendingCount + ? `${launchSummary.shellOnlyPendingCount} shell-only` + : '', + launchSummary.runtimeProcessPendingCount + ? `${launchSummary.runtimeProcessPendingCount} waiting for bootstrap` + : '', + launchSummary.runtimeCandidatePendingCount + ? `${launchSummary.runtimeCandidatePendingCount} process candidates` + : '', + launchSummary.noRuntimePendingCount + ? `${launchSummary.noRuntimePendingCount} no runtime found` + : '', + ].filter(Boolean); + const diagnosticSuffix = diagnosticParts.length > 0 ? ` - ${diagnosticParts.join(', ')}` : ''; if (launchSummary.confirmedCount === 0) { const allRuntimeAlive = launchSummary.runtimeAlivePendingCount > 0 && @@ -12776,10 +13127,10 @@ export class TeamProvisioningService { ? `${prefix} — teammates online` : launchSummary.runtimeAlivePendingCount > 0 ? `${prefix} — ${launchSummary.runtimeAlivePendingCount}/${expectedTeammateCount} teammate${launchSummary.runtimeAlivePendingCount === 1 ? '' : 's'} online${stillStartingCount > 0 ? `, ${stillStartingCount} still starting` : ''}` - : `${prefix} — teammates are still starting`; + : `${prefix} — teammates are still starting${diagnosticSuffix}`; } - return `${prefix} — ${launchSummary.confirmedCount}/${expectedTeammateCount} teammates made contact${launchSummary.runtimeAlivePendingCount > 0 ? `, ${launchSummary.runtimeAlivePendingCount} teammate${launchSummary.runtimeAlivePendingCount === 1 ? '' : 's'} online` : ''}${stillStartingCount > 0 ? `${launchSummary.runtimeAlivePendingCount > 0 ? ', ' : ', '}${stillStartingCount} still joining` : ''}`; + return `${prefix} — ${launchSummary.confirmedCount}/${expectedTeammateCount} teammates made contact${launchSummary.runtimeAlivePendingCount > 0 ? `, ${launchSummary.runtimeAlivePendingCount} teammate${launchSummary.runtimeAlivePendingCount === 1 ? '' : 's'} online` : ''}${stillStartingCount > 0 ? `${launchSummary.runtimeAlivePendingCount > 0 ? ', ' : ', '}${stillStartingCount} still joining${diagnosticSuffix}` : ''}`; } private buildAggregatePendingLaunchMessage( diff --git a/src/main/services/team/TeamRuntimeLivenessResolver.ts b/src/main/services/team/TeamRuntimeLivenessResolver.ts new file mode 100644 index 00000000..410b7f73 --- /dev/null +++ b/src/main/services/team/TeamRuntimeLivenessResolver.ts @@ -0,0 +1,350 @@ +import type { RuntimeProcessTableRow, TmuxPaneRuntimeInfo } from '@features/tmux-installer/main'; +import type { + MemberSpawnStatusEntry, + TeamAgentRuntimeBackendType, + TeamAgentRuntimeDiagnosticSeverity, + TeamAgentRuntimeLivenessKind, + TeamAgentRuntimePidSource, + TeamProviderId, +} from '@shared/types'; + +export interface ResolveTeamMemberRuntimeLivenessInput { + teamName: string; + memberName: string; + agentId?: string; + backendType?: TeamAgentRuntimeBackendType; + providerId?: TeamProviderId; + tmuxPaneId?: string; + persistedRuntimePid?: number; + persistedRuntimeSessionId?: string; + trackedSpawnStatus?: MemberSpawnStatusEntry; + runtimePid?: number; + runtimeSessionId?: string; + pane?: TmuxPaneRuntimeInfo; + processRows: readonly RuntimeProcessTableRow[]; + processTableAvailable: boolean; + nowIso: string; +} + +export interface ResolvedTeamMemberRuntimeLiveness { + alive: boolean; + livenessKind: TeamAgentRuntimeLivenessKind; + pidSource?: TeamAgentRuntimePidSource; + pid?: number; + metricsPid?: number; + panePid?: number; + paneCurrentCommand?: string; + processCommand?: string; + runtimeSessionId?: string; + runtimeLastSeenAt?: string; + runtimeDiagnostic: string; + runtimeDiagnosticSeverity: TeamAgentRuntimeDiagnosticSeverity; + diagnostics: string[]; +} + +const SHELL_COMMAND_NAMES = new Set(['sh', 'bash', 'zsh', 'fish', 'dash', 'login', 'tmux']); +const SECRET_FLAG_PATTERN = + /(--(?:api-key|token|password|secret|authorization|auth-token)(?:=|\s+))("[^"]*"|'[^']*'|\S+)/gi; + +function basenameCommand(command: string | undefined): string { + const firstToken = command?.trim().split(/\s+/, 1)[0] ?? ''; + const base = firstToken.split(/[\\/]/).pop() ?? firstToken; + return base.replace(/^-/, '').toLowerCase(); +} + +export function isShellLikeCommand(command: string | undefined): boolean { + return SHELL_COMMAND_NAMES.has(basenameCommand(command)); +} + +export function sanitizeProcessCommandForDiagnostics( + command: string | undefined +): string | undefined { + const trimmed = command?.trim(); + if (!trimmed) return undefined; + return trimmed.replace(SECRET_FLAG_PATTERN, '$1[redacted]').slice(0, 500); +} + +function escapeRegexLiteral(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +export function extractCliArgValues(command: string, argName: string): string[] { + const escapedArg = escapeRegexLiteral(argName); + const pattern = new RegExp( + `(?:^|\\s)${escapedArg}(?:=|\\s+)("([^"]*)"|'([^']*)'|([^\\s]+))`, + 'g' + ); + + const values: string[] = []; + for (const match of command.matchAll(pattern)) { + const value = (match[2] ?? match[3] ?? match[4] ?? '').trim(); + if (value) values.push(value); + } + return values; +} + +export function commandArgEquals( + command: string, + argName: string, + expected: string | undefined +): boolean { + const normalizedExpected = expected?.trim(); + if (!normalizedExpected) return false; + return extractCliArgValues(command, argName).some((value) => value === normalizedExpected); +} + +function collectDescendants( + rows: readonly RuntimeProcessTableRow[], + rootPid: number +): RuntimeProcessTableRow[] { + const childrenByParent = new Map(); + for (const row of rows) { + const current = childrenByParent.get(row.ppid) ?? []; + current.push(row); + childrenByParent.set(row.ppid, current); + } + + const descendants: RuntimeProcessTableRow[] = []; + const queue = [...(childrenByParent.get(rootPid) ?? [])]; + const seen = new Set(); + while (queue.length > 0) { + const row = queue.shift(); + if (!row || seen.has(row.pid)) continue; + seen.add(row.pid); + descendants.push(row); + queue.push(...(childrenByParent.get(row.pid) ?? [])); + } + return descendants; +} + +function isVerifiedRuntimeProcess(params: { + row: RuntimeProcessTableRow; + teamName: string; + agentId?: string; +}): boolean { + return ( + commandArgEquals(params.row.command, '--team-name', params.teamName) && + commandArgEquals(params.row.command, '--agent-id', params.agentId) + ); +} + +function hasPersistedEvidence(input: ResolveTeamMemberRuntimeLivenessInput): boolean { + return Boolean( + input.agentId?.trim() || + input.tmuxPaneId?.trim() || + input.persistedRuntimePid || + input.runtimePid || + input.persistedRuntimeSessionId?.trim() || + input.runtimeSessionId?.trim() || + input.backendType + ); +} + +function result(params: { + alive: boolean; + livenessKind: TeamAgentRuntimeLivenessKind; + runtimeDiagnostic: string; + runtimeDiagnosticSeverity?: TeamAgentRuntimeDiagnosticSeverity; + diagnostics?: string[]; + pidSource?: TeamAgentRuntimePidSource; + pid?: number; + metricsPid?: number; + panePid?: number; + paneCurrentCommand?: string; + processCommand?: string; + runtimeSessionId?: string; + runtimeLastSeenAt?: string; +}): ResolvedTeamMemberRuntimeLiveness { + return { + alive: params.alive, + livenessKind: params.livenessKind, + runtimeDiagnostic: params.runtimeDiagnostic, + runtimeDiagnosticSeverity: params.runtimeDiagnosticSeverity ?? 'info', + diagnostics: params.diagnostics ?? [params.runtimeDiagnostic], + ...(params.pidSource ? { pidSource: params.pidSource } : {}), + ...(typeof params.pid === 'number' && params.pid > 0 ? { pid: params.pid } : {}), + ...(typeof params.metricsPid === 'number' && params.metricsPid > 0 + ? { metricsPid: params.metricsPid } + : {}), + ...(typeof params.panePid === 'number' && params.panePid > 0 + ? { panePid: params.panePid } + : {}), + ...(params.paneCurrentCommand ? { paneCurrentCommand: params.paneCurrentCommand } : {}), + ...(params.processCommand ? { processCommand: params.processCommand } : {}), + ...(params.runtimeSessionId ? { runtimeSessionId: params.runtimeSessionId } : {}), + ...(params.runtimeLastSeenAt ? { runtimeLastSeenAt: params.runtimeLastSeenAt } : {}), + }; +} + +export function resolveTeamMemberRuntimeLiveness( + input: ResolveTeamMemberRuntimeLivenessInput +): ResolvedTeamMemberRuntimeLiveness { + const tracked = input.trackedSpawnStatus; + const runtimeSessionId = input.runtimeSessionId ?? input.persistedRuntimeSessionId; + const diagnostics: string[] = []; + if (!input.processTableAvailable) { + diagnostics.push('process table unavailable'); + } + + if (tracked?.bootstrapConfirmed === true || tracked?.launchState === 'confirmed_alive') { + return result({ + alive: true, + livenessKind: 'confirmed_bootstrap', + pidSource: 'runtime_bootstrap', + runtimeSessionId, + runtimeLastSeenAt: tracked.lastHeartbeatAt ?? tracked.updatedAt, + runtimeDiagnostic: 'bootstrap confirmed', + diagnostics: [...diagnostics, 'bootstrap confirmed'], + }); + } + + if ( + tracked?.launchState === 'runtime_pending_permission' || + (tracked?.pendingPermissionRequestIds?.length ?? 0) > 0 + ) { + return result({ + alive: false, + livenessKind: 'permission_blocked', + runtimeSessionId, + runtimeDiagnostic: 'waiting for permission approval', + runtimeDiagnosticSeverity: 'warning', + diagnostics: [...diagnostics, 'permission approval pending'], + }); + } + + const verifiedProcess = input.processRows + .filter((row) => + isVerifiedRuntimeProcess({ row, teamName: input.teamName, agentId: input.agentId }) + ) + .sort((left, right) => right.pid - left.pid)[0]; + if (verifiedProcess) { + return result({ + alive: true, + livenessKind: 'runtime_process', + pidSource: 'agent_process_table', + pid: verifiedProcess.pid, + runtimeSessionId, + processCommand: sanitizeProcessCommandForDiagnostics(verifiedProcess.command), + runtimeDiagnostic: 'verified runtime process detected', + diagnostics: [...diagnostics, 'matched process table by team-name and agent-id'], + }); + } + + const runtimePid = input.runtimePid ?? input.persistedRuntimePid; + const runtimePidRow = + typeof runtimePid === 'number' && runtimePid > 0 + ? input.processRows.find((row) => row.pid === runtimePid) + : undefined; + if (runtimePidRow && input.providerId === 'opencode') { + return result({ + alive: true, + livenessKind: 'runtime_process', + pidSource: 'opencode_bridge', + pid: runtimePidRow.pid, + runtimeSessionId, + processCommand: sanitizeProcessCommandForDiagnostics(runtimePidRow.command), + runtimeDiagnostic: 'OpenCode runtime process detected', + diagnostics: [...diagnostics, 'matched OpenCode runtime pid in process table'], + }); + } + + const pane = input.pane; + if (pane) { + const descendants = collectDescendants(input.processRows, pane.panePid); + const verifiedDescendant = descendants + .filter((row) => + isVerifiedRuntimeProcess({ row, teamName: input.teamName, agentId: input.agentId }) + ) + .sort((left, right) => right.pid - left.pid)[0]; + if (verifiedDescendant) { + return result({ + alive: true, + livenessKind: 'runtime_process', + pidSource: 'tmux_child', + pid: verifiedDescendant.pid, + panePid: pane.panePid, + paneCurrentCommand: pane.currentCommand, + runtimeSessionId, + processCommand: sanitizeProcessCommandForDiagnostics(verifiedDescendant.command), + runtimeDiagnostic: 'verified tmux runtime child detected', + diagnostics: [...diagnostics, 'matched tmux descendant by team-name and agent-id'], + }); + } + + const candidate = descendants.find((row) => !isShellLikeCommand(row.command)); + if (candidate) { + return result({ + alive: false, + livenessKind: 'runtime_process_candidate', + pidSource: 'tmux_child', + pid: candidate.pid, + panePid: pane.panePid, + paneCurrentCommand: pane.currentCommand, + runtimeSessionId, + processCommand: sanitizeProcessCommandForDiagnostics(candidate.command), + runtimeDiagnostic: 'runtime process candidate detected', + runtimeDiagnosticSeverity: 'warning', + diagnostics: [...diagnostics, 'tmux descendant found without runtime identity match'], + }); + } + + const shellOnly = isShellLikeCommand(pane.currentCommand); + return result({ + alive: false, + livenessKind: shellOnly ? 'shell_only' : 'runtime_process_candidate', + pidSource: 'tmux_pane', + pid: pane.panePid, + panePid: pane.panePid, + paneCurrentCommand: pane.currentCommand, + runtimeSessionId, + runtimeDiagnostic: shellOnly + ? `tmux pane foreground command is ${pane.currentCommand ?? 'a shell'}` + : 'tmux pane is alive, but runtime identity is not verified', + runtimeDiagnosticSeverity: shellOnly ? 'warning' : 'info', + diagnostics: [ + ...diagnostics, + shellOnly + ? `tmux pane is alive, but foreground command is ${pane.currentCommand ?? 'a shell'}` + : 'tmux pane exists, but no verified runtime process was found', + ], + }); + } + + if (runtimePid && !runtimePidRow) { + return result({ + alive: false, + livenessKind: 'stale_metadata', + pidSource: 'persisted_metadata', + pid: runtimePid, + runtimeSessionId, + runtimeDiagnostic: 'persisted runtime pid is not alive', + runtimeDiagnosticSeverity: 'warning', + diagnostics: [...diagnostics, 'persisted runtime pid was not found in process table'], + }); + } + + if (hasPersistedEvidence(input)) { + return result({ + alive: false, + livenessKind: 'registered_only', + runtimeSessionId, + runtimeDiagnostic: 'registered runtime metadata without live process', + runtimeDiagnosticSeverity: 'warning', + diagnostics: [...diagnostics, 'member has persisted runtime metadata only'], + }); + } + + return result({ + alive: false, + livenessKind: 'not_found', + runtimeDiagnostic: 'runtime process not found', + runtimeDiagnosticSeverity: 'warning', + diagnostics: [...diagnostics, 'runtime process not found'], + }); +} + +export function isStrongRuntimeEvidence( + value: { livenessKind?: TeamAgentRuntimeLivenessKind } | undefined +): boolean { + return value?.livenessKind === 'confirmed_bootstrap' || value?.livenessKind === 'runtime_process'; +} diff --git a/src/main/services/team/progressPayload.ts b/src/main/services/team/progressPayload.ts index c2f4fce7..1ab0ef41 100644 --- a/src/main/services/team/progressPayload.ts +++ b/src/main/services/team/progressPayload.ts @@ -12,8 +12,12 @@ * diagnostics and completion-time reports. */ +import type { TeamLaunchDiagnosticItem } from '@shared/types'; + export const PROGRESS_LOG_TAIL_LINES = 200; export const PROGRESS_OUTPUT_TAIL_PARTS = 20; +export const PROGRESS_LAUNCH_DIAGNOSTICS_LIMIT = 20; +const PROGRESS_LAUNCH_DIAGNOSTIC_TEXT_LIMIT = 500; /** * Return the trailing `maxLines` of a line-buffered CLI log, joined with "\n" @@ -50,3 +54,29 @@ export function buildProgressAssistantOutput( const joined = tail.join('\n\n'); return joined.trim().length === 0 ? undefined : joined; } + +function boundDiagnosticText(value: string | undefined): string | undefined { + const trimmed = value?.replace(/\s+/g, ' ').trim(); + if (!trimmed) { + return undefined; + } + return trimmed.length > PROGRESS_LAUNCH_DIAGNOSTIC_TEXT_LIMIT + ? `${trimmed.slice(0, PROGRESS_LAUNCH_DIAGNOSTIC_TEXT_LIMIT - 3).trimEnd()}...` + : trimmed; +} + +export function boundLaunchDiagnostics( + items: readonly TeamLaunchDiagnosticItem[] | undefined, + maxItems: number = PROGRESS_LAUNCH_DIAGNOSTICS_LIMIT +): TeamLaunchDiagnosticItem[] | undefined { + if (!items || items.length === 0) { + return undefined; + } + + const bounded = items.slice(0, Math.max(1, maxItems)).map((item) => ({ + ...item, + label: boundDiagnosticText(item.label) ?? item.code, + detail: boundDiagnosticText(item.detail), + })); + return bounded.length > 0 ? bounded : undefined; +} diff --git a/src/main/services/team/runtime/OpenCodeTeamRuntimeAdapter.ts b/src/main/services/team/runtime/OpenCodeTeamRuntimeAdapter.ts index 208ba184..0b093732 100644 --- a/src/main/services/team/runtime/OpenCodeTeamRuntimeAdapter.ts +++ b/src/main/services/team/runtime/OpenCodeTeamRuntimeAdapter.ts @@ -546,7 +546,25 @@ function mapBridgeMemberToRuntimeEvidence( const confirmed = launchState === 'confirmed_alive'; const createdOrBlocked = launchState === 'created' || launchState === 'permission_blocked'; const failed = launchState === 'failed'; - const pendingRuntimeObserved = createdOrBlocked && runtimeMaterialized; + const hasRuntimePid = + typeof runtimePid === 'number' && Number.isFinite(runtimePid) && runtimePid > 0; + const pendingRuntimeObserved = launchState === 'created' && hasRuntimePid; + const livenessKind = confirmed + ? 'confirmed_bootstrap' + : pendingRuntimeObserved + ? 'runtime_process' + : launchState === 'permission_blocked' + ? 'permission_blocked' + : runtimeMaterialized || sessionId + ? 'runtime_process_candidate' + : 'registered_only'; + const runtimeDiagnostic = pendingRuntimeObserved + ? 'OpenCode runtime process reported by bridge' + : launchState === 'permission_blocked' + ? 'OpenCode runtime is waiting for permission approval' + : runtimeMaterialized || sessionId + ? 'OpenCode session exists without verified runtime pid' + : undefined; return { memberName, providerId: 'opencode', @@ -557,7 +575,7 @@ function mapBridgeMemberToRuntimeEvidence( : launchState === 'permission_blocked' ? 'runtime_pending_permission' : 'runtime_pending_bootstrap', - agentToolAccepted: confirmed || pendingRuntimeObserved, + agentToolAccepted: confirmed || createdOrBlocked || runtimeMaterialized, runtimeAlive: confirmed || pendingRuntimeObserved, bootstrapConfirmed: confirmed, hardFailure: failed, @@ -567,9 +585,10 @@ function mapBridgeMemberToRuntimeEvidence( ? [...new Set(pendingPermissionRequestIds)] : undefined, sessionId, - ...(typeof runtimePid === 'number' && Number.isFinite(runtimePid) && runtimePid > 0 - ? { runtimePid } - : {}), + ...(hasRuntimePid ? { runtimePid } : {}), + livenessKind, + ...(hasRuntimePid ? { pidSource: 'opencode_bridge' as const } : {}), + ...(runtimeDiagnostic ? { runtimeDiagnostic } : {}), diagnostics, }; } diff --git a/src/main/services/team/runtime/TeamRuntimeAdapter.ts b/src/main/services/team/runtime/TeamRuntimeAdapter.ts index 80847ebc..1fd06f8e 100644 --- a/src/main/services/team/runtime/TeamRuntimeAdapter.ts +++ b/src/main/services/team/runtime/TeamRuntimeAdapter.ts @@ -4,6 +4,8 @@ import type { PersistedTeamLaunchPhase, PersistedTeamLaunchSnapshot, TeamAgentRuntimeBackendType, + TeamAgentRuntimeLivenessKind, + TeamAgentRuntimePidSource, TeamLaunchAggregateState, } from '@shared/types'; @@ -73,6 +75,9 @@ export interface TeamRuntimeMemberLaunchEvidence { sessionId?: string; backendType?: TeamAgentRuntimeBackendType; runtimePid?: number; + livenessKind?: TeamAgentRuntimeLivenessKind; + pidSource?: TeamAgentRuntimePidSource; + runtimeDiagnostic?: string; diagnostics: string[]; } diff --git a/src/renderer/components/team/ProvisioningProgressBlock.tsx b/src/renderer/components/team/ProvisioningProgressBlock.tsx index 0359139b..ecd5d8db 100644 --- a/src/renderer/components/team/ProvisioningProgressBlock.tsx +++ b/src/renderer/components/team/ProvisioningProgressBlock.tsx @@ -19,6 +19,7 @@ import { DISPLAY_STEPS } from './provisioningSteps'; import { StepProgressBar } from './StepProgressBar'; import type { StepProgressBarStep } from './StepProgressBar'; +import type { TeamLaunchDiagnosticItem } from '@shared/types'; /** Pre-built step definitions for the provisioning stepper. */ const PROVISIONING_STEPS: StepProgressBarStep[] = DISPLAY_STEPS.map((s) => ({ @@ -61,6 +62,8 @@ export interface ProvisioningProgressBlockProps { cliLogsTail?: string; /** Accumulated assistant text output for live preview */ assistantOutput?: string; + /** Bounded structured launch diagnostics */ + launchDiagnostics?: TeamLaunchDiagnosticItem[]; /** Visual surface chrome for the outer block */ surface?: 'raised' | 'flat'; className?: string; @@ -153,11 +156,13 @@ export const ProvisioningProgressBlock = ({ pid, cliLogsTail, assistantOutput, + launchDiagnostics, surface = 'raised', className, }: ProvisioningProgressBlockProps): React.JSX.Element => { const elapsed = useElapsedTimer(startedAt, loading); const [logsOpen, setLogsOpen] = useState(() => defaultLogsOpen ?? false); + const [diagnosticsOpen, setDiagnosticsOpen] = useState(false); const [liveOutputOpen, setLiveOutputOpen] = useState(defaultLiveOutputOpen); const outputScrollRef = useRef(null); const isError = tone === 'error'; @@ -293,6 +298,42 @@ export const ProvisioningProgressBlock = ({ errorIndex={errorStepIndex} /> + {launchDiagnostics && launchDiagnostics.length > 0 ? ( +
+ + {diagnosticsOpen ? ( +
+ {launchDiagnostics.map((item) => ( +
+
+ {item.label} +
+ {item.detail ? ( +
+ {item.detail} +
+ ) : null} +
+ ))} +
+ ) : null} +
+ ) : null}
) : null} {showLaunchBadge ? ( - + onUpdateRole(member.name, newRole) : undefined @@ -253,6 +254,7 @@ export const MemberDetailDialog = ({ ) : runtimeEntry?.pid ? (
PID {runtimeEntry.pid} + {runtimeEntry.pidSource ? ` · ${runtimeEntry.pidSource}` : ''}
) : (
diff --git a/src/renderer/components/team/members/MemberDetailHeader.tsx b/src/renderer/components/team/members/MemberDetailHeader.tsx index 992c9934..a26a2766 100644 --- a/src/renderer/components/team/members/MemberDetailHeader.tsx +++ b/src/renderer/components/team/members/MemberDetailHeader.tsx @@ -23,11 +23,13 @@ import type { MemberSpawnLivenessSource, MemberSpawnStatus, ResolvedTeamMember, + TeamAgentRuntimeEntry, } from '@shared/types'; interface MemberDetailHeaderProps { member: ResolvedTeamMember; runtimeSummary?: string; + runtimeEntry?: TeamAgentRuntimeEntry; isTeamAlive?: boolean; isTeamProvisioning?: boolean; leadActivity?: LeadActivityState; @@ -43,6 +45,7 @@ interface MemberDetailHeaderProps { export const MemberDetailHeader = ({ member, runtimeSummary, + runtimeEntry, isTeamAlive, isTeamProvisioning, leadActivity, @@ -75,6 +78,7 @@ export const MemberDetailHeader = ({ spawnLaunchState, spawnLivenessSource, spawnRuntimeAlive, + runtimeEntry, runtimeAdvisory: member.runtimeAdvisory, isLaunchSettling, isTeamAlive, @@ -91,7 +95,12 @@ export const MemberDetailHeader = ({ const badgeLabel = runtimeAdvisoryTone === 'error' && runtimeAdvisoryLabel ? runtimeAdvisoryLabel - : launchVisualState === 'runtime_pending' || launchVisualState === 'permission_pending' + : launchVisualState === 'runtime_pending' || + launchVisualState === 'permission_pending' || + launchVisualState === 'shell_only' || + launchVisualState === 'runtime_candidate' || + launchVisualState === 'registered_only' || + launchVisualState === 'stale_runtime' ? (launchStatusLabel ?? presenceLabel) : presenceLabel; diff --git a/src/renderer/components/team/members/MemberHoverCard.tsx b/src/renderer/components/team/members/MemberHoverCard.tsx index 8c85df45..ea61631e 100644 --- a/src/renderer/components/team/members/MemberHoverCard.tsx +++ b/src/renderer/components/team/members/MemberHoverCard.tsx @@ -68,6 +68,7 @@ export const MemberHoverCard = ({ memberSpawnSnapshot, memberSpawnStatuses, spawnEntry, + runtimeEntry, leadActivity, } = useStore( useShallow((s) => ({ @@ -89,6 +90,9 @@ export const MemberHoverCard = ({ spawnEntry: effectiveTeamName ? s.memberSpawnStatusesByTeam[effectiveTeamName]?.[name] : undefined, + runtimeEntry: effectiveTeamName + ? s.teamAgentRuntimeByTeam[effectiveTeamName]?.members[name] + : undefined, leadActivity: effectiveTeamName ? s.leadActivityByTeam[effectiveTeamName] : undefined, })) ); @@ -114,6 +118,7 @@ export const MemberHoverCard = ({ spawnLaunchState: spawnEntry?.launchState, spawnLivenessSource: spawnEntry?.livenessSource, spawnRuntimeAlive: spawnEntry?.runtimeAlive, + runtimeEntry, runtimeAdvisory: member.runtimeAdvisory, isLaunchSettling, isTeamAlive, @@ -130,7 +135,12 @@ export const MemberHoverCard = ({ const badgeLabel = runtimeAdvisoryTone === 'error' && runtimeAdvisoryLabel ? runtimeAdvisoryLabel - : launchVisualState === 'runtime_pending' || launchVisualState === 'permission_pending' + : launchVisualState === 'runtime_pending' || + launchVisualState === 'permission_pending' || + launchVisualState === 'shell_only' || + launchVisualState === 'runtime_candidate' || + launchVisualState === 'registered_only' || + launchVisualState === 'stale_runtime' ? (launchStatusLabel ?? presenceLabel) : presenceLabel; const currentTask: TeamTaskWithKanban | null = member.currentTaskId diff --git a/src/renderer/components/team/members/MemberList.tsx b/src/renderer/components/team/members/MemberList.tsx index 6f07b69d..5a12e674 100644 --- a/src/renderer/components/team/members/MemberList.tsx +++ b/src/renderer/components/team/members/MemberList.tsx @@ -151,6 +151,9 @@ function areMemberSpawnStatusesEquivalent( leftEntry.hardFailure !== rightEntry.hardFailure || leftEntry.hardFailureReason !== rightEntry.hardFailureReason || leftEntry.livenessSource !== rightEntry.livenessSource || + leftEntry.livenessKind !== rightEntry.livenessKind || + leftEntry.runtimeDiagnostic !== rightEntry.runtimeDiagnostic || + leftEntry.runtimeDiagnosticSeverity !== rightEntry.runtimeDiagnosticSeverity || leftEntry.runtimeModel !== rightEntry.runtimeModel || leftEntry.runtimeAlive !== rightEntry.runtimeAlive || leftEntry.bootstrapConfirmed !== rightEntry.bootstrapConfirmed || @@ -196,7 +199,13 @@ function areMemberRuntimeEntriesEquivalent( leftEntry.backendType !== rightEntry?.backendType || leftEntry.pid !== rightEntry?.pid || leftEntry.runtimeModel !== rightEntry?.runtimeModel || - leftEntry.rssBytes !== rightEntry?.rssBytes + leftEntry.rssBytes !== rightEntry?.rssBytes || + leftEntry.livenessKind !== rightEntry?.livenessKind || + leftEntry.pidSource !== rightEntry?.pidSource || + leftEntry.paneCurrentCommand !== rightEntry?.paneCurrentCommand || + leftEntry.runtimeDiagnostic !== rightEntry?.runtimeDiagnostic || + leftEntry.runtimeDiagnosticSeverity !== rightEntry?.runtimeDiagnosticSeverity || + leftEntry.runtimeLastSeenAt !== rightEntry?.runtimeLastSeenAt ) { return false; } @@ -332,6 +341,7 @@ export const MemberList = memo(function MemberList({ isRemoved ? undefined : spawnEntry, isRemoved ? undefined : runtimeEntry )} + runtimeEntry={isRemoved ? undefined : runtimeEntry} spawnStatus={isRemoved ? undefined : spawnEntry?.status} spawnError={isRemoved ? undefined : (spawnEntry?.error ?? spawnEntry?.hardFailureReason)} spawnLivenessSource={isRemoved ? undefined : spawnEntry?.livenessSource} diff --git a/src/renderer/components/team/provisioningSteps.ts b/src/renderer/components/team/provisioningSteps.ts index aeafdc17..1fd55d64 100644 --- a/src/renderer/components/team/provisioningSteps.ts +++ b/src/renderer/components/team/provisioningSteps.ts @@ -131,7 +131,7 @@ function summarizeLiveLaunchJoinMilestones(params: { entry.launchState === 'runtime_pending_bootstrap' || entry.launchState === 'runtime_pending_permission' ) { - if (entry.runtimeAlive === true) { + if (entry.runtimeAlive === true && entry.livenessKind !== 'shell_only') { processOnlyAliveCount += 1; } else { pendingSpawnCount += 1; @@ -199,7 +199,8 @@ export function getLaunchJoinMilestonesFromMembers({ const snapshotMilestones = { expectedTeammateCount, heartbeatConfirmedCount: snapshotSummary.confirmedCount, - processOnlyAliveCount: snapshotSummary.runtimeAlivePendingCount, + processOnlyAliveCount: + snapshotSummary.runtimeProcessPendingCount ?? snapshotSummary.runtimeAlivePendingCount, pendingSpawnCount: Math.max( 0, snapshotSummary.pendingCount - snapshotSummary.runtimeAlivePendingCount diff --git a/src/renderer/store/index.ts b/src/renderer/store/index.ts index ebba97a7..0b9424ff 100644 --- a/src/renderer/store/index.ts +++ b/src/renderer/store/index.ts @@ -242,6 +242,7 @@ export function initializeNotificationListeners(): () => void { let teamMessageRefreshTimers = new Map>(); let teamPresenceRefreshTimers = new Map>(); let memberSpawnRefreshTimers = new Map>(); + let teamAgentRuntimeRefreshTimers = new Map>(); let toolActivityTimers = new Map>(); let inProgressChangePresencePollInFlight = false; let teamMessageFallbackPollInFlight = false; @@ -286,6 +287,19 @@ export function initializeNotificationListeners(): () => void { }, TEAM_MEMBER_SPAWN_REFRESH_THROTTLE_MS); memberSpawnRefreshTimers.set(teamName, timer); }; + const scheduleTeamAgentRuntimeRefresh = (teamName: string | null | undefined): void => { + if (!teamName || !isTeamVisibleInAnyPane(teamName)) { + return; + } + if (teamAgentRuntimeRefreshTimers.has(teamName)) { + return; + } + const timer = setTimeout(() => { + teamAgentRuntimeRefreshTimers.delete(teamName); + void useStore.getState().fetchTeamAgentRuntime(teamName); + }, TEAM_MEMBER_SPAWN_REFRESH_THROTTLE_MS); + teamAgentRuntimeRefreshTimers.set(teamName, timer); + }; const scheduleTrackedTeamMessageRefresh = (teamName: string | null | undefined): void => { if (!teamName || !shouldRefreshTeamMessages(teamName)) { return; @@ -1194,6 +1208,7 @@ export function initializeNotificationListeners(): () => void { } seedCurrentRunIdIfMissing(); scheduleMemberSpawnStatusesRefresh(event.teamName); + scheduleTeamAgentRuntimeRefresh(event.teamName); return; } @@ -1276,6 +1291,8 @@ export function initializeNotificationListeners(): () => void { teamPresenceRefreshTimers = new Map(); for (const t of memberSpawnRefreshTimers.values()) clearTimeout(t); memberSpawnRefreshTimers = new Map(); + for (const t of teamAgentRuntimeRefreshTimers.values()) clearTimeout(t); + teamAgentRuntimeRefreshTimers = new Map(); for (const t of toolActivityTimers.values()) clearTimeout(t); toolActivityTimers = new Map(); teamLastRelevantActivityAt.clear(); diff --git a/src/renderer/store/slices/teamSlice.ts b/src/renderer/store/slices/teamSlice.ts index a9f9e414..96ab3dac 100644 --- a/src/renderer/store/slices/teamSlice.ts +++ b/src/renderer/store/slices/teamSlice.ts @@ -702,7 +702,12 @@ function areLaunchSummaryCountsEqual( left.confirmedCount === right.confirmedCount && left.pendingCount === right.pendingCount && left.failedCount === right.failedCount && - left.runtimeAlivePendingCount === right.runtimeAlivePendingCount + left.runtimeAlivePendingCount === right.runtimeAlivePendingCount && + left.shellOnlyPendingCount === right.shellOnlyPendingCount && + left.runtimeProcessPendingCount === right.runtimeProcessPendingCount && + left.runtimeCandidatePendingCount === right.runtimeCandidatePendingCount && + left.noRuntimePendingCount === right.noRuntimePendingCount && + left.permissionPendingCount === right.permissionPendingCount ); } @@ -739,6 +744,9 @@ function areMemberSpawnStatusEntriesEqual( left.livenessSource === right.livenessSource && left.runtimeAlive === right.runtimeAlive && left.runtimeModel === right.runtimeModel && + left.livenessKind === right.livenessKind && + left.runtimeDiagnostic === right.runtimeDiagnostic && + left.runtimeDiagnosticSeverity === right.runtimeDiagnosticSeverity && left.bootstrapConfirmed === right.bootstrapConfirmed && left.hardFailure === right.hardFailure && leftPendingPermissionIds.length === rightPendingPermissionIds.length && @@ -809,7 +817,13 @@ function areTeamAgentRuntimeEntriesEqual( left.backendType === right.backendType && left.pid === right.pid && left.runtimeModel === right.runtimeModel && - left.rssBytes === right.rssBytes + left.rssBytes === right.rssBytes && + left.livenessKind === right.livenessKind && + left.pidSource === right.pidSource && + left.paneCurrentCommand === right.paneCurrentCommand && + left.runtimeDiagnostic === right.runtimeDiagnostic && + left.runtimeDiagnosticSeverity === right.runtimeDiagnosticSeverity && + left.runtimeLastSeenAt === right.runtimeLastSeenAt ); } diff --git a/src/renderer/utils/memberHelpers.ts b/src/renderer/utils/memberHelpers.ts index a827da07..e7c67c96 100644 --- a/src/renderer/utils/memberHelpers.ts +++ b/src/renderer/utils/memberHelpers.ts @@ -15,6 +15,7 @@ import type { MemberSpawnStatus, MemberStatus, ResolvedTeamMember, + TeamAgentRuntimeEntry, TeamProviderId, TeamReviewState, TeamTaskStatus, @@ -531,6 +532,10 @@ export type MemberLaunchVisualState = | 'spawning' | 'permission_pending' | 'runtime_pending' + | 'shell_only' + | 'runtime_candidate' + | 'registered_only' + | 'stale_runtime' | 'settling' | 'error' | null; @@ -556,7 +561,15 @@ export function getMemberLaunchStatusLabel(visualState: MemberLaunchVisualState) case 'permission_pending': return 'awaiting permission'; case 'runtime_pending': - return 'connecting'; + return 'waiting for bootstrap'; + case 'shell_only': + return 'shell only'; + case 'runtime_candidate': + return 'process candidate'; + case 'registered_only': + return 'registered'; + case 'stale_runtime': + return 'stale runtime'; case 'settling': return 'joining team'; case 'error': @@ -573,6 +586,7 @@ export function buildMemberLaunchPresentation({ spawnLivenessSource, spawnRuntimeAlive, runtimeAdvisory, + runtimeEntry, isLaunchSettling = false, isTeamAlive, isTeamProvisioning, @@ -584,6 +598,7 @@ export function buildMemberLaunchPresentation({ spawnLivenessSource: MemberSpawnLivenessSource | undefined; spawnRuntimeAlive: boolean | undefined; runtimeAdvisory: MemberRuntimeAdvisory | undefined; + runtimeEntry?: TeamAgentRuntimeEntry; isLaunchSettling?: boolean; isTeamAlive?: boolean; isTeamProvisioning?: boolean; @@ -630,10 +645,21 @@ export function buildMemberLaunchPresentation({ launchVisualState = 'error'; } else if (spawnLaunchState === 'runtime_pending_permission') { launchVisualState = 'permission_pending'; + } else if (runtimeEntry?.livenessKind === 'shell_only') { + launchVisualState = 'shell_only'; + } else if (runtimeEntry?.livenessKind === 'runtime_process_candidate') { + launchVisualState = 'runtime_candidate'; + } else if (runtimeEntry?.livenessKind === 'registered_only') { + launchVisualState = 'registered_only'; + } else if ( + runtimeEntry?.livenessKind === 'stale_metadata' || + runtimeEntry?.livenessKind === 'not_found' + ) { + launchVisualState = 'stale_runtime'; } else if ( spawnLaunchState === 'runtime_pending_bootstrap' && - spawnStatus === 'online' && - spawnRuntimeAlive === true + (runtimeEntry?.livenessKind === 'runtime_process' || + (spawnStatus === 'online' && spawnRuntimeAlive === true)) ) { launchVisualState = 'runtime_pending'; } else if ( @@ -655,6 +681,15 @@ export function buildMemberLaunchPresentation({ } const launchStatusLabel = getMemberLaunchStatusLabel(launchVisualState); + const displayPresenceLabel = + launchVisualState === 'permission_pending' || + launchVisualState === 'runtime_pending' || + launchVisualState === 'shell_only' || + launchVisualState === 'runtime_candidate' || + launchVisualState === 'registered_only' || + launchVisualState === 'stale_runtime' + ? (launchStatusLabel ?? presenceLabel) + : presenceLabel; const spawnBadgeLabel = spawnStatus && spawnStatus !== 'online' ? spawnStatus === 'waiting' || spawnStatus === 'spawning' @@ -663,7 +698,7 @@ export function buildMemberLaunchPresentation({ : null; return { - presenceLabel, + presenceLabel: displayPresenceLabel, dotClass: runtimeAdvisoryTone === 'error' ? STATUS_DOT_COLORS.terminated : dotClass, cardClass, runtimeAdvisoryLabel, diff --git a/src/renderer/utils/teamProvisioningPresentation.ts b/src/renderer/utils/teamProvisioningPresentation.ts index 49528d1d..6d7d6e54 100644 --- a/src/renderer/utils/teamProvisioningPresentation.ts +++ b/src/renderer/utils/teamProvisioningPresentation.ts @@ -126,6 +126,27 @@ function buildAwaitingPermissionPhrase(count: number): string { : `${count} teammates awaiting permission approval`; } +function buildPendingDiagnosticPhrase( + summary: MemberSpawnStatusesSnapshot['summary'] | undefined, + fallbackJoiningPhrase: string +): string { + if (!summary) { + return fallbackJoiningPhrase; + } + const parts = [ + summary.shellOnlyPendingCount ? `${summary.shellOnlyPendingCount} shell-only` : '', + summary.runtimeProcessPendingCount + ? `${summary.runtimeProcessPendingCount} waiting for bootstrap` + : '', + summary.runtimeCandidatePendingCount + ? `${summary.runtimeCandidatePendingCount} process candidates` + : '', + summary.permissionPendingCount ? `${summary.permissionPendingCount} awaiting permission` : '', + summary.noRuntimePendingCount ? `${summary.noRuntimePendingCount} no runtime found` : '', + ].filter(Boolean); + return parts.length > 0 ? parts.join(', ') : fallbackJoiningPhrase; +} + const ACTIVE_PROVISIONING_STATES = new Set([ 'validating', 'spawning', @@ -394,7 +415,7 @@ export function buildTeamProvisioningPresentation({ permissionBlockedCount === remainingJoinCount; const pendingDetailPhrase = pendingMembersAwaitApproval ? buildAwaitingPermissionPhrase(permissionBlockedCount) - : joiningPhrase; + : buildPendingDiagnosticPhrase(memberSpawnSnapshot?.summary, joiningPhrase); const readyCompactDetail = failedSpawnCount > 0 ? (failedSpawnCompactDetail ?? @@ -471,7 +492,7 @@ export function buildTeamProvisioningPresentation({ permissionBlockedCount > 0 && permissionBlockedCount === remainingJoinCount ? buildAwaitingPermissionPhrase(permissionBlockedCount) - : activeJoiningPhrase; + : buildPendingDiagnosticPhrase(memberSpawnSnapshot?.summary, activeJoiningPhrase); return { progress, isActive: true, diff --git a/src/shared/types/team.ts b/src/shared/types/team.ts index 65dc5f4a..c10e0aef 100644 --- a/src/shared/types/team.ts +++ b/src/shared/types/team.ts @@ -82,6 +82,11 @@ export interface TeamSummary { pendingCount?: number; failedCount?: number; runtimeAlivePendingCount?: number; + shellOnlyPendingCount?: number; + runtimeProcessPendingCount?: number; + runtimeCandidatePendingCount?: number; + noRuntimePendingCount?: number; + permissionPendingCount?: number; } export type TeamTaskStatus = 'pending' | 'in_progress' | 'completed' | 'deleted'; @@ -941,6 +946,12 @@ export interface PersistedTeamLaunchMemberState { hardFailureReason?: string; pendingPermissionRequestIds?: string[]; runtimePid?: number; + runtimeSessionId?: string; + livenessKind?: TeamAgentRuntimeLivenessKind; + pidSource?: TeamAgentRuntimePidSource; + runtimeDiagnostic?: string; + runtimeDiagnosticSeverity?: TeamAgentRuntimeDiagnosticSeverity; + runtimeLastSeenAt?: string; firstSpawnAcceptedAt?: string; lastHeartbeatAt?: string; lastRuntimeAliveAt?: string; @@ -954,6 +965,11 @@ export interface PersistedTeamLaunchSummary { pendingCount: number; failedCount: number; runtimeAlivePendingCount: number; + shellOnlyPendingCount?: number; + runtimeProcessPendingCount?: number; + runtimeCandidatePendingCount?: number; + noRuntimePendingCount?: number; + permissionPendingCount?: number; } export interface PersistedTeamLaunchSnapshot { @@ -984,6 +1000,27 @@ export type MemberSpawnLivenessSource = 'heartbeat' | 'process'; export type TeamAgentRuntimeBackendType = 'lead' | 'tmux' | 'iterm2' | 'in-process' | 'process'; +export type TeamAgentRuntimeLivenessKind = + | 'confirmed_bootstrap' + | 'runtime_process' + | 'runtime_process_candidate' + | 'permission_blocked' + | 'shell_only' + | 'registered_only' + | 'stale_metadata' + | 'not_found'; + +export type TeamAgentRuntimePidSource = + | 'lead_process' + | 'tmux_pane' + | 'tmux_child' + | 'agent_process_table' + | 'opencode_bridge' + | 'runtime_bootstrap' + | 'persisted_metadata'; + +export type TeamAgentRuntimeDiagnosticSeverity = 'info' | 'warning' | 'error'; + export interface TeamAgentRuntimeEntry { memberName: string; alive: boolean; @@ -996,6 +1033,19 @@ export interface TeamAgentRuntimeEntry { pid?: number; runtimeModel?: string; rssBytes?: number; + livenessKind?: TeamAgentRuntimeLivenessKind; + pidSource?: TeamAgentRuntimePidSource; + processCommand?: string; + paneId?: string; + panePid?: number; + paneCurrentCommand?: string; + runtimePid?: number; + runtimeSessionId?: string; + runtimeLeaseExpiresAt?: string; + runtimeLastSeenAt?: string; + runtimeDiagnostic?: string; + runtimeDiagnosticSeverity?: TeamAgentRuntimeDiagnosticSeverity; + diagnostics?: string[]; updatedAt: string; } @@ -1062,6 +1112,14 @@ export interface MemberSpawnStatusEntry { lastHeartbeatAt?: string; /** Live runtime model observed from the teammate process, when available. */ runtimeModel?: string; + /** Compact runtime liveness classification for launch UI. */ + livenessKind?: TeamAgentRuntimeLivenessKind; + /** Short user-facing liveness diagnostic. */ + runtimeDiagnostic?: string; + /** Visual severity for runtimeDiagnostic. */ + runtimeDiagnosticSeverity?: TeamAgentRuntimeDiagnosticSeverity; + /** ISO timestamp of the last liveness evaluation. */ + livenessLastCheckedAt?: string; /** ISO timestamp of the last status change. */ updatedAt: string; } @@ -1176,6 +1234,28 @@ export interface TeamProvisioningProgress { assistantOutput?: string; /** True once provisioning has written a readable config.json for this team. */ configReady?: boolean; + /** Bounded structured launch diagnostics for the progress UI. */ + launchDiagnostics?: TeamLaunchDiagnosticItem[]; +} + +export interface TeamLaunchDiagnosticItem { + id: string; + memberName?: string; + severity: TeamAgentRuntimeDiagnosticSeverity; + code: + | 'spawn_accepted' + | 'runtime_process_detected' + | 'runtime_process_candidate' + | 'tmux_shell_only' + | 'runtime_not_found' + | 'permission_pending' + | 'bootstrap_confirmed' + | 'bootstrap_stalled' + | 'stale_runtime_event_rejected' + | 'process_table_unavailable'; + label: string; + detail?: string; + observedAt: string; } export interface TeamRuntimeState { diff --git a/test/main/services/team/TeamLaunchStateEvaluator.test.ts b/test/main/services/team/TeamLaunchStateEvaluator.test.ts index c0a1f2bf..d6042194 100644 --- a/test/main/services/team/TeamLaunchStateEvaluator.test.ts +++ b/test/main/services/team/TeamLaunchStateEvaluator.test.ts @@ -60,26 +60,27 @@ describe('TeamLaunchStateEvaluator', () => { }); it('counts persisted members in launch summary even when expectedMembers is stale', () => { - const summary = summarizePersistedLaunchMembers( - ['alice'], - { - alice: { - launchState: 'runtime_pending_bootstrap', - runtimeAlive: false, - }, - bob: { - launchState: 'runtime_pending_permission', - runtimeAlive: true, - }, - } as any - ); + const summary = summarizePersistedLaunchMembers(['alice'], { + alice: { + launchState: 'runtime_pending_bootstrap', + runtimeAlive: false, + }, + bob: { + launchState: 'runtime_pending_permission', + runtimeAlive: true, + }, + } as any); expect(summary).toEqual({ confirmedCount: 0, pendingCount: 2, failedCount: 0, runtimeAlivePendingCount: 1, + shellOnlyPendingCount: 0, + runtimeProcessPendingCount: 0, + runtimeCandidatePendingCount: 0, + noRuntimePendingCount: 0, + permissionPendingCount: 1, }); }); - }); diff --git a/test/main/services/team/TeamMcpConfigBuilder.test.ts b/test/main/services/team/TeamMcpConfigBuilder.test.ts index fa33b110..22e70f16 100644 --- a/test/main/services/team/TeamMcpConfigBuilder.test.ts +++ b/test/main/services/team/TeamMcpConfigBuilder.test.ts @@ -75,7 +75,9 @@ describe('TeamMcpConfigBuilder', () => { return dir; } - function readGeneratedServer(configPath: string): { command?: string; args?: string[] } | undefined { + function readGeneratedServer( + configPath: string + ): { command?: string; args?: string[] } | undefined { const raw = fs.readFileSync(configPath, 'utf8'); const parsed = JSON.parse(raw) as { mcpServers?: Record; @@ -83,26 +85,72 @@ describe('TeamMcpConfigBuilder', () => { return parsed.mcpServers?.['agent-teams']; } - function expectNodeEntry(server: { command?: string; args?: string[] } | undefined, entry: string): void { + function expectNodeEntry( + server: { command?: string; args?: string[] } | undefined, + entry: string + ): void { expect(server?.args).toEqual([entry]); expect(server?.command).toMatch(/(^node$|[\\/]node(?:\.exe)?$)/); } - function mockPathExists(existingPaths: string[]): void { + function expectTsxEntry( + server: { command?: string; args?: string[] } | undefined, + entry: string + ): void { + expect(server?.args).toEqual([entry]); + expect(server?.command).toMatch(/[\\/]tsx(?:\.cmd)?$/); + } + + function getBuiltWorkspaceEntry(): string { + return path.join(process.cwd(), 'mcp-server', 'dist', 'index.js'); + } + + function getSourceWorkspaceEntry(): string { + return path.join(process.cwd(), 'mcp-server', 'src', 'index.ts'); + } + + function getWorkspaceTsxBin(): string { + return path.join(process.cwd(), 'mcp-server', 'node_modules', '.bin', 'tsx'); + } + + function mockPathExists(existingPaths: string[], options: { strict?: boolean } = {}): void { const originalAccess = fs.promises.access.bind(fs.promises); vi.spyOn(fs.promises, 'access').mockImplementation(async (targetPath, mode) => { const normalizedPath = - typeof targetPath === 'string' ? targetPath : Buffer.isBuffer(targetPath) ? targetPath.toString() : `${targetPath}`; + typeof targetPath === 'string' + ? targetPath + : Buffer.isBuffer(targetPath) + ? targetPath.toString() + : `${targetPath}`; if (existingPaths.includes(normalizedPath)) { return; } + if (options.strict) { + const error = new Error( + `ENOENT: no such file or directory, access '${normalizedPath}'` + ) as NodeJS.ErrnoException; + error.code = 'ENOENT'; + throw error; + } await originalAccess(targetPath, mode); }); } + function mockSourceWorkspaceEntryAvailable(): { + sourceEntry: string; + tsxBin: string; + builtEntry: string; + } { + const sourceEntry = getSourceWorkspaceEntry(); + const tsxBin = getWorkspaceTsxBin(); + const builtEntry = getBuiltWorkspaceEntry(); + mockPathExists([sourceEntry, tsxBin, builtEntry], { strict: true }); + return { sourceEntry, tsxBin, builtEntry }; + } + function mockBuiltWorkspaceEntryAvailable(): string { - const builtEntry = path.join(process.cwd(), 'mcp-server', 'dist', 'index.js'); - mockPathExists([builtEntry]); + const builtEntry = getBuiltWorkspaceEntry(); + mockPathExists([builtEntry], { strict: true }); return builtEntry; } @@ -172,12 +220,26 @@ describe('TeamMcpConfigBuilder', () => { createdPaths.push(configPath); const filename = path.basename(configPath); - expect(filename).toMatch( - new RegExp(`^agent-teams-mcp-${process.pid}-\\d+-[0-9a-f-]+\\.json$`) - ); + expect(filename).toMatch(new RegExp(`^agent-teams-mcp-${process.pid}-\\d+-[0-9a-f-]+\\.json$`)); }); - it('prefers the built workspace MCP entry when available', async () => { + it('prefers the source workspace MCP entry in dev mode when available', async () => { + const { sourceEntry } = mockSourceWorkspaceEntryAvailable(); + const builder = new TeamMcpConfigBuilder(); + + const configPath = await builder.writeConfigFile(); + createdPaths.push(configPath); + + const raw = fs.readFileSync(configPath, 'utf8'); + const parsed = JSON.parse(raw) as { + mcpServers?: Record; + }; + + const server = parsed.mcpServers?.['agent-teams']; + expectTsxEntry(server, sourceEntry); + }); + + it('falls back to the built workspace MCP entry when source execution is unavailable', async () => { const builtEntry = mockBuiltWorkspaceEntryAvailable(); const builder = new TeamMcpConfigBuilder(); @@ -232,7 +294,10 @@ describe('TeamMcpConfigBuilder', () => { createdPaths.push(configPath); const parsed = JSON.parse(fs.readFileSync(configPath, 'utf8')) as { - mcpServers: Record; + mcpServers: Record< + string, + { command?: string; args?: string[]; type?: string; url?: string } + >; }; expect(Object.keys(parsed.mcpServers)).toEqual(['agent-teams']); @@ -246,7 +311,10 @@ describe('TeamMcpConfigBuilder', () => { createdDirs.push(homeDir, projectDir); mockHomeDir = homeDir; - fs.writeFileSync(path.join(homeDir, '.claude.json'), JSON.stringify({ mcpServers: {} }, null, 2)); + fs.writeFileSync( + path.join(homeDir, '.claude.json'), + JSON.stringify({ mcpServers: {} }, null, 2) + ); fs.writeFileSync( path.join(projectDir, '.mcp.json'), JSON.stringify( @@ -273,7 +341,7 @@ describe('TeamMcpConfigBuilder', () => { }); it('generated agent-teams server ignores same-named user MCP entry', async () => { - const builtEntry = mockBuiltWorkspaceEntryAvailable(); + const { sourceEntry } = mockSourceWorkspaceEntryAvailable(); const homeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'team-mcp-home-')); createdDirs.push(homeDir); mockHomeDir = homeDir; @@ -299,7 +367,7 @@ describe('TeamMcpConfigBuilder', () => { mcpServers: Record; }; - expectNodeEntry(parsed.mcpServers['agent-teams'], builtEntry); + expectTsxEntry(parsed.mcpServers['agent-teams'], sourceEntry); }); it('ignores malformed user MCP file', async () => { @@ -494,7 +562,10 @@ describe('TeamMcpConfigBuilder', () => { const configPath = await builder.writeConfigFile(); createdPaths.push(configPath); - expectNodeEntry(readGeneratedServer(configPath), path.join(resourcesDir, 'mcp-server', 'index.js')); + expectNodeEntry( + readGeneratedServer(configPath), + path.join(resourcesDir, 'mcp-server', 'index.js') + ); }); it('packaged mode uses the winner stable copy when atomic rename loses the race', async () => { @@ -526,8 +597,8 @@ describe('TeamMcpConfigBuilder', () => { expectNodeEntry(readGeneratedServer(configPath), path.join(stableDir, 'index.js')); }); - it('packaged mode falls back to the built workspace MCP entry when resourcesPath bundle is missing', async () => { - const builtEntry = mockBuiltWorkspaceEntryAvailable(); + it('packaged mode falls back to the source workspace MCP entry when resourcesPath bundle is missing', async () => { + const { sourceEntry } = mockSourceWorkspaceEntryAvailable(); setPackagedMode(true, '6.0.0'); const resourcesDir = fs.mkdtempSync(path.join(os.tmpdir(), 'team-mcp-resources-')); createdDirs.push(resourcesDir); @@ -537,6 +608,6 @@ describe('TeamMcpConfigBuilder', () => { const configPath = await builder.writeConfigFile(); createdPaths.push(configPath); - expectNodeEntry(readGeneratedServer(configPath), builtEntry); + expectTsxEntry(readGeneratedServer(configPath), sourceEntry); }); }); diff --git a/test/main/services/team/TeamProvisioningService.test.ts b/test/main/services/team/TeamProvisioningService.test.ts index b826b4fa..69cb20bf 100644 --- a/test/main/services/team/TeamProvisioningService.test.ts +++ b/test/main/services/team/TeamProvisioningService.test.ts @@ -27,7 +27,9 @@ vi.mock('@main/services/team/ClaudeBinaryResolver', () => ({ vi.mock('@features/tmux-installer/main', () => ({ killTmuxPaneForCurrentPlatformSync: vi.fn(), + listRuntimeProcessesForCurrentTmuxPlatform: vi.fn(async () => []), listTmuxPanePidsForCurrentPlatform: vi.fn(async () => new Map()), + listTmuxPaneRuntimeInfoForCurrentPlatform: vi.fn(async () => new Map()), isTmuxRuntimeReadyForCurrentPlatform: vi.fn(async () => true), })); @@ -144,7 +146,9 @@ import { } from 'agent-teams-controller'; import { killTmuxPaneForCurrentPlatformSync, + listRuntimeProcessesForCurrentTmuxPlatform, listTmuxPanePidsForCurrentPlatform, + listTmuxPaneRuntimeInfoForCurrentPlatform, } from '@features/tmux-installer/main'; import pidusage from 'pidusage'; @@ -409,6 +413,13 @@ function createClaudeLogsRun(overrides: Record = {}) { describe('TeamProvisioningService', () => { beforeEach(() => { vi.clearAllMocks(); + vi.mocked(killTmuxPaneForCurrentPlatformSync).mockReset(); + vi.mocked(listRuntimeProcessesForCurrentTmuxPlatform).mockReset(); + vi.mocked(listRuntimeProcessesForCurrentTmuxPlatform).mockResolvedValue([]); + vi.mocked(listTmuxPanePidsForCurrentPlatform).mockReset(); + vi.mocked(listTmuxPanePidsForCurrentPlatform).mockResolvedValue(new Map()); + vi.mocked(listTmuxPaneRuntimeInfoForCurrentPlatform).mockReset(); + vi.mocked(listTmuxPaneRuntimeInfoForCurrentPlatform).mockResolvedValue(new Map()); tempClaudeRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'claude-team-provisioning-')); tempTeamsBase = path.join(tempClaudeRoot, 'teams'); tempTasksBase = path.join(tempClaudeRoot, 'tasks'); @@ -557,7 +568,18 @@ describe('TeamProvisioningService', () => { cancelRequested: false, spawnContext: null, }); - vi.mocked(listTmuxPanePidsForCurrentPlatform).mockResolvedValueOnce(new Map([['%1', 222]])); + vi.mocked(listTmuxPaneRuntimeInfoForCurrentPlatform).mockResolvedValueOnce( + new Map([ + [ + '%1', + { + paneId: '%1', + panePid: 222, + currentCommand: 'codex', + }, + ], + ]) + ); vi.mocked(pidusage).mockResolvedValueOnce({ '111': createPidusageStat(111, 123_000_000), @@ -650,7 +672,18 @@ describe('TeamProvisioningService', () => { cancelRequested: false, spawnContext: null, }); - vi.mocked(listTmuxPanePidsForCurrentPlatform).mockResolvedValueOnce(new Map([['%1', 222]])); + vi.mocked(listTmuxPaneRuntimeInfoForCurrentPlatform).mockResolvedValueOnce( + new Map([ + [ + '%1', + { + paneId: '%1', + panePid: 222, + currentCommand: 'codex', + }, + ], + ]) + ); vi.mocked(pidusage) .mockRejectedValueOnce(new Error('ps: process exited')) @@ -693,14 +726,14 @@ describe('TeamProvisioningService', () => { cancelRequested: false, spawnContext: null, }); - (svc as any).readUnixProcessTableRows = vi.fn(() => [ + vi.mocked(listRuntimeProcessesForCurrentTmuxPlatform).mockResolvedValueOnce([ { pid: 333, + ppid: 1, command: '/Users/belief/.bun/bin/bun cli.js --agent-id alice@nice-team --agent-name alice --team-name nice-team --model gpt-5.2', }, ]); - vi.mocked(listTmuxPanePidsForCurrentPlatform).mockResolvedValueOnce(new Map()); vi.mocked(pidusage).mockResolvedValueOnce({ '111': createPidusageStat(111, 123_000_000), '333': createPidusageStat(333, 456_000_000), @@ -746,19 +779,20 @@ describe('TeamProvisioningService', () => { cancelRequested: false, spawnContext: null, }); - (svc as any).readUnixProcessTableRows = vi.fn(() => [ + vi.mocked(listRuntimeProcessesForCurrentTmuxPlatform).mockResolvedValueOnce([ { pid: 222, + ppid: 1, command: '/Users/belief/.bun/bin/bun cli.js --agent-id alice@nice-team --agent-name alice --team-name nice-team --model gpt-5.2', }, { pid: 333, + ppid: 1, command: '/Users/belief/.bun/bin/bun cli.js --team-name nice-team --agent-id alice@nice-team --agent-name alice --model gpt-5.2', }, ]); - vi.mocked(listTmuxPanePidsForCurrentPlatform).mockResolvedValueOnce(new Map()); vi.mocked(pidusage).mockResolvedValueOnce({ '111': createPidusageStat(111, 123_000_000), '333': createPidusageStat(333, 456_000_000), @@ -911,14 +945,20 @@ describe('TeamProvisioningService', () => { ]), }; (svc as any).readPersistedRuntimeMembers = vi.fn(() => []); - (svc as any).findLiveProcessPidByAgentId = vi.fn( - () => - new Map([ - ['alice@signal-ops-6', 17527], - ['atlas@signal-ops-6', 17528], - ]) - ); - vi.mocked(listTmuxPanePidsForCurrentPlatform).mockResolvedValueOnce(new Map()); + vi.mocked(listRuntimeProcessesForCurrentTmuxPlatform).mockResolvedValueOnce([ + { + pid: 17527, + ppid: 1, + command: + '/Users/belief/.bun/bin/bun cli.js --agent-id alice@signal-ops-6 --agent-name alice --team-name signal-ops-6 --model gpt-5.4-mini', + }, + { + pid: 17528, + ppid: 1, + command: + '/Users/belief/.bun/bin/bun cli.js --agent-id atlas@signal-ops-6 --agent-name atlas --team-name signal-ops-6 --model gpt-5.3-codex', + }, + ]); const metadata = await (svc as any).getLiveTeamAgentRuntimeMetadata('signal-ops-6'); @@ -4009,6 +4049,9 @@ describe('TeamProvisioningService', () => { const restartPromise = expect(svc.restartMember('process-team', 'forge')).rejects.toThrow( `Restart for teammate "forge" is still waiting for the previous process to exit (${process.pid}).` ); + await vi.waitFor(() => { + expect(vi.mocked(killProcessByPid)).toHaveBeenCalledWith(process.pid); + }); await vi.advanceTimersByTimeAsync(1_500); await restartPromise; @@ -4056,9 +4099,14 @@ describe('TeamProvisioningService', () => { backendType: 'process', }, ]); - (svc as any).findLiveProcessPidByAgentId = vi.fn( - () => new Map([['forge@process-team', process.pid]]) - ); + vi.mocked(listRuntimeProcessesForCurrentTmuxPlatform).mockResolvedValueOnce([ + { + pid: process.pid, + ppid: 1, + command: + '/Users/belief/.bun/bin/bun cli.js --team-name process-team --agent-id forge@process-team --agent-name forge --model gpt-5.4', + }, + ]); (svc as any).liveTeamAgentRuntimeMetadataCache.set('process-team', { expiresAtMs: Date.now() + 60_000, metadata: new Map([ @@ -4078,6 +4126,9 @@ describe('TeamProvisioningService', () => { const restartPromise = expect(svc.restartMember('process-team', 'forge')).rejects.toThrow( `Restart for teammate "forge" is still waiting for the previous process to exit (${process.pid}).` ); + await vi.waitFor(() => { + expect(vi.mocked(killProcessByPid)).toHaveBeenCalledWith(process.pid); + }); await vi.advanceTimersByTimeAsync(1_500); await restartPromise; @@ -4120,15 +4171,23 @@ describe('TeamProvisioningService', () => { ]), }; (svc as any).readPersistedRuntimeMembers = vi.fn(() => []); - (svc as any).findLiveProcessPidByAgentId = vi.fn( - () => new Map([['forge@process-team', process.pid]]) - ); + vi.mocked(listRuntimeProcessesForCurrentTmuxPlatform).mockResolvedValueOnce([ + { + pid: process.pid, + ppid: 1, + command: + '/Users/belief/.bun/bin/bun cli.js --team-name process-team --agent-id forge@process-team --agent-name forge --model gpt-5.4', + }, + ]); (svc as any).aliveRunByTeam.set('process-team', run.runId); (svc as any).runs.set(run.runId, run); const restartPromise = expect(svc.restartMember('process-team', 'forge')).rejects.toThrow( `Restart for teammate "forge" is still waiting for the previous process to exit (${process.pid}).` ); + await vi.waitFor(() => { + expect(vi.mocked(killProcessByPid)).toHaveBeenCalledWith(process.pid); + }); await vi.advanceTimersByTimeAsync(1_500); await restartPromise;