diff --git a/docs/research/get-team-data-parallel-read-plan.md b/docs/research/get-team-data-parallel-read-plan.md new file mode 100644 index 00000000..97095764 --- /dev/null +++ b/docs/research/get-team-data-parallel-read-plan.md @@ -0,0 +1,1261 @@ +# getTeamData Parallel Read Plan + +## Goal + +Reduce `team:getData` latency by parallelizing only the safe top-level read phase of `getTeamData()` without changing: + +- `TeamData` payload shape +- message merge semantics +- dedup semantics +- warning semantics +- renderer/main truth boundaries +- launch/process reconciliation behavior + +Primary target: + +- repeated visible team detail refreshes + +Not a target for this patch: + +- new IPC endpoints +- renderer refresh changes +- new caches +- message payload slimming +- lower-pipeline refactors + +## Why This Requires Extra Care + +This touches a guarded hot zone: + +- `main process / IPC / heavy team payload assembly` + +Relevant guardrails: + +- preserve state truth and reconciliation +- avoid heavier monolithic snapshot paths +- preserve observability +- prefer the smallest invariant-preserving change + +Main failure modes to avoid: + +- one async failure aborts the whole snapshot +- warning order becomes nondeterministic +- message merge order changes +- completion-time diagnostics become misleading +- side-effecting reads run concurrently with unrelated file-heavy reads +- naive parallelism creates enough disk contention to erase the perf gain +- the patch quietly assumes tests cover behavior they currently do not cover +- step wrappers silently mask wiring bugs because the wrapped closure does too much work +- nested settlement layers make the control flow harder to audit +- `presenceIndexPromise` rejection semantics change by accident +- shared mutable fallback objects leak across steps or tests +- diagnostics state is mutated from concurrent branches and becomes harder to audit +- queued heavy work starts reader I/O before a limiter slot is actually granted +- a queued heavy step records timing metadata at queue time instead of settle time +- fallback factories become mini control-flow hooks and start masking programming bugs +- warning text generation drifts from today because it becomes dynamic instead of literal + +Relevant code: + +- [src/main/services/team/TeamDataService.ts#L501](/Users/belief/dev/projects/claude/claude_team_freecode/src/main/services/team/TeamDataService.ts#L501) +- [src/main/ipc/teams.ts#L540](/Users/belief/dev/projects/claude/claude_team_freecode/src/main/ipc/teams.ts#L540) +- [src/renderer/store/index.ts#L938](/Users/belief/dev/projects/claude/claude_team_freecode/src/renderer/store/index.ts#L938) +- [src/main/services/team/TeamInboxReader.ts#L182](/Users/belief/dev/projects/claude/claude_team_freecode/src/main/services/team/TeamInboxReader.ts#L182) +- [agent-teams-controller/src/internal/processStore.js#L29](/Users/belief/dev/projects/claude/claude_team_freecode/agent-teams-controller/src/internal/processStore.js#L29) + +## Hard Non-Goals + +This patch must not: + +- parallelize `readProcesses()` +- parallelize `resolveMembers()` +- parallelize `getMemberAdvisories(...)` +- parallelize `enrichMemberBranches(...)` +- change the post-load message pipeline +- change warning strings +- change warning order +- change any renderer store logic +- change any `InboxMessage` or `TeamData` semantics +- move concurrency into `TeamTaskReader` or `TeamInboxReader` + +This patch must also not opportunistically fix adjacent issues such as: + +- reusing `listInboxNames()` inside `getMessages()` +- introducing a single-team worker path for tasks +- caching lead-session directory listings + +Those are separate patches. + +Additional rule: + +- keep all new concurrency orchestration local to `getTeamData()` + +Why: + +- moving concurrency into lower-level readers would widen the blast radius +- it would make it harder to reason about total filesystem pressure from this patch + +## Dependency Graph + +The patch must preserve this dependency graph exactly. + +### Strict prerequisites + +- `config` must complete before: + - `extractLeadSessionTexts(config)` + - any downstream assembly + +Additional rule: + +- if `config` is missing, `getTeamData()` must still throw before any read-phase step promises are created + +Why: + +- this preserves the current fast-fail behavior +- it avoids accidental background work for nonexistent teams + +### Safe early-start paths + +After `config`: + +- `presenceIndexPromise` +- tasks +- inboxNames +- messages +- leadTexts +- sentMessages +- metaMembers +- kanbanState + +### Assembly prerequisites + +`resolveMembers(...)` requires all of: + +- `config` +- `metaMembers` +- `inboxNames` +- `tasksWithKanban` +- final assembled `messages` + +`tasksWithKanban` requires: + +- `tasks` +- `kanbanState` +- `presenceIndexPromise` + +`presenceIndexPromise` itself must preserve current semantics: + +- it starts early +- it is awaited only after `tasksWithKanbanBase` +- if it rejects today, `getTeamData()` rejects + +Rule: + +- do not wrap, settle, catch, delay, or parallelize away that rejection behavior in this patch + +### Explicit downstream-only paths + +These must stay after assembly: + +- `getMemberAdvisories(...)` +- `enrichMemberBranches(...)` +- `readProcesses(teamName)` + +Why this section exists: + +- the patch is only safe if dependency edges remain explicit +- parallelization work tends to hide dependency mistakes behind “it still compiled” + +## Key Observations + +### 1. `processes` is not a pure read + +`readProcesses()` looks harmless, but `controller.processes.listProcesses()` can mark dead PIDs and synchronously rewrite `processes.json`. + +Relevant code: + +- [src/main/services/team/TeamDataService.ts#L874](/Users/belief/dev/projects/claude/claude_team_freecode/src/main/services/team/TeamDataService.ts#L874) +- [agent-teams-controller/src/internal/processStore.js#L29](/Users/belief/dev/projects/claude/claude_team_freecode/agent-teams-controller/src/internal/processStore.js#L29) + +Rule: + +- keep `processes` outside the parallel read phase + +### 2. Some top-level readers already create their own fan-out + +`TeamInboxReader.getMessages()` already performs bounded concurrent inbox reads. + +Relevant code: + +- [src/main/services/team/TeamInboxReader.ts#L14](/Users/belief/dev/projects/claude/claude_team_freecode/src/main/services/team/TeamInboxReader.ts#L14) +- [src/main/services/team/TeamInboxReader.ts#L182](/Users/belief/dev/projects/claude/claude_team_freecode/src/main/services/team/TeamInboxReader.ts#L182) + +Implication: + +- do not start every heavyweight reader at once +- control top-level heavy-step concurrency explicitly + +### 3. `listInboxNames()` is duplicated work today + +`getTeamData()` calls `listInboxNames()` directly, and `getMessages()` calls it again internally. + +Relevant code: + +- [src/main/services/team/TeamDataService.ts#L549](/Users/belief/dev/projects/claude/claude_team_freecode/src/main/services/team/TeamDataService.ts#L549) +- [src/main/services/team/TeamInboxReader.ts#L182](/Users/belief/dev/projects/claude/claude_team_freecode/src/main/services/team/TeamInboxReader.ts#L182) + +Implication: + +- phase 1 will still duplicate one inbox-directory listing +- that is acceptable for this patch +- do not widen the patch by coupling these readers + +### 4. Warning order is currently deterministic + +Current semantic order: + +1. tasks +2. inbox names +3. messages +4. lead session texts +5. sent messages +6. member metadata +7. kanban state +8. runtime advisories +9. processes + +Rule: + +- preserve this order even if reads complete out of order + +Additional rule: + +- preserve warning presence and wording exactly +- do not deduplicate warnings in this patch + +Additional note: + +- `presenceIndexPromise` is intentionally not part of this warning sequence +- phase 1 must not introduce a new warning for presence loading + +### 5. The lower pipeline is semantically sensitive + +After the read phase, `getTeamData()` performs: + +- message merge +- lead-session/live dedup +- session-id enrichment +- slash-result annotation +- kanban overlay +- member resolution + +Rule: + +- do not reorder, parallelize, or “clean up” this lower pipeline in phase 1 + +### 5a. `kanbanGc` is part of current diagnostics shape even though no GC runs here + +`getTeamData()` currently records `mark('kanbanGc')` immediately after kanban load without performing garbage collection. + +Relevant code: + +- [src/main/services/team/TeamDataService.ts#L740](/Users/belief/dev/projects/claude/claude_team_freecode/src/main/services/team/TeamDataService.ts#L740) + +Rule: + +- keep this mark where it is in phase 1 +- do not “clean it up” while touching perf code + +Why: + +- changing it mixes perf work with observability semantics +- it would make before/after slow-log comparison noisier + +### 6. Existing tests do not cover the riskiest new behavior + +Current coverage exercises many data semantics, but not: + +- warning order under concurrent failures +- read-phase race behavior +- proof that `processes` stayed outside the parallel phase +- sync-throw fallback behavior for step wrappers + +Rule: + +- add those tests explicitly in this patch + +Additional note: + +- an existing test already asserts that `getTeamData()` remains read-only and does not invoke kanban garbage collection +- that invariant must stay green throughout this patch, not be replaced or weakened + +## Safe Scope + +### Safe to parallelize + +After `config` loads successfully: + +- `taskReader.getTasks(teamName)` +- `inboxReader.listInboxNames(teamName)` +- `inboxReader.getMessages(teamName)` +- `extractLeadSessionTexts(config)` +- `sentMessagesStore.readMessages(teamName)` +- `membersMetaStore.getMembers(teamName)` +- `kanbanManager.getState(teamName)` + +### Explicitly keep serial + +- `configReader.getConfig(teamName)` +- `presenceIndexPromise` setup logic +- all message merge/dedup/enrichment logic +- `resolveMembers(...)` +- `getMemberAdvisories(...)` +- `enrichMemberBranches(...)` +- `readProcesses(teamName)` + +## Design Choice + +### Preferred approach: two-level parallel read phase + +After `config`, split the top read phase into: + +#### Heavy group + +Heavier file-system work: + +- `getTasks` +- `getMessages` +- `extractLeadSessionTexts` + +Run through a small limiter. + +Recommended limit: + +- `2` + +Rules: + +- use a fixed limit in phase 1 +- do not make it adaptive +- do not make it platform-specific in phase 1 + +Why: + +- `getMessages()` already fans out internally +- `getTasks()` can read many task files +- `extractLeadSessionTexts()` may scan directories and JSONL files +- fixed `2` is the most predictable low-risk choice before profiling + +#### Light group + +Lower-risk direct reads: + +- `listInboxNames` +- `readMessages(sentMessages)` +- `getMembers(metaMembers)` +- `getState(kanban)` + +Run immediately in parallel. + +Important nuance: + +- “light” here means “safe to start immediately”, not “guaranteed tiny” +- if later profiling shows one of these is expensive, that becomes a follow-up patch + +### Why not one big `Promise.all` + +- too easy to turn one step failure into whole-snapshot failure +- too easy to lose deterministic warnings +- too easy to create uncontrolled disk contention +- too easy to blur diagnostics and completion timing + +### Safer fallback if confidence drops during implementation + +If implementation reveals unexpected complexity, the fallback patch is: + +- parallelize only the light group +- keep heavy steps serial + +This fallback has lower upside but even lower risk: + +- `🎯 7 🛡️ 10 🧠 3` + +Rule: + +- prefer shipping the light-group-only version over broadening the patch unsafely + +## Step Wrapper Design + +Do not use scattered `try/catch` blocks inside concurrent branches. + +Introduce a small local helper inside `getTeamData()` or a tiny private helper in `TeamDataService`. + +Preferred contract: + +```ts +interface StepResult { + value: T; + warning?: string; + completedAt: number; +} + +function startReadStep(options: { + label: string; + createFallback: () => T; + warningText?: string; + load: () => Promise; +}): Promise> +``` + +Required semantics: + +- start the load immediately when invoked +- never reject outward for expected step failures +- convert both sync throws and async rejects into fallback + warning +- always record `completedAt` +- never mutate shared warning arrays internally +- keep fallback and warning text explicit at the callsite +- create fallback values only on failure + +Additional rule: + +- `warningText` should remain a static literal at each callsite +- do not compute warning text dynamically inside the helper + +Why: + +- this patch must preserve wording exactly +- dynamic warning construction would increase drift risk for no real benefit + +Stronger rule: + +- the wrapped `load` closure should be a thin direct call to one reader/service method +- do not put assembly logic inside the wrapper closure + +Why: + +- broad closures increase the chance that a programming bug gets silently turned into fallback + warning +- keeping closures thin confines masking behavior to the existing best-effort I/O boundary + +Important constraint: + +- keep this helper local to the hot path +- do not introduce a new shared generic abstraction in the same patch + +Why: + +- this is a surgical performance patch +- broad abstractions increase blast radius and review cost + +Additional rule: + +- the helper must not inspect or transform returned values + +Additional rule: + +- the helper must not mutate shared outer state such as `warnings` or `marks` + +Why: + +- any normalization inside the helper would increase the chance of semantic drift +- the helper should only control error capture and timing metadata + +Additional rule: + +- fallback values must be created per step callsite +- prefer `createFallback()` over passing a prebuilt object/array +- do not reuse one module-level empty array or one shared default object between steps + +Additional rule: + +- `createFallback()` must be synchronous, side-effect free, and trivial +- if `createFallback()` itself throws, treat that as a programming bug, not an expected degraded read failure + +Why: + +- arrays like `messages`, `tasks`, and `sentMessages` are later merged and sometimes mutated +- shared fallback identity would make accidental aliasing harder to spot in tests +- lazy fallback creation makes it harder to accidentally share the same object across successes and failures +- allowing fallback factories to become “smart” would widen the masking boundary beyond the current design intent + +### Settlement rule + +Because each step promise is designed to resolve with a `StepResult` even on failure: + +- prefer `Promise.all(stepPromises)` over `Promise.allSettled(stepPromises)` + +Why: + +- `Promise.allSettled` becomes redundant once the wrapper already settles failures +- a second settlement layer makes the control flow harder to reason about +- it increases the chance of accidentally swallowing a bug in the orchestration code itself + +Additional rule: + +- do not add per-step logging inside the helper in phase 1 + +Why: + +- existing observability already relies on slow-log timing and final warnings +- extra step-local logs would change noise characteristics while the patch goal is latency reduction + +## Diagnostics And Marks + +Current `mark(...)` timestamps feed the slow log in [TeamDataService.ts#L804](/Users/belief/dev/projects/claude/claude_team_freecode/src/main/services/team/TeamDataService.ts#L804). + +Do not lose that. + +### Rule + +- each step still needs a mark timestamp +- mark time should represent actual completion time +- not “when we finally consumed the result later” + +### Practical approach + +When a step settles, keep `completedAt` in the returned `StepResult`. + +After `Promise.all(...)` completes, assign that timestamp into `marks[label]` from the main control flow. + +Example: + +- `marks.tasks = tasksStep.completedAt` +- `marks.messages = messagesStep.completedAt` + +Stronger rule: + +- do not mutate `marks` from inside concurrent branches + +Why: + +- the values would still be numerically correct, but the write path becomes harder to audit +- keeping all diagnostics writes in one deterministic place lowers review risk + +Additional rule: + +- do not synthesize marks for unfinished or skipped steps + +Why: + +- fake timestamps would make slow logs harder to trust +- missing marks should remain visible as `-1` via current `msSince(...)` behavior if something truly did not complete + +### Important nuance + +This changes interpretation slightly: + +- before: marks reflected serial stage completion +- after: marks reflect “time since method start when this async read completed” + +That is acceptable, but it should be documented near the helper or marks assignment in code comments. + +## Warning Discipline + +Warnings must remain deterministic. + +Do not: + +- call `warnings.push(...)` inside async branches + +Instead: + +1. collect each step result independently +2. append warnings after the parallel phase settles +3. append them in the same semantic order as today + +Required final order: + +1. `Tasks failed to load` +2. `Inboxes failed to load` +3. `Messages failed to load` +4. `Lead session texts failed to load` +5. `Sent messages failed to load` +6. `Member metadata failed to load` +7. `Kanban state failed to load` +8. `Member runtime advisories failed to load` +9. `Processes failed to load` + +Phase 1 only changes how steps 1-7 are loaded, not their order. + +## Exact Safe Execution Order + +### Phase A - bootstrap + +1. load `config` +2. create `warnings` +3. compute `changePresenceEnabled` +4. compute `logSourceSnapshot` +5. start `presenceIndexPromise` exactly as today + +Important rule: + +- do not wrap `presenceIndexPromise` in `startReadStep` +- do not convert a `presenceIndexPromise` rejection into fallback behavior +- do not move its await earlier than today + +### Phase B - parallel read phase + +Start these reads: + +- heavy, limiter `2`: + - tasks + - messages + - leadTexts +- light, direct: + - inboxNames + - sentMessages + - metaMembers + - kanbanState + +Wait for all of them to settle. + +Preferred implementation detail: + +- create every step promise before awaiting any of them +- start light steps immediately +- start heavy steps immediately through the limiter +- only await results after all step promises exist +- use one final await point for the whole phase, not separate group awaits + +Why: + +- avoids accidental serial gaps +- makes the intended parallelism explicit +- prevents “light first, heavy later” drift from sneaking in during refactor + +Additional rule: + +- instantiate heavy-step promises in stable semantic order: + - tasks + - messages + - leadTexts + +Why: + +- this makes limiter behavior deterministic +- it simplifies reasoning in tests that verify bounded heavy-step concurrency + +Additional rule: + +- do not create step promises inside array literals passed directly into `Promise.all` + +Why: + +- it hides creation order +- it makes breakpoint/debugging worse +- it increases the chance of accidental inline logic expansion + +Additional rule: + +- queued heavy steps must not execute any reader work before a limiter slot is granted + +Why: + +- otherwise the code may look bounded while still front-loading real filesystem work +- this is an easy place to accidentally defeat the limiter with an eager closure + +Preferred composition: + +- queue a thunk that starts the wrapped step only after the limiter grants a slot +- in other words, prefer `runHeavy(() => startReadStep(...))` over a shape that starts `startReadStep(...)` first and only limits an inner sub-call + +Why: + +- this makes “not started yet” observable in tests +- it avoids subtle eagerness around fallback creation or timing capture +- it keeps the limiter boundary explicit in code review + +### Phase C - deterministic assembly + +Using the resolved values: + +1. append warnings in fixed order +2. initialize `messages` from the resolved inbox messages +3. append `leadTexts` +4. append `sentMessages` +5. run existing dedup logic unchanged +6. run existing session-id enrichment unchanged +7. run existing slash annotation unchanged +8. build `tasksWithKanbanBase` +9. await `presenceIndexPromise` +10. apply presence overlay +11. call `resolveMembers(...)` + +Important rule: + +- do not mutate the `messages` array during the parallel read phase + +Important rule: + +- preserve the current `presenceIndexPromise` await point +- it must still happen after `tasksWithKanbanBase` exists and before presence overlay is applied + +Additional rule: + +- keep local variable names close to the current implementation where practical + +Why: + +- smaller diffs reduce review risk in a fragile hot path +- easier diff-reading improves confidence that only await ordering changed + +### Phase D - unchanged lower phase + +Keep this serial in phase 1: + +1. `getMemberAdvisories(...)` +2. `enrichMemberBranches(...)` +3. `readProcesses(teamName)` + +Important rule: + +- `runtimeAdvisories` warning must still precede `processes` +- `processes` warning must still be last + +Additional rule: + +- if phase 1 reveals that `readProcesses()` is also a measurable bottleneck, do not “just parallelize it” +- handle it in a separate plan because it is a reconcile-like path + +## Implementation Notes + +### 1. Use a tiny local limiter, not a new dependency + +Do not add a library. + +Either: + +- reuse a tiny local `mapLimit` style pattern +- or add a minimal local limiter helper inside `TeamDataService` + +The helper should be boring and obvious. + +### 2. Preserve fallback defaults exactly + +Defaults today: + +- tasks: `[]` +- inboxNames: `[]` +- messages: `[]` +- leadTexts: `[]` +- sentMessages: `[]` +- metaMembers: `[]` +- kanbanState: default empty state +- processes: `[]` + +Do not change any of these defaults. + +Stronger rule: + +- instantiate array fallbacks as fresh literals at each step callsite +- instantiate the empty `kanbanState` fallback as a fresh object literal at its callsite + +Why: + +- it prevents accidental aliasing if later code mutates a returned fallback object +- it keeps tests honest when assertions depend on reference isolation + +### 3. Keep `presenceIndexPromise` untouched + +It already starts early and is gated correctly. + +Changing it would add correctness risk with little payoff. + +Stronger rule: + +- preserve both success and failure semantics exactly +- if `presenceIndexPromise` rejects today, the method should still reject after the same downstream point + +Why: + +- silently degrading change-presence data would be a correctness change, not just a perf change + +### 4. Do not mix adjacent optimizations into this patch + +Specifically do not combine: + +- inbox-name reuse between `listInboxNames()` and `getMessages()` +- task reader worker-path changes +- lead-session dir listing cache + +Why: + +- mixed patches are harder to reason about +- perf attribution gets worse +- rollback gets harder + +### 5. Prefer the lowest-risk first wrapper conversion during implementation + +If doing the implementation incrementally, the first serial wrapper conversion should be one of: + +- `membersMetaStore.getMembers(teamName)` +- `sentMessagesStore.readMessages(teamName)` + +Avoid using `getMessages()` or `extractLeadSessionTexts(config)` as the first wrapper-conversion step. + +Why: + +- they are semantically denser and more file-heavy +- starting with a lighter reader makes it easier to verify the wrapper boundary before concurrency is introduced + +### 6. Do not add committed benchmark scaffolding in this patch + +If extra timing is needed during implementation: + +- use temporary local instrumentation or existing slow-log output +- remove any temporary timing helpers before committing + +Why: + +- the patch goal is a surgical hot-path change, not a new profiling framework +- leaving ad hoc benchmark scaffolding behind would increase maintenance noise + +## Edge Cases + +### 1. `messages` fail but `leadTexts` succeed + +Expected: + +- final `messages` still contains `leadTexts` and `sentMessages` +- warning includes `Messages failed to load` + +### 2. `leadTexts` fail but inbox messages succeed + +Expected: + +- final `messages` remains valid without lead session history +- warning includes `Lead session texts failed to load` + +### 3. `kanbanState` fails + +Expected: + +- tasks still load +- empty kanban fallback is used +- warning order remains stable +- renderer still sees safe fallback state + +### 4. `metaMembers` fail + +Expected: + +- resolver still works from config + inbox names + messages +- warning includes `Member metadata failed to load` + +### 5. One heavy read is very slow + +Expected: + +- light reads are not blocked from starting +- heavy reads are bounded and do not all pile up at once + +### 6. Duplicate inbox-dir reads happen concurrently + +Expected: + +- this is acceptable in phase 1 +- no semantic change occurs +- this should not be “optimized away” inside the same patch + +### 7. `processes` marks a dead runtime and writes `processes.json` + +Expected: + +- this still happens +- it still happens outside the parallel read phase + +### 8. A step throws synchronously before returning a promise + +Expected: + +- the step wrapper converts it into fallback + warning +- `getTeamData()` still returns a partial snapshot + +### 9. A heavy step finishes before a light step + +Expected: + +- marks reflect actual completion times +- warnings remain in semantic order +- assembly order remains unchanged + +### 10. A bug occurs in orchestration code outside a step wrapper + +Expected: + +- it should still surface as a normal test/runtime failure +- it must not be hidden behind generic fallback behavior + +Why this matters: + +- the wrapper is only meant to preserve current best-effort reader semantics +- it is not meant to hide mistakes in assembly code + +### 11. `presenceIndexPromise` rejects + +Expected: + +- `getTeamData()` still rejects +- no new warning is added +- the reject point remains after `tasksWithKanbanBase` construction, as today + +Why this matters: + +- changing this would alter correctness semantics for task change presence +- that is outside the scope of a safe latency patch + +### 12. Heavy-step limiter accidentally serializes everything + +Expected: + +- light steps still start immediately +- at least two heavy steps may be in flight before the first one resolves + +Why this matters: + +- a broken limiter implementation can silently collapse back to serial behavior while tests still pass + +### 13. A heavy step is queued behind the limiter and throws synchronously when finally started + +Expected: + +- the wrapper still converts it into fallback + warning +- the limiter slot is released +- downstream heavy steps can still run + +Why this matters: + +- queued execution introduces a second place where sync throws can happen +- the plan should be explicit that the limiter cannot leak capacity on failure + +### 14. A queued heavy step has not started yet + +Expected: + +- its underlying reader has not been called +- it has not produced a warning +- it has not stamped `completedAt` + +Why this matters: + +- otherwise the limiter may only be cosmetically bounding concurrency +- timing diagnostics would become misleading if queue time and completion time are confused + +### 15. `createFallback()` is evaluated + +Expected: + +- it is evaluated only after the corresponding load fails +- it does not perform I/O or mutate shared state +- if it throws, the test should fail rather than being silently degraded + +Why this matters: + +- fallback factories are part of the masking boundary +- making them “smart” would quietly widen the behavior of the patch + +## Testing Plan + +Add focused tests in [TeamDataService.test.ts](/Users/belief/dev/projects/claude/claude_team_freecode/test/main/services/team/TeamDataService.test.ts). + +### Preferred test technique + +Prefer explicit deferred promises over fake timers for read-phase ordering tests. + +Why: + +- fake timers tend to couple tests to implementation details like `setTimeout` +- deferred promises make start-order and release-order assertions clearer +- the existing test file already uses direct promise orchestration patterns successfully + +### Required tests + +1. Parallel read phase does not abort full `getTeamData()` when one step fails. + +2. Warning order remains the same as before even when failures resolve out of order. + +3. `messages` merge order remains unchanged: + - inbox + - plus leadTexts + - plus sentMessages + - then current dedup/enrichment behavior + +4. `resolveMembers()` is still called with the same effective inputs. + +5. `readProcesses()` still runs after the top read phase and still populates `processes`. + +6. `kanban` fallback remains safe when kanban read fails. + +7. `leadTexts` success still contributes messages even if inbox read fails. + +8. slow diagnostics do not crash when marks are produced from async completion times. + +9. a step that throws synchronously still degrades to fallback + warning. + +10. `processes` remains outside the parallel read phase. + +11. orchestration still fails normally if an error is thrown after step resolution and outside the wrapper boundary. + +12. light steps start even while heavy steps are still blocked. + +13. heavy-step limiter still allows bounded overlap and does not collapse to fully serial execution. + +14. `presenceIndexPromise` rejection semantics remain unchanged. + +15. no concurrent branch writes directly into `warnings` or `marks`. + +16. the existing read-only invariant for `getTeamData()` remains intact and kanban GC is still not invoked from this path. + +17. the third heavy step does not call its underlying reader before one of the first two heavy steps settles. + +18. a queued heavy step does not stamp `completedAt` before it actually settles. + +19. missing-team fast-fail still happens before any read-phase step starts. + +20. fallback creation happens only on failure and does not itself become a hidden degraded path. + +### Nice-to-have tests + +1. prove heavy-group limiter behavior with deferred promises +2. prove warnings do not depend on completion order +3. prove light steps can finish before heavy ones without changing assembly order +4. prove heavy-step start order remains deterministic under the limiter +5. prove queued heavy-step failure releases limiter capacity for the next heavy step +6. prove fallback arrays/objects are not shared across unrelated step results +7. prove missing-team fast-fail still happens before any read-phase step starts +8. prove fallback creation is lazy and does not run on successful steps + +### Testing note + +Current tests exercise many `getTeamData()` semantics, but they do not currently prove: + +- warning order +- read-phase race behavior +- serial `processes` + +These tests must be added explicitly in this patch. + +Additional testing rule: + +- at least one new test should assert start order, not only final output + +Why: + +- output-only assertions can miss accidental serialization or accidental over-parallelization + +## Rollout Strategy + +### Phase 1 + +Implement only: + +- local step wrapper +- small local heavy-step limiter +- top-level parallel read phase +- deterministic warning collection +- explicit tests for warning order and serial `processes` +- explicit tests for bounded heavy overlap and immediate light-step start +- explicit test for unchanged `presenceIndexPromise` rejection behavior + +### Recommended implementation sequence + +1. Add the local `StepResult` shape and `startReadStep(...)` helper without changing behavior yet. +2. Move one low-risk light read to the helper pattern and keep it serial to verify the wrapper boundary stays thin. +3. Introduce the local heavy limiter with a tiny focused test before wiring all heavy steps through it. +4. Convert the full top read phase into: + - light steps started directly + - heavy steps started via the limiter +5. Keep warning collection and mark assignment outside the wrappers and outside concurrent branches. +6. Re-run the lower assembly with local variable names as close to the current code as practical. +7. Add the ordering and failure-mode tests last, after the implementation shape stabilizes. + +Why this order: + +- it keeps each transition auditable +- it makes it easier to catch accidental semantic drift before the whole method is reshaped + +Additional rule: + +- do not combine the helper-contract change and the full concurrency rewrite in one blind edit + +Why: + +- separating those transitions makes review of semantic drift much easier + +### Phase 2 only if needed later + +- optional `getLeadSessionJsonlPaths()` cache +- optional inbox-name reuse between `listInboxNames()` and `getMessages()` +- optional broader lower-phase parallelization + +## Success Criteria + +The patch is successful if all are true: + +1. `TeamData` payload shape is unchanged +2. existing tests pass +3. new warning-order and race tests pass +4. `typecheck` passes +5. targeted `getTeamData` latency improves on repeated refreshes +6. no warning-order drift +7. no changes to dedup or renderer-visible semantics + +## Areas With Lower Confidence + +### 1. Best heavy-step limiter value + +Confidence: medium. + +Why: + +- `2` is the safest educated default +- it is not backed by real profiling data yet + +Decision: + +- use `2` in phase 1 +- tune only after measurement + +### 2. Real contention impact of concurrent `getMessages()` plus `extractLeadSessionTexts()` + +Confidence: medium. + +Why: + +- both touch team filesystem data +- one already does internal fan-out + +Decision: + +- keep the heavy group bounded +- do not add more heavy steps to it in phase 1 + +### 3. Diagnostic interpretation shift + +Confidence: medium-high. + +Why: + +- marks will no longer describe a strictly serial stage pipeline +- but they remain operationally useful + +Decision: + +- preserve labels +- store actual completion times +- document the interpretation shift + +### 4. Perf upside size + +Confidence: medium-high. + +Why: + +- the current serial top read phase is clearly a bottleneck candidate +- exact gains still depend on filesystem behavior and team size + +Decision: + +- keep the patch narrow +- verify with targeted timings before any broader refactor + +### 5. Exact wrapper boundary + +Confidence: medium. + +Why: + +- too broad a wrapper masks bugs +- too narrow a wrapper creates repetitive code and inconsistent fallback handling + +Decision: + +- wrap only the direct reader/service call +- keep all merge/assembly logic outside the wrapper + +### 6. `presenceIndexPromise` semantics + +Confidence: medium. + +Why: + +- it starts early but is awaited later +- it currently rejects the whole method instead of degrading to warning +- it is easy to accidentally “improve” this while introducing wrappers + +Decision: + +- preserve current reject semantics exactly +- add a regression test explicitly for this + +### 7. Test strength for concurrency behavior + +Confidence: medium. + +Why: + +- concurrency tests often pass while failing to prove the intended scheduling shape +- the current suite has very limited ordering-specific coverage + +Decision: + +- use explicit deferred promises +- assert both start order and completion-order independence + +### 8. Limiter implementation correctness + +Confidence: medium. + +Why: + +- small limiters are easy to get almost right while still leaking capacity or starting work eagerly +- that kind of bug can preserve output correctness while eliminating the perf gain + +Decision: + +- keep the limiter tiny and local +- verify queued-failure release behavior in tests +- verify that the third heavy step does not start before one of the first two settles + +### 9. Wrapper and limiter composition + +Confidence: medium. + +Why: + +- the code can look “limited” while still eagerly starting reader work +- this is one of the easiest places to lose the intended perf behavior without breaking output tests + +Decision: + +- prefer queuing a thunk that starts `startReadStep(...)` only after slot acquisition +- add a test that asserts the third heavy reader has not been called while both slots are occupied + +### 10. Fallback-factory discipline + +Confidence: medium. + +Why: + +- `createFallback()` looks harmless, but it is part of the same masking boundary as the wrapper +- if it grows side effects or branching logic, the patch stops being a narrow perf change + +Decision: + +- keep fallback factories as trivial literals/object constructors +- keep warning text static at the callsite +- let thrown fallback-factory errors surface as programming bugs + +## Residual Risk + +Even after this patch, the architecture remains sensitive because: + +- `team:getData` still assembles a large monolithic snapshot +- `messages[]` is still a hot payload +- first-open cost still exists +- `processes` remains a reconcile-like read/write path + +This patch should therefore be treated as: + +- a safe latency reduction +- not a final architectural solution diff --git a/package.json b/package.json index 4e9f89b5..f4a5bcb2 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,8 @@ }, "main": "dist-electron/main/index.cjs", "scripts": { - "dev": "electron-vite dev", + "dev": "node ./scripts/dev-with-runtime.mjs", + "dev:ui": "electron-vite dev", "dev:kill": "node bin/kill-dev.js", "prebuild": "tsx scripts/fetch-pricing-data.ts && pnpm --filter agent-teams-controller build && pnpm --filter agent-teams-mcp build", "build": "electron-vite build", diff --git a/scripts/dev-with-runtime.mjs b/scripts/dev-with-runtime.mjs new file mode 100644 index 00000000..4c6d57df --- /dev/null +++ b/scripts/dev-with-runtime.mjs @@ -0,0 +1,84 @@ +#!/usr/bin/env node + +import fs from 'node:fs' +import path from 'node:path' +import { spawnSync } from 'node:child_process' +import { fileURLToPath } from 'node:url' + +function runOrExit(cmd, args, options = {}) { + const result = spawnSync(cmd, args, { + stdio: 'inherit', + ...options, + }) + + if (result.error) { + console.error(`Failed to run ${cmd}: ${result.error.message}`) + process.exit(1) + } + + if (result.status !== 0) { + process.exit(result.status ?? 1) + } +} + +function readPackageManagerCommand(repoRoot) { + const packageJsonPath = path.join(repoRoot, 'package.json') + const rawPackageJson = fs.readFileSync(packageJsonPath, 'utf8') + const packageJson = JSON.parse(rawPackageJson) + const rawPackageManager = packageJson.packageManager + + if (typeof rawPackageManager !== 'string' || rawPackageManager.trim().length === 0) { + return 'pnpm' + } + + const [packageManagerName] = rawPackageManager.trim().split('@', 1) + if (!packageManagerName) { + return 'pnpm' + } + + return packageManagerName +} + +const scriptDir = path.dirname(fileURLToPath(import.meta.url)) +const uiRepoRoot = path.resolve(scriptDir, '..') +// Keep the dev runtime target explicit. This workspace can contain multiple +// sibling repos with the same package name, so auto-discovery is ambiguous and +// can silently point the UI at the wrong runtime after branch switches. +const runtimeRepoRoot = process.env.CLAUDE_DEV_RUNTIME_ROOT?.trim() + +if (!runtimeRepoRoot) { + console.error( + 'CLAUDE_DEV_RUNTIME_ROOT is required for pnpm dev. ' + + 'Point it at the runtime repo root you want the UI to use in dev.', + ) + process.exit(1) +} + +const runtimePackageJsonPath = path.join(runtimeRepoRoot, 'package.json') +if (!fs.existsSync(runtimePackageJsonPath)) { + console.error( + `CLAUDE_DEV_RUNTIME_ROOT does not look like a repo root: ${runtimeRepoRoot}`, + ) + process.exit(1) +} + +const runtimePackageManager = readPackageManagerCommand(runtimeRepoRoot) + +if (process.argv.includes('--print-runtime-path')) { + process.stdout.write(`${runtimeRepoRoot}\n`) + process.exit(0) +} + +// Respect the runtime repo's own package manager. The UI repo uses pnpm, but +// the runtime may legitimately be a Bun workspace, and forcing pnpm there can +// fail before the build even starts. +runOrExit(runtimePackageManager, ['run', 'build:dev'], { cwd: runtimeRepoRoot }) + +const runtimeCliPath = path.join(runtimeRepoRoot, 'cli-dev') +runOrExit('pnpm', ['run', 'dev:ui'], { + cwd: uiRepoRoot, + env: { + ...process.env, + CLAUDE_CLI_PATH: runtimeCliPath, + }, +}) diff --git a/src/main/index.ts b/src/main/index.ts index b4a0146a..ad118cec 100644 --- a/src/main/index.ts +++ b/src/main/index.ts @@ -87,6 +87,10 @@ import { } from './services/team/TeamControlApiState'; import { TeamInboxReader } from './services/team/TeamInboxReader'; import { TeamMemberRuntimeAdvisoryService } from './services/team/TeamMemberRuntimeAdvisoryService'; +import { + createTeamReconcileDrainScheduler, + type TeamReconcileTrigger, +} from './services/team/TeamReconcileDrainScheduler'; import { TeamSentMessagesStore } from './services/team/TeamSentMessagesStore'; import { getAppIconPath } from './utils/appIcon'; import { getProjectsBasePath, getTeamsBasePath, getTodosBasePath } from './utils/pathDecoder'; @@ -510,6 +514,27 @@ function wireFileWatcherEvents(context: ServiceContext): void { context.fileWatcher.on('todo-change', todoChangeHandler); todoChangeCleanup = () => context.fileWatcher.off('todo-change', todoChangeHandler); + const reconcileScheduler = teamDataService + ? createTeamReconcileDrainScheduler({ + run: async (teamName: string, trigger: TeamReconcileTrigger) => { + try { + await teamDataService.reconcileTeamArtifacts(teamName, trigger); + } catch (e) { + if (trigger.source === 'task') { + logger.warn( + `[FileWatcher] task reconcile failed for ${teamName} detail=${trigger.detail}: ${String(e)}` + ); + } else { + logger.warn( + `[FileWatcher] reconcile failed for ${teamName} source=${trigger.source} detail=${trigger.detail}: ${String(e)}` + ); + } + throw e; + } + }, + }) + : null; + // Forward team-change events to renderer and HTTP SSE const teamChangeHandler = (event: unknown): void => { safeSendToRenderer(mainWindow, TEAM_CHANGE, event); @@ -525,12 +550,8 @@ function wireFileWatcherEvents(context: ServiceContext): void { // --- Inbox change events: relay to lead + native OS notifications --- if (row.type === 'inbox') { - if (teamDataService) { - void teamDataService - .reconcileTeamArtifacts(teamName) - .catch((e: unknown) => - logger.warn(`[FileWatcher] reconcile failed for ${teamName}: ${String(e)}`) - ); + if (reconcileScheduler) { + reconcileScheduler.schedule(teamName, { source: 'inbox', detail }); } // Relay inbox changes into active runtime recipients. @@ -588,11 +609,7 @@ function wireFileWatcherEvents(context: ServiceContext): void { // --- Task change events: notify lead when teammate starts a task via CLI --- if (row.type === 'task' && detail.endsWith('.json') && teamDataService) { - void teamDataService - .reconcileTeamArtifacts(teamName) - .catch((e: unknown) => - logger.warn(`[FileWatcher] task reconcile failed for ${teamName}: ${String(e)}`) - ); + reconcileScheduler?.schedule(teamName, { source: 'task', detail }); const taskId = detail.replace('.json', ''); void teamDataService @@ -625,7 +642,10 @@ function wireFileWatcherEvents(context: ServiceContext): void { } }; context.fileWatcher.on('team-change', teamChangeHandler); - teamChangeCleanup = () => context.fileWatcher.off('team-change', teamChangeHandler); + teamChangeCleanup = () => { + context.fileWatcher.off('team-change', teamChangeHandler); + reconcileScheduler?.dispose(); + }; logger.info(`FileWatcher events wired for context: ${context.id}`); } diff --git a/src/main/ipc/teams.ts b/src/main/ipc/teams.ts index 65733d3b..bfcdcede 100644 --- a/src/main/ipc/teams.ts +++ b/src/main/ipc/teams.ts @@ -69,7 +69,7 @@ import { TEAM_VALIDATE_CLI_ARGS, // eslint-disable-next-line boundaries/element-types -- IPC channel constants are shared between main and preload by design } from '@preload/constants/ipcChannels'; -import { AGENT_BLOCK_CLOSE, AGENT_BLOCK_OPEN } from '@shared/constants/agentBlocks'; +import { AGENT_BLOCK_CLOSE, AGENT_BLOCK_OPEN, wrapAgentBlock } from '@shared/constants/agentBlocks'; import { KANBAN_COLUMN_IDS } from '@shared/constants/kanban'; import { MAX_TEXT_LENGTH } from '@shared/constants/teamLimits'; import { isApiErrorMessage } from '@shared/utils/apiErrorDetector'; @@ -256,6 +256,20 @@ function buildLeadRosterContextBlock( ].join('\n'); } +function buildLeadDirectDelegateAckBlock(actionMode?: AgentActionMode): string | null { + if (actionMode !== 'delegate') return null; + + return wrapAgentBlock( + [ + 'DELEGATE MODE USER ACK CONTRACT:', + 'Before any task creation, delegation, or other tool use, begin your next assistant response with one short human-readable acknowledgement to the user.', + 'That acknowledgement must be visible plain text, not only an agent-only block.', + 'Make the acknowledgement at least 40 characters so it is preserved in the Messages panel.', + 'After that visible acknowledgement, continue with delegation/orchestration in the same turn.', + ].join('\n') + ); +} + /** * In-memory set of API error message keys already processed. * Independent of NotificationManager storage — survives notification deletion/pruning. @@ -1676,6 +1690,7 @@ async function handleSendMessage( const resolvedLeadName = leadName ?? memberName; const teammateRoster = await getDurableLeadTeammateRoster(tn, resolvedLeadName); const rosterContextBlock = buildLeadRosterContextBlock(tn, resolvedLeadName, teammateRoster); + const delegateAckBlock = buildLeadDirectDelegateAckBlock(actionMode); // Pre-generate stable messageId so both stdin and persistence use the same identity. // This allows the lead to call task_create_from_message with the exact messageId. const preGeneratedMessageId = crypto.randomUUID(); @@ -1694,6 +1709,7 @@ async function handleSendMessage( `You received a direct message from the user.`, `IMPORTANT: Your text response here is shown to the user in the Messages panel. Always include a brief human-readable reply. Do NOT respond with only an agent-only block.`, ...(rosterContextBlock ? [rosterContextBlock] : []), + ...(delegateAckBlock ? [delegateAckBlock] : []), AGENT_BLOCK_OPEN, `MessageId: ${preGeneratedMessageId}`, `When creating a task from this user message, prefer task_create_from_message with messageId="${preGeneratedMessageId}" for reliable provenance. Only use this exact messageId — never guess or fabricate one.`, diff --git a/src/main/services/runtime/ClaudeMultimodelBridgeService.ts b/src/main/services/runtime/ClaudeMultimodelBridgeService.ts index f04fe630..f4ccbd1b 100644 --- a/src/main/services/runtime/ClaudeMultimodelBridgeService.ts +++ b/src/main/services/runtime/ClaudeMultimodelBridgeService.ts @@ -10,7 +10,7 @@ import { createLogger } from '@shared/utils/logger'; import type { CliProviderId, CliProviderStatus } from '@shared/types'; import { configManager } from '../infrastructure/ConfigManager'; import { resolveGeminiRuntimeAuth } from './geminiRuntimeAuth'; -import { applyConfiguredRuntimeBackendsEnv } from './providerRuntimeEnv'; +import { applyConfiguredRuntimeBackendsEnv, applyProviderRuntimeEnv } from './providerRuntimeEnv'; const logger = createLogger('ClaudeMultimodelBridgeService'); @@ -174,21 +174,7 @@ export class ClaudeMultimodelBridgeService { } private buildProviderCliEnv(binaryPath: string, providerId: CliProviderId): NodeJS.ProcessEnv { - const env = { ...this.buildCliEnv(binaryPath) }; - delete env.CLAUDE_CODE_ENTRY_PROVIDER; - delete env.CLAUDE_CODE_USE_OPENAI; - delete env.CLAUDE_CODE_USE_BEDROCK; - delete env.CLAUDE_CODE_USE_VERTEX; - delete env.CLAUDE_CODE_USE_FOUNDRY; - delete env.CLAUDE_CODE_USE_GEMINI; - - if (providerId === 'codex') { - env.CLAUDE_CODE_ENTRY_PROVIDER = 'codex'; - } else if (providerId === 'gemini') { - env.CLAUDE_CODE_ENTRY_PROVIDER = 'gemini'; - } - - return env; + return applyProviderRuntimeEnv({ ...this.buildCliEnv(binaryPath) }, providerId); } private isUnifiedRuntimeUnsupported(error: unknown): boolean { diff --git a/src/main/services/runtime/providerRuntimeEnv.ts b/src/main/services/runtime/providerRuntimeEnv.ts index ae93cc8c..b2fa257c 100644 --- a/src/main/services/runtime/providerRuntimeEnv.ts +++ b/src/main/services/runtime/providerRuntimeEnv.ts @@ -2,7 +2,8 @@ import type { TeamProviderId } from '@shared/types'; import { ConfigManager } from '../infrastructure/ConfigManager'; -const THIRD_PARTY_PROVIDER_ENV_KEYS = [ +const PROVIDER_ROUTING_ENV_KEYS = [ + 'CLAUDE_CODE_PROVIDER_MANAGED_BY_HOST', 'CLAUDE_CODE_ENTRY_PROVIDER', 'CLAUDE_CODE_USE_OPENAI', 'CLAUDE_CODE_USE_BEDROCK', @@ -36,15 +37,17 @@ export function applyProviderRuntimeEnv( const resolvedProvider: TeamProviderId = providerId === 'codex' || providerId === 'gemini' ? providerId : 'anthropic'; - for (const key of THIRD_PARTY_PROVIDER_ENV_KEYS) { + for (const key of PROVIDER_ROUTING_ENV_KEYS) { env[key] = undefined; } - if (resolvedProvider === 'codex') { - env.CLAUDE_CODE_ENTRY_PROVIDER = 'codex'; - } else if (resolvedProvider === 'gemini') { - env.CLAUDE_CODE_ENTRY_PROVIDER = 'gemini'; - } + // Provider overrides must be positive pins. In dev and multimodel desktop + // flows the host process can already be routed to codex or gemini, and the + // child runtime reapplies settings.env after trust. Mark the env as + // host-managed and set the exact entry provider so anthropic teammates do not + // silently fall back into the host's current routing world. + env.CLAUDE_CODE_PROVIDER_MANAGED_BY_HOST = '1'; + env.CLAUDE_CODE_ENTRY_PROVIDER = resolvedProvider; return env; } diff --git a/src/main/services/team/ClaudeBinaryResolver.ts b/src/main/services/team/ClaudeBinaryResolver.ts index 10c87acb..25ac3874 100644 --- a/src/main/services/team/ClaudeBinaryResolver.ts +++ b/src/main/services/team/ClaudeBinaryResolver.ts @@ -176,21 +176,6 @@ async function resolveFromCandidateList(candidates: string[]): Promise | null = null; private processHealthTeams = new Set(); @@ -121,6 +133,7 @@ export class TeamDataService { private taskCommentNotificationInFlight = new Set(); private taskChangePresenceRepository: TaskChangePresenceRepository | null = null; private teamLogSourceTracker: TeamLogSourceTracker | null = null; + private fileWatchReconcileDiagnostics = new Map(); constructor( private readonly configReader: TeamConfigReader = new TeamConfigReader(), @@ -509,6 +522,11 @@ export class TeamDataService { const t = marks[label]; return typeof t === 'number' ? t - startedAt : -1; }; + const msBetween = (from: string, to: string): number => { + const fromTs = marks[from]; + const toTs = marks[to]; + return typeof fromTs === 'number' && typeof toTs === 'number' ? toTs - fromTs : -1; + }; const config = await this.configReader.getConfig(teamName); if (!config) { @@ -688,6 +706,7 @@ export class TeamDataService { let messages: InboxMessage[] = messagesStepResult.value; const leadTexts: InboxMessage[] = leadTextsStepResult.value; const sentMessages: InboxMessage[] = sentMessagesStepResult.value; + mark('postStart'); if (leadTexts.length > 0) { messages = [...messages, ...leadTexts]; @@ -695,6 +714,7 @@ export class TeamDataService { if (sentMessages.length > 0) { messages = [...messages, ...sentMessages]; } + mark('mergeMessages'); // Dedup: if a lead_process message text is also present in lead_session, prefer lead_session. // This avoids double-rendering when we persist lead process messages and later load the lead JSONL. @@ -717,6 +737,7 @@ export class TeamDataService { return !leadSessionFingerprints.has(fp); }); } + mark('dedupLeadTexts'); // Dedup exact message copies that can appear as both live lead_process rows and // their persisted inbox/sent-message counterpart. If the messageId is identical, @@ -779,6 +800,7 @@ export class TeamDataService { } messages = [...dedupedWithoutId, ...dedupedById.values()]; } + mark('dedupMessageIds'); // Enrich inbox messages without leadSessionId by assigning the nearest neighbor's // session ID (by timestamp). This avoids the old forward-only propagation bug. @@ -826,11 +848,13 @@ export class TeamDataService { } } } + mark('attachLeadSessionIds'); messages.sort((a, b) => Date.parse(a.timestamp) - Date.parse(b.timestamp)); this.annotateSlashCommandResponses(messages); messages.sort((a, b) => Date.parse(b.timestamp) - Date.parse(a.timestamp)); + mark('normalizeMessages'); const metaMembers: TeamConfig['members'] = metaMembersStepResult.value; const kanbanState: KanbanState = kanbanStateStepResult.value; @@ -840,8 +864,10 @@ export class TeamDataService { const tasksWithKanbanBase: TeamTaskWithKanban[] = tasks.map((task) => this.attachKanbanCompatibility(task, kanbanState.tasks[task.id]) ); + mark('attachKanban'); const presenceIndex = await presenceIndexPromise; + mark('loadPresenceIndex'); const taskChangePresenceById = this.resolveTaskChangePresenceMap( tasksWithKanbanBase, @@ -855,6 +881,7 @@ export class TeamDataService { changePresence: taskChangePresenceById[task.id] ?? 'unknown', })) : tasksWithKanbanBase; + mark('changePresence'); const members = this.memberResolver.resolveMembers( config, @@ -897,18 +924,45 @@ export class TeamDataService { const totalMs = Date.now() - startedAt; if (totalMs >= 1500) { + const counts = `counts=tasks:${tasks.length},messages:${messages.length},inboxNames:${inboxNames.length},leadTexts:${leadTexts.length},sent:${sentMessages.length},members:${members.length},processes:${processes.length}`; logger.warn( `getTeamData team=${teamName} slow total=${totalMs}ms config=${msSince('config')} tasks=${msSince('tasks')} inboxNames=${msSince( 'inboxNames' )} messages=${msSince('messages')} leadTexts=${msSince('leadTexts')} sent=${msSince( 'sentMessages' - )} membersMeta=${msSince('metaMembers')} kanban=${msSince('kanbanState')} kanbanGc=${msSince( - 'kanbanGc' - )} resolveMembers=${msSince('resolveMembers')} runtimeAdvisories=${msSince( + )} membersMeta=${msSince('metaMembers')} kanban=${msSince('kanbanState')} post=${msBetween( + 'postStart', + 'mergeMessages' + )}/dedupLead=${msBetween('mergeMessages', 'dedupLeadTexts')}/dedupIds=${msBetween( + 'dedupLeadTexts', + 'dedupMessageIds' + )}/attachLeadSession=${msBetween( + 'dedupMessageIds', + 'attachLeadSessionIds' + )}/normalizeMessages=${msBetween( + 'attachLeadSessionIds', + 'normalizeMessages' + )}/attachKanban=${msBetween( + 'normalizeMessages', + 'attachKanban' + )}/loadPresenceIndex=${msBetween( + 'attachKanban', + 'loadPresenceIndex' + )}/changePresence=${msBetween( + 'loadPresenceIndex', + 'changePresence' + )}/resolveMembers=${msBetween( + 'changePresence', + 'resolveMembers' + )}/runtimeAdvisories=${msBetween( + 'resolveMembers', 'runtimeAdvisories' - )} enrichBranches=${msSince( + )}/enrichBranches=${msBetween( + 'runtimeAdvisories', 'enrichBranches' - )} syncComments=${msSince('syncComments')} processes=${msSince('processes')}` + )}/processes=${msBetween('syncComments', 'processes')} ${counts}${ + warnings.length > 0 ? ` warnings=${warnings.join('|')}` : '' + }` ); } @@ -2231,10 +2285,79 @@ export class TeamDataService { ); } - async reconcileTeamArtifacts(teamName: string): Promise { - this.getController(teamName).maintenance.reconcileArtifacts({ - reason: 'file-watch', - }); + async reconcileTeamArtifacts( + teamName: string, + trigger?: FileWatchReconcileTrigger + ): Promise { + const now = Date.now(); + const diagnostics = this.fileWatchReconcileDiagnostics.get(teamName) ?? { + inFlight: 0, + burstCount: 0, + windowStartedAt: now, + lastPressureLogAt: 0, + }; + const triggerSource = trigger?.source ?? 'unknown'; + const triggerDetail = + typeof trigger?.detail === 'string' && trigger.detail.trim().length > 0 + ? ` detail=${trigger.detail.trim()}` + : ''; + if (now - diagnostics.windowStartedAt > 5_000) { + diagnostics.windowStartedAt = now; + diagnostics.burstCount = 0; + } + diagnostics.burstCount += 1; + diagnostics.inFlight += 1; + this.fileWatchReconcileDiagnostics.set(teamName, diagnostics); + + const concurrentAtStart = diagnostics.inFlight; + const shouldLogPressure = + concurrentAtStart > 1 || diagnostics.burstCount >= 8 || diagnostics.burstCount === 1; + if (shouldLogPressure && now - diagnostics.lastPressureLogAt >= 2_000) { + diagnostics.lastPressureLogAt = now; + logger.warn( + `[reconcileTeamArtifacts] team=${teamName} reason=file-watch source=${triggerSource}${triggerDetail} inFlight=${concurrentAtStart} burst=${diagnostics.burstCount}` + ); + } + + const startedAt = Date.now(); + try { + const rawResult = this.getController(teamName).maintenance.reconcileArtifacts({ + reason: 'file-watch', + }) as + | { + staleKanbanEntriesRemoved?: number; + staleColumnOrderRefsRemoved?: number; + linkedCommentsCreated?: number; + } + | undefined; + const result = (rawResult ?? {}) as { + staleKanbanEntriesRemoved?: number; + staleColumnOrderRefsRemoved?: number; + linkedCommentsCreated?: number; + }; + const durationMs = Date.now() - startedAt; + if ( + durationMs >= 100 || + concurrentAtStart > 1 || + diagnostics.burstCount >= 8 || + (result.linkedCommentsCreated ?? 0) > 0 || + (result.staleKanbanEntriesRemoved ?? 0) > 0 || + (result.staleColumnOrderRefsRemoved ?? 0) > 0 + ) { + logger.warn( + `[reconcileTeamArtifacts] completed team=${teamName} reason=file-watch source=${triggerSource}${triggerDetail} durationMs=${durationMs} inFlightAtStart=${concurrentAtStart} burst=${diagnostics.burstCount} linkedCommentsCreated=${result.linkedCommentsCreated ?? 0} staleKanbanEntriesRemoved=${result.staleKanbanEntriesRemoved ?? 0} staleColumnOrderRefsRemoved=${result.staleColumnOrderRefsRemoved ?? 0}` + ); + } + } finally { + const current = this.fileWatchReconcileDiagnostics.get(teamName); + if (!current) { + return; + } + current.inFlight = Math.max(0, current.inFlight - 1); + if (current.inFlight === 0 && Date.now() - current.windowStartedAt > 30_000) { + this.fileWatchReconcileDiagnostics.delete(teamName); + } + } } private getLeadProjectDirCandidates(projectPath: string): string[] { diff --git a/src/main/services/team/TeamMemberRuntimeAdvisoryService.ts b/src/main/services/team/TeamMemberRuntimeAdvisoryService.ts index e4cda49b..851d8fad 100644 --- a/src/main/services/team/TeamMemberRuntimeAdvisoryService.ts +++ b/src/main/services/team/TeamMemberRuntimeAdvisoryService.ts @@ -1,20 +1,35 @@ import * as fs from 'fs/promises'; import type { MemberRuntimeAdvisory, ResolvedTeamMember } from '@shared/types'; +import { createLogger } from '@shared/utils/logger'; import { TeamMemberLogsFinder } from './TeamMemberLogsFinder'; const LOOKBACK_MS = 10 * 60 * 1000; const CACHE_TTL_MS = 5_000; const TAIL_BYTES = 64 * 1024; +const BATCH_WARN_MS = 200; + +const logger = createLogger('Service:TeamMemberRuntimeAdvisory'); interface CachedRuntimeAdvisory { value: MemberRuntimeAdvisory | null; expiresAt: number; } +interface CachedTeamBatchAdvisories { + membersSignature: string; + value: Map; + expiresAt: number; +} + export class TeamMemberRuntimeAdvisoryService { - private readonly cache = new Map(); + private readonly memberCache = new Map(); + private readonly teamBatchCacheByTeam = new Map(); + private readonly inFlightBatchRequests = new Map< + string, + Promise> + >(); constructor(private readonly logsFinder: TeamMemberLogsFinder = new TeamMemberLogsFinder()) {} @@ -22,38 +37,162 @@ export class TeamMemberRuntimeAdvisoryService { teamName: string, members: readonly Pick[] ): Promise> { - const advisoryEntries = await Promise.all( - members - .filter((member) => !member.removedAt) - .map(async (member) => { - const advisory = await this.getMemberAdvisory(teamName, member.name); - return advisory ? ([member.name, advisory] as const) : null; - }) - ); + const activeMembers = members.filter((member) => !member.removedAt); + if (activeMembers.length === 0) { + return new Map(); + } - return new Map( - advisoryEntries.filter( - (entry): entry is readonly [string, MemberRuntimeAdvisory] => entry !== null - ) - ); + const teamKey = this.normalizeToken(teamName); + const membersSignature = this.buildMembersSignature(activeMembers); + const now = Date.now(); + const cachedBatch = this.teamBatchCacheByTeam.get(teamKey); + if ( + cachedBatch && + cachedBatch.membersSignature === membersSignature && + cachedBatch.expiresAt > now + ) { + return this.materializeBatchAdvisories(activeMembers, cachedBatch.value); + } + + const inFlightKey = `${teamKey}::${membersSignature}`; + const existingRequest = this.inFlightBatchRequests.get(inFlightKey); + if (existingRequest) { + return this.materializeBatchAdvisories(activeMembers, await existingRequest); + } + + const request = this.loadBatchAdvisories(teamName, teamKey, activeMembers, membersSignature); + this.inFlightBatchRequests.set(inFlightKey, request); + + try { + return this.materializeBatchAdvisories(activeMembers, await request); + } finally { + if (this.inFlightBatchRequests.get(inFlightKey) === request) { + this.inFlightBatchRequests.delete(inFlightKey); + } + } } async getMemberAdvisory( teamName: string, memberName: string ): Promise { - const cacheKey = `${teamName.toLowerCase()}::${memberName.toLowerCase()}`; - const cached = this.cache.get(cacheKey); + const cacheKey = this.getMemberCacheKey(teamName, memberName); + const cached = this.memberCache.get(cacheKey); if (cached && cached.expiresAt > Date.now()) { - return cached.value; + return cached.value ? this.cloneAdvisory(cached.value) : null; } const advisory = await this.findRecentMemberAdvisory(teamName, memberName); - this.cache.set(cacheKey, { + this.memberCache.set(cacheKey, { value: advisory, expiresAt: Date.now() + CACHE_TTL_MS, }); - return advisory; + return advisory ? this.cloneAdvisory(advisory) : null; + } + + private async loadBatchAdvisories( + teamName: string, + teamKey: string, + activeMembers: readonly Pick[], + membersSignature: string + ): Promise> { + const startedAt = performance.now(); + const now = Date.now(); + const result = new Map(); + const membersToFetch: string[] = []; + let memberCacheHits = 0; + let memberCacheMisses = 0; + + for (const member of activeMembers) { + const normalizedMemberName = this.normalizeToken(member.name); + const cacheKey = `${teamKey}::${normalizedMemberName}`; + const cached = this.memberCache.get(cacheKey); + if (cached && cached.expiresAt > now) { + memberCacheHits += 1; + if (cached.value) { + result.set(normalizedMemberName, this.cloneAdvisory(cached.value)); + } + continue; + } + + memberCacheMisses += 1; + membersToFetch.push(member.name); + } + + if (membersToFetch.length > 0) { + const fetched = await Promise.all( + membersToFetch.map(async (memberName) => { + const advisory = await this.findRecentMemberAdvisory(teamName, memberName); + return [memberName, advisory] as const; + }) + ); + const fetchedAt = Date.now(); + for (const [memberName, advisory] of fetched) { + const normalizedMemberName = this.normalizeToken(memberName); + this.memberCache.set(`${teamKey}::${normalizedMemberName}`, { + value: advisory, + expiresAt: fetchedAt + CACHE_TTL_MS, + }); + if (advisory) { + result.set(normalizedMemberName, this.cloneAdvisory(advisory)); + } + } + } + + this.teamBatchCacheByTeam.set(teamKey, { + membersSignature, + value: this.cloneNormalizedAdvisories(result), + expiresAt: Date.now() + CACHE_TTL_MS, + }); + + const totalMs = performance.now() - startedAt; + if (totalMs >= BATCH_WARN_MS) { + logger.warn( + `[perf] getMemberAdvisories slow team=${teamName} activeMembers=${activeMembers.length} signatureMembers=${activeMembers.length} batchCache=miss memberCacheHits=${memberCacheHits} memberCacheMisses=${memberCacheMisses} fetchedMembers=${membersToFetch.length} total=${totalMs.toFixed(1)}ms` + ); + } + + return result; + } + + private getMemberCacheKey(teamName: string, memberName: string): string { + return `${this.normalizeToken(teamName)}::${this.normalizeToken(memberName)}`; + } + + private buildMembersSignature(members: readonly Pick[]): string { + return Array.from(new Set(members.map((member) => this.normalizeToken(member.name)))) + .sort() + .join('|'); + } + + private normalizeToken(value: string): string { + return value.trim().toLowerCase(); + } + + private cloneAdvisory(advisory: MemberRuntimeAdvisory): MemberRuntimeAdvisory { + return { ...advisory }; + } + + private cloneNormalizedAdvisories( + advisories: ReadonlyMap + ): Map { + return new Map( + Array.from(advisories, ([memberName, advisory]) => [memberName, this.cloneAdvisory(advisory)]) + ); + } + + private materializeBatchAdvisories( + activeMembers: readonly Pick[], + advisories: ReadonlyMap + ): Map { + const materialized = new Map(); + for (const member of activeMembers) { + const advisory = advisories.get(this.normalizeToken(member.name)); + if (advisory) { + materialized.set(member.name, this.cloneAdvisory(advisory)); + } + } + return materialized; } private async findRecentMemberAdvisory( diff --git a/src/main/services/team/TeamProvisioningService.ts b/src/main/services/team/TeamProvisioningService.ts index da7051e1..1a4e6a01 100644 --- a/src/main/services/team/TeamProvisioningService.ts +++ b/src/main/services/team/TeamProvisioningService.ts @@ -719,6 +719,38 @@ interface PromptSizeSummary { const MEMBER_LAUNCH_GRACE_MS = 90_000; +export function shouldWarnOnUnreadableMemberAuditConfig(params: { + nowMs: number; + lastWarnAt: number; + expectedMembers: readonly string[]; + memberSpawnStatuses: ReadonlyMap< + string, + Pick | undefined + >; +}): boolean { + const { nowMs, lastWarnAt, expectedMembers, memberSpawnStatuses } = params; + if (nowMs - lastWarnAt < MEMBER_SPAWN_AUDIT_WARNING_THROTTLE_MS) { + return false; + } + return expectedMembers.some((memberName) => { + const current = memberSpawnStatuses.get(memberName); + if (!current?.agentToolAccepted || typeof current.firstSpawnAcceptedAt !== 'string') { + return false; + } + const acceptedAtMs = Date.parse(current.firstSpawnAcceptedAt); + return Number.isFinite(acceptedAtMs) && nowMs - acceptedAtMs >= MEMBER_LAUNCH_GRACE_MS; + }); +} + +export function shouldWarnOnMissingRegisteredMember(params: { + nowMs: number; + lastWarnAt: number; + graceExpired: boolean; +}): boolean { + const { nowMs, lastWarnAt, graceExpired } = params; + return graceExpired && nowMs - lastWarnAt >= MEMBER_SPAWN_AUDIT_WARNING_THROTTLE_MS; +} + function nowIso(): string { return new Date().toISOString(); } @@ -3557,7 +3589,9 @@ export class TeamProvisioningService { } } catch (error) { if ((error as NodeJS.ErrnoException).code === 'ENOENT') { - throw new Error(`Working directory does not exist: ${cwd}`); + // Allow the runtime probe to degrade a missing cwd into a warning. + // This keeps prepareForProvisioning side-effect free for future/missing paths. + return; } throw error; } @@ -6258,8 +6292,12 @@ export class TeamProvisioningService { if (!registeredNames) { const now = Date.now(); if ( - now - run.lastMemberSpawnAuditConfigReadWarningAt >= - MEMBER_SPAWN_AUDIT_WARNING_THROTTLE_MS + shouldWarnOnUnreadableMemberAuditConfig({ + nowMs: now, + lastWarnAt: run.lastMemberSpawnAuditConfigReadWarningAt, + expectedMembers: run.expectedMembers, + memberSpawnStatuses: run.memberSpawnStatuses, + }) ) { run.lastMemberSpawnAuditConfigReadWarningAt = now; logger.warn(`[${run.teamName}] auditMemberSpawnStatuses: config.json not readable`); @@ -6318,7 +6356,13 @@ export class TeamProvisioningService { const now = Date.now(); const lastWarnAt = run.lastMemberSpawnAuditMissingWarningAt.get(expected) ?? 0; - if (now - lastWarnAt >= MEMBER_SPAWN_AUDIT_WARNING_THROTTLE_MS) { + if ( + shouldWarnOnMissingRegisteredMember({ + nowMs: now, + lastWarnAt, + graceExpired, + }) + ) { run.lastMemberSpawnAuditMissingWarningAt.set(expected, now); logger.warn( `[${run.teamName}] Member "${expected}" not found in config.json members after provisioning` @@ -6413,7 +6457,8 @@ export class TeamProvisioningService { private getFailedSpawnMembers( run: ProvisioningRun ): { name: string; error?: string; updatedAt: string }[] { - return [...run.memberSpawnStatuses.entries()] + const memberSpawnStatuses = run.memberSpawnStatuses ?? new Map(); + return [...memberSpawnStatuses.entries()] .filter(([, entry]) => entry.launchState === 'failed_to_start') .map(([name, entry]) => ({ name, @@ -6429,12 +6474,14 @@ export class TeamProvisioningService { failedCount: number; runtimeAlivePendingCount: number; } { + const expectedMembers = run.expectedMembers ?? []; + const memberSpawnStatuses = run.memberSpawnStatuses ?? new Map(); let confirmedCount = 0; let pendingCount = 0; let failedCount = 0; let runtimeAlivePendingCount = 0; - for (const expected of run.expectedMembers) { - const entry = run.memberSpawnStatuses.get(expected) ?? createInitialMemberSpawnStatusEntry(); + for (const expected of expectedMembers) { + const entry = memberSpawnStatuses.get(expected) ?? createInitialMemberSpawnStatusEntry(); if (entry.launchState === 'confirmed_alive') { confirmedCount += 1; continue; @@ -9093,7 +9140,7 @@ export class TeamProvisioningService { launchSummary.pendingCount - launchSummary.runtimeAlivePendingCount ); const hasPendingBootstrap = - !hasSpawnFailures && stillStartingCount > 0 && run.expectedMembers.length > 0; + !hasSpawnFailures && stillStartingCount > 0 && (run.expectedMembers?.length ?? 0) > 0; const readyMessage = hasSpawnFailures ? `Launch completed with teammate errors — ${failedSpawnMembers .map((member) => member.name) diff --git a/src/main/services/team/TeamReconcileDrainScheduler.ts b/src/main/services/team/TeamReconcileDrainScheduler.ts new file mode 100644 index 00000000..0861ae10 --- /dev/null +++ b/src/main/services/team/TeamReconcileDrainScheduler.ts @@ -0,0 +1,93 @@ +import { yieldToEventLoop } from '@main/utils/asyncYield'; + +export interface TeamReconcileTrigger { + source: 'inbox' | 'task'; + detail: string; +} + +interface TeamReconcileDrainState { + running: boolean; + pending: boolean; + lastTrigger: TeamReconcileTrigger | null; +} + +export interface TeamReconcileDrainScheduler { + schedule(teamName: string, trigger: TeamReconcileTrigger): void; + dispose(): void; +} + +export function createTeamReconcileDrainScheduler(options: { + run: (teamName: string, trigger: TeamReconcileTrigger) => Promise; +}): TeamReconcileDrainScheduler { + const states = new Map(); + let disposed = false; + + const drainTeam = async (teamName: string): Promise => { + const state = states.get(teamName); + if (!state || state.running || disposed) { + return; + } + + state.running = true; + let failed = false; + + try { + while (!disposed && state.pending) { + state.pending = false; + const trigger = state.lastTrigger; + if (!trigger) { + break; + } + + try { + await options.run(teamName, trigger); + } catch (error) { + failed = true; + throw error; + } finally { + if (!disposed) { + await yieldToEventLoop(); + } + } + } + } finally { + state.running = false; + if (disposed || !state.pending) { + states.delete(teamName); + return; + } + + if (failed) { + void drainTeam(teamName).catch(() => undefined); + } + } + }; + + return { + schedule(teamName: string, trigger: TeamReconcileTrigger): void { + if (disposed) { + return; + } + + const state = states.get(teamName) ?? { + running: false, + pending: false, + lastTrigger: null, + }; + state.pending = true; + state.lastTrigger = trigger; + states.set(teamName, state); + + if (state.running) { + return; + } + + void drainTeam(teamName).catch(() => undefined); + }, + + dispose(): void { + disposed = true; + states.clear(); + }, + }; +} diff --git a/src/renderer/components/team/ClaudeLogsSection.tsx b/src/renderer/components/team/ClaudeLogsSection.tsx index 8c9a6d27..ac61b2a4 100644 --- a/src/renderer/components/team/ClaudeLogsSection.tsx +++ b/src/renderer/components/team/ClaudeLogsSection.tsx @@ -1,4 +1,4 @@ -import { useMemo, useState } from 'react'; +import { memo, useMemo, useState } from 'react'; import { Button } from '@renderer/components/ui/button'; import { Tooltip, TooltipContent, TooltipTrigger } from '@renderer/components/ui/tooltip'; @@ -69,12 +69,12 @@ const LogPreviewInline = ({ preview }: { preview: LastLogPreview }): React.JSX.E // Main component // ============================================================================= -export const ClaudeLogsSection = ({ +export const ClaudeLogsSection = memo(function ClaudeLogsSection({ teamName, position = 'inline', sidebarViewerMaxHeight, onOpenChange, -}: ClaudeLogsSectionProps): React.JSX.Element => { +}: ClaudeLogsSectionProps): React.JSX.Element { const ctrl = useClaudeLogsController(teamName); const [dialogOpen, setDialogOpen] = useState(false); @@ -151,4 +151,4 @@ export const ClaudeLogsSection = ({ ); -}; +}); diff --git a/src/renderer/components/team/MemberBadge.tsx b/src/renderer/components/team/MemberBadge.tsx index 3a6ea00f..6ac4efda 100644 --- a/src/renderer/components/team/MemberBadge.tsx +++ b/src/renderer/components/team/MemberBadge.tsx @@ -12,6 +12,8 @@ import { MemberHoverCard } from './members/MemberHoverCard'; interface MemberBadgeProps { name: string; color?: string; + /** Owning team context for hover-card store lookups. */ + teamName?: string; /** Avatar + badge size variant */ size?: 'xs' | 'sm' | 'md'; /** Hide the avatar icon, show only the name badge */ @@ -30,6 +32,7 @@ interface MemberBadgeProps { export const MemberBadge = ({ name, color, + teamName, size = 'sm', hideAvatar, onClick, @@ -93,7 +96,7 @@ export const MemberBadge = ({ } return ( - + {content} ); diff --git a/src/renderer/components/team/ProcessesSection.tsx b/src/renderer/components/team/ProcessesSection.tsx index ff6bd0e5..2316fafc 100644 --- a/src/renderer/components/team/ProcessesSection.tsx +++ b/src/renderer/components/team/ProcessesSection.tsx @@ -1,9 +1,12 @@ -import { useStore } from '@renderer/store'; +import { memo } from 'react'; + import { formatDistanceToNowStrict } from 'date-fns'; import { ExternalLink, Square, Terminal } from 'lucide-react'; import { MemberBadge } from './MemberBadge'; +import type { ResolvedTeamMember, TeamProcess } from '@shared/types'; + function formatShortTime(date: Date): string { const distance = formatDistanceToNowStrict(date, { addSuffix: false }); return distance @@ -23,15 +26,61 @@ function formatShortTime(date: Date): string { .replace(' year', 'y'); } -export const ProcessesSection = (): React.JSX.Element | null => { - const teamName = useStore((s) => s.selectedTeamName); - const data = useStore((s) => s.selectedTeamData); +interface ProcessesSectionProps { + teamName: string; + members: ResolvedTeamMember[]; + processes: TeamProcess[]; +} - if (!teamName || !data?.processes?.length) return null; +function areMembersEquivalent( + left: readonly ResolvedTeamMember[], + right: readonly ResolvedTeamMember[] +): boolean { + if (left === right) return true; + if (left.length !== right.length) return false; + for (let index = 0; index < left.length; index += 1) { + if (left[index].name !== right[index].name || left[index].color !== right[index].color) { + return false; + } + } + return true; +} - const memberColorMap = new Map(data.members.map((m) => [m.name, m.color])); +function areProcessesEquivalent( + left: readonly TeamProcess[], + right: readonly TeamProcess[] +): boolean { + if (left === right) return true; + if (left.length !== right.length) return false; + for (let index = 0; index < left.length; index += 1) { + const leftProcess = left[index]; + const rightProcess = right[index]; + if ( + leftProcess.id !== rightProcess.id || + leftProcess.port !== rightProcess.port || + leftProcess.url !== rightProcess.url || + leftProcess.label !== rightProcess.label || + leftProcess.pid !== rightProcess.pid || + leftProcess.registeredBy !== rightProcess.registeredBy || + leftProcess.registeredAt !== rightProcess.registeredAt || + leftProcess.stoppedAt !== rightProcess.stoppedAt + ) { + return false; + } + } + return true; +} - const sorted = [...data.processes].sort((a, b) => { +export const ProcessesSection = memo(function ProcessesSection({ + teamName, + members, + processes, +}: ProcessesSectionProps): React.JSX.Element | null { + if (!teamName || processes.length === 0) return null; + + const memberColorMap = new Map(members.map((m) => [m.name, m.color])); + + const sorted = [...processes].sort((a, b) => { const aAlive = !a.stoppedAt; const bAlive = !b.stoppedAt; if (aAlive !== bAlive) return aAlive ? -1 : 1; @@ -119,6 +168,7 @@ export const ProcessesSection = (): React.JSX.Element | null => { )} {timeStr} @@ -128,4 +178,15 @@ export const ProcessesSection = (): React.JSX.Element | null => { })} ); -}; +}, areProcessesSectionPropsEqual); + +function areProcessesSectionPropsEqual( + prev: Readonly, + next: Readonly +): boolean { + return ( + prev.teamName === next.teamName && + areMembersEquivalent(prev.members, next.members) && + areProcessesEquivalent(prev.processes, next.processes) + ); +} diff --git a/src/renderer/components/team/TaskTooltip.tsx b/src/renderer/components/team/TaskTooltip.tsx index 2c6788fc..a7773523 100644 --- a/src/renderer/components/team/TaskTooltip.tsx +++ b/src/renderer/components/team/TaskTooltip.tsx @@ -70,7 +70,12 @@ export const TaskTooltip = ({ side = 'top', }: TaskTooltipProps): React.JSX.Element => { const selectedTeamName = useStore((s) => s.selectedTeamName); - const selectedTeamData = useStore((s) => s.selectedTeamData); + const selectedTeamData = useStore((s) => { + if (teamName) { + return s.selectedTeamName === teamName ? s.selectedTeamData : null; + } + return s.selectedTeamData; + }); const globalTasks = useStore((s) => s.globalTasks); const teamByName = useStore((s) => s.teamByName); @@ -161,7 +166,11 @@ export const TaskTooltip = ({ {/* Owner */} {task.owner && members.length > 0 ? ( - + ) : task.owner ? ( {task.owner} ) : ( diff --git a/src/renderer/components/team/TeamDetailView.tsx b/src/renderer/components/team/TeamDetailView.tsx index b147ce49..15255d28 100644 --- a/src/renderer/components/team/TeamDetailView.tsx +++ b/src/renderer/components/team/TeamDetailView.tsx @@ -32,6 +32,7 @@ import { type TaskChangeRequestOptions, } from '@renderer/utils/taskChangeRequest'; import { stripAgentBlocks } from '@shared/constants/agentBlocks'; +import { createLogger } from '@shared/utils/logger'; import { isLeadAgentType, isLeadMember } from '@shared/utils/leadDetection'; import { deriveTaskDisplayId, formatTaskDisplayLabel } from '@shared/utils/taskIdentity'; import { @@ -87,6 +88,10 @@ import { TeamSidebarRail } from './sidebar/TeamSidebarRail'; import { ClaudeLogsSection } from './ClaudeLogsSection'; import { CollapsibleTeamSection } from './CollapsibleTeamSection'; import { ProcessesSection } from './ProcessesSection'; +import { + isLeadSessionMissing, + shouldSuppressMissingLeadSessionFetch, +} from './teamSessionFetchGuards'; import { TeamProvisioningBanner } from './TeamProvisioningBanner'; import { TeamSessionsSection } from './TeamSessionsSection'; @@ -117,6 +122,13 @@ interface CreateTaskDialogState { defaultChip?: InlineChip; } +const logger = createLogger('Component:TeamDetailView'); +const TEAM_DETAIL_COMMIT_WARN_MS = 32; +const TEAM_DETAIL_RENDER_BURST_WINDOW_MS = 4_000; +const TEAM_DETAIL_RENDER_BURST_WARN_COUNT = 8; +const TEAM_DETAIL_RENDER_WARN_THROTTLE_MS = 2_000; +const TEAM_PENDING_REPLY_REFRESH_DELAY_MS = 10_000; + function areResolvedMembersEqual( prev: readonly ResolvedTeamMember[], next: readonly ResolvedTeamMember[] @@ -140,7 +152,12 @@ function areResolvedMembersEqual( prevMember.effort !== nextMember.effort || prevMember.cwd !== nextMember.cwd || prevMember.gitBranch !== nextMember.gitBranch || - prevMember.removedAt !== nextMember.removedAt + prevMember.removedAt !== nextMember.removedAt || + prevMember.runtimeAdvisory?.kind !== nextMember.runtimeAdvisory?.kind || + prevMember.runtimeAdvisory?.observedAt !== nextMember.runtimeAdvisory?.observedAt || + prevMember.runtimeAdvisory?.retryUntil !== nextMember.runtimeAdvisory?.retryUntil || + prevMember.runtimeAdvisory?.retryDelayMs !== nextMember.runtimeAdvisory?.retryDelayMs || + prevMember.runtimeAdvisory?.message !== nextMember.runtimeAdvisory?.message ) { return false; } @@ -189,6 +206,13 @@ export const TeamDetailView = ({ teamName, isPaneFocused = false, }: TeamDetailViewProps): React.JSX.Element => { + const renderStartedAtRef = useRef(performance.now()); + const renderDiagnosticsRef = useRef({ + windowStartedAt: Date.now(), + count: 0, + lastWarnAt: 0, + }); + renderStartedAtRef.current = performance.now(); const { isLight } = useTheme(); const [requestChangesTaskId, setRequestChangesTaskId] = useState(null); const [selectedTask, setSelectedTask] = useState(null); @@ -212,6 +236,7 @@ export const TeamDetailView = ({ const contentRef = useRef(null); const provisioningBannerRef = useRef(null); const wasProvisioningRef = useRef(false); + const pendingReplyRefreshTimerRef = useRef(null); // Set inert on background content when editor/graph overlay is open (a11y focus trap) useEffect(() => { @@ -405,6 +430,7 @@ export const TeamDetailView = ({ const [sessions, setSessions] = useState([]); const [sessionsLoading, setSessionsLoading] = useState(false); const [sessionsError, setSessionsError] = useState(null); + const missingLeadSessionFetchKeyRef = useRef(null); const [kanbanFilter, setKanbanFilter] = useState({ sessionId: null, selectedOwners: new Set(), @@ -418,7 +444,6 @@ export const TeamDetailView = ({ error, projects, repositoryGroups, - teams, fetchSessionDetail, initTabUIState, selectTeam, @@ -444,7 +469,7 @@ export const TeamDetailView = ({ provisioningError, clearProvisioningError, isTeamProvisioning, - leadActivityByTeam, + leadActivity, leadContextUpdatedAt, memberSpawnStatuses, fetchMemberSpawnStatuses, @@ -464,12 +489,8 @@ export const TeamDetailView = ({ setSidebarLogsHeight, } = useStore( useShallow((s) => ({ - data: s.selectedTeamData, - loading: s.selectedTeamLoading, - error: s.selectedTeamError, projects: s.projects, repositoryGroups: s.repositoryGroups, - teams: s.teams, fetchSessionDetail: s.fetchSessionDetail, initTabUIState: s.initTabUIState, selectTeam: s.selectTeam, @@ -495,7 +516,10 @@ export const TeamDetailView = ({ provisioningError: teamName ? (s.provisioningErrorByTeam[teamName] ?? null) : null, clearProvisioningError: s.clearProvisioningError, isTeamProvisioning: teamName ? isTeamProvisioningActive(s, teamName) : false, - leadActivityByTeam: s.leadActivityByTeam, + data: s.selectedTeamName === teamName ? s.selectedTeamData : null, + loading: s.selectedTeamName === teamName ? s.selectedTeamLoading : false, + error: s.selectedTeamName === teamName ? s.selectedTeamError : null, + leadActivity: teamName ? s.leadActivityByTeam[teamName] : undefined, leadContextUpdatedAt: teamName ? s.leadContextByTeam[teamName]?.updatedAt : undefined, memberSpawnStatuses: teamName ? s.memberSpawnStatusesByTeam[teamName] : undefined, fetchMemberSpawnStatuses: s.fetchMemberSpawnStatuses, @@ -528,6 +552,39 @@ export const TeamDetailView = ({ const isThisTabActive = tabId ? activeTabId === tabId : false; const [isContextButtonHovered, setIsContextButtonHovered] = useState(false); + useEffect(() => { + const now = Date.now(); + const diagnostic = renderDiagnosticsRef.current; + if (now - diagnostic.windowStartedAt > TEAM_DETAIL_RENDER_BURST_WINDOW_MS) { + diagnostic.windowStartedAt = now; + diagnostic.count = 0; + } + diagnostic.count += 1; + + const commitMs = performance.now() - renderStartedAtRef.current; + const messagesCount = data?.messages.length ?? 0; + const tasksCount = data?.tasks.length ?? 0; + const membersCount = data?.members.length ?? 0; + const processesCount = data?.processes.length ?? 0; + const shouldWarnSlow = commitMs >= TEAM_DETAIL_COMMIT_WARN_MS; + const shouldWarnBurst = diagnostic.count >= TEAM_DETAIL_RENDER_BURST_WARN_COUNT; + const shouldWarnLarge = messagesCount >= 150 || tasksCount >= 80; + + if ( + (shouldWarnSlow || shouldWarnBurst || shouldWarnLarge) && + now - diagnostic.lastWarnAt >= TEAM_DETAIL_RENDER_WARN_THROTTLE_MS + ) { + diagnostic.lastWarnAt = now; + logger.warn( + `[perf] commit team=${teamName} ms=${commitMs.toFixed(1)} renders=${diagnostic.count} windowMs=${ + now - diagnostic.windowStartedAt + } activeTab=${isThisTabActive ? 'yes' : 'no'} paneFocused=${isPaneFocused ? 'yes' : 'no'} loading=${ + loading ? 'yes' : 'no' + } messages=${messagesCount} tasks=${tasksCount} members=${membersCount} processes=${processesCount} panel=${messagesPanelMode}` + ); + } + }); + // Messages panel resize const { isResizing: isMessagesPanelResizing, handleProps: messagesPanelHandleProps } = useResizablePanel({ @@ -565,7 +622,6 @@ export const TeamDetailView = ({ // Fetch initial spawn statuses when provisioning starts useEffect(() => { - const leadActivity = teamName ? leadActivityByTeam[teamName] : undefined; const shouldFetchSpawnStatuses = Boolean(teamName) && (isTeamProvisioning || @@ -578,7 +634,7 @@ export const TeamDetailView = ({ data?.isAlive, fetchMemberSpawnStatuses, isTeamProvisioning, - leadActivityByTeam, + leadActivity, memberSpawnStatuses, teamName, ]); @@ -644,12 +700,13 @@ export const TeamDetailView = ({ useEffect(() => { if (!launchDialogOpen) return; let cancelled = false; + const teamsSnapshot = useStore.getState().teams; void (async () => { try { const aliveList = await api.teams.aliveList(); if (cancelled) return; const aliveSet = new Set(aliveList); - const refs = teams + const refs = teamsSnapshot .filter((t) => aliveSet.has(t.teamName) && t.projectPath) .map((t) => ({ teamName: t.teamName, @@ -664,7 +721,7 @@ export const TeamDetailView = ({ return () => { cancelled = true; }; - }, [launchDialogOpen, teams]); + }, [launchDialogOpen]); useEffect(() => { if (kanbanFilterQuery) { @@ -673,9 +730,22 @@ export const TeamDetailView = ({ } }, [kanbanFilterQuery, clearKanbanFilter]); - const currentTeamSummary = useMemo( - () => teams.find((team) => team.teamName === teamName) ?? null, - [teams, teamName] + const currentTeamSummary = useStore( + useShallow((s) => { + const team = teamName ? s.teamByName[teamName] : undefined; + if (!team) return null; + return { + displayName: team.displayName, + projectPath: team.projectPath, + memberCount: team.memberCount, + expectedMemberCount: team.expectedMemberCount, + confirmedCount: team.confirmedCount, + runtimeAlivePendingCount: team.runtimeAlivePendingCount, + teamLaunchState: team.teamLaunchState, + partialLaunchFailure: team.partialLaunchFailure, + missingMemberCount: team.missingMembers?.length ?? 0, + }; + }) ); // Load sessions for the team's project @@ -695,6 +765,14 @@ export const TeamDetailView = ({ const leadSessionLoaded = Boolean( leadSessionId && leadSessionDetail?.session?.id === leadSessionId ); + const sessionHistoryKey = useMemo( + () => (data?.config.sessionHistory ?? []).join('|'), + [data?.config.sessionHistory] + ); + const missingLeadSessionFetchKey = useMemo( + () => `${teamName}:${projectId ?? ''}:${leadSessionId ?? ''}:${sessionHistoryKey}`, + [teamName, projectId, leadSessionId, sessionHistoryKey] + ); const leadSubagentCostUsd = useMemo(() => { const processes = leadSessionDetail?.processes; @@ -773,6 +851,10 @@ export const TeamDetailView = ({ [visibleContextTokens, lastAiGroupTotalTokens] ); + useEffect(() => { + missingLeadSessionFetchKeyRef.current = null; + }, [missingLeadSessionFetchKey]); + // Keep lead-session context fresh in the background while the team tab is active. // This keeps the button value current even when the panel is closed. // For offline teams: fetch once on mount so the percentage shows immediately. @@ -781,32 +863,76 @@ export const TeamDetailView = ({ if (!isThisTabActive) return; if (!tabId || !projectId || !leadSessionId) return; - void fetchSessionDetail(projectId, leadSessionId, tabId, { silent: true }); + const leadSessionMissing = isLeadSessionMissing({ + leadSessionId, + projectId, + sessionsLoading, + knownSessions: sessions, + }); + if (leadSessionMissing) { + missingLeadSessionFetchKeyRef.current = missingLeadSessionFetchKey; + return; + } + + const fetchLeadSessionDetail = () => { + const suppressRepeatedFetch = shouldSuppressMissingLeadSessionFetch({ + leadSessionId, + projectId, + sessionsLoading, + knownSessions: sessions, + suppressionKey: missingLeadSessionFetchKeyRef.current, + currentKey: missingLeadSessionFetchKey, + }); + if (suppressRepeatedFetch) { + return; + } + void fetchSessionDetail(projectId, leadSessionId, tabId, { silent: true }); + }; + + fetchLeadSessionDetail(); if (!data?.isAlive) return; const id = window.setInterval(() => { - void fetchSessionDetail(projectId, leadSessionId, tabId, { silent: true }); + fetchLeadSessionDetail(); }, 10_000); return () => window.clearInterval(id); - }, [isThisTabActive, tabId, projectId, leadSessionId, data?.isAlive, fetchSessionDetail]); + }, [ + isThisTabActive, + tabId, + projectId, + leadSessionId, + data?.isAlive, + fetchSessionDetail, + sessions, + sessionsLoading, + missingLeadSessionFetchKey, + ]); // Keep team message state fresh while we are explicitly waiting for a reply. - // Direct lead replies land in the lead session JSONL first; without a light - // refresh loop here, the Messages panel can keep showing stale "awaiting reply" - // state even though the lead has already answered. + // Use a delayed single-shot refresh instead of a tight polling loop so we + // don't keep rewriting the whole team snapshot every 2 seconds. useEffect(() => { + if (pendingReplyRefreshTimerRef.current != null) { + window.clearTimeout(pendingReplyRefreshTimerRef.current); + pendingReplyRefreshTimerRef.current = null; + } + if (!isThisTabActive) return; - if (!data) return; + if (!data?.isAlive) return; if (Object.keys(pendingRepliesByMember).length === 0) return; - void refreshTeamData(teamName); + pendingReplyRefreshTimerRef.current = window.setTimeout(() => { + pendingReplyRefreshTimerRef.current = null; + void refreshTeamData(teamName, { withDedup: true }); + }, TEAM_PENDING_REPLY_REFRESH_DELAY_MS); - const id = window.setInterval(() => { - void refreshTeamData(teamName); - }, 2_000); - - return () => window.clearInterval(id); + return () => { + if (pendingReplyRefreshTimerRef.current != null) { + window.clearTimeout(pendingReplyRefreshTimerRef.current); + pendingReplyRefreshTimerRef.current = null; + } + }; }, [isThisTabActive, data, pendingRepliesByMember, refreshTeamData, teamName]); useEffect(() => { @@ -933,25 +1059,25 @@ export const TeamDetailView = ({ ); const taskMap = useMemo(() => new Map((data?.tasks ?? []).map((t) => [t.id, t])), [data?.tasks]); + const taskMapRef = useRef(taskMap); + taskMapRef.current = taskMap; const memberTaskCounts = useMemo(() => buildTaskCountsByOwner(data?.tasks ?? []), [data?.tasks]); - const openCreateTaskDialog = ( - subject = '', - description = '', - owner = '', - startImmediately?: boolean - ): void => { - setCreateTaskDialog({ - open: true, - defaultSubject: subject, - defaultDescription: description, - defaultOwner: owner, - defaultStartImmediately: startImmediately, - }); - }; + const openCreateTaskDialog = useCallback( + (subject = '', description = '', owner = '', startImmediately?: boolean): void => { + setCreateTaskDialog({ + open: true, + defaultSubject: subject, + defaultDescription: description, + defaultOwner: owner, + defaultStartImmediately: startImmediately, + }); + }, + [] + ); - const closeCreateTaskDialog = (): void => { + const closeCreateTaskDialog = useCallback((): void => { setCreateTaskDialog({ open: false, defaultSubject: '', @@ -959,7 +1085,7 @@ export const TeamDetailView = ({ defaultOwner: '', defaultStartImmediately: undefined, }); - }; + }, []); const handleCreateTaskFromMessage = useCallback((subject: string, description: string) => { openCreateTaskDialog(subject, description); @@ -977,6 +1103,36 @@ export const TeamDetailView = ({ setLaunchDialogOpen(true); }, []); + const handleSelectMember = useCallback((member: ResolvedTeamMember) => { + setSelectedMember(member); + }, []); + + const handleSendMessageToMember = useCallback((member: ResolvedTeamMember) => { + setSendDialogRecipient(member.name); + setSendDialogDefaultText(undefined); + setSendDialogDefaultChip(undefined); + setReplyQuote(undefined); + setSendDialogOpen(true); + }, []); + + const handleAssignTaskToMember = useCallback( + (member: ResolvedTeamMember) => { + openCreateTaskDialog('', '', member.name); + }, + [openCreateTaskDialog] + ); + + const handleOpenTaskById = useCallback((taskId: string) => { + const task = taskMapRef.current.get(taskId); + if (task) { + setSelectedTask(task); + } + }, []); + + const handleOpenTask = useCallback((task: TeamTaskWithKanban) => { + setSelectedTask(task); + }, []); + const handleTaskIdClick = useCallback( (taskId: string) => { const task = @@ -1172,6 +1328,50 @@ export const TeamDetailView = ({ })(); }; + const sharedMessagesPanelProps = useMemo( + () => ({ + teamName, + onTogglePosition: toggleMessagesPanelMode, + members: activeMembers, + tasks: data?.tasks ?? [], + messages: data?.messages ?? [], + isTeamAlive: data?.isAlive, + leadActivity, + leadContextUpdatedAt, + timeWindow, + teamSessionIds, + currentLeadSessionId: data?.config.leadSessionId, + pendingRepliesByMember, + onPendingReplyChange: setPendingRepliesByMember, + onMemberClick: handleSelectMember, + onTaskClick: handleOpenTask, + onCreateTaskFromMessage: handleCreateTaskFromMessage, + onReplyToMessage: handleReplyToMessage, + onRestartTeam: handleRestartTeam, + onTaskIdClick: handleTaskIdClick, + }), + [ + activeMembers, + data?.config.leadSessionId, + data?.isAlive, + data?.messages, + data?.tasks, + handleCreateTaskFromMessage, + handleOpenTask, + handleReplyToMessage, + handleRestartTeam, + handleSelectMember, + handleTaskIdClick, + leadActivity, + leadContextUpdatedAt, + pendingRepliesByMember, + teamName, + teamSessionIds, + timeWindow, + toggleMessagesPanelMode, + ] + ); + if (!teamName) { return (
@@ -1197,7 +1397,6 @@ export const TeamDetailView = ({ } if (error === 'TEAM_DRAFT') { - const teamSummary = teams.find((t) => t.teamName === teamName); return ( <>
@@ -1208,10 +1407,11 @@ export const TeamDetailView = ({

Team not launched yet

- This is a draft team — {teamSummary?.displayName || teamName} has - been configured with {teamSummary?.memberCount ?? 0} member - {teamSummary?.memberCount === 1 ? '' : 's'} but hasn't been provisioned by CLI - yet. Click Launch to select a model and start the team. + This is a draft team —{' '} + {currentTeamSummary?.displayName || teamName} has been configured + with {currentTeamSummary?.memberCount ?? 0} member + {currentTeamSummary?.memberCount === 1 ? '' : 's'} but hasn't been provisioned + by CLI yet. Click Launch to select a model and start the team.