From 9a8a59757ccaa2ea3fa557fdbb873c879bfb6a5f Mon Sep 17 00:00:00 2001 From: 777genius Date: Thu, 14 May 2026 09:51:29 +0300 Subject: [PATCH] fix: harden opencode delivery e2e flows --- docs/team-management/README.md | 1 + .../adaptive-task-graphs-research-note.md | 181 ++++++ .../services/team/TeamProvisioningService.ts | 44 +- ...enCodeStateChangingBridgeCommandService.ts | 54 +- .../runtime/OpenCodeTeamRuntimeAdapter.ts | 8 +- .../components/team/TeamDetailView.tsx | 54 +- src/renderer/components/ui/dialog.tsx | 16 +- .../model-gauntlet-results.json | 578 ++---------------- .../model-gauntlet-results.md | 41 +- ...eStateChangingBridgeCommandService.test.ts | 65 +- .../team/OpenCodeTeamRuntimeAdapter.test.ts | 43 ++ 11 files changed, 503 insertions(+), 582 deletions(-) create mode 100644 docs/team-management/adaptive-task-graphs-research-note.md diff --git a/docs/team-management/README.md b/docs/team-management/README.md index 70954a97..c5938ee2 100644 --- a/docs/team-management/README.md +++ b/docs/team-management/README.md @@ -23,6 +23,7 @@ | [research-worktrees.md](./research-worktrees.md) | Git worktrees + teams, запуск Claude процессов из UI (Phase 2) | | [task-queue-derived-agenda-plan.md](./task-queue-derived-agenda-plan.md) | Подробный rollout-plan по разделению queue/inventory, derived actionOwner и phased agenda/delta sync | | [debugging-agent-teams.md](./debugging-agent-teams.md) | Runtime debugging runbook, включая `CLAUDE_TEAM_TEAMMATE_MODE=tmux` для pane-backed teammate debug | +| [adaptive-task-graphs-research-note.md](./adaptive-task-graphs-research-note.md) | Research note по LATTE/AgentConductor: dynamic task graphs, frontier scheduling, selective verify, release stragglers | ## Ключевые решения diff --git a/docs/team-management/adaptive-task-graphs-research-note.md b/docs/team-management/adaptive-task-graphs-research-note.md new file mode 100644 index 00000000..7b8c2ac7 --- /dev/null +++ b/docs/team-management/adaptive-task-graphs-research-note.md @@ -0,0 +1,181 @@ +# Adaptive Task Graphs For Agent Teams + +**Date:** 2026-05-14 +**Status:** Research note, not an approved implementation plan +**Scope:** Team Management, task graph scheduling, lead/member coordination, token and conflict reduction + +## Sources + +- [AgentConductor: Topology Evolution for Multi-Agent Competition-Level Code Generation](https://arxiv.org/abs/2602.17100) +- [Improving the Efficiency of Language Agent Teams with Adaptive Task Graphs](https://arxiv.org/html/2605.06320v1) + +## Why This Is Interesting + +These papers point at the same product problem we already see in Agent Teams: multi-agent performance is limited less by raw model capability and more by coordination overhead. + +The useful idea is not "replace our orchestrator with a research framework". The useful idea is to make the task board itself a more explicit coordination graph: + +- tasks are graph nodes +- `blockedBy` / `blocks` are dependency edges +- ready work is the graph frontier +- workers should receive scoped local context, not full team history +- stalled work should be released or reassigned explicitly +- risky or high-impact work should get selective verification +- coordination quality should be measured, not inferred from vibes + +This fits our existing direction because the product already has task dependencies, review workflow, stall monitoring, task logs, context tracking, and lead/member briefing surfaces. + +## Most Valuable Ideas To Preserve + +### 1. LATTE-style dynamic task graph + +LATTE is the more directly useful paper for us. + +Core idea: + +- the lead owns global graph consistency +- workers can propose or claim local work +- structural updates are serialized through the lead or controller +- execution stays parallel where dependencies allow it +- the graph remains inspectable, so coordination decisions are visible in the UI + +Relevant operators to consider: + +- `Discover` - create a newly discovered task when implementation reveals missing work +- `Assign` - set an owner for a ready task +- `Claim` - allow an idle member to take an unowned ready task +- `Complete` - mark task completion +- `Release` - clear owner or return stalled work to the ready queue +- `Close` - close stale/completed tasks when tests or evidence prove completion +- `Verify` - insert a lightweight review/check task before downstream work proceeds + +🎯 Product value: 9/10 +🛡️ Reliability if implemented incrementally: 8/10 +🧠 Complexity: 6/10 +Expected change size for a first useful version: about 700-1400 LOC. + +### 2. Frontier-based scheduling + +The board should be able to derive "what is actionable now" from graph state: + +- a task is ready when all `blockedBy` tasks are completed or approved +- blocked tasks should not be started automatically +- ready unowned tasks can be offered to idle members +- ready owned tasks belong in the owner's operational queue +- lead briefing should show graph bottlenecks and unassigned frontier work + +This connects directly to `task-queue-derived-agenda-plan.md`. The key addition is to treat the queue as a graph frontier, not just a filtered task list. + +🎯 Product value: 9/10 +🛡️ Reliability: 8/10 +🧠 Complexity: 5/10 +Expected change size: about 500-1000 LOC if built on the current derived agenda work. + +### 3. Selective verification instead of review everything + +LATTE's `Verify` is useful because it scales review cost with risk: + +- verify upstream tasks that many other tasks depend on +- verify work touching shared files or public contracts +- verify tasks whose owner reported uncertainty +- skip extra verification for small isolated changes unless policy requires it + +This maps well to our existing review UI and task comments. A future implementation could create a verification task or request review based on graph impact. + +🎯 Product value: 8/10 +🛡️ Reliability: 7/10 +🧠 Complexity: 5/10 +Expected change size: about 350-800 LOC. + +### 4. Straggler release as first-class behavior + +LATTE explicitly models stalled workers and `Release`. We already have task-stall monitoring, but the next step is to make release/reassign a structured board action, not only a message nudge. + +Useful behavior: + +- detect a task with weak or stale progress evidence +- notify or nudge the current owner first +- if still stalled, clear owner or reassign with context +- preserve evidence and avoid duplicate nudges +- never auto-start new runtime lanes as a side effect + +This must stay compatible with existing OpenCode delivery watchdog and stall-monitor semantics. + +🎯 Product value: 8/10 +🛡️ Reliability: 7/10 +🧠 Complexity: 6/10 +Expected change size: about 600-1200 LOC. + +### 5. Coordination metrics as a product surface + +LATTE is especially useful because it externalizes coordination and measures failures: + +- idle rounds +- straggler tail latency +- inter-agent messages +- file conflicts or concurrent writes +- redundant output +- wasted tokens +- task graph growth and bottlenecks + +For Agent Teams, this could become a "team efficiency" diagnostic panel and a safer prerequisite before changing scheduling behavior. + +🎯 Product value: 8/10 +🛡️ Reliability: 9/10 +🧠 Complexity: 4/10 +Expected change size: about 350-800 LOC. + +## AgentConductor Ideas Worth Keeping + +AgentConductor is less directly implementable because it depends on an RL/SFT-trained orchestrator and competition-code benchmarks. Still, one product idea is valuable: + +**Task difficulty should control graph density.** + +Possible lightweight version for Agent Teams: + +- easy task - solo or small graph, minimal messaging, no extra verification by default +- medium task - split by independent deliverables, use dependencies only where real ordering exists +- hard task - more explicit roles, denser review/checkpoints, stronger integration pass +- failed execution feedback - adapt the graph instead of repeating the same topology + +Do not adopt the paper's full GRPO/SFT training path for now. It is too heavy for the app and not necessary to get product value. + +🎯 Product value: 7/10 +🛡️ Reliability: 6/10 +🧠 Complexity: 7/10 +Expected change size for a heuristic MVP: about 600-1300 LOC. + +## Objectivity And Risk Notes + +The LATTE paper is directionally credible but should not be treated as production proof. + +Strong points: + +- the core claim matches practical distributed-systems intuition +- the paper compares against several coordination styles, not only one weak baseline +- it evaluates multiple collaborative task types +- it emphasizes metrics we can independently measure +- the mechanism is simple enough to port incrementally + +Limitations: + +- it is an arXiv preprint, not final production validation +- benchmark tasks are controlled research tasks, not our full Electron plus runtime matrix +- baseline implementations may not match best possible production implementations +- reported improvements should be validated against our own teams, logs, and providers + +Practical conclusion: + +⚠️ Treat LATTE as a strong design signal, not a dependency or spec. Implement the ideas gradually behind our existing task board, lead/member briefings, and runtime-specific guardrails. + +## Recommended Internal Path + +1. Add coordination metrics first. +2. Derive a graph frontier from current task state. +3. Make lead and member briefings use the frontier as the operational queue. +4. Add structured release/reassign for stalled work. +5. Add selective verification for high-risk graph nodes. +6. Only after that, consider difficulty-aware graph density hints. + +This ordering gives us evidence before automation. It also keeps the rollout compatible with existing `blockedBy`, review flow, task-stall monitor, OpenCode delivery watchdog, and context tracking. + diff --git a/src/main/services/team/TeamProvisioningService.ts b/src/main/services/team/TeamProvisioningService.ts index 34ff3d3a..09b4ed3f 100644 --- a/src/main/services/team/TeamProvisioningService.ts +++ b/src/main/services/team/TeamProvisioningService.ts @@ -278,7 +278,11 @@ import { isAgentTeamsToolUse } from './agentTeamsToolNames'; import { atomicWriteAsync } from './atomicWrite'; import { peekAutoResumeService } from './AutoResumeService'; import { ClaudeBinaryResolver } from './ClaudeBinaryResolver'; -import { getConfiguredCliCommandLabel } from './cliFlavor'; +import { + getCliFlavorUiOptions, + getConfiguredCliCommandLabel, + getConfiguredCliFlavor, +} from './cliFlavor'; import { withFileLock } from './fileLock'; import { type ClassifiedMainProcessIdle, @@ -993,6 +997,41 @@ function getPreflightTimeoutMs(providerId: TeamProviderId | undefined): number { return getProviderModelProbeTimeoutMs(providerId); } +function getProviderRuntimeFailureLabel(providerId: TeamProviderId): string { + switch (providerId) { + case 'anthropic': + return 'Claude CLI'; + case 'codex': + return 'Codex runtime'; + case 'gemini': + return 'Gemini runtime'; + case 'opencode': + return 'OpenCode runtime'; + } +} + +function getRunRuntimeFailureLabel(run: ProvisioningRun): string { + const providerIds = new Set(); + const addProvider = (providerId: TeamProviderId | undefined): void => { + if (providerId) { + providerIds.add(providerId); + } + }; + + addProvider(normalizeOptionalTeamProviderId(run.request.providerId)); + addProvider(inferTeamProviderIdFromModel(run.request.model)); + for (const member of run.request.members) { + addProvider(normalizeOptionalTeamProviderId(member.providerId)); + addProvider(inferTeamProviderIdFromModel(member.model)); + } + + if (providerIds.size === 1) { + return getProviderRuntimeFailureLabel([...providerIds][0]!); + } + + return getCliFlavorUiOptions(getConfiguredCliFlavor()).displayName; +} + function buildProviderCliCommandArgs(providerArgs: string[], args: string[]): string[] { return mergeJsonSettingsArgs([...providerArgs, ...args]); } @@ -32379,7 +32418,8 @@ export class TeamProvisioningService { } const errorText = buildCliExitError(code, run.stdoutBuffer, run.stderrBuffer); - const progress = updateProgress(run, 'failed', 'Claude CLI exited with an error', { + const runtimeFailureLabel = getRunRuntimeFailureLabel(run); + const progress = updateProgress(run, 'failed', `${runtimeFailureLabel} exited with an error`, { error: errorText, cliLogsTail: extractCliLogsFromRun(run), }); diff --git a/src/main/services/team/opencode/bridge/OpenCodeStateChangingBridgeCommandService.ts b/src/main/services/team/opencode/bridge/OpenCodeStateChangingBridgeCommandService.ts index 9cd0eab9..0736db84 100644 --- a/src/main/services/team/opencode/bridge/OpenCodeStateChangingBridgeCommandService.ts +++ b/src/main/services/team/opencode/bridge/OpenCodeStateChangingBridgeCommandService.ts @@ -17,8 +17,13 @@ import { import type { OpenCodeBridgeCommandLeaseStore, + OpenCodeBridgeCommandLease, OpenCodeBridgeCommandLedger, } from './OpenCodeBridgeCommandLedgerStore'; +import { OpenCodeBridgeCommandLeaseError } from './OpenCodeBridgeCommandLedgerStore'; + +const DEFAULT_COMMAND_LEASE_ACQUIRE_TIMEOUT_MS = 10_000; +const DEFAULT_COMMAND_LEASE_ACQUIRE_RETRY_DELAY_MS = 100; export interface OpenCodeBridgeCommandExecutor { execute( @@ -63,6 +68,8 @@ export interface OpenCodeStateChangingBridgeCommandServiceOptions { requestIdFactory?: () => string; diagnosticIdFactory?: () => string; clock?: () => Date; + leaseAcquireTimeoutMs?: number; + leaseAcquireRetryDelayMs?: number; } export class OpenCodeStateChangingBridgeCommandService { @@ -76,6 +83,8 @@ export class OpenCodeStateChangingBridgeCommandService { private readonly requestIdFactory: () => string; private readonly diagnosticIdFactory: () => string; private readonly clock: () => Date; + private readonly leaseAcquireTimeoutMs: number; + private readonly leaseAcquireRetryDelayMs: number; constructor(options: OpenCodeStateChangingBridgeCommandServiceOptions) { this.expectedClientIdentity = options.expectedClientIdentity; @@ -89,6 +98,10 @@ export class OpenCodeStateChangingBridgeCommandService { this.diagnosticIdFactory = options.diagnosticIdFactory ?? (() => `opencode-bridge-diagnostic-${randomUUID()}`); this.clock = options.clock ?? (() => new Date()); + this.leaseAcquireTimeoutMs = + options.leaseAcquireTimeoutMs ?? DEFAULT_COMMAND_LEASE_ACQUIRE_TIMEOUT_MS; + this.leaseAcquireRetryDelayMs = + options.leaseAcquireRetryDelayMs ?? DEFAULT_COMMAND_LEASE_ACQUIRE_RETRY_DELAY_MS; } async execute(input: { @@ -136,7 +149,7 @@ export class OpenCodeStateChangingBridgeCommandService { body: input.body, }); const commandRequestId = this.requestIdFactory(); - const lease = await this.leaseStore.acquire({ + const lease = await this.acquireLease({ teamName: input.teamName, laneId: normalizedLaneId, runId: input.runId, @@ -243,6 +256,37 @@ export class OpenCodeStateChangingBridgeCommandService { } } + private async acquireLease(input: { + teamName: string; + laneId: string | null; + runId: string | null; + command: OpenCodeBridgeCommandName; + ttlMs: number; + }): Promise { + const deadlineMs = Date.now() + Math.max(0, this.leaseAcquireTimeoutMs); + let lastError: unknown = null; + + do { + try { + return await this.leaseStore.acquire(input); + } catch (error) { + if ( + !(error instanceof OpenCodeBridgeCommandLeaseError) || + !isActiveOpenCodeBridgeCommandLeaseError(error) + ) { + throw error; + } + lastError = error; + if (Date.now() >= deadlineMs) { + throw error; + } + await sleep(Math.max(1, this.leaseAcquireRetryDelayMs)); + } + } while (Date.now() < deadlineMs); + + throw lastError instanceof Error ? lastError : new Error('OpenCode bridge lease unavailable'); + } + private async appendUnknownOutcomeDiagnostic(input: { result: OpenCodeBridgeResult; teamName: string; @@ -316,3 +360,11 @@ function requiresOpenCodeDeliveryAcceptanceContract( function stringifyError(error: unknown): string { return error instanceof Error ? error.message : String(error); } + +function isActiveOpenCodeBridgeCommandLeaseError(error: OpenCodeBridgeCommandLeaseError): boolean { + return error.message.startsWith('OpenCode bridge command lease already active:'); +} + +function sleep(delayMs: number): Promise { + return new Promise((resolve) => setTimeout(resolve, delayMs)); +} diff --git a/src/main/services/team/runtime/OpenCodeTeamRuntimeAdapter.ts b/src/main/services/team/runtime/OpenCodeTeamRuntimeAdapter.ts index 82cf20ee..87173b8c 100644 --- a/src/main/services/team/runtime/OpenCodeTeamRuntimeAdapter.ts +++ b/src/main/services/team/runtime/OpenCodeTeamRuntimeAdapter.ts @@ -100,6 +100,12 @@ const SECRET_FLAG_PATTERN = const BEARER_TOKEN_PATTERN = /\bBearer\s+\S+/gi; const SECRET_KEY_PATTERN = /\bsk-[A-Za-z0-9_-]{16,}\b/g; +function resolveOpenCodeRuntimeSettlementMode( + input: Pick +): OpenCodeSendMessageCommandBody['settlementMode'] { + return input.messageKind === 'member_work_sync_nudge' ? 'observed' : 'acceptance'; +} + export class OpenCodeTeamRuntimeAdapter implements TeamLaunchRuntimeAdapter { readonly providerId = 'opencode' as const; private readonly lastProjectPathByTeamName = new Map(); @@ -334,7 +340,7 @@ export class OpenCodeTeamRuntimeAdapter implements TeamLaunchRuntimeAdapter { text: buildOpenCodeRuntimeMessageText(input), messageId: input.messageId, ...(input.deliveryAttemptId ? { deliveryAttemptId: input.deliveryAttemptId } : {}), - settlementMode: 'acceptance', + settlementMode: resolveOpenCodeRuntimeSettlementMode(input), fileParts: input.fileParts, actionMode: input.actionMode, messageKind: input.messageKind, diff --git a/src/renderer/components/team/TeamDetailView.tsx b/src/renderer/components/team/TeamDetailView.tsx index 73c5d040..4e5fbc41 100644 --- a/src/renderer/components/team/TeamDetailView.tsx +++ b/src/renderer/components/team/TeamDetailView.tsx @@ -91,6 +91,7 @@ import { type MemberActivityFilter, type MemberDetailTab } from './members/membe import type { AddMemberEntry } from './dialogs/AddMemberDialog'; import type { TeamLaunchDialogMode } from './dialogs/LaunchTeamDialog'; +import type { TeamColorSet } from '@renderer/constants/teamColors'; import type { TeamMessagesPanelMode } from '@renderer/types/teamMessagesPanelMode'; import type { ComponentProps, CSSProperties, RefObject } from 'react'; @@ -449,12 +450,16 @@ const TeamLoadingSectionHeader = ({ type TeamContentLoadingSkeletonProps = Readonly<{ teamName: string; + headerColorSet: TeamColorSet; + isLight: boolean; contentRef: RefObject; provisioningBannerRef: RefObject; }>; const TeamContentLoadingSkeleton = ({ teamName, + headerColorSet, + isLight, contentRef, provisioningBannerRef, }: TeamContentLoadingSkeletonProps): React.JSX.Element => ( @@ -465,28 +470,32 @@ const TeamContentLoadingSkeleton = ({ role="status" aria-label="Loading team" > -
-
+
+
+
-
+
- -
- -
- - - +
-
+
-
- + +
+
+ + + +
+
@@ -592,6 +601,8 @@ type TeamLoadingSkeletonProps = Readonly<{ isActive: boolean | undefined; isFocused: boolean | undefined; messagesPanelMode: TeamMessagesPanelMode; + headerColorSet: TeamColorSet; + isLight: boolean; contentRef: RefObject; provisioningBannerRef: RefObject; }>; @@ -601,6 +612,8 @@ const TeamLoadingSkeleton = ({ isActive, isFocused, messagesPanelMode, + headerColorSet, + isLight, contentRef, provisioningBannerRef, }: TeamLoadingSkeletonProps): React.JSX.Element => ( @@ -618,6 +631,8 @@ const TeamLoadingSkeleton = ({
@@ -1635,6 +1650,8 @@ export const TeamDetailView = memo(function TeamDetailView({ pendingReviewRequest, setPendingReviewRequest, summaryKnownTeammateCount, + teamSummaryColor, + teamSummaryDisplayName, } = useStore( useShallow((s) => ({ projects: s.projects, @@ -1672,6 +1689,8 @@ export const TeamDetailView = memo(function TeamDetailView({ summaryKnownTeammateCount: teamName ? getSummaryKnownTeammateCount(s.teamByName[teamName]) : 0, + teamSummaryColor: teamName ? s.teamByName[teamName]?.color : undefined, + teamSummaryDisplayName: teamName ? s.teamByName[teamName]?.displayName : undefined, loading: s.selectedTeamName === teamName ? s.selectedTeamLoading : false, error: s.selectedTeamName === teamName ? s.selectedTeamError : null, refreshTeamData: s.refreshTeamData, @@ -1701,6 +1720,13 @@ export const TeamDetailView = memo(function TeamDetailView({ const tabId = useTabIdOptional(); const isThisTabActive = isActive; const wasInteractiveRef = useRef(false); + const loadingHeaderColorSet = useMemo( + () => + teamSummaryColor + ? getTeamColorSet(teamSummaryColor) + : nameColorSet(teamSummaryDisplayName || teamName), + [teamName, teamSummaryColor, teamSummaryDisplayName] + ); // Messages panel resize const { isResizing: isMessagesPanelResizing, handleProps: messagesPanelHandleProps } = @@ -2509,6 +2535,8 @@ export const TeamDetailView = memo(function TeamDetailView({ isActive={isThisTabActive} isFocused={isPaneFocused} messagesPanelMode={messagesPanelMode} + headerColorSet={loadingHeaderColorSet} + isLight={isLight} contentRef={contentRef} provisioningBannerRef={provisioningBannerRef} /> diff --git a/src/renderer/components/ui/dialog.tsx b/src/renderer/components/ui/dialog.tsx index b6fab3bc..be5af7ec 100644 --- a/src/renderer/components/ui/dialog.tsx +++ b/src/renderer/components/ui/dialog.tsx @@ -1,6 +1,5 @@ /* eslint-disable react/jsx-props-no-spreading -- Standard shadcn pattern: forward remaining props to underlying elements */ import * as React from 'react'; -import { createPortal } from 'react-dom'; import * as DialogPrimitive from '@radix-ui/react-dialog'; import { cn } from '@renderer/lib/utils'; @@ -9,20 +8,7 @@ import { X } from 'lucide-react'; const Dialog = DialogPrimitive.Root; const DialogTrigger = DialogPrimitive.Trigger; const DialogClose = DialogPrimitive.Close; - -type DialogPortalProps = React.ComponentPropsWithoutRef; - -const DialogPortal = ({ children, container }: DialogPortalProps): React.ReactPortal | null => { - const [mounted, setMounted] = React.useState(false); - - React.useLayoutEffect(() => { - setMounted(true); - }, []); - - const portalContainer = container ?? (mounted ? globalThis.document?.body : null); - return portalContainer ? createPortal(<>{children}, portalContainer) : null; -}; -DialogPortal.displayName = DialogPrimitive.Portal.displayName; +const DialogPortal = DialogPrimitive.Portal; const DialogOverlay = React.forwardRef< React.ComponentRef, diff --git a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json index 994d483b..02975626 100644 --- a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json +++ b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json @@ -1,9 +1,9 @@ { - "generatedAt": "2026-05-09T23:16:07.760Z", - "runsPerModel": 3, + "generatedAt": "2026-05-14T06:34:47.601Z", + "runsPerModel": 1, "qualification": { - "minimumAverageScore": 90, - "minimumSuccessfulRuns": 3, + "minimumAverageScore": 70, + "minimumSuccessfulRuns": 1, "minimumConsistencyScore": 85, "requireNoHardFailures": true }, @@ -11,93 +11,93 @@ { "model": "opencode/big-pickle", "verdict": "recommended", - "confidence": "high", + "confidence": "low", "qualified": true, "readinessScore": 100, "averageScore": 100, "consistencyScore": 100, "behavioralAverageScore": 100, "minScore": 100, - "successfulRuns": 3, - "countedRuns": 3, + "successfulRuns": 1, + "countedRuns": 1, "hardFailures": 0, "providerInfraFailures": 0, "runtimeTransportFailures": 0, "modelBehaviorFailures": 0, "harnessFailures": 0, - "p50DurationMs": 112355, - "p95DurationMs": 116891, + "p50DurationMs": 132968, + "p95DurationMs": 132968, "stagePassRates": { "launchBootstrap": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "directReply": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "peerRelayAB": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "peerRelayBC": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "concurrentReplies": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "taskRefs": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "cleanTranscript": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "noDuplicateTokens": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "latencyStable": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 } }, "taskRefPassRates": { "directReply": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "peerRelayAB": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "peerRelayBC": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "concurrentBob": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 }, "concurrentTom": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 } }, @@ -112,8 +112,8 @@ "failedRuns": 0, "weightedLoss": 0, "passRate": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 } }, @@ -122,8 +122,8 @@ "failedRuns": 0, "weightedLoss": 0, "passRate": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 } }, @@ -132,8 +132,8 @@ "failedRuns": 0, "weightedLoss": 0, "passRate": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 } }, @@ -142,8 +142,8 @@ "failedRuns": 0, "weightedLoss": 0, "passRate": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 } }, @@ -152,8 +152,8 @@ "failedRuns": 0, "weightedLoss": 0, "passRate": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 } }, @@ -162,8 +162,8 @@ "failedRuns": 0, "weightedLoss": 0, "passRate": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 } }, @@ -172,8 +172,8 @@ "failedRuns": 0, "weightedLoss": 0, "passRate": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 } }, @@ -182,8 +182,8 @@ "failedRuns": 0, "weightedLoss": 0, "passRate": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 } }, @@ -192,14 +192,14 @@ "failedRuns": 0, "weightedLoss": 0, "passRate": { - "passed": 3, - "total": 3, + "passed": 1, + "total": 1, "rate": 100 } } ], "scoreStability": { - "sampleSize": 3, + "sampleSize": 1, "minScore": 100, "maxScore": 100, "spread": 0, @@ -217,16 +217,16 @@ "outcome": "passed", "failureCategory": "none", "primaryFailure": null, - "durationMs": 112344, + "durationMs": 132968, "hardFailure": false, "stageDurationsMs": { - "setup": 183, - "launchBootstrap": 19933, - "materializeTasks": 35, - "directReply": 15430, - "peerRelayAB": 25001, - "peerRelayBC": 28154, - "concurrentReplies": 15551, + "setup": 2770, + "launchBootstrap": 49092, + "materializeTasks": 85, + "directReply": 13760, + "peerRelayAB": 22730, + "peerRelayBC": 21484, + "concurrentReplies": 14023, "hygiene": 1 }, "stageFailures": {}, @@ -253,455 +253,7 @@ "latencyStable": true }, "diagnostics": [ - "runId=d9d27eb0-2798-4980-a0fa-f082a6edd705" - ] - }, - { - "runIndex": 2, - "passed": true, - "score": 100, - "countedForRecommendation": true, - "outcome": "passed", - "failureCategory": "none", - "primaryFailure": null, - "durationMs": 112355, - "hardFailure": false, - "stageDurationsMs": { - "setup": 11, - "launchBootstrap": 18682, - "materializeTasks": 36, - "directReply": 15126, - "peerRelayAB": 24835, - "peerRelayBC": 28580, - "concurrentReplies": 17164, - "hygiene": 1 - }, - "stageFailures": {}, - "taskRefChecks": { - "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentBob": true, - "concurrentTom": true - }, - "protocolViolations": { - "badMessages": 0, - "duplicateOrMissingTokens": [] - }, - "stages": { - "launchBootstrap": true, - "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentReplies": true, - "taskRefs": true, - "cleanTranscript": true, - "noDuplicateTokens": true, - "latencyStable": true - }, - "diagnostics": [ - "runId=97364154-e06d-460c-94ae-65b73cb1b6f9" - ] - }, - { - "runIndex": 3, - "passed": true, - "score": 100, - "countedForRecommendation": true, - "outcome": "passed", - "failureCategory": "none", - "primaryFailure": null, - "durationMs": 116891, - "hardFailure": false, - "stageDurationsMs": { - "setup": 8, - "launchBootstrap": 18926, - "materializeTasks": 31, - "directReply": 17061, - "peerRelayAB": 27842, - "peerRelayBC": 27262, - "concurrentReplies": 15437, - "hygiene": 1 - }, - "stageFailures": {}, - "taskRefChecks": { - "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentBob": true, - "concurrentTom": true - }, - "protocolViolations": { - "badMessages": 0, - "duplicateOrMissingTokens": [] - }, - "stages": { - "launchBootstrap": true, - "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentReplies": true, - "taskRefs": true, - "cleanTranscript": true, - "noDuplicateTokens": true, - "latencyStable": true - }, - "diagnostics": [ - "runId=7bdd4b2e-dbd6-4474-a8a0-9418df433671" - ] - } - ] - }, - { - "model": "opencode/minimax-m2.5-free", - "verdict": "strong-candidate", - "confidence": "high", - "qualified": false, - "readinessScore": 88.6, - "averageScore": 98.3, - "consistencyScore": 93.1, - "behavioralAverageScore": 98.3, - "minScore": 95, - "successfulRuns": 2, - "countedRuns": 3, - "hardFailures": 1, - "providerInfraFailures": 0, - "runtimeTransportFailures": 0, - "modelBehaviorFailures": 1, - "harnessFailures": 0, - "p50DurationMs": 108862, - "p95DurationMs": 118757, - "stagePassRates": { - "launchBootstrap": { - "passed": 3, - "total": 3, - "rate": 100 - }, - "directReply": { - "passed": 3, - "total": 3, - "rate": 100 - }, - "peerRelayAB": { - "passed": 3, - "total": 3, - "rate": 100 - }, - "peerRelayBC": { - "passed": 3, - "total": 3, - "rate": 100 - }, - "concurrentReplies": { - "passed": 3, - "total": 3, - "rate": 100 - }, - "taskRefs": { - "passed": 3, - "total": 3, - "rate": 100 - }, - "cleanTranscript": { - "passed": 3, - "total": 3, - "rate": 100 - }, - "noDuplicateTokens": { - "passed": 2, - "total": 3, - "rate": 66.7 - }, - "latencyStable": { - "passed": 3, - "total": 3, - "rate": 100 - } - }, - "taskRefPassRates": { - "directReply": { - "passed": 3, - "total": 3, - "rate": 100 - }, - "peerRelayAB": { - "passed": 3, - "total": 3, - "rate": 100 - }, - "peerRelayBC": { - "passed": 3, - "total": 3, - "rate": 100 - }, - "concurrentBob": { - "passed": 3, - "total": 3, - "rate": 100 - }, - "concurrentTom": { - "passed": 3, - "total": 3, - "rate": 100 - } - }, - "protocolViolationTotals": { - "badMessages": 0, - "duplicateOrMissingTokens": 2, - "affectedRuns": 1 - }, - "stageFailureImpact": [ - { - "stage": "noDuplicateTokens", - "failedRuns": 1, - "weightedLoss": 5, - "passRate": { - "passed": 2, - "total": 3, - "rate": 66.7 - } - }, - { - "stage": "cleanTranscript", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 3, - "total": 3, - "rate": 100 - } - }, - { - "stage": "concurrentReplies", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 3, - "total": 3, - "rate": 100 - } - }, - { - "stage": "directReply", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 3, - "total": 3, - "rate": 100 - } - }, - { - "stage": "latencyStable", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 3, - "total": 3, - "rate": 100 - } - }, - { - "stage": "launchBootstrap", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 3, - "total": 3, - "rate": 100 - } - }, - { - "stage": "peerRelayAB", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 3, - "total": 3, - "rate": 100 - } - }, - { - "stage": "peerRelayBC", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 3, - "total": 3, - "rate": 100 - } - }, - { - "stage": "taskRefs", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 3, - "total": 3, - "rate": 100 - } - } - ], - "scoreStability": { - "sampleSize": 3, - "minScore": 95, - "maxScore": 100, - "spread": 5, - "standardDeviation": 2.4, - "consistencyScore": 93.1 - }, - "dominantFailureCategory": "model-behavior", - "recommendationBlockers": [ - "successful runs 2 < 3", - "hard failures 1", - "model-behavior failures 1", - "highest weighted stage loss noDuplicateTokens=5", - "protocol violations in 1 runs" - ], - "runs": [ - { - "runIndex": 1, - "passed": true, - "score": 100, - "countedForRecommendation": true, - "outcome": "passed", - "failureCategory": "none", - "primaryFailure": null, - "durationMs": 91530, - "hardFailure": false, - "stageDurationsMs": { - "setup": 10, - "launchBootstrap": 18716, - "materializeTasks": 31, - "directReply": 11557, - "peerRelayAB": 16323, - "peerRelayBC": 27370, - "concurrentReplies": 9606, - "hygiene": 1 - }, - "stageFailures": {}, - "taskRefChecks": { - "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentBob": true, - "concurrentTom": true - }, - "protocolViolations": { - "badMessages": 0, - "duplicateOrMissingTokens": [] - }, - "stages": { - "launchBootstrap": true, - "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentReplies": true, - "taskRefs": true, - "cleanTranscript": true, - "noDuplicateTokens": true, - "latencyStable": true - }, - "diagnostics": [ - "runId=23ae85d2-e79d-41c9-93a6-e843acea6d9e" - ] - }, - { - "runIndex": 2, - "passed": true, - "score": 100, - "countedForRecommendation": true, - "outcome": "passed", - "failureCategory": "none", - "primaryFailure": null, - "durationMs": 108862, - "hardFailure": false, - "stageDurationsMs": { - "setup": 10, - "launchBootstrap": 18359, - "materializeTasks": 35, - "directReply": 7236, - "peerRelayAB": 30664, - "peerRelayBC": 26124, - "concurrentReplies": 18477, - "hygiene": 0 - }, - "stageFailures": {}, - "taskRefChecks": { - "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentBob": true, - "concurrentTom": true - }, - "protocolViolations": { - "badMessages": 0, - "duplicateOrMissingTokens": [] - }, - "stages": { - "launchBootstrap": true, - "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentReplies": true, - "taskRefs": true, - "cleanTranscript": true, - "noDuplicateTokens": true, - "latencyStable": true - }, - "diagnostics": [ - "runId=c3a55d8a-4028-4af7-9e1a-8ae8c87a95e5" - ] - }, - { - "runIndex": 3, - "passed": false, - "score": 95, - "countedForRecommendation": true, - "outcome": "behavioral-fail", - "failureCategory": "model-behavior", - "primaryFailure": "duplicateOrMissingTokens=GAUNTLET_JACK_USER_OK_3,GAUNTLET_TOM_USER_OK_3", - "durationMs": 118757, - "hardFailure": true, - "stageDurationsMs": { - "setup": 9, - "launchBootstrap": 19986, - "materializeTasks": 37, - "directReply": 8036, - "peerRelayAB": 37430, - "peerRelayBC": 36219, - "concurrentReplies": 8551, - "hygiene": 0 - }, - "stageFailures": {}, - "taskRefChecks": { - "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentBob": true, - "concurrentTom": true - }, - "protocolViolations": { - "badMessages": 0, - "duplicateOrMissingTokens": [ - "GAUNTLET_JACK_USER_OK_3", - "GAUNTLET_TOM_USER_OK_3" - ] - }, - "stages": { - "launchBootstrap": true, - "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentReplies": true, - "taskRefs": true, - "cleanTranscript": true, - "noDuplicateTokens": false, - "latencyStable": true - }, - "diagnostics": [ - "runId=2b0610e0-7b10-49fc-88dd-ab30b37abce9", - "duplicateOrMissingTokens=GAUNTLET_JACK_USER_OK_3,GAUNTLET_TOM_USER_OK_3" + "runId=5f3d0b1b-17eb-44d6-8b61-644e6f8673c6" ] } ] diff --git a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md index d41c64b7..4f93005d 100644 --- a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md +++ b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md @@ -1,9 +1,9 @@ # OpenCode Model Gauntlet Results -Generated: 2026-05-09T23:16:07.760Z +Generated: 2026-05-14T06:34:47.601Z -Runs per model: 3 -Recommended threshold: average >= 90, successful runs >= 3, consistency >= 85, hard failures = 0 +Runs per model: 1 +Recommended threshold: average >= 70, successful runs >= 1, consistency >= 85, hard failures = 0 Provider-infra runs are reported separately and are not counted as model behavior. They still block a Recommended verdict until rerun succeeds. @@ -13,50 +13,25 @@ Scoring weights: launchBootstrap=15, directReply=10, peerRelayAB=15, peerRelayBC | Model | Verdict | Confidence | Readiness | Consistency | Score Spread | Behavior Avg | Overall Avg | Counted | Pass Runs | Weakest Stage | Weakest TaskRef | Dominant Failure | Blockers | Provider Infra | Runtime Transport | Model Fails | Protocol Runs | p50 | p95 | | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | -| `opencode/big-pickle` | Recommended | high | 100 | 100 | 0 | 100 | 100 | 3/3 | 3/3 | cleanTranscript 3/3 (100%) | concurrentBob 3/3 (100%) | none | - | 0 | 0 | 0 | 0 | 112355ms | 116891ms | -| `opencode/minimax-m2.5-free` | Strong candidate | high | 88.6 | 93.1 | 5 | 98.3 | 98.3 | 3/3 | 2/3 | noDuplicateTokens 2/3 (66.7%) | concurrentBob 3/3 (100%) | model-behavior | successful runs 2 < 3; hard failures 1; model-behavior failures 1; highest weighted stage loss noDuplicateTokens=5; protocol violations in 1 runs | 0 | 0 | 1 | 1 | 108862ms | 118757ms | +| `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 132968ms | 132968ms | ## opencode/big-pickle Readiness score: 100. -Score stability: consistency=100, min=100, max=100, spread=0, stdDev=0, samples=3. +Score stability: consistency=100, min=100, max=100, spread=0, stdDev=0, samples=1. Recommendation blockers: -. Weighted stage impact: -. -Stage pass rates: launchBootstrap:3/3 (100%), directReply:3/3 (100%), peerRelayAB:3/3 (100%), peerRelayBC:3/3 (100%), concurrentReplies:3/3 (100%), taskRefs:3/3 (100%), cleanTranscript:3/3 (100%), noDuplicateTokens:3/3 (100%), latencyStable:3/3 (100%). +Stage pass rates: launchBootstrap:1/1 (100%), directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentReplies:1/1 (100%), taskRefs:1/1 (100%), cleanTranscript:1/1 (100%), noDuplicateTokens:1/1 (100%), latencyStable:1/1 (100%). -TaskRef pass rates: directReply:3/3 (100%), peerRelayAB:3/3 (100%), peerRelayBC:3/3 (100%), concurrentBob:3/3 (100%), concurrentTom:3/3 (100%). +TaskRef pass rates: directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentBob:1/1 (100%), concurrentTom:1/1 (100%). Protocol totals: badMessages=0, duplicateOrMissingTokens=0, affectedRuns=0. | Run | Outcome | Category | Score | Counted | Duration | Failed Stages | Slowest Stage | TaskRefs | Protocol | Diagnostics | | ---: | --- | --- | ---: | --- | ---: | --- | --- | --- | --- | --- | -| 1 | passed | none | 100 | yes | 112344ms | - | peerRelayBC:28154ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=d9d27eb0-2798-4980-a0fa-f082a6edd705 | -| 2 | passed | none | 100 | yes | 112355ms | - | peerRelayBC:28580ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=97364154-e06d-460c-94ae-65b73cb1b6f9 | -| 3 | passed | none | 100 | yes | 116891ms | - | peerRelayAB:27842ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=7bdd4b2e-dbd6-4474-a8a0-9418df433671 | - -## opencode/minimax-m2.5-free - -Readiness score: 88.6. - -Score stability: consistency=93.1, min=95, max=100, spread=5, stdDev=2.4, samples=3. - -Recommendation blockers: successful runs 2 < 3; hard failures 1; model-behavior failures 1; highest weighted stage loss noDuplicateTokens=5; protocol violations in 1 runs. - -Weighted stage impact: noDuplicateTokens:loss=5, failed=1, pass=2/3 (66.7%). - -Stage pass rates: launchBootstrap:3/3 (100%), directReply:3/3 (100%), peerRelayAB:3/3 (100%), peerRelayBC:3/3 (100%), concurrentReplies:3/3 (100%), taskRefs:3/3 (100%), cleanTranscript:3/3 (100%), noDuplicateTokens:2/3 (66.7%), latencyStable:3/3 (100%). - -TaskRef pass rates: directReply:3/3 (100%), peerRelayAB:3/3 (100%), peerRelayBC:3/3 (100%), concurrentBob:3/3 (100%), concurrentTom:3/3 (100%). - -Protocol totals: badMessages=0, duplicateOrMissingTokens=2, affectedRuns=1. - -| Run | Outcome | Category | Score | Counted | Duration | Failed Stages | Slowest Stage | TaskRefs | Protocol | Diagnostics | -| ---: | --- | --- | ---: | --- | ---: | --- | --- | --- | --- | --- | -| 1 | passed | none | 100 | yes | 91530ms | - | peerRelayBC:27370ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=23ae85d2-e79d-41c9-93a6-e843acea6d9e | -| 2 | passed | none | 100 | yes | 108862ms | - | peerRelayAB:30664ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=c3a55d8a-4028-4af7-9e1a-8ae8c87a95e5 | -| 3 | behavioral-fail | model-behavior | 95 | yes | 118757ms | noDuplicateTokens | peerRelayAB:37430ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | token=GAUNTLET_JACK_USER_OK_3+GAUNTLET_TOM_USER_OK_3 | duplicateOrMissingTokens=GAUNTLET_JACK_USER_OK_3,GAUNTLET_TOM_USER_OK_3 | +| 1 | passed | none | 100 | yes | 132968ms | - | launchBootstrap:49092ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=5f3d0b1b-17eb-44d6-8b61-644e6f8673c6 | diff --git a/test/main/services/team/OpenCodeStateChangingBridgeCommandService.test.ts b/test/main/services/team/OpenCodeStateChangingBridgeCommandService.test.ts index 5004584f..db3d51eb 100644 --- a/test/main/services/team/OpenCodeStateChangingBridgeCommandService.test.ts +++ b/test/main/services/team/OpenCodeStateChangingBridgeCommandService.test.ts @@ -166,6 +166,50 @@ describe('OpenCodeStateChangingBridgeCommandService', () => { await expect(leaseStore.getActive('team-a')).resolves.toBeNull(); }); + it('waits briefly for an active lane lease instead of failing near-concurrent sends', async () => { + clientIdentity.bridgeProtocol.supportedCommands.push('opencode.sendMessage'); + const server = peerIdentity('agent_teams_orchestrator'); + server.bridgeProtocol.supportedCommands.push('opencode.sendMessage'); + server.bridgeProtocol.opencodeDeliveryAcceptanceContractVersion = + OPEN_CODE_DELIVERY_ACCEPTANCE_CONTRACT_VERSION; + handshakePort.nextHandshake = buildHandshakeWithAcceptedCommands( + { client: clientIdentity, server }, + ['opencode.launchTeam', 'opencode.stopTeam', 'opencode.sendMessage'] + ); + bridge.resultFactory = ({ body, command, options }) => + bridgeSuccess({ + requestId: options.requestId, + command, + data: { + runId: 'run-1', + idempotencyKey: body.preconditions.idempotencyKey, + runtimeStoreManifestHighWatermark: 10, + }, + }); + const service = createService({ + leaseAcquireTimeoutMs: 200, + leaseAcquireRetryDelayMs: 5, + }); + const activeLease = await leaseStore.acquire({ + teamName: 'team-a', + laneId: 'secondary:opencode:bob', + runId: 'run-1', + command: 'opencode.sendMessage', + ttlMs: 10_000, + }); + + const resultPromise = service.execute(buildSendInput('acceptance')); + await sleep(20); + expect(bridge.calls).toHaveLength(0); + + await leaseStore.release(activeLease.leaseId); + + await expect(resultPromise).resolves.toMatchObject({ ok: true }); + expect(bridge.calls).toHaveLength(1); + expect(bridge.calls[0].body.preconditions.commandLeaseId).toBe('lease-2'); + await expect(leaseStore.getActive('team-a')).resolves.toBeNull(); + }); + it('records unknown outcome after timeout and blocks retry before a duplicate bridge call', async () => { bridge.resultFactory = ({ body, command, options }) => ({ ok: false, @@ -238,7 +282,12 @@ describe('OpenCodeStateChangingBridgeCommandService', () => { await expect(leaseStore.getActive('team-a')).resolves.toBeNull(); }); - function createService(): OpenCodeStateChangingBridgeCommandService { + function createService( + overrides: { + leaseAcquireTimeoutMs?: number; + leaseAcquireRetryDelayMs?: number; + } = {} + ): OpenCodeStateChangingBridgeCommandService { return new OpenCodeStateChangingBridgeCommandService({ expectedClientIdentity: clientIdentity, handshakePort, @@ -250,6 +299,7 @@ describe('OpenCodeStateChangingBridgeCommandService', () => { requestIdFactory: () => 'cmd-1', diagnosticIdFactory: () => 'diag-1', clock: () => now, + ...overrides, }); } }); @@ -405,12 +455,12 @@ function buildHandshakeWithAcceptedCommands( class FakeBridgeExecutor implements OpenCodeBridgeCommandExecutor { calls: Array<{ command: OpenCodeBridgeCommandName; - body: { prompt: string; preconditions: { idempotencyKey: string } }; + body: { prompt: string; preconditions: { idempotencyKey: string; commandLeaseId?: string } }; options: { cwd: string; timeoutMs: number; requestId?: string }; }> = []; resultFactory: (input: { command: OpenCodeBridgeCommandName; - body: { prompt: string; preconditions: { idempotencyKey: string } }; + body: { prompt: string; preconditions: { idempotencyKey: string; commandLeaseId?: string } }; options: { cwd: string; timeoutMs: number; requestId?: string }; }) => OpenCodeBridgeResult = ({ body, options }) => bridgeSuccess({ @@ -429,7 +479,10 @@ class FakeBridgeExecutor implements OpenCodeBridgeCommandExecutor { ): Promise> { const call = { command, - body: body as { prompt: string; preconditions: { idempotencyKey: string } }, + body: body as { + prompt: string; + preconditions: { idempotencyKey: string; commandLeaseId?: string }; + }, options, }; this.calls.push(call); @@ -460,3 +513,7 @@ class FakeManifestReader implements RuntimeStoreManifestReader { class FakeDiagnosticsSink implements OpenCodeStateChangingBridgeDiagnosticsSink { readonly append = vi.fn(async () => {}); } + +function sleep(delayMs: number): Promise { + return new Promise((resolve) => setTimeout(resolve, delayMs)); +} diff --git a/test/main/services/team/OpenCodeTeamRuntimeAdapter.test.ts b/test/main/services/team/OpenCodeTeamRuntimeAdapter.test.ts index 308a6c8b..8cb3a504 100644 --- a/test/main/services/team/OpenCodeTeamRuntimeAdapter.test.ts +++ b/test/main/services/team/OpenCodeTeamRuntimeAdapter.test.ts @@ -545,6 +545,49 @@ describe('OpenCodeTeamRuntimeAdapter', () => { expect(sentText).toContain('never use #00000000'); }); + it('uses observed settlement for member-work-sync nudges so turn-settled can drive reconcile', async () => { + const sendOpenCodeTeamMessage = vi.fn< + NonNullable + >(async () => ({ + accepted: true, + sessionId: 'oc-session-bob', + memberName: 'bob', + runtimePid: 456, + runtimePromptMessageId: 'msg_prompt_1', + diagnostics: [], + })); + const adapter = new OpenCodeTeamRuntimeAdapter( + bridgePort(readiness({ state: 'ready', launchAllowed: true }), { + sendOpenCodeTeamMessage, + }) + ); + + await expect( + adapter.sendMessageToMember({ + runId: 'run-1', + teamName: 'team-a', + laneId: 'secondary:opencode:bob', + memberName: 'bob', + cwd: '/repo', + text: 'sync your current work state', + messageId: 'sync-1', + messageKind: 'member_work_sync_nudge', + taskRefs: [{ taskId: 'task-1', displayId: 'abcd1234', teamName: 'team-a' }], + }) + ).resolves.toMatchObject({ + ok: true, + runtimePromptMessageId: 'msg_prompt_1', + }); + + expect(sendOpenCodeTeamMessage).toHaveBeenCalledWith( + expect.objectContaining({ + messageId: 'sync-1', + messageKind: 'member_work_sync_nudge', + settlementMode: 'observed', + }) + ); + }); + it('observes direct teammate messages by exact accepted runtime prompt id', async () => { const observeOpenCodeTeamMessageDelivery = vi.fn< NonNullable