From 55dfc5db1422df475220e059f3129bfacf6b339b Mon Sep 17 00:00:00 2001 From: 777genius Date: Mon, 18 May 2026 17:58:55 +0300 Subject: [PATCH] fix(opencode): classify session refresh diagnostics --- .../delivery/OpenCodePromptDeliveryLedger.ts | 14 +- .../OpenCodeRuntimeDeliveryDiagnostics.ts | 39 ++- .../runtime/RuntimeDiagnosticClassifier.ts | 19 +- src/renderer/utils/memberHelpers.ts | 21 +- src/renderer/utils/memberLaunchDiagnostics.ts | 24 +- .../model-gauntlet-results.json | 238 +++++++++--------- .../model-gauntlet-results.md | 20 +- .../team/OpenCodePromptDeliveryLedger.test.ts | 8 + ...OpenCodeRuntimeDeliveryDiagnostics.test.ts | 45 ++++ .../team/RuntimeDiagnosticClassifier.test.ts | 20 ++ test/renderer/utils/memberHelpers.test.ts | 2 + .../utils/memberLaunchDiagnostics.test.ts | 21 +- 12 files changed, 302 insertions(+), 169 deletions(-) diff --git a/src/main/services/team/opencode/delivery/OpenCodePromptDeliveryLedger.ts b/src/main/services/team/opencode/delivery/OpenCodePromptDeliveryLedger.ts index 0065c58f..6c15602e 100644 --- a/src/main/services/team/opencode/delivery/OpenCodePromptDeliveryLedger.ts +++ b/src/main/services/team/opencode/delivery/OpenCodePromptDeliveryLedger.ts @@ -890,7 +890,7 @@ function isCleanOpenCodeSessionRefreshReason( if (!pattern.test(normalized)) { return false; } - const markerText = normalized.replace(/^opencode api error(?:[.:\s-]+|$)/i, ''); + const markerText = stripOpenCodeGenericApiErrorPrefix(normalized); if (hasOpenCodeSessionRefreshFailureConflict(markerText)) { return false; } @@ -924,11 +924,9 @@ function isBenignOpenCodeSessionRefreshRemainder(rawRemainder: string): boolean } function isOpenCodeSessionRefreshScheduledReason(message: string | null | undefined): boolean { - const normalized = - message - ?.trim() - .toLowerCase() - .replace(/[.:\s-]+$/, '') ?? ''; + const normalized = stripOpenCodeGenericApiErrorPrefix( + message?.trim().toLowerCase() ?? '' + ).replace(/[.:\s-]+$/, ''); return ( normalized === 'opencode prompt delivery session refresh scheduled' || normalized === 'opencode_prompt_delivery_session_refresh_scheduled' || @@ -938,6 +936,10 @@ function isOpenCodeSessionRefreshScheduledReason(message: string | null | undefi ); } +function stripOpenCodeGenericApiErrorPrefix(message: string): string { + return message.replace(/^opencode api error(?:[.:\s-]+|$)/i, ''); +} + function hasOpenCodeSessionRefreshFailureConflict(value: string): boolean { return OPENCODE_SESSION_REFRESH_FAILURE_PATTERN.test( value.replace(OPENCODE_SESSION_REFRESH_SAFE_MARKER_STATE_PATTERN, 'state') diff --git a/src/main/services/team/opencode/delivery/OpenCodeRuntimeDeliveryDiagnostics.ts b/src/main/services/team/opencode/delivery/OpenCodeRuntimeDeliveryDiagnostics.ts index f03c4fb8..fb27767e 100644 --- a/src/main/services/team/opencode/delivery/OpenCodeRuntimeDeliveryDiagnostics.ts +++ b/src/main/services/team/opencode/delivery/OpenCodeRuntimeDeliveryDiagnostics.ts @@ -51,6 +51,32 @@ function isPlainGenericOpenCodeApiError(message: string): boolean { ); } +function isOpenCodeRuntimeDeliverySessionRefreshScheduledDiagnostic(message: string): boolean { + const normalized = stripOpenCodeGenericApiErrorPrefix(message.trim().toLowerCase()).replace( + /[.:\s-]+$/, + '' + ); + return ( + normalized === 'opencode prompt delivery session refresh scheduled' || + normalized === 'opencode_prompt_delivery_session_refresh_scheduled' || + normalized === 'opencode session refresh scheduled after resolved behavior changed' || + normalized === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' || + normalized === 'opencode session changed; refreshing the session before retry' + ); +} + +function stripOpenCodeGenericApiErrorPrefix(message: string): string { + return message.replace(/^opencode api error(?:[.:\s-]+|$)/i, ''); +} + +function isOpenCodeRuntimeDeliveryCleanSessionRefreshDiagnostic(message: string): boolean { + return ( + isOpenCodeRuntimeDeliverySessionRefreshScheduledDiagnostic(message) || + isOpenCodeResolvedBehaviorChangedReason(message) || + isOpenCodeSessionTransportChangedReason(message) + ); +} + function isInformationalOpenCodeRuntimeDeliveryDiagnostic( message: string | null | undefined ): boolean { @@ -61,12 +87,7 @@ function isInformationalOpenCodeRuntimeDeliveryDiagnostic( 'opencode prompt_async accepted; response observation will continue through durable app-side ledger reconciliation.' || normalized === 'opencode session status busy' || normalized === 'opencode_delivery_response_pending' || - normalized === 'opencode_prompt_delivery_session_refresh_scheduled' || - normalized === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' || - Boolean( - isOpenCodeResolvedBehaviorChangedReason(normalized) || - isOpenCodeSessionTransportChangedReason(normalized) - ) + Boolean(normalized && isOpenCodeRuntimeDeliveryCleanSessionRefreshDiagnostic(normalized)) ); } @@ -85,11 +106,7 @@ function getOpenCodeRuntimeDeliveryStateFallback( const diagnostics = record.diagnostics.map((diagnostic) => diagnostic.trim().toLowerCase()); const diagnosticText = diagnostics.join('\n'); const hasCleanSessionRefreshDiagnostic = diagnostics.some( - (diagnostic) => - diagnostic === 'opencode_prompt_delivery_session_refresh_scheduled' || - diagnostic === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' || - isOpenCodeResolvedBehaviorChangedReason(diagnostic) || - isOpenCodeSessionTransportChangedReason(diagnostic) + isOpenCodeRuntimeDeliveryCleanSessionRefreshDiagnostic ); if (state === 'empty_assistant_turn' || normalizedReason === 'empty_assistant_turn') { return 'OpenCode returned an empty assistant turn.'; diff --git a/src/main/services/team/runtime/RuntimeDiagnosticClassifier.ts b/src/main/services/team/runtime/RuntimeDiagnosticClassifier.ts index d5a26fd5..012e26af 100644 --- a/src/main/services/team/runtime/RuntimeDiagnosticClassifier.ts +++ b/src/main/services/team/runtime/RuntimeDiagnosticClassifier.ts @@ -41,19 +41,20 @@ const OPENCODE_SESSION_REFRESH_SAFE_MARKER_STATE_PATTERN = function isCleanOpenCodeSessionRefreshDiagnostic(message: string): boolean { const normalized = message.trim().toLowerCase(); + const refreshText = stripOpenCodeGenericApiErrorPrefix(normalized); + const refreshMarkerText = refreshText.replace(/[.:\s-]+$/, ''); if ( - normalized === 'opencode session changed; refreshing the session before retry' || - normalized === 'opencode session changed; refreshing the session before retry.' || - normalized === 'opencode session refresh scheduled after resolved behavior changed' || - normalized === 'opencode_prompt_delivery_session_refresh_scheduled' || - normalized === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' + refreshMarkerText === 'opencode session changed; refreshing the session before retry' || + refreshMarkerText === 'opencode session refresh scheduled after resolved behavior changed' || + refreshMarkerText === 'opencode_prompt_delivery_session_refresh_scheduled' || + refreshMarkerText === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' ) { return true; } - if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(normalized)) { + if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(refreshText)) { return false; } - const markerText = normalized.replace(/^opencode api error(?:[.:\s-]+|$)/i, ''); + const markerText = refreshText; if (hasOpenCodeSessionRefreshFailureConflict(markerText)) { return false; } @@ -65,6 +66,10 @@ function isCleanOpenCodeSessionRefreshDiagnostic(message: string): boolean { return isBenignOpenCodeSessionRefreshRemainder(rawRemainder); } +function stripOpenCodeGenericApiErrorPrefix(message: string): string { + return message.replace(/^opencode api error(?:[.:\s-]+|$)/i, ''); +} + function isBenignOpenCodeSessionRefreshRemainder(rawRemainder: string): boolean { if (OPENCODE_SESSION_REFRESH_FAILURE_PATTERN.test(rawRemainder)) { return false; diff --git a/src/renderer/utils/memberHelpers.ts b/src/renderer/utils/memberHelpers.ts index d549fd38..19ccac27 100644 --- a/src/renderer/utils/memberHelpers.ts +++ b/src/renderer/utils/memberHelpers.ts @@ -400,20 +400,21 @@ const OPENCODE_SESSION_REFRESH_SAFE_MARKER_STATE_PATTERN = function isRecoverableOpenCodeSessionRefreshMessage(message: string | undefined): boolean { const normalized = message?.trim().toLowerCase() ?? ''; + const refreshText = stripOpenCodeGenericApiErrorPrefix(normalized); + const refreshMarkerText = refreshText.replace(/[.:\s-]+$/, ''); if ( - normalized === 'session_stale' || - normalized === 'opencode session changed; refreshing the session before retry' || - normalized === 'opencode session changed; refreshing the session before retry.' || - normalized === 'opencode session refresh scheduled after resolved behavior changed' || - normalized === 'opencode_prompt_delivery_session_refresh_scheduled' || - normalized === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' + refreshMarkerText === 'session_stale' || + refreshMarkerText === 'opencode session changed; refreshing the session before retry' || + refreshMarkerText === 'opencode session refresh scheduled after resolved behavior changed' || + refreshMarkerText === 'opencode_prompt_delivery_session_refresh_scheduled' || + refreshMarkerText === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' ) { return true; } - if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(normalized)) { + if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(refreshText)) { return false; } - const markerText = normalized.replace(/^opencode api error(?:[.:\s-]+|$)/i, ''); + const markerText = refreshText; if (hasOpenCodeSessionRefreshFailureConflict(markerText)) { return false; } @@ -429,6 +430,10 @@ function isRecoverableOpenCodeSessionRefreshMessage(message: string | undefined) return staleLogProjectionContext && isBenignOpenCodeSessionRefreshRemainder(rawRemainder); } +function stripOpenCodeGenericApiErrorPrefix(message: string): string { + return message.replace(/^opencode api error(?:[.:\s-]+|$)/i, ''); +} + function isBenignOpenCodeSessionRefreshRemainder(rawRemainder: string): boolean { if (OPENCODE_SESSION_REFRESH_FAILURE_PATTERN.test(rawRemainder)) { return false; diff --git a/src/renderer/utils/memberLaunchDiagnostics.ts b/src/renderer/utils/memberLaunchDiagnostics.ts index b938d77e..6e67b501 100644 --- a/src/renderer/utils/memberLaunchDiagnostics.ts +++ b/src/renderer/utils/memberLaunchDiagnostics.ts @@ -166,21 +166,22 @@ function isRuntimeDiagnosticCardError(params: { function isRecoverableOpenCodeSessionRefreshText(value: string | undefined): boolean { const normalized = value?.trim().toLowerCase() ?? ''; + const refreshText = stripOpenCodeGenericApiErrorPrefix(normalized); + const refreshMarkerText = refreshText.replace(/[.:\s-]+$/, ''); if ( - normalized === 'session_stale' || - normalized === 'opencode session refresh' || - normalized === 'opencode session changed; refreshing the session before retry' || - normalized === 'opencode session changed; refreshing the session before retry.' || - normalized === 'opencode session refresh scheduled after resolved behavior changed' || - normalized === 'opencode_prompt_delivery_session_refresh_scheduled' || - normalized === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' + refreshMarkerText === 'session_stale' || + refreshMarkerText === 'opencode session refresh' || + refreshMarkerText === 'opencode session changed; refreshing the session before retry' || + refreshMarkerText === 'opencode session refresh scheduled after resolved behavior changed' || + refreshMarkerText === 'opencode_prompt_delivery_session_refresh_scheduled' || + refreshMarkerText === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' ) { return true; } - if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(normalized)) { + if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(refreshText)) { return false; } - const markerText = normalized.replace(/^opencode api error(?:[.:\s-]+|$)/i, ''); + const markerText = refreshText; if (hasOpenCodeSessionRefreshFailureConflict(markerText)) { return false; } @@ -196,6 +197,10 @@ function isRecoverableOpenCodeSessionRefreshText(value: string | undefined): boo return staleLogProjectionContext && isBenignOpenCodeSessionRefreshRemainder(rawRemainder); } +function stripOpenCodeGenericApiErrorPrefix(message: string): string { + return message.replace(/^opencode api error(?:[.:\s-]+|$)/i, ''); +} + function isBenignOpenCodeSessionRefreshRemainder(rawRemainder: string): boolean { if (OPENCODE_SESSION_REFRESH_FAILURE_PATTERN.test(rawRemainder)) { return false; @@ -774,6 +779,7 @@ export function hasMemberLaunchDiagnosticsError(payload: MemberLaunchDiagnostics return false; } return Boolean( + payload.memberCardError || payload.spawnStatus === 'error' || payload.launchState === 'failed_to_start' || payload.runtimeDiagnosticSeverity === 'error' diff --git a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json index 02975626..2804f982 100644 --- a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json +++ b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json @@ -1,8 +1,8 @@ { - "generatedAt": "2026-05-14T06:34:47.601Z", + "generatedAt": "2026-05-18T13:16:59.867Z", "runsPerModel": 1, "qualification": { - "minimumAverageScore": 70, + "minimumAverageScore": 80, "minimumSuccessfulRuns": 1, "minimumConsistencyScore": 85, "requireNoHardFailures": true @@ -10,23 +10,23 @@ "models": [ { "model": "opencode/big-pickle", - "verdict": "recommended", + "verdict": "tested-only", "confidence": "low", - "qualified": true, - "readinessScore": 100, - "averageScore": 100, + "qualified": false, + "readinessScore": 54, + "averageScore": 35, "consistencyScore": 100, - "behavioralAverageScore": 100, - "minScore": 100, - "successfulRuns": 1, + "behavioralAverageScore": 35, + "minScore": 35, + "successfulRuns": 0, "countedRuns": 1, - "hardFailures": 0, + "hardFailures": 1, "providerInfraFailures": 0, "runtimeTransportFailures": 0, - "modelBehaviorFailures": 0, + "modelBehaviorFailures": 1, "harnessFailures": 0, - "p50DurationMs": 132968, - "p95DurationMs": 132968, + "p50DurationMs": 133048, + "p95DurationMs": 133048, "stagePassRates": { "launchBootstrap": { "passed": 1, @@ -39,19 +39,19 @@ "rate": 100 }, "peerRelayAB": { - "passed": 1, + "passed": 0, "total": 1, - "rate": 100 + "rate": 0 }, "peerRelayBC": { - "passed": 1, + "passed": 0, "total": 1, - "rate": 100 + "rate": 0 }, "concurrentReplies": { - "passed": 1, + "passed": 0, "total": 1, - "rate": 100 + "rate": 0 }, "taskRefs": { "passed": 1, @@ -59,19 +59,19 @@ "rate": 100 }, "cleanTranscript": { - "passed": 1, + "passed": 0, "total": 1, - "rate": 100 + "rate": 0 }, "noDuplicateTokens": { - "passed": 1, + "passed": 0, "total": 1, - "rate": 100 + "rate": 0 }, "latencyStable": { - "passed": 1, + "passed": 0, "total": 1, - "rate": 100 + "rate": 0 } }, "taskRefPassRates": { @@ -81,24 +81,24 @@ "rate": 100 }, "peerRelayAB": { - "passed": 1, - "total": 1, - "rate": 100 + "passed": 0, + "total": 0, + "rate": null }, "peerRelayBC": { - "passed": 1, - "total": 1, - "rate": 100 + "passed": 0, + "total": 0, + "rate": null }, "concurrentBob": { - "passed": 1, - "total": 1, - "rate": 100 + "passed": 0, + "total": 0, + "rate": null }, "concurrentTom": { - "passed": 1, - "total": 1, - "rate": 100 + "passed": 0, + "total": 0, + "rate": null } }, "protocolViolationTotals": { @@ -108,23 +108,63 @@ }, "stageFailureImpact": [ { - "stage": "cleanTranscript", - "failedRuns": 0, - "weightedLoss": 0, + "stage": "concurrentReplies", + "failedRuns": 1, + "weightedLoss": 15, "passRate": { - "passed": 1, + "passed": 0, "total": 1, - "rate": 100 + "rate": 0 } }, { - "stage": "concurrentReplies", - "failedRuns": 0, - "weightedLoss": 0, + "stage": "peerRelayAB", + "failedRuns": 1, + "weightedLoss": 15, "passRate": { - "passed": 1, + "passed": 0, "total": 1, - "rate": 100 + "rate": 0 + } + }, + { + "stage": "peerRelayBC", + "failedRuns": 1, + "weightedLoss": 15, + "passRate": { + "passed": 0, + "total": 1, + "rate": 0 + } + }, + { + "stage": "cleanTranscript", + "failedRuns": 1, + "weightedLoss": 10, + "passRate": { + "passed": 0, + "total": 1, + "rate": 0 + } + }, + { + "stage": "latencyStable", + "failedRuns": 1, + "weightedLoss": 5, + "passRate": { + "passed": 0, + "total": 1, + "rate": 0 + } + }, + { + "stage": "noDuplicateTokens", + "failedRuns": 1, + "weightedLoss": 5, + "passRate": { + "passed": 0, + "total": 1, + "rate": 0 } }, { @@ -137,16 +177,6 @@ "rate": 100 } }, - { - "stage": "latencyStable", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 1, - "total": 1, - "rate": 100 - } - }, { "stage": "launchBootstrap", "failedRuns": 0, @@ -157,36 +187,6 @@ "rate": 100 } }, - { - "stage": "noDuplicateTokens", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 1, - "total": 1, - "rate": 100 - } - }, - { - "stage": "peerRelayAB", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 1, - "total": 1, - "rate": 100 - } - }, - { - "stage": "peerRelayBC", - "failedRuns": 0, - "weightedLoss": 0, - "passRate": { - "passed": 1, - "total": 1, - "rate": 100 - } - }, { "stage": "taskRefs", "failedRuns": 0, @@ -200,42 +200,45 @@ ], "scoreStability": { "sampleSize": 1, - "minScore": 100, - "maxScore": 100, + "minScore": 35, + "maxScore": 35, "spread": 0, "standardDeviation": 0, "consistencyScore": 100 }, - "dominantFailureCategory": "none", - "recommendationBlockers": [], + "dominantFailureCategory": "model-behavior", + "recommendationBlockers": [ + "overall average 35 < 80", + "behavioral average 35 < 80", + "successful runs 0 < 1", + "hard failures 1", + "model-behavior failures 1", + "highest weighted stage loss concurrentReplies=15" + ], "runs": [ { "runIndex": 1, - "passed": true, - "score": 100, + "passed": false, + "score": 35, "countedForRecommendation": true, - "outcome": "passed", - "failureCategory": "none", - "primaryFailure": null, - "durationMs": 132968, - "hardFailure": false, + "outcome": "behavioral-fail", + "failureCategory": "model-behavior", + "primaryFailure": "Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending", + "durationMs": 133048, + "hardFailure": true, "stageDurationsMs": { - "setup": 2770, - "launchBootstrap": 49092, - "materializeTasks": 85, - "directReply": 13760, - "peerRelayAB": 22730, - "peerRelayBC": 21484, - "concurrentReplies": 14023, - "hygiene": 1 + "setup": 371, + "launchBootstrap": 23735, + "materializeTasks": 36, + "directReply": 17327 }, "stageFailures": {}, "taskRefChecks": { "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentBob": true, - "concurrentTom": true + "peerRelayAB": null, + "peerRelayBC": null, + "concurrentBob": null, + "concurrentTom": null }, "protocolViolations": { "badMessages": 0, @@ -244,16 +247,17 @@ "stages": { "launchBootstrap": true, "directReply": true, - "peerRelayAB": true, - "peerRelayBC": true, - "concurrentReplies": true, + "peerRelayAB": false, + "peerRelayBC": false, + "concurrentReplies": false, "taskRefs": true, - "cleanTranscript": true, - "noDuplicateTokens": true, - "latencyStable": true + "cleanTranscript": false, + "noDuplicateTokens": false, + "latencyStable": false }, "diagnostics": [ - "runId=5f3d0b1b-17eb-44d6-8b61-644e6f8673c6" + "runId=9fb17ac5-fc66-4b01-831e-90a04b1e2304", + "Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending" ] } ] diff --git a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md index 4f93005d..93862987 100644 --- a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md +++ b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md @@ -1,9 +1,9 @@ # OpenCode Model Gauntlet Results -Generated: 2026-05-14T06:34:47.601Z +Generated: 2026-05-18T13:16:59.867Z Runs per model: 1 -Recommended threshold: average >= 70, successful runs >= 1, consistency >= 85, hard failures = 0 +Recommended threshold: average >= 80, successful runs >= 1, consistency >= 85, hard failures = 0 Provider-infra runs are reported separately and are not counted as model behavior. They still block a Recommended verdict until rerun succeeds. @@ -13,25 +13,25 @@ Scoring weights: launchBootstrap=15, directReply=10, peerRelayAB=15, peerRelayBC | Model | Verdict | Confidence | Readiness | Consistency | Score Spread | Behavior Avg | Overall Avg | Counted | Pass Runs | Weakest Stage | Weakest TaskRef | Dominant Failure | Blockers | Provider Infra | Runtime Transport | Model Fails | Protocol Runs | p50 | p95 | | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | -| `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 132968ms | 132968ms | +| `opencode/big-pickle` | Tested only | low | 54 | 100 | 0 | 35 | 35 | 1/1 | 0/1 | cleanTranscript 0/1 (0%) | directReply 1/1 (100%) | model-behavior | overall average 35 < 80; behavioral average 35 < 80; successful runs 0 < 1; hard failures 1; model-behavior failures 1; highest weighted stage loss concurrentReplies=15 | 0 | 0 | 1 | 0 | 133048ms | 133048ms | ## opencode/big-pickle -Readiness score: 100. +Readiness score: 54. -Score stability: consistency=100, min=100, max=100, spread=0, stdDev=0, samples=1. +Score stability: consistency=100, min=35, max=35, spread=0, stdDev=0, samples=1. -Recommendation blockers: -. +Recommendation blockers: overall average 35 < 80; behavioral average 35 < 80; successful runs 0 < 1; hard failures 1; model-behavior failures 1; highest weighted stage loss concurrentReplies=15. -Weighted stage impact: -. +Weighted stage impact: concurrentReplies:loss=15, failed=1, pass=0/1 (0%); peerRelayAB:loss=15, failed=1, pass=0/1 (0%); peerRelayBC:loss=15, failed=1, pass=0/1 (0%); cleanTranscript:loss=10, failed=1, pass=0/1 (0%); latencyStable:loss=5, failed=1, pass=0/1 (0%). -Stage pass rates: launchBootstrap:1/1 (100%), directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentReplies:1/1 (100%), taskRefs:1/1 (100%), cleanTranscript:1/1 (100%), noDuplicateTokens:1/1 (100%), latencyStable:1/1 (100%). +Stage pass rates: launchBootstrap:1/1 (100%), directReply:1/1 (100%), peerRelayAB:0/1 (0%), peerRelayBC:0/1 (0%), concurrentReplies:0/1 (0%), taskRefs:1/1 (100%), cleanTranscript:0/1 (0%), noDuplicateTokens:0/1 (0%), latencyStable:0/1 (0%). -TaskRef pass rates: directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentBob:1/1 (100%), concurrentTom:1/1 (100%). +TaskRef pass rates: directReply:1/1 (100%), peerRelayAB:n/a, peerRelayBC:n/a, concurrentBob:n/a, concurrentTom:n/a. Protocol totals: badMessages=0, duplicateOrMissingTokens=0, affectedRuns=0. | Run | Outcome | Category | Score | Counted | Duration | Failed Stages | Slowest Stage | TaskRefs | Protocol | Diagnostics | | ---: | --- | --- | ---: | --- | ---: | --- | --- | --- | --- | --- | -| 1 | passed | none | 100 | yes | 132968ms | - | launchBootstrap:49092ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=5f3d0b1b-17eb-44d6-8b61-644e6f8673c6 | +| 1 | behavioral-fail | model-behavior | 35 | yes | 133048ms | peerRelayAB, peerRelayBC, concurrentReplies, cleanTranscript, noDuplicateTokens, latencyStable | launchBootstrap:23735ms | directReply:ok | - | Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending | diff --git a/test/main/services/team/OpenCodePromptDeliveryLedger.test.ts b/test/main/services/team/OpenCodePromptDeliveryLedger.test.ts index c5e4f08d..7efb904c 100644 --- a/test/main/services/team/OpenCodePromptDeliveryLedger.test.ts +++ b/test/main/services/team/OpenCodePromptDeliveryLedger.test.ts @@ -591,7 +591,9 @@ describe('OpenCodePromptDeliveryLedger', () => { ).toBe(true); for (const reason of [ 'opencode_prompt_delivery_session_refresh_scheduled', + 'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled', 'opencode_session_refresh_scheduled_after_resolved_behavior_changed', + 'OpenCode API error: opencode_session_refresh_scheduled_after_resolved_behavior_changed', 'OpenCode session refresh scheduled after resolved behavior changed', 'OpenCode session changed; refreshing the session before retry.', ]) { @@ -602,6 +604,12 @@ describe('OpenCodePromptDeliveryLedger', () => { }) ).toBe(true); } + expect( + isOpenCodeSessionRefreshResponseState({ + responseState: 'pending', + reason: 'OpenCode API erroropencode_prompt_delivery_session_refresh_scheduled', + }) + ).toBe(false); expect( isOpenCodeSessionRefreshResponseState({ diagnostics: [ diff --git a/test/main/services/team/OpenCodeRuntimeDeliveryDiagnostics.test.ts b/test/main/services/team/OpenCodeRuntimeDeliveryDiagnostics.test.ts index 59ded236..6b1962e9 100644 --- a/test/main/services/team/OpenCodeRuntimeDeliveryDiagnostics.test.ts +++ b/test/main/services/team/OpenCodeRuntimeDeliveryDiagnostics.test.ts @@ -131,6 +131,38 @@ describe('OpenCodeRuntimeDeliveryDiagnostics', () => { ); }); + it('treats generic-prefixed legacy prompt-delivery refresh scheduled diagnostics as session refresh', () => { + const record = { + diagnostics: [ + 'OpenCode API error', + 'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled', + ], + lastReason: 'OpenCode API error', + responseState: 'not_observed', + status: 'retry_scheduled', + } as Parameters[0]; + + expect(selectOpenCodeRuntimeDeliveryReason(record)).toBe( + 'OpenCode session changed; refreshing the session before retry.' + ); + }); + + it('treats generic-prefixed resolved-behavior refresh scheduled diagnostics as session refresh', () => { + const record = { + diagnostics: [ + 'OpenCode API error', + 'OpenCode API error. opencode_session_refresh_scheduled_after_resolved_behavior_changed', + ], + lastReason: 'OpenCode API error', + responseState: 'not_observed', + status: 'retry_scheduled', + } as Parameters[0]; + + expect(selectOpenCodeRuntimeDeliveryReason(record)).toBe( + 'OpenCode session changed; refreshing the session before retry.' + ); + }); + it('treats colon-terminated generic OpenCode API errors plus clean refresh evidence as session refresh', () => { const record = { diagnostics: ['OpenCode API error:', 'resolved_behavior_changed:old->new'], @@ -144,6 +176,19 @@ describe('OpenCodeRuntimeDeliveryDiagnostics', () => { ); }); + it('does not treat generic-prefixed refresh scheduled diagnostics with failure details as session refresh', () => { + const reason = + 'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled permission denied'; + const record = { + diagnostics: ['OpenCode API error', reason], + lastReason: 'OpenCode API error', + responseState: 'reconcile_failed', + status: 'failed_retryable', + } as Parameters[0]; + + expect(selectOpenCodeRuntimeDeliveryReason(record)).toBe(reason); + }); + it('keeps real failure diagnostics above generic OpenCode API error plus refresh evidence', () => { const record = { diagnostics: ['OpenCode API error', 'resolved_behavior_changed:old->new', 'permission denied'], diff --git a/test/main/services/team/RuntimeDiagnosticClassifier.test.ts b/test/main/services/team/RuntimeDiagnosticClassifier.test.ts index 382d3776..09e5e2c8 100644 --- a/test/main/services/team/RuntimeDiagnosticClassifier.test.ts +++ b/test/main/services/team/RuntimeDiagnosticClassifier.test.ts @@ -208,6 +208,26 @@ describe('RuntimeDiagnosticClassifier', () => { generic: true, actionRequired: false, }); + expect( + classifyRuntimeDiagnostic( + 'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled' + ) + ).toMatchObject({ + reasonCode: 'backend_error', + normalizedMessage: 'OpenCode session changed; refreshing the session before retry.', + generic: true, + actionRequired: false, + }); + expect( + classifyRuntimeDiagnostic( + 'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled.' + ) + ).toMatchObject({ + reasonCode: 'backend_error', + normalizedMessage: 'OpenCode session changed; refreshing the session before retry.', + generic: true, + actionRequired: false, + }); }); it('does not classify refresh markers with unknown extra text as clean refresh', () => { diff --git a/test/renderer/utils/memberHelpers.test.ts b/test/renderer/utils/memberHelpers.test.ts index ec8dc27c..cec9fa57 100644 --- a/test/renderer/utils/memberHelpers.test.ts +++ b/test/renderer/utils/memberHelpers.test.ts @@ -981,6 +981,8 @@ describe('memberHelpers spawn-aware presence', () => { 'OpenCode session is stale (resolved_behavior_changed:old->new); reading historical messages for log projection only', 'opencode_app_mcp_transport_changed:old->new', 'opencode_prompt_delivery_session_refresh_scheduled', + 'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled', + 'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled.', 'OpenCode session refresh scheduled after resolved behavior changed', 'opencode_session_refresh_scheduled_after_resolved_behavior_changed', ])('renders recoverable OpenCode session refresh advisory %s as a warning', (message) => { diff --git a/test/renderer/utils/memberLaunchDiagnostics.test.ts b/test/renderer/utils/memberLaunchDiagnostics.test.ts index f10c73ce..fa4fb4db 100644 --- a/test/renderer/utils/memberLaunchDiagnostics.test.ts +++ b/test/renderer/utils/memberLaunchDiagnostics.test.ts @@ -278,7 +278,7 @@ describe('member launch diagnostics', () => { kind: 'api_error', observedAt: '2026-05-18T08:31:46.075Z', reasonCode: 'backend_error', - message: 'opencode_prompt_delivery_session_refresh_scheduled', + message: 'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled.', }, }); @@ -312,6 +312,25 @@ describe('member launch diagnostics', () => { expect(hasMemberLaunchDiagnosticsError(payload)).toBe(false); }); + it('treats member card errors from runtime advisory as diagnostics errors', () => { + const payload = buildMemberLaunchDiagnosticsPayload({ + memberName: 'tom', + member: { name: 'tom', providerId: 'opencode' }, + runtimeAdvisoryLabel: 'OpenCode API error', + runtimeAdvisoryTitle: 'OpenCode API error', + runtimeAdvisory: { + kind: 'api_error', + observedAt: '2026-05-18T08:31:46.075Z', + reasonCode: 'backend_error', + message: 'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled permission denied', + }, + }); + + expect(payload.memberCardError).toBe('OpenCode API error'); + expect(hasMemberLaunchDiagnosticsError(payload)).toBe(true); + expect(getMemberLaunchDiagnosticsErrorMessage(payload)).toBe('OpenCode API error'); + }); + it('does not treat OpenCode response-state names inside refresh markers as card errors', () => { const payload = buildMemberLaunchDiagnosticsPayload({ memberName: 'tom',