diff --git a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json index 2804f982..1a73aaae 100644 --- a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json +++ b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json @@ -1,5 +1,5 @@ { - "generatedAt": "2026-05-18T13:16:59.867Z", + "generatedAt": "2026-05-20T15:19:19.600Z", "runsPerModel": 1, "qualification": { "minimumAverageScore": 80, @@ -10,23 +10,23 @@ "models": [ { "model": "opencode/big-pickle", - "verdict": "tested-only", + "verdict": "recommended", "confidence": "low", - "qualified": false, - "readinessScore": 54, - "averageScore": 35, + "qualified": true, + "readinessScore": 100, + "averageScore": 100, "consistencyScore": 100, - "behavioralAverageScore": 35, - "minScore": 35, - "successfulRuns": 0, + "behavioralAverageScore": 100, + "minScore": 100, + "successfulRuns": 1, "countedRuns": 1, - "hardFailures": 1, + "hardFailures": 0, "providerInfraFailures": 0, "runtimeTransportFailures": 0, - "modelBehaviorFailures": 1, + "modelBehaviorFailures": 0, "harnessFailures": 0, - "p50DurationMs": 133048, - "p95DurationMs": 133048, + "p50DurationMs": 201546, + "p95DurationMs": 201546, "stagePassRates": { "launchBootstrap": { "passed": 1, @@ -39,19 +39,19 @@ "rate": 100 }, "peerRelayAB": { - "passed": 0, + "passed": 1, "total": 1, - "rate": 0 + "rate": 100 }, "peerRelayBC": { - "passed": 0, + "passed": 1, "total": 1, - "rate": 0 + "rate": 100 }, "concurrentReplies": { - "passed": 0, + "passed": 1, "total": 1, - "rate": 0 + "rate": 100 }, "taskRefs": { "passed": 1, @@ -59,19 +59,19 @@ "rate": 100 }, "cleanTranscript": { - "passed": 0, + "passed": 1, "total": 1, - "rate": 0 + "rate": 100 }, "noDuplicateTokens": { - "passed": 0, + "passed": 1, "total": 1, - "rate": 0 + "rate": 100 }, "latencyStable": { - "passed": 0, + "passed": 1, "total": 1, - "rate": 0 + "rate": 100 } }, "taskRefPassRates": { @@ -81,24 +81,24 @@ "rate": 100 }, "peerRelayAB": { - "passed": 0, - "total": 0, - "rate": null + "passed": 1, + "total": 1, + "rate": 100 }, "peerRelayBC": { - "passed": 0, - "total": 0, - "rate": null + "passed": 1, + "total": 1, + "rate": 100 }, "concurrentBob": { - "passed": 0, - "total": 0, - "rate": null + "passed": 1, + "total": 1, + "rate": 100 }, "concurrentTom": { - "passed": 0, - "total": 0, - "rate": null + "passed": 1, + "total": 1, + "rate": 100 } }, "protocolViolationTotals": { @@ -107,64 +107,24 @@ "affectedRuns": 0 }, "stageFailureImpact": [ - { - "stage": "concurrentReplies", - "failedRuns": 1, - "weightedLoss": 15, - "passRate": { - "passed": 0, - "total": 1, - "rate": 0 - } - }, - { - "stage": "peerRelayAB", - "failedRuns": 1, - "weightedLoss": 15, - "passRate": { - "passed": 0, - "total": 1, - "rate": 0 - } - }, - { - "stage": "peerRelayBC", - "failedRuns": 1, - "weightedLoss": 15, - "passRate": { - "passed": 0, - "total": 1, - "rate": 0 - } - }, { "stage": "cleanTranscript", - "failedRuns": 1, - "weightedLoss": 10, + "failedRuns": 0, + "weightedLoss": 0, "passRate": { - "passed": 0, + "passed": 1, "total": 1, - "rate": 0 + "rate": 100 } }, { - "stage": "latencyStable", - "failedRuns": 1, - "weightedLoss": 5, + "stage": "concurrentReplies", + "failedRuns": 0, + "weightedLoss": 0, "passRate": { - "passed": 0, + "passed": 1, "total": 1, - "rate": 0 - } - }, - { - "stage": "noDuplicateTokens", - "failedRuns": 1, - "weightedLoss": 5, - "passRate": { - "passed": 0, - "total": 1, - "rate": 0 + "rate": 100 } }, { @@ -177,6 +137,16 @@ "rate": 100 } }, + { + "stage": "latencyStable", + "failedRuns": 0, + "weightedLoss": 0, + "passRate": { + "passed": 1, + "total": 1, + "rate": 100 + } + }, { "stage": "launchBootstrap", "failedRuns": 0, @@ -187,6 +157,36 @@ "rate": 100 } }, + { + "stage": "noDuplicateTokens", + "failedRuns": 0, + "weightedLoss": 0, + "passRate": { + "passed": 1, + "total": 1, + "rate": 100 + } + }, + { + "stage": "peerRelayAB", + "failedRuns": 0, + "weightedLoss": 0, + "passRate": { + "passed": 1, + "total": 1, + "rate": 100 + } + }, + { + "stage": "peerRelayBC", + "failedRuns": 0, + "weightedLoss": 0, + "passRate": { + "passed": 1, + "total": 1, + "rate": 100 + } + }, { "stage": "taskRefs", "failedRuns": 0, @@ -200,45 +200,42 @@ ], "scoreStability": { "sampleSize": 1, - "minScore": 35, - "maxScore": 35, + "minScore": 100, + "maxScore": 100, "spread": 0, "standardDeviation": 0, "consistencyScore": 100 }, - "dominantFailureCategory": "model-behavior", - "recommendationBlockers": [ - "overall average 35 < 80", - "behavioral average 35 < 80", - "successful runs 0 < 1", - "hard failures 1", - "model-behavior failures 1", - "highest weighted stage loss concurrentReplies=15" - ], + "dominantFailureCategory": "none", + "recommendationBlockers": [], "runs": [ { "runIndex": 1, - "passed": false, - "score": 35, + "passed": true, + "score": 100, "countedForRecommendation": true, - "outcome": "behavioral-fail", - "failureCategory": "model-behavior", - "primaryFailure": "Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending", - "durationMs": 133048, - "hardFailure": true, + "outcome": "passed", + "failureCategory": "none", + "primaryFailure": null, + "durationMs": 201546, + "hardFailure": false, "stageDurationsMs": { - "setup": 371, - "launchBootstrap": 23735, - "materializeTasks": 36, - "directReply": 17327 + "setup": 171, + "launchBootstrap": 41905, + "materializeTasks": 39, + "directReply": 29109, + "peerRelayAB": 45148, + "peerRelayBC": 39967, + "concurrentReplies": 28807, + "hygiene": 1 }, "stageFailures": {}, "taskRefChecks": { "directReply": true, - "peerRelayAB": null, - "peerRelayBC": null, - "concurrentBob": null, - "concurrentTom": null + "peerRelayAB": true, + "peerRelayBC": true, + "concurrentBob": true, + "concurrentTom": true }, "protocolViolations": { "badMessages": 0, @@ -247,17 +244,16 @@ "stages": { "launchBootstrap": true, "directReply": true, - "peerRelayAB": false, - "peerRelayBC": false, - "concurrentReplies": false, + "peerRelayAB": true, + "peerRelayBC": true, + "concurrentReplies": true, "taskRefs": true, - "cleanTranscript": false, - "noDuplicateTokens": false, - "latencyStable": false + "cleanTranscript": true, + "noDuplicateTokens": true, + "latencyStable": true }, "diagnostics": [ - "runId=9fb17ac5-fc66-4b01-831e-90a04b1e2304", - "Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending" + "runId=1d6a50c3-c5cc-4c1e-91a0-d0e34a2229a3" ] } ] diff --git a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md index 93862987..b43a451c 100644 --- a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md +++ b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md @@ -1,6 +1,6 @@ # OpenCode Model Gauntlet Results -Generated: 2026-05-18T13:16:59.867Z +Generated: 2026-05-20T15:19:19.600Z Runs per model: 1 Recommended threshold: average >= 80, successful runs >= 1, consistency >= 85, hard failures = 0 @@ -13,25 +13,25 @@ Scoring weights: launchBootstrap=15, directReply=10, peerRelayAB=15, peerRelayBC | Model | Verdict | Confidence | Readiness | Consistency | Score Spread | Behavior Avg | Overall Avg | Counted | Pass Runs | Weakest Stage | Weakest TaskRef | Dominant Failure | Blockers | Provider Infra | Runtime Transport | Model Fails | Protocol Runs | p50 | p95 | | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | -| `opencode/big-pickle` | Tested only | low | 54 | 100 | 0 | 35 | 35 | 1/1 | 0/1 | cleanTranscript 0/1 (0%) | directReply 1/1 (100%) | model-behavior | overall average 35 < 80; behavioral average 35 < 80; successful runs 0 < 1; hard failures 1; model-behavior failures 1; highest weighted stage loss concurrentReplies=15 | 0 | 0 | 1 | 0 | 133048ms | 133048ms | +| `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 201546ms | 201546ms | ## opencode/big-pickle -Readiness score: 54. +Readiness score: 100. -Score stability: consistency=100, min=35, max=35, spread=0, stdDev=0, samples=1. +Score stability: consistency=100, min=100, max=100, spread=0, stdDev=0, samples=1. -Recommendation blockers: overall average 35 < 80; behavioral average 35 < 80; successful runs 0 < 1; hard failures 1; model-behavior failures 1; highest weighted stage loss concurrentReplies=15. +Recommendation blockers: -. -Weighted stage impact: concurrentReplies:loss=15, failed=1, pass=0/1 (0%); peerRelayAB:loss=15, failed=1, pass=0/1 (0%); peerRelayBC:loss=15, failed=1, pass=0/1 (0%); cleanTranscript:loss=10, failed=1, pass=0/1 (0%); latencyStable:loss=5, failed=1, pass=0/1 (0%). +Weighted stage impact: -. -Stage pass rates: launchBootstrap:1/1 (100%), directReply:1/1 (100%), peerRelayAB:0/1 (0%), peerRelayBC:0/1 (0%), concurrentReplies:0/1 (0%), taskRefs:1/1 (100%), cleanTranscript:0/1 (0%), noDuplicateTokens:0/1 (0%), latencyStable:0/1 (0%). +Stage pass rates: launchBootstrap:1/1 (100%), directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentReplies:1/1 (100%), taskRefs:1/1 (100%), cleanTranscript:1/1 (100%), noDuplicateTokens:1/1 (100%), latencyStable:1/1 (100%). -TaskRef pass rates: directReply:1/1 (100%), peerRelayAB:n/a, peerRelayBC:n/a, concurrentBob:n/a, concurrentTom:n/a. +TaskRef pass rates: directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentBob:1/1 (100%), concurrentTom:1/1 (100%). Protocol totals: badMessages=0, duplicateOrMissingTokens=0, affectedRuns=0. | Run | Outcome | Category | Score | Counted | Duration | Failed Stages | Slowest Stage | TaskRefs | Protocol | Diagnostics | | ---: | --- | --- | ---: | --- | ---: | --- | --- | --- | --- | --- | -| 1 | behavioral-fail | model-behavior | 35 | yes | 133048ms | peerRelayAB, peerRelayBC, concurrentReplies, cleanTranscript, noDuplicateTokens, latencyStable | launchBootstrap:23735ms | directReply:ok | - | Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending | +| 1 | passed | none | 100 | yes | 201546ms | - | peerRelayAB:45148ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=1d6a50c3-c5cc-4c1e-91a0-d0e34a2229a3 | diff --git a/test/main/services/team/OpenCodeAcceptFastDelivery.live-e2e.test.ts b/test/main/services/team/OpenCodeAcceptFastDelivery.live-e2e.test.ts index ce320b64..2d234a28 100644 --- a/test/main/services/team/OpenCodeAcceptFastDelivery.live-e2e.test.ts +++ b/test/main/services/team/OpenCodeAcceptFastDelivery.live-e2e.test.ts @@ -78,7 +78,7 @@ liveDescribe('OpenCode accept-fast delivery live e2e', () => { } else { await fs.rm(tempDir, { recursive: true, force: true }); } - }); + }, 90_000); it( 'returns after prompt acceptance and later projects exact-session task logs',