diff --git a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json index 6e72f050..bb815c19 100644 --- a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json +++ b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json @@ -1,5 +1,5 @@ { - "generatedAt": "2026-05-20T15:44:19.975Z", + "generatedAt": "2026-05-27T08:11:47.513Z", "runsPerModel": 1, "qualification": { "minimumAverageScore": 80, @@ -25,8 +25,8 @@ "runtimeTransportFailures": 0, "modelBehaviorFailures": 0, "harnessFailures": 0, - "p50DurationMs": 201184, - "p95DurationMs": 201184, + "p50DurationMs": 129420, + "p95DurationMs": 129420, "stagePassRates": { "launchBootstrap": { "passed": 1, @@ -217,16 +217,16 @@ "outcome": "passed", "failureCategory": "none", "primaryFailure": null, - "durationMs": 201184, + "durationMs": 129420, "hardFailure": false, "stageDurationsMs": { - "setup": 322, - "launchBootstrap": 44102, - "materializeTasks": 40, - "directReply": 20838, - "peerRelayAB": 41022, - "peerRelayBC": 47832, - "concurrentReplies": 29138, + "setup": 168, + "launchBootstrap": 31364, + "materializeTasks": 29, + "directReply": 15080, + "peerRelayAB": 31900, + "peerRelayBC": 29178, + "concurrentReplies": 20867, "hygiene": 1 }, "stageFailures": {}, @@ -253,7 +253,7 @@ "latencyStable": true }, "diagnostics": [ - "runId=85e7ecb6-0767-4606-90d2-c926937b22f5" + "runId=37f103a7-cae5-4d48-b578-56cbabb466d9" ] } ] diff --git a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md index 0c4f989e..51f7194a 100644 --- a/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md +++ b/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.md @@ -1,6 +1,6 @@ # OpenCode Model Gauntlet Results -Generated: 2026-05-20T15:44:19.975Z +Generated: 2026-05-27T08:11:47.513Z Runs per model: 1 Recommended threshold: average >= 80, successful runs >= 1, consistency >= 85, hard failures = 0 @@ -13,7 +13,7 @@ Scoring weights: launchBootstrap=15, directReply=10, peerRelayAB=15, peerRelayBC | Model | Verdict | Confidence | Readiness | Consistency | Score Spread | Behavior Avg | Overall Avg | Counted | Pass Runs | Weakest Stage | Weakest TaskRef | Dominant Failure | Blockers | Provider Infra | Runtime Transport | Model Fails | Protocol Runs | p50 | p95 | | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | -| `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 201184ms | 201184ms | +| `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 129420ms | 129420ms | ## opencode/big-pickle @@ -33,5 +33,5 @@ Protocol totals: badMessages=0, duplicateOrMissingTokens=0, affectedRuns=0. | Run | Outcome | Category | Score | Counted | Duration | Failed Stages | Slowest Stage | TaskRefs | Protocol | Diagnostics | | ---: | --- | --- | ---: | --- | ---: | --- | --- | --- | --- | --- | -| 1 | passed | none | 100 | yes | 201184ms | - | peerRelayBC:47832ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=85e7ecb6-0767-4606-90d2-c926937b22f5 | +| 1 | passed | none | 100 | yes | 129420ms | - | peerRelayAB:31900ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=37f103a7-cae5-4d48-b578-56cbabb466d9 | diff --git a/test-results/opencode-semantic-model-matrix/report-1779869494489.json b/test-results/opencode-semantic-model-matrix/report-1779869494489.json new file mode 100644 index 00000000..1e1f1670 --- /dev/null +++ b/test-results/opencode-semantic-model-matrix/report-1779869494489.json @@ -0,0 +1,24 @@ +{ + "generatedAt": "2026-05-27T08:11:34.489Z", + "models": [ + { + "model": "opencode/big-pickle", + "passed": true, + "score": 100, + "durationMs": 86233, + "stages": { + "launchBootstrap": true, + "directReply": true, + "peerRelay": true, + "taskRefs": true, + "longPrompt": true, + "latencyStable": true + }, + "diagnostics": [ + "runId=5a90cf2a-d00e-4e26-a514-5efecf1914af", + "directDelivery={\"delivered\":true,\"accepted\":true,\"responsePending\":false,\"responseState\":\"responded_visible_message\",\"ledgerStatus\":\"responded\",\"visibleReplyMessageId\":\"d4065728-b244-4e53-a8cf-d33d7de62a6f\",\"visibleReplyCorrelation\":\"relayOfMessageId\",\"diagnostics\":[\"OpenCode app MCP is connected for message delivery.\",\"OpenCode prompt_async accepted after a turn-settled guard; response observation remains delegated to durable app-side ledger reconciliation.\"]}", + "peerDelivery={\"delivered\":true,\"accepted\":true,\"responsePending\":false,\"responseState\":\"responded_visible_message\",\"ledgerStatus\":\"responded\",\"visibleReplyMessageId\":\"37187280-1220-44da-a5d3-a3fdf812cc3a\",\"visibleReplyCorrelation\":\"relayOfMessageId\",\"diagnostics\":[\"OpenCode app MCP is connected for message delivery.\",\"OpenCode prompt_async accepted after a turn-settled guard; response observation remains delegated to durable app-side ledger reconciliation.\"]}" + ] + } + ] +}