test(opencode): update live semantic model results

This commit is contained in:
777genius 2026-05-27 12:14:56 +03:00
parent ebcc0e717f
commit 5046d80fdf
3 changed files with 39 additions and 15 deletions

View file

@ -1,5 +1,5 @@
{ {
"generatedAt": "2026-05-20T15:44:19.975Z", "generatedAt": "2026-05-27T08:11:47.513Z",
"runsPerModel": 1, "runsPerModel": 1,
"qualification": { "qualification": {
"minimumAverageScore": 80, "minimumAverageScore": 80,
@ -25,8 +25,8 @@
"runtimeTransportFailures": 0, "runtimeTransportFailures": 0,
"modelBehaviorFailures": 0, "modelBehaviorFailures": 0,
"harnessFailures": 0, "harnessFailures": 0,
"p50DurationMs": 201184, "p50DurationMs": 129420,
"p95DurationMs": 201184, "p95DurationMs": 129420,
"stagePassRates": { "stagePassRates": {
"launchBootstrap": { "launchBootstrap": {
"passed": 1, "passed": 1,
@ -217,16 +217,16 @@
"outcome": "passed", "outcome": "passed",
"failureCategory": "none", "failureCategory": "none",
"primaryFailure": null, "primaryFailure": null,
"durationMs": 201184, "durationMs": 129420,
"hardFailure": false, "hardFailure": false,
"stageDurationsMs": { "stageDurationsMs": {
"setup": 322, "setup": 168,
"launchBootstrap": 44102, "launchBootstrap": 31364,
"materializeTasks": 40, "materializeTasks": 29,
"directReply": 20838, "directReply": 15080,
"peerRelayAB": 41022, "peerRelayAB": 31900,
"peerRelayBC": 47832, "peerRelayBC": 29178,
"concurrentReplies": 29138, "concurrentReplies": 20867,
"hygiene": 1 "hygiene": 1
}, },
"stageFailures": {}, "stageFailures": {},
@ -253,7 +253,7 @@
"latencyStable": true "latencyStable": true
}, },
"diagnostics": [ "diagnostics": [
"runId=85e7ecb6-0767-4606-90d2-c926937b22f5" "runId=37f103a7-cae5-4d48-b578-56cbabb466d9"
] ]
} }
] ]

View file

@ -1,6 +1,6 @@
# OpenCode Model Gauntlet Results # OpenCode Model Gauntlet Results
Generated: 2026-05-20T15:44:19.975Z Generated: 2026-05-27T08:11:47.513Z
Runs per model: 1 Runs per model: 1
Recommended threshold: average >= 80, successful runs >= 1, consistency >= 85, hard failures = 0 Recommended threshold: average >= 80, successful runs >= 1, consistency >= 85, hard failures = 0
@ -13,7 +13,7 @@ Scoring weights: launchBootstrap=15, directReply=10, peerRelayAB=15, peerRelayBC
| Model | Verdict | Confidence | Readiness | Consistency | Score Spread | Behavior Avg | Overall Avg | Counted | Pass Runs | Weakest Stage | Weakest TaskRef | Dominant Failure | Blockers | Provider Infra | Runtime Transport | Model Fails | Protocol Runs | p50 | p95 | | Model | Verdict | Confidence | Readiness | Consistency | Score Spread | Behavior Avg | Overall Avg | Counted | Pass Runs | Weakest Stage | Weakest TaskRef | Dominant Failure | Blockers | Provider Infra | Runtime Transport | Model Fails | Protocol Runs | p50 | p95 |
| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |
| `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 201184ms | 201184ms | | `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 129420ms | 129420ms |
## opencode/big-pickle ## opencode/big-pickle
@ -33,5 +33,5 @@ Protocol totals: badMessages=0, duplicateOrMissingTokens=0, affectedRuns=0.
| Run | Outcome | Category | Score | Counted | Duration | Failed Stages | Slowest Stage | TaskRefs | Protocol | Diagnostics | | Run | Outcome | Category | Score | Counted | Duration | Failed Stages | Slowest Stage | TaskRefs | Protocol | Diagnostics |
| ---: | --- | --- | ---: | --- | ---: | --- | --- | --- | --- | --- | | ---: | --- | --- | ---: | --- | ---: | --- | --- | --- | --- | --- |
| 1 | passed | none | 100 | yes | 201184ms | - | peerRelayBC:47832ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=85e7ecb6-0767-4606-90d2-c926937b22f5 | | 1 | passed | none | 100 | yes | 129420ms | - | peerRelayAB:31900ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=37f103a7-cae5-4d48-b578-56cbabb466d9 |

View file

@ -0,0 +1,24 @@
{
"generatedAt": "2026-05-27T08:11:34.489Z",
"models": [
{
"model": "opencode/big-pickle",
"passed": true,
"score": 100,
"durationMs": 86233,
"stages": {
"launchBootstrap": true,
"directReply": true,
"peerRelay": true,
"taskRefs": true,
"longPrompt": true,
"latencyStable": true
},
"diagnostics": [
"runId=5a90cf2a-d00e-4e26-a514-5efecf1914af",
"directDelivery={\"delivered\":true,\"accepted\":true,\"responsePending\":false,\"responseState\":\"responded_visible_message\",\"ledgerStatus\":\"responded\",\"visibleReplyMessageId\":\"d4065728-b244-4e53-a8cf-d33d7de62a6f\",\"visibleReplyCorrelation\":\"relayOfMessageId\",\"diagnostics\":[\"OpenCode app MCP is connected for message delivery.\",\"OpenCode prompt_async accepted after a turn-settled guard; response observation remains delegated to durable app-side ledger reconciliation.\"]}",
"peerDelivery={\"delivered\":true,\"accepted\":true,\"responsePending\":false,\"responseState\":\"responded_visible_message\",\"ledgerStatus\":\"responded\",\"visibleReplyMessageId\":\"37187280-1220-44da-a5d3-a3fdf812cc3a\",\"visibleReplyCorrelation\":\"relayOfMessageId\",\"diagnostics\":[\"OpenCode app MCP is connected for message delivery.\",\"OpenCode prompt_async accepted after a turn-settled guard; response observation remains delegated to durable app-side ledger reconciliation.\"]}"
]
}
]
}