test(opencode): update live semantic model results
This commit is contained in:
parent
ebcc0e717f
commit
5046d80fdf
3 changed files with 39 additions and 15 deletions
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"generatedAt": "2026-05-20T15:44:19.975Z",
|
||||
"generatedAt": "2026-05-27T08:11:47.513Z",
|
||||
"runsPerModel": 1,
|
||||
"qualification": {
|
||||
"minimumAverageScore": 80,
|
||||
|
|
@ -25,8 +25,8 @@
|
|||
"runtimeTransportFailures": 0,
|
||||
"modelBehaviorFailures": 0,
|
||||
"harnessFailures": 0,
|
||||
"p50DurationMs": 201184,
|
||||
"p95DurationMs": 201184,
|
||||
"p50DurationMs": 129420,
|
||||
"p95DurationMs": 129420,
|
||||
"stagePassRates": {
|
||||
"launchBootstrap": {
|
||||
"passed": 1,
|
||||
|
|
@ -217,16 +217,16 @@
|
|||
"outcome": "passed",
|
||||
"failureCategory": "none",
|
||||
"primaryFailure": null,
|
||||
"durationMs": 201184,
|
||||
"durationMs": 129420,
|
||||
"hardFailure": false,
|
||||
"stageDurationsMs": {
|
||||
"setup": 322,
|
||||
"launchBootstrap": 44102,
|
||||
"materializeTasks": 40,
|
||||
"directReply": 20838,
|
||||
"peerRelayAB": 41022,
|
||||
"peerRelayBC": 47832,
|
||||
"concurrentReplies": 29138,
|
||||
"setup": 168,
|
||||
"launchBootstrap": 31364,
|
||||
"materializeTasks": 29,
|
||||
"directReply": 15080,
|
||||
"peerRelayAB": 31900,
|
||||
"peerRelayBC": 29178,
|
||||
"concurrentReplies": 20867,
|
||||
"hygiene": 1
|
||||
},
|
||||
"stageFailures": {},
|
||||
|
|
@ -253,7 +253,7 @@
|
|||
"latencyStable": true
|
||||
},
|
||||
"diagnostics": [
|
||||
"runId=85e7ecb6-0767-4606-90d2-c926937b22f5"
|
||||
"runId=37f103a7-cae5-4d48-b578-56cbabb466d9"
|
||||
]
|
||||
}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# OpenCode Model Gauntlet Results
|
||||
|
||||
Generated: 2026-05-20T15:44:19.975Z
|
||||
Generated: 2026-05-27T08:11:47.513Z
|
||||
|
||||
Runs per model: 1
|
||||
Recommended threshold: average >= 80, successful runs >= 1, consistency >= 85, hard failures = 0
|
||||
|
|
@ -13,7 +13,7 @@ Scoring weights: launchBootstrap=15, directReply=10, peerRelayAB=15, peerRelayBC
|
|||
|
||||
| Model | Verdict | Confidence | Readiness | Consistency | Score Spread | Behavior Avg | Overall Avg | Counted | Pass Runs | Weakest Stage | Weakest TaskRef | Dominant Failure | Blockers | Provider Infra | Runtime Transport | Model Fails | Protocol Runs | p50 | p95 |
|
||||
| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |
|
||||
| `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 201184ms | 201184ms |
|
||||
| `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 129420ms | 129420ms |
|
||||
|
||||
## opencode/big-pickle
|
||||
|
||||
|
|
@ -33,5 +33,5 @@ Protocol totals: badMessages=0, duplicateOrMissingTokens=0, affectedRuns=0.
|
|||
|
||||
| Run | Outcome | Category | Score | Counted | Duration | Failed Stages | Slowest Stage | TaskRefs | Protocol | Diagnostics |
|
||||
| ---: | --- | --- | ---: | --- | ---: | --- | --- | --- | --- | --- |
|
||||
| 1 | passed | none | 100 | yes | 201184ms | - | peerRelayBC:47832ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=85e7ecb6-0767-4606-90d2-c926937b22f5 |
|
||||
| 1 | passed | none | 100 | yes | 129420ms | - | peerRelayAB:31900ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=37f103a7-cae5-4d48-b578-56cbabb466d9 |
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,24 @@
|
|||
{
|
||||
"generatedAt": "2026-05-27T08:11:34.489Z",
|
||||
"models": [
|
||||
{
|
||||
"model": "opencode/big-pickle",
|
||||
"passed": true,
|
||||
"score": 100,
|
||||
"durationMs": 86233,
|
||||
"stages": {
|
||||
"launchBootstrap": true,
|
||||
"directReply": true,
|
||||
"peerRelay": true,
|
||||
"taskRefs": true,
|
||||
"longPrompt": true,
|
||||
"latencyStable": true
|
||||
},
|
||||
"diagnostics": [
|
||||
"runId=5a90cf2a-d00e-4e26-a514-5efecf1914af",
|
||||
"directDelivery={\"delivered\":true,\"accepted\":true,\"responsePending\":false,\"responseState\":\"responded_visible_message\",\"ledgerStatus\":\"responded\",\"visibleReplyMessageId\":\"d4065728-b244-4e53-a8cf-d33d7de62a6f\",\"visibleReplyCorrelation\":\"relayOfMessageId\",\"diagnostics\":[\"OpenCode app MCP is connected for message delivery.\",\"OpenCode prompt_async accepted after a turn-settled guard; response observation remains delegated to durable app-side ledger reconciliation.\"]}",
|
||||
"peerDelivery={\"delivered\":true,\"accepted\":true,\"responsePending\":false,\"responseState\":\"responded_visible_message\",\"ledgerStatus\":\"responded\",\"visibleReplyMessageId\":\"37187280-1220-44da-a5d3-a3fdf812cc3a\",\"visibleReplyCorrelation\":\"relayOfMessageId\",\"diagnostics\":[\"OpenCode app MCP is connected for message delivery.\",\"OpenCode prompt_async accepted after a turn-settled guard; response observation remains delegated to durable app-side ledger reconciliation.\"]}"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Loading…
Reference in a new issue