test(opencode): harden deep live launch smoke
This commit is contained in:
parent
a88e9f74b7
commit
187a2697f7
3 changed files with 125 additions and 129 deletions
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"generatedAt": "2026-05-18T13:16:59.867Z",
|
||||
"generatedAt": "2026-05-20T15:19:19.600Z",
|
||||
"runsPerModel": 1,
|
||||
"qualification": {
|
||||
"minimumAverageScore": 80,
|
||||
|
|
@ -10,23 +10,23 @@
|
|||
"models": [
|
||||
{
|
||||
"model": "opencode/big-pickle",
|
||||
"verdict": "tested-only",
|
||||
"verdict": "recommended",
|
||||
"confidence": "low",
|
||||
"qualified": false,
|
||||
"readinessScore": 54,
|
||||
"averageScore": 35,
|
||||
"qualified": true,
|
||||
"readinessScore": 100,
|
||||
"averageScore": 100,
|
||||
"consistencyScore": 100,
|
||||
"behavioralAverageScore": 35,
|
||||
"minScore": 35,
|
||||
"successfulRuns": 0,
|
||||
"behavioralAverageScore": 100,
|
||||
"minScore": 100,
|
||||
"successfulRuns": 1,
|
||||
"countedRuns": 1,
|
||||
"hardFailures": 1,
|
||||
"hardFailures": 0,
|
||||
"providerInfraFailures": 0,
|
||||
"runtimeTransportFailures": 0,
|
||||
"modelBehaviorFailures": 1,
|
||||
"modelBehaviorFailures": 0,
|
||||
"harnessFailures": 0,
|
||||
"p50DurationMs": 133048,
|
||||
"p95DurationMs": 133048,
|
||||
"p50DurationMs": 201546,
|
||||
"p95DurationMs": 201546,
|
||||
"stagePassRates": {
|
||||
"launchBootstrap": {
|
||||
"passed": 1,
|
||||
|
|
@ -39,19 +39,19 @@
|
|||
"rate": 100
|
||||
},
|
||||
"peerRelayAB": {
|
||||
"passed": 0,
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
"rate": 100
|
||||
},
|
||||
"peerRelayBC": {
|
||||
"passed": 0,
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
"rate": 100
|
||||
},
|
||||
"concurrentReplies": {
|
||||
"passed": 0,
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
"rate": 100
|
||||
},
|
||||
"taskRefs": {
|
||||
"passed": 1,
|
||||
|
|
@ -59,19 +59,19 @@
|
|||
"rate": 100
|
||||
},
|
||||
"cleanTranscript": {
|
||||
"passed": 0,
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
"rate": 100
|
||||
},
|
||||
"noDuplicateTokens": {
|
||||
"passed": 0,
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
"rate": 100
|
||||
},
|
||||
"latencyStable": {
|
||||
"passed": 0,
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
"taskRefPassRates": {
|
||||
|
|
@ -81,24 +81,24 @@
|
|||
"rate": 100
|
||||
},
|
||||
"peerRelayAB": {
|
||||
"passed": 0,
|
||||
"total": 0,
|
||||
"rate": null
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
},
|
||||
"peerRelayBC": {
|
||||
"passed": 0,
|
||||
"total": 0,
|
||||
"rate": null
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
},
|
||||
"concurrentBob": {
|
||||
"passed": 0,
|
||||
"total": 0,
|
||||
"rate": null
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
},
|
||||
"concurrentTom": {
|
||||
"passed": 0,
|
||||
"total": 0,
|
||||
"rate": null
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
"protocolViolationTotals": {
|
||||
|
|
@ -107,64 +107,24 @@
|
|||
"affectedRuns": 0
|
||||
},
|
||||
"stageFailureImpact": [
|
||||
{
|
||||
"stage": "concurrentReplies",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 15,
|
||||
"passRate": {
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "peerRelayAB",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 15,
|
||||
"passRate": {
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "peerRelayBC",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 15,
|
||||
"passRate": {
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "cleanTranscript",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 10,
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"passRate": {
|
||||
"passed": 0,
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "latencyStable",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 5,
|
||||
"stage": "concurrentReplies",
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"passRate": {
|
||||
"passed": 0,
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "noDuplicateTokens",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 5,
|
||||
"passRate": {
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
|
|
@ -177,6 +137,16 @@
|
|||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "latencyStable",
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"passRate": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "launchBootstrap",
|
||||
"failedRuns": 0,
|
||||
|
|
@ -187,6 +157,36 @@
|
|||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "noDuplicateTokens",
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"passRate": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "peerRelayAB",
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"passRate": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "peerRelayBC",
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"passRate": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "taskRefs",
|
||||
"failedRuns": 0,
|
||||
|
|
@ -200,45 +200,42 @@
|
|||
],
|
||||
"scoreStability": {
|
||||
"sampleSize": 1,
|
||||
"minScore": 35,
|
||||
"maxScore": 35,
|
||||
"minScore": 100,
|
||||
"maxScore": 100,
|
||||
"spread": 0,
|
||||
"standardDeviation": 0,
|
||||
"consistencyScore": 100
|
||||
},
|
||||
"dominantFailureCategory": "model-behavior",
|
||||
"recommendationBlockers": [
|
||||
"overall average 35 < 80",
|
||||
"behavioral average 35 < 80",
|
||||
"successful runs 0 < 1",
|
||||
"hard failures 1",
|
||||
"model-behavior failures 1",
|
||||
"highest weighted stage loss concurrentReplies=15"
|
||||
],
|
||||
"dominantFailureCategory": "none",
|
||||
"recommendationBlockers": [],
|
||||
"runs": [
|
||||
{
|
||||
"runIndex": 1,
|
||||
"passed": false,
|
||||
"score": 35,
|
||||
"passed": true,
|
||||
"score": 100,
|
||||
"countedForRecommendation": true,
|
||||
"outcome": "behavioral-fail",
|
||||
"failureCategory": "model-behavior",
|
||||
"primaryFailure": "Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending",
|
||||
"durationMs": 133048,
|
||||
"hardFailure": true,
|
||||
"outcome": "passed",
|
||||
"failureCategory": "none",
|
||||
"primaryFailure": null,
|
||||
"durationMs": 201546,
|
||||
"hardFailure": false,
|
||||
"stageDurationsMs": {
|
||||
"setup": 371,
|
||||
"launchBootstrap": 23735,
|
||||
"materializeTasks": 36,
|
||||
"directReply": 17327
|
||||
"setup": 171,
|
||||
"launchBootstrap": 41905,
|
||||
"materializeTasks": 39,
|
||||
"directReply": 29109,
|
||||
"peerRelayAB": 45148,
|
||||
"peerRelayBC": 39967,
|
||||
"concurrentReplies": 28807,
|
||||
"hygiene": 1
|
||||
},
|
||||
"stageFailures": {},
|
||||
"taskRefChecks": {
|
||||
"directReply": true,
|
||||
"peerRelayAB": null,
|
||||
"peerRelayBC": null,
|
||||
"concurrentBob": null,
|
||||
"concurrentTom": null
|
||||
"peerRelayAB": true,
|
||||
"peerRelayBC": true,
|
||||
"concurrentBob": true,
|
||||
"concurrentTom": true
|
||||
},
|
||||
"protocolViolations": {
|
||||
"badMessages": 0,
|
||||
|
|
@ -247,17 +244,16 @@
|
|||
"stages": {
|
||||
"launchBootstrap": true,
|
||||
"directReply": true,
|
||||
"peerRelayAB": false,
|
||||
"peerRelayBC": false,
|
||||
"concurrentReplies": false,
|
||||
"peerRelayAB": true,
|
||||
"peerRelayBC": true,
|
||||
"concurrentReplies": true,
|
||||
"taskRefs": true,
|
||||
"cleanTranscript": false,
|
||||
"noDuplicateTokens": false,
|
||||
"latencyStable": false
|
||||
"cleanTranscript": true,
|
||||
"noDuplicateTokens": true,
|
||||
"latencyStable": true
|
||||
},
|
||||
"diagnostics": [
|
||||
"runId=9fb17ac5-fc66-4b01-831e-90a04b1e2304",
|
||||
"Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending"
|
||||
"runId=1d6a50c3-c5cc-4c1e-91a0-d0e34a2229a3"
|
||||
]
|
||||
}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# OpenCode Model Gauntlet Results
|
||||
|
||||
Generated: 2026-05-18T13:16:59.867Z
|
||||
Generated: 2026-05-20T15:19:19.600Z
|
||||
|
||||
Runs per model: 1
|
||||
Recommended threshold: average >= 80, successful runs >= 1, consistency >= 85, hard failures = 0
|
||||
|
|
@ -13,25 +13,25 @@ Scoring weights: launchBootstrap=15, directReply=10, peerRelayAB=15, peerRelayBC
|
|||
|
||||
| Model | Verdict | Confidence | Readiness | Consistency | Score Spread | Behavior Avg | Overall Avg | Counted | Pass Runs | Weakest Stage | Weakest TaskRef | Dominant Failure | Blockers | Provider Infra | Runtime Transport | Model Fails | Protocol Runs | p50 | p95 |
|
||||
| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |
|
||||
| `opencode/big-pickle` | Tested only | low | 54 | 100 | 0 | 35 | 35 | 1/1 | 0/1 | cleanTranscript 0/1 (0%) | directReply 1/1 (100%) | model-behavior | overall average 35 < 80; behavioral average 35 < 80; successful runs 0 < 1; hard failures 1; model-behavior failures 1; highest weighted stage loss concurrentReplies=15 | 0 | 0 | 1 | 0 | 133048ms | 133048ms |
|
||||
| `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 201546ms | 201546ms |
|
||||
|
||||
## opencode/big-pickle
|
||||
|
||||
Readiness score: 54.
|
||||
Readiness score: 100.
|
||||
|
||||
Score stability: consistency=100, min=35, max=35, spread=0, stdDev=0, samples=1.
|
||||
Score stability: consistency=100, min=100, max=100, spread=0, stdDev=0, samples=1.
|
||||
|
||||
Recommendation blockers: overall average 35 < 80; behavioral average 35 < 80; successful runs 0 < 1; hard failures 1; model-behavior failures 1; highest weighted stage loss concurrentReplies=15.
|
||||
Recommendation blockers: -.
|
||||
|
||||
Weighted stage impact: concurrentReplies:loss=15, failed=1, pass=0/1 (0%); peerRelayAB:loss=15, failed=1, pass=0/1 (0%); peerRelayBC:loss=15, failed=1, pass=0/1 (0%); cleanTranscript:loss=10, failed=1, pass=0/1 (0%); latencyStable:loss=5, failed=1, pass=0/1 (0%).
|
||||
Weighted stage impact: -.
|
||||
|
||||
Stage pass rates: launchBootstrap:1/1 (100%), directReply:1/1 (100%), peerRelayAB:0/1 (0%), peerRelayBC:0/1 (0%), concurrentReplies:0/1 (0%), taskRefs:1/1 (100%), cleanTranscript:0/1 (0%), noDuplicateTokens:0/1 (0%), latencyStable:0/1 (0%).
|
||||
Stage pass rates: launchBootstrap:1/1 (100%), directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentReplies:1/1 (100%), taskRefs:1/1 (100%), cleanTranscript:1/1 (100%), noDuplicateTokens:1/1 (100%), latencyStable:1/1 (100%).
|
||||
|
||||
TaskRef pass rates: directReply:1/1 (100%), peerRelayAB:n/a, peerRelayBC:n/a, concurrentBob:n/a, concurrentTom:n/a.
|
||||
TaskRef pass rates: directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentBob:1/1 (100%), concurrentTom:1/1 (100%).
|
||||
|
||||
Protocol totals: badMessages=0, duplicateOrMissingTokens=0, affectedRuns=0.
|
||||
|
||||
| Run | Outcome | Category | Score | Counted | Duration | Failed Stages | Slowest Stage | TaskRefs | Protocol | Diagnostics |
|
||||
| ---: | --- | --- | ---: | --- | ---: | --- | --- | --- | --- | --- |
|
||||
| 1 | behavioral-fail | model-behavior | 35 | yes | 133048ms | peerRelayAB, peerRelayBC, concurrentReplies, cleanTranscript, noDuplicateTokens, latencyStable | launchBootstrap:23735ms | directReply:ok | - | Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending |
|
||||
| 1 | passed | none | 100 | yes | 201546ms | - | peerRelayAB:45148ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=1d6a50c3-c5cc-4c1e-91a0-d0e34a2229a3 |
|
||||
|
||||
|
|
|
|||
|
|
@ -78,7 +78,7 @@ liveDescribe('OpenCode accept-fast delivery live e2e', () => {
|
|||
} else {
|
||||
await fs.rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
}, 90_000);
|
||||
|
||||
it(
|
||||
'returns after prompt acceptance and later projects exact-session task logs',
|
||||
|
|
|
|||
Loading…
Reference in a new issue