test(opencode): harden deep live launch smoke

This commit is contained in:
777genius 2026-05-20 18:24:13 +03:00
parent a88e9f74b7
commit 187a2697f7
3 changed files with 125 additions and 129 deletions

View file

@ -1,5 +1,5 @@
{
"generatedAt": "2026-05-18T13:16:59.867Z",
"generatedAt": "2026-05-20T15:19:19.600Z",
"runsPerModel": 1,
"qualification": {
"minimumAverageScore": 80,
@ -10,23 +10,23 @@
"models": [
{
"model": "opencode/big-pickle",
"verdict": "tested-only",
"verdict": "recommended",
"confidence": "low",
"qualified": false,
"readinessScore": 54,
"averageScore": 35,
"qualified": true,
"readinessScore": 100,
"averageScore": 100,
"consistencyScore": 100,
"behavioralAverageScore": 35,
"minScore": 35,
"successfulRuns": 0,
"behavioralAverageScore": 100,
"minScore": 100,
"successfulRuns": 1,
"countedRuns": 1,
"hardFailures": 1,
"hardFailures": 0,
"providerInfraFailures": 0,
"runtimeTransportFailures": 0,
"modelBehaviorFailures": 1,
"modelBehaviorFailures": 0,
"harnessFailures": 0,
"p50DurationMs": 133048,
"p95DurationMs": 133048,
"p50DurationMs": 201546,
"p95DurationMs": 201546,
"stagePassRates": {
"launchBootstrap": {
"passed": 1,
@ -39,19 +39,19 @@
"rate": 100
},
"peerRelayAB": {
"passed": 0,
"passed": 1,
"total": 1,
"rate": 0
"rate": 100
},
"peerRelayBC": {
"passed": 0,
"passed": 1,
"total": 1,
"rate": 0
"rate": 100
},
"concurrentReplies": {
"passed": 0,
"passed": 1,
"total": 1,
"rate": 0
"rate": 100
},
"taskRefs": {
"passed": 1,
@ -59,19 +59,19 @@
"rate": 100
},
"cleanTranscript": {
"passed": 0,
"passed": 1,
"total": 1,
"rate": 0
"rate": 100
},
"noDuplicateTokens": {
"passed": 0,
"passed": 1,
"total": 1,
"rate": 0
"rate": 100
},
"latencyStable": {
"passed": 0,
"passed": 1,
"total": 1,
"rate": 0
"rate": 100
}
},
"taskRefPassRates": {
@ -81,24 +81,24 @@
"rate": 100
},
"peerRelayAB": {
"passed": 0,
"total": 0,
"rate": null
"passed": 1,
"total": 1,
"rate": 100
},
"peerRelayBC": {
"passed": 0,
"total": 0,
"rate": null
"passed": 1,
"total": 1,
"rate": 100
},
"concurrentBob": {
"passed": 0,
"total": 0,
"rate": null
"passed": 1,
"total": 1,
"rate": 100
},
"concurrentTom": {
"passed": 0,
"total": 0,
"rate": null
"passed": 1,
"total": 1,
"rate": 100
}
},
"protocolViolationTotals": {
@ -107,64 +107,24 @@
"affectedRuns": 0
},
"stageFailureImpact": [
{
"stage": "concurrentReplies",
"failedRuns": 1,
"weightedLoss": 15,
"passRate": {
"passed": 0,
"total": 1,
"rate": 0
}
},
{
"stage": "peerRelayAB",
"failedRuns": 1,
"weightedLoss": 15,
"passRate": {
"passed": 0,
"total": 1,
"rate": 0
}
},
{
"stage": "peerRelayBC",
"failedRuns": 1,
"weightedLoss": 15,
"passRate": {
"passed": 0,
"total": 1,
"rate": 0
}
},
{
"stage": "cleanTranscript",
"failedRuns": 1,
"weightedLoss": 10,
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 0,
"passed": 1,
"total": 1,
"rate": 0
"rate": 100
}
},
{
"stage": "latencyStable",
"failedRuns": 1,
"weightedLoss": 5,
"stage": "concurrentReplies",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 0,
"passed": 1,
"total": 1,
"rate": 0
}
},
{
"stage": "noDuplicateTokens",
"failedRuns": 1,
"weightedLoss": 5,
"passRate": {
"passed": 0,
"total": 1,
"rate": 0
"rate": 100
}
},
{
@ -177,6 +137,16 @@
"rate": 100
}
},
{
"stage": "latencyStable",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "launchBootstrap",
"failedRuns": 0,
@ -187,6 +157,36 @@
"rate": 100
}
},
{
"stage": "noDuplicateTokens",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "peerRelayAB",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "peerRelayBC",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "taskRefs",
"failedRuns": 0,
@ -200,45 +200,42 @@
],
"scoreStability": {
"sampleSize": 1,
"minScore": 35,
"maxScore": 35,
"minScore": 100,
"maxScore": 100,
"spread": 0,
"standardDeviation": 0,
"consistencyScore": 100
},
"dominantFailureCategory": "model-behavior",
"recommendationBlockers": [
"overall average 35 < 80",
"behavioral average 35 < 80",
"successful runs 0 < 1",
"hard failures 1",
"model-behavior failures 1",
"highest weighted stage loss concurrentReplies=15"
],
"dominantFailureCategory": "none",
"recommendationBlockers": [],
"runs": [
{
"runIndex": 1,
"passed": false,
"score": 35,
"passed": true,
"score": 100,
"countedForRecommendation": true,
"outcome": "behavioral-fail",
"failureCategory": "model-behavior",
"primaryFailure": "Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending",
"durationMs": 133048,
"hardFailure": true,
"outcome": "passed",
"failureCategory": "none",
"primaryFailure": null,
"durationMs": 201546,
"hardFailure": false,
"stageDurationsMs": {
"setup": 371,
"launchBootstrap": 23735,
"materializeTasks": 36,
"directReply": 17327
"setup": 171,
"launchBootstrap": 41905,
"materializeTasks": 39,
"directReply": 29109,
"peerRelayAB": 45148,
"peerRelayBC": 39967,
"concurrentReplies": 28807,
"hygiene": 1
},
"stageFailures": {},
"taskRefChecks": {
"directReply": true,
"peerRelayAB": null,
"peerRelayBC": null,
"concurrentBob": null,
"concurrentTom": null
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentBob": true,
"concurrentTom": true
},
"protocolViolations": {
"badMessages": 0,
@ -247,17 +244,16 @@
"stages": {
"launchBootstrap": true,
"directReply": true,
"peerRelayAB": false,
"peerRelayBC": false,
"concurrentReplies": false,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentReplies": true,
"taskRefs": true,
"cleanTranscript": false,
"noDuplicateTokens": false,
"latencyStable": false
"cleanTranscript": true,
"noDuplicateTokens": true,
"latencyStable": true
},
"diagnostics": [
"runId=9fb17ac5-fc66-4b01-831e-90a04b1e2304",
"Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending"
"runId=1d6a50c3-c5cc-4c1e-91a0-d0e34a2229a3"
]
}
]

View file

@ -1,6 +1,6 @@
# OpenCode Model Gauntlet Results
Generated: 2026-05-18T13:16:59.867Z
Generated: 2026-05-20T15:19:19.600Z
Runs per model: 1
Recommended threshold: average >= 80, successful runs >= 1, consistency >= 85, hard failures = 0
@ -13,25 +13,25 @@ Scoring weights: launchBootstrap=15, directReply=10, peerRelayAB=15, peerRelayBC
| Model | Verdict | Confidence | Readiness | Consistency | Score Spread | Behavior Avg | Overall Avg | Counted | Pass Runs | Weakest Stage | Weakest TaskRef | Dominant Failure | Blockers | Provider Infra | Runtime Transport | Model Fails | Protocol Runs | p50 | p95 |
| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |
| `opencode/big-pickle` | Tested only | low | 54 | 100 | 0 | 35 | 35 | 1/1 | 0/1 | cleanTranscript 0/1 (0%) | directReply 1/1 (100%) | model-behavior | overall average 35 < 80; behavioral average 35 < 80; successful runs 0 < 1; hard failures 1; model-behavior failures 1; highest weighted stage loss concurrentReplies=15 | 0 | 0 | 1 | 0 | 133048ms | 133048ms |
| `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 201546ms | 201546ms |
## opencode/big-pickle
Readiness score: 54.
Readiness score: 100.
Score stability: consistency=100, min=35, max=35, spread=0, stdDev=0, samples=1.
Score stability: consistency=100, min=100, max=100, spread=0, stdDev=0, samples=1.
Recommendation blockers: overall average 35 < 80; behavioral average 35 < 80; successful runs 0 < 1; hard failures 1; model-behavior failures 1; highest weighted stage loss concurrentReplies=15.
Recommendation blockers: -.
Weighted stage impact: concurrentReplies:loss=15, failed=1, pass=0/1 (0%); peerRelayAB:loss=15, failed=1, pass=0/1 (0%); peerRelayBC:loss=15, failed=1, pass=0/1 (0%); cleanTranscript:loss=10, failed=1, pass=0/1 (0%); latencyStable:loss=5, failed=1, pass=0/1 (0%).
Weighted stage impact: -.
Stage pass rates: launchBootstrap:1/1 (100%), directReply:1/1 (100%), peerRelayAB:0/1 (0%), peerRelayBC:0/1 (0%), concurrentReplies:0/1 (0%), taskRefs:1/1 (100%), cleanTranscript:0/1 (0%), noDuplicateTokens:0/1 (0%), latencyStable:0/1 (0%).
Stage pass rates: launchBootstrap:1/1 (100%), directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentReplies:1/1 (100%), taskRefs:1/1 (100%), cleanTranscript:1/1 (100%), noDuplicateTokens:1/1 (100%), latencyStable:1/1 (100%).
TaskRef pass rates: directReply:1/1 (100%), peerRelayAB:n/a, peerRelayBC:n/a, concurrentBob:n/a, concurrentTom:n/a.
TaskRef pass rates: directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentBob:1/1 (100%), concurrentTom:1/1 (100%).
Protocol totals: badMessages=0, duplicateOrMissingTokens=0, affectedRuns=0.
| Run | Outcome | Category | Score | Counted | Duration | Failed Stages | Slowest Stage | TaskRefs | Protocol | Diagnostics |
| ---: | --- | --- | ---: | --- | ---: | --- | --- | --- | --- | --- |
| 1 | behavioral-fail | model-behavior | 35 | yes | 133048ms | peerRelayAB, peerRelayBC, concurrentReplies, cleanTranscript, noDuplicateTokens, latencyStable | launchBootstrap:23735ms | directReply:ok | - | Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending |
| 1 | passed | none | 100 | yes | 201546ms | - | peerRelayAB:45148ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=1d6a50c3-c5cc-4c1e-91a0-d0e34a2229a3 |

View file

@ -78,7 +78,7 @@ liveDescribe('OpenCode accept-fast delivery live e2e', () => {
} else {
await fs.rm(tempDir, { recursive: true, force: true });
}
});
}, 90_000);
it(
'returns after prompt acceptance and later projects exact-session task logs',