agent-ecosystem/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json

262 lines
6.1 KiB
JSON

{
"generatedAt": "2026-05-27T08:11:47.513Z",
"runsPerModel": 1,
"qualification": {
"minimumAverageScore": 80,
"minimumSuccessfulRuns": 1,
"minimumConsistencyScore": 85,
"requireNoHardFailures": true
},
"models": [
{
"model": "opencode/big-pickle",
"verdict": "recommended",
"confidence": "low",
"qualified": true,
"readinessScore": 100,
"averageScore": 100,
"consistencyScore": 100,
"behavioralAverageScore": 100,
"minScore": 100,
"successfulRuns": 1,
"countedRuns": 1,
"hardFailures": 0,
"providerInfraFailures": 0,
"runtimeTransportFailures": 0,
"modelBehaviorFailures": 0,
"harnessFailures": 0,
"p50DurationMs": 129420,
"p95DurationMs": 129420,
"stagePassRates": {
"launchBootstrap": {
"passed": 1,
"total": 1,
"rate": 100
},
"directReply": {
"passed": 1,
"total": 1,
"rate": 100
},
"peerRelayAB": {
"passed": 1,
"total": 1,
"rate": 100
},
"peerRelayBC": {
"passed": 1,
"total": 1,
"rate": 100
},
"concurrentReplies": {
"passed": 1,
"total": 1,
"rate": 100
},
"taskRefs": {
"passed": 1,
"total": 1,
"rate": 100
},
"cleanTranscript": {
"passed": 1,
"total": 1,
"rate": 100
},
"noDuplicateTokens": {
"passed": 1,
"total": 1,
"rate": 100
},
"latencyStable": {
"passed": 1,
"total": 1,
"rate": 100
}
},
"taskRefPassRates": {
"directReply": {
"passed": 1,
"total": 1,
"rate": 100
},
"peerRelayAB": {
"passed": 1,
"total": 1,
"rate": 100
},
"peerRelayBC": {
"passed": 1,
"total": 1,
"rate": 100
},
"concurrentBob": {
"passed": 1,
"total": 1,
"rate": 100
},
"concurrentTom": {
"passed": 1,
"total": 1,
"rate": 100
}
},
"protocolViolationTotals": {
"badMessages": 0,
"duplicateOrMissingTokens": 0,
"affectedRuns": 0
},
"stageFailureImpact": [
{
"stage": "cleanTranscript",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "concurrentReplies",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "directReply",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "latencyStable",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "launchBootstrap",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "noDuplicateTokens",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "peerRelayAB",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "peerRelayBC",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
},
{
"stage": "taskRefs",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 1,
"total": 1,
"rate": 100
}
}
],
"scoreStability": {
"sampleSize": 1,
"minScore": 100,
"maxScore": 100,
"spread": 0,
"standardDeviation": 0,
"consistencyScore": 100
},
"dominantFailureCategory": "none",
"recommendationBlockers": [],
"runs": [
{
"runIndex": 1,
"passed": true,
"score": 100,
"countedForRecommendation": true,
"outcome": "passed",
"failureCategory": "none",
"primaryFailure": null,
"durationMs": 129420,
"hardFailure": false,
"stageDurationsMs": {
"setup": 168,
"launchBootstrap": 31364,
"materializeTasks": 29,
"directReply": 15080,
"peerRelayAB": 31900,
"peerRelayBC": 29178,
"concurrentReplies": 20867,
"hygiene": 1
},
"stageFailures": {},
"taskRefChecks": {
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentBob": true,
"concurrentTom": true
},
"protocolViolations": {
"badMessages": 0,
"duplicateOrMissingTokens": []
},
"stages": {
"launchBootstrap": true,
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentReplies": true,
"taskRefs": true,
"cleanTranscript": true,
"noDuplicateTokens": true,
"latencyStable": true
},
"diagnostics": [
"runId=37f103a7-cae5-4d48-b578-56cbabb466d9"
]
}
]
}
]
}