agent-ecosystem/test-results/opencode-semantic-model-gauntlet/model-gauntlet-results.json

710 lines
18 KiB
JSON

{
"generatedAt": "2026-05-09T23:16:07.760Z",
"runsPerModel": 3,
"qualification": {
"minimumAverageScore": 90,
"minimumSuccessfulRuns": 3,
"minimumConsistencyScore": 85,
"requireNoHardFailures": true
},
"models": [
{
"model": "opencode/big-pickle",
"verdict": "recommended",
"confidence": "high",
"qualified": true,
"readinessScore": 100,
"averageScore": 100,
"consistencyScore": 100,
"behavioralAverageScore": 100,
"minScore": 100,
"successfulRuns": 3,
"countedRuns": 3,
"hardFailures": 0,
"providerInfraFailures": 0,
"runtimeTransportFailures": 0,
"modelBehaviorFailures": 0,
"harnessFailures": 0,
"p50DurationMs": 112355,
"p95DurationMs": 116891,
"stagePassRates": {
"launchBootstrap": {
"passed": 3,
"total": 3,
"rate": 100
},
"directReply": {
"passed": 3,
"total": 3,
"rate": 100
},
"peerRelayAB": {
"passed": 3,
"total": 3,
"rate": 100
},
"peerRelayBC": {
"passed": 3,
"total": 3,
"rate": 100
},
"concurrentReplies": {
"passed": 3,
"total": 3,
"rate": 100
},
"taskRefs": {
"passed": 3,
"total": 3,
"rate": 100
},
"cleanTranscript": {
"passed": 3,
"total": 3,
"rate": 100
},
"noDuplicateTokens": {
"passed": 3,
"total": 3,
"rate": 100
},
"latencyStable": {
"passed": 3,
"total": 3,
"rate": 100
}
},
"taskRefPassRates": {
"directReply": {
"passed": 3,
"total": 3,
"rate": 100
},
"peerRelayAB": {
"passed": 3,
"total": 3,
"rate": 100
},
"peerRelayBC": {
"passed": 3,
"total": 3,
"rate": 100
},
"concurrentBob": {
"passed": 3,
"total": 3,
"rate": 100
},
"concurrentTom": {
"passed": 3,
"total": 3,
"rate": 100
}
},
"protocolViolationTotals": {
"badMessages": 0,
"duplicateOrMissingTokens": 0,
"affectedRuns": 0
},
"stageFailureImpact": [
{
"stage": "cleanTranscript",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "concurrentReplies",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "directReply",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "latencyStable",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "launchBootstrap",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "noDuplicateTokens",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "peerRelayAB",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "peerRelayBC",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "taskRefs",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
}
],
"scoreStability": {
"sampleSize": 3,
"minScore": 100,
"maxScore": 100,
"spread": 0,
"standardDeviation": 0,
"consistencyScore": 100
},
"dominantFailureCategory": "none",
"recommendationBlockers": [],
"runs": [
{
"runIndex": 1,
"passed": true,
"score": 100,
"countedForRecommendation": true,
"outcome": "passed",
"failureCategory": "none",
"primaryFailure": null,
"durationMs": 112344,
"hardFailure": false,
"stageDurationsMs": {
"setup": 183,
"launchBootstrap": 19933,
"materializeTasks": 35,
"directReply": 15430,
"peerRelayAB": 25001,
"peerRelayBC": 28154,
"concurrentReplies": 15551,
"hygiene": 1
},
"stageFailures": {},
"taskRefChecks": {
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentBob": true,
"concurrentTom": true
},
"protocolViolations": {
"badMessages": 0,
"duplicateOrMissingTokens": []
},
"stages": {
"launchBootstrap": true,
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentReplies": true,
"taskRefs": true,
"cleanTranscript": true,
"noDuplicateTokens": true,
"latencyStable": true
},
"diagnostics": [
"runId=d9d27eb0-2798-4980-a0fa-f082a6edd705"
]
},
{
"runIndex": 2,
"passed": true,
"score": 100,
"countedForRecommendation": true,
"outcome": "passed",
"failureCategory": "none",
"primaryFailure": null,
"durationMs": 112355,
"hardFailure": false,
"stageDurationsMs": {
"setup": 11,
"launchBootstrap": 18682,
"materializeTasks": 36,
"directReply": 15126,
"peerRelayAB": 24835,
"peerRelayBC": 28580,
"concurrentReplies": 17164,
"hygiene": 1
},
"stageFailures": {},
"taskRefChecks": {
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentBob": true,
"concurrentTom": true
},
"protocolViolations": {
"badMessages": 0,
"duplicateOrMissingTokens": []
},
"stages": {
"launchBootstrap": true,
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentReplies": true,
"taskRefs": true,
"cleanTranscript": true,
"noDuplicateTokens": true,
"latencyStable": true
},
"diagnostics": [
"runId=97364154-e06d-460c-94ae-65b73cb1b6f9"
]
},
{
"runIndex": 3,
"passed": true,
"score": 100,
"countedForRecommendation": true,
"outcome": "passed",
"failureCategory": "none",
"primaryFailure": null,
"durationMs": 116891,
"hardFailure": false,
"stageDurationsMs": {
"setup": 8,
"launchBootstrap": 18926,
"materializeTasks": 31,
"directReply": 17061,
"peerRelayAB": 27842,
"peerRelayBC": 27262,
"concurrentReplies": 15437,
"hygiene": 1
},
"stageFailures": {},
"taskRefChecks": {
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentBob": true,
"concurrentTom": true
},
"protocolViolations": {
"badMessages": 0,
"duplicateOrMissingTokens": []
},
"stages": {
"launchBootstrap": true,
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentReplies": true,
"taskRefs": true,
"cleanTranscript": true,
"noDuplicateTokens": true,
"latencyStable": true
},
"diagnostics": [
"runId=7bdd4b2e-dbd6-4474-a8a0-9418df433671"
]
}
]
},
{
"model": "opencode/minimax-m2.5-free",
"verdict": "strong-candidate",
"confidence": "high",
"qualified": false,
"readinessScore": 88.6,
"averageScore": 98.3,
"consistencyScore": 93.1,
"behavioralAverageScore": 98.3,
"minScore": 95,
"successfulRuns": 2,
"countedRuns": 3,
"hardFailures": 1,
"providerInfraFailures": 0,
"runtimeTransportFailures": 0,
"modelBehaviorFailures": 1,
"harnessFailures": 0,
"p50DurationMs": 108862,
"p95DurationMs": 118757,
"stagePassRates": {
"launchBootstrap": {
"passed": 3,
"total": 3,
"rate": 100
},
"directReply": {
"passed": 3,
"total": 3,
"rate": 100
},
"peerRelayAB": {
"passed": 3,
"total": 3,
"rate": 100
},
"peerRelayBC": {
"passed": 3,
"total": 3,
"rate": 100
},
"concurrentReplies": {
"passed": 3,
"total": 3,
"rate": 100
},
"taskRefs": {
"passed": 3,
"total": 3,
"rate": 100
},
"cleanTranscript": {
"passed": 3,
"total": 3,
"rate": 100
},
"noDuplicateTokens": {
"passed": 2,
"total": 3,
"rate": 66.7
},
"latencyStable": {
"passed": 3,
"total": 3,
"rate": 100
}
},
"taskRefPassRates": {
"directReply": {
"passed": 3,
"total": 3,
"rate": 100
},
"peerRelayAB": {
"passed": 3,
"total": 3,
"rate": 100
},
"peerRelayBC": {
"passed": 3,
"total": 3,
"rate": 100
},
"concurrentBob": {
"passed": 3,
"total": 3,
"rate": 100
},
"concurrentTom": {
"passed": 3,
"total": 3,
"rate": 100
}
},
"protocolViolationTotals": {
"badMessages": 0,
"duplicateOrMissingTokens": 2,
"affectedRuns": 1
},
"stageFailureImpact": [
{
"stage": "noDuplicateTokens",
"failedRuns": 1,
"weightedLoss": 5,
"passRate": {
"passed": 2,
"total": 3,
"rate": 66.7
}
},
{
"stage": "cleanTranscript",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "concurrentReplies",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "directReply",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "latencyStable",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "launchBootstrap",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "peerRelayAB",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "peerRelayBC",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
},
{
"stage": "taskRefs",
"failedRuns": 0,
"weightedLoss": 0,
"passRate": {
"passed": 3,
"total": 3,
"rate": 100
}
}
],
"scoreStability": {
"sampleSize": 3,
"minScore": 95,
"maxScore": 100,
"spread": 5,
"standardDeviation": 2.4,
"consistencyScore": 93.1
},
"dominantFailureCategory": "model-behavior",
"recommendationBlockers": [
"successful runs 2 < 3",
"hard failures 1",
"model-behavior failures 1",
"highest weighted stage loss noDuplicateTokens=5",
"protocol violations in 1 runs"
],
"runs": [
{
"runIndex": 1,
"passed": true,
"score": 100,
"countedForRecommendation": true,
"outcome": "passed",
"failureCategory": "none",
"primaryFailure": null,
"durationMs": 91530,
"hardFailure": false,
"stageDurationsMs": {
"setup": 10,
"launchBootstrap": 18716,
"materializeTasks": 31,
"directReply": 11557,
"peerRelayAB": 16323,
"peerRelayBC": 27370,
"concurrentReplies": 9606,
"hygiene": 1
},
"stageFailures": {},
"taskRefChecks": {
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentBob": true,
"concurrentTom": true
},
"protocolViolations": {
"badMessages": 0,
"duplicateOrMissingTokens": []
},
"stages": {
"launchBootstrap": true,
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentReplies": true,
"taskRefs": true,
"cleanTranscript": true,
"noDuplicateTokens": true,
"latencyStable": true
},
"diagnostics": [
"runId=23ae85d2-e79d-41c9-93a6-e843acea6d9e"
]
},
{
"runIndex": 2,
"passed": true,
"score": 100,
"countedForRecommendation": true,
"outcome": "passed",
"failureCategory": "none",
"primaryFailure": null,
"durationMs": 108862,
"hardFailure": false,
"stageDurationsMs": {
"setup": 10,
"launchBootstrap": 18359,
"materializeTasks": 35,
"directReply": 7236,
"peerRelayAB": 30664,
"peerRelayBC": 26124,
"concurrentReplies": 18477,
"hygiene": 0
},
"stageFailures": {},
"taskRefChecks": {
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentBob": true,
"concurrentTom": true
},
"protocolViolations": {
"badMessages": 0,
"duplicateOrMissingTokens": []
},
"stages": {
"launchBootstrap": true,
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentReplies": true,
"taskRefs": true,
"cleanTranscript": true,
"noDuplicateTokens": true,
"latencyStable": true
},
"diagnostics": [
"runId=c3a55d8a-4028-4af7-9e1a-8ae8c87a95e5"
]
},
{
"runIndex": 3,
"passed": false,
"score": 95,
"countedForRecommendation": true,
"outcome": "behavioral-fail",
"failureCategory": "model-behavior",
"primaryFailure": "duplicateOrMissingTokens=GAUNTLET_JACK_USER_OK_3,GAUNTLET_TOM_USER_OK_3",
"durationMs": 118757,
"hardFailure": true,
"stageDurationsMs": {
"setup": 9,
"launchBootstrap": 19986,
"materializeTasks": 37,
"directReply": 8036,
"peerRelayAB": 37430,
"peerRelayBC": 36219,
"concurrentReplies": 8551,
"hygiene": 0
},
"stageFailures": {},
"taskRefChecks": {
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentBob": true,
"concurrentTom": true
},
"protocolViolations": {
"badMessages": 0,
"duplicateOrMissingTokens": [
"GAUNTLET_JACK_USER_OK_3",
"GAUNTLET_TOM_USER_OK_3"
]
},
"stages": {
"launchBootstrap": true,
"directReply": true,
"peerRelayAB": true,
"peerRelayBC": true,
"concurrentReplies": true,
"taskRefs": true,
"cleanTranscript": true,
"noDuplicateTokens": false,
"latencyStable": true
},
"diagnostics": [
"runId=2b0610e0-7b10-49fc-88dd-ab30b37abce9",
"duplicateOrMissingTokens=GAUNTLET_JACK_USER_OK_3,GAUNTLET_TOM_USER_OK_3"
]
}
]
}
]
}