fix(opencode): classify session refresh diagnostics
This commit is contained in:
parent
20a8e69c4c
commit
55dfc5db14
12 changed files with 302 additions and 169 deletions
|
|
@ -890,7 +890,7 @@ function isCleanOpenCodeSessionRefreshReason(
|
|||
if (!pattern.test(normalized)) {
|
||||
return false;
|
||||
}
|
||||
const markerText = normalized.replace(/^opencode api error(?:[.:\s-]+|$)/i, '');
|
||||
const markerText = stripOpenCodeGenericApiErrorPrefix(normalized);
|
||||
if (hasOpenCodeSessionRefreshFailureConflict(markerText)) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -924,11 +924,9 @@ function isBenignOpenCodeSessionRefreshRemainder(rawRemainder: string): boolean
|
|||
}
|
||||
|
||||
function isOpenCodeSessionRefreshScheduledReason(message: string | null | undefined): boolean {
|
||||
const normalized =
|
||||
message
|
||||
?.trim()
|
||||
.toLowerCase()
|
||||
.replace(/[.:\s-]+$/, '') ?? '';
|
||||
const normalized = stripOpenCodeGenericApiErrorPrefix(
|
||||
message?.trim().toLowerCase() ?? ''
|
||||
).replace(/[.:\s-]+$/, '');
|
||||
return (
|
||||
normalized === 'opencode prompt delivery session refresh scheduled' ||
|
||||
normalized === 'opencode_prompt_delivery_session_refresh_scheduled' ||
|
||||
|
|
@ -938,6 +936,10 @@ function isOpenCodeSessionRefreshScheduledReason(message: string | null | undefi
|
|||
);
|
||||
}
|
||||
|
||||
function stripOpenCodeGenericApiErrorPrefix(message: string): string {
|
||||
return message.replace(/^opencode api error(?:[.:\s-]+|$)/i, '');
|
||||
}
|
||||
|
||||
function hasOpenCodeSessionRefreshFailureConflict(value: string): boolean {
|
||||
return OPENCODE_SESSION_REFRESH_FAILURE_PATTERN.test(
|
||||
value.replace(OPENCODE_SESSION_REFRESH_SAFE_MARKER_STATE_PATTERN, 'state')
|
||||
|
|
|
|||
|
|
@ -51,6 +51,32 @@ function isPlainGenericOpenCodeApiError(message: string): boolean {
|
|||
);
|
||||
}
|
||||
|
||||
function isOpenCodeRuntimeDeliverySessionRefreshScheduledDiagnostic(message: string): boolean {
|
||||
const normalized = stripOpenCodeGenericApiErrorPrefix(message.trim().toLowerCase()).replace(
|
||||
/[.:\s-]+$/,
|
||||
''
|
||||
);
|
||||
return (
|
||||
normalized === 'opencode prompt delivery session refresh scheduled' ||
|
||||
normalized === 'opencode_prompt_delivery_session_refresh_scheduled' ||
|
||||
normalized === 'opencode session refresh scheduled after resolved behavior changed' ||
|
||||
normalized === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' ||
|
||||
normalized === 'opencode session changed; refreshing the session before retry'
|
||||
);
|
||||
}
|
||||
|
||||
function stripOpenCodeGenericApiErrorPrefix(message: string): string {
|
||||
return message.replace(/^opencode api error(?:[.:\s-]+|$)/i, '');
|
||||
}
|
||||
|
||||
function isOpenCodeRuntimeDeliveryCleanSessionRefreshDiagnostic(message: string): boolean {
|
||||
return (
|
||||
isOpenCodeRuntimeDeliverySessionRefreshScheduledDiagnostic(message) ||
|
||||
isOpenCodeResolvedBehaviorChangedReason(message) ||
|
||||
isOpenCodeSessionTransportChangedReason(message)
|
||||
);
|
||||
}
|
||||
|
||||
function isInformationalOpenCodeRuntimeDeliveryDiagnostic(
|
||||
message: string | null | undefined
|
||||
): boolean {
|
||||
|
|
@ -61,12 +87,7 @@ function isInformationalOpenCodeRuntimeDeliveryDiagnostic(
|
|||
'opencode prompt_async accepted; response observation will continue through durable app-side ledger reconciliation.' ||
|
||||
normalized === 'opencode session status busy' ||
|
||||
normalized === 'opencode_delivery_response_pending' ||
|
||||
normalized === 'opencode_prompt_delivery_session_refresh_scheduled' ||
|
||||
normalized === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' ||
|
||||
Boolean(
|
||||
isOpenCodeResolvedBehaviorChangedReason(normalized) ||
|
||||
isOpenCodeSessionTransportChangedReason(normalized)
|
||||
)
|
||||
Boolean(normalized && isOpenCodeRuntimeDeliveryCleanSessionRefreshDiagnostic(normalized))
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -85,11 +106,7 @@ function getOpenCodeRuntimeDeliveryStateFallback(
|
|||
const diagnostics = record.diagnostics.map((diagnostic) => diagnostic.trim().toLowerCase());
|
||||
const diagnosticText = diagnostics.join('\n');
|
||||
const hasCleanSessionRefreshDiagnostic = diagnostics.some(
|
||||
(diagnostic) =>
|
||||
diagnostic === 'opencode_prompt_delivery_session_refresh_scheduled' ||
|
||||
diagnostic === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed' ||
|
||||
isOpenCodeResolvedBehaviorChangedReason(diagnostic) ||
|
||||
isOpenCodeSessionTransportChangedReason(diagnostic)
|
||||
isOpenCodeRuntimeDeliveryCleanSessionRefreshDiagnostic
|
||||
);
|
||||
if (state === 'empty_assistant_turn' || normalizedReason === 'empty_assistant_turn') {
|
||||
return 'OpenCode returned an empty assistant turn.';
|
||||
|
|
|
|||
|
|
@ -41,19 +41,20 @@ const OPENCODE_SESSION_REFRESH_SAFE_MARKER_STATE_PATTERN =
|
|||
|
||||
function isCleanOpenCodeSessionRefreshDiagnostic(message: string): boolean {
|
||||
const normalized = message.trim().toLowerCase();
|
||||
const refreshText = stripOpenCodeGenericApiErrorPrefix(normalized);
|
||||
const refreshMarkerText = refreshText.replace(/[.:\s-]+$/, '');
|
||||
if (
|
||||
normalized === 'opencode session changed; refreshing the session before retry' ||
|
||||
normalized === 'opencode session changed; refreshing the session before retry.' ||
|
||||
normalized === 'opencode session refresh scheduled after resolved behavior changed' ||
|
||||
normalized === 'opencode_prompt_delivery_session_refresh_scheduled' ||
|
||||
normalized === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed'
|
||||
refreshMarkerText === 'opencode session changed; refreshing the session before retry' ||
|
||||
refreshMarkerText === 'opencode session refresh scheduled after resolved behavior changed' ||
|
||||
refreshMarkerText === 'opencode_prompt_delivery_session_refresh_scheduled' ||
|
||||
refreshMarkerText === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed'
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(normalized)) {
|
||||
if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(refreshText)) {
|
||||
return false;
|
||||
}
|
||||
const markerText = normalized.replace(/^opencode api error(?:[.:\s-]+|$)/i, '');
|
||||
const markerText = refreshText;
|
||||
if (hasOpenCodeSessionRefreshFailureConflict(markerText)) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -65,6 +66,10 @@ function isCleanOpenCodeSessionRefreshDiagnostic(message: string): boolean {
|
|||
return isBenignOpenCodeSessionRefreshRemainder(rawRemainder);
|
||||
}
|
||||
|
||||
function stripOpenCodeGenericApiErrorPrefix(message: string): string {
|
||||
return message.replace(/^opencode api error(?:[.:\s-]+|$)/i, '');
|
||||
}
|
||||
|
||||
function isBenignOpenCodeSessionRefreshRemainder(rawRemainder: string): boolean {
|
||||
if (OPENCODE_SESSION_REFRESH_FAILURE_PATTERN.test(rawRemainder)) {
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -400,20 +400,21 @@ const OPENCODE_SESSION_REFRESH_SAFE_MARKER_STATE_PATTERN =
|
|||
|
||||
function isRecoverableOpenCodeSessionRefreshMessage(message: string | undefined): boolean {
|
||||
const normalized = message?.trim().toLowerCase() ?? '';
|
||||
const refreshText = stripOpenCodeGenericApiErrorPrefix(normalized);
|
||||
const refreshMarkerText = refreshText.replace(/[.:\s-]+$/, '');
|
||||
if (
|
||||
normalized === 'session_stale' ||
|
||||
normalized === 'opencode session changed; refreshing the session before retry' ||
|
||||
normalized === 'opencode session changed; refreshing the session before retry.' ||
|
||||
normalized === 'opencode session refresh scheduled after resolved behavior changed' ||
|
||||
normalized === 'opencode_prompt_delivery_session_refresh_scheduled' ||
|
||||
normalized === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed'
|
||||
refreshMarkerText === 'session_stale' ||
|
||||
refreshMarkerText === 'opencode session changed; refreshing the session before retry' ||
|
||||
refreshMarkerText === 'opencode session refresh scheduled after resolved behavior changed' ||
|
||||
refreshMarkerText === 'opencode_prompt_delivery_session_refresh_scheduled' ||
|
||||
refreshMarkerText === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed'
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(normalized)) {
|
||||
if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(refreshText)) {
|
||||
return false;
|
||||
}
|
||||
const markerText = normalized.replace(/^opencode api error(?:[.:\s-]+|$)/i, '');
|
||||
const markerText = refreshText;
|
||||
if (hasOpenCodeSessionRefreshFailureConflict(markerText)) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -429,6 +430,10 @@ function isRecoverableOpenCodeSessionRefreshMessage(message: string | undefined)
|
|||
return staleLogProjectionContext && isBenignOpenCodeSessionRefreshRemainder(rawRemainder);
|
||||
}
|
||||
|
||||
function stripOpenCodeGenericApiErrorPrefix(message: string): string {
|
||||
return message.replace(/^opencode api error(?:[.:\s-]+|$)/i, '');
|
||||
}
|
||||
|
||||
function isBenignOpenCodeSessionRefreshRemainder(rawRemainder: string): boolean {
|
||||
if (OPENCODE_SESSION_REFRESH_FAILURE_PATTERN.test(rawRemainder)) {
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -166,21 +166,22 @@ function isRuntimeDiagnosticCardError(params: {
|
|||
|
||||
function isRecoverableOpenCodeSessionRefreshText(value: string | undefined): boolean {
|
||||
const normalized = value?.trim().toLowerCase() ?? '';
|
||||
const refreshText = stripOpenCodeGenericApiErrorPrefix(normalized);
|
||||
const refreshMarkerText = refreshText.replace(/[.:\s-]+$/, '');
|
||||
if (
|
||||
normalized === 'session_stale' ||
|
||||
normalized === 'opencode session refresh' ||
|
||||
normalized === 'opencode session changed; refreshing the session before retry' ||
|
||||
normalized === 'opencode session changed; refreshing the session before retry.' ||
|
||||
normalized === 'opencode session refresh scheduled after resolved behavior changed' ||
|
||||
normalized === 'opencode_prompt_delivery_session_refresh_scheduled' ||
|
||||
normalized === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed'
|
||||
refreshMarkerText === 'session_stale' ||
|
||||
refreshMarkerText === 'opencode session refresh' ||
|
||||
refreshMarkerText === 'opencode session changed; refreshing the session before retry' ||
|
||||
refreshMarkerText === 'opencode session refresh scheduled after resolved behavior changed' ||
|
||||
refreshMarkerText === 'opencode_prompt_delivery_session_refresh_scheduled' ||
|
||||
refreshMarkerText === 'opencode_session_refresh_scheduled_after_resolved_behavior_changed'
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(normalized)) {
|
||||
if (!OPENCODE_SESSION_REFRESH_REASON_PATTERN.test(refreshText)) {
|
||||
return false;
|
||||
}
|
||||
const markerText = normalized.replace(/^opencode api error(?:[.:\s-]+|$)/i, '');
|
||||
const markerText = refreshText;
|
||||
if (hasOpenCodeSessionRefreshFailureConflict(markerText)) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -196,6 +197,10 @@ function isRecoverableOpenCodeSessionRefreshText(value: string | undefined): boo
|
|||
return staleLogProjectionContext && isBenignOpenCodeSessionRefreshRemainder(rawRemainder);
|
||||
}
|
||||
|
||||
function stripOpenCodeGenericApiErrorPrefix(message: string): string {
|
||||
return message.replace(/^opencode api error(?:[.:\s-]+|$)/i, '');
|
||||
}
|
||||
|
||||
function isBenignOpenCodeSessionRefreshRemainder(rawRemainder: string): boolean {
|
||||
if (OPENCODE_SESSION_REFRESH_FAILURE_PATTERN.test(rawRemainder)) {
|
||||
return false;
|
||||
|
|
@ -774,6 +779,7 @@ export function hasMemberLaunchDiagnosticsError(payload: MemberLaunchDiagnostics
|
|||
return false;
|
||||
}
|
||||
return Boolean(
|
||||
payload.memberCardError ||
|
||||
payload.spawnStatus === 'error' ||
|
||||
payload.launchState === 'failed_to_start' ||
|
||||
payload.runtimeDiagnosticSeverity === 'error'
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
{
|
||||
"generatedAt": "2026-05-14T06:34:47.601Z",
|
||||
"generatedAt": "2026-05-18T13:16:59.867Z",
|
||||
"runsPerModel": 1,
|
||||
"qualification": {
|
||||
"minimumAverageScore": 70,
|
||||
"minimumAverageScore": 80,
|
||||
"minimumSuccessfulRuns": 1,
|
||||
"minimumConsistencyScore": 85,
|
||||
"requireNoHardFailures": true
|
||||
|
|
@ -10,23 +10,23 @@
|
|||
"models": [
|
||||
{
|
||||
"model": "opencode/big-pickle",
|
||||
"verdict": "recommended",
|
||||
"verdict": "tested-only",
|
||||
"confidence": "low",
|
||||
"qualified": true,
|
||||
"readinessScore": 100,
|
||||
"averageScore": 100,
|
||||
"qualified": false,
|
||||
"readinessScore": 54,
|
||||
"averageScore": 35,
|
||||
"consistencyScore": 100,
|
||||
"behavioralAverageScore": 100,
|
||||
"minScore": 100,
|
||||
"successfulRuns": 1,
|
||||
"behavioralAverageScore": 35,
|
||||
"minScore": 35,
|
||||
"successfulRuns": 0,
|
||||
"countedRuns": 1,
|
||||
"hardFailures": 0,
|
||||
"hardFailures": 1,
|
||||
"providerInfraFailures": 0,
|
||||
"runtimeTransportFailures": 0,
|
||||
"modelBehaviorFailures": 0,
|
||||
"modelBehaviorFailures": 1,
|
||||
"harnessFailures": 0,
|
||||
"p50DurationMs": 132968,
|
||||
"p95DurationMs": 132968,
|
||||
"p50DurationMs": 133048,
|
||||
"p95DurationMs": 133048,
|
||||
"stagePassRates": {
|
||||
"launchBootstrap": {
|
||||
"passed": 1,
|
||||
|
|
@ -39,19 +39,19 @@
|
|||
"rate": 100
|
||||
},
|
||||
"peerRelayAB": {
|
||||
"passed": 1,
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"rate": 0
|
||||
},
|
||||
"peerRelayBC": {
|
||||
"passed": 1,
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"rate": 0
|
||||
},
|
||||
"concurrentReplies": {
|
||||
"passed": 1,
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"rate": 0
|
||||
},
|
||||
"taskRefs": {
|
||||
"passed": 1,
|
||||
|
|
@ -59,19 +59,19 @@
|
|||
"rate": 100
|
||||
},
|
||||
"cleanTranscript": {
|
||||
"passed": 1,
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"rate": 0
|
||||
},
|
||||
"noDuplicateTokens": {
|
||||
"passed": 1,
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"rate": 0
|
||||
},
|
||||
"latencyStable": {
|
||||
"passed": 1,
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"rate": 0
|
||||
}
|
||||
},
|
||||
"taskRefPassRates": {
|
||||
|
|
@ -81,24 +81,24 @@
|
|||
"rate": 100
|
||||
},
|
||||
"peerRelayAB": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"passed": 0,
|
||||
"total": 0,
|
||||
"rate": null
|
||||
},
|
||||
"peerRelayBC": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"passed": 0,
|
||||
"total": 0,
|
||||
"rate": null
|
||||
},
|
||||
"concurrentBob": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"passed": 0,
|
||||
"total": 0,
|
||||
"rate": null
|
||||
},
|
||||
"concurrentTom": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"passed": 0,
|
||||
"total": 0,
|
||||
"rate": null
|
||||
}
|
||||
},
|
||||
"protocolViolationTotals": {
|
||||
|
|
@ -108,23 +108,63 @@
|
|||
},
|
||||
"stageFailureImpact": [
|
||||
{
|
||||
"stage": "cleanTranscript",
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"stage": "concurrentReplies",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 15,
|
||||
"passRate": {
|
||||
"passed": 1,
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"rate": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "concurrentReplies",
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"stage": "peerRelayAB",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 15,
|
||||
"passRate": {
|
||||
"passed": 1,
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
"rate": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "peerRelayBC",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 15,
|
||||
"passRate": {
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "cleanTranscript",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 10,
|
||||
"passRate": {
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "latencyStable",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 5,
|
||||
"passRate": {
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "noDuplicateTokens",
|
||||
"failedRuns": 1,
|
||||
"weightedLoss": 5,
|
||||
"passRate": {
|
||||
"passed": 0,
|
||||
"total": 1,
|
||||
"rate": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
|
|
@ -137,16 +177,6 @@
|
|||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "latencyStable",
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"passRate": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "launchBootstrap",
|
||||
"failedRuns": 0,
|
||||
|
|
@ -157,36 +187,6 @@
|
|||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "noDuplicateTokens",
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"passRate": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "peerRelayAB",
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"passRate": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "peerRelayBC",
|
||||
"failedRuns": 0,
|
||||
"weightedLoss": 0,
|
||||
"passRate": {
|
||||
"passed": 1,
|
||||
"total": 1,
|
||||
"rate": 100
|
||||
}
|
||||
},
|
||||
{
|
||||
"stage": "taskRefs",
|
||||
"failedRuns": 0,
|
||||
|
|
@ -200,42 +200,45 @@
|
|||
],
|
||||
"scoreStability": {
|
||||
"sampleSize": 1,
|
||||
"minScore": 100,
|
||||
"maxScore": 100,
|
||||
"minScore": 35,
|
||||
"maxScore": 35,
|
||||
"spread": 0,
|
||||
"standardDeviation": 0,
|
||||
"consistencyScore": 100
|
||||
},
|
||||
"dominantFailureCategory": "none",
|
||||
"recommendationBlockers": [],
|
||||
"dominantFailureCategory": "model-behavior",
|
||||
"recommendationBlockers": [
|
||||
"overall average 35 < 80",
|
||||
"behavioral average 35 < 80",
|
||||
"successful runs 0 < 1",
|
||||
"hard failures 1",
|
||||
"model-behavior failures 1",
|
||||
"highest weighted stage loss concurrentReplies=15"
|
||||
],
|
||||
"runs": [
|
||||
{
|
||||
"runIndex": 1,
|
||||
"passed": true,
|
||||
"score": 100,
|
||||
"passed": false,
|
||||
"score": 35,
|
||||
"countedForRecommendation": true,
|
||||
"outcome": "passed",
|
||||
"failureCategory": "none",
|
||||
"primaryFailure": null,
|
||||
"durationMs": 132968,
|
||||
"hardFailure": false,
|
||||
"outcome": "behavioral-fail",
|
||||
"failureCategory": "model-behavior",
|
||||
"primaryFailure": "Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending",
|
||||
"durationMs": 133048,
|
||||
"hardFailure": true,
|
||||
"stageDurationsMs": {
|
||||
"setup": 2770,
|
||||
"launchBootstrap": 49092,
|
||||
"materializeTasks": 85,
|
||||
"directReply": 13760,
|
||||
"peerRelayAB": 22730,
|
||||
"peerRelayBC": 21484,
|
||||
"concurrentReplies": 14023,
|
||||
"hygiene": 1
|
||||
"setup": 371,
|
||||
"launchBootstrap": 23735,
|
||||
"materializeTasks": 36,
|
||||
"directReply": 17327
|
||||
},
|
||||
"stageFailures": {},
|
||||
"taskRefChecks": {
|
||||
"directReply": true,
|
||||
"peerRelayAB": true,
|
||||
"peerRelayBC": true,
|
||||
"concurrentBob": true,
|
||||
"concurrentTom": true
|
||||
"peerRelayAB": null,
|
||||
"peerRelayBC": null,
|
||||
"concurrentBob": null,
|
||||
"concurrentTom": null
|
||||
},
|
||||
"protocolViolations": {
|
||||
"badMessages": 0,
|
||||
|
|
@ -244,16 +247,17 @@
|
|||
"stages": {
|
||||
"launchBootstrap": true,
|
||||
"directReply": true,
|
||||
"peerRelayAB": true,
|
||||
"peerRelayBC": true,
|
||||
"concurrentReplies": true,
|
||||
"peerRelayAB": false,
|
||||
"peerRelayBC": false,
|
||||
"concurrentReplies": false,
|
||||
"taskRefs": true,
|
||||
"cleanTranscript": true,
|
||||
"noDuplicateTokens": true,
|
||||
"latencyStable": true
|
||||
"cleanTranscript": false,
|
||||
"noDuplicateTokens": false,
|
||||
"latencyStable": false
|
||||
},
|
||||
"diagnostics": [
|
||||
"runId=5f3d0b1b-17eb-44d6-8b61-644e6f8673c6"
|
||||
"runId=9fb17ac5-fc66-4b01-831e-90a04b1e2304",
|
||||
"Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending"
|
||||
]
|
||||
}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
# OpenCode Model Gauntlet Results
|
||||
|
||||
Generated: 2026-05-14T06:34:47.601Z
|
||||
Generated: 2026-05-18T13:16:59.867Z
|
||||
|
||||
Runs per model: 1
|
||||
Recommended threshold: average >= 70, successful runs >= 1, consistency >= 85, hard failures = 0
|
||||
Recommended threshold: average >= 80, successful runs >= 1, consistency >= 85, hard failures = 0
|
||||
|
||||
Provider-infra runs are reported separately and are not counted as model behavior. They still block a Recommended verdict until rerun succeeds.
|
||||
|
||||
|
|
@ -13,25 +13,25 @@ Scoring weights: launchBootstrap=15, directReply=10, peerRelayAB=15, peerRelayBC
|
|||
|
||||
| Model | Verdict | Confidence | Readiness | Consistency | Score Spread | Behavior Avg | Overall Avg | Counted | Pass Runs | Weakest Stage | Weakest TaskRef | Dominant Failure | Blockers | Provider Infra | Runtime Transport | Model Fails | Protocol Runs | p50 | p95 |
|
||||
| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |
|
||||
| `opencode/big-pickle` | Recommended | low | 100 | 100 | 0 | 100 | 100 | 1/1 | 1/1 | cleanTranscript 1/1 (100%) | concurrentBob 1/1 (100%) | none | - | 0 | 0 | 0 | 0 | 132968ms | 132968ms |
|
||||
| `opencode/big-pickle` | Tested only | low | 54 | 100 | 0 | 35 | 35 | 1/1 | 0/1 | cleanTranscript 0/1 (0%) | directReply 1/1 (100%) | model-behavior | overall average 35 < 80; behavioral average 35 < 80; successful runs 0 < 1; hard failures 1; model-behavior failures 1; highest weighted stage loss concurrentReplies=15 | 0 | 0 | 1 | 0 | 133048ms | 133048ms |
|
||||
|
||||
## opencode/big-pickle
|
||||
|
||||
Readiness score: 100.
|
||||
Readiness score: 54.
|
||||
|
||||
Score stability: consistency=100, min=100, max=100, spread=0, stdDev=0, samples=1.
|
||||
Score stability: consistency=100, min=35, max=35, spread=0, stdDev=0, samples=1.
|
||||
|
||||
Recommendation blockers: -.
|
||||
Recommendation blockers: overall average 35 < 80; behavioral average 35 < 80; successful runs 0 < 1; hard failures 1; model-behavior failures 1; highest weighted stage loss concurrentReplies=15.
|
||||
|
||||
Weighted stage impact: -.
|
||||
Weighted stage impact: concurrentReplies:loss=15, failed=1, pass=0/1 (0%); peerRelayAB:loss=15, failed=1, pass=0/1 (0%); peerRelayBC:loss=15, failed=1, pass=0/1 (0%); cleanTranscript:loss=10, failed=1, pass=0/1 (0%); latencyStable:loss=5, failed=1, pass=0/1 (0%).
|
||||
|
||||
Stage pass rates: launchBootstrap:1/1 (100%), directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentReplies:1/1 (100%), taskRefs:1/1 (100%), cleanTranscript:1/1 (100%), noDuplicateTokens:1/1 (100%), latencyStable:1/1 (100%).
|
||||
Stage pass rates: launchBootstrap:1/1 (100%), directReply:1/1 (100%), peerRelayAB:0/1 (0%), peerRelayBC:0/1 (0%), concurrentReplies:0/1 (0%), taskRefs:1/1 (100%), cleanTranscript:0/1 (0%), noDuplicateTokens:0/1 (0%), latencyStable:0/1 (0%).
|
||||
|
||||
TaskRef pass rates: directReply:1/1 (100%), peerRelayAB:1/1 (100%), peerRelayBC:1/1 (100%), concurrentBob:1/1 (100%), concurrentTom:1/1 (100%).
|
||||
TaskRef pass rates: directReply:1/1 (100%), peerRelayAB:n/a, peerRelayBC:n/a, concurrentBob:n/a, concurrentTom:n/a.
|
||||
|
||||
Protocol totals: badMessages=0, duplicateOrMissingTokens=0, affectedRuns=0.
|
||||
|
||||
| Run | Outcome | Category | Score | Counted | Duration | Failed Stages | Slowest Stage | TaskRefs | Protocol | Diagnostics |
|
||||
| ---: | --- | --- | ---: | --- | ---: | --- | --- | --- | --- | --- |
|
||||
| 1 | passed | none | 100 | yes | 132968ms | - | launchBootstrap:49092ms | directReply:ok, peerRelayAB:ok, peerRelayBC:ok, concurrentBob:ok, concurrentTom:ok | - | runId=5f3d0b1b-17eb-44d6-8b61-644e6f8673c6 |
|
||||
| 1 | behavioral-fail | model-behavior | 35 | yes | 133048ms | peerRelayAB, peerRelayBC, concurrentReplies, cleanTranscript, noDuplicateTokens, latencyStable | launchBootstrap:23735ms | directReply:ok | - | Timed out waiting for OpenCode member bob to become idle. Last durableState: reply_pending |
|
||||
|
||||
|
|
|
|||
|
|
@ -591,7 +591,9 @@ describe('OpenCodePromptDeliveryLedger', () => {
|
|||
).toBe(true);
|
||||
for (const reason of [
|
||||
'opencode_prompt_delivery_session_refresh_scheduled',
|
||||
'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled',
|
||||
'opencode_session_refresh_scheduled_after_resolved_behavior_changed',
|
||||
'OpenCode API error: opencode_session_refresh_scheduled_after_resolved_behavior_changed',
|
||||
'OpenCode session refresh scheduled after resolved behavior changed',
|
||||
'OpenCode session changed; refreshing the session before retry.',
|
||||
]) {
|
||||
|
|
@ -602,6 +604,12 @@ describe('OpenCodePromptDeliveryLedger', () => {
|
|||
})
|
||||
).toBe(true);
|
||||
}
|
||||
expect(
|
||||
isOpenCodeSessionRefreshResponseState({
|
||||
responseState: 'pending',
|
||||
reason: 'OpenCode API erroropencode_prompt_delivery_session_refresh_scheduled',
|
||||
})
|
||||
).toBe(false);
|
||||
expect(
|
||||
isOpenCodeSessionRefreshResponseState({
|
||||
diagnostics: [
|
||||
|
|
|
|||
|
|
@ -131,6 +131,38 @@ describe('OpenCodeRuntimeDeliveryDiagnostics', () => {
|
|||
);
|
||||
});
|
||||
|
||||
it('treats generic-prefixed legacy prompt-delivery refresh scheduled diagnostics as session refresh', () => {
|
||||
const record = {
|
||||
diagnostics: [
|
||||
'OpenCode API error',
|
||||
'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled',
|
||||
],
|
||||
lastReason: 'OpenCode API error',
|
||||
responseState: 'not_observed',
|
||||
status: 'retry_scheduled',
|
||||
} as Parameters<typeof selectOpenCodeRuntimeDeliveryReason>[0];
|
||||
|
||||
expect(selectOpenCodeRuntimeDeliveryReason(record)).toBe(
|
||||
'OpenCode session changed; refreshing the session before retry.'
|
||||
);
|
||||
});
|
||||
|
||||
it('treats generic-prefixed resolved-behavior refresh scheduled diagnostics as session refresh', () => {
|
||||
const record = {
|
||||
diagnostics: [
|
||||
'OpenCode API error',
|
||||
'OpenCode API error. opencode_session_refresh_scheduled_after_resolved_behavior_changed',
|
||||
],
|
||||
lastReason: 'OpenCode API error',
|
||||
responseState: 'not_observed',
|
||||
status: 'retry_scheduled',
|
||||
} as Parameters<typeof selectOpenCodeRuntimeDeliveryReason>[0];
|
||||
|
||||
expect(selectOpenCodeRuntimeDeliveryReason(record)).toBe(
|
||||
'OpenCode session changed; refreshing the session before retry.'
|
||||
);
|
||||
});
|
||||
|
||||
it('treats colon-terminated generic OpenCode API errors plus clean refresh evidence as session refresh', () => {
|
||||
const record = {
|
||||
diagnostics: ['OpenCode API error:', 'resolved_behavior_changed:old->new'],
|
||||
|
|
@ -144,6 +176,19 @@ describe('OpenCodeRuntimeDeliveryDiagnostics', () => {
|
|||
);
|
||||
});
|
||||
|
||||
it('does not treat generic-prefixed refresh scheduled diagnostics with failure details as session refresh', () => {
|
||||
const reason =
|
||||
'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled permission denied';
|
||||
const record = {
|
||||
diagnostics: ['OpenCode API error', reason],
|
||||
lastReason: 'OpenCode API error',
|
||||
responseState: 'reconcile_failed',
|
||||
status: 'failed_retryable',
|
||||
} as Parameters<typeof selectOpenCodeRuntimeDeliveryReason>[0];
|
||||
|
||||
expect(selectOpenCodeRuntimeDeliveryReason(record)).toBe(reason);
|
||||
});
|
||||
|
||||
it('keeps real failure diagnostics above generic OpenCode API error plus refresh evidence', () => {
|
||||
const record = {
|
||||
diagnostics: ['OpenCode API error', 'resolved_behavior_changed:old->new', 'permission denied'],
|
||||
|
|
|
|||
|
|
@ -208,6 +208,26 @@ describe('RuntimeDiagnosticClassifier', () => {
|
|||
generic: true,
|
||||
actionRequired: false,
|
||||
});
|
||||
expect(
|
||||
classifyRuntimeDiagnostic(
|
||||
'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled'
|
||||
)
|
||||
).toMatchObject({
|
||||
reasonCode: 'backend_error',
|
||||
normalizedMessage: 'OpenCode session changed; refreshing the session before retry.',
|
||||
generic: true,
|
||||
actionRequired: false,
|
||||
});
|
||||
expect(
|
||||
classifyRuntimeDiagnostic(
|
||||
'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled.'
|
||||
)
|
||||
).toMatchObject({
|
||||
reasonCode: 'backend_error',
|
||||
normalizedMessage: 'OpenCode session changed; refreshing the session before retry.',
|
||||
generic: true,
|
||||
actionRequired: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('does not classify refresh markers with unknown extra text as clean refresh', () => {
|
||||
|
|
|
|||
|
|
@ -981,6 +981,8 @@ describe('memberHelpers spawn-aware presence', () => {
|
|||
'OpenCode session is stale (resolved_behavior_changed:old->new); reading historical messages for log projection only',
|
||||
'opencode_app_mcp_transport_changed:old->new',
|
||||
'opencode_prompt_delivery_session_refresh_scheduled',
|
||||
'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled',
|
||||
'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled.',
|
||||
'OpenCode session refresh scheduled after resolved behavior changed',
|
||||
'opencode_session_refresh_scheduled_after_resolved_behavior_changed',
|
||||
])('renders recoverable OpenCode session refresh advisory %s as a warning', (message) => {
|
||||
|
|
|
|||
|
|
@ -278,7 +278,7 @@ describe('member launch diagnostics', () => {
|
|||
kind: 'api_error',
|
||||
observedAt: '2026-05-18T08:31:46.075Z',
|
||||
reasonCode: 'backend_error',
|
||||
message: 'opencode_prompt_delivery_session_refresh_scheduled',
|
||||
message: 'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled.',
|
||||
},
|
||||
});
|
||||
|
||||
|
|
@ -312,6 +312,25 @@ describe('member launch diagnostics', () => {
|
|||
expect(hasMemberLaunchDiagnosticsError(payload)).toBe(false);
|
||||
});
|
||||
|
||||
it('treats member card errors from runtime advisory as diagnostics errors', () => {
|
||||
const payload = buildMemberLaunchDiagnosticsPayload({
|
||||
memberName: 'tom',
|
||||
member: { name: 'tom', providerId: 'opencode' },
|
||||
runtimeAdvisoryLabel: 'OpenCode API error',
|
||||
runtimeAdvisoryTitle: 'OpenCode API error',
|
||||
runtimeAdvisory: {
|
||||
kind: 'api_error',
|
||||
observedAt: '2026-05-18T08:31:46.075Z',
|
||||
reasonCode: 'backend_error',
|
||||
message: 'OpenCode API error. opencode_prompt_delivery_session_refresh_scheduled permission denied',
|
||||
},
|
||||
});
|
||||
|
||||
expect(payload.memberCardError).toBe('OpenCode API error');
|
||||
expect(hasMemberLaunchDiagnosticsError(payload)).toBe(true);
|
||||
expect(getMemberLaunchDiagnosticsErrorMessage(payload)).toBe('OpenCode API error');
|
||||
});
|
||||
|
||||
it('does not treat OpenCode response-state names inside refresh markers as card errors', () => {
|
||||
const payload = buildMemberLaunchDiagnosticsPayload({
|
||||
memberName: 'tom',
|
||||
|
|
|
|||
Loading…
Reference in a new issue