444 lines
14 KiB
TypeScript
444 lines
14 KiB
TypeScript
import { promises as fs } from 'fs';
|
|
import * as os from 'os';
|
|
import * as path from 'path';
|
|
|
|
import { describe, expect, it } from 'vitest';
|
|
|
|
import { setClaudeBasePathOverride } from '../../../../src/main/utils/pathDecoder';
|
|
import {
|
|
buildOpenCodeScenarioTeamRequest,
|
|
loadOpenCodeSemanticScenario,
|
|
materializeOpenCodeScenarioProject,
|
|
materializeOpenCodeScenarioTasks,
|
|
parseOpenCodeE2EModelList,
|
|
taskRefForScenario,
|
|
type OpenCodeSemanticScenario,
|
|
} from './openCodeSemanticScenarioHarness';
|
|
import {
|
|
createOpenCodeLiveHarness,
|
|
getRuntimeTranscript,
|
|
waitForOpenCodeMemberIdle,
|
|
type InboxMessage,
|
|
waitForMemberInboxMessage,
|
|
waitForOpenCodeLanesStopped,
|
|
waitForOpenCodePeerRelay,
|
|
waitForUserInboxReply,
|
|
} from './openCodeLiveTestHarness';
|
|
|
|
import type { TaskRef, TeamProvisioningProgress } from '../../../../src/shared/types';
|
|
|
|
const liveDescribe =
|
|
process.env.OPENCODE_E2E === '1' && process.env.OPENCODE_E2E_SEMANTIC_MODEL_MATRIX === '1'
|
|
? describe
|
|
: describe.skip;
|
|
|
|
interface ModelMatrixReport {
|
|
generatedAt: string;
|
|
models: ModelResult[];
|
|
}
|
|
|
|
interface ModelResult {
|
|
model: string;
|
|
passed: boolean;
|
|
score: number;
|
|
durationMs: number;
|
|
stages: {
|
|
launchBootstrap: boolean;
|
|
directReply: boolean;
|
|
peerRelay: boolean;
|
|
taskRefs: boolean;
|
|
longPrompt: boolean;
|
|
latencyStable: boolean;
|
|
};
|
|
diagnostics: string[];
|
|
}
|
|
|
|
liveDescribe('OpenCode semantic model matrix live e2e', () => {
|
|
it(
|
|
'launches realistic OpenCode teams and scores model behavior sequentially',
|
|
async () => {
|
|
const scenario = await loadOpenCodeSemanticScenario();
|
|
const models = parseOpenCodeE2EModelList();
|
|
const results: ModelResult[] = [];
|
|
|
|
for (const model of models) {
|
|
results.push(await runModelScenario({ scenario, model }));
|
|
}
|
|
|
|
await writeModelMatrixReport({
|
|
generatedAt: new Date().toISOString(),
|
|
models: results,
|
|
});
|
|
|
|
const failures = results.filter((result) => !result.passed);
|
|
expect(failures, JSON.stringify(results, null, 2)).toEqual([]);
|
|
},
|
|
Math.max(420_000, parseOpenCodeE2EModelList().length * 420_000)
|
|
);
|
|
});
|
|
|
|
async function runModelScenario(input: {
|
|
scenario: OpenCodeSemanticScenario;
|
|
model: string;
|
|
}): Promise<ModelResult> {
|
|
const startedAt = Date.now();
|
|
const stages: ModelResult['stages'] = {
|
|
launchBootstrap: false,
|
|
directReply: false,
|
|
peerRelay: false,
|
|
taskRefs: false,
|
|
longPrompt: false,
|
|
latencyStable: false,
|
|
};
|
|
const diagnostics: string[] = [];
|
|
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'opencode-semantic-model-matrix-'));
|
|
const tempClaudeRoot = path.join(tempDir, '.claude');
|
|
const projectPath = path.join(tempDir, 'project');
|
|
const teamName = `${input.scenario.teamNamePrefix}-${sanitizeModelForTeamName(input.model)}-${Date.now()}`;
|
|
let harness: Awaited<ReturnType<typeof createOpenCodeLiveHarness>> | null = null;
|
|
let keepTempDir = false;
|
|
|
|
try {
|
|
await fs.mkdir(tempClaudeRoot, { recursive: true });
|
|
await fs.mkdir(projectPath, { recursive: true });
|
|
setClaudeBasePathOverride(tempClaudeRoot);
|
|
await materializeOpenCodeScenarioProject(input.scenario, projectPath);
|
|
harness = await createOpenCodeLiveHarness({
|
|
tempDir,
|
|
selectedModel: input.model,
|
|
});
|
|
|
|
const progressEvents: TeamProvisioningProgress[] = [];
|
|
const createStartedAt = Date.now();
|
|
const { runId } = await harness.svc.createTeam(
|
|
buildOpenCodeScenarioTeamRequest({
|
|
scenario: input.scenario,
|
|
teamName,
|
|
projectPath,
|
|
model: harness.selectedModel,
|
|
}),
|
|
(progress) => progressEvents.push(progress)
|
|
);
|
|
diagnostics.push(`runId=${runId}`);
|
|
await materializeOpenCodeScenarioTasks({ scenario: input.scenario, teamName, projectPath });
|
|
|
|
const progressDump = formatProgressDump(progressEvents);
|
|
if (!progressEvents.some((progress) => progress.message.includes('OpenCode team launch is ready'))) {
|
|
throw new Error(`OpenCode launch did not reach ready state.\n${progressDump}`);
|
|
}
|
|
const runtimeSnapshot = await harness.svc.getTeamAgentRuntimeSnapshot(teamName);
|
|
for (const member of input.scenario.members) {
|
|
const snapshot = runtimeSnapshot.members[member.name];
|
|
if (!snapshot?.alive) {
|
|
throw new Error(
|
|
`OpenCode member ${member.name} is not alive. Snapshot: ${JSON.stringify(
|
|
runtimeSnapshot,
|
|
null,
|
|
2
|
|
)}`
|
|
);
|
|
}
|
|
if (snapshot.runtimeModel !== harness.selectedModel) {
|
|
diagnostics.push(
|
|
`${member.name} runtime model ${snapshot.runtimeModel ?? 'unknown'} differs from ${harness.selectedModel}`
|
|
);
|
|
}
|
|
}
|
|
stages.launchBootstrap = true;
|
|
stages.longPrompt = input.scenario.teamPromptLines.join('\n').length > 1_500;
|
|
stages.latencyStable = Date.now() - createStartedAt < 240_000;
|
|
|
|
const directTaskRef = taskRefForScenario(
|
|
input.scenario,
|
|
input.scenario.directDelivery.taskIndex,
|
|
teamName
|
|
);
|
|
const directDelivery = await harness.svc.deliverOpenCodeMemberMessage(teamName, {
|
|
memberName: input.scenario.directDelivery.memberName,
|
|
messageId: `ui-direct-${Date.now()}`,
|
|
replyRecipient: input.scenario.directDelivery.replyRecipient,
|
|
actionMode: input.scenario.directDelivery.actionMode,
|
|
taskRefs: [directTaskRef],
|
|
source: 'manual',
|
|
text: input.scenario.directDelivery.textLines.join('\n'),
|
|
});
|
|
diagnostics.push(`directDelivery=${formatDeliveryDiagnostic(directDelivery)}`);
|
|
if (!directDelivery.delivered) {
|
|
throw new Error(`Direct OpenCode delivery failed: ${JSON.stringify(directDelivery, null, 2)}`);
|
|
}
|
|
const directReply = await waitForReplyWithTranscript({
|
|
bridgeClient: harness.bridgeClient,
|
|
teamName,
|
|
memberName: input.scenario.directDelivery.memberName,
|
|
projectPath,
|
|
expectedToken: input.scenario.directDelivery.expectedReplyToken,
|
|
timeoutMs: 180_000,
|
|
});
|
|
assertVisibleReplyContract(directReply, {
|
|
expectedFrom: input.scenario.directDelivery.memberName,
|
|
expectedTo: 'user',
|
|
expectedTaskRef: directTaskRef,
|
|
});
|
|
stages.directReply = true;
|
|
stages.taskRefs = hasTaskRef(directReply, directTaskRef);
|
|
await waitForOpenCodeMemberIdle({
|
|
bridgeClient: harness.bridgeClient,
|
|
teamName,
|
|
memberName: input.scenario.directDelivery.memberName,
|
|
projectPath,
|
|
timeoutMs: 90_000,
|
|
});
|
|
|
|
const peerTaskRef = taskRefForScenario(
|
|
input.scenario,
|
|
input.scenario.peerDelivery.taskIndex,
|
|
teamName
|
|
);
|
|
const peerDelivery = await harness.svc.deliverOpenCodeMemberMessage(teamName, {
|
|
memberName: input.scenario.peerDelivery.senderName,
|
|
messageId: `ui-peer-${Date.now()}`,
|
|
replyRecipient: input.scenario.peerDelivery.replyRecipient,
|
|
actionMode: input.scenario.peerDelivery.actionMode,
|
|
taskRefs: [peerTaskRef],
|
|
source: 'manual',
|
|
text: input.scenario.peerDelivery.textLines.join('\n'),
|
|
});
|
|
diagnostics.push(`peerDelivery=${formatDeliveryDiagnostic(peerDelivery)}`);
|
|
if (!peerDelivery.delivered) {
|
|
throw new Error(`Peer OpenCode delivery failed: ${JSON.stringify(peerDelivery, null, 2)}`);
|
|
}
|
|
if (peerDelivery.accepted === false || peerDelivery.queuedBehindMessageId) {
|
|
throw new Error(
|
|
`Peer OpenCode delivery was not accepted immediately: ${JSON.stringify(
|
|
peerDelivery,
|
|
null,
|
|
2
|
|
)}`
|
|
);
|
|
}
|
|
|
|
let peerMessage: Awaited<ReturnType<typeof waitForMemberInboxMessage>>;
|
|
try {
|
|
peerMessage = await waitForMemberInboxMessage(
|
|
teamName,
|
|
input.scenario.peerDelivery.recipientName,
|
|
input.scenario.peerDelivery.senderName,
|
|
input.scenario.peerDelivery.peerToken,
|
|
180_000
|
|
);
|
|
} catch (error) {
|
|
const transcript = await getRuntimeTranscript({
|
|
bridgeClient: harness.bridgeClient,
|
|
teamName,
|
|
memberName: input.scenario.peerDelivery.senderName,
|
|
projectPath,
|
|
});
|
|
throw new Error(
|
|
`${error instanceof Error ? error.message : String(error)}\nSender transcript: ${JSON.stringify(
|
|
transcript,
|
|
null,
|
|
2
|
|
)}`
|
|
);
|
|
}
|
|
assertVisibleReplyContract(peerMessage, {
|
|
expectedFrom: input.scenario.peerDelivery.senderName,
|
|
expectedTo: input.scenario.peerDelivery.recipientName,
|
|
expectedTaskRef: peerTaskRef,
|
|
});
|
|
await waitForOpenCodePeerRelay(
|
|
harness.svc,
|
|
teamName,
|
|
input.scenario.peerDelivery.recipientName,
|
|
peerMessage.messageId,
|
|
180_000
|
|
);
|
|
const peerReply = await waitForReplyWithTranscript({
|
|
bridgeClient: harness.bridgeClient,
|
|
teamName,
|
|
memberName: input.scenario.peerDelivery.recipientName,
|
|
projectPath,
|
|
expectedToken: input.scenario.peerDelivery.expectedReplyToken,
|
|
timeoutMs: 180_000,
|
|
});
|
|
assertVisibleReplyContract(peerReply, {
|
|
expectedFrom: input.scenario.peerDelivery.recipientName,
|
|
expectedTo: 'user',
|
|
});
|
|
stages.peerRelay = true;
|
|
|
|
const score = scoreModel(stages);
|
|
return {
|
|
model: input.model,
|
|
passed: score === 100,
|
|
score,
|
|
durationMs: Date.now() - startedAt,
|
|
stages,
|
|
diagnostics,
|
|
};
|
|
} catch (error) {
|
|
if (process.env.OPENCODE_E2E_KEEP_FAILED === '1') {
|
|
keepTempDir = true;
|
|
diagnostics.push(`tempDir=${tempDir}`);
|
|
}
|
|
diagnostics.push(error instanceof Error ? error.message : String(error));
|
|
return {
|
|
model: input.model,
|
|
passed: false,
|
|
score: scoreModel(stages),
|
|
durationMs: Date.now() - startedAt,
|
|
stages,
|
|
diagnostics,
|
|
};
|
|
} finally {
|
|
if (harness) {
|
|
await harness.svc.stopTeam(teamName).catch(() => undefined);
|
|
await harness.dispose().catch(() => undefined);
|
|
await waitForOpenCodeLanesStopped(teamName).catch(() => undefined);
|
|
}
|
|
setClaudeBasePathOverride(null);
|
|
if (!keepTempDir) {
|
|
await fs.rm(tempDir, { recursive: true, force: true }).catch(() => undefined);
|
|
}
|
|
}
|
|
}
|
|
|
|
async function waitForReplyWithTranscript(input: {
|
|
bridgeClient: Parameters<typeof getRuntimeTranscript>[0]['bridgeClient'];
|
|
teamName: string;
|
|
memberName: string;
|
|
projectPath: string;
|
|
expectedToken: string;
|
|
timeoutMs: number;
|
|
}): Promise<InboxMessage> {
|
|
try {
|
|
return await waitForUserInboxReply(
|
|
input.teamName,
|
|
input.memberName,
|
|
input.expectedToken,
|
|
input.timeoutMs
|
|
);
|
|
} catch (error) {
|
|
const transcript = await getRuntimeTranscript({
|
|
bridgeClient: input.bridgeClient,
|
|
teamName: input.teamName,
|
|
memberName: input.memberName,
|
|
projectPath: input.projectPath,
|
|
});
|
|
throw new Error(
|
|
`${error instanceof Error ? error.message : String(error)}\nTranscript: ${JSON.stringify(
|
|
transcript,
|
|
null,
|
|
2
|
|
)}`
|
|
);
|
|
}
|
|
}
|
|
|
|
function assertVisibleReplyContract(
|
|
message: InboxMessage,
|
|
input: {
|
|
expectedFrom: string;
|
|
expectedTo: string;
|
|
expectedTaskRef?: TaskRef;
|
|
}
|
|
): void {
|
|
expect(message).toMatchObject({
|
|
from: input.expectedFrom,
|
|
to: input.expectedTo,
|
|
});
|
|
const text = message.text ?? '';
|
|
expect(text).not.toContain('SendMessage');
|
|
expect(text).not.toContain('runtime_deliver_message');
|
|
expect(text).not.toContain('#00000000');
|
|
expect(text.trim()).not.toBe('\u041f\u043e\u043d\u044f\u043b');
|
|
if (input.expectedTaskRef) {
|
|
expect(hasTaskRef(message, input.expectedTaskRef)).toBe(true);
|
|
}
|
|
}
|
|
|
|
function hasTaskRef(message: InboxMessage, expected: TaskRef): boolean {
|
|
return Boolean(
|
|
message.taskRefs?.some(
|
|
(taskRef) =>
|
|
taskRef.teamName === expected.teamName &&
|
|
taskRef.taskId === expected.taskId &&
|
|
taskRef.displayId === expected.displayId
|
|
)
|
|
);
|
|
}
|
|
|
|
function scoreModel(stages: ModelResult['stages']): number {
|
|
return (
|
|
(stages.launchBootstrap ? 25 : 0) +
|
|
(stages.directReply ? 25 : 0) +
|
|
(stages.peerRelay ? 20 : 0) +
|
|
(stages.taskRefs ? 15 : 0) +
|
|
(stages.longPrompt ? 10 : 0) +
|
|
(stages.latencyStable ? 5 : 0)
|
|
);
|
|
}
|
|
|
|
function formatDeliveryDiagnostic(delivery: {
|
|
delivered?: unknown;
|
|
accepted?: unknown;
|
|
responsePending?: unknown;
|
|
responseState?: unknown;
|
|
ledgerStatus?: unknown;
|
|
queuedBehindMessageId?: unknown;
|
|
reason?: unknown;
|
|
visibleReplyMessageId?: unknown;
|
|
visibleReplyCorrelation?: unknown;
|
|
diagnostics?: unknown;
|
|
}): string {
|
|
return JSON.stringify({
|
|
delivered: delivery.delivered,
|
|
accepted: delivery.accepted,
|
|
responsePending: delivery.responsePending,
|
|
responseState: delivery.responseState,
|
|
ledgerStatus: delivery.ledgerStatus,
|
|
queuedBehindMessageId: delivery.queuedBehindMessageId,
|
|
reason: delivery.reason,
|
|
visibleReplyMessageId: delivery.visibleReplyMessageId,
|
|
visibleReplyCorrelation: delivery.visibleReplyCorrelation,
|
|
diagnostics: Array.isArray(delivery.diagnostics)
|
|
? delivery.diagnostics.slice(0, 5)
|
|
: delivery.diagnostics,
|
|
});
|
|
}
|
|
|
|
async function writeModelMatrixReport(report: ModelMatrixReport): Promise<void> {
|
|
const outputDir = process.env.OPENCODE_E2E_REPORT_DIR?.trim()
|
|
? path.resolve(process.env.OPENCODE_E2E_REPORT_DIR.trim())
|
|
: path.join(process.cwd(), 'test-results', 'opencode-semantic-model-matrix');
|
|
await fs.mkdir(outputDir, { recursive: true });
|
|
await fs.writeFile(
|
|
path.join(outputDir, `report-${Date.now()}.json`),
|
|
`${JSON.stringify(report, null, 2)}\n`,
|
|
'utf8'
|
|
);
|
|
}
|
|
|
|
function formatProgressDump(progressEvents: TeamProvisioningProgress[]): string {
|
|
return progressEvents
|
|
.map((progress) =>
|
|
[
|
|
progress.state,
|
|
progress.message,
|
|
progress.messageSeverity,
|
|
progress.error,
|
|
progress.cliLogsTail,
|
|
]
|
|
.filter(Boolean)
|
|
.join(' | ')
|
|
)
|
|
.join('\n');
|
|
}
|
|
|
|
function sanitizeModelForTeamName(model: string): string {
|
|
return model
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, '-')
|
|
.replace(/^-+|-+$/g, '')
|
|
.slice(0, 48);
|
|
}
|