@EricGustin you can use this cli command:
```
uv run arcade evals mcp_building_evals_results/eval_toolkit_iteration_dict.py \
-p openai:gpt-4o,gpt-4o-mini \
-p anthropic:claude-sonnet-4-20250514 \
-k openai:$OPENAI_API_KEY \
-k anthropic:$ANTHROPIC_API_KEY \
-d \
--num-runs 3 \
--seed random \
--multi-run-pass-rule majority \
--max-concurrent 6 \
-o mcp_building_evals_results/results
```
<!-- CURSOR_SUMMARY -->
---
> [!NOTE]
> **Medium Risk**
> Touches core eval execution and all result formatters while adding new
CLI inputs and output schema (`run_stats`/`critic_stats` and capture
`runs`), so regressions could affect evaluation results and report
compatibility despite being additive and validated.
>
> **Overview**
> Adds **multi-run evaluation support** to `arcade evals` via new flags
`--num-runs`, `--seed`, and `--multi-run-pass-rule`, with upfront
validation and plumbing through the CLI runner into eval/capture suite
execution.
>
> Fixes provider selection UX/bug by making `--use-provider/-p`
**repeatable** (instead of a space-delimited string), updates
docs/examples accordingly, and extends capture mode to optionally record
**per-run tool calls** (`CapturedRun`) when `num_runs > 1`.
>
> Enhances all output formatters (HTML/Markdown/Text/JSON) to
**propagate and display** per-case `run_stats` and `critic_stats`,
including new HTML UI for run tabs/cards and comparative tables showing
mean ± stddev when multi-run data is present.
>
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
2ee1654b7d1fbb9538373507355636164b16a066. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
1389 lines
48 KiB
Python
1389 lines
48 KiB
Python
"""Tests for capture mode formatters."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import TYPE_CHECKING
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
from arcade_cli.formatters import (
|
|
CAPTURE_FORMATTERS,
|
|
CaptureHtmlFormatter,
|
|
CaptureJsonFormatter,
|
|
CaptureMarkdownFormatter,
|
|
CaptureTextFormatter,
|
|
get_capture_formatter,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from arcade_evals import CaptureResult
|
|
|
|
|
|
def _create_mock_capture_result(
|
|
suite_name: str = "TestSuite",
|
|
model: str = "gpt-4o",
|
|
provider: str = "openai",
|
|
cases: list[dict] | None = None,
|
|
) -> CaptureResult:
|
|
"""Create a mock CaptureResult for testing."""
|
|
if cases is None:
|
|
cases = [
|
|
{
|
|
"case_name": "test_case_1",
|
|
"user_message": "What's the weather?",
|
|
"tool_calls": [
|
|
{"name": "GetWeather", "args": {"city": "NYC", "units": "celsius"}},
|
|
],
|
|
"system_message": "You are helpful",
|
|
"additional_messages": [{"role": "user", "content": "Hi"}],
|
|
}
|
|
]
|
|
|
|
# Create mock capture result
|
|
capture = MagicMock()
|
|
capture.suite_name = suite_name
|
|
capture.model = model
|
|
capture.provider = provider
|
|
|
|
# Create mock captured cases
|
|
captured_cases = []
|
|
for case_data in cases:
|
|
case = MagicMock()
|
|
case.case_name = case_data["case_name"]
|
|
case.user_message = case_data["user_message"]
|
|
case.system_message = case_data.get("system_message")
|
|
case.additional_messages = case_data.get("additional_messages", [])
|
|
# Explicitly set track_name to None unless specified (avoids MagicMock)
|
|
case.track_name = case_data.get("track_name")
|
|
|
|
# Create mock runs if provided
|
|
runs = []
|
|
for run_data in case_data.get("runs", []):
|
|
run = MagicMock()
|
|
run_tool_calls = []
|
|
for tc_data in run_data.get("tool_calls", []):
|
|
tc = MagicMock()
|
|
tc.name = tc_data["name"]
|
|
tc.args = tc_data.get("args", {})
|
|
run_tool_calls.append(tc)
|
|
run.tool_calls = run_tool_calls
|
|
runs.append(run)
|
|
case.runs = runs
|
|
|
|
# Create mock tool calls
|
|
tool_calls = []
|
|
for tc_data in case_data.get("tool_calls", []):
|
|
tc = MagicMock()
|
|
tc.name = tc_data["name"]
|
|
tc.args = tc_data.get("args", {})
|
|
tool_calls.append(tc)
|
|
case.tool_calls = tool_calls
|
|
|
|
captured_cases.append(case)
|
|
|
|
capture.captured_cases = captured_cases
|
|
|
|
# Mock to_dict method
|
|
def to_dict(include_context: bool = False) -> dict:
|
|
result = {
|
|
"suite_name": capture.suite_name,
|
|
"model": capture.model,
|
|
"provider": capture.provider,
|
|
"captured_cases": [],
|
|
}
|
|
for case in captured_cases:
|
|
case_dict = {
|
|
"case_name": case.case_name,
|
|
"user_message": case.user_message,
|
|
"tool_calls": [{"name": tc.name, "args": tc.args} for tc in case.tool_calls],
|
|
}
|
|
if case.runs:
|
|
case_dict["runs"] = [
|
|
{"tool_calls": [{"name": tc.name, "args": tc.args} for tc in run.tool_calls]}
|
|
for run in case.runs
|
|
]
|
|
if include_context:
|
|
case_dict["system_message"] = case.system_message
|
|
case_dict["additional_messages"] = case.additional_messages
|
|
result["captured_cases"].append(case_dict)
|
|
return result
|
|
|
|
capture.to_dict = to_dict
|
|
|
|
return capture
|
|
|
|
|
|
class TestGetCaptureFormatter:
|
|
"""Tests for get_capture_formatter function."""
|
|
|
|
def test_get_json_formatter(self) -> None:
|
|
"""Test getting JSON formatter."""
|
|
formatter = get_capture_formatter("json")
|
|
assert isinstance(formatter, CaptureJsonFormatter)
|
|
|
|
def test_get_txt_formatter(self) -> None:
|
|
"""Test getting text formatter."""
|
|
formatter = get_capture_formatter("txt")
|
|
assert isinstance(formatter, CaptureTextFormatter)
|
|
|
|
def test_get_md_formatter(self) -> None:
|
|
"""Test getting markdown formatter."""
|
|
formatter = get_capture_formatter("md")
|
|
assert isinstance(formatter, CaptureMarkdownFormatter)
|
|
|
|
def test_get_html_formatter(self) -> None:
|
|
"""Test getting HTML formatter."""
|
|
formatter = get_capture_formatter("html")
|
|
assert isinstance(formatter, CaptureHtmlFormatter)
|
|
|
|
def test_case_insensitive(self) -> None:
|
|
"""Test that format names are case insensitive."""
|
|
assert isinstance(get_capture_formatter("JSON"), CaptureJsonFormatter)
|
|
assert isinstance(get_capture_formatter("TXT"), CaptureTextFormatter)
|
|
assert isinstance(get_capture_formatter("MD"), CaptureMarkdownFormatter)
|
|
assert isinstance(get_capture_formatter("HTML"), CaptureHtmlFormatter)
|
|
|
|
def test_unsupported_format_raises(self) -> None:
|
|
"""Test that unsupported formats raise ValueError."""
|
|
with pytest.raises(ValueError, match="Unsupported capture format 'xlsx'"):
|
|
get_capture_formatter("xlsx")
|
|
|
|
def test_close_match_suggestion(self) -> None:
|
|
"""Test that close matches are suggested."""
|
|
with pytest.raises(ValueError, match="Did you mean 'json'"):
|
|
get_capture_formatter("jsn")
|
|
|
|
|
|
class TestCaptureJsonFormatter:
|
|
"""Tests for CaptureJsonFormatter."""
|
|
|
|
def test_file_extension(self) -> None:
|
|
"""Test file extension is json."""
|
|
formatter = CaptureJsonFormatter()
|
|
assert formatter.file_extension == "json"
|
|
|
|
def test_format_basic(self) -> None:
|
|
"""Test basic JSON formatting."""
|
|
formatter = CaptureJsonFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
parsed = json.loads(output)
|
|
|
|
assert "captures" in parsed
|
|
assert len(parsed["captures"]) == 1
|
|
assert parsed["captures"][0]["suite_name"] == "TestSuite"
|
|
assert parsed["captures"][0]["model"] == "gpt-4o"
|
|
|
|
def test_format_includes_tool_calls(self) -> None:
|
|
"""Test that tool calls are included."""
|
|
formatter = CaptureJsonFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
parsed = json.loads(output)
|
|
|
|
case = parsed["captures"][0]["captured_cases"][0]
|
|
assert len(case["tool_calls"]) == 1
|
|
assert case["tool_calls"][0]["name"] == "GetWeather"
|
|
assert case["tool_calls"][0]["args"]["city"] == "NYC"
|
|
|
|
def test_format_includes_runs(self) -> None:
|
|
"""Test that runs are included when present."""
|
|
formatter = CaptureJsonFormatter()
|
|
capture = _create_mock_capture_result(
|
|
cases=[
|
|
{
|
|
"case_name": "multi_run_case",
|
|
"user_message": "Hello",
|
|
"tool_calls": [],
|
|
"runs": [
|
|
{"tool_calls": [{"name": "A", "args": {"x": 1}}]},
|
|
{"tool_calls": [{"name": "B", "args": {"x": 2}}]},
|
|
],
|
|
}
|
|
]
|
|
)
|
|
|
|
output = formatter.format([capture])
|
|
parsed = json.loads(output)
|
|
runs = parsed["captures"][0]["captured_cases"][0]["runs"]
|
|
assert len(runs) == 2
|
|
assert runs[0]["tool_calls"][0]["name"] == "A"
|
|
|
|
def test_format_with_context(self) -> None:
|
|
"""Test formatting with context included."""
|
|
formatter = CaptureJsonFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture], include_context=True)
|
|
parsed = json.loads(output)
|
|
|
|
case = parsed["captures"][0]["captured_cases"][0]
|
|
assert "system_message" in case
|
|
assert case["system_message"] == "You are helpful"
|
|
|
|
def test_format_without_context(self) -> None:
|
|
"""Test formatting without context (default)."""
|
|
formatter = CaptureJsonFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture], include_context=False)
|
|
parsed = json.loads(output)
|
|
|
|
case = parsed["captures"][0]["captured_cases"][0]
|
|
assert "system_message" not in case
|
|
|
|
|
|
class TestCaptureTextFormatter:
|
|
"""Tests for CaptureTextFormatter."""
|
|
|
|
def test_file_extension(self) -> None:
|
|
"""Test file extension is txt."""
|
|
formatter = CaptureTextFormatter()
|
|
assert formatter.file_extension == "txt"
|
|
|
|
def test_format_contains_suite_info(self) -> None:
|
|
"""Test that suite info is in output."""
|
|
formatter = CaptureTextFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "Suite: TestSuite" in output
|
|
assert "Model: gpt-4o" in output
|
|
assert "Provider: openai" in output
|
|
|
|
def test_format_contains_case_info(self) -> None:
|
|
"""Test that case info is in output."""
|
|
formatter = CaptureTextFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "Case: test_case_1" in output
|
|
assert "User Message: What's the weather?" in output
|
|
|
|
def test_format_contains_tool_calls(self) -> None:
|
|
"""Test that tool calls are in output."""
|
|
formatter = CaptureTextFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "GetWeather" in output
|
|
assert "city: NYC" in output
|
|
|
|
def test_format_contains_summary(self) -> None:
|
|
"""Test that summary is in output."""
|
|
formatter = CaptureTextFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "Summary: 1 tool calls across 1 cases" in output
|
|
|
|
def test_format_with_context(self) -> None:
|
|
"""Test formatting with context."""
|
|
formatter = CaptureTextFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture], include_context=True)
|
|
|
|
assert "System Message: You are helpful" in output
|
|
|
|
|
|
class TestCaptureMarkdownFormatter:
|
|
"""Tests for CaptureMarkdownFormatter."""
|
|
|
|
def test_file_extension(self) -> None:
|
|
"""Test file extension is md."""
|
|
formatter = CaptureMarkdownFormatter()
|
|
assert formatter.file_extension == "md"
|
|
|
|
def test_format_has_heading(self) -> None:
|
|
"""Test that markdown has main heading."""
|
|
formatter = CaptureMarkdownFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "# Capture Results" in output
|
|
|
|
def test_format_has_suite_heading(self) -> None:
|
|
"""Test that suite has heading."""
|
|
formatter = CaptureMarkdownFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "## TestSuite" in output
|
|
|
|
def test_format_has_case_heading(self) -> None:
|
|
"""Test that case has heading."""
|
|
formatter = CaptureMarkdownFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "### Case: test_case_1" in output
|
|
|
|
def test_format_has_code_blocks(self) -> None:
|
|
"""Test that tool args are in code blocks."""
|
|
formatter = CaptureMarkdownFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "```json" in output
|
|
assert '"city": "NYC"' in output
|
|
assert "```" in output
|
|
|
|
def test_format_has_summary(self) -> None:
|
|
"""Test that summary is present."""
|
|
formatter = CaptureMarkdownFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "## Summary" in output
|
|
assert "**Total Cases:** 1" in output
|
|
assert "**Total Tool Calls:** 1" in output
|
|
|
|
def test_format_includes_runs(self) -> None:
|
|
"""Should include per-run tool calls when runs are present."""
|
|
formatter = CaptureMarkdownFormatter()
|
|
capture = _create_mock_capture_result(
|
|
cases=[
|
|
{
|
|
"case_name": "multi_run_case",
|
|
"user_message": "Hello",
|
|
"tool_calls": [],
|
|
"runs": [
|
|
{"tool_calls": [{"name": "GetWeather", "args": {"city": "NYC"}}]},
|
|
{"tool_calls": [{"name": "GetWeather", "args": {"city": "SF"}}]},
|
|
],
|
|
}
|
|
]
|
|
)
|
|
|
|
output = formatter.format([capture])
|
|
assert "Run 1" in output
|
|
assert "Run 2" in output
|
|
assert "`GetWeather`" in output
|
|
|
|
|
|
class TestCaptureHtmlFormatter:
|
|
"""Tests for CaptureHtmlFormatter."""
|
|
|
|
def test_file_extension(self) -> None:
|
|
"""Test file extension is html."""
|
|
formatter = CaptureHtmlFormatter()
|
|
assert formatter.file_extension == "html"
|
|
|
|
def test_format_is_valid_html(self) -> None:
|
|
"""Test that output is valid HTML structure."""
|
|
formatter = CaptureHtmlFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "<!DOCTYPE html>" in output
|
|
assert "<html" in output
|
|
assert "</html>" in output
|
|
assert "<head>" in output
|
|
assert "</head>" in output
|
|
assert "<body>" in output
|
|
assert "</body>" in output
|
|
|
|
def test_format_contains_styles(self) -> None:
|
|
"""Test that CSS styles are included."""
|
|
formatter = CaptureHtmlFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "<style>" in output
|
|
assert "</style>" in output
|
|
|
|
def test_format_contains_suite_info(self) -> None:
|
|
"""Test that suite info is in output."""
|
|
formatter = CaptureHtmlFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "TestSuite" in output
|
|
assert "gpt-4o" in output
|
|
|
|
def test_format_contains_tool_calls(self) -> None:
|
|
"""Test that tool calls are in output."""
|
|
formatter = CaptureHtmlFormatter()
|
|
capture = _create_mock_capture_result()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
assert "GetWeather" in output
|
|
# Args should be HTML-escaped
|
|
assert ""city"" in output or '"city"' in output
|
|
|
|
def test_format_escapes_html(self) -> None:
|
|
"""Test that HTML special characters are escaped."""
|
|
formatter = CaptureHtmlFormatter()
|
|
capture = _create_mock_capture_result(
|
|
cases=[
|
|
{
|
|
"case_name": "Test <script>",
|
|
"user_message": "Hello & Goodbye",
|
|
"tool_calls": [],
|
|
}
|
|
]
|
|
)
|
|
|
|
output = formatter.format([capture])
|
|
|
|
# Angle brackets should be escaped
|
|
assert "<script>" in output
|
|
# Ampersand should be escaped
|
|
assert "&" in output
|
|
|
|
|
|
class TestCaptureFormattersRegistry:
|
|
"""Tests for the CAPTURE_FORMATTERS registry."""
|
|
|
|
def test_all_formats_registered(self) -> None:
|
|
"""Test that all expected formats are registered."""
|
|
assert "json" in CAPTURE_FORMATTERS
|
|
assert "txt" in CAPTURE_FORMATTERS
|
|
assert "md" in CAPTURE_FORMATTERS
|
|
assert "html" in CAPTURE_FORMATTERS
|
|
|
|
def test_registry_returns_correct_types(self) -> None:
|
|
"""Test that registry maps to correct formatter types."""
|
|
assert CAPTURE_FORMATTERS["json"] == CaptureJsonFormatter
|
|
assert CAPTURE_FORMATTERS["txt"] == CaptureTextFormatter
|
|
assert CAPTURE_FORMATTERS["md"] == CaptureMarkdownFormatter
|
|
assert CAPTURE_FORMATTERS["html"] == CaptureHtmlFormatter
|
|
|
|
|
|
class TestCaptureFormatterEdgeCases:
|
|
"""Tests for edge cases in capture formatting."""
|
|
|
|
def test_empty_captures_list(self) -> None:
|
|
"""Test formatting with empty captures list."""
|
|
for formatter in [
|
|
CaptureJsonFormatter(),
|
|
CaptureTextFormatter(),
|
|
CaptureMarkdownFormatter(),
|
|
CaptureHtmlFormatter(),
|
|
]:
|
|
output = formatter.format([])
|
|
assert output # Should produce some output
|
|
|
|
def test_case_with_no_tool_calls(self) -> None:
|
|
"""Test formatting a case with no tool calls."""
|
|
capture = _create_mock_capture_result(
|
|
cases=[
|
|
{
|
|
"case_name": "empty_case",
|
|
"user_message": "Hello",
|
|
"tool_calls": [],
|
|
}
|
|
]
|
|
)
|
|
|
|
for formatter in [
|
|
CaptureJsonFormatter(),
|
|
CaptureTextFormatter(),
|
|
CaptureMarkdownFormatter(),
|
|
CaptureHtmlFormatter(),
|
|
]:
|
|
output = formatter.format([capture])
|
|
assert output # Should produce some output
|
|
|
|
def test_multiple_captures(self) -> None:
|
|
"""Test formatting multiple capture results."""
|
|
capture1 = _create_mock_capture_result(suite_name="Suite1", model="gpt-4o")
|
|
capture2 = _create_mock_capture_result(suite_name="Suite2", model="claude-3")
|
|
|
|
for formatter in [
|
|
CaptureJsonFormatter(),
|
|
CaptureTextFormatter(),
|
|
CaptureMarkdownFormatter(),
|
|
CaptureHtmlFormatter(),
|
|
]:
|
|
output = formatter.format([capture1, capture2])
|
|
assert "Suite1" in output
|
|
assert "Suite2" in output
|
|
|
|
|
|
class TestMultiModelCaptureFormatting:
|
|
"""Tests for multi-model capture formatting."""
|
|
|
|
def test_markdown_multi_model_detection(self) -> None:
|
|
"""Test that markdown formatter detects multi-model and groups by case."""
|
|
# Same suite, same case, different models
|
|
capture1 = _create_mock_capture_result(
|
|
suite_name="TestSuite",
|
|
model="gpt-4o",
|
|
cases=[
|
|
{
|
|
"case_name": "shared_case",
|
|
"user_message": "What's the weather?",
|
|
"tool_calls": [{"name": "GetWeather", "args": {"city": "NYC"}}],
|
|
}
|
|
],
|
|
)
|
|
capture2 = _create_mock_capture_result(
|
|
suite_name="TestSuite",
|
|
model="gpt-4-turbo",
|
|
cases=[
|
|
{
|
|
"case_name": "shared_case",
|
|
"user_message": "What's the weather?",
|
|
"tool_calls": [{"name": "GetWeather", "args": {"city": "New York"}}],
|
|
}
|
|
],
|
|
)
|
|
|
|
formatter = CaptureMarkdownFormatter()
|
|
output = formatter.format([capture1, capture2])
|
|
|
|
# Should detect multi-model and show comparison
|
|
assert "Multi-Model" in output
|
|
assert "gpt-4o" in output
|
|
assert "gpt-4-turbo" in output
|
|
assert "shared_case" in output
|
|
# Should show models comparison table
|
|
assert "| Model |" in output
|
|
|
|
def test_markdown_single_model_format(self) -> None:
|
|
"""Test that single-model captures use the simple format."""
|
|
capture = _create_mock_capture_result(suite_name="Suite", model="gpt-4o")
|
|
|
|
formatter = CaptureMarkdownFormatter()
|
|
output = formatter.format([capture])
|
|
|
|
# Should NOT have multi-model header
|
|
assert "Multi-Model" not in output
|
|
# Should have regular header
|
|
assert "# Capture Results" in output
|
|
|
|
def test_multi_model_tool_calls_grouped(self) -> None:
|
|
"""Test that tool calls are grouped by case in multi-model output."""
|
|
capture1 = _create_mock_capture_result(
|
|
suite_name="Suite",
|
|
model="model-a",
|
|
cases=[
|
|
{
|
|
"case_name": "case1",
|
|
"user_message": "Do something",
|
|
"tool_calls": [{"name": "ToolA", "args": {"x": 1}}],
|
|
}
|
|
],
|
|
)
|
|
capture2 = _create_mock_capture_result(
|
|
suite_name="Suite",
|
|
model="model-b",
|
|
cases=[
|
|
{
|
|
"case_name": "case1",
|
|
"user_message": "Do something",
|
|
"tool_calls": [{"name": "ToolA", "args": {"x": 2}}],
|
|
}
|
|
],
|
|
)
|
|
|
|
formatter = CaptureMarkdownFormatter()
|
|
output = formatter.format([capture1, capture2])
|
|
|
|
# Both models should appear for the same case
|
|
assert "model-a" in output
|
|
assert "model-b" in output
|
|
# Tool details should be in collapsible sections
|
|
assert "<details>" in output
|
|
|
|
|
|
class TestMultiModelHelpers:
|
|
"""Tests for multi-model helper functions in base.py."""
|
|
|
|
def test_is_multi_model_capture_true(self) -> None:
|
|
"""Test detection of multiple models in captures."""
|
|
from arcade_cli.formatters.base import is_multi_model_capture
|
|
|
|
capture1 = _create_mock_capture_result(model="gpt-4o")
|
|
capture2 = _create_mock_capture_result(model="gpt-4-turbo")
|
|
|
|
assert is_multi_model_capture([capture1, capture2]) is True
|
|
|
|
def test_is_multi_model_capture_false(self) -> None:
|
|
"""Test single model detection."""
|
|
from arcade_cli.formatters.base import is_multi_model_capture
|
|
|
|
capture1 = _create_mock_capture_result(model="gpt-4o")
|
|
capture2 = _create_mock_capture_result(model="gpt-4o")
|
|
|
|
assert is_multi_model_capture([capture1, capture2]) is False
|
|
|
|
def test_group_captures_by_case(self) -> None:
|
|
"""Test grouping captures by case for comparison."""
|
|
from arcade_cli.formatters.base import group_captures_by_case
|
|
|
|
capture1 = _create_mock_capture_result(
|
|
suite_name="Suite",
|
|
model="model-a",
|
|
cases=[
|
|
{"case_name": "case1", "user_message": "msg1", "tool_calls": []},
|
|
{"case_name": "case2", "user_message": "msg2", "tool_calls": []},
|
|
],
|
|
)
|
|
capture2 = _create_mock_capture_result(
|
|
suite_name="Suite",
|
|
model="model-b",
|
|
cases=[
|
|
{"case_name": "case1", "user_message": "msg1", "tool_calls": []},
|
|
],
|
|
)
|
|
|
|
grouped, model_order = group_captures_by_case([capture1, capture2])
|
|
|
|
# Check structure
|
|
assert "Suite" in grouped
|
|
assert "case1" in grouped["Suite"]
|
|
assert "case2" in grouped["Suite"]
|
|
|
|
# Check model order
|
|
assert model_order == ["model-a", "model-b"]
|
|
|
|
# Check case1 has both models
|
|
assert "model-a" in grouped["Suite"]["case1"]["models"]
|
|
assert "model-b" in grouped["Suite"]["case1"]["models"]
|
|
|
|
# Check case2 only has model-a
|
|
assert "model-a" in grouped["Suite"]["case2"]["models"]
|
|
assert "model-b" not in grouped["Suite"]["case2"]["models"]
|
|
|
|
|
|
class TestMultiModelTextCaptureFormatter:
|
|
"""Tests for multi-model text capture formatting."""
|
|
|
|
def test_text_multi_model_output(self) -> None:
|
|
"""Should produce multi-model text output."""
|
|
capture1 = _create_mock_capture_result(
|
|
suite_name="TestSuite",
|
|
model="gpt-4o",
|
|
cases=[
|
|
{
|
|
"case_name": "case1",
|
|
"user_message": "Hi",
|
|
"tool_calls": [{"name": "Tool1", "args": {}}],
|
|
}
|
|
],
|
|
)
|
|
capture2 = _create_mock_capture_result(
|
|
suite_name="TestSuite",
|
|
model="gpt-4-turbo",
|
|
cases=[
|
|
{
|
|
"case_name": "case1",
|
|
"user_message": "Hi",
|
|
"tool_calls": [{"name": "Tool2", "args": {}}],
|
|
}
|
|
],
|
|
)
|
|
|
|
formatter = CaptureTextFormatter()
|
|
output = formatter.format([capture1, capture2])
|
|
|
|
# Should have multi-model header
|
|
assert "MULTI-MODEL CAPTURE RESULTS" in output
|
|
|
|
# Should list both models
|
|
assert "gpt-4o" in output
|
|
assert "gpt-4-turbo" in output
|
|
|
|
# Should show case name
|
|
assert "case1" in output
|
|
|
|
def test_text_single_model_regular_format(self) -> None:
|
|
"""Should use regular format for single model."""
|
|
capture = _create_mock_capture_result(model="gpt-4o")
|
|
|
|
formatter = CaptureTextFormatter()
|
|
output = formatter.format([capture])
|
|
|
|
# Should NOT have multi-model header
|
|
assert "MULTI-MODEL CAPTURE RESULTS" not in output
|
|
|
|
|
|
class TestMultiModelHtmlCaptureFormatter:
|
|
"""Tests for multi-model HTML capture formatting."""
|
|
|
|
def test_html_multi_model_output(self) -> None:
|
|
"""Should produce multi-model HTML output."""
|
|
capture1 = _create_mock_capture_result(
|
|
suite_name="TestSuite",
|
|
model="gpt-4o",
|
|
cases=[
|
|
{
|
|
"case_name": "case1",
|
|
"user_message": "Hi",
|
|
"tool_calls": [{"name": "Tool1", "args": {}}],
|
|
}
|
|
],
|
|
)
|
|
capture2 = _create_mock_capture_result(
|
|
suite_name="TestSuite",
|
|
model="gpt-4-turbo",
|
|
cases=[
|
|
{
|
|
"case_name": "case1",
|
|
"user_message": "Hi",
|
|
"tool_calls": [{"name": "Tool2", "args": {}}],
|
|
}
|
|
],
|
|
)
|
|
|
|
formatter = CaptureHtmlFormatter()
|
|
output = formatter.format([capture1, capture2])
|
|
|
|
# Should have multi-model title
|
|
assert "Multi-Model Capture Results" in output
|
|
|
|
# Should list models
|
|
assert "gpt-4o" in output
|
|
assert "gpt-4-turbo" in output
|
|
|
|
# Should have model panels
|
|
assert "model-panel" in output
|
|
|
|
def test_html_single_model_regular_format(self) -> None:
|
|
"""Should use regular format for single model."""
|
|
capture = _create_mock_capture_result(model="gpt-4o")
|
|
|
|
formatter = CaptureHtmlFormatter()
|
|
output = formatter.format([capture])
|
|
|
|
# Should NOT have multi-model title
|
|
assert "Multi-Model Capture Results" not in output
|
|
|
|
|
|
class TestMultiModelJsonCaptureFormatter:
|
|
"""Tests for multi-model JSON capture formatting."""
|
|
|
|
def test_json_multi_model_output(self) -> None:
|
|
"""Should produce structured multi-model JSON."""
|
|
capture1 = _create_mock_capture_result(
|
|
suite_name="TestSuite",
|
|
model="gpt-4o",
|
|
cases=[
|
|
{
|
|
"case_name": "case1",
|
|
"user_message": "Hi",
|
|
"tool_calls": [{"name": "Tool1", "args": {}}],
|
|
}
|
|
],
|
|
)
|
|
capture2 = _create_mock_capture_result(
|
|
suite_name="TestSuite",
|
|
model="gpt-4-turbo",
|
|
cases=[
|
|
{
|
|
"case_name": "case1",
|
|
"user_message": "Hi",
|
|
"tool_calls": [{"name": "Tool2", "args": {}}],
|
|
}
|
|
],
|
|
)
|
|
|
|
formatter = CaptureJsonFormatter()
|
|
output = formatter.format([capture1, capture2])
|
|
|
|
data = json.loads(output)
|
|
|
|
# Should have multi-model type
|
|
assert data["type"] == "multi_model_capture"
|
|
|
|
# Should have models list
|
|
assert "models" in data
|
|
assert "gpt-4o" in data["models"]
|
|
assert "gpt-4-turbo" in data["models"]
|
|
|
|
# Should have grouped_by_case structure
|
|
assert "grouped_by_case" in data
|
|
assert "TestSuite" in data["grouped_by_case"]
|
|
assert "case1" in data["grouped_by_case"]["TestSuite"]
|
|
|
|
def test_json_single_model_regular_format(self) -> None:
|
|
"""Should use regular format for single model."""
|
|
capture = _create_mock_capture_result(model="gpt-4o")
|
|
|
|
formatter = CaptureJsonFormatter()
|
|
output = formatter.format([capture])
|
|
|
|
data = json.loads(output)
|
|
|
|
# Should have capture type
|
|
assert data["type"] == "capture"
|
|
# Should not have grouped_by_case
|
|
assert "grouped_by_case" not in data
|
|
|
|
|
|
# =============================================================================
|
|
# CAPTURE WITH TRACKS TESTS
|
|
# =============================================================================
|
|
|
|
|
|
def _create_mock_capture_with_tracks(
|
|
suite_name: str = "ComparativeSuite",
|
|
model: str = "gpt-4o",
|
|
provider: str = "openai",
|
|
) -> CaptureResult:
|
|
"""Create a mock CaptureResult with track information for testing."""
|
|
cases = [
|
|
{
|
|
"case_name": "weather_case",
|
|
"user_message": "What's the weather in NYC?",
|
|
"tool_calls": [
|
|
{"name": "get_weather_v1", "args": {"city": "NYC"}},
|
|
],
|
|
"track_name": "track_a",
|
|
"system_message": "You are a weather assistant",
|
|
"additional_messages": [],
|
|
},
|
|
{
|
|
"case_name": "weather_case",
|
|
"user_message": "What's the weather in NYC?",
|
|
"tool_calls": [
|
|
{"name": "fetch_weather", "args": {"location": "NYC"}},
|
|
],
|
|
"track_name": "track_b",
|
|
"system_message": "You are a weather assistant",
|
|
"additional_messages": [],
|
|
},
|
|
{
|
|
"case_name": "regular_case",
|
|
"user_message": "Hello world",
|
|
"tool_calls": [
|
|
{"name": "greet", "args": {}},
|
|
],
|
|
"track_name": None, # Regular case without track
|
|
"system_message": None,
|
|
"additional_messages": [],
|
|
},
|
|
]
|
|
|
|
capture = MagicMock()
|
|
capture.suite_name = suite_name
|
|
capture.model = model
|
|
capture.provider = provider
|
|
|
|
captured_cases = []
|
|
for case_data in cases:
|
|
mock_case = MagicMock()
|
|
mock_case.case_name = case_data["case_name"]
|
|
mock_case.user_message = case_data["user_message"]
|
|
mock_case.system_message = case_data["system_message"]
|
|
mock_case.additional_messages = case_data["additional_messages"]
|
|
mock_case.track_name = case_data["track_name"]
|
|
|
|
mock_tool_calls = []
|
|
for tc in case_data["tool_calls"]:
|
|
mock_tc = MagicMock()
|
|
mock_tc.name = tc["name"]
|
|
mock_tc.args = tc["args"]
|
|
mock_tool_calls.append(mock_tc)
|
|
mock_case.tool_calls = mock_tool_calls
|
|
mock_case.runs = [] # Explicitly set runs to empty for single-run captures
|
|
|
|
captured_cases.append(mock_case)
|
|
|
|
capture.captured_cases = captured_cases
|
|
|
|
def to_dict(include_context: bool = False) -> dict:
|
|
result = {
|
|
"suite_name": capture.suite_name,
|
|
"model": capture.model,
|
|
"provider": capture.provider,
|
|
"captured_cases": [],
|
|
}
|
|
for case in capture.captured_cases:
|
|
case_dict = {
|
|
"case_name": case.case_name,
|
|
"user_message": case.user_message,
|
|
"tool_calls": [{"name": tc.name, "args": tc.args} for tc in case.tool_calls],
|
|
}
|
|
if case.track_name:
|
|
case_dict["track_name"] = case.track_name
|
|
if include_context:
|
|
case_dict["system_message"] = case.system_message
|
|
case_dict["additional_messages"] = case.additional_messages
|
|
result["captured_cases"].append(case_dict)
|
|
return result
|
|
|
|
capture.to_dict = to_dict
|
|
return capture
|
|
|
|
|
|
class TestCaptureWithTracks:
|
|
"""Tests for capture mode with track support."""
|
|
|
|
def test_captured_case_has_track_name_field(self) -> None:
|
|
"""CapturedCase should have track_name field."""
|
|
from arcade_evals.capture import CapturedCase
|
|
|
|
# Create a captured case with track
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="test",
|
|
tool_calls=[],
|
|
track_name="my_track",
|
|
)
|
|
assert case.track_name == "my_track"
|
|
|
|
# Create a captured case without track
|
|
case_no_track = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="test",
|
|
tool_calls=[],
|
|
)
|
|
assert case_no_track.track_name is None
|
|
|
|
def test_captured_case_to_dict_includes_track_name(self) -> None:
|
|
"""CapturedCase.to_dict should include track_name when set."""
|
|
from arcade_evals.capture import CapturedCase
|
|
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="test",
|
|
tool_calls=[],
|
|
track_name="my_track",
|
|
)
|
|
|
|
result = case.to_dict()
|
|
assert "track_name" in result
|
|
assert result["track_name"] == "my_track"
|
|
|
|
def test_captured_case_to_dict_excludes_track_name_when_none(self) -> None:
|
|
"""CapturedCase.to_dict should not include track_name when None."""
|
|
from arcade_evals.capture import CapturedCase
|
|
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="test",
|
|
tool_calls=[],
|
|
track_name=None,
|
|
)
|
|
|
|
result = case.to_dict()
|
|
assert "track_name" not in result
|
|
|
|
def test_json_formatter_shows_track_name(self) -> None:
|
|
"""JSON formatter should include track_name in output."""
|
|
capture = _create_mock_capture_with_tracks()
|
|
formatter = CaptureJsonFormatter()
|
|
|
|
output = formatter.format([capture])
|
|
data = json.loads(output)
|
|
|
|
# Find case with track
|
|
cases = data["captures"][0]["captured_cases"]
|
|
track_case = next(c for c in cases if c.get("track_name") == "track_a")
|
|
assert track_case["track_name"] == "track_a"
|
|
|
|
# Find case without track
|
|
regular_case = next(c for c in cases if c.get("track_name") is None)
|
|
assert "track_name" not in regular_case
|
|
|
|
def test_text_formatter_shows_track_info(self) -> None:
|
|
"""Text formatter should show track information."""
|
|
capture = _create_mock_capture_with_tracks()
|
|
formatter = CaptureTextFormatter()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
# Should show track names in output
|
|
assert "track_a" in output or "Track:" in output
|
|
|
|
def test_html_formatter_shows_track_info(self) -> None:
|
|
"""HTML formatter should show track information."""
|
|
capture = _create_mock_capture_with_tracks()
|
|
formatter = CaptureHtmlFormatter()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
# Should include track info in HTML
|
|
assert "track_a" in output or "Track" in output
|
|
|
|
def test_markdown_formatter_shows_track_info(self) -> None:
|
|
"""Markdown formatter should show track information."""
|
|
capture = _create_mock_capture_with_tracks()
|
|
formatter = CaptureMarkdownFormatter()
|
|
|
|
output = formatter.format([capture])
|
|
|
|
# Should include track info in markdown
|
|
assert "[track_a]" in output or "track_a" in output
|
|
|
|
|
|
# =====================================================================
|
|
# Capture formatter multi-run tests
|
|
# =====================================================================
|
|
|
|
|
|
def _create_mock_capture_with_runs(
|
|
num_runs: int = 3,
|
|
) -> CaptureResult:
|
|
"""Create a mock CaptureResult with multiple runs per case."""
|
|
cases = [
|
|
{
|
|
"case_name": "multi_run_case",
|
|
"user_message": "What's the weather in NYC?",
|
|
"tool_calls": [
|
|
{"name": "GetWeather", "args": {"city": "NYC"}},
|
|
],
|
|
"system_message": "You are a weather assistant",
|
|
"additional_messages": [],
|
|
"runs": [
|
|
{
|
|
"tool_calls": [
|
|
{"name": "GetWeather", "args": {"city": "NYC", "seed": str(i)}},
|
|
]
|
|
}
|
|
for i in range(1, num_runs + 1)
|
|
],
|
|
}
|
|
]
|
|
|
|
return _create_mock_capture_result(
|
|
suite_name="MultiRunCaptureSuite",
|
|
cases=cases,
|
|
)
|
|
|
|
|
|
def _create_mock_capture_no_runs() -> CaptureResult:
|
|
"""Create a mock CaptureResult with a case that has no tool calls and no runs."""
|
|
cases = [
|
|
{
|
|
"case_name": "empty_case",
|
|
"user_message": "Do nothing",
|
|
"tool_calls": [],
|
|
"system_message": None,
|
|
"additional_messages": [],
|
|
}
|
|
]
|
|
return _create_mock_capture_result(
|
|
suite_name="EmptyCaptureSuite",
|
|
cases=cases,
|
|
)
|
|
|
|
|
|
class TestCaptureMultiRunText:
|
|
"""Tests for multi-run capture in the text formatter."""
|
|
|
|
def test_text_shows_run_headers(self) -> None:
|
|
"""Text capture output should show 'Run 1', 'Run 2', etc."""
|
|
capture = _create_mock_capture_with_runs(num_runs=3)
|
|
formatter = CaptureTextFormatter()
|
|
output = formatter.format([capture])
|
|
assert "Run 1:" in output
|
|
assert "Run 2:" in output
|
|
assert "Run 3:" in output
|
|
|
|
def test_text_shows_tool_calls_per_run(self) -> None:
|
|
"""Each run should display its tool calls."""
|
|
capture = _create_mock_capture_with_runs(num_runs=2)
|
|
formatter = CaptureTextFormatter()
|
|
output = formatter.format([capture])
|
|
assert "GetWeather" in output
|
|
|
|
def test_text_no_runs_shows_top_level_calls(self) -> None:
|
|
"""When runs is empty, should fall through to top-level tool_calls."""
|
|
capture = _create_mock_capture_result() # default: no runs
|
|
formatter = CaptureTextFormatter()
|
|
output = formatter.format([capture])
|
|
assert "GetWeather" in output
|
|
|
|
def test_text_empty_case_no_tool_calls(self) -> None:
|
|
"""Case with no tool calls should show appropriate message."""
|
|
capture = _create_mock_capture_no_runs()
|
|
formatter = CaptureTextFormatter()
|
|
output = formatter.format([capture])
|
|
assert "no tool calls" in output.lower()
|
|
|
|
|
|
class TestCaptureMultiRunMarkdown:
|
|
"""Tests for multi-run capture in the markdown formatter."""
|
|
|
|
def test_markdown_shows_run_headers(self) -> None:
|
|
"""Markdown capture should show run headers."""
|
|
capture = _create_mock_capture_with_runs(num_runs=3)
|
|
formatter = CaptureMarkdownFormatter()
|
|
output = formatter.format([capture])
|
|
assert "Run 1" in output
|
|
assert "Run 2" in output
|
|
assert "Run 3" in output
|
|
|
|
def test_markdown_shows_tool_call_json(self) -> None:
|
|
"""Markdown capture should show tool call args as JSON."""
|
|
capture = _create_mock_capture_with_runs(num_runs=2)
|
|
formatter = CaptureMarkdownFormatter()
|
|
output = formatter.format([capture])
|
|
assert "```json" in output
|
|
assert "GetWeather" in output
|
|
|
|
def test_markdown_empty_runs_shows_no_calls(self) -> None:
|
|
"""Markdown capture with no tool calls shows appropriate message."""
|
|
capture = _create_mock_capture_no_runs()
|
|
formatter = CaptureMarkdownFormatter()
|
|
output = formatter.format([capture])
|
|
assert "No tool calls" in output
|
|
|
|
|
|
class TestCaptureMultiRunHTML:
|
|
"""Tests for multi-run capture in the HTML formatter."""
|
|
|
|
def test_html_shows_capture_run_details(self) -> None:
|
|
"""HTML capture should show capture-run details elements."""
|
|
capture = _create_mock_capture_with_runs(num_runs=3)
|
|
formatter = CaptureHtmlFormatter()
|
|
output = formatter.format([capture])
|
|
assert "capture-run" in output
|
|
assert "Run 1" in output
|
|
assert "Run 2" in output
|
|
assert "Run 3" in output
|
|
|
|
def test_html_tool_calls_escaped(self) -> None:
|
|
"""HTML capture should escape tool call content."""
|
|
capture = _create_mock_capture_with_runs(num_runs=1)
|
|
formatter = CaptureHtmlFormatter()
|
|
output = formatter.format([capture])
|
|
assert "GetWeather" in output
|
|
|
|
def test_html_empty_case_no_calls(self) -> None:
|
|
"""HTML capture with no tool calls shows appropriate message."""
|
|
capture = _create_mock_capture_no_runs()
|
|
formatter = CaptureHtmlFormatter()
|
|
output = formatter.format([capture])
|
|
assert "No tool calls" in output or "no-calls" in output
|
|
|
|
|
|
class TestCaptureMultiRunJSON:
|
|
"""Tests for multi-run capture in the JSON formatter."""
|
|
|
|
def test_json_includes_runs_array(self) -> None:
|
|
"""JSON capture should include runs array for multi-run cases."""
|
|
capture = _create_mock_capture_with_runs(num_runs=3)
|
|
formatter = CaptureJsonFormatter()
|
|
output = formatter.format([capture])
|
|
data = json.loads(output)
|
|
captures = data["captures"]
|
|
assert len(captures) == 1
|
|
case = captures[0]["captured_cases"][0]
|
|
assert "runs" in case
|
|
assert len(case["runs"]) == 3
|
|
|
|
def test_json_no_runs_for_single_run(self) -> None:
|
|
"""JSON capture should not include runs for single-run cases."""
|
|
capture = _create_mock_capture_result() # default: no runs
|
|
formatter = CaptureJsonFormatter()
|
|
output = formatter.format([capture])
|
|
data = json.loads(output)
|
|
case = data["captures"][0]["captured_cases"][0]
|
|
assert "runs" not in case
|
|
|
|
def test_json_run_tool_calls_structure(self) -> None:
|
|
"""Each run in JSON should have tool_calls with name and args."""
|
|
capture = _create_mock_capture_with_runs(num_runs=2)
|
|
formatter = CaptureJsonFormatter()
|
|
output = formatter.format([capture])
|
|
data = json.loads(output)
|
|
run = data["captures"][0]["captured_cases"][0]["runs"][0]
|
|
assert "tool_calls" in run
|
|
assert run["tool_calls"][0]["name"] == "GetWeather"
|
|
|
|
|
|
# =====================================================================
|
|
# Coverage gap tests — CaptureTextFormatter
|
|
# =====================================================================
|
|
|
|
|
|
class TestCaptureTextFormatterCoverageGaps:
|
|
"""Tests for CaptureTextFormatter methods that lacked coverage."""
|
|
|
|
def test_format_value_truncation(self) -> None:
|
|
"""_format_value should truncate values longer than 60 chars."""
|
|
formatter = CaptureTextFormatter()
|
|
short = formatter._format_value("hello")
|
|
assert short == "hello"
|
|
|
|
long_val = "x" * 100
|
|
truncated = formatter._format_value(long_val)
|
|
assert len(truncated) == 60
|
|
assert truncated.endswith("...")
|
|
|
|
def test_format_value_exactly_60(self) -> None:
|
|
"""_format_value should NOT truncate values of exactly 60 chars."""
|
|
formatter = CaptureTextFormatter()
|
|
exact = "a" * 60
|
|
result = formatter._format_value(exact)
|
|
assert result == exact
|
|
|
|
def test_conversation_text_format(self) -> None:
|
|
"""CaptureTextFormatter._format_conversation_text should format messages."""
|
|
formatter = CaptureTextFormatter()
|
|
messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi!"},
|
|
{
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [{"function": {"name": "get_data", "arguments": '{"id": 1}'}}],
|
|
},
|
|
{"role": "tool", "name": "get_data", "content": '{"result": "ok"}'},
|
|
]
|
|
lines = formatter._format_conversation_text(messages)
|
|
text = "\n".join(lines)
|
|
|
|
assert "[USER]" in text
|
|
assert "[ASSISTANT]" in text
|
|
assert "[TOOL]" in text
|
|
assert "get_data" in text
|
|
assert "Hello" in text
|
|
|
|
def test_conversation_text_invalid_json_content(self) -> None:
|
|
"""Should gracefully handle non-JSON tool content."""
|
|
formatter = CaptureTextFormatter()
|
|
messages = [
|
|
{"role": "tool", "name": "raw", "content": "plain text output"},
|
|
]
|
|
lines = formatter._format_conversation_text(messages)
|
|
text = "\n".join(lines)
|
|
|
|
assert "plain text output" in text
|
|
|
|
def test_conversation_text_invalid_json_args(self) -> None:
|
|
"""Should gracefully handle non-JSON tool call arguments."""
|
|
formatter = CaptureTextFormatter()
|
|
messages = [
|
|
{
|
|
"role": "assistant",
|
|
"content": "",
|
|
"tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}],
|
|
},
|
|
]
|
|
lines = formatter._format_conversation_text(messages)
|
|
text = "\n".join(lines)
|
|
|
|
assert "broken" in text
|
|
assert "not json" in text
|
|
|
|
def test_conversation_text_separator_between_messages(self) -> None:
|
|
"""Should add separator between messages (not before first)."""
|
|
formatter = CaptureTextFormatter()
|
|
messages = [
|
|
{"role": "user", "content": "First"},
|
|
{"role": "assistant", "content": "Second"},
|
|
]
|
|
lines = formatter._format_conversation_text(messages)
|
|
text = "\n".join(lines)
|
|
|
|
# Separator should appear between messages
|
|
assert "----" in text
|
|
|
|
def test_multi_model_with_tracks_and_context(self) -> None:
|
|
"""Multi-model capture with tracks should render correctly with context."""
|
|
capture1 = _create_mock_capture_with_tracks(model="gpt-4o")
|
|
capture2 = _create_mock_capture_with_tracks(model="gpt-4-turbo")
|
|
|
|
formatter = CaptureTextFormatter()
|
|
output = formatter.format([capture1, capture2], include_context=True)
|
|
|
|
assert "MULTI-MODEL CAPTURE RESULTS" in output
|
|
assert "gpt-4o" in output
|
|
assert "gpt-4-turbo" in output
|
|
# Should show track sections
|
|
assert "TRACK:" in output or "track_a" in output
|
|
|
|
def test_multi_model_no_data_model(self) -> None:
|
|
"""Multi-model capture should handle a model with no data for a case."""
|
|
# Model A has case1, model B has case1 with different tools
|
|
capture1 = _create_mock_capture_result(
|
|
suite_name="Suite",
|
|
model="model-a",
|
|
cases=[
|
|
{
|
|
"case_name": "case1",
|
|
"user_message": "Hi",
|
|
"tool_calls": [{"name": "T1", "args": {}}],
|
|
}
|
|
],
|
|
)
|
|
capture2 = _create_mock_capture_result(
|
|
suite_name="Suite",
|
|
model="model-b",
|
|
cases=[{"case_name": "case1", "user_message": "Hi", "tool_calls": []}],
|
|
)
|
|
|
|
formatter = CaptureTextFormatter()
|
|
output = formatter.format([capture1, capture2])
|
|
|
|
assert "model-a" in output
|
|
assert "model-b" in output
|
|
assert "MULTI-MODEL CAPTURE RESULTS" in output
|
|
|
|
|
|
# =====================================================================
|
|
# Coverage gap tests — CaptureMarkdownFormatter
|
|
# =====================================================================
|
|
|
|
|
|
class TestCaptureMarkdownFormatterCoverageGaps:
|
|
"""Tests for CaptureMarkdownFormatter methods that lacked coverage."""
|
|
|
|
def test_multi_model_with_tracks_and_context(self) -> None:
|
|
"""Multi-model markdown capture with tracks should render correctly."""
|
|
capture1 = _create_mock_capture_with_tracks(model="gpt-4o")
|
|
capture2 = _create_mock_capture_with_tracks(model="gpt-4-turbo")
|
|
|
|
formatter = CaptureMarkdownFormatter()
|
|
output = formatter.format([capture1, capture2], include_context=True)
|
|
|
|
assert "Multi-Model Capture Results" in output
|
|
assert "gpt-4o" in output
|
|
assert "gpt-4-turbo" in output
|
|
|
|
def test_conversation_md_standalone(self) -> None:
|
|
"""CaptureMarkdownFormatter._format_conversation_md should format messages."""
|
|
formatter = CaptureMarkdownFormatter()
|
|
messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi!"},
|
|
{
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [{"function": {"name": "search", "arguments": '{"q": "x"}'}}],
|
|
},
|
|
{"role": "tool", "name": "search", "content": '{"r": 1}'},
|
|
]
|
|
lines = formatter._format_conversation_md(messages)
|
|
text = "\n".join(lines)
|
|
|
|
assert "👤" in text or "User" in text
|
|
assert "search" in text
|
|
|
|
def test_conversation_md_invalid_json(self) -> None:
|
|
"""Should handle invalid JSON in tool call args."""
|
|
formatter = CaptureMarkdownFormatter()
|
|
messages = [
|
|
{
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}],
|
|
},
|
|
]
|
|
lines = formatter._format_conversation_md(messages)
|
|
text = "\n".join(lines)
|
|
|
|
assert "broken" in text
|