@EricGustin you can use this cli command:
```
uv run arcade evals mcp_building_evals_results/eval_toolkit_iteration_dict.py \
-p openai:gpt-4o,gpt-4o-mini \
-p anthropic:claude-sonnet-4-20250514 \
-k openai:$OPENAI_API_KEY \
-k anthropic:$ANTHROPIC_API_KEY \
-d \
--num-runs 3 \
--seed random \
--multi-run-pass-rule majority \
--max-concurrent 6 \
-o mcp_building_evals_results/results
```
<!-- CURSOR_SUMMARY -->
---
> [!NOTE]
> **Medium Risk**
> Touches core eval execution and all result formatters while adding new
CLI inputs and output schema (`run_stats`/`critic_stats` and capture
`runs`), so regressions could affect evaluation results and report
compatibility despite being additive and validated.
>
> **Overview**
> Adds **multi-run evaluation support** to `arcade evals` via new flags
`--num-runs`, `--seed`, and `--multi-run-pass-rule`, with upfront
validation and plumbing through the CLI runner into eval/capture suite
execution.
>
> Fixes provider selection UX/bug by making `--use-provider/-p`
**repeatable** (instead of a space-delimited string), updates
docs/examples accordingly, and extends capture mode to optionally record
**per-run tool calls** (`CapturedRun`) when `num_runs > 1`.
>
> Enhances all output formatters (HTML/Markdown/Text/JSON) to
**propagate and display** per-case `run_stats` and `critic_stats`,
including new HTML UI for run tabs/cards and comparative tables showing
mean ± stddev when multi-run data is present.
>
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
2ee1654b7d1fbb9538373507355636164b16a066. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
3776 lines
140 KiB
Python
3776 lines
140 KiB
Python
"""Tests for evaluation result formatters."""
|
|
|
|
import json
|
|
|
|
import pytest
|
|
from arcade_cli.formatters import (
|
|
FORMATTERS,
|
|
EvalResultFormatter,
|
|
HtmlFormatter,
|
|
JsonFormatter,
|
|
MarkdownFormatter,
|
|
TextFormatter,
|
|
get_formatter,
|
|
)
|
|
|
|
|
|
class MockEvaluation:
|
|
"""Mock EvaluationResult for testing."""
|
|
|
|
def __init__(
|
|
self,
|
|
passed: bool = True,
|
|
warning: bool = False,
|
|
score: float = 1.0,
|
|
failure_reason: str | None = None,
|
|
results: list[dict] | None = None,
|
|
):
|
|
self.passed = passed
|
|
self.warning = warning
|
|
self.score = score
|
|
self.failure_reason = failure_reason
|
|
self.results = results or []
|
|
|
|
|
|
def make_mock_results(
|
|
model: str = "gpt-4o",
|
|
cases: list[dict] | None = None,
|
|
suite_name: str = "test_eval_suite",
|
|
) -> list[list[dict]]:
|
|
"""Create mock evaluation results structure."""
|
|
if cases is None:
|
|
cases = [
|
|
{
|
|
"name": "test_case_1",
|
|
"input": "Test input 1",
|
|
"evaluation": MockEvaluation(passed=True, score=1.0),
|
|
},
|
|
{
|
|
"name": "test_case_2",
|
|
"input": "Test input 2",
|
|
"evaluation": MockEvaluation(passed=False, score=0.5),
|
|
},
|
|
]
|
|
|
|
return [[{"model": model, "suite_name": suite_name, "rubric": "Test Rubric", "cases": cases}]]
|
|
|
|
|
|
class TestGetFormatter:
|
|
"""Tests for get_formatter function."""
|
|
|
|
def test_get_text_formatter(self) -> None:
|
|
"""Should return TextFormatter for 'txt'."""
|
|
formatter = get_formatter("txt")
|
|
assert isinstance(formatter, TextFormatter)
|
|
|
|
def test_get_markdown_formatter(self) -> None:
|
|
"""Should return MarkdownFormatter for 'md'."""
|
|
formatter = get_formatter("md")
|
|
assert isinstance(formatter, MarkdownFormatter)
|
|
|
|
def test_case_insensitive(self) -> None:
|
|
"""Should be case-insensitive."""
|
|
assert isinstance(get_formatter("TXT"), TextFormatter)
|
|
assert isinstance(get_formatter("MD"), MarkdownFormatter)
|
|
|
|
def test_invalid_format_raises_error(self) -> None:
|
|
"""Should raise ValueError for unknown format."""
|
|
with pytest.raises(ValueError, match="Unsupported format"):
|
|
get_formatter("invalid")
|
|
|
|
def test_fuzzy_matching_suggests_close_match(self) -> None:
|
|
"""Should suggest 'txt' when 'txtt' is provided."""
|
|
with pytest.raises(ValueError) as excinfo:
|
|
get_formatter("txtt")
|
|
assert "Did you mean 'txt'?" in str(excinfo.value)
|
|
|
|
def test_fuzzy_matching_suggests_html_for_htm(self) -> None:
|
|
"""Should suggest 'html' when 'htm' is provided."""
|
|
with pytest.raises(ValueError) as excinfo:
|
|
get_formatter("htm")
|
|
assert "Did you mean 'html'?" in str(excinfo.value)
|
|
|
|
def test_no_suggestion_for_completely_different_format(self) -> None:
|
|
"""Should not suggest anything for completely different format names."""
|
|
with pytest.raises(ValueError) as excinfo:
|
|
get_formatter("xyz123")
|
|
assert "Did you mean" not in str(excinfo.value)
|
|
assert "Supported formats:" in str(excinfo.value)
|
|
|
|
|
|
class TestFormattersRegistry:
|
|
"""Tests for FORMATTERS registry."""
|
|
|
|
def test_registry_has_expected_formats(self) -> None:
|
|
"""Registry should contain txt and md formats."""
|
|
assert "txt" in FORMATTERS
|
|
assert "md" in FORMATTERS
|
|
|
|
def test_registry_values_are_formatter_classes(self) -> None:
|
|
"""All registry values should be EvalResultFormatter subclasses."""
|
|
for name, formatter_cls in FORMATTERS.items():
|
|
assert issubclass(formatter_cls, EvalResultFormatter), f"{name} is not a formatter"
|
|
|
|
|
|
class TestTextFormatter:
|
|
"""Tests for TextFormatter."""
|
|
|
|
def test_file_extension(self) -> None:
|
|
"""File extension should be 'txt'."""
|
|
formatter = TextFormatter()
|
|
assert formatter.file_extension == "txt"
|
|
|
|
def test_format_basic_results(self) -> None:
|
|
"""Should format basic results correctly."""
|
|
formatter = TextFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "Model: gpt-4o" in output
|
|
assert "PASSED test_case_1" in output
|
|
assert "FAILED test_case_2" in output
|
|
assert "Score: 100.00%" in output
|
|
assert "Score: 50.00%" in output
|
|
assert "Summary" in output
|
|
assert "Total: 2" in output
|
|
assert "Passed: 1" in output
|
|
assert "Failed: 1" in output
|
|
|
|
def test_format_with_warnings(self) -> None:
|
|
"""Should show warnings correctly."""
|
|
cases = [
|
|
{
|
|
"name": "warned_case",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(passed=False, warning=True, score=0.7),
|
|
}
|
|
]
|
|
formatter = TextFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases))
|
|
|
|
assert "WARNED warned_case" in output
|
|
assert "Warnings: 1" in output
|
|
|
|
def test_format_with_details(self) -> None:
|
|
"""Should include detailed output when show_details=True."""
|
|
cases = [
|
|
{
|
|
"name": "detailed_case",
|
|
"input": "Detailed test input",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=0.9,
|
|
results=[
|
|
{
|
|
"field": "param1",
|
|
"match": True,
|
|
"score": 0.5,
|
|
"weight": 0.5,
|
|
"expected": "expected_val",
|
|
"actual": "actual_val",
|
|
"is_criticized": True,
|
|
}
|
|
],
|
|
),
|
|
}
|
|
]
|
|
formatter = TextFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases), show_details=True)
|
|
|
|
assert "User Input: Detailed test input" in output
|
|
assert "Details:" in output
|
|
assert "param1:" in output
|
|
assert "Expected: expected_val" in output
|
|
assert "Actual: actual_val" in output
|
|
|
|
def test_format_failed_only_with_original_counts(self) -> None:
|
|
"""Should show original counts with failed_only mode."""
|
|
formatter = TextFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(
|
|
results,
|
|
failed_only=True,
|
|
original_counts=(10, 8, 2, 0),
|
|
)
|
|
|
|
assert "Showing only 2 failed evaluation(s)" in output
|
|
assert "Total: 10" in output
|
|
assert "Passed: 8" in output
|
|
assert "Failed: 2" in output
|
|
|
|
|
|
class TestMarkdownFormatter:
|
|
"""Tests for MarkdownFormatter."""
|
|
|
|
def test_file_extension(self) -> None:
|
|
"""File extension should be 'md'."""
|
|
formatter = MarkdownFormatter()
|
|
assert formatter.file_extension == "md"
|
|
|
|
def test_format_has_markdown_structure(self) -> None:
|
|
"""Should produce valid markdown structure."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results)
|
|
|
|
# Check headers
|
|
assert "# Evaluation Results" in output
|
|
assert "## Summary" in output
|
|
assert "## Results by Model" in output
|
|
assert "### 🤖 gpt-4o" in output
|
|
|
|
# Check table markers
|
|
assert "|" in output
|
|
assert "---" in output
|
|
|
|
def test_format_summary_table(self) -> None:
|
|
"""Should include summary table with stats."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "| Metric | Count |" in output
|
|
assert "| **Total** | 2 |" in output
|
|
assert "| ✅ Passed | 1 |" in output
|
|
assert "| ❌ Failed | 1 |" in output
|
|
|
|
def test_format_results_table(self) -> None:
|
|
"""Should include results table per model."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "| Status | Case | Score |" in output
|
|
assert "| ✅ | test_case_1 | 100.0% |" in output
|
|
assert "| ❌ | test_case_2 | 50.0% |" in output
|
|
|
|
def test_format_with_warnings_emoji(self) -> None:
|
|
"""Should use warning emoji for warned cases."""
|
|
cases = [
|
|
{
|
|
"name": "warned_case",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(passed=False, warning=True, score=0.7),
|
|
}
|
|
]
|
|
formatter = MarkdownFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases))
|
|
|
|
assert "⚠️" in output
|
|
|
|
def test_format_with_details_collapsible(self) -> None:
|
|
"""Should use collapsible details section."""
|
|
cases = [
|
|
{
|
|
"name": "detailed_case",
|
|
"input": "Test input",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=0.9,
|
|
results=[
|
|
{
|
|
"field": "param1",
|
|
"match": True,
|
|
"score": 0.5,
|
|
"weight": 0.5,
|
|
"expected": "exp",
|
|
"actual": "act",
|
|
"is_criticized": True,
|
|
}
|
|
],
|
|
),
|
|
}
|
|
]
|
|
formatter = MarkdownFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases), show_details=True)
|
|
|
|
assert "<details>" in output
|
|
assert "<summary>" in output
|
|
assert "</details>" in output
|
|
assert "#### detailed_case" in output
|
|
|
|
def test_markdown_run_details_include_per_run_tables(self) -> None:
|
|
"""Should include per-run detail tables when available."""
|
|
cases = [
|
|
{
|
|
"name": "multi_run_case",
|
|
"input": "Test input",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=0.9,
|
|
results=[
|
|
{
|
|
"field": "param1",
|
|
"match": True,
|
|
"score": 0.5,
|
|
"weight": 0.5,
|
|
"expected": "exp",
|
|
"actual": "act",
|
|
"is_criticized": True,
|
|
}
|
|
],
|
|
),
|
|
"run_stats": {
|
|
"num_runs": 2,
|
|
"scores": [0.9, 0.7],
|
|
"mean_score": 0.8,
|
|
"std_deviation": 0.1,
|
|
"runs": [
|
|
{
|
|
"score": 0.9,
|
|
"passed": True,
|
|
"warning": False,
|
|
"details": [
|
|
{
|
|
"field": "param1",
|
|
"match": True,
|
|
"score": 0.5,
|
|
"weight": 0.5,
|
|
"expected": "exp",
|
|
"actual": "act",
|
|
"is_criticized": True,
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"score": 0.7,
|
|
"passed": False,
|
|
"warning": False,
|
|
"details": [
|
|
{
|
|
"field": "param1",
|
|
"match": False,
|
|
"score": 0.0,
|
|
"weight": 0.5,
|
|
"expected": "exp",
|
|
"actual": "wrong",
|
|
"is_criticized": True,
|
|
}
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}
|
|
]
|
|
formatter = MarkdownFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases), show_details=True)
|
|
|
|
assert "**Run Details:**" in output
|
|
assert "<summary>Run 1 details</summary>" in output
|
|
assert "| Field | Match | Score | Expected | Actual |" in output
|
|
assert "| param1 | ✅ | 0.50/0.50 | `exp` | `act` |" in output
|
|
|
|
def test_format_pass_rate(self) -> None:
|
|
"""Should include pass rate percentage."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "**Pass Rate:**" in output
|
|
assert "50.0%" in output
|
|
|
|
def test_format_escapes_pipe_in_case_names(self) -> None:
|
|
"""Should escape pipe characters in case names for tables."""
|
|
cases = [
|
|
{
|
|
"name": "case|with|pipes",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(passed=True, score=1.0),
|
|
}
|
|
]
|
|
formatter = MarkdownFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases))
|
|
|
|
# Should escape pipes
|
|
assert "case\\|with\\|pipes" in output
|
|
|
|
def test_format_failed_only_shows_note(self) -> None:
|
|
"""Should show note when failed_only mode."""
|
|
formatter = MarkdownFormatter()
|
|
output = formatter.format(
|
|
make_mock_results(),
|
|
failed_only=True,
|
|
original_counts=(10, 8, 2, 0),
|
|
)
|
|
|
|
assert "> ⚠️ **Note:**" in output
|
|
assert "failed evaluation(s)" in output
|
|
|
|
def test_format_includes_timestamp(self) -> None:
|
|
"""Should include generation timestamp."""
|
|
formatter = MarkdownFormatter()
|
|
output = formatter.format(make_mock_results())
|
|
|
|
assert "**Generated:**" in output
|
|
assert "UTC" in output
|
|
|
|
|
|
class TestFormatterFailureReason:
|
|
"""Tests for handling failure reasons in formatters."""
|
|
|
|
def test_text_formatter_shows_failure_reason(self) -> None:
|
|
"""TextFormatter should show failure reason."""
|
|
cases = [
|
|
{
|
|
"name": "failed_case",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(
|
|
passed=False,
|
|
score=0.0,
|
|
failure_reason="Tool not called",
|
|
),
|
|
}
|
|
]
|
|
formatter = TextFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases), show_details=True)
|
|
|
|
assert "Failure Reason: Tool not called" in output
|
|
|
|
def test_markdown_formatter_shows_failure_reason(self) -> None:
|
|
"""MarkdownFormatter should show failure reason."""
|
|
cases = [
|
|
{
|
|
"name": "failed_case",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(
|
|
passed=False,
|
|
score=0.0,
|
|
failure_reason="Tool not called",
|
|
),
|
|
}
|
|
]
|
|
formatter = MarkdownFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases), show_details=True)
|
|
|
|
assert "**Failure Reason:** Tool not called" in output
|
|
|
|
|
|
class TestFormatterMultipleModels:
|
|
"""Tests for handling multiple models."""
|
|
|
|
def test_text_formatter_multiple_models(self) -> None:
|
|
"""Should show all models in multi-model output."""
|
|
results = [
|
|
[
|
|
{
|
|
"model": "gpt-4o",
|
|
"rubric": "Rubric 1",
|
|
"cases": [
|
|
{
|
|
"name": "case1",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(passed=True),
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"model": "claude-3-opus",
|
|
"rubric": "Rubric 2",
|
|
"cases": [
|
|
{
|
|
"name": "case2",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(passed=False),
|
|
}
|
|
],
|
|
},
|
|
]
|
|
]
|
|
formatter = TextFormatter()
|
|
output = formatter.format(results)
|
|
|
|
# Multi-model format shows models in summary table
|
|
assert "MULTI-MODEL EVALUATION RESULTS" in output
|
|
assert "gpt-4o" in output
|
|
assert "claude-3-opus" in output
|
|
|
|
def test_markdown_formatter_groups_by_model(self) -> None:
|
|
"""Should group results by model in markdown."""
|
|
results = [
|
|
[
|
|
{
|
|
"model": "gpt-4o",
|
|
"rubric": "Rubric",
|
|
"cases": [
|
|
{"name": "c1", "input": "T", "evaluation": MockEvaluation(passed=True)}
|
|
],
|
|
},
|
|
{
|
|
"model": "gpt-4o", # Same model
|
|
"rubric": "Rubric",
|
|
"cases": [
|
|
{"name": "c2", "input": "T", "evaluation": MockEvaluation(passed=True)}
|
|
],
|
|
},
|
|
]
|
|
]
|
|
formatter = MarkdownFormatter()
|
|
output = formatter.format(results)
|
|
|
|
# Should only have one header for gpt-4o
|
|
assert output.count("### 🤖 gpt-4o") == 1
|
|
# But both cases under it
|
|
assert "c1" in output
|
|
assert "c2" in output
|
|
|
|
|
|
class TestHtmlFormatter:
|
|
"""Tests for HtmlFormatter with color support."""
|
|
|
|
def test_file_extension(self) -> None:
|
|
"""File extension should be 'html'."""
|
|
formatter = HtmlFormatter()
|
|
assert formatter.file_extension == "html"
|
|
|
|
def test_format_produces_valid_html_structure(self) -> None:
|
|
"""Should produce valid HTML structure."""
|
|
formatter = HtmlFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "<!DOCTYPE html>" in output
|
|
assert "<html" in output
|
|
assert "</html>" in output
|
|
assert "<head>" in output
|
|
assert "<body>" in output
|
|
assert "<style>" in output
|
|
|
|
def test_format_includes_css_colors(self) -> None:
|
|
"""Should include CSS color definitions."""
|
|
formatter = HtmlFormatter()
|
|
output = formatter.format(make_mock_results())
|
|
|
|
# Check for color variables
|
|
assert "--green:" in output
|
|
assert "--red:" in output
|
|
assert "--yellow:" in output
|
|
assert "--blue:" in output
|
|
assert "--purple:" in output
|
|
|
|
def test_format_basic_results_with_status_classes(self) -> None:
|
|
"""Should include status classes for styling in summary table."""
|
|
formatter = HtmlFormatter()
|
|
results = make_mock_results()
|
|
# Without details, should show summary table
|
|
output = formatter.format(results, show_details=False)
|
|
|
|
assert 'class="passed"' in output
|
|
assert 'class="failed"' in output
|
|
assert 'class="results-table"' in output
|
|
|
|
def test_format_shows_suite_name(self) -> None:
|
|
"""Should display suite name in the output."""
|
|
formatter = HtmlFormatter()
|
|
results = make_mock_results(suite_name="my_custom_suite")
|
|
output = formatter.format(results)
|
|
|
|
# Should show suite section with suite name
|
|
assert 'class="suite-section"' in output
|
|
assert 'class="suite-header"' in output
|
|
assert "my_custom_suite" in output
|
|
|
|
def test_format_hides_summary_table_when_details_shown(self) -> None:
|
|
"""Should hide summary table when show_details=True to avoid duplication."""
|
|
formatter = HtmlFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results, show_details=True)
|
|
|
|
# Summary table should NOT be present
|
|
assert 'class="results-table"' not in output
|
|
# But expandable details should be
|
|
assert 'class="case-expandable' in output
|
|
|
|
def test_format_with_warnings_has_warned_class(self) -> None:
|
|
"""Should include warned class for warnings."""
|
|
cases = [
|
|
{
|
|
"name": "warned_case",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(passed=False, warning=True, score=0.7),
|
|
}
|
|
]
|
|
formatter = HtmlFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases))
|
|
|
|
assert 'class="warned"' in output
|
|
assert "⚠️ WARNED" in output
|
|
|
|
def test_format_escapes_html_special_chars(self) -> None:
|
|
"""Should escape HTML special characters."""
|
|
cases = [
|
|
{
|
|
"name": "<script>alert('xss')</script>",
|
|
"input": "Test <b>bold</b>",
|
|
"evaluation": MockEvaluation(passed=True, score=1.0),
|
|
}
|
|
]
|
|
formatter = HtmlFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases))
|
|
|
|
# Should escape < and > in case name
|
|
assert "<script>" in output
|
|
# Raw script tags should NOT be present (XSS prevention)
|
|
assert "<script>alert" not in output
|
|
|
|
def test_format_includes_stats_grid(self) -> None:
|
|
"""Should include stats grid with counts."""
|
|
formatter = HtmlFormatter()
|
|
output = formatter.format(make_mock_results())
|
|
|
|
assert 'class="stats-grid"' in output
|
|
assert 'class="stat-card total"' in output
|
|
assert 'class="stat-card passed"' in output
|
|
|
|
def test_format_with_details_includes_collapsible(self) -> None:
|
|
"""Should include details/summary for collapsible sections."""
|
|
cases = [
|
|
{
|
|
"name": "detailed_case",
|
|
"input": "Test input",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=0.9,
|
|
results=[
|
|
{
|
|
"field": "param1",
|
|
"match": True,
|
|
"score": 0.5,
|
|
"weight": 0.5,
|
|
"expected": "exp",
|
|
"actual": "act",
|
|
"is_criticized": True,
|
|
}
|
|
],
|
|
),
|
|
}
|
|
]
|
|
formatter = HtmlFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases), show_details=True)
|
|
|
|
# Each case should be individually expandable
|
|
assert '<details class="case-expandable' in output
|
|
assert '<summary class="case-summary">' in output
|
|
assert "</details>" in output
|
|
assert "detailed_case" in output
|
|
|
|
def test_format_each_case_is_individually_expandable(self) -> None:
|
|
"""Each case result should be in its own collapsible element."""
|
|
cases = [
|
|
{"name": "case_1", "input": "T1", "evaluation": MockEvaluation(passed=True)},
|
|
{"name": "case_2", "input": "T2", "evaluation": MockEvaluation(passed=False)},
|
|
{
|
|
"name": "case_3",
|
|
"input": "T3",
|
|
"evaluation": MockEvaluation(passed=False, warning=True),
|
|
},
|
|
]
|
|
formatter = HtmlFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases), show_details=True)
|
|
|
|
# Should have 3 separate expandable case elements
|
|
assert output.count('<details class="case-expandable') == 3
|
|
assert output.count("</details>") >= 3
|
|
|
|
def test_format_shows_expand_hint_in_details_mode(self) -> None:
|
|
"""Should show hint about clicking to expand when details mode is on."""
|
|
formatter = HtmlFormatter()
|
|
output = formatter.format(make_mock_results(), show_details=True)
|
|
|
|
assert "expand-hint" in output
|
|
assert "Click on any case below to expand details" in output
|
|
|
|
def test_format_no_expand_hint_without_details(self) -> None:
|
|
"""Should not show expand hint when details mode is off."""
|
|
formatter = HtmlFormatter()
|
|
output = formatter.format(make_mock_results(), show_details=False)
|
|
|
|
# The hint text should not be in the output (even though CSS class may be defined)
|
|
assert "Click on any case below to expand details" not in output
|
|
|
|
def test_format_failure_reason_styled(self) -> None:
|
|
"""Should style failure reasons prominently."""
|
|
cases = [
|
|
{
|
|
"name": "failed_case",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(
|
|
passed=False,
|
|
score=0.0,
|
|
failure_reason="Tool not called",
|
|
),
|
|
}
|
|
]
|
|
formatter = HtmlFormatter()
|
|
output = formatter.format(make_mock_results(cases=cases), show_details=True)
|
|
|
|
assert "failure-reason" in output
|
|
assert "Tool not called" in output
|
|
|
|
def test_format_responsive_css(self) -> None:
|
|
"""Should include responsive CSS media query."""
|
|
formatter = HtmlFormatter()
|
|
output = formatter.format(make_mock_results())
|
|
|
|
assert "@media" in output
|
|
|
|
def test_get_formatter_returns_html(self) -> None:
|
|
"""get_formatter should return HtmlFormatter for 'html'."""
|
|
formatter = get_formatter("html")
|
|
assert isinstance(formatter, HtmlFormatter)
|
|
|
|
def test_registry_includes_html(self) -> None:
|
|
"""FORMATTERS registry should include html."""
|
|
assert "html" in FORMATTERS
|
|
assert FORMATTERS["html"] is HtmlFormatter
|
|
|
|
|
|
class TestSharedUtilities:
|
|
"""Tests for shared utility functions in base.py."""
|
|
|
|
def test_truncate_field_value_within_limit(self) -> None:
|
|
"""Should not truncate values within limit."""
|
|
from arcade_cli.formatters.base import truncate_field_value
|
|
|
|
short_value = "short string"
|
|
result = truncate_field_value(short_value)
|
|
assert result == short_value
|
|
|
|
def test_truncate_field_value_exceeds_limit(self) -> None:
|
|
"""Should truncate values exceeding limit."""
|
|
from arcade_cli.formatters.base import truncate_field_value
|
|
|
|
long_value = "x" * 100
|
|
result = truncate_field_value(long_value, max_length=60)
|
|
assert len(result) == 60
|
|
assert result.endswith("...")
|
|
|
|
def test_truncate_field_value_custom_limit(self) -> None:
|
|
"""Should respect custom max_length."""
|
|
from arcade_cli.formatters.base import truncate_field_value
|
|
|
|
value = "hello world"
|
|
result = truncate_field_value(value, max_length=8)
|
|
assert len(result) == 8
|
|
assert result == "hello..."
|
|
|
|
def test_truncate_field_value_at_boundary(self) -> None:
|
|
"""Should handle exactly at boundary."""
|
|
from arcade_cli.formatters.base import truncate_field_value
|
|
|
|
value = "x" * 60
|
|
result = truncate_field_value(value, max_length=60)
|
|
assert result == value # Exactly at limit, no truncation
|
|
|
|
def test_group_results_by_model_basic(self) -> None:
|
|
"""Should group results by model and suite."""
|
|
from arcade_cli.formatters.base import group_results_by_model
|
|
|
|
results = make_mock_results(model="gpt-4o", suite_name="test_suite")
|
|
model_groups, passed, failed, warned, total = group_results_by_model(results)
|
|
|
|
assert "gpt-4o" in model_groups
|
|
assert "test_suite" in model_groups["gpt-4o"]
|
|
assert total == 2 # Two cases from make_mock_results
|
|
|
|
def test_group_results_by_model_multiple_models(self) -> None:
|
|
"""Should correctly separate multiple models."""
|
|
from arcade_cli.formatters.base import group_results_by_model
|
|
|
|
results = [
|
|
[
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "suite_a",
|
|
"cases": [{"name": "c1", "evaluation": MockEvaluation(passed=True)}],
|
|
},
|
|
{
|
|
"model": "claude-3",
|
|
"suite_name": "suite_b",
|
|
"cases": [{"name": "c2", "evaluation": MockEvaluation(passed=False)}],
|
|
},
|
|
]
|
|
]
|
|
model_groups, passed, failed, warned, total = group_results_by_model(results)
|
|
|
|
assert len(model_groups) == 2
|
|
assert "gpt-4o" in model_groups
|
|
assert "claude-3" in model_groups
|
|
assert passed == 1
|
|
assert failed == 1
|
|
assert total == 2
|
|
|
|
def test_group_results_by_model_suite_name_fallback(self) -> None:
|
|
"""Should fall back to 'Unnamed Suite' when suite_name is missing."""
|
|
from arcade_cli.formatters.base import group_results_by_model
|
|
|
|
results = [
|
|
[
|
|
{
|
|
"model": "gpt-4o",
|
|
# No suite_name, no rubric
|
|
"cases": [{"name": "c1", "evaluation": MockEvaluation(passed=True)}],
|
|
}
|
|
]
|
|
]
|
|
model_groups, _, _, _, _ = group_results_by_model(results)
|
|
|
|
assert "Unnamed Suite" in model_groups["gpt-4o"]
|
|
|
|
def test_group_results_counts_warnings(self) -> None:
|
|
"""Should correctly count warnings."""
|
|
from arcade_cli.formatters.base import group_results_by_model
|
|
|
|
results = [
|
|
[
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "suite",
|
|
"cases": [
|
|
{"name": "c1", "evaluation": MockEvaluation(passed=True, warning=False)},
|
|
{"name": "c2", "evaluation": MockEvaluation(passed=False, warning=True)},
|
|
{"name": "c3", "evaluation": MockEvaluation(passed=False, warning=False)},
|
|
],
|
|
}
|
|
]
|
|
]
|
|
_, passed, failed, warned, total = group_results_by_model(results)
|
|
|
|
assert passed == 1
|
|
assert warned == 1
|
|
assert failed == 1
|
|
assert total == 3
|
|
|
|
|
|
# =============================================================================
|
|
# COMPARATIVE EVALUATION TESTS
|
|
# =============================================================================
|
|
|
|
|
|
def make_comparative_results(
|
|
model: str = "gpt-4o",
|
|
suite_name: str = "Test Suite",
|
|
track_cases: dict[str, list[dict]] | None = None,
|
|
) -> list[list[dict]]:
|
|
"""Create mock comparative evaluation results structure.
|
|
|
|
Args:
|
|
model: The model name.
|
|
suite_name: Base suite name (will have track suffix added).
|
|
track_cases: Dict of track_name -> list of cases. If None, uses default.
|
|
|
|
Returns:
|
|
Results in the format produced by _convert_comparative_to_cli_format.
|
|
"""
|
|
if track_cases is None:
|
|
# Default: two tracks, same case
|
|
track_cases = {
|
|
"track_a": [
|
|
{
|
|
"name": "create_issue",
|
|
"input": "Create a bug issue",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=1.0,
|
|
results=[
|
|
{
|
|
"field": "title",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "Bug Issue",
|
|
"actual": "Bug Issue",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "description",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "A bug",
|
|
"actual": "A bug",
|
|
"is_criticized": True,
|
|
},
|
|
],
|
|
),
|
|
}
|
|
],
|
|
"track_b": [
|
|
{
|
|
"name": "create_issue",
|
|
"input": "Create a bug issue",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=0.9,
|
|
results=[
|
|
{
|
|
"field": "title",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "Bug Issue",
|
|
"actual": "Bug Issue",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "description",
|
|
"match": False,
|
|
"score": 0.8,
|
|
"weight": 1.0,
|
|
"expected": "A bug",
|
|
"actual": "A bug report",
|
|
"is_criticized": True,
|
|
},
|
|
],
|
|
),
|
|
}
|
|
],
|
|
}
|
|
|
|
# Build results like _convert_comparative_to_cli_format produces
|
|
results: list[list[dict]] = []
|
|
for track_name, cases in track_cases.items():
|
|
results.append([
|
|
{
|
|
"model": model,
|
|
"suite_name": f"{suite_name} [{track_name}]",
|
|
"track_name": track_name,
|
|
"rubric": None,
|
|
"cases": cases,
|
|
}
|
|
])
|
|
|
|
return results
|
|
|
|
|
|
# =============================================================================
|
|
# REALISTIC MCP SERVER DEFINITIONS FOR COMPARATIVE TESTS
|
|
# =============================================================================
|
|
|
|
# Simulates 3 MCP servers that provide Linear integration with different implementations:
|
|
# 1. linear_official - Official Linear MCP server (uses Linear's native tool names)
|
|
# 2. linear_arcade - Arcade's Linear toolkit (uses Arcade naming conventions)
|
|
# 3. linear_community - Community/custom implementation (alternative naming)
|
|
|
|
MCP_SERVER_LINEAR_OFFICIAL: dict = {
|
|
"name": "linear_official",
|
|
"description": "Official Linear MCP Server",
|
|
"tools": [
|
|
{
|
|
"name": "linear_create_issue",
|
|
"description": "Create a new issue in Linear",
|
|
"parameters": {
|
|
"title": {"type": "string", "required": True},
|
|
"description": {"type": "string", "required": False},
|
|
"team_id": {"type": "string", "required": True},
|
|
"priority": {"type": "integer", "required": False, "enum": [0, 1, 2, 3, 4]},
|
|
"assignee_id": {"type": "string", "required": False},
|
|
},
|
|
},
|
|
{
|
|
"name": "linear_update_issue",
|
|
"description": "Update an existing issue",
|
|
"parameters": {
|
|
"issue_id": {"type": "string", "required": True},
|
|
"title": {"type": "string", "required": False},
|
|
"state_id": {"type": "string", "required": False},
|
|
"priority": {"type": "integer", "required": False},
|
|
},
|
|
},
|
|
{
|
|
"name": "linear_list_issues",
|
|
"description": "List issues with filters",
|
|
"parameters": {
|
|
"team_id": {"type": "string", "required": False},
|
|
"state": {"type": "string", "required": False},
|
|
"assignee_id": {"type": "string", "required": False},
|
|
"limit": {"type": "integer", "required": False, "default": 50},
|
|
},
|
|
},
|
|
],
|
|
}
|
|
|
|
MCP_SERVER_LINEAR_ARCADE: dict = {
|
|
"name": "linear_arcade",
|
|
"description": "Arcade Linear Toolkit",
|
|
"tools": [
|
|
{
|
|
"name": "Linear_CreateIssue",
|
|
"description": "Create a new issue in Linear workspace",
|
|
"parameters": {
|
|
"title": {"type": "string", "required": True},
|
|
"description": {"type": "string", "required": False},
|
|
"team_key": {"type": "string", "required": True},
|
|
"priority_level": {
|
|
"type": "string",
|
|
"required": False,
|
|
"enum": ["urgent", "high", "medium", "low", "none"],
|
|
},
|
|
"assignee_email": {"type": "string", "required": False},
|
|
},
|
|
},
|
|
{
|
|
"name": "Linear_UpdateIssue",
|
|
"description": "Update an existing Linear issue",
|
|
"parameters": {
|
|
"issue_identifier": {"type": "string", "required": True},
|
|
"title": {"type": "string", "required": False},
|
|
"state_name": {"type": "string", "required": False},
|
|
"priority_level": {"type": "string", "required": False},
|
|
},
|
|
},
|
|
{
|
|
"name": "Linear_ListIssues",
|
|
"description": "Search and list issues",
|
|
"parameters": {
|
|
"team_key": {"type": "string", "required": False},
|
|
"status": {"type": "string", "required": False},
|
|
"assignee_email": {"type": "string", "required": False},
|
|
"max_results": {"type": "integer", "required": False, "default": 25},
|
|
},
|
|
},
|
|
],
|
|
}
|
|
|
|
MCP_SERVER_LINEAR_COMMUNITY: dict = {
|
|
"name": "linear_community",
|
|
"description": "Community Linear Integration",
|
|
"tools": [
|
|
{
|
|
"name": "createLinearIssue",
|
|
"description": "Creates an issue in Linear",
|
|
"parameters": {
|
|
"name": {"type": "string", "required": True}, # Different param name!
|
|
"body": {"type": "string", "required": False}, # Different param name!
|
|
"team": {"type": "string", "required": True},
|
|
"urgency": {
|
|
"type": "string",
|
|
"required": False,
|
|
"enum": ["critical", "high", "normal", "low"],
|
|
},
|
|
"owner": {"type": "string", "required": False},
|
|
},
|
|
},
|
|
{
|
|
"name": "updateLinearIssue",
|
|
"description": "Updates a Linear issue",
|
|
"parameters": {
|
|
"id": {"type": "string", "required": True},
|
|
"name": {"type": "string", "required": False},
|
|
"status": {"type": "string", "required": False},
|
|
"urgency": {"type": "string", "required": False},
|
|
},
|
|
},
|
|
{
|
|
"name": "searchLinearIssues",
|
|
"description": "Search for issues",
|
|
"parameters": {
|
|
"team": {"type": "string", "required": False},
|
|
"status": {"type": "string", "required": False},
|
|
"owner": {"type": "string", "required": False},
|
|
"count": {"type": "integer", "required": False, "default": 20},
|
|
},
|
|
},
|
|
],
|
|
}
|
|
|
|
|
|
def make_mcp_server_comparative_results(
|
|
model: str = "gpt-4o",
|
|
suite_name: str = "Linear MCP Comparison",
|
|
) -> list[list[dict]]:
|
|
"""Create comparative results simulating 3 different MCP servers for Linear.
|
|
|
|
This represents a realistic use case: comparing how well an LLM works with
|
|
different MCP server implementations that serve the same purpose.
|
|
"""
|
|
# Each server has different tool names and parameter conventions
|
|
track_cases = {
|
|
# Official Linear MCP - uses integer priority, state_id
|
|
"linear_official": [
|
|
{
|
|
"name": "create_bug_issue",
|
|
"input": "Create a high priority bug: 'API timeout errors'",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=1.0,
|
|
results=[
|
|
{
|
|
"field": "tool_selection",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "linear_create_issue",
|
|
"actual": "linear_create_issue",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "title",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "API timeout errors",
|
|
"actual": "API timeout errors",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "priority",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 0.8,
|
|
"expected": 1, # integer priority
|
|
"actual": 1,
|
|
"is_criticized": True,
|
|
},
|
|
],
|
|
),
|
|
},
|
|
{
|
|
"name": "update_issue_status",
|
|
"input": "Mark issue LIN-123 as done",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=1.0,
|
|
results=[
|
|
{
|
|
"field": "tool_selection",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "linear_update_issue",
|
|
"actual": "linear_update_issue",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "issue_id",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "LIN-123",
|
|
"actual": "LIN-123",
|
|
"is_criticized": True,
|
|
},
|
|
],
|
|
),
|
|
},
|
|
],
|
|
# Arcade Linear - uses string priority_level, state_name
|
|
"linear_arcade": [
|
|
{
|
|
"name": "create_bug_issue",
|
|
"input": "Create a high priority bug: 'API timeout errors'",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=0.95,
|
|
results=[
|
|
{
|
|
"field": "tool_selection",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "Linear_CreateIssue",
|
|
"actual": "Linear_CreateIssue",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "title",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "API timeout errors",
|
|
"actual": "API timeout errors",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "priority_level",
|
|
"match": True,
|
|
"score": 0.8,
|
|
"weight": 0.8,
|
|
"expected": "high", # string priority
|
|
"actual": "high",
|
|
"is_criticized": True,
|
|
},
|
|
],
|
|
),
|
|
},
|
|
{
|
|
"name": "update_issue_status",
|
|
"input": "Mark issue LIN-123 as done",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=1.0,
|
|
results=[
|
|
{
|
|
"field": "tool_selection",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "Linear_UpdateIssue",
|
|
"actual": "Linear_UpdateIssue",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "issue_identifier",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "LIN-123",
|
|
"actual": "LIN-123",
|
|
"is_criticized": True,
|
|
},
|
|
],
|
|
),
|
|
},
|
|
],
|
|
# Community Linear - uses name/body instead of title/description
|
|
"linear_community": [
|
|
{
|
|
"name": "create_bug_issue",
|
|
"input": "Create a high priority bug: 'API timeout errors'",
|
|
"evaluation": MockEvaluation(
|
|
passed=False,
|
|
score=0.7,
|
|
results=[
|
|
{
|
|
"field": "tool_selection",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "createLinearIssue",
|
|
"actual": "createLinearIssue",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "name", # Different field name!
|
|
"match": False,
|
|
"score": 0.5,
|
|
"weight": 1.0,
|
|
"expected": "API timeout errors",
|
|
"actual": "Bug: API timeout", # LLM confused by param name
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "urgency",
|
|
"match": False,
|
|
"score": 0.0,
|
|
"weight": 0.8,
|
|
"expected": "high",
|
|
"actual": "critical", # Wrong mapping
|
|
"is_criticized": True,
|
|
},
|
|
],
|
|
),
|
|
},
|
|
{
|
|
"name": "update_issue_status",
|
|
"input": "Mark issue LIN-123 as done",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=0.9,
|
|
results=[
|
|
{
|
|
"field": "tool_selection",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "updateLinearIssue",
|
|
"actual": "updateLinearIssue",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "id",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "LIN-123",
|
|
"actual": "LIN-123",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "status",
|
|
"match": False,
|
|
"score": 0.8,
|
|
"weight": 0.5,
|
|
"expected": "done",
|
|
"actual": "completed", # Slight mismatch
|
|
"is_criticized": True,
|
|
},
|
|
],
|
|
),
|
|
},
|
|
],
|
|
}
|
|
|
|
# Build results
|
|
results: list[list[dict]] = []
|
|
for track_name, cases in track_cases.items():
|
|
results.append([
|
|
{
|
|
"model": model,
|
|
"suite_name": f"{suite_name} [{track_name}]",
|
|
"track_name": track_name,
|
|
"rubric": None,
|
|
"cases": cases,
|
|
}
|
|
])
|
|
|
|
return results
|
|
|
|
|
|
class TestComparativeHelpers:
|
|
"""Tests for comparative evaluation helper functions."""
|
|
|
|
def test_is_comparative_result_true(self) -> None:
|
|
"""Should return True for results with track_name."""
|
|
from arcade_cli.formatters.base import is_comparative_result
|
|
|
|
results = make_comparative_results()
|
|
assert is_comparative_result(results) is True
|
|
|
|
def test_is_comparative_result_false(self) -> None:
|
|
"""Should return False for regular results without track_name."""
|
|
from arcade_cli.formatters.base import is_comparative_result
|
|
|
|
results = make_mock_results()
|
|
assert is_comparative_result(results) is False
|
|
|
|
def test_extract_base_suite_name(self) -> None:
|
|
"""Should extract base suite name by removing track suffix."""
|
|
from arcade_cli.formatters.base import _extract_base_suite_name
|
|
|
|
assert _extract_base_suite_name("My Suite [track_a]", "track_a") == "My Suite"
|
|
assert _extract_base_suite_name("Suite [foo]", "foo") == "Suite"
|
|
# Should not change if suffix doesn't match
|
|
assert _extract_base_suite_name("Suite [other]", "track_a") == "Suite [other]"
|
|
|
|
def test_group_comparative_by_case(self) -> None:
|
|
"""Should group comparative results by model, suite, and case."""
|
|
from arcade_cli.formatters.base import group_comparative_by_case
|
|
|
|
results = make_comparative_results()
|
|
groups, passed, failed, warned, total, suite_track_order = group_comparative_by_case(
|
|
results
|
|
)
|
|
|
|
# Check structure
|
|
assert "gpt-4o" in groups
|
|
assert "Test Suite" in groups["gpt-4o"]
|
|
assert "create_issue" in groups["gpt-4o"]["Test Suite"]
|
|
|
|
# Check case data
|
|
case_data = groups["gpt-4o"]["Test Suite"]["create_issue"]
|
|
assert "tracks" in case_data
|
|
assert "track_a" in case_data["tracks"]
|
|
assert "track_b" in case_data["tracks"]
|
|
assert case_data["input"] == "Create a bug issue"
|
|
|
|
# Check stats (2 cases total: 1 per track)
|
|
assert total == 2
|
|
assert passed == 2
|
|
assert failed == 0
|
|
|
|
# Check track order per suite
|
|
assert "Test Suite" in suite_track_order
|
|
assert suite_track_order["Test Suite"] == ["track_a", "track_b"]
|
|
|
|
def test_compute_track_differences(self) -> None:
|
|
"""Should compute which fields differ from baseline."""
|
|
from arcade_cli.formatters.base import compute_track_differences, group_comparative_by_case
|
|
|
|
results = make_comparative_results()
|
|
groups, _, _, _, _, suite_track_order = group_comparative_by_case(results)
|
|
|
|
case_data = groups["gpt-4o"]["Test Suite"]["create_issue"]
|
|
track_order = suite_track_order["Test Suite"]
|
|
differences = compute_track_differences(case_data, track_order)
|
|
|
|
# track_a is baseline, so no differences for it
|
|
assert "track_a" not in differences
|
|
|
|
# track_b should have 'description' as different
|
|
assert "track_b" in differences
|
|
assert "description" in differences["track_b"]
|
|
assert "title" not in differences["track_b"] # title matched
|
|
|
|
|
|
class TestComparativeMarkdownFormatter:
|
|
"""Tests for comparative markdown formatting."""
|
|
|
|
def test_comparative_format_header(self) -> None:
|
|
"""Should have comparative header."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "# Comparative Evaluation Results" in output
|
|
assert "Tracks compared:" in output
|
|
assert "`track_a`" in output
|
|
assert "`track_b`" in output
|
|
|
|
def test_comparative_format_suite_tracks(self) -> None:
|
|
"""Should show suite-specific tracks."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
# Suite should show its tracks
|
|
assert "**Tracks:** `track_a`, `track_b`" in output
|
|
|
|
def test_comparative_format_case_comparison_table(self) -> None:
|
|
"""Should show case comparison table with all tracks."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "##### Case: create_issue" in output
|
|
assert "| Track | Status | Score | Differences |" in output
|
|
assert "`track_a`" in output
|
|
assert "`track_b`" in output
|
|
assert "*(baseline)*" in output
|
|
|
|
def test_comparative_format_shows_differences(self) -> None:
|
|
"""Should show which fields differ between tracks."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
# track_b should show description as different
|
|
assert "`description`" in output
|
|
|
|
def test_comparative_format_with_details(self) -> None:
|
|
"""Should include collapsible details per track."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results, show_details=True)
|
|
|
|
assert "<details>" in output
|
|
assert "<summary>" in output
|
|
assert "track_a" in output
|
|
assert "track_b" in output
|
|
|
|
|
|
class TestComparativeTextFormatter:
|
|
"""Tests for comparative text formatting."""
|
|
|
|
def test_comparative_format_header(self) -> None:
|
|
"""Should have comparative header."""
|
|
formatter = TextFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "COMPARATIVE EVALUATION RESULTS" in output
|
|
assert "track_a vs track_b" in output
|
|
|
|
def test_comparative_format_ascii_table(self) -> None:
|
|
"""Should show ASCII comparison table."""
|
|
formatter = TextFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "COMPARISON" in output
|
|
assert "Track" in output
|
|
assert "Status" in output
|
|
assert "Score" in output
|
|
assert "track_a" in output
|
|
assert "track_b" in output
|
|
|
|
def test_comparative_format_with_details(self) -> None:
|
|
"""Should include per-track details."""
|
|
formatter = TextFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results, show_details=True)
|
|
|
|
assert "[track_a] Details:" in output
|
|
assert "[track_b] Details:" in output
|
|
|
|
|
|
class TestMcpServerComparison:
|
|
"""Tests for realistic MCP server comparison scenarios."""
|
|
|
|
def test_mcp_server_definitions_exist(self) -> None:
|
|
"""Should have 3 MCP server definitions with tools."""
|
|
assert MCP_SERVER_LINEAR_OFFICIAL["name"] == "linear_official"
|
|
assert MCP_SERVER_LINEAR_ARCADE["name"] == "linear_arcade"
|
|
assert MCP_SERVER_LINEAR_COMMUNITY["name"] == "linear_community"
|
|
|
|
# Each should have at least 3 tools
|
|
assert len(MCP_SERVER_LINEAR_OFFICIAL["tools"]) >= 3
|
|
assert len(MCP_SERVER_LINEAR_ARCADE["tools"]) >= 3
|
|
assert len(MCP_SERVER_LINEAR_COMMUNITY["tools"]) >= 3
|
|
|
|
def test_mcp_servers_have_different_tool_names(self) -> None:
|
|
"""Different MCP servers should have different tool naming conventions."""
|
|
official_tools = [t["name"] for t in MCP_SERVER_LINEAR_OFFICIAL["tools"]]
|
|
arcade_tools = [t["name"] for t in MCP_SERVER_LINEAR_ARCADE["tools"]]
|
|
community_tools = [t["name"] for t in MCP_SERVER_LINEAR_COMMUNITY["tools"]]
|
|
|
|
# Verify different naming conventions
|
|
assert "linear_create_issue" in official_tools # snake_case with prefix
|
|
assert "Linear_CreateIssue" in arcade_tools # PascalCase with prefix
|
|
assert "createLinearIssue" in community_tools # camelCase
|
|
|
|
def test_mcp_servers_have_different_param_names(self) -> None:
|
|
"""Different MCP servers may use different parameter names for same concept."""
|
|
# Get create issue tool from each server
|
|
official_create = next(
|
|
t for t in MCP_SERVER_LINEAR_OFFICIAL["tools"] if "create" in t["name"].lower()
|
|
)
|
|
arcade_create = next(
|
|
t for t in MCP_SERVER_LINEAR_ARCADE["tools"] if "create" in t["name"].lower()
|
|
)
|
|
community_create = next(
|
|
t for t in MCP_SERVER_LINEAR_COMMUNITY["tools"] if "create" in t["name"].lower()
|
|
)
|
|
|
|
# Official uses "title", Arcade uses "title", Community uses "name"
|
|
assert "title" in official_create["parameters"]
|
|
assert "title" in arcade_create["parameters"]
|
|
assert "name" in community_create["parameters"] # Different!
|
|
|
|
# Different priority representations
|
|
assert "priority" in official_create["parameters"] # integer
|
|
assert "priority_level" in arcade_create["parameters"] # string enum
|
|
assert "urgency" in community_create["parameters"] # different name entirely
|
|
|
|
def test_mcp_server_comparative_results_structure(self) -> None:
|
|
"""Should create proper comparative results from 3 MCP servers."""
|
|
from arcade_cli.formatters.base import group_comparative_by_case
|
|
|
|
results = make_mcp_server_comparative_results()
|
|
groups, passed, failed, warned, total, suite_track_order = group_comparative_by_case(
|
|
results
|
|
)
|
|
|
|
# Should have 3 tracks (one per MCP server)
|
|
assert "Linear MCP Comparison" in suite_track_order
|
|
tracks = suite_track_order["Linear MCP Comparison"]
|
|
assert len(tracks) == 3
|
|
assert "linear_official" in tracks
|
|
assert "linear_arcade" in tracks
|
|
assert "linear_community" in tracks
|
|
|
|
# Should have 6 total cases (2 cases x 3 servers)
|
|
assert total == 6
|
|
|
|
# Community server has failures (score-based, not all passed)
|
|
assert passed == 5
|
|
assert failed == 1
|
|
|
|
def test_mcp_server_markdown_shows_all_servers(self) -> None:
|
|
"""Markdown output should show all 3 MCP servers being compared."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_mcp_server_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
# Should show all three servers
|
|
assert "`linear_official`" in output
|
|
assert "`linear_arcade`" in output
|
|
assert "`linear_community`" in output
|
|
|
|
# Should show cases
|
|
assert "create_bug_issue" in output
|
|
assert "update_issue_status" in output
|
|
|
|
# Should show the failed community server
|
|
assert "❌" in output # Failed status for community
|
|
|
|
def test_mcp_server_comparison_shows_differences(self) -> None:
|
|
"""Should highlight differences between MCP server implementations."""
|
|
from arcade_cli.formatters.base import compute_track_differences, group_comparative_by_case
|
|
|
|
results = make_mcp_server_comparative_results()
|
|
groups, _, _, _, _, suite_track_order = group_comparative_by_case(results)
|
|
|
|
suite = "Linear MCP Comparison"
|
|
case_data = groups["gpt-4o"][suite]["create_bug_issue"]
|
|
track_order = suite_track_order[suite]
|
|
|
|
differences = compute_track_differences(case_data, track_order)
|
|
|
|
# linear_official is baseline (first), so not in differences
|
|
assert "linear_official" not in differences
|
|
|
|
# linear_arcade uses priority_level instead of priority
|
|
# (different field name = different from baseline)
|
|
assert "linear_arcade" in differences
|
|
|
|
# linear_community has name/urgency mismatches
|
|
assert "linear_community" in differences
|
|
assert "name" in differences["linear_community"]
|
|
|
|
def test_mcp_server_text_format_shows_servers(self) -> None:
|
|
"""Text output should properly display all MCP servers."""
|
|
formatter = TextFormatter()
|
|
results = make_mcp_server_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "linear_official" in output
|
|
assert "linear_arcade" in output
|
|
assert "linear_community" in output
|
|
assert "PASSED" in output
|
|
assert "FAILED" in output
|
|
|
|
def test_mcp_server_html_format_shows_servers(self) -> None:
|
|
"""HTML output should display all MCP servers with proper styling."""
|
|
formatter = HtmlFormatter()
|
|
results = make_mcp_server_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "linear_official" in output
|
|
assert "linear_arcade" in output
|
|
assert "linear_community" in output
|
|
assert "track-badge" in output
|
|
assert "diff-field" in output # Should show differences
|
|
|
|
|
|
class TestComparativeHtmlFormatter:
|
|
"""Tests for comparative HTML formatting."""
|
|
|
|
def test_comparative_format_structure(self) -> None:
|
|
"""Should have comparative HTML structure."""
|
|
formatter = HtmlFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "Comparative Evaluation Results" in output
|
|
assert "comparative-badge" in output
|
|
assert "COMPARATIVE" in output
|
|
|
|
def test_comparative_format_track_badges(self) -> None:
|
|
"""Should show track badges."""
|
|
formatter = HtmlFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "track-badge" in output
|
|
assert "track_a" in output
|
|
assert "track_b" in output
|
|
|
|
def test_comparative_format_comparison_table(self) -> None:
|
|
"""Should have comparison table."""
|
|
formatter = HtmlFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "comparison-table" in output
|
|
assert "baseline" in output
|
|
|
|
def test_comparative_format_diff_fields(self) -> None:
|
|
"""Should highlight different fields."""
|
|
formatter = HtmlFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
assert "diff-field" in output
|
|
assert "description" in output
|
|
|
|
def test_comparative_format_with_details_has_tabs(self) -> None:
|
|
"""Should include tabbed details per track."""
|
|
formatter = HtmlFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results, show_details=True)
|
|
|
|
assert "track-tabs" in output
|
|
assert "track-tab" in output
|
|
assert "track-panel" in output
|
|
|
|
def test_comparative_format_has_tab_script(self) -> None:
|
|
"""Should include JavaScript for tab switching."""
|
|
formatter = HtmlFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results, show_details=True)
|
|
|
|
assert "<script>" in output
|
|
assert "track-tab" in output
|
|
|
|
|
|
# =============================================================================
|
|
# JSON FORMATTER TESTS
|
|
# =============================================================================
|
|
|
|
|
|
class TestJsonFormatter:
|
|
"""Tests for JsonFormatter."""
|
|
|
|
def test_file_extension(self) -> None:
|
|
"""Should return 'json' as file extension."""
|
|
formatter = JsonFormatter()
|
|
assert formatter.file_extension == "json"
|
|
|
|
def test_format_produces_valid_json(self) -> None:
|
|
"""Should produce valid JSON output."""
|
|
formatter = JsonFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results)
|
|
|
|
# Should be valid JSON
|
|
parsed = json.loads(output)
|
|
assert isinstance(parsed, dict)
|
|
|
|
def test_format_includes_summary(self) -> None:
|
|
"""Should include summary statistics."""
|
|
formatter = JsonFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results)
|
|
|
|
parsed = json.loads(output)
|
|
assert "summary" in parsed
|
|
assert "total_cases" in parsed["summary"]
|
|
assert "passed" in parsed["summary"]
|
|
assert "failed" in parsed["summary"]
|
|
assert "pass_rate" in parsed["summary"]
|
|
|
|
def test_format_includes_model_results(self) -> None:
|
|
"""Should include model results structure."""
|
|
formatter = JsonFormatter()
|
|
results = make_mock_results(model="gpt-4o")
|
|
output = formatter.format(results)
|
|
|
|
parsed = json.loads(output)
|
|
assert "models" in parsed
|
|
assert "gpt-4o" in parsed["models"]
|
|
|
|
def test_format_includes_suite_name(self) -> None:
|
|
"""Should include suite name."""
|
|
formatter = JsonFormatter()
|
|
results = make_mock_results(suite_name="MyTestSuite")
|
|
output = formatter.format(results)
|
|
|
|
parsed = json.loads(output)
|
|
assert "MyTestSuite" in parsed["models"]["gpt-4o"]["suites"]
|
|
|
|
def test_format_includes_case_data(self) -> None:
|
|
"""Should include case data."""
|
|
formatter = JsonFormatter()
|
|
cases = [
|
|
{
|
|
"name": "my_test_case",
|
|
"input": "Test user input",
|
|
"evaluation": MockEvaluation(passed=True, score=0.95),
|
|
}
|
|
]
|
|
results = make_mock_results(cases=cases)
|
|
output = formatter.format(results)
|
|
|
|
parsed = json.loads(output)
|
|
suite_data = parsed["models"]["gpt-4o"]["suites"]["test_eval_suite"]
|
|
assert "cases" in suite_data
|
|
|
|
def test_format_with_details_includes_critic_results(self) -> None:
|
|
"""Should include critic details when show_details=True."""
|
|
formatter = JsonFormatter()
|
|
cases = [
|
|
{
|
|
"name": "detailed_case",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=1.0,
|
|
results=[
|
|
{
|
|
"field": "tool_selection",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "create_issue",
|
|
"actual": "create_issue",
|
|
}
|
|
],
|
|
),
|
|
}
|
|
]
|
|
results = make_mock_results(cases=cases)
|
|
output = formatter.format(results, show_details=True)
|
|
|
|
parsed = json.loads(output)
|
|
# Find the case data
|
|
suite_data = parsed["models"]["gpt-4o"]["suites"]["test_eval_suite"]
|
|
# In comparative mode, cases are in a dict
|
|
if isinstance(suite_data.get("cases"), dict):
|
|
case_data = suite_data["cases"]["detailed_case"]["tracks"]["default"]
|
|
else:
|
|
case_data = suite_data["cases"][0]
|
|
|
|
assert "details" in case_data
|
|
assert case_data["details"][0]["field"] == "tool_selection"
|
|
|
|
def test_format_without_details_excludes_critic_results(self) -> None:
|
|
"""Should not include critic details when show_details=False."""
|
|
formatter = JsonFormatter()
|
|
cases = [
|
|
{
|
|
"name": "no_details_case",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(
|
|
passed=True,
|
|
score=1.0,
|
|
results=[
|
|
{
|
|
"field": "test",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "x",
|
|
"actual": "x",
|
|
}
|
|
],
|
|
),
|
|
}
|
|
]
|
|
results = make_mock_results(cases=cases)
|
|
output = formatter.format(results, show_details=False)
|
|
|
|
parsed = json.loads(output)
|
|
suite_data = parsed["models"]["gpt-4o"]["suites"]["test_eval_suite"]
|
|
if isinstance(suite_data.get("cases"), dict):
|
|
case_data = suite_data["cases"]["no_details_case"]["tracks"]["default"]
|
|
else:
|
|
case_data = suite_data["cases"][0]
|
|
|
|
assert "details" not in case_data
|
|
|
|
def test_format_includes_failure_reason(self) -> None:
|
|
"""Should include failure reason when present."""
|
|
formatter = JsonFormatter()
|
|
cases = [
|
|
{
|
|
"name": "failed_case",
|
|
"input": "Test",
|
|
"evaluation": MockEvaluation(
|
|
passed=False,
|
|
score=0.0,
|
|
failure_reason="Tool selection mismatch",
|
|
),
|
|
}
|
|
]
|
|
results = make_mock_results(cases=cases)
|
|
output = formatter.format(results)
|
|
|
|
parsed = json.loads(output)
|
|
suite_data = parsed["models"]["gpt-4o"]["suites"]["test_eval_suite"]
|
|
if isinstance(suite_data.get("cases"), dict):
|
|
case_data = suite_data["cases"]["failed_case"]["tracks"]["default"]
|
|
else:
|
|
case_data = suite_data["cases"][0]
|
|
|
|
assert case_data["status"] == "failed"
|
|
assert case_data["failure_reason"] == "Tool selection mismatch"
|
|
|
|
def test_get_formatter_returns_json(self) -> None:
|
|
"""Should return JsonFormatter for 'json' format."""
|
|
formatter = get_formatter("json")
|
|
assert isinstance(formatter, JsonFormatter)
|
|
|
|
def test_registry_includes_json(self) -> None:
|
|
"""Should have 'json' in FORMATTERS registry."""
|
|
assert "json" in FORMATTERS
|
|
assert FORMATTERS["json"] == JsonFormatter
|
|
|
|
|
|
class TestComparativeJsonFormatter:
|
|
"""Tests for JSON formatter with comparative evaluation results."""
|
|
|
|
def test_comparative_format_includes_tracks(self) -> None:
|
|
"""Should include track information for comparative results."""
|
|
formatter = JsonFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
parsed = json.loads(output)
|
|
assert parsed["type"] == "comparative_evaluation"
|
|
assert "tracks" in parsed
|
|
|
|
def test_comparative_format_groups_by_case(self) -> None:
|
|
"""Should group results by case name across tracks."""
|
|
formatter = JsonFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
parsed = json.loads(output)
|
|
# Find the suite
|
|
suite_data = next(iter(parsed["models"].values()))["suites"]
|
|
suite = next(iter(suite_data.values()))
|
|
|
|
# Cases should be grouped
|
|
assert "cases" in suite
|
|
assert isinstance(suite["cases"], dict)
|
|
|
|
def test_comparative_format_includes_per_track_results(self) -> None:
|
|
"""Should include results for each track within a case."""
|
|
formatter = JsonFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
parsed = json.loads(output)
|
|
suite_data = next(iter(parsed["models"].values()))["suites"]
|
|
suite = next(iter(suite_data.values()))
|
|
case = next(iter(suite["cases"].values()))
|
|
|
|
assert "tracks" in case
|
|
# Should have track1 and track2
|
|
assert len(case["tracks"]) == 2
|
|
|
|
def test_comparative_format_with_details(self) -> None:
|
|
"""Should include detailed results per track when show_details=True."""
|
|
formatter = JsonFormatter()
|
|
results = make_comparative_results()
|
|
output = formatter.format(results, show_details=True)
|
|
|
|
parsed = json.loads(output)
|
|
suite_data = next(iter(parsed["models"].values()))["suites"]
|
|
suite = next(iter(suite_data.values()))
|
|
case = next(iter(suite["cases"].values()))
|
|
|
|
# Each track should have details
|
|
for _track_name, track_data in case["tracks"].items():
|
|
assert "details" in track_data
|
|
|
|
|
|
# =============================================================================
|
|
# MULTI-MODEL EVALUATION FORMATTING TESTS
|
|
# =============================================================================
|
|
|
|
|
|
def make_multi_model_results() -> list[list[dict]]:
|
|
"""Create mock multi-model evaluation results."""
|
|
return [
|
|
# Model 1 results
|
|
[
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "TestSuite",
|
|
"rubric": "Test Rubric",
|
|
"cases": [
|
|
{
|
|
"name": "case_1",
|
|
"input": "Test input 1",
|
|
"evaluation": MockEvaluation(passed=True, score=1.0),
|
|
},
|
|
{
|
|
"name": "case_2",
|
|
"input": "Test input 2",
|
|
"evaluation": MockEvaluation(passed=False, score=0.5),
|
|
},
|
|
],
|
|
}
|
|
],
|
|
# Model 2 results
|
|
[
|
|
{
|
|
"model": "gpt-4-turbo",
|
|
"suite_name": "TestSuite",
|
|
"rubric": "Test Rubric",
|
|
"cases": [
|
|
{
|
|
"name": "case_1",
|
|
"input": "Test input 1",
|
|
"evaluation": MockEvaluation(passed=True, score=0.95),
|
|
},
|
|
{
|
|
"name": "case_2",
|
|
"input": "Test input 2",
|
|
"evaluation": MockEvaluation(passed=True, score=0.85),
|
|
},
|
|
],
|
|
}
|
|
],
|
|
]
|
|
|
|
|
|
class TestMultiModelEvalHelpers:
|
|
"""Tests for multi-model eval helper functions."""
|
|
|
|
def test_is_multi_model_eval_true(self) -> None:
|
|
"""Should detect multiple models."""
|
|
from arcade_cli.formatters.base import is_multi_model_eval
|
|
|
|
results = make_multi_model_results()
|
|
assert is_multi_model_eval(results) is True
|
|
|
|
def test_is_multi_model_eval_false(self) -> None:
|
|
"""Should detect single model."""
|
|
from arcade_cli.formatters.base import is_multi_model_eval
|
|
|
|
results = make_mock_results(model="gpt-4o")
|
|
assert is_multi_model_eval(results) is False
|
|
|
|
def test_group_eval_for_comparison(self) -> None:
|
|
"""Should group results by suite, case, and model."""
|
|
from arcade_cli.formatters.base import group_eval_for_comparison
|
|
|
|
results = make_multi_model_results()
|
|
comparison_data, model_order, per_model_stats = group_eval_for_comparison(results)
|
|
|
|
# Check model order
|
|
assert model_order == ["gpt-4o", "gpt-4-turbo"]
|
|
|
|
# Check per-model stats
|
|
assert per_model_stats["gpt-4o"]["passed"] == 1
|
|
assert per_model_stats["gpt-4o"]["failed"] == 1
|
|
assert per_model_stats["gpt-4-turbo"]["passed"] == 2
|
|
assert per_model_stats["gpt-4-turbo"]["failed"] == 0
|
|
|
|
# Check comparison data structure
|
|
assert "TestSuite" in comparison_data
|
|
assert "case_1" in comparison_data["TestSuite"]
|
|
assert "case_2" in comparison_data["TestSuite"]
|
|
|
|
# Check both models present for each case
|
|
assert "gpt-4o" in comparison_data["TestSuite"]["case_1"]
|
|
assert "gpt-4-turbo" in comparison_data["TestSuite"]["case_1"]
|
|
|
|
def test_find_best_model(self) -> None:
|
|
"""Should find model with highest score."""
|
|
from arcade_cli.formatters.base import find_best_model
|
|
|
|
case_models = {
|
|
"model-a": {"evaluation": MockEvaluation(score=0.8)},
|
|
"model-b": {"evaluation": MockEvaluation(score=0.95)},
|
|
"model-c": {"evaluation": MockEvaluation(score=0.7)},
|
|
}
|
|
|
|
best, score = find_best_model(case_models)
|
|
assert best == "model-b"
|
|
assert score == 0.95
|
|
|
|
def test_find_best_model_tie(self) -> None:
|
|
"""Should return 'Tie' when multiple models have same score."""
|
|
from arcade_cli.formatters.base import find_best_model
|
|
|
|
case_models = {
|
|
"model-a": {"evaluation": MockEvaluation(score=0.9)},
|
|
"model-b": {"evaluation": MockEvaluation(score=0.9)},
|
|
}
|
|
|
|
best, score = find_best_model(case_models)
|
|
assert best == "Tie"
|
|
assert score == 0.9
|
|
|
|
|
|
class TestMultiModelMarkdownFormatter:
|
|
"""Tests for multi-model markdown formatting."""
|
|
|
|
def test_multi_model_comparison_table(self) -> None:
|
|
"""Should show comparison table when multiple models."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_multi_model_results()
|
|
output = formatter.format(results)
|
|
|
|
# Should have multi-model header
|
|
assert "Multi-Model Evaluation Results" in output
|
|
|
|
# Should list models
|
|
assert "gpt-4o" in output
|
|
assert "gpt-4-turbo" in output
|
|
|
|
# Should have per-model summary table
|
|
assert "| Model |" in output
|
|
assert "Pass Rate" in output
|
|
|
|
# Should have cross-model comparison section
|
|
assert "Cross-Model Comparison" in output
|
|
|
|
# Should show best model
|
|
assert "Best Overall" in output
|
|
|
|
def test_multi_model_shows_best_per_case(self) -> None:
|
|
"""Should show best model per case in comparison table."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_multi_model_results()
|
|
output = formatter.format(results)
|
|
|
|
# The comparison table should have a Best column
|
|
assert "| Best |" in output
|
|
|
|
def test_single_model_uses_regular_format(self) -> None:
|
|
"""Should use regular format for single model."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_mock_results(model="gpt-4o")
|
|
output = formatter.format(results)
|
|
|
|
# Should NOT have multi-model header
|
|
assert "Multi-Model Evaluation Results" not in output
|
|
# Should have regular header
|
|
assert "# Evaluation Results" in output
|
|
|
|
|
|
class TestMultiModelTextFormatter:
|
|
"""Tests for multi-model text formatting."""
|
|
|
|
def test_multi_model_text_output(self) -> None:
|
|
"""Should show comparison table when multiple models."""
|
|
formatter = TextFormatter()
|
|
results = make_multi_model_results()
|
|
output = formatter.format(results)
|
|
|
|
# Should have multi-model header
|
|
assert "MULTI-MODEL EVALUATION RESULTS" in output
|
|
|
|
# Should list models
|
|
assert "gpt-4o" in output
|
|
assert "gpt-4-turbo" in output
|
|
|
|
# Should have per-model summary section
|
|
assert "PER-MODEL SUMMARY" in output
|
|
assert "Pass Rate" in output
|
|
|
|
# Should have cross-model comparison section
|
|
assert "CROSS-MODEL COMPARISON" in output
|
|
|
|
# Should show best model
|
|
assert "Best Overall" in output
|
|
|
|
def test_single_model_uses_regular_text_format(self) -> None:
|
|
"""Should use regular format for single model."""
|
|
formatter = TextFormatter()
|
|
results = make_mock_results(model="gpt-4o")
|
|
output = formatter.format(results)
|
|
|
|
# Should NOT have multi-model header
|
|
assert "MULTI-MODEL EVALUATION RESULTS" not in output
|
|
# Should have case results
|
|
assert "test_case_1" in output
|
|
|
|
|
|
class TestMultiModelHtmlFormatter:
|
|
"""Tests for multi-model HTML formatting."""
|
|
|
|
def test_multi_model_html_output(self) -> None:
|
|
"""Should show comparison table when multiple models."""
|
|
formatter = HtmlFormatter()
|
|
results = make_multi_model_results()
|
|
output = formatter.format(results)
|
|
|
|
# Should have multi-model title
|
|
assert "Multi-Model Evaluation Results" in output
|
|
|
|
# Should have per-model summary
|
|
assert "Per-Model Summary" in output
|
|
|
|
# Should list models in table
|
|
assert "gpt-4o" in output
|
|
assert "gpt-4-turbo" in output
|
|
|
|
# Should have cross-model comparison
|
|
assert "Cross-Model Comparison" in output
|
|
|
|
# Should show best overall
|
|
assert "Best Overall" in output
|
|
|
|
def test_multi_model_html_has_styles(self) -> None:
|
|
"""Should include multi-model specific styles."""
|
|
formatter = HtmlFormatter()
|
|
results = make_multi_model_results()
|
|
output = formatter.format(results)
|
|
|
|
# Should have comparison table styles
|
|
assert ".comparison-table" in output
|
|
assert ".best-model" in output
|
|
|
|
def test_single_model_uses_regular_html_format(self) -> None:
|
|
"""Should use regular format for single model."""
|
|
formatter = HtmlFormatter()
|
|
results = make_mock_results(model="gpt-4o")
|
|
output = formatter.format(results)
|
|
|
|
# Should NOT have multi-model title
|
|
assert "Multi-Model Evaluation Results" not in output
|
|
# Should have regular title
|
|
assert "Evaluation Results" in output
|
|
|
|
|
|
class TestMultiModelJsonFormatter:
|
|
"""Tests for multi-model JSON formatting."""
|
|
|
|
def test_multi_model_json_output(self) -> None:
|
|
"""Should produce structured multi-model JSON."""
|
|
formatter = JsonFormatter()
|
|
results = make_multi_model_results()
|
|
output = formatter.format(results)
|
|
|
|
data = json.loads(output)
|
|
|
|
# Should have multi-model type
|
|
assert data["type"] == "multi_model_evaluation"
|
|
|
|
# Should have models list
|
|
assert "models" in data
|
|
assert "gpt-4o" in data["models"]
|
|
assert "gpt-4-turbo" in data["models"]
|
|
|
|
# Should have per-model stats
|
|
assert "per_model_stats" in data
|
|
assert "gpt-4o" in data["per_model_stats"]
|
|
assert "gpt-4-turbo" in data["per_model_stats"]
|
|
|
|
# Should have comparison structure
|
|
assert "comparison" in data
|
|
assert "TestSuite" in data["comparison"]
|
|
|
|
def test_multi_model_json_has_best_model_per_case(self) -> None:
|
|
"""Should include best model per case."""
|
|
formatter = JsonFormatter()
|
|
results = make_multi_model_results()
|
|
output = formatter.format(results)
|
|
|
|
data = json.loads(output)
|
|
|
|
# Each case in comparison should have best_model
|
|
for _suite_name, cases in data["comparison"].items():
|
|
for _case_name, case_data in cases.items():
|
|
assert "best_model" in case_data
|
|
assert "best_score" in case_data
|
|
|
|
def test_single_model_uses_regular_json_format(self) -> None:
|
|
"""Should use regular format for single model."""
|
|
formatter = JsonFormatter()
|
|
results = make_mock_results(model="gpt-4o")
|
|
output = formatter.format(results)
|
|
|
|
data = json.loads(output)
|
|
|
|
# Should NOT have multi-model type
|
|
assert data["type"] == "evaluation"
|
|
# Should not have comparison structure
|
|
assert "comparison" not in data
|
|
|
|
|
|
# =============================================================================
|
|
# MULTI-MODEL COMPARATIVE TESTS (CASE-FIRST GROUPING)
|
|
# =============================================================================
|
|
|
|
|
|
def make_multi_model_comparative_results() -> list[list[dict]]:
|
|
"""Create mock results for multi-model comparative evaluation."""
|
|
results: list[list[dict]] = []
|
|
|
|
for model in ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo"]:
|
|
for track in ["track_a", "track_b"]:
|
|
results.append([
|
|
{
|
|
"model": model,
|
|
"suite_name": f"TestSuite [{track}]",
|
|
"track_name": track,
|
|
"is_comparative": True,
|
|
"cases": [
|
|
{
|
|
"name": "case_1",
|
|
"input": "Test input for case 1",
|
|
"evaluation": MockEvaluation(passed=True, score=0.9),
|
|
},
|
|
{
|
|
"name": "case_2",
|
|
"input": "Test input for case 2",
|
|
"evaluation": MockEvaluation(passed=False, score=0.3),
|
|
},
|
|
],
|
|
}
|
|
])
|
|
|
|
return results
|
|
|
|
|
|
class TestMultiModelComparativeCaseFirstGrouping:
|
|
"""Tests for multi-model comparative evaluation with case-first grouping."""
|
|
|
|
def test_is_multi_model_comparative_detection(self) -> None:
|
|
"""Should correctly detect multi-model comparative evaluation."""
|
|
from arcade_cli.formatters.base import is_multi_model_comparative
|
|
|
|
# Multi-model comparative
|
|
results = make_multi_model_comparative_results()
|
|
assert is_multi_model_comparative(results) is True
|
|
|
|
# Single-model comparative
|
|
single_model_results = [r for r in results if r[0].get("model") == "gpt-4o"]
|
|
assert is_multi_model_comparative(single_model_results) is False
|
|
|
|
def test_group_comparative_by_case_first(self) -> None:
|
|
"""Should group results by case first, then model."""
|
|
from arcade_cli.formatters.base import group_comparative_by_case_first
|
|
|
|
results = make_multi_model_comparative_results()
|
|
case_groups, model_order, _, passed, failed, _, total = group_comparative_by_case_first(
|
|
results
|
|
)
|
|
|
|
# Check model order
|
|
assert len(model_order) == 3
|
|
assert "gpt-4o" in model_order
|
|
assert "gpt-4o-mini" in model_order
|
|
assert "gpt-4-turbo" in model_order
|
|
|
|
# Check case grouping structure: suite -> case -> model
|
|
assert "TestSuite" in case_groups
|
|
assert "case_1" in case_groups["TestSuite"]
|
|
assert "case_2" in case_groups["TestSuite"]
|
|
|
|
# Each case should have results for all models
|
|
for case_name in ["case_1", "case_2"]:
|
|
for model in model_order:
|
|
assert model in case_groups["TestSuite"][case_name]
|
|
|
|
def test_markdown_case_first_grouping(self) -> None:
|
|
"""Markdown formatter should use case-first grouping for multi-model comparative."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_multi_model_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
# Should use case-first format
|
|
assert "Multi-Model" in output
|
|
assert "Results by Case" in output
|
|
|
|
# Cases should appear before models in the hierarchy
|
|
case_1_pos = output.find("Case: case_1")
|
|
case_2_pos = output.find("Case: case_2")
|
|
|
|
# After each case header, models should be listed
|
|
assert case_1_pos > 0
|
|
assert case_2_pos > 0
|
|
|
|
# Check that each case section contains all models
|
|
assert output.count("gpt-4o") > 0
|
|
assert output.count("gpt-4o-mini") > 0
|
|
assert output.count("gpt-4-turbo") > 0
|
|
|
|
def test_text_case_first_grouping(self) -> None:
|
|
"""Text formatter should use case-first grouping for multi-model comparative."""
|
|
formatter = TextFormatter()
|
|
results = make_multi_model_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
# Should use case-first format
|
|
assert "MULTI-MODEL" in output
|
|
assert "CASE:" in output
|
|
|
|
# Check that models appear grouped under cases
|
|
assert "[gpt-4o]" in output
|
|
assert "[gpt-4o-mini]" in output
|
|
assert "[gpt-4-turbo]" in output
|
|
|
|
def test_html_case_first_grouping(self) -> None:
|
|
"""HTML formatter should use case-first grouping for multi-model comparative."""
|
|
formatter = HtmlFormatter()
|
|
results = make_multi_model_comparative_results()
|
|
output = formatter.format(results)
|
|
|
|
# Should use case-first format
|
|
assert "Multi-Model" in output
|
|
assert "Results by Case" in output
|
|
assert "case-group" in output # CSS class for case grouping
|
|
assert "model-panel" in output # CSS class for model panels
|
|
|
|
# Check structure
|
|
assert "case_1" in output
|
|
assert "case_2" in output
|
|
|
|
def test_json_case_first_grouping(self) -> None:
|
|
"""JSON formatter should use case-first structure for multi-model comparative."""
|
|
formatter = JsonFormatter()
|
|
results = make_multi_model_comparative_results()
|
|
output = formatter.format(results)
|
|
data = json.loads(output)
|
|
|
|
# Should have multi-model comparative type
|
|
assert data["type"] == "multi_model_comparative_evaluation"
|
|
|
|
# Should have model order
|
|
assert "models" in data
|
|
assert len(data["models"]) == 3
|
|
|
|
# Should have case-first structure
|
|
assert "grouped_by_case" in data
|
|
assert "TestSuite" in data["grouped_by_case"]
|
|
|
|
# Check structure: suite -> cases -> case -> models
|
|
suite_data = data["grouped_by_case"]["TestSuite"]
|
|
assert "case_1" in suite_data["cases"]
|
|
assert "case_2" in suite_data["cases"]
|
|
|
|
# Each case should have all models
|
|
case_1 = suite_data["cases"]["case_1"]
|
|
assert "models" in case_1
|
|
for model in data["models"]:
|
|
assert model in case_1["models"]
|
|
|
|
def test_single_model_comparative_uses_model_first(self) -> None:
|
|
"""Single-model comparative should still use model-first grouping."""
|
|
from arcade_cli.formatters.base import is_multi_model_comparative
|
|
|
|
# Create single-model comparative
|
|
results = [
|
|
[r[0]] for r in make_multi_model_comparative_results() if r[0].get("model") == "gpt-4o"
|
|
]
|
|
|
|
# Should not be detected as multi-model
|
|
assert is_multi_model_comparative(results) is False
|
|
|
|
# Markdown should not have case-first format
|
|
md_formatter = MarkdownFormatter()
|
|
md_output = md_formatter.format(results)
|
|
assert "Results by Model" in md_output
|
|
assert "Results by Case" not in md_output
|
|
|
|
|
|
# =============================================================================
|
|
# CONTEXT FUNCTIONALITY TESTS
|
|
# =============================================================================
|
|
|
|
|
|
def make_results_with_context(
|
|
model: str = "gpt-4o",
|
|
suite_name: str = "test_suite",
|
|
) -> list[list[dict]]:
|
|
"""Create mock results with system_message and additional_messages."""
|
|
return [
|
|
[
|
|
{
|
|
"model": model,
|
|
"suite_name": suite_name,
|
|
"rubric": "Test Rubric",
|
|
"cases": [
|
|
{
|
|
"name": "case_with_context",
|
|
"input": "What is the weather?",
|
|
"system_message": "You are a helpful weather assistant.",
|
|
"additional_messages": [
|
|
{"role": "user", "content": "Hello!"},
|
|
{"role": "assistant", "content": "Hi! How can I help?"},
|
|
{
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [
|
|
{
|
|
"function": {
|
|
"name": "get_weather",
|
|
"arguments": '{"city": "NYC"}',
|
|
}
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"role": "tool",
|
|
"name": "get_weather",
|
|
"content": '{"temp": 72, "conditions": "sunny"}',
|
|
},
|
|
],
|
|
"evaluation": MockEvaluation(passed=True, score=1.0),
|
|
},
|
|
{
|
|
"name": "case_without_context",
|
|
"input": "Simple test",
|
|
"evaluation": MockEvaluation(passed=True, score=0.9),
|
|
},
|
|
],
|
|
}
|
|
]
|
|
]
|
|
|
|
|
|
def make_comparative_results_with_context() -> list[list[dict]]:
|
|
"""Create mock comparative results with context."""
|
|
return [
|
|
[
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "weather_suite [track_a]",
|
|
"track_name": "track_a",
|
|
"rubric": "Test",
|
|
"cases": [
|
|
{
|
|
"name": "weather_test",
|
|
"input": "Get weather for NYC",
|
|
"system_message": "You are a weather bot.",
|
|
"additional_messages": [
|
|
{"role": "user", "content": "I need weather info"},
|
|
{
|
|
"role": "tool",
|
|
"name": "weather_api",
|
|
"content": '{"temp": 72, "city": "NYC"}',
|
|
},
|
|
],
|
|
"evaluation": MockEvaluation(passed=True, score=1.0),
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "weather_suite [track_b]",
|
|
"track_name": "track_b",
|
|
"rubric": "Test",
|
|
"cases": [
|
|
{
|
|
"name": "weather_test",
|
|
"input": "Get weather for NYC",
|
|
"system_message": "You are a weather bot.",
|
|
"additional_messages": [
|
|
{"role": "user", "content": "I need weather info"},
|
|
],
|
|
"evaluation": MockEvaluation(passed=False, score=0.7),
|
|
}
|
|
],
|
|
},
|
|
]
|
|
]
|
|
|
|
|
|
class TestIncludeContextJson:
|
|
"""Tests for JSON formatter include_context functionality."""
|
|
|
|
def test_json_include_context_true_shows_context(self) -> None:
|
|
"""JSON output should include context fields when include_context=True."""
|
|
formatter = JsonFormatter()
|
|
results = make_results_with_context()
|
|
|
|
output = formatter.format(results, include_context=True)
|
|
data = json.loads(output)
|
|
|
|
cases = data["models"]["gpt-4o"]["suites"]["test_suite"]["cases"]
|
|
case_with_ctx = next(c for c in cases if c["name"] == "case_with_context")
|
|
|
|
assert "system_message" in case_with_ctx
|
|
assert case_with_ctx["system_message"] == "You are a helpful weather assistant."
|
|
assert "additional_messages" in case_with_ctx
|
|
assert len(case_with_ctx["additional_messages"]) == 4
|
|
|
|
def test_json_include_context_false_excludes_context(self) -> None:
|
|
"""JSON output should NOT include context fields when include_context=False."""
|
|
formatter = JsonFormatter()
|
|
results = make_results_with_context()
|
|
|
|
output = formatter.format(results, include_context=False)
|
|
data = json.loads(output)
|
|
|
|
cases = data["models"]["gpt-4o"]["suites"]["test_suite"]["cases"]
|
|
case_with_ctx = next(c for c in cases if c["name"] == "case_with_context")
|
|
|
|
assert "system_message" not in case_with_ctx
|
|
assert "additional_messages" not in case_with_ctx
|
|
|
|
def test_json_comparative_with_context(self) -> None:
|
|
"""JSON comparative output should include context when requested."""
|
|
formatter = JsonFormatter()
|
|
results = make_comparative_results_with_context()
|
|
|
|
output = formatter.format(results, include_context=True)
|
|
data = json.loads(output)
|
|
|
|
# Comparative results have different structure
|
|
assert "models" in data
|
|
cases = data["models"]["gpt-4o"]["suites"]["weather_suite"]["cases"]
|
|
|
|
assert "weather_test" in cases
|
|
case_data = cases["weather_test"]
|
|
assert "system_message" in case_data
|
|
assert case_data["system_message"] == "You are a weather bot."
|
|
|
|
|
|
class TestIncludeContextHtml:
|
|
"""Tests for HTML formatter include_context functionality."""
|
|
|
|
def test_html_include_context_shows_context_section(self) -> None:
|
|
"""HTML output should include context section when include_context=True."""
|
|
formatter = HtmlFormatter()
|
|
results = make_results_with_context()
|
|
|
|
output = formatter.format(results, show_details=True, include_context=True)
|
|
|
|
assert "context-section" in output or "📋 Context" in output
|
|
assert "You are a helpful weather assistant" in output
|
|
|
|
def test_html_include_context_false_no_context(self) -> None:
|
|
"""HTML output should NOT include context when include_context=False."""
|
|
formatter = HtmlFormatter()
|
|
results = make_results_with_context()
|
|
|
|
output = formatter.format(results, show_details=True, include_context=False)
|
|
|
|
# System message should not appear
|
|
assert "You are a helpful weather assistant" not in output
|
|
|
|
def test_html_formats_tool_response_json(self) -> None:
|
|
"""HTML should pretty-print JSON in tool responses."""
|
|
formatter = HtmlFormatter()
|
|
results = make_results_with_context()
|
|
|
|
output = formatter.format(results, show_details=True, include_context=True)
|
|
|
|
# Tool response JSON should be formatted (indented)
|
|
# The raw JSON would be on one line, formatted has newlines
|
|
assert "tool-response" in output or '"temp"' in output
|
|
|
|
|
|
class TestIncludeContextMarkdown:
|
|
"""Tests for Markdown formatter include_context functionality."""
|
|
|
|
def test_markdown_include_context_shows_context(self) -> None:
|
|
"""Markdown output should include context when include_context=True."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_results_with_context()
|
|
|
|
output = formatter.format(results, show_details=True, include_context=True)
|
|
|
|
assert "Context" in output or "System Message" in output
|
|
assert "You are a helpful weather assistant" in output
|
|
|
|
def test_markdown_include_context_false_no_context(self) -> None:
|
|
"""Markdown should NOT include context when include_context=False."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_results_with_context()
|
|
|
|
output = formatter.format(results, show_details=True, include_context=False)
|
|
|
|
assert "You are a helpful weather assistant" not in output
|
|
|
|
|
|
class TestIncludeContextText:
|
|
"""Tests for Text formatter include_context functionality."""
|
|
|
|
def test_text_include_context_shows_context(self) -> None:
|
|
"""Text output should include context when include_context=True."""
|
|
formatter = TextFormatter()
|
|
results = make_results_with_context()
|
|
|
|
output = formatter.format(results, show_details=True, include_context=True)
|
|
|
|
assert "Context:" in output or "CONTEXT" in output
|
|
assert "You are a helpful weather assistant" in output
|
|
|
|
def test_text_include_context_false_no_context(self) -> None:
|
|
"""Text should NOT include context when include_context=False."""
|
|
formatter = TextFormatter()
|
|
results = make_results_with_context()
|
|
|
|
output = formatter.format(results, show_details=True, include_context=False)
|
|
|
|
assert "You are a helpful weather assistant" not in output
|
|
|
|
|
|
class TestGroupingPreservesContext:
|
|
"""Tests that grouping functions preserve context data."""
|
|
|
|
def test_group_comparative_by_case_preserves_context(self) -> None:
|
|
"""group_comparative_by_case should preserve system_message and additional_messages."""
|
|
from arcade_cli.formatters.base import group_comparative_by_case
|
|
|
|
results = make_comparative_results_with_context()
|
|
groups, *_ = group_comparative_by_case(results)
|
|
|
|
# Navigate to the case data
|
|
case_data = groups["gpt-4o"]["weather_suite"]["weather_test"]
|
|
|
|
assert "system_message" in case_data
|
|
assert case_data["system_message"] == "You are a weather bot."
|
|
assert "additional_messages" in case_data
|
|
assert len(case_data["additional_messages"]) == 2
|
|
|
|
def test_group_comparative_by_case_first_preserves_context(self) -> None:
|
|
"""group_comparative_by_case_first should preserve context."""
|
|
from arcade_cli.formatters.base import group_comparative_by_case_first
|
|
|
|
# Create multi-model comparative results
|
|
results = make_comparative_results_with_context()
|
|
# Add another model
|
|
results[0].append({
|
|
"model": "gpt-4o-mini",
|
|
"suite_name": "weather_suite [track_a]",
|
|
"track_name": "track_a",
|
|
"rubric": "Test",
|
|
"cases": [
|
|
{
|
|
"name": "weather_test",
|
|
"input": "Get weather for NYC",
|
|
"system_message": "You are a weather bot.",
|
|
"additional_messages": [{"role": "user", "content": "Test"}],
|
|
"evaluation": MockEvaluation(passed=True, score=0.95),
|
|
}
|
|
],
|
|
})
|
|
|
|
groups, model_order, *_ = group_comparative_by_case_first(results)
|
|
|
|
# Navigate to the case data for first model
|
|
model_data = groups["weather_suite"]["weather_test"]["gpt-4o"]
|
|
|
|
assert "system_message" in model_data
|
|
assert model_data["system_message"] == "You are a weather bot."
|
|
assert "additional_messages" in model_data
|
|
|
|
|
|
class TestConversationFormatting:
|
|
"""Tests for conversation/tool response formatting."""
|
|
|
|
def test_html_conversation_formats_tool_calls(self) -> None:
|
|
"""HTML _format_conversation should format tool calls with arguments."""
|
|
formatter = HtmlFormatter()
|
|
|
|
messages = [
|
|
{
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [
|
|
{
|
|
"function": {
|
|
"name": "search",
|
|
"arguments": '{"query": "test", "limit": 10}',
|
|
}
|
|
}
|
|
],
|
|
}
|
|
]
|
|
|
|
html = formatter._format_conversation(messages)
|
|
|
|
assert "search" in html
|
|
assert "query" in html
|
|
assert "tool-call" in html.lower() or "tool_call" in html.lower()
|
|
|
|
def test_html_conversation_formats_tool_response_json(self) -> None:
|
|
"""HTML should format tool response JSON nicely."""
|
|
formatter = HtmlFormatter()
|
|
|
|
messages = [
|
|
{
|
|
"role": "tool",
|
|
"name": "get_data",
|
|
"content": '{"results": [1, 2, 3], "count": 3}',
|
|
}
|
|
]
|
|
|
|
html = formatter._format_conversation(messages)
|
|
|
|
# Should have tool-response class for styling
|
|
assert "tool-response" in html
|
|
# Should show the tool name
|
|
assert "get_data" in html
|
|
|
|
def test_html_conversation_handles_invalid_json(self) -> None:
|
|
"""HTML should gracefully handle non-JSON tool responses."""
|
|
formatter = HtmlFormatter()
|
|
|
|
messages = [
|
|
{
|
|
"role": "tool",
|
|
"name": "raw_tool",
|
|
"content": "This is not JSON, just plain text response",
|
|
}
|
|
]
|
|
|
|
html = formatter._format_conversation(messages)
|
|
|
|
# Should still render the content
|
|
assert "plain text response" in html
|
|
# Should show as regular content, not JSON
|
|
assert "msg-content" in html
|
|
|
|
|
|
class TestComparativeWithContext:
|
|
"""Tests for comparative formatters with context enabled."""
|
|
|
|
def test_html_comparative_with_context(self) -> None:
|
|
"""HTML comparative format should include context."""
|
|
formatter = HtmlFormatter()
|
|
results = make_comparative_results_with_context()
|
|
|
|
output = formatter.format(results, show_details=True, include_context=True)
|
|
|
|
assert "You are a weather bot" in output
|
|
|
|
def test_markdown_comparative_with_context(self) -> None:
|
|
"""Markdown comparative format should include context."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_comparative_results_with_context()
|
|
|
|
output = formatter.format(results, show_details=True, include_context=True)
|
|
|
|
assert "You are a weather bot" in output
|
|
|
|
def test_text_comparative_with_context(self) -> None:
|
|
"""Text comparative format should include context."""
|
|
formatter = TextFormatter()
|
|
results = make_comparative_results_with_context()
|
|
|
|
output = formatter.format(results, show_details=True, include_context=True)
|
|
|
|
assert "You are a weather bot" in output
|
|
|
|
|
|
class TestHtmlSafeId:
|
|
"""Tests for HTML formatter's _make_safe_id method."""
|
|
|
|
def test_make_safe_id_basic(self) -> None:
|
|
"""Test basic ID generation."""
|
|
formatter = HtmlFormatter()
|
|
case_id = formatter._make_safe_id("My Suite", "Test Case", "gpt-4o")
|
|
|
|
# Should be a valid HTML ID (alphanumeric + hyphens/underscores)
|
|
assert isinstance(case_id, str)
|
|
assert len(case_id) > 0
|
|
# Should not contain problematic characters
|
|
assert '"' not in case_id
|
|
assert "'" not in case_id
|
|
assert " " not in case_id
|
|
|
|
def test_make_safe_id_with_special_chars(self) -> None:
|
|
"""Test ID generation with special characters that could break HTML."""
|
|
formatter = HtmlFormatter()
|
|
|
|
# Test with double quotes
|
|
case_id = formatter._make_safe_id('Suite "quoted"', 'Case "test"', 'model "name"')
|
|
assert '"' not in case_id
|
|
|
|
# Test with single quotes
|
|
case_id2 = formatter._make_safe_id("Suite's name", "Case's test", "model's")
|
|
assert "'" not in case_id2
|
|
|
|
# Test with brackets
|
|
case_id3 = formatter._make_safe_id("Suite [track]", "Case [test]", "model")
|
|
assert "[" not in case_id3
|
|
assert "]" not in case_id3
|
|
|
|
def test_make_safe_id_stable(self) -> None:
|
|
"""Test that same inputs produce same ID (for caching/consistency)."""
|
|
formatter = HtmlFormatter()
|
|
|
|
id1 = formatter._make_safe_id("Suite", "Case", "Model")
|
|
id2 = formatter._make_safe_id("Suite", "Case", "Model")
|
|
|
|
assert id1 == id2
|
|
|
|
def test_make_safe_id_unique(self) -> None:
|
|
"""Test that different inputs produce different IDs."""
|
|
formatter = HtmlFormatter()
|
|
|
|
id1 = formatter._make_safe_id("Suite1", "Case", "Model")
|
|
id2 = formatter._make_safe_id("Suite2", "Case", "Model")
|
|
id3 = formatter._make_safe_id("Suite1", "Case2", "Model")
|
|
id4 = formatter._make_safe_id("Suite1", "Case", "Model2")
|
|
|
|
# All should be different
|
|
ids = {id1, id2, id3, id4}
|
|
assert len(ids) == 4
|
|
|
|
def test_json_comparative_preserves_tool_response(self) -> None:
|
|
"""JSON comparative should preserve tool response content."""
|
|
formatter = JsonFormatter()
|
|
results = make_comparative_results_with_context()
|
|
|
|
output = formatter.format(results, include_context=True)
|
|
data = json.loads(output)
|
|
|
|
cases = data["models"]["gpt-4o"]["suites"]["weather_suite"]["cases"]
|
|
case_data = cases["weather_test"]
|
|
|
|
assert "additional_messages" in case_data
|
|
tool_msg = next(m for m in case_data["additional_messages"] if m.get("role") == "tool")
|
|
assert "temp" in tool_msg["content"]
|
|
|
|
|
|
# =====================================================================
|
|
# Multi-run stats formatter tests
|
|
# =====================================================================
|
|
|
|
|
|
def _make_multi_run_case(
|
|
name: str = "multi_run_case",
|
|
score: float = 0.75,
|
|
passed: bool = True,
|
|
num_runs: int = 3,
|
|
) -> dict:
|
|
"""Create a case dict with multi-run stats populated."""
|
|
return {
|
|
"name": name,
|
|
"input": "multi run test input",
|
|
"evaluation": MockEvaluation(
|
|
passed=passed,
|
|
score=score,
|
|
results=[
|
|
{
|
|
"field": "arg_a",
|
|
"match": True,
|
|
"score": score,
|
|
"weight": 1.0,
|
|
"expected": "foo",
|
|
"actual": "foo",
|
|
"is_criticized": True,
|
|
}
|
|
],
|
|
),
|
|
"run_stats": {
|
|
"num_runs": num_runs,
|
|
"scores": [0.8, 0.7, 0.75],
|
|
"mean_score": 0.75,
|
|
"std_deviation": 0.041,
|
|
"passed": [True, True, True],
|
|
"warned": [False, False, False],
|
|
"seed_policy": "constant",
|
|
"run_seeds": [42, 42, 42],
|
|
"pass_rule": "last",
|
|
"runs": [
|
|
{
|
|
"score": 0.8,
|
|
"passed": True,
|
|
"warning": False,
|
|
"failure_reason": None,
|
|
"details": [],
|
|
},
|
|
{
|
|
"score": 0.7,
|
|
"passed": True,
|
|
"warning": False,
|
|
"failure_reason": None,
|
|
"details": [],
|
|
},
|
|
{
|
|
"score": 0.75,
|
|
"passed": True,
|
|
"warning": False,
|
|
"failure_reason": None,
|
|
"details": [],
|
|
},
|
|
],
|
|
},
|
|
"critic_stats": {
|
|
"arg_a": {
|
|
"run_scores": [0.8, 0.7, 0.75],
|
|
"mean_score": 0.75,
|
|
"std_deviation": 0.041,
|
|
"run_scores_normalized": [0.8, 0.7, 0.75],
|
|
"mean_score_normalized": 0.75,
|
|
"std_deviation_normalized": 0.041,
|
|
"weight": 1.0,
|
|
},
|
|
},
|
|
"expected_tool_calls": [],
|
|
"predicted_tool_calls": [],
|
|
}
|
|
|
|
|
|
def _make_multi_run_results() -> list[list[dict]]:
|
|
"""Create evaluation results with multi-run stats for a single model."""
|
|
return [
|
|
[
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "MultiRunSuite",
|
|
"rubric": MockEvaluation(),
|
|
"cases": [_make_multi_run_case()],
|
|
}
|
|
]
|
|
]
|
|
|
|
|
|
class TestMarkdownMultiRunStats:
|
|
"""Tests for multi-run stats in the Markdown formatter."""
|
|
|
|
def test_run_stats_summary_in_output(self) -> None:
|
|
"""Markdown should include Run Stats summary for multi-run cases."""
|
|
formatter = MarkdownFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "**Run Stats:**" in output
|
|
assert "Runs: 3" in output
|
|
assert "Mean Score:" in output
|
|
assert "Std Deviation:" in output
|
|
assert "Seed Policy: constant" in output
|
|
assert "Pass Rule: last" in output
|
|
|
|
def test_runs_column_in_summary_table(self) -> None:
|
|
"""Summary table should include a Runs column for multi-run cases."""
|
|
formatter = MarkdownFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=False)
|
|
assert "| Runs |" in output
|
|
assert "±" in output # Score should show ± std dev
|
|
|
|
def test_no_run_stats_for_single_run(self) -> None:
|
|
"""Single-run cases should not show Run Stats or Runs column."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_mock_results() # standard single-run mock
|
|
output = formatter.format(results, show_details=True)
|
|
assert "**Run Stats:**" not in output
|
|
assert "| Runs |" not in output
|
|
|
|
def test_critic_stats_in_output(self) -> None:
|
|
"""Markdown should include Critic Stats table for multi-run cases."""
|
|
formatter = MarkdownFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Critic Stats" in output
|
|
assert "arg_a" in output
|
|
|
|
def test_format_run_stats_summary_returns_empty_for_single_run(self) -> None:
|
|
"""_format_run_stats_summary returns [] when num_runs < 2."""
|
|
formatter = MarkdownFormatter()
|
|
assert formatter._format_run_stats_summary(None) == []
|
|
assert formatter._format_run_stats_summary({}) == []
|
|
assert formatter._format_run_stats_summary({"num_runs": 1}) == []
|
|
|
|
|
|
class TestTextMultiRunStats:
|
|
"""Tests for multi-run stats in the Text formatter."""
|
|
|
|
def test_stats_suffix_in_score_line(self) -> None:
|
|
"""Text score line should include (n=X, sd=Y%) for multi-run."""
|
|
formatter = TextFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "(n=3" in output
|
|
assert "sd=" in output
|
|
|
|
def test_no_stats_suffix_for_single_run(self) -> None:
|
|
"""Single-run text output should not include multi-run stats suffix."""
|
|
formatter = TextFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "(n=" not in output
|
|
assert "sd=" not in output
|
|
|
|
def test_run_stats_block_in_details(self) -> None:
|
|
"""Text details should include Run Stats block for multi-run."""
|
|
formatter = TextFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Run Stats:" in output
|
|
assert "Seed Policy: constant" in output
|
|
|
|
|
|
class TestHtmlMultiRunStats:
|
|
"""Tests for multi-run stats in the HTML formatter."""
|
|
|
|
def test_run_stats_card_in_output(self) -> None:
|
|
"""HTML should include run-stats-card for multi-run cases."""
|
|
formatter = HtmlFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "run-stats-card" in output
|
|
assert "mean score" in output
|
|
assert "std dev" in output
|
|
|
|
def test_run_tabs_in_output(self) -> None:
|
|
"""HTML should include run-tab for each run."""
|
|
formatter = HtmlFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Run 1" in output
|
|
assert "Run 2" in output
|
|
assert "Run 3" in output
|
|
|
|
def test_no_run_stats_card_for_single_run(self) -> None:
|
|
"""Single-run HTML should not include an active run-stats-card div."""
|
|
formatter = HtmlFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results, show_details=True)
|
|
# The CSS class definition exists in the template, but no div should use it
|
|
assert '<div class="run-stats-card' not in output
|
|
|
|
|
|
class TestJsonMultiRunStats:
|
|
"""Tests for multi-run stats in the JSON formatter."""
|
|
|
|
def test_run_stats_in_json_output(self) -> None:
|
|
"""JSON output should include run_stats for multi-run cases."""
|
|
formatter = JsonFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
data = json.loads(output)
|
|
suites = data["models"]["gpt-4o"]["suites"]["MultiRunSuite"]
|
|
# cases is a list in single-model format
|
|
case_data = suites["cases"][0]
|
|
assert "run_stats" in case_data
|
|
assert case_data["run_stats"]["num_runs"] == 3
|
|
assert "critic_stats" in case_data
|
|
|
|
def test_no_run_stats_in_json_for_single_run(self) -> None:
|
|
"""JSON output should not include run_stats for single-run cases."""
|
|
formatter = JsonFormatter()
|
|
results = make_mock_results()
|
|
output = formatter.format(results, show_details=True)
|
|
data = json.loads(output)
|
|
suites = data["models"]["gpt-4o"]["suites"]["test_eval_suite"]
|
|
case_data = suites["cases"][0]
|
|
assert "run_stats" not in case_data
|
|
|
|
|
|
# =====================================================================
|
|
# Extended multi-run formatter coverage tests
|
|
# =====================================================================
|
|
|
|
|
|
def _make_multi_run_case_failed(
|
|
name: str = "failed_multi_run",
|
|
score: float = 0.25,
|
|
num_runs: int = 3,
|
|
) -> dict:
|
|
"""Create a failing multi-run case with failure_reason in run details."""
|
|
return {
|
|
"name": name,
|
|
"input": "multi run fail input",
|
|
"evaluation": MockEvaluation(
|
|
passed=False,
|
|
score=score,
|
|
failure_reason="All runs failed completely",
|
|
results=[
|
|
{
|
|
"field": "arg_a",
|
|
"match": False,
|
|
"score": 0.25,
|
|
"weight": 1.0,
|
|
"expected": "bar",
|
|
"actual": "baz",
|
|
"is_criticized": True,
|
|
}
|
|
],
|
|
),
|
|
"run_stats": {
|
|
"num_runs": num_runs,
|
|
"scores": [0.3, 0.2, 0.25],
|
|
"mean_score": 0.25,
|
|
"std_deviation": 0.041,
|
|
"passed": [False, False, False],
|
|
"warned": [False, False, False],
|
|
"seed_policy": "random",
|
|
"run_seeds": [100, 200, 300],
|
|
"pass_rule": "majority",
|
|
"runs": [
|
|
{
|
|
"score": 0.3,
|
|
"passed": False,
|
|
"warning": False,
|
|
"failure_reason": "Tool selection mismatch",
|
|
"details": [
|
|
{
|
|
"field": "arg_a",
|
|
"match": False,
|
|
"score": 0.3,
|
|
"weight": 1.0,
|
|
"expected": "bar",
|
|
"actual": "baz",
|
|
"is_criticized": True,
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"score": 0.2,
|
|
"passed": False,
|
|
"warning": False,
|
|
"failure_reason": "Tool selection mismatch",
|
|
"details": [],
|
|
},
|
|
{
|
|
"score": 0.25,
|
|
"passed": False,
|
|
"warning": False,
|
|
"failure_reason": "Tool selection mismatch",
|
|
"details": [],
|
|
},
|
|
],
|
|
},
|
|
"critic_stats": {
|
|
"arg_a": {
|
|
"run_scores": [0.3, 0.2, 0.25],
|
|
"mean_score": 0.25,
|
|
"std_deviation": 0.041,
|
|
"run_scores_normalized": [0.3, 0.2, 0.25],
|
|
"mean_score_normalized": 0.25,
|
|
"std_deviation_normalized": 0.041,
|
|
"weight": 1.0,
|
|
},
|
|
},
|
|
"expected_tool_calls": [],
|
|
"predicted_tool_calls": [],
|
|
}
|
|
|
|
|
|
def _make_multi_run_results_failed() -> list[list[dict]]:
|
|
"""Create evaluation results with failed multi-run stats for a single model."""
|
|
return [
|
|
[
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "FailingSuite",
|
|
"rubric": MockEvaluation(),
|
|
"cases": [_make_multi_run_case_failed()],
|
|
}
|
|
]
|
|
]
|
|
|
|
|
|
def _make_multi_model_multi_run_results() -> list[list[dict]]:
|
|
"""Create evaluation results with multi-run stats for TWO models."""
|
|
return [
|
|
[
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "MultiRunSuite",
|
|
"rubric": MockEvaluation(),
|
|
"cases": [_make_multi_run_case()],
|
|
},
|
|
{
|
|
"model": "claude-3.5-sonnet",
|
|
"suite_name": "MultiRunSuite",
|
|
"rubric": MockEvaluation(),
|
|
"cases": [
|
|
_make_multi_run_case(
|
|
name="multi_run_case",
|
|
score=0.6,
|
|
passed=False,
|
|
num_runs=3,
|
|
)
|
|
],
|
|
},
|
|
]
|
|
]
|
|
|
|
|
|
class TestTextMultiRunCoverage:
|
|
"""Extended text formatter tests for multi-run coverage."""
|
|
|
|
def test_run_stats_details_with_failure_reason(self) -> None:
|
|
"""Run details should include failure_reason when present."""
|
|
formatter = TextFormatter()
|
|
results = _make_multi_run_results_failed()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Tool selection mismatch" in output
|
|
|
|
def test_run_stats_displays_seeds(self) -> None:
|
|
"""Text should display run seeds."""
|
|
formatter = TextFormatter()
|
|
results = _make_multi_run_results_failed()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Run Seeds: 100, 200, 300" in output
|
|
assert "Seed Policy: random" in output
|
|
|
|
def test_run_stats_displays_pass_rule(self) -> None:
|
|
"""Text should display the pass rule."""
|
|
formatter = TextFormatter()
|
|
results = _make_multi_run_results_failed()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Pass Rule: majority" in output
|
|
|
|
def test_critic_stats_block(self) -> None:
|
|
"""Text should display critic stats block with ± notation."""
|
|
formatter = TextFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Critic Stats:" in output
|
|
assert "arg_a:" in output
|
|
assert "±" in output
|
|
|
|
def test_multi_model_with_run_stats(self) -> None:
|
|
"""Multi-model text should include run stats in detail view."""
|
|
formatter = TextFormatter()
|
|
results = _make_multi_model_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
# Multi-model detail view calls _format_run_stats for each case_result
|
|
assert "Run Stats:" in output
|
|
assert "Runs: 3" in output
|
|
assert "gpt-4o" in output
|
|
assert "claude-3.5-sonnet" in output
|
|
|
|
def test_run_results_per_run_status(self) -> None:
|
|
"""Text should show each run with status and score."""
|
|
formatter = TextFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Run 1:" in output
|
|
assert "Run 2:" in output
|
|
assert "Run 3:" in output
|
|
|
|
|
|
class TestMarkdownMultiRunCoverage:
|
|
"""Extended markdown formatter tests for multi-run coverage."""
|
|
|
|
def test_run_details_with_failure_reason(self) -> None:
|
|
"""Markdown run details should include failure_reason."""
|
|
formatter = MarkdownFormatter()
|
|
results = _make_multi_run_results_failed()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Tool selection mismatch" in output
|
|
|
|
def test_run_details_with_critic_details(self) -> None:
|
|
"""Markdown should include per-run critic details when present."""
|
|
formatter = MarkdownFormatter()
|
|
results = _make_multi_run_results_failed()
|
|
output = formatter.format(results, show_details=True)
|
|
# The first run has details with field "arg_a"
|
|
assert "Run Details:" in output
|
|
|
|
def test_run_seeds_displayed(self) -> None:
|
|
"""Markdown should display random seeds."""
|
|
formatter = MarkdownFormatter()
|
|
results = _make_multi_run_results_failed()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Run Seeds: 100, 200, 300" in output
|
|
assert "Seed Policy: random" in output
|
|
|
|
def test_critic_stats_table(self) -> None:
|
|
"""Markdown should display critic stats table with all columns."""
|
|
formatter = MarkdownFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Critic Stats" in output
|
|
assert "Weight" in output
|
|
assert "Mean" in output
|
|
|
|
def test_multi_model_with_run_stats(self) -> None:
|
|
"""Multi-model markdown should include run stats."""
|
|
formatter = MarkdownFormatter()
|
|
results = _make_multi_model_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
# Multi-model detail view calls _format_run_stats_summary for each case
|
|
assert "**Run Stats:**" in output
|
|
assert "Runs: 3" in output
|
|
assert "gpt-4o" in output
|
|
assert "claude-3.5-sonnet" in output
|
|
|
|
def test_no_duplicate_eval_details_for_multi_run(self) -> None:
|
|
"""When run details are present, should not also show eval details."""
|
|
formatter = MarkdownFormatter()
|
|
results = _make_multi_run_results_failed()
|
|
output = formatter.format(results, show_details=True)
|
|
# "Run Details:" section should exist
|
|
assert "Run Details:" in output
|
|
# The field-level critic table should appear inside run details,
|
|
# not duplicated as a standalone section
|
|
|
|
|
|
class TestHtmlMultiRunCoverage:
|
|
"""Extended HTML formatter tests for multi-run coverage."""
|
|
|
|
def test_run_stats_card_fields(self) -> None:
|
|
"""HTML run stats card should include mean score, std dev, pass rule."""
|
|
formatter = HtmlFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "mean score" in output
|
|
assert "std dev" in output
|
|
assert "Pass Rule" in output
|
|
assert "Seed Policy" in output
|
|
|
|
def test_run_tabs_with_failure_reason(self) -> None:
|
|
"""HTML run tabs should include failure reason per run."""
|
|
formatter = HtmlFormatter()
|
|
results = _make_multi_run_results_failed()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "Failure Reason" in output
|
|
assert "Tool selection mismatch" in output
|
|
|
|
def test_run_tabs_status_classes(self) -> None:
|
|
"""HTML run tabs should have status classes (passed/failed)."""
|
|
formatter = HtmlFormatter()
|
|
results = _make_multi_run_results_failed()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "run-tab failed" in output or "run-tab failed" in output
|
|
|
|
def test_critic_stats_html_table(self) -> None:
|
|
"""HTML should include critic stats table."""
|
|
formatter = HtmlFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "critic-stats" in output
|
|
assert "Critic Stats" in output
|
|
|
|
def test_score_pills_in_run_stats(self) -> None:
|
|
"""HTML run stats card should show score pills for each run."""
|
|
formatter = HtmlFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "score-pill" in output
|
|
assert "R1:" in output
|
|
assert "R2:" in output
|
|
assert "R3:" in output
|
|
|
|
def test_multi_model_with_run_stats(self) -> None:
|
|
"""Multi-model HTML should include run stats."""
|
|
formatter = HtmlFormatter()
|
|
results = _make_multi_model_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "run-stats-card" in output
|
|
assert "gpt-4o" in output
|
|
assert "claude-3.5-sonnet" in output
|
|
|
|
def test_random_seeds_in_card(self) -> None:
|
|
"""HTML run stats card should display random seeds."""
|
|
formatter = HtmlFormatter()
|
|
results = _make_multi_run_results_failed()
|
|
output = formatter.format(results, show_details=True)
|
|
assert "100" in output
|
|
assert "200" in output
|
|
assert "300" in output
|
|
|
|
|
|
class TestJsonMultiRunCoverage:
|
|
"""Extended JSON formatter tests for multi-run coverage."""
|
|
|
|
def test_run_stats_fields(self) -> None:
|
|
"""JSON run_stats should include all expected fields."""
|
|
formatter = JsonFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
data = json.loads(output)
|
|
suites = data["models"]["gpt-4o"]["suites"]["MultiRunSuite"]
|
|
rs = suites["cases"][0]["run_stats"]
|
|
assert rs["num_runs"] == 3
|
|
assert "scores" in rs
|
|
assert "mean_score" in rs
|
|
assert "std_deviation" in rs
|
|
assert "seed_policy" in rs
|
|
assert "run_seeds" in rs
|
|
assert "pass_rule" in rs
|
|
assert "runs" in rs
|
|
|
|
def test_critic_stats_fields(self) -> None:
|
|
"""JSON critic_stats should include all expected fields."""
|
|
formatter = JsonFormatter()
|
|
results = _make_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
data = json.loads(output)
|
|
suites = data["models"]["gpt-4o"]["suites"]["MultiRunSuite"]
|
|
cs = suites["cases"][0]["critic_stats"]
|
|
assert "arg_a" in cs
|
|
assert "run_scores" in cs["arg_a"]
|
|
assert "mean_score" in cs["arg_a"]
|
|
assert "weight" in cs["arg_a"]
|
|
|
|
def test_multi_model_json_with_run_stats(self) -> None:
|
|
"""Multi-model JSON should include run_stats for each model."""
|
|
formatter = JsonFormatter()
|
|
results = _make_multi_model_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
data = json.loads(output)
|
|
# Multi-model JSON uses comparison structure: {suite: {case: {results_by_model: ...}}}
|
|
comparison = data["comparison"]
|
|
case_data = comparison["MultiRunSuite"]["multi_run_case"]
|
|
for model_name in ["gpt-4o", "claude-3.5-sonnet"]:
|
|
model_result = case_data["results_by_model"][model_name]
|
|
assert "run_stats" in model_result, f"Missing run_stats for {model_name}"
|
|
|
|
def test_per_run_details_in_json(self) -> None:
|
|
"""JSON per-run details should include failure_reason."""
|
|
formatter = JsonFormatter()
|
|
results = _make_multi_run_results_failed()
|
|
output = formatter.format(results, show_details=True)
|
|
data = json.loads(output)
|
|
suites = data["models"]["gpt-4o"]["suites"]["FailingSuite"]
|
|
runs = suites["cases"][0]["run_stats"]["runs"]
|
|
assert len(runs) == 3
|
|
assert runs[0]["failure_reason"] == "Tool selection mismatch"
|
|
|
|
|
|
# =====================================================================
|
|
# Coverage gap tests — TextFormatter
|
|
# =====================================================================
|
|
|
|
|
|
class TestTextFormatterCoverageGaps:
|
|
"""Tests for TextFormatter methods that lacked coverage."""
|
|
|
|
def test_format_evaluation_uncriticized_field(self) -> None:
|
|
"""_format_evaluation should show 'Un-criticized' for is_criticized=False."""
|
|
formatter = TextFormatter()
|
|
evaluation = MockEvaluation(
|
|
passed=True,
|
|
score=1.0,
|
|
results=[
|
|
{
|
|
"field": "optional_param",
|
|
"match": False,
|
|
"score": 0.0,
|
|
"weight": 0.0,
|
|
"expected": "abc",
|
|
"actual": "xyz",
|
|
"is_criticized": False,
|
|
},
|
|
],
|
|
)
|
|
output = formatter._format_evaluation(evaluation)
|
|
assert "Un-criticized" in output
|
|
assert "optional_param" in output
|
|
assert "Expected: abc" in output
|
|
assert "Actual: xyz" in output
|
|
|
|
def test_format_evaluation_mixed_criticized_and_uncriticized(self) -> None:
|
|
"""_format_evaluation should handle a mix of criticized and uncriticized fields."""
|
|
formatter = TextFormatter()
|
|
evaluation = MockEvaluation(
|
|
passed=True,
|
|
score=0.5,
|
|
results=[
|
|
{
|
|
"field": "required_field",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "foo",
|
|
"actual": "foo",
|
|
"is_criticized": True,
|
|
},
|
|
{
|
|
"field": "info_field",
|
|
"match": False,
|
|
"score": 0.0,
|
|
"weight": 0.0,
|
|
"expected": "bar",
|
|
"actual": "baz",
|
|
"is_criticized": False,
|
|
},
|
|
],
|
|
)
|
|
output = formatter._format_evaluation(evaluation)
|
|
assert "Match" in output
|
|
assert "Un-criticized" in output
|
|
assert "required_field" in output
|
|
assert "info_field" in output
|
|
|
|
def test_format_conversation_text_standalone(self) -> None:
|
|
"""_format_conversation_text should format conversation messages correctly."""
|
|
formatter = TextFormatter()
|
|
messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi there!"},
|
|
{
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [{"function": {"name": "search", "arguments": '{"q": "test"}'}}],
|
|
},
|
|
{
|
|
"role": "tool",
|
|
"name": "search",
|
|
"content": '{"results": [1, 2]}',
|
|
},
|
|
]
|
|
lines = formatter._format_conversation_text(messages)
|
|
text = "\n".join(lines)
|
|
|
|
assert "[USER]" in text
|
|
assert "[ASSISTANT]" in text
|
|
assert "[TOOL: search]" in text
|
|
assert "Hello" in text
|
|
assert "Hi there!" in text
|
|
assert "search" in text
|
|
assert "results" in text
|
|
|
|
def test_format_conversation_text_invalid_json_tool_content(self) -> None:
|
|
"""_format_conversation_text should handle non-JSON tool content gracefully."""
|
|
formatter = TextFormatter()
|
|
messages = [
|
|
{"role": "tool", "name": "raw", "content": "not valid json"},
|
|
]
|
|
lines = formatter._format_conversation_text(messages)
|
|
text = "\n".join(lines)
|
|
|
|
assert "not valid json" in text
|
|
|
|
def test_format_conversation_text_invalid_json_tool_call_args(self) -> None:
|
|
"""_format_conversation_text should handle non-JSON tool call args gracefully."""
|
|
formatter = TextFormatter()
|
|
messages = [
|
|
{
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}],
|
|
},
|
|
]
|
|
lines = formatter._format_conversation_text(messages)
|
|
text = "\n".join(lines)
|
|
|
|
assert "broken" in text
|
|
assert "not json" in text
|
|
|
|
def test_multi_model_failed_only_with_original_counts(self) -> None:
|
|
"""TextFormatter multi-model output should show original counts with --only-failed."""
|
|
formatter = TextFormatter()
|
|
results = make_multi_model_results()
|
|
output = formatter.format(results, failed_only=True, original_counts=(20, 15, 5, 0))
|
|
|
|
assert "Note: Showing only failed evaluations" in output
|
|
assert "Total: 20" in output
|
|
assert "Passed: 15" in output
|
|
assert "Failed: 5" in output
|
|
|
|
def test_multi_model_details_with_run_and_critic_stats(self) -> None:
|
|
"""TextFormatter multi-model detail view should show run/critic stats per model."""
|
|
formatter = TextFormatter()
|
|
results = _make_multi_model_multi_run_results()
|
|
output = formatter.format(results, show_details=True)
|
|
|
|
# Should show per-model detail sections with run stats
|
|
assert "Run Stats:" in output
|
|
assert "Critic Stats:" in output
|
|
assert "gpt-4o" in output
|
|
assert "claude-3.5-sonnet" in output
|
|
|
|
def test_comparative_single_model_with_context_conversation(self) -> None:
|
|
"""Comparative single-model should render context with conversation formatting."""
|
|
formatter = TextFormatter()
|
|
results = make_comparative_results_with_context()
|
|
output = formatter.format(results, show_details=True, include_context=True)
|
|
|
|
# Context section should be present
|
|
assert "You are a weather bot" in output
|
|
# Tool responses should be JSON-formatted
|
|
assert "temp" in output
|
|
assert "I need weather info" in output
|
|
|
|
def test_comparative_case_first_with_context_conversation(self) -> None:
|
|
"""Comparative case-first (multi-model) should render context with conversation."""
|
|
# Create multi-model comparative results with context
|
|
results = make_comparative_results_with_context()
|
|
# Add another model to trigger case-first grouping
|
|
results[0].append({
|
|
"model": "gpt-4o-mini",
|
|
"suite_name": "weather_suite [track_a]",
|
|
"track_name": "track_a",
|
|
"rubric": "Test",
|
|
"cases": [
|
|
{
|
|
"name": "weather_test",
|
|
"input": "Get weather for NYC",
|
|
"system_message": "You are a weather bot.",
|
|
"additional_messages": [
|
|
{"role": "user", "content": "I need weather info"},
|
|
],
|
|
"evaluation": MockEvaluation(passed=True, score=0.95),
|
|
}
|
|
],
|
|
})
|
|
|
|
formatter = TextFormatter()
|
|
output = formatter.format(results, show_details=True, include_context=True)
|
|
|
|
assert "MULTI-MODEL" in output
|
|
assert "You are a weather bot" in output
|
|
assert "I need weather info" in output
|
|
|
|
|
|
# =====================================================================
|
|
# Coverage gap tests — MarkdownFormatter
|
|
# =====================================================================
|
|
|
|
|
|
class TestMarkdownFormatterCoverageGaps:
|
|
"""Tests for MarkdownFormatter methods that lacked coverage."""
|
|
|
|
def test_format_conversation_md_standalone(self) -> None:
|
|
"""_format_conversation_md should format messages as markdown blockquotes."""
|
|
formatter = MarkdownFormatter()
|
|
messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi!"},
|
|
{
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [{"function": {"name": "search", "arguments": '{"q": "test"}'}}],
|
|
},
|
|
{
|
|
"role": "tool",
|
|
"name": "search",
|
|
"content": '{"results": [1]}',
|
|
},
|
|
]
|
|
lines = formatter._format_conversation_md(messages)
|
|
text = "\n".join(lines)
|
|
|
|
assert "👤" in text
|
|
assert "🤖" in text
|
|
assert "🔧" in text
|
|
assert "Hello" in text
|
|
assert "search" in text
|
|
|
|
def test_format_conversation_md_invalid_json(self) -> None:
|
|
"""_format_conversation_md should handle non-JSON tool content gracefully."""
|
|
formatter = MarkdownFormatter()
|
|
messages = [
|
|
{"role": "tool", "name": "raw", "content": "plain text"},
|
|
]
|
|
lines = formatter._format_conversation_md(messages)
|
|
text = "\n".join(lines)
|
|
|
|
assert "plain text" in text
|
|
|
|
def test_format_conversation_md_invalid_json_args(self) -> None:
|
|
"""_format_conversation_md should handle non-JSON tool call args gracefully."""
|
|
formatter = MarkdownFormatter()
|
|
messages = [
|
|
{
|
|
"role": "assistant",
|
|
"content": None,
|
|
"tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}],
|
|
},
|
|
]
|
|
lines = formatter._format_conversation_md(messages)
|
|
text = "\n".join(lines)
|
|
|
|
assert "broken" in text
|
|
|
|
def test_comparative_single_model_with_context_uses_conversation(self) -> None:
|
|
"""Comparative single-model should render context including tool/assistant messages."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_comparative_results_with_context()
|
|
output = formatter.format(results, show_details=True, include_context=True)
|
|
|
|
# Context should be rendered
|
|
assert "You are a weather bot" in output
|
|
assert "I need weather info" in output
|
|
# Tool response should be JSON-formatted
|
|
assert "temp" in output
|
|
|
|
def test_comparative_case_first_with_context_uses_conversation(self) -> None:
|
|
"""Comparative case-first should render context with conversation messages."""
|
|
results = make_comparative_results_with_context()
|
|
results[0].append({
|
|
"model": "gpt-4o-mini",
|
|
"suite_name": "weather_suite [track_a]",
|
|
"track_name": "track_a",
|
|
"rubric": "Test",
|
|
"cases": [
|
|
{
|
|
"name": "weather_test",
|
|
"input": "Get weather for NYC",
|
|
"system_message": "You are a weather bot.",
|
|
"additional_messages": [
|
|
{"role": "user", "content": "I need weather info"},
|
|
],
|
|
"evaluation": MockEvaluation(passed=True, score=0.95),
|
|
}
|
|
],
|
|
})
|
|
|
|
formatter = MarkdownFormatter()
|
|
output = formatter.format(results, show_details=True, include_context=True)
|
|
|
|
assert "Multi-Model" in output
|
|
assert "You are a weather bot" in output
|
|
assert "I need weather info" in output
|
|
|
|
def test_multi_model_failed_only_with_original_counts(self) -> None:
|
|
"""MarkdownFormatter multi-model should show original counts with --only-failed."""
|
|
formatter = MarkdownFormatter()
|
|
results = make_multi_model_results()
|
|
output = formatter.format(results, failed_only=True, original_counts=(20, 15, 5, 0))
|
|
|
|
assert "Showing only failed evaluations" in output
|
|
assert "20" in output
|
|
assert "15" in output
|
|
|
|
def test_evaluation_details_uncriticized_field(self) -> None:
|
|
"""_format_evaluation_details should show dash for uncriticized fields."""
|
|
formatter = MarkdownFormatter()
|
|
evaluation = MockEvaluation(
|
|
passed=True,
|
|
score=1.0,
|
|
results=[
|
|
{
|
|
"field": "optional",
|
|
"match": False,
|
|
"score": 0.0,
|
|
"weight": 0.0,
|
|
"expected": "abc",
|
|
"actual": "xyz",
|
|
"is_criticized": False,
|
|
},
|
|
],
|
|
)
|
|
output = formatter._format_evaluation_details(evaluation)
|
|
assert "—" in output # un-criticized uses em-dash
|
|
assert "optional" in output
|