@EricGustin you can use this cli command:
```
uv run arcade evals mcp_building_evals_results/eval_toolkit_iteration_dict.py \
-p openai:gpt-4o,gpt-4o-mini \
-p anthropic:claude-sonnet-4-20250514 \
-k openai:$OPENAI_API_KEY \
-k anthropic:$ANTHROPIC_API_KEY \
-d \
--num-runs 3 \
--seed random \
--multi-run-pass-rule majority \
--max-concurrent 6 \
-o mcp_building_evals_results/results
```
<!-- CURSOR_SUMMARY -->
---
> [!NOTE]
> **Medium Risk**
> Touches core eval execution and all result formatters while adding new
CLI inputs and output schema (`run_stats`/`critic_stats` and capture
`runs`), so regressions could affect evaluation results and report
compatibility despite being additive and validated.
>
> **Overview**
> Adds **multi-run evaluation support** to `arcade evals` via new flags
`--num-runs`, `--seed`, and `--multi-run-pass-rule`, with upfront
validation and plumbing through the CLI runner into eval/capture suite
execution.
>
> Fixes provider selection UX/bug by making `--use-provider/-p`
**repeatable** (instead of a space-delimited string), updates
docs/examples accordingly, and extends capture mode to optionally record
**per-run tool calls** (`CapturedRun`) when `num_runs > 1`.
>
> Enhances all output formatters (HTML/Markdown/Text/JSON) to
**propagate and display** per-case `run_stats` and `critic_stats`,
including new HTML UI for run tabs/cards and comparative tables showing
mean ± stddev when multi-run data is present.
>
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
2ee1654b7d1fbb9538373507355636164b16a066. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
295 lines
12 KiB
Python
295 lines
12 KiB
Python
"""Additional edge case tests for formatters to ensure robustness."""
|
|
|
|
from arcade_cli.formatters import (
|
|
HtmlFormatter,
|
|
JsonFormatter,
|
|
MarkdownFormatter,
|
|
TextFormatter,
|
|
)
|
|
|
|
|
|
class MockEvaluation:
|
|
"""Mock EvaluationResult for testing."""
|
|
|
|
def __init__(
|
|
self,
|
|
passed: bool = True,
|
|
warning: bool = False,
|
|
score: float = 1.0,
|
|
failure_reason: str | None = None,
|
|
results: list[dict] | None = None,
|
|
):
|
|
self.passed = passed
|
|
self.warning = warning
|
|
self.score = score
|
|
self.failure_reason = failure_reason
|
|
self.results = results or []
|
|
|
|
|
|
def make_empty_results() -> list[list[dict]]:
|
|
"""Create empty evaluation results."""
|
|
return [[{"model": "gpt-4o", "suite_name": "empty_suite", "rubric": "Test", "cases": []}]]
|
|
|
|
|
|
class TestFormatterEdgeCases:
|
|
"""Test edge cases that might not be covered elsewhere."""
|
|
|
|
def test_empty_results_all_formatters(self) -> None:
|
|
"""All formatters should handle empty results gracefully."""
|
|
results = make_empty_results()
|
|
|
|
for formatter_class in [TextFormatter, MarkdownFormatter, HtmlFormatter, JsonFormatter]:
|
|
formatter = formatter_class()
|
|
output = formatter.format(results)
|
|
assert output # Should produce some output
|
|
assert "0" in output or "Total: 0" in output.lower() or '"total_cases": 0' in output
|
|
|
|
def test_failed_only_with_zero_original_total(self) -> None:
|
|
"""Should handle original_counts with zero total without crashing."""
|
|
results = make_empty_results()
|
|
# Edge case: original_counts with 0 total (shouldn't happen in practice but should be safe)
|
|
original_counts = (0, 0, 0, 0)
|
|
|
|
for formatter_class in [TextFormatter, MarkdownFormatter, HtmlFormatter, JsonFormatter]:
|
|
formatter = formatter_class()
|
|
# Should not raise ZeroDivisionError
|
|
output = formatter.format(results, failed_only=True, original_counts=original_counts)
|
|
assert output # Should produce some output
|
|
|
|
def test_failed_only_with_empty_results_but_nonzero_original(self) -> None:
|
|
"""Should handle case where filtered results are empty but original had cases."""
|
|
results = make_empty_results()
|
|
# All cases were filtered out, but there were originally 5 cases (all passed)
|
|
original_counts = (5, 5, 0, 0)
|
|
|
|
for formatter_class in [TextFormatter, MarkdownFormatter, HtmlFormatter, JsonFormatter]:
|
|
formatter = formatter_class()
|
|
output = formatter.format(results, failed_only=True, original_counts=original_counts)
|
|
assert output
|
|
# Should show original counts
|
|
assert "5" in output
|
|
|
|
def test_all_formatters_handle_none_original_counts(self) -> None:
|
|
"""All formatters should handle None original_counts gracefully."""
|
|
results = [[{
|
|
"model": "gpt-4o",
|
|
"suite_name": "test",
|
|
"rubric": "Test",
|
|
"cases": [{
|
|
"name": "test_case",
|
|
"input": "test",
|
|
"evaluation": MockEvaluation(passed=False, score=0.0),
|
|
}],
|
|
}]]
|
|
|
|
for formatter_class in [TextFormatter, MarkdownFormatter, HtmlFormatter, JsonFormatter]:
|
|
formatter = formatter_class()
|
|
# Should not crash with None original_counts
|
|
output = formatter.format(results, failed_only=True, original_counts=None)
|
|
assert output
|
|
|
|
def test_comparative_with_missing_track_data(self) -> None:
|
|
"""Comparative formatters should handle missing track gracefully."""
|
|
# Create comparative result where one track is missing data
|
|
results = [[
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "Test Suite [track_a]",
|
|
"track_name": "track_a",
|
|
"rubric": None,
|
|
"cases": [{
|
|
"name": "test_case",
|
|
"input": "test",
|
|
"evaluation": MockEvaluation(passed=True, score=1.0),
|
|
}],
|
|
},
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "Test Suite [track_b]",
|
|
"track_name": "track_b",
|
|
"rubric": None,
|
|
"cases": [], # Empty cases for this track
|
|
},
|
|
]]
|
|
|
|
for formatter_class in [TextFormatter, MarkdownFormatter, HtmlFormatter, JsonFormatter]:
|
|
formatter = formatter_class()
|
|
output = formatter.format(results)
|
|
assert output
|
|
# Should mention both tracks
|
|
assert "track_a" in output
|
|
assert "track_b" in output
|
|
|
|
def test_html_formatter_escapes_all_special_chars(self) -> None:
|
|
"""HTML formatter must escape all special characters to prevent XSS."""
|
|
results = [[{
|
|
"model": "gpt-4o<script>alert('xss')</script>",
|
|
"suite_name": "Suite & Test",
|
|
"rubric": "Test",
|
|
"cases": [{
|
|
"name": "<img src=x onerror=alert(1)>",
|
|
"input": "test' OR '1'='1",
|
|
"evaluation": MockEvaluation(
|
|
passed=False,
|
|
score=0.0,
|
|
failure_reason="Error: <script>malicious</script>",
|
|
),
|
|
}],
|
|
}]]
|
|
|
|
formatter = HtmlFormatter()
|
|
output = formatter.format(results)
|
|
|
|
# The template includes a legitimate <script> tag for run-tabs JS,
|
|
# but user-provided content must be properly escaped.
|
|
# Verify that injected XSS payloads are escaped (not rendered raw)
|
|
assert "<script>alert" not in output # User payload must be escaped
|
|
assert "<script>malicious" not in output # Failure reason must be escaped
|
|
# <img must be escaped to <img so it doesn't render as an HTML element
|
|
assert "<img src=x" not in output
|
|
# Should contain escaped versions of user-provided content
|
|
assert "<script>" in output or "<" in output
|
|
assert "<img" in output # The img tag should be escaped
|
|
assert "&" in output # & should be escaped
|
|
|
|
def test_json_formatter_produces_valid_json_for_all_cases(self) -> None:
|
|
"""JSON formatter must always produce valid JSON."""
|
|
import json
|
|
|
|
test_cases = [
|
|
make_empty_results(),
|
|
[[{
|
|
"model": "test",
|
|
"suite_name": "test",
|
|
"rubric": None,
|
|
"cases": [{
|
|
"name": "test",
|
|
"input": "test with \"quotes\" and \n newlines",
|
|
"evaluation": MockEvaluation(passed=True),
|
|
}],
|
|
}]],
|
|
]
|
|
|
|
formatter = JsonFormatter()
|
|
for results in test_cases:
|
|
output = formatter.format(results)
|
|
# Should be valid JSON (this will raise if invalid)
|
|
parsed = json.loads(output)
|
|
assert isinstance(parsed, dict)
|
|
assert "summary" in parsed
|
|
|
|
def test_formatters_with_suite_name_none(self) -> None:
|
|
"""Formatters should handle None suite_name gracefully."""
|
|
results = [[{
|
|
"model": "gpt-4o",
|
|
"suite_name": None, # Explicitly None
|
|
"rubric": "Test",
|
|
"cases": [{
|
|
"name": "test_case",
|
|
"input": "test",
|
|
"evaluation": MockEvaluation(passed=True),
|
|
}],
|
|
}]]
|
|
|
|
for formatter_class in [TextFormatter, MarkdownFormatter, HtmlFormatter, JsonFormatter]:
|
|
formatter = formatter_class()
|
|
output = formatter.format(results)
|
|
assert output
|
|
# Should use fallback name
|
|
assert "Unnamed Suite" in output or "unnamed" in output.lower()
|
|
|
|
def test_pass_rate_calculation_edge_cases(self) -> None:
|
|
"""Test pass rate calculation in various edge cases."""
|
|
# Case 1: All passed
|
|
results_all_passed = [[{
|
|
"model": "gpt-4o",
|
|
"suite_name": "test",
|
|
"rubric": "Test",
|
|
"cases": [
|
|
{"name": f"case_{i}", "input": "test", "evaluation": MockEvaluation(passed=True)}
|
|
for i in range(5)
|
|
],
|
|
}]]
|
|
|
|
# Case 2: All failed
|
|
results_all_failed = [[{
|
|
"model": "gpt-4o",
|
|
"suite_name": "test",
|
|
"rubric": "Test",
|
|
"cases": [
|
|
{"name": f"case_{i}", "input": "test", "evaluation": MockEvaluation(passed=False, score=0.0)}
|
|
for i in range(5)
|
|
],
|
|
}]]
|
|
|
|
formatter = JsonFormatter()
|
|
|
|
# All passed should show 100% pass rate
|
|
output_passed = formatter.format(results_all_passed)
|
|
assert "100" in output_passed or "100.0" in output_passed
|
|
|
|
# All failed should show 0% pass rate
|
|
output_failed = formatter.format(results_all_failed)
|
|
assert '"pass_rate": 0' in output_failed or '"pass_rate": 0.0' in output_failed
|
|
|
|
def test_comparative_with_none_evaluation(self) -> None:
|
|
"""Comparative formatters should handle None evaluation gracefully."""
|
|
# Simulate a track result with missing evaluation (edge case)
|
|
# This could happen if there was an error during evaluation
|
|
# Note: In real usage, group_comparative_by_case would build the tracks dict
|
|
# from cases, so we need to test this at the formatting level where
|
|
# the track might not have evaluation data
|
|
results = [[
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "Test Suite [track_a]",
|
|
"track_name": "track_a",
|
|
"rubric": None,
|
|
"cases": [{
|
|
"name": "test_case",
|
|
"input": "test",
|
|
"evaluation": MockEvaluation(passed=True, score=1.0, results=[
|
|
{
|
|
"field": "test",
|
|
"match": True,
|
|
"score": 1.0,
|
|
"weight": 1.0,
|
|
"expected": "test",
|
|
"actual": "test",
|
|
}
|
|
]),
|
|
}],
|
|
},
|
|
# track_b exists but has no cases (edge case where data is missing)
|
|
]]
|
|
|
|
# All formatters should handle missing track data without crashing
|
|
for formatter_class in [TextFormatter, MarkdownFormatter, HtmlFormatter, JsonFormatter]:
|
|
formatter = formatter_class()
|
|
output = formatter.format(results)
|
|
# Should produce output
|
|
assert output
|
|
# Should show the track that exists
|
|
assert "track_a" in output or "Track" in output or "test_case" in output
|
|
|
|
def test_comparative_with_no_results_in_evaluation(self) -> None:
|
|
"""Comparative formatters should handle evaluation without results field."""
|
|
results = [[
|
|
{
|
|
"model": "gpt-4o",
|
|
"suite_name": "Test Suite [track_a]",
|
|
"track_name": "track_a",
|
|
"rubric": None,
|
|
"cases": [{
|
|
"name": "test_case",
|
|
"input": "test",
|
|
"evaluation": MockEvaluation(passed=True, score=1.0, results=[]), # Empty results
|
|
}],
|
|
},
|
|
]]
|
|
|
|
for formatter_class in [TextFormatter, MarkdownFormatter, HtmlFormatter, JsonFormatter]:
|
|
formatter = formatter_class()
|
|
# Should not crash with empty results
|
|
output = formatter.format(results, show_details=True)
|
|
assert output
|