arcade-mcp/libs/tests/cli/test_main_evals.py
jottakka 7472b18106
Fixing bug with multiple providers + stats for multiple runs (#752)
@EricGustin you can use this cli command:
```
uv run arcade evals mcp_building_evals_results/eval_toolkit_iteration_dict.py \
    -p openai:gpt-4o,gpt-4o-mini \
    -p anthropic:claude-sonnet-4-20250514 \
    -k openai:$OPENAI_API_KEY \
    -k anthropic:$ANTHROPIC_API_KEY \
    -d \
    --num-runs 3 \
    --seed random \
    --multi-run-pass-rule majority \
    --max-concurrent 6 \
    -o mcp_building_evals_results/results

```

<!-- CURSOR_SUMMARY -->
---

> [!NOTE]
> **Medium Risk**
> Touches core eval execution and all result formatters while adding new
CLI inputs and output schema (`run_stats`/`critic_stats` and capture
`runs`), so regressions could affect evaluation results and report
compatibility despite being additive and validated.
> 
> **Overview**
> Adds **multi-run evaluation support** to `arcade evals` via new flags
`--num-runs`, `--seed`, and `--multi-run-pass-rule`, with upfront
validation and plumbing through the CLI runner into eval/capture suite
execution.
> 
> Fixes provider selection UX/bug by making `--use-provider/-p`
**repeatable** (instead of a space-delimited string), updates
docs/examples accordingly, and extends capture mode to optionally record
**per-run tool calls** (`CapturedRun`) when `num_runs > 1`.
> 
> Enhances all output formatters (HTML/Markdown/Text/JSON) to
**propagate and display** per-case `run_stats` and `critic_stats`,
including new HTML UI for run tabs/cards and comparative tables showing
mean ± stddev when multi-run data is present.
> 
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
2ee1654b7d1fbb9538373507355636164b16a066. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
2026-02-09 14:25:28 -03:00

422 lines
15 KiB
Python

import re
from unittest.mock import Mock
import pytest
from arcade_cli.main import cli
from arcade_cli.utils import filter_failed_evaluations
from arcade_evals.eval import EvaluationResult
from typer.testing import CliRunner
# Mark all tests in this module as requiring evals dependencies
pytestmark = pytest.mark.evals
runner = CliRunner()
_ANSI_ESCAPE_RE = re.compile(r"\x1b\[[0-9;]*m")
def _strip_ansi(text: str) -> str:
return _ANSI_ESCAPE_RE.sub("", text)
def create_mock_evaluation_result(passed: bool, warning: bool, score: float) -> Mock:
"""Create a mock EvaluationResult with the specified properties."""
evaluation = Mock(spec=EvaluationResult)
evaluation.passed = passed
evaluation.warning = warning
evaluation.score = score
evaluation.failure_reason = None
evaluation.results = []
return evaluation
def test_filter_failed_evaluations_mixed_results() -> None:
"""Test filtering logic with mixed passed, failed, and warned cases."""
all_evaluations = [
[
{
"model": "gpt-4o",
"rubric": "Test Rubric",
"cases": [
{
"name": "Passed Case",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=True, warning=False, score=0.95
),
},
{
"name": "Warning Case",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=False, warning=True, score=0.85
),
},
{
"name": "Failed Case 1",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=False, warning=False, score=0.3
),
},
{
"name": "Failed Case 2",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=False, warning=False, score=0.2
),
},
],
}
]
]
filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations)
# Verify original counts
assert original_counts == (4, 1, 2, 1)
# Verify filtered results only contain failed cases
assert len(filtered_evaluations) == 1
assert len(filtered_evaluations[0]) == 1
assert len(filtered_evaluations[0][0]["cases"]) == 2
assert filtered_evaluations[0][0]["cases"][0]["name"] == "Failed Case 1"
assert filtered_evaluations[0][0]["cases"][1]["name"] == "Failed Case 2"
def test_filter_failed_evaluations_all_passed() -> None:
"""Test filtering when all cases passed (should return empty)."""
all_evaluations = [
[
{
"model": "gpt-4o",
"rubric": "Test Rubric",
"cases": [
{
"name": "Passed Case 1",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=True, warning=False, score=0.95
),
},
{
"name": "Passed Case 2",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=True, warning=False, score=0.98
),
},
],
}
]
]
filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations)
# Verify original counts
assert original_counts == (2, 2, 0, 0)
# Verify filtered results are empty (no failed cases)
assert len(filtered_evaluations) == 0
def test_filter_failed_evaluations_multiple_suites() -> None:
"""Test filtering with multiple eval suites."""
all_evaluations = [
[
{
"model": "gpt-4o",
"rubric": "Test Rubric 1",
"cases": [
{
"name": "Passed Case",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=True, warning=False, score=0.95
),
},
{
"name": "Failed Case",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=False, warning=False, score=0.3
),
},
],
}
],
[
{
"model": "gpt-4o",
"rubric": "Test Rubric 2",
"cases": [
{
"name": "Failed Case 2",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=False, warning=False, score=0.2
),
},
],
}
],
]
filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations)
# Verify original counts
assert original_counts == (3, 1, 2, 0)
# Verify filtered results
assert len(filtered_evaluations) == 2
assert len(filtered_evaluations[0][0]["cases"]) == 1
assert len(filtered_evaluations[1][0]["cases"]) == 1
def test_filter_failed_evaluations_multiple_models() -> None:
"""Test filtering with multiple models in same suite."""
all_evaluations = [
[
{
"model": "gpt-4o",
"rubric": "Test Rubric",
"cases": [
{
"name": "Failed Case",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=False, warning=False, score=0.3
),
},
],
},
{
"model": "gpt-3.5-turbo",
"rubric": "Test Rubric",
"cases": [
{
"name": "Passed Case",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=True, warning=False, score=0.95
),
},
{
"name": "Failed Case 2",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=False, warning=False, score=0.2
),
},
],
},
]
]
filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations)
# Verify original counts
assert original_counts == (3, 1, 2, 0)
# Verify filtered results - should have both models with failed cases
assert len(filtered_evaluations) == 1
assert len(filtered_evaluations[0]) == 2 # Both models have failed cases
assert len(filtered_evaluations[0][0]["cases"]) == 1 # First model has 1 failed
assert len(filtered_evaluations[0][1]["cases"]) == 1 # Second model has 1 failed
def test_filter_failed_evaluations_model_with_no_failed() -> None:
"""Test filtering when one model has no failed cases."""
all_evaluations = [
[
{
"model": "gpt-4o",
"rubric": "Test Rubric",
"cases": [
{
"name": "Passed Case",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=True, warning=False, score=0.95
),
},
],
},
{
"model": "gpt-3.5-turbo",
"rubric": "Test Rubric",
"cases": [
{
"name": "Failed Case",
"input": "Test input",
"evaluation": create_mock_evaluation_result(
passed=False, warning=False, score=0.3
),
},
],
},
]
]
filtered_evaluations, original_counts = filter_failed_evaluations(all_evaluations)
# Verify original counts
assert original_counts == (2, 1, 1, 0)
# Verify filtered results - only second model should be included
assert len(filtered_evaluations) == 1
assert len(filtered_evaluations[0]) == 1 # Only one model with failed cases
assert filtered_evaluations[0][0]["model"] == "gpt-3.5-turbo"
assert len(filtered_evaluations[0][0]["cases"]) == 1
# --- CLI Capture Mode Flag Tests ---
def test_evals_help_shows_capture_flag() -> None:
"""Test that --capture flag is documented in help."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
assert "--capture" in output
assert "capture mode" in output.lower()
def test_evals_help_shows_include_context_flag() -> None:
"""Test that --include-context flag is documented in help."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
assert "--include-context" in output
def test_evals_help_shows_file_flag() -> None:
"""Test that --file flag is documented in help (deprecated, now hidden)."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
# Old flag is hidden, new --output should show
assert "--output" in output or "-o" in output
def test_evals_help_shows_format_flag() -> None:
"""Test that --format flag is documented in help (deprecated, now uses --output)."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
# New --output flag should show formats
assert "--output" in output
# --- New CLI Flags Tests (addressing Eric's review) ---
def test_evals_help_shows_output_flag() -> None:
"""Test that --output/-o flag is documented in help."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
assert "--output" in output or "-o" in output
def test_evals_help_shows_api_key_flag() -> None:
"""Test that --api-key flag is documented in help."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
assert "--api-key" in output
def test_evals_help_shows_only_failed_flag() -> None:
"""Test that --only-failed flag is documented in help."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
assert "--only-failed" in output
def test_evals_help_shows_host_flag() -> None:
"""Test that --host flag is documented in help."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
assert "--host" in output
def test_evals_help_shows_port_flag() -> None:
"""Test that --port flag is documented in help."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
assert "--port" in output
def test_evals_help_shows_use_provider_flag() -> None:
"""Test that --use-provider flag is documented in help."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
assert "--use-provider" in output or "-p" in output
assert "repeatable" in output.lower() or "can be repeated" in output.lower()
def test_evals_help_shows_num_runs_flag() -> None:
"""Test that --num-runs flag is documented in help."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
assert "--num-runs" in output or "-n" in output
def test_evals_help_shows_seed_flag() -> None:
"""Test that --seed flag is documented in help."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
assert "--seed" in output
def test_evals_help_shows_multi_run_pass_rule_flag() -> None:
"""Test that --multi-run-pass-rule flag is documented in help."""
result = runner.invoke(cli, ["evals", "--help"])
assert result.exit_code == 0
output = _strip_ansi(result.output)
assert "--multi-run-pass-rule" in output
# --- CLI Validation Tests for Multi-Run Flags ---
def test_evals_rejects_num_runs_zero() -> None:
"""--num-runs 0 should produce a CLI error."""
result = runner.invoke(cli, ["evals", "--num-runs", "0", "."])
output = _strip_ansi(result.output)
assert "--num-runs must be >= 1" in output
def test_evals_rejects_num_runs_negative() -> None:
"""--num-runs with a negative value should produce a CLI error."""
result = runner.invoke(cli, ["evals", "--num-runs", "-1", "."])
output = _strip_ansi(result.output)
assert "--num-runs must be >= 1" in output
def test_evals_rejects_invalid_seed() -> None:
"""--seed with an invalid string should produce a CLI error."""
result = runner.invoke(cli, ["evals", "--seed", "foobar", "."])
output = _strip_ansi(result.output)
assert "invalid" in output.lower() and "seed" in output.lower()
def test_evals_rejects_negative_seed() -> None:
"""--seed with a negative integer should produce a CLI error."""
result = runner.invoke(cli, ["evals", "--seed", "-5", "."])
output = _strip_ansi(result.output)
assert "seed" in output.lower() and ("non-negative" in output.lower() or "must be" in output.lower())
def test_evals_rejects_invalid_pass_rule() -> None:
"""--multi-run-pass-rule with an invalid value should produce a CLI error."""
result = runner.invoke(cli, ["evals", "--multi-run-pass-rule", "bogus", "."])
output = _strip_ansi(result.output)
assert "invalid" in output.lower() and "pass-rule" in output.lower().replace("_", "-")