@EricGustin you can use this cli command:
```
uv run arcade evals mcp_building_evals_results/eval_toolkit_iteration_dict.py \
-p openai:gpt-4o,gpt-4o-mini \
-p anthropic:claude-sonnet-4-20250514 \
-k openai:$OPENAI_API_KEY \
-k anthropic:$ANTHROPIC_API_KEY \
-d \
--num-runs 3 \
--seed random \
--multi-run-pass-rule majority \
--max-concurrent 6 \
-o mcp_building_evals_results/results
```
<!-- CURSOR_SUMMARY -->
---
> [!NOTE]
> **Medium Risk**
> Touches core eval execution and all result formatters while adding new
CLI inputs and output schema (`run_stats`/`critic_stats` and capture
`runs`), so regressions could affect evaluation results and report
compatibility despite being additive and validated.
>
> **Overview**
> Adds **multi-run evaluation support** to `arcade evals` via new flags
`--num-runs`, `--seed`, and `--multi-run-pass-rule`, with upfront
validation and plumbing through the CLI runner into eval/capture suite
execution.
>
> Fixes provider selection UX/bug by making `--use-provider/-p`
**repeatable** (instead of a space-delimited string), updates
docs/examples accordingly, and extends capture mode to optionally record
**per-run tool calls** (`CapturedRun`) when `num_runs > 1`.
>
> Enhances all output formatters (HTML/Markdown/Text/JSON) to
**propagate and display** per-case `run_stats` and `critic_stats`,
including new HTML UI for run tabs/cards and comparative tables showing
mean ± stddev when multi-run data is present.
>
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
2ee1654b7d1fbb9538373507355636164b16a066. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
439 lines
16 KiB
Python
439 lines
16 KiB
Python
import pytest
|
|
from arcade_evals._evalsuite._types import (
|
|
DEFAULT_EVAL_SEED,
|
|
PASS_RULE_LAST,
|
|
PASS_RULE_MAJORITY,
|
|
PASS_RULE_MEAN,
|
|
_resolve_seed_spec,
|
|
)
|
|
from arcade_evals.capture import CapturedRun, CapturedToolCall
|
|
from arcade_evals.eval import (
|
|
EvalRubric,
|
|
EvaluationResult,
|
|
_aggregate_critic_stats,
|
|
_compute_mean_std,
|
|
_resolve_pass_rule,
|
|
)
|
|
|
|
|
|
# ========================================================================
|
|
# _compute_mean_std tests
|
|
# ========================================================================
|
|
|
|
|
|
class TestComputeMeanStd:
|
|
def test_empty_list(self) -> None:
|
|
avg, std = _compute_mean_std([])
|
|
assert avg == 0.0
|
|
assert std == 0.0
|
|
|
|
def test_single_value(self) -> None:
|
|
avg, std = _compute_mean_std([0.75])
|
|
assert avg == pytest.approx(0.75)
|
|
assert std == 0.0
|
|
|
|
def test_multiple_values(self) -> None:
|
|
avg, std = _compute_mean_std([0.5, 0.5])
|
|
assert avg == pytest.approx(0.5)
|
|
assert std == pytest.approx(0.0)
|
|
|
|
def test_varying_values(self) -> None:
|
|
avg, std = _compute_mean_std([0.0, 1.0])
|
|
assert avg == pytest.approx(0.5)
|
|
assert std > 0.0
|
|
|
|
|
|
# ========================================================================
|
|
# _resolve_seed_spec tests
|
|
# ========================================================================
|
|
|
|
|
|
class TestResolveSeedSpec:
|
|
def test_constant_string(self) -> None:
|
|
mode, value = _resolve_seed_spec("constant")
|
|
assert mode == "constant"
|
|
assert value == DEFAULT_EVAL_SEED
|
|
|
|
def test_random_string(self) -> None:
|
|
mode, value = _resolve_seed_spec("random")
|
|
assert mode == "random"
|
|
assert value is None
|
|
|
|
def test_integer(self) -> None:
|
|
mode, value = _resolve_seed_spec(123)
|
|
assert mode == "custom"
|
|
assert value == 123
|
|
|
|
def test_numeric_string(self) -> None:
|
|
mode, value = _resolve_seed_spec("456")
|
|
assert mode == "custom"
|
|
assert value == 456
|
|
|
|
def test_none_defaults_to_constant(self) -> None:
|
|
mode, value = _resolve_seed_spec(None)
|
|
assert mode == "constant"
|
|
assert value == DEFAULT_EVAL_SEED
|
|
|
|
def test_invalid_string_raises(self) -> None:
|
|
with pytest.raises(ValueError, match="Invalid seed"):
|
|
_resolve_seed_spec("not-a-seed")
|
|
|
|
def test_case_insensitive(self) -> None:
|
|
mode, value = _resolve_seed_spec("CONSTANT")
|
|
assert mode == "constant"
|
|
mode2, value2 = _resolve_seed_spec("RANDOM")
|
|
assert mode2 == "random"
|
|
assert value2 is None
|
|
|
|
|
|
# ========================================================================
|
|
# _resolve_pass_rule tests
|
|
# ========================================================================
|
|
|
|
|
|
class TestResolvePassRule:
|
|
def test_last_rule_returns_last_eval(self) -> None:
|
|
rubric = EvalRubric()
|
|
run_evals = [
|
|
EvaluationResult(score=0.3, passed=False),
|
|
EvaluationResult(score=0.9, passed=True),
|
|
]
|
|
passed, warning = _resolve_pass_rule(
|
|
run_evals, mean_score=0.6, pass_rule=PASS_RULE_LAST, rubric=rubric
|
|
)
|
|
assert passed is True
|
|
assert warning is False
|
|
|
|
def test_mean_rule_passes_when_mean_above_threshold(self) -> None:
|
|
rubric = EvalRubric(fail_threshold=0.6, warn_threshold=0.4)
|
|
run_evals = [EvaluationResult(score=0.5), EvaluationResult(score=0.9)]
|
|
passed, warning = _resolve_pass_rule(
|
|
run_evals, mean_score=0.7, pass_rule=PASS_RULE_MEAN, rubric=rubric
|
|
)
|
|
assert passed is True
|
|
assert warning is False
|
|
|
|
def test_mean_rule_warning(self) -> None:
|
|
rubric = EvalRubric(fail_threshold=0.6, warn_threshold=0.4)
|
|
run_evals = [EvaluationResult(score=0.2), EvaluationResult(score=0.8)]
|
|
passed, warning = _resolve_pass_rule(
|
|
run_evals, mean_score=0.5, pass_rule=PASS_RULE_MEAN, rubric=rubric
|
|
)
|
|
assert passed is False
|
|
assert warning is True
|
|
|
|
def test_mean_rule_fails_below_warn(self) -> None:
|
|
rubric = EvalRubric(fail_threshold=0.6, warn_threshold=0.4)
|
|
run_evals = [EvaluationResult(score=0.1), EvaluationResult(score=0.2)]
|
|
passed, warning = _resolve_pass_rule(
|
|
run_evals, mean_score=0.15, pass_rule=PASS_RULE_MEAN, rubric=rubric
|
|
)
|
|
assert passed is False
|
|
assert warning is False
|
|
|
|
def test_majority_rule_passes(self) -> None:
|
|
rubric = EvalRubric()
|
|
run_evals = [
|
|
EvaluationResult(score=0.9, passed=True),
|
|
EvaluationResult(score=0.9, passed=True),
|
|
EvaluationResult(score=0.1, passed=False),
|
|
]
|
|
passed, warning = _resolve_pass_rule(
|
|
run_evals, mean_score=0.63, pass_rule=PASS_RULE_MAJORITY, rubric=rubric
|
|
)
|
|
assert passed is True
|
|
assert warning is False
|
|
|
|
def test_majority_rule_warning(self) -> None:
|
|
rubric = EvalRubric()
|
|
run_evals = [
|
|
EvaluationResult(score=0.8, passed=True),
|
|
EvaluationResult(score=0.5, warning=True),
|
|
EvaluationResult(score=0.1, passed=False),
|
|
]
|
|
passed, warning = _resolve_pass_rule(
|
|
run_evals, mean_score=0.46, pass_rule=PASS_RULE_MAJORITY, rubric=rubric
|
|
)
|
|
assert passed is False
|
|
assert warning is True
|
|
|
|
def test_empty_evaluations_returns_false(self) -> None:
|
|
rubric = EvalRubric()
|
|
passed, warning = _resolve_pass_rule(
|
|
[], mean_score=0.0, pass_rule=PASS_RULE_LAST, rubric=rubric
|
|
)
|
|
assert passed is False
|
|
assert warning is False
|
|
|
|
def test_invalid_rule_raises(self) -> None:
|
|
rubric = EvalRubric()
|
|
with pytest.raises(ValueError, match="Invalid multi-run pass rule"):
|
|
_resolve_pass_rule(
|
|
[EvaluationResult(score=0.5)],
|
|
mean_score=0.5,
|
|
pass_rule="invalid",
|
|
rubric=rubric,
|
|
)
|
|
|
|
|
|
# ========================================================================
|
|
# _aggregate_critic_stats tests
|
|
# ========================================================================
|
|
|
|
|
|
class TestAggregateCriticStats:
|
|
def test_basic_aggregation(self) -> None:
|
|
run_field_scores = [
|
|
{"arg_a": {"score": 0.5, "weight": 0.5}},
|
|
{
|
|
"arg_a": {"score": 0.0, "weight": 0.5},
|
|
"arg_b": {"score": 0.25, "weight": 0.5},
|
|
},
|
|
]
|
|
stats = _aggregate_critic_stats(run_field_scores)
|
|
assert stats["arg_a"]["run_scores"] == [0.5, 0.0]
|
|
assert stats["arg_a"]["run_scores_normalized"] == [1.0, 0.0]
|
|
assert stats["arg_a"]["weight"] == pytest.approx(0.5)
|
|
assert stats["arg_b"]["run_scores"] == [0.0, 0.25]
|
|
assert stats["arg_b"]["run_scores_normalized"] == [0.0, 0.5]
|
|
assert stats["arg_b"]["weight"] == pytest.approx(0.5)
|
|
|
|
def test_empty_input(self) -> None:
|
|
assert _aggregate_critic_stats([]) == {}
|
|
|
|
def test_single_run(self) -> None:
|
|
run_field_scores = [{"field_x": {"score": 0.8, "weight": 1.0}}]
|
|
stats = _aggregate_critic_stats(run_field_scores)
|
|
assert stats["field_x"]["run_scores"] == [0.8]
|
|
assert stats["field_x"]["mean_score"] == pytest.approx(0.8)
|
|
assert stats["field_x"]["std_deviation"] == pytest.approx(0.0)
|
|
assert stats["field_x"]["weight"] == pytest.approx(1.0)
|
|
|
|
|
|
# ========================================================================
|
|
# CapturedRun tests
|
|
# ========================================================================
|
|
|
|
|
|
class TestCapturedRun:
|
|
def test_to_dict_empty(self) -> None:
|
|
run = CapturedRun()
|
|
assert run.to_dict() == {"tool_calls": []}
|
|
|
|
def test_to_dict_with_calls(self) -> None:
|
|
run = CapturedRun(
|
|
tool_calls=[
|
|
CapturedToolCall(name="GetWeather", args={"city": "NYC"}),
|
|
CapturedToolCall(name="GetTime", args={"tz": "UTC"}),
|
|
]
|
|
)
|
|
d = run.to_dict()
|
|
assert len(d["tool_calls"]) == 2
|
|
assert d["tool_calls"][0] == {"name": "GetWeather", "args": {"city": "NYC"}}
|
|
assert d["tool_calls"][1] == {"name": "GetTime", "args": {"tz": "UTC"}}
|
|
|
|
|
|
# ========================================================================
|
|
# CapturedCase.to_dict tests
|
|
# ========================================================================
|
|
from arcade_evals.capture import CapturedCase
|
|
|
|
|
|
class TestCapturedCaseToDict:
|
|
def test_single_run_no_runs_key(self) -> None:
|
|
"""When runs=[], to_dict should NOT include a 'runs' key."""
|
|
case = CapturedCase(
|
|
case_name="test",
|
|
user_message="Hello",
|
|
tool_calls=[CapturedToolCall(name="Greet", args={})],
|
|
runs=[],
|
|
)
|
|
d = case.to_dict()
|
|
assert "runs" not in d
|
|
assert d["tool_calls"] == [{"name": "Greet", "args": {}}]
|
|
|
|
def test_multi_run_includes_runs(self) -> None:
|
|
"""When runs has items, to_dict should include 'runs' key."""
|
|
case = CapturedCase(
|
|
case_name="test",
|
|
user_message="Hello",
|
|
tool_calls=[CapturedToolCall(name="Greet", args={})],
|
|
runs=[
|
|
CapturedRun(tool_calls=[CapturedToolCall(name="Greet", args={"seed": "1"})]),
|
|
CapturedRun(tool_calls=[CapturedToolCall(name="Greet", args={"seed": "2"})]),
|
|
],
|
|
)
|
|
d = case.to_dict()
|
|
assert "runs" in d
|
|
assert len(d["runs"]) == 2
|
|
assert d["runs"][0]["tool_calls"][0]["args"]["seed"] == "1"
|
|
|
|
def test_to_dict_with_context(self) -> None:
|
|
"""to_dict with include_context=True should include system_message."""
|
|
case = CapturedCase(
|
|
case_name="test",
|
|
user_message="Hello",
|
|
tool_calls=[],
|
|
system_message="You are helpful",
|
|
additional_messages=[],
|
|
)
|
|
d = case.to_dict(include_context=True)
|
|
assert "system_message" in d
|
|
assert d["system_message"] == "You are helpful"
|
|
|
|
def test_to_dict_with_track_name(self) -> None:
|
|
"""to_dict should include track_name when set."""
|
|
case = CapturedCase(
|
|
case_name="test",
|
|
user_message="Hello",
|
|
tool_calls=[],
|
|
track_name="track_a",
|
|
)
|
|
d = case.to_dict()
|
|
assert d["track_name"] == "track_a"
|
|
|
|
def test_to_dict_no_track_name_omits_key(self) -> None:
|
|
"""to_dict should not include track_name when None."""
|
|
case = CapturedCase(
|
|
case_name="test",
|
|
user_message="Hello",
|
|
tool_calls=[],
|
|
)
|
|
d = case.to_dict()
|
|
assert "track_name" not in d
|
|
|
|
|
|
# ========================================================================
|
|
# _aggregate_critic_stats extended tests
|
|
# ========================================================================
|
|
|
|
|
|
class TestAggregateCriticStatsExtended:
|
|
def test_zero_weight_field(self) -> None:
|
|
"""Fields with zero weight should still aggregate correctly."""
|
|
run_field_scores = [
|
|
{"field_a": {"score": 0.5, "weight": 0.0}},
|
|
{"field_a": {"score": 0.7, "weight": 0.0}},
|
|
]
|
|
stats = _aggregate_critic_stats(run_field_scores)
|
|
assert stats["field_a"]["weight"] == pytest.approx(0.0)
|
|
# Normalized scores with zero weight are 0.0
|
|
assert stats["field_a"]["run_scores_normalized"] == [0.0, 0.0]
|
|
|
|
def test_mixed_presence_across_runs(self) -> None:
|
|
"""Fields missing from some runs should get 0.0 for those runs."""
|
|
run_field_scores = [
|
|
{"field_a": {"score": 0.8, "weight": 1.0}},
|
|
{"field_b": {"score": 0.5, "weight": 0.5}},
|
|
]
|
|
stats = _aggregate_critic_stats(run_field_scores)
|
|
# field_a present in run 1 (0.8), absent in run 2 (0.0)
|
|
assert stats["field_a"]["run_scores"] == [0.8, 0.0]
|
|
# field_b absent in run 1 (0.0), present in run 2 (0.5)
|
|
assert stats["field_b"]["run_scores"] == [0.0, 0.5]
|
|
|
|
def test_consistency_of_mean_and_std(self) -> None:
|
|
"""Verify mean and std are consistent with run_scores."""
|
|
from statistics import mean, pstdev
|
|
|
|
run_field_scores = [
|
|
{"f": {"score": 0.2, "weight": 0.5}},
|
|
{"f": {"score": 0.6, "weight": 0.5}},
|
|
{"f": {"score": 0.4, "weight": 0.5}},
|
|
]
|
|
stats = _aggregate_critic_stats(run_field_scores)
|
|
assert stats["f"]["mean_score"] == pytest.approx(mean([0.2, 0.6, 0.4]))
|
|
assert stats["f"]["std_deviation"] == pytest.approx(pstdev([0.2, 0.6, 0.4]))
|
|
|
|
|
|
# ========================================================================
|
|
# PASS_RULE_LAST failure_reason defensive guard test
|
|
# ========================================================================
|
|
|
|
|
|
class TestPassRuleLastFailureReasonGuard:
|
|
"""The PASS_RULE_LAST branch should not surface failure_reason when passed."""
|
|
|
|
def test_last_passed_no_failure_reason(self) -> None:
|
|
"""When PASS_RULE_LAST and the last run passed, failure_reason should be None."""
|
|
# This tests the defensive guard we added:
|
|
# aggregate_failure_reason = run_evaluations[-1].failure_reason if not passed else None
|
|
#
|
|
# We can't easily test _run_case_with_stats without mocking the LLM,
|
|
# but we can verify the logic pattern by checking _resolve_pass_rule:
|
|
rubric = EvalRubric()
|
|
evals = [
|
|
EvaluationResult(score=0.3, passed=False, failure_reason="bad"),
|
|
EvaluationResult(score=0.9, passed=True, failure_reason=None),
|
|
]
|
|
passed, warning = _resolve_pass_rule(evals, 0.6, PASS_RULE_LAST, rubric)
|
|
assert passed is True
|
|
# When passed is True, the aggregate should NOT surface failure_reason
|
|
# (This is the logic we guard in eval.py line ~929-933)
|
|
aggregate_failure_reason = evals[-1].failure_reason if not passed else None
|
|
assert aggregate_failure_reason is None
|
|
|
|
def test_last_failed_surfaces_failure_reason(self) -> None:
|
|
"""When PASS_RULE_LAST and the last run failed, failure_reason is surfaced."""
|
|
rubric = EvalRubric()
|
|
evals = [
|
|
EvaluationResult(score=0.9, passed=True),
|
|
EvaluationResult(score=0.3, passed=False, failure_reason="tool mismatch"),
|
|
]
|
|
passed, warning = _resolve_pass_rule(evals, 0.6, PASS_RULE_LAST, rubric)
|
|
assert passed is False
|
|
aggregate_failure_reason = evals[-1].failure_reason if not passed else None
|
|
assert aggregate_failure_reason == "tool mismatch"
|
|
|
|
|
|
# ========================================================================
|
|
# _resolve_pass_rule with MAJORITY edge cases
|
|
# ========================================================================
|
|
|
|
|
|
class TestResolvePassRuleMajorityEdgeCases:
|
|
def test_majority_all_warned(self) -> None:
|
|
"""When all runs have warnings, majority should return warning."""
|
|
rubric = EvalRubric()
|
|
evals = [
|
|
EvaluationResult(score=0.5, passed=False, warning=True),
|
|
EvaluationResult(score=0.5, passed=False, warning=True),
|
|
EvaluationResult(score=0.5, passed=False, warning=True),
|
|
]
|
|
passed, warning = _resolve_pass_rule(evals, 0.5, PASS_RULE_MAJORITY, rubric)
|
|
assert passed is False
|
|
assert warning is True
|
|
|
|
def test_majority_all_failed(self) -> None:
|
|
"""When all runs fail, majority should return fail."""
|
|
rubric = EvalRubric()
|
|
evals = [
|
|
EvaluationResult(score=0.1, passed=False),
|
|
EvaluationResult(score=0.2, passed=False),
|
|
EvaluationResult(score=0.15, passed=False),
|
|
]
|
|
passed, warning = _resolve_pass_rule(evals, 0.15, PASS_RULE_MAJORITY, rubric)
|
|
assert passed is False
|
|
assert warning is False
|
|
|
|
def test_majority_tie_does_not_pass(self) -> None:
|
|
"""With a 50/50 even split, there is no majority, so it fails."""
|
|
rubric = EvalRubric()
|
|
evals = [
|
|
EvaluationResult(score=0.9, passed=True),
|
|
EvaluationResult(score=0.1, passed=False),
|
|
]
|
|
# majority = 2 // 2 + 1 = 2, passed_count=1 < 2
|
|
passed, warning = _resolve_pass_rule(evals, 0.5, PASS_RULE_MAJORITY, rubric)
|
|
assert passed is False
|
|
|
|
def test_majority_tie_fails(self) -> None:
|
|
"""With more failures than passes, should fail."""
|
|
rubric = EvalRubric()
|
|
evals = [
|
|
EvaluationResult(score=0.9, passed=True),
|
|
EvaluationResult(score=0.1, passed=False),
|
|
EvaluationResult(score=0.1, passed=False),
|
|
]
|
|
passed, warning = _resolve_pass_rule(evals, 0.36, PASS_RULE_MAJORITY, rubric)
|
|
assert passed is False
|