@EricGustin you can use this cli command:
```
uv run arcade evals mcp_building_evals_results/eval_toolkit_iteration_dict.py \
-p openai:gpt-4o,gpt-4o-mini \
-p anthropic:claude-sonnet-4-20250514 \
-k openai:$OPENAI_API_KEY \
-k anthropic:$ANTHROPIC_API_KEY \
-d \
--num-runs 3 \
--seed random \
--multi-run-pass-rule majority \
--max-concurrent 6 \
-o mcp_building_evals_results/results
```
<!-- CURSOR_SUMMARY -->
---
> [!NOTE]
> **Medium Risk**
> Touches core eval execution and all result formatters while adding new
CLI inputs and output schema (`run_stats`/`critic_stats` and capture
`runs`), so regressions could affect evaluation results and report
compatibility despite being additive and validated.
>
> **Overview**
> Adds **multi-run evaluation support** to `arcade evals` via new flags
`--num-runs`, `--seed`, and `--multi-run-pass-rule`, with upfront
validation and plumbing through the CLI runner into eval/capture suite
execution.
>
> Fixes provider selection UX/bug by making `--use-provider/-p`
**repeatable** (instead of a space-delimited string), updates
docs/examples accordingly, and extends capture mode to optionally record
**per-run tool calls** (`CapturedRun`) when `num_runs > 1`.
>
> Enhances all output formatters (HTML/Markdown/Text/JSON) to
**propagate and display** per-case `run_stats` and `critic_stats`,
including new HTML UI for run tabs/cards and comparative tables showing
mean ± stddev when multi-run data is present.
>
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
2ee1654b7d1fbb9538373507355636164b16a066. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
50 lines
1.3 KiB
Python
50 lines
1.3 KiB
Python
from ._evalsuite._providers import ProviderName
|
|
from ._evalsuite._tool_registry import MCPToolDefinition
|
|
from .capture import CapturedCase, CapturedRun, CapturedToolCall, CaptureResult
|
|
from .critic import BinaryCritic, DatetimeCritic, NoneCritic, NumericCritic, SimilarityCritic
|
|
from .eval import (
|
|
AnyExpectedToolCall,
|
|
EvalRubric,
|
|
EvalSuite,
|
|
ExpectedMCPToolCall,
|
|
ExpectedToolCall,
|
|
NamedExpectedToolCall,
|
|
tool_eval,
|
|
)
|
|
from .loaders import (
|
|
clear_tools_cache,
|
|
load_arcade_mcp_gateway_async,
|
|
load_from_stdio_async,
|
|
load_mcp_remote_async,
|
|
load_stdio_arcade_async,
|
|
)
|
|
from .weights import FuzzyWeight, Weight, validate_and_normalize_critic_weights
|
|
|
|
__all__ = [
|
|
"AnyExpectedToolCall",
|
|
"BinaryCritic",
|
|
"CaptureResult",
|
|
"CapturedCase",
|
|
"CapturedRun",
|
|
"CapturedToolCall",
|
|
"DatetimeCritic",
|
|
"EvalRubric",
|
|
"EvalSuite",
|
|
"ExpectedMCPToolCall",
|
|
"ExpectedToolCall",
|
|
"FuzzyWeight",
|
|
"MCPToolDefinition",
|
|
"NamedExpectedToolCall",
|
|
"NoneCritic",
|
|
"NumericCritic",
|
|
"ProviderName",
|
|
"SimilarityCritic",
|
|
"Weight",
|
|
"clear_tools_cache",
|
|
"load_arcade_mcp_gateway_async",
|
|
"load_from_stdio_async",
|
|
"load_mcp_remote_async",
|
|
"load_stdio_arcade_async",
|
|
"tool_eval",
|
|
"validate_and_normalize_critic_weights",
|
|
]
|