diff --git a/examples/evals/README.md b/examples/evals/README.md
index cf1371f6..118c12a7 100644
--- a/examples/evals/README.md
+++ b/examples/evals/README.md
@@ -40,7 +40,8 @@ arcade evals examples/evals/eval_arcade_gateway.py \
# Compare multiple models
arcade evals examples/evals/eval_stdio_mcp_server.py \
- -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \
+ -p openai:gpt-4o \
+ -p anthropic:claude-sonnet-4-5-20250929 \
-k openai:YOUR_OPENAI_KEY \
-k anthropic:YOUR_ANTHROPIC_KEY
@@ -205,7 +206,8 @@ export ARCADE_API_KEY=your_key
export ARCADE_USER_ID=your_user_id
arcade evals examples/evals/eval_comprehensive_comparison.py \
- -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \
+ -p openai:gpt-4o \
+ -p anthropic:claude-sonnet-4-5-20250929 \
-k openai:YOUR_KEY \
-k anthropic:YOUR_KEY \
-o comparison.html -d
@@ -213,19 +215,22 @@ arcade evals examples/evals/eval_comprehensive_comparison.py \
## 🎯 CLI Reference
-### New v2.0.0 Flags
+### Flags
-| Flag | Short | Description | Example |
-| --------------------- | ------- | -------------------------------------------------- | ------------------------------------------------- |
-| `--use-provider` | `-p` | Provider(s) and models (space-separated) | `-p "openai:gpt-4o anthropic:claude-sonnet"` |
-| `--api-key` | `-k` | API key in`provider:key` format (repeatable) | `-k openai:sk-... -k anthropic:sk-ant-...` |
-| `--output` | `-o` | Output file (auto-detects format from extension) | `-o results.html` or `-o results` (all formats) |
-| `--only-failed` | `-f` | Show only failed evaluations | `--only-failed` |
-| `--include-context` | | Include system messages and conversation history | `--include-context` |
-| `--details` | `-d` | Show detailed output | `-d` |
-| `--max-concurrent` | | Max concurrent evaluations | `--max-concurrent 5` |
-| `--capture` | | Capture mode (record tool calls without scoring) | `--capture` |
+| Flag | Short | Description | Example |
+| ----------------------- | ----- | ----------------------------------------------------- | --------------------------------------------------- |
+| `--use-provider` | `-p` | Provider and models (repeatable) | `-p openai:gpt-4o -p anthropic:claude-sonnet` |
+| `--api-key` | `-k` | API key in `provider:key` format (repeatable) | `-k openai:sk-... -k anthropic:sk-ant-...` |
+| `--output` | `-o` | Output file (auto-detects format from extension) | `-o results.html` or `-o results` (all formats) |
+| `--only-failed` | `-f` | Show only failed evaluations | `--only-failed` |
+| `--include-context` | | Include system messages and conversation history | `--include-context` |
+| `--details` | `-d` | Show detailed output | `-d` |
+| `--max-concurrent` | | Max concurrent evaluations | `--max-concurrent 5` |
+| `--capture` | | Capture mode (record tool calls without scoring) | `--capture` |
+| `--num-runs` | `-n` | Number of runs per case (default: 1) | `-n 5` |
+| `--seed` | | Seed policy: `constant`, `random`, or an integer | `--seed random` or `--seed 42` |
+| `--multi-run-pass-rule` | | Aggregation rule: `last` (default), `mean`, `majority`| `--multi-run-pass-rule majority` |
### Provider & Model Selection
@@ -238,14 +243,15 @@ arcade evals eval_file.py -p openai -k openai:YOUR_KEY
**Single provider with specific models:**
```bash
-arcade evals eval_file.py -p "openai:gpt-4o,gpt-4o-mini" -k openai:YOUR_KEY
+arcade evals eval_file.py -p openai:gpt-4o,gpt-4o-mini -k openai:YOUR_KEY
```
-**Multiple providers (space-separated):**
+**Multiple providers (use separate `-p` flags):**
```bash
arcade evals eval_file.py \
- -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \
+ -p openai:gpt-4o \
+ -p anthropic:claude-sonnet-4-5-20250929 \
-k openai:YOUR_KEY \
-k anthropic:YOUR_KEY
```
@@ -279,7 +285,7 @@ arcade evals eval_file.py \
```bash
arcade evals examples/evals/eval_arcade_gateway.py \
- -p "openai:gpt-4o,gpt-4o-mini,gpt-3.5-turbo" \
+ -p openai:gpt-4o,gpt-4o-mini \
-k openai:YOUR_KEY \
-o comparison.html -d
```
@@ -288,7 +294,8 @@ arcade evals examples/evals/eval_arcade_gateway.py \
```bash
arcade evals examples/evals/eval_stdio_mcp_server.py \
- -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \
+ -p openai:gpt-4o \
+ -p anthropic:claude-sonnet-4-5-20250929 \
-k openai:YOUR_OPENAI_KEY \
-k anthropic:YOUR_ANTHROPIC_KEY \
-o battle.html -d
@@ -307,7 +314,8 @@ arcade evals examples/evals/eval_http_mcp_server.py \
```bash
# Compare performance across multiple tool sources
arcade evals examples/evals/eval_comprehensive_comparison.py \
- -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \
+ -p openai:gpt-4o \
+ -p anthropic:claude-sonnet-4-5-20250929 \
-k openai:YOUR_KEY \
-k anthropic:YOUR_KEY \
-o comparison.html -d
@@ -332,6 +340,46 @@ arcade evals examples/evals/eval_stdio_mcp_server.py \
-o full_results.html -d
```
+### Pattern 7: Multi-Run Evaluation
+
+Run each case multiple times to measure consistency and reduce variance:
+
+```bash
+# Run each case 5 times with random seeds, pass if majority of runs pass
+arcade evals examples/evals/eval_arcade_gateway.py \
+ --api-key openai:YOUR_KEY \
+ --num-runs 5 \
+ --seed random \
+ --multi-run-pass-rule majority \
+ -o stability.html -d
+```
+
+The output will include per-case statistics: mean score, standard deviation,
+individual run results, and per-critic field breakdowns.
+
+**Seed policies:**
+- `constant` (default) — Uses a fixed seed (42) for reproducible results
+- `random` — Uses a different random seed per run for variance testing
+- An integer (e.g., `--seed 123`) — Uses the given seed for all runs
+
+**Pass rules:**
+- `last` (default) — Uses the last run's pass/fail result
+- `mean` — Passes if mean score meets the rubric threshold
+- `majority` — Passes if more than half of the runs pass
+
+### Pattern 8: Multi-Run Capture Mode
+
+Capture mode also supports multiple runs:
+
+```bash
+arcade evals examples/evals/eval_arcade_gateway.py \
+ --capture \
+ --num-runs 3 \
+ --seed random \
+ --api-key openai:YOUR_KEY \
+ -o captured.json
+```
+
## 🐛 Troubleshooting
### Error: "No module named 'openai'"
diff --git a/libs/arcade-cli/arcade_cli/evals_runner.py b/libs/arcade-cli/arcade_cli/evals_runner.py
index 4c16d5ec..ed327325 100644
--- a/libs/arcade-cli/arcade_cli/evals_runner.py
+++ b/libs/arcade-cli/arcade_cli/evals_runner.py
@@ -159,6 +159,9 @@ async def _run_eval_task(
suite_func: Callable[..., Any],
model_spec: ModelSpec,
max_concurrent: int,
+ num_runs: int,
+ seed: str | int,
+ multi_run_pass_rule: str,
include_context: bool = False,
) -> EvalTaskResult:
"""
@@ -175,6 +178,9 @@ async def _run_eval_task(
max_concurrency=max_concurrent,
provider=model_spec.provider.value,
include_context=include_context,
+ num_runs=num_runs,
+ seed=seed,
+ multi_run_pass_rule=multi_run_pass_rule,
)
return EvalTaskResult.from_success(
suite_name, model_spec.model, model_spec.provider.value, result
@@ -198,6 +204,8 @@ async def _run_capture_task(
model_spec: ModelSpec,
max_concurrent: int,
include_context: bool,
+ num_runs: int,
+ seed: str | int,
) -> CaptureTaskResult:
"""
Run a single capture task with error handling.
@@ -214,6 +222,8 @@ async def _run_capture_task(
provider=model_spec.provider.value,
capture_mode=True,
include_context=include_context,
+ num_runs=num_runs,
+ seed=seed,
)
return CaptureTaskResult.from_success(
suite_name, model_spec.model, model_spec.provider.value, result
@@ -246,6 +256,9 @@ async def run_evaluations(
output_format: str,
failed_only: bool,
console: Console,
+ num_runs: int,
+ seed: str | int,
+ multi_run_pass_rule: str,
include_context: bool = False,
) -> None:
"""
@@ -262,6 +275,9 @@ async def run_evaluations(
output_format: Format for file output ('txt', 'md').
failed_only: Whether to show only failed evaluations.
console: Rich console for output.
+ num_runs: Number of runs per case.
+ seed: Seed policy ("constant", "random", or an integer seed).
+ multi_run_pass_rule: How to determine pass/warn for multi-run cases.
include_context: Whether to include system_message and additional_messages.
"""
tasks = []
@@ -280,6 +296,9 @@ async def run_evaluations(
model_spec=model_spec,
max_concurrent=max_concurrent,
include_context=include_context,
+ num_runs=num_runs,
+ seed=seed,
+ multi_run_pass_rule=multi_run_pass_rule,
)
)
tasks.append(task)
@@ -370,6 +389,8 @@ async def run_capture(
output_file: str | None,
output_format: str,
console: Console,
+ num_runs: int,
+ seed: str | int,
) -> None:
"""
Run evaluation suites in capture mode and output results.
@@ -385,6 +406,8 @@ async def run_capture(
output_file: Optional file path to write results.
output_format: Output format ('json', 'txt', 'md', 'html').
console: Rich console for output.
+ num_runs: Number of runs per case.
+ seed: Seed policy ("constant", "random", or an integer seed).
"""
tasks = []
@@ -402,6 +425,8 @@ async def run_capture(
model_spec=model_spec,
max_concurrent=max_concurrent,
include_context=include_context,
+ num_runs=num_runs,
+ seed=seed,
)
)
tasks.append(task)
diff --git a/libs/arcade-cli/arcade_cli/formatters/base.py b/libs/arcade-cli/arcade_cli/formatters/base.py
index 3b1d6166..ebe6a2de 100644
--- a/libs/arcade-cli/arcade_cli/formatters/base.py
+++ b/libs/arcade-cli/arcade_cli/formatters/base.py
@@ -214,11 +214,21 @@ def group_comparative_by_case(
}
# Store this track's result for this case
- comparative_groups[model][base_suite][case_name]["tracks"][track_name] = {
+ track_result: dict[str, Any] = {
"evaluation": evaluation,
"name": case_name,
"input": case.get("input", ""),
}
+ run_stats = case.get("run_stats")
+ if run_stats:
+ track_result["run_stats"] = run_stats
+ critic_stats = case.get("critic_stats")
+ if critic_stats:
+ track_result["critic_stats"] = critic_stats
+
+ comparative_groups[model][base_suite][case_name]["tracks"][track_name] = (
+ track_result
+ )
return (
comparative_groups,
@@ -414,11 +424,19 @@ def group_comparative_by_case_first(
}
# Store this track's result
- case_groups[base_suite][case_name][model]["tracks"][track_name] = {
+ track_result: dict[str, Any] = {
"evaluation": evaluation,
"name": case_name,
"input": case.get("input", ""),
}
+ run_stats = case.get("run_stats")
+ if run_stats:
+ track_result["run_stats"] = run_stats
+ critic_stats = case.get("critic_stats")
+ if critic_stats:
+ track_result["critic_stats"] = critic_stats
+
+ case_groups[base_suite][case_name][model]["tracks"][track_name] = track_result
return (
case_groups,
@@ -539,11 +557,17 @@ def group_eval_for_comparison(
comparison_data[suite_name][case_name] = {}
# Store this model's result for this case
- comparison_data[suite_name][case_name][model] = {
+ case_entry: dict[str, Any] = {
"evaluation": evaluation,
"input": case.get("input", ""),
"name": case_name,
}
+ # Propagate multi-run stats if present
+ if case.get("run_stats"):
+ case_entry["run_stats"] = case["run_stats"]
+ if case.get("critic_stats"):
+ case_entry["critic_stats"] = case["critic_stats"]
+ comparison_data[suite_name][case_name][model] = case_entry
# Calculate pass rates
for _model, stats in per_model_stats.items():
diff --git a/libs/arcade-cli/arcade_cli/formatters/html.py b/libs/arcade-cli/arcade_cli/formatters/html.py
index ab900aa2..c267e70b 100644
--- a/libs/arcade-cli/arcade_cli/formatters/html.py
+++ b/libs/arcade-cli/arcade_cli/formatters/html.py
@@ -163,10 +163,18 @@ class HtmlFormatter(EvalResultFormatter):
# Show summary table only when NOT showing details (avoid duplication)
if not show_details:
- html_parts.append('
')
- html_parts.append(
- "Status Case Score "
+ has_run_stats = any(
+ case.get("run_stats", {}).get("num_runs", 1) > 1 for case in cases
)
+ html_parts.append('')
+ if has_run_stats:
+ html_parts.append(
+ "Status Case Score Runs "
+ )
+ else:
+ html_parts.append(
+ "Status Case Score "
+ )
html_parts.append("")
for case in cases:
@@ -183,11 +191,20 @@ class HtmlFormatter(EvalResultFormatter):
score_pct = evaluation.score * 100
case_name = self._escape_html(case["name"])
+ run_stats = case.get("run_stats") or {}
+ score_display = f"{score_pct:.1f}%"
+ runs_display = ""
+ if run_stats.get("num_runs", 1) > 1:
+ std_pct = run_stats.get("std_deviation", 0.0) * 100
+ score_display = f"{score_pct:.1f}% ± {std_pct:.1f}%"
+ runs_display = str(run_stats.get("num_runs", 1))
html_parts.append(f'')
html_parts.append(f'{status_text} ')
html_parts.append(f"{case_name} ")
- html_parts.append(f'{score_pct:.1f}% ')
+ html_parts.append(f'{score_display} ')
+ if has_run_stats:
+ html_parts.append(f"{runs_display or '-'} ")
html_parts.append(" ")
html_parts.append("
")
@@ -254,7 +271,15 @@ class HtmlFormatter(EvalResultFormatter):
html_parts.append("")
# Evaluation details
- html_parts.append(self._format_evaluation_details(evaluation))
+ run_id = self._make_safe_id(suite_name, case["name"], model)
+ html_parts.append(
+ self._format_evaluation_details(
+ evaluation,
+ case.get("run_stats"),
+ case.get("critic_stats"),
+ run_id=run_id,
+ )
+ )
html_parts.append("")
html_parts.append("")
@@ -267,18 +292,50 @@ class HtmlFormatter(EvalResultFormatter):
return "\n".join(html_parts)
- def _format_evaluation_details(self, evaluation: Any) -> str:
+ def _format_evaluation_details(
+ self,
+ evaluation: Any,
+ run_stats: dict[str, Any] | None = None,
+ critic_stats: dict[str, Any] | None = None,
+ run_id: str | None = None,
+ ) -> str:
"""Format evaluation details as HTML table."""
- if evaluation.failure_reason:
- return f'❌ Failure Reason: {self._escape_html(evaluation.failure_reason)}
'
+ parts: list[str] = []
+ run_stats_html = self._format_run_stats_html(run_stats, evaluation)
+ if run_stats_html:
+ parts.append(run_stats_html)
+
+ run_tabs_html = self._format_run_tabs_html(run_stats, run_id)
+ if run_tabs_html:
+ parts.append(run_tabs_html)
+
+ critic_stats_html = self._format_critic_stats_html(critic_stats)
+ if critic_stats_html:
+ parts.append(critic_stats_html)
+
+ if evaluation.failure_reason:
+ parts.append(
+ f'❌ Failure Reason: '
+ f"{self._escape_html(evaluation.failure_reason)}
"
+ )
+ return "\n".join(parts)
+
+ # Only show field details table when there are NO run tabs
+ # (run tabs already show per-run field details, and Critic Stats shows the aggregation)
+ if not run_tabs_html:
+ details_table = self._format_critic_results_table(evaluation.results)
+ parts.append(details_table)
+ return "\n".join(parts)
+
+ def _format_critic_results_table(self, results: list[dict[str, Any]]) -> str:
lines = ['']
lines.append(
"Field Match Score Expected Actual "
)
lines.append("")
- for critic_result in evaluation.results:
+ for critic_result in results:
is_criticized = critic_result.get("is_criticized", True)
field = self._escape_html(critic_result["field"])
score = critic_result["score"]
@@ -314,6 +371,177 @@ class HtmlFormatter(EvalResultFormatter):
lines.append("
")
return "\n".join(lines)
+ def _format_run_stats_html(self, run_stats: dict[str, Any] | None, evaluation: Any) -> str:
+ if not run_stats or run_stats.get("num_runs", 1) < 2:
+ return ""
+ if evaluation.passed:
+ status_label = "PASSED"
+ status_icon = "✅"
+ status_class = "passed"
+ elif evaluation.warning:
+ status_label = "WARNED"
+ status_icon = "⚠️"
+ status_class = "warned"
+ else:
+ status_label = "FAILED"
+ status_icon = "❌"
+ status_class = "failed"
+ mean_pct = run_stats.get("mean_score", 0.0) * 100
+ std_pct = run_stats.get("std_deviation", 0.0) * 100
+ num_runs = run_stats.get("num_runs", 0)
+ scores = run_stats.get("scores", [])
+ seed_policy = run_stats.get("seed_policy", "")
+ run_seeds = run_stats.get("run_seeds") or []
+ pass_rule = run_stats.get("pass_rule", "")
+
+ # Build score pills for each run
+ score_pills = []
+ for i, score in enumerate(scores, 1):
+ score_pct = score * 100
+ if score >= 0.8:
+ pill_class = "score-pill high"
+ elif score >= 0.6:
+ pill_class = "score-pill mid"
+ else:
+ pill_class = "score-pill low"
+ score_pills.append(f'R{i}: {score_pct:.0f}% ')
+ scores_html = " ".join(score_pills) if score_pills else ""
+
+ # Build seeds display
+ seeds_html = ""
+ if run_seeds and any(seed is not None for seed in run_seeds):
+ seeds_display = ", ".join(str(seed) for seed in run_seeds)
+ seeds_html = f'🎲 Seeds {seeds_display}
'
+
+ html = f"""
+
+
+
+
+ {mean_pct:.1f}%
+ mean score
+
+
+ ± {std_pct:.1f}%
+ std dev
+
+
+
+
{scores_html}
+
+
+
"""
+ return html
+
+ def _format_critic_stats_html(self, critic_stats: dict[str, Any] | None) -> str:
+ if not critic_stats:
+ return ""
+ lines = ['📊 Critic Stats ']
+ lines.append('
')
+ lines.append(
+ "Field Weight Mean (norm %) "
+ "Std (norm %) Mean (weighted %) "
+ "Std (weighted %) "
+ )
+ lines.append("")
+ for field, stats in critic_stats.items():
+ weight = stats.get("weight", 0.0)
+ mean_norm = stats.get("mean_score_normalized", 0.0) * 100
+ std_norm = stats.get("std_deviation_normalized", 0.0) * 100
+ mean_weighted = stats.get("mean_score", 0.0) * 100
+ std_weighted = stats.get("std_deviation", 0.0) * 100
+ # Color coding based on normalized mean: <60 red, 60-80 yellow, >80 green
+ if mean_norm < 60:
+ score_class = "score-low"
+ elif mean_norm < 80:
+ score_class = "score-mid"
+ else:
+ score_class = "score-high"
+ lines.append(
+ f''
+ f"{self._escape_html(field)} "
+ f"{weight:.2f} "
+ f'{mean_norm:.2f}% '
+ f"{std_norm:.2f}% "
+ f"{mean_weighted:.2f}% "
+ f"{std_weighted:.2f}% "
+ " "
+ )
+ lines.append("
")
+ return "\n".join(lines)
+
+ def _format_run_tabs_html(self, run_stats: dict[str, Any] | None, run_id: str | None) -> str:
+ if not run_stats or run_stats.get("num_runs", 1) < 2:
+ return ""
+ runs = run_stats.get("runs", [])
+ if not runs or run_id is None:
+ return ""
+
+ tabs = ['', '
']
+ for idx, run in enumerate(runs, start=1):
+ active = "active" if idx == 1 else ""
+ if run.get("passed"):
+ status_class = "passed"
+ elif run.get("warning"):
+ status_class = "warned"
+ else:
+ status_class = "failed"
+ tabs.append(
+ f'Run {idx} '
+ )
+ tabs.append("
")
+
+ panels = ['
']
+ for idx, run in enumerate(runs, start=1):
+ active = "active" if idx == 1 else ""
+ if run.get("passed"):
+ status = "✅ PASSED"
+ status_class = "passed"
+ elif run.get("warning"):
+ status = "⚠️ WARNED"
+ status_class = "warned"
+ else:
+ status = "❌ FAILED"
+ status_class = "failed"
+ score_pct = run.get("score", 0.0) * 100
+ details = run.get("details", [])
+ panels.append(
+ f'
'
+ )
+ panels.append(f"
Run {idx}: {status} — {score_pct:.2f}%
")
+ failure_reason = run.get("failure_reason")
+ if failure_reason:
+ panels.append(
+ f'
❌ Failure Reason: '
+ f"{self._escape_html(str(failure_reason))}
"
+ )
+ if details:
+ panels.append(self._format_critic_results_table(details))
+ panels.append("
")
+ panels.append("
")
+
+ return "\n".join(tabs + panels)
+
def _escape_html(self, text: str) -> str:
"""Escape HTML special characters."""
return (
@@ -534,6 +762,7 @@ class HtmlFormatter(EvalResultFormatter):
for model in model_order:
if model in case_models:
evaluation = case_models[model]["evaluation"]
+ run_stats = case_models[model].get("run_stats")
score = evaluation.score * 100
if evaluation.passed:
cell_class = "passed"
@@ -544,7 +773,15 @@ class HtmlFormatter(EvalResultFormatter):
else:
cell_class = "failed"
icon = "✗"
- html_parts.append(f'{icon} {score:.0f}% ')
+ if run_stats and run_stats.get("num_runs", 1) > 1:
+ std_pct = run_stats.get("std_deviation", 0.0) * 100
+ runs = run_stats.get("num_runs", 1)
+ html_parts.append(
+ f'{icon} '
+ f"{score:.0f}% ± {std_pct:.0f}%n={runs} "
+ )
+ else:
+ html_parts.append(f'{icon} {score:.0f}% ')
else:
html_parts.append('- ')
@@ -582,7 +819,15 @@ class HtmlFormatter(EvalResultFormatter):
html_parts.append(
f"{self._escape_html(model)} : Score {evaluation.score * 100:.1f}%"
)
- html_parts.append(self._format_evaluation_details(evaluation))
+ run_id = self._make_safe_id(suite_name, case_name, model)
+ html_parts.append(
+ self._format_evaluation_details(
+ evaluation,
+ case_result.get("run_stats"),
+ case_result.get("critic_stats"),
+ run_id=run_id,
+ )
+ )
html_parts.append("")
html_parts.append("")
@@ -609,9 +854,9 @@ class HtmlFormatter(EvalResultFormatter):
.multi-model-summary .pass-rate { font-weight: bold; }
.multi-model-summary .best-model { background-color: rgba(76, 175, 80, 0.1); }
.best-overall { margin-top: 15px; padding: 10px; background: #1e1e1e; border-radius: 4px; }
- .comparison-table { width: 100%; border-collapse: collapse; margin: 15px 0; }
- .comparison-table th, .comparison-table td { padding: 10px; border: 1px solid #333; text-align: center; }
- .comparison-table th { background-color: #252525; }
+ .comparison-table { width: 100%; border-collapse: collapse; margin: 15px 0; }
+ .comparison-table th, .comparison-table td { padding: 10px; border: 1px solid #333; text-align: center; }
+ .comparison-table th { background: linear-gradient(90deg, rgba(137, 180, 250, 0.25), rgba(203, 166, 247, 0.25)); }
.comparison-table .case-name { text-align: left; font-weight: bold; }
.comparison-table .passed { background-color: rgba(76, 175, 80, 0.2); color: #4caf50; }
.comparison-table .failed { background-color: rgba(244, 67, 54, 0.2); color: #f44336; }
@@ -1201,7 +1446,15 @@ class HtmlFormatter(EvalResultFormatter):
f'{self._escape_html(track_name)} '
)
lines.append("")
- lines.append(self._format_evaluation_details(evaluation))
+ run_id = self._make_safe_id(suite_name, case_name, f"{track_name}")
+ lines.append(
+ self._format_evaluation_details(
+ evaluation,
+ track_result.get("run_stats"),
+ track_result.get("critic_stats"),
+ run_id=run_id,
+ )
+ )
lines.append("") # track-panel
lines.append("") # track-panels-container
@@ -1256,6 +1509,9 @@ document.querySelectorAll('.track-tab').forEach(tab => {
--blue: #89b4fa;
--purple: #cba6f7;
--cyan: #94e2d5;
+ --accent: #89b4fa;
+ --accent-2: #cba6f7;
+ --shadow: rgba(0, 0, 0, 0.35);
}
* {
@@ -1282,6 +1538,23 @@ document.querySelectorAll('.track-tab').forEach(tab => {
padding-bottom: 10px;
}
+ /* Critic stats score coloring: red <60%, yellow 60-80%, green >80% */
+ .critic-stats-table tr.score-low td.score-value {
+ color: var(--red);
+ font-weight: bold;
+ }
+ .critic-stats-table tr.score-mid td.score-value {
+ color: var(--yellow);
+ font-weight: bold;
+ }
+ .critic-stats-table tr.score-high td.score-value {
+ color: var(--green);
+ font-weight: bold;
+ }
+ .critic-stats-table tr.score-low { background: rgba(243, 139, 168, 0.08); }
+ .critic-stats-table tr.score-mid { background: rgba(249, 226, 175, 0.08); }
+ .critic-stats-table tr.score-high { background: rgba(166, 227, 161, 0.08); }
+
h2 {
color: var(--blue);
margin-top: 30px;
@@ -1480,6 +1753,234 @@ document.querySelectorAll('.track-tab').forEach(tab => {
font-size: 0.9em;
}
+ .critic-stats {
+ margin: 10px 0;
+ padding: 10px;
+ background: #202020;
+ border-radius: 6px;
+ }
+
+ /* Run Stats Card - Modern Design */
+ .run-stats-card {
+ margin: 15px 0;
+ border-radius: 12px;
+ background: linear-gradient(145deg, #252535, #1a1a2a);
+ border: 1px solid var(--border-color);
+ overflow: hidden;
+ }
+ .run-stats-card.passed { border-left: 4px solid var(--green); }
+ .run-stats-card.warned { border-left: 4px solid var(--yellow); }
+ .run-stats-card.failed { border-left: 4px solid var(--red); }
+
+ .run-stats-header {
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+ padding: 12px 16px;
+ background: rgba(0, 0, 0, 0.2);
+ border-bottom: 1px solid var(--border-color);
+ }
+
+ .run-status-badge {
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ padding: 6px 14px;
+ border-radius: 20px;
+ font-weight: bold;
+ font-size: 0.9em;
+ }
+ .run-status-badge.passed { background: rgba(166, 227, 161, 0.2); color: var(--green); }
+ .run-status-badge.warned { background: rgba(249, 226, 175, 0.2); color: var(--yellow); }
+ .run-status-badge.failed { background: rgba(243, 139, 168, 0.2); color: var(--red); }
+
+ .run-count {
+ color: var(--text-color);
+ font-size: 0.9em;
+ opacity: 0.8;
+ }
+
+ .run-stats-body {
+ padding: 16px;
+ }
+
+ .score-display {
+ display: flex;
+ align-items: flex-end;
+ gap: 20px;
+ margin-bottom: 12px;
+ }
+
+ .score-main {
+ display: flex;
+ flex-direction: column;
+ }
+ .score-main .score-value {
+ font-size: 2.2em;
+ font-weight: bold;
+ color: var(--blue);
+ line-height: 1;
+ }
+ .score-main .score-label {
+ font-size: 0.75em;
+ color: #888;
+ text-transform: uppercase;
+ letter-spacing: 0.05em;
+ }
+
+ .score-deviation {
+ display: flex;
+ flex-direction: column;
+ }
+ .score-deviation .deviation-value {
+ font-size: 1.3em;
+ font-weight: 600;
+ color: var(--purple);
+ }
+ .score-deviation .deviation-label {
+ font-size: 0.7em;
+ color: #888;
+ text-transform: uppercase;
+ }
+
+ .score-bar-container {
+ height: 8px;
+ background: rgba(255, 255, 255, 0.1);
+ border-radius: 4px;
+ overflow: hidden;
+ margin-bottom: 14px;
+ }
+ .score-bar {
+ height: 100%;
+ border-radius: 4px;
+ transition: width 0.3s ease;
+ }
+ .score-bar.passed { background: linear-gradient(90deg, var(--green), #7ecf7e); }
+ .score-bar.warned { background: linear-gradient(90deg, var(--yellow), #f5d67a); }
+ .score-bar.failed { background: linear-gradient(90deg, var(--red), #e87a94); }
+
+ .run-scores {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 8px;
+ }
+
+ .score-pill {
+ padding: 4px 10px;
+ border-radius: 12px;
+ font-size: 0.8em;
+ font-weight: 600;
+ }
+ .score-pill.high { background: rgba(166, 227, 161, 0.2); color: var(--green); }
+ .score-pill.mid { background: rgba(249, 226, 175, 0.2); color: var(--yellow); }
+ .score-pill.low { background: rgba(243, 139, 168, 0.2); color: var(--red); }
+
+ .run-stats-footer {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 16px;
+ padding: 12px 16px;
+ background: rgba(0, 0, 0, 0.15);
+ border-top: 1px solid var(--border-color);
+ }
+
+ .run-meta-item {
+ display: flex;
+ flex-direction: column;
+ gap: 2px;
+ }
+ .meta-label {
+ font-size: 0.7em;
+ color: #888;
+ text-transform: uppercase;
+ }
+ .meta-value {
+ font-size: 0.85em;
+ color: var(--text-color);
+ }
+ .meta-value.mono {
+ font-family: 'Consolas', 'Monaco', monospace;
+ font-size: 0.75em;
+ color: var(--cyan);
+ }
+
+ .run-tabs {
+ margin: 12px 0;
+ border: 1px solid var(--border-color);
+ border-radius: 6px;
+ background: #1f1f2b;
+ }
+
+ .run-tab-list {
+ display: flex;
+ gap: 6px;
+ padding: 8px;
+ border-bottom: 1px solid var(--border-color);
+ flex-wrap: wrap;
+ }
+
+ .run-tab {
+ background: #2a2a3a;
+ color: var(--text-color);
+ border: 1px solid var(--border-color);
+ border-radius: 4px;
+ padding: 6px 10px;
+ cursor: pointer;
+ }
+
+ .run-tab.active {
+ background: var(--blue);
+ color: #111;
+ border-color: var(--blue);
+ }
+
+ .run-tab.passed {
+ border-color: var(--green);
+ }
+
+ .run-tab.warned {
+ border-color: var(--yellow);
+ }
+
+ .run-tab.failed {
+ border-color: var(--red);
+ }
+
+ .run-panels {
+ padding: 10px;
+ }
+
+ .run-panel {
+ display: none;
+ }
+
+ .run-panel.active {
+ display: block;
+ }
+
+ .run-panel.passed {
+ border-left: 3px solid var(--green);
+ padding-left: 10px;
+ }
+
+ .run-panel.warned {
+ border-left: 3px solid var(--yellow);
+ padding-left: 10px;
+ }
+
+ .run-panel.failed {
+ border-left: 3px solid var(--red);
+ padding-left: 10px;
+ }
+
+ .run-status.passed { color: var(--green); }
+ .run-status.warned { color: var(--yellow); }
+ .run-status.failed { color: var(--red); }
+
+ .aggregate-details {
+ margin-top: 10px;
+ }
+
.field-name {
color: var(--purple);
font-weight: 600;
@@ -1954,6 +2455,39 @@ document.querySelectorAll('.track-tab').forEach(tab => {
margin: 8px 0;
}
+
"""
@@ -1994,19 +2528,48 @@ class CaptureHtmlFormatter(CaptureFormatter):
for case in capture.captured_cases:
total_cases += 1
tool_calls_html = []
+ runs = getattr(case, "runs", None)
- for tc in case.tool_calls:
- total_calls += 1
- args_html = ""
- if tc.args:
- args_json = json.dumps(tc.args, indent=2)
- args_html = f'{self._escape_html(args_json)} '
- tool_calls_html.append(
- f''
- f'{self._escape_html(tc.name)} '
- f"{args_html}"
- f"
"
- )
+ if runs:
+ for run_index, run in enumerate(runs, start=1):
+ run_calls_html = []
+ for tc in run.tool_calls:
+ total_calls += 1
+ args_html = ""
+ if tc.args:
+ args_json = json.dumps(tc.args, indent=2)
+ args_html = (
+ f'{self._escape_html(args_json)} '
+ )
+ run_calls_html.append(
+ f''
+ f'{self._escape_html(tc.name)} '
+ f"{args_html}"
+ f"
"
+ )
+ if not run_calls_html:
+ run_calls_html.append(
+ 'No tool calls captured
'
+ )
+ tool_calls_html.append(
+ f''
+ f'Run {run_index} '
+ f'{"".join(run_calls_html)}'
+ f" "
+ )
+ else:
+ for tc in case.tool_calls:
+ total_calls += 1
+ args_html = ""
+ if tc.args:
+ args_json = json.dumps(tc.args, indent=2)
+ args_html = f'{self._escape_html(args_json)} '
+ tool_calls_html.append(
+ f''
+ f'{self._escape_html(tc.name)} '
+ f"{args_html}"
+ f"
"
+ )
if not tool_calls_html:
tool_calls_html.append('No tool calls captured
')
@@ -2498,7 +3061,31 @@ class CaptureHtmlFormatter(CaptureFormatter):
f'{self._escape_html(model)}
'
)
- if captured_case.tool_calls:
+ runs = getattr(captured_case, "runs", None)
+ if runs:
+ for run_index, run in enumerate(runs, start=1):
+ html_parts.append(
+ f''
+ f"Run {run_index} "
+ )
+ if run.tool_calls:
+ for tc in run.tool_calls:
+ total_calls += 1
+ args_html = ""
+ if tc.args:
+ args_json = json.dumps(tc.args, indent=2)
+ args_html = f'{self._escape_html(args_json)} '
+ html_parts.append(
+ f''
+ f'{self._escape_html(tc.name)} '
+ f"{args_html}
"
+ )
+ else:
+ html_parts.append(
+ 'No tool calls
'
+ )
+ html_parts.append(" ")
+ elif captured_case.tool_calls:
for tc in captured_case.tool_calls:
total_calls += 1
args_html = ""
@@ -2539,7 +3126,29 @@ class CaptureHtmlFormatter(CaptureFormatter):
f'{self._escape_html(model)}
'
)
- if captured_case.tool_calls:
+ runs = getattr(captured_case, "runs", None)
+ if runs:
+ for run_index, run in enumerate(runs, start=1):
+ html_parts.append(
+ f''
+ f"Run {run_index} "
+ )
+ if run.tool_calls:
+ for tc in run.tool_calls:
+ total_calls += 1
+ args_html = ""
+ if tc.args:
+ args_json = json.dumps(tc.args, indent=2)
+ args_html = f'{self._escape_html(args_json)} '
+ html_parts.append(
+ f''
+ f'{self._escape_html(tc.name)} '
+ f"{args_html}
"
+ )
+ else:
+ html_parts.append('No tool calls
')
+ html_parts.append(" ")
+ elif captured_case.tool_calls:
for tc in captured_case.tool_calls:
total_calls += 1
args_html = ""
@@ -2704,6 +3313,19 @@ document.querySelectorAll('.track-tab').forEach(tab => {{
text-transform: uppercase;
margin-bottom: 0.5rem;
}}
+ .capture-run {{
+ margin-bottom: 0.75rem;
+ background: var(--bg-primary);
+ border: 1px solid var(--border);
+ border-radius: 6px;
+ padding: 0.5rem 0.75rem;
+ }}
+ .capture-run summary {{
+ cursor: pointer;
+ font-weight: 600;
+ color: var(--accent);
+ margin-bottom: 0.5rem;
+ }}
.tool-call {{
background: var(--bg-primary);
border-left: 3px solid var(--accent);
diff --git a/libs/arcade-cli/arcade_cli/formatters/json.py b/libs/arcade-cli/arcade_cli/formatters/json.py
index 361974b2..e5ebcb0b 100644
--- a/libs/arcade-cli/arcade_cli/formatters/json.py
+++ b/libs/arcade-cli/arcade_cli/formatters/json.py
@@ -249,6 +249,13 @@ class JsonFormatter(EvalResultFormatter):
if evaluation.failure_reason:
track_data["failure_reason"] = evaluation.failure_reason
+ run_stats = track_result.get("run_stats")
+ if run_stats:
+ track_data["run_stats"] = run_stats
+ critic_stats = track_result.get("critic_stats")
+ if critic_stats:
+ track_data["critic_stats"] = critic_stats
+
if show_details and evaluation.results:
track_data["details"] = self._serialize_critic_results(
evaluation.results
@@ -375,6 +382,13 @@ class JsonFormatter(EvalResultFormatter):
if evaluation.failure_reason:
track_data["failure_reason"] = evaluation.failure_reason
+ run_stats = track_result.get("run_stats")
+ if run_stats:
+ track_data["run_stats"] = run_stats
+ critic_stats = track_result.get("critic_stats")
+ if critic_stats:
+ track_data["critic_stats"] = critic_stats
+
if show_details and evaluation.results:
track_data["details"] = self._serialize_critic_results(
evaluation.results
@@ -496,6 +510,13 @@ class JsonFormatter(EvalResultFormatter):
if evaluation.failure_reason:
model_data["failure_reason"] = evaluation.failure_reason
+ run_stats = case_result.get("run_stats")
+ if run_stats:
+ model_data["run_stats"] = run_stats
+ critic_stats = case_result.get("critic_stats")
+ if critic_stats:
+ model_data["critic_stats"] = critic_stats
+
if show_details and evaluation.results:
model_data["details"] = self._serialize_critic_results(evaluation.results)
@@ -537,6 +558,13 @@ class JsonFormatter(EvalResultFormatter):
if evaluation.failure_reason:
case_data["failure_reason"] = evaluation.failure_reason
+ run_stats = case.get("run_stats")
+ if run_stats:
+ case_data["run_stats"] = run_stats
+ critic_stats = case.get("critic_stats")
+ if critic_stats:
+ case_data["critic_stats"] = critic_stats
+
if show_details and evaluation.results:
case_data["details"] = self._serialize_critic_results(evaluation.results)
@@ -657,12 +685,24 @@ class CaptureJsonFormatter(CaptureFormatter):
continue
captured_case = models_dict[model]
- track_output["models"][model] = {
+ model_output: dict[str, Any] = {
"tool_calls": [
{"name": tc.name, "args": tc.args}
for tc in captured_case.tool_calls
],
}
+ runs = getattr(captured_case, "runs", None)
+ if runs:
+ model_output["runs"] = [
+ {
+ "tool_calls": [
+ {"name": tc.name, "args": tc.args}
+ for tc in run.tool_calls
+ ]
+ }
+ for run in runs
+ ]
+ track_output["models"][model] = model_output
case_output["tracks"][track_display] = track_output
else:
@@ -678,12 +718,23 @@ class CaptureJsonFormatter(CaptureFormatter):
continue
captured_case = models_dict[model]
- case_output["models"][model] = {
+ model_output = {
"tool_calls": [
{"name": tc.name, "args": tc.args}
for tc in captured_case.tool_calls
],
}
+ runs = getattr(captured_case, "runs", None)
+ if runs:
+ model_output["runs"] = [
+ {
+ "tool_calls": [
+ {"name": tc.name, "args": tc.args} for tc in run.tool_calls
+ ]
+ }
+ for run in runs
+ ]
+ case_output["models"][model] = model_output
output["grouped_by_case"][suite_name][case_name] = case_output
diff --git a/libs/arcade-cli/arcade_cli/formatters/markdown.py b/libs/arcade-cli/arcade_cli/formatters/markdown.py
index ea5b3486..f6fef9c8 100644
--- a/libs/arcade-cli/arcade_cli/formatters/markdown.py
+++ b/libs/arcade-cli/arcade_cli/formatters/markdown.py
@@ -89,37 +89,16 @@ class MarkdownFormatter(EvalResultFormatter):
lines.append("## Summary")
lines.append("")
- if failed_only and original_counts:
- orig_total, orig_passed, orig_failed, orig_warned = original_counts
- lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)")
- lines.append("")
- lines.append("| Metric | Count |")
- lines.append("|--------|-------|")
- lines.append(f"| **Total** | {orig_total} |")
- lines.append(f"| ✅ Passed | {orig_passed} |")
- if orig_warned > 0:
- lines.append(f"| ⚠️ Warnings | {orig_warned} |")
- lines.append(f"| ❌ Failed | {orig_failed} |")
- else:
- lines.append("| Metric | Count |")
- lines.append("|--------|-------|")
- lines.append(f"| **Total** | {total_cases} |")
- lines.append(f"| ✅ Passed | {total_passed} |")
- if total_warned > 0:
- lines.append(f"| ⚠️ Warnings | {total_warned} |")
- if total_failed > 0:
- lines.append(f"| ❌ Failed | {total_failed} |")
-
- # Pass rate
- if total_cases > 0:
- if failed_only and original_counts and original_counts[0] > 0:
- pass_rate = (original_counts[1] / original_counts[0]) * 100
- else:
- pass_rate = (total_passed / total_cases) * 100
- lines.append("")
- lines.append(f"**Pass Rate:** {pass_rate:.1f}%")
-
- lines.append("")
+ lines.extend(
+ self._format_summary_table_md(
+ total_cases,
+ total_passed,
+ total_failed,
+ total_warned,
+ failed_only,
+ original_counts,
+ )
+ )
# Results by model
lines.append("## Results by Model")
@@ -134,8 +113,15 @@ class MarkdownFormatter(EvalResultFormatter):
lines.append("")
# Results table
- lines.append("| Status | Case | Score |")
- lines.append("|--------|------|-------|")
+ has_run_stats = any(
+ case.get("run_stats", {}).get("num_runs", 1) > 1 for case in cases
+ )
+ if has_run_stats:
+ lines.append("| Status | Case | Score | Runs |")
+ lines.append("|--------|------|-------|------|")
+ else:
+ lines.append("| Status | Case | Score |")
+ lines.append("|--------|------|-------|")
for case in cases:
evaluation = case["evaluation"]
@@ -148,7 +134,15 @@ class MarkdownFormatter(EvalResultFormatter):
score_pct = evaluation.score * 100
case_name = case["name"].replace("|", "\\|")
- lines.append(f"| {status} | {case_name} | {score_pct:.1f}% |")
+ run_stats = case.get("run_stats") or {}
+ score_display = f"{score_pct:.1f}%"
+ if run_stats.get("num_runs", 1) > 1:
+ std_pct = run_stats.get("std_deviation", 0.0) * 100
+ score_display = f"{score_pct:.1f}% ± {std_pct:.1f}%"
+ runs_value = run_stats.get("num_runs", 1)
+ lines.append(f"| {status} | {case_name} | {score_display} | {runs_value} |")
+ else:
+ lines.append(f"| {status} | {case_name} | {score_display} |")
lines.append("")
@@ -175,6 +169,16 @@ class MarkdownFormatter(EvalResultFormatter):
lines.append(f"**Input:** `{case['input']}`")
lines.append("")
+ run_stats = case.get("run_stats")
+ lines.extend(self._format_run_stats_summary(run_stats))
+
+ run_detail_lines = self._format_run_details_md(run_stats)
+ lines.extend(run_detail_lines)
+
+ critic_stats = case.get("critic_stats")
+ if critic_stats:
+ lines.extend(self._format_critic_stats_summary(critic_stats))
+
# Context section (if include_context is True)
if include_context:
system_msg = case.get("system_message")
@@ -194,8 +198,10 @@ class MarkdownFormatter(EvalResultFormatter):
lines.append("")
lines.append("")
- # Evaluation details
- lines.append(self._format_evaluation_details(evaluation))
+ # Only show the critic results table when there are no per-run
+ # details (run details already include per-run field tables)
+ if not run_detail_lines:
+ lines.append(self._format_evaluation_details(evaluation))
lines.append("")
lines.append("---")
lines.append("")
@@ -212,31 +218,120 @@ class MarkdownFormatter(EvalResultFormatter):
if evaluation.failure_reason:
lines.append(f"**Failure Reason:** {evaluation.failure_reason}")
else:
- lines.append("| Field | Match | Score | Expected | Actual |")
- lines.append("|-------|-------|-------|----------|--------|")
-
- for critic_result in evaluation.results:
- is_criticized = critic_result.get("is_criticized", True)
- field = critic_result["field"]
- score = critic_result["score"]
- weight = critic_result["weight"]
- expected = str(critic_result["expected"]).replace("|", "\\|")
- actual = str(critic_result["actual"]).replace("|", "\\|")
-
- # Truncate long values for table readability
- expected = truncate_field_value(expected, MD_MAX_FIELD_LENGTH)
- actual = truncate_field_value(actual, MD_MAX_FIELD_LENGTH)
-
- if is_criticized:
- match_icon = "✅" if critic_result["match"] else "❌"
- lines.append(
- f"| {field} | {match_icon} | {score:.2f}/{weight:.2f} | `{expected}` | `{actual}` |"
- )
- else:
- lines.append(f"| {field} | — | - | `{expected}` | `{actual}` |")
+ lines.extend(self._format_critic_results_table_md(evaluation.results))
return "\n".join(lines)
+ def _format_critic_results_table_md(self, results: list[dict[str, Any]]) -> list[str]:
+ lines: list[str] = []
+ lines.append("| Field | Match | Score | Expected | Actual |")
+ lines.append("|-------|-------|-------|----------|--------|")
+
+ for critic_result in results:
+ is_criticized = critic_result.get("is_criticized", True)
+ field = critic_result["field"]
+ score = critic_result["score"]
+ weight = critic_result["weight"]
+ expected = str(critic_result["expected"]).replace("|", "\\|")
+ actual = str(critic_result["actual"]).replace("|", "\\|")
+
+ # Truncate long values for table readability
+ expected = truncate_field_value(expected, MD_MAX_FIELD_LENGTH)
+ actual = truncate_field_value(actual, MD_MAX_FIELD_LENGTH)
+
+ if is_criticized:
+ match_icon = "✅" if critic_result["match"] else "❌"
+ lines.append(
+ f"| {field} | {match_icon} | {score:.2f}/{weight:.2f} | `{expected}` | `{actual}` |"
+ )
+ else:
+ lines.append(f"| {field} | — | - | `{expected}` | `{actual}` |")
+
+ return lines
+
+ def _format_critic_stats_summary(self, critic_stats: dict[str, Any]) -> list[str]:
+ lines: list[str] = []
+ lines.append("**Critic Stats (normalized & weighted):** ")
+ lines.append(
+ "| Field | Weight | Mean (norm %) | Std (norm %) | Mean (weighted %) | Std (weighted %) |"
+ )
+ lines.append(
+ "|-------|--------|---------------|--------------|-------------------|------------------|"
+ )
+ for field, stats in critic_stats.items():
+ weight = stats.get("weight", 0.0)
+ mean_norm = stats.get("mean_score_normalized", 0.0) * 100
+ std_norm = stats.get("std_deviation_normalized", 0.0) * 100
+ mean_weighted = stats.get("mean_score", 0.0) * 100
+ std_weighted = stats.get("std_deviation", 0.0) * 100
+ lines.append(
+ f"| {field} | {weight:.2f} | {mean_norm:.2f}% | {std_norm:.2f}% | "
+ f"{mean_weighted:.2f}% | {std_weighted:.2f}% |"
+ )
+ lines.append("")
+ return lines
+
+ def _format_run_stats_summary(self, run_stats: dict[str, Any] | None) -> list[str]:
+ """Format the run statistics summary as a Markdown bullet list."""
+ if not run_stats or run_stats.get("num_runs", 1) < 2:
+ return []
+ lines: list[str] = []
+ mean_pct = run_stats.get("mean_score", 0.0) * 100
+ std_pct = run_stats.get("std_deviation", 0.0) * 100
+ scores = run_stats.get("scores", [])
+ scores_display = ", ".join(f"{score * 100:.2f}%" for score in scores)
+ lines.append("**Run Stats:** ")
+ lines.append(f"- Runs: {run_stats.get('num_runs', len(scores))} ")
+ lines.append(f"- Mean Score: {mean_pct:.2f}% ")
+ lines.append(f"- Std Deviation: {std_pct:.2f}% ")
+ if scores_display:
+ lines.append(f"- Scores: {scores_display} ")
+ seed_policy = run_stats.get("seed_policy")
+ if seed_policy:
+ lines.append(f"- Seed Policy: {seed_policy} ")
+ run_seeds = run_stats.get("run_seeds")
+ if run_seeds and any(seed is not None for seed in run_seeds):
+ seeds_display = ", ".join(str(seed) for seed in run_seeds)
+ lines.append(f"- Run Seeds: {seeds_display} ")
+ pass_rule = run_stats.get("pass_rule")
+ if pass_rule:
+ lines.append(f"- Pass Rule: {pass_rule} ")
+ lines.append("")
+ return lines
+
+ def _format_run_details_md(self, run_stats: dict[str, Any] | None) -> list[str]:
+ if not run_stats or run_stats.get("num_runs", 1) < 2:
+ return []
+ runs = run_stats.get("runs", [])
+ if not runs:
+ return []
+ lines: list[str] = []
+ lines.append("**Run Details:** ")
+ for idx, run in enumerate(runs, start=1):
+ if run.get("passed"):
+ status = "✅ PASSED"
+ elif run.get("warning"):
+ status = "⚠️ WARNED"
+ else:
+ status = "❌ FAILED"
+ score_pct = run.get("score", 0.0) * 100
+ line = f"- Run {idx}: {status} — {score_pct:.2f}%"
+ failure_reason = run.get("failure_reason")
+ if failure_reason:
+ line += f" ({failure_reason})"
+ lines.append(line)
+ details = run.get("details", [])
+ if details:
+ lines.append("")
+ lines.append("")
+ lines.append(f"Run {idx} details ")
+ lines.append("")
+ lines.extend(self._format_critic_results_table_md(details))
+ lines.append("")
+ lines.append(" ")
+ lines.append("")
+ return lines
+
# =========================================================================
# MULTI-MODEL EVALUATION FORMATTING
# =========================================================================
@@ -371,7 +466,19 @@ class MarkdownFormatter(EvalResultFormatter):
lines.append(f"**{model}:** Score {evaluation.score * 100:.1f}%")
lines.append("")
- lines.append(self._format_evaluation_details(evaluation))
+ run_stats = case_result.get("run_stats")
+ lines.extend(self._format_run_stats_summary(run_stats))
+
+ run_detail_lines = self._format_run_details_md(run_stats)
+ lines.extend(run_detail_lines)
+
+ critic_stats = case_result.get("critic_stats")
+ if critic_stats:
+ lines.extend(self._format_critic_stats_summary(critic_stats))
+ # Only show the critic results table when there are no per-run
+ # details (run details already include per-run field tables)
+ if not run_detail_lines:
+ lines.append(self._format_evaluation_details(evaluation))
lines.append("")
lines.append("---")
@@ -471,37 +578,16 @@ class MarkdownFormatter(EvalResultFormatter):
lines.append(f"**Tracks compared:** {', '.join(f'`{t}`' for t in all_tracks)}")
lines.append("")
- if failed_only and original_counts:
- orig_total, orig_passed, orig_failed, orig_warned = original_counts
- lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)")
- lines.append("")
- lines.append("| Metric | Count |")
- lines.append("|--------|-------|")
- lines.append(f"| **Total** | {orig_total} |")
- lines.append(f"| ✅ Passed | {orig_passed} |")
- if orig_warned > 0:
- lines.append(f"| ⚠️ Warnings | {orig_warned} |")
- lines.append(f"| ❌ Failed | {orig_failed} |")
- else:
- lines.append("| Metric | Count |")
- lines.append("|--------|-------|")
- lines.append(f"| **Total** | {total_cases} |")
- lines.append(f"| ✅ Passed | {total_passed} |")
- if total_warned > 0:
- lines.append(f"| ⚠️ Warnings | {total_warned} |")
- if total_failed > 0:
- lines.append(f"| ❌ Failed | {total_failed} |")
-
- # Pass rate
- if total_cases > 0:
- if failed_only and original_counts and original_counts[0] > 0:
- pass_rate = (original_counts[1] / original_counts[0]) * 100
- else:
- pass_rate = (total_passed / total_cases) * 100
- lines.append("")
- lines.append(f"**Pass Rate:** {pass_rate:.1f}%")
-
- lines.append("")
+ lines.extend(
+ self._format_summary_table_md(
+ total_cases,
+ total_passed,
+ total_failed,
+ total_warned,
+ failed_only,
+ original_counts,
+ )
+ )
# Results by model
lines.append("## Results by Model")
@@ -522,77 +608,13 @@ class MarkdownFormatter(EvalResultFormatter):
# List all cases with summary comparison
for case_name, case_data in cases.items():
- # Context section (if include_context is True)
if include_context:
- system_msg = case_data.get("system_message")
- addl_msgs = case_data.get("additional_messages")
- if system_msg or addl_msgs:
- lines.append("")
- lines.append("📋 Context ")
- lines.append("")
- if system_msg:
- lines.append(f"**System Message:** {system_msg}")
- lines.append("")
- if addl_msgs:
- lines.append(f"**💬 Conversation ({len(addl_msgs)} messages):**")
- lines.append("")
- for msg in addl_msgs:
- role = msg.get("role", "unknown")
- content = msg.get("content", "")
- name = msg.get("name", "")
- role_icons = {
- "user": "👤",
- "assistant": "🤖",
- "tool": "🔧",
- "system": "⚙️",
- }
- icon = role_icons.get(role, "💬")
- label = (
- f"{icon} **{role.title()}**"
- if not name
- else f"{icon} **{role.title()}** (`{name}`)"
- )
- lines.append(f"> {label}")
- if content:
- if role == "tool":
- try:
- import json
-
- parsed = json.loads(content)
- formatted = json.dumps(parsed, indent=2)
- lines.append("> ```json")
- for json_line in formatted.split("\n"):
- lines.append(f"> {json_line}")
- lines.append("> ```")
- except (json.JSONDecodeError, TypeError):
- lines.append(f"> {content}")
- else:
- lines.append(f"> {content}")
- tool_calls = msg.get("tool_calls", [])
- if tool_calls:
- for tc in tool_calls:
- func = tc.get("function", {})
- tc_name = func.get("name", "unknown")
- tc_args = func.get("arguments", "{}")
- lines.append(f"> 🔧 **{tc_name}**")
- try:
- import json
-
- args_dict = (
- json.loads(tc_args)
- if isinstance(tc_args, str)
- else tc_args
- )
- formatted = json.dumps(args_dict, indent=2)
- lines.append("> ```json")
- for arg_line in formatted.split("\n"):
- lines.append(f"> {arg_line}")
- lines.append("> ```")
- except (json.JSONDecodeError, TypeError):
- lines.append(f"> `{tc_args}`")
- lines.append(">")
- lines.append(" ")
- lines.append("")
+ lines.extend(
+ self._format_context_section_md(
+ case_data.get("system_message"),
+ case_data.get("additional_messages"),
+ )
+ )
lines.extend(
self._format_comparative_case(
@@ -647,37 +669,16 @@ class MarkdownFormatter(EvalResultFormatter):
lines.append("## Summary")
lines.append("")
- if failed_only and original_counts:
- orig_total, orig_passed, orig_failed, orig_warned = original_counts
- lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)")
- lines.append("")
- lines.append("| Metric | Count |")
- lines.append("|--------|-------|")
- lines.append(f"| **Total** | {orig_total} |")
- lines.append(f"| ✅ Passed | {orig_passed} |")
- if orig_warned > 0:
- lines.append(f"| ⚠️ Warnings | {orig_warned} |")
- lines.append(f"| ❌ Failed | {orig_failed} |")
- else:
- lines.append("| Metric | Count |")
- lines.append("|--------|-------|")
- lines.append(f"| **Total** | {total_cases} |")
- lines.append(f"| ✅ Passed | {total_passed} |")
- if total_warned > 0:
- lines.append(f"| ⚠️ Warnings | {total_warned} |")
- if total_failed > 0:
- lines.append(f"| ❌ Failed | {total_failed} |")
-
- # Pass rate
- if total_cases > 0:
- if failed_only and original_counts and original_counts[0] > 0:
- pass_rate = (original_counts[1] / original_counts[0]) * 100
- else:
- pass_rate = (total_passed / total_cases) * 100
- lines.append("")
- lines.append(f"**Pass Rate:** {pass_rate:.1f}%")
-
- lines.append("")
+ lines.extend(
+ self._format_summary_table_md(
+ total_cases,
+ total_passed,
+ total_failed,
+ total_warned,
+ failed_only,
+ original_counts,
+ )
+ )
# Results grouped by case
lines.append("## Results by Case")
@@ -705,77 +706,12 @@ class MarkdownFormatter(EvalResultFormatter):
# Context section (if include_context is True)
if include_context:
- system_msg = first_model_data.get("system_message")
- addl_msgs = first_model_data.get("additional_messages")
- if system_msg or addl_msgs:
- lines.append("")
- lines.append("📋 Context ")
- lines.append("")
- if system_msg:
- lines.append(f"**System Message:** {system_msg}")
- lines.append("")
- if addl_msgs:
- lines.append(f"**💬 Conversation ({len(addl_msgs)} messages):**")
- lines.append("")
- for msg in addl_msgs:
- role = msg.get("role", "unknown")
- content = msg.get("content", "")
- name = msg.get("name", "")
- role_icons = {
- "user": "👤",
- "assistant": "🤖",
- "tool": "🔧",
- "system": "⚙️",
- }
- icon = role_icons.get(role, "💬")
- label = (
- f"{icon} **{role.title()}**"
- if not name
- else f"{icon} **{role.title()}** (`{name}`)"
- )
- lines.append(f"> {label}")
- if content:
- # For tool responses, format as JSON code block
- if role == "tool":
- try:
- import json
-
- parsed = json.loads(content)
- formatted = json.dumps(parsed, indent=2)
- lines.append("> ```json")
- for json_line in formatted.split("\n"):
- lines.append(f"> {json_line}")
- lines.append("> ```")
- except (json.JSONDecodeError, TypeError):
- lines.append(f"> {content}")
- else:
- lines.append(f"> {content}")
- # Handle tool calls
- tool_calls = msg.get("tool_calls", [])
- if tool_calls:
- for tc in tool_calls:
- func = tc.get("function", {})
- tc_name = func.get("name", "unknown")
- tc_args = func.get("arguments", "{}")
- lines.append(f"> 🔧 **{tc_name}**")
- try:
- import json
-
- args_dict = (
- json.loads(tc_args)
- if isinstance(tc_args, str)
- else tc_args
- )
- formatted = json.dumps(args_dict, indent=2)
- lines.append("> ```json")
- for arg_line in formatted.split("\n"):
- lines.append(f"> {arg_line}")
- lines.append("> ```")
- except (json.JSONDecodeError, TypeError):
- lines.append(f"> `{tc_args}`")
- lines.append(">")
- lines.append(" ")
- lines.append("")
+ lines.extend(
+ self._format_context_section_md(
+ first_model_data.get("system_message"),
+ first_model_data.get("additional_messages"),
+ )
+ )
# Show each model's results for this case
for model in model_order:
@@ -876,7 +812,20 @@ class MarkdownFormatter(EvalResultFormatter):
lines.append("")
lines.append(f"📋 {track_name} — Detailed Results ")
lines.append("")
- lines.append(self._format_evaluation_details(evaluation))
+ run_stats = track_result.get("run_stats")
+ lines.extend(self._format_run_stats_summary(run_stats))
+
+ run_detail_lines = self._format_run_details_md(run_stats)
+ lines.extend(run_detail_lines)
+
+ critic_stats = track_result.get("critic_stats")
+ if critic_stats:
+ lines.extend(self._format_critic_stats_summary(critic_stats))
+
+ # Only show the critic results table when there are no per-run
+ # details (run details already include per-run field tables)
+ if not run_detail_lines:
+ lines.append(self._format_evaluation_details(evaluation))
lines.append("")
lines.append(" ")
lines.append("")
@@ -886,6 +835,81 @@ class MarkdownFormatter(EvalResultFormatter):
return lines
+ def _format_summary_table_md(
+ self,
+ total_cases: int,
+ total_passed: int,
+ total_failed: int,
+ total_warned: int,
+ failed_only: bool,
+ original_counts: tuple[int, int, int, int] | None,
+ ) -> list[str]:
+ """Build the summary table and pass rate used by regular and comparative formatters."""
+ lines: list[str] = []
+ if failed_only and original_counts:
+ orig_total, orig_passed, orig_failed, orig_warned = original_counts
+ lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)")
+ lines.append("")
+ lines.append("| Metric | Count |")
+ lines.append("|--------|-------|")
+ lines.append(f"| **Total** | {orig_total} |")
+ lines.append(f"| ✅ Passed | {orig_passed} |")
+ if orig_warned > 0:
+ lines.append(f"| ⚠️ Warnings | {orig_warned} |")
+ lines.append(f"| ❌ Failed | {orig_failed} |")
+ else:
+ lines.append("| Metric | Count |")
+ lines.append("|--------|-------|")
+ lines.append(f"| **Total** | {total_cases} |")
+ lines.append(f"| ✅ Passed | {total_passed} |")
+ if total_warned > 0:
+ lines.append(f"| ⚠️ Warnings | {total_warned} |")
+ if total_failed > 0:
+ lines.append(f"| ❌ Failed | {total_failed} |")
+
+ # Pass rate
+ if total_cases > 0:
+ if failed_only and original_counts and original_counts[0] > 0:
+ pass_rate = (original_counts[1] / original_counts[0]) * 100
+ else:
+ pass_rate = (total_passed / total_cases) * 100
+ lines.append("")
+ lines.append(f"**Pass Rate:** {pass_rate:.1f}%")
+
+ lines.append("")
+ return lines
+
+ def _format_context_section_md(
+ self,
+ system_msg: str | None,
+ additional_messages: list[dict] | None,
+ ) -> list[str]:
+ """Build a collapsible context section for comparative display.
+
+ Args:
+ system_msg: The system message, if any.
+ additional_messages: Conversation messages, if any.
+
+ Returns:
+ List of formatted markdown lines (empty if no context data).
+ """
+ if not system_msg and not additional_messages:
+ return []
+ lines: list[str] = []
+ lines.append("")
+ lines.append("📋 Context ")
+ lines.append("")
+ if system_msg:
+ lines.append(f"**System Message:** {system_msg}")
+ lines.append("")
+ if additional_messages:
+ lines.append(f"**💬 Conversation ({len(additional_messages)} messages):**")
+ lines.append("")
+ lines.extend(self._format_conversation_md(additional_messages))
+ lines.append(" ")
+ lines.append("")
+ return lines
+
def _format_conversation_md(self, messages: list[dict]) -> list[str]:
"""Format conversation messages as Markdown for context display."""
lines: list[str] = []
@@ -1003,7 +1027,25 @@ class CaptureMarkdownFormatter(CaptureFormatter):
lines.append("#### Tool Calls")
lines.append("")
- if case.tool_calls:
+ runs = getattr(case, "runs", None)
+ if runs:
+ for run_index, run in enumerate(runs, start=1):
+ lines.append(f"**Run {run_index}**")
+ lines.append("")
+ if run.tool_calls:
+ for tc in run.tool_calls:
+ total_calls += 1
+ lines.append(f"**`{tc.name}`**")
+ if tc.args:
+ lines.append("")
+ lines.append("```json")
+ lines.append(json.dumps(tc.args, indent=2))
+ lines.append("```")
+ lines.append("")
+ else:
+ lines.append("*No tool calls captured*")
+ lines.append("")
+ elif case.tool_calls:
for tc in case.tool_calls:
total_calls += 1
lines.append(f"**`{tc.name}`**")
@@ -1104,7 +1146,11 @@ class CaptureMarkdownFormatter(CaptureFormatter):
continue
captured_case = models_dict[model]
- if captured_case.tool_calls:
+ runs = getattr(captured_case, "runs", None)
+ if runs:
+ tool_names = f"{len(runs)} run(s)"
+ total_calls += sum(len(run.tool_calls) for run in runs)
+ elif captured_case.tool_calls:
tool_names = ", ".join(
f"`{tc.name}`" for tc in captured_case.tool_calls
)
@@ -1121,21 +1167,39 @@ class CaptureMarkdownFormatter(CaptureFormatter):
continue
captured_case = models_dict[model]
- if not captured_case.tool_calls:
+ runs = getattr(captured_case, "runs", None)
+ if not runs and not captured_case.tool_calls:
continue
lines.append("")
lines.append(f"🤖 {model} - Details ")
lines.append("")
- for tc in captured_case.tool_calls:
- lines.append(f"**`{tc.name}`**")
- if tc.args:
+ if runs:
+ for run_index, run in enumerate(runs, start=1):
+ lines.append(f"**Run {run_index}**")
+ lines.append("")
+ if run.tool_calls:
+ for tc in run.tool_calls:
+ lines.append(f"**`{tc.name}`**")
+ if tc.args:
+ lines.append("")
+ lines.append("```json")
+ lines.append(json.dumps(tc.args, indent=2))
+ lines.append("```")
+ lines.append("")
+ else:
+ lines.append("*No tool calls captured*")
+ lines.append("")
+ else:
+ for tc in captured_case.tool_calls:
+ lines.append(f"**`{tc.name}`**")
+ if tc.args:
+ lines.append("")
+ lines.append("```json")
+ lines.append(json.dumps(tc.args, indent=2))
+ lines.append("```")
lines.append("")
- lines.append("```json")
- lines.append(json.dumps(tc.args, indent=2))
- lines.append("```")
- lines.append("")
lines.append(" ")
lines.append("")
@@ -1160,7 +1224,11 @@ class CaptureMarkdownFormatter(CaptureFormatter):
continue
captured_case = models_dict[model]
- if captured_case.tool_calls:
+ runs = getattr(captured_case, "runs", None)
+ if runs:
+ tool_names = f"{len(runs)} run(s)"
+ total_calls += sum(len(run.tool_calls) for run in runs)
+ elif captured_case.tool_calls:
tool_names = ", ".join(
f"`{tc.name}`" for tc in captured_case.tool_calls
)
@@ -1177,21 +1245,39 @@ class CaptureMarkdownFormatter(CaptureFormatter):
continue
captured_case = models_dict[model]
- if not captured_case.tool_calls:
+ runs = getattr(captured_case, "runs", None)
+ if not runs and not captured_case.tool_calls:
continue
lines.append("")
lines.append(f"🤖 {model} - Tool Call Details ")
lines.append("")
- for tc in captured_case.tool_calls:
- lines.append(f"**`{tc.name}`**")
- if tc.args:
+ if runs:
+ for run_index, run in enumerate(runs, start=1):
+ lines.append(f"**Run {run_index}**")
+ lines.append("")
+ if run.tool_calls:
+ for tc in run.tool_calls:
+ lines.append(f"**`{tc.name}`**")
+ if tc.args:
+ lines.append("")
+ lines.append("```json")
+ lines.append(json.dumps(tc.args, indent=2))
+ lines.append("```")
+ lines.append("")
+ else:
+ lines.append("*No tool calls captured*")
+ lines.append("")
+ else:
+ for tc in captured_case.tool_calls:
+ lines.append(f"**`{tc.name}`**")
+ if tc.args:
+ lines.append("")
+ lines.append("```json")
+ lines.append(json.dumps(tc.args, indent=2))
+ lines.append("```")
lines.append("")
- lines.append("```json")
- lines.append(json.dumps(tc.args, indent=2))
- lines.append("```")
- lines.append("")
lines.append(" ")
lines.append("")
diff --git a/libs/arcade-cli/arcade_cli/formatters/text.py b/libs/arcade-cli/arcade_cli/formatters/text.py
index 88bf3bc0..e8c5bcbc 100644
--- a/libs/arcade-cli/arcade_cli/formatters/text.py
+++ b/libs/arcade-cli/arcade_cli/formatters/text.py
@@ -91,7 +91,14 @@ class TextFormatter(EvalResultFormatter):
status = "FAILED"
score_percentage = evaluation.score * 100
- lines.append(f" {status} {case['name']} -- Score: {score_percentage:.2f}%")
+ run_stats = case.get("run_stats") or {}
+ stats_suffix = ""
+ if run_stats.get("num_runs", 1) > 1:
+ std_pct = run_stats.get("std_deviation", 0.0) * 100
+ stats_suffix = f" (n={run_stats['num_runs']}, sd={std_pct:.2f}%)"
+ lines.append(
+ f" {status} {case['name']} -- Score: {score_percentage:.2f}%{stats_suffix}"
+ )
if show_details:
lines.append(f" User Input: {case['input']}")
@@ -112,6 +119,10 @@ class TextFormatter(EvalResultFormatter):
lines.append("")
lines.append(" Details:")
+ for stat_line in self._format_run_stats(case):
+ lines.append(f" {stat_line}")
+ for stat_line in self._format_critic_stats(case):
+ lines.append(f" {stat_line}")
for detail_line in self._format_evaluation(evaluation).split("\n"):
lines.append(f" {detail_line}")
lines.append(" " + "-" * 52)
@@ -121,23 +132,16 @@ class TextFormatter(EvalResultFormatter):
lines.append("")
# Summary
- if failed_only and original_counts:
- orig_total, orig_passed, orig_failed, orig_warned = original_counts
- lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)")
- summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}"
- if orig_warned > 0:
- summary += f" -- Warnings: {orig_warned}"
- if orig_failed > 0:
- summary += f" -- Failed: {orig_failed}"
- else:
- summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}"
- if total_warned > 0:
- summary += f" -- Warnings: {total_warned}"
- if total_failed > 0:
- summary += f" -- Failed: {total_failed}"
-
- lines.append(summary)
- lines.append("")
+ lines.extend(
+ self._format_summary_lines(
+ total_cases,
+ total_passed,
+ total_failed,
+ total_warned,
+ failed_only,
+ original_counts,
+ )
+ )
return "\n".join(lines)
@@ -169,6 +173,70 @@ class TextFormatter(EvalResultFormatter):
)
return "\n".join(result_lines)
+ def _format_run_stats(self, case: dict[str, Any]) -> list[str]:
+ run_stats = case.get("run_stats")
+ if not run_stats or run_stats.get("num_runs", 1) < 2:
+ return []
+ scores = run_stats.get("scores", [])
+ scores_display = ", ".join(f"{score * 100:.2f}%" for score in scores)
+ mean_pct = run_stats.get("mean_score", 0.0) * 100
+ std_pct = run_stats.get("std_deviation", 0.0) * 100
+ lines = [
+ "Run Stats:",
+ f" Runs: {run_stats.get('num_runs', len(scores))}",
+ f" Mean Score: {mean_pct:.2f}%",
+ f" Std Deviation: {std_pct:.2f}%",
+ ]
+ if scores_display:
+ lines.append(f" Scores: {scores_display}")
+ seed_policy = run_stats.get("seed_policy")
+ run_seeds = run_stats.get("run_seeds")
+ if seed_policy:
+ lines.append(f" Seed Policy: {seed_policy}")
+ if run_seeds and any(seed is not None for seed in run_seeds):
+ seeds_display = ", ".join(str(seed) for seed in run_seeds)
+ lines.append(f" Run Seeds: {seeds_display}")
+ pass_rule = run_stats.get("pass_rule")
+ if pass_rule:
+ lines.append(f" Pass Rule: {pass_rule}")
+
+ runs = run_stats.get("runs", [])
+ if runs:
+ lines.append(" Run Results:")
+ for idx, run in enumerate(runs, start=1):
+ if run.get("passed"):
+ status = "PASSED"
+ elif run.get("warning"):
+ status = "WARNED"
+ else:
+ status = "FAILED"
+ score_pct = run.get("score", 0.0) * 100
+ run_line = f" Run {idx}: {status} ({score_pct:.2f}%)"
+ failure_reason = run.get("failure_reason")
+ if failure_reason:
+ run_line += f" -- {failure_reason}"
+ lines.append(run_line)
+ lines.append("")
+ return lines
+
+ def _format_critic_stats(self, case: dict[str, Any]) -> list[str]:
+ critic_stats = case.get("critic_stats")
+ if not critic_stats:
+ return []
+ lines = ["Critic Stats:"]
+ for field, stats in critic_stats.items():
+ weight = stats.get("weight", 0.0)
+ mean_norm = stats.get("mean_score_normalized", 0.0) * 100
+ std_norm = stats.get("std_deviation_normalized", 0.0) * 100
+ mean_weighted = stats.get("mean_score", 0.0) * 100
+ std_weighted = stats.get("std_deviation", 0.0) * 100
+ lines.append(
+ f" {field}: norm {mean_norm:.2f}% ± {std_norm:.2f}% | "
+ f"weighted {mean_weighted:.2f}% ± {std_weighted:.2f}% (w={weight:.2f})"
+ )
+ lines.append("")
+ return lines
+
# =========================================================================
# MULTI-MODEL EVALUATION FORMATTING
# =========================================================================
@@ -312,6 +380,11 @@ class TextFormatter(EvalResultFormatter):
lines.append(f" [{model}] Score: {evaluation.score * 100:.1f}%")
+ for stat_line in self._format_run_stats(case_result):
+ lines.append(f" {stat_line}")
+ for stat_line in self._format_critic_stats(case_result):
+ lines.append(f" {stat_line}")
+
# Show evaluation details indented
eval_details = self._format_evaluation(evaluation)
for line in eval_details.split("\n"):
@@ -420,60 +493,13 @@ class TextFormatter(EvalResultFormatter):
lines.append(" " + "-" * 72)
for case_name, case_data in cases.items():
- # Context section (if include_context is True)
if include_context:
- system_msg = case_data.get("system_message")
- addl_msgs = case_data.get("additional_messages")
- if system_msg or addl_msgs:
- lines.append(" " + "-" * 40)
- lines.append(" 📋 CONTEXT")
- lines.append(" " + "-" * 40)
- if system_msg:
- lines.append(f" System Message: {system_msg}")
- if addl_msgs:
- lines.append(f" 💬 Conversation ({len(addl_msgs)} messages):")
- for msg in addl_msgs:
- role = msg.get("role", "unknown").upper()
- content = msg.get("content", "")
- name = msg.get("name", "")
- role_label = f"[{role}]" if not name else f"[{role}: {name}]"
- lines.append(f" {role_label}")
- if content:
- # For tool responses, try to format JSON
- if role.lower() == "tool":
- try:
- import json
-
- parsed = json.loads(content)
- formatted = json.dumps(parsed, indent=2)
- for json_line in formatted.split("\n"):
- lines.append(f" {json_line}")
- except (json.JSONDecodeError, TypeError):
- lines.append(f" {content}")
- else:
- lines.append(f" {content}")
- # Handle tool calls
- tool_calls = msg.get("tool_calls", [])
- if tool_calls:
- for tc in tool_calls:
- func = tc.get("function", {})
- tc_name = func.get("name", "unknown")
- tc_args = func.get("arguments", "{}")
- lines.append(f" 🔧 {tc_name}")
- try:
- import json
-
- args_dict = (
- json.loads(tc_args)
- if isinstance(tc_args, str)
- else tc_args
- )
- formatted = json.dumps(args_dict, indent=2)
- for arg_line in formatted.split("\n"):
- lines.append(f" {arg_line}")
- except (json.JSONDecodeError, TypeError):
- lines.append(f" {tc_args}")
- lines.append(" " + "-" * 40)
+ lines.extend(
+ self._format_context_block(
+ case_data.get("system_message"),
+ case_data.get("additional_messages"),
+ )
+ )
lines.extend(
self._format_comparative_case_text(
@@ -484,23 +510,16 @@ class TextFormatter(EvalResultFormatter):
lines.append("")
# Summary
- if failed_only and original_counts:
- orig_total, orig_passed, orig_failed, orig_warned = original_counts
- lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)")
- summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}"
- if orig_warned > 0:
- summary += f" -- Warnings: {orig_warned}"
- if orig_failed > 0:
- summary += f" -- Failed: {orig_failed}"
- else:
- summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}"
- if total_warned > 0:
- summary += f" -- Warnings: {total_warned}"
- if total_failed > 0:
- summary += f" -- Failed: {total_failed}"
-
- lines.append(summary)
- lines.append("")
+ lines.extend(
+ self._format_summary_lines(
+ total_cases,
+ total_passed,
+ total_failed,
+ total_warned,
+ failed_only,
+ original_counts,
+ )
+ )
return "\n".join(lines)
@@ -563,61 +582,14 @@ class TextFormatter(EvalResultFormatter):
if case_input:
lines.append(f" Input: {case_input}")
- # Context section (if include_context is True)
if include_context:
- system_msg = first_model_data.get("system_message")
- addl_msgs = first_model_data.get("additional_messages")
- if system_msg or addl_msgs:
+ context_lines = self._format_context_block(
+ first_model_data.get("system_message"),
+ first_model_data.get("additional_messages"),
+ )
+ if context_lines:
lines.append("")
- lines.append(" " + "-" * 40)
- lines.append(" 📋 CONTEXT")
- lines.append(" " + "-" * 40)
- if system_msg:
- lines.append(f" System Message: {system_msg}")
- if addl_msgs:
- lines.append(f" 💬 Conversation ({len(addl_msgs)} messages):")
- for msg in addl_msgs:
- role = msg.get("role", "unknown").upper()
- content = msg.get("content", "")
- name = msg.get("name", "")
- role_label = f"[{role}]" if not name else f"[{role}: {name}]"
- lines.append(f" {role_label}")
- if content:
- # For tool responses, try to format JSON
- if role.lower() == "tool":
- try:
- import json
-
- parsed = json.loads(content)
- formatted = json.dumps(parsed, indent=2)
- for json_line in formatted.split("\n"):
- lines.append(f" {json_line}")
- except (json.JSONDecodeError, TypeError):
- lines.append(f" {content}")
- else:
- lines.append(f" {content}")
- # Handle tool calls in assistant messages
- tool_calls = msg.get("tool_calls", [])
- if tool_calls:
- for tc in tool_calls:
- func = tc.get("function", {})
- tc_name = func.get("name", "unknown")
- tc_args = func.get("arguments", "{}")
- lines.append(f" 🔧 {tc_name}")
- try:
- import json
-
- args_dict = (
- json.loads(tc_args)
- if isinstance(tc_args, str)
- else tc_args
- )
- formatted = json.dumps(args_dict, indent=2)
- for arg_line in formatted.split("\n"):
- lines.append(f" {arg_line}")
- except (json.JSONDecodeError, TypeError):
- lines.append(f" {tc_args}")
- lines.append(" " + "-" * 40)
+ lines.extend(context_lines)
lines.append("")
@@ -643,23 +615,16 @@ class TextFormatter(EvalResultFormatter):
# Summary
lines.append("=" * 78)
- if failed_only and original_counts:
- orig_total, orig_passed, orig_failed, orig_warned = original_counts
- lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)")
- summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}"
- if orig_warned > 0:
- summary += f" -- Warnings: {orig_warned}"
- if orig_failed > 0:
- summary += f" -- Failed: {orig_failed}"
- else:
- summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}"
- if total_warned > 0:
- summary += f" -- Warnings: {total_warned}"
- if total_failed > 0:
- summary += f" -- Failed: {total_failed}"
-
- lines.append(summary)
- lines.append("")
+ lines.extend(
+ self._format_summary_lines(
+ total_cases,
+ total_passed,
+ total_failed,
+ total_warned,
+ failed_only,
+ original_counts,
+ )
+ )
return "\n".join(lines)
@@ -753,12 +718,76 @@ class TextFormatter(EvalResultFormatter):
continue
lines.append(f" [{track_name}] Details:")
+ for stat_line in self._format_run_stats(track_result):
+ lines.append(f" {stat_line}")
+ for stat_line in self._format_critic_stats(track_result):
+ lines.append(f" {stat_line}")
for detail_line in self._format_evaluation(evaluation).split("\n"):
lines.append(f" {detail_line}")
lines.append("")
return lines
+ def _format_summary_lines(
+ self,
+ total_cases: int,
+ total_passed: int,
+ total_failed: int,
+ total_warned: int,
+ failed_only: bool,
+ original_counts: tuple[int, int, int, int] | None,
+ ) -> list[str]:
+ """Build the summary lines used by regular and comparative formatters."""
+ lines: list[str] = []
+ if failed_only and original_counts:
+ orig_total, orig_passed, orig_failed, orig_warned = original_counts
+ lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)")
+ summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}"
+ if orig_warned > 0:
+ summary += f" -- Warnings: {orig_warned}"
+ if orig_failed > 0:
+ summary += f" -- Failed: {orig_failed}"
+ else:
+ summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}"
+ if total_warned > 0:
+ summary += f" -- Warnings: {total_warned}"
+ if total_failed > 0:
+ summary += f" -- Failed: {total_failed}"
+ lines.append(summary)
+ lines.append("")
+ return lines
+
+ def _format_context_block(
+ self,
+ system_msg: str | None,
+ additional_messages: list[dict] | None,
+ indent: str = " ",
+ ) -> list[str]:
+ """Build the context section lines for comparative display.
+
+ Args:
+ system_msg: The system message, if any.
+ additional_messages: Conversation messages, if any.
+ indent: Base indentation prefix for each line.
+
+ Returns:
+ List of formatted lines (empty if no context data).
+ """
+ if not system_msg and not additional_messages:
+ return []
+ lines: list[str] = []
+ lines.append(indent + "-" * 40)
+ lines.append(indent + "📋 CONTEXT")
+ lines.append(indent + "-" * 40)
+ if system_msg:
+ lines.append(f"{indent}System Message: {system_msg}")
+ if additional_messages:
+ lines.append(f"{indent}💬 Conversation ({len(additional_messages)} messages):")
+ for conv_line in self._format_conversation_text(additional_messages):
+ lines.append(f"{indent}{conv_line}")
+ lines.append(indent + "-" * 40)
+ return lines
+
def _format_conversation_text(self, messages: list[dict]) -> list[str]:
"""Format conversation messages as plain text for context display."""
lines: list[str] = []
@@ -858,7 +887,22 @@ class CaptureTextFormatter(CaptureFormatter):
lines.append("")
lines.append(" Tool Calls:")
- if case.tool_calls:
+ runs = getattr(case, "runs", None)
+ if runs:
+ for run_index, run in enumerate(runs, start=1):
+ lines.append(f" Run {run_index}:")
+ if run.tool_calls:
+ for tc in run.tool_calls:
+ total_calls += 1
+ lines.append(f" - {tc.name}")
+ if tc.args:
+ for key, value in tc.args.items():
+ lines.append(
+ f" {key}: {self._format_value(value)}"
+ )
+ else:
+ lines.append(" (no tool calls)")
+ elif case.tool_calls:
for tc in case.tool_calls:
total_calls += 1
lines.append(f" - {tc.name}")
@@ -949,7 +993,21 @@ class CaptureTextFormatter(CaptureFormatter):
captured_case = models_dict[model]
lines.append(f" │ [{model}]")
- if captured_case.tool_calls:
+ runs = getattr(captured_case, "runs", None)
+ if runs:
+ for run_index, run in enumerate(runs, start=1):
+ lines.append(f" │ Run {run_index}:")
+ if run.tool_calls:
+ for tc in run.tool_calls:
+ lines.append(f" │ - {tc.name}")
+ if tc.args:
+ for key, value in tc.args.items():
+ lines.append(
+ f" │ {key}: {self._format_value(value)}"
+ )
+ else:
+ lines.append(" │ (no tool calls)")
+ elif captured_case.tool_calls:
for tc in captured_case.tool_calls:
lines.append(f" │ - {tc.name}")
if tc.args:
@@ -980,7 +1038,21 @@ class CaptureTextFormatter(CaptureFormatter):
captured_case = models_dict[model]
lines.append(f" [{model}]")
- if captured_case.tool_calls:
+ runs = getattr(captured_case, "runs", None)
+ if runs:
+ for run_index, run in enumerate(runs, start=1):
+ lines.append(f" Run {run_index}:")
+ if run.tool_calls:
+ for tc in run.tool_calls:
+ lines.append(f" - {tc.name}")
+ if tc.args:
+ for key, value in tc.args.items():
+ lines.append(
+ f" {key}: {self._format_value(value)}"
+ )
+ else:
+ lines.append(" (no tool calls)")
+ elif captured_case.tool_calls:
for tc in captured_case.tool_calls:
lines.append(f" - {tc.name}")
if tc.args:
diff --git a/libs/arcade-cli/arcade_cli/main.py b/libs/arcade-cli/arcade_cli/main.py
index 78b52b3a..d39413c7 100644
--- a/libs/arcade-cli/arcade_cli/main.py
+++ b/libs/arcade-cli/arcade_cli/main.py
@@ -405,13 +405,29 @@ def evals(
"-c",
help="Maximum number of concurrent evaluations (default: 1)",
),
- use_provider: Optional[str] = typer.Option(
+ num_runs: int = typer.Option(
+ 1,
+ "--num-runs",
+ "-n",
+ help="Number of runs per case (default: 1).",
+ ),
+ seed: str = typer.Option(
+ "constant",
+ "--seed",
+ help="Seed policy for OpenAI runs (ignored for Anthropic): "
+ "'constant' (default), 'random', or an integer.",
+ ),
+ multi_run_pass_rule: str = typer.Option(
+ "last",
+ "--multi-run-pass-rule",
+ help="Pass/fail aggregation for multi-run cases: 'last' (default), 'mean', or 'majority'.",
+ ),
+ use_provider: Optional[list[str]] = typer.Option(
None,
"--use-provider",
"-p",
help="Provider(s) and models to use. Format: 'provider' or 'provider:model1,model2'. "
- "Multiple providers: separate with spaces. "
- "Examples: 'openai' or 'openai:gpt-4o anthropic:claude-sonnet-4-5-20250929'",
+ "Can be repeated. Examples: --use-provider openai or --use-provider openai:gpt-4o --use-provider anthropic:claude-sonnet-4-5-20250929",
),
api_key: Optional[list[str]] = typer.Option(
None,
@@ -476,6 +492,39 @@ def evals(
pip_install_command=r"pip install arcade-tdk",
)
+ # --- Validate multi-run parameters upfront (before any API calls) ---
+ if num_runs < 1:
+ handle_cli_error("--num-runs must be >= 1", should_exit=True)
+ return
+
+ seed_value: str | int
+ seed_lower = seed.strip().lower()
+ if seed_lower in {"constant", "random"}:
+ seed_value = seed_lower
+ else:
+ try:
+ seed_value = int(seed)
+ except ValueError:
+ handle_cli_error(
+ "Invalid --seed value. Use 'constant', 'random', or an integer.", should_exit=True
+ )
+ return
+ if seed_value < 0:
+ handle_cli_error("--seed must be a non-negative integer.", should_exit=True)
+ return
+
+ pass_rule = multi_run_pass_rule.strip().lower()
+ # Lazy import: arcade_evals requires optional deps (openai) that aren't
+ # available when the CLI is installed without the [evals] extra.
+ from arcade_evals._evalsuite._types import _VALID_PASS_RULES
+
+ if pass_rule not in _VALID_PASS_RULES:
+ handle_cli_error(
+ f"Invalid --multi-run-pass-rule. Valid values: {', '.join(sorted(_VALID_PASS_RULES))}.",
+ should_exit=True,
+ )
+ return
+
# --- Build model specs from flags ---
model_specs: list[ModelSpec] = []
@@ -483,11 +532,10 @@ def evals(
api_keys = resolve_provider_api_keys(api_keys_specs=api_key)
if use_provider:
- # Parse provider specs - supports space-separated values
- # e.g., "openai:gpt-4o anthropic:claude"
- provider_specs = use_provider.split()
+ # Parse provider specs - supports multiple --use-provider flags
+ # e.g., --use-provider openai:gpt-4o --use-provider anthropic:claude
try:
- provider_configs = [parse_provider_spec(spec) for spec in provider_specs]
+ provider_configs = [parse_provider_spec(spec) for spec in use_provider]
except ValueError as e:
handle_cli_error(str(e), should_exit=True)
return # For type checker
@@ -594,6 +642,8 @@ def evals(
output_file=final_output_file,
output_format=",".join(final_output_formats) if final_output_formats else "txt",
console=console,
+ num_runs=num_runs,
+ seed=seed_value,
)
)
else:
@@ -608,6 +658,9 @@ def evals(
failed_only=only_failed,
include_context=include_context,
console=console,
+ num_runs=num_runs,
+ seed=seed_value,
+ multi_run_pass_rule=pass_rule,
)
)
except Exception as e:
diff --git a/libs/arcade-evals/README.md b/libs/arcade-evals/README.md
index 97ec57c4..79b89b21 100644
--- a/libs/arcade-evals/README.md
+++ b/libs/arcade-evals/README.md
@@ -9,7 +9,10 @@ Arcade Evals provides comprehensive evaluation capabilities for Arcade tools:
- **Evaluation Framework**: Cases, suites, and rubrics for systematic testing
- **Critics**: Different types of comparisons (binary, numeric, similarity, datetime)
- **Tool Evaluation**: Decorators and utilities for evaluating tool performance
-- **Result Analysis**: Comprehensive evaluation results and reporting
+- **Multi-Run Statistics**: Run each case multiple times with configurable seed policies and pass rules to measure consistency
+- **Comparative Evaluation**: Compare tool performance across multiple sources/tracks side-by-side
+- **Capture Mode**: Record model tool calls without scoring for debugging and baseline generation
+- **Result Analysis**: Comprehensive evaluation results and reporting in multiple formats (text, markdown, HTML, JSON)
## Installation
@@ -81,6 +84,31 @@ rubric = EvalRubric(
suite = EvalSuite(cases=[case1], rubric=rubric)
```
+### Multi-Run Evaluation
+
+Run each case multiple times to measure consistency:
+
+```python
+# Run via the CLI
+# arcade evals eval_file.py --num-runs 5 --seed random --multi-run-pass-rule majority
+
+# Or programmatically
+result = await suite.run(
+ client,
+ model="gpt-4o",
+ num_runs=5, # Run each case 5 times
+ seed="random", # Different seed per run
+ multi_run_pass_rule="majority", # Pass if >50% of runs pass
+)
+```
+
+Multi-run results include per-case statistics:
+- **Mean score** and **standard deviation** across runs
+- **Per-run pass/fail** with individual scores
+- **Per-critic field** score breakdowns across runs
+- Configurable **pass rules**: `last` (default), `mean`, or `majority`
+- Configurable **seed policies**: `constant` (fixed seed 42), `random`, or a specific integer
+
## License
MIT License - see LICENSE file for details.
diff --git a/libs/arcade-evals/arcade_evals/__init__.py b/libs/arcade-evals/arcade_evals/__init__.py
index 83d1c092..2531b19a 100644
--- a/libs/arcade-evals/arcade_evals/__init__.py
+++ b/libs/arcade-evals/arcade_evals/__init__.py
@@ -1,6 +1,6 @@
from ._evalsuite._providers import ProviderName
from ._evalsuite._tool_registry import MCPToolDefinition
-from .capture import CapturedCase, CapturedToolCall, CaptureResult
+from .capture import CapturedCase, CapturedRun, CapturedToolCall, CaptureResult
from .critic import BinaryCritic, DatetimeCritic, NoneCritic, NumericCritic, SimilarityCritic
from .eval import (
AnyExpectedToolCall,
@@ -25,6 +25,7 @@ __all__ = [
"BinaryCritic",
"CaptureResult",
"CapturedCase",
+ "CapturedRun",
"CapturedToolCall",
"DatetimeCritic",
"EvalRubric",
@@ -41,8 +42,8 @@ __all__ = [
"Weight",
"clear_tools_cache",
"load_arcade_mcp_gateway_async",
- "load_mcp_remote_async",
"load_from_stdio_async",
+ "load_mcp_remote_async",
"load_stdio_arcade_async",
"tool_eval",
"validate_and_normalize_critic_weights",
diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py b/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py
index 711f9e8e..69b2079b 100644
--- a/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py
+++ b/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py
@@ -7,9 +7,11 @@ keeping it separate from the main evaluation logic in eval.py.
from __future__ import annotations
import asyncio
+import random
from typing import TYPE_CHECKING, Any
-from arcade_evals.capture import CapturedCase, CapturedToolCall, CaptureResult
+from arcade_evals._evalsuite._types import _resolve_seed_spec
+from arcade_evals.capture import CapturedCase, CapturedRun, CapturedToolCall, CaptureResult
if TYPE_CHECKING:
from arcade_evals._evalsuite._comparative import ComparativeCaseBuilder
@@ -39,6 +41,7 @@ class _EvalSuiteCaptureMixin:
model: str,
case: EvalCase,
registry: EvalSuiteToolRegistry | None = None,
+ seed: int | None = None,
) -> list[tuple[str, dict[str, Any]]]:
raise NotImplementedError # Implemented in EvalSuite
@@ -67,6 +70,8 @@ class _EvalSuiteCaptureMixin:
model: str,
provider: ProviderName = "openai",
include_context: bool = False,
+ num_runs: int = 1,
+ seed: str | int | None = "constant",
) -> CaptureResult:
"""
Run the evaluation suite in capture mode - records tool calls without scoring.
@@ -86,10 +91,15 @@ class _EvalSuiteCaptureMixin:
provider: The provider name ("openai" or "anthropic").
include_context: Whether to include system_message and additional_messages
in the output.
+ num_runs: Number of runs per case.
+ seed: Seed policy ("constant", "random", or an integer seed).
Returns:
A CaptureResult containing all captured tool calls.
"""
+ if num_runs < 1:
+ raise ValueError("num_runs must be >= 1")
+
all_captured: list[CapturedCase] = []
semaphore = asyncio.Semaphore(self.max_concurrent)
@@ -106,34 +116,54 @@ class _EvalSuiteCaptureMixin:
"No tools registered. Use add_* convenience methods or pass catalog=ToolCatalog."
)
- # Get tool calls based on provider
- if provider == "anthropic":
- predicted_args = await self._run_anthropic(
- client, model, case, registry=use_registry
- )
+ seed_policy, seed_value = _resolve_seed_spec(seed)
+ if provider == "openai":
+ if seed_policy == "random":
+ run_seeds: list[int | None] = [
+ random.randint(0, 2**31 - 1) # noqa: S311
+ for _ in range(num_runs)
+ ]
+ else:
+ run_seeds = [seed_value for _ in range(num_runs)]
else:
- predicted_args = await self._run_openai(
- client, model, case, registry=use_registry
+ run_seeds = [None for _ in range(num_runs)]
+
+ runs: list[CapturedRun] = []
+ for run_index in range(num_runs):
+ run_seed = run_seeds[run_index]
+ # Get tool calls based on provider
+ if provider == "anthropic":
+ predicted_args = await self._run_anthropic(
+ client, model, case, registry=use_registry
+ )
+ else:
+ predicted_args = await self._run_openai(
+ client, model, case, registry=use_registry, seed=run_seed
+ )
+
+ # Process tool calls (resolve names, fill defaults)
+ filled_actual_tool_calls = self._process_tool_calls(
+ predicted_args, registry=use_registry
)
- # Process tool calls (resolve names, fill defaults)
- filled_actual_tool_calls = self._process_tool_calls(
- predicted_args, registry=use_registry
- )
+ # Convert to CapturedToolCall objects
+ tool_calls = [
+ CapturedToolCall(name=name, args=args)
+ for name, args in filled_actual_tool_calls
+ ]
- # Convert to CapturedToolCall objects
- tool_calls = [
- CapturedToolCall(name=name, args=args)
- for name, args in filled_actual_tool_calls
- ]
+ runs.append(CapturedRun(tool_calls=tool_calls))
+
+ primary_tool_calls = runs[0].tool_calls if runs else []
return CapturedCase(
case_name=case.name,
user_message=case.user_message,
- tool_calls=tool_calls,
+ tool_calls=primary_tool_calls,
system_message=case.system_message if include_context else None,
additional_messages=case.additional_messages if include_context else None,
track_name=track,
+ runs=runs if len(runs) > 1 else [],
)
# Capture regular cases (using default registry)
diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py
index fc2027e1..66f3f71e 100644
--- a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py
+++ b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py
@@ -6,6 +6,7 @@ multiple tool tracks with track-specific expected results and critics.
from __future__ import annotations
+from collections.abc import Sequence
from typing import TYPE_CHECKING, Any
from arcade_evals._evalsuite._types import (
@@ -45,7 +46,7 @@ class ComparativeCaseBuilder:
name: str,
user_message: str,
system_message: str = "",
- additional_messages: list[dict[str, str]] | None = None,
+ additional_messages: list[dict[str, Any]] | None = None,
rubric: EvalRubric | None = None,
) -> None:
"""Initialize the builder.
@@ -70,7 +71,7 @@ class ComparativeCaseBuilder:
def for_track(
self,
track_name: str,
- expected_tool_calls: list[ExpectedToolCall | ExpectedMCPToolCall],
+ expected_tool_calls: Sequence[ExpectedToolCall | ExpectedMCPToolCall],
critics: list[Critic] | None = None,
) -> ComparativeCaseBuilder:
"""Add track-specific configuration.
diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py
index a0e69251..bf8c7a16 100644
--- a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py
+++ b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py
@@ -7,17 +7,25 @@ allowing the same cases to be run against multiple tool tracks.
from __future__ import annotations
import asyncio
+import logging
import time
from typing import TYPE_CHECKING, Any
from arcade_evals._evalsuite._comparative import ComparativeCaseBuilder
-from arcade_evals._evalsuite._types import ComparativeCase, EvalRubric
+from arcade_evals._evalsuite._types import (
+ _VALID_PASS_RULES,
+ PASS_RULE_LAST,
+ ComparativeCase,
+ EvalRubric,
+)
if TYPE_CHECKING:
from arcade_evals._evalsuite._providers import ProviderName
from arcade_evals._evalsuite._tool_registry import EvalSuiteToolRegistry
from arcade_evals._evalsuite._tracks import TrackManager
+logger = logging.getLogger(__name__)
+
class _EvalSuiteComparativeMixin:
"""Mixin providing comparative evaluation execution methods."""
@@ -36,12 +44,26 @@ class _EvalSuiteComparativeMixin:
_run_openai: Any # Method from EvalSuite
_run_anthropic: Any # Method from EvalSuite
+ async def _run_case_with_stats(
+ self,
+ case: Any,
+ client: Any,
+ model: str,
+ provider: ProviderName,
+ *,
+ num_runs: int,
+ seed: str | int | None,
+ pass_rule: str,
+ registry: EvalSuiteToolRegistry | None = None,
+ ) -> dict[str, Any]:
+ raise NotImplementedError # Implemented in EvalSuite
+
def add_comparative_case(
self,
name: str,
user_message: str,
system_message: str | None = None,
- additional_messages: list[dict[str, str]] | None = None,
+ additional_messages: list[dict[str, Any]] | None = None,
rubric: EvalRubric | None = None,
) -> ComparativeCaseBuilder:
"""Create a comparative case that runs against multiple tool tracks.
@@ -90,6 +112,9 @@ class _EvalSuiteComparativeMixin:
client: Any,
model: str,
provider: ProviderName = "openai",
+ num_runs: int = 1,
+ seed: str | int | None = "constant",
+ multi_run_pass_rule: str = PASS_RULE_LAST,
) -> dict[str, dict[str, Any]]:
"""Run comparative cases across all configured tracks.
@@ -97,6 +122,9 @@ class _EvalSuiteComparativeMixin:
client: The LLM client instance.
model: The model to evaluate.
provider: The provider name.
+ num_runs: Number of runs per case.
+ seed: Seed policy ("constant", "random", or an integer seed).
+ multi_run_pass_rule: How to determine pass/warn for multi-run cases.
Returns:
Dictionary mapping track names to their results.
@@ -116,6 +144,15 @@ class _EvalSuiteComparativeMixin:
"No comparative cases defined. Use add_comparative_case() to add cases."
)
+ # Validate upfront before making any API calls
+ if num_runs < 1:
+ raise ValueError("num_runs must be >= 1")
+ if multi_run_pass_rule not in _VALID_PASS_RULES:
+ raise ValueError(
+ f"Invalid multi-run pass rule '{multi_run_pass_rule}'. "
+ f"Valid values: {', '.join(sorted(_VALID_PASS_RULES))}"
+ )
+
# Build and validate all cases upfront
comparative_cases: list[ComparativeCase] = []
all_required_tracks: set[str] = set()
@@ -183,27 +220,21 @@ class _EvalSuiteComparativeMixin:
) -> dict[str, Any]:
async with semaphore:
start = time.time()
- print(f" [TASK START] {_case.name} @ {_t_name}", flush=True)
- if provider == "anthropic":
- predicted_args = await self._run_anthropic(
- client, model, _case, registry=_reg
- )
- else:
- predicted_args = await self._run_openai(
- client, model, _case, registry=_reg
- )
+ logger.debug("[TASK START] %s @ %s", _case.name, _t_name)
+ case_result = await self._run_case_with_stats(
+ _case,
+ client,
+ model,
+ provider,
+ num_runs=num_runs,
+ seed=seed,
+ pass_rule=multi_run_pass_rule,
+ registry=_reg,
+ )
elapsed = time.time() - start
- print(
- f" [TASK DONE] {_case.name} @ {_t_name} ({elapsed:.1f}s)",
- flush=True,
- )
+ logger.debug("[TASK DONE] %s @ %s (%.1fs)", _case.name, _t_name, elapsed)
- filled_actual_tool_calls = self._process_tool_calls(
- predicted_args, registry=_reg
- )
- evaluation = _case.evaluate(filled_actual_tool_calls)
-
- return {
+ result = {
"name": _case.name,
"track": _t_name,
"input": _case.user_message,
@@ -215,10 +246,15 @@ class _EvalSuiteComparativeMixin:
],
"predicted_tool_calls": [
{"name": name, "args": args}
- for name, args in filled_actual_tool_calls
+ for name, args in case_result["predicted_tool_calls"]
],
- "evaluation": evaluation,
+ "evaluation": case_result["evaluation"],
}
+ if num_runs > 1:
+ result["run_stats"] = case_result["run_stats"]
+ if case_result["critic_stats"]:
+ result["critic_stats"] = case_result["critic_stats"]
+ return result
task = run_track_case(eval_case, registry, track_name)
tasks.append((track_name, task))
diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_types.py b/libs/arcade-evals/arcade_evals/_evalsuite/_types.py
index a43063d5..1aa29303 100644
--- a/libs/arcade-evals/arcade_evals/_evalsuite/_types.py
+++ b/libs/arcade-evals/arcade_evals/_evalsuite/_types.py
@@ -6,9 +6,44 @@ eval.py and the _evalsuite submodules, avoiding circular imports.
from __future__ import annotations
+from collections.abc import Sequence
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Callable
+DEFAULT_EVAL_SEED = 42
+
+# Pass-rule constants (shared across eval.py & _comparative_execution.py)
+PASS_RULE_LAST = "last" # noqa: S105
+PASS_RULE_MEAN = "mean" # noqa: S105
+PASS_RULE_MAJORITY = "majority" # noqa: S105
+_VALID_PASS_RULES: frozenset[str] = frozenset({PASS_RULE_LAST, PASS_RULE_MEAN, PASS_RULE_MAJORITY})
+
+
+def _resolve_seed_spec(seed: str | int | None) -> tuple[str, int | None]:
+ """Resolve a seed specification into a (policy, value) pair.
+
+ Args:
+ seed: 'constant', 'random', an integer, a numeric string, or None.
+
+ Returns:
+ A tuple of (policy_name, seed_value). policy_name is one of
+ 'constant', 'random', or 'custom'.
+ """
+ if seed is None:
+ return "constant", DEFAULT_EVAL_SEED
+ if isinstance(seed, int):
+ return "custom", seed
+ seed_value = seed.strip().lower()
+ if seed_value == "constant":
+ return "constant", DEFAULT_EVAL_SEED
+ if seed_value == "random":
+ return "random", None
+ try:
+ return "custom", int(seed_value)
+ except ValueError as exc:
+ raise ValueError("Invalid seed. Use 'constant', 'random', or an integer value.") from exc
+
+
if TYPE_CHECKING:
from arcade_evals.critic import Critic
@@ -117,7 +152,7 @@ class TrackConfig:
critics: Critics to evaluate tool arguments for this track.
"""
- expected_tool_calls: list[ExpectedToolCall | ExpectedMCPToolCall]
+ expected_tool_calls: Sequence[ExpectedToolCall | ExpectedMCPToolCall]
critics: list[Critic] = field(default_factory=list)
@@ -140,14 +175,14 @@ class ComparativeCase:
name: str
user_message: str
system_message: str = ""
- additional_messages: list[dict[str, str]] = field(default_factory=list)
+ additional_messages: list[dict[str, Any]] = field(default_factory=list)
rubric: EvalRubric | None = None
track_configs: dict[str, TrackConfig] = field(default_factory=dict)
def add_track_config(
self,
track_name: str,
- expected_tool_calls: list[ExpectedToolCall | ExpectedMCPToolCall],
+ expected_tool_calls: Sequence[ExpectedToolCall | ExpectedMCPToolCall],
critics: list[Critic] | None = None,
) -> None:
"""Add configuration for a track.
diff --git a/libs/arcade-evals/arcade_evals/capture.py b/libs/arcade-evals/arcade_evals/capture.py
index d5ad4aeb..19b98033 100644
--- a/libs/arcade-evals/arcade_evals/capture.py
+++ b/libs/arcade-evals/arcade_evals/capture.py
@@ -38,6 +38,22 @@ class CapturedToolCall:
return {"name": self.name, "args": self.args}
+@dataclass
+class CapturedRun:
+ """
+ A single capture run for a case, containing tool calls.
+
+ Attributes:
+ tool_calls: List of tool calls made by the model in this run.
+ """
+
+ tool_calls: list[CapturedToolCall] = field(default_factory=list)
+
+ def to_dict(self) -> dict[str, Any]:
+ """Convert to dictionary for JSON serialization."""
+ return {"tool_calls": [tc.to_dict() for tc in self.tool_calls]}
+
+
@dataclass
class CapturedCase:
"""
@@ -50,6 +66,7 @@ class CapturedCase:
system_message: The system message (included if include_context is True).
additional_messages: Additional messages (included if include_context is True).
track_name: The track name for comparative captures (None for regular cases).
+ runs: Optional list of runs (populated when num_runs > 1).
"""
case_name: str
@@ -58,6 +75,7 @@ class CapturedCase:
system_message: str | None = None
additional_messages: list[dict[str, Any]] | None = None
track_name: str | None = None
+ runs: list[CapturedRun] = field(default_factory=list)
@staticmethod
def _try_parse_json(value: str) -> Any:
@@ -109,6 +127,8 @@ class CapturedCase:
"user_message": self.user_message,
"tool_calls": [tc.to_dict() for tc in self.tool_calls],
}
+ if self.runs:
+ result["runs"] = [run.to_dict() for run in self.runs]
if self.track_name:
result["track_name"] = self.track_name
if include_context:
@@ -159,17 +179,32 @@ class CaptureResult:
async def _capture_with_openai(
- suite: EvalSuite, api_key: str, model: str, include_context: bool = False
+ suite: EvalSuite,
+ api_key: str,
+ model: str,
+ include_context: bool = False,
+ num_runs: int = 1,
+ seed: str | int | None = "constant",
) -> CaptureResult:
"""Run capture mode with OpenAI client."""
async with AsyncOpenAI(api_key=api_key) as client:
return await suite.capture(
- client, model, provider="openai", include_context=include_context
+ client,
+ model,
+ provider="openai",
+ include_context=include_context,
+ num_runs=num_runs,
+ seed=seed,
)
async def _capture_with_anthropic(
- suite: EvalSuite, api_key: str, model: str, include_context: bool = False
+ suite: EvalSuite,
+ api_key: str,
+ model: str,
+ include_context: bool = False,
+ num_runs: int = 1,
+ seed: str | int | None = "constant",
) -> CaptureResult:
"""Run capture mode with Anthropic client."""
try:
@@ -182,5 +217,10 @@ async def _capture_with_anthropic(
async with AsyncAnthropic(api_key=api_key) as client:
return await suite.capture(
- client, model, provider="anthropic", include_context=include_context
+ client,
+ model,
+ provider="anthropic",
+ include_context=include_context,
+ num_runs=num_runs,
+ seed=seed,
)
diff --git a/libs/arcade-evals/arcade_evals/eval.py b/libs/arcade-evals/arcade_evals/eval.py
index 27d926a6..a8dc9e09 100644
--- a/libs/arcade-evals/arcade_evals/eval.py
+++ b/libs/arcade-evals/arcade_evals/eval.py
@@ -3,7 +3,10 @@ import functools
import inspect
import json
import logging
+import random
+from collections.abc import Sequence
from dataclasses import dataclass, field
+from statistics import mean, pstdev
from typing import TYPE_CHECKING, Any, Callable
import numpy as np
@@ -24,11 +27,16 @@ from arcade_evals._evalsuite._tracks import TrackManager
# Import shared types from _types module (breaks circular dependencies)
from arcade_evals._evalsuite._types import (
+ _VALID_PASS_RULES,
+ PASS_RULE_LAST,
+ PASS_RULE_MAJORITY,
+ PASS_RULE_MEAN,
AnyExpectedToolCall,
EvalRubric,
ExpectedMCPToolCall,
ExpectedToolCall,
NamedExpectedToolCall,
+ _resolve_seed_spec,
)
from arcade_evals.critic import NoneCritic
from arcade_evals.weights import validate_and_normalize_critic_weights
@@ -140,6 +148,88 @@ class EvaluationResult:
self.score = total_score / total_weight if total_weight > 0 else 0.0
+# PASS_RULE_LAST, PASS_RULE_MEAN, PASS_RULE_MAJORITY, and _VALID_PASS_RULES
+# are imported from _types (see top-level imports) to keep a single source of truth.
+
+
+def _compute_mean_std(values: list[float]) -> tuple[float, float]:
+ if not values:
+ return 0.0, 0.0
+ avg = mean(values)
+ if len(values) < 2:
+ return avg, 0.0
+ return avg, pstdev(values)
+
+
+def _resolve_pass_rule(
+ run_evaluations: list[EvaluationResult],
+ mean_score: float,
+ pass_rule: str,
+ rubric: EvalRubric,
+) -> tuple[bool, bool]:
+ if pass_rule not in _VALID_PASS_RULES:
+ raise ValueError(
+ f"Invalid multi-run pass rule '{pass_rule}'. "
+ f"Valid values: {', '.join(sorted(_VALID_PASS_RULES))}"
+ )
+ if not run_evaluations:
+ return False, False
+ if pass_rule == PASS_RULE_MEAN:
+ passed = mean_score >= rubric.fail_threshold
+ warning = not passed and mean_score >= rubric.warn_threshold
+ return passed, warning
+ if pass_rule == PASS_RULE_MAJORITY:
+ majority = len(run_evaluations) // 2 + 1
+ passed_count = sum(1 for ev in run_evaluations if ev.passed)
+ warned_count = sum(1 for ev in run_evaluations if ev.warning)
+ if passed_count >= majority:
+ return True, False
+ if (passed_count + warned_count) >= majority:
+ return False, True
+ return False, False
+ last_eval = run_evaluations[-1]
+ return last_eval.passed, last_eval.warning
+
+
+def _aggregate_critic_stats(
+ run_field_scores: list[dict[str, dict[str, float]]],
+) -> dict[str, dict[str, Any]]:
+ if not run_field_scores:
+ return {}
+ all_fields: set[str] = set()
+ for field_scores in run_field_scores:
+ all_fields.update(field_scores.keys())
+
+ critic_stats: dict[str, dict[str, Any]] = {}
+ for critic_field in sorted(all_fields):
+ weighted_scores = [
+ run_scores.get(critic_field, {}).get("score", 0.0) for run_scores in run_field_scores
+ ]
+ weights = [
+ run_scores.get(critic_field, {}).get("weight", 0.0) for run_scores in run_field_scores
+ ]
+ normalized_scores = [
+ (score / weight) if weight > 0 else 0.0
+ for score, weight in zip(weighted_scores, weights)
+ ]
+ avg, std_dev = _compute_mean_std(weighted_scores)
+ avg_norm, std_norm = _compute_mean_std(normalized_scores)
+ non_zero_weights = [w for w in weights if w > 0]
+ # Use mean of non-zero weights as the representative weight.
+ # Weights are typically constant across runs, but mean handles edge cases.
+ representative_weight = mean(non_zero_weights) if non_zero_weights else 0.0
+ critic_stats[critic_field] = {
+ "run_scores": weighted_scores,
+ "mean_score": avg,
+ "std_deviation": std_dev,
+ "run_scores_normalized": normalized_scores,
+ "mean_score_normalized": avg_norm,
+ "std_deviation_normalized": std_norm,
+ "weight": representative_weight,
+ }
+ return critic_stats
+
+
# Import capture mode helpers (defined in capture.py to keep this file focused)
from arcade_evals.capture import ( # noqa: E402
_capture_with_anthropic,
@@ -167,7 +257,7 @@ class EvalCase:
user_message: str
expected_tool_calls: list[NamedExpectedToolCall]
critics: list["Critic"] | None = None
- additional_messages: list[dict[str, str]] = field(default_factory=list)
+ additional_messages: list[dict[str, Any]] = field(default_factory=list)
rubric: EvalRubric = field(default_factory=EvalRubric)
def __post_init__(self) -> None:
@@ -520,7 +610,7 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
expected_tool_calls: list[NamedExpectedToolCall],
rubric: EvalRubric,
critics: list["Critic"],
- additional_messages: list[dict[str, str]],
+ additional_messages: list[dict[str, Any]],
) -> "EvalCase":
"""Factory method to create EvalCase instances.
@@ -540,11 +630,12 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
self,
name: str,
user_message: str,
- expected_tool_calls: list[AnyExpectedToolCall] | list[tuple[Callable, dict[str, Any]]],
+ expected_tool_calls: Sequence[AnyExpectedToolCall]
+ | Sequence[tuple[Callable, dict[str, Any]]],
critics: list["Critic"] | None = None,
system_message: str | None = None,
rubric: EvalRubric | None = None,
- additional_messages: list[dict[str, str]] | None = None,
+ additional_messages: list[dict[str, Any]] | None = None,
) -> None:
"""
Add a new evaluation case to the suite.
@@ -660,7 +751,7 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
| None = None,
rubric: EvalRubric | None = None,
critics: list["Critic"] | None = None,
- additional_messages: list[dict[str, str]] | None = None,
+ additional_messages: list[dict[str, Any]] | None = None,
) -> None:
"""
Extend the last added case with new information.
@@ -745,11 +836,148 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
processed_calls.append((resolved_name, args_with_defaults))
return processed_calls
+ def _compute_run_field_scores(
+ self, evaluation: EvaluationResult
+ ) -> dict[str, dict[str, float]]:
+ field_scores: dict[str, list[float]] = {}
+ field_weights: dict[str, list[float]] = {}
+ for result in evaluation.results:
+ field = result["field"]
+ if field == "tool_selection":
+ continue
+ field_scores.setdefault(field, []).append(result["score"])
+ field_weights.setdefault(field, []).append(result["weight"])
+
+ run_scores: dict[str, dict[str, float]] = {}
+ for field, scores in field_scores.items():
+ weights = field_weights.get(field, [])
+ run_scores[field] = {
+ "score": mean(scores) if scores else 0.0,
+ "weight": mean(weights) if weights else 0.0,
+ }
+ return run_scores
+
+ async def _run_case_with_stats(
+ self,
+ case: "EvalCase",
+ client: Any,
+ model: str,
+ provider: ProviderName,
+ *,
+ num_runs: int,
+ seed: str | int | None,
+ pass_rule: str,
+ registry: EvalSuiteToolRegistry | None = None,
+ ) -> dict[str, Any]:
+ if num_runs < 1:
+ raise ValueError("num_runs must be >= 1")
+
+ seed_policy, seed_value = _resolve_seed_spec(seed)
+ seed_policy_display = seed_policy
+ if provider == "openai":
+ if seed_policy == "random":
+ run_seeds: list[int | None] = [
+ random.randint(0, 2**31 - 1) # noqa: S311
+ for _ in range(num_runs)
+ ]
+ else:
+ run_seeds = [seed_value for _ in range(num_runs)]
+ else:
+ seed_policy_display = f"{seed_policy} (ignored)"
+ run_seeds = [None for _ in range(num_runs)]
+
+ run_evaluations: list[EvaluationResult] = []
+ run_scores: list[float] = []
+ run_passed: list[bool] = []
+ run_warned: list[bool] = []
+ run_field_scores: list[dict[str, dict[str, float]]] = []
+ last_processed_calls: list[tuple[str, dict[str, Any]]] = []
+ run_details: list[dict[str, Any]] = []
+
+ for run_index in range(num_runs):
+ run_seed = run_seeds[run_index]
+ if provider == "anthropic":
+ predicted_args = await self._run_anthropic(client, model, case, registry=registry)
+ else:
+ predicted_args = await self._run_openai(
+ client, model, case, registry=registry, seed=run_seed
+ )
+
+ processed_calls = self._process_tool_calls(predicted_args, registry=registry)
+ evaluation = case.evaluate(processed_calls)
+
+ run_evaluations.append(evaluation)
+ run_scores.append(evaluation.score)
+ run_passed.append(evaluation.passed)
+ run_warned.append(evaluation.warning)
+ run_field_scores.append(self._compute_run_field_scores(evaluation))
+ last_processed_calls = processed_calls
+ run_details.append({
+ "score": evaluation.score,
+ "passed": evaluation.passed,
+ "warning": evaluation.warning,
+ "failure_reason": evaluation.failure_reason,
+ "details": evaluation.results,
+ })
+
+ mean_score, std_dev = _compute_mean_std(run_scores)
+ passed, warning = _resolve_pass_rule(run_evaluations, mean_score, pass_rule, case.rubric)
+
+ # Determine aggregate failure_reason:
+ # - PASS_RULE_LAST: use the last run's failure reason
+ # - Other rules: if ALL runs failed with the same reason, surface it
+ if not run_evaluations:
+ aggregate_failure_reason = None
+ elif pass_rule == PASS_RULE_LAST:
+ # Only surface failure_reason when the aggregate didn't pass
+ aggregate_failure_reason = run_evaluations[-1].failure_reason if not passed else None
+ elif not passed and not warning:
+ # For non-last rules, surface the failure reason if all runs share the same one
+ failure_reasons = [ev.failure_reason for ev in run_evaluations if ev.failure_reason]
+ unique_reasons = set(failure_reasons)
+ if len(unique_reasons) == 1 and len(failure_reasons) == len(run_evaluations):
+ aggregate_failure_reason = failure_reasons[0]
+ else:
+ aggregate_failure_reason = None
+ else:
+ aggregate_failure_reason = None
+
+ aggregate = EvaluationResult(
+ score=mean_score,
+ passed=passed,
+ warning=warning,
+ results=run_evaluations[-1].results if run_evaluations else [],
+ failure_reason=aggregate_failure_reason,
+ )
+
+ run_stats = {
+ "num_runs": num_runs,
+ "scores": run_scores,
+ "mean_score": mean_score,
+ "std_deviation": std_dev,
+ "passed": run_passed,
+ "warned": run_warned,
+ "seed_policy": seed_policy_display,
+ "run_seeds": run_seeds,
+ "pass_rule": pass_rule,
+ "runs": run_details,
+ }
+
+ return {
+ "evaluation": aggregate,
+ "predicted_tool_calls": last_processed_calls,
+ "run_stats": run_stats,
+ "critic_stats": _aggregate_critic_stats(run_field_scores),
+ }
+
async def run(
self,
client: Any, # AsyncOpenAI | AsyncAnthropic - use Any to avoid import dependency
model: str,
provider: ProviderName = "openai",
+ num_runs: int = 1,
+ seed: str | int | None = "constant",
+ multi_run_pass_rule: str = PASS_RULE_LAST,
) -> dict[str, Any]:
"""
Run the evaluation suite.
@@ -758,10 +986,22 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
client: The LLM client instance (AsyncOpenAI or AsyncAnthropic).
model: The model to evaluate.
provider: The provider name ("openai" or "anthropic").
+ num_runs: Number of runs per case.
+ seed: Seed policy ("constant", "random", or an integer seed).
+ multi_run_pass_rule: How to determine pass/warn for multi-run cases.
Returns:
A dictionary containing the evaluation results.
"""
+ # Validate upfront before making any API calls
+ if num_runs < 1:
+ raise ValueError("num_runs must be >= 1")
+ if multi_run_pass_rule not in _VALID_PASS_RULES:
+ raise ValueError(
+ f"Invalid multi-run pass rule '{multi_run_pass_rule}'. "
+ f"Valid values: {', '.join(sorted(_VALID_PASS_RULES))}"
+ )
+
results: dict[str, Any] = {
"model": model,
"suite_name": self.name,
@@ -779,17 +1019,15 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
"No tools registered. Use add_* convenience methods or pass catalog=ToolCatalog."
)
- # Get tool calls based on provider
- if provider == "anthropic":
- predicted_args = await self._run_anthropic(client, model, case)
- else:
- predicted_args = await self._run_openai(client, model, case)
-
- # Process tool calls (resolve names, fill defaults)
- filled_actual_tool_calls = self._process_tool_calls(predicted_args)
-
- # Evaluate the case
- evaluation = case.evaluate(filled_actual_tool_calls)
+ case_result = await self._run_case_with_stats(
+ case,
+ client,
+ model,
+ provider,
+ num_runs=num_runs,
+ seed=seed,
+ pass_rule=multi_run_pass_rule,
+ )
# Prepare the result
result = {
@@ -801,10 +1039,15 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
{"name": tc.name, "args": tc.args} for tc in case.expected_tool_calls
],
"predicted_tool_calls": [
- {"name": name, "args": args} for name, args in filled_actual_tool_calls
+ {"name": name, "args": args}
+ for name, args in case_result["predicted_tool_calls"]
],
- "evaluation": evaluation,
+ "evaluation": case_result["evaluation"],
}
+ if num_runs > 1:
+ result["run_stats"] = case_result["run_stats"]
+ if case_result["critic_stats"]:
+ result["critic_stats"] = case_result["critic_stats"]
return result
tasks = [sem_task(case) for case in self.cases]
@@ -819,6 +1062,7 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
model: str,
case: "EvalCase",
registry: EvalSuiteToolRegistry | None = None,
+ seed: int | None = None,
) -> list[tuple[str, dict[str, Any]]]:
"""Run evaluation using OpenAI client.
@@ -843,15 +1087,18 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
tools = effective_registry.list_tools_for_model(tool_format="openai")
# Get the model response
- response = await client.chat.completions.create( # type: ignore[arg-type]
- model=model,
- messages=messages,
- tool_choice="auto",
- tools=tools,
- user="eval_user",
- seed=42,
- stream=False,
- )
+ request_params: dict[str, Any] = {
+ "model": model,
+ "messages": messages,
+ "tool_choice": "auto",
+ "tools": tools,
+ "user": "eval_user",
+ "stream": False,
+ }
+ if seed is not None:
+ request_params["seed"] = seed
+
+ response = await client.chat.completions.create(**request_params)
return get_tool_args(response, normalize_names=False)
@@ -985,6 +1232,9 @@ def tool_eval() -> Callable[[Callable], Callable]:
provider: ProviderName = "openai",
capture_mode: bool = False,
include_context: bool = False,
+ num_runs: int = 1,
+ seed: str | int | None = "constant",
+ multi_run_pass_rule: str = PASS_RULE_LAST,
) -> list[Any]:
"""
Run evaluation or capture mode.
@@ -1015,19 +1265,43 @@ def tool_eval() -> Callable[[Callable], Callable]:
# Run in capture mode
if provider == "anthropic":
capture_result = await _capture_with_anthropic(
- suite, provider_api_key, model, include_context
+ suite,
+ provider_api_key,
+ model,
+ include_context=include_context,
+ num_runs=num_runs,
+ seed=seed,
)
else:
capture_result = await _capture_with_openai(
- suite, provider_api_key, model, include_context
+ suite,
+ provider_api_key,
+ model,
+ include_context=include_context,
+ num_runs=num_runs,
+ seed=seed,
)
return [capture_result]
else:
# Run in evaluation mode
if provider == "anthropic":
- eval_result = await _run_with_anthropic(suite, provider_api_key, model)
+ eval_result = await _run_with_anthropic(
+ suite,
+ provider_api_key,
+ model,
+ num_runs=num_runs,
+ seed=seed,
+ multi_run_pass_rule=multi_run_pass_rule,
+ )
else:
- eval_result = await _run_with_openai(suite, provider_api_key, model)
+ eval_result = await _run_with_openai(
+ suite,
+ provider_api_key,
+ model,
+ num_runs=num_runs,
+ seed=seed,
+ multi_run_pass_rule=multi_run_pass_rule,
+ )
# For comparative evaluations, eval_result is already a list of track results
# For regular evaluations, it's a single dict that needs wrapping
@@ -1042,7 +1316,13 @@ def tool_eval() -> Callable[[Callable], Callable]:
async def _run_with_openai(
- suite: "EvalSuite", api_key: str, model: str
+ suite: "EvalSuite",
+ api_key: str,
+ model: str,
+ *,
+ num_runs: int = 1,
+ seed: str | int | None = "constant",
+ multi_run_pass_rule: str = PASS_RULE_LAST,
) -> dict[str, Any] | list[dict[str, Any]]:
"""Run evaluation suite with OpenAI client.
@@ -1054,16 +1334,36 @@ async def _run_with_openai(
# Check if this suite has comparative cases
if suite._comparative_case_builders:
# Run comparative evaluation - returns dict[track_name, result]
- track_results = await suite.run_comparative(client, model, provider="openai")
+ track_results = await suite.run_comparative(
+ client,
+ model,
+ provider="openai",
+ num_runs=num_runs,
+ seed=seed,
+ multi_run_pass_rule=multi_run_pass_rule,
+ )
# Convert to list of results for consistent handling
return list(track_results.values())
else:
# Run regular evaluation
- return await suite.run(client, model, provider="openai")
+ return await suite.run(
+ client,
+ model,
+ provider="openai",
+ num_runs=num_runs,
+ seed=seed,
+ multi_run_pass_rule=multi_run_pass_rule,
+ )
async def _run_with_anthropic(
- suite: "EvalSuite", api_key: str, model: str
+ suite: "EvalSuite",
+ api_key: str,
+ model: str,
+ *,
+ num_runs: int = 1,
+ seed: str | int | None = "constant",
+ multi_run_pass_rule: str = PASS_RULE_LAST,
) -> dict[str, Any] | list[dict[str, Any]]:
"""Run evaluation suite with Anthropic client.
@@ -1083,9 +1383,23 @@ async def _run_with_anthropic(
# Check if this suite has comparative cases
if suite._comparative_case_builders:
# Run comparative evaluation - returns dict[track_name, result]
- track_results = await suite.run_comparative(client, model, provider="anthropic")
+ track_results = await suite.run_comparative(
+ client,
+ model,
+ provider="anthropic",
+ num_runs=num_runs,
+ seed=seed,
+ multi_run_pass_rule=multi_run_pass_rule,
+ )
# Convert to list of results for consistent handling
return list(track_results.values())
else:
# Run regular evaluation
- return await suite.run(client, model, provider="anthropic")
+ return await suite.run(
+ client,
+ model,
+ provider="anthropic",
+ num_runs=num_runs,
+ seed=seed,
+ multi_run_pass_rule=multi_run_pass_rule,
+ )
diff --git a/libs/tests/cli/test_capture_formatters.py b/libs/tests/cli/test_capture_formatters.py
index 832ab8e5..fc93b5f4 100644
--- a/libs/tests/cli/test_capture_formatters.py
+++ b/libs/tests/cli/test_capture_formatters.py
@@ -57,6 +57,20 @@ def _create_mock_capture_result(
# Explicitly set track_name to None unless specified (avoids MagicMock)
case.track_name = case_data.get("track_name")
+ # Create mock runs if provided
+ runs = []
+ for run_data in case_data.get("runs", []):
+ run = MagicMock()
+ run_tool_calls = []
+ for tc_data in run_data.get("tool_calls", []):
+ tc = MagicMock()
+ tc.name = tc_data["name"]
+ tc.args = tc_data.get("args", {})
+ run_tool_calls.append(tc)
+ run.tool_calls = run_tool_calls
+ runs.append(run)
+ case.runs = runs
+
# Create mock tool calls
tool_calls = []
for tc_data in case_data.get("tool_calls", []):
@@ -84,6 +98,11 @@ def _create_mock_capture_result(
"user_message": case.user_message,
"tool_calls": [{"name": tc.name, "args": tc.args} for tc in case.tool_calls],
}
+ if case.runs:
+ case_dict["runs"] = [
+ {"tool_calls": [{"name": tc.name, "args": tc.args} for tc in run.tool_calls]}
+ for run in case.runs
+ ]
if include_context:
case_dict["system_message"] = case.system_message
case_dict["additional_messages"] = case.additional_messages
@@ -170,6 +189,29 @@ class TestCaptureJsonFormatter:
assert case["tool_calls"][0]["name"] == "GetWeather"
assert case["tool_calls"][0]["args"]["city"] == "NYC"
+ def test_format_includes_runs(self) -> None:
+ """Test that runs are included when present."""
+ formatter = CaptureJsonFormatter()
+ capture = _create_mock_capture_result(
+ cases=[
+ {
+ "case_name": "multi_run_case",
+ "user_message": "Hello",
+ "tool_calls": [],
+ "runs": [
+ {"tool_calls": [{"name": "A", "args": {"x": 1}}]},
+ {"tool_calls": [{"name": "B", "args": {"x": 2}}]},
+ ],
+ }
+ ]
+ )
+
+ output = formatter.format([capture])
+ parsed = json.loads(output)
+ runs = parsed["captures"][0]["captured_cases"][0]["runs"]
+ assert len(runs) == 2
+ assert runs[0]["tool_calls"][0]["name"] == "A"
+
def test_format_with_context(self) -> None:
"""Test formatting with context included."""
formatter = CaptureJsonFormatter()
@@ -309,6 +351,28 @@ class TestCaptureMarkdownFormatter:
assert "**Total Cases:** 1" in output
assert "**Total Tool Calls:** 1" in output
+ def test_format_includes_runs(self) -> None:
+ """Should include per-run tool calls when runs are present."""
+ formatter = CaptureMarkdownFormatter()
+ capture = _create_mock_capture_result(
+ cases=[
+ {
+ "case_name": "multi_run_case",
+ "user_message": "Hello",
+ "tool_calls": [],
+ "runs": [
+ {"tool_calls": [{"name": "GetWeather", "args": {"city": "NYC"}}]},
+ {"tool_calls": [{"name": "GetWeather", "args": {"city": "SF"}}]},
+ ],
+ }
+ ]
+ )
+
+ output = formatter.format([capture])
+ assert "Run 1" in output
+ assert "Run 2" in output
+ assert "`GetWeather`" in output
+
class TestCaptureHtmlFormatter:
"""Tests for CaptureHtmlFormatter."""
@@ -607,14 +671,26 @@ class TestMultiModelTextCaptureFormatter:
def test_text_multi_model_output(self) -> None:
"""Should produce multi-model text output."""
capture1 = _create_mock_capture_result(
- suite_name="TestSuite", model="gpt-4o", cases=[
- {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool1", "args": {}}]}
- ]
+ suite_name="TestSuite",
+ model="gpt-4o",
+ cases=[
+ {
+ "case_name": "case1",
+ "user_message": "Hi",
+ "tool_calls": [{"name": "Tool1", "args": {}}],
+ }
+ ],
)
capture2 = _create_mock_capture_result(
- suite_name="TestSuite", model="gpt-4-turbo", cases=[
- {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool2", "args": {}}]}
- ]
+ suite_name="TestSuite",
+ model="gpt-4-turbo",
+ cases=[
+ {
+ "case_name": "case1",
+ "user_message": "Hi",
+ "tool_calls": [{"name": "Tool2", "args": {}}],
+ }
+ ],
)
formatter = CaptureTextFormatter()
@@ -647,14 +723,26 @@ class TestMultiModelHtmlCaptureFormatter:
def test_html_multi_model_output(self) -> None:
"""Should produce multi-model HTML output."""
capture1 = _create_mock_capture_result(
- suite_name="TestSuite", model="gpt-4o", cases=[
- {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool1", "args": {}}]}
- ]
+ suite_name="TestSuite",
+ model="gpt-4o",
+ cases=[
+ {
+ "case_name": "case1",
+ "user_message": "Hi",
+ "tool_calls": [{"name": "Tool1", "args": {}}],
+ }
+ ],
)
capture2 = _create_mock_capture_result(
- suite_name="TestSuite", model="gpt-4-turbo", cases=[
- {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool2", "args": {}}]}
- ]
+ suite_name="TestSuite",
+ model="gpt-4-turbo",
+ cases=[
+ {
+ "case_name": "case1",
+ "user_message": "Hi",
+ "tool_calls": [{"name": "Tool2", "args": {}}],
+ }
+ ],
)
formatter = CaptureHtmlFormatter()
@@ -687,14 +775,26 @@ class TestMultiModelJsonCaptureFormatter:
def test_json_multi_model_output(self) -> None:
"""Should produce structured multi-model JSON."""
capture1 = _create_mock_capture_result(
- suite_name="TestSuite", model="gpt-4o", cases=[
- {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool1", "args": {}}]}
- ]
+ suite_name="TestSuite",
+ model="gpt-4o",
+ cases=[
+ {
+ "case_name": "case1",
+ "user_message": "Hi",
+ "tool_calls": [{"name": "Tool1", "args": {}}],
+ }
+ ],
)
capture2 = _create_mock_capture_result(
- suite_name="TestSuite", model="gpt-4-turbo", cases=[
- {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool2", "args": {}}]}
- ]
+ suite_name="TestSuite",
+ model="gpt-4-turbo",
+ cases=[
+ {
+ "case_name": "case1",
+ "user_message": "Hi",
+ "tool_calls": [{"name": "Tool2", "args": {}}],
+ }
+ ],
)
formatter = CaptureJsonFormatter()
@@ -795,6 +895,7 @@ def _create_mock_capture_with_tracks(
mock_tc.args = tc["args"]
mock_tool_calls.append(mock_tc)
mock_case.tool_calls = mock_tool_calls
+ mock_case.runs = [] # Explicitly set runs to empty for single-run captures
captured_cases.append(mock_case)
@@ -924,3 +1025,365 @@ class TestCaptureWithTracks:
# Should include track info in markdown
assert "[track_a]" in output or "track_a" in output
+
+
+# =====================================================================
+# Capture formatter multi-run tests
+# =====================================================================
+
+
+def _create_mock_capture_with_runs(
+ num_runs: int = 3,
+) -> CaptureResult:
+ """Create a mock CaptureResult with multiple runs per case."""
+ cases = [
+ {
+ "case_name": "multi_run_case",
+ "user_message": "What's the weather in NYC?",
+ "tool_calls": [
+ {"name": "GetWeather", "args": {"city": "NYC"}},
+ ],
+ "system_message": "You are a weather assistant",
+ "additional_messages": [],
+ "runs": [
+ {
+ "tool_calls": [
+ {"name": "GetWeather", "args": {"city": "NYC", "seed": str(i)}},
+ ]
+ }
+ for i in range(1, num_runs + 1)
+ ],
+ }
+ ]
+
+ return _create_mock_capture_result(
+ suite_name="MultiRunCaptureSuite",
+ cases=cases,
+ )
+
+
+def _create_mock_capture_no_runs() -> CaptureResult:
+ """Create a mock CaptureResult with a case that has no tool calls and no runs."""
+ cases = [
+ {
+ "case_name": "empty_case",
+ "user_message": "Do nothing",
+ "tool_calls": [],
+ "system_message": None,
+ "additional_messages": [],
+ }
+ ]
+ return _create_mock_capture_result(
+ suite_name="EmptyCaptureSuite",
+ cases=cases,
+ )
+
+
+class TestCaptureMultiRunText:
+ """Tests for multi-run capture in the text formatter."""
+
+ def test_text_shows_run_headers(self) -> None:
+ """Text capture output should show 'Run 1', 'Run 2', etc."""
+ capture = _create_mock_capture_with_runs(num_runs=3)
+ formatter = CaptureTextFormatter()
+ output = formatter.format([capture])
+ assert "Run 1:" in output
+ assert "Run 2:" in output
+ assert "Run 3:" in output
+
+ def test_text_shows_tool_calls_per_run(self) -> None:
+ """Each run should display its tool calls."""
+ capture = _create_mock_capture_with_runs(num_runs=2)
+ formatter = CaptureTextFormatter()
+ output = formatter.format([capture])
+ assert "GetWeather" in output
+
+ def test_text_no_runs_shows_top_level_calls(self) -> None:
+ """When runs is empty, should fall through to top-level tool_calls."""
+ capture = _create_mock_capture_result() # default: no runs
+ formatter = CaptureTextFormatter()
+ output = formatter.format([capture])
+ assert "GetWeather" in output
+
+ def test_text_empty_case_no_tool_calls(self) -> None:
+ """Case with no tool calls should show appropriate message."""
+ capture = _create_mock_capture_no_runs()
+ formatter = CaptureTextFormatter()
+ output = formatter.format([capture])
+ assert "no tool calls" in output.lower()
+
+
+class TestCaptureMultiRunMarkdown:
+ """Tests for multi-run capture in the markdown formatter."""
+
+ def test_markdown_shows_run_headers(self) -> None:
+ """Markdown capture should show run headers."""
+ capture = _create_mock_capture_with_runs(num_runs=3)
+ formatter = CaptureMarkdownFormatter()
+ output = formatter.format([capture])
+ assert "Run 1" in output
+ assert "Run 2" in output
+ assert "Run 3" in output
+
+ def test_markdown_shows_tool_call_json(self) -> None:
+ """Markdown capture should show tool call args as JSON."""
+ capture = _create_mock_capture_with_runs(num_runs=2)
+ formatter = CaptureMarkdownFormatter()
+ output = formatter.format([capture])
+ assert "```json" in output
+ assert "GetWeather" in output
+
+ def test_markdown_empty_runs_shows_no_calls(self) -> None:
+ """Markdown capture with no tool calls shows appropriate message."""
+ capture = _create_mock_capture_no_runs()
+ formatter = CaptureMarkdownFormatter()
+ output = formatter.format([capture])
+ assert "No tool calls" in output
+
+
+class TestCaptureMultiRunHTML:
+ """Tests for multi-run capture in the HTML formatter."""
+
+ def test_html_shows_capture_run_details(self) -> None:
+ """HTML capture should show capture-run details elements."""
+ capture = _create_mock_capture_with_runs(num_runs=3)
+ formatter = CaptureHtmlFormatter()
+ output = formatter.format([capture])
+ assert "capture-run" in output
+ assert "Run 1" in output
+ assert "Run 2" in output
+ assert "Run 3" in output
+
+ def test_html_tool_calls_escaped(self) -> None:
+ """HTML capture should escape tool call content."""
+ capture = _create_mock_capture_with_runs(num_runs=1)
+ formatter = CaptureHtmlFormatter()
+ output = formatter.format([capture])
+ assert "GetWeather" in output
+
+ def test_html_empty_case_no_calls(self) -> None:
+ """HTML capture with no tool calls shows appropriate message."""
+ capture = _create_mock_capture_no_runs()
+ formatter = CaptureHtmlFormatter()
+ output = formatter.format([capture])
+ assert "No tool calls" in output or "no-calls" in output
+
+
+class TestCaptureMultiRunJSON:
+ """Tests for multi-run capture in the JSON formatter."""
+
+ def test_json_includes_runs_array(self) -> None:
+ """JSON capture should include runs array for multi-run cases."""
+ capture = _create_mock_capture_with_runs(num_runs=3)
+ formatter = CaptureJsonFormatter()
+ output = formatter.format([capture])
+ data = json.loads(output)
+ captures = data["captures"]
+ assert len(captures) == 1
+ case = captures[0]["captured_cases"][0]
+ assert "runs" in case
+ assert len(case["runs"]) == 3
+
+ def test_json_no_runs_for_single_run(self) -> None:
+ """JSON capture should not include runs for single-run cases."""
+ capture = _create_mock_capture_result() # default: no runs
+ formatter = CaptureJsonFormatter()
+ output = formatter.format([capture])
+ data = json.loads(output)
+ case = data["captures"][0]["captured_cases"][0]
+ assert "runs" not in case
+
+ def test_json_run_tool_calls_structure(self) -> None:
+ """Each run in JSON should have tool_calls with name and args."""
+ capture = _create_mock_capture_with_runs(num_runs=2)
+ formatter = CaptureJsonFormatter()
+ output = formatter.format([capture])
+ data = json.loads(output)
+ run = data["captures"][0]["captured_cases"][0]["runs"][0]
+ assert "tool_calls" in run
+ assert run["tool_calls"][0]["name"] == "GetWeather"
+
+
+# =====================================================================
+# Coverage gap tests — CaptureTextFormatter
+# =====================================================================
+
+
+class TestCaptureTextFormatterCoverageGaps:
+ """Tests for CaptureTextFormatter methods that lacked coverage."""
+
+ def test_format_value_truncation(self) -> None:
+ """_format_value should truncate values longer than 60 chars."""
+ formatter = CaptureTextFormatter()
+ short = formatter._format_value("hello")
+ assert short == "hello"
+
+ long_val = "x" * 100
+ truncated = formatter._format_value(long_val)
+ assert len(truncated) == 60
+ assert truncated.endswith("...")
+
+ def test_format_value_exactly_60(self) -> None:
+ """_format_value should NOT truncate values of exactly 60 chars."""
+ formatter = CaptureTextFormatter()
+ exact = "a" * 60
+ result = formatter._format_value(exact)
+ assert result == exact
+
+ def test_conversation_text_format(self) -> None:
+ """CaptureTextFormatter._format_conversation_text should format messages."""
+ formatter = CaptureTextFormatter()
+ messages = [
+ {"role": "user", "content": "Hello"},
+ {"role": "assistant", "content": "Hi!"},
+ {
+ "role": "assistant",
+ "content": None,
+ "tool_calls": [{"function": {"name": "get_data", "arguments": '{"id": 1}'}}],
+ },
+ {"role": "tool", "name": "get_data", "content": '{"result": "ok"}'},
+ ]
+ lines = formatter._format_conversation_text(messages)
+ text = "\n".join(lines)
+
+ assert "[USER]" in text
+ assert "[ASSISTANT]" in text
+ assert "[TOOL]" in text
+ assert "get_data" in text
+ assert "Hello" in text
+
+ def test_conversation_text_invalid_json_content(self) -> None:
+ """Should gracefully handle non-JSON tool content."""
+ formatter = CaptureTextFormatter()
+ messages = [
+ {"role": "tool", "name": "raw", "content": "plain text output"},
+ ]
+ lines = formatter._format_conversation_text(messages)
+ text = "\n".join(lines)
+
+ assert "plain text output" in text
+
+ def test_conversation_text_invalid_json_args(self) -> None:
+ """Should gracefully handle non-JSON tool call arguments."""
+ formatter = CaptureTextFormatter()
+ messages = [
+ {
+ "role": "assistant",
+ "content": "",
+ "tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}],
+ },
+ ]
+ lines = formatter._format_conversation_text(messages)
+ text = "\n".join(lines)
+
+ assert "broken" in text
+ assert "not json" in text
+
+ def test_conversation_text_separator_between_messages(self) -> None:
+ """Should add separator between messages (not before first)."""
+ formatter = CaptureTextFormatter()
+ messages = [
+ {"role": "user", "content": "First"},
+ {"role": "assistant", "content": "Second"},
+ ]
+ lines = formatter._format_conversation_text(messages)
+ text = "\n".join(lines)
+
+ # Separator should appear between messages
+ assert "----" in text
+
+ def test_multi_model_with_tracks_and_context(self) -> None:
+ """Multi-model capture with tracks should render correctly with context."""
+ capture1 = _create_mock_capture_with_tracks(model="gpt-4o")
+ capture2 = _create_mock_capture_with_tracks(model="gpt-4-turbo")
+
+ formatter = CaptureTextFormatter()
+ output = formatter.format([capture1, capture2], include_context=True)
+
+ assert "MULTI-MODEL CAPTURE RESULTS" in output
+ assert "gpt-4o" in output
+ assert "gpt-4-turbo" in output
+ # Should show track sections
+ assert "TRACK:" in output or "track_a" in output
+
+ def test_multi_model_no_data_model(self) -> None:
+ """Multi-model capture should handle a model with no data for a case."""
+ # Model A has case1, model B has case1 with different tools
+ capture1 = _create_mock_capture_result(
+ suite_name="Suite",
+ model="model-a",
+ cases=[
+ {
+ "case_name": "case1",
+ "user_message": "Hi",
+ "tool_calls": [{"name": "T1", "args": {}}],
+ }
+ ],
+ )
+ capture2 = _create_mock_capture_result(
+ suite_name="Suite",
+ model="model-b",
+ cases=[{"case_name": "case1", "user_message": "Hi", "tool_calls": []}],
+ )
+
+ formatter = CaptureTextFormatter()
+ output = formatter.format([capture1, capture2])
+
+ assert "model-a" in output
+ assert "model-b" in output
+ assert "MULTI-MODEL CAPTURE RESULTS" in output
+
+
+# =====================================================================
+# Coverage gap tests — CaptureMarkdownFormatter
+# =====================================================================
+
+
+class TestCaptureMarkdownFormatterCoverageGaps:
+ """Tests for CaptureMarkdownFormatter methods that lacked coverage."""
+
+ def test_multi_model_with_tracks_and_context(self) -> None:
+ """Multi-model markdown capture with tracks should render correctly."""
+ capture1 = _create_mock_capture_with_tracks(model="gpt-4o")
+ capture2 = _create_mock_capture_with_tracks(model="gpt-4-turbo")
+
+ formatter = CaptureMarkdownFormatter()
+ output = formatter.format([capture1, capture2], include_context=True)
+
+ assert "Multi-Model Capture Results" in output
+ assert "gpt-4o" in output
+ assert "gpt-4-turbo" in output
+
+ def test_conversation_md_standalone(self) -> None:
+ """CaptureMarkdownFormatter._format_conversation_md should format messages."""
+ formatter = CaptureMarkdownFormatter()
+ messages = [
+ {"role": "user", "content": "Hello"},
+ {"role": "assistant", "content": "Hi!"},
+ {
+ "role": "assistant",
+ "content": None,
+ "tool_calls": [{"function": {"name": "search", "arguments": '{"q": "x"}'}}],
+ },
+ {"role": "tool", "name": "search", "content": '{"r": 1}'},
+ ]
+ lines = formatter._format_conversation_md(messages)
+ text = "\n".join(lines)
+
+ assert "👤" in text or "User" in text
+ assert "search" in text
+
+ def test_conversation_md_invalid_json(self) -> None:
+ """Should handle invalid JSON in tool call args."""
+ formatter = CaptureMarkdownFormatter()
+ messages = [
+ {
+ "role": "assistant",
+ "content": None,
+ "tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}],
+ },
+ ]
+ lines = formatter._format_conversation_md(messages)
+ text = "\n".join(lines)
+
+ assert "broken" in text
diff --git a/libs/tests/cli/test_evals_runner.py b/libs/tests/cli/test_evals_runner.py
index af470a26..a1ef380e 100644
--- a/libs/tests/cli/test_evals_runner.py
+++ b/libs/tests/cli/test_evals_runner.py
@@ -1,5 +1,6 @@
"""Tests for evals_runner error handling."""
+from typing import Any, cast
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
@@ -14,6 +15,9 @@ from arcade_cli.evals_runner import (
run_evaluations,
)
from arcade_cli.utils import ModelSpec, Provider
+from arcade_evals import CaptureResult
+
+RUN_RULE_LAST = "last"
class TestEvalTaskResult:
@@ -68,7 +72,13 @@ class TestCaptureTaskResult:
def test_from_success(self) -> None:
"""Test creating a successful capture result."""
- mock_captures = [MagicMock(), MagicMock()]
+ mock_captures = cast(
+ list[CaptureResult],
+ [
+ MagicMock(spec=CaptureResult),
+ MagicMock(spec=CaptureResult),
+ ],
+ )
result = CaptureTaskResult.from_success("test_suite", "gpt-4o", "openai", mock_captures)
assert result.success is True
assert result.suite_name == "test_suite"
@@ -107,6 +117,9 @@ class TestRunEvalTask:
suite_func=mock_suite,
model_spec=model_spec,
max_concurrent=1,
+ num_runs=1,
+ seed="constant",
+ multi_run_pass_rule=RUN_RULE_LAST,
)
assert result.success is True
@@ -126,9 +139,13 @@ class TestRunEvalTask:
suite_func=mock_suite,
model_spec=model_spec,
max_concurrent=1,
+ num_runs=1,
+ seed="constant",
+ multi_run_pass_rule=RUN_RULE_LAST,
)
assert result.success is False
+ assert result.error is not None
assert "API error" in result.error
assert result.error_type == "ValueError"
assert result.result is None
@@ -145,6 +162,9 @@ class TestRunEvalTask:
model_spec=model_spec,
max_concurrent=5,
include_context=False,
+ num_runs=1,
+ seed="constant",
+ multi_run_pass_rule=RUN_RULE_LAST,
)
mock_suite.assert_called_once_with(
@@ -153,6 +173,9 @@ class TestRunEvalTask:
max_concurrency=5,
provider="anthropic",
include_context=False,
+ num_runs=1,
+ seed="constant",
+ multi_run_pass_rule=RUN_RULE_LAST,
)
@@ -172,6 +195,8 @@ class TestRunCaptureTask:
model_spec=model_spec,
max_concurrent=1,
include_context=True,
+ num_runs=1,
+ seed="constant",
)
assert result.success is True
@@ -189,9 +214,12 @@ class TestRunCaptureTask:
model_spec=model_spec,
max_concurrent=1,
include_context=False,
+ num_runs=1,
+ seed="constant",
)
assert result.success is False
+ assert result.error is not None
assert "Network failed" in result.error
assert result.error_type == "ConnectionError"
@@ -207,6 +235,8 @@ class TestRunCaptureTask:
model_spec=model_spec,
max_concurrent=2,
include_context=True,
+ num_runs=1,
+ seed="constant",
)
mock_suite.assert_called_once_with(
@@ -216,6 +246,8 @@ class TestRunCaptureTask:
provider="openai",
capture_mode=True,
include_context=True,
+ num_runs=1,
+ seed="constant",
)
@@ -253,6 +285,9 @@ class TestRunEvaluationsErrorHandling:
output_format="txt",
failed_only=False,
console=console,
+ num_runs=1,
+ seed="constant",
+ multi_run_pass_rule=RUN_RULE_LAST,
)
# Verify both were attempted
@@ -277,6 +312,9 @@ class TestRunEvaluationsErrorHandling:
output_format="txt",
failed_only=False,
console=console,
+ num_runs=1,
+ seed="constant",
+ multi_run_pass_rule=RUN_RULE_LAST,
)
# Should print "No evaluations completed successfully" (with emoji)
@@ -302,6 +340,9 @@ class TestRunEvaluationsErrorHandling:
output_format="txt",
failed_only=False,
console=console,
+ num_runs=1,
+ seed="constant",
+ multi_run_pass_rule=RUN_RULE_LAST,
)
# Check that failure count is printed
@@ -327,6 +368,9 @@ class TestRunEvaluationsErrorHandling:
output_format="txt",
failed_only=False,
console=console,
+ num_runs=1,
+ seed="constant",
+ multi_run_pass_rule=RUN_RULE_LAST,
)
# Check that no failure warning is printed
@@ -338,7 +382,7 @@ class TestRunEvaluationsErrorHandling:
"""Test partial failure with multiple models."""
# Suite that fails on one model but succeeds on another
- async def conditional_suite(**kwargs):
+ async def conditional_suite(**kwargs: Any) -> MagicMock:
if kwargs["model"] == "bad-model":
raise RuntimeError("Model not supported")
return MagicMock()
@@ -371,6 +415,9 @@ class TestRunEvaluationsErrorHandling:
output_format="txt",
failed_only=False,
console=console,
+ num_runs=1,
+ seed="constant",
+ multi_run_pass_rule=RUN_RULE_LAST,
)
# Should have been called twice
@@ -397,6 +444,8 @@ class TestRunCaptureErrorHandling:
output_file=None,
output_format="json",
console=console,
+ num_runs=1,
+ seed="constant",
)
# Error message includes emoji
@@ -436,6 +485,8 @@ class TestRunCaptureErrorHandling:
output_file=None,
output_format="json",
console=console,
+ num_runs=1,
+ seed="constant",
)
# Both should have been attempted
@@ -463,6 +514,8 @@ class TestRunCaptureErrorHandling:
output_file=None,
output_format="json",
console=console,
+ num_runs=1,
+ seed="constant",
)
# Check error details are printed
diff --git a/libs/tests/cli/test_formatter_edge_cases.py b/libs/tests/cli/test_formatter_edge_cases.py
index 50ca1e1a..9838f68e 100644
--- a/libs/tests/cli/test_formatter_edge_cases.py
+++ b/libs/tests/cli/test_formatter_edge_cases.py
@@ -140,11 +140,16 @@ class TestFormatterEdgeCases:
formatter = HtmlFormatter()
output = formatter.format(results)
- # Should NOT contain raw script tags or other unescaped HTML
- assert "