diff --git a/examples/evals/README.md b/examples/evals/README.md index cf1371f6..118c12a7 100644 --- a/examples/evals/README.md +++ b/examples/evals/README.md @@ -40,7 +40,8 @@ arcade evals examples/evals/eval_arcade_gateway.py \ # Compare multiple models arcade evals examples/evals/eval_stdio_mcp_server.py \ - -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \ + -p openai:gpt-4o \ + -p anthropic:claude-sonnet-4-5-20250929 \ -k openai:YOUR_OPENAI_KEY \ -k anthropic:YOUR_ANTHROPIC_KEY @@ -205,7 +206,8 @@ export ARCADE_API_KEY=your_key export ARCADE_USER_ID=your_user_id arcade evals examples/evals/eval_comprehensive_comparison.py \ - -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \ + -p openai:gpt-4o \ + -p anthropic:claude-sonnet-4-5-20250929 \ -k openai:YOUR_KEY \ -k anthropic:YOUR_KEY \ -o comparison.html -d @@ -213,19 +215,22 @@ arcade evals examples/evals/eval_comprehensive_comparison.py \ ## 🎯 CLI Reference -### New v2.0.0 Flags +### Flags -| Flag | Short | Description | Example | -| --------------------- | ------- | -------------------------------------------------- | ------------------------------------------------- | -| `--use-provider` | `-p` | Provider(s) and models (space-separated) | `-p "openai:gpt-4o anthropic:claude-sonnet"` | -| `--api-key` | `-k` | API key in`provider:key` format (repeatable) | `-k openai:sk-... -k anthropic:sk-ant-...` | -| `--output` | `-o` | Output file (auto-detects format from extension) | `-o results.html` or `-o results` (all formats) | -| `--only-failed` | `-f` | Show only failed evaluations | `--only-failed` | -| `--include-context` | | Include system messages and conversation history | `--include-context` | -| `--details` | `-d` | Show detailed output | `-d` | -| `--max-concurrent` | | Max concurrent evaluations | `--max-concurrent 5` | -| `--capture` | | Capture mode (record tool calls without scoring) | `--capture` | +| Flag | Short | Description | Example | +| ----------------------- | ----- | ----------------------------------------------------- | --------------------------------------------------- | +| `--use-provider` | `-p` | Provider and models (repeatable) | `-p openai:gpt-4o -p anthropic:claude-sonnet` | +| `--api-key` | `-k` | API key in `provider:key` format (repeatable) | `-k openai:sk-... -k anthropic:sk-ant-...` | +| `--output` | `-o` | Output file (auto-detects format from extension) | `-o results.html` or `-o results` (all formats) | +| `--only-failed` | `-f` | Show only failed evaluations | `--only-failed` | +| `--include-context` | | Include system messages and conversation history | `--include-context` | +| `--details` | `-d` | Show detailed output | `-d` | +| `--max-concurrent` | | Max concurrent evaluations | `--max-concurrent 5` | +| `--capture` | | Capture mode (record tool calls without scoring) | `--capture` | +| `--num-runs` | `-n` | Number of runs per case (default: 1) | `-n 5` | +| `--seed` | | Seed policy: `constant`, `random`, or an integer | `--seed random` or `--seed 42` | +| `--multi-run-pass-rule` | | Aggregation rule: `last` (default), `mean`, `majority`| `--multi-run-pass-rule majority` | ### Provider & Model Selection @@ -238,14 +243,15 @@ arcade evals eval_file.py -p openai -k openai:YOUR_KEY **Single provider with specific models:** ```bash -arcade evals eval_file.py -p "openai:gpt-4o,gpt-4o-mini" -k openai:YOUR_KEY +arcade evals eval_file.py -p openai:gpt-4o,gpt-4o-mini -k openai:YOUR_KEY ``` -**Multiple providers (space-separated):** +**Multiple providers (use separate `-p` flags):** ```bash arcade evals eval_file.py \ - -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \ + -p openai:gpt-4o \ + -p anthropic:claude-sonnet-4-5-20250929 \ -k openai:YOUR_KEY \ -k anthropic:YOUR_KEY ``` @@ -279,7 +285,7 @@ arcade evals eval_file.py \ ```bash arcade evals examples/evals/eval_arcade_gateway.py \ - -p "openai:gpt-4o,gpt-4o-mini,gpt-3.5-turbo" \ + -p openai:gpt-4o,gpt-4o-mini \ -k openai:YOUR_KEY \ -o comparison.html -d ``` @@ -288,7 +294,8 @@ arcade evals examples/evals/eval_arcade_gateway.py \ ```bash arcade evals examples/evals/eval_stdio_mcp_server.py \ - -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \ + -p openai:gpt-4o \ + -p anthropic:claude-sonnet-4-5-20250929 \ -k openai:YOUR_OPENAI_KEY \ -k anthropic:YOUR_ANTHROPIC_KEY \ -o battle.html -d @@ -307,7 +314,8 @@ arcade evals examples/evals/eval_http_mcp_server.py \ ```bash # Compare performance across multiple tool sources arcade evals examples/evals/eval_comprehensive_comparison.py \ - -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \ + -p openai:gpt-4o \ + -p anthropic:claude-sonnet-4-5-20250929 \ -k openai:YOUR_KEY \ -k anthropic:YOUR_KEY \ -o comparison.html -d @@ -332,6 +340,46 @@ arcade evals examples/evals/eval_stdio_mcp_server.py \ -o full_results.html -d ``` +### Pattern 7: Multi-Run Evaluation + +Run each case multiple times to measure consistency and reduce variance: + +```bash +# Run each case 5 times with random seeds, pass if majority of runs pass +arcade evals examples/evals/eval_arcade_gateway.py \ + --api-key openai:YOUR_KEY \ + --num-runs 5 \ + --seed random \ + --multi-run-pass-rule majority \ + -o stability.html -d +``` + +The output will include per-case statistics: mean score, standard deviation, +individual run results, and per-critic field breakdowns. + +**Seed policies:** +- `constant` (default) — Uses a fixed seed (42) for reproducible results +- `random` — Uses a different random seed per run for variance testing +- An integer (e.g., `--seed 123`) — Uses the given seed for all runs + +**Pass rules:** +- `last` (default) — Uses the last run's pass/fail result +- `mean` — Passes if mean score meets the rubric threshold +- `majority` — Passes if more than half of the runs pass + +### Pattern 8: Multi-Run Capture Mode + +Capture mode also supports multiple runs: + +```bash +arcade evals examples/evals/eval_arcade_gateway.py \ + --capture \ + --num-runs 3 \ + --seed random \ + --api-key openai:YOUR_KEY \ + -o captured.json +``` + ## 🐛 Troubleshooting ### Error: "No module named 'openai'" diff --git a/libs/arcade-cli/arcade_cli/evals_runner.py b/libs/arcade-cli/arcade_cli/evals_runner.py index 4c16d5ec..ed327325 100644 --- a/libs/arcade-cli/arcade_cli/evals_runner.py +++ b/libs/arcade-cli/arcade_cli/evals_runner.py @@ -159,6 +159,9 @@ async def _run_eval_task( suite_func: Callable[..., Any], model_spec: ModelSpec, max_concurrent: int, + num_runs: int, + seed: str | int, + multi_run_pass_rule: str, include_context: bool = False, ) -> EvalTaskResult: """ @@ -175,6 +178,9 @@ async def _run_eval_task( max_concurrency=max_concurrent, provider=model_spec.provider.value, include_context=include_context, + num_runs=num_runs, + seed=seed, + multi_run_pass_rule=multi_run_pass_rule, ) return EvalTaskResult.from_success( suite_name, model_spec.model, model_spec.provider.value, result @@ -198,6 +204,8 @@ async def _run_capture_task( model_spec: ModelSpec, max_concurrent: int, include_context: bool, + num_runs: int, + seed: str | int, ) -> CaptureTaskResult: """ Run a single capture task with error handling. @@ -214,6 +222,8 @@ async def _run_capture_task( provider=model_spec.provider.value, capture_mode=True, include_context=include_context, + num_runs=num_runs, + seed=seed, ) return CaptureTaskResult.from_success( suite_name, model_spec.model, model_spec.provider.value, result @@ -246,6 +256,9 @@ async def run_evaluations( output_format: str, failed_only: bool, console: Console, + num_runs: int, + seed: str | int, + multi_run_pass_rule: str, include_context: bool = False, ) -> None: """ @@ -262,6 +275,9 @@ async def run_evaluations( output_format: Format for file output ('txt', 'md'). failed_only: Whether to show only failed evaluations. console: Rich console for output. + num_runs: Number of runs per case. + seed: Seed policy ("constant", "random", or an integer seed). + multi_run_pass_rule: How to determine pass/warn for multi-run cases. include_context: Whether to include system_message and additional_messages. """ tasks = [] @@ -280,6 +296,9 @@ async def run_evaluations( model_spec=model_spec, max_concurrent=max_concurrent, include_context=include_context, + num_runs=num_runs, + seed=seed, + multi_run_pass_rule=multi_run_pass_rule, ) ) tasks.append(task) @@ -370,6 +389,8 @@ async def run_capture( output_file: str | None, output_format: str, console: Console, + num_runs: int, + seed: str | int, ) -> None: """ Run evaluation suites in capture mode and output results. @@ -385,6 +406,8 @@ async def run_capture( output_file: Optional file path to write results. output_format: Output format ('json', 'txt', 'md', 'html'). console: Rich console for output. + num_runs: Number of runs per case. + seed: Seed policy ("constant", "random", or an integer seed). """ tasks = [] @@ -402,6 +425,8 @@ async def run_capture( model_spec=model_spec, max_concurrent=max_concurrent, include_context=include_context, + num_runs=num_runs, + seed=seed, ) ) tasks.append(task) diff --git a/libs/arcade-cli/arcade_cli/formatters/base.py b/libs/arcade-cli/arcade_cli/formatters/base.py index 3b1d6166..ebe6a2de 100644 --- a/libs/arcade-cli/arcade_cli/formatters/base.py +++ b/libs/arcade-cli/arcade_cli/formatters/base.py @@ -214,11 +214,21 @@ def group_comparative_by_case( } # Store this track's result for this case - comparative_groups[model][base_suite][case_name]["tracks"][track_name] = { + track_result: dict[str, Any] = { "evaluation": evaluation, "name": case_name, "input": case.get("input", ""), } + run_stats = case.get("run_stats") + if run_stats: + track_result["run_stats"] = run_stats + critic_stats = case.get("critic_stats") + if critic_stats: + track_result["critic_stats"] = critic_stats + + comparative_groups[model][base_suite][case_name]["tracks"][track_name] = ( + track_result + ) return ( comparative_groups, @@ -414,11 +424,19 @@ def group_comparative_by_case_first( } # Store this track's result - case_groups[base_suite][case_name][model]["tracks"][track_name] = { + track_result: dict[str, Any] = { "evaluation": evaluation, "name": case_name, "input": case.get("input", ""), } + run_stats = case.get("run_stats") + if run_stats: + track_result["run_stats"] = run_stats + critic_stats = case.get("critic_stats") + if critic_stats: + track_result["critic_stats"] = critic_stats + + case_groups[base_suite][case_name][model]["tracks"][track_name] = track_result return ( case_groups, @@ -539,11 +557,17 @@ def group_eval_for_comparison( comparison_data[suite_name][case_name] = {} # Store this model's result for this case - comparison_data[suite_name][case_name][model] = { + case_entry: dict[str, Any] = { "evaluation": evaluation, "input": case.get("input", ""), "name": case_name, } + # Propagate multi-run stats if present + if case.get("run_stats"): + case_entry["run_stats"] = case["run_stats"] + if case.get("critic_stats"): + case_entry["critic_stats"] = case["critic_stats"] + comparison_data[suite_name][case_name][model] = case_entry # Calculate pass rates for _model, stats in per_model_stats.items(): diff --git a/libs/arcade-cli/arcade_cli/formatters/html.py b/libs/arcade-cli/arcade_cli/formatters/html.py index ab900aa2..c267e70b 100644 --- a/libs/arcade-cli/arcade_cli/formatters/html.py +++ b/libs/arcade-cli/arcade_cli/formatters/html.py @@ -163,10 +163,18 @@ class HtmlFormatter(EvalResultFormatter): # Show summary table only when NOT showing details (avoid duplication) if not show_details: - html_parts.append('') - html_parts.append( - "" + has_run_stats = any( + case.get("run_stats", {}).get("num_runs", 1) > 1 for case in cases ) + html_parts.append('
StatusCaseScore
') + if has_run_stats: + html_parts.append( + "" + ) + else: + html_parts.append( + "" + ) html_parts.append("") for case in cases: @@ -183,11 +191,20 @@ class HtmlFormatter(EvalResultFormatter): score_pct = evaluation.score * 100 case_name = self._escape_html(case["name"]) + run_stats = case.get("run_stats") or {} + score_display = f"{score_pct:.1f}%" + runs_display = "" + if run_stats.get("num_runs", 1) > 1: + std_pct = run_stats.get("std_deviation", 0.0) * 100 + score_display = f"{score_pct:.1f}% ± {std_pct:.1f}%" + runs_display = str(run_stats.get("num_runs", 1)) html_parts.append(f'') html_parts.append(f'') html_parts.append(f"") - html_parts.append(f'') + html_parts.append(f'') + if has_run_stats: + html_parts.append(f"") html_parts.append("") html_parts.append("
StatusCaseScoreRuns
StatusCaseScore
{status_text}{case_name}{score_pct:.1f}%{score_display}{runs_display or '-'}
") @@ -254,7 +271,15 @@ class HtmlFormatter(EvalResultFormatter): html_parts.append("") # Evaluation details - html_parts.append(self._format_evaluation_details(evaluation)) + run_id = self._make_safe_id(suite_name, case["name"], model) + html_parts.append( + self._format_evaluation_details( + evaluation, + case.get("run_stats"), + case.get("critic_stats"), + run_id=run_id, + ) + ) html_parts.append("") html_parts.append("") @@ -267,18 +292,50 @@ class HtmlFormatter(EvalResultFormatter): return "\n".join(html_parts) - def _format_evaluation_details(self, evaluation: Any) -> str: + def _format_evaluation_details( + self, + evaluation: Any, + run_stats: dict[str, Any] | None = None, + critic_stats: dict[str, Any] | None = None, + run_id: str | None = None, + ) -> str: """Format evaluation details as HTML table.""" - if evaluation.failure_reason: - return f'
Failure Reason: {self._escape_html(evaluation.failure_reason)}
' + parts: list[str] = [] + run_stats_html = self._format_run_stats_html(run_stats, evaluation) + if run_stats_html: + parts.append(run_stats_html) + + run_tabs_html = self._format_run_tabs_html(run_stats, run_id) + if run_tabs_html: + parts.append(run_tabs_html) + + critic_stats_html = self._format_critic_stats_html(critic_stats) + if critic_stats_html: + parts.append(critic_stats_html) + + if evaluation.failure_reason: + parts.append( + f'
Failure Reason: ' + f"{self._escape_html(evaluation.failure_reason)}
" + ) + return "\n".join(parts) + + # Only show field details table when there are NO run tabs + # (run tabs already show per-run field details, and Critic Stats shows the aggregation) + if not run_tabs_html: + details_table = self._format_critic_results_table(evaluation.results) + parts.append(details_table) + return "\n".join(parts) + + def _format_critic_results_table(self, results: list[dict[str, Any]]) -> str: lines = [''] lines.append( "" ) lines.append("") - for critic_result in evaluation.results: + for critic_result in results: is_criticized = critic_result.get("is_criticized", True) field = self._escape_html(critic_result["field"]) score = critic_result["score"] @@ -314,6 +371,177 @@ class HtmlFormatter(EvalResultFormatter): lines.append("
FieldMatchScoreExpectedActual
") return "\n".join(lines) + def _format_run_stats_html(self, run_stats: dict[str, Any] | None, evaluation: Any) -> str: + if not run_stats or run_stats.get("num_runs", 1) < 2: + return "" + if evaluation.passed: + status_label = "PASSED" + status_icon = "✅" + status_class = "passed" + elif evaluation.warning: + status_label = "WARNED" + status_icon = "⚠️" + status_class = "warned" + else: + status_label = "FAILED" + status_icon = "❌" + status_class = "failed" + mean_pct = run_stats.get("mean_score", 0.0) * 100 + std_pct = run_stats.get("std_deviation", 0.0) * 100 + num_runs = run_stats.get("num_runs", 0) + scores = run_stats.get("scores", []) + seed_policy = run_stats.get("seed_policy", "") + run_seeds = run_stats.get("run_seeds") or [] + pass_rule = run_stats.get("pass_rule", "") + + # Build score pills for each run + score_pills = [] + for i, score in enumerate(scores, 1): + score_pct = score * 100 + if score >= 0.8: + pill_class = "score-pill high" + elif score >= 0.6: + pill_class = "score-pill mid" + else: + pill_class = "score-pill low" + score_pills.append(f'R{i}: {score_pct:.0f}%') + scores_html = " ".join(score_pills) if score_pills else "" + + # Build seeds display + seeds_html = "" + if run_seeds and any(seed is not None for seed in run_seeds): + seeds_display = ", ".join(str(seed) for seed in run_seeds) + seeds_html = f'
🎲 Seeds{seeds_display}
' + + html = f"""
+
+
+ {status_icon} + {status_label} +
+
{num_runs} runs
+
+
+
+
+ {mean_pct:.1f}% + mean score +
+
+ ± {std_pct:.1f}% + std dev +
+
+
+
+
+
{scores_html}
+
+ +
""" + return html + + def _format_critic_stats_html(self, critic_stats: dict[str, Any] | None) -> str: + if not critic_stats: + return "" + lines = ['

📊 Critic Stats

'] + lines.append('') + lines.append( + "" + "" + "" + ) + lines.append("") + for field, stats in critic_stats.items(): + weight = stats.get("weight", 0.0) + mean_norm = stats.get("mean_score_normalized", 0.0) * 100 + std_norm = stats.get("std_deviation_normalized", 0.0) * 100 + mean_weighted = stats.get("mean_score", 0.0) * 100 + std_weighted = stats.get("std_deviation", 0.0) * 100 + # Color coding based on normalized mean: <60 red, 60-80 yellow, >80 green + if mean_norm < 60: + score_class = "score-low" + elif mean_norm < 80: + score_class = "score-mid" + else: + score_class = "score-high" + lines.append( + f'' + f"" + f"" + f'' + f"" + f"" + f"" + "" + ) + lines.append("
FieldWeightMean (norm %)Std (norm %)Mean (weighted %)Std (weighted %)
{self._escape_html(field)}{weight:.2f}{mean_norm:.2f}%{std_norm:.2f}%{mean_weighted:.2f}%{std_weighted:.2f}%
") + return "\n".join(lines) + + def _format_run_tabs_html(self, run_stats: dict[str, Any] | None, run_id: str | None) -> str: + if not run_stats or run_stats.get("num_runs", 1) < 2: + return "" + runs = run_stats.get("runs", []) + if not runs or run_id is None: + return "" + + tabs = ['
', '
'] + for idx, run in enumerate(runs, start=1): + active = "active" if idx == 1 else "" + if run.get("passed"): + status_class = "passed" + elif run.get("warning"): + status_class = "warned" + else: + status_class = "failed" + tabs.append( + f'' + ) + tabs.append("
") + + panels = ['
'] + for idx, run in enumerate(runs, start=1): + active = "active" if idx == 1 else "" + if run.get("passed"): + status = "✅ PASSED" + status_class = "passed" + elif run.get("warning"): + status = "⚠️ WARNED" + status_class = "warned" + else: + status = "❌ FAILED" + status_class = "failed" + score_pct = run.get("score", 0.0) * 100 + details = run.get("details", []) + panels.append( + f'
' + ) + panels.append(f"

Run {idx}: {status} — {score_pct:.2f}%

") + failure_reason = run.get("failure_reason") + if failure_reason: + panels.append( + f'
Failure Reason: ' + f"{self._escape_html(str(failure_reason))}
" + ) + if details: + panels.append(self._format_critic_results_table(details)) + panels.append("
") + panels.append("
") + + return "\n".join(tabs + panels) + def _escape_html(self, text: str) -> str: """Escape HTML special characters.""" return ( @@ -534,6 +762,7 @@ class HtmlFormatter(EvalResultFormatter): for model in model_order: if model in case_models: evaluation = case_models[model]["evaluation"] + run_stats = case_models[model].get("run_stats") score = evaluation.score * 100 if evaluation.passed: cell_class = "passed" @@ -544,7 +773,15 @@ class HtmlFormatter(EvalResultFormatter): else: cell_class = "failed" icon = "✗" - html_parts.append(f'{icon} {score:.0f}%') + if run_stats and run_stats.get("num_runs", 1) > 1: + std_pct = run_stats.get("std_deviation", 0.0) * 100 + runs = run_stats.get("num_runs", 1) + html_parts.append( + f'{icon} ' + f"{score:.0f}% ± {std_pct:.0f}%
n={runs}" + ) + else: + html_parts.append(f'{icon} {score:.0f}%') else: html_parts.append('-') @@ -582,7 +819,15 @@ class HtmlFormatter(EvalResultFormatter): html_parts.append( f"{self._escape_html(model)}: Score {evaluation.score * 100:.1f}%" ) - html_parts.append(self._format_evaluation_details(evaluation)) + run_id = self._make_safe_id(suite_name, case_name, model) + html_parts.append( + self._format_evaluation_details( + evaluation, + case_result.get("run_stats"), + case_result.get("critic_stats"), + run_id=run_id, + ) + ) html_parts.append("") html_parts.append("") @@ -609,9 +854,9 @@ class HtmlFormatter(EvalResultFormatter): .multi-model-summary .pass-rate { font-weight: bold; } .multi-model-summary .best-model { background-color: rgba(76, 175, 80, 0.1); } .best-overall { margin-top: 15px; padding: 10px; background: #1e1e1e; border-radius: 4px; } - .comparison-table { width: 100%; border-collapse: collapse; margin: 15px 0; } - .comparison-table th, .comparison-table td { padding: 10px; border: 1px solid #333; text-align: center; } - .comparison-table th { background-color: #252525; } + .comparison-table { width: 100%; border-collapse: collapse; margin: 15px 0; } + .comparison-table th, .comparison-table td { padding: 10px; border: 1px solid #333; text-align: center; } + .comparison-table th { background: linear-gradient(90deg, rgba(137, 180, 250, 0.25), rgba(203, 166, 247, 0.25)); } .comparison-table .case-name { text-align: left; font-weight: bold; } .comparison-table .passed { background-color: rgba(76, 175, 80, 0.2); color: #4caf50; } .comparison-table .failed { background-color: rgba(244, 67, 54, 0.2); color: #f44336; } @@ -1201,7 +1446,15 @@ class HtmlFormatter(EvalResultFormatter): f'{self._escape_html(track_name)}' ) lines.append("") - lines.append(self._format_evaluation_details(evaluation)) + run_id = self._make_safe_id(suite_name, case_name, f"{track_name}") + lines.append( + self._format_evaluation_details( + evaluation, + track_result.get("run_stats"), + track_result.get("critic_stats"), + run_id=run_id, + ) + ) lines.append("") # track-panel lines.append("") # track-panels-container @@ -1256,6 +1509,9 @@ document.querySelectorAll('.track-tab').forEach(tab => { --blue: #89b4fa; --purple: #cba6f7; --cyan: #94e2d5; + --accent: #89b4fa; + --accent-2: #cba6f7; + --shadow: rgba(0, 0, 0, 0.35); } * { @@ -1282,6 +1538,23 @@ document.querySelectorAll('.track-tab').forEach(tab => { padding-bottom: 10px; } + /* Critic stats score coloring: red <60%, yellow 60-80%, green >80% */ + .critic-stats-table tr.score-low td.score-value { + color: var(--red); + font-weight: bold; + } + .critic-stats-table tr.score-mid td.score-value { + color: var(--yellow); + font-weight: bold; + } + .critic-stats-table tr.score-high td.score-value { + color: var(--green); + font-weight: bold; + } + .critic-stats-table tr.score-low { background: rgba(243, 139, 168, 0.08); } + .critic-stats-table tr.score-mid { background: rgba(249, 226, 175, 0.08); } + .critic-stats-table tr.score-high { background: rgba(166, 227, 161, 0.08); } + h2 { color: var(--blue); margin-top: 30px; @@ -1480,6 +1753,234 @@ document.querySelectorAll('.track-tab').forEach(tab => { font-size: 0.9em; } + .critic-stats { + margin: 10px 0; + padding: 10px; + background: #202020; + border-radius: 6px; + } + + /* Run Stats Card - Modern Design */ + .run-stats-card { + margin: 15px 0; + border-radius: 12px; + background: linear-gradient(145deg, #252535, #1a1a2a); + border: 1px solid var(--border-color); + overflow: hidden; + } + .run-stats-card.passed { border-left: 4px solid var(--green); } + .run-stats-card.warned { border-left: 4px solid var(--yellow); } + .run-stats-card.failed { border-left: 4px solid var(--red); } + + .run-stats-header { + display: flex; + justify-content: space-between; + align-items: center; + padding: 12px 16px; + background: rgba(0, 0, 0, 0.2); + border-bottom: 1px solid var(--border-color); + } + + .run-status-badge { + display: flex; + align-items: center; + gap: 8px; + padding: 6px 14px; + border-radius: 20px; + font-weight: bold; + font-size: 0.9em; + } + .run-status-badge.passed { background: rgba(166, 227, 161, 0.2); color: var(--green); } + .run-status-badge.warned { background: rgba(249, 226, 175, 0.2); color: var(--yellow); } + .run-status-badge.failed { background: rgba(243, 139, 168, 0.2); color: var(--red); } + + .run-count { + color: var(--text-color); + font-size: 0.9em; + opacity: 0.8; + } + + .run-stats-body { + padding: 16px; + } + + .score-display { + display: flex; + align-items: flex-end; + gap: 20px; + margin-bottom: 12px; + } + + .score-main { + display: flex; + flex-direction: column; + } + .score-main .score-value { + font-size: 2.2em; + font-weight: bold; + color: var(--blue); + line-height: 1; + } + .score-main .score-label { + font-size: 0.75em; + color: #888; + text-transform: uppercase; + letter-spacing: 0.05em; + } + + .score-deviation { + display: flex; + flex-direction: column; + } + .score-deviation .deviation-value { + font-size: 1.3em; + font-weight: 600; + color: var(--purple); + } + .score-deviation .deviation-label { + font-size: 0.7em; + color: #888; + text-transform: uppercase; + } + + .score-bar-container { + height: 8px; + background: rgba(255, 255, 255, 0.1); + border-radius: 4px; + overflow: hidden; + margin-bottom: 14px; + } + .score-bar { + height: 100%; + border-radius: 4px; + transition: width 0.3s ease; + } + .score-bar.passed { background: linear-gradient(90deg, var(--green), #7ecf7e); } + .score-bar.warned { background: linear-gradient(90deg, var(--yellow), #f5d67a); } + .score-bar.failed { background: linear-gradient(90deg, var(--red), #e87a94); } + + .run-scores { + display: flex; + flex-wrap: wrap; + gap: 8px; + } + + .score-pill { + padding: 4px 10px; + border-radius: 12px; + font-size: 0.8em; + font-weight: 600; + } + .score-pill.high { background: rgba(166, 227, 161, 0.2); color: var(--green); } + .score-pill.mid { background: rgba(249, 226, 175, 0.2); color: var(--yellow); } + .score-pill.low { background: rgba(243, 139, 168, 0.2); color: var(--red); } + + .run-stats-footer { + display: flex; + flex-wrap: wrap; + gap: 16px; + padding: 12px 16px; + background: rgba(0, 0, 0, 0.15); + border-top: 1px solid var(--border-color); + } + + .run-meta-item { + display: flex; + flex-direction: column; + gap: 2px; + } + .meta-label { + font-size: 0.7em; + color: #888; + text-transform: uppercase; + } + .meta-value { + font-size: 0.85em; + color: var(--text-color); + } + .meta-value.mono { + font-family: 'Consolas', 'Monaco', monospace; + font-size: 0.75em; + color: var(--cyan); + } + + .run-tabs { + margin: 12px 0; + border: 1px solid var(--border-color); + border-radius: 6px; + background: #1f1f2b; + } + + .run-tab-list { + display: flex; + gap: 6px; + padding: 8px; + border-bottom: 1px solid var(--border-color); + flex-wrap: wrap; + } + + .run-tab { + background: #2a2a3a; + color: var(--text-color); + border: 1px solid var(--border-color); + border-radius: 4px; + padding: 6px 10px; + cursor: pointer; + } + + .run-tab.active { + background: var(--blue); + color: #111; + border-color: var(--blue); + } + + .run-tab.passed { + border-color: var(--green); + } + + .run-tab.warned { + border-color: var(--yellow); + } + + .run-tab.failed { + border-color: var(--red); + } + + .run-panels { + padding: 10px; + } + + .run-panel { + display: none; + } + + .run-panel.active { + display: block; + } + + .run-panel.passed { + border-left: 3px solid var(--green); + padding-left: 10px; + } + + .run-panel.warned { + border-left: 3px solid var(--yellow); + padding-left: 10px; + } + + .run-panel.failed { + border-left: 3px solid var(--red); + padding-left: 10px; + } + + .run-status.passed { color: var(--green); } + .run-status.warned { color: var(--yellow); } + .run-status.failed { color: var(--red); } + + .aggregate-details { + margin-top: 10px; + } + .field-name { color: var(--purple); font-weight: 600; @@ -1954,6 +2455,39 @@ document.querySelectorAll('.track-tab').forEach(tab => { margin: 8px 0; } + """ @@ -1994,19 +2528,48 @@ class CaptureHtmlFormatter(CaptureFormatter): for case in capture.captured_cases: total_cases += 1 tool_calls_html = [] + runs = getattr(case, "runs", None) - for tc in case.tool_calls: - total_calls += 1 - args_html = "" - if tc.args: - args_json = json.dumps(tc.args, indent=2) - args_html = f'
{self._escape_html(args_json)}
' - tool_calls_html.append( - f'
' - f'{self._escape_html(tc.name)}' - f"{args_html}" - f"
" - ) + if runs: + for run_index, run in enumerate(runs, start=1): + run_calls_html = [] + for tc in run.tool_calls: + total_calls += 1 + args_html = "" + if tc.args: + args_json = json.dumps(tc.args, indent=2) + args_html = ( + f'
{self._escape_html(args_json)}
' + ) + run_calls_html.append( + f'
' + f'{self._escape_html(tc.name)}' + f"{args_html}" + f"
" + ) + if not run_calls_html: + run_calls_html.append( + '
No tool calls captured
' + ) + tool_calls_html.append( + f'
' + f'Run {run_index}' + f'{"".join(run_calls_html)}' + f"
" + ) + else: + for tc in case.tool_calls: + total_calls += 1 + args_html = "" + if tc.args: + args_json = json.dumps(tc.args, indent=2) + args_html = f'
{self._escape_html(args_json)}
' + tool_calls_html.append( + f'
' + f'{self._escape_html(tc.name)}' + f"{args_html}" + f"
" + ) if not tool_calls_html: tool_calls_html.append('
No tool calls captured
') @@ -2498,7 +3061,31 @@ class CaptureHtmlFormatter(CaptureFormatter): f'
{self._escape_html(model)}
' ) - if captured_case.tool_calls: + runs = getattr(captured_case, "runs", None) + if runs: + for run_index, run in enumerate(runs, start=1): + html_parts.append( + f'
' + f"Run {run_index}" + ) + if run.tool_calls: + for tc in run.tool_calls: + total_calls += 1 + args_html = "" + if tc.args: + args_json = json.dumps(tc.args, indent=2) + args_html = f'
{self._escape_html(args_json)}
' + html_parts.append( + f'
' + f'{self._escape_html(tc.name)}' + f"{args_html}
" + ) + else: + html_parts.append( + '
No tool calls
' + ) + html_parts.append("
") + elif captured_case.tool_calls: for tc in captured_case.tool_calls: total_calls += 1 args_html = "" @@ -2539,7 +3126,29 @@ class CaptureHtmlFormatter(CaptureFormatter): f'
{self._escape_html(model)}
' ) - if captured_case.tool_calls: + runs = getattr(captured_case, "runs", None) + if runs: + for run_index, run in enumerate(runs, start=1): + html_parts.append( + f'
' + f"Run {run_index}" + ) + if run.tool_calls: + for tc in run.tool_calls: + total_calls += 1 + args_html = "" + if tc.args: + args_json = json.dumps(tc.args, indent=2) + args_html = f'
{self._escape_html(args_json)}
' + html_parts.append( + f'
' + f'{self._escape_html(tc.name)}' + f"{args_html}
" + ) + else: + html_parts.append('
No tool calls
') + html_parts.append("
") + elif captured_case.tool_calls: for tc in captured_case.tool_calls: total_calls += 1 args_html = "" @@ -2704,6 +3313,19 @@ document.querySelectorAll('.track-tab').forEach(tab => {{ text-transform: uppercase; margin-bottom: 0.5rem; }} + .capture-run {{ + margin-bottom: 0.75rem; + background: var(--bg-primary); + border: 1px solid var(--border); + border-radius: 6px; + padding: 0.5rem 0.75rem; + }} + .capture-run summary {{ + cursor: pointer; + font-weight: 600; + color: var(--accent); + margin-bottom: 0.5rem; + }} .tool-call {{ background: var(--bg-primary); border-left: 3px solid var(--accent); diff --git a/libs/arcade-cli/arcade_cli/formatters/json.py b/libs/arcade-cli/arcade_cli/formatters/json.py index 361974b2..e5ebcb0b 100644 --- a/libs/arcade-cli/arcade_cli/formatters/json.py +++ b/libs/arcade-cli/arcade_cli/formatters/json.py @@ -249,6 +249,13 @@ class JsonFormatter(EvalResultFormatter): if evaluation.failure_reason: track_data["failure_reason"] = evaluation.failure_reason + run_stats = track_result.get("run_stats") + if run_stats: + track_data["run_stats"] = run_stats + critic_stats = track_result.get("critic_stats") + if critic_stats: + track_data["critic_stats"] = critic_stats + if show_details and evaluation.results: track_data["details"] = self._serialize_critic_results( evaluation.results @@ -375,6 +382,13 @@ class JsonFormatter(EvalResultFormatter): if evaluation.failure_reason: track_data["failure_reason"] = evaluation.failure_reason + run_stats = track_result.get("run_stats") + if run_stats: + track_data["run_stats"] = run_stats + critic_stats = track_result.get("critic_stats") + if critic_stats: + track_data["critic_stats"] = critic_stats + if show_details and evaluation.results: track_data["details"] = self._serialize_critic_results( evaluation.results @@ -496,6 +510,13 @@ class JsonFormatter(EvalResultFormatter): if evaluation.failure_reason: model_data["failure_reason"] = evaluation.failure_reason + run_stats = case_result.get("run_stats") + if run_stats: + model_data["run_stats"] = run_stats + critic_stats = case_result.get("critic_stats") + if critic_stats: + model_data["critic_stats"] = critic_stats + if show_details and evaluation.results: model_data["details"] = self._serialize_critic_results(evaluation.results) @@ -537,6 +558,13 @@ class JsonFormatter(EvalResultFormatter): if evaluation.failure_reason: case_data["failure_reason"] = evaluation.failure_reason + run_stats = case.get("run_stats") + if run_stats: + case_data["run_stats"] = run_stats + critic_stats = case.get("critic_stats") + if critic_stats: + case_data["critic_stats"] = critic_stats + if show_details and evaluation.results: case_data["details"] = self._serialize_critic_results(evaluation.results) @@ -657,12 +685,24 @@ class CaptureJsonFormatter(CaptureFormatter): continue captured_case = models_dict[model] - track_output["models"][model] = { + model_output: dict[str, Any] = { "tool_calls": [ {"name": tc.name, "args": tc.args} for tc in captured_case.tool_calls ], } + runs = getattr(captured_case, "runs", None) + if runs: + model_output["runs"] = [ + { + "tool_calls": [ + {"name": tc.name, "args": tc.args} + for tc in run.tool_calls + ] + } + for run in runs + ] + track_output["models"][model] = model_output case_output["tracks"][track_display] = track_output else: @@ -678,12 +718,23 @@ class CaptureJsonFormatter(CaptureFormatter): continue captured_case = models_dict[model] - case_output["models"][model] = { + model_output = { "tool_calls": [ {"name": tc.name, "args": tc.args} for tc in captured_case.tool_calls ], } + runs = getattr(captured_case, "runs", None) + if runs: + model_output["runs"] = [ + { + "tool_calls": [ + {"name": tc.name, "args": tc.args} for tc in run.tool_calls + ] + } + for run in runs + ] + case_output["models"][model] = model_output output["grouped_by_case"][suite_name][case_name] = case_output diff --git a/libs/arcade-cli/arcade_cli/formatters/markdown.py b/libs/arcade-cli/arcade_cli/formatters/markdown.py index ea5b3486..f6fef9c8 100644 --- a/libs/arcade-cli/arcade_cli/formatters/markdown.py +++ b/libs/arcade-cli/arcade_cli/formatters/markdown.py @@ -89,37 +89,16 @@ class MarkdownFormatter(EvalResultFormatter): lines.append("## Summary") lines.append("") - if failed_only and original_counts: - orig_total, orig_passed, orig_failed, orig_warned = original_counts - lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)") - lines.append("") - lines.append("| Metric | Count |") - lines.append("|--------|-------|") - lines.append(f"| **Total** | {orig_total} |") - lines.append(f"| ✅ Passed | {orig_passed} |") - if orig_warned > 0: - lines.append(f"| ⚠️ Warnings | {orig_warned} |") - lines.append(f"| ❌ Failed | {orig_failed} |") - else: - lines.append("| Metric | Count |") - lines.append("|--------|-------|") - lines.append(f"| **Total** | {total_cases} |") - lines.append(f"| ✅ Passed | {total_passed} |") - if total_warned > 0: - lines.append(f"| ⚠️ Warnings | {total_warned} |") - if total_failed > 0: - lines.append(f"| ❌ Failed | {total_failed} |") - - # Pass rate - if total_cases > 0: - if failed_only and original_counts and original_counts[0] > 0: - pass_rate = (original_counts[1] / original_counts[0]) * 100 - else: - pass_rate = (total_passed / total_cases) * 100 - lines.append("") - lines.append(f"**Pass Rate:** {pass_rate:.1f}%") - - lines.append("") + lines.extend( + self._format_summary_table_md( + total_cases, + total_passed, + total_failed, + total_warned, + failed_only, + original_counts, + ) + ) # Results by model lines.append("## Results by Model") @@ -134,8 +113,15 @@ class MarkdownFormatter(EvalResultFormatter): lines.append("") # Results table - lines.append("| Status | Case | Score |") - lines.append("|--------|------|-------|") + has_run_stats = any( + case.get("run_stats", {}).get("num_runs", 1) > 1 for case in cases + ) + if has_run_stats: + lines.append("| Status | Case | Score | Runs |") + lines.append("|--------|------|-------|------|") + else: + lines.append("| Status | Case | Score |") + lines.append("|--------|------|-------|") for case in cases: evaluation = case["evaluation"] @@ -148,7 +134,15 @@ class MarkdownFormatter(EvalResultFormatter): score_pct = evaluation.score * 100 case_name = case["name"].replace("|", "\\|") - lines.append(f"| {status} | {case_name} | {score_pct:.1f}% |") + run_stats = case.get("run_stats") or {} + score_display = f"{score_pct:.1f}%" + if run_stats.get("num_runs", 1) > 1: + std_pct = run_stats.get("std_deviation", 0.0) * 100 + score_display = f"{score_pct:.1f}% ± {std_pct:.1f}%" + runs_value = run_stats.get("num_runs", 1) + lines.append(f"| {status} | {case_name} | {score_display} | {runs_value} |") + else: + lines.append(f"| {status} | {case_name} | {score_display} |") lines.append("") @@ -175,6 +169,16 @@ class MarkdownFormatter(EvalResultFormatter): lines.append(f"**Input:** `{case['input']}`") lines.append("") + run_stats = case.get("run_stats") + lines.extend(self._format_run_stats_summary(run_stats)) + + run_detail_lines = self._format_run_details_md(run_stats) + lines.extend(run_detail_lines) + + critic_stats = case.get("critic_stats") + if critic_stats: + lines.extend(self._format_critic_stats_summary(critic_stats)) + # Context section (if include_context is True) if include_context: system_msg = case.get("system_message") @@ -194,8 +198,10 @@ class MarkdownFormatter(EvalResultFormatter): lines.append("") lines.append("") - # Evaluation details - lines.append(self._format_evaluation_details(evaluation)) + # Only show the critic results table when there are no per-run + # details (run details already include per-run field tables) + if not run_detail_lines: + lines.append(self._format_evaluation_details(evaluation)) lines.append("") lines.append("---") lines.append("") @@ -212,31 +218,120 @@ class MarkdownFormatter(EvalResultFormatter): if evaluation.failure_reason: lines.append(f"**Failure Reason:** {evaluation.failure_reason}") else: - lines.append("| Field | Match | Score | Expected | Actual |") - lines.append("|-------|-------|-------|----------|--------|") - - for critic_result in evaluation.results: - is_criticized = critic_result.get("is_criticized", True) - field = critic_result["field"] - score = critic_result["score"] - weight = critic_result["weight"] - expected = str(critic_result["expected"]).replace("|", "\\|") - actual = str(critic_result["actual"]).replace("|", "\\|") - - # Truncate long values for table readability - expected = truncate_field_value(expected, MD_MAX_FIELD_LENGTH) - actual = truncate_field_value(actual, MD_MAX_FIELD_LENGTH) - - if is_criticized: - match_icon = "✅" if critic_result["match"] else "❌" - lines.append( - f"| {field} | {match_icon} | {score:.2f}/{weight:.2f} | `{expected}` | `{actual}` |" - ) - else: - lines.append(f"| {field} | — | - | `{expected}` | `{actual}` |") + lines.extend(self._format_critic_results_table_md(evaluation.results)) return "\n".join(lines) + def _format_critic_results_table_md(self, results: list[dict[str, Any]]) -> list[str]: + lines: list[str] = [] + lines.append("| Field | Match | Score | Expected | Actual |") + lines.append("|-------|-------|-------|----------|--------|") + + for critic_result in results: + is_criticized = critic_result.get("is_criticized", True) + field = critic_result["field"] + score = critic_result["score"] + weight = critic_result["weight"] + expected = str(critic_result["expected"]).replace("|", "\\|") + actual = str(critic_result["actual"]).replace("|", "\\|") + + # Truncate long values for table readability + expected = truncate_field_value(expected, MD_MAX_FIELD_LENGTH) + actual = truncate_field_value(actual, MD_MAX_FIELD_LENGTH) + + if is_criticized: + match_icon = "✅" if critic_result["match"] else "❌" + lines.append( + f"| {field} | {match_icon} | {score:.2f}/{weight:.2f} | `{expected}` | `{actual}` |" + ) + else: + lines.append(f"| {field} | — | - | `{expected}` | `{actual}` |") + + return lines + + def _format_critic_stats_summary(self, critic_stats: dict[str, Any]) -> list[str]: + lines: list[str] = [] + lines.append("**Critic Stats (normalized & weighted):** ") + lines.append( + "| Field | Weight | Mean (norm %) | Std (norm %) | Mean (weighted %) | Std (weighted %) |" + ) + lines.append( + "|-------|--------|---------------|--------------|-------------------|------------------|" + ) + for field, stats in critic_stats.items(): + weight = stats.get("weight", 0.0) + mean_norm = stats.get("mean_score_normalized", 0.0) * 100 + std_norm = stats.get("std_deviation_normalized", 0.0) * 100 + mean_weighted = stats.get("mean_score", 0.0) * 100 + std_weighted = stats.get("std_deviation", 0.0) * 100 + lines.append( + f"| {field} | {weight:.2f} | {mean_norm:.2f}% | {std_norm:.2f}% | " + f"{mean_weighted:.2f}% | {std_weighted:.2f}% |" + ) + lines.append("") + return lines + + def _format_run_stats_summary(self, run_stats: dict[str, Any] | None) -> list[str]: + """Format the run statistics summary as a Markdown bullet list.""" + if not run_stats or run_stats.get("num_runs", 1) < 2: + return [] + lines: list[str] = [] + mean_pct = run_stats.get("mean_score", 0.0) * 100 + std_pct = run_stats.get("std_deviation", 0.0) * 100 + scores = run_stats.get("scores", []) + scores_display = ", ".join(f"{score * 100:.2f}%" for score in scores) + lines.append("**Run Stats:** ") + lines.append(f"- Runs: {run_stats.get('num_runs', len(scores))} ") + lines.append(f"- Mean Score: {mean_pct:.2f}% ") + lines.append(f"- Std Deviation: {std_pct:.2f}% ") + if scores_display: + lines.append(f"- Scores: {scores_display} ") + seed_policy = run_stats.get("seed_policy") + if seed_policy: + lines.append(f"- Seed Policy: {seed_policy} ") + run_seeds = run_stats.get("run_seeds") + if run_seeds and any(seed is not None for seed in run_seeds): + seeds_display = ", ".join(str(seed) for seed in run_seeds) + lines.append(f"- Run Seeds: {seeds_display} ") + pass_rule = run_stats.get("pass_rule") + if pass_rule: + lines.append(f"- Pass Rule: {pass_rule} ") + lines.append("") + return lines + + def _format_run_details_md(self, run_stats: dict[str, Any] | None) -> list[str]: + if not run_stats or run_stats.get("num_runs", 1) < 2: + return [] + runs = run_stats.get("runs", []) + if not runs: + return [] + lines: list[str] = [] + lines.append("**Run Details:** ") + for idx, run in enumerate(runs, start=1): + if run.get("passed"): + status = "✅ PASSED" + elif run.get("warning"): + status = "⚠️ WARNED" + else: + status = "❌ FAILED" + score_pct = run.get("score", 0.0) * 100 + line = f"- Run {idx}: {status} — {score_pct:.2f}%" + failure_reason = run.get("failure_reason") + if failure_reason: + line += f" ({failure_reason})" + lines.append(line) + details = run.get("details", []) + if details: + lines.append("") + lines.append("
") + lines.append(f"Run {idx} details") + lines.append("") + lines.extend(self._format_critic_results_table_md(details)) + lines.append("") + lines.append("
") + lines.append("") + return lines + # ========================================================================= # MULTI-MODEL EVALUATION FORMATTING # ========================================================================= @@ -371,7 +466,19 @@ class MarkdownFormatter(EvalResultFormatter): lines.append(f"**{model}:** Score {evaluation.score * 100:.1f}%") lines.append("") - lines.append(self._format_evaluation_details(evaluation)) + run_stats = case_result.get("run_stats") + lines.extend(self._format_run_stats_summary(run_stats)) + + run_detail_lines = self._format_run_details_md(run_stats) + lines.extend(run_detail_lines) + + critic_stats = case_result.get("critic_stats") + if critic_stats: + lines.extend(self._format_critic_stats_summary(critic_stats)) + # Only show the critic results table when there are no per-run + # details (run details already include per-run field tables) + if not run_detail_lines: + lines.append(self._format_evaluation_details(evaluation)) lines.append("") lines.append("---") @@ -471,37 +578,16 @@ class MarkdownFormatter(EvalResultFormatter): lines.append(f"**Tracks compared:** {', '.join(f'`{t}`' for t in all_tracks)}") lines.append("") - if failed_only and original_counts: - orig_total, orig_passed, orig_failed, orig_warned = original_counts - lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)") - lines.append("") - lines.append("| Metric | Count |") - lines.append("|--------|-------|") - lines.append(f"| **Total** | {orig_total} |") - lines.append(f"| ✅ Passed | {orig_passed} |") - if orig_warned > 0: - lines.append(f"| ⚠️ Warnings | {orig_warned} |") - lines.append(f"| ❌ Failed | {orig_failed} |") - else: - lines.append("| Metric | Count |") - lines.append("|--------|-------|") - lines.append(f"| **Total** | {total_cases} |") - lines.append(f"| ✅ Passed | {total_passed} |") - if total_warned > 0: - lines.append(f"| ⚠️ Warnings | {total_warned} |") - if total_failed > 0: - lines.append(f"| ❌ Failed | {total_failed} |") - - # Pass rate - if total_cases > 0: - if failed_only and original_counts and original_counts[0] > 0: - pass_rate = (original_counts[1] / original_counts[0]) * 100 - else: - pass_rate = (total_passed / total_cases) * 100 - lines.append("") - lines.append(f"**Pass Rate:** {pass_rate:.1f}%") - - lines.append("") + lines.extend( + self._format_summary_table_md( + total_cases, + total_passed, + total_failed, + total_warned, + failed_only, + original_counts, + ) + ) # Results by model lines.append("## Results by Model") @@ -522,77 +608,13 @@ class MarkdownFormatter(EvalResultFormatter): # List all cases with summary comparison for case_name, case_data in cases.items(): - # Context section (if include_context is True) if include_context: - system_msg = case_data.get("system_message") - addl_msgs = case_data.get("additional_messages") - if system_msg or addl_msgs: - lines.append("
") - lines.append("📋 Context") - lines.append("") - if system_msg: - lines.append(f"**System Message:** {system_msg}") - lines.append("") - if addl_msgs: - lines.append(f"**💬 Conversation ({len(addl_msgs)} messages):**") - lines.append("") - for msg in addl_msgs: - role = msg.get("role", "unknown") - content = msg.get("content", "") - name = msg.get("name", "") - role_icons = { - "user": "👤", - "assistant": "🤖", - "tool": "🔧", - "system": "⚙️", - } - icon = role_icons.get(role, "💬") - label = ( - f"{icon} **{role.title()}**" - if not name - else f"{icon} **{role.title()}** (`{name}`)" - ) - lines.append(f"> {label}") - if content: - if role == "tool": - try: - import json - - parsed = json.loads(content) - formatted = json.dumps(parsed, indent=2) - lines.append("> ```json") - for json_line in formatted.split("\n"): - lines.append(f"> {json_line}") - lines.append("> ```") - except (json.JSONDecodeError, TypeError): - lines.append(f"> {content}") - else: - lines.append(f"> {content}") - tool_calls = msg.get("tool_calls", []) - if tool_calls: - for tc in tool_calls: - func = tc.get("function", {}) - tc_name = func.get("name", "unknown") - tc_args = func.get("arguments", "{}") - lines.append(f"> 🔧 **{tc_name}**") - try: - import json - - args_dict = ( - json.loads(tc_args) - if isinstance(tc_args, str) - else tc_args - ) - formatted = json.dumps(args_dict, indent=2) - lines.append("> ```json") - for arg_line in formatted.split("\n"): - lines.append(f"> {arg_line}") - lines.append("> ```") - except (json.JSONDecodeError, TypeError): - lines.append(f"> `{tc_args}`") - lines.append(">") - lines.append("
") - lines.append("") + lines.extend( + self._format_context_section_md( + case_data.get("system_message"), + case_data.get("additional_messages"), + ) + ) lines.extend( self._format_comparative_case( @@ -647,37 +669,16 @@ class MarkdownFormatter(EvalResultFormatter): lines.append("## Summary") lines.append("") - if failed_only and original_counts: - orig_total, orig_passed, orig_failed, orig_warned = original_counts - lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)") - lines.append("") - lines.append("| Metric | Count |") - lines.append("|--------|-------|") - lines.append(f"| **Total** | {orig_total} |") - lines.append(f"| ✅ Passed | {orig_passed} |") - if orig_warned > 0: - lines.append(f"| ⚠️ Warnings | {orig_warned} |") - lines.append(f"| ❌ Failed | {orig_failed} |") - else: - lines.append("| Metric | Count |") - lines.append("|--------|-------|") - lines.append(f"| **Total** | {total_cases} |") - lines.append(f"| ✅ Passed | {total_passed} |") - if total_warned > 0: - lines.append(f"| ⚠️ Warnings | {total_warned} |") - if total_failed > 0: - lines.append(f"| ❌ Failed | {total_failed} |") - - # Pass rate - if total_cases > 0: - if failed_only and original_counts and original_counts[0] > 0: - pass_rate = (original_counts[1] / original_counts[0]) * 100 - else: - pass_rate = (total_passed / total_cases) * 100 - lines.append("") - lines.append(f"**Pass Rate:** {pass_rate:.1f}%") - - lines.append("") + lines.extend( + self._format_summary_table_md( + total_cases, + total_passed, + total_failed, + total_warned, + failed_only, + original_counts, + ) + ) # Results grouped by case lines.append("## Results by Case") @@ -705,77 +706,12 @@ class MarkdownFormatter(EvalResultFormatter): # Context section (if include_context is True) if include_context: - system_msg = first_model_data.get("system_message") - addl_msgs = first_model_data.get("additional_messages") - if system_msg or addl_msgs: - lines.append("
") - lines.append("📋 Context") - lines.append("") - if system_msg: - lines.append(f"**System Message:** {system_msg}") - lines.append("") - if addl_msgs: - lines.append(f"**💬 Conversation ({len(addl_msgs)} messages):**") - lines.append("") - for msg in addl_msgs: - role = msg.get("role", "unknown") - content = msg.get("content", "") - name = msg.get("name", "") - role_icons = { - "user": "👤", - "assistant": "🤖", - "tool": "🔧", - "system": "⚙️", - } - icon = role_icons.get(role, "💬") - label = ( - f"{icon} **{role.title()}**" - if not name - else f"{icon} **{role.title()}** (`{name}`)" - ) - lines.append(f"> {label}") - if content: - # For tool responses, format as JSON code block - if role == "tool": - try: - import json - - parsed = json.loads(content) - formatted = json.dumps(parsed, indent=2) - lines.append("> ```json") - for json_line in formatted.split("\n"): - lines.append(f"> {json_line}") - lines.append("> ```") - except (json.JSONDecodeError, TypeError): - lines.append(f"> {content}") - else: - lines.append(f"> {content}") - # Handle tool calls - tool_calls = msg.get("tool_calls", []) - if tool_calls: - for tc in tool_calls: - func = tc.get("function", {}) - tc_name = func.get("name", "unknown") - tc_args = func.get("arguments", "{}") - lines.append(f"> 🔧 **{tc_name}**") - try: - import json - - args_dict = ( - json.loads(tc_args) - if isinstance(tc_args, str) - else tc_args - ) - formatted = json.dumps(args_dict, indent=2) - lines.append("> ```json") - for arg_line in formatted.split("\n"): - lines.append(f"> {arg_line}") - lines.append("> ```") - except (json.JSONDecodeError, TypeError): - lines.append(f"> `{tc_args}`") - lines.append(">") - lines.append("
") - lines.append("") + lines.extend( + self._format_context_section_md( + first_model_data.get("system_message"), + first_model_data.get("additional_messages"), + ) + ) # Show each model's results for this case for model in model_order: @@ -876,7 +812,20 @@ class MarkdownFormatter(EvalResultFormatter): lines.append("
") lines.append(f"📋 {track_name} — Detailed Results") lines.append("") - lines.append(self._format_evaluation_details(evaluation)) + run_stats = track_result.get("run_stats") + lines.extend(self._format_run_stats_summary(run_stats)) + + run_detail_lines = self._format_run_details_md(run_stats) + lines.extend(run_detail_lines) + + critic_stats = track_result.get("critic_stats") + if critic_stats: + lines.extend(self._format_critic_stats_summary(critic_stats)) + + # Only show the critic results table when there are no per-run + # details (run details already include per-run field tables) + if not run_detail_lines: + lines.append(self._format_evaluation_details(evaluation)) lines.append("") lines.append("
") lines.append("") @@ -886,6 +835,81 @@ class MarkdownFormatter(EvalResultFormatter): return lines + def _format_summary_table_md( + self, + total_cases: int, + total_passed: int, + total_failed: int, + total_warned: int, + failed_only: bool, + original_counts: tuple[int, int, int, int] | None, + ) -> list[str]: + """Build the summary table and pass rate used by regular and comparative formatters.""" + lines: list[str] = [] + if failed_only and original_counts: + orig_total, orig_passed, orig_failed, orig_warned = original_counts + lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)") + lines.append("") + lines.append("| Metric | Count |") + lines.append("|--------|-------|") + lines.append(f"| **Total** | {orig_total} |") + lines.append(f"| ✅ Passed | {orig_passed} |") + if orig_warned > 0: + lines.append(f"| ⚠️ Warnings | {orig_warned} |") + lines.append(f"| ❌ Failed | {orig_failed} |") + else: + lines.append("| Metric | Count |") + lines.append("|--------|-------|") + lines.append(f"| **Total** | {total_cases} |") + lines.append(f"| ✅ Passed | {total_passed} |") + if total_warned > 0: + lines.append(f"| ⚠️ Warnings | {total_warned} |") + if total_failed > 0: + lines.append(f"| ❌ Failed | {total_failed} |") + + # Pass rate + if total_cases > 0: + if failed_only and original_counts and original_counts[0] > 0: + pass_rate = (original_counts[1] / original_counts[0]) * 100 + else: + pass_rate = (total_passed / total_cases) * 100 + lines.append("") + lines.append(f"**Pass Rate:** {pass_rate:.1f}%") + + lines.append("") + return lines + + def _format_context_section_md( + self, + system_msg: str | None, + additional_messages: list[dict] | None, + ) -> list[str]: + """Build a collapsible context section for comparative display. + + Args: + system_msg: The system message, if any. + additional_messages: Conversation messages, if any. + + Returns: + List of formatted markdown lines (empty if no context data). + """ + if not system_msg and not additional_messages: + return [] + lines: list[str] = [] + lines.append("
") + lines.append("📋 Context") + lines.append("") + if system_msg: + lines.append(f"**System Message:** {system_msg}") + lines.append("") + if additional_messages: + lines.append(f"**💬 Conversation ({len(additional_messages)} messages):**") + lines.append("") + lines.extend(self._format_conversation_md(additional_messages)) + lines.append("
") + lines.append("") + return lines + def _format_conversation_md(self, messages: list[dict]) -> list[str]: """Format conversation messages as Markdown for context display.""" lines: list[str] = [] @@ -1003,7 +1027,25 @@ class CaptureMarkdownFormatter(CaptureFormatter): lines.append("#### Tool Calls") lines.append("") - if case.tool_calls: + runs = getattr(case, "runs", None) + if runs: + for run_index, run in enumerate(runs, start=1): + lines.append(f"**Run {run_index}**") + lines.append("") + if run.tool_calls: + for tc in run.tool_calls: + total_calls += 1 + lines.append(f"**`{tc.name}`**") + if tc.args: + lines.append("") + lines.append("```json") + lines.append(json.dumps(tc.args, indent=2)) + lines.append("```") + lines.append("") + else: + lines.append("*No tool calls captured*") + lines.append("") + elif case.tool_calls: for tc in case.tool_calls: total_calls += 1 lines.append(f"**`{tc.name}`**") @@ -1104,7 +1146,11 @@ class CaptureMarkdownFormatter(CaptureFormatter): continue captured_case = models_dict[model] - if captured_case.tool_calls: + runs = getattr(captured_case, "runs", None) + if runs: + tool_names = f"{len(runs)} run(s)" + total_calls += sum(len(run.tool_calls) for run in runs) + elif captured_case.tool_calls: tool_names = ", ".join( f"`{tc.name}`" for tc in captured_case.tool_calls ) @@ -1121,21 +1167,39 @@ class CaptureMarkdownFormatter(CaptureFormatter): continue captured_case = models_dict[model] - if not captured_case.tool_calls: + runs = getattr(captured_case, "runs", None) + if not runs and not captured_case.tool_calls: continue lines.append("
") lines.append(f"🤖 {model} - Details") lines.append("") - for tc in captured_case.tool_calls: - lines.append(f"**`{tc.name}`**") - if tc.args: + if runs: + for run_index, run in enumerate(runs, start=1): + lines.append(f"**Run {run_index}**") + lines.append("") + if run.tool_calls: + for tc in run.tool_calls: + lines.append(f"**`{tc.name}`**") + if tc.args: + lines.append("") + lines.append("```json") + lines.append(json.dumps(tc.args, indent=2)) + lines.append("```") + lines.append("") + else: + lines.append("*No tool calls captured*") + lines.append("") + else: + for tc in captured_case.tool_calls: + lines.append(f"**`{tc.name}`**") + if tc.args: + lines.append("") + lines.append("```json") + lines.append(json.dumps(tc.args, indent=2)) + lines.append("```") lines.append("") - lines.append("```json") - lines.append(json.dumps(tc.args, indent=2)) - lines.append("```") - lines.append("") lines.append("
") lines.append("") @@ -1160,7 +1224,11 @@ class CaptureMarkdownFormatter(CaptureFormatter): continue captured_case = models_dict[model] - if captured_case.tool_calls: + runs = getattr(captured_case, "runs", None) + if runs: + tool_names = f"{len(runs)} run(s)" + total_calls += sum(len(run.tool_calls) for run in runs) + elif captured_case.tool_calls: tool_names = ", ".join( f"`{tc.name}`" for tc in captured_case.tool_calls ) @@ -1177,21 +1245,39 @@ class CaptureMarkdownFormatter(CaptureFormatter): continue captured_case = models_dict[model] - if not captured_case.tool_calls: + runs = getattr(captured_case, "runs", None) + if not runs and not captured_case.tool_calls: continue lines.append("
") lines.append(f"🤖 {model} - Tool Call Details") lines.append("") - for tc in captured_case.tool_calls: - lines.append(f"**`{tc.name}`**") - if tc.args: + if runs: + for run_index, run in enumerate(runs, start=1): + lines.append(f"**Run {run_index}**") + lines.append("") + if run.tool_calls: + for tc in run.tool_calls: + lines.append(f"**`{tc.name}`**") + if tc.args: + lines.append("") + lines.append("```json") + lines.append(json.dumps(tc.args, indent=2)) + lines.append("```") + lines.append("") + else: + lines.append("*No tool calls captured*") + lines.append("") + else: + for tc in captured_case.tool_calls: + lines.append(f"**`{tc.name}`**") + if tc.args: + lines.append("") + lines.append("```json") + lines.append(json.dumps(tc.args, indent=2)) + lines.append("```") lines.append("") - lines.append("```json") - lines.append(json.dumps(tc.args, indent=2)) - lines.append("```") - lines.append("") lines.append("
") lines.append("") diff --git a/libs/arcade-cli/arcade_cli/formatters/text.py b/libs/arcade-cli/arcade_cli/formatters/text.py index 88bf3bc0..e8c5bcbc 100644 --- a/libs/arcade-cli/arcade_cli/formatters/text.py +++ b/libs/arcade-cli/arcade_cli/formatters/text.py @@ -91,7 +91,14 @@ class TextFormatter(EvalResultFormatter): status = "FAILED" score_percentage = evaluation.score * 100 - lines.append(f" {status} {case['name']} -- Score: {score_percentage:.2f}%") + run_stats = case.get("run_stats") or {} + stats_suffix = "" + if run_stats.get("num_runs", 1) > 1: + std_pct = run_stats.get("std_deviation", 0.0) * 100 + stats_suffix = f" (n={run_stats['num_runs']}, sd={std_pct:.2f}%)" + lines.append( + f" {status} {case['name']} -- Score: {score_percentage:.2f}%{stats_suffix}" + ) if show_details: lines.append(f" User Input: {case['input']}") @@ -112,6 +119,10 @@ class TextFormatter(EvalResultFormatter): lines.append("") lines.append(" Details:") + for stat_line in self._format_run_stats(case): + lines.append(f" {stat_line}") + for stat_line in self._format_critic_stats(case): + lines.append(f" {stat_line}") for detail_line in self._format_evaluation(evaluation).split("\n"): lines.append(f" {detail_line}") lines.append(" " + "-" * 52) @@ -121,23 +132,16 @@ class TextFormatter(EvalResultFormatter): lines.append("") # Summary - if failed_only and original_counts: - orig_total, orig_passed, orig_failed, orig_warned = original_counts - lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)") - summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}" - if orig_warned > 0: - summary += f" -- Warnings: {orig_warned}" - if orig_failed > 0: - summary += f" -- Failed: {orig_failed}" - else: - summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}" - if total_warned > 0: - summary += f" -- Warnings: {total_warned}" - if total_failed > 0: - summary += f" -- Failed: {total_failed}" - - lines.append(summary) - lines.append("") + lines.extend( + self._format_summary_lines( + total_cases, + total_passed, + total_failed, + total_warned, + failed_only, + original_counts, + ) + ) return "\n".join(lines) @@ -169,6 +173,70 @@ class TextFormatter(EvalResultFormatter): ) return "\n".join(result_lines) + def _format_run_stats(self, case: dict[str, Any]) -> list[str]: + run_stats = case.get("run_stats") + if not run_stats or run_stats.get("num_runs", 1) < 2: + return [] + scores = run_stats.get("scores", []) + scores_display = ", ".join(f"{score * 100:.2f}%" for score in scores) + mean_pct = run_stats.get("mean_score", 0.0) * 100 + std_pct = run_stats.get("std_deviation", 0.0) * 100 + lines = [ + "Run Stats:", + f" Runs: {run_stats.get('num_runs', len(scores))}", + f" Mean Score: {mean_pct:.2f}%", + f" Std Deviation: {std_pct:.2f}%", + ] + if scores_display: + lines.append(f" Scores: {scores_display}") + seed_policy = run_stats.get("seed_policy") + run_seeds = run_stats.get("run_seeds") + if seed_policy: + lines.append(f" Seed Policy: {seed_policy}") + if run_seeds and any(seed is not None for seed in run_seeds): + seeds_display = ", ".join(str(seed) for seed in run_seeds) + lines.append(f" Run Seeds: {seeds_display}") + pass_rule = run_stats.get("pass_rule") + if pass_rule: + lines.append(f" Pass Rule: {pass_rule}") + + runs = run_stats.get("runs", []) + if runs: + lines.append(" Run Results:") + for idx, run in enumerate(runs, start=1): + if run.get("passed"): + status = "PASSED" + elif run.get("warning"): + status = "WARNED" + else: + status = "FAILED" + score_pct = run.get("score", 0.0) * 100 + run_line = f" Run {idx}: {status} ({score_pct:.2f}%)" + failure_reason = run.get("failure_reason") + if failure_reason: + run_line += f" -- {failure_reason}" + lines.append(run_line) + lines.append("") + return lines + + def _format_critic_stats(self, case: dict[str, Any]) -> list[str]: + critic_stats = case.get("critic_stats") + if not critic_stats: + return [] + lines = ["Critic Stats:"] + for field, stats in critic_stats.items(): + weight = stats.get("weight", 0.0) + mean_norm = stats.get("mean_score_normalized", 0.0) * 100 + std_norm = stats.get("std_deviation_normalized", 0.0) * 100 + mean_weighted = stats.get("mean_score", 0.0) * 100 + std_weighted = stats.get("std_deviation", 0.0) * 100 + lines.append( + f" {field}: norm {mean_norm:.2f}% ± {std_norm:.2f}% | " + f"weighted {mean_weighted:.2f}% ± {std_weighted:.2f}% (w={weight:.2f})" + ) + lines.append("") + return lines + # ========================================================================= # MULTI-MODEL EVALUATION FORMATTING # ========================================================================= @@ -312,6 +380,11 @@ class TextFormatter(EvalResultFormatter): lines.append(f" [{model}] Score: {evaluation.score * 100:.1f}%") + for stat_line in self._format_run_stats(case_result): + lines.append(f" {stat_line}") + for stat_line in self._format_critic_stats(case_result): + lines.append(f" {stat_line}") + # Show evaluation details indented eval_details = self._format_evaluation(evaluation) for line in eval_details.split("\n"): @@ -420,60 +493,13 @@ class TextFormatter(EvalResultFormatter): lines.append(" " + "-" * 72) for case_name, case_data in cases.items(): - # Context section (if include_context is True) if include_context: - system_msg = case_data.get("system_message") - addl_msgs = case_data.get("additional_messages") - if system_msg or addl_msgs: - lines.append(" " + "-" * 40) - lines.append(" 📋 CONTEXT") - lines.append(" " + "-" * 40) - if system_msg: - lines.append(f" System Message: {system_msg}") - if addl_msgs: - lines.append(f" 💬 Conversation ({len(addl_msgs)} messages):") - for msg in addl_msgs: - role = msg.get("role", "unknown").upper() - content = msg.get("content", "") - name = msg.get("name", "") - role_label = f"[{role}]" if not name else f"[{role}: {name}]" - lines.append(f" {role_label}") - if content: - # For tool responses, try to format JSON - if role.lower() == "tool": - try: - import json - - parsed = json.loads(content) - formatted = json.dumps(parsed, indent=2) - for json_line in formatted.split("\n"): - lines.append(f" {json_line}") - except (json.JSONDecodeError, TypeError): - lines.append(f" {content}") - else: - lines.append(f" {content}") - # Handle tool calls - tool_calls = msg.get("tool_calls", []) - if tool_calls: - for tc in tool_calls: - func = tc.get("function", {}) - tc_name = func.get("name", "unknown") - tc_args = func.get("arguments", "{}") - lines.append(f" 🔧 {tc_name}") - try: - import json - - args_dict = ( - json.loads(tc_args) - if isinstance(tc_args, str) - else tc_args - ) - formatted = json.dumps(args_dict, indent=2) - for arg_line in formatted.split("\n"): - lines.append(f" {arg_line}") - except (json.JSONDecodeError, TypeError): - lines.append(f" {tc_args}") - lines.append(" " + "-" * 40) + lines.extend( + self._format_context_block( + case_data.get("system_message"), + case_data.get("additional_messages"), + ) + ) lines.extend( self._format_comparative_case_text( @@ -484,23 +510,16 @@ class TextFormatter(EvalResultFormatter): lines.append("") # Summary - if failed_only and original_counts: - orig_total, orig_passed, orig_failed, orig_warned = original_counts - lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)") - summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}" - if orig_warned > 0: - summary += f" -- Warnings: {orig_warned}" - if orig_failed > 0: - summary += f" -- Failed: {orig_failed}" - else: - summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}" - if total_warned > 0: - summary += f" -- Warnings: {total_warned}" - if total_failed > 0: - summary += f" -- Failed: {total_failed}" - - lines.append(summary) - lines.append("") + lines.extend( + self._format_summary_lines( + total_cases, + total_passed, + total_failed, + total_warned, + failed_only, + original_counts, + ) + ) return "\n".join(lines) @@ -563,61 +582,14 @@ class TextFormatter(EvalResultFormatter): if case_input: lines.append(f" Input: {case_input}") - # Context section (if include_context is True) if include_context: - system_msg = first_model_data.get("system_message") - addl_msgs = first_model_data.get("additional_messages") - if system_msg or addl_msgs: + context_lines = self._format_context_block( + first_model_data.get("system_message"), + first_model_data.get("additional_messages"), + ) + if context_lines: lines.append("") - lines.append(" " + "-" * 40) - lines.append(" 📋 CONTEXT") - lines.append(" " + "-" * 40) - if system_msg: - lines.append(f" System Message: {system_msg}") - if addl_msgs: - lines.append(f" 💬 Conversation ({len(addl_msgs)} messages):") - for msg in addl_msgs: - role = msg.get("role", "unknown").upper() - content = msg.get("content", "") - name = msg.get("name", "") - role_label = f"[{role}]" if not name else f"[{role}: {name}]" - lines.append(f" {role_label}") - if content: - # For tool responses, try to format JSON - if role.lower() == "tool": - try: - import json - - parsed = json.loads(content) - formatted = json.dumps(parsed, indent=2) - for json_line in formatted.split("\n"): - lines.append(f" {json_line}") - except (json.JSONDecodeError, TypeError): - lines.append(f" {content}") - else: - lines.append(f" {content}") - # Handle tool calls in assistant messages - tool_calls = msg.get("tool_calls", []) - if tool_calls: - for tc in tool_calls: - func = tc.get("function", {}) - tc_name = func.get("name", "unknown") - tc_args = func.get("arguments", "{}") - lines.append(f" 🔧 {tc_name}") - try: - import json - - args_dict = ( - json.loads(tc_args) - if isinstance(tc_args, str) - else tc_args - ) - formatted = json.dumps(args_dict, indent=2) - for arg_line in formatted.split("\n"): - lines.append(f" {arg_line}") - except (json.JSONDecodeError, TypeError): - lines.append(f" {tc_args}") - lines.append(" " + "-" * 40) + lines.extend(context_lines) lines.append("") @@ -643,23 +615,16 @@ class TextFormatter(EvalResultFormatter): # Summary lines.append("=" * 78) - if failed_only and original_counts: - orig_total, orig_passed, orig_failed, orig_warned = original_counts - lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)") - summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}" - if orig_warned > 0: - summary += f" -- Warnings: {orig_warned}" - if orig_failed > 0: - summary += f" -- Failed: {orig_failed}" - else: - summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}" - if total_warned > 0: - summary += f" -- Warnings: {total_warned}" - if total_failed > 0: - summary += f" -- Failed: {total_failed}" - - lines.append(summary) - lines.append("") + lines.extend( + self._format_summary_lines( + total_cases, + total_passed, + total_failed, + total_warned, + failed_only, + original_counts, + ) + ) return "\n".join(lines) @@ -753,12 +718,76 @@ class TextFormatter(EvalResultFormatter): continue lines.append(f" [{track_name}] Details:") + for stat_line in self._format_run_stats(track_result): + lines.append(f" {stat_line}") + for stat_line in self._format_critic_stats(track_result): + lines.append(f" {stat_line}") for detail_line in self._format_evaluation(evaluation).split("\n"): lines.append(f" {detail_line}") lines.append("") return lines + def _format_summary_lines( + self, + total_cases: int, + total_passed: int, + total_failed: int, + total_warned: int, + failed_only: bool, + original_counts: tuple[int, int, int, int] | None, + ) -> list[str]: + """Build the summary lines used by regular and comparative formatters.""" + lines: list[str] = [] + if failed_only and original_counts: + orig_total, orig_passed, orig_failed, orig_warned = original_counts + lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)") + summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}" + if orig_warned > 0: + summary += f" -- Warnings: {orig_warned}" + if orig_failed > 0: + summary += f" -- Failed: {orig_failed}" + else: + summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}" + if total_warned > 0: + summary += f" -- Warnings: {total_warned}" + if total_failed > 0: + summary += f" -- Failed: {total_failed}" + lines.append(summary) + lines.append("") + return lines + + def _format_context_block( + self, + system_msg: str | None, + additional_messages: list[dict] | None, + indent: str = " ", + ) -> list[str]: + """Build the context section lines for comparative display. + + Args: + system_msg: The system message, if any. + additional_messages: Conversation messages, if any. + indent: Base indentation prefix for each line. + + Returns: + List of formatted lines (empty if no context data). + """ + if not system_msg and not additional_messages: + return [] + lines: list[str] = [] + lines.append(indent + "-" * 40) + lines.append(indent + "📋 CONTEXT") + lines.append(indent + "-" * 40) + if system_msg: + lines.append(f"{indent}System Message: {system_msg}") + if additional_messages: + lines.append(f"{indent}💬 Conversation ({len(additional_messages)} messages):") + for conv_line in self._format_conversation_text(additional_messages): + lines.append(f"{indent}{conv_line}") + lines.append(indent + "-" * 40) + return lines + def _format_conversation_text(self, messages: list[dict]) -> list[str]: """Format conversation messages as plain text for context display.""" lines: list[str] = [] @@ -858,7 +887,22 @@ class CaptureTextFormatter(CaptureFormatter): lines.append("") lines.append(" Tool Calls:") - if case.tool_calls: + runs = getattr(case, "runs", None) + if runs: + for run_index, run in enumerate(runs, start=1): + lines.append(f" Run {run_index}:") + if run.tool_calls: + for tc in run.tool_calls: + total_calls += 1 + lines.append(f" - {tc.name}") + if tc.args: + for key, value in tc.args.items(): + lines.append( + f" {key}: {self._format_value(value)}" + ) + else: + lines.append(" (no tool calls)") + elif case.tool_calls: for tc in case.tool_calls: total_calls += 1 lines.append(f" - {tc.name}") @@ -949,7 +993,21 @@ class CaptureTextFormatter(CaptureFormatter): captured_case = models_dict[model] lines.append(f" │ [{model}]") - if captured_case.tool_calls: + runs = getattr(captured_case, "runs", None) + if runs: + for run_index, run in enumerate(runs, start=1): + lines.append(f" │ Run {run_index}:") + if run.tool_calls: + for tc in run.tool_calls: + lines.append(f" │ - {tc.name}") + if tc.args: + for key, value in tc.args.items(): + lines.append( + f" │ {key}: {self._format_value(value)}" + ) + else: + lines.append(" │ (no tool calls)") + elif captured_case.tool_calls: for tc in captured_case.tool_calls: lines.append(f" │ - {tc.name}") if tc.args: @@ -980,7 +1038,21 @@ class CaptureTextFormatter(CaptureFormatter): captured_case = models_dict[model] lines.append(f" [{model}]") - if captured_case.tool_calls: + runs = getattr(captured_case, "runs", None) + if runs: + for run_index, run in enumerate(runs, start=1): + lines.append(f" Run {run_index}:") + if run.tool_calls: + for tc in run.tool_calls: + lines.append(f" - {tc.name}") + if tc.args: + for key, value in tc.args.items(): + lines.append( + f" {key}: {self._format_value(value)}" + ) + else: + lines.append(" (no tool calls)") + elif captured_case.tool_calls: for tc in captured_case.tool_calls: lines.append(f" - {tc.name}") if tc.args: diff --git a/libs/arcade-cli/arcade_cli/main.py b/libs/arcade-cli/arcade_cli/main.py index 78b52b3a..d39413c7 100644 --- a/libs/arcade-cli/arcade_cli/main.py +++ b/libs/arcade-cli/arcade_cli/main.py @@ -405,13 +405,29 @@ def evals( "-c", help="Maximum number of concurrent evaluations (default: 1)", ), - use_provider: Optional[str] = typer.Option( + num_runs: int = typer.Option( + 1, + "--num-runs", + "-n", + help="Number of runs per case (default: 1).", + ), + seed: str = typer.Option( + "constant", + "--seed", + help="Seed policy for OpenAI runs (ignored for Anthropic): " + "'constant' (default), 'random', or an integer.", + ), + multi_run_pass_rule: str = typer.Option( + "last", + "--multi-run-pass-rule", + help="Pass/fail aggregation for multi-run cases: 'last' (default), 'mean', or 'majority'.", + ), + use_provider: Optional[list[str]] = typer.Option( None, "--use-provider", "-p", help="Provider(s) and models to use. Format: 'provider' or 'provider:model1,model2'. " - "Multiple providers: separate with spaces. " - "Examples: 'openai' or 'openai:gpt-4o anthropic:claude-sonnet-4-5-20250929'", + "Can be repeated. Examples: --use-provider openai or --use-provider openai:gpt-4o --use-provider anthropic:claude-sonnet-4-5-20250929", ), api_key: Optional[list[str]] = typer.Option( None, @@ -476,6 +492,39 @@ def evals( pip_install_command=r"pip install arcade-tdk", ) + # --- Validate multi-run parameters upfront (before any API calls) --- + if num_runs < 1: + handle_cli_error("--num-runs must be >= 1", should_exit=True) + return + + seed_value: str | int + seed_lower = seed.strip().lower() + if seed_lower in {"constant", "random"}: + seed_value = seed_lower + else: + try: + seed_value = int(seed) + except ValueError: + handle_cli_error( + "Invalid --seed value. Use 'constant', 'random', or an integer.", should_exit=True + ) + return + if seed_value < 0: + handle_cli_error("--seed must be a non-negative integer.", should_exit=True) + return + + pass_rule = multi_run_pass_rule.strip().lower() + # Lazy import: arcade_evals requires optional deps (openai) that aren't + # available when the CLI is installed without the [evals] extra. + from arcade_evals._evalsuite._types import _VALID_PASS_RULES + + if pass_rule not in _VALID_PASS_RULES: + handle_cli_error( + f"Invalid --multi-run-pass-rule. Valid values: {', '.join(sorted(_VALID_PASS_RULES))}.", + should_exit=True, + ) + return + # --- Build model specs from flags --- model_specs: list[ModelSpec] = [] @@ -483,11 +532,10 @@ def evals( api_keys = resolve_provider_api_keys(api_keys_specs=api_key) if use_provider: - # Parse provider specs - supports space-separated values - # e.g., "openai:gpt-4o anthropic:claude" - provider_specs = use_provider.split() + # Parse provider specs - supports multiple --use-provider flags + # e.g., --use-provider openai:gpt-4o --use-provider anthropic:claude try: - provider_configs = [parse_provider_spec(spec) for spec in provider_specs] + provider_configs = [parse_provider_spec(spec) for spec in use_provider] except ValueError as e: handle_cli_error(str(e), should_exit=True) return # For type checker @@ -594,6 +642,8 @@ def evals( output_file=final_output_file, output_format=",".join(final_output_formats) if final_output_formats else "txt", console=console, + num_runs=num_runs, + seed=seed_value, ) ) else: @@ -608,6 +658,9 @@ def evals( failed_only=only_failed, include_context=include_context, console=console, + num_runs=num_runs, + seed=seed_value, + multi_run_pass_rule=pass_rule, ) ) except Exception as e: diff --git a/libs/arcade-evals/README.md b/libs/arcade-evals/README.md index 97ec57c4..79b89b21 100644 --- a/libs/arcade-evals/README.md +++ b/libs/arcade-evals/README.md @@ -9,7 +9,10 @@ Arcade Evals provides comprehensive evaluation capabilities for Arcade tools: - **Evaluation Framework**: Cases, suites, and rubrics for systematic testing - **Critics**: Different types of comparisons (binary, numeric, similarity, datetime) - **Tool Evaluation**: Decorators and utilities for evaluating tool performance -- **Result Analysis**: Comprehensive evaluation results and reporting +- **Multi-Run Statistics**: Run each case multiple times with configurable seed policies and pass rules to measure consistency +- **Comparative Evaluation**: Compare tool performance across multiple sources/tracks side-by-side +- **Capture Mode**: Record model tool calls without scoring for debugging and baseline generation +- **Result Analysis**: Comprehensive evaluation results and reporting in multiple formats (text, markdown, HTML, JSON) ## Installation @@ -81,6 +84,31 @@ rubric = EvalRubric( suite = EvalSuite(cases=[case1], rubric=rubric) ``` +### Multi-Run Evaluation + +Run each case multiple times to measure consistency: + +```python +# Run via the CLI +# arcade evals eval_file.py --num-runs 5 --seed random --multi-run-pass-rule majority + +# Or programmatically +result = await suite.run( + client, + model="gpt-4o", + num_runs=5, # Run each case 5 times + seed="random", # Different seed per run + multi_run_pass_rule="majority", # Pass if >50% of runs pass +) +``` + +Multi-run results include per-case statistics: +- **Mean score** and **standard deviation** across runs +- **Per-run pass/fail** with individual scores +- **Per-critic field** score breakdowns across runs +- Configurable **pass rules**: `last` (default), `mean`, or `majority` +- Configurable **seed policies**: `constant` (fixed seed 42), `random`, or a specific integer + ## License MIT License - see LICENSE file for details. diff --git a/libs/arcade-evals/arcade_evals/__init__.py b/libs/arcade-evals/arcade_evals/__init__.py index 83d1c092..2531b19a 100644 --- a/libs/arcade-evals/arcade_evals/__init__.py +++ b/libs/arcade-evals/arcade_evals/__init__.py @@ -1,6 +1,6 @@ from ._evalsuite._providers import ProviderName from ._evalsuite._tool_registry import MCPToolDefinition -from .capture import CapturedCase, CapturedToolCall, CaptureResult +from .capture import CapturedCase, CapturedRun, CapturedToolCall, CaptureResult from .critic import BinaryCritic, DatetimeCritic, NoneCritic, NumericCritic, SimilarityCritic from .eval import ( AnyExpectedToolCall, @@ -25,6 +25,7 @@ __all__ = [ "BinaryCritic", "CaptureResult", "CapturedCase", + "CapturedRun", "CapturedToolCall", "DatetimeCritic", "EvalRubric", @@ -41,8 +42,8 @@ __all__ = [ "Weight", "clear_tools_cache", "load_arcade_mcp_gateway_async", - "load_mcp_remote_async", "load_from_stdio_async", + "load_mcp_remote_async", "load_stdio_arcade_async", "tool_eval", "validate_and_normalize_critic_weights", diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py b/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py index 711f9e8e..69b2079b 100644 --- a/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py +++ b/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py @@ -7,9 +7,11 @@ keeping it separate from the main evaluation logic in eval.py. from __future__ import annotations import asyncio +import random from typing import TYPE_CHECKING, Any -from arcade_evals.capture import CapturedCase, CapturedToolCall, CaptureResult +from arcade_evals._evalsuite._types import _resolve_seed_spec +from arcade_evals.capture import CapturedCase, CapturedRun, CapturedToolCall, CaptureResult if TYPE_CHECKING: from arcade_evals._evalsuite._comparative import ComparativeCaseBuilder @@ -39,6 +41,7 @@ class _EvalSuiteCaptureMixin: model: str, case: EvalCase, registry: EvalSuiteToolRegistry | None = None, + seed: int | None = None, ) -> list[tuple[str, dict[str, Any]]]: raise NotImplementedError # Implemented in EvalSuite @@ -67,6 +70,8 @@ class _EvalSuiteCaptureMixin: model: str, provider: ProviderName = "openai", include_context: bool = False, + num_runs: int = 1, + seed: str | int | None = "constant", ) -> CaptureResult: """ Run the evaluation suite in capture mode - records tool calls without scoring. @@ -86,10 +91,15 @@ class _EvalSuiteCaptureMixin: provider: The provider name ("openai" or "anthropic"). include_context: Whether to include system_message and additional_messages in the output. + num_runs: Number of runs per case. + seed: Seed policy ("constant", "random", or an integer seed). Returns: A CaptureResult containing all captured tool calls. """ + if num_runs < 1: + raise ValueError("num_runs must be >= 1") + all_captured: list[CapturedCase] = [] semaphore = asyncio.Semaphore(self.max_concurrent) @@ -106,34 +116,54 @@ class _EvalSuiteCaptureMixin: "No tools registered. Use add_* convenience methods or pass catalog=ToolCatalog." ) - # Get tool calls based on provider - if provider == "anthropic": - predicted_args = await self._run_anthropic( - client, model, case, registry=use_registry - ) + seed_policy, seed_value = _resolve_seed_spec(seed) + if provider == "openai": + if seed_policy == "random": + run_seeds: list[int | None] = [ + random.randint(0, 2**31 - 1) # noqa: S311 + for _ in range(num_runs) + ] + else: + run_seeds = [seed_value for _ in range(num_runs)] else: - predicted_args = await self._run_openai( - client, model, case, registry=use_registry + run_seeds = [None for _ in range(num_runs)] + + runs: list[CapturedRun] = [] + for run_index in range(num_runs): + run_seed = run_seeds[run_index] + # Get tool calls based on provider + if provider == "anthropic": + predicted_args = await self._run_anthropic( + client, model, case, registry=use_registry + ) + else: + predicted_args = await self._run_openai( + client, model, case, registry=use_registry, seed=run_seed + ) + + # Process tool calls (resolve names, fill defaults) + filled_actual_tool_calls = self._process_tool_calls( + predicted_args, registry=use_registry ) - # Process tool calls (resolve names, fill defaults) - filled_actual_tool_calls = self._process_tool_calls( - predicted_args, registry=use_registry - ) + # Convert to CapturedToolCall objects + tool_calls = [ + CapturedToolCall(name=name, args=args) + for name, args in filled_actual_tool_calls + ] - # Convert to CapturedToolCall objects - tool_calls = [ - CapturedToolCall(name=name, args=args) - for name, args in filled_actual_tool_calls - ] + runs.append(CapturedRun(tool_calls=tool_calls)) + + primary_tool_calls = runs[0].tool_calls if runs else [] return CapturedCase( case_name=case.name, user_message=case.user_message, - tool_calls=tool_calls, + tool_calls=primary_tool_calls, system_message=case.system_message if include_context else None, additional_messages=case.additional_messages if include_context else None, track_name=track, + runs=runs if len(runs) > 1 else [], ) # Capture regular cases (using default registry) diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py index fc2027e1..66f3f71e 100644 --- a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py +++ b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py @@ -6,6 +6,7 @@ multiple tool tracks with track-specific expected results and critics. from __future__ import annotations +from collections.abc import Sequence from typing import TYPE_CHECKING, Any from arcade_evals._evalsuite._types import ( @@ -45,7 +46,7 @@ class ComparativeCaseBuilder: name: str, user_message: str, system_message: str = "", - additional_messages: list[dict[str, str]] | None = None, + additional_messages: list[dict[str, Any]] | None = None, rubric: EvalRubric | None = None, ) -> None: """Initialize the builder. @@ -70,7 +71,7 @@ class ComparativeCaseBuilder: def for_track( self, track_name: str, - expected_tool_calls: list[ExpectedToolCall | ExpectedMCPToolCall], + expected_tool_calls: Sequence[ExpectedToolCall | ExpectedMCPToolCall], critics: list[Critic] | None = None, ) -> ComparativeCaseBuilder: """Add track-specific configuration. diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py index a0e69251..bf8c7a16 100644 --- a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py +++ b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py @@ -7,17 +7,25 @@ allowing the same cases to be run against multiple tool tracks. from __future__ import annotations import asyncio +import logging import time from typing import TYPE_CHECKING, Any from arcade_evals._evalsuite._comparative import ComparativeCaseBuilder -from arcade_evals._evalsuite._types import ComparativeCase, EvalRubric +from arcade_evals._evalsuite._types import ( + _VALID_PASS_RULES, + PASS_RULE_LAST, + ComparativeCase, + EvalRubric, +) if TYPE_CHECKING: from arcade_evals._evalsuite._providers import ProviderName from arcade_evals._evalsuite._tool_registry import EvalSuiteToolRegistry from arcade_evals._evalsuite._tracks import TrackManager +logger = logging.getLogger(__name__) + class _EvalSuiteComparativeMixin: """Mixin providing comparative evaluation execution methods.""" @@ -36,12 +44,26 @@ class _EvalSuiteComparativeMixin: _run_openai: Any # Method from EvalSuite _run_anthropic: Any # Method from EvalSuite + async def _run_case_with_stats( + self, + case: Any, + client: Any, + model: str, + provider: ProviderName, + *, + num_runs: int, + seed: str | int | None, + pass_rule: str, + registry: EvalSuiteToolRegistry | None = None, + ) -> dict[str, Any]: + raise NotImplementedError # Implemented in EvalSuite + def add_comparative_case( self, name: str, user_message: str, system_message: str | None = None, - additional_messages: list[dict[str, str]] | None = None, + additional_messages: list[dict[str, Any]] | None = None, rubric: EvalRubric | None = None, ) -> ComparativeCaseBuilder: """Create a comparative case that runs against multiple tool tracks. @@ -90,6 +112,9 @@ class _EvalSuiteComparativeMixin: client: Any, model: str, provider: ProviderName = "openai", + num_runs: int = 1, + seed: str | int | None = "constant", + multi_run_pass_rule: str = PASS_RULE_LAST, ) -> dict[str, dict[str, Any]]: """Run comparative cases across all configured tracks. @@ -97,6 +122,9 @@ class _EvalSuiteComparativeMixin: client: The LLM client instance. model: The model to evaluate. provider: The provider name. + num_runs: Number of runs per case. + seed: Seed policy ("constant", "random", or an integer seed). + multi_run_pass_rule: How to determine pass/warn for multi-run cases. Returns: Dictionary mapping track names to their results. @@ -116,6 +144,15 @@ class _EvalSuiteComparativeMixin: "No comparative cases defined. Use add_comparative_case() to add cases." ) + # Validate upfront before making any API calls + if num_runs < 1: + raise ValueError("num_runs must be >= 1") + if multi_run_pass_rule not in _VALID_PASS_RULES: + raise ValueError( + f"Invalid multi-run pass rule '{multi_run_pass_rule}'. " + f"Valid values: {', '.join(sorted(_VALID_PASS_RULES))}" + ) + # Build and validate all cases upfront comparative_cases: list[ComparativeCase] = [] all_required_tracks: set[str] = set() @@ -183,27 +220,21 @@ class _EvalSuiteComparativeMixin: ) -> dict[str, Any]: async with semaphore: start = time.time() - print(f" [TASK START] {_case.name} @ {_t_name}", flush=True) - if provider == "anthropic": - predicted_args = await self._run_anthropic( - client, model, _case, registry=_reg - ) - else: - predicted_args = await self._run_openai( - client, model, _case, registry=_reg - ) + logger.debug("[TASK START] %s @ %s", _case.name, _t_name) + case_result = await self._run_case_with_stats( + _case, + client, + model, + provider, + num_runs=num_runs, + seed=seed, + pass_rule=multi_run_pass_rule, + registry=_reg, + ) elapsed = time.time() - start - print( - f" [TASK DONE] {_case.name} @ {_t_name} ({elapsed:.1f}s)", - flush=True, - ) + logger.debug("[TASK DONE] %s @ %s (%.1fs)", _case.name, _t_name, elapsed) - filled_actual_tool_calls = self._process_tool_calls( - predicted_args, registry=_reg - ) - evaluation = _case.evaluate(filled_actual_tool_calls) - - return { + result = { "name": _case.name, "track": _t_name, "input": _case.user_message, @@ -215,10 +246,15 @@ class _EvalSuiteComparativeMixin: ], "predicted_tool_calls": [ {"name": name, "args": args} - for name, args in filled_actual_tool_calls + for name, args in case_result["predicted_tool_calls"] ], - "evaluation": evaluation, + "evaluation": case_result["evaluation"], } + if num_runs > 1: + result["run_stats"] = case_result["run_stats"] + if case_result["critic_stats"]: + result["critic_stats"] = case_result["critic_stats"] + return result task = run_track_case(eval_case, registry, track_name) tasks.append((track_name, task)) diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_types.py b/libs/arcade-evals/arcade_evals/_evalsuite/_types.py index a43063d5..1aa29303 100644 --- a/libs/arcade-evals/arcade_evals/_evalsuite/_types.py +++ b/libs/arcade-evals/arcade_evals/_evalsuite/_types.py @@ -6,9 +6,44 @@ eval.py and the _evalsuite submodules, avoiding circular imports. from __future__ import annotations +from collections.abc import Sequence from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any, Callable +DEFAULT_EVAL_SEED = 42 + +# Pass-rule constants (shared across eval.py & _comparative_execution.py) +PASS_RULE_LAST = "last" # noqa: S105 +PASS_RULE_MEAN = "mean" # noqa: S105 +PASS_RULE_MAJORITY = "majority" # noqa: S105 +_VALID_PASS_RULES: frozenset[str] = frozenset({PASS_RULE_LAST, PASS_RULE_MEAN, PASS_RULE_MAJORITY}) + + +def _resolve_seed_spec(seed: str | int | None) -> tuple[str, int | None]: + """Resolve a seed specification into a (policy, value) pair. + + Args: + seed: 'constant', 'random', an integer, a numeric string, or None. + + Returns: + A tuple of (policy_name, seed_value). policy_name is one of + 'constant', 'random', or 'custom'. + """ + if seed is None: + return "constant", DEFAULT_EVAL_SEED + if isinstance(seed, int): + return "custom", seed + seed_value = seed.strip().lower() + if seed_value == "constant": + return "constant", DEFAULT_EVAL_SEED + if seed_value == "random": + return "random", None + try: + return "custom", int(seed_value) + except ValueError as exc: + raise ValueError("Invalid seed. Use 'constant', 'random', or an integer value.") from exc + + if TYPE_CHECKING: from arcade_evals.critic import Critic @@ -117,7 +152,7 @@ class TrackConfig: critics: Critics to evaluate tool arguments for this track. """ - expected_tool_calls: list[ExpectedToolCall | ExpectedMCPToolCall] + expected_tool_calls: Sequence[ExpectedToolCall | ExpectedMCPToolCall] critics: list[Critic] = field(default_factory=list) @@ -140,14 +175,14 @@ class ComparativeCase: name: str user_message: str system_message: str = "" - additional_messages: list[dict[str, str]] = field(default_factory=list) + additional_messages: list[dict[str, Any]] = field(default_factory=list) rubric: EvalRubric | None = None track_configs: dict[str, TrackConfig] = field(default_factory=dict) def add_track_config( self, track_name: str, - expected_tool_calls: list[ExpectedToolCall | ExpectedMCPToolCall], + expected_tool_calls: Sequence[ExpectedToolCall | ExpectedMCPToolCall], critics: list[Critic] | None = None, ) -> None: """Add configuration for a track. diff --git a/libs/arcade-evals/arcade_evals/capture.py b/libs/arcade-evals/arcade_evals/capture.py index d5ad4aeb..19b98033 100644 --- a/libs/arcade-evals/arcade_evals/capture.py +++ b/libs/arcade-evals/arcade_evals/capture.py @@ -38,6 +38,22 @@ class CapturedToolCall: return {"name": self.name, "args": self.args} +@dataclass +class CapturedRun: + """ + A single capture run for a case, containing tool calls. + + Attributes: + tool_calls: List of tool calls made by the model in this run. + """ + + tool_calls: list[CapturedToolCall] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + return {"tool_calls": [tc.to_dict() for tc in self.tool_calls]} + + @dataclass class CapturedCase: """ @@ -50,6 +66,7 @@ class CapturedCase: system_message: The system message (included if include_context is True). additional_messages: Additional messages (included if include_context is True). track_name: The track name for comparative captures (None for regular cases). + runs: Optional list of runs (populated when num_runs > 1). """ case_name: str @@ -58,6 +75,7 @@ class CapturedCase: system_message: str | None = None additional_messages: list[dict[str, Any]] | None = None track_name: str | None = None + runs: list[CapturedRun] = field(default_factory=list) @staticmethod def _try_parse_json(value: str) -> Any: @@ -109,6 +127,8 @@ class CapturedCase: "user_message": self.user_message, "tool_calls": [tc.to_dict() for tc in self.tool_calls], } + if self.runs: + result["runs"] = [run.to_dict() for run in self.runs] if self.track_name: result["track_name"] = self.track_name if include_context: @@ -159,17 +179,32 @@ class CaptureResult: async def _capture_with_openai( - suite: EvalSuite, api_key: str, model: str, include_context: bool = False + suite: EvalSuite, + api_key: str, + model: str, + include_context: bool = False, + num_runs: int = 1, + seed: str | int | None = "constant", ) -> CaptureResult: """Run capture mode with OpenAI client.""" async with AsyncOpenAI(api_key=api_key) as client: return await suite.capture( - client, model, provider="openai", include_context=include_context + client, + model, + provider="openai", + include_context=include_context, + num_runs=num_runs, + seed=seed, ) async def _capture_with_anthropic( - suite: EvalSuite, api_key: str, model: str, include_context: bool = False + suite: EvalSuite, + api_key: str, + model: str, + include_context: bool = False, + num_runs: int = 1, + seed: str | int | None = "constant", ) -> CaptureResult: """Run capture mode with Anthropic client.""" try: @@ -182,5 +217,10 @@ async def _capture_with_anthropic( async with AsyncAnthropic(api_key=api_key) as client: return await suite.capture( - client, model, provider="anthropic", include_context=include_context + client, + model, + provider="anthropic", + include_context=include_context, + num_runs=num_runs, + seed=seed, ) diff --git a/libs/arcade-evals/arcade_evals/eval.py b/libs/arcade-evals/arcade_evals/eval.py index 27d926a6..a8dc9e09 100644 --- a/libs/arcade-evals/arcade_evals/eval.py +++ b/libs/arcade-evals/arcade_evals/eval.py @@ -3,7 +3,10 @@ import functools import inspect import json import logging +import random +from collections.abc import Sequence from dataclasses import dataclass, field +from statistics import mean, pstdev from typing import TYPE_CHECKING, Any, Callable import numpy as np @@ -24,11 +27,16 @@ from arcade_evals._evalsuite._tracks import TrackManager # Import shared types from _types module (breaks circular dependencies) from arcade_evals._evalsuite._types import ( + _VALID_PASS_RULES, + PASS_RULE_LAST, + PASS_RULE_MAJORITY, + PASS_RULE_MEAN, AnyExpectedToolCall, EvalRubric, ExpectedMCPToolCall, ExpectedToolCall, NamedExpectedToolCall, + _resolve_seed_spec, ) from arcade_evals.critic import NoneCritic from arcade_evals.weights import validate_and_normalize_critic_weights @@ -140,6 +148,88 @@ class EvaluationResult: self.score = total_score / total_weight if total_weight > 0 else 0.0 +# PASS_RULE_LAST, PASS_RULE_MEAN, PASS_RULE_MAJORITY, and _VALID_PASS_RULES +# are imported from _types (see top-level imports) to keep a single source of truth. + + +def _compute_mean_std(values: list[float]) -> tuple[float, float]: + if not values: + return 0.0, 0.0 + avg = mean(values) + if len(values) < 2: + return avg, 0.0 + return avg, pstdev(values) + + +def _resolve_pass_rule( + run_evaluations: list[EvaluationResult], + mean_score: float, + pass_rule: str, + rubric: EvalRubric, +) -> tuple[bool, bool]: + if pass_rule not in _VALID_PASS_RULES: + raise ValueError( + f"Invalid multi-run pass rule '{pass_rule}'. " + f"Valid values: {', '.join(sorted(_VALID_PASS_RULES))}" + ) + if not run_evaluations: + return False, False + if pass_rule == PASS_RULE_MEAN: + passed = mean_score >= rubric.fail_threshold + warning = not passed and mean_score >= rubric.warn_threshold + return passed, warning + if pass_rule == PASS_RULE_MAJORITY: + majority = len(run_evaluations) // 2 + 1 + passed_count = sum(1 for ev in run_evaluations if ev.passed) + warned_count = sum(1 for ev in run_evaluations if ev.warning) + if passed_count >= majority: + return True, False + if (passed_count + warned_count) >= majority: + return False, True + return False, False + last_eval = run_evaluations[-1] + return last_eval.passed, last_eval.warning + + +def _aggregate_critic_stats( + run_field_scores: list[dict[str, dict[str, float]]], +) -> dict[str, dict[str, Any]]: + if not run_field_scores: + return {} + all_fields: set[str] = set() + for field_scores in run_field_scores: + all_fields.update(field_scores.keys()) + + critic_stats: dict[str, dict[str, Any]] = {} + for critic_field in sorted(all_fields): + weighted_scores = [ + run_scores.get(critic_field, {}).get("score", 0.0) for run_scores in run_field_scores + ] + weights = [ + run_scores.get(critic_field, {}).get("weight", 0.0) for run_scores in run_field_scores + ] + normalized_scores = [ + (score / weight) if weight > 0 else 0.0 + for score, weight in zip(weighted_scores, weights) + ] + avg, std_dev = _compute_mean_std(weighted_scores) + avg_norm, std_norm = _compute_mean_std(normalized_scores) + non_zero_weights = [w for w in weights if w > 0] + # Use mean of non-zero weights as the representative weight. + # Weights are typically constant across runs, but mean handles edge cases. + representative_weight = mean(non_zero_weights) if non_zero_weights else 0.0 + critic_stats[critic_field] = { + "run_scores": weighted_scores, + "mean_score": avg, + "std_deviation": std_dev, + "run_scores_normalized": normalized_scores, + "mean_score_normalized": avg_norm, + "std_deviation_normalized": std_norm, + "weight": representative_weight, + } + return critic_stats + + # Import capture mode helpers (defined in capture.py to keep this file focused) from arcade_evals.capture import ( # noqa: E402 _capture_with_anthropic, @@ -167,7 +257,7 @@ class EvalCase: user_message: str expected_tool_calls: list[NamedExpectedToolCall] critics: list["Critic"] | None = None - additional_messages: list[dict[str, str]] = field(default_factory=list) + additional_messages: list[dict[str, Any]] = field(default_factory=list) rubric: EvalRubric = field(default_factory=EvalRubric) def __post_init__(self) -> None: @@ -520,7 +610,7 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo expected_tool_calls: list[NamedExpectedToolCall], rubric: EvalRubric, critics: list["Critic"], - additional_messages: list[dict[str, str]], + additional_messages: list[dict[str, Any]], ) -> "EvalCase": """Factory method to create EvalCase instances. @@ -540,11 +630,12 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo self, name: str, user_message: str, - expected_tool_calls: list[AnyExpectedToolCall] | list[tuple[Callable, dict[str, Any]]], + expected_tool_calls: Sequence[AnyExpectedToolCall] + | Sequence[tuple[Callable, dict[str, Any]]], critics: list["Critic"] | None = None, system_message: str | None = None, rubric: EvalRubric | None = None, - additional_messages: list[dict[str, str]] | None = None, + additional_messages: list[dict[str, Any]] | None = None, ) -> None: """ Add a new evaluation case to the suite. @@ -660,7 +751,7 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo | None = None, rubric: EvalRubric | None = None, critics: list["Critic"] | None = None, - additional_messages: list[dict[str, str]] | None = None, + additional_messages: list[dict[str, Any]] | None = None, ) -> None: """ Extend the last added case with new information. @@ -745,11 +836,148 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo processed_calls.append((resolved_name, args_with_defaults)) return processed_calls + def _compute_run_field_scores( + self, evaluation: EvaluationResult + ) -> dict[str, dict[str, float]]: + field_scores: dict[str, list[float]] = {} + field_weights: dict[str, list[float]] = {} + for result in evaluation.results: + field = result["field"] + if field == "tool_selection": + continue + field_scores.setdefault(field, []).append(result["score"]) + field_weights.setdefault(field, []).append(result["weight"]) + + run_scores: dict[str, dict[str, float]] = {} + for field, scores in field_scores.items(): + weights = field_weights.get(field, []) + run_scores[field] = { + "score": mean(scores) if scores else 0.0, + "weight": mean(weights) if weights else 0.0, + } + return run_scores + + async def _run_case_with_stats( + self, + case: "EvalCase", + client: Any, + model: str, + provider: ProviderName, + *, + num_runs: int, + seed: str | int | None, + pass_rule: str, + registry: EvalSuiteToolRegistry | None = None, + ) -> dict[str, Any]: + if num_runs < 1: + raise ValueError("num_runs must be >= 1") + + seed_policy, seed_value = _resolve_seed_spec(seed) + seed_policy_display = seed_policy + if provider == "openai": + if seed_policy == "random": + run_seeds: list[int | None] = [ + random.randint(0, 2**31 - 1) # noqa: S311 + for _ in range(num_runs) + ] + else: + run_seeds = [seed_value for _ in range(num_runs)] + else: + seed_policy_display = f"{seed_policy} (ignored)" + run_seeds = [None for _ in range(num_runs)] + + run_evaluations: list[EvaluationResult] = [] + run_scores: list[float] = [] + run_passed: list[bool] = [] + run_warned: list[bool] = [] + run_field_scores: list[dict[str, dict[str, float]]] = [] + last_processed_calls: list[tuple[str, dict[str, Any]]] = [] + run_details: list[dict[str, Any]] = [] + + for run_index in range(num_runs): + run_seed = run_seeds[run_index] + if provider == "anthropic": + predicted_args = await self._run_anthropic(client, model, case, registry=registry) + else: + predicted_args = await self._run_openai( + client, model, case, registry=registry, seed=run_seed + ) + + processed_calls = self._process_tool_calls(predicted_args, registry=registry) + evaluation = case.evaluate(processed_calls) + + run_evaluations.append(evaluation) + run_scores.append(evaluation.score) + run_passed.append(evaluation.passed) + run_warned.append(evaluation.warning) + run_field_scores.append(self._compute_run_field_scores(evaluation)) + last_processed_calls = processed_calls + run_details.append({ + "score": evaluation.score, + "passed": evaluation.passed, + "warning": evaluation.warning, + "failure_reason": evaluation.failure_reason, + "details": evaluation.results, + }) + + mean_score, std_dev = _compute_mean_std(run_scores) + passed, warning = _resolve_pass_rule(run_evaluations, mean_score, pass_rule, case.rubric) + + # Determine aggregate failure_reason: + # - PASS_RULE_LAST: use the last run's failure reason + # - Other rules: if ALL runs failed with the same reason, surface it + if not run_evaluations: + aggregate_failure_reason = None + elif pass_rule == PASS_RULE_LAST: + # Only surface failure_reason when the aggregate didn't pass + aggregate_failure_reason = run_evaluations[-1].failure_reason if not passed else None + elif not passed and not warning: + # For non-last rules, surface the failure reason if all runs share the same one + failure_reasons = [ev.failure_reason for ev in run_evaluations if ev.failure_reason] + unique_reasons = set(failure_reasons) + if len(unique_reasons) == 1 and len(failure_reasons) == len(run_evaluations): + aggregate_failure_reason = failure_reasons[0] + else: + aggregate_failure_reason = None + else: + aggregate_failure_reason = None + + aggregate = EvaluationResult( + score=mean_score, + passed=passed, + warning=warning, + results=run_evaluations[-1].results if run_evaluations else [], + failure_reason=aggregate_failure_reason, + ) + + run_stats = { + "num_runs": num_runs, + "scores": run_scores, + "mean_score": mean_score, + "std_deviation": std_dev, + "passed": run_passed, + "warned": run_warned, + "seed_policy": seed_policy_display, + "run_seeds": run_seeds, + "pass_rule": pass_rule, + "runs": run_details, + } + + return { + "evaluation": aggregate, + "predicted_tool_calls": last_processed_calls, + "run_stats": run_stats, + "critic_stats": _aggregate_critic_stats(run_field_scores), + } + async def run( self, client: Any, # AsyncOpenAI | AsyncAnthropic - use Any to avoid import dependency model: str, provider: ProviderName = "openai", + num_runs: int = 1, + seed: str | int | None = "constant", + multi_run_pass_rule: str = PASS_RULE_LAST, ) -> dict[str, Any]: """ Run the evaluation suite. @@ -758,10 +986,22 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo client: The LLM client instance (AsyncOpenAI or AsyncAnthropic). model: The model to evaluate. provider: The provider name ("openai" or "anthropic"). + num_runs: Number of runs per case. + seed: Seed policy ("constant", "random", or an integer seed). + multi_run_pass_rule: How to determine pass/warn for multi-run cases. Returns: A dictionary containing the evaluation results. """ + # Validate upfront before making any API calls + if num_runs < 1: + raise ValueError("num_runs must be >= 1") + if multi_run_pass_rule not in _VALID_PASS_RULES: + raise ValueError( + f"Invalid multi-run pass rule '{multi_run_pass_rule}'. " + f"Valid values: {', '.join(sorted(_VALID_PASS_RULES))}" + ) + results: dict[str, Any] = { "model": model, "suite_name": self.name, @@ -779,17 +1019,15 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo "No tools registered. Use add_* convenience methods or pass catalog=ToolCatalog." ) - # Get tool calls based on provider - if provider == "anthropic": - predicted_args = await self._run_anthropic(client, model, case) - else: - predicted_args = await self._run_openai(client, model, case) - - # Process tool calls (resolve names, fill defaults) - filled_actual_tool_calls = self._process_tool_calls(predicted_args) - - # Evaluate the case - evaluation = case.evaluate(filled_actual_tool_calls) + case_result = await self._run_case_with_stats( + case, + client, + model, + provider, + num_runs=num_runs, + seed=seed, + pass_rule=multi_run_pass_rule, + ) # Prepare the result result = { @@ -801,10 +1039,15 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo {"name": tc.name, "args": tc.args} for tc in case.expected_tool_calls ], "predicted_tool_calls": [ - {"name": name, "args": args} for name, args in filled_actual_tool_calls + {"name": name, "args": args} + for name, args in case_result["predicted_tool_calls"] ], - "evaluation": evaluation, + "evaluation": case_result["evaluation"], } + if num_runs > 1: + result["run_stats"] = case_result["run_stats"] + if case_result["critic_stats"]: + result["critic_stats"] = case_result["critic_stats"] return result tasks = [sem_task(case) for case in self.cases] @@ -819,6 +1062,7 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo model: str, case: "EvalCase", registry: EvalSuiteToolRegistry | None = None, + seed: int | None = None, ) -> list[tuple[str, dict[str, Any]]]: """Run evaluation using OpenAI client. @@ -843,15 +1087,18 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo tools = effective_registry.list_tools_for_model(tool_format="openai") # Get the model response - response = await client.chat.completions.create( # type: ignore[arg-type] - model=model, - messages=messages, - tool_choice="auto", - tools=tools, - user="eval_user", - seed=42, - stream=False, - ) + request_params: dict[str, Any] = { + "model": model, + "messages": messages, + "tool_choice": "auto", + "tools": tools, + "user": "eval_user", + "stream": False, + } + if seed is not None: + request_params["seed"] = seed + + response = await client.chat.completions.create(**request_params) return get_tool_args(response, normalize_names=False) @@ -985,6 +1232,9 @@ def tool_eval() -> Callable[[Callable], Callable]: provider: ProviderName = "openai", capture_mode: bool = False, include_context: bool = False, + num_runs: int = 1, + seed: str | int | None = "constant", + multi_run_pass_rule: str = PASS_RULE_LAST, ) -> list[Any]: """ Run evaluation or capture mode. @@ -1015,19 +1265,43 @@ def tool_eval() -> Callable[[Callable], Callable]: # Run in capture mode if provider == "anthropic": capture_result = await _capture_with_anthropic( - suite, provider_api_key, model, include_context + suite, + provider_api_key, + model, + include_context=include_context, + num_runs=num_runs, + seed=seed, ) else: capture_result = await _capture_with_openai( - suite, provider_api_key, model, include_context + suite, + provider_api_key, + model, + include_context=include_context, + num_runs=num_runs, + seed=seed, ) return [capture_result] else: # Run in evaluation mode if provider == "anthropic": - eval_result = await _run_with_anthropic(suite, provider_api_key, model) + eval_result = await _run_with_anthropic( + suite, + provider_api_key, + model, + num_runs=num_runs, + seed=seed, + multi_run_pass_rule=multi_run_pass_rule, + ) else: - eval_result = await _run_with_openai(suite, provider_api_key, model) + eval_result = await _run_with_openai( + suite, + provider_api_key, + model, + num_runs=num_runs, + seed=seed, + multi_run_pass_rule=multi_run_pass_rule, + ) # For comparative evaluations, eval_result is already a list of track results # For regular evaluations, it's a single dict that needs wrapping @@ -1042,7 +1316,13 @@ def tool_eval() -> Callable[[Callable], Callable]: async def _run_with_openai( - suite: "EvalSuite", api_key: str, model: str + suite: "EvalSuite", + api_key: str, + model: str, + *, + num_runs: int = 1, + seed: str | int | None = "constant", + multi_run_pass_rule: str = PASS_RULE_LAST, ) -> dict[str, Any] | list[dict[str, Any]]: """Run evaluation suite with OpenAI client. @@ -1054,16 +1334,36 @@ async def _run_with_openai( # Check if this suite has comparative cases if suite._comparative_case_builders: # Run comparative evaluation - returns dict[track_name, result] - track_results = await suite.run_comparative(client, model, provider="openai") + track_results = await suite.run_comparative( + client, + model, + provider="openai", + num_runs=num_runs, + seed=seed, + multi_run_pass_rule=multi_run_pass_rule, + ) # Convert to list of results for consistent handling return list(track_results.values()) else: # Run regular evaluation - return await suite.run(client, model, provider="openai") + return await suite.run( + client, + model, + provider="openai", + num_runs=num_runs, + seed=seed, + multi_run_pass_rule=multi_run_pass_rule, + ) async def _run_with_anthropic( - suite: "EvalSuite", api_key: str, model: str + suite: "EvalSuite", + api_key: str, + model: str, + *, + num_runs: int = 1, + seed: str | int | None = "constant", + multi_run_pass_rule: str = PASS_RULE_LAST, ) -> dict[str, Any] | list[dict[str, Any]]: """Run evaluation suite with Anthropic client. @@ -1083,9 +1383,23 @@ async def _run_with_anthropic( # Check if this suite has comparative cases if suite._comparative_case_builders: # Run comparative evaluation - returns dict[track_name, result] - track_results = await suite.run_comparative(client, model, provider="anthropic") + track_results = await suite.run_comparative( + client, + model, + provider="anthropic", + num_runs=num_runs, + seed=seed, + multi_run_pass_rule=multi_run_pass_rule, + ) # Convert to list of results for consistent handling return list(track_results.values()) else: # Run regular evaluation - return await suite.run(client, model, provider="anthropic") + return await suite.run( + client, + model, + provider="anthropic", + num_runs=num_runs, + seed=seed, + multi_run_pass_rule=multi_run_pass_rule, + ) diff --git a/libs/tests/cli/test_capture_formatters.py b/libs/tests/cli/test_capture_formatters.py index 832ab8e5..fc93b5f4 100644 --- a/libs/tests/cli/test_capture_formatters.py +++ b/libs/tests/cli/test_capture_formatters.py @@ -57,6 +57,20 @@ def _create_mock_capture_result( # Explicitly set track_name to None unless specified (avoids MagicMock) case.track_name = case_data.get("track_name") + # Create mock runs if provided + runs = [] + for run_data in case_data.get("runs", []): + run = MagicMock() + run_tool_calls = [] + for tc_data in run_data.get("tool_calls", []): + tc = MagicMock() + tc.name = tc_data["name"] + tc.args = tc_data.get("args", {}) + run_tool_calls.append(tc) + run.tool_calls = run_tool_calls + runs.append(run) + case.runs = runs + # Create mock tool calls tool_calls = [] for tc_data in case_data.get("tool_calls", []): @@ -84,6 +98,11 @@ def _create_mock_capture_result( "user_message": case.user_message, "tool_calls": [{"name": tc.name, "args": tc.args} for tc in case.tool_calls], } + if case.runs: + case_dict["runs"] = [ + {"tool_calls": [{"name": tc.name, "args": tc.args} for tc in run.tool_calls]} + for run in case.runs + ] if include_context: case_dict["system_message"] = case.system_message case_dict["additional_messages"] = case.additional_messages @@ -170,6 +189,29 @@ class TestCaptureJsonFormatter: assert case["tool_calls"][0]["name"] == "GetWeather" assert case["tool_calls"][0]["args"]["city"] == "NYC" + def test_format_includes_runs(self) -> None: + """Test that runs are included when present.""" + formatter = CaptureJsonFormatter() + capture = _create_mock_capture_result( + cases=[ + { + "case_name": "multi_run_case", + "user_message": "Hello", + "tool_calls": [], + "runs": [ + {"tool_calls": [{"name": "A", "args": {"x": 1}}]}, + {"tool_calls": [{"name": "B", "args": {"x": 2}}]}, + ], + } + ] + ) + + output = formatter.format([capture]) + parsed = json.loads(output) + runs = parsed["captures"][0]["captured_cases"][0]["runs"] + assert len(runs) == 2 + assert runs[0]["tool_calls"][0]["name"] == "A" + def test_format_with_context(self) -> None: """Test formatting with context included.""" formatter = CaptureJsonFormatter() @@ -309,6 +351,28 @@ class TestCaptureMarkdownFormatter: assert "**Total Cases:** 1" in output assert "**Total Tool Calls:** 1" in output + def test_format_includes_runs(self) -> None: + """Should include per-run tool calls when runs are present.""" + formatter = CaptureMarkdownFormatter() + capture = _create_mock_capture_result( + cases=[ + { + "case_name": "multi_run_case", + "user_message": "Hello", + "tool_calls": [], + "runs": [ + {"tool_calls": [{"name": "GetWeather", "args": {"city": "NYC"}}]}, + {"tool_calls": [{"name": "GetWeather", "args": {"city": "SF"}}]}, + ], + } + ] + ) + + output = formatter.format([capture]) + assert "Run 1" in output + assert "Run 2" in output + assert "`GetWeather`" in output + class TestCaptureHtmlFormatter: """Tests for CaptureHtmlFormatter.""" @@ -607,14 +671,26 @@ class TestMultiModelTextCaptureFormatter: def test_text_multi_model_output(self) -> None: """Should produce multi-model text output.""" capture1 = _create_mock_capture_result( - suite_name="TestSuite", model="gpt-4o", cases=[ - {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool1", "args": {}}]} - ] + suite_name="TestSuite", + model="gpt-4o", + cases=[ + { + "case_name": "case1", + "user_message": "Hi", + "tool_calls": [{"name": "Tool1", "args": {}}], + } + ], ) capture2 = _create_mock_capture_result( - suite_name="TestSuite", model="gpt-4-turbo", cases=[ - {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool2", "args": {}}]} - ] + suite_name="TestSuite", + model="gpt-4-turbo", + cases=[ + { + "case_name": "case1", + "user_message": "Hi", + "tool_calls": [{"name": "Tool2", "args": {}}], + } + ], ) formatter = CaptureTextFormatter() @@ -647,14 +723,26 @@ class TestMultiModelHtmlCaptureFormatter: def test_html_multi_model_output(self) -> None: """Should produce multi-model HTML output.""" capture1 = _create_mock_capture_result( - suite_name="TestSuite", model="gpt-4o", cases=[ - {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool1", "args": {}}]} - ] + suite_name="TestSuite", + model="gpt-4o", + cases=[ + { + "case_name": "case1", + "user_message": "Hi", + "tool_calls": [{"name": "Tool1", "args": {}}], + } + ], ) capture2 = _create_mock_capture_result( - suite_name="TestSuite", model="gpt-4-turbo", cases=[ - {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool2", "args": {}}]} - ] + suite_name="TestSuite", + model="gpt-4-turbo", + cases=[ + { + "case_name": "case1", + "user_message": "Hi", + "tool_calls": [{"name": "Tool2", "args": {}}], + } + ], ) formatter = CaptureHtmlFormatter() @@ -687,14 +775,26 @@ class TestMultiModelJsonCaptureFormatter: def test_json_multi_model_output(self) -> None: """Should produce structured multi-model JSON.""" capture1 = _create_mock_capture_result( - suite_name="TestSuite", model="gpt-4o", cases=[ - {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool1", "args": {}}]} - ] + suite_name="TestSuite", + model="gpt-4o", + cases=[ + { + "case_name": "case1", + "user_message": "Hi", + "tool_calls": [{"name": "Tool1", "args": {}}], + } + ], ) capture2 = _create_mock_capture_result( - suite_name="TestSuite", model="gpt-4-turbo", cases=[ - {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool2", "args": {}}]} - ] + suite_name="TestSuite", + model="gpt-4-turbo", + cases=[ + { + "case_name": "case1", + "user_message": "Hi", + "tool_calls": [{"name": "Tool2", "args": {}}], + } + ], ) formatter = CaptureJsonFormatter() @@ -795,6 +895,7 @@ def _create_mock_capture_with_tracks( mock_tc.args = tc["args"] mock_tool_calls.append(mock_tc) mock_case.tool_calls = mock_tool_calls + mock_case.runs = [] # Explicitly set runs to empty for single-run captures captured_cases.append(mock_case) @@ -924,3 +1025,365 @@ class TestCaptureWithTracks: # Should include track info in markdown assert "[track_a]" in output or "track_a" in output + + +# ===================================================================== +# Capture formatter multi-run tests +# ===================================================================== + + +def _create_mock_capture_with_runs( + num_runs: int = 3, +) -> CaptureResult: + """Create a mock CaptureResult with multiple runs per case.""" + cases = [ + { + "case_name": "multi_run_case", + "user_message": "What's the weather in NYC?", + "tool_calls": [ + {"name": "GetWeather", "args": {"city": "NYC"}}, + ], + "system_message": "You are a weather assistant", + "additional_messages": [], + "runs": [ + { + "tool_calls": [ + {"name": "GetWeather", "args": {"city": "NYC", "seed": str(i)}}, + ] + } + for i in range(1, num_runs + 1) + ], + } + ] + + return _create_mock_capture_result( + suite_name="MultiRunCaptureSuite", + cases=cases, + ) + + +def _create_mock_capture_no_runs() -> CaptureResult: + """Create a mock CaptureResult with a case that has no tool calls and no runs.""" + cases = [ + { + "case_name": "empty_case", + "user_message": "Do nothing", + "tool_calls": [], + "system_message": None, + "additional_messages": [], + } + ] + return _create_mock_capture_result( + suite_name="EmptyCaptureSuite", + cases=cases, + ) + + +class TestCaptureMultiRunText: + """Tests for multi-run capture in the text formatter.""" + + def test_text_shows_run_headers(self) -> None: + """Text capture output should show 'Run 1', 'Run 2', etc.""" + capture = _create_mock_capture_with_runs(num_runs=3) + formatter = CaptureTextFormatter() + output = formatter.format([capture]) + assert "Run 1:" in output + assert "Run 2:" in output + assert "Run 3:" in output + + def test_text_shows_tool_calls_per_run(self) -> None: + """Each run should display its tool calls.""" + capture = _create_mock_capture_with_runs(num_runs=2) + formatter = CaptureTextFormatter() + output = formatter.format([capture]) + assert "GetWeather" in output + + def test_text_no_runs_shows_top_level_calls(self) -> None: + """When runs is empty, should fall through to top-level tool_calls.""" + capture = _create_mock_capture_result() # default: no runs + formatter = CaptureTextFormatter() + output = formatter.format([capture]) + assert "GetWeather" in output + + def test_text_empty_case_no_tool_calls(self) -> None: + """Case with no tool calls should show appropriate message.""" + capture = _create_mock_capture_no_runs() + formatter = CaptureTextFormatter() + output = formatter.format([capture]) + assert "no tool calls" in output.lower() + + +class TestCaptureMultiRunMarkdown: + """Tests for multi-run capture in the markdown formatter.""" + + def test_markdown_shows_run_headers(self) -> None: + """Markdown capture should show run headers.""" + capture = _create_mock_capture_with_runs(num_runs=3) + formatter = CaptureMarkdownFormatter() + output = formatter.format([capture]) + assert "Run 1" in output + assert "Run 2" in output + assert "Run 3" in output + + def test_markdown_shows_tool_call_json(self) -> None: + """Markdown capture should show tool call args as JSON.""" + capture = _create_mock_capture_with_runs(num_runs=2) + formatter = CaptureMarkdownFormatter() + output = formatter.format([capture]) + assert "```json" in output + assert "GetWeather" in output + + def test_markdown_empty_runs_shows_no_calls(self) -> None: + """Markdown capture with no tool calls shows appropriate message.""" + capture = _create_mock_capture_no_runs() + formatter = CaptureMarkdownFormatter() + output = formatter.format([capture]) + assert "No tool calls" in output + + +class TestCaptureMultiRunHTML: + """Tests for multi-run capture in the HTML formatter.""" + + def test_html_shows_capture_run_details(self) -> None: + """HTML capture should show capture-run details elements.""" + capture = _create_mock_capture_with_runs(num_runs=3) + formatter = CaptureHtmlFormatter() + output = formatter.format([capture]) + assert "capture-run" in output + assert "Run 1" in output + assert "Run 2" in output + assert "Run 3" in output + + def test_html_tool_calls_escaped(self) -> None: + """HTML capture should escape tool call content.""" + capture = _create_mock_capture_with_runs(num_runs=1) + formatter = CaptureHtmlFormatter() + output = formatter.format([capture]) + assert "GetWeather" in output + + def test_html_empty_case_no_calls(self) -> None: + """HTML capture with no tool calls shows appropriate message.""" + capture = _create_mock_capture_no_runs() + formatter = CaptureHtmlFormatter() + output = formatter.format([capture]) + assert "No tool calls" in output or "no-calls" in output + + +class TestCaptureMultiRunJSON: + """Tests for multi-run capture in the JSON formatter.""" + + def test_json_includes_runs_array(self) -> None: + """JSON capture should include runs array for multi-run cases.""" + capture = _create_mock_capture_with_runs(num_runs=3) + formatter = CaptureJsonFormatter() + output = formatter.format([capture]) + data = json.loads(output) + captures = data["captures"] + assert len(captures) == 1 + case = captures[0]["captured_cases"][0] + assert "runs" in case + assert len(case["runs"]) == 3 + + def test_json_no_runs_for_single_run(self) -> None: + """JSON capture should not include runs for single-run cases.""" + capture = _create_mock_capture_result() # default: no runs + formatter = CaptureJsonFormatter() + output = formatter.format([capture]) + data = json.loads(output) + case = data["captures"][0]["captured_cases"][0] + assert "runs" not in case + + def test_json_run_tool_calls_structure(self) -> None: + """Each run in JSON should have tool_calls with name and args.""" + capture = _create_mock_capture_with_runs(num_runs=2) + formatter = CaptureJsonFormatter() + output = formatter.format([capture]) + data = json.loads(output) + run = data["captures"][0]["captured_cases"][0]["runs"][0] + assert "tool_calls" in run + assert run["tool_calls"][0]["name"] == "GetWeather" + + +# ===================================================================== +# Coverage gap tests — CaptureTextFormatter +# ===================================================================== + + +class TestCaptureTextFormatterCoverageGaps: + """Tests for CaptureTextFormatter methods that lacked coverage.""" + + def test_format_value_truncation(self) -> None: + """_format_value should truncate values longer than 60 chars.""" + formatter = CaptureTextFormatter() + short = formatter._format_value("hello") + assert short == "hello" + + long_val = "x" * 100 + truncated = formatter._format_value(long_val) + assert len(truncated) == 60 + assert truncated.endswith("...") + + def test_format_value_exactly_60(self) -> None: + """_format_value should NOT truncate values of exactly 60 chars.""" + formatter = CaptureTextFormatter() + exact = "a" * 60 + result = formatter._format_value(exact) + assert result == exact + + def test_conversation_text_format(self) -> None: + """CaptureTextFormatter._format_conversation_text should format messages.""" + formatter = CaptureTextFormatter() + messages = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi!"}, + { + "role": "assistant", + "content": None, + "tool_calls": [{"function": {"name": "get_data", "arguments": '{"id": 1}'}}], + }, + {"role": "tool", "name": "get_data", "content": '{"result": "ok"}'}, + ] + lines = formatter._format_conversation_text(messages) + text = "\n".join(lines) + + assert "[USER]" in text + assert "[ASSISTANT]" in text + assert "[TOOL]" in text + assert "get_data" in text + assert "Hello" in text + + def test_conversation_text_invalid_json_content(self) -> None: + """Should gracefully handle non-JSON tool content.""" + formatter = CaptureTextFormatter() + messages = [ + {"role": "tool", "name": "raw", "content": "plain text output"}, + ] + lines = formatter._format_conversation_text(messages) + text = "\n".join(lines) + + assert "plain text output" in text + + def test_conversation_text_invalid_json_args(self) -> None: + """Should gracefully handle non-JSON tool call arguments.""" + formatter = CaptureTextFormatter() + messages = [ + { + "role": "assistant", + "content": "", + "tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}], + }, + ] + lines = formatter._format_conversation_text(messages) + text = "\n".join(lines) + + assert "broken" in text + assert "not json" in text + + def test_conversation_text_separator_between_messages(self) -> None: + """Should add separator between messages (not before first).""" + formatter = CaptureTextFormatter() + messages = [ + {"role": "user", "content": "First"}, + {"role": "assistant", "content": "Second"}, + ] + lines = formatter._format_conversation_text(messages) + text = "\n".join(lines) + + # Separator should appear between messages + assert "----" in text + + def test_multi_model_with_tracks_and_context(self) -> None: + """Multi-model capture with tracks should render correctly with context.""" + capture1 = _create_mock_capture_with_tracks(model="gpt-4o") + capture2 = _create_mock_capture_with_tracks(model="gpt-4-turbo") + + formatter = CaptureTextFormatter() + output = formatter.format([capture1, capture2], include_context=True) + + assert "MULTI-MODEL CAPTURE RESULTS" in output + assert "gpt-4o" in output + assert "gpt-4-turbo" in output + # Should show track sections + assert "TRACK:" in output or "track_a" in output + + def test_multi_model_no_data_model(self) -> None: + """Multi-model capture should handle a model with no data for a case.""" + # Model A has case1, model B has case1 with different tools + capture1 = _create_mock_capture_result( + suite_name="Suite", + model="model-a", + cases=[ + { + "case_name": "case1", + "user_message": "Hi", + "tool_calls": [{"name": "T1", "args": {}}], + } + ], + ) + capture2 = _create_mock_capture_result( + suite_name="Suite", + model="model-b", + cases=[{"case_name": "case1", "user_message": "Hi", "tool_calls": []}], + ) + + formatter = CaptureTextFormatter() + output = formatter.format([capture1, capture2]) + + assert "model-a" in output + assert "model-b" in output + assert "MULTI-MODEL CAPTURE RESULTS" in output + + +# ===================================================================== +# Coverage gap tests — CaptureMarkdownFormatter +# ===================================================================== + + +class TestCaptureMarkdownFormatterCoverageGaps: + """Tests for CaptureMarkdownFormatter methods that lacked coverage.""" + + def test_multi_model_with_tracks_and_context(self) -> None: + """Multi-model markdown capture with tracks should render correctly.""" + capture1 = _create_mock_capture_with_tracks(model="gpt-4o") + capture2 = _create_mock_capture_with_tracks(model="gpt-4-turbo") + + formatter = CaptureMarkdownFormatter() + output = formatter.format([capture1, capture2], include_context=True) + + assert "Multi-Model Capture Results" in output + assert "gpt-4o" in output + assert "gpt-4-turbo" in output + + def test_conversation_md_standalone(self) -> None: + """CaptureMarkdownFormatter._format_conversation_md should format messages.""" + formatter = CaptureMarkdownFormatter() + messages = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi!"}, + { + "role": "assistant", + "content": None, + "tool_calls": [{"function": {"name": "search", "arguments": '{"q": "x"}'}}], + }, + {"role": "tool", "name": "search", "content": '{"r": 1}'}, + ] + lines = formatter._format_conversation_md(messages) + text = "\n".join(lines) + + assert "👤" in text or "User" in text + assert "search" in text + + def test_conversation_md_invalid_json(self) -> None: + """Should handle invalid JSON in tool call args.""" + formatter = CaptureMarkdownFormatter() + messages = [ + { + "role": "assistant", + "content": None, + "tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}], + }, + ] + lines = formatter._format_conversation_md(messages) + text = "\n".join(lines) + + assert "broken" in text diff --git a/libs/tests/cli/test_evals_runner.py b/libs/tests/cli/test_evals_runner.py index af470a26..a1ef380e 100644 --- a/libs/tests/cli/test_evals_runner.py +++ b/libs/tests/cli/test_evals_runner.py @@ -1,5 +1,6 @@ """Tests for evals_runner error handling.""" +from typing import Any, cast from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -14,6 +15,9 @@ from arcade_cli.evals_runner import ( run_evaluations, ) from arcade_cli.utils import ModelSpec, Provider +from arcade_evals import CaptureResult + +RUN_RULE_LAST = "last" class TestEvalTaskResult: @@ -68,7 +72,13 @@ class TestCaptureTaskResult: def test_from_success(self) -> None: """Test creating a successful capture result.""" - mock_captures = [MagicMock(), MagicMock()] + mock_captures = cast( + list[CaptureResult], + [ + MagicMock(spec=CaptureResult), + MagicMock(spec=CaptureResult), + ], + ) result = CaptureTaskResult.from_success("test_suite", "gpt-4o", "openai", mock_captures) assert result.success is True assert result.suite_name == "test_suite" @@ -107,6 +117,9 @@ class TestRunEvalTask: suite_func=mock_suite, model_spec=model_spec, max_concurrent=1, + num_runs=1, + seed="constant", + multi_run_pass_rule=RUN_RULE_LAST, ) assert result.success is True @@ -126,9 +139,13 @@ class TestRunEvalTask: suite_func=mock_suite, model_spec=model_spec, max_concurrent=1, + num_runs=1, + seed="constant", + multi_run_pass_rule=RUN_RULE_LAST, ) assert result.success is False + assert result.error is not None assert "API error" in result.error assert result.error_type == "ValueError" assert result.result is None @@ -145,6 +162,9 @@ class TestRunEvalTask: model_spec=model_spec, max_concurrent=5, include_context=False, + num_runs=1, + seed="constant", + multi_run_pass_rule=RUN_RULE_LAST, ) mock_suite.assert_called_once_with( @@ -153,6 +173,9 @@ class TestRunEvalTask: max_concurrency=5, provider="anthropic", include_context=False, + num_runs=1, + seed="constant", + multi_run_pass_rule=RUN_RULE_LAST, ) @@ -172,6 +195,8 @@ class TestRunCaptureTask: model_spec=model_spec, max_concurrent=1, include_context=True, + num_runs=1, + seed="constant", ) assert result.success is True @@ -189,9 +214,12 @@ class TestRunCaptureTask: model_spec=model_spec, max_concurrent=1, include_context=False, + num_runs=1, + seed="constant", ) assert result.success is False + assert result.error is not None assert "Network failed" in result.error assert result.error_type == "ConnectionError" @@ -207,6 +235,8 @@ class TestRunCaptureTask: model_spec=model_spec, max_concurrent=2, include_context=True, + num_runs=1, + seed="constant", ) mock_suite.assert_called_once_with( @@ -216,6 +246,8 @@ class TestRunCaptureTask: provider="openai", capture_mode=True, include_context=True, + num_runs=1, + seed="constant", ) @@ -253,6 +285,9 @@ class TestRunEvaluationsErrorHandling: output_format="txt", failed_only=False, console=console, + num_runs=1, + seed="constant", + multi_run_pass_rule=RUN_RULE_LAST, ) # Verify both were attempted @@ -277,6 +312,9 @@ class TestRunEvaluationsErrorHandling: output_format="txt", failed_only=False, console=console, + num_runs=1, + seed="constant", + multi_run_pass_rule=RUN_RULE_LAST, ) # Should print "No evaluations completed successfully" (with emoji) @@ -302,6 +340,9 @@ class TestRunEvaluationsErrorHandling: output_format="txt", failed_only=False, console=console, + num_runs=1, + seed="constant", + multi_run_pass_rule=RUN_RULE_LAST, ) # Check that failure count is printed @@ -327,6 +368,9 @@ class TestRunEvaluationsErrorHandling: output_format="txt", failed_only=False, console=console, + num_runs=1, + seed="constant", + multi_run_pass_rule=RUN_RULE_LAST, ) # Check that no failure warning is printed @@ -338,7 +382,7 @@ class TestRunEvaluationsErrorHandling: """Test partial failure with multiple models.""" # Suite that fails on one model but succeeds on another - async def conditional_suite(**kwargs): + async def conditional_suite(**kwargs: Any) -> MagicMock: if kwargs["model"] == "bad-model": raise RuntimeError("Model not supported") return MagicMock() @@ -371,6 +415,9 @@ class TestRunEvaluationsErrorHandling: output_format="txt", failed_only=False, console=console, + num_runs=1, + seed="constant", + multi_run_pass_rule=RUN_RULE_LAST, ) # Should have been called twice @@ -397,6 +444,8 @@ class TestRunCaptureErrorHandling: output_file=None, output_format="json", console=console, + num_runs=1, + seed="constant", ) # Error message includes emoji @@ -436,6 +485,8 @@ class TestRunCaptureErrorHandling: output_file=None, output_format="json", console=console, + num_runs=1, + seed="constant", ) # Both should have been attempted @@ -463,6 +514,8 @@ class TestRunCaptureErrorHandling: output_file=None, output_format="json", console=console, + num_runs=1, + seed="constant", ) # Check error details are printed diff --git a/libs/tests/cli/test_formatter_edge_cases.py b/libs/tests/cli/test_formatter_edge_cases.py index 50ca1e1a..9838f68e 100644 --- a/libs/tests/cli/test_formatter_edge_cases.py +++ b/libs/tests/cli/test_formatter_edge_cases.py @@ -140,11 +140,16 @@ class TestFormatterEdgeCases: formatter = HtmlFormatter() output = formatter.format(results) - # Should NOT contain raw script tags or other unescaped HTML - assert "