diff --git a/examples/evals/README.md b/examples/evals/README.md
index cf1371f6..118c12a7 100644
--- a/examples/evals/README.md
+++ b/examples/evals/README.md
@@ -40,7 +40,8 @@ arcade evals examples/evals/eval_arcade_gateway.py \
 
 # Compare multiple models
 arcade evals examples/evals/eval_stdio_mcp_server.py \
-    -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \
+    -p openai:gpt-4o \
+    -p anthropic:claude-sonnet-4-5-20250929 \
     -k openai:YOUR_OPENAI_KEY \
     -k anthropic:YOUR_ANTHROPIC_KEY
 
@@ -205,7 +206,8 @@ export ARCADE_API_KEY=your_key
 export ARCADE_USER_ID=your_user_id
 
 arcade evals examples/evals/eval_comprehensive_comparison.py \
-    -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \
+    -p openai:gpt-4o \
+    -p anthropic:claude-sonnet-4-5-20250929 \
     -k openai:YOUR_KEY \
     -k anthropic:YOUR_KEY \
     -o comparison.html -d
@@ -213,19 +215,22 @@ arcade evals examples/evals/eval_comprehensive_comparison.py \
 
 ## 🎯 CLI Reference
 
-### New v2.0.0 Flags
+### Flags
 
 
-| Flag                | Short | Description                                      | Example                                         |
-| --------------------- | ------- | -------------------------------------------------- | ------------------------------------------------- |
-| `--use-provider`    | `-p`  | Provider(s) and models (space-separated)         | `-p "openai:gpt-4o anthropic:claude-sonnet"`    |
-| `--api-key`         | `-k`  | API key in`provider:key` format (repeatable)     | `-k openai:sk-... -k anthropic:sk-ant-...`      |
-| `--output`          | `-o`  | Output file (auto-detects format from extension) | `-o results.html` or `-o results` (all formats) |
-| `--only-failed`     | `-f`  | Show only failed evaluations                     | `--only-failed`                                 |
-| `--include-context` |       | Include system messages and conversation history | `--include-context`                             |
-| `--details`         | `-d`  | Show detailed output                             | `-d`                                            |
-| `--max-concurrent`  |       | Max concurrent evaluations                       | `--max-concurrent 5`                            |
-| `--capture`         |       | Capture mode (record tool calls without scoring) | `--capture`                                     |
+| Flag                    | Short | Description                                           | Example                                             |
+| ----------------------- | ----- | ----------------------------------------------------- | --------------------------------------------------- |
+| `--use-provider`        | `-p`  | Provider and models (repeatable)                      | `-p openai:gpt-4o -p anthropic:claude-sonnet`       |
+| `--api-key`             | `-k`  | API key in `provider:key` format (repeatable)         | `-k openai:sk-... -k anthropic:sk-ant-...`          |
+| `--output`              | `-o`  | Output file (auto-detects format from extension)      | `-o results.html` or `-o results` (all formats)     |
+| `--only-failed`         | `-f`  | Show only failed evaluations                          | `--only-failed`                                     |
+| `--include-context`     |       | Include system messages and conversation history      | `--include-context`                                 |
+| `--details`             | `-d`  | Show detailed output                                  | `-d`                                                |
+| `--max-concurrent`      |       | Max concurrent evaluations                            | `--max-concurrent 5`                                |
+| `--capture`             |       | Capture mode (record tool calls without scoring)      | `--capture`                                         |
+| `--num-runs`            | `-n`  | Number of runs per case (default: 1)                  | `-n 5`                                              |
+| `--seed`                |       | Seed policy: `constant`, `random`, or an integer      | `--seed random` or `--seed 42`                      |
+| `--multi-run-pass-rule` |       | Aggregation rule: `last` (default), `mean`, `majority`| `--multi-run-pass-rule majority`                    |
 
 ### Provider & Model Selection
 
@@ -238,14 +243,15 @@ arcade evals eval_file.py -p openai -k openai:YOUR_KEY
 **Single provider with specific models:**
 
 ```bash
-arcade evals eval_file.py -p "openai:gpt-4o,gpt-4o-mini" -k openai:YOUR_KEY
+arcade evals eval_file.py -p openai:gpt-4o,gpt-4o-mini -k openai:YOUR_KEY
 ```
 
-**Multiple providers (space-separated):**
+**Multiple providers (use separate `-p` flags):**
 
 ```bash
 arcade evals eval_file.py \
-    -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \
+    -p openai:gpt-4o \
+    -p anthropic:claude-sonnet-4-5-20250929 \
     -k openai:YOUR_KEY \
     -k anthropic:YOUR_KEY
 ```
@@ -279,7 +285,7 @@ arcade evals eval_file.py \
 
 ```bash
 arcade evals examples/evals/eval_arcade_gateway.py \
-    -p "openai:gpt-4o,gpt-4o-mini,gpt-3.5-turbo" \
+    -p openai:gpt-4o,gpt-4o-mini \
     -k openai:YOUR_KEY \
     -o comparison.html -d
 ```
@@ -288,7 +294,8 @@ arcade evals examples/evals/eval_arcade_gateway.py \
 
 ```bash
 arcade evals examples/evals/eval_stdio_mcp_server.py \
-    -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \
+    -p openai:gpt-4o \
+    -p anthropic:claude-sonnet-4-5-20250929 \
     -k openai:YOUR_OPENAI_KEY \
     -k anthropic:YOUR_ANTHROPIC_KEY \
     -o battle.html -d
@@ -307,7 +314,8 @@ arcade evals examples/evals/eval_http_mcp_server.py \
 ```bash
 # Compare performance across multiple tool sources
 arcade evals examples/evals/eval_comprehensive_comparison.py \
-    -p "openai:gpt-4o anthropic:claude-sonnet-4-5-20250929" \
+    -p openai:gpt-4o \
+    -p anthropic:claude-sonnet-4-5-20250929 \
     -k openai:YOUR_KEY \
     -k anthropic:YOUR_KEY \
     -o comparison.html -d
@@ -332,6 +340,46 @@ arcade evals examples/evals/eval_stdio_mcp_server.py \
     -o full_results.html -d
 ```
 
+### Pattern 7: Multi-Run Evaluation
+
+Run each case multiple times to measure consistency and reduce variance:
+
+```bash
+# Run each case 5 times with random seeds, pass if majority of runs pass
+arcade evals examples/evals/eval_arcade_gateway.py \
+    --api-key openai:YOUR_KEY \
+    --num-runs 5 \
+    --seed random \
+    --multi-run-pass-rule majority \
+    -o stability.html -d
+```
+
+The output will include per-case statistics: mean score, standard deviation,
+individual run results, and per-critic field breakdowns.
+
+**Seed policies:**
+- `constant` (default) — Uses a fixed seed (42) for reproducible results
+- `random` — Uses a different random seed per run for variance testing
+- An integer (e.g., `--seed 123`) — Uses the given seed for all runs
+
+**Pass rules:**
+- `last` (default) — Uses the last run's pass/fail result
+- `mean` — Passes if mean score meets the rubric threshold
+- `majority` — Passes if more than half of the runs pass
+
+### Pattern 8: Multi-Run Capture Mode
+
+Capture mode also supports multiple runs:
+
+```bash
+arcade evals examples/evals/eval_arcade_gateway.py \
+    --capture \
+    --num-runs 3 \
+    --seed random \
+    --api-key openai:YOUR_KEY \
+    -o captured.json
+```
+
 ## 🐛 Troubleshooting
 
 ### Error: "No module named 'openai'"
diff --git a/libs/arcade-cli/arcade_cli/evals_runner.py b/libs/arcade-cli/arcade_cli/evals_runner.py
index 4c16d5ec..ed327325 100644
--- a/libs/arcade-cli/arcade_cli/evals_runner.py
+++ b/libs/arcade-cli/arcade_cli/evals_runner.py
@@ -159,6 +159,9 @@ async def _run_eval_task(
     suite_func: Callable[..., Any],
     model_spec: ModelSpec,
     max_concurrent: int,
+    num_runs: int,
+    seed: str | int,
+    multi_run_pass_rule: str,
     include_context: bool = False,
 ) -> EvalTaskResult:
     """
@@ -175,6 +178,9 @@ async def _run_eval_task(
             max_concurrency=max_concurrent,
             provider=model_spec.provider.value,
             include_context=include_context,
+            num_runs=num_runs,
+            seed=seed,
+            multi_run_pass_rule=multi_run_pass_rule,
         )
         return EvalTaskResult.from_success(
             suite_name, model_spec.model, model_spec.provider.value, result
@@ -198,6 +204,8 @@ async def _run_capture_task(
     model_spec: ModelSpec,
     max_concurrent: int,
     include_context: bool,
+    num_runs: int,
+    seed: str | int,
 ) -> CaptureTaskResult:
     """
     Run a single capture task with error handling.
@@ -214,6 +222,8 @@ async def _run_capture_task(
             provider=model_spec.provider.value,
             capture_mode=True,
             include_context=include_context,
+            num_runs=num_runs,
+            seed=seed,
         )
         return CaptureTaskResult.from_success(
             suite_name, model_spec.model, model_spec.provider.value, result
@@ -246,6 +256,9 @@ async def run_evaluations(
     output_format: str,
     failed_only: bool,
     console: Console,
+    num_runs: int,
+    seed: str | int,
+    multi_run_pass_rule: str,
     include_context: bool = False,
 ) -> None:
     """
@@ -262,6 +275,9 @@ async def run_evaluations(
         output_format: Format for file output ('txt', 'md').
         failed_only: Whether to show only failed evaluations.
         console: Rich console for output.
+        num_runs: Number of runs per case.
+        seed: Seed policy ("constant", "random", or an integer seed).
+        multi_run_pass_rule: How to determine pass/warn for multi-run cases.
         include_context: Whether to include system_message and additional_messages.
     """
     tasks = []
@@ -280,6 +296,9 @@ async def run_evaluations(
                     model_spec=model_spec,
                     max_concurrent=max_concurrent,
                     include_context=include_context,
+                    num_runs=num_runs,
+                    seed=seed,
+                    multi_run_pass_rule=multi_run_pass_rule,
                 )
             )
             tasks.append(task)
@@ -370,6 +389,8 @@ async def run_capture(
     output_file: str | None,
     output_format: str,
     console: Console,
+    num_runs: int,
+    seed: str | int,
 ) -> None:
     """
     Run evaluation suites in capture mode and output results.
@@ -385,6 +406,8 @@ async def run_capture(
         output_file: Optional file path to write results.
         output_format: Output format ('json', 'txt', 'md', 'html').
         console: Rich console for output.
+        num_runs: Number of runs per case.
+        seed: Seed policy ("constant", "random", or an integer seed).
     """
     tasks = []
 
@@ -402,6 +425,8 @@ async def run_capture(
                     model_spec=model_spec,
                     max_concurrent=max_concurrent,
                     include_context=include_context,
+                    num_runs=num_runs,
+                    seed=seed,
                 )
             )
             tasks.append(task)
diff --git a/libs/arcade-cli/arcade_cli/formatters/base.py b/libs/arcade-cli/arcade_cli/formatters/base.py
index 3b1d6166..ebe6a2de 100644
--- a/libs/arcade-cli/arcade_cli/formatters/base.py
+++ b/libs/arcade-cli/arcade_cli/formatters/base.py
@@ -214,11 +214,21 @@ def group_comparative_by_case(
                     }
 
                 # Store this track's result for this case
-                comparative_groups[model][base_suite][case_name]["tracks"][track_name] = {
+                track_result: dict[str, Any] = {
                     "evaluation": evaluation,
                     "name": case_name,
                     "input": case.get("input", ""),
                 }
+                run_stats = case.get("run_stats")
+                if run_stats:
+                    track_result["run_stats"] = run_stats
+                critic_stats = case.get("critic_stats")
+                if critic_stats:
+                    track_result["critic_stats"] = critic_stats
+
+                comparative_groups[model][base_suite][case_name]["tracks"][track_name] = (
+                    track_result
+                )
 
     return (
         comparative_groups,
@@ -414,11 +424,19 @@ def group_comparative_by_case_first(
                     }
 
                 # Store this track's result
-                case_groups[base_suite][case_name][model]["tracks"][track_name] = {
+                track_result: dict[str, Any] = {
                     "evaluation": evaluation,
                     "name": case_name,
                     "input": case.get("input", ""),
                 }
+                run_stats = case.get("run_stats")
+                if run_stats:
+                    track_result["run_stats"] = run_stats
+                critic_stats = case.get("critic_stats")
+                if critic_stats:
+                    track_result["critic_stats"] = critic_stats
+
+                case_groups[base_suite][case_name][model]["tracks"][track_name] = track_result
 
     return (
         case_groups,
@@ -539,11 +557,17 @@ def group_eval_for_comparison(
                     comparison_data[suite_name][case_name] = {}
 
                 # Store this model's result for this case
-                comparison_data[suite_name][case_name][model] = {
+                case_entry: dict[str, Any] = {
                     "evaluation": evaluation,
                     "input": case.get("input", ""),
                     "name": case_name,
                 }
+                # Propagate multi-run stats if present
+                if case.get("run_stats"):
+                    case_entry["run_stats"] = case["run_stats"]
+                if case.get("critic_stats"):
+                    case_entry["critic_stats"] = case["critic_stats"]
+                comparison_data[suite_name][case_name][model] = case_entry
 
     # Calculate pass rates
     for _model, stats in per_model_stats.items():
diff --git a/libs/arcade-cli/arcade_cli/formatters/html.py b/libs/arcade-cli/arcade_cli/formatters/html.py
index ab900aa2..c267e70b 100644
--- a/libs/arcade-cli/arcade_cli/formatters/html.py
+++ b/libs/arcade-cli/arcade_cli/formatters/html.py
@@ -163,10 +163,18 @@ class HtmlFormatter(EvalResultFormatter):
 
                 # Show summary table only when NOT showing details (avoid duplication)
                 if not show_details:
-                    html_parts.append('<table class="results-table">')
-                    html_parts.append(
-                        "<thead><tr><th>Status</th><th>Case</th><th>Score</th></tr></thead>"
+                    has_run_stats = any(
+                        case.get("run_stats", {}).get("num_runs", 1) > 1 for case in cases
                     )
+                    html_parts.append('<table class="results-table">')
+                    if has_run_stats:
+                        html_parts.append(
+                            "<thead><tr><th>Status</th><th>Case</th><th>Score</th><th>Runs</th></tr></thead>"
+                        )
+                    else:
+                        html_parts.append(
+                            "<thead><tr><th>Status</th><th>Case</th><th>Score</th></tr></thead>"
+                        )
                     html_parts.append("<tbody>")
 
                     for case in cases:
@@ -183,11 +191,20 @@ class HtmlFormatter(EvalResultFormatter):
 
                         score_pct = evaluation.score * 100
                         case_name = self._escape_html(case["name"])
+                        run_stats = case.get("run_stats") or {}
+                        score_display = f"{score_pct:.1f}%"
+                        runs_display = ""
+                        if run_stats.get("num_runs", 1) > 1:
+                            std_pct = run_stats.get("std_deviation", 0.0) * 100
+                            score_display = f"{score_pct:.1f}% ± {std_pct:.1f}%"
+                            runs_display = str(run_stats.get("num_runs", 1))
 
                         html_parts.append(f'<tr class="{status_class}">')
                         html_parts.append(f'<td class="status-cell">{status_text}</td>')
                         html_parts.append(f"<td>{case_name}</td>")
-                        html_parts.append(f'<td class="score-cell">{score_pct:.1f}%</td>')
+                        html_parts.append(f'<td class="score-cell">{score_display}</td>')
+                        if has_run_stats:
+                            html_parts.append(f"<td>{runs_display or '-'}</td>")
                         html_parts.append("</tr>")
 
                     html_parts.append("</tbody></table>")
@@ -254,7 +271,15 @@ class HtmlFormatter(EvalResultFormatter):
                                 html_parts.append("</div>")
 
                         # Evaluation details
-                        html_parts.append(self._format_evaluation_details(evaluation))
+                        run_id = self._make_safe_id(suite_name, case["name"], model)
+                        html_parts.append(
+                            self._format_evaluation_details(
+                                evaluation,
+                                case.get("run_stats"),
+                                case.get("critic_stats"),
+                                run_id=run_id,
+                            )
+                        )
                         html_parts.append("</div>")
                         html_parts.append("</details>")
 
@@ -267,18 +292,50 @@ class HtmlFormatter(EvalResultFormatter):
 
         return "\n".join(html_parts)
 
-    def _format_evaluation_details(self, evaluation: Any) -> str:
+    def _format_evaluation_details(
+        self,
+        evaluation: Any,
+        run_stats: dict[str, Any] | None = None,
+        critic_stats: dict[str, Any] | None = None,
+        run_id: str | None = None,
+    ) -> str:
         """Format evaluation details as HTML table."""
-        if evaluation.failure_reason:
-            return f'<div class="failure-reason">❌ <strong>Failure Reason:</strong> {self._escape_html(evaluation.failure_reason)}</div>'
+        parts: list[str] = []
 
+        run_stats_html = self._format_run_stats_html(run_stats, evaluation)
+        if run_stats_html:
+            parts.append(run_stats_html)
+
+        run_tabs_html = self._format_run_tabs_html(run_stats, run_id)
+        if run_tabs_html:
+            parts.append(run_tabs_html)
+
+        critic_stats_html = self._format_critic_stats_html(critic_stats)
+        if critic_stats_html:
+            parts.append(critic_stats_html)
+
+        if evaluation.failure_reason:
+            parts.append(
+                f'<div class="failure-reason">❌ <strong>Failure Reason:</strong> '
+                f"{self._escape_html(evaluation.failure_reason)}</div>"
+            )
+            return "\n".join(parts)
+
+        # Only show field details table when there are NO run tabs
+        # (run tabs already show per-run field details, and Critic Stats shows the aggregation)
+        if not run_tabs_html:
+            details_table = self._format_critic_results_table(evaluation.results)
+            parts.append(details_table)
+        return "\n".join(parts)
+
+    def _format_critic_results_table(self, results: list[dict[str, Any]]) -> str:
         lines = ['<table class="detail-table">']
         lines.append(
             "<thead><tr><th>Field</th><th>Match</th><th>Score</th><th>Expected</th><th>Actual</th></tr></thead>"
         )
         lines.append("<tbody>")
 
-        for critic_result in evaluation.results:
+        for critic_result in results:
             is_criticized = critic_result.get("is_criticized", True)
             field = self._escape_html(critic_result["field"])
             score = critic_result["score"]
@@ -314,6 +371,177 @@ class HtmlFormatter(EvalResultFormatter):
         lines.append("</tbody></table>")
         return "\n".join(lines)
 
+    def _format_run_stats_html(self, run_stats: dict[str, Any] | None, evaluation: Any) -> str:
+        if not run_stats or run_stats.get("num_runs", 1) < 2:
+            return ""
+        if evaluation.passed:
+            status_label = "PASSED"
+            status_icon = "✅"
+            status_class = "passed"
+        elif evaluation.warning:
+            status_label = "WARNED"
+            status_icon = "⚠️"
+            status_class = "warned"
+        else:
+            status_label = "FAILED"
+            status_icon = "❌"
+            status_class = "failed"
+        mean_pct = run_stats.get("mean_score", 0.0) * 100
+        std_pct = run_stats.get("std_deviation", 0.0) * 100
+        num_runs = run_stats.get("num_runs", 0)
+        scores = run_stats.get("scores", [])
+        seed_policy = run_stats.get("seed_policy", "")
+        run_seeds = run_stats.get("run_seeds") or []
+        pass_rule = run_stats.get("pass_rule", "")
+
+        # Build score pills for each run
+        score_pills = []
+        for i, score in enumerate(scores, 1):
+            score_pct = score * 100
+            if score >= 0.8:
+                pill_class = "score-pill high"
+            elif score >= 0.6:
+                pill_class = "score-pill mid"
+            else:
+                pill_class = "score-pill low"
+            score_pills.append(f'<span class="{pill_class}">R{i}: {score_pct:.0f}%</span>')
+        scores_html = " ".join(score_pills) if score_pills else ""
+
+        # Build seeds display
+        seeds_html = ""
+        if run_seeds and any(seed is not None for seed in run_seeds):
+            seeds_display = ", ".join(str(seed) for seed in run_seeds)
+            seeds_html = f'<div class="run-meta-item"><span class="meta-label">🎲 Seeds</span><span class="meta-value mono">{seeds_display}</span></div>'
+
+        html = f"""<div class="run-stats-card {status_class}">
+            <div class="run-stats-header">
+                <div class="run-status-badge {status_class}">
+                    <span class="status-icon">{status_icon}</span>
+                    <span class="status-text">{status_label}</span>
+                </div>
+                <div class="run-count">{num_runs} runs</div>
+            </div>
+            <div class="run-stats-body">
+                <div class="score-display">
+                    <div class="score-main">
+                        <span class="score-value">{mean_pct:.1f}%</span>
+                        <span class="score-label">mean score</span>
+                    </div>
+                    <div class="score-deviation">
+                        <span class="deviation-value">± {std_pct:.1f}%</span>
+                        <span class="deviation-label">std dev</span>
+                    </div>
+                </div>
+                <div class="score-bar-container">
+                    <div class="score-bar {status_class}" style="width: {min(mean_pct, 100):.1f}%"></div>
+                </div>
+                <div class="run-scores">{scores_html}</div>
+            </div>
+            <div class="run-stats-footer">
+                <div class="run-meta-item">
+                    <span class="meta-label">📋 Pass Rule</span>
+                    <span class="meta-value">{self._escape_html(pass_rule)}</span>
+                </div>
+                <div class="run-meta-item">
+                    <span class="meta-label">🌱 Seed Policy</span>
+                    <span class="meta-value">{self._escape_html(seed_policy)}</span>
+                </div>
+                {seeds_html}
+            </div>
+        </div>"""
+        return html
+
+    def _format_critic_stats_html(self, critic_stats: dict[str, Any] | None) -> str:
+        if not critic_stats:
+            return ""
+        lines = ['<div class="critic-stats"><h4>📊 Critic Stats</h4>']
+        lines.append('<table class="detail-table critic-stats-table">')
+        lines.append(
+            "<thead><tr><th>Field</th><th>Weight</th><th>Mean (norm %)</th>"
+            "<th>Std (norm %)</th><th>Mean (weighted %)</th>"
+            "<th>Std (weighted %)</th></tr></thead>"
+        )
+        lines.append("<tbody>")
+        for field, stats in critic_stats.items():
+            weight = stats.get("weight", 0.0)
+            mean_norm = stats.get("mean_score_normalized", 0.0) * 100
+            std_norm = stats.get("std_deviation_normalized", 0.0) * 100
+            mean_weighted = stats.get("mean_score", 0.0) * 100
+            std_weighted = stats.get("std_deviation", 0.0) * 100
+            # Color coding based on normalized mean: <60 red, 60-80 yellow, >80 green
+            if mean_norm < 60:
+                score_class = "score-low"
+            elif mean_norm < 80:
+                score_class = "score-mid"
+            else:
+                score_class = "score-high"
+            lines.append(
+                f'<tr class="{score_class}">'
+                f"<td>{self._escape_html(field)}</td>"
+                f"<td>{weight:.2f}</td>"
+                f'<td class="score-value">{mean_norm:.2f}%</td>'
+                f"<td>{std_norm:.2f}%</td>"
+                f"<td>{mean_weighted:.2f}%</td>"
+                f"<td>{std_weighted:.2f}%</td>"
+                "</tr>"
+            )
+        lines.append("</tbody></table></div>")
+        return "\n".join(lines)
+
+    def _format_run_tabs_html(self, run_stats: dict[str, Any] | None, run_id: str | None) -> str:
+        if not run_stats or run_stats.get("num_runs", 1) < 2:
+            return ""
+        runs = run_stats.get("runs", [])
+        if not runs or run_id is None:
+            return ""
+
+        tabs = ['<div class="run-tabs">', '<div class="run-tab-list">']
+        for idx, run in enumerate(runs, start=1):
+            active = "active" if idx == 1 else ""
+            if run.get("passed"):
+                status_class = "passed"
+            elif run.get("warning"):
+                status_class = "warned"
+            else:
+                status_class = "failed"
+            tabs.append(
+                f'<button class="run-tab {status_class} {active}" data-run-group="{run_id}" '
+                f'data-run-index="{idx}">Run {idx}</button>'
+            )
+        tabs.append("</div>")
+
+        panels = ['<div class="run-panels">']
+        for idx, run in enumerate(runs, start=1):
+            active = "active" if idx == 1 else ""
+            if run.get("passed"):
+                status = "✅ PASSED"
+                status_class = "passed"
+            elif run.get("warning"):
+                status = "⚠️ WARNED"
+                status_class = "warned"
+            else:
+                status = "❌ FAILED"
+                status_class = "failed"
+            score_pct = run.get("score", 0.0) * 100
+            details = run.get("details", [])
+            panels.append(
+                f'<div class="run-panel {status_class} {active}" data-run-group="{run_id}" '
+                f'data-run-index="{idx}">'
+            )
+            panels.append(f"<p><strong>Run {idx}:</strong> {status} — {score_pct:.2f}%</p>")
+            failure_reason = run.get("failure_reason")
+            if failure_reason:
+                panels.append(
+                    f'<div class="failure-reason">❌ <strong>Failure Reason:</strong> '
+                    f"{self._escape_html(str(failure_reason))}</div>"
+                )
+            if details:
+                panels.append(self._format_critic_results_table(details))
+            panels.append("</div>")
+        panels.append("</div></div>")
+
+        return "\n".join(tabs + panels)
+
     def _escape_html(self, text: str) -> str:
         """Escape HTML special characters."""
         return (
@@ -534,6 +762,7 @@ class HtmlFormatter(EvalResultFormatter):
                 for model in model_order:
                     if model in case_models:
                         evaluation = case_models[model]["evaluation"]
+                        run_stats = case_models[model].get("run_stats")
                         score = evaluation.score * 100
                         if evaluation.passed:
                             cell_class = "passed"
@@ -544,7 +773,15 @@ class HtmlFormatter(EvalResultFormatter):
                         else:
                             cell_class = "failed"
                             icon = "✗"
-                        html_parts.append(f'<td class="{cell_class}">{icon} {score:.0f}%</td>')
+                        if run_stats and run_stats.get("num_runs", 1) > 1:
+                            std_pct = run_stats.get("std_deviation", 0.0) * 100
+                            runs = run_stats.get("num_runs", 1)
+                            html_parts.append(
+                                f'<td class="{cell_class}">{icon} '
+                                f"{score:.0f}% ± {std_pct:.0f}%<br><small>n={runs}</small></td>"
+                            )
+                        else:
+                            html_parts.append(f'<td class="{cell_class}">{icon} {score:.0f}%</td>')
                     else:
                         html_parts.append('<td class="no-data">-</td>')
 
@@ -582,7 +819,15 @@ class HtmlFormatter(EvalResultFormatter):
                         html_parts.append(
                             f"<strong>{self._escape_html(model)}</strong>: Score {evaluation.score * 100:.1f}%"
                         )
-                        html_parts.append(self._format_evaluation_details(evaluation))
+                        run_id = self._make_safe_id(suite_name, case_name, model)
+                        html_parts.append(
+                            self._format_evaluation_details(
+                                evaluation,
+                                case_result.get("run_stats"),
+                                case_result.get("critic_stats"),
+                                run_id=run_id,
+                            )
+                        )
                         html_parts.append("</div>")
 
                     html_parts.append("</div>")
@@ -609,9 +854,9 @@ class HtmlFormatter(EvalResultFormatter):
             .multi-model-summary .pass-rate { font-weight: bold; }
             .multi-model-summary .best-model { background-color: rgba(76, 175, 80, 0.1); }
             .best-overall { margin-top: 15px; padding: 10px; background: #1e1e1e; border-radius: 4px; }
-            .comparison-table { width: 100%; border-collapse: collapse; margin: 15px 0; }
-            .comparison-table th, .comparison-table td { padding: 10px; border: 1px solid #333; text-align: center; }
-            .comparison-table th { background-color: #252525; }
+        .comparison-table { width: 100%; border-collapse: collapse; margin: 15px 0; }
+        .comparison-table th, .comparison-table td { padding: 10px; border: 1px solid #333; text-align: center; }
+        .comparison-table th { background: linear-gradient(90deg, rgba(137, 180, 250, 0.25), rgba(203, 166, 247, 0.25)); }
             .comparison-table .case-name { text-align: left; font-weight: bold; }
             .comparison-table .passed { background-color: rgba(76, 175, 80, 0.2); color: #4caf50; }
             .comparison-table .failed { background-color: rgba(244, 67, 54, 0.2); color: #f44336; }
@@ -1201,7 +1446,15 @@ class HtmlFormatter(EvalResultFormatter):
                         f'<span class="track-badge">{self._escape_html(track_name)}</span>'
                     )
                     lines.append("</div>")
-                    lines.append(self._format_evaluation_details(evaluation))
+                    run_id = self._make_safe_id(suite_name, case_name, f"{track_name}")
+                    lines.append(
+                        self._format_evaluation_details(
+                            evaluation,
+                            track_result.get("run_stats"),
+                            track_result.get("critic_stats"),
+                            run_id=run_id,
+                        )
+                    )
 
                 lines.append("</div>")  # track-panel
             lines.append("</div>")  # track-panels-container
@@ -1256,6 +1509,9 @@ document.querySelectorAll('.track-tab').forEach(tab => {
             --blue: #89b4fa;
             --purple: #cba6f7;
             --cyan: #94e2d5;
+            --accent: #89b4fa;
+            --accent-2: #cba6f7;
+            --shadow: rgba(0, 0, 0, 0.35);
         }
 
         * {
@@ -1282,6 +1538,23 @@ document.querySelectorAll('.track-tab').forEach(tab => {
             padding-bottom: 10px;
         }
 
+        /* Critic stats score coloring: red <60%, yellow 60-80%, green >80% */
+        .critic-stats-table tr.score-low td.score-value {
+            color: var(--red);
+            font-weight: bold;
+        }
+        .critic-stats-table tr.score-mid td.score-value {
+            color: var(--yellow);
+            font-weight: bold;
+        }
+        .critic-stats-table tr.score-high td.score-value {
+            color: var(--green);
+            font-weight: bold;
+        }
+        .critic-stats-table tr.score-low { background: rgba(243, 139, 168, 0.08); }
+        .critic-stats-table tr.score-mid { background: rgba(249, 226, 175, 0.08); }
+        .critic-stats-table tr.score-high { background: rgba(166, 227, 161, 0.08); }
+
         h2 {
             color: var(--blue);
             margin-top: 30px;
@@ -1480,6 +1753,234 @@ document.querySelectorAll('.track-tab').forEach(tab => {
             font-size: 0.9em;
         }
 
+        .critic-stats {
+            margin: 10px 0;
+            padding: 10px;
+            background: #202020;
+            border-radius: 6px;
+        }
+
+        /* Run Stats Card - Modern Design */
+        .run-stats-card {
+            margin: 15px 0;
+            border-radius: 12px;
+            background: linear-gradient(145deg, #252535, #1a1a2a);
+            border: 1px solid var(--border-color);
+            overflow: hidden;
+        }
+        .run-stats-card.passed { border-left: 4px solid var(--green); }
+        .run-stats-card.warned { border-left: 4px solid var(--yellow); }
+        .run-stats-card.failed { border-left: 4px solid var(--red); }
+
+        .run-stats-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            padding: 12px 16px;
+            background: rgba(0, 0, 0, 0.2);
+            border-bottom: 1px solid var(--border-color);
+        }
+
+        .run-status-badge {
+            display: flex;
+            align-items: center;
+            gap: 8px;
+            padding: 6px 14px;
+            border-radius: 20px;
+            font-weight: bold;
+            font-size: 0.9em;
+        }
+        .run-status-badge.passed { background: rgba(166, 227, 161, 0.2); color: var(--green); }
+        .run-status-badge.warned { background: rgba(249, 226, 175, 0.2); color: var(--yellow); }
+        .run-status-badge.failed { background: rgba(243, 139, 168, 0.2); color: var(--red); }
+
+        .run-count {
+            color: var(--text-color);
+            font-size: 0.9em;
+            opacity: 0.8;
+        }
+
+        .run-stats-body {
+            padding: 16px;
+        }
+
+        .score-display {
+            display: flex;
+            align-items: flex-end;
+            gap: 20px;
+            margin-bottom: 12px;
+        }
+
+        .score-main {
+            display: flex;
+            flex-direction: column;
+        }
+        .score-main .score-value {
+            font-size: 2.2em;
+            font-weight: bold;
+            color: var(--blue);
+            line-height: 1;
+        }
+        .score-main .score-label {
+            font-size: 0.75em;
+            color: #888;
+            text-transform: uppercase;
+            letter-spacing: 0.05em;
+        }
+
+        .score-deviation {
+            display: flex;
+            flex-direction: column;
+        }
+        .score-deviation .deviation-value {
+            font-size: 1.3em;
+            font-weight: 600;
+            color: var(--purple);
+        }
+        .score-deviation .deviation-label {
+            font-size: 0.7em;
+            color: #888;
+            text-transform: uppercase;
+        }
+
+        .score-bar-container {
+            height: 8px;
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 4px;
+            overflow: hidden;
+            margin-bottom: 14px;
+        }
+        .score-bar {
+            height: 100%;
+            border-radius: 4px;
+            transition: width 0.3s ease;
+        }
+        .score-bar.passed { background: linear-gradient(90deg, var(--green), #7ecf7e); }
+        .score-bar.warned { background: linear-gradient(90deg, var(--yellow), #f5d67a); }
+        .score-bar.failed { background: linear-gradient(90deg, var(--red), #e87a94); }
+
+        .run-scores {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 8px;
+        }
+
+        .score-pill {
+            padding: 4px 10px;
+            border-radius: 12px;
+            font-size: 0.8em;
+            font-weight: 600;
+        }
+        .score-pill.high { background: rgba(166, 227, 161, 0.2); color: var(--green); }
+        .score-pill.mid { background: rgba(249, 226, 175, 0.2); color: var(--yellow); }
+        .score-pill.low { background: rgba(243, 139, 168, 0.2); color: var(--red); }
+
+        .run-stats-footer {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 16px;
+            padding: 12px 16px;
+            background: rgba(0, 0, 0, 0.15);
+            border-top: 1px solid var(--border-color);
+        }
+
+        .run-meta-item {
+            display: flex;
+            flex-direction: column;
+            gap: 2px;
+        }
+        .meta-label {
+            font-size: 0.7em;
+            color: #888;
+            text-transform: uppercase;
+        }
+        .meta-value {
+            font-size: 0.85em;
+            color: var(--text-color);
+        }
+        .meta-value.mono {
+            font-family: 'Consolas', 'Monaco', monospace;
+            font-size: 0.75em;
+            color: var(--cyan);
+        }
+
+        .run-tabs {
+            margin: 12px 0;
+            border: 1px solid var(--border-color);
+            border-radius: 6px;
+            background: #1f1f2b;
+        }
+
+        .run-tab-list {
+            display: flex;
+            gap: 6px;
+            padding: 8px;
+            border-bottom: 1px solid var(--border-color);
+            flex-wrap: wrap;
+        }
+
+        .run-tab {
+            background: #2a2a3a;
+            color: var(--text-color);
+            border: 1px solid var(--border-color);
+            border-radius: 4px;
+            padding: 6px 10px;
+            cursor: pointer;
+        }
+
+        .run-tab.active {
+            background: var(--blue);
+            color: #111;
+            border-color: var(--blue);
+        }
+
+        .run-tab.passed {
+            border-color: var(--green);
+        }
+
+        .run-tab.warned {
+            border-color: var(--yellow);
+        }
+
+        .run-tab.failed {
+            border-color: var(--red);
+        }
+
+        .run-panels {
+            padding: 10px;
+        }
+
+        .run-panel {
+            display: none;
+        }
+
+        .run-panel.active {
+            display: block;
+        }
+
+        .run-panel.passed {
+            border-left: 3px solid var(--green);
+            padding-left: 10px;
+        }
+
+        .run-panel.warned {
+            border-left: 3px solid var(--yellow);
+            padding-left: 10px;
+        }
+
+        .run-panel.failed {
+            border-left: 3px solid var(--red);
+            padding-left: 10px;
+        }
+
+        .run-status.passed { color: var(--green); }
+        .run-status.warned { color: var(--yellow); }
+        .run-status.failed { color: var(--red); }
+
+        .aggregate-details {
+            margin-top: 10px;
+        }
+
         .field-name {
             color: var(--purple);
             font-weight: 600;
@@ -1954,6 +2455,39 @@ document.querySelectorAll('.track-tab').forEach(tab => {
             margin: 8px 0;
         }
     </style>
+    <script>
+    document.addEventListener("click", (event) => {
+        const target = event.target;
+        if (!(target instanceof HTMLElement)) {
+            return;
+        }
+        const runTab = target.closest(".run-tab");
+        if (!runTab) {
+            return;
+        }
+        const container = runTab.closest(".run-tabs");
+        if (!container) {
+            return;
+        }
+        const index = runTab.dataset.runIndex;
+        if (!index) {
+            return;
+        }
+        container.querySelectorAll(".run-tab").forEach((tab) => {
+            tab.classList.remove("active");
+        });
+        container.querySelectorAll(".run-panel").forEach((panel) => {
+            panel.classList.remove("active");
+        });
+        runTab.classList.add("active");
+        const panel = container.querySelector(
+            `.run-panel[data-run-index="${index}"]`
+        );
+        if (panel) {
+            panel.classList.add("active");
+        }
+    });
+    </script>
 </head>
 <body>
 """
@@ -1994,19 +2528,48 @@ class CaptureHtmlFormatter(CaptureFormatter):
             for case in capture.captured_cases:
                 total_cases += 1
                 tool_calls_html = []
+                runs = getattr(case, "runs", None)
 
-                for tc in case.tool_calls:
-                    total_calls += 1
-                    args_html = ""
-                    if tc.args:
-                        args_json = json.dumps(tc.args, indent=2)
-                        args_html = f'<pre class="args">{self._escape_html(args_json)}</pre>'
-                    tool_calls_html.append(
-                        f'<div class="tool-call">'
-                        f'<span class="tool-name">{self._escape_html(tc.name)}</span>'
-                        f"{args_html}"
-                        f"</div>"
-                    )
+                if runs:
+                    for run_index, run in enumerate(runs, start=1):
+                        run_calls_html = []
+                        for tc in run.tool_calls:
+                            total_calls += 1
+                            args_html = ""
+                            if tc.args:
+                                args_json = json.dumps(tc.args, indent=2)
+                                args_html = (
+                                    f'<pre class="args">{self._escape_html(args_json)}</pre>'
+                                )
+                            run_calls_html.append(
+                                f'<div class="tool-call">'
+                                f'<span class="tool-name">{self._escape_html(tc.name)}</span>'
+                                f"{args_html}"
+                                f"</div>"
+                            )
+                        if not run_calls_html:
+                            run_calls_html.append(
+                                '<div class="no-calls">No tool calls captured</div>'
+                            )
+                        tool_calls_html.append(
+                            f'<details class="capture-run" open>'
+                            f'<summary>Run {run_index}</summary>'
+                            f'{"".join(run_calls_html)}'
+                            f"</details>"
+                        )
+                else:
+                    for tc in case.tool_calls:
+                        total_calls += 1
+                        args_html = ""
+                        if tc.args:
+                            args_json = json.dumps(tc.args, indent=2)
+                            args_html = f'<pre class="args">{self._escape_html(args_json)}</pre>'
+                        tool_calls_html.append(
+                            f'<div class="tool-call">'
+                            f'<span class="tool-name">{self._escape_html(tc.name)}</span>'
+                            f"{args_html}"
+                            f"</div>"
+                        )
 
                 if not tool_calls_html:
                     tool_calls_html.append('<div class="no-calls">No tool calls captured</div>')
@@ -2498,7 +3061,31 @@ class CaptureHtmlFormatter(CaptureFormatter):
                                 f'<div class="model-label">{self._escape_html(model)}</div>'
                             )
 
-                            if captured_case.tool_calls:
+                            runs = getattr(captured_case, "runs", None)
+                            if runs:
+                                for run_index, run in enumerate(runs, start=1):
+                                    html_parts.append(
+                                        f'<details class="capture-run" open>'
+                                        f"<summary>Run {run_index}</summary>"
+                                    )
+                                    if run.tool_calls:
+                                        for tc in run.tool_calls:
+                                            total_calls += 1
+                                            args_html = ""
+                                            if tc.args:
+                                                args_json = json.dumps(tc.args, indent=2)
+                                                args_html = f'<pre class="args">{self._escape_html(args_json)}</pre>'
+                                            html_parts.append(
+                                                f'<div class="tool-call">'
+                                                f'<span class="tool-name">{self._escape_html(tc.name)}</span>'
+                                                f"{args_html}</div>"
+                                            )
+                                    else:
+                                        html_parts.append(
+                                            '<div class="no-calls">No tool calls</div>'
+                                        )
+                                    html_parts.append("</details>")
+                            elif captured_case.tool_calls:
                                 for tc in captured_case.tool_calls:
                                     total_calls += 1
                                     args_html = ""
@@ -2539,7 +3126,29 @@ class CaptureHtmlFormatter(CaptureFormatter):
                             f'<div class="model-label">{self._escape_html(model)}</div>'
                         )
 
-                        if captured_case.tool_calls:
+                        runs = getattr(captured_case, "runs", None)
+                        if runs:
+                            for run_index, run in enumerate(runs, start=1):
+                                html_parts.append(
+                                    f'<details class="capture-run" open>'
+                                    f"<summary>Run {run_index}</summary>"
+                                )
+                                if run.tool_calls:
+                                    for tc in run.tool_calls:
+                                        total_calls += 1
+                                        args_html = ""
+                                        if tc.args:
+                                            args_json = json.dumps(tc.args, indent=2)
+                                            args_html = f'<pre class="args">{self._escape_html(args_json)}</pre>'
+                                        html_parts.append(
+                                            f'<div class="tool-call">'
+                                            f'<span class="tool-name">{self._escape_html(tc.name)}</span>'
+                                            f"{args_html}</div>"
+                                        )
+                                else:
+                                    html_parts.append('<div class="no-calls">No tool calls</div>')
+                                html_parts.append("</details>")
+                        elif captured_case.tool_calls:
                             for tc in captured_case.tool_calls:
                                 total_calls += 1
                                 args_html = ""
@@ -2704,6 +3313,19 @@ document.querySelectorAll('.track-tab').forEach(tab => {{
             text-transform: uppercase;
             margin-bottom: 0.5rem;
         }}
+        .capture-run {{
+            margin-bottom: 0.75rem;
+            background: var(--bg-primary);
+            border: 1px solid var(--border);
+            border-radius: 6px;
+            padding: 0.5rem 0.75rem;
+        }}
+        .capture-run summary {{
+            cursor: pointer;
+            font-weight: 600;
+            color: var(--accent);
+            margin-bottom: 0.5rem;
+        }}
         .tool-call {{
             background: var(--bg-primary);
             border-left: 3px solid var(--accent);
diff --git a/libs/arcade-cli/arcade_cli/formatters/json.py b/libs/arcade-cli/arcade_cli/formatters/json.py
index 361974b2..e5ebcb0b 100644
--- a/libs/arcade-cli/arcade_cli/formatters/json.py
+++ b/libs/arcade-cli/arcade_cli/formatters/json.py
@@ -249,6 +249,13 @@ class JsonFormatter(EvalResultFormatter):
                         if evaluation.failure_reason:
                             track_data["failure_reason"] = evaluation.failure_reason
 
+                        run_stats = track_result.get("run_stats")
+                        if run_stats:
+                            track_data["run_stats"] = run_stats
+                        critic_stats = track_result.get("critic_stats")
+                        if critic_stats:
+                            track_data["critic_stats"] = critic_stats
+
                         if show_details and evaluation.results:
                             track_data["details"] = self._serialize_critic_results(
                                 evaluation.results
@@ -375,6 +382,13 @@ class JsonFormatter(EvalResultFormatter):
                         if evaluation.failure_reason:
                             track_data["failure_reason"] = evaluation.failure_reason
 
+                        run_stats = track_result.get("run_stats")
+                        if run_stats:
+                            track_data["run_stats"] = run_stats
+                        critic_stats = track_result.get("critic_stats")
+                        if critic_stats:
+                            track_data["critic_stats"] = critic_stats
+
                         if show_details and evaluation.results:
                             track_data["details"] = self._serialize_critic_results(
                                 evaluation.results
@@ -496,6 +510,13 @@ class JsonFormatter(EvalResultFormatter):
                     if evaluation.failure_reason:
                         model_data["failure_reason"] = evaluation.failure_reason
 
+                    run_stats = case_result.get("run_stats")
+                    if run_stats:
+                        model_data["run_stats"] = run_stats
+                    critic_stats = case_result.get("critic_stats")
+                    if critic_stats:
+                        model_data["critic_stats"] = critic_stats
+
                     if show_details and evaluation.results:
                         model_data["details"] = self._serialize_critic_results(evaluation.results)
 
@@ -537,6 +558,13 @@ class JsonFormatter(EvalResultFormatter):
         if evaluation.failure_reason:
             case_data["failure_reason"] = evaluation.failure_reason
 
+        run_stats = case.get("run_stats")
+        if run_stats:
+            case_data["run_stats"] = run_stats
+        critic_stats = case.get("critic_stats")
+        if critic_stats:
+            case_data["critic_stats"] = critic_stats
+
         if show_details and evaluation.results:
             case_data["details"] = self._serialize_critic_results(evaluation.results)
 
@@ -657,12 +685,24 @@ class CaptureJsonFormatter(CaptureFormatter):
                                 continue
 
                             captured_case = models_dict[model]
-                            track_output["models"][model] = {
+                            model_output: dict[str, Any] = {
                                 "tool_calls": [
                                     {"name": tc.name, "args": tc.args}
                                     for tc in captured_case.tool_calls
                                 ],
                             }
+                            runs = getattr(captured_case, "runs", None)
+                            if runs:
+                                model_output["runs"] = [
+                                    {
+                                        "tool_calls": [
+                                            {"name": tc.name, "args": tc.args}
+                                            for tc in run.tool_calls
+                                        ]
+                                    }
+                                    for run in runs
+                                ]
+                            track_output["models"][model] = model_output
 
                         case_output["tracks"][track_display] = track_output
                 else:
@@ -678,12 +718,23 @@ class CaptureJsonFormatter(CaptureFormatter):
                             continue
 
                         captured_case = models_dict[model]
-                        case_output["models"][model] = {
+                        model_output = {
                             "tool_calls": [
                                 {"name": tc.name, "args": tc.args}
                                 for tc in captured_case.tool_calls
                             ],
                         }
+                        runs = getattr(captured_case, "runs", None)
+                        if runs:
+                            model_output["runs"] = [
+                                {
+                                    "tool_calls": [
+                                        {"name": tc.name, "args": tc.args} for tc in run.tool_calls
+                                    ]
+                                }
+                                for run in runs
+                            ]
+                        case_output["models"][model] = model_output
 
                 output["grouped_by_case"][suite_name][case_name] = case_output
 
diff --git a/libs/arcade-cli/arcade_cli/formatters/markdown.py b/libs/arcade-cli/arcade_cli/formatters/markdown.py
index ea5b3486..f6fef9c8 100644
--- a/libs/arcade-cli/arcade_cli/formatters/markdown.py
+++ b/libs/arcade-cli/arcade_cli/formatters/markdown.py
@@ -89,37 +89,16 @@ class MarkdownFormatter(EvalResultFormatter):
         lines.append("## Summary")
         lines.append("")
 
-        if failed_only and original_counts:
-            orig_total, orig_passed, orig_failed, orig_warned = original_counts
-            lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)")
-            lines.append("")
-            lines.append("| Metric | Count |")
-            lines.append("|--------|-------|")
-            lines.append(f"| **Total** | {orig_total} |")
-            lines.append(f"| ✅ Passed | {orig_passed} |")
-            if orig_warned > 0:
-                lines.append(f"| ⚠️ Warnings | {orig_warned} |")
-            lines.append(f"| ❌ Failed | {orig_failed} |")
-        else:
-            lines.append("| Metric | Count |")
-            lines.append("|--------|-------|")
-            lines.append(f"| **Total** | {total_cases} |")
-            lines.append(f"| ✅ Passed | {total_passed} |")
-            if total_warned > 0:
-                lines.append(f"| ⚠️ Warnings | {total_warned} |")
-            if total_failed > 0:
-                lines.append(f"| ❌ Failed | {total_failed} |")
-
-        # Pass rate
-        if total_cases > 0:
-            if failed_only and original_counts and original_counts[0] > 0:
-                pass_rate = (original_counts[1] / original_counts[0]) * 100
-            else:
-                pass_rate = (total_passed / total_cases) * 100
-            lines.append("")
-            lines.append(f"**Pass Rate:** {pass_rate:.1f}%")
-
-        lines.append("")
+        lines.extend(
+            self._format_summary_table_md(
+                total_cases,
+                total_passed,
+                total_failed,
+                total_warned,
+                failed_only,
+                original_counts,
+            )
+        )
 
         # Results by model
         lines.append("## Results by Model")
@@ -134,8 +113,15 @@ class MarkdownFormatter(EvalResultFormatter):
                 lines.append("")
 
                 # Results table
-                lines.append("| Status | Case | Score |")
-                lines.append("|--------|------|-------|")
+                has_run_stats = any(
+                    case.get("run_stats", {}).get("num_runs", 1) > 1 for case in cases
+                )
+                if has_run_stats:
+                    lines.append("| Status | Case | Score | Runs |")
+                    lines.append("|--------|------|-------|------|")
+                else:
+                    lines.append("| Status | Case | Score |")
+                    lines.append("|--------|------|-------|")
 
                 for case in cases:
                     evaluation = case["evaluation"]
@@ -148,7 +134,15 @@ class MarkdownFormatter(EvalResultFormatter):
 
                     score_pct = evaluation.score * 100
                     case_name = case["name"].replace("|", "\\|")
-                    lines.append(f"| {status} | {case_name} | {score_pct:.1f}% |")
+                    run_stats = case.get("run_stats") or {}
+                    score_display = f"{score_pct:.1f}%"
+                    if run_stats.get("num_runs", 1) > 1:
+                        std_pct = run_stats.get("std_deviation", 0.0) * 100
+                        score_display = f"{score_pct:.1f}% ± {std_pct:.1f}%"
+                        runs_value = run_stats.get("num_runs", 1)
+                        lines.append(f"| {status} | {case_name} | {score_display} | {runs_value} |")
+                    else:
+                        lines.append(f"| {status} | {case_name} | {score_display} |")
 
                 lines.append("")
 
@@ -175,6 +169,16 @@ class MarkdownFormatter(EvalResultFormatter):
                         lines.append(f"**Input:** `{case['input']}`")
                         lines.append("")
 
+                        run_stats = case.get("run_stats")
+                        lines.extend(self._format_run_stats_summary(run_stats))
+
+                        run_detail_lines = self._format_run_details_md(run_stats)
+                        lines.extend(run_detail_lines)
+
+                        critic_stats = case.get("critic_stats")
+                        if critic_stats:
+                            lines.extend(self._format_critic_stats_summary(critic_stats))
+
                         # Context section (if include_context is True)
                         if include_context:
                             system_msg = case.get("system_message")
@@ -194,8 +198,10 @@ class MarkdownFormatter(EvalResultFormatter):
                                     lines.append("</details>")
                                     lines.append("")
 
-                        # Evaluation details
-                        lines.append(self._format_evaluation_details(evaluation))
+                        # Only show the critic results table when there are no per-run
+                        # details (run details already include per-run field tables)
+                        if not run_detail_lines:
+                            lines.append(self._format_evaluation_details(evaluation))
                         lines.append("")
                         lines.append("---")
                         lines.append("")
@@ -212,31 +218,120 @@ class MarkdownFormatter(EvalResultFormatter):
         if evaluation.failure_reason:
             lines.append(f"**Failure Reason:** {evaluation.failure_reason}")
         else:
-            lines.append("| Field | Match | Score | Expected | Actual |")
-            lines.append("|-------|-------|-------|----------|--------|")
-
-            for critic_result in evaluation.results:
-                is_criticized = critic_result.get("is_criticized", True)
-                field = critic_result["field"]
-                score = critic_result["score"]
-                weight = critic_result["weight"]
-                expected = str(critic_result["expected"]).replace("|", "\\|")
-                actual = str(critic_result["actual"]).replace("|", "\\|")
-
-                # Truncate long values for table readability
-                expected = truncate_field_value(expected, MD_MAX_FIELD_LENGTH)
-                actual = truncate_field_value(actual, MD_MAX_FIELD_LENGTH)
-
-                if is_criticized:
-                    match_icon = "✅" if critic_result["match"] else "❌"
-                    lines.append(
-                        f"| {field} | {match_icon} | {score:.2f}/{weight:.2f} | `{expected}` | `{actual}` |"
-                    )
-                else:
-                    lines.append(f"| {field} | — | - | `{expected}` | `{actual}` |")
+            lines.extend(self._format_critic_results_table_md(evaluation.results))
 
         return "\n".join(lines)
 
+    def _format_critic_results_table_md(self, results: list[dict[str, Any]]) -> list[str]:
+        lines: list[str] = []
+        lines.append("| Field | Match | Score | Expected | Actual |")
+        lines.append("|-------|-------|-------|----------|--------|")
+
+        for critic_result in results:
+            is_criticized = critic_result.get("is_criticized", True)
+            field = critic_result["field"]
+            score = critic_result["score"]
+            weight = critic_result["weight"]
+            expected = str(critic_result["expected"]).replace("|", "\\|")
+            actual = str(critic_result["actual"]).replace("|", "\\|")
+
+            # Truncate long values for table readability
+            expected = truncate_field_value(expected, MD_MAX_FIELD_LENGTH)
+            actual = truncate_field_value(actual, MD_MAX_FIELD_LENGTH)
+
+            if is_criticized:
+                match_icon = "✅" if critic_result["match"] else "❌"
+                lines.append(
+                    f"| {field} | {match_icon} | {score:.2f}/{weight:.2f} | `{expected}` | `{actual}` |"
+                )
+            else:
+                lines.append(f"| {field} | — | - | `{expected}` | `{actual}` |")
+
+        return lines
+
+    def _format_critic_stats_summary(self, critic_stats: dict[str, Any]) -> list[str]:
+        lines: list[str] = []
+        lines.append("**Critic Stats (normalized & weighted):**  ")
+        lines.append(
+            "| Field | Weight | Mean (norm %) | Std (norm %) | Mean (weighted %) | Std (weighted %) |"
+        )
+        lines.append(
+            "|-------|--------|---------------|--------------|-------------------|------------------|"
+        )
+        for field, stats in critic_stats.items():
+            weight = stats.get("weight", 0.0)
+            mean_norm = stats.get("mean_score_normalized", 0.0) * 100
+            std_norm = stats.get("std_deviation_normalized", 0.0) * 100
+            mean_weighted = stats.get("mean_score", 0.0) * 100
+            std_weighted = stats.get("std_deviation", 0.0) * 100
+            lines.append(
+                f"| {field} | {weight:.2f} | {mean_norm:.2f}% | {std_norm:.2f}% | "
+                f"{mean_weighted:.2f}% | {std_weighted:.2f}% |"
+            )
+        lines.append("")
+        return lines
+
+    def _format_run_stats_summary(self, run_stats: dict[str, Any] | None) -> list[str]:
+        """Format the run statistics summary as a Markdown bullet list."""
+        if not run_stats or run_stats.get("num_runs", 1) < 2:
+            return []
+        lines: list[str] = []
+        mean_pct = run_stats.get("mean_score", 0.0) * 100
+        std_pct = run_stats.get("std_deviation", 0.0) * 100
+        scores = run_stats.get("scores", [])
+        scores_display = ", ".join(f"{score * 100:.2f}%" for score in scores)
+        lines.append("**Run Stats:**  ")
+        lines.append(f"- Runs: {run_stats.get('num_runs', len(scores))}  ")
+        lines.append(f"- Mean Score: {mean_pct:.2f}%  ")
+        lines.append(f"- Std Deviation: {std_pct:.2f}%  ")
+        if scores_display:
+            lines.append(f"- Scores: {scores_display}  ")
+        seed_policy = run_stats.get("seed_policy")
+        if seed_policy:
+            lines.append(f"- Seed Policy: {seed_policy}  ")
+        run_seeds = run_stats.get("run_seeds")
+        if run_seeds and any(seed is not None for seed in run_seeds):
+            seeds_display = ", ".join(str(seed) for seed in run_seeds)
+            lines.append(f"- Run Seeds: {seeds_display}  ")
+        pass_rule = run_stats.get("pass_rule")
+        if pass_rule:
+            lines.append(f"- Pass Rule: {pass_rule}  ")
+        lines.append("")
+        return lines
+
+    def _format_run_details_md(self, run_stats: dict[str, Any] | None) -> list[str]:
+        if not run_stats or run_stats.get("num_runs", 1) < 2:
+            return []
+        runs = run_stats.get("runs", [])
+        if not runs:
+            return []
+        lines: list[str] = []
+        lines.append("**Run Details:**  ")
+        for idx, run in enumerate(runs, start=1):
+            if run.get("passed"):
+                status = "✅ PASSED"
+            elif run.get("warning"):
+                status = "⚠️ WARNED"
+            else:
+                status = "❌ FAILED"
+            score_pct = run.get("score", 0.0) * 100
+            line = f"- Run {idx}: {status} — {score_pct:.2f}%"
+            failure_reason = run.get("failure_reason")
+            if failure_reason:
+                line += f" ({failure_reason})"
+            lines.append(line)
+            details = run.get("details", [])
+            if details:
+                lines.append("")
+                lines.append("<details>")
+                lines.append(f"<summary>Run {idx} details</summary>")
+                lines.append("")
+                lines.extend(self._format_critic_results_table_md(details))
+                lines.append("")
+                lines.append("</details>")
+        lines.append("")
+        return lines
+
     # =========================================================================
     # MULTI-MODEL EVALUATION FORMATTING
     # =========================================================================
@@ -371,7 +466,19 @@ class MarkdownFormatter(EvalResultFormatter):
 
                         lines.append(f"**{model}:** Score {evaluation.score * 100:.1f}%")
                         lines.append("")
-                        lines.append(self._format_evaluation_details(evaluation))
+                        run_stats = case_result.get("run_stats")
+                        lines.extend(self._format_run_stats_summary(run_stats))
+
+                        run_detail_lines = self._format_run_details_md(run_stats)
+                        lines.extend(run_detail_lines)
+
+                        critic_stats = case_result.get("critic_stats")
+                        if critic_stats:
+                            lines.extend(self._format_critic_stats_summary(critic_stats))
+                        # Only show the critic results table when there are no per-run
+                        # details (run details already include per-run field tables)
+                        if not run_detail_lines:
+                            lines.append(self._format_evaluation_details(evaluation))
                         lines.append("")
 
                     lines.append("---")
@@ -471,37 +578,16 @@ class MarkdownFormatter(EvalResultFormatter):
         lines.append(f"**Tracks compared:** {', '.join(f'`{t}`' for t in all_tracks)}")
         lines.append("")
 
-        if failed_only and original_counts:
-            orig_total, orig_passed, orig_failed, orig_warned = original_counts
-            lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)")
-            lines.append("")
-            lines.append("| Metric | Count |")
-            lines.append("|--------|-------|")
-            lines.append(f"| **Total** | {orig_total} |")
-            lines.append(f"| ✅ Passed | {orig_passed} |")
-            if orig_warned > 0:
-                lines.append(f"| ⚠️ Warnings | {orig_warned} |")
-            lines.append(f"| ❌ Failed | {orig_failed} |")
-        else:
-            lines.append("| Metric | Count |")
-            lines.append("|--------|-------|")
-            lines.append(f"| **Total** | {total_cases} |")
-            lines.append(f"| ✅ Passed | {total_passed} |")
-            if total_warned > 0:
-                lines.append(f"| ⚠️ Warnings | {total_warned} |")
-            if total_failed > 0:
-                lines.append(f"| ❌ Failed | {total_failed} |")
-
-        # Pass rate
-        if total_cases > 0:
-            if failed_only and original_counts and original_counts[0] > 0:
-                pass_rate = (original_counts[1] / original_counts[0]) * 100
-            else:
-                pass_rate = (total_passed / total_cases) * 100
-            lines.append("")
-            lines.append(f"**Pass Rate:** {pass_rate:.1f}%")
-
-        lines.append("")
+        lines.extend(
+            self._format_summary_table_md(
+                total_cases,
+                total_passed,
+                total_failed,
+                total_warned,
+                failed_only,
+                original_counts,
+            )
+        )
 
         # Results by model
         lines.append("## Results by Model")
@@ -522,77 +608,13 @@ class MarkdownFormatter(EvalResultFormatter):
 
                 # List all cases with summary comparison
                 for case_name, case_data in cases.items():
-                    # Context section (if include_context is True)
                     if include_context:
-                        system_msg = case_data.get("system_message")
-                        addl_msgs = case_data.get("additional_messages")
-                        if system_msg or addl_msgs:
-                            lines.append("<details>")
-                            lines.append("<summary>📋 <strong>Context</strong></summary>")
-                            lines.append("")
-                            if system_msg:
-                                lines.append(f"**System Message:** {system_msg}")
-                                lines.append("")
-                            if addl_msgs:
-                                lines.append(f"**💬 Conversation ({len(addl_msgs)} messages):**")
-                                lines.append("")
-                                for msg in addl_msgs:
-                                    role = msg.get("role", "unknown")
-                                    content = msg.get("content", "")
-                                    name = msg.get("name", "")
-                                    role_icons = {
-                                        "user": "👤",
-                                        "assistant": "🤖",
-                                        "tool": "🔧",
-                                        "system": "⚙️",
-                                    }
-                                    icon = role_icons.get(role, "💬")
-                                    label = (
-                                        f"{icon} **{role.title()}**"
-                                        if not name
-                                        else f"{icon} **{role.title()}** (`{name}`)"
-                                    )
-                                    lines.append(f"> {label}")
-                                    if content:
-                                        if role == "tool":
-                                            try:
-                                                import json
-
-                                                parsed = json.loads(content)
-                                                formatted = json.dumps(parsed, indent=2)
-                                                lines.append("> ```json")
-                                                for json_line in formatted.split("\n"):
-                                                    lines.append(f"> {json_line}")
-                                                lines.append("> ```")
-                                            except (json.JSONDecodeError, TypeError):
-                                                lines.append(f"> {content}")
-                                        else:
-                                            lines.append(f"> {content}")
-                                    tool_calls = msg.get("tool_calls", [])
-                                    if tool_calls:
-                                        for tc in tool_calls:
-                                            func = tc.get("function", {})
-                                            tc_name = func.get("name", "unknown")
-                                            tc_args = func.get("arguments", "{}")
-                                            lines.append(f"> 🔧 **{tc_name}**")
-                                            try:
-                                                import json
-
-                                                args_dict = (
-                                                    json.loads(tc_args)
-                                                    if isinstance(tc_args, str)
-                                                    else tc_args
-                                                )
-                                                formatted = json.dumps(args_dict, indent=2)
-                                                lines.append("> ```json")
-                                                for arg_line in formatted.split("\n"):
-                                                    lines.append(f"> {arg_line}")
-                                                lines.append("> ```")
-                                            except (json.JSONDecodeError, TypeError):
-                                                lines.append(f"> `{tc_args}`")
-                                    lines.append(">")
-                            lines.append("</details>")
-                            lines.append("")
+                        lines.extend(
+                            self._format_context_section_md(
+                                case_data.get("system_message"),
+                                case_data.get("additional_messages"),
+                            )
+                        )
 
                     lines.extend(
                         self._format_comparative_case(
@@ -647,37 +669,16 @@ class MarkdownFormatter(EvalResultFormatter):
         lines.append("## Summary")
         lines.append("")
 
-        if failed_only and original_counts:
-            orig_total, orig_passed, orig_failed, orig_warned = original_counts
-            lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)")
-            lines.append("")
-            lines.append("| Metric | Count |")
-            lines.append("|--------|-------|")
-            lines.append(f"| **Total** | {orig_total} |")
-            lines.append(f"| ✅ Passed | {orig_passed} |")
-            if orig_warned > 0:
-                lines.append(f"| ⚠️ Warnings | {orig_warned} |")
-            lines.append(f"| ❌ Failed | {orig_failed} |")
-        else:
-            lines.append("| Metric | Count |")
-            lines.append("|--------|-------|")
-            lines.append(f"| **Total** | {total_cases} |")
-            lines.append(f"| ✅ Passed | {total_passed} |")
-            if total_warned > 0:
-                lines.append(f"| ⚠️ Warnings | {total_warned} |")
-            if total_failed > 0:
-                lines.append(f"| ❌ Failed | {total_failed} |")
-
-        # Pass rate
-        if total_cases > 0:
-            if failed_only and original_counts and original_counts[0] > 0:
-                pass_rate = (original_counts[1] / original_counts[0]) * 100
-            else:
-                pass_rate = (total_passed / total_cases) * 100
-            lines.append("")
-            lines.append(f"**Pass Rate:** {pass_rate:.1f}%")
-
-        lines.append("")
+        lines.extend(
+            self._format_summary_table_md(
+                total_cases,
+                total_passed,
+                total_failed,
+                total_warned,
+                failed_only,
+                original_counts,
+            )
+        )
 
         # Results grouped by case
         lines.append("## Results by Case")
@@ -705,77 +706,12 @@ class MarkdownFormatter(EvalResultFormatter):
 
                 # Context section (if include_context is True)
                 if include_context:
-                    system_msg = first_model_data.get("system_message")
-                    addl_msgs = first_model_data.get("additional_messages")
-                    if system_msg or addl_msgs:
-                        lines.append("<details>")
-                        lines.append("<summary>📋 <strong>Context</strong></summary>")
-                        lines.append("")
-                        if system_msg:
-                            lines.append(f"**System Message:** {system_msg}")
-                            lines.append("")
-                        if addl_msgs:
-                            lines.append(f"**💬 Conversation ({len(addl_msgs)} messages):**")
-                            lines.append("")
-                            for msg in addl_msgs:
-                                role = msg.get("role", "unknown")
-                                content = msg.get("content", "")
-                                name = msg.get("name", "")
-                                role_icons = {
-                                    "user": "👤",
-                                    "assistant": "🤖",
-                                    "tool": "🔧",
-                                    "system": "⚙️",
-                                }
-                                icon = role_icons.get(role, "💬")
-                                label = (
-                                    f"{icon} **{role.title()}**"
-                                    if not name
-                                    else f"{icon} **{role.title()}** (`{name}`)"
-                                )
-                                lines.append(f"> {label}")
-                                if content:
-                                    # For tool responses, format as JSON code block
-                                    if role == "tool":
-                                        try:
-                                            import json
-
-                                            parsed = json.loads(content)
-                                            formatted = json.dumps(parsed, indent=2)
-                                            lines.append("> ```json")
-                                            for json_line in formatted.split("\n"):
-                                                lines.append(f"> {json_line}")
-                                            lines.append("> ```")
-                                        except (json.JSONDecodeError, TypeError):
-                                            lines.append(f"> {content}")
-                                    else:
-                                        lines.append(f"> {content}")
-                                # Handle tool calls
-                                tool_calls = msg.get("tool_calls", [])
-                                if tool_calls:
-                                    for tc in tool_calls:
-                                        func = tc.get("function", {})
-                                        tc_name = func.get("name", "unknown")
-                                        tc_args = func.get("arguments", "{}")
-                                        lines.append(f"> 🔧 **{tc_name}**")
-                                        try:
-                                            import json
-
-                                            args_dict = (
-                                                json.loads(tc_args)
-                                                if isinstance(tc_args, str)
-                                                else tc_args
-                                            )
-                                            formatted = json.dumps(args_dict, indent=2)
-                                            lines.append("> ```json")
-                                            for arg_line in formatted.split("\n"):
-                                                lines.append(f"> {arg_line}")
-                                            lines.append("> ```")
-                                        except (json.JSONDecodeError, TypeError):
-                                            lines.append(f"> `{tc_args}`")
-                                lines.append(">")
-                        lines.append("</details>")
-                        lines.append("")
+                    lines.extend(
+                        self._format_context_section_md(
+                            first_model_data.get("system_message"),
+                            first_model_data.get("additional_messages"),
+                        )
+                    )
 
                 # Show each model's results for this case
                 for model in model_order:
@@ -876,7 +812,20 @@ class MarkdownFormatter(EvalResultFormatter):
                 lines.append("<details>")
                 lines.append(f"<summary>📋 <b>{track_name}</b> — Detailed Results</summary>")
                 lines.append("")
-                lines.append(self._format_evaluation_details(evaluation))
+                run_stats = track_result.get("run_stats")
+                lines.extend(self._format_run_stats_summary(run_stats))
+
+                run_detail_lines = self._format_run_details_md(run_stats)
+                lines.extend(run_detail_lines)
+
+                critic_stats = track_result.get("critic_stats")
+                if critic_stats:
+                    lines.extend(self._format_critic_stats_summary(critic_stats))
+
+                # Only show the critic results table when there are no per-run
+                # details (run details already include per-run field tables)
+                if not run_detail_lines:
+                    lines.append(self._format_evaluation_details(evaluation))
                 lines.append("")
                 lines.append("</details>")
                 lines.append("")
@@ -886,6 +835,81 @@ class MarkdownFormatter(EvalResultFormatter):
 
         return lines
 
+    def _format_summary_table_md(
+        self,
+        total_cases: int,
+        total_passed: int,
+        total_failed: int,
+        total_warned: int,
+        failed_only: bool,
+        original_counts: tuple[int, int, int, int] | None,
+    ) -> list[str]:
+        """Build the summary table and pass rate used by regular and comparative formatters."""
+        lines: list[str] = []
+        if failed_only and original_counts:
+            orig_total, orig_passed, orig_failed, orig_warned = original_counts
+            lines.append(f"> ⚠️ **Note:** Showing only {total_cases} failed evaluation(s)")
+            lines.append("")
+            lines.append("| Metric | Count |")
+            lines.append("|--------|-------|")
+            lines.append(f"| **Total** | {orig_total} |")
+            lines.append(f"| ✅ Passed | {orig_passed} |")
+            if orig_warned > 0:
+                lines.append(f"| ⚠️ Warnings | {orig_warned} |")
+            lines.append(f"| ❌ Failed | {orig_failed} |")
+        else:
+            lines.append("| Metric | Count |")
+            lines.append("|--------|-------|")
+            lines.append(f"| **Total** | {total_cases} |")
+            lines.append(f"| ✅ Passed | {total_passed} |")
+            if total_warned > 0:
+                lines.append(f"| ⚠️ Warnings | {total_warned} |")
+            if total_failed > 0:
+                lines.append(f"| ❌ Failed | {total_failed} |")
+
+        # Pass rate
+        if total_cases > 0:
+            if failed_only and original_counts and original_counts[0] > 0:
+                pass_rate = (original_counts[1] / original_counts[0]) * 100
+            else:
+                pass_rate = (total_passed / total_cases) * 100
+            lines.append("")
+            lines.append(f"**Pass Rate:** {pass_rate:.1f}%")
+
+        lines.append("")
+        return lines
+
+    def _format_context_section_md(
+        self,
+        system_msg: str | None,
+        additional_messages: list[dict] | None,
+    ) -> list[str]:
+        """Build a collapsible context section for comparative display.
+
+        Args:
+            system_msg: The system message, if any.
+            additional_messages: Conversation messages, if any.
+
+        Returns:
+            List of formatted markdown lines (empty if no context data).
+        """
+        if not system_msg and not additional_messages:
+            return []
+        lines: list[str] = []
+        lines.append("<details>")
+        lines.append("<summary>📋 <strong>Context</strong></summary>")
+        lines.append("")
+        if system_msg:
+            lines.append(f"**System Message:** {system_msg}")
+            lines.append("")
+        if additional_messages:
+            lines.append(f"**💬 Conversation ({len(additional_messages)} messages):**")
+            lines.append("")
+            lines.extend(self._format_conversation_md(additional_messages))
+        lines.append("</details>")
+        lines.append("")
+        return lines
+
     def _format_conversation_md(self, messages: list[dict]) -> list[str]:
         """Format conversation messages as Markdown for context display."""
         lines: list[str] = []
@@ -1003,7 +1027,25 @@ class CaptureMarkdownFormatter(CaptureFormatter):
                 lines.append("#### Tool Calls")
                 lines.append("")
 
-                if case.tool_calls:
+                runs = getattr(case, "runs", None)
+                if runs:
+                    for run_index, run in enumerate(runs, start=1):
+                        lines.append(f"**Run {run_index}**")
+                        lines.append("")
+                        if run.tool_calls:
+                            for tc in run.tool_calls:
+                                total_calls += 1
+                                lines.append(f"**`{tc.name}`**")
+                                if tc.args:
+                                    lines.append("")
+                                    lines.append("```json")
+                                    lines.append(json.dumps(tc.args, indent=2))
+                                    lines.append("```")
+                                lines.append("")
+                        else:
+                            lines.append("*No tool calls captured*")
+                            lines.append("")
+                elif case.tool_calls:
                     for tc in case.tool_calls:
                         total_calls += 1
                         lines.append(f"**`{tc.name}`**")
@@ -1104,7 +1146,11 @@ class CaptureMarkdownFormatter(CaptureFormatter):
                                 continue
 
                             captured_case = models_dict[model]
-                            if captured_case.tool_calls:
+                            runs = getattr(captured_case, "runs", None)
+                            if runs:
+                                tool_names = f"{len(runs)} run(s)"
+                                total_calls += sum(len(run.tool_calls) for run in runs)
+                            elif captured_case.tool_calls:
                                 tool_names = ", ".join(
                                     f"`{tc.name}`" for tc in captured_case.tool_calls
                                 )
@@ -1121,21 +1167,39 @@ class CaptureMarkdownFormatter(CaptureFormatter):
                                 continue
 
                             captured_case = models_dict[model]
-                            if not captured_case.tool_calls:
+                            runs = getattr(captured_case, "runs", None)
+                            if not runs and not captured_case.tool_calls:
                                 continue
 
                             lines.append("<details>")
                             lines.append(f"<summary>🤖 {model} - Details</summary>")
                             lines.append("")
 
-                            for tc in captured_case.tool_calls:
-                                lines.append(f"**`{tc.name}`**")
-                                if tc.args:
+                            if runs:
+                                for run_index, run in enumerate(runs, start=1):
+                                    lines.append(f"**Run {run_index}**")
+                                    lines.append("")
+                                    if run.tool_calls:
+                                        for tc in run.tool_calls:
+                                            lines.append(f"**`{tc.name}`**")
+                                            if tc.args:
+                                                lines.append("")
+                                                lines.append("```json")
+                                                lines.append(json.dumps(tc.args, indent=2))
+                                                lines.append("```")
+                                            lines.append("")
+                                    else:
+                                        lines.append("*No tool calls captured*")
+                                        lines.append("")
+                            else:
+                                for tc in captured_case.tool_calls:
+                                    lines.append(f"**`{tc.name}`**")
+                                    if tc.args:
+                                        lines.append("")
+                                        lines.append("```json")
+                                        lines.append(json.dumps(tc.args, indent=2))
+                                        lines.append("```")
                                     lines.append("")
-                                    lines.append("```json")
-                                    lines.append(json.dumps(tc.args, indent=2))
-                                    lines.append("```")
-                                lines.append("")
 
                             lines.append("</details>")
                             lines.append("")
@@ -1160,7 +1224,11 @@ class CaptureMarkdownFormatter(CaptureFormatter):
                             continue
 
                         captured_case = models_dict[model]
-                        if captured_case.tool_calls:
+                        runs = getattr(captured_case, "runs", None)
+                        if runs:
+                            tool_names = f"{len(runs)} run(s)"
+                            total_calls += sum(len(run.tool_calls) for run in runs)
+                        elif captured_case.tool_calls:
                             tool_names = ", ".join(
                                 f"`{tc.name}`" for tc in captured_case.tool_calls
                             )
@@ -1177,21 +1245,39 @@ class CaptureMarkdownFormatter(CaptureFormatter):
                             continue
 
                         captured_case = models_dict[model]
-                        if not captured_case.tool_calls:
+                        runs = getattr(captured_case, "runs", None)
+                        if not runs and not captured_case.tool_calls:
                             continue
 
                         lines.append("<details>")
                         lines.append(f"<summary>🤖 <b>{model}</b> - Tool Call Details</summary>")
                         lines.append("")
 
-                        for tc in captured_case.tool_calls:
-                            lines.append(f"**`{tc.name}`**")
-                            if tc.args:
+                        if runs:
+                            for run_index, run in enumerate(runs, start=1):
+                                lines.append(f"**Run {run_index}**")
+                                lines.append("")
+                                if run.tool_calls:
+                                    for tc in run.tool_calls:
+                                        lines.append(f"**`{tc.name}`**")
+                                        if tc.args:
+                                            lines.append("")
+                                            lines.append("```json")
+                                            lines.append(json.dumps(tc.args, indent=2))
+                                            lines.append("```")
+                                        lines.append("")
+                                else:
+                                    lines.append("*No tool calls captured*")
+                                    lines.append("")
+                        else:
+                            for tc in captured_case.tool_calls:
+                                lines.append(f"**`{tc.name}`**")
+                                if tc.args:
+                                    lines.append("")
+                                    lines.append("```json")
+                                    lines.append(json.dumps(tc.args, indent=2))
+                                    lines.append("```")
                                 lines.append("")
-                                lines.append("```json")
-                                lines.append(json.dumps(tc.args, indent=2))
-                                lines.append("```")
-                            lines.append("")
 
                         lines.append("</details>")
                         lines.append("")
diff --git a/libs/arcade-cli/arcade_cli/formatters/text.py b/libs/arcade-cli/arcade_cli/formatters/text.py
index 88bf3bc0..e8c5bcbc 100644
--- a/libs/arcade-cli/arcade_cli/formatters/text.py
+++ b/libs/arcade-cli/arcade_cli/formatters/text.py
@@ -91,7 +91,14 @@ class TextFormatter(EvalResultFormatter):
                         status = "FAILED"
 
                     score_percentage = evaluation.score * 100
-                    lines.append(f"    {status} {case['name']} -- Score: {score_percentage:.2f}%")
+                    run_stats = case.get("run_stats") or {}
+                    stats_suffix = ""
+                    if run_stats.get("num_runs", 1) > 1:
+                        std_pct = run_stats.get("std_deviation", 0.0) * 100
+                        stats_suffix = f" (n={run_stats['num_runs']}, sd={std_pct:.2f}%)"
+                    lines.append(
+                        f"    {status} {case['name']} -- Score: {score_percentage:.2f}%{stats_suffix}"
+                    )
 
                     if show_details:
                         lines.append(f"    User Input: {case['input']}")
@@ -112,6 +119,10 @@ class TextFormatter(EvalResultFormatter):
                                 lines.append("")
 
                         lines.append("    Details:")
+                        for stat_line in self._format_run_stats(case):
+                            lines.append(f"    {stat_line}")
+                        for stat_line in self._format_critic_stats(case):
+                            lines.append(f"    {stat_line}")
                         for detail_line in self._format_evaluation(evaluation).split("\n"):
                             lines.append(f"    {detail_line}")
                         lines.append("    " + "-" * 52)
@@ -121,23 +132,16 @@ class TextFormatter(EvalResultFormatter):
             lines.append("")
 
         # Summary
-        if failed_only and original_counts:
-            orig_total, orig_passed, orig_failed, orig_warned = original_counts
-            lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)")
-            summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}"
-            if orig_warned > 0:
-                summary += f" -- Warnings: {orig_warned}"
-            if orig_failed > 0:
-                summary += f" -- Failed: {orig_failed}"
-        else:
-            summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}"
-            if total_warned > 0:
-                summary += f" -- Warnings: {total_warned}"
-            if total_failed > 0:
-                summary += f" -- Failed: {total_failed}"
-
-        lines.append(summary)
-        lines.append("")
+        lines.extend(
+            self._format_summary_lines(
+                total_cases,
+                total_passed,
+                total_failed,
+                total_warned,
+                failed_only,
+                original_counts,
+            )
+        )
 
         return "\n".join(lines)
 
@@ -169,6 +173,70 @@ class TextFormatter(EvalResultFormatter):
                     )
         return "\n".join(result_lines)
 
+    def _format_run_stats(self, case: dict[str, Any]) -> list[str]:
+        run_stats = case.get("run_stats")
+        if not run_stats or run_stats.get("num_runs", 1) < 2:
+            return []
+        scores = run_stats.get("scores", [])
+        scores_display = ", ".join(f"{score * 100:.2f}%" for score in scores)
+        mean_pct = run_stats.get("mean_score", 0.0) * 100
+        std_pct = run_stats.get("std_deviation", 0.0) * 100
+        lines = [
+            "Run Stats:",
+            f"  Runs: {run_stats.get('num_runs', len(scores))}",
+            f"  Mean Score: {mean_pct:.2f}%",
+            f"  Std Deviation: {std_pct:.2f}%",
+        ]
+        if scores_display:
+            lines.append(f"  Scores: {scores_display}")
+        seed_policy = run_stats.get("seed_policy")
+        run_seeds = run_stats.get("run_seeds")
+        if seed_policy:
+            lines.append(f"  Seed Policy: {seed_policy}")
+        if run_seeds and any(seed is not None for seed in run_seeds):
+            seeds_display = ", ".join(str(seed) for seed in run_seeds)
+            lines.append(f"  Run Seeds: {seeds_display}")
+        pass_rule = run_stats.get("pass_rule")
+        if pass_rule:
+            lines.append(f"  Pass Rule: {pass_rule}")
+
+        runs = run_stats.get("runs", [])
+        if runs:
+            lines.append("  Run Results:")
+            for idx, run in enumerate(runs, start=1):
+                if run.get("passed"):
+                    status = "PASSED"
+                elif run.get("warning"):
+                    status = "WARNED"
+                else:
+                    status = "FAILED"
+                score_pct = run.get("score", 0.0) * 100
+                run_line = f"    Run {idx}: {status} ({score_pct:.2f}%)"
+                failure_reason = run.get("failure_reason")
+                if failure_reason:
+                    run_line += f" -- {failure_reason}"
+                lines.append(run_line)
+        lines.append("")
+        return lines
+
+    def _format_critic_stats(self, case: dict[str, Any]) -> list[str]:
+        critic_stats = case.get("critic_stats")
+        if not critic_stats:
+            return []
+        lines = ["Critic Stats:"]
+        for field, stats in critic_stats.items():
+            weight = stats.get("weight", 0.0)
+            mean_norm = stats.get("mean_score_normalized", 0.0) * 100
+            std_norm = stats.get("std_deviation_normalized", 0.0) * 100
+            mean_weighted = stats.get("mean_score", 0.0) * 100
+            std_weighted = stats.get("std_deviation", 0.0) * 100
+            lines.append(
+                f"  {field}: norm {mean_norm:.2f}% ± {std_norm:.2f}% | "
+                f"weighted {mean_weighted:.2f}% ± {std_weighted:.2f}% (w={weight:.2f})"
+            )
+        lines.append("")
+        return lines
+
     # =========================================================================
     # MULTI-MODEL EVALUATION FORMATTING
     # =========================================================================
@@ -312,6 +380,11 @@ class TextFormatter(EvalResultFormatter):
 
                         lines.append(f"    [{model}] Score: {evaluation.score * 100:.1f}%")
 
+                        for stat_line in self._format_run_stats(case_result):
+                            lines.append(f"      {stat_line}")
+                        for stat_line in self._format_critic_stats(case_result):
+                            lines.append(f"      {stat_line}")
+
                         # Show evaluation details indented
                         eval_details = self._format_evaluation(evaluation)
                         for line in eval_details.split("\n"):
@@ -420,60 +493,13 @@ class TextFormatter(EvalResultFormatter):
                 lines.append("  " + "-" * 72)
 
                 for case_name, case_data in cases.items():
-                    # Context section (if include_context is True)
                     if include_context:
-                        system_msg = case_data.get("system_message")
-                        addl_msgs = case_data.get("additional_messages")
-                        if system_msg or addl_msgs:
-                            lines.append("  " + "-" * 40)
-                            lines.append("  📋 CONTEXT")
-                            lines.append("  " + "-" * 40)
-                            if system_msg:
-                                lines.append(f"  System Message: {system_msg}")
-                            if addl_msgs:
-                                lines.append(f"  💬 Conversation ({len(addl_msgs)} messages):")
-                                for msg in addl_msgs:
-                                    role = msg.get("role", "unknown").upper()
-                                    content = msg.get("content", "")
-                                    name = msg.get("name", "")
-                                    role_label = f"[{role}]" if not name else f"[{role}: {name}]"
-                                    lines.append(f"    {role_label}")
-                                    if content:
-                                        # For tool responses, try to format JSON
-                                        if role.lower() == "tool":
-                                            try:
-                                                import json
-
-                                                parsed = json.loads(content)
-                                                formatted = json.dumps(parsed, indent=2)
-                                                for json_line in formatted.split("\n"):
-                                                    lines.append(f"      {json_line}")
-                                            except (json.JSONDecodeError, TypeError):
-                                                lines.append(f"      {content}")
-                                        else:
-                                            lines.append(f"      {content}")
-                                    # Handle tool calls
-                                    tool_calls = msg.get("tool_calls", [])
-                                    if tool_calls:
-                                        for tc in tool_calls:
-                                            func = tc.get("function", {})
-                                            tc_name = func.get("name", "unknown")
-                                            tc_args = func.get("arguments", "{}")
-                                            lines.append(f"      🔧 {tc_name}")
-                                            try:
-                                                import json
-
-                                                args_dict = (
-                                                    json.loads(tc_args)
-                                                    if isinstance(tc_args, str)
-                                                    else tc_args
-                                                )
-                                                formatted = json.dumps(args_dict, indent=2)
-                                                for arg_line in formatted.split("\n"):
-                                                    lines.append(f"        {arg_line}")
-                                            except (json.JSONDecodeError, TypeError):
-                                                lines.append(f"        {tc_args}")
-                            lines.append("  " + "-" * 40)
+                        lines.extend(
+                            self._format_context_block(
+                                case_data.get("system_message"),
+                                case_data.get("additional_messages"),
+                            )
+                        )
 
                     lines.extend(
                         self._format_comparative_case_text(
@@ -484,23 +510,16 @@ class TextFormatter(EvalResultFormatter):
             lines.append("")
 
         # Summary
-        if failed_only and original_counts:
-            orig_total, orig_passed, orig_failed, orig_warned = original_counts
-            lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)")
-            summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}"
-            if orig_warned > 0:
-                summary += f" -- Warnings: {orig_warned}"
-            if orig_failed > 0:
-                summary += f" -- Failed: {orig_failed}"
-        else:
-            summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}"
-            if total_warned > 0:
-                summary += f" -- Warnings: {total_warned}"
-            if total_failed > 0:
-                summary += f" -- Failed: {total_failed}"
-
-        lines.append(summary)
-        lines.append("")
+        lines.extend(
+            self._format_summary_lines(
+                total_cases,
+                total_passed,
+                total_failed,
+                total_warned,
+                failed_only,
+                original_counts,
+            )
+        )
 
         return "\n".join(lines)
 
@@ -563,61 +582,14 @@ class TextFormatter(EvalResultFormatter):
                 if case_input:
                     lines.append(f"  Input: {case_input}")
 
-                # Context section (if include_context is True)
                 if include_context:
-                    system_msg = first_model_data.get("system_message")
-                    addl_msgs = first_model_data.get("additional_messages")
-                    if system_msg or addl_msgs:
+                    context_lines = self._format_context_block(
+                        first_model_data.get("system_message"),
+                        first_model_data.get("additional_messages"),
+                    )
+                    if context_lines:
                         lines.append("")
-                        lines.append("  " + "-" * 40)
-                        lines.append("  📋 CONTEXT")
-                        lines.append("  " + "-" * 40)
-                        if system_msg:
-                            lines.append(f"  System Message: {system_msg}")
-                        if addl_msgs:
-                            lines.append(f"  💬 Conversation ({len(addl_msgs)} messages):")
-                            for msg in addl_msgs:
-                                role = msg.get("role", "unknown").upper()
-                                content = msg.get("content", "")
-                                name = msg.get("name", "")
-                                role_label = f"[{role}]" if not name else f"[{role}: {name}]"
-                                lines.append(f"    {role_label}")
-                                if content:
-                                    # For tool responses, try to format JSON
-                                    if role.lower() == "tool":
-                                        try:
-                                            import json
-
-                                            parsed = json.loads(content)
-                                            formatted = json.dumps(parsed, indent=2)
-                                            for json_line in formatted.split("\n"):
-                                                lines.append(f"      {json_line}")
-                                        except (json.JSONDecodeError, TypeError):
-                                            lines.append(f"      {content}")
-                                    else:
-                                        lines.append(f"      {content}")
-                                # Handle tool calls in assistant messages
-                                tool_calls = msg.get("tool_calls", [])
-                                if tool_calls:
-                                    for tc in tool_calls:
-                                        func = tc.get("function", {})
-                                        tc_name = func.get("name", "unknown")
-                                        tc_args = func.get("arguments", "{}")
-                                        lines.append(f"      🔧 {tc_name}")
-                                        try:
-                                            import json
-
-                                            args_dict = (
-                                                json.loads(tc_args)
-                                                if isinstance(tc_args, str)
-                                                else tc_args
-                                            )
-                                            formatted = json.dumps(args_dict, indent=2)
-                                            for arg_line in formatted.split("\n"):
-                                                lines.append(f"        {arg_line}")
-                                        except (json.JSONDecodeError, TypeError):
-                                            lines.append(f"        {tc_args}")
-                        lines.append("  " + "-" * 40)
+                        lines.extend(context_lines)
 
                 lines.append("")
 
@@ -643,23 +615,16 @@ class TextFormatter(EvalResultFormatter):
 
         # Summary
         lines.append("=" * 78)
-        if failed_only and original_counts:
-            orig_total, orig_passed, orig_failed, orig_warned = original_counts
-            lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)")
-            summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}"
-            if orig_warned > 0:
-                summary += f" -- Warnings: {orig_warned}"
-            if orig_failed > 0:
-                summary += f" -- Failed: {orig_failed}"
-        else:
-            summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}"
-            if total_warned > 0:
-                summary += f" -- Warnings: {total_warned}"
-            if total_failed > 0:
-                summary += f" -- Failed: {total_failed}"
-
-        lines.append(summary)
-        lines.append("")
+        lines.extend(
+            self._format_summary_lines(
+                total_cases,
+                total_passed,
+                total_failed,
+                total_warned,
+                failed_only,
+                original_counts,
+            )
+        )
 
         return "\n".join(lines)
 
@@ -753,12 +718,76 @@ class TextFormatter(EvalResultFormatter):
                     continue
 
                 lines.append(f"    [{track_name}] Details:")
+                for stat_line in self._format_run_stats(track_result):
+                    lines.append(f"      {stat_line}")
+                for stat_line in self._format_critic_stats(track_result):
+                    lines.append(f"      {stat_line}")
                 for detail_line in self._format_evaluation(evaluation).split("\n"):
                     lines.append(f"      {detail_line}")
                 lines.append("")
 
         return lines
 
+    def _format_summary_lines(
+        self,
+        total_cases: int,
+        total_passed: int,
+        total_failed: int,
+        total_warned: int,
+        failed_only: bool,
+        original_counts: tuple[int, int, int, int] | None,
+    ) -> list[str]:
+        """Build the summary lines used by regular and comparative formatters."""
+        lines: list[str] = []
+        if failed_only and original_counts:
+            orig_total, orig_passed, orig_failed, orig_warned = original_counts
+            lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)")
+            summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}"
+            if orig_warned > 0:
+                summary += f" -- Warnings: {orig_warned}"
+            if orig_failed > 0:
+                summary += f" -- Failed: {orig_failed}"
+        else:
+            summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}"
+            if total_warned > 0:
+                summary += f" -- Warnings: {total_warned}"
+            if total_failed > 0:
+                summary += f" -- Failed: {total_failed}"
+        lines.append(summary)
+        lines.append("")
+        return lines
+
+    def _format_context_block(
+        self,
+        system_msg: str | None,
+        additional_messages: list[dict] | None,
+        indent: str = "  ",
+    ) -> list[str]:
+        """Build the context section lines for comparative display.
+
+        Args:
+            system_msg: The system message, if any.
+            additional_messages: Conversation messages, if any.
+            indent: Base indentation prefix for each line.
+
+        Returns:
+            List of formatted lines (empty if no context data).
+        """
+        if not system_msg and not additional_messages:
+            return []
+        lines: list[str] = []
+        lines.append(indent + "-" * 40)
+        lines.append(indent + "📋 CONTEXT")
+        lines.append(indent + "-" * 40)
+        if system_msg:
+            lines.append(f"{indent}System Message: {system_msg}")
+        if additional_messages:
+            lines.append(f"{indent}💬 Conversation ({len(additional_messages)} messages):")
+            for conv_line in self._format_conversation_text(additional_messages):
+                lines.append(f"{indent}{conv_line}")
+        lines.append(indent + "-" * 40)
+        return lines
+
     def _format_conversation_text(self, messages: list[dict]) -> list[str]:
         """Format conversation messages as plain text for context display."""
         lines: list[str] = []
@@ -858,7 +887,22 @@ class CaptureTextFormatter(CaptureFormatter):
 
                 lines.append("")
                 lines.append("  Tool Calls:")
-                if case.tool_calls:
+                runs = getattr(case, "runs", None)
+                if runs:
+                    for run_index, run in enumerate(runs, start=1):
+                        lines.append(f"    Run {run_index}:")
+                        if run.tool_calls:
+                            for tc in run.tool_calls:
+                                total_calls += 1
+                                lines.append(f"      - {tc.name}")
+                                if tc.args:
+                                    for key, value in tc.args.items():
+                                        lines.append(
+                                            f"          {key}: {self._format_value(value)}"
+                                        )
+                        else:
+                            lines.append("      (no tool calls)")
+                elif case.tool_calls:
                     for tc in case.tool_calls:
                         total_calls += 1
                         lines.append(f"    - {tc.name}")
@@ -949,7 +993,21 @@ class CaptureTextFormatter(CaptureFormatter):
                             captured_case = models_dict[model]
                             lines.append(f"  │   [{model}]")
 
-                            if captured_case.tool_calls:
+                            runs = getattr(captured_case, "runs", None)
+                            if runs:
+                                for run_index, run in enumerate(runs, start=1):
+                                    lines.append(f"  │     Run {run_index}:")
+                                    if run.tool_calls:
+                                        for tc in run.tool_calls:
+                                            lines.append(f"  │       - {tc.name}")
+                                            if tc.args:
+                                                for key, value in tc.args.items():
+                                                    lines.append(
+                                                        f"  │           {key}: {self._format_value(value)}"
+                                                    )
+                                    else:
+                                        lines.append("  │       (no tool calls)")
+                            elif captured_case.tool_calls:
                                 for tc in captured_case.tool_calls:
                                     lines.append(f"  │     - {tc.name}")
                                     if tc.args:
@@ -980,7 +1038,21 @@ class CaptureTextFormatter(CaptureFormatter):
                         captured_case = models_dict[model]
                         lines.append(f"    [{model}]")
 
-                        if captured_case.tool_calls:
+                        runs = getattr(captured_case, "runs", None)
+                        if runs:
+                            for run_index, run in enumerate(runs, start=1):
+                                lines.append(f"      Run {run_index}:")
+                                if run.tool_calls:
+                                    for tc in run.tool_calls:
+                                        lines.append(f"        - {tc.name}")
+                                        if tc.args:
+                                            for key, value in tc.args.items():
+                                                lines.append(
+                                                    f"          {key}: {self._format_value(value)}"
+                                                )
+                                else:
+                                    lines.append("        (no tool calls)")
+                        elif captured_case.tool_calls:
                             for tc in captured_case.tool_calls:
                                 lines.append(f"      - {tc.name}")
                                 if tc.args:
diff --git a/libs/arcade-cli/arcade_cli/main.py b/libs/arcade-cli/arcade_cli/main.py
index 78b52b3a..d39413c7 100644
--- a/libs/arcade-cli/arcade_cli/main.py
+++ b/libs/arcade-cli/arcade_cli/main.py
@@ -405,13 +405,29 @@ def evals(
         "-c",
         help="Maximum number of concurrent evaluations (default: 1)",
     ),
-    use_provider: Optional[str] = typer.Option(
+    num_runs: int = typer.Option(
+        1,
+        "--num-runs",
+        "-n",
+        help="Number of runs per case (default: 1).",
+    ),
+    seed: str = typer.Option(
+        "constant",
+        "--seed",
+        help="Seed policy for OpenAI runs (ignored for Anthropic): "
+        "'constant' (default), 'random', or an integer.",
+    ),
+    multi_run_pass_rule: str = typer.Option(
+        "last",
+        "--multi-run-pass-rule",
+        help="Pass/fail aggregation for multi-run cases: 'last' (default), 'mean', or 'majority'.",
+    ),
+    use_provider: Optional[list[str]] = typer.Option(
         None,
         "--use-provider",
         "-p",
         help="Provider(s) and models to use. Format: 'provider' or 'provider:model1,model2'. "
-        "Multiple providers: separate with spaces. "
-        "Examples: 'openai' or 'openai:gpt-4o anthropic:claude-sonnet-4-5-20250929'",
+        "Can be repeated. Examples: --use-provider openai or --use-provider openai:gpt-4o --use-provider anthropic:claude-sonnet-4-5-20250929",
     ),
     api_key: Optional[list[str]] = typer.Option(
         None,
@@ -476,6 +492,39 @@ def evals(
         pip_install_command=r"pip install arcade-tdk",
     )
 
+    # --- Validate multi-run parameters upfront (before any API calls) ---
+    if num_runs < 1:
+        handle_cli_error("--num-runs must be >= 1", should_exit=True)
+        return
+
+    seed_value: str | int
+    seed_lower = seed.strip().lower()
+    if seed_lower in {"constant", "random"}:
+        seed_value = seed_lower
+    else:
+        try:
+            seed_value = int(seed)
+        except ValueError:
+            handle_cli_error(
+                "Invalid --seed value. Use 'constant', 'random', or an integer.", should_exit=True
+            )
+            return
+        if seed_value < 0:
+            handle_cli_error("--seed must be a non-negative integer.", should_exit=True)
+            return
+
+    pass_rule = multi_run_pass_rule.strip().lower()
+    # Lazy import: arcade_evals requires optional deps (openai) that aren't
+    # available when the CLI is installed without the [evals] extra.
+    from arcade_evals._evalsuite._types import _VALID_PASS_RULES
+
+    if pass_rule not in _VALID_PASS_RULES:
+        handle_cli_error(
+            f"Invalid --multi-run-pass-rule. Valid values: {', '.join(sorted(_VALID_PASS_RULES))}.",
+            should_exit=True,
+        )
+        return
+
     # --- Build model specs from flags ---
     model_specs: list[ModelSpec] = []
 
@@ -483,11 +532,10 @@ def evals(
     api_keys = resolve_provider_api_keys(api_keys_specs=api_key)
 
     if use_provider:
-        # Parse provider specs - supports space-separated values
-        # e.g., "openai:gpt-4o anthropic:claude"
-        provider_specs = use_provider.split()
+        # Parse provider specs - supports multiple --use-provider flags
+        # e.g., --use-provider openai:gpt-4o --use-provider anthropic:claude
         try:
-            provider_configs = [parse_provider_spec(spec) for spec in provider_specs]
+            provider_configs = [parse_provider_spec(spec) for spec in use_provider]
         except ValueError as e:
             handle_cli_error(str(e), should_exit=True)
             return  # For type checker
@@ -594,6 +642,8 @@ def evals(
                     output_file=final_output_file,
                     output_format=",".join(final_output_formats) if final_output_formats else "txt",
                     console=console,
+                    num_runs=num_runs,
+                    seed=seed_value,
                 )
             )
         else:
@@ -608,6 +658,9 @@ def evals(
                     failed_only=only_failed,
                     include_context=include_context,
                     console=console,
+                    num_runs=num_runs,
+                    seed=seed_value,
+                    multi_run_pass_rule=pass_rule,
                 )
             )
     except Exception as e:
diff --git a/libs/arcade-evals/README.md b/libs/arcade-evals/README.md
index 97ec57c4..79b89b21 100644
--- a/libs/arcade-evals/README.md
+++ b/libs/arcade-evals/README.md
@@ -9,7 +9,10 @@ Arcade Evals provides comprehensive evaluation capabilities for Arcade tools:
 - **Evaluation Framework**: Cases, suites, and rubrics for systematic testing
 - **Critics**: Different types of comparisons (binary, numeric, similarity, datetime)
 - **Tool Evaluation**: Decorators and utilities for evaluating tool performance
-- **Result Analysis**: Comprehensive evaluation results and reporting
+- **Multi-Run Statistics**: Run each case multiple times with configurable seed policies and pass rules to measure consistency
+- **Comparative Evaluation**: Compare tool performance across multiple sources/tracks side-by-side
+- **Capture Mode**: Record model tool calls without scoring for debugging and baseline generation
+- **Result Analysis**: Comprehensive evaluation results and reporting in multiple formats (text, markdown, HTML, JSON)
 
 ## Installation
 
@@ -81,6 +84,31 @@ rubric = EvalRubric(
 suite = EvalSuite(cases=[case1], rubric=rubric)
 ```
 
+### Multi-Run Evaluation
+
+Run each case multiple times to measure consistency:
+
+```python
+# Run via the CLI
+# arcade evals eval_file.py --num-runs 5 --seed random --multi-run-pass-rule majority
+
+# Or programmatically
+result = await suite.run(
+    client,
+    model="gpt-4o",
+    num_runs=5,            # Run each case 5 times
+    seed="random",         # Different seed per run
+    multi_run_pass_rule="majority",  # Pass if >50% of runs pass
+)
+```
+
+Multi-run results include per-case statistics:
+- **Mean score** and **standard deviation** across runs
+- **Per-run pass/fail** with individual scores
+- **Per-critic field** score breakdowns across runs
+- Configurable **pass rules**: `last` (default), `mean`, or `majority`
+- Configurable **seed policies**: `constant` (fixed seed 42), `random`, or a specific integer
+
 ## License
 
 MIT License - see LICENSE file for details.
diff --git a/libs/arcade-evals/arcade_evals/__init__.py b/libs/arcade-evals/arcade_evals/__init__.py
index 83d1c092..2531b19a 100644
--- a/libs/arcade-evals/arcade_evals/__init__.py
+++ b/libs/arcade-evals/arcade_evals/__init__.py
@@ -1,6 +1,6 @@
 from ._evalsuite._providers import ProviderName
 from ._evalsuite._tool_registry import MCPToolDefinition
-from .capture import CapturedCase, CapturedToolCall, CaptureResult
+from .capture import CapturedCase, CapturedRun, CapturedToolCall, CaptureResult
 from .critic import BinaryCritic, DatetimeCritic, NoneCritic, NumericCritic, SimilarityCritic
 from .eval import (
     AnyExpectedToolCall,
@@ -25,6 +25,7 @@ __all__ = [
     "BinaryCritic",
     "CaptureResult",
     "CapturedCase",
+    "CapturedRun",
     "CapturedToolCall",
     "DatetimeCritic",
     "EvalRubric",
@@ -41,8 +42,8 @@ __all__ = [
     "Weight",
     "clear_tools_cache",
     "load_arcade_mcp_gateway_async",
-    "load_mcp_remote_async",
     "load_from_stdio_async",
+    "load_mcp_remote_async",
     "load_stdio_arcade_async",
     "tool_eval",
     "validate_and_normalize_critic_weights",
diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py b/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py
index 711f9e8e..69b2079b 100644
--- a/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py
+++ b/libs/arcade-evals/arcade_evals/_evalsuite/_capture.py
@@ -7,9 +7,11 @@ keeping it separate from the main evaluation logic in eval.py.
 from __future__ import annotations
 
 import asyncio
+import random
 from typing import TYPE_CHECKING, Any
 
-from arcade_evals.capture import CapturedCase, CapturedToolCall, CaptureResult
+from arcade_evals._evalsuite._types import _resolve_seed_spec
+from arcade_evals.capture import CapturedCase, CapturedRun, CapturedToolCall, CaptureResult
 
 if TYPE_CHECKING:
     from arcade_evals._evalsuite._comparative import ComparativeCaseBuilder
@@ -39,6 +41,7 @@ class _EvalSuiteCaptureMixin:
         model: str,
         case: EvalCase,
         registry: EvalSuiteToolRegistry | None = None,
+        seed: int | None = None,
     ) -> list[tuple[str, dict[str, Any]]]:
         raise NotImplementedError  # Implemented in EvalSuite
 
@@ -67,6 +70,8 @@ class _EvalSuiteCaptureMixin:
         model: str,
         provider: ProviderName = "openai",
         include_context: bool = False,
+        num_runs: int = 1,
+        seed: str | int | None = "constant",
     ) -> CaptureResult:
         """
         Run the evaluation suite in capture mode - records tool calls without scoring.
@@ -86,10 +91,15 @@ class _EvalSuiteCaptureMixin:
             provider: The provider name ("openai" or "anthropic").
             include_context: Whether to include system_message and additional_messages
                            in the output.
+            num_runs: Number of runs per case.
+            seed: Seed policy ("constant", "random", or an integer seed).
 
         Returns:
             A CaptureResult containing all captured tool calls.
         """
+        if num_runs < 1:
+            raise ValueError("num_runs must be >= 1")
+
         all_captured: list[CapturedCase] = []
         semaphore = asyncio.Semaphore(self.max_concurrent)
 
@@ -106,34 +116,54 @@ class _EvalSuiteCaptureMixin:
                         "No tools registered. Use add_* convenience methods or pass catalog=ToolCatalog."
                     )
 
-                # Get tool calls based on provider
-                if provider == "anthropic":
-                    predicted_args = await self._run_anthropic(
-                        client, model, case, registry=use_registry
-                    )
+                seed_policy, seed_value = _resolve_seed_spec(seed)
+                if provider == "openai":
+                    if seed_policy == "random":
+                        run_seeds: list[int | None] = [
+                            random.randint(0, 2**31 - 1)  # noqa: S311
+                            for _ in range(num_runs)
+                        ]
+                    else:
+                        run_seeds = [seed_value for _ in range(num_runs)]
                 else:
-                    predicted_args = await self._run_openai(
-                        client, model, case, registry=use_registry
+                    run_seeds = [None for _ in range(num_runs)]
+
+                runs: list[CapturedRun] = []
+                for run_index in range(num_runs):
+                    run_seed = run_seeds[run_index]
+                    # Get tool calls based on provider
+                    if provider == "anthropic":
+                        predicted_args = await self._run_anthropic(
+                            client, model, case, registry=use_registry
+                        )
+                    else:
+                        predicted_args = await self._run_openai(
+                            client, model, case, registry=use_registry, seed=run_seed
+                        )
+
+                    # Process tool calls (resolve names, fill defaults)
+                    filled_actual_tool_calls = self._process_tool_calls(
+                        predicted_args, registry=use_registry
                     )
 
-                # Process tool calls (resolve names, fill defaults)
-                filled_actual_tool_calls = self._process_tool_calls(
-                    predicted_args, registry=use_registry
-                )
+                    # Convert to CapturedToolCall objects
+                    tool_calls = [
+                        CapturedToolCall(name=name, args=args)
+                        for name, args in filled_actual_tool_calls
+                    ]
 
-                # Convert to CapturedToolCall objects
-                tool_calls = [
-                    CapturedToolCall(name=name, args=args)
-                    for name, args in filled_actual_tool_calls
-                ]
+                    runs.append(CapturedRun(tool_calls=tool_calls))
+
+                primary_tool_calls = runs[0].tool_calls if runs else []
 
                 return CapturedCase(
                     case_name=case.name,
                     user_message=case.user_message,
-                    tool_calls=tool_calls,
+                    tool_calls=primary_tool_calls,
                     system_message=case.system_message if include_context else None,
                     additional_messages=case.additional_messages if include_context else None,
                     track_name=track,
+                    runs=runs if len(runs) > 1 else [],
                 )
 
         # Capture regular cases (using default registry)
diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py
index fc2027e1..66f3f71e 100644
--- a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py
+++ b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative.py
@@ -6,6 +6,7 @@ multiple tool tracks with track-specific expected results and critics.
 
 from __future__ import annotations
 
+from collections.abc import Sequence
 from typing import TYPE_CHECKING, Any
 
 from arcade_evals._evalsuite._types import (
@@ -45,7 +46,7 @@ class ComparativeCaseBuilder:
         name: str,
         user_message: str,
         system_message: str = "",
-        additional_messages: list[dict[str, str]] | None = None,
+        additional_messages: list[dict[str, Any]] | None = None,
         rubric: EvalRubric | None = None,
     ) -> None:
         """Initialize the builder.
@@ -70,7 +71,7 @@ class ComparativeCaseBuilder:
     def for_track(
         self,
         track_name: str,
-        expected_tool_calls: list[ExpectedToolCall | ExpectedMCPToolCall],
+        expected_tool_calls: Sequence[ExpectedToolCall | ExpectedMCPToolCall],
         critics: list[Critic] | None = None,
     ) -> ComparativeCaseBuilder:
         """Add track-specific configuration.
diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py
index a0e69251..bf8c7a16 100644
--- a/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py
+++ b/libs/arcade-evals/arcade_evals/_evalsuite/_comparative_execution.py
@@ -7,17 +7,25 @@ allowing the same cases to be run against multiple tool tracks.
 from __future__ import annotations
 
 import asyncio
+import logging
 import time
 from typing import TYPE_CHECKING, Any
 
 from arcade_evals._evalsuite._comparative import ComparativeCaseBuilder
-from arcade_evals._evalsuite._types import ComparativeCase, EvalRubric
+from arcade_evals._evalsuite._types import (
+    _VALID_PASS_RULES,
+    PASS_RULE_LAST,
+    ComparativeCase,
+    EvalRubric,
+)
 
 if TYPE_CHECKING:
     from arcade_evals._evalsuite._providers import ProviderName
     from arcade_evals._evalsuite._tool_registry import EvalSuiteToolRegistry
     from arcade_evals._evalsuite._tracks import TrackManager
 
+logger = logging.getLogger(__name__)
+
 
 class _EvalSuiteComparativeMixin:
     """Mixin providing comparative evaluation execution methods."""
@@ -36,12 +44,26 @@ class _EvalSuiteComparativeMixin:
     _run_openai: Any  # Method from EvalSuite
     _run_anthropic: Any  # Method from EvalSuite
 
+    async def _run_case_with_stats(
+        self,
+        case: Any,
+        client: Any,
+        model: str,
+        provider: ProviderName,
+        *,
+        num_runs: int,
+        seed: str | int | None,
+        pass_rule: str,
+        registry: EvalSuiteToolRegistry | None = None,
+    ) -> dict[str, Any]:
+        raise NotImplementedError  # Implemented in EvalSuite
+
     def add_comparative_case(
         self,
         name: str,
         user_message: str,
         system_message: str | None = None,
-        additional_messages: list[dict[str, str]] | None = None,
+        additional_messages: list[dict[str, Any]] | None = None,
         rubric: EvalRubric | None = None,
     ) -> ComparativeCaseBuilder:
         """Create a comparative case that runs against multiple tool tracks.
@@ -90,6 +112,9 @@ class _EvalSuiteComparativeMixin:
         client: Any,
         model: str,
         provider: ProviderName = "openai",
+        num_runs: int = 1,
+        seed: str | int | None = "constant",
+        multi_run_pass_rule: str = PASS_RULE_LAST,
     ) -> dict[str, dict[str, Any]]:
         """Run comparative cases across all configured tracks.
 
@@ -97,6 +122,9 @@ class _EvalSuiteComparativeMixin:
             client: The LLM client instance.
             model: The model to evaluate.
             provider: The provider name.
+            num_runs: Number of runs per case.
+            seed: Seed policy ("constant", "random", or an integer seed).
+            multi_run_pass_rule: How to determine pass/warn for multi-run cases.
 
         Returns:
             Dictionary mapping track names to their results.
@@ -116,6 +144,15 @@ class _EvalSuiteComparativeMixin:
                 "No comparative cases defined. Use add_comparative_case() to add cases."
             )
 
+        # Validate upfront before making any API calls
+        if num_runs < 1:
+            raise ValueError("num_runs must be >= 1")
+        if multi_run_pass_rule not in _VALID_PASS_RULES:
+            raise ValueError(
+                f"Invalid multi-run pass rule '{multi_run_pass_rule}'. "
+                f"Valid values: {', '.join(sorted(_VALID_PASS_RULES))}"
+            )
+
         # Build and validate all cases upfront
         comparative_cases: list[ComparativeCase] = []
         all_required_tracks: set[str] = set()
@@ -183,27 +220,21 @@ class _EvalSuiteComparativeMixin:
                 ) -> dict[str, Any]:
                     async with semaphore:
                         start = time.time()
-                        print(f"    [TASK START] {_case.name} @ {_t_name}", flush=True)
-                        if provider == "anthropic":
-                            predicted_args = await self._run_anthropic(
-                                client, model, _case, registry=_reg
-                            )
-                        else:
-                            predicted_args = await self._run_openai(
-                                client, model, _case, registry=_reg
-                            )
+                        logger.debug("[TASK START] %s @ %s", _case.name, _t_name)
+                        case_result = await self._run_case_with_stats(
+                            _case,
+                            client,
+                            model,
+                            provider,
+                            num_runs=num_runs,
+                            seed=seed,
+                            pass_rule=multi_run_pass_rule,
+                            registry=_reg,
+                        )
                         elapsed = time.time() - start
-                        print(
-                            f"    [TASK DONE] {_case.name} @ {_t_name} ({elapsed:.1f}s)",
-                            flush=True,
-                        )
+                        logger.debug("[TASK DONE] %s @ %s (%.1fs)", _case.name, _t_name, elapsed)
 
-                        filled_actual_tool_calls = self._process_tool_calls(
-                            predicted_args, registry=_reg
-                        )
-                        evaluation = _case.evaluate(filled_actual_tool_calls)
-
-                        return {
+                        result = {
                             "name": _case.name,
                             "track": _t_name,
                             "input": _case.user_message,
@@ -215,10 +246,15 @@ class _EvalSuiteComparativeMixin:
                             ],
                             "predicted_tool_calls": [
                                 {"name": name, "args": args}
-                                for name, args in filled_actual_tool_calls
+                                for name, args in case_result["predicted_tool_calls"]
                             ],
-                            "evaluation": evaluation,
+                            "evaluation": case_result["evaluation"],
                         }
+                        if num_runs > 1:
+                            result["run_stats"] = case_result["run_stats"]
+                            if case_result["critic_stats"]:
+                                result["critic_stats"] = case_result["critic_stats"]
+                        return result
 
                 task = run_track_case(eval_case, registry, track_name)
                 tasks.append((track_name, task))
diff --git a/libs/arcade-evals/arcade_evals/_evalsuite/_types.py b/libs/arcade-evals/arcade_evals/_evalsuite/_types.py
index a43063d5..1aa29303 100644
--- a/libs/arcade-evals/arcade_evals/_evalsuite/_types.py
+++ b/libs/arcade-evals/arcade_evals/_evalsuite/_types.py
@@ -6,9 +6,44 @@ eval.py and the _evalsuite submodules, avoiding circular imports.
 
 from __future__ import annotations
 
+from collections.abc import Sequence
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Callable
 
+DEFAULT_EVAL_SEED = 42
+
+# Pass-rule constants (shared across eval.py & _comparative_execution.py)
+PASS_RULE_LAST = "last"  # noqa: S105
+PASS_RULE_MEAN = "mean"  # noqa: S105
+PASS_RULE_MAJORITY = "majority"  # noqa: S105
+_VALID_PASS_RULES: frozenset[str] = frozenset({PASS_RULE_LAST, PASS_RULE_MEAN, PASS_RULE_MAJORITY})
+
+
+def _resolve_seed_spec(seed: str | int | None) -> tuple[str, int | None]:
+    """Resolve a seed specification into a (policy, value) pair.
+
+    Args:
+        seed: 'constant', 'random', an integer, a numeric string, or None.
+
+    Returns:
+        A tuple of (policy_name, seed_value). policy_name is one of
+        'constant', 'random', or 'custom'.
+    """
+    if seed is None:
+        return "constant", DEFAULT_EVAL_SEED
+    if isinstance(seed, int):
+        return "custom", seed
+    seed_value = seed.strip().lower()
+    if seed_value == "constant":
+        return "constant", DEFAULT_EVAL_SEED
+    if seed_value == "random":
+        return "random", None
+    try:
+        return "custom", int(seed_value)
+    except ValueError as exc:
+        raise ValueError("Invalid seed. Use 'constant', 'random', or an integer value.") from exc
+
+
 if TYPE_CHECKING:
     from arcade_evals.critic import Critic
 
@@ -117,7 +152,7 @@ class TrackConfig:
         critics: Critics to evaluate tool arguments for this track.
     """
 
-    expected_tool_calls: list[ExpectedToolCall | ExpectedMCPToolCall]
+    expected_tool_calls: Sequence[ExpectedToolCall | ExpectedMCPToolCall]
     critics: list[Critic] = field(default_factory=list)
 
 
@@ -140,14 +175,14 @@ class ComparativeCase:
     name: str
     user_message: str
     system_message: str = ""
-    additional_messages: list[dict[str, str]] = field(default_factory=list)
+    additional_messages: list[dict[str, Any]] = field(default_factory=list)
     rubric: EvalRubric | None = None
     track_configs: dict[str, TrackConfig] = field(default_factory=dict)
 
     def add_track_config(
         self,
         track_name: str,
-        expected_tool_calls: list[ExpectedToolCall | ExpectedMCPToolCall],
+        expected_tool_calls: Sequence[ExpectedToolCall | ExpectedMCPToolCall],
         critics: list[Critic] | None = None,
     ) -> None:
         """Add configuration for a track.
diff --git a/libs/arcade-evals/arcade_evals/capture.py b/libs/arcade-evals/arcade_evals/capture.py
index d5ad4aeb..19b98033 100644
--- a/libs/arcade-evals/arcade_evals/capture.py
+++ b/libs/arcade-evals/arcade_evals/capture.py
@@ -38,6 +38,22 @@ class CapturedToolCall:
         return {"name": self.name, "args": self.args}
 
 
+@dataclass
+class CapturedRun:
+    """
+    A single capture run for a case, containing tool calls.
+
+    Attributes:
+        tool_calls: List of tool calls made by the model in this run.
+    """
+
+    tool_calls: list[CapturedToolCall] = field(default_factory=list)
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return {"tool_calls": [tc.to_dict() for tc in self.tool_calls]}
+
+
 @dataclass
 class CapturedCase:
     """
@@ -50,6 +66,7 @@ class CapturedCase:
         system_message: The system message (included if include_context is True).
         additional_messages: Additional messages (included if include_context is True).
         track_name: The track name for comparative captures (None for regular cases).
+        runs: Optional list of runs (populated when num_runs > 1).
     """
 
     case_name: str
@@ -58,6 +75,7 @@ class CapturedCase:
     system_message: str | None = None
     additional_messages: list[dict[str, Any]] | None = None
     track_name: str | None = None
+    runs: list[CapturedRun] = field(default_factory=list)
 
     @staticmethod
     def _try_parse_json(value: str) -> Any:
@@ -109,6 +127,8 @@ class CapturedCase:
             "user_message": self.user_message,
             "tool_calls": [tc.to_dict() for tc in self.tool_calls],
         }
+        if self.runs:
+            result["runs"] = [run.to_dict() for run in self.runs]
         if self.track_name:
             result["track_name"] = self.track_name
         if include_context:
@@ -159,17 +179,32 @@ class CaptureResult:
 
 
 async def _capture_with_openai(
-    suite: EvalSuite, api_key: str, model: str, include_context: bool = False
+    suite: EvalSuite,
+    api_key: str,
+    model: str,
+    include_context: bool = False,
+    num_runs: int = 1,
+    seed: str | int | None = "constant",
 ) -> CaptureResult:
     """Run capture mode with OpenAI client."""
     async with AsyncOpenAI(api_key=api_key) as client:
         return await suite.capture(
-            client, model, provider="openai", include_context=include_context
+            client,
+            model,
+            provider="openai",
+            include_context=include_context,
+            num_runs=num_runs,
+            seed=seed,
         )
 
 
 async def _capture_with_anthropic(
-    suite: EvalSuite, api_key: str, model: str, include_context: bool = False
+    suite: EvalSuite,
+    api_key: str,
+    model: str,
+    include_context: bool = False,
+    num_runs: int = 1,
+    seed: str | int | None = "constant",
 ) -> CaptureResult:
     """Run capture mode with Anthropic client."""
     try:
@@ -182,5 +217,10 @@ async def _capture_with_anthropic(
 
     async with AsyncAnthropic(api_key=api_key) as client:
         return await suite.capture(
-            client, model, provider="anthropic", include_context=include_context
+            client,
+            model,
+            provider="anthropic",
+            include_context=include_context,
+            num_runs=num_runs,
+            seed=seed,
         )
diff --git a/libs/arcade-evals/arcade_evals/eval.py b/libs/arcade-evals/arcade_evals/eval.py
index 27d926a6..a8dc9e09 100644
--- a/libs/arcade-evals/arcade_evals/eval.py
+++ b/libs/arcade-evals/arcade_evals/eval.py
@@ -3,7 +3,10 @@ import functools
 import inspect
 import json
 import logging
+import random
+from collections.abc import Sequence
 from dataclasses import dataclass, field
+from statistics import mean, pstdev
 from typing import TYPE_CHECKING, Any, Callable
 
 import numpy as np
@@ -24,11 +27,16 @@ from arcade_evals._evalsuite._tracks import TrackManager
 
 # Import shared types from _types module (breaks circular dependencies)
 from arcade_evals._evalsuite._types import (
+    _VALID_PASS_RULES,
+    PASS_RULE_LAST,
+    PASS_RULE_MAJORITY,
+    PASS_RULE_MEAN,
     AnyExpectedToolCall,
     EvalRubric,
     ExpectedMCPToolCall,
     ExpectedToolCall,
     NamedExpectedToolCall,
+    _resolve_seed_spec,
 )
 from arcade_evals.critic import NoneCritic
 from arcade_evals.weights import validate_and_normalize_critic_weights
@@ -140,6 +148,88 @@ class EvaluationResult:
         self.score = total_score / total_weight if total_weight > 0 else 0.0
 
 
+# PASS_RULE_LAST, PASS_RULE_MEAN, PASS_RULE_MAJORITY, and _VALID_PASS_RULES
+# are imported from _types (see top-level imports) to keep a single source of truth.
+
+
+def _compute_mean_std(values: list[float]) -> tuple[float, float]:
+    if not values:
+        return 0.0, 0.0
+    avg = mean(values)
+    if len(values) < 2:
+        return avg, 0.0
+    return avg, pstdev(values)
+
+
+def _resolve_pass_rule(
+    run_evaluations: list[EvaluationResult],
+    mean_score: float,
+    pass_rule: str,
+    rubric: EvalRubric,
+) -> tuple[bool, bool]:
+    if pass_rule not in _VALID_PASS_RULES:
+        raise ValueError(
+            f"Invalid multi-run pass rule '{pass_rule}'. "
+            f"Valid values: {', '.join(sorted(_VALID_PASS_RULES))}"
+        )
+    if not run_evaluations:
+        return False, False
+    if pass_rule == PASS_RULE_MEAN:
+        passed = mean_score >= rubric.fail_threshold
+        warning = not passed and mean_score >= rubric.warn_threshold
+        return passed, warning
+    if pass_rule == PASS_RULE_MAJORITY:
+        majority = len(run_evaluations) // 2 + 1
+        passed_count = sum(1 for ev in run_evaluations if ev.passed)
+        warned_count = sum(1 for ev in run_evaluations if ev.warning)
+        if passed_count >= majority:
+            return True, False
+        if (passed_count + warned_count) >= majority:
+            return False, True
+        return False, False
+    last_eval = run_evaluations[-1]
+    return last_eval.passed, last_eval.warning
+
+
+def _aggregate_critic_stats(
+    run_field_scores: list[dict[str, dict[str, float]]],
+) -> dict[str, dict[str, Any]]:
+    if not run_field_scores:
+        return {}
+    all_fields: set[str] = set()
+    for field_scores in run_field_scores:
+        all_fields.update(field_scores.keys())
+
+    critic_stats: dict[str, dict[str, Any]] = {}
+    for critic_field in sorted(all_fields):
+        weighted_scores = [
+            run_scores.get(critic_field, {}).get("score", 0.0) for run_scores in run_field_scores
+        ]
+        weights = [
+            run_scores.get(critic_field, {}).get("weight", 0.0) for run_scores in run_field_scores
+        ]
+        normalized_scores = [
+            (score / weight) if weight > 0 else 0.0
+            for score, weight in zip(weighted_scores, weights)
+        ]
+        avg, std_dev = _compute_mean_std(weighted_scores)
+        avg_norm, std_norm = _compute_mean_std(normalized_scores)
+        non_zero_weights = [w for w in weights if w > 0]
+        # Use mean of non-zero weights as the representative weight.
+        # Weights are typically constant across runs, but mean handles edge cases.
+        representative_weight = mean(non_zero_weights) if non_zero_weights else 0.0
+        critic_stats[critic_field] = {
+            "run_scores": weighted_scores,
+            "mean_score": avg,
+            "std_deviation": std_dev,
+            "run_scores_normalized": normalized_scores,
+            "mean_score_normalized": avg_norm,
+            "std_deviation_normalized": std_norm,
+            "weight": representative_weight,
+        }
+    return critic_stats
+
+
 # Import capture mode helpers (defined in capture.py to keep this file focused)
 from arcade_evals.capture import (  # noqa: E402
     _capture_with_anthropic,
@@ -167,7 +257,7 @@ class EvalCase:
     user_message: str
     expected_tool_calls: list[NamedExpectedToolCall]
     critics: list["Critic"] | None = None
-    additional_messages: list[dict[str, str]] = field(default_factory=list)
+    additional_messages: list[dict[str, Any]] = field(default_factory=list)
     rubric: EvalRubric = field(default_factory=EvalRubric)
 
     def __post_init__(self) -> None:
@@ -520,7 +610,7 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
         expected_tool_calls: list[NamedExpectedToolCall],
         rubric: EvalRubric,
         critics: list["Critic"],
-        additional_messages: list[dict[str, str]],
+        additional_messages: list[dict[str, Any]],
     ) -> "EvalCase":
         """Factory method to create EvalCase instances.
 
@@ -540,11 +630,12 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
         self,
         name: str,
         user_message: str,
-        expected_tool_calls: list[AnyExpectedToolCall] | list[tuple[Callable, dict[str, Any]]],
+        expected_tool_calls: Sequence[AnyExpectedToolCall]
+        | Sequence[tuple[Callable, dict[str, Any]]],
         critics: list["Critic"] | None = None,
         system_message: str | None = None,
         rubric: EvalRubric | None = None,
-        additional_messages: list[dict[str, str]] | None = None,
+        additional_messages: list[dict[str, Any]] | None = None,
     ) -> None:
         """
         Add a new evaluation case to the suite.
@@ -660,7 +751,7 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
         | None = None,
         rubric: EvalRubric | None = None,
         critics: list["Critic"] | None = None,
-        additional_messages: list[dict[str, str]] | None = None,
+        additional_messages: list[dict[str, Any]] | None = None,
     ) -> None:
         """
         Extend the last added case with new information.
@@ -745,11 +836,148 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
             processed_calls.append((resolved_name, args_with_defaults))
         return processed_calls
 
+    def _compute_run_field_scores(
+        self, evaluation: EvaluationResult
+    ) -> dict[str, dict[str, float]]:
+        field_scores: dict[str, list[float]] = {}
+        field_weights: dict[str, list[float]] = {}
+        for result in evaluation.results:
+            field = result["field"]
+            if field == "tool_selection":
+                continue
+            field_scores.setdefault(field, []).append(result["score"])
+            field_weights.setdefault(field, []).append(result["weight"])
+
+        run_scores: dict[str, dict[str, float]] = {}
+        for field, scores in field_scores.items():
+            weights = field_weights.get(field, [])
+            run_scores[field] = {
+                "score": mean(scores) if scores else 0.0,
+                "weight": mean(weights) if weights else 0.0,
+            }
+        return run_scores
+
+    async def _run_case_with_stats(
+        self,
+        case: "EvalCase",
+        client: Any,
+        model: str,
+        provider: ProviderName,
+        *,
+        num_runs: int,
+        seed: str | int | None,
+        pass_rule: str,
+        registry: EvalSuiteToolRegistry | None = None,
+    ) -> dict[str, Any]:
+        if num_runs < 1:
+            raise ValueError("num_runs must be >= 1")
+
+        seed_policy, seed_value = _resolve_seed_spec(seed)
+        seed_policy_display = seed_policy
+        if provider == "openai":
+            if seed_policy == "random":
+                run_seeds: list[int | None] = [
+                    random.randint(0, 2**31 - 1)  # noqa: S311
+                    for _ in range(num_runs)
+                ]
+            else:
+                run_seeds = [seed_value for _ in range(num_runs)]
+        else:
+            seed_policy_display = f"{seed_policy} (ignored)"
+            run_seeds = [None for _ in range(num_runs)]
+
+        run_evaluations: list[EvaluationResult] = []
+        run_scores: list[float] = []
+        run_passed: list[bool] = []
+        run_warned: list[bool] = []
+        run_field_scores: list[dict[str, dict[str, float]]] = []
+        last_processed_calls: list[tuple[str, dict[str, Any]]] = []
+        run_details: list[dict[str, Any]] = []
+
+        for run_index in range(num_runs):
+            run_seed = run_seeds[run_index]
+            if provider == "anthropic":
+                predicted_args = await self._run_anthropic(client, model, case, registry=registry)
+            else:
+                predicted_args = await self._run_openai(
+                    client, model, case, registry=registry, seed=run_seed
+                )
+
+            processed_calls = self._process_tool_calls(predicted_args, registry=registry)
+            evaluation = case.evaluate(processed_calls)
+
+            run_evaluations.append(evaluation)
+            run_scores.append(evaluation.score)
+            run_passed.append(evaluation.passed)
+            run_warned.append(evaluation.warning)
+            run_field_scores.append(self._compute_run_field_scores(evaluation))
+            last_processed_calls = processed_calls
+            run_details.append({
+                "score": evaluation.score,
+                "passed": evaluation.passed,
+                "warning": evaluation.warning,
+                "failure_reason": evaluation.failure_reason,
+                "details": evaluation.results,
+            })
+
+        mean_score, std_dev = _compute_mean_std(run_scores)
+        passed, warning = _resolve_pass_rule(run_evaluations, mean_score, pass_rule, case.rubric)
+
+        # Determine aggregate failure_reason:
+        # - PASS_RULE_LAST: use the last run's failure reason
+        # - Other rules: if ALL runs failed with the same reason, surface it
+        if not run_evaluations:
+            aggregate_failure_reason = None
+        elif pass_rule == PASS_RULE_LAST:
+            # Only surface failure_reason when the aggregate didn't pass
+            aggregate_failure_reason = run_evaluations[-1].failure_reason if not passed else None
+        elif not passed and not warning:
+            # For non-last rules, surface the failure reason if all runs share the same one
+            failure_reasons = [ev.failure_reason for ev in run_evaluations if ev.failure_reason]
+            unique_reasons = set(failure_reasons)
+            if len(unique_reasons) == 1 and len(failure_reasons) == len(run_evaluations):
+                aggregate_failure_reason = failure_reasons[0]
+            else:
+                aggregate_failure_reason = None
+        else:
+            aggregate_failure_reason = None
+
+        aggregate = EvaluationResult(
+            score=mean_score,
+            passed=passed,
+            warning=warning,
+            results=run_evaluations[-1].results if run_evaluations else [],
+            failure_reason=aggregate_failure_reason,
+        )
+
+        run_stats = {
+            "num_runs": num_runs,
+            "scores": run_scores,
+            "mean_score": mean_score,
+            "std_deviation": std_dev,
+            "passed": run_passed,
+            "warned": run_warned,
+            "seed_policy": seed_policy_display,
+            "run_seeds": run_seeds,
+            "pass_rule": pass_rule,
+            "runs": run_details,
+        }
+
+        return {
+            "evaluation": aggregate,
+            "predicted_tool_calls": last_processed_calls,
+            "run_stats": run_stats,
+            "critic_stats": _aggregate_critic_stats(run_field_scores),
+        }
+
     async def run(
         self,
         client: Any,  # AsyncOpenAI | AsyncAnthropic - use Any to avoid import dependency
         model: str,
         provider: ProviderName = "openai",
+        num_runs: int = 1,
+        seed: str | int | None = "constant",
+        multi_run_pass_rule: str = PASS_RULE_LAST,
     ) -> dict[str, Any]:
         """
         Run the evaluation suite.
@@ -758,10 +986,22 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
             client: The LLM client instance (AsyncOpenAI or AsyncAnthropic).
             model: The model to evaluate.
             provider: The provider name ("openai" or "anthropic").
+            num_runs: Number of runs per case.
+            seed: Seed policy ("constant", "random", or an integer seed).
+            multi_run_pass_rule: How to determine pass/warn for multi-run cases.
 
         Returns:
             A dictionary containing the evaluation results.
         """
+        # Validate upfront before making any API calls
+        if num_runs < 1:
+            raise ValueError("num_runs must be >= 1")
+        if multi_run_pass_rule not in _VALID_PASS_RULES:
+            raise ValueError(
+                f"Invalid multi-run pass rule '{multi_run_pass_rule}'. "
+                f"Valid values: {', '.join(sorted(_VALID_PASS_RULES))}"
+            )
+
         results: dict[str, Any] = {
             "model": model,
             "suite_name": self.name,
@@ -779,17 +1019,15 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
                         "No tools registered. Use add_* convenience methods or pass catalog=ToolCatalog."
                     )
 
-                # Get tool calls based on provider
-                if provider == "anthropic":
-                    predicted_args = await self._run_anthropic(client, model, case)
-                else:
-                    predicted_args = await self._run_openai(client, model, case)
-
-                # Process tool calls (resolve names, fill defaults)
-                filled_actual_tool_calls = self._process_tool_calls(predicted_args)
-
-                # Evaluate the case
-                evaluation = case.evaluate(filled_actual_tool_calls)
+                case_result = await self._run_case_with_stats(
+                    case,
+                    client,
+                    model,
+                    provider,
+                    num_runs=num_runs,
+                    seed=seed,
+                    pass_rule=multi_run_pass_rule,
+                )
 
                 # Prepare the result
                 result = {
@@ -801,10 +1039,15 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
                         {"name": tc.name, "args": tc.args} for tc in case.expected_tool_calls
                     ],
                     "predicted_tool_calls": [
-                        {"name": name, "args": args} for name, args in filled_actual_tool_calls
+                        {"name": name, "args": args}
+                        for name, args in case_result["predicted_tool_calls"]
                     ],
-                    "evaluation": evaluation,
+                    "evaluation": case_result["evaluation"],
                 }
+                if num_runs > 1:
+                    result["run_stats"] = case_result["run_stats"]
+                    if case_result["critic_stats"]:
+                        result["critic_stats"] = case_result["critic_stats"]
                 return result
 
         tasks = [sem_task(case) for case in self.cases]
@@ -819,6 +1062,7 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
         model: str,
         case: "EvalCase",
         registry: EvalSuiteToolRegistry | None = None,
+        seed: int | None = None,
     ) -> list[tuple[str, dict[str, Any]]]:
         """Run evaluation using OpenAI client.
 
@@ -843,15 +1087,18 @@ class EvalSuite(_EvalSuiteCaptureMixin, _EvalSuiteConvenienceMixin, _EvalSuiteCo
         tools = effective_registry.list_tools_for_model(tool_format="openai")
 
         # Get the model response
-        response = await client.chat.completions.create(  # type: ignore[arg-type]
-            model=model,
-            messages=messages,
-            tool_choice="auto",
-            tools=tools,
-            user="eval_user",
-            seed=42,
-            stream=False,
-        )
+        request_params: dict[str, Any] = {
+            "model": model,
+            "messages": messages,
+            "tool_choice": "auto",
+            "tools": tools,
+            "user": "eval_user",
+            "stream": False,
+        }
+        if seed is not None:
+            request_params["seed"] = seed
+
+        response = await client.chat.completions.create(**request_params)
 
         return get_tool_args(response, normalize_names=False)
 
@@ -985,6 +1232,9 @@ def tool_eval() -> Callable[[Callable], Callable]:
             provider: ProviderName = "openai",
             capture_mode: bool = False,
             include_context: bool = False,
+            num_runs: int = 1,
+            seed: str | int | None = "constant",
+            multi_run_pass_rule: str = PASS_RULE_LAST,
         ) -> list[Any]:
             """
             Run evaluation or capture mode.
@@ -1015,19 +1265,43 @@ def tool_eval() -> Callable[[Callable], Callable]:
                 # Run in capture mode
                 if provider == "anthropic":
                     capture_result = await _capture_with_anthropic(
-                        suite, provider_api_key, model, include_context
+                        suite,
+                        provider_api_key,
+                        model,
+                        include_context=include_context,
+                        num_runs=num_runs,
+                        seed=seed,
                     )
                 else:
                     capture_result = await _capture_with_openai(
-                        suite, provider_api_key, model, include_context
+                        suite,
+                        provider_api_key,
+                        model,
+                        include_context=include_context,
+                        num_runs=num_runs,
+                        seed=seed,
                     )
                 return [capture_result]
             else:
                 # Run in evaluation mode
                 if provider == "anthropic":
-                    eval_result = await _run_with_anthropic(suite, provider_api_key, model)
+                    eval_result = await _run_with_anthropic(
+                        suite,
+                        provider_api_key,
+                        model,
+                        num_runs=num_runs,
+                        seed=seed,
+                        multi_run_pass_rule=multi_run_pass_rule,
+                    )
                 else:
-                    eval_result = await _run_with_openai(suite, provider_api_key, model)
+                    eval_result = await _run_with_openai(
+                        suite,
+                        provider_api_key,
+                        model,
+                        num_runs=num_runs,
+                        seed=seed,
+                        multi_run_pass_rule=multi_run_pass_rule,
+                    )
 
                 # For comparative evaluations, eval_result is already a list of track results
                 # For regular evaluations, it's a single dict that needs wrapping
@@ -1042,7 +1316,13 @@ def tool_eval() -> Callable[[Callable], Callable]:
 
 
 async def _run_with_openai(
-    suite: "EvalSuite", api_key: str, model: str
+    suite: "EvalSuite",
+    api_key: str,
+    model: str,
+    *,
+    num_runs: int = 1,
+    seed: str | int | None = "constant",
+    multi_run_pass_rule: str = PASS_RULE_LAST,
 ) -> dict[str, Any] | list[dict[str, Any]]:
     """Run evaluation suite with OpenAI client.
 
@@ -1054,16 +1334,36 @@ async def _run_with_openai(
         # Check if this suite has comparative cases
         if suite._comparative_case_builders:
             # Run comparative evaluation - returns dict[track_name, result]
-            track_results = await suite.run_comparative(client, model, provider="openai")
+            track_results = await suite.run_comparative(
+                client,
+                model,
+                provider="openai",
+                num_runs=num_runs,
+                seed=seed,
+                multi_run_pass_rule=multi_run_pass_rule,
+            )
             # Convert to list of results for consistent handling
             return list(track_results.values())
         else:
             # Run regular evaluation
-            return await suite.run(client, model, provider="openai")
+            return await suite.run(
+                client,
+                model,
+                provider="openai",
+                num_runs=num_runs,
+                seed=seed,
+                multi_run_pass_rule=multi_run_pass_rule,
+            )
 
 
 async def _run_with_anthropic(
-    suite: "EvalSuite", api_key: str, model: str
+    suite: "EvalSuite",
+    api_key: str,
+    model: str,
+    *,
+    num_runs: int = 1,
+    seed: str | int | None = "constant",
+    multi_run_pass_rule: str = PASS_RULE_LAST,
 ) -> dict[str, Any] | list[dict[str, Any]]:
     """Run evaluation suite with Anthropic client.
 
@@ -1083,9 +1383,23 @@ async def _run_with_anthropic(
         # Check if this suite has comparative cases
         if suite._comparative_case_builders:
             # Run comparative evaluation - returns dict[track_name, result]
-            track_results = await suite.run_comparative(client, model, provider="anthropic")
+            track_results = await suite.run_comparative(
+                client,
+                model,
+                provider="anthropic",
+                num_runs=num_runs,
+                seed=seed,
+                multi_run_pass_rule=multi_run_pass_rule,
+            )
             # Convert to list of results for consistent handling
             return list(track_results.values())
         else:
             # Run regular evaluation
-            return await suite.run(client, model, provider="anthropic")
+            return await suite.run(
+                client,
+                model,
+                provider="anthropic",
+                num_runs=num_runs,
+                seed=seed,
+                multi_run_pass_rule=multi_run_pass_rule,
+            )
diff --git a/libs/tests/cli/test_capture_formatters.py b/libs/tests/cli/test_capture_formatters.py
index 832ab8e5..fc93b5f4 100644
--- a/libs/tests/cli/test_capture_formatters.py
+++ b/libs/tests/cli/test_capture_formatters.py
@@ -57,6 +57,20 @@ def _create_mock_capture_result(
         # Explicitly set track_name to None unless specified (avoids MagicMock)
         case.track_name = case_data.get("track_name")
 
+        # Create mock runs if provided
+        runs = []
+        for run_data in case_data.get("runs", []):
+            run = MagicMock()
+            run_tool_calls = []
+            for tc_data in run_data.get("tool_calls", []):
+                tc = MagicMock()
+                tc.name = tc_data["name"]
+                tc.args = tc_data.get("args", {})
+                run_tool_calls.append(tc)
+            run.tool_calls = run_tool_calls
+            runs.append(run)
+        case.runs = runs
+
         # Create mock tool calls
         tool_calls = []
         for tc_data in case_data.get("tool_calls", []):
@@ -84,6 +98,11 @@ def _create_mock_capture_result(
                 "user_message": case.user_message,
                 "tool_calls": [{"name": tc.name, "args": tc.args} for tc in case.tool_calls],
             }
+            if case.runs:
+                case_dict["runs"] = [
+                    {"tool_calls": [{"name": tc.name, "args": tc.args} for tc in run.tool_calls]}
+                    for run in case.runs
+                ]
             if include_context:
                 case_dict["system_message"] = case.system_message
                 case_dict["additional_messages"] = case.additional_messages
@@ -170,6 +189,29 @@ class TestCaptureJsonFormatter:
         assert case["tool_calls"][0]["name"] == "GetWeather"
         assert case["tool_calls"][0]["args"]["city"] == "NYC"
 
+    def test_format_includes_runs(self) -> None:
+        """Test that runs are included when present."""
+        formatter = CaptureJsonFormatter()
+        capture = _create_mock_capture_result(
+            cases=[
+                {
+                    "case_name": "multi_run_case",
+                    "user_message": "Hello",
+                    "tool_calls": [],
+                    "runs": [
+                        {"tool_calls": [{"name": "A", "args": {"x": 1}}]},
+                        {"tool_calls": [{"name": "B", "args": {"x": 2}}]},
+                    ],
+                }
+            ]
+        )
+
+        output = formatter.format([capture])
+        parsed = json.loads(output)
+        runs = parsed["captures"][0]["captured_cases"][0]["runs"]
+        assert len(runs) == 2
+        assert runs[0]["tool_calls"][0]["name"] == "A"
+
     def test_format_with_context(self) -> None:
         """Test formatting with context included."""
         formatter = CaptureJsonFormatter()
@@ -309,6 +351,28 @@ class TestCaptureMarkdownFormatter:
         assert "**Total Cases:** 1" in output
         assert "**Total Tool Calls:** 1" in output
 
+    def test_format_includes_runs(self) -> None:
+        """Should include per-run tool calls when runs are present."""
+        formatter = CaptureMarkdownFormatter()
+        capture = _create_mock_capture_result(
+            cases=[
+                {
+                    "case_name": "multi_run_case",
+                    "user_message": "Hello",
+                    "tool_calls": [],
+                    "runs": [
+                        {"tool_calls": [{"name": "GetWeather", "args": {"city": "NYC"}}]},
+                        {"tool_calls": [{"name": "GetWeather", "args": {"city": "SF"}}]},
+                    ],
+                }
+            ]
+        )
+
+        output = formatter.format([capture])
+        assert "Run 1" in output
+        assert "Run 2" in output
+        assert "`GetWeather`" in output
+
 
 class TestCaptureHtmlFormatter:
     """Tests for CaptureHtmlFormatter."""
@@ -607,14 +671,26 @@ class TestMultiModelTextCaptureFormatter:
     def test_text_multi_model_output(self) -> None:
         """Should produce multi-model text output."""
         capture1 = _create_mock_capture_result(
-            suite_name="TestSuite", model="gpt-4o", cases=[
-                {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool1", "args": {}}]}
-            ]
+            suite_name="TestSuite",
+            model="gpt-4o",
+            cases=[
+                {
+                    "case_name": "case1",
+                    "user_message": "Hi",
+                    "tool_calls": [{"name": "Tool1", "args": {}}],
+                }
+            ],
         )
         capture2 = _create_mock_capture_result(
-            suite_name="TestSuite", model="gpt-4-turbo", cases=[
-                {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool2", "args": {}}]}
-            ]
+            suite_name="TestSuite",
+            model="gpt-4-turbo",
+            cases=[
+                {
+                    "case_name": "case1",
+                    "user_message": "Hi",
+                    "tool_calls": [{"name": "Tool2", "args": {}}],
+                }
+            ],
         )
 
         formatter = CaptureTextFormatter()
@@ -647,14 +723,26 @@ class TestMultiModelHtmlCaptureFormatter:
     def test_html_multi_model_output(self) -> None:
         """Should produce multi-model HTML output."""
         capture1 = _create_mock_capture_result(
-            suite_name="TestSuite", model="gpt-4o", cases=[
-                {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool1", "args": {}}]}
-            ]
+            suite_name="TestSuite",
+            model="gpt-4o",
+            cases=[
+                {
+                    "case_name": "case1",
+                    "user_message": "Hi",
+                    "tool_calls": [{"name": "Tool1", "args": {}}],
+                }
+            ],
         )
         capture2 = _create_mock_capture_result(
-            suite_name="TestSuite", model="gpt-4-turbo", cases=[
-                {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool2", "args": {}}]}
-            ]
+            suite_name="TestSuite",
+            model="gpt-4-turbo",
+            cases=[
+                {
+                    "case_name": "case1",
+                    "user_message": "Hi",
+                    "tool_calls": [{"name": "Tool2", "args": {}}],
+                }
+            ],
         )
 
         formatter = CaptureHtmlFormatter()
@@ -687,14 +775,26 @@ class TestMultiModelJsonCaptureFormatter:
     def test_json_multi_model_output(self) -> None:
         """Should produce structured multi-model JSON."""
         capture1 = _create_mock_capture_result(
-            suite_name="TestSuite", model="gpt-4o", cases=[
-                {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool1", "args": {}}]}
-            ]
+            suite_name="TestSuite",
+            model="gpt-4o",
+            cases=[
+                {
+                    "case_name": "case1",
+                    "user_message": "Hi",
+                    "tool_calls": [{"name": "Tool1", "args": {}}],
+                }
+            ],
         )
         capture2 = _create_mock_capture_result(
-            suite_name="TestSuite", model="gpt-4-turbo", cases=[
-                {"case_name": "case1", "user_message": "Hi", "tool_calls": [{"name": "Tool2", "args": {}}]}
-            ]
+            suite_name="TestSuite",
+            model="gpt-4-turbo",
+            cases=[
+                {
+                    "case_name": "case1",
+                    "user_message": "Hi",
+                    "tool_calls": [{"name": "Tool2", "args": {}}],
+                }
+            ],
         )
 
         formatter = CaptureJsonFormatter()
@@ -795,6 +895,7 @@ def _create_mock_capture_with_tracks(
             mock_tc.args = tc["args"]
             mock_tool_calls.append(mock_tc)
         mock_case.tool_calls = mock_tool_calls
+        mock_case.runs = []  # Explicitly set runs to empty for single-run captures
 
         captured_cases.append(mock_case)
 
@@ -924,3 +1025,365 @@ class TestCaptureWithTracks:
 
         # Should include track info in markdown
         assert "[track_a]" in output or "track_a" in output
+
+
+# =====================================================================
+# Capture formatter multi-run tests
+# =====================================================================
+
+
+def _create_mock_capture_with_runs(
+    num_runs: int = 3,
+) -> CaptureResult:
+    """Create a mock CaptureResult with multiple runs per case."""
+    cases = [
+        {
+            "case_name": "multi_run_case",
+            "user_message": "What's the weather in NYC?",
+            "tool_calls": [
+                {"name": "GetWeather", "args": {"city": "NYC"}},
+            ],
+            "system_message": "You are a weather assistant",
+            "additional_messages": [],
+            "runs": [
+                {
+                    "tool_calls": [
+                        {"name": "GetWeather", "args": {"city": "NYC", "seed": str(i)}},
+                    ]
+                }
+                for i in range(1, num_runs + 1)
+            ],
+        }
+    ]
+
+    return _create_mock_capture_result(
+        suite_name="MultiRunCaptureSuite",
+        cases=cases,
+    )
+
+
+def _create_mock_capture_no_runs() -> CaptureResult:
+    """Create a mock CaptureResult with a case that has no tool calls and no runs."""
+    cases = [
+        {
+            "case_name": "empty_case",
+            "user_message": "Do nothing",
+            "tool_calls": [],
+            "system_message": None,
+            "additional_messages": [],
+        }
+    ]
+    return _create_mock_capture_result(
+        suite_name="EmptyCaptureSuite",
+        cases=cases,
+    )
+
+
+class TestCaptureMultiRunText:
+    """Tests for multi-run capture in the text formatter."""
+
+    def test_text_shows_run_headers(self) -> None:
+        """Text capture output should show 'Run 1', 'Run 2', etc."""
+        capture = _create_mock_capture_with_runs(num_runs=3)
+        formatter = CaptureTextFormatter()
+        output = formatter.format([capture])
+        assert "Run 1:" in output
+        assert "Run 2:" in output
+        assert "Run 3:" in output
+
+    def test_text_shows_tool_calls_per_run(self) -> None:
+        """Each run should display its tool calls."""
+        capture = _create_mock_capture_with_runs(num_runs=2)
+        formatter = CaptureTextFormatter()
+        output = formatter.format([capture])
+        assert "GetWeather" in output
+
+    def test_text_no_runs_shows_top_level_calls(self) -> None:
+        """When runs is empty, should fall through to top-level tool_calls."""
+        capture = _create_mock_capture_result()  # default: no runs
+        formatter = CaptureTextFormatter()
+        output = formatter.format([capture])
+        assert "GetWeather" in output
+
+    def test_text_empty_case_no_tool_calls(self) -> None:
+        """Case with no tool calls should show appropriate message."""
+        capture = _create_mock_capture_no_runs()
+        formatter = CaptureTextFormatter()
+        output = formatter.format([capture])
+        assert "no tool calls" in output.lower()
+
+
+class TestCaptureMultiRunMarkdown:
+    """Tests for multi-run capture in the markdown formatter."""
+
+    def test_markdown_shows_run_headers(self) -> None:
+        """Markdown capture should show run headers."""
+        capture = _create_mock_capture_with_runs(num_runs=3)
+        formatter = CaptureMarkdownFormatter()
+        output = formatter.format([capture])
+        assert "Run 1" in output
+        assert "Run 2" in output
+        assert "Run 3" in output
+
+    def test_markdown_shows_tool_call_json(self) -> None:
+        """Markdown capture should show tool call args as JSON."""
+        capture = _create_mock_capture_with_runs(num_runs=2)
+        formatter = CaptureMarkdownFormatter()
+        output = formatter.format([capture])
+        assert "```json" in output
+        assert "GetWeather" in output
+
+    def test_markdown_empty_runs_shows_no_calls(self) -> None:
+        """Markdown capture with no tool calls shows appropriate message."""
+        capture = _create_mock_capture_no_runs()
+        formatter = CaptureMarkdownFormatter()
+        output = formatter.format([capture])
+        assert "No tool calls" in output
+
+
+class TestCaptureMultiRunHTML:
+    """Tests for multi-run capture in the HTML formatter."""
+
+    def test_html_shows_capture_run_details(self) -> None:
+        """HTML capture should show capture-run details elements."""
+        capture = _create_mock_capture_with_runs(num_runs=3)
+        formatter = CaptureHtmlFormatter()
+        output = formatter.format([capture])
+        assert "capture-run" in output
+        assert "Run 1" in output
+        assert "Run 2" in output
+        assert "Run 3" in output
+
+    def test_html_tool_calls_escaped(self) -> None:
+        """HTML capture should escape tool call content."""
+        capture = _create_mock_capture_with_runs(num_runs=1)
+        formatter = CaptureHtmlFormatter()
+        output = formatter.format([capture])
+        assert "GetWeather" in output
+
+    def test_html_empty_case_no_calls(self) -> None:
+        """HTML capture with no tool calls shows appropriate message."""
+        capture = _create_mock_capture_no_runs()
+        formatter = CaptureHtmlFormatter()
+        output = formatter.format([capture])
+        assert "No tool calls" in output or "no-calls" in output
+
+
+class TestCaptureMultiRunJSON:
+    """Tests for multi-run capture in the JSON formatter."""
+
+    def test_json_includes_runs_array(self) -> None:
+        """JSON capture should include runs array for multi-run cases."""
+        capture = _create_mock_capture_with_runs(num_runs=3)
+        formatter = CaptureJsonFormatter()
+        output = formatter.format([capture])
+        data = json.loads(output)
+        captures = data["captures"]
+        assert len(captures) == 1
+        case = captures[0]["captured_cases"][0]
+        assert "runs" in case
+        assert len(case["runs"]) == 3
+
+    def test_json_no_runs_for_single_run(self) -> None:
+        """JSON capture should not include runs for single-run cases."""
+        capture = _create_mock_capture_result()  # default: no runs
+        formatter = CaptureJsonFormatter()
+        output = formatter.format([capture])
+        data = json.loads(output)
+        case = data["captures"][0]["captured_cases"][0]
+        assert "runs" not in case
+
+    def test_json_run_tool_calls_structure(self) -> None:
+        """Each run in JSON should have tool_calls with name and args."""
+        capture = _create_mock_capture_with_runs(num_runs=2)
+        formatter = CaptureJsonFormatter()
+        output = formatter.format([capture])
+        data = json.loads(output)
+        run = data["captures"][0]["captured_cases"][0]["runs"][0]
+        assert "tool_calls" in run
+        assert run["tool_calls"][0]["name"] == "GetWeather"
+
+
+# =====================================================================
+# Coverage gap tests — CaptureTextFormatter
+# =====================================================================
+
+
+class TestCaptureTextFormatterCoverageGaps:
+    """Tests for CaptureTextFormatter methods that lacked coverage."""
+
+    def test_format_value_truncation(self) -> None:
+        """_format_value should truncate values longer than 60 chars."""
+        formatter = CaptureTextFormatter()
+        short = formatter._format_value("hello")
+        assert short == "hello"
+
+        long_val = "x" * 100
+        truncated = formatter._format_value(long_val)
+        assert len(truncated) == 60
+        assert truncated.endswith("...")
+
+    def test_format_value_exactly_60(self) -> None:
+        """_format_value should NOT truncate values of exactly 60 chars."""
+        formatter = CaptureTextFormatter()
+        exact = "a" * 60
+        result = formatter._format_value(exact)
+        assert result == exact
+
+    def test_conversation_text_format(self) -> None:
+        """CaptureTextFormatter._format_conversation_text should format messages."""
+        formatter = CaptureTextFormatter()
+        messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi!"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{"function": {"name": "get_data", "arguments": '{"id": 1}'}}],
+            },
+            {"role": "tool", "name": "get_data", "content": '{"result": "ok"}'},
+        ]
+        lines = formatter._format_conversation_text(messages)
+        text = "\n".join(lines)
+
+        assert "[USER]" in text
+        assert "[ASSISTANT]" in text
+        assert "[TOOL]" in text
+        assert "get_data" in text
+        assert "Hello" in text
+
+    def test_conversation_text_invalid_json_content(self) -> None:
+        """Should gracefully handle non-JSON tool content."""
+        formatter = CaptureTextFormatter()
+        messages = [
+            {"role": "tool", "name": "raw", "content": "plain text output"},
+        ]
+        lines = formatter._format_conversation_text(messages)
+        text = "\n".join(lines)
+
+        assert "plain text output" in text
+
+    def test_conversation_text_invalid_json_args(self) -> None:
+        """Should gracefully handle non-JSON tool call arguments."""
+        formatter = CaptureTextFormatter()
+        messages = [
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}],
+            },
+        ]
+        lines = formatter._format_conversation_text(messages)
+        text = "\n".join(lines)
+
+        assert "broken" in text
+        assert "not json" in text
+
+    def test_conversation_text_separator_between_messages(self) -> None:
+        """Should add separator between messages (not before first)."""
+        formatter = CaptureTextFormatter()
+        messages = [
+            {"role": "user", "content": "First"},
+            {"role": "assistant", "content": "Second"},
+        ]
+        lines = formatter._format_conversation_text(messages)
+        text = "\n".join(lines)
+
+        # Separator should appear between messages
+        assert "----" in text
+
+    def test_multi_model_with_tracks_and_context(self) -> None:
+        """Multi-model capture with tracks should render correctly with context."""
+        capture1 = _create_mock_capture_with_tracks(model="gpt-4o")
+        capture2 = _create_mock_capture_with_tracks(model="gpt-4-turbo")
+
+        formatter = CaptureTextFormatter()
+        output = formatter.format([capture1, capture2], include_context=True)
+
+        assert "MULTI-MODEL CAPTURE RESULTS" in output
+        assert "gpt-4o" in output
+        assert "gpt-4-turbo" in output
+        # Should show track sections
+        assert "TRACK:" in output or "track_a" in output
+
+    def test_multi_model_no_data_model(self) -> None:
+        """Multi-model capture should handle a model with no data for a case."""
+        # Model A has case1, model B has case1 with different tools
+        capture1 = _create_mock_capture_result(
+            suite_name="Suite",
+            model="model-a",
+            cases=[
+                {
+                    "case_name": "case1",
+                    "user_message": "Hi",
+                    "tool_calls": [{"name": "T1", "args": {}}],
+                }
+            ],
+        )
+        capture2 = _create_mock_capture_result(
+            suite_name="Suite",
+            model="model-b",
+            cases=[{"case_name": "case1", "user_message": "Hi", "tool_calls": []}],
+        )
+
+        formatter = CaptureTextFormatter()
+        output = formatter.format([capture1, capture2])
+
+        assert "model-a" in output
+        assert "model-b" in output
+        assert "MULTI-MODEL CAPTURE RESULTS" in output
+
+
+# =====================================================================
+# Coverage gap tests — CaptureMarkdownFormatter
+# =====================================================================
+
+
+class TestCaptureMarkdownFormatterCoverageGaps:
+    """Tests for CaptureMarkdownFormatter methods that lacked coverage."""
+
+    def test_multi_model_with_tracks_and_context(self) -> None:
+        """Multi-model markdown capture with tracks should render correctly."""
+        capture1 = _create_mock_capture_with_tracks(model="gpt-4o")
+        capture2 = _create_mock_capture_with_tracks(model="gpt-4-turbo")
+
+        formatter = CaptureMarkdownFormatter()
+        output = formatter.format([capture1, capture2], include_context=True)
+
+        assert "Multi-Model Capture Results" in output
+        assert "gpt-4o" in output
+        assert "gpt-4-turbo" in output
+
+    def test_conversation_md_standalone(self) -> None:
+        """CaptureMarkdownFormatter._format_conversation_md should format messages."""
+        formatter = CaptureMarkdownFormatter()
+        messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi!"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{"function": {"name": "search", "arguments": '{"q": "x"}'}}],
+            },
+            {"role": "tool", "name": "search", "content": '{"r": 1}'},
+        ]
+        lines = formatter._format_conversation_md(messages)
+        text = "\n".join(lines)
+
+        assert "👤" in text or "User" in text
+        assert "search" in text
+
+    def test_conversation_md_invalid_json(self) -> None:
+        """Should handle invalid JSON in tool call args."""
+        formatter = CaptureMarkdownFormatter()
+        messages = [
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}],
+            },
+        ]
+        lines = formatter._format_conversation_md(messages)
+        text = "\n".join(lines)
+
+        assert "broken" in text
diff --git a/libs/tests/cli/test_evals_runner.py b/libs/tests/cli/test_evals_runner.py
index af470a26..a1ef380e 100644
--- a/libs/tests/cli/test_evals_runner.py
+++ b/libs/tests/cli/test_evals_runner.py
@@ -1,5 +1,6 @@
 """Tests for evals_runner error handling."""
 
+from typing import Any, cast
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
@@ -14,6 +15,9 @@ from arcade_cli.evals_runner import (
     run_evaluations,
 )
 from arcade_cli.utils import ModelSpec, Provider
+from arcade_evals import CaptureResult
+
+RUN_RULE_LAST = "last"
 
 
 class TestEvalTaskResult:
@@ -68,7 +72,13 @@ class TestCaptureTaskResult:
 
     def test_from_success(self) -> None:
         """Test creating a successful capture result."""
-        mock_captures = [MagicMock(), MagicMock()]
+        mock_captures = cast(
+            list[CaptureResult],
+            [
+                MagicMock(spec=CaptureResult),
+                MagicMock(spec=CaptureResult),
+            ],
+        )
         result = CaptureTaskResult.from_success("test_suite", "gpt-4o", "openai", mock_captures)
         assert result.success is True
         assert result.suite_name == "test_suite"
@@ -107,6 +117,9 @@ class TestRunEvalTask:
             suite_func=mock_suite,
             model_spec=model_spec,
             max_concurrent=1,
+            num_runs=1,
+            seed="constant",
+            multi_run_pass_rule=RUN_RULE_LAST,
         )
 
         assert result.success is True
@@ -126,9 +139,13 @@ class TestRunEvalTask:
             suite_func=mock_suite,
             model_spec=model_spec,
             max_concurrent=1,
+            num_runs=1,
+            seed="constant",
+            multi_run_pass_rule=RUN_RULE_LAST,
         )
 
         assert result.success is False
+        assert result.error is not None
         assert "API error" in result.error
         assert result.error_type == "ValueError"
         assert result.result is None
@@ -145,6 +162,9 @@ class TestRunEvalTask:
             model_spec=model_spec,
             max_concurrent=5,
             include_context=False,
+            num_runs=1,
+            seed="constant",
+            multi_run_pass_rule=RUN_RULE_LAST,
         )
 
         mock_suite.assert_called_once_with(
@@ -153,6 +173,9 @@ class TestRunEvalTask:
             max_concurrency=5,
             provider="anthropic",
             include_context=False,
+            num_runs=1,
+            seed="constant",
+            multi_run_pass_rule=RUN_RULE_LAST,
         )
 
 
@@ -172,6 +195,8 @@ class TestRunCaptureTask:
             model_spec=model_spec,
             max_concurrent=1,
             include_context=True,
+            num_runs=1,
+            seed="constant",
         )
 
         assert result.success is True
@@ -189,9 +214,12 @@ class TestRunCaptureTask:
             model_spec=model_spec,
             max_concurrent=1,
             include_context=False,
+            num_runs=1,
+            seed="constant",
         )
 
         assert result.success is False
+        assert result.error is not None
         assert "Network failed" in result.error
         assert result.error_type == "ConnectionError"
 
@@ -207,6 +235,8 @@ class TestRunCaptureTask:
             model_spec=model_spec,
             max_concurrent=2,
             include_context=True,
+            num_runs=1,
+            seed="constant",
         )
 
         mock_suite.assert_called_once_with(
@@ -216,6 +246,8 @@ class TestRunCaptureTask:
             provider="openai",
             capture_mode=True,
             include_context=True,
+            num_runs=1,
+            seed="constant",
         )
 
 
@@ -253,6 +285,9 @@ class TestRunEvaluationsErrorHandling:
                 output_format="txt",
                 failed_only=False,
                 console=console,
+                num_runs=1,
+                seed="constant",
+                multi_run_pass_rule=RUN_RULE_LAST,
             )
 
         # Verify both were attempted
@@ -277,6 +312,9 @@ class TestRunEvaluationsErrorHandling:
             output_format="txt",
             failed_only=False,
             console=console,
+            num_runs=1,
+            seed="constant",
+            multi_run_pass_rule=RUN_RULE_LAST,
         )
 
         # Should print "No evaluations completed successfully" (with emoji)
@@ -302,6 +340,9 @@ class TestRunEvaluationsErrorHandling:
             output_format="txt",
             failed_only=False,
             console=console,
+            num_runs=1,
+            seed="constant",
+            multi_run_pass_rule=RUN_RULE_LAST,
         )
 
         # Check that failure count is printed
@@ -327,6 +368,9 @@ class TestRunEvaluationsErrorHandling:
                 output_format="txt",
                 failed_only=False,
                 console=console,
+                num_runs=1,
+                seed="constant",
+                multi_run_pass_rule=RUN_RULE_LAST,
             )
 
         # Check that no failure warning is printed
@@ -338,7 +382,7 @@ class TestRunEvaluationsErrorHandling:
         """Test partial failure with multiple models."""
 
         # Suite that fails on one model but succeeds on another
-        async def conditional_suite(**kwargs):
+        async def conditional_suite(**kwargs: Any) -> MagicMock:
             if kwargs["model"] == "bad-model":
                 raise RuntimeError("Model not supported")
             return MagicMock()
@@ -371,6 +415,9 @@ class TestRunEvaluationsErrorHandling:
                 output_format="txt",
                 failed_only=False,
                 console=console,
+                num_runs=1,
+                seed="constant",
+                multi_run_pass_rule=RUN_RULE_LAST,
             )
 
         # Should have been called twice
@@ -397,6 +444,8 @@ class TestRunCaptureErrorHandling:
             output_file=None,
             output_format="json",
             console=console,
+            num_runs=1,
+            seed="constant",
         )
 
         # Error message includes emoji
@@ -436,6 +485,8 @@ class TestRunCaptureErrorHandling:
                 output_file=None,
                 output_format="json",
                 console=console,
+                num_runs=1,
+                seed="constant",
             )
 
         # Both should have been attempted
@@ -463,6 +514,8 @@ class TestRunCaptureErrorHandling:
             output_file=None,
             output_format="json",
             console=console,
+            num_runs=1,
+            seed="constant",
         )
 
         # Check error details are printed
diff --git a/libs/tests/cli/test_formatter_edge_cases.py b/libs/tests/cli/test_formatter_edge_cases.py
index 50ca1e1a..9838f68e 100644
--- a/libs/tests/cli/test_formatter_edge_cases.py
+++ b/libs/tests/cli/test_formatter_edge_cases.py
@@ -140,11 +140,16 @@ class TestFormatterEdgeCases:
         formatter = HtmlFormatter()
         output = formatter.format(results)
 
-        # Should NOT contain raw script tags or other unescaped HTML
-        assert "<script>" not in output
-        assert "onerror" not in output or "&" in output  # Should be escaped
-        # Should contain escaped versions
+        # The template includes a legitimate <script> tag for run-tabs JS,
+        # but user-provided content must be properly escaped.
+        # Verify that injected XSS payloads are escaped (not rendered raw)
+        assert "<script>alert" not in output  # User payload must be escaped
+        assert "<script>malicious" not in output  # Failure reason must be escaped
+        # <img must be escaped to &lt;img so it doesn't render as an HTML element
+        assert "<img src=x" not in output
+        # Should contain escaped versions of user-provided content
         assert "&lt;script&gt;" in output or "&lt;" in output
+        assert "&lt;img" in output  # The img tag should be escaped
         assert "&amp;" in output  # & should be escaped
 
     def test_json_formatter_produces_valid_json_for_all_cases(self) -> None:
diff --git a/libs/tests/cli/test_formatters.py b/libs/tests/cli/test_formatters.py
index ae42a420..334c87af 100644
--- a/libs/tests/cli/test_formatters.py
+++ b/libs/tests/cli/test_formatters.py
@@ -289,6 +289,77 @@ class TestMarkdownFormatter:
         assert "</details>" in output
         assert "#### detailed_case" in output
 
+    def test_markdown_run_details_include_per_run_tables(self) -> None:
+        """Should include per-run detail tables when available."""
+        cases = [
+            {
+                "name": "multi_run_case",
+                "input": "Test input",
+                "evaluation": MockEvaluation(
+                    passed=True,
+                    score=0.9,
+                    results=[
+                        {
+                            "field": "param1",
+                            "match": True,
+                            "score": 0.5,
+                            "weight": 0.5,
+                            "expected": "exp",
+                            "actual": "act",
+                            "is_criticized": True,
+                        }
+                    ],
+                ),
+                "run_stats": {
+                    "num_runs": 2,
+                    "scores": [0.9, 0.7],
+                    "mean_score": 0.8,
+                    "std_deviation": 0.1,
+                    "runs": [
+                        {
+                            "score": 0.9,
+                            "passed": True,
+                            "warning": False,
+                            "details": [
+                                {
+                                    "field": "param1",
+                                    "match": True,
+                                    "score": 0.5,
+                                    "weight": 0.5,
+                                    "expected": "exp",
+                                    "actual": "act",
+                                    "is_criticized": True,
+                                }
+                            ],
+                        },
+                        {
+                            "score": 0.7,
+                            "passed": False,
+                            "warning": False,
+                            "details": [
+                                {
+                                    "field": "param1",
+                                    "match": False,
+                                    "score": 0.0,
+                                    "weight": 0.5,
+                                    "expected": "exp",
+                                    "actual": "wrong",
+                                    "is_criticized": True,
+                                }
+                            ],
+                        },
+                    ],
+                },
+            }
+        ]
+        formatter = MarkdownFormatter()
+        output = formatter.format(make_mock_results(cases=cases), show_details=True)
+
+        assert "**Run Details:**" in output
+        assert "<summary>Run 1 details</summary>" in output
+        assert "| Field | Match | Score | Expected | Actual |" in output
+        assert "| param1 | ✅ | 0.50/0.50 | `exp` | `act` |" in output
+
     def test_format_pass_rate(self) -> None:
         """Should include pass rate percentage."""
         formatter = MarkdownFormatter()
@@ -1477,7 +1548,7 @@ class TestMcpServerComparison:
         assert "linear_arcade" in tracks
         assert "linear_community" in tracks
 
-        # Should have 6 total cases (2 cases × 3 servers)
+        # Should have 6 total cases (2 cases x 3 servers)
         assert total == 6
 
         # Community server has failures (score-based, not all passed)
@@ -1816,8 +1887,8 @@ class TestComparativeJsonFormatter:
 
         parsed = json.loads(output)
         # Find the suite
-        suite_data = list(parsed["models"].values())[0]["suites"]
-        suite = list(suite_data.values())[0]
+        suite_data = next(iter(parsed["models"].values()))["suites"]
+        suite = next(iter(suite_data.values()))
 
         # Cases should be grouped
         assert "cases" in suite
@@ -1830,9 +1901,9 @@ class TestComparativeJsonFormatter:
         output = formatter.format(results)
 
         parsed = json.loads(output)
-        suite_data = list(parsed["models"].values())[0]["suites"]
-        suite = list(suite_data.values())[0]
-        case = list(suite["cases"].values())[0]
+        suite_data = next(iter(parsed["models"].values()))["suites"]
+        suite = next(iter(suite_data.values()))
+        case = next(iter(suite["cases"].values()))
 
         assert "tracks" in case
         # Should have track1 and track2
@@ -1845,12 +1916,12 @@ class TestComparativeJsonFormatter:
         output = formatter.format(results, show_details=True)
 
         parsed = json.loads(output)
-        suite_data = list(parsed["models"].values())[0]["suites"]
-        suite = list(suite_data.values())[0]
-        case = list(suite["cases"].values())[0]
+        suite_data = next(iter(parsed["models"].values()))["suites"]
+        suite = next(iter(suite_data.values()))
+        case = next(iter(suite["cases"].values()))
 
         # Each track should have details
-        for track_name, track_data in case["tracks"].items():
+        for _track_name, track_data in case["tracks"].items():
             assert "details" in track_data
 
 
@@ -2144,8 +2215,8 @@ class TestMultiModelJsonFormatter:
         data = json.loads(output)
 
         # Each case in comparison should have best_model
-        for suite_name, cases in data["comparison"].items():
-            for case_name, case_data in cases.items():
+        for _suite_name, cases in data["comparison"].items():
+            for _case_name, case_data in cases.items():
                 assert "best_model" in case_data
                 assert "best_score" in case_data
 
@@ -2792,3 +2863,914 @@ class TestHtmlSafeId:
         assert "additional_messages" in case_data
         tool_msg = next(m for m in case_data["additional_messages"] if m.get("role") == "tool")
         assert "temp" in tool_msg["content"]
+
+
+# =====================================================================
+# Multi-run stats formatter tests
+# =====================================================================
+
+
+def _make_multi_run_case(
+    name: str = "multi_run_case",
+    score: float = 0.75,
+    passed: bool = True,
+    num_runs: int = 3,
+) -> dict:
+    """Create a case dict with multi-run stats populated."""
+    return {
+        "name": name,
+        "input": "multi run test input",
+        "evaluation": MockEvaluation(
+            passed=passed,
+            score=score,
+            results=[
+                {
+                    "field": "arg_a",
+                    "match": True,
+                    "score": score,
+                    "weight": 1.0,
+                    "expected": "foo",
+                    "actual": "foo",
+                    "is_criticized": True,
+                }
+            ],
+        ),
+        "run_stats": {
+            "num_runs": num_runs,
+            "scores": [0.8, 0.7, 0.75],
+            "mean_score": 0.75,
+            "std_deviation": 0.041,
+            "passed": [True, True, True],
+            "warned": [False, False, False],
+            "seed_policy": "constant",
+            "run_seeds": [42, 42, 42],
+            "pass_rule": "last",
+            "runs": [
+                {
+                    "score": 0.8,
+                    "passed": True,
+                    "warning": False,
+                    "failure_reason": None,
+                    "details": [],
+                },
+                {
+                    "score": 0.7,
+                    "passed": True,
+                    "warning": False,
+                    "failure_reason": None,
+                    "details": [],
+                },
+                {
+                    "score": 0.75,
+                    "passed": True,
+                    "warning": False,
+                    "failure_reason": None,
+                    "details": [],
+                },
+            ],
+        },
+        "critic_stats": {
+            "arg_a": {
+                "run_scores": [0.8, 0.7, 0.75],
+                "mean_score": 0.75,
+                "std_deviation": 0.041,
+                "run_scores_normalized": [0.8, 0.7, 0.75],
+                "mean_score_normalized": 0.75,
+                "std_deviation_normalized": 0.041,
+                "weight": 1.0,
+            },
+        },
+        "expected_tool_calls": [],
+        "predicted_tool_calls": [],
+    }
+
+
+def _make_multi_run_results() -> list[list[dict]]:
+    """Create evaluation results with multi-run stats for a single model."""
+    return [
+        [
+            {
+                "model": "gpt-4o",
+                "suite_name": "MultiRunSuite",
+                "rubric": MockEvaluation(),
+                "cases": [_make_multi_run_case()],
+            }
+        ]
+    ]
+
+
+class TestMarkdownMultiRunStats:
+    """Tests for multi-run stats in the Markdown formatter."""
+
+    def test_run_stats_summary_in_output(self) -> None:
+        """Markdown should include Run Stats summary for multi-run cases."""
+        formatter = MarkdownFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "**Run Stats:**" in output
+        assert "Runs: 3" in output
+        assert "Mean Score:" in output
+        assert "Std Deviation:" in output
+        assert "Seed Policy: constant" in output
+        assert "Pass Rule: last" in output
+
+    def test_runs_column_in_summary_table(self) -> None:
+        """Summary table should include a Runs column for multi-run cases."""
+        formatter = MarkdownFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=False)
+        assert "| Runs |" in output
+        assert "±" in output  # Score should show ± std dev
+
+    def test_no_run_stats_for_single_run(self) -> None:
+        """Single-run cases should not show Run Stats or Runs column."""
+        formatter = MarkdownFormatter()
+        results = make_mock_results()  # standard single-run mock
+        output = formatter.format(results, show_details=True)
+        assert "**Run Stats:**" not in output
+        assert "| Runs |" not in output
+
+    def test_critic_stats_in_output(self) -> None:
+        """Markdown should include Critic Stats table for multi-run cases."""
+        formatter = MarkdownFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "Critic Stats" in output
+        assert "arg_a" in output
+
+    def test_format_run_stats_summary_returns_empty_for_single_run(self) -> None:
+        """_format_run_stats_summary returns [] when num_runs < 2."""
+        formatter = MarkdownFormatter()
+        assert formatter._format_run_stats_summary(None) == []
+        assert formatter._format_run_stats_summary({}) == []
+        assert formatter._format_run_stats_summary({"num_runs": 1}) == []
+
+
+class TestTextMultiRunStats:
+    """Tests for multi-run stats in the Text formatter."""
+
+    def test_stats_suffix_in_score_line(self) -> None:
+        """Text score line should include (n=X, sd=Y%) for multi-run."""
+        formatter = TextFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "(n=3" in output
+        assert "sd=" in output
+
+    def test_no_stats_suffix_for_single_run(self) -> None:
+        """Single-run text output should not include multi-run stats suffix."""
+        formatter = TextFormatter()
+        results = make_mock_results()
+        output = formatter.format(results, show_details=True)
+        assert "(n=" not in output
+        assert "sd=" not in output
+
+    def test_run_stats_block_in_details(self) -> None:
+        """Text details should include Run Stats block for multi-run."""
+        formatter = TextFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "Run Stats:" in output
+        assert "Seed Policy: constant" in output
+
+
+class TestHtmlMultiRunStats:
+    """Tests for multi-run stats in the HTML formatter."""
+
+    def test_run_stats_card_in_output(self) -> None:
+        """HTML should include run-stats-card for multi-run cases."""
+        formatter = HtmlFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "run-stats-card" in output
+        assert "mean score" in output
+        assert "std dev" in output
+
+    def test_run_tabs_in_output(self) -> None:
+        """HTML should include run-tab for each run."""
+        formatter = HtmlFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "Run 1" in output
+        assert "Run 2" in output
+        assert "Run 3" in output
+
+    def test_no_run_stats_card_for_single_run(self) -> None:
+        """Single-run HTML should not include an active run-stats-card div."""
+        formatter = HtmlFormatter()
+        results = make_mock_results()
+        output = formatter.format(results, show_details=True)
+        # The CSS class definition exists in the template, but no div should use it
+        assert '<div class="run-stats-card' not in output
+
+
+class TestJsonMultiRunStats:
+    """Tests for multi-run stats in the JSON formatter."""
+
+    def test_run_stats_in_json_output(self) -> None:
+        """JSON output should include run_stats for multi-run cases."""
+        formatter = JsonFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        data = json.loads(output)
+        suites = data["models"]["gpt-4o"]["suites"]["MultiRunSuite"]
+        # cases is a list in single-model format
+        case_data = suites["cases"][0]
+        assert "run_stats" in case_data
+        assert case_data["run_stats"]["num_runs"] == 3
+        assert "critic_stats" in case_data
+
+    def test_no_run_stats_in_json_for_single_run(self) -> None:
+        """JSON output should not include run_stats for single-run cases."""
+        formatter = JsonFormatter()
+        results = make_mock_results()
+        output = formatter.format(results, show_details=True)
+        data = json.loads(output)
+        suites = data["models"]["gpt-4o"]["suites"]["test_eval_suite"]
+        case_data = suites["cases"][0]
+        assert "run_stats" not in case_data
+
+
+# =====================================================================
+# Extended multi-run formatter coverage tests
+# =====================================================================
+
+
+def _make_multi_run_case_failed(
+    name: str = "failed_multi_run",
+    score: float = 0.25,
+    num_runs: int = 3,
+) -> dict:
+    """Create a failing multi-run case with failure_reason in run details."""
+    return {
+        "name": name,
+        "input": "multi run fail input",
+        "evaluation": MockEvaluation(
+            passed=False,
+            score=score,
+            failure_reason="All runs failed completely",
+            results=[
+                {
+                    "field": "arg_a",
+                    "match": False,
+                    "score": 0.25,
+                    "weight": 1.0,
+                    "expected": "bar",
+                    "actual": "baz",
+                    "is_criticized": True,
+                }
+            ],
+        ),
+        "run_stats": {
+            "num_runs": num_runs,
+            "scores": [0.3, 0.2, 0.25],
+            "mean_score": 0.25,
+            "std_deviation": 0.041,
+            "passed": [False, False, False],
+            "warned": [False, False, False],
+            "seed_policy": "random",
+            "run_seeds": [100, 200, 300],
+            "pass_rule": "majority",
+            "runs": [
+                {
+                    "score": 0.3,
+                    "passed": False,
+                    "warning": False,
+                    "failure_reason": "Tool selection mismatch",
+                    "details": [
+                        {
+                            "field": "arg_a",
+                            "match": False,
+                            "score": 0.3,
+                            "weight": 1.0,
+                            "expected": "bar",
+                            "actual": "baz",
+                            "is_criticized": True,
+                        }
+                    ],
+                },
+                {
+                    "score": 0.2,
+                    "passed": False,
+                    "warning": False,
+                    "failure_reason": "Tool selection mismatch",
+                    "details": [],
+                },
+                {
+                    "score": 0.25,
+                    "passed": False,
+                    "warning": False,
+                    "failure_reason": "Tool selection mismatch",
+                    "details": [],
+                },
+            ],
+        },
+        "critic_stats": {
+            "arg_a": {
+                "run_scores": [0.3, 0.2, 0.25],
+                "mean_score": 0.25,
+                "std_deviation": 0.041,
+                "run_scores_normalized": [0.3, 0.2, 0.25],
+                "mean_score_normalized": 0.25,
+                "std_deviation_normalized": 0.041,
+                "weight": 1.0,
+            },
+        },
+        "expected_tool_calls": [],
+        "predicted_tool_calls": [],
+    }
+
+
+def _make_multi_run_results_failed() -> list[list[dict]]:
+    """Create evaluation results with failed multi-run stats for a single model."""
+    return [
+        [
+            {
+                "model": "gpt-4o",
+                "suite_name": "FailingSuite",
+                "rubric": MockEvaluation(),
+                "cases": [_make_multi_run_case_failed()],
+            }
+        ]
+    ]
+
+
+def _make_multi_model_multi_run_results() -> list[list[dict]]:
+    """Create evaluation results with multi-run stats for TWO models."""
+    return [
+        [
+            {
+                "model": "gpt-4o",
+                "suite_name": "MultiRunSuite",
+                "rubric": MockEvaluation(),
+                "cases": [_make_multi_run_case()],
+            },
+            {
+                "model": "claude-3.5-sonnet",
+                "suite_name": "MultiRunSuite",
+                "rubric": MockEvaluation(),
+                "cases": [
+                    _make_multi_run_case(
+                        name="multi_run_case",
+                        score=0.6,
+                        passed=False,
+                        num_runs=3,
+                    )
+                ],
+            },
+        ]
+    ]
+
+
+class TestTextMultiRunCoverage:
+    """Extended text formatter tests for multi-run coverage."""
+
+    def test_run_stats_details_with_failure_reason(self) -> None:
+        """Run details should include failure_reason when present."""
+        formatter = TextFormatter()
+        results = _make_multi_run_results_failed()
+        output = formatter.format(results, show_details=True)
+        assert "Tool selection mismatch" in output
+
+    def test_run_stats_displays_seeds(self) -> None:
+        """Text should display run seeds."""
+        formatter = TextFormatter()
+        results = _make_multi_run_results_failed()
+        output = formatter.format(results, show_details=True)
+        assert "Run Seeds: 100, 200, 300" in output
+        assert "Seed Policy: random" in output
+
+    def test_run_stats_displays_pass_rule(self) -> None:
+        """Text should display the pass rule."""
+        formatter = TextFormatter()
+        results = _make_multi_run_results_failed()
+        output = formatter.format(results, show_details=True)
+        assert "Pass Rule: majority" in output
+
+    def test_critic_stats_block(self) -> None:
+        """Text should display critic stats block with ± notation."""
+        formatter = TextFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "Critic Stats:" in output
+        assert "arg_a:" in output
+        assert "±" in output
+
+    def test_multi_model_with_run_stats(self) -> None:
+        """Multi-model text should include run stats in detail view."""
+        formatter = TextFormatter()
+        results = _make_multi_model_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        # Multi-model detail view calls _format_run_stats for each case_result
+        assert "Run Stats:" in output
+        assert "Runs: 3" in output
+        assert "gpt-4o" in output
+        assert "claude-3.5-sonnet" in output
+
+    def test_run_results_per_run_status(self) -> None:
+        """Text should show each run with status and score."""
+        formatter = TextFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "Run 1:" in output
+        assert "Run 2:" in output
+        assert "Run 3:" in output
+
+
+class TestMarkdownMultiRunCoverage:
+    """Extended markdown formatter tests for multi-run coverage."""
+
+    def test_run_details_with_failure_reason(self) -> None:
+        """Markdown run details should include failure_reason."""
+        formatter = MarkdownFormatter()
+        results = _make_multi_run_results_failed()
+        output = formatter.format(results, show_details=True)
+        assert "Tool selection mismatch" in output
+
+    def test_run_details_with_critic_details(self) -> None:
+        """Markdown should include per-run critic details when present."""
+        formatter = MarkdownFormatter()
+        results = _make_multi_run_results_failed()
+        output = formatter.format(results, show_details=True)
+        # The first run has details with field "arg_a"
+        assert "Run Details:" in output
+
+    def test_run_seeds_displayed(self) -> None:
+        """Markdown should display random seeds."""
+        formatter = MarkdownFormatter()
+        results = _make_multi_run_results_failed()
+        output = formatter.format(results, show_details=True)
+        assert "Run Seeds: 100, 200, 300" in output
+        assert "Seed Policy: random" in output
+
+    def test_critic_stats_table(self) -> None:
+        """Markdown should display critic stats table with all columns."""
+        formatter = MarkdownFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "Critic Stats" in output
+        assert "Weight" in output
+        assert "Mean" in output
+
+    def test_multi_model_with_run_stats(self) -> None:
+        """Multi-model markdown should include run stats."""
+        formatter = MarkdownFormatter()
+        results = _make_multi_model_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        # Multi-model detail view calls _format_run_stats_summary for each case
+        assert "**Run Stats:**" in output
+        assert "Runs: 3" in output
+        assert "gpt-4o" in output
+        assert "claude-3.5-sonnet" in output
+
+    def test_no_duplicate_eval_details_for_multi_run(self) -> None:
+        """When run details are present, should not also show eval details."""
+        formatter = MarkdownFormatter()
+        results = _make_multi_run_results_failed()
+        output = formatter.format(results, show_details=True)
+        # "Run Details:" section should exist
+        assert "Run Details:" in output
+        # The field-level critic table should appear inside run details,
+        # not duplicated as a standalone section
+
+
+class TestHtmlMultiRunCoverage:
+    """Extended HTML formatter tests for multi-run coverage."""
+
+    def test_run_stats_card_fields(self) -> None:
+        """HTML run stats card should include mean score, std dev, pass rule."""
+        formatter = HtmlFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "mean score" in output
+        assert "std dev" in output
+        assert "Pass Rule" in output
+        assert "Seed Policy" in output
+
+    def test_run_tabs_with_failure_reason(self) -> None:
+        """HTML run tabs should include failure reason per run."""
+        formatter = HtmlFormatter()
+        results = _make_multi_run_results_failed()
+        output = formatter.format(results, show_details=True)
+        assert "Failure Reason" in output
+        assert "Tool selection mismatch" in output
+
+    def test_run_tabs_status_classes(self) -> None:
+        """HTML run tabs should have status classes (passed/failed)."""
+        formatter = HtmlFormatter()
+        results = _make_multi_run_results_failed()
+        output = formatter.format(results, show_details=True)
+        assert "run-tab failed" in output or "run-tab  failed" in output
+
+    def test_critic_stats_html_table(self) -> None:
+        """HTML should include critic stats table."""
+        formatter = HtmlFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "critic-stats" in output
+        assert "Critic Stats" in output
+
+    def test_score_pills_in_run_stats(self) -> None:
+        """HTML run stats card should show score pills for each run."""
+        formatter = HtmlFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "score-pill" in output
+        assert "R1:" in output
+        assert "R2:" in output
+        assert "R3:" in output
+
+    def test_multi_model_with_run_stats(self) -> None:
+        """Multi-model HTML should include run stats."""
+        formatter = HtmlFormatter()
+        results = _make_multi_model_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        assert "run-stats-card" in output
+        assert "gpt-4o" in output
+        assert "claude-3.5-sonnet" in output
+
+    def test_random_seeds_in_card(self) -> None:
+        """HTML run stats card should display random seeds."""
+        formatter = HtmlFormatter()
+        results = _make_multi_run_results_failed()
+        output = formatter.format(results, show_details=True)
+        assert "100" in output
+        assert "200" in output
+        assert "300" in output
+
+
+class TestJsonMultiRunCoverage:
+    """Extended JSON formatter tests for multi-run coverage."""
+
+    def test_run_stats_fields(self) -> None:
+        """JSON run_stats should include all expected fields."""
+        formatter = JsonFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        data = json.loads(output)
+        suites = data["models"]["gpt-4o"]["suites"]["MultiRunSuite"]
+        rs = suites["cases"][0]["run_stats"]
+        assert rs["num_runs"] == 3
+        assert "scores" in rs
+        assert "mean_score" in rs
+        assert "std_deviation" in rs
+        assert "seed_policy" in rs
+        assert "run_seeds" in rs
+        assert "pass_rule" in rs
+        assert "runs" in rs
+
+    def test_critic_stats_fields(self) -> None:
+        """JSON critic_stats should include all expected fields."""
+        formatter = JsonFormatter()
+        results = _make_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        data = json.loads(output)
+        suites = data["models"]["gpt-4o"]["suites"]["MultiRunSuite"]
+        cs = suites["cases"][0]["critic_stats"]
+        assert "arg_a" in cs
+        assert "run_scores" in cs["arg_a"]
+        assert "mean_score" in cs["arg_a"]
+        assert "weight" in cs["arg_a"]
+
+    def test_multi_model_json_with_run_stats(self) -> None:
+        """Multi-model JSON should include run_stats for each model."""
+        formatter = JsonFormatter()
+        results = _make_multi_model_multi_run_results()
+        output = formatter.format(results, show_details=True)
+        data = json.loads(output)
+        # Multi-model JSON uses comparison structure: {suite: {case: {results_by_model: ...}}}
+        comparison = data["comparison"]
+        case_data = comparison["MultiRunSuite"]["multi_run_case"]
+        for model_name in ["gpt-4o", "claude-3.5-sonnet"]:
+            model_result = case_data["results_by_model"][model_name]
+            assert "run_stats" in model_result, f"Missing run_stats for {model_name}"
+
+    def test_per_run_details_in_json(self) -> None:
+        """JSON per-run details should include failure_reason."""
+        formatter = JsonFormatter()
+        results = _make_multi_run_results_failed()
+        output = formatter.format(results, show_details=True)
+        data = json.loads(output)
+        suites = data["models"]["gpt-4o"]["suites"]["FailingSuite"]
+        runs = suites["cases"][0]["run_stats"]["runs"]
+        assert len(runs) == 3
+        assert runs[0]["failure_reason"] == "Tool selection mismatch"
+
+
+# =====================================================================
+# Coverage gap tests — TextFormatter
+# =====================================================================
+
+
+class TestTextFormatterCoverageGaps:
+    """Tests for TextFormatter methods that lacked coverage."""
+
+    def test_format_evaluation_uncriticized_field(self) -> None:
+        """_format_evaluation should show 'Un-criticized' for is_criticized=False."""
+        formatter = TextFormatter()
+        evaluation = MockEvaluation(
+            passed=True,
+            score=1.0,
+            results=[
+                {
+                    "field": "optional_param",
+                    "match": False,
+                    "score": 0.0,
+                    "weight": 0.0,
+                    "expected": "abc",
+                    "actual": "xyz",
+                    "is_criticized": False,
+                },
+            ],
+        )
+        output = formatter._format_evaluation(evaluation)
+        assert "Un-criticized" in output
+        assert "optional_param" in output
+        assert "Expected: abc" in output
+        assert "Actual: xyz" in output
+
+    def test_format_evaluation_mixed_criticized_and_uncriticized(self) -> None:
+        """_format_evaluation should handle a mix of criticized and uncriticized fields."""
+        formatter = TextFormatter()
+        evaluation = MockEvaluation(
+            passed=True,
+            score=0.5,
+            results=[
+                {
+                    "field": "required_field",
+                    "match": True,
+                    "score": 1.0,
+                    "weight": 1.0,
+                    "expected": "foo",
+                    "actual": "foo",
+                    "is_criticized": True,
+                },
+                {
+                    "field": "info_field",
+                    "match": False,
+                    "score": 0.0,
+                    "weight": 0.0,
+                    "expected": "bar",
+                    "actual": "baz",
+                    "is_criticized": False,
+                },
+            ],
+        )
+        output = formatter._format_evaluation(evaluation)
+        assert "Match" in output
+        assert "Un-criticized" in output
+        assert "required_field" in output
+        assert "info_field" in output
+
+    def test_format_conversation_text_standalone(self) -> None:
+        """_format_conversation_text should format conversation messages correctly."""
+        formatter = TextFormatter()
+        messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi there!"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{"function": {"name": "search", "arguments": '{"q": "test"}'}}],
+            },
+            {
+                "role": "tool",
+                "name": "search",
+                "content": '{"results": [1, 2]}',
+            },
+        ]
+        lines = formatter._format_conversation_text(messages)
+        text = "\n".join(lines)
+
+        assert "[USER]" in text
+        assert "[ASSISTANT]" in text
+        assert "[TOOL: search]" in text
+        assert "Hello" in text
+        assert "Hi there!" in text
+        assert "search" in text
+        assert "results" in text
+
+    def test_format_conversation_text_invalid_json_tool_content(self) -> None:
+        """_format_conversation_text should handle non-JSON tool content gracefully."""
+        formatter = TextFormatter()
+        messages = [
+            {"role": "tool", "name": "raw", "content": "not valid json"},
+        ]
+        lines = formatter._format_conversation_text(messages)
+        text = "\n".join(lines)
+
+        assert "not valid json" in text
+
+    def test_format_conversation_text_invalid_json_tool_call_args(self) -> None:
+        """_format_conversation_text should handle non-JSON tool call args gracefully."""
+        formatter = TextFormatter()
+        messages = [
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}],
+            },
+        ]
+        lines = formatter._format_conversation_text(messages)
+        text = "\n".join(lines)
+
+        assert "broken" in text
+        assert "not json" in text
+
+    def test_multi_model_failed_only_with_original_counts(self) -> None:
+        """TextFormatter multi-model output should show original counts with --only-failed."""
+        formatter = TextFormatter()
+        results = make_multi_model_results()
+        output = formatter.format(results, failed_only=True, original_counts=(20, 15, 5, 0))
+
+        assert "Note: Showing only failed evaluations" in output
+        assert "Total: 20" in output
+        assert "Passed: 15" in output
+        assert "Failed: 5" in output
+
+    def test_multi_model_details_with_run_and_critic_stats(self) -> None:
+        """TextFormatter multi-model detail view should show run/critic stats per model."""
+        formatter = TextFormatter()
+        results = _make_multi_model_multi_run_results()
+        output = formatter.format(results, show_details=True)
+
+        # Should show per-model detail sections with run stats
+        assert "Run Stats:" in output
+        assert "Critic Stats:" in output
+        assert "gpt-4o" in output
+        assert "claude-3.5-sonnet" in output
+
+    def test_comparative_single_model_with_context_conversation(self) -> None:
+        """Comparative single-model should render context with conversation formatting."""
+        formatter = TextFormatter()
+        results = make_comparative_results_with_context()
+        output = formatter.format(results, show_details=True, include_context=True)
+
+        # Context section should be present
+        assert "You are a weather bot" in output
+        # Tool responses should be JSON-formatted
+        assert "temp" in output
+        assert "I need weather info" in output
+
+    def test_comparative_case_first_with_context_conversation(self) -> None:
+        """Comparative case-first (multi-model) should render context with conversation."""
+        # Create multi-model comparative results with context
+        results = make_comparative_results_with_context()
+        # Add another model to trigger case-first grouping
+        results[0].append({
+            "model": "gpt-4o-mini",
+            "suite_name": "weather_suite [track_a]",
+            "track_name": "track_a",
+            "rubric": "Test",
+            "cases": [
+                {
+                    "name": "weather_test",
+                    "input": "Get weather for NYC",
+                    "system_message": "You are a weather bot.",
+                    "additional_messages": [
+                        {"role": "user", "content": "I need weather info"},
+                    ],
+                    "evaluation": MockEvaluation(passed=True, score=0.95),
+                }
+            ],
+        })
+
+        formatter = TextFormatter()
+        output = formatter.format(results, show_details=True, include_context=True)
+
+        assert "MULTI-MODEL" in output
+        assert "You are a weather bot" in output
+        assert "I need weather info" in output
+
+
+# =====================================================================
+# Coverage gap tests — MarkdownFormatter
+# =====================================================================
+
+
+class TestMarkdownFormatterCoverageGaps:
+    """Tests for MarkdownFormatter methods that lacked coverage."""
+
+    def test_format_conversation_md_standalone(self) -> None:
+        """_format_conversation_md should format messages as markdown blockquotes."""
+        formatter = MarkdownFormatter()
+        messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi!"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{"function": {"name": "search", "arguments": '{"q": "test"}'}}],
+            },
+            {
+                "role": "tool",
+                "name": "search",
+                "content": '{"results": [1]}',
+            },
+        ]
+        lines = formatter._format_conversation_md(messages)
+        text = "\n".join(lines)
+
+        assert "👤" in text
+        assert "🤖" in text
+        assert "🔧" in text
+        assert "Hello" in text
+        assert "search" in text
+
+    def test_format_conversation_md_invalid_json(self) -> None:
+        """_format_conversation_md should handle non-JSON tool content gracefully."""
+        formatter = MarkdownFormatter()
+        messages = [
+            {"role": "tool", "name": "raw", "content": "plain text"},
+        ]
+        lines = formatter._format_conversation_md(messages)
+        text = "\n".join(lines)
+
+        assert "plain text" in text
+
+    def test_format_conversation_md_invalid_json_args(self) -> None:
+        """_format_conversation_md should handle non-JSON tool call args gracefully."""
+        formatter = MarkdownFormatter()
+        messages = [
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{"function": {"name": "broken", "arguments": "not json"}}],
+            },
+        ]
+        lines = formatter._format_conversation_md(messages)
+        text = "\n".join(lines)
+
+        assert "broken" in text
+
+    def test_comparative_single_model_with_context_uses_conversation(self) -> None:
+        """Comparative single-model should render context including tool/assistant messages."""
+        formatter = MarkdownFormatter()
+        results = make_comparative_results_with_context()
+        output = formatter.format(results, show_details=True, include_context=True)
+
+        # Context should be rendered
+        assert "You are a weather bot" in output
+        assert "I need weather info" in output
+        # Tool response should be JSON-formatted
+        assert "temp" in output
+
+    def test_comparative_case_first_with_context_uses_conversation(self) -> None:
+        """Comparative case-first should render context with conversation messages."""
+        results = make_comparative_results_with_context()
+        results[0].append({
+            "model": "gpt-4o-mini",
+            "suite_name": "weather_suite [track_a]",
+            "track_name": "track_a",
+            "rubric": "Test",
+            "cases": [
+                {
+                    "name": "weather_test",
+                    "input": "Get weather for NYC",
+                    "system_message": "You are a weather bot.",
+                    "additional_messages": [
+                        {"role": "user", "content": "I need weather info"},
+                    ],
+                    "evaluation": MockEvaluation(passed=True, score=0.95),
+                }
+            ],
+        })
+
+        formatter = MarkdownFormatter()
+        output = formatter.format(results, show_details=True, include_context=True)
+
+        assert "Multi-Model" in output
+        assert "You are a weather bot" in output
+        assert "I need weather info" in output
+
+    def test_multi_model_failed_only_with_original_counts(self) -> None:
+        """MarkdownFormatter multi-model should show original counts with --only-failed."""
+        formatter = MarkdownFormatter()
+        results = make_multi_model_results()
+        output = formatter.format(results, failed_only=True, original_counts=(20, 15, 5, 0))
+
+        assert "Showing only failed evaluations" in output
+        assert "20" in output
+        assert "15" in output
+
+    def test_evaluation_details_uncriticized_field(self) -> None:
+        """_format_evaluation_details should show dash for uncriticized fields."""
+        formatter = MarkdownFormatter()
+        evaluation = MockEvaluation(
+            passed=True,
+            score=1.0,
+            results=[
+                {
+                    "field": "optional",
+                    "match": False,
+                    "score": 0.0,
+                    "weight": 0.0,
+                    "expected": "abc",
+                    "actual": "xyz",
+                    "is_criticized": False,
+                },
+            ],
+        )
+        output = formatter._format_evaluation_details(evaluation)
+        assert "—" in output  # un-criticized uses em-dash
+        assert "optional" in output
diff --git a/libs/tests/cli/test_main_evals.py b/libs/tests/cli/test_main_evals.py
index bcb3d27d..42990d7a 100644
--- a/libs/tests/cli/test_main_evals.py
+++ b/libs/tests/cli/test_main_evals.py
@@ -349,3 +349,74 @@ def test_evals_help_shows_port_flag() -> None:
     assert result.exit_code == 0
     output = _strip_ansi(result.output)
     assert "--port" in output
+
+
+def test_evals_help_shows_use_provider_flag() -> None:
+    """Test that --use-provider flag is documented in help."""
+    result = runner.invoke(cli, ["evals", "--help"])
+    assert result.exit_code == 0
+    output = _strip_ansi(result.output)
+    assert "--use-provider" in output or "-p" in output
+    assert "repeatable" in output.lower() or "can be repeated" in output.lower()
+
+
+def test_evals_help_shows_num_runs_flag() -> None:
+    """Test that --num-runs flag is documented in help."""
+    result = runner.invoke(cli, ["evals", "--help"])
+    assert result.exit_code == 0
+    output = _strip_ansi(result.output)
+    assert "--num-runs" in output or "-n" in output
+
+
+def test_evals_help_shows_seed_flag() -> None:
+    """Test that --seed flag is documented in help."""
+    result = runner.invoke(cli, ["evals", "--help"])
+    assert result.exit_code == 0
+    output = _strip_ansi(result.output)
+    assert "--seed" in output
+
+
+def test_evals_help_shows_multi_run_pass_rule_flag() -> None:
+    """Test that --multi-run-pass-rule flag is documented in help."""
+    result = runner.invoke(cli, ["evals", "--help"])
+    assert result.exit_code == 0
+    output = _strip_ansi(result.output)
+    assert "--multi-run-pass-rule" in output
+
+
+# --- CLI Validation Tests for Multi-Run Flags ---
+
+
+def test_evals_rejects_num_runs_zero() -> None:
+    """--num-runs 0 should produce a CLI error."""
+    result = runner.invoke(cli, ["evals", "--num-runs", "0", "."])
+    output = _strip_ansi(result.output)
+    assert "--num-runs must be >= 1" in output
+
+
+def test_evals_rejects_num_runs_negative() -> None:
+    """--num-runs with a negative value should produce a CLI error."""
+    result = runner.invoke(cli, ["evals", "--num-runs", "-1", "."])
+    output = _strip_ansi(result.output)
+    assert "--num-runs must be >= 1" in output
+
+
+def test_evals_rejects_invalid_seed() -> None:
+    """--seed with an invalid string should produce a CLI error."""
+    result = runner.invoke(cli, ["evals", "--seed", "foobar", "."])
+    output = _strip_ansi(result.output)
+    assert "invalid" in output.lower() and "seed" in output.lower()
+
+
+def test_evals_rejects_negative_seed() -> None:
+    """--seed with a negative integer should produce a CLI error."""
+    result = runner.invoke(cli, ["evals", "--seed", "-5", "."])
+    output = _strip_ansi(result.output)
+    assert "seed" in output.lower() and ("non-negative" in output.lower() or "must be" in output.lower())
+
+
+def test_evals_rejects_invalid_pass_rule() -> None:
+    """--multi-run-pass-rule with an invalid value should produce a CLI error."""
+    result = runner.invoke(cli, ["evals", "--multi-run-pass-rule", "bogus", "."])
+    output = _strip_ansi(result.output)
+    assert "invalid" in output.lower() and "pass-rule" in output.lower().replace("_", "-")
diff --git a/libs/tests/cli/test_utils_multi_provider.py b/libs/tests/cli/test_utils_multi_provider.py
index 0e9a033e..3c8fab8f 100644
--- a/libs/tests/cli/test_utils_multi_provider.py
+++ b/libs/tests/cli/test_utils_multi_provider.py
@@ -112,7 +112,7 @@ class TestExpandProviderConfigs:
     def test_single_provider_single_model(self) -> None:
         """Test expanding single provider with single model."""
         configs = [ProviderConfig(provider=Provider.OPENAI, models=["gpt-4o"])]
-        api_keys = {Provider.OPENAI: "openai-key"}
+        api_keys: dict[Provider, str | None] = {Provider.OPENAI: "openai-key"}
 
         specs = expand_provider_configs(configs, api_keys)
 
@@ -124,7 +124,7 @@ class TestExpandProviderConfigs:
     def test_single_provider_multiple_models(self) -> None:
         """Test expanding single provider with multiple models."""
         configs = [ProviderConfig(provider=Provider.OPENAI, models=["gpt-4o", "gpt-4o-mini"])]
-        api_keys = {Provider.OPENAI: "openai-key"}
+        api_keys: dict[Provider, str | None] = {Provider.OPENAI: "openai-key"}
 
         specs = expand_provider_configs(configs, api_keys)
 
@@ -138,7 +138,7 @@ class TestExpandProviderConfigs:
             ProviderConfig(provider=Provider.OPENAI, models=["gpt-4o"]),
             ProviderConfig(provider=Provider.ANTHROPIC, models=["claude-3-sonnet"]),
         ]
-        api_keys = {
+        api_keys: dict[Provider, str | None] = {
             Provider.OPENAI: "openai-key",
             Provider.ANTHROPIC: "anthropic-key",
         }
@@ -154,7 +154,7 @@ class TestExpandProviderConfigs:
     def test_missing_api_key_raises(self) -> None:
         """Test that missing API key raises ValueError."""
         configs = [ProviderConfig(provider=Provider.OPENAI, models=["gpt-4o"])]
-        api_keys = {Provider.OPENAI: None}  # No key
+        api_keys: dict[Provider, str | None] = {Provider.OPENAI: None}  # No key
 
         with pytest.raises(ValueError) as exc_info:
             expand_provider_configs(configs, api_keys)
@@ -165,7 +165,7 @@ class TestExpandProviderConfigs:
     def test_uses_default_model_when_empty(self) -> None:
         """Test that empty models list uses default."""
         configs = [ProviderConfig(provider=Provider.OPENAI, models=[])]
-        api_keys = {Provider.OPENAI: "openai-key"}
+        api_keys: dict[Provider, str | None] = {Provider.OPENAI: "openai-key"}
 
         specs = expand_provider_configs(configs, api_keys)
 
@@ -204,12 +204,13 @@ class TestResolveProviderApiKeys:
             if "ANTHROPIC_API_KEY" in env_copy:
                 del env_copy["ANTHROPIC_API_KEY"]
 
-            with patch.dict(os.environ, env_copy, clear=True):
-                with patch("dotenv.dotenv_values", return_value={}):
-                    keys = resolve_provider_api_keys()
-                    # Check structure - values should be None when not found
-                    assert Provider.OPENAI in keys
-                    assert Provider.ANTHROPIC in keys
+            with patch.dict(os.environ, env_copy, clear=True), patch(
+                "dotenv.dotenv_values", return_value={}
+            ):
+                keys = resolve_provider_api_keys()
+                # Check structure - values should be None when not found
+                assert Provider.OPENAI in keys
+                assert Provider.ANTHROPIC in keys
 
     def test_multiple_api_key_specs(self) -> None:
         """Test parsing multiple --api-key specs."""
@@ -244,10 +245,8 @@ class TestIntegration:
         config = parse_provider_spec("openai:gpt-4o")
 
         # Expand with key
-        specs = expand_provider_configs(
-            [config],
-            {Provider.OPENAI: "test-key"},
-        )
+        api_keys: dict[Provider, str | None] = {Provider.OPENAI: "test-key"}
+        specs = expand_provider_configs([config], api_keys)
 
         assert len(specs) == 1
         assert specs[0].display_name == "openai/gpt-4o"
@@ -260,7 +259,7 @@ class TestIntegration:
         configs = [parse_provider_spec(s) for s in specs_str]
 
         # Expand with keys
-        api_keys = {
+        api_keys: dict[Provider, str | None] = {
             Provider.OPENAI: "openai-key",
             Provider.ANTHROPIC: "anthropic-key",
         }
@@ -271,6 +270,30 @@ class TestIntegration:
         assert specs[1].display_name == "openai/gpt-4o-mini"
         assert specs[2].display_name == "anthropic/claude-3-sonnet"
 
+    def test_multiple_provider_flags_parsing(self) -> None:
+        """Test that multiple --use-provider flags are parsed correctly (simulating CLI behavior)."""
+        # Simulate what happens when CLI receives multiple -p flags
+        # e.g., -p openai:gpt-4o -p anthropic:claude-3-sonnet
+        provider_specs = ["openai:gpt-4o", "anthropic:claude-3-sonnet"]
+        configs = [parse_provider_spec(spec) for spec in provider_specs]
+
+        assert len(configs) == 2
+        assert configs[0].provider == Provider.OPENAI
+        assert configs[0].models == ["gpt-4o"]
+        assert configs[1].provider == Provider.ANTHROPIC
+        assert configs[1].models == ["claude-3-sonnet"]
+
+        # Expand with keys
+        api_keys: dict[Provider, str | None] = {
+            Provider.OPENAI: "openai-key",
+            Provider.ANTHROPIC: "anthropic-key",
+        }
+        specs = expand_provider_configs(configs, api_keys)
+
+        assert len(specs) == 2
+        assert specs[0].display_name == "openai/gpt-4o"
+        assert specs[1].display_name == "anthropic/claude-3-sonnet"
+
 
 class TestParseApiKeySpec:
     """Tests for parse_api_key_spec function."""
diff --git a/libs/tests/sdk/test_eval_capture.py b/libs/tests/sdk/test_eval_capture.py
index b25f5e35..5513b8e6 100644
--- a/libs/tests/sdk/test_eval_capture.py
+++ b/libs/tests/sdk/test_eval_capture.py
@@ -732,7 +732,7 @@ class TestToolEvalCaptureMode:
 
             # Verify include_context was passed
             call_args = mock_capture.call_args
-            assert call_args[0][3] is True  # include_context is 4th positional arg
+            assert call_args.kwargs["include_context"] is True
 
 
 # --- Multiple Tool Calls per Case Tests ---
diff --git a/libs/tests/sdk/test_eval_multi_run.py b/libs/tests/sdk/test_eval_multi_run.py
new file mode 100644
index 00000000..ccbda141
--- /dev/null
+++ b/libs/tests/sdk/test_eval_multi_run.py
@@ -0,0 +1,439 @@
+import pytest
+from arcade_evals._evalsuite._types import (
+    DEFAULT_EVAL_SEED,
+    PASS_RULE_LAST,
+    PASS_RULE_MAJORITY,
+    PASS_RULE_MEAN,
+    _resolve_seed_spec,
+)
+from arcade_evals.capture import CapturedRun, CapturedToolCall
+from arcade_evals.eval import (
+    EvalRubric,
+    EvaluationResult,
+    _aggregate_critic_stats,
+    _compute_mean_std,
+    _resolve_pass_rule,
+)
+
+
+# ========================================================================
+# _compute_mean_std tests
+# ========================================================================
+
+
+class TestComputeMeanStd:
+    def test_empty_list(self) -> None:
+        avg, std = _compute_mean_std([])
+        assert avg == 0.0
+        assert std == 0.0
+
+    def test_single_value(self) -> None:
+        avg, std = _compute_mean_std([0.75])
+        assert avg == pytest.approx(0.75)
+        assert std == 0.0
+
+    def test_multiple_values(self) -> None:
+        avg, std = _compute_mean_std([0.5, 0.5])
+        assert avg == pytest.approx(0.5)
+        assert std == pytest.approx(0.0)
+
+    def test_varying_values(self) -> None:
+        avg, std = _compute_mean_std([0.0, 1.0])
+        assert avg == pytest.approx(0.5)
+        assert std > 0.0
+
+
+# ========================================================================
+# _resolve_seed_spec tests
+# ========================================================================
+
+
+class TestResolveSeedSpec:
+    def test_constant_string(self) -> None:
+        mode, value = _resolve_seed_spec("constant")
+        assert mode == "constant"
+        assert value == DEFAULT_EVAL_SEED
+
+    def test_random_string(self) -> None:
+        mode, value = _resolve_seed_spec("random")
+        assert mode == "random"
+        assert value is None
+
+    def test_integer(self) -> None:
+        mode, value = _resolve_seed_spec(123)
+        assert mode == "custom"
+        assert value == 123
+
+    def test_numeric_string(self) -> None:
+        mode, value = _resolve_seed_spec("456")
+        assert mode == "custom"
+        assert value == 456
+
+    def test_none_defaults_to_constant(self) -> None:
+        mode, value = _resolve_seed_spec(None)
+        assert mode == "constant"
+        assert value == DEFAULT_EVAL_SEED
+
+    def test_invalid_string_raises(self) -> None:
+        with pytest.raises(ValueError, match="Invalid seed"):
+            _resolve_seed_spec("not-a-seed")
+
+    def test_case_insensitive(self) -> None:
+        mode, value = _resolve_seed_spec("CONSTANT")
+        assert mode == "constant"
+        mode2, value2 = _resolve_seed_spec("RANDOM")
+        assert mode2 == "random"
+        assert value2 is None
+
+
+# ========================================================================
+# _resolve_pass_rule tests
+# ========================================================================
+
+
+class TestResolvePassRule:
+    def test_last_rule_returns_last_eval(self) -> None:
+        rubric = EvalRubric()
+        run_evals = [
+            EvaluationResult(score=0.3, passed=False),
+            EvaluationResult(score=0.9, passed=True),
+        ]
+        passed, warning = _resolve_pass_rule(
+            run_evals, mean_score=0.6, pass_rule=PASS_RULE_LAST, rubric=rubric
+        )
+        assert passed is True
+        assert warning is False
+
+    def test_mean_rule_passes_when_mean_above_threshold(self) -> None:
+        rubric = EvalRubric(fail_threshold=0.6, warn_threshold=0.4)
+        run_evals = [EvaluationResult(score=0.5), EvaluationResult(score=0.9)]
+        passed, warning = _resolve_pass_rule(
+            run_evals, mean_score=0.7, pass_rule=PASS_RULE_MEAN, rubric=rubric
+        )
+        assert passed is True
+        assert warning is False
+
+    def test_mean_rule_warning(self) -> None:
+        rubric = EvalRubric(fail_threshold=0.6, warn_threshold=0.4)
+        run_evals = [EvaluationResult(score=0.2), EvaluationResult(score=0.8)]
+        passed, warning = _resolve_pass_rule(
+            run_evals, mean_score=0.5, pass_rule=PASS_RULE_MEAN, rubric=rubric
+        )
+        assert passed is False
+        assert warning is True
+
+    def test_mean_rule_fails_below_warn(self) -> None:
+        rubric = EvalRubric(fail_threshold=0.6, warn_threshold=0.4)
+        run_evals = [EvaluationResult(score=0.1), EvaluationResult(score=0.2)]
+        passed, warning = _resolve_pass_rule(
+            run_evals, mean_score=0.15, pass_rule=PASS_RULE_MEAN, rubric=rubric
+        )
+        assert passed is False
+        assert warning is False
+
+    def test_majority_rule_passes(self) -> None:
+        rubric = EvalRubric()
+        run_evals = [
+            EvaluationResult(score=0.9, passed=True),
+            EvaluationResult(score=0.9, passed=True),
+            EvaluationResult(score=0.1, passed=False),
+        ]
+        passed, warning = _resolve_pass_rule(
+            run_evals, mean_score=0.63, pass_rule=PASS_RULE_MAJORITY, rubric=rubric
+        )
+        assert passed is True
+        assert warning is False
+
+    def test_majority_rule_warning(self) -> None:
+        rubric = EvalRubric()
+        run_evals = [
+            EvaluationResult(score=0.8, passed=True),
+            EvaluationResult(score=0.5, warning=True),
+            EvaluationResult(score=0.1, passed=False),
+        ]
+        passed, warning = _resolve_pass_rule(
+            run_evals, mean_score=0.46, pass_rule=PASS_RULE_MAJORITY, rubric=rubric
+        )
+        assert passed is False
+        assert warning is True
+
+    def test_empty_evaluations_returns_false(self) -> None:
+        rubric = EvalRubric()
+        passed, warning = _resolve_pass_rule(
+            [], mean_score=0.0, pass_rule=PASS_RULE_LAST, rubric=rubric
+        )
+        assert passed is False
+        assert warning is False
+
+    def test_invalid_rule_raises(self) -> None:
+        rubric = EvalRubric()
+        with pytest.raises(ValueError, match="Invalid multi-run pass rule"):
+            _resolve_pass_rule(
+                [EvaluationResult(score=0.5)],
+                mean_score=0.5,
+                pass_rule="invalid",
+                rubric=rubric,
+            )
+
+
+# ========================================================================
+# _aggregate_critic_stats tests
+# ========================================================================
+
+
+class TestAggregateCriticStats:
+    def test_basic_aggregation(self) -> None:
+        run_field_scores = [
+            {"arg_a": {"score": 0.5, "weight": 0.5}},
+            {
+                "arg_a": {"score": 0.0, "weight": 0.5},
+                "arg_b": {"score": 0.25, "weight": 0.5},
+            },
+        ]
+        stats = _aggregate_critic_stats(run_field_scores)
+        assert stats["arg_a"]["run_scores"] == [0.5, 0.0]
+        assert stats["arg_a"]["run_scores_normalized"] == [1.0, 0.0]
+        assert stats["arg_a"]["weight"] == pytest.approx(0.5)
+        assert stats["arg_b"]["run_scores"] == [0.0, 0.25]
+        assert stats["arg_b"]["run_scores_normalized"] == [0.0, 0.5]
+        assert stats["arg_b"]["weight"] == pytest.approx(0.5)
+
+    def test_empty_input(self) -> None:
+        assert _aggregate_critic_stats([]) == {}
+
+    def test_single_run(self) -> None:
+        run_field_scores = [{"field_x": {"score": 0.8, "weight": 1.0}}]
+        stats = _aggregate_critic_stats(run_field_scores)
+        assert stats["field_x"]["run_scores"] == [0.8]
+        assert stats["field_x"]["mean_score"] == pytest.approx(0.8)
+        assert stats["field_x"]["std_deviation"] == pytest.approx(0.0)
+        assert stats["field_x"]["weight"] == pytest.approx(1.0)
+
+
+# ========================================================================
+# CapturedRun tests
+# ========================================================================
+
+
+class TestCapturedRun:
+    def test_to_dict_empty(self) -> None:
+        run = CapturedRun()
+        assert run.to_dict() == {"tool_calls": []}
+
+    def test_to_dict_with_calls(self) -> None:
+        run = CapturedRun(
+            tool_calls=[
+                CapturedToolCall(name="GetWeather", args={"city": "NYC"}),
+                CapturedToolCall(name="GetTime", args={"tz": "UTC"}),
+            ]
+        )
+        d = run.to_dict()
+        assert len(d["tool_calls"]) == 2
+        assert d["tool_calls"][0] == {"name": "GetWeather", "args": {"city": "NYC"}}
+        assert d["tool_calls"][1] == {"name": "GetTime", "args": {"tz": "UTC"}}
+
+
+# ========================================================================
+# CapturedCase.to_dict tests
+# ========================================================================
+from arcade_evals.capture import CapturedCase
+
+
+class TestCapturedCaseToDict:
+    def test_single_run_no_runs_key(self) -> None:
+        """When runs=[], to_dict should NOT include a 'runs' key."""
+        case = CapturedCase(
+            case_name="test",
+            user_message="Hello",
+            tool_calls=[CapturedToolCall(name="Greet", args={})],
+            runs=[],
+        )
+        d = case.to_dict()
+        assert "runs" not in d
+        assert d["tool_calls"] == [{"name": "Greet", "args": {}}]
+
+    def test_multi_run_includes_runs(self) -> None:
+        """When runs has items, to_dict should include 'runs' key."""
+        case = CapturedCase(
+            case_name="test",
+            user_message="Hello",
+            tool_calls=[CapturedToolCall(name="Greet", args={})],
+            runs=[
+                CapturedRun(tool_calls=[CapturedToolCall(name="Greet", args={"seed": "1"})]),
+                CapturedRun(tool_calls=[CapturedToolCall(name="Greet", args={"seed": "2"})]),
+            ],
+        )
+        d = case.to_dict()
+        assert "runs" in d
+        assert len(d["runs"]) == 2
+        assert d["runs"][0]["tool_calls"][0]["args"]["seed"] == "1"
+
+    def test_to_dict_with_context(self) -> None:
+        """to_dict with include_context=True should include system_message."""
+        case = CapturedCase(
+            case_name="test",
+            user_message="Hello",
+            tool_calls=[],
+            system_message="You are helpful",
+            additional_messages=[],
+        )
+        d = case.to_dict(include_context=True)
+        assert "system_message" in d
+        assert d["system_message"] == "You are helpful"
+
+    def test_to_dict_with_track_name(self) -> None:
+        """to_dict should include track_name when set."""
+        case = CapturedCase(
+            case_name="test",
+            user_message="Hello",
+            tool_calls=[],
+            track_name="track_a",
+        )
+        d = case.to_dict()
+        assert d["track_name"] == "track_a"
+
+    def test_to_dict_no_track_name_omits_key(self) -> None:
+        """to_dict should not include track_name when None."""
+        case = CapturedCase(
+            case_name="test",
+            user_message="Hello",
+            tool_calls=[],
+        )
+        d = case.to_dict()
+        assert "track_name" not in d
+
+
+# ========================================================================
+# _aggregate_critic_stats extended tests
+# ========================================================================
+
+
+class TestAggregateCriticStatsExtended:
+    def test_zero_weight_field(self) -> None:
+        """Fields with zero weight should still aggregate correctly."""
+        run_field_scores = [
+            {"field_a": {"score": 0.5, "weight": 0.0}},
+            {"field_a": {"score": 0.7, "weight": 0.0}},
+        ]
+        stats = _aggregate_critic_stats(run_field_scores)
+        assert stats["field_a"]["weight"] == pytest.approx(0.0)
+        # Normalized scores with zero weight are 0.0
+        assert stats["field_a"]["run_scores_normalized"] == [0.0, 0.0]
+
+    def test_mixed_presence_across_runs(self) -> None:
+        """Fields missing from some runs should get 0.0 for those runs."""
+        run_field_scores = [
+            {"field_a": {"score": 0.8, "weight": 1.0}},
+            {"field_b": {"score": 0.5, "weight": 0.5}},
+        ]
+        stats = _aggregate_critic_stats(run_field_scores)
+        # field_a present in run 1 (0.8), absent in run 2 (0.0)
+        assert stats["field_a"]["run_scores"] == [0.8, 0.0]
+        # field_b absent in run 1 (0.0), present in run 2 (0.5)
+        assert stats["field_b"]["run_scores"] == [0.0, 0.5]
+
+    def test_consistency_of_mean_and_std(self) -> None:
+        """Verify mean and std are consistent with run_scores."""
+        from statistics import mean, pstdev
+
+        run_field_scores = [
+            {"f": {"score": 0.2, "weight": 0.5}},
+            {"f": {"score": 0.6, "weight": 0.5}},
+            {"f": {"score": 0.4, "weight": 0.5}},
+        ]
+        stats = _aggregate_critic_stats(run_field_scores)
+        assert stats["f"]["mean_score"] == pytest.approx(mean([0.2, 0.6, 0.4]))
+        assert stats["f"]["std_deviation"] == pytest.approx(pstdev([0.2, 0.6, 0.4]))
+
+
+# ========================================================================
+# PASS_RULE_LAST failure_reason defensive guard test
+# ========================================================================
+
+
+class TestPassRuleLastFailureReasonGuard:
+    """The PASS_RULE_LAST branch should not surface failure_reason when passed."""
+
+    def test_last_passed_no_failure_reason(self) -> None:
+        """When PASS_RULE_LAST and the last run passed, failure_reason should be None."""
+        # This tests the defensive guard we added:
+        #   aggregate_failure_reason = run_evaluations[-1].failure_reason if not passed else None
+        #
+        # We can't easily test _run_case_with_stats without mocking the LLM,
+        # but we can verify the logic pattern by checking _resolve_pass_rule:
+        rubric = EvalRubric()
+        evals = [
+            EvaluationResult(score=0.3, passed=False, failure_reason="bad"),
+            EvaluationResult(score=0.9, passed=True, failure_reason=None),
+        ]
+        passed, warning = _resolve_pass_rule(evals, 0.6, PASS_RULE_LAST, rubric)
+        assert passed is True
+        # When passed is True, the aggregate should NOT surface failure_reason
+        # (This is the logic we guard in eval.py line ~929-933)
+        aggregate_failure_reason = evals[-1].failure_reason if not passed else None
+        assert aggregate_failure_reason is None
+
+    def test_last_failed_surfaces_failure_reason(self) -> None:
+        """When PASS_RULE_LAST and the last run failed, failure_reason is surfaced."""
+        rubric = EvalRubric()
+        evals = [
+            EvaluationResult(score=0.9, passed=True),
+            EvaluationResult(score=0.3, passed=False, failure_reason="tool mismatch"),
+        ]
+        passed, warning = _resolve_pass_rule(evals, 0.6, PASS_RULE_LAST, rubric)
+        assert passed is False
+        aggregate_failure_reason = evals[-1].failure_reason if not passed else None
+        assert aggregate_failure_reason == "tool mismatch"
+
+
+# ========================================================================
+# _resolve_pass_rule with MAJORITY edge cases
+# ========================================================================
+
+
+class TestResolvePassRuleMajorityEdgeCases:
+    def test_majority_all_warned(self) -> None:
+        """When all runs have warnings, majority should return warning."""
+        rubric = EvalRubric()
+        evals = [
+            EvaluationResult(score=0.5, passed=False, warning=True),
+            EvaluationResult(score=0.5, passed=False, warning=True),
+            EvaluationResult(score=0.5, passed=False, warning=True),
+        ]
+        passed, warning = _resolve_pass_rule(evals, 0.5, PASS_RULE_MAJORITY, rubric)
+        assert passed is False
+        assert warning is True
+
+    def test_majority_all_failed(self) -> None:
+        """When all runs fail, majority should return fail."""
+        rubric = EvalRubric()
+        evals = [
+            EvaluationResult(score=0.1, passed=False),
+            EvaluationResult(score=0.2, passed=False),
+            EvaluationResult(score=0.15, passed=False),
+        ]
+        passed, warning = _resolve_pass_rule(evals, 0.15, PASS_RULE_MAJORITY, rubric)
+        assert passed is False
+        assert warning is False
+
+    def test_majority_tie_does_not_pass(self) -> None:
+        """With a 50/50 even split, there is no majority, so it fails."""
+        rubric = EvalRubric()
+        evals = [
+            EvaluationResult(score=0.9, passed=True),
+            EvaluationResult(score=0.1, passed=False),
+        ]
+        # majority = 2 // 2 + 1 = 2, passed_count=1 < 2
+        passed, warning = _resolve_pass_rule(evals, 0.5, PASS_RULE_MAJORITY, rubric)
+        assert passed is False
+
+    def test_majority_tie_fails(self) -> None:
+        """With more failures than passes, should fail."""
+        rubric = EvalRubric()
+        evals = [
+            EvaluationResult(score=0.9, passed=True),
+            EvaluationResult(score=0.1, passed=False),
+            EvaluationResult(score=0.1, passed=False),
+        ]
+        passed, warning = _resolve_pass_rule(evals, 0.36, PASS_RULE_MAJORITY, rubric)
+        assert passed is False
diff --git a/pyproject.toml b/pyproject.toml
index 3125433a..40e18aeb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "arcade-mcp"
-version = "1.8.3"
+version = "1.9.0"
 description = "Arcade.dev - Tool Calling platform for Agents"
 readme = "README.md"
 license = { file = "LICENSE" }
diff --git a/toolkits/linkedin/evals/eval_linkedin.py b/toolkits/linkedin/evals/eval_linkedin.py
index 5a4c005f..4c25d50b 100644
--- a/toolkits/linkedin/evals/eval_linkedin.py
+++ b/toolkits/linkedin/evals/eval_linkedin.py
@@ -21,7 +21,7 @@ catalog.add_module(arcade_linkedin)
 
 
 @tool_eval()
-def linkedin_eval_suite():
+def linkedin_eval_suite() -> EvalSuite:
     suite = EvalSuite(
         name="LinkedIn Tools Evaluation",
         system_message="You are an AI assistant with access to LinkedIn tools. Use them to help the user with their tasks.",
diff --git a/toolkits/math/evals/eval_math_tools.py b/toolkits/math/evals/eval_math_tools.py
index f1646b0a..c1ff3859 100644
--- a/toolkits/math/evals/eval_math_tools.py
+++ b/toolkits/math/evals/eval_math_tools.py
@@ -1,3 +1,6 @@
+from collections.abc import Callable
+from typing import Any
+
 from arcade_evals import (
     BinaryCritic,
     EvalRubric,
@@ -44,6 +47,9 @@ from arcade_math.tools.trigonometry import (
     rad_to_deg,
 )
 
+# Type alias for test case tuples: (function, prompt_template, params)
+TestCase = tuple[Callable[..., Any], str, dict[str, Any]]
+
 # Evaluation rubric
 rubric = EvalRubric(
     fail_threshold=0.85,
@@ -56,7 +62,7 @@ catalog.add_module(arcade_math)
 
 
 @tool_eval()
-def math_eval_suite():
+def math_eval_suite() -> EvalSuite:
     suite = EvalSuite(
         name="Math Tools Evaluation",
         system_message="You're an AI assistant with access to math tools. Use them to help the user with their math-related tasks.",
@@ -65,7 +71,7 @@ def math_eval_suite():
     )
 
     list_param = ["1", "2", "3", "4", "5"]
-    funcs_to_expression_and_params = [
+    funcs_to_expression_and_params: list[TestCase] = [
         # unary
         (sqrt, "What's the square root of {a}?", {"a": "25"}),
         (abs_val, "What's the absolute value of {a}?", {"a": "-10"}),