@EricGustin you can use this cli command:
```
uv run arcade evals mcp_building_evals_results/eval_toolkit_iteration_dict.py \
-p openai:gpt-4o,gpt-4o-mini \
-p anthropic:claude-sonnet-4-20250514 \
-k openai:$OPENAI_API_KEY \
-k anthropic:$ANTHROPIC_API_KEY \
-d \
--num-runs 3 \
--seed random \
--multi-run-pass-rule majority \
--max-concurrent 6 \
-o mcp_building_evals_results/results
```
<!-- CURSOR_SUMMARY -->
---
> [!NOTE]
> **Medium Risk**
> Touches core eval execution and all result formatters while adding new
CLI inputs and output schema (`run_stats`/`critic_stats` and capture
`runs`), so regressions could affect evaluation results and report
compatibility despite being additive and validated.
>
> **Overview**
> Adds **multi-run evaluation support** to `arcade evals` via new flags
`--num-runs`, `--seed`, and `--multi-run-pass-rule`, with upfront
validation and plumbing through the CLI runner into eval/capture suite
execution.
>
> Fixes provider selection UX/bug by making `--use-provider/-p`
**repeatable** (instead of a space-delimited string), updates
docs/examples accordingly, and extends capture mode to optionally record
**per-run tool calls** (`CapturedRun`) when `num_runs > 1`.
>
> Enhances all output formatters (HTML/Markdown/Text/JSON) to
**propagate and display** per-case `run_stats` and `critic_stats`,
including new HTML UI for run tabs/cards and comparative tables showing
mean ± stddev when multi-run data is present.
>
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
2ee1654b7d1fbb9538373507355636164b16a066. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
1158 lines
45 KiB
Python
1158 lines
45 KiB
Python
"""Plain text formatter for evaluation and capture results."""
|
|
|
|
import json
|
|
from typing import Any
|
|
|
|
from arcade_cli.formatters.base import (
|
|
CaptureFormatter,
|
|
CaptureResults,
|
|
ComparativeCaseData,
|
|
EvalResultFormatter,
|
|
compute_track_differences,
|
|
find_best_model,
|
|
group_comparative_by_case,
|
|
group_comparative_by_case_first,
|
|
group_eval_for_comparison,
|
|
group_results_by_model,
|
|
is_comparative_result,
|
|
is_multi_model_capture,
|
|
is_multi_model_comparative,
|
|
is_multi_model_eval,
|
|
)
|
|
|
|
|
|
class TextFormatter(EvalResultFormatter):
|
|
"""
|
|
Plain text formatter for evaluation results.
|
|
|
|
Produces output similar to pytest's format with simple ASCII formatting.
|
|
"""
|
|
|
|
@property
|
|
def file_extension(self) -> str:
|
|
return "txt"
|
|
|
|
def format(
|
|
self,
|
|
results: list[list[dict[str, Any]]],
|
|
show_details: bool = False,
|
|
failed_only: bool = False,
|
|
original_counts: tuple[int, int, int, int] | None = None,
|
|
include_context: bool = False,
|
|
) -> str:
|
|
# Check if this is a comparative evaluation
|
|
if is_comparative_result(results):
|
|
return self._format_comparative(
|
|
results, show_details, failed_only, original_counts, include_context
|
|
)
|
|
|
|
# Check if this is a multi-model evaluation
|
|
if is_multi_model_eval(results):
|
|
return self._format_multi_model(
|
|
results, show_details, failed_only, original_counts, include_context
|
|
)
|
|
|
|
return self._format_regular(
|
|
results, show_details, failed_only, original_counts, include_context
|
|
)
|
|
|
|
def _format_regular(
|
|
self,
|
|
results: list[list[dict[str, Any]]],
|
|
show_details: bool = False,
|
|
failed_only: bool = False,
|
|
original_counts: tuple[int, int, int, int] | None = None,
|
|
include_context: bool = False,
|
|
) -> str:
|
|
"""Format regular (non-comparative) evaluation results."""
|
|
lines: list[str] = []
|
|
|
|
# Use shared grouping logic
|
|
model_groups, total_passed, total_failed, total_warned, total_cases = (
|
|
group_results_by_model(results)
|
|
)
|
|
|
|
# Output grouped results
|
|
for model, suites in model_groups.items():
|
|
lines.append(f"Model: {model}")
|
|
lines.append("=" * 60)
|
|
|
|
for suite_name, cases in suites.items():
|
|
lines.append(f" Suite: {suite_name}")
|
|
lines.append(" " + "-" * 56)
|
|
|
|
for case in cases:
|
|
evaluation = case["evaluation"]
|
|
if evaluation.passed:
|
|
status = "PASSED"
|
|
elif evaluation.warning:
|
|
status = "WARNED"
|
|
else:
|
|
status = "FAILED"
|
|
|
|
score_percentage = evaluation.score * 100
|
|
run_stats = case.get("run_stats") or {}
|
|
stats_suffix = ""
|
|
if run_stats.get("num_runs", 1) > 1:
|
|
std_pct = run_stats.get("std_deviation", 0.0) * 100
|
|
stats_suffix = f" (n={run_stats['num_runs']}, sd={std_pct:.2f}%)"
|
|
lines.append(
|
|
f" {status} {case['name']} -- Score: {score_percentage:.2f}%{stats_suffix}"
|
|
)
|
|
|
|
if show_details:
|
|
lines.append(f" User Input: {case['input']}")
|
|
lines.append("")
|
|
|
|
# Context section (if include_context is True)
|
|
if include_context:
|
|
system_msg = case.get("system_message")
|
|
addl_msgs = case.get("additional_messages")
|
|
if system_msg or addl_msgs:
|
|
lines.append(" Context:")
|
|
if system_msg:
|
|
lines.append(f" System: {system_msg}")
|
|
if addl_msgs:
|
|
lines.append(f" Conversation ({len(addl_msgs)} messages):")
|
|
for conv_line in self._format_conversation_text(addl_msgs):
|
|
lines.append(f" {conv_line}")
|
|
lines.append("")
|
|
|
|
lines.append(" Details:")
|
|
for stat_line in self._format_run_stats(case):
|
|
lines.append(f" {stat_line}")
|
|
for stat_line in self._format_critic_stats(case):
|
|
lines.append(f" {stat_line}")
|
|
for detail_line in self._format_evaluation(evaluation).split("\n"):
|
|
lines.append(f" {detail_line}")
|
|
lines.append(" " + "-" * 52)
|
|
|
|
lines.append("")
|
|
|
|
lines.append("")
|
|
|
|
# Summary
|
|
lines.extend(
|
|
self._format_summary_lines(
|
|
total_cases,
|
|
total_passed,
|
|
total_failed,
|
|
total_warned,
|
|
failed_only,
|
|
original_counts,
|
|
)
|
|
)
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_evaluation(self, evaluation: Any) -> str:
|
|
"""Format evaluation details."""
|
|
result_lines = []
|
|
if evaluation.failure_reason:
|
|
result_lines.append(f"Failure Reason: {evaluation.failure_reason}")
|
|
else:
|
|
for critic_result in evaluation.results:
|
|
is_criticized = critic_result.get("is_criticized", True)
|
|
field = critic_result["field"]
|
|
score = critic_result["score"]
|
|
weight = critic_result["weight"]
|
|
expected = critic_result["expected"]
|
|
actual = critic_result["actual"]
|
|
|
|
if is_criticized:
|
|
match_str = "Match" if critic_result["match"] else "No Match"
|
|
result_lines.append(
|
|
f"{field}: {match_str}\n"
|
|
f" Score: {score:.2f}/{weight:.2f}\n"
|
|
f" Expected: {expected}\n"
|
|
f" Actual: {actual}"
|
|
)
|
|
else:
|
|
result_lines.append(
|
|
f"{field}: Un-criticized\n Expected: {expected}\n Actual: {actual}"
|
|
)
|
|
return "\n".join(result_lines)
|
|
|
|
def _format_run_stats(self, case: dict[str, Any]) -> list[str]:
|
|
run_stats = case.get("run_stats")
|
|
if not run_stats or run_stats.get("num_runs", 1) < 2:
|
|
return []
|
|
scores = run_stats.get("scores", [])
|
|
scores_display = ", ".join(f"{score * 100:.2f}%" for score in scores)
|
|
mean_pct = run_stats.get("mean_score", 0.0) * 100
|
|
std_pct = run_stats.get("std_deviation", 0.0) * 100
|
|
lines = [
|
|
"Run Stats:",
|
|
f" Runs: {run_stats.get('num_runs', len(scores))}",
|
|
f" Mean Score: {mean_pct:.2f}%",
|
|
f" Std Deviation: {std_pct:.2f}%",
|
|
]
|
|
if scores_display:
|
|
lines.append(f" Scores: {scores_display}")
|
|
seed_policy = run_stats.get("seed_policy")
|
|
run_seeds = run_stats.get("run_seeds")
|
|
if seed_policy:
|
|
lines.append(f" Seed Policy: {seed_policy}")
|
|
if run_seeds and any(seed is not None for seed in run_seeds):
|
|
seeds_display = ", ".join(str(seed) for seed in run_seeds)
|
|
lines.append(f" Run Seeds: {seeds_display}")
|
|
pass_rule = run_stats.get("pass_rule")
|
|
if pass_rule:
|
|
lines.append(f" Pass Rule: {pass_rule}")
|
|
|
|
runs = run_stats.get("runs", [])
|
|
if runs:
|
|
lines.append(" Run Results:")
|
|
for idx, run in enumerate(runs, start=1):
|
|
if run.get("passed"):
|
|
status = "PASSED"
|
|
elif run.get("warning"):
|
|
status = "WARNED"
|
|
else:
|
|
status = "FAILED"
|
|
score_pct = run.get("score", 0.0) * 100
|
|
run_line = f" Run {idx}: {status} ({score_pct:.2f}%)"
|
|
failure_reason = run.get("failure_reason")
|
|
if failure_reason:
|
|
run_line += f" -- {failure_reason}"
|
|
lines.append(run_line)
|
|
lines.append("")
|
|
return lines
|
|
|
|
def _format_critic_stats(self, case: dict[str, Any]) -> list[str]:
|
|
critic_stats = case.get("critic_stats")
|
|
if not critic_stats:
|
|
return []
|
|
lines = ["Critic Stats:"]
|
|
for field, stats in critic_stats.items():
|
|
weight = stats.get("weight", 0.0)
|
|
mean_norm = stats.get("mean_score_normalized", 0.0) * 100
|
|
std_norm = stats.get("std_deviation_normalized", 0.0) * 100
|
|
mean_weighted = stats.get("mean_score", 0.0) * 100
|
|
std_weighted = stats.get("std_deviation", 0.0) * 100
|
|
lines.append(
|
|
f" {field}: norm {mean_norm:.2f}% ± {std_norm:.2f}% | "
|
|
f"weighted {mean_weighted:.2f}% ± {std_weighted:.2f}% (w={weight:.2f})"
|
|
)
|
|
lines.append("")
|
|
return lines
|
|
|
|
# =========================================================================
|
|
# MULTI-MODEL EVALUATION FORMATTING
|
|
# =========================================================================
|
|
|
|
def _format_multi_model(
|
|
self,
|
|
results: list[list[dict[str, Any]]],
|
|
show_details: bool = False,
|
|
failed_only: bool = False,
|
|
original_counts: tuple[int, int, int, int] | None = None,
|
|
include_context: bool = False,
|
|
) -> str:
|
|
"""Format multi-model evaluation results with comparison tables."""
|
|
lines: list[str] = []
|
|
|
|
# Get comparison data
|
|
comparison_data, model_order, per_model_stats = group_eval_for_comparison(results)
|
|
|
|
# Header
|
|
lines.append("=" * 78)
|
|
lines.append("MULTI-MODEL EVALUATION RESULTS")
|
|
lines.append("=" * 78)
|
|
lines.append("")
|
|
lines.append(f"Models: {', '.join(model_order)}")
|
|
lines.append("")
|
|
|
|
# Per-Model Summary Table
|
|
lines.append("-" * 78)
|
|
lines.append("PER-MODEL SUMMARY")
|
|
lines.append("-" * 78)
|
|
lines.append("")
|
|
|
|
# Build header row
|
|
header = f"{'Model':<20} {'Passed':>8} {'Failed':>8} {'Warned':>8} {'Total':>8} {'Pass Rate':>10}"
|
|
lines.append(header)
|
|
lines.append("-" * len(header))
|
|
|
|
best_model = None
|
|
best_rate = -1.0
|
|
for model in model_order:
|
|
stats = per_model_stats[model]
|
|
rate = stats["pass_rate"]
|
|
|
|
if rate > best_rate:
|
|
best_rate = rate
|
|
best_model = model
|
|
|
|
lines.append(
|
|
f"{model:<20} {stats['passed']:>8} {stats['failed']:>8} "
|
|
f"{stats['warned']:>8} {stats['total']:>8} {rate:>9.1f}%"
|
|
)
|
|
|
|
lines.append("")
|
|
if best_model:
|
|
lines.append(f"Best Overall: {best_model} ({best_rate:.1f}% pass rate)")
|
|
lines.append("")
|
|
|
|
# Cross-Model Comparison by Suite
|
|
lines.append("-" * 78)
|
|
lines.append("CROSS-MODEL COMPARISON")
|
|
lines.append("-" * 78)
|
|
lines.append("")
|
|
|
|
for suite_name, cases in comparison_data.items():
|
|
lines.append(f"Suite: {suite_name}")
|
|
lines.append("")
|
|
|
|
# Build comparison table header - dynamic based on model count
|
|
# Calculate column widths
|
|
case_col_width = 30
|
|
model_col_width = 12
|
|
best_col_width = 15
|
|
|
|
header_parts = [f"{'Case':<{case_col_width}}"]
|
|
for model in model_order:
|
|
# Truncate model name if too long
|
|
display_name = (
|
|
model[: model_col_width - 1] if len(model) > model_col_width - 1 else model
|
|
)
|
|
header_parts.append(f"{display_name:>{model_col_width}}")
|
|
header_parts.append(f"{'Best':>{best_col_width}}")
|
|
|
|
header_line = " ".join(header_parts)
|
|
lines.append(header_line)
|
|
lines.append("-" * len(header_line))
|
|
|
|
# Build rows for each case
|
|
for case_name, case_models in cases.items():
|
|
# Truncate case name if needed
|
|
display_case = (
|
|
case_name[: case_col_width - 1]
|
|
if len(case_name) > case_col_width - 1
|
|
else case_name
|
|
)
|
|
row_parts = [f"{display_case:<{case_col_width}}"]
|
|
|
|
for model in model_order:
|
|
if model in case_models:
|
|
evaluation = case_models[model]["evaluation"]
|
|
score = evaluation.score * 100
|
|
if evaluation.passed:
|
|
cell = f"OK {score:.0f}%"
|
|
elif evaluation.warning:
|
|
cell = f"WN {score:.0f}%"
|
|
else:
|
|
cell = f"FL {score:.0f}%"
|
|
else:
|
|
cell = "-"
|
|
row_parts.append(f"{cell:>{model_col_width}}")
|
|
|
|
# Find best model for this case
|
|
best, _ = find_best_model(case_models)
|
|
if best == "Tie":
|
|
best_cell = "Tie"
|
|
elif best:
|
|
best_cell = (
|
|
best[: best_col_width - 1] if len(best) > best_col_width - 1 else best
|
|
)
|
|
else:
|
|
best_cell = "-"
|
|
row_parts.append(f"{best_cell:>{best_col_width}}")
|
|
|
|
lines.append(" ".join(row_parts))
|
|
|
|
lines.append("")
|
|
|
|
# Detailed results per case (if requested)
|
|
if show_details:
|
|
lines.append(" Detailed Results:")
|
|
lines.append(" " + "-" * 70)
|
|
|
|
for case_name, case_models in cases.items():
|
|
lines.append(f" Case: {case_name}")
|
|
|
|
for model in model_order:
|
|
if model not in case_models:
|
|
continue
|
|
|
|
case_result = case_models[model]
|
|
evaluation = case_result["evaluation"]
|
|
|
|
lines.append(f" [{model}] Score: {evaluation.score * 100:.1f}%")
|
|
|
|
for stat_line in self._format_run_stats(case_result):
|
|
lines.append(f" {stat_line}")
|
|
for stat_line in self._format_critic_stats(case_result):
|
|
lines.append(f" {stat_line}")
|
|
|
|
# Show evaluation details indented
|
|
eval_details = self._format_evaluation(evaluation)
|
|
for line in eval_details.split("\n"):
|
|
lines.append(f" {line}")
|
|
|
|
lines.append("")
|
|
|
|
lines.append("")
|
|
|
|
# Overall summary
|
|
total_cases = sum(s["total"] for s in per_model_stats.values())
|
|
total_passed = sum(s["passed"] for s in per_model_stats.values())
|
|
total_failed = sum(s["failed"] for s in per_model_stats.values())
|
|
total_warned = sum(s["warned"] for s in per_model_stats.values())
|
|
|
|
lines.append("=" * 78)
|
|
if failed_only and original_counts:
|
|
orig_total, orig_passed, orig_failed, orig_warned = original_counts
|
|
lines.append("Note: Showing only failed evaluations (--only-failed)")
|
|
lines.append(
|
|
f"Summary -- Total: {orig_total} -- Passed: {orig_passed} -- "
|
|
f"Failed: {orig_failed} -- Warned: {orig_warned}"
|
|
)
|
|
else:
|
|
unique_cases = sum(len(cases) for cases in comparison_data.values())
|
|
lines.append(
|
|
f"Summary -- Unique Cases: {unique_cases} -- "
|
|
f"Total Evaluations: {total_cases} ({len(model_order)} models)"
|
|
)
|
|
lines.append(
|
|
f" Passed: {total_passed} -- Failed: {total_failed} -- Warned: {total_warned}"
|
|
)
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
# =========================================================================
|
|
# COMPARATIVE EVALUATION FORMATTING
|
|
# =========================================================================
|
|
|
|
def _format_comparative(
|
|
self,
|
|
results: list[list[dict[str, Any]]],
|
|
show_details: bool = False,
|
|
failed_only: bool = False,
|
|
original_counts: tuple[int, int, int, int] | None = None,
|
|
include_context: bool = False,
|
|
) -> str:
|
|
"""Format comparative evaluation results showing tracks side-by-side."""
|
|
# Check if this is multi-model comparative - use case-first grouping
|
|
if is_multi_model_comparative(results):
|
|
return self._format_comparative_case_first(
|
|
results, show_details, failed_only, original_counts, include_context
|
|
)
|
|
|
|
return self._format_comparative_single_model(
|
|
results, show_details, failed_only, original_counts, include_context
|
|
)
|
|
|
|
def _format_comparative_single_model(
|
|
self,
|
|
results: list[list[dict[str, Any]]],
|
|
show_details: bool = False,
|
|
failed_only: bool = False,
|
|
original_counts: tuple[int, int, int, int] | None = None,
|
|
include_context: bool = False,
|
|
) -> str:
|
|
"""Format single-model comparative evaluation results."""
|
|
lines: list[str] = []
|
|
|
|
# Use comparative grouping
|
|
(
|
|
comparative_groups,
|
|
total_passed,
|
|
total_failed,
|
|
total_warned,
|
|
total_cases,
|
|
suite_track_order,
|
|
) = group_comparative_by_case(results)
|
|
|
|
# Collect all unique tracks for header
|
|
all_tracks: list[str] = []
|
|
for tracks in suite_track_order.values():
|
|
for t in tracks:
|
|
if t not in all_tracks:
|
|
all_tracks.append(t)
|
|
|
|
lines.append("=" * 76)
|
|
lines.append("COMPARATIVE EVALUATION RESULTS")
|
|
lines.append("=" * 76)
|
|
lines.append("")
|
|
lines.append(f"All Tracks: {' vs '.join(all_tracks)}")
|
|
lines.append("")
|
|
|
|
# Output grouped results
|
|
for model, suites in comparative_groups.items():
|
|
lines.append(f"Model: {model}")
|
|
lines.append("=" * 76)
|
|
|
|
for suite_name, cases in suites.items():
|
|
# Get track order for this specific suite
|
|
track_order = suite_track_order.get(suite_name, [])
|
|
|
|
lines.append(f" Suite: {suite_name} (Comparative)")
|
|
lines.append(f" Tracks: {' vs '.join(track_order)}")
|
|
lines.append(" " + "-" * 72)
|
|
|
|
for case_name, case_data in cases.items():
|
|
if include_context:
|
|
lines.extend(
|
|
self._format_context_block(
|
|
case_data.get("system_message"),
|
|
case_data.get("additional_messages"),
|
|
)
|
|
)
|
|
|
|
lines.extend(
|
|
self._format_comparative_case_text(
|
|
case_name, case_data, track_order, show_details
|
|
)
|
|
)
|
|
|
|
lines.append("")
|
|
|
|
# Summary
|
|
lines.extend(
|
|
self._format_summary_lines(
|
|
total_cases,
|
|
total_passed,
|
|
total_failed,
|
|
total_warned,
|
|
failed_only,
|
|
original_counts,
|
|
)
|
|
)
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_comparative_case_first(
|
|
self,
|
|
results: list[list[dict[str, Any]]],
|
|
show_details: bool = False,
|
|
failed_only: bool = False,
|
|
original_counts: tuple[int, int, int, int] | None = None,
|
|
include_context: bool = False,
|
|
) -> str:
|
|
"""Format multi-model comparative evaluation grouped by case first."""
|
|
lines: list[str] = []
|
|
|
|
# Get case-first grouping
|
|
(
|
|
case_groups,
|
|
model_order,
|
|
suite_track_order,
|
|
total_passed,
|
|
total_failed,
|
|
total_warned,
|
|
total_cases,
|
|
) = group_comparative_by_case_first(results)
|
|
|
|
# Collect all unique tracks
|
|
all_tracks: list[str] = []
|
|
for tracks in suite_track_order.values():
|
|
for t in tracks:
|
|
if t not in all_tracks:
|
|
all_tracks.append(t)
|
|
|
|
lines.append("=" * 78)
|
|
lines.append("COMPARATIVE EVALUATION RESULTS (MULTI-MODEL)")
|
|
lines.append("=" * 78)
|
|
lines.append("")
|
|
lines.append(f"Models: {', '.join(model_order)}")
|
|
lines.append(f"Tracks: {', '.join(all_tracks)}")
|
|
lines.append("")
|
|
|
|
# Results grouped by case
|
|
for suite_name, cases in case_groups.items():
|
|
track_order = suite_track_order.get(suite_name, [])
|
|
|
|
lines.append("-" * 78)
|
|
lines.append(f"SUITE: {suite_name}")
|
|
lines.append(f"Tracks: {' vs '.join(track_order)}")
|
|
lines.append("-" * 78)
|
|
lines.append("")
|
|
|
|
for case_name, model_data in cases.items():
|
|
# Case header
|
|
lines.append(" " + "=" * 72)
|
|
lines.append(f" CASE: {case_name}")
|
|
lines.append(" " + "=" * 72)
|
|
|
|
# Get input and context from first model
|
|
first_model_data = next(iter(model_data.values()), {})
|
|
case_input = first_model_data.get("input", "")
|
|
if case_input:
|
|
lines.append(f" Input: {case_input}")
|
|
|
|
if include_context:
|
|
context_lines = self._format_context_block(
|
|
first_model_data.get("system_message"),
|
|
first_model_data.get("additional_messages"),
|
|
)
|
|
if context_lines:
|
|
lines.append("")
|
|
lines.extend(context_lines)
|
|
|
|
lines.append("")
|
|
|
|
# Show each model's results for this case
|
|
for model in model_order:
|
|
if model not in model_data:
|
|
lines.append(f" [{model}] (no data)")
|
|
lines.append("")
|
|
continue
|
|
|
|
model_case_data = model_data[model]
|
|
lines.append(f" [{model}]")
|
|
|
|
# Show track comparison for this model
|
|
case_lines = self._format_comparative_case_text(
|
|
case_name, model_case_data, track_order, show_details
|
|
)
|
|
# Indent the case lines
|
|
for line in case_lines:
|
|
lines.append(" " + line)
|
|
|
|
lines.append("")
|
|
|
|
# Summary
|
|
lines.append("=" * 78)
|
|
lines.extend(
|
|
self._format_summary_lines(
|
|
total_cases,
|
|
total_passed,
|
|
total_failed,
|
|
total_warned,
|
|
failed_only,
|
|
original_counts,
|
|
)
|
|
)
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_comparative_case_text(
|
|
self,
|
|
case_name: str,
|
|
case_data: ComparativeCaseData,
|
|
track_order: list[str],
|
|
show_details: bool,
|
|
) -> list[str]:
|
|
"""Format a single comparative case in text format."""
|
|
lines: list[str] = []
|
|
tracks = case_data.get("tracks", {})
|
|
|
|
lines.append("")
|
|
lines.append(" " + "─" * 68)
|
|
lines.append(f" CASE: {case_name}")
|
|
lines.append(" " + "─" * 68)
|
|
lines.append(f" Input: {case_data.get('input', 'N/A')}")
|
|
lines.append("")
|
|
|
|
# Compute differences from baseline
|
|
differences = compute_track_differences(case_data, track_order)
|
|
|
|
# Build comparison table header
|
|
lines.append(" ┌─ COMPARISON ─────────────────────────────────────────────────────┐")
|
|
lines.append(
|
|
" │ {:20s} │ {:8s} │ {:8s} │ {:24s} │".format(
|
|
"Track", "Status", "Score", "Differences"
|
|
)
|
|
)
|
|
lines.append(" ├" + "─" * 22 + "┼" + "─" * 10 + "┼" + "─" * 10 + "┼" + "─" * 26 + "┤")
|
|
|
|
for track_name in track_order:
|
|
if track_name not in tracks:
|
|
lines.append(
|
|
" │ {:20s} │ {:8s} │ {:8s} │ {:24s} │".format(
|
|
track_name[:20], "N/A", "N/A", "No data"
|
|
)
|
|
)
|
|
continue
|
|
|
|
track_result = tracks[track_name]
|
|
evaluation = track_result.get("evaluation")
|
|
|
|
if not evaluation:
|
|
lines.append(
|
|
" │ {:20s} │ {:8s} │ {:8s} │ {:24s} │".format(
|
|
track_name[:20], "N/A", "N/A", "No evaluation"
|
|
)
|
|
)
|
|
continue
|
|
|
|
# Status
|
|
if evaluation.passed:
|
|
status = "PASSED"
|
|
elif evaluation.warning:
|
|
status = "WARNED"
|
|
else:
|
|
status = "FAILED"
|
|
|
|
# Score
|
|
score_str = f"{evaluation.score * 100:.1f}%"
|
|
|
|
# Differences from baseline
|
|
diff_fields = differences.get(track_name, [])
|
|
if track_name == track_order[0]:
|
|
diff_text = "(baseline)"
|
|
elif diff_fields:
|
|
diff_text = ", ".join(diff_fields)[:24]
|
|
else:
|
|
diff_text = "—"
|
|
|
|
lines.append(
|
|
f" │ {track_name[:20]:20s} │ {status:8s} │ {score_str:8s} │ {diff_text[:24]:24s} │"
|
|
)
|
|
|
|
lines.append(" └" + "─" * 22 + "┴" + "─" * 10 + "┴" + "─" * 10 + "┴" + "─" * 26 + "┘")
|
|
lines.append("")
|
|
|
|
# Detailed results per track
|
|
if show_details:
|
|
for track_name in track_order:
|
|
if track_name not in tracks:
|
|
continue
|
|
|
|
track_result = tracks[track_name]
|
|
evaluation = track_result.get("evaluation")
|
|
|
|
if not evaluation:
|
|
continue
|
|
|
|
lines.append(f" [{track_name}] Details:")
|
|
for stat_line in self._format_run_stats(track_result):
|
|
lines.append(f" {stat_line}")
|
|
for stat_line in self._format_critic_stats(track_result):
|
|
lines.append(f" {stat_line}")
|
|
for detail_line in self._format_evaluation(evaluation).split("\n"):
|
|
lines.append(f" {detail_line}")
|
|
lines.append("")
|
|
|
|
return lines
|
|
|
|
def _format_summary_lines(
|
|
self,
|
|
total_cases: int,
|
|
total_passed: int,
|
|
total_failed: int,
|
|
total_warned: int,
|
|
failed_only: bool,
|
|
original_counts: tuple[int, int, int, int] | None,
|
|
) -> list[str]:
|
|
"""Build the summary lines used by regular and comparative formatters."""
|
|
lines: list[str] = []
|
|
if failed_only and original_counts:
|
|
orig_total, orig_passed, orig_failed, orig_warned = original_counts
|
|
lines.append(f"Note: Showing only {total_cases} failed evaluation(s) (--only-failed)")
|
|
summary = f"Summary -- Total: {orig_total} -- Passed: {orig_passed}"
|
|
if orig_warned > 0:
|
|
summary += f" -- Warnings: {orig_warned}"
|
|
if orig_failed > 0:
|
|
summary += f" -- Failed: {orig_failed}"
|
|
else:
|
|
summary = f"Summary -- Total: {total_cases} -- Passed: {total_passed}"
|
|
if total_warned > 0:
|
|
summary += f" -- Warnings: {total_warned}"
|
|
if total_failed > 0:
|
|
summary += f" -- Failed: {total_failed}"
|
|
lines.append(summary)
|
|
lines.append("")
|
|
return lines
|
|
|
|
def _format_context_block(
|
|
self,
|
|
system_msg: str | None,
|
|
additional_messages: list[dict] | None,
|
|
indent: str = " ",
|
|
) -> list[str]:
|
|
"""Build the context section lines for comparative display.
|
|
|
|
Args:
|
|
system_msg: The system message, if any.
|
|
additional_messages: Conversation messages, if any.
|
|
indent: Base indentation prefix for each line.
|
|
|
|
Returns:
|
|
List of formatted lines (empty if no context data).
|
|
"""
|
|
if not system_msg and not additional_messages:
|
|
return []
|
|
lines: list[str] = []
|
|
lines.append(indent + "-" * 40)
|
|
lines.append(indent + "📋 CONTEXT")
|
|
lines.append(indent + "-" * 40)
|
|
if system_msg:
|
|
lines.append(f"{indent}System Message: {system_msg}")
|
|
if additional_messages:
|
|
lines.append(f"{indent}💬 Conversation ({len(additional_messages)} messages):")
|
|
for conv_line in self._format_conversation_text(additional_messages):
|
|
lines.append(f"{indent}{conv_line}")
|
|
lines.append(indent + "-" * 40)
|
|
return lines
|
|
|
|
def _format_conversation_text(self, messages: list[dict]) -> list[str]:
|
|
"""Format conversation messages as plain text for context display."""
|
|
lines: list[str] = []
|
|
|
|
for msg in messages:
|
|
role = msg.get("role", "unknown").upper()
|
|
content = msg.get("content", "")
|
|
tool_calls = msg.get("tool_calls", [])
|
|
name = msg.get("name", "")
|
|
|
|
role_label = f"[{role}]" if not name else f"[{role}: {name}]"
|
|
lines.append(f" {role_label}")
|
|
|
|
if content:
|
|
# For tool responses, try to format JSON nicely
|
|
if role.lower() == "tool":
|
|
try:
|
|
parsed = json.loads(content)
|
|
formatted = json.dumps(parsed, indent=2)
|
|
for json_line in formatted.split("\n"):
|
|
lines.append(f" {json_line}")
|
|
except (json.JSONDecodeError, TypeError):
|
|
lines.append(f" {content}")
|
|
else:
|
|
lines.append(f" {content}")
|
|
|
|
# Handle tool calls in assistant messages
|
|
if tool_calls:
|
|
for tc in tool_calls:
|
|
func = tc.get("function", {})
|
|
tc_name = func.get("name", "unknown")
|
|
tc_args = func.get("arguments", "{}")
|
|
lines.append(f" 🔧 {tc_name}")
|
|
try:
|
|
args_dict = json.loads(tc_args) if isinstance(tc_args, str) else tc_args
|
|
formatted = json.dumps(args_dict, indent=2)
|
|
for arg_line in formatted.split("\n"):
|
|
lines.append(f" {arg_line}")
|
|
except (json.JSONDecodeError, TypeError):
|
|
lines.append(f" {tc_args}")
|
|
|
|
return lines
|
|
|
|
|
|
class CaptureTextFormatter(CaptureFormatter):
|
|
"""Plain text formatter for capture results."""
|
|
|
|
@property
|
|
def file_extension(self) -> str:
|
|
return "txt"
|
|
|
|
def format(
|
|
self,
|
|
captures: CaptureResults,
|
|
include_context: bool = False,
|
|
) -> str:
|
|
"""Format capture results as plain text."""
|
|
# Check for multi-model captures
|
|
if is_multi_model_capture(captures):
|
|
return self._format_multi_model(captures, include_context)
|
|
|
|
return self._format_single_model(captures, include_context)
|
|
|
|
def _format_single_model(
|
|
self,
|
|
captures: CaptureResults,
|
|
include_context: bool = False,
|
|
) -> str:
|
|
"""Format single-model capture results."""
|
|
lines: list[str] = []
|
|
lines.append("=" * 70)
|
|
lines.append("CAPTURE RESULTS")
|
|
lines.append("=" * 70)
|
|
lines.append("")
|
|
|
|
total_cases = 0
|
|
total_calls = 0
|
|
|
|
for capture in captures:
|
|
lines.append(f"Suite: {capture.suite_name}")
|
|
lines.append(f"Model: {capture.model}")
|
|
lines.append(f"Provider: {capture.provider}")
|
|
lines.append("-" * 70)
|
|
|
|
for case in capture.captured_cases:
|
|
total_cases += 1
|
|
lines.append("")
|
|
lines.append(f" Case: {case.case_name}")
|
|
# track_name is set for comparative cases
|
|
track_name = getattr(case, "track_name", None)
|
|
if track_name:
|
|
lines.append(f" Track: {track_name}")
|
|
lines.append(f" User Message: {case.user_message}")
|
|
|
|
if include_context and case.system_message:
|
|
lines.append(f" System Message: {case.system_message}")
|
|
|
|
lines.append("")
|
|
lines.append(" Tool Calls:")
|
|
runs = getattr(case, "runs", None)
|
|
if runs:
|
|
for run_index, run in enumerate(runs, start=1):
|
|
lines.append(f" Run {run_index}:")
|
|
if run.tool_calls:
|
|
for tc in run.tool_calls:
|
|
total_calls += 1
|
|
lines.append(f" - {tc.name}")
|
|
if tc.args:
|
|
for key, value in tc.args.items():
|
|
lines.append(
|
|
f" {key}: {self._format_value(value)}"
|
|
)
|
|
else:
|
|
lines.append(" (no tool calls)")
|
|
elif case.tool_calls:
|
|
for tc in case.tool_calls:
|
|
total_calls += 1
|
|
lines.append(f" - {tc.name}")
|
|
if tc.args:
|
|
for key, value in tc.args.items():
|
|
lines.append(f" {key}: {self._format_value(value)}")
|
|
else:
|
|
lines.append(" (no tool calls)")
|
|
|
|
if include_context and case.additional_messages:
|
|
lines.append("")
|
|
lines.append(
|
|
f" Conversation Context ({len(case.additional_messages)} messages):"
|
|
)
|
|
lines.extend(self._format_conversation_text(case.additional_messages))
|
|
|
|
lines.append("")
|
|
|
|
lines.append("")
|
|
|
|
lines.append("=" * 70)
|
|
lines.append(f"Summary: {total_calls} tool calls across {total_cases} cases")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_multi_model(
|
|
self,
|
|
captures: CaptureResults,
|
|
include_context: bool = False,
|
|
) -> str:
|
|
"""Format multi-model capture results with track sections."""
|
|
from arcade_cli.formatters.base import group_captures_by_case_then_track
|
|
|
|
grouped_data, model_order, track_order = group_captures_by_case_then_track(captures)
|
|
has_tracks = len(track_order) > 1 or (track_order and track_order[0] is not None)
|
|
|
|
lines: list[str] = []
|
|
|
|
lines.append("=" * 78)
|
|
lines.append("MULTI-MODEL CAPTURE RESULTS")
|
|
lines.append("=" * 78)
|
|
lines.append("")
|
|
lines.append(f"Models: {', '.join(model_order)}")
|
|
if has_tracks:
|
|
track_names = [t for t in track_order if t is not None]
|
|
lines.append(f"Tracks: {' | '.join(track_names)}")
|
|
lines.append("")
|
|
|
|
for suite_name, cases in grouped_data.items():
|
|
lines.append("-" * 78)
|
|
lines.append(f"SUITE: {suite_name}")
|
|
lines.append("-" * 78)
|
|
lines.append("")
|
|
|
|
for case_name, case_data in cases.items():
|
|
lines.append(" " + "=" * 72)
|
|
lines.append(f" CASE: {case_name}")
|
|
lines.append(" " + "=" * 72)
|
|
|
|
user_msg = case_data.get("user_message", "")
|
|
if user_msg:
|
|
lines.append(f" User Message: {user_msg}")
|
|
lines.append("")
|
|
|
|
tracks_data = case_data.get("tracks", {})
|
|
track_keys = list(tracks_data.keys())
|
|
has_multiple_tracks = len(track_keys) > 1 or (
|
|
len(track_keys) == 1 and track_keys[0] != "_default"
|
|
)
|
|
|
|
if has_multiple_tracks:
|
|
# Show track sections
|
|
for track_key in track_keys:
|
|
track_display = track_key if track_key != "_default" else "Default"
|
|
lines.append(" " + "┌" + "─" * 70 + "┐")
|
|
lines.append(f" │ 🏷️ TRACK: {track_display:<57s} │")
|
|
lines.append(" " + "├" + "─" * 70 + "┤")
|
|
|
|
track_data = tracks_data[track_key]
|
|
models_dict = track_data.get("models", {})
|
|
|
|
for model in model_order:
|
|
if model not in models_dict:
|
|
lines.append(f" │ [{model}] (no data)")
|
|
continue
|
|
|
|
captured_case = models_dict[model]
|
|
lines.append(f" │ [{model}]")
|
|
|
|
runs = getattr(captured_case, "runs", None)
|
|
if runs:
|
|
for run_index, run in enumerate(runs, start=1):
|
|
lines.append(f" │ Run {run_index}:")
|
|
if run.tool_calls:
|
|
for tc in run.tool_calls:
|
|
lines.append(f" │ - {tc.name}")
|
|
if tc.args:
|
|
for key, value in tc.args.items():
|
|
lines.append(
|
|
f" │ {key}: {self._format_value(value)}"
|
|
)
|
|
else:
|
|
lines.append(" │ (no tool calls)")
|
|
elif captured_case.tool_calls:
|
|
for tc in captured_case.tool_calls:
|
|
lines.append(f" │ - {tc.name}")
|
|
if tc.args:
|
|
for key, value in tc.args.items():
|
|
lines.append(
|
|
f" │ {key}: {self._format_value(value)}"
|
|
)
|
|
else:
|
|
lines.append(" │ (no tool calls)")
|
|
lines.append(" │")
|
|
|
|
lines.append(" " + "└" + "─" * 70 + "┘")
|
|
lines.append("")
|
|
else:
|
|
# No tracks - render models directly
|
|
track_key = track_keys[0] if track_keys else "_default"
|
|
track_data = tracks_data.get(track_key, {})
|
|
models_dict = track_data.get("models", {})
|
|
|
|
lines.append(" Tool Calls by Model:")
|
|
lines.append(" " + "-" * 70)
|
|
|
|
for model in model_order:
|
|
if model not in models_dict:
|
|
lines.append(f" [{model}] (no data)")
|
|
continue
|
|
|
|
captured_case = models_dict[model]
|
|
lines.append(f" [{model}]")
|
|
|
|
runs = getattr(captured_case, "runs", None)
|
|
if runs:
|
|
for run_index, run in enumerate(runs, start=1):
|
|
lines.append(f" Run {run_index}:")
|
|
if run.tool_calls:
|
|
for tc in run.tool_calls:
|
|
lines.append(f" - {tc.name}")
|
|
if tc.args:
|
|
for key, value in tc.args.items():
|
|
lines.append(
|
|
f" {key}: {self._format_value(value)}"
|
|
)
|
|
else:
|
|
lines.append(" (no tool calls)")
|
|
elif captured_case.tool_calls:
|
|
for tc in captured_case.tool_calls:
|
|
lines.append(f" - {tc.name}")
|
|
if tc.args:
|
|
for key, value in tc.args.items():
|
|
lines.append(
|
|
f" {key}: {self._format_value(value)}"
|
|
)
|
|
else:
|
|
lines.append(" (no tool calls)")
|
|
lines.append("")
|
|
|
|
# Context section
|
|
system_msg = case_data.get("system_message")
|
|
addl_msgs = case_data.get("additional_messages")
|
|
if include_context and (system_msg or addl_msgs):
|
|
lines.append(" 📋 Context:")
|
|
if system_msg:
|
|
lines.append(f" System: {system_msg}")
|
|
if addl_msgs:
|
|
lines.append(f" Conversation ({len(addl_msgs)} messages):")
|
|
lines.extend(self._format_conversation_text(addl_msgs))
|
|
lines.append("")
|
|
|
|
lines.append("")
|
|
|
|
# Summary
|
|
total_models = len(model_order)
|
|
total_suites = len(grouped_data)
|
|
total_cases = sum(len(cases) for cases in grouped_data.values())
|
|
track_info = f", {len([t for t in track_order if t])} track(s)" if has_tracks else ""
|
|
|
|
lines.append("=" * 78)
|
|
lines.append(
|
|
f"Summary: {total_cases} cases across {total_suites} suite(s), "
|
|
f"{total_models} model(s){track_info}"
|
|
)
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_conversation_text(self, messages: list[dict]) -> list[str]:
|
|
"""Format conversation messages as plain text."""
|
|
lines: list[str] = []
|
|
|
|
for i, msg in enumerate(messages):
|
|
role = msg.get("role", "unknown")
|
|
content = msg.get("content", "")
|
|
tool_calls = msg.get("tool_calls", [])
|
|
name = msg.get("name", "")
|
|
|
|
# Role indicators
|
|
role_prefix = {
|
|
"user": " [USER]",
|
|
"assistant": " [ASSISTANT]",
|
|
"tool": " [TOOL]",
|
|
"system": " [SYSTEM]",
|
|
}.get(role, f" [{role.upper()}]")
|
|
|
|
# Add separator between messages
|
|
if i > 0:
|
|
lines.append(" " + "-" * 50)
|
|
|
|
# Header
|
|
if role == "tool" and name:
|
|
lines.append(f"{role_prefix} ({name})")
|
|
else:
|
|
lines.append(role_prefix)
|
|
|
|
# Content
|
|
if content:
|
|
# Indent content lines
|
|
for line in content.split("\n"):
|
|
if line.strip():
|
|
lines.append(f" {line}")
|
|
elif role == "assistant" and not content and tool_calls:
|
|
lines.append(" (calling tools...)")
|
|
|
|
# Tool calls for assistant messages
|
|
if tool_calls:
|
|
for tc in tool_calls:
|
|
func = tc.get("function", {})
|
|
tc_name = func.get("name", "unknown")
|
|
tc_args = func.get("arguments", "{}")
|
|
|
|
lines.append(f" -> {tc_name}")
|
|
|
|
# Parse and format arguments
|
|
try:
|
|
args_dict = json.loads(tc_args) if isinstance(tc_args, str) else tc_args
|
|
args_formatted = json.dumps(args_dict, indent=2)
|
|
for arg_line in args_formatted.split("\n"):
|
|
lines.append(f" {arg_line}")
|
|
except (json.JSONDecodeError, TypeError):
|
|
lines.append(f" {tc_args}")
|
|
|
|
return lines
|
|
|
|
def _format_value(self, value: Any) -> str:
|
|
"""Format a value for display, truncating if too long."""
|
|
str_value = str(value)
|
|
if len(str_value) > 60:
|
|
return str_value[:57] + "..."
|
|
return str_value
|