"""HTML formatter for evaluation and capture results with full color support."""
import json
from datetime import datetime, timezone
from typing import Any
from arcade_cli.formatters.base import (
CaptureFormatter,
CaptureResults,
ComparativeCaseData,
EvalResultFormatter,
compute_track_differences,
find_best_model,
group_comparative_by_case,
group_comparative_by_case_first,
group_eval_for_comparison,
group_results_by_model,
is_comparative_result,
is_multi_model_capture,
is_multi_model_comparative,
is_multi_model_eval,
truncate_field_value,
)
class HtmlFormatter(EvalResultFormatter):
"""
HTML formatter for evaluation results.
Produces a styled HTML document with colors matching the terminal output.
Security Note: All user-controllable data MUST be escaped via _escape_html()
before being inserted into HTML. This includes case names, inputs, model names,
suite names, and any evaluation results or error messages.
"""
def __init__(self) -> None:
"""Initialize formatter with ID tracking for uniqueness."""
super().__init__()
self._id_cache: dict[tuple[str, str, str], str] = {}
self._used_ids: set[str] = set()
@property
def file_extension(self) -> str:
return "html"
def format(
self,
results: list[list[dict[str, Any]]],
show_details: bool = False,
failed_only: bool = False,
original_counts: tuple[int, int, int, int] | None = None,
include_context: bool = False,
) -> str:
# Check if this is a comparative evaluation
if is_comparative_result(results):
return self._format_comparative(
results, show_details, failed_only, original_counts, include_context
)
# Check if this is a multi-model evaluation
if is_multi_model_eval(results):
return self._format_multi_model(
results, show_details, failed_only, original_counts, include_context
)
return self._format_regular(
results, show_details, failed_only, original_counts, include_context
)
def _format_regular(
self,
results: list[list[dict[str, Any]]],
show_details: bool = False,
failed_only: bool = False,
original_counts: tuple[int, int, int, int] | None = None,
include_context: bool = False,
) -> str:
"""Format regular (non-comparative) evaluation results."""
# Use shared grouping logic
model_groups, total_passed, total_failed, total_warned, total_cases = (
group_results_by_model(results)
)
# Calculate pass rate
if total_cases > 0:
if failed_only and original_counts and original_counts[0] > 0:
pass_rate = (original_counts[1] / original_counts[0]) * 100
else:
pass_rate = (total_passed / total_cases) * 100
else:
pass_rate = 0
# Build HTML
html_parts = [self._get_html_header()]
# Title and timestamp
html_parts.append('
')
html_parts.append("
🎯 Evaluation Results
")
html_parts.append(
f'
Generated: {datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")}
'
)
# Summary section
html_parts.append('
')
html_parts.append("
📊 Summary
")
if failed_only and original_counts:
orig_total, orig_passed, orig_failed, orig_warned = original_counts
html_parts.append(
f'
⚠️ Showing only {total_cases} failed evaluation(s)
'
)
html_parts.append('
')
html_parts.append(
f'
Total{orig_total}
'
)
html_parts.append(
f'
Passed{orig_passed}
'
)
if orig_warned > 0:
html_parts.append(
f'
Warnings{orig_warned}
'
)
html_parts.append(
f'
Failed{orig_failed}
'
)
else:
html_parts.append('
')
html_parts.append(
f'
Total{total_cases}
'
)
html_parts.append(
f'
Passed{total_passed}
'
)
if total_warned > 0:
html_parts.append(
f'
Warnings{total_warned}
'
)
if total_failed > 0:
html_parts.append(
f'
Failed{total_failed}
'
)
html_parts.append("
") # stats-grid
html_parts.append(
f'
Pass Rate: {pass_rate:.1f}%
'
)
html_parts.append("
") # summary-section
# Results by model
html_parts.append("
📋 Results by Model
")
for model, suites in model_groups.items():
html_parts.append('
')
html_parts.append(f"
🤖 {self._escape_html(model)}
")
for suite_name, cases in suites.items():
# Show suite/file name
html_parts.append('
')
html_parts.append(
f''
)
# Show summary table only when NOT showing details (avoid duplication)
if not show_details:
has_run_stats = any(
case.get("run_stats", {}).get("num_runs", 1) > 1 for case in cases
)
html_parts.append('
')
if has_run_stats:
html_parts.append(
"| Status | Case | Score | Runs |
"
)
else:
html_parts.append(
"| Status | Case | Score |
"
)
html_parts.append("")
for case in cases:
evaluation = case["evaluation"]
if evaluation.passed:
status_class = "passed"
status_text = "✅ PASSED"
elif evaluation.warning:
status_class = "warned"
status_text = "⚠️ WARNED"
else:
status_class = "failed"
status_text = "❌ FAILED"
score_pct = evaluation.score * 100
case_name = self._escape_html(case["name"])
run_stats = case.get("run_stats") or {}
score_display = f"{score_pct:.1f}%"
runs_display = ""
if run_stats.get("num_runs", 1) > 1:
std_pct = run_stats.get("std_deviation", 0.0) * 100
score_display = f"{score_pct:.1f}% ± {std_pct:.1f}%"
runs_display = str(run_stats.get("num_runs", 1))
html_parts.append(f'')
html_parts.append(f'| {status_text} | ')
html_parts.append(f"{case_name} | ")
html_parts.append(f'{score_display} | ')
if has_run_stats:
html_parts.append(f"{runs_display or '-'} | ")
html_parts.append("
")
html_parts.append("
")
# Detailed results - each case is individually expandable
if show_details:
html_parts.append(
'
💡 Click on any case below to expand details
'
)
for case in cases:
evaluation = case["evaluation"]
if evaluation.passed:
status_class = "passed"
status_badge = '
PASSED'
status_icon = "✅"
elif evaluation.warning:
status_class = "warned"
status_badge = '
WARNED'
status_icon = "⚠️"
else:
status_class = "failed"
status_badge = '
FAILED'
status_icon = "❌"
case_name = self._escape_html(case["name"])
score_pct = evaluation.score * 100
# Each case is a collapsible details element (collapsed by default)
html_parts.append(f'
')
html_parts.append(
f''
f"{status_icon} {case_name} "
f'{score_pct:.1f}% '
f"{status_badge}"
f"
"
)
html_parts.append('')
html_parts.append(
f"
Input: {self._escape_html(case['input'])}
"
)
# Context section (if include_context is True)
if include_context:
system_msg = case.get("system_message")
addl_msgs = case.get("additional_messages")
if system_msg or addl_msgs:
html_parts.append('
')
html_parts.append("
📋 Context
")
if system_msg:
html_parts.append(
f'
'
f"System Message: "
f"{self._escape_html(system_msg)}"
f"
"
)
if addl_msgs:
conversation_html = self._format_conversation(addl_msgs)
html_parts.append(
f'
'
f"💬 Conversation Context ({len(addl_msgs)} messages)
"
f"{conversation_html}"
f" "
)
html_parts.append("
")
# Evaluation details
run_id = self._make_safe_id(suite_name, case["name"], model)
html_parts.append(
self._format_evaluation_details(
evaluation,
case.get("run_stats"),
case.get("critic_stats"),
run_id=run_id,
)
)
html_parts.append("
")
html_parts.append(" ")
html_parts.append("
") # suite-section
html_parts.append("
") # model-section
html_parts.append("
") # container
html_parts.append("