') html_parts.append("

🎯 Evaluation Results

") html_parts.append( f'

Generated: {datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")}

' ) # Summary section html_parts.append('

') html_parts.append("

📊 Summary

") if failed_only and original_counts: orig_total, orig_passed, orig_failed, orig_warned = original_counts html_parts.append( f'' ) html_parts.append('

') html_parts.append( f'

Total{orig_total}

' ) html_parts.append( f'

Passed{orig_passed}

' ) if orig_warned > 0: html_parts.append( f'

Warnings{orig_warned}

' ) html_parts.append( f'

Failed{orig_failed}

' ) else: html_parts.append('

') html_parts.append( f'

Total{total_cases}

' ) html_parts.append( f'

Passed{total_passed}

' ) if total_warned > 0: html_parts.append( f'

Warnings{total_warned}

' ) if total_failed > 0: html_parts.append( f'

Failed{total_failed}

' ) html_parts.append("

") # stats-grid html_parts.append( f'

Pass Rate: {pass_rate:.1f}%

' ) html_parts.append("

") # summary-section # Results by model html_parts.append("

📋 Results by Model

") for model, suites in model_groups.items(): html_parts.append('

') html_parts.append(f"

🤖 {self._escape_html(model)}

") for suite_name, cases in suites.items(): # Show suite/file name html_parts.append('

') html_parts.append( f'

📁 {self._escape_html(suite_name)}

' ) # Show summary table only when NOT showing details (avoid duplication) if not show_details: has_run_stats = any( case.get("run_stats", {}).get("num_runs", 1) > 1 for case in cases ) html_parts.append('') if has_run_stats: html_parts.append( "" ) else: html_parts.append( "" ) html_parts.append("") for case in cases: evaluation = case["evaluation"] if evaluation.passed: status_class = "passed" status_text = "✅ PASSED" elif evaluation.warning: status_class = "warned" status_text = "⚠️ WARNED" else: status_class = "failed" status_text = "❌ FAILED" score_pct = evaluation.score * 100 case_name = self._escape_html(case["name"]) run_stats = case.get("run_stats") or {} score_display = f"{score_pct:.1f}%" runs_display = "" if run_stats.get("num_runs", 1) > 1: std_pct = run_stats.get("std_deviation", 0.0) * 100 score_display = f"{score_pct:.1f}% ± {std_pct:.1f}%" runs_display = str(run_stats.get("num_runs", 1)) html_parts.append(f'') html_parts.append(f'') html_parts.append(f"") html_parts.append(f'') if has_run_stats: html_parts.append(f"") html_parts.append("") html_parts.append("

Status	Case	Score	Runs
Status	Case	Score
{status_text}	{case_name}	{score_display}	{runs_display or '-'}

") # Detailed results - each case is individually expandable if show_details: html_parts.append( '

💡 Click on any case below to expand details

' ) for case in cases: evaluation = case["evaluation"] if evaluation.passed: status_class = "passed" status_badge = 'PASSED' status_icon = "✅" elif evaluation.warning: status_class = "warned" status_badge = 'WARNED' status_icon = "⚠️" else: status_class = "failed" status_badge = 'FAILED' status_icon = "❌" case_name = self._escape_html(case["name"]) score_pct = evaluation.score * 100 # Each case is a collapsible details element (collapsed by default) html_parts.append(f'

') html_parts.append( f'

' f"{status_icon} {case_name} " f'{score_pct:.1f}% ' f"{status_badge}" f"

" ) html_parts.append('

') html_parts.append( f"

Input: {self._escape_html(case['input'])}

" ) # Context section (if include_context is True) if include_context: system_msg = case.get("system_message") addl_msgs = case.get("additional_messages") if system_msg or addl_msgs: html_parts.append('

') html_parts.append("

📋 Context

") if system_msg: html_parts.append( f'

' f"System Message: " f"{self._escape_html(system_msg)}" f"

" ) if addl_msgs: conversation_html = self._format_conversation(addl_msgs) html_parts.append( f'

' f"

💬 Conversation Context ({len(addl_msgs)} messages)

" f"{conversation_html}" f"

" ) html_parts.append("

") # Evaluation details run_id = self._make_safe_id(suite_name, case["name"], model) html_parts.append( self._format_evaluation_details( evaluation, case.get("run_stats"), case.get("critic_stats"), run_id=run_id, ) ) html_parts.append("

") html_parts.append("

") # suite-section html_parts.append("

") # model-section html_parts.append("

") # container html_parts.append("") return "\n".join(html_parts) def _format_evaluation_details( self, evaluation: Any, run_stats: dict[str, Any] | None = None, critic_stats: dict[str, Any] | None = None, run_id: str | None = None, ) -> str: """Format evaluation details as HTML table.""" parts: list[str] = [] run_stats_html = self._format_run_stats_html(run_stats, evaluation) if run_stats_html: parts.append(run_stats_html) run_tabs_html = self._format_run_tabs_html(run_stats, run_id) if run_tabs_html: parts.append(run_tabs_html) critic_stats_html = self._format_critic_stats_html(critic_stats) if critic_stats_html: parts.append(critic_stats_html) if evaluation.failure_reason: parts.append( f'

❌ Failure Reason: ' f"{self._escape_html(evaluation.failure_reason)}

" ) return "\n".join(parts) # Only show field details table when there are NO run tabs # (run tabs already show per-run field details, and Critic Stats shows the aggregation) if not run_tabs_html: details_table = self._format_critic_results_table(evaluation.results) parts.append(details_table) return "\n".join(parts) def _format_critic_results_table(self, results: list[dict[str, Any]]) -> str: lines = [''] lines.append( "" ) lines.append("") for critic_result in results: is_criticized = critic_result.get("is_criticized", True) field = self._escape_html(critic_result["field"]) score = critic_result["score"] weight = critic_result["weight"] expected = self._escape_html(str(critic_result["expected"])) actual = self._escape_html(str(critic_result["actual"])) # Truncate long values for table readability expected = truncate_field_value(expected) actual = truncate_field_value(actual) if is_criticized: if critic_result["match"]: match_cell = '✅ Match' row_class = "match-row" else: match_cell = '❌ No Match' row_class = "nomatch-row" score_cell = f"{score:.2f}/{weight:.2f}" else: match_cell = '— Un-criticized' row_class = "uncriticized-row" score_cell = "-" lines.append(f'') lines.append(f'') lines.append(f"") lines.append(f'') lines.append(f"") lines.append(f"") lines.append("") lines.append("

Field	Match	Score	Expected	Actual
{field}	{match_cell}	{score_cell}	`{expected}`	`{actual}`

") return "\n".join(lines) def _format_run_stats_html(self, run_stats: dict[str, Any] | None, evaluation: Any) -> str: if not run_stats or run_stats.get("num_runs", 1) < 2: return "" if evaluation.passed: status_label = "PASSED" status_icon = "✅" status_class = "passed" elif evaluation.warning: status_label = "WARNED" status_icon = "⚠️" status_class = "warned" else: status_label = "FAILED" status_icon = "❌" status_class = "failed" mean_pct = run_stats.get("mean_score", 0.0) * 100 std_pct = run_stats.get("std_deviation", 0.0) * 100 num_runs = run_stats.get("num_runs", 0) scores = run_stats.get("scores", []) seed_policy = run_stats.get("seed_policy", "") run_seeds = run_stats.get("run_seeds") or [] pass_rule = run_stats.get("pass_rule", "") # Build score pills for each run score_pills = [] for i, score in enumerate(scores, 1): score_pct = score * 100 if score >= 0.8: pill_class = "score-pill high" elif score >= 0.6: pill_class = "score-pill mid" else: pill_class = "score-pill low" score_pills.append(f'R{i}: {score_pct:.0f}%') scores_html = " ".join(score_pills) if score_pills else "" # Build seeds display seeds_html = "" if run_seeds and any(seed is not None for seed in run_seeds): seeds_display = ", ".join(str(seed) for seed in run_seeds) seeds_html = f'

🎲 Seeds{seeds_display}

' html = f"""

{status_icon} {status_label}

{num_runs} runs

{mean_pct:.1f}% mean score

± {std_pct:.1f}% std dev

{scores_html}

""" return html def _format_critic_stats_html(self, critic_stats: dict[str, Any] | None) -> str: if not critic_stats: return "" lines = ['

📊 Critic Stats

'] lines.append('') lines.append( "" "" "" ) lines.append("") for field, stats in critic_stats.items(): weight = stats.get("weight", 0.0) mean_norm = stats.get("mean_score_normalized", 0.0) * 100 std_norm = stats.get("std_deviation_normalized", 0.0) * 100 mean_weighted = stats.get("mean_score", 0.0) * 100 std_weighted = stats.get("std_deviation", 0.0) * 100 # Color coding based on normalized mean: <60 red, 60-80 yellow, >80 green if mean_norm < 60: score_class = "score-low" elif mean_norm < 80: score_class = "score-mid" else: score_class = "score-high" lines.append( f'' f"" f"" f'' f"" f"" f"" "" ) lines.append("

Field	Weight	Mean (norm %)	Std (norm %)	Mean (weighted %)	Std (weighted %)
{self._escape_html(field)}	{weight:.2f}	{mean_norm:.2f}%	{std_norm:.2f}%	{mean_weighted:.2f}%	{std_weighted:.2f}%

") return "\n".join(lines) def _format_run_tabs_html(self, run_stats: dict[str, Any] | None, run_id: str | None) -> str: if not run_stats or run_stats.get("num_runs", 1) < 2: return "" runs = run_stats.get("runs", []) if not runs or run_id is None: return "" tabs = ['

', '

'] for idx, run in enumerate(runs, start=1): active = "active" if idx == 1 else "" if run.get("passed"): status_class = "passed" elif run.get("warning"): status_class = "warned" else: status_class = "failed" tabs.append( f'' ) tabs.append("

") panels = ['

'] for idx, run in enumerate(runs, start=1): active = "active" if idx == 1 else "" if run.get("passed"): status = "✅ PASSED" status_class = "passed" elif run.get("warning"): status = "⚠️ WARNED" status_class = "warned" else: status = "❌ FAILED" status_class = "failed" score_pct = run.get("score", 0.0) * 100 details = run.get("details", []) panels.append( f'

' ) panels.append(f"

Run {idx}: {status} — {score_pct:.2f}%

") failure_reason = run.get("failure_reason") if failure_reason: panels.append( f'

❌ Failure Reason: ' f"{self._escape_html(str(failure_reason))}

" ) if details: panels.append(self._format_critic_results_table(details)) panels.append("

") panels.append("

") return "\n".join(tabs + panels) def _escape_html(self, text: str) -> str: """Escape HTML special characters.""" return ( text.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """) .replace("'", "'") ) def _make_safe_id(self, suite_name: str, case_name: str, model_name: str = "") -> str: """Generate a safe ID for HTML attributes and CSS selectors. Removes or replaces characters that could break HTML attributes or CSS selectors, including quotes, brackets, and special characters. Ensures uniqueness by appending a counter when duplicates are detected. Args: suite_name: The suite name. case_name: The case name. model_name: Optional model name. Returns: A sanitized string safe for use in HTML id/data attributes, guaranteed unique. """ import re def sanitize(s: str) -> str: # Replace common separators with underscores s = s.replace(" ", "_").replace("-", "_") # Remove brackets and parentheses s = s.replace("[", "").replace("]", "").replace("(", "").replace(")", "") # Remove quotes that would break HTML attributes s = s.replace('"', "").replace("'", "") # Remove any remaining non-alphanumeric characters except underscores s = re.sub(r"[^\w]", "", s) return s # Check cache for idempotence - same inputs should return same ID cache_key = (suite_name, case_name, model_name) if cache_key in self._id_cache: return self._id_cache[cache_key] suite_id = sanitize(suite_name) case_id_part = sanitize(case_name) base_id = f"{suite_id}__{case_id_part}" if model_name: model_id = sanitize(model_name) base_id = f"{model_id}__{base_id}" # Ensure uniqueness by appending a counter if this ID already exists unique_id = base_id counter = 1 while unique_id in self._used_ids: unique_id = f"{base_id}_{counter}" counter += 1 # Cache the result and mark ID as used self._id_cache[cache_key] = unique_id self._used_ids.add(unique_id) return unique_id def _format_conversation(self, messages: list[dict]) -> str: """Format conversation messages as rich HTML for context display.""" html_parts = ['

'] for msg in messages: role = msg.get("role", "unknown") content = msg.get("content") tool_calls = msg.get("tool_calls", []) tool_name = msg.get("name", "") # For tool responses role_class = f"msg msg-{role}" role_label = { "user": "👤 User", "assistant": "🤖 Assistant", "tool": "🔧 Tool", "system": "⚙️ System", }.get(role, f"💬 {role.title()}") # Add tool name to label for tool responses if role == "tool" and tool_name: role_label = f"🔧 Tool ({tool_name})" html_parts.append(f'

') html_parts.append(f'

{role_label}

') if content: # For tool responses, try to format JSON nicely if role == "tool": try: parsed_content = json.loads(content) formatted_content = json.dumps(parsed_content, indent=2) html_parts.append( f'

{self._escape_html(formatted_content)}

' ) except (json.JSONDecodeError, TypeError): # Not valid JSON, show as regular content html_parts.append( f'

{self._escape_html(str(content))}

' ) else: html_parts.append( f'

{self._escape_html(str(content))}

' ) # Handle tool calls in assistant messages if tool_calls: html_parts.append('

') for tc in tool_calls: tc_func = tc.get("function", {}) tc_name = tc_func.get("name", "unknown") tc_args = tc_func.get("arguments", "{}") try: args_formatted = json.dumps(json.loads(tc_args), indent=2) except (json.JSONDecodeError, TypeError): args_formatted = str(tc_args) html_parts.append( f'

' f'🛠️ {self._escape_html(tc_name)}' f'

{self._escape_html(args_formatted)}

' f"

" ) html_parts.append("

") html_parts.append("

") return "\n".join(html_parts) # ========================================================================= # MULTI-MODEL EVALUATION FORMATTING # ========================================================================= def _format_multi_model( self, results: list[list[dict[str, Any]]], show_details: bool = False, failed_only: bool = False, original_counts: tuple[int, int, int, int] | None = None, include_context: bool = False, ) -> str: """Format multi-model evaluation results with comparison tables.""" comparison_data, model_order, per_model_stats = group_eval_for_comparison(results) # Build HTML html_parts = [self._get_html_header()] html_parts.append(self._get_multi_model_styles()) # Container html_parts.append('

') html_parts.append("

🔄 Multi-Model Evaluation Results

") html_parts.append( f'

Generated: {datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")}

' ) html_parts.append(f'

Models: {", ".join(model_order)}

') # Per-Model Summary Section html_parts.append('

') html_parts.append("

📊 Per-Model Summary

") html_parts.append('') html_parts.append("") html_parts.append( "" ) html_parts.append("") best_model = None best_rate = -1.0 for model in model_order: stats = per_model_stats[model] rate = stats["pass_rate"] if rate > best_rate: best_rate = rate best_model = model row_class = "best-model" if rate == best_rate and best_model == model else "" html_parts.append(f'') html_parts.append(f'') html_parts.append(f'') html_parts.append(f'') html_parts.append(f'') html_parts.append(f"") html_parts.append(f'') html_parts.append("") html_parts.append("

Model	Passed	Failed	Warned	Total	Pass Rate
{self._escape_html(model)}	{stats["passed"]}	{stats["failed"]}	{stats["warned"]}	{stats['total']}	{rate:.1f}%

") if best_model: html_parts.append( f'

🏆 Best Overall: {self._escape_html(best_model)} ({best_rate:.1f}% pass rate)

' ) html_parts.append("

") # Cross-Model Comparison Section html_parts.append('

') html_parts.append("

⚔️ Cross-Model Comparison

") for suite_name, cases in comparison_data.items(): html_parts.append('

') html_parts.append(f"

Suite: {self._escape_html(suite_name)}

") # Comparison table html_parts.append('') html_parts.append("") html_parts.append("") for model in model_order: html_parts.append(f"") html_parts.append("") html_parts.append("") for case_name, case_models in cases.items(): html_parts.append("") html_parts.append(f'') for model in model_order: if model in case_models: evaluation = case_models[model]["evaluation"] run_stats = case_models[model].get("run_stats") score = evaluation.score * 100 if evaluation.passed: cell_class = "passed" icon = "✓" elif evaluation.warning: cell_class = "warned" icon = "⚠" else: cell_class = "failed" icon = "✗" if run_stats and run_stats.get("num_runs", 1) > 1: std_pct = run_stats.get("std_deviation", 0.0) * 100 runs = run_stats.get("num_runs", 1) html_parts.append( f'" ) else: html_parts.append(f'') else: html_parts.append('') # Best model best, _ = find_best_model(case_models) if best == "Tie": html_parts.append('') elif best and best != "N/A": html_parts.append(f'') else: html_parts.append('') html_parts.append("") html_parts.append("

Case	{self._escape_html(model)}	Best
{self._escape_html(case_name)}	{icon} ' f"{score:.0f}% ± {std_pct:.0f}% n={runs}	{icon} {score:.0f}%	-	🤝 Tie	🏆 {self._escape_html(best)}	-

") html_parts.append("

") # Detailed results if show_details: html_parts.append('

') html_parts.append("

Detailed Results

") for case_name, case_models in cases.items(): html_parts.append('

') html_parts.append(f"

{self._escape_html(case_name)}

") for model in model_order: if model not in case_models: continue case_result = case_models[model] evaluation = case_result["evaluation"] html_parts.append('

') html_parts.append( f"{self._escape_html(model)}: Score {evaluation.score * 100:.1f}%" ) run_id = self._make_safe_id(suite_name, case_name, model) html_parts.append( self._format_evaluation_details( evaluation, case_result.get("run_stats"), case_result.get("critic_stats"), run_id=run_id, ) ) html_parts.append("

") html_parts.append("

") # Footer html_parts.append("

") # container html_parts.append("") return "\n".join(html_parts) def _get_multi_model_styles(self) -> str: """Return additional CSS for multi-model views.""" return """ """ # ========================================================================= # COMPARATIVE EVALUATION FORMATTING # ========================================================================= def _format_comparative( self, results: list[list[dict[str, Any]]], show_details: bool = False, failed_only: bool = False, original_counts: tuple[int, int, int, int] | None = None, include_context: bool = False, ) -> str: """Format comparative evaluation results with tabbed track view.""" # Check if this is multi-model comparative - use case-first grouping if is_multi_model_comparative(results): return self._format_comparative_case_first( results, show_details, failed_only, original_counts, include_context ) return self._format_comparative_single_model( results, show_details, failed_only, original_counts, include_context ) def _format_comparative_single_model( self, results: list[list[dict[str, Any]]], show_details: bool = False, failed_only: bool = False, original_counts: tuple[int, int, int, int] | None = None, include_context: bool = False, ) -> str: """Format single-model comparative evaluation results.""" # Use comparative grouping ( comparative_groups, total_passed, total_failed, total_warned, total_cases, suite_track_order, ) = group_comparative_by_case(results) # Collect all unique tracks for header all_tracks: list[str] = [] for tracks in suite_track_order.values(): for t in tracks: if t not in all_tracks: all_tracks.append(t) # Calculate pass rate if total_cases > 0: if failed_only and original_counts and original_counts[0] > 0: pass_rate = (original_counts[1] / original_counts[0]) * 100 else: pass_rate = (total_passed / total_cases) * 100 else: pass_rate = 0 # Build HTML html_parts = [self._get_html_header()] # Title and timestamp html_parts.append('

') html_parts.append("

📊 Comparative Evaluation Results

") html_parts.append( f'

Generated: {datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")}

' ) # Tracks list (only show if there are multiple tracks) if len(all_tracks) > 1: html_parts.append('

') html_parts.append("All Tracks:") for track in all_tracks: html_parts.append(f'{self._escape_html(track)}') html_parts.append("

") # Summary section html_parts.append('

') html_parts.append("

📊 Summary

") if failed_only and original_counts: orig_total, orig_passed, orig_failed, orig_warned = original_counts html_parts.append( f'' ) html_parts.append('

') html_parts.append( f'

Total{orig_total}

' ) html_parts.append( f'

Passed{orig_passed}

' ) if orig_warned > 0: html_parts.append( f'

Warnings{orig_warned}

' ) html_parts.append( f'

Failed{orig_failed}

' ) else: html_parts.append('

') html_parts.append( f'

Total{total_cases}

' ) html_parts.append( f'

Passed{total_passed}

' ) if total_warned > 0: html_parts.append( f'

Warnings{total_warned}

' ) if total_failed > 0: html_parts.append( f'

Failed{total_failed}

' ) html_parts.append("

") # stats-grid html_parts.append( f'

Pass Rate: {pass_rate:.1f}%

' ) html_parts.append("

") # summary-section # Results by model html_parts.append("

📋 Comparative Results by Model

") for model, suites in comparative_groups.items(): html_parts.append('

') html_parts.append(f"

🤖 {self._escape_html(model)}

") for suite_name, cases in suites.items(): # Get track order for this specific suite track_order = suite_track_order.get(suite_name, []) html_parts.append('

') # Only show COMPARATIVE badge if there are multiple tracks badge = ( 'COMPARATIVE' if len(track_order) > 1 else "" ) html_parts.append( f'

📁 {self._escape_html(suite_name)} {badge}

' ) # Show tracks for this suite (only if multiple) if len(track_order) > 1: html_parts.append('

') html_parts.append("Tracks:") for track in track_order: html_parts.append( f'{self._escape_html(track)}' ) html_parts.append("

") for case_name, case_data in cases.items(): # Context section (if include_context is True) if include_context: system_msg = case_data.get("system_message") addl_msgs = case_data.get("additional_messages") if system_msg or addl_msgs: html_parts.append('

') html_parts.append("

📋 Context

") if system_msg: html_parts.append( f'

' f"System Message: " f"{self._escape_html(system_msg)}" f"

" ) if addl_msgs: conversation_html = self._format_conversation(addl_msgs) html_parts.append( f'

' f"

💬 Conversation Context ({len(addl_msgs)} messages)

" f"{conversation_html}" f"

" ) html_parts.append("

") html_parts.append( self._format_comparative_case_html( case_name, case_data, track_order, show_details, suite_name ) ) html_parts.append("

") # suite-section html_parts.append("

") # model-section # JavaScript for tab switching html_parts.append(self._get_tab_script()) html_parts.append("

") # container html_parts.append("") return "\n".join(html_parts) def _format_comparative_case_first( self, results: list[list[dict[str, Any]]], show_details: bool = False, failed_only: bool = False, original_counts: tuple[int, int, int, int] | None = None, include_context: bool = False, ) -> str: """Format multi-model comparative evaluation grouped by case first.""" # Get case-first grouping ( case_groups, model_order, suite_track_order, total_passed, total_failed, total_warned, total_cases, ) = group_comparative_by_case_first(results) # Collect all unique tracks all_tracks: list[str] = [] for tracks in suite_track_order.values(): for t in tracks: if t not in all_tracks: all_tracks.append(t) # Calculate pass rate if total_cases > 0: if failed_only and original_counts and original_counts[0] > 0: pass_rate = (original_counts[1] / original_counts[0]) * 100 else: pass_rate = (total_passed / total_cases) * 100 else: pass_rate = 0 # Build HTML html_parts = [self._get_html_header()] html_parts.append(self._get_multi_model_styles()) html_parts.append('

') html_parts.append("

📊 Comparative Evaluation Results (Multi-Model)

") html_parts.append( f'

Generated: {datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")}

' ) # Models and tracks info html_parts.append('

') html_parts.append(f"

Models: {', '.join(model_order)}

") # Only show tracks list if there are multiple tracks if len(all_tracks) > 1: html_parts.append('

') html_parts.append("Tracks:") for track in all_tracks: html_parts.append(f'{self._escape_html(track)}') html_parts.append("

") html_parts.append("

") # Summary section html_parts.append('

') html_parts.append("

📊 Summary

") if failed_only and original_counts: orig_total, orig_passed, orig_failed, orig_warned = original_counts html_parts.append( f'' ) html_parts.append('

') html_parts.append( f'

Total{orig_total}

' ) html_parts.append( f'

Passed{orig_passed}

' ) if orig_warned > 0: html_parts.append( f'

Warnings{orig_warned}

' ) html_parts.append( f'

Failed{orig_failed}

' ) else: html_parts.append('

') html_parts.append( f'

Total{total_cases}

' ) html_parts.append( f'

Passed{total_passed}

' ) if total_warned > 0: html_parts.append( f'

Warnings{total_warned}

' ) if total_failed > 0: html_parts.append( f'

Failed{total_failed}

' ) html_parts.append("

") # stats-grid html_parts.append( f'

Pass Rate: {pass_rate:.1f}%

' ) html_parts.append("

") # summary-section # Results grouped by case html_parts.append("

📋 Results by Case

") for suite_name, cases in case_groups.items(): track_order = suite_track_order.get(suite_name, []) html_parts.append('

') # Only show COMPARATIVE badge if there are multiple tracks badge = ( 'COMPARATIVE' if len(track_order) > 1 else "" ) html_parts.append( f'

📁 {self._escape_html(suite_name)} {badge}

' ) # Show tracks for this suite (only if multiple) if len(track_order) > 1: html_parts.append('

') html_parts.append("Tracks:") for track in track_order: html_parts.append( f'{self._escape_html(track)}' ) html_parts.append("

") for case_name, model_data in cases.items(): # Case container html_parts.append('

') html_parts.append(f"

📋 Case: {self._escape_html(case_name)}

") # Get input and context from first model first_model_data = next(iter(model_data.values()), {}) case_input = first_model_data.get("input", "") if case_input: html_parts.append( f'

Input: {self._escape_html(case_input)}

' ) # Context section (if include_context is True) if include_context: system_msg = first_model_data.get("system_message") addl_msgs = first_model_data.get("additional_messages") if system_msg or addl_msgs: html_parts.append('

') html_parts.append("

📋 Context

") if system_msg: html_parts.append( f'

' f"System Message: " f"{self._escape_html(system_msg)}" f"

" ) if addl_msgs: conversation_html = self._format_conversation(addl_msgs) html_parts.append( f'

' f"

💬 Conversation Context ({len(addl_msgs)} messages)

" f"{conversation_html}" f"

" ) html_parts.append("

") # Show each model's results for this case for model in model_order: if model not in model_data: html_parts.append('

') html_parts.append( f'

🤖 {self._escape_html(model)}

' ) html_parts.append('

No data

') html_parts.append("

") continue model_case_data = model_data[model] html_parts.append('

') html_parts.append( f'

🤖 {self._escape_html(model)}

' ) # Show track comparison for this model html_parts.append( self._format_comparative_case_html( case_name, model_case_data, track_order, show_details, suite_name, model ) ) html_parts.append("

") # model-panel html_parts.append("

") # case-group html_parts.append("

") # suite-section # JavaScript for tab switching html_parts.append(self._get_tab_script()) html_parts.append("

") # container html_parts.append("") return "\n".join(html_parts) def _format_comparative_case_html( self, case_name: str, case_data: ComparativeCaseData, track_order: list[str], show_details: bool, suite_name: str = "", model_name: str = "", ) -> str: """Format a single comparative case as HTML with tabbed details.""" lines: list[str] = [] tracks = case_data.get("tracks", {}) # Compute differences from baseline differences = compute_track_differences(case_data, track_order) # Generate unique ID for this case's tabs - include suite name and model for uniqueness # Sanitize all parts for use in HTML attributes and CSS selectors case_id = self._make_safe_id(suite_name, case_name, model_name) lines.append('

') # Case header lines.append('

') lines.append(f"

{self._escape_html(case_name)}

") lines.append( f'

Input: ' f"{self._escape_html(case_data.get('input', 'N/A'))}

" ) lines.append("

") # Comparison summary table lines.append('') lines.append( "" ) lines.append("") for i, track_name in enumerate(track_order): is_baseline = i == 0 row_class = "baseline" if is_baseline else "" if track_name not in tracks: lines.append(f'') lines.append(f"") lines.append('') lines.append('') lines.append('') lines.append("") continue track_result = tracks[track_name] evaluation = track_result.get("evaluation") if not evaluation: lines.append(f'') lines.append(f"") lines.append('') lines.append('') lines.append('') lines.append("") continue # Status if evaluation.passed: status_class = "passed" status_text = "✅ PASSED" elif evaluation.warning: status_class = "warned" status_text = "⚠️ WARNED" else: status_class = "failed" status_text = "❌ FAILED" # Score score_pct = evaluation.score * 100 # Differences diff_fields = differences.get(track_name, []) if is_baseline: diff_html = '(baseline)' elif diff_fields: diff_html = " ".join( f'{self._escape_html(f)}' for f in diff_fields ) else: diff_html = '—' lines.append(f'') lines.append(f"") lines.append(f'') lines.append(f'') lines.append(f"") lines.append("") lines.append("

Track	Status	Score	Differences
`{self._escape_html(track_name)}`	⚠️ N/A	—	No data
`{self._escape_html(track_name)}`	⚠️ N/A	—	No evaluation
`{self._escape_html(track_name)}`	{status_text}	{score_pct:.1f}%	{diff_html}

") # Detailed results with tabs (if show_details) if show_details: # Find tracks with data for proper active tab handling tracks_with_data = [ (i, tn) for i, tn in enumerate(track_order) if tn in tracks and tracks[tn].get("evaluation") ] # Tab buttons - show all tracks, style N/A differently but keep clickable lines.append('

') first_with_data = tracks_with_data[0][0] if tracks_with_data else 0 for i, track_name in enumerate(track_order): has_data = track_name in tracks and tracks[track_name].get("evaluation") active = "active" if i == first_with_data else "" na_class = "" if has_data else "na-track" diff_class = "has-diff" if differences.get(track_name) else "" lines.append( f'" ) lines.append("

") # track-tabs # Tab panels container - include panels for ALL tracks lines.append('

') for i, track_name in enumerate(track_order): has_data = track_name in tracks and tracks[track_name].get("evaluation") active = "active" if i == first_with_data else "" lines.append( f'

' ) if not has_data: # Show informative N/A panel lines.append('

') lines.append('Viewing track:') lines.append( f'{self._escape_html(track_name)}' ) lines.append("

") lines.append('

') lines.append('

ℹ

') # noqa: RUF001 lines.append("

Track Not Configured

") lines.append( f"

The {self._escape_html(track_name)} track " f"was not configured for this test case.

" ) lines.append("

") lines.append( "This happens when a comparative case uses .for_track() " "to define expectations only for specific tracks. " "Tracks without expectations are skipped during evaluation." ) lines.append("

") lines.append('

') lines.append("To include this track:") lines.append("

case.for_track(")
                    lines.append(f'    "{self._escape_html(track_name)}",')
                    lines.append("    expected_tool_calls=[...],")
                    lines.append("    critics=[...]")
                    lines.append(")

") lines.append("

") # na-panel-content else: # Show normal evaluation panel track_result = tracks[track_name] evaluation = track_result.get("evaluation") lines.append('

') lines.append('Viewing track:') lines.append( f'{self._escape_html(track_name)}' ) lines.append("

") run_id = self._make_safe_id(suite_name, case_name, f"{track_name}") lines.append( self._format_evaluation_details( evaluation, track_result.get("run_stats"), track_result.get("critic_stats"), run_id=run_id, ) ) lines.append("

") # track-panel lines.append("

") # track-panels-container lines.append("

") # comparative-case return "\n".join(lines) def _get_tab_script(self) -> str: """Return JavaScript for tab switching.""" return """ """ def _get_html_header(self) -> str: """Return HTML header with embedded CSS for styling.""" return """ Evaluation Results """ class CaptureHtmlFormatter(CaptureFormatter): """HTML formatter for capture results.""" @property def file_extension(self) -> str: return "html" def format( self, captures: CaptureResults, include_context: bool = False, ) -> str: """Format capture results as HTML.""" # Check for multi-model captures if is_multi_model_capture(captures): return self._format_multi_model(captures, include_context) return self._format_single_model(captures, include_context) def _format_single_model( self, captures: CaptureResults, include_context: bool = False, ) -> str: """Format single-model capture results as HTML.""" total_cases = 0 total_calls = 0 # Build captures HTML captures_html = [] for capture in captures: cases_html = [] for case in capture.captured_cases: total_cases += 1 tool_calls_html = [] runs = getattr(case, "runs", None) if runs: for run_index, run in enumerate(runs, start=1): run_calls_html = [] for tc in run.tool_calls: total_calls += 1 args_html = "" if tc.args: args_json = json.dumps(tc.args, indent=2) args_html = ( f'

{self._escape_html(args_json)}

' ) run_calls_html.append( f'

' f'{self._escape_html(tc.name)}' f"{args_html}" f"

" ) if not run_calls_html: run_calls_html.append( '

No tool calls captured

' ) tool_calls_html.append( f'

' f'

Run {run_index}

' f'{"".join(run_calls_html)}' f"

" ) else: for tc in case.tool_calls: total_calls += 1 args_html = "" if tc.args: args_json = json.dumps(tc.args, indent=2) args_html = f'

{self._escape_html(args_json)}

' tool_calls_html.append( f'

' f'{self._escape_html(tc.name)}' f"{args_html}" f"

" ) if not tool_calls_html: tool_calls_html.append('

No tool calls captured

') context_html = "" if include_context: context_parts = [] if case.system_message: context_parts.append( f'

' f"System Message: " f"{self._escape_html(case.system_message)}" f"

" ) if case.additional_messages: conversation_html = self._format_conversation(case.additional_messages) context_parts.append( f'

' f"

💬 Conversation Context ({len(case.additional_messages)} messages)

" f"{conversation_html}" f"

" ) if context_parts: context_html = f'

{"".join(context_parts)}

' # track_name is set for comparative cases track_name = getattr(case, "track_name", None) track_html = "" if track_name: track_html = f'{self._escape_html(track_name)}' cases_html.append( f'

' f'

{self._escape_html(case.case_name)} {track_html}

' f'

' f"User: {self._escape_html(case.user_message)}" f"

" f"{context_html}" f'

Tool Calls

{"".join(tool_calls_html)}

' f"

" ) captures_html.append( f'

' f'

{self._escape_html(capture.suite_name)}

' f'

' f"Model: {self._escape_html(capture.model)}" f"Provider: {self._escape_html(capture.provider)}" f"

" f'

{"".join(cases_html)}

' f"

" ) return self._get_capture_html(captures_html, total_cases, total_calls) def _escape_html(self, text: str) -> str: """Escape HTML special characters.""" return ( text.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """) .replace("'", "'") ) def _format_conversation(self, messages: list[dict]) -> str: """Format conversation messages as a rich HTML conversation view.""" html_parts = ['

'] for msg in messages: role = msg.get("role", "unknown") content = msg.get("content", "") tool_calls = msg.get("tool_calls", []) name = msg.get("name", "") # Role-specific styling role_class = f"msg-{role}" role_icon = { "user": "👤", "assistant": "🤖", "tool": "🔧", "system": "⚙️", }.get(role, "💬") role_label = role.capitalize() html_parts.append(f'

') html_parts.append( f'

' f'{role_icon}' f'{role_label}' ) # Show tool name for tool responses if role == "tool" and name: html_parts.append(f'({self._escape_html(name)})') html_parts.append("

") # Close msg-header # Message content if content: # For tool responses, try to format JSON nicely if role == "tool": try: parsed_content = json.loads(content) formatted_content = json.dumps(parsed_content, indent=2) html_parts.append( f'

{self._escape_html(formatted_content)}

' ) except (json.JSONDecodeError, TypeError): # Not valid JSON, show as regular content html_parts.append( f'

{self._escape_html(str(content))}

' ) else: html_parts.append( f'

{self._escape_html(str(content))}

' ) # Tool calls (for assistant messages) if tool_calls: html_parts.append('

') for tc in tool_calls: func = tc.get("function", {}) tc_name = func.get("name", "unknown") tc_args = func.get("arguments", "{}") # Parse and pretty-print arguments try: args_dict = json.loads(tc_args) if isinstance(tc_args, str) else tc_args args_formatted = json.dumps(args_dict, indent=2) except (json.JSONDecodeError, TypeError): args_formatted = str(tc_args) html_parts.append( f'

' f'📞 {self._escape_html(tc_name)}' f'

{self._escape_html(args_formatted)}

' f"

" ) html_parts.append("

") html_parts.append("

") # Close msg html_parts.append("

") # Close conversation return "\n".join(html_parts) def _format_multi_model( self, captures: CaptureResults, include_context: bool = False, ) -> str: """Format multi-model capture results with track tabs.""" from arcade_cli.formatters.base import group_captures_by_case_then_track grouped_data, model_order, track_order = group_captures_by_case_then_track(captures) html_parts: list[str] = [] # HTML head with track tab styles html_parts.append(""" Multi-Model Capture Results """) html_parts.append("

🔄 Multi-Model Capture Results

") html_parts.append( f'

Models: {", ".join(self._escape_html(m) for m in model_order)}

' ) total_cases = 0 total_calls = 0 case_idx = 0 for suite_name, cases in grouped_data.items(): html_parts.append('

') html_parts.append(f"

{self._escape_html(suite_name)}

") for case_name, case_data in cases.items(): total_cases += 1 case_idx += 1 case_id = f"case_{case_idx}" html_parts.append('

') user_msg = case_data.get("user_message", "") tracks_data = case_data.get("tracks", {}) html_parts.append('

') html_parts.append(f"

{self._escape_html(case_name)}

") if user_msg: html_parts.append( f"

User: {self._escape_html(user_msg)}

" ) html_parts.append("

") # Check if we have multiple tracks track_keys = list(tracks_data.keys()) has_multiple_tracks = len(track_keys) > 1 or ( len(track_keys) == 1 and track_keys[0] != "_default" ) if has_multiple_tracks: # Render track tabs html_parts.append('

') for i, track_key in enumerate(track_keys): active = "active" if i == 0 else "" display_name = track_key if track_key != "_default" else "Default" html_parts.append( f'" ) html_parts.append("

") # Render track panels html_parts.append('

') for i, track_key in enumerate(track_keys): active = "active" if i == 0 else "" track_data = tracks_data[track_key] html_parts.append( f'

' ) display_name = track_key if track_key != "_default" else "Default" html_parts.append( f'

🏷️ {self._escape_html(display_name)}

' ) # Render model panels within track models_dict = track_data.get("models", {}) for model in model_order: if model not in models_dict: html_parts.append('

') html_parts.append( f'

{self._escape_html(model)}

' ) html_parts.append('

No data

') html_parts.append("

") continue captured_case = models_dict[model] html_parts.append('

') html_parts.append( f'

{self._escape_html(model)}

' ) runs = getattr(captured_case, "runs", None) if runs: for run_index, run in enumerate(runs, start=1): html_parts.append( f'

' f"

Run {run_index}

" ) if run.tool_calls: for tc in run.tool_calls: total_calls += 1 args_html = "" if tc.args: args_json = json.dumps(tc.args, indent=2) args_html = f'

{self._escape_html(args_json)}

' html_parts.append( f'

' f'{self._escape_html(tc.name)}' f"{args_html}

" ) else: html_parts.append( '

No tool calls

' ) html_parts.append("

") elif captured_case.tool_calls: for tc in captured_case.tool_calls: total_calls += 1 args_html = "" if tc.args: args_json = json.dumps(tc.args, indent=2) args_html = f'

{self._escape_html(args_json)}

' html_parts.append( f'

' f'{self._escape_html(tc.name)}' f"{args_html}

" ) else: html_parts.append('

No tool calls

') html_parts.append("

") # model-panel html_parts.append("

") # track-panel html_parts.append("

") # track-panels else: # No tracks - render models directly track_key = track_keys[0] if track_keys else "_default" track_data = tracks_data.get(track_key, {}) models_dict = track_data.get("models", {}) for model in model_order: if model not in models_dict: html_parts.append('

') html_parts.append( f'

{self._escape_html(model)}

' ) html_parts.append('

No data

') html_parts.append("

") continue captured_case = models_dict[model] html_parts.append('

') html_parts.append( f'

{self._escape_html(model)}

' ) runs = getattr(captured_case, "runs", None) if runs: for run_index, run in enumerate(runs, start=1): html_parts.append( f'

' f"

Run {run_index}

" ) if run.tool_calls: for tc in run.tool_calls: total_calls += 1 args_html = "" if tc.args: args_json = json.dumps(tc.args, indent=2) args_html = f'

{self._escape_html(args_json)}

' html_parts.append( f'

' f'{self._escape_html(tc.name)}' f"{args_html}

" ) else: html_parts.append('

No tool calls

') html_parts.append("

") elif captured_case.tool_calls: for tc in captured_case.tool_calls: total_calls += 1 args_html = "" if tc.args: args_json = json.dumps(tc.args, indent=2) args_html = ( f'

{self._escape_html(args_json)}

' ) html_parts.append( f'

' f'{self._escape_html(tc.name)}' f"{args_html}

" ) else: html_parts.append('

No tool calls

') html_parts.append("

") # Context section system_msg = case_data.get("system_message") addl_msgs = case_data.get("additional_messages") if include_context and (system_msg or addl_msgs): html_parts.append('

') html_parts.append("

Context

") if system_msg: html_parts.append( f"

System: {self._escape_html(system_msg)}

" ) if addl_msgs: html_parts.append(self._format_conversation(addl_msgs)) html_parts.append("

") html_parts.append("

") # case-group html_parts.append("

") # suite-section # Summary total_suites = len(grouped_data) html_parts.append(f"""

Summary

Suites: {total_suites} | Cases: {total_cases} | Models: {len(model_order)} | Tool Calls: {total_calls}

""") return "\n".join(html_parts) def _get_capture_html( self, captures_html: list[str], total_cases: int, total_calls: int ) -> str: """Return complete HTML document for capture results.""" return f""" Capture Results

🎯 Capture Results

{"".join(captures_html)}

Summary

{total_cases}

Total Cases

{total_calls}

Tool Calls

"""