From db948125d5b44efbbf2b9b4cc3771cf9caaa157e Mon Sep 17 00:00:00 2001 From: Sam Partee Date: Thu, 19 Sep 2024 03:36:44 -0700 Subject: [PATCH] Tool Evalulation SDK (#35) 1. New Eval SDK (`arcade/sdk/eval.py`): - Introduces `EvalSuite`, `EvalCase`, and `EvalRubric` classes for structured evaluation. - Implements various Critic classes (Binary, Numeric, Similarity) for flexible scoring. - Adds a `tool_eval` decorator for easy integration with existing tools. 2. CLI Integration (`arcade/cli/main.py` and `arcade/cli/utils.py`): - Adds an `evals` command to run evaluation suites from the CLI. - Implements result display functionality for evaluation outcomes. 3. Toolkit Updates: - Adds evaluation scripts for Gmail ([toolkits/gmail/evals/eval_gmail_tools.py](file:///Users/spartee/Dropbox/Arcade/platform/Team/arcade-ai/toolkits/gmail/evals/eval_gmail_tools.py#1%2C1-1%2C1)) and Slack ([toolkits/slack/evals/eval_slack_messaging.py](file:///Users/spartee/Dropbox/Arcade/platform/Team/arcade-ai/toolkits/slack/evals/eval_slack_messaging.py#1%2C1-1%2C1)) toolkits. - Demonstrates practical usage of the Eval SDK with real-world scenarios. 4. Miscellaneous: - Updates `arcade/cli/new.py` to optionally generate an `evals` directory for new toolkits. --------- Co-authored-by: Nate Barbettini --- Makefile | 32 +- arcade/arcade/cli/main.py | 58 +- arcade/arcade/cli/new.py | 5 + arcade/arcade/cli/utils.py | 93 +++ arcade/arcade/core/catalog.py | 14 +- arcade/arcade/sdk/__init__.py | 16 + arcade/arcade/sdk/error.py | 6 + arcade/arcade/sdk/eval/__init__.py | 12 + arcade/arcade/sdk/eval/critic.py | 154 +++++ arcade/arcade/sdk/eval/eval.py | 632 ++++++++++++++++++ arcade/pyproject.toml | 17 +- arcade/tests/client/test_client.py | 4 +- arcade/tests/sdk/test_eval.py | 342 ++++++++++ .../tests/tool/test_create_tool_definition.py | 27 + examples/modal-deploy.py | 44 ++ toolkits/gmail/evals/eval_gmail_tools.py | 133 ++++ toolkits/slack/evals/eval_slack_messaging.py | 191 ++++++ 17 files changed, 1768 insertions(+), 12 deletions(-) create mode 100644 arcade/arcade/sdk/error.py create mode 100644 arcade/arcade/sdk/eval/__init__.py create mode 100644 arcade/arcade/sdk/eval/critic.py create mode 100644 arcade/arcade/sdk/eval/eval.py create mode 100644 arcade/tests/sdk/test_eval.py create mode 100644 examples/modal-deploy.py create mode 100644 toolkits/gmail/evals/eval_gmail_tools.py create mode 100644 toolkits/slack/evals/eval_slack_messaging.py diff --git a/Makefile b/Makefile index cfc856d4..0e90047d 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,8 @@ .PHONY: install install: ## Install the poetry environment and install the pre-commit hooks @echo "🚀 Creating virtual environment using pyenv and poetry" - @cd arcade && poetry install + @cd arcade && poetry install --all-extras @cd arcade && poetry run pre-commit install - @cd arcade && poetry shell .PHONY: check check: ## Run code quality tools. @@ -54,6 +53,35 @@ docker: ## Build and run the Docker container @cd docker && make docker-build @cd docker && make docker-run +.PHONY: full-dist +full-dist: clean-dist ## Build all projects and copy wheels to arcade/dist + @echo "🚀 Building all projects and copying wheels to arcade/dist" + + # Build the main arcade project + @echo "Building arcade project..." + @cd arcade && poetry build + + # Create the arcade/dist directory if it doesn't exist + @mkdir -p arcade/dist + + # Build and copy wheels for each toolkit + @for toolkit_dir in toolkits/*; do \ + if [ -d "$$toolkit_dir" ]; then \ + toolkit_name=$$(basename "$$toolkit_dir"); \ + echo "Building $$toolkit_name project..."; \ + cd "$$toolkit_dir" && poetry build; \ + cp dist/*.whl ../../arcade/dist; \ + cd -; \ + fi; \ + done + + @echo "✅ All projects built and wheels copied to arcade/dist" + +.PHONY: clean-dist +clean-dist: ## Clean the arcade/dist directory + @echo "🗑️ Cleaning arcade/dist directory" + @rm -rf arcade/dist + .PHONY: help help: @echo "🛠️ Arcade AI Dev Commands:\n" diff --git a/arcade/arcade/cli/main.py b/arcade/arcade/cli/main.py index e484b888..317d0177 100644 --- a/arcade/arcade/cli/main.py +++ b/arcade/arcade/cli/main.py @@ -1,3 +1,4 @@ +import importlib.util import os import readline import threading @@ -18,6 +19,7 @@ from arcade.cli.utils import ( OrderCommands, apply_config_overrides, create_cli_catalog, + display_eval_results, display_streamed_markdown, markdownify_urls, validate_and_get_config, @@ -107,6 +109,7 @@ def show( None, "-t", "--toolkit", help="The toolkit to show the tools of" ), actor: Optional[str] = typer.Option(None, help="A running actor address to list tools from"), + debug: bool = typer.Option(False, "--debug", "-d", help="Show debug information"), ) -> None: """ Show the available tools in an actor or toolkit @@ -128,7 +131,8 @@ def show( console.print(table) except Exception as e: - # better error message here + if debug: + raise error_message = f"❌ Failed to List tools: {escape(str(e))}" console.print(error_message, style="bold red") @@ -380,3 +384,55 @@ def display_config_as_table(config) -> None: # type: ignore[no-untyped-def] table.add_row("", "", "") console.print(table) + + +@cli.command(help="Run evaluation suites in a directory") +def evals( + directory: str = typer.Argument(".", help="Directory containing evaluation files"), + show_details: bool = typer.Option(False, "--details", "-d", help="Show detailed results"), + max_concurrent: int = typer.Option( + 1, + "--max-concurrent", + "-c", + help="Maximum number of concurrent evaluations (default: 1)", + ), + models: str = typer.Option( + "gpt-4o", "--models", "-m", help="The models to use for evaluation (default: gpt-4o)" + ), +) -> None: + """ + Find all files starting with 'eval_' in the given directory, + execute any functions decorated with @tool_eval, and display the results. + """ + models = models.split(",") # type: ignore[assignment] + eval_files = [f for f in os.listdir(directory) if f.startswith("eval_") and f.endswith(".py")] + + if not eval_files: + console.print("No evaluation files found.", style="bold yellow") + return + + for file in eval_files: + file_path = os.path.join(directory, file) + module_name = file[:-3] # Remove .py extension + + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + console.print(f"Failed to load {file}", style="bold red") + continue + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) # type: ignore[union-attr] + + eval_functions = [ + obj + for name, obj in module.__dict__.items() + if callable(obj) and hasattr(obj, "__tool_eval__") + ] + + if not eval_functions: + console.print(f"No @tool_eval functions found in {file}", style="bold yellow") + continue + + for func in eval_functions: + console.print(f"\nRunning evaluation from {file}: {func.__name__}", style="bold blue") + results = func(models=models, max_concurrency=max_concurrent) + display_eval_results(results, show_details=show_details) diff --git a/arcade/arcade/cli/new.py b/arcade/arcade/cli/new.py index 885e9ecd..bf566874 100644 --- a/arcade/arcade/cli/new.py +++ b/arcade/arcade/cli/new.py @@ -97,6 +97,7 @@ def create_new_toolkit(directory: str) -> None: author = f"{author_name} <{author_email}>" generate_test_dir = ask_question("Generate test directory? (yes/no)", "yes") == "yes" + generate_eval_dir = ask_question("Generate eval directory? (yes/no)", "yes") == "yes" top_level_dir = os.path.join(directory, name) toolkit_dir = os.path.join(directory, name, toolkit_name) @@ -140,4 +141,8 @@ def create_new_toolkit(directory: str) -> None: if generate_test_dir: create_directory(os.path.join(top_level_dir, "tests")) + # If the user wants to generate an eval directory + if generate_eval_dir: + create_directory(os.path.join(top_level_dir, "evals")) + console.print(f"[green]Toolkit {toolkit_name} has been created.[/green]") diff --git a/arcade/arcade/cli/utils.py b/arcade/arcade/cli/utils.py index cc8b325e..67dbc2d0 100644 --- a/arcade/arcade/cli/utils.py +++ b/arcade/arcade/cli/utils.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING, Any + import typer from openai.resources.chat.completions import ChatCompletionChunk, Stream from rich.console import Console @@ -9,6 +11,9 @@ from arcade.core.catalog import ToolCatalog from arcade.core.config_model import Config from arcade.core.toolkit import Toolkit +if TYPE_CHECKING: + from arcade.sdk.eval.eval import EvaluationResult + console = Console() @@ -150,3 +155,91 @@ def apply_config_overrides( if tls_input is not None: config.engine.tls = tls_input + + +def display_eval_results(results: list[dict[str, Any]], show_details: bool = False) -> None: + """ + Display evaluation results in a format inspired by pytest's output. + + Args: + results: List of dictionaries containing evaluation results for each model. + show_details: Whether to show detailed results for each case. + """ + total_passed = 0 + total_failed = 0 + total_warned = 0 + total_cases = 0 + + for model_results in results: + model = model_results.get("model", "Unknown Model") + rubric = model_results.get("rubric", "Unknown Rubric") + cases = model_results.get("cases", []) + total_cases += len(cases) + + console.print(f"\n[bold magenta]Model: {model}[/bold magenta]\n") + console.print(f"[bold magenta]{rubric}[/bold magenta]\n") + + for case in cases: + evaluation = case["evaluation"] + status = ( + "[green]PASSED[/green]" + if evaluation.passed + else "[yellow]WARNED[/yellow]" + if evaluation.warning + else "[red]FAILED[/red]" + ) + if evaluation.passed: + total_passed += 1 + elif evaluation.warning: + total_warned += 1 + else: + total_failed += 1 + + # Display one-line summary for each case + console.print(f"{status} {case['name']} -- Score: {evaluation.score:.2f}") + + if show_details: + # Show detailed information for each case + console.print(f"[bold]User Input:[/bold] {case['input']}\n") + console.print("[bold]Details:[/bold]") + console.print(_format_evaluation(evaluation)) + console.print("-" * 80) + + # Summary + console.print("\n[bold]Summary:[/bold]") + console.print(f"Total Cases: {total_cases}") + console.print(f"[green]Passed: {total_passed}[/green]") + console.print(f"[yellow]Warnings: {total_warned}[/yellow]") + console.print(f"[red]Failed: {total_failed}[/red]\n") + + +def _format_evaluation(evaluation: "EvaluationResult") -> str: + """ + Format evaluation results with color-coded matches and scores. + + Args: + evaluation: An EvaluationResult object containing the evaluation results. + + Returns: + A formatted string representation of the evaluation details. + """ + result_lines = [] + + # Include overall final score + result_lines.append(f"[bold]Final Score:[/bold] {evaluation.score:.2f}\n") + + for critic_result in evaluation.results: + match_color = "green" if critic_result["match"] else "red" + field = critic_result["field"] + score = critic_result["score"] + weight = critic_result["weight"] + expected = critic_result["expected"] + actual = critic_result["actual"] + result_lines.append( + f"[bold]{field}:[/bold] " + f"[{match_color}]Match: {critic_result['match']}, " + f"Score: {score:.2f}/{weight:.2f}[/{match_color}]" + f"\n Expected: {expected}" + f"\n Actual: {actual}" + ) + return "\n".join(result_lines) diff --git a/arcade/arcade/core/catalog.py b/arcade/arcade/core/catalog.py index 7bc2e271..d47cc4d5 100644 --- a/arcade/arcade/core/catalog.py +++ b/arcade/arcade/core/catalog.py @@ -1,5 +1,6 @@ import asyncio import inspect +import typing from collections.abc import Iterator from dataclasses import dataclass from datetime import datetime @@ -501,6 +502,7 @@ def get_wire_type( """ Mapping between Python types and HTTP/JSON types """ + # TODO ensure Any is not allowed type_mapping: dict[type, WireType] = { str: "string", bool: "boolean", @@ -513,7 +515,6 @@ def get_wire_type( list: "array", dict: "json", } - wire_type = type_mapping.get(_type) if wire_type: return wire_type @@ -580,6 +581,17 @@ def determine_output_model(func: Callable) -> type[BaseModel]: output_model_name, result=(field_type, Field(description=str(description))), ) + # Handle Union types + origin = return_annotation.__origin__ + if origin is typing.Union: + # For union types, create a model with the first non-None argument + # TODO handle multiple non-None arguments. Raise error? + for arg in get_args(return_annotation): + if arg is not type(None): + return create_model( + output_model_name, + result=(arg, Field(description="No description provided.")), + ) # when the return_annotation has an __origin__ attribute # and does not have a __metadata__ attribute. return create_model( diff --git a/arcade/arcade/sdk/__init__.py b/arcade/arcade/sdk/__init__.py index 26c0fc27..f9ddc08d 100644 --- a/arcade/arcade/sdk/__init__.py +++ b/arcade/arcade/sdk/__init__.py @@ -1,5 +1,21 @@ +from .eval import ( + BinaryCritic, + EvalRubric, + EvalSuite, + ExpectedToolCall, + NumericCritic, + SimilarityCritic, + tool_eval, +) from .tool import tool __all__ = [ "tool", + "EvalRubric", + "EvalSuite", + "ExpectedToolCall", + "tool_eval", + "BinaryCritic", + "SimilarityCritic", + "NumericCritic", ] diff --git a/arcade/arcade/sdk/error.py b/arcade/arcade/sdk/error.py new file mode 100644 index 00000000..977075d6 --- /dev/null +++ b/arcade/arcade/sdk/error.py @@ -0,0 +1,6 @@ +class SDKError(Exception): + """Base class for all SDK errors.""" + + +class WeightError(SDKError): + """Raised when the critic weights do not abide by SDK weight constraints.""" diff --git a/arcade/arcade/sdk/eval/__init__.py b/arcade/arcade/sdk/eval/__init__.py new file mode 100644 index 00000000..b5a686af --- /dev/null +++ b/arcade/arcade/sdk/eval/__init__.py @@ -0,0 +1,12 @@ +from .critic import BinaryCritic, NumericCritic, SimilarityCritic +from .eval import EvalRubric, EvalSuite, ExpectedToolCall, tool_eval + +__all__ = [ + "BinaryCritic", + "SimilarityCritic", + "NumericCritic", + "EvalRubric", + "EvalSuite", + "ExpectedToolCall", + "tool_eval", +] diff --git a/arcade/arcade/sdk/eval/critic.py b/arcade/arcade/sdk/eval/critic.py new file mode 100644 index 00000000..0ed3f847 --- /dev/null +++ b/arcade/arcade/sdk/eval/critic.py @@ -0,0 +1,154 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, ClassVar + +from arcade.sdk.error import WeightError + + +@dataclass +class Critic(ABC): + critic_field: str + weight: float + + def __post_init__(self) -> None: + if self.weight < 0 or self.weight > 1: + raise WeightError(f"Critic weight must be between 0 and 1, got {self.weight}") + + @abstractmethod + def evaluate(self, expected: Any, actual: Any) -> dict[str, Any]: + pass + + +@dataclass +class BinaryCritic(Critic): + """ + A critic for performing exact equality comparisons between expected and actual values. + + This critic evaluates whether the expected and actual values are exactly equal. + It's useful for scenarios where only an exact match is acceptable. + + Returns: + A dict with: + - "match": True if expected == actual, otherwise False. + - "score": The full weight if there's a match, otherwise 0.0. + """ + + def evaluate(self, expected: Any, actual: Any) -> dict[str, float | bool]: + match = expected == actual + return {"match": match, "score": self.weight if match else 0.0} + + +@dataclass +class NumericCritic(Critic): + """ + A critic for evaluating numeric values within a specified range. + + This critic performs a "fuzzy" comparison of numeric values, where values closer + to each other (relative to the specified range) result in higher scores. It's + useful for scenarios where exact matches aren't necessary, but closeness within + a certain tolerance is rewarded. + + Attributes: + value_range: The min and max values of the expected range. + match_threshold: The threshold for considering a match (default 0.8). + + The evaluation process: + 1. Normalizes both expected and actual values to a 0-1 scale based on value_range. + 2. Calculates the absolute difference between these normalized values. + 3. Subtracts this difference from 1 to get a similarity score (closer to 1 is more similar). + 4. Multiplies the similarity by the critic's weight for the final score. + + Returns: + A dict with: + - "match": True if the score >= match_threshold, otherwise False. + - "score": The calculated score (similarity * weight). + """ + + value_range: tuple[float, float] + match_threshold: float = 0.8 + + def __init__( + self, + critic_field: str, + weight: float, + value_range: tuple[float, float], + match_threshold: float = 0.8, + ): + super().__init__(critic_field, weight) + if value_range[0] >= value_range[1]: + raise ValueError("Invalid value_range: minimum must be less than maximum.") + self.value_range = value_range + self.match_threshold = match_threshold + + def evaluate(self, expected: Any, actual: Any) -> dict[str, Any]: + min_val, max_val = self.value_range + normalized_expected = float((float(expected) - min_val) / (max_val - min_val)) + normalized_actual = float((float(actual) - min_val) / (max_val - min_val)) + score = float(1 - abs(normalized_expected - normalized_actual)) + return {"match": bool(score >= self.match_threshold), "score": float(score * self.weight)} + + +@dataclass +class SimilarityCritic(Critic): + """ + A critic for evaluating the similarity between two strings. + + This critic uses a specified similarity metric to compare the expected and actual + string values. Currently, it supports cosine similarity using TF-IDF vectorization. + + Args: + metric: The similarity metric to use (default is "cosine"). + similarity_threshold: The threshold for considering a match (default 0.8). + + The evaluation process: + 1. Converts both expected and actual values to strings. + 2. Calculates the similarity score using the specified metric. + 3. Determines a match based on the similarity_threshold. + 4. Calculates the final score by multiplying the similarity by the critic's weight. + + Returns: + A dict with: + - "match": True if similarity >= similarity_threshold, otherwise False. + - "score": The calculated score (similarity * weight). + + Raises: + ImportError: If scikit-learn is not installed (required for cosine similarity). + ValueError: If an unsupported similarity metric is specified. + """ + + metric: str = "cosine" + similarity_threshold: float = 0.8 + + SUPPORTED_METRICS: ClassVar[list[str]] = ["cosine"] + + def __init__( + self, + critic_field: str, + weight: float, + similarity_threshold: float = 0.8, + metric: str = "cosine", + ): + super().__init__(critic_field, weight) + if metric not in self.SUPPORTED_METRICS: + raise ValueError(f"Unsupported similarity metric: {metric}") + self.similarity_threshold = similarity_threshold + self.metric = metric + + def evaluate(self, expected: str, actual: str) -> dict[str, float | bool]: + if self.metric == "cosine": + try: + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.metrics.pairwise import cosine_similarity + except ImportError: + raise ImportError( + "Use `pip install arcade[evals]` to install the required dependencies for similarity metrics." + ) + vectorizer = TfidfVectorizer() + tfidf_matrix = vectorizer.fit_transform([expected, actual]) + similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0] + else: + raise ValueError(f"Unsupported similarity metric: {self.metric}") + return { + "match": similarity >= self.similarity_threshold, + "score": min(similarity * self.weight, self.weight), + } diff --git a/arcade/arcade/sdk/eval/eval.py b/arcade/arcade/sdk/eval/eval.py new file mode 100644 index 00000000..54fe9eee --- /dev/null +++ b/arcade/arcade/sdk/eval/eval.py @@ -0,0 +1,632 @@ +import asyncio +import functools +import json +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Callable + +try: + import numpy as np + from scipy.optimize import linear_sum_assignment +except ImportError: + raise ImportError( + "Use `pip install arcade[evals]` to install the required dependencies for evaluation." + ) + +from arcade.client.client import Arcade, AsyncArcade +from arcade.core.config import config +from arcade.sdk.error import WeightError + +if TYPE_CHECKING: + from arcade.core.catalog import ToolCatalog + from arcade.sdk.eval.critic import Critic + + +@dataclass +class ExpectedToolCall: + """ + Represents an expected tool call with its name and arguments. + + Attributes: + name: The name of the tool. + args: A dictionary containing the expected arguments for the tool. + """ + + name: str + args: dict[str, Any] + + +@dataclass +class EvalRubric: + """ + Defines the rubric for evaluating an AI model's performance on a task. + + Attributes: + fail_threshold: The minimum score required to pass the evaluation (between 0.0 and 1.0). + warn_threshold: The score threshold for issuing a warning (between 0.0 and 1.0). + fail_on_tool_selection: Whether to fail the evaluation if the tool selection is incorrect. + fail_on_tool_call_quantity: Whether to fail the evaluation if the number of tool calls is incorrect. + tool_selection_weight: The weight assigned to the tool selection score (between 0.0 and 1.0). + """ + + fail_threshold: float = 0.8 + warn_threshold: float = 0.9 + fail_on_tool_selection: bool = True + fail_on_tool_call_quantity: bool = True + tool_selection_weight: float = 1.0 + + def __str__(self) -> str: + return f"Fail threshold: {self.fail_threshold}\nWarn threshold: {self.warn_threshold}\n" + + +@dataclass +class EvaluationResult: + """ + Represents the result of an evaluation case. + + Attributes: + score: The normalized evaluation score (0.0-1.0). + passed: Whether the evaluation passed based on the fail_threshold. + warning: Whether the evaluation issued a warning based on the warn_threshold. + results: A list of dictionaries containing the results for each critic. + """ + + score: float = 0.0 + passed: bool = False + warning: bool = False + results: list[dict[str, Any]] = field(default_factory=list) + + @property + def fail(self) -> bool: + return not self.passed and not self.warning + + def add( + self, + field: str, + result: dict[str, Any], + weight: float, + expected: Any, + actual: Any, + ) -> None: + """ + Add a critic result to the list of critic results. + + Args: + field: The field name for the critic result. + result: A dictionary containing the critic result. + weight: The weight of the critic. + expected: The expected value for the critic. + actual: The actual value for the critic. + """ + self.results.append( + { + "field": field, + **result, + "weight": weight, + "expected": expected, + "actual": actual, + } + ) + + def score_tool_selection(self, expected: str, actual: str, weight: float) -> float: + """ + Score and record tool selection in results. + + Args: + expected: The expected tool name. + actual: The actual tool name. + weight: The weight for tool selection. + + Returns: + The score for the tool selection. + """ + score = weight if expected == actual else 0.0 + self.add( + "tool_selection", + {"match": expected == actual, "score": score}, + weight, + expected, + actual, + ) + return score + + def compute_final_score(self, total_weight: float) -> None: + """ + Compute the final score by normalizing the total score with the total weight. + """ + total_score = sum(result["score"] for result in self.results) + self.score = total_score / total_weight if total_weight > 0 else 0.0 + + +@dataclass +class EvalCase: + """ + Represents a single evaluation case within an EvalSuite. + + Attributes: + name: A descriptive name for this evaluation case. + system_message: The system message to be sent to the AI model. + user_message: The user input to be sent to the AI model. + expected_tool_calls: A list of ExpectedToolCall objects representing the expected tool calls. + critics: A list of Critic objects used to evaluate tool arguments. + additional_messages: Optional list of additional context messages. + rubric: An EvalRubric object defining pass/fail criteria and tool selection behavior. + """ + + name: str + system_message: str + user_message: str + expected_tool_calls: list[ExpectedToolCall] + critics: list["Critic"] + additional_messages: list[dict[str, str]] = field(default_factory=list) + rubric: EvalRubric = field(default_factory=EvalRubric) + + def __post_init__(self) -> None: + self._validate_critics() + + def _validate_critics(self) -> None: + """ + Validate the sum of critic weights. + + Raises: + WeightError: If the sum of critic weights exceeds 1.0. + """ + total_weight = sum(critic.weight for critic in self.critics) + if total_weight > 1.0: + raise WeightError(f"Sum of critic weights must not exceed 1.0, got {total_weight}") + + for critic in self.critics: + if critic.weight < 0.1: + raise WeightError(f"Critic weights should be at least 0.1, got {critic.weight}") + + def check_tool_selection_failure(self, actual_tools: list[str]) -> bool: + """ + Check if tool selection failure should occur. + + Args: + actual_tools: The list of actual tool names used. + + Returns: + True if tool selection failure should occur, False otherwise. + """ + expected_tools = [tc.name for tc in self.expected_tool_calls] + return self.rubric.fail_on_tool_selection and set(expected_tools) != set(actual_tools) + + def check_tool_call_quantity_failure(self, actual_count: int) -> bool: + """ + Check if tool call quantity failure should occur. + + Args: + actual_count: The number of actual tool calls made. + + Returns: + True if tool call quantity failure should occur, False otherwise. + """ + expected_count = len(self.expected_tool_calls) + return self.rubric.fail_on_tool_call_quantity and expected_count != actual_count + + def evaluate(self, actual_tool_calls: list[tuple[str, dict[str, Any]]]) -> EvaluationResult: + """ + Evaluate the actual tool calls against the expected tool calls and critics. + + Args: + actual_tool_calls: A list of tuples containing the actual tool name and arguments. + + Returns: + An EvaluationResult object containing the evaluation results. + """ + evaluation_result = EvaluationResult() + actual_tools = [tool for tool, _ in actual_tool_calls] + + if self.check_tool_selection_failure(actual_tools): + evaluation_result.score = 0.0 + evaluation_result.passed = False + evaluation_result.warning = False + return evaluation_result + + actual_count = len(actual_tool_calls) + if self.check_tool_call_quantity_failure(actual_count): + evaluation_result.score = 0.0 + evaluation_result.passed = False + evaluation_result.warning = False + return evaluation_result + + # Create a cost matrix for the assignment problem + cost_matrix = self._create_cost_matrix(actual_tool_calls) + + # Use the Linear Sum Assignment (LSA) algorithm to find the optimal assignment + # The algorithm minimizes the cost of assigning each expected tool call to an actual tool call + row_ind, col_ind = linear_sum_assignment(cost_matrix, maximize=True) + + total_score = 0.0 + total_weight = 0.0 + + for i, j in zip(row_ind, col_ind): + if i < len(self.expected_tool_calls) and j < len(actual_tool_calls): + expected = self.expected_tool_calls[i] + actual_tool, actual_args = actual_tool_calls[j] + + tool_selection_score = evaluation_result.score_tool_selection( + expected.name, actual_tool, self.rubric.tool_selection_weight + ) + total_score += tool_selection_score + total_weight += self.rubric.tool_selection_weight + + # Evaluate arguments using critics + for critic in self.critics: + expected_value = expected.args.get(critic.critic_field) + actual_value = actual_args.get(critic.critic_field) + if expected_value is not None and actual_value is not None: + result = critic.evaluate(expected_value, actual_value) + total_score += result["score"] + total_weight += critic.weight + evaluation_result.add( + critic.critic_field, result, critic.weight, expected_value, actual_value + ) + + # Compute the final score using the method from EvaluationResult + evaluation_result.compute_final_score(total_weight) + + # Set the pass/fail status based on the fail_threshold + evaluation_result.passed = evaluation_result.score >= self.rubric.fail_threshold + + # Set the warning status based on the warn_threshold + evaluation_result.warning = ( + not evaluation_result.passed and evaluation_result.score >= self.rubric.warn_threshold + ) + + return evaluation_result + + def _create_cost_matrix( + self, actual_tool_calls: list[tuple[str, dict[str, Any]]] + ) -> np.ndarray: + """ + Create a cost matrix for the Hungarian algorithm. + + This method computes the score for each possible pairing of expected and actual tool calls. + The resulting matrix is used by the Hungarian algorithm to find the optimal assignment. + + Args: + actual_tool_calls: A list of tuples containing the actual tool calls and their arguments. + + Returns: + A numpy array representing the cost matrix. + """ + n = max(len(self.expected_tool_calls), len(actual_tool_calls)) + cost_matrix = np.zeros((n, n)) + + for i, expected in enumerate(self.expected_tool_calls): + for j, (actual_tool, actual_args) in enumerate(actual_tool_calls): + score = 0.0 + if expected.name == actual_tool: + score += self.rubric.tool_selection_weight + + for critic in self.critics: + expected_value = expected.args.get(critic.critic_field) + actual_value = actual_args.get(critic.critic_field) + if expected_value is not None and actual_value is not None: + result = critic.evaluate(expected_value, actual_value) + score += result["score"] + cost_matrix[i, j] = score + + return cost_matrix + + async def run_async( + self, client: AsyncArcade, model: str, tool_names: list[str] + ) -> dict[str, Any]: + """ + Run the evaluation case asynchronously. + + Args: + client: The AsyncArcade client instance. + model: The model to evaluate. + tool_names: The list of tool names to use for the evaluation. + Returns: + A dictionary containing the evaluation result for the case. + """ + messages = [{"role": "system", "content": self.system_message}] + messages.extend(list(self.additional_messages)) + messages.append({"role": "user", "content": self.user_message}) + + response = await client.chat.completions.create( # type: ignore[call-overload] + model=model, + messages=messages, + tool_choice="auto", + tools=tool_names, + user="eval_user", + stream=False, + ) + + predicted_args = get_tool_args(response) + + evaluation = self.evaluate(predicted_args) + + result = { + "name": self.name, + "input": self.user_message, + "expected_tool_calls": [ + {"name": tc.name, "args": tc.args} for tc in self.expected_tool_calls + ], + "predicted_tool_calls": [{"name": tool, "args": args} for tool, args in predicted_args], + "evaluation": evaluation, + } + + return result + + def run_sync(self, client: Arcade, model: str, tool_names: list[str]) -> dict[str, Any]: + """ + Run the evaluation case synchronously. + + Args: + client: The Arcade client instance. + model: The model to evaluate. + tool_names: The list of tool names to use for the evaluation. + Returns: + A dictionary containing the evaluation result for the case. + """ + messages = [{"role": "system", "content": self.system_message}] + messages.extend(list(self.additional_messages)) + messages.append({"role": "user", "content": self.user_message}) + + response = client.chat.completions.create( # type: ignore[call-overload] + model=model, + messages=messages, + tool_choice="auto", + tools=tool_names, + user="eval_user", + stream=False, + ) + + predicted_args = get_tool_args(response) + + evaluation = self.evaluate(predicted_args) + + result = { + "name": self.name, + "input": self.user_message, + "expected_tool_calls": [ + {"name": tc.name, "args": tc.args} for tc in self.expected_tool_calls + ], + "predicted_tool_calls": [{"name": tool, "args": args} for tool, args in predicted_args], + "evaluation": evaluation, + } + + return result + + +@dataclass +class EvalSuite: + """ + A suite for evaluating AI model performance on specific tasks or scenarios. + + EvalSuite manages a collection of EvalCases, each representing a specific test scenario. + It provides methods to add cases, register tools, and run evaluations against specified models. + + Attributes: + name: The name of the evaluation suite. + system_message: The system message to be used for all cases in this suite. + catalog: A ToolCatalog object containing registered tools. + cases: A list of EvalCase objects representing individual test scenarios. + tool_choice: The tool choice mode for the AI model ("auto" or "function"). + rubric: The evaluation rubric for this case. + max_concurrent: Maximum number of concurrent evaluations. + """ + + name: str + system_message: str + catalog: "ToolCatalog" + cases: list[EvalCase] = field(default_factory=list) + rubric: EvalRubric = field(default_factory=EvalRubric) + max_concurrent: int = 1 # Default to sequential execution + _client: AsyncArcade | Arcade | None = None + + def initialize_client(self) -> None: + """ + Initialize the client instance for the EvalSuite. + """ + if self.max_concurrent > 1: + self._client = AsyncArcade( + api_key=config.api.key, + base_url=config.engine_url, + ) + else: + self._client = Arcade( + api_key=config.api.key, + base_url=config.engine_url, + ) + + def add_case( + self, + name: str, + user_message: str, + expected_tool_calls: list[ExpectedToolCall], + critics: list["Critic"], + system_message: str | None = None, + rubric: EvalRubric | None = None, + additional_messages: list[dict[str, str]] | None = None, + ) -> None: + """ + Add a new evaluation case to the suite. + + Args: + name: The name of the evaluation case. + user_message: The user's input message. + system_message: The system message to be sent to the AI model. + expected_tool_calls: A list of expected tool calls. + critics: List of critics to evaluate the tool arguments. + rubric: The evaluation rubric for this case. + additional_messages: Optional list of additional messages for context. + """ + case = EvalCase( + name=name, + system_message=system_message or self.system_message, + user_message=user_message, + expected_tool_calls=expected_tool_calls, + rubric=rubric or self.rubric, + critics=critics, + additional_messages=additional_messages or [], + ) + self.cases.append(case) + + def extend_case( + self, + name: str, + user_message: str, + system_message: str | None = None, + expected_tool_calls: list[ExpectedToolCall] | None = None, + rubric: EvalRubric | None = None, + critics: list["Critic"] | None = None, + additional_messages: list[dict[str, str]] | None = None, + ) -> None: + """ + Extend the last added case with new information. + + Args: + name: The name of the extended case. + user_message: The new user message for this extended case. + system_message: The new system message for this extended case. + expected_tool_calls: New or updated expected tool calls. + rubric: A new rubric (if different from the last case). + critics: New critics (if different from the last case). + additional_messages: New additional messages (if different from the last case). + to be added before the new user message. + """ + if not self.cases: + raise ValueError("No cases to extend. Add a case first.") + + last_case = self.cases[-1] + + # Create a new message list with the previous case's messages and user message + new_additional_messages = [ + *last_case.additional_messages, + ] + if additional_messages: + new_additional_messages.extend(additional_messages) + + # Create a new case, copying from the last one and updating fields + new_case = EvalCase( + name=name, + system_message=system_message or last_case.system_message, + user_message=user_message, + expected_tool_calls=expected_tool_calls or last_case.expected_tool_calls, + rubric=rubric or self.rubric, + critics=critics or last_case.critics.copy(), + additional_messages=new_additional_messages, + ) + + self.cases.append(new_case) + + async def run_async(self, model: str) -> dict[str, Any]: + """ + Run the evaluation suite asynchronously. + + Args: + model: The model to evaluate. + + Returns: + A dictionary containing the evaluation results. + """ + if not self._client: + raise ValueError("Client not initialized. Call initialize_client() first.") + + results: dict[str, Any] = {"model": model, "rubric": self.rubric, "cases": []} + + semaphore = asyncio.Semaphore(self.max_concurrent) + tool_names = list(self.catalog.tools.keys()) + + async def sem_task(case: EvalCase) -> dict[str, Any]: + async with semaphore: + return await case.run_async(self._client, model, tool_names) # type: ignore[arg-type] + + tasks = [sem_task(case) for case in self.cases] + case_results = await asyncio.gather(*tasks) + + results["cases"] = case_results + return results + + def run_sync(self, model: str) -> dict[str, Any]: + """ + Run the evaluation suite synchronously. + + Args: + model: The model to evaluate. + + Returns: + A dictionary containing the evaluation results. + """ + if not self._client: + raise ValueError("Client not initialized. Call initialize_client() first.") + + cases: list[dict[str, Any]] = [] + results = {"model": model, "rubric": self.rubric, "cases": cases} + tool_names = list(self.catalog.tools.keys()) + for case in self.cases: + result = case.run_sync(self._client, model, tool_names) # type: ignore[arg-type] + cases.append(result) + + return results + + def run(self, model: str) -> dict[str, Any]: + """ + Run the evaluation suite. + + Args: + model: The model to evaluate. + + Returns: + A dictionary containing the evaluation results. + """ + if not self._client: + self.initialize_client() + + if self.max_concurrent > 1: + # Run asynchronously with concurrency + return asyncio.run(self.run_async(model)) + else: + # Run synchronously + return self.run_sync(model) + + +def get_tool_args(chat_completion: Any) -> list[tuple[str, dict[str, Any]]]: + """ + Returns the tool arguments from the chat completion object. + + Args: + chat_completion: The chat completion object. + + Returns: + A list of tuples containing the tool name and arguments. + """ + tool_args_list: list[tuple[str, dict[str, Any]]] = [] + message = chat_completion.choices[0].message + if message.tool_calls: + for tool_call in message.tool_calls: + tool_args_list.append( + ( + tool_call.function.name, + json.loads(tool_call.function.arguments), + ) + ) + return tool_args_list + + +def tool_eval() -> Callable[[Callable], Callable]: + def decorator(func: Callable) -> Callable: + @functools.wraps(func) + def wrapper( + models: list[str], + max_concurrency: int = 1, + ) -> list[dict[str, Any]]: + suite = func() + if not isinstance(suite, EvalSuite): + raise TypeError("Eval function must return an EvalSuite") + suite.max_concurrent = max_concurrency + results = [] + for model in models: + result = suite.run(model) + results.append(result) + return results + + wrapper.__tool_eval__ = True # type: ignore[attr-defined] + return wrapper + + return decorator diff --git a/arcade/pyproject.toml b/arcade/pyproject.toml index c7b5f7d3..58524177 100644 --- a/arcade/pyproject.toml +++ b/arcade/pyproject.toml @@ -24,9 +24,12 @@ requests = "^2.26.0" # TODO: is this really needed? openai = "^1.36.0" # TODO: relax to an earlier version that still has what we need pyjwt = "^2.8.0" -fastapi = { version = "^0.110.0", optional = true } -flask = { version = "^3.0.3", optional = true } +[tool.poetry.group.fastapi.dependencies] +fastapi = "^0.110.0" + +[tool.poetry.group.flask.dependencies] +flask = "^3.0.3" [tool.poetry.group.dev.dependencies] pytest = "^8.1.1" @@ -41,16 +44,17 @@ mkdocs = ">=1.5.2" mkdocs-material = ">=9.3.0" mkdocstrings = {extras = ["python"], version = ">=0.23.1"} -[tool.poetry.extras] -fastapi = ["fastapi"] -flask = ["flask"] - +[tool.poetry.group.evals.dependencies] +scipy = "^1.14.0" +numpy = "^2.0.0" +scikit-learn = "^1.5.0" [tool.poetry.scripts] arcade = "arcade.cli.main:cli" [tool.mypy] files = ["arcade"] +python_version = "3.10" disallow_untyped_defs = "True" disallow_any_unimported = "True" no_implicit_optional = "True" @@ -58,6 +62,7 @@ check_untyped_defs = "True" warn_return_any = "True" warn_unused_ignores = "True" show_error_codes = "True" +ignore_missing_imports = "True" [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/arcade/tests/client/test_client.py b/arcade/tests/client/test_client.py index 4cb8f7c7..4db7d258 100644 --- a/arcade/tests/client/test_client.py +++ b/arcade/tests/client/test_client.py @@ -40,9 +40,9 @@ TOOL_DEFINITION_DATA = { "input_schema": {"type": "object", "properties": {"n_emails": {"type": "integer"}}}, "output_schema": {"type": "array", "items": {"type": "string"}}, "version": "0.1.0", - "inputs": {"parameters": []}, # Update this line + "inputs": {"parameters": []}, "output": {}, - "requirements": {"auth_requirements": []}, # Update this line + "requirements": {"auth_requirements": []}, } TOOL_AUTHORIZE_RESPONSE_DATA = { diff --git a/arcade/tests/sdk/test_eval.py b/arcade/tests/sdk/test_eval.py new file mode 100644 index 00000000..f6281aaa --- /dev/null +++ b/arcade/tests/sdk/test_eval.py @@ -0,0 +1,342 @@ +import pytest + +from arcade.sdk.error import WeightError +from arcade.sdk.eval import ( + BinaryCritic, + EvalRubric, + ExpectedToolCall, + NumericCritic, + SimilarityCritic, +) +from arcade.sdk.eval.eval import EvalCase, EvaluationResult + +# Test BinaryCritic.evaluate() + + +@pytest.mark.parametrize( + "expected, actual, weight, expected_match, expected_score", + [ + ("value", "value", 1.0, True, 1.0), + ("value", "different", 1.0, False, 0.0), + (10, 10, 0.5, True, 0.5), + (10, 20, 0.5, False, 0.0), + ], +) +def test_binary_critic_evaluate(expected, actual, weight, expected_match, expected_score): + """ + Test the BinaryCritic's evaluate method to ensure it correctly computes + the match and score based on expected and actual values. + """ + critic = BinaryCritic(critic_field="test_field", weight=weight) + result = critic.evaluate(expected=expected, actual=actual) + assert result["match"] == expected_match + assert result["score"] == expected_score + + +# Test NumericCritic.evaluate() + + +@pytest.mark.parametrize( + "expected, actual, value_range, weight, match_threshold, expected_match, expected_score", + [ + (5, 5, (0, 10), 1.0, 0.8, True, 1.0), + (5, 6, (0, 10), 1.0, 0.8, True, 0.9), + (0, 10, (0, 10), 1.0, 0.8, False, 0.0), + (2, 8, (0, 10), 1.0, 0.5, False, 0.4), + (50, 60, (0, 100), 0.5, 0.9, True, 0.45), + ], +) +def test_numeric_critic_evaluate( + expected, actual, value_range, weight, match_threshold, expected_match, expected_score +): + """ + Test the NumericCritic's evaluate method to ensure it calculates + the correct score based on the proportion of the difference between + expected and actual values within a specified range. + """ + critic = NumericCritic( + critic_field="number", + weight=weight, + value_range=value_range, + match_threshold=match_threshold, + ) + result = critic.evaluate(expected=expected, actual=actual) + assert result["match"] == expected_match + assert pytest.approx(result["score"], 0.01) == expected_score + + +# Test SimilarityCritic.evaluate() + + +@pytest.mark.parametrize( + "expected, actual, weight, similarity_threshold, expected_match, min_expected_score", + [ + ("hello world", "hello world", 1.0, 0.8, True, 1.0), + ("hello world", "hello", 1.0, 0.8, False, 0.0), + ("The quick brown fox", "The quick brown fox jumps over the lazy dog", 1.0, 0.5, True, 0.5), + ("data science", "machine learning", 0.5, 0.3, False, 0.0), + ], +) +def test_similarity_critic_evaluate( + expected, actual, weight, similarity_threshold, expected_match, min_expected_score +): + """ + Test the SimilarityCritic's evaluate method to ensure it computes + the similarity score between expected and actual strings and determines + the match correctly based on the similarity threshold. + """ + critic = SimilarityCritic( + critic_field="text", + weight=weight, + similarity_threshold=similarity_threshold, + ) + result = critic.evaluate(expected=expected, actual=actual) + assert result["match"] == expected_match + assert result["score"] >= min_expected_score + assert result["score"] >= 0.0 + assert result["score"] <= weight + 1e-6 # Allow a small epsilon for floating-point comparison + + +# Test EvaluationResult accumulation and pass/fail logic + + +def test_evaluation_result_accumulation(): + """ + Test that EvaluationResult correctly accumulates scores and determines + pass/fail status based on thresholds. + """ + evaluation = EvaluationResult() + evaluation.add( + field="field1", + result={"match": True, "score": 0.8}, + weight=1.0, + expected="expected_value", + actual="actual_value", + ) + evaluation.add( + field="field2", + result={"match": False, "score": 0.0}, + weight=0.5, + expected="expected_value", + actual="actual_value", + ) + total_weight = 1.5 + expected_score = (0.8 * 1.0 + 0.0 * 0.5) / total_weight + evaluation.compute_final_score(total_weight) + assert evaluation.score == expected_score + + +# Test EvalCase.evaluate() + + +def test_eval_case_evaluate(): + """ + Test EvalCase's evaluate method to ensure it calculates the overall score + correctly based on tool selection and critics, and applies the rubric's + thresholds to determine pass/fail/warning status. + """ + # Define expected tool calls and actual tool calls + expected_tool_calls = [ + ExpectedToolCall(name="ToolA", args={"param": "value1"}), + ExpectedToolCall(name="ToolB", args={"param": "value2"}), + ] + actual_tool_calls = [ + ("ToolA", {"param": "value1"}), + ("ToolB", {"param": "wrong_value"}), + ] + + # Define critics + critics = [ + BinaryCritic(critic_field="param", weight=1.0), + ] + + # Create EvalCase with a rubric + case = EvalCase( + name="TestCase", + system_message="System message", + user_message="User message", + expected_tool_calls=expected_tool_calls, + critics=critics, + rubric=EvalRubric(fail_threshold=0.75, warn_threshold=0.9, tool_selection_weight=1.0), + ) + + # Evaluate the case + result = case.evaluate(actual_tool_calls) + + # Expected calculations: + # - Tool selection score should be 2 * 1.0 = 2.0 (both tools are correct) + # - First critic score: match (1.0) + # - Second critic score: no match (0.0) + # - Total critic score: 1.0 + 0.0 = 1.0 + # - Total weight: tool selection (2.0) + critics (2.0) = 4.0 + # - Total score: (2.0 + 1.0) / 4.0 = 0.75 + + assert result.score == 0.75 + assert result.passed is True + + +# Test EvalCase with mismatched tool calls + + +def test_eval_case_evaluate_mismatched_tools(): + """ + Test EvalCase's evaluate method when the actual tool calls do not match + the expected tool calls to ensure tool selection scoring is correct. + """ + expected_tool_calls = [ + ExpectedToolCall(name="ToolA", args={"param": "value"}), + ] + actual_tool_calls = [ + ("ToolB", {"param": "value"}), + ] + + critics = [BinaryCritic(critic_field="param", weight=1.0)] + + case = EvalCase( + name="TestCase", + system_message="", + user_message="", + expected_tool_calls=expected_tool_calls, + critics=critics, + rubric=EvalRubric(tool_selection_weight=1.0), + ) + + result = case.evaluate(actual_tool_calls) + + # Tool selection score should be 0.0 since the tools don't match + # Critic is not evaluated since the tool selection failed + # Total score: 0.0 + + assert result.score == 0.0 + assert result.passed is False + + +# Test EvalCase with multiple critics and weights + + +def test_eval_case_multiple_critics(): + """ + Test EvalCase's evaluate method with multiple critics having different weights + to ensure individual critic scores are correctly combined into the total score. + """ + expected_tool_calls = [ + ExpectedToolCall(name="ToolA", args={"param1": "value1", "param2": "value2"}), + ] + actual_tool_calls = [ + ("ToolA", {"param1": "value1", "param2": "wrong_value"}), + ] + + critics = [ + BinaryCritic(critic_field="param1", weight=0.6), + SimilarityCritic(critic_field="param2", weight=0.4, similarity_threshold=0.8), + ] + + case = EvalCase( + name="TestCase", + system_message="", + user_message="", + expected_tool_calls=expected_tool_calls, + critics=critics, + rubric=EvalRubric(fail_threshold=0.7), + ) + + result = case.evaluate(actual_tool_calls) + + # Tool selection score: 1.0 + # Critic scores: + # - param1: match (score 0.6) + # - param2: likely not match (score ~0.0) + # Total score: (1.0 + 0.6 + 0.0) / (1.0 + 0.6 + 0.4) = 1.6 / 2.0 = 0.8 + + assert pytest.approx(result.score, 0.01) == 0.8 + assert result.passed + + +# Test EvalCase with missing expected and actual values in args + + +@pytest.mark.parametrize( + "expected_args, actual_args, expected_score", + [ + ({"param": "value"}, {}, 1.0), # Missing actual value + ({}, {"param": "value"}, 1.0), # Missing expected value + ({"param": "value"}, {"param": "value"}, 2.0), # Both values present + ], +) +def test_eval_case_missing_values(expected_args, actual_args, expected_score): + """ + Test that when either expected or actual values are missing for a critic, + the critic evaluation is skipped, and the total score is computed accordingly. + """ + expected_tool_calls = [ExpectedToolCall(name="ToolA", args=expected_args)] + actual_tool_calls = [("ToolA", actual_args)] + + critics = [BinaryCritic(critic_field="param", weight=1.0)] + + case = EvalCase( + name="TestCase", + system_message="", + user_message="", + expected_tool_calls=expected_tool_calls, + critics=critics, + rubric=EvalRubric(tool_selection_weight=1.0), + ) + + result = case.evaluate(actual_tool_calls) + + # If critic is skipped, only tool selection score is counted + # Otherwise, tool selection + critic score + total_weight = 1.0 # At least tool selection weight + if "param" in expected_args and "param" in actual_args: + total_weight += 1.0 # Critic weight + + expected_total_score = expected_score / total_weight + + assert result.score == expected_total_score + + +# Test that WeightError is raised for invalid critic weights + + +@pytest.mark.parametrize( + "critic_class, weight", + [ + (BinaryCritic, -0.1), + (BinaryCritic, 1.1), + (NumericCritic, -0.5), + (SimilarityCritic, 1.5), + ], +) +def test_critic_invalid_weight(critic_class, weight): + """ + Test that initializing a critic with an invalid weight raises a WeightError. + """ + with pytest.raises(WeightError): + if critic_class == NumericCritic: + critic_class(critic_field="test_field", weight=weight, value_range=(0, 1)) + elif critic_class == SimilarityCritic: + critic_class(critic_field="test_field", weight=weight) + else: + critic_class(critic_field="test_field", weight=weight) + + +# Test NumericCritic with invalid value range + + +def test_numeric_critic_invalid_range(): + """ + Test that initializing a NumericCritic with an invalid value range raises a ValueError. + """ + with pytest.raises(ValueError): + NumericCritic(critic_field="number", weight=1.0, value_range=(10, 0)) # Invalid range + + +# Test SimilarityCritic with unsupported metric + + +def test_similarity_critic_unsupported_metric(): + """ + Test that initializing a SimilarityCritic with an unsupported metric raises a ValueError. + """ + with pytest.raises(ValueError): + SimilarityCritic(critic_field="text", weight=1.0, metric="unsupported_metric") diff --git a/arcade/tests/tool/test_create_tool_definition.py b/arcade/tests/tool/test_create_tool_definition.py index 63798a5c..55ff5572 100644 --- a/arcade/tests/tool/test_create_tool_definition.py +++ b/arcade/tests/tool/test_create_tool_definition.py @@ -133,6 +133,13 @@ def func_with_optional_param_with_default_value( pass +@tool(desc="A function with an optional input parameter with bar syntax") +def func_with_optional_param_with_bar_syntax( + param1: Annotated[str | None, "First param"] = None, +): + pass + + @tool(desc="A function with multiple parameters, some with default values") def func_with_mixed_params( context: ToolContext, @@ -456,6 +463,26 @@ def func_with_complex_return() -> dict[str, str]: }, id="func_with_optional_param_with_default_value", ), + pytest.param( + func_with_optional_param_with_bar_syntax, + { + "inputs": ToolInputs( + parameters=[ + InputParameter( + name="param1", + description="First param", + inferrable=True, + required=False, # Because of Optional[str] + value_schema=ValueSchema(val_type="string", enum=None), + ) + ] + ), + "output": ToolOutput( + available_modes=["null"], description="No description provided." + ), + }, + id="func_with_optional_param_with_bar_syntax", + ), pytest.param( func_with_mixed_params, { diff --git a/examples/modal-deploy.py b/examples/modal-deploy.py new file mode 100644 index 00000000..2ee0870d --- /dev/null +++ b/examples/modal-deploy.py @@ -0,0 +1,44 @@ +import os + +from modal import App, Image, asgi_app + +os.environ["WORK_DIR"] = "/root" + +# Define the FastAPI app +app = App("arcade-ai-actor") + + +image = ( + Image.debian_slim() + .copy_local_dir("./dist", "/root/dist") + .pip_install("/root/dist/arcade_ai-0.1.0-py3-none-any.whl") + .pip_install("/root/dist/arcade_gmail-0.1.0-py3-none-any.whl") + .pip_install("/root/dist/arcade_websearch-0.1.0-py3-none-any.whl") + .pip_install("/root/dist/arcade_github-0.1.0-py3-none-any.whl") + .pip_install("/root/dist/arcade_slack-0.1.0-py3-none-any.whl") + .pip_install("fastapi>=0.110.0") + .pip_install("uvicorn>=0.24.0") + .pip_install("pydantic>=2.7.0") + .copy_local_file("./arcade.toml", "/root/arcade.toml") +) + + +@app.function(image=image) +@asgi_app() +def fastapi_app(): + from fastapi import FastAPI + + from arcade.actor.fastapi.actor import FastAPIActor + from arcade.core.toolkit import Toolkit + + web_app = FastAPI() + + # Initialize app and Arcade FastAPIActor + actor = FastAPIActor(web_app) + + # Register toolkits we've installed + toolkits = Toolkit.find_all_arcade_toolkits() + for toolkit in toolkits: + actor.register_toolkit(toolkit) + + return web_app diff --git a/toolkits/gmail/evals/eval_gmail_tools.py b/toolkits/gmail/evals/eval_gmail_tools.py new file mode 100644 index 00000000..6e79c738 --- /dev/null +++ b/toolkits/gmail/evals/eval_gmail_tools.py @@ -0,0 +1,133 @@ +from arcade_gmail.tools.gmail import ( + DateRange, + get_emails, + search_emails_by_header, + write_draft, +) + +from arcade.sdk.eval import ( + BinaryCritic, + EvalRubric, + EvalSuite, + ExpectedToolCall, + NumericCritic, + SimilarityCritic, + tool_eval, +) + +# Evaluation rubric +rubric = EvalRubric( + fail_threshold=0.7, + warn_threshold=0.9, +) + + +@tool_eval("gpt-3.5-turbo") +def gmail_eval_suite(): + suite = EvalSuite( + name="Gmail Tools Evaluation", + system="You are an AI assistant with access to Gmail tools. Use them to help the user with their email-related tasks.", + ) + + # Register the Gmail tools + suite.register_tool(write_draft) + suite.register_tool(search_emails_by_header) + suite.register_tool(get_emails) + + # Write Draft Scenarios + suite.add_case( + name="Write Draft with specified recipient, subject, and body", + user_message="Draft and email to john@example.com asking if we can meet tomorrow at 2 PM", + expected_tool_calls=[ + ExpectedToolCall( + name="WriteDraft", + args={ + "recipient": "john@example.com", + "subject": "Meeting Tomorrow", + "body": "Hi John, Can we meet tomorrow at 2 PM? Thanks, Alice", + }, + ) + ], + rubric=rubric, + critics=[ + BinaryCritic(critic_field="recipient", weight=0.5), + SimilarityCritic(critic_field="subject", weight=0.2), + SimilarityCritic(critic_field="body", weight=0.3), + ], + ) + + # Search Emails by Header Scenarios + suite.add_case( + name="Search for emails from a specific sender and time period", + user_message="Find emails from alice@example.com sent last week", + expected_tool_calls=[ + ExpectedToolCall( + name="SearchEmailsByHeader", + args={ + "sender": "alice@example.com", + "date_range": DateRange.LAST_7_DAYS.value, + "limit": 25, + }, + ) + ], + rubric=rubric, + critics=[ + BinaryCritic(critic_field="sender", weight=0.5), + BinaryCritic(critic_field="date_range", weight=0.4), + NumericCritic(critic_field="limit", weight=0.1, value_range=(1, 100)), + ], + ) + + suite.add_case( + name="Search by subject and date range", + user_message="Search for emails with 'Urgent' in the subject from the last 30 days", + expected_tool_calls=[ + ExpectedToolCall( + name="SearchEmailsByHeader", + args={ + "subject": "Urgent", + "date_range": DateRange.LAST_30_DAYS.value, + "limit": 25, + }, + ) + ], + rubric=rubric, + critics=[ + SimilarityCritic(critic_field="subject", weight=0.4), + BinaryCritic(critic_field="date_range", weight=0.4), + NumericCritic(critic_field="limit", weight=0.2, value_range=(1, 100)), + ], + ) + + suite.extend_case( + name="Followup search by subject and date range", + user_message="show me more of those", + expected_tool_calls=[ + ExpectedToolCall( + name="SearchEmailsByHeader", + args={ + "subject": "Urgent", + "date_range": DateRange.LAST_30_DAYS.value, + "limit": 50, + }, + ) + ], + ) + + suite.add_case( + name="Retrieve specific number of emails", + user_message="Retrieve the last 10 emails in my inbox", + expected_tool_calls=[ + ExpectedToolCall( + name="GetEmails", + args={"n_emails": 10}, + ) + ], + rubric=rubric, + critics=[ + BinaryCritic(critic_field="n_emails", weight=0.8), + NumericCritic(critic_field="n_emails", weight=0.2, value_range=(1, 20)), + ], + ) + + return suite diff --git a/toolkits/slack/evals/eval_slack_messaging.py b/toolkits/slack/evals/eval_slack_messaging.py new file mode 100644 index 00000000..e0c64aa8 --- /dev/null +++ b/toolkits/slack/evals/eval_slack_messaging.py @@ -0,0 +1,191 @@ +from arcade_slack.tools.chat import send_dm_to_user, send_message_to_channel + +from arcade.core.catalog import ToolCatalog +from arcade.sdk.eval import ( + BinaryCritic, + EvalRubric, + EvalSuite, + ExpectedToolCall, + SimilarityCritic, + tool_eval, +) + +# Evaluation rubric +rubric = EvalRubric( + fail_threshold=0.8, + warn_threshold=0.9, +) + + +catalog = ToolCatalog() +# Register the Slack tools +catalog.add_tool(send_dm_to_user) +catalog.add_tool(send_message_to_channel) + + +@tool_eval() +def slack_eval_suite() -> EvalSuite: + """Create an evaluation suite for Slack messaging tools.""" + suite = EvalSuite( + name="Slack Messaging Tools Evaluation", + system_message="You are an AI assistant that can send direct messages and post messages to channels in Slack using the provided tools.", + catalog=catalog, + rubric=rubric, + ) + + # Send DM to User Scenarios + suite.add_case( + name="Send DM to user with clear username", + user_message="Send a direct message to johndoe saying 'Hello, can we meet at 3 PM?'", + expected_tool_calls=[ + ExpectedToolCall( + name="SendDmToUser", + args={ + "user_name": "johndoe", + "message": "Hello, can we meet at 3 PM?", + }, + ) + ], + critics=[ + BinaryCritic(critic_field="user_name", weight=0.5), + SimilarityCritic(critic_field="message", weight=0.5), + ], + ) + + suite.add_case( + name="Send DM with ambiguous username", + user_message="Message John about the project deadline", + expected_tool_calls=[ + ExpectedToolCall( + name="SendDmToUser", + args={ + "user_name": "john", + "message": "Hi John, I wanted to check about the project deadline. Can you provide an update?", + }, + ) + ], + critics=[ + SimilarityCritic(critic_field="user_name", weight=0.4), + SimilarityCritic(critic_field="message", weight=0.6), + ], + ) + + suite.add_case( + name="Send DM with username in different format", + user_message="DM Jane.Doe to reschedule our meeting", + expected_tool_calls=[ + ExpectedToolCall( + name="SendDmToUser", + args={ + "user_name": "jane.doe", + "message": "Hi Jane, I need to reschedule our meeting. When are you available?", + }, + ) + ], + critics=[ + BinaryCritic(critic_field="user_name", weight=0.5), + SimilarityCritic(critic_field="message", weight=0.5), + ], + ) + + # Send Message to Channel Scenarios + suite.add_case( + name="Send message to channel with clear name", + user_message="Post 'The new feature is now live!' in the #announcements channel", + expected_tool_calls=[ + ExpectedToolCall( + name="SendMessageToChannel", + args={ + "channel_name": "announcements", + "message": "The new feature is now live!", + }, + ) + ], + critics=[ + BinaryCritic(critic_field="channel_name", weight=0.5), + SimilarityCritic(critic_field="message", weight=0.5), + ], + ) + + suite.add_case( + name="Send message to channel with ambiguous name", + user_message="Inform the engineering team about the upcoming maintenance in the general channel", + expected_tool_calls=[ + ExpectedToolCall( + name="SendMessageToChannel", + args={ + "channel_name": "engineering", + "message": "Attention team: There will be upcoming maintenance. Please save your work and expect some downtime.", + }, + ) + ], + critics=[ + SimilarityCritic(critic_field="channel_name", weight=0.4), + SimilarityCritic(critic_field="message", weight=0.6), + ], + ) + + # Adversarial Scenarios + suite.add_case( + name="Ambiguous between DM and channel message", + user_message="Send 'Great job on the presentation!' to the team", + expected_tool_calls=[ + ExpectedToolCall( + name="SendMessageToChannel", + args={ + "channel_name": "general", + "message": "Great job on the presentation!", + }, + ) + ], + critics=[ + SimilarityCritic(critic_field="channel_name", weight=0.4), + SimilarityCritic(critic_field="message", weight=0.6), + ], + ) + + # Multiple recipients in DM request + suite.add_case( + name="Multiple recipients in DM request", + user_message="Send a DM to Alice and Bob about pushing the meeting tomorrow. I have to much work to do.", + expected_tool_calls=[ + ExpectedToolCall( + name="SendDmToUser", + args={ + "user_name": "alice", + "message": "Hi Alice, about our meeting tomorrow, let's reschedule? I am swamped with work.", + }, + ), + ExpectedToolCall( + name="SendDmToUser", + args={ + "user_name": "bob", + "message": "Hi Bob, about our meeting tomorrow, let's reschedule? I am swamped with work.", + }, + ), + ], + critics=[ + SimilarityCritic(critic_field="user_name", weight=0.4), + SimilarityCritic(critic_field="message", weight=0.6), + ], + ) + + suite.add_case( + name="Channel name similar to username", + user_message="Post 'sounds great!' in john-project channel", + expected_tool_calls=[ + ExpectedToolCall( + name="SendMessageToChannel", + args={ + "channel_name": "john-project", + "message": "Sounds great!", + }, + ) + ], + critics=[ + BinaryCritic(critic_field="channel_name", weight=0.5), + SimilarityCritic(critic_field="message", weight=0.5), + ], + ) + + return suite