arcade-mcp/libs/arcade-evals/arcade_evals/critic.py
Sam Partee b6b4cd0a4c
🏗️ Restructure: Multi-Package Architecture + uv Migration (#412)
### Overview
Major restructuring from monolithic `arcade-ai` package to modular
library architecture with standardized uv-based dependency management.

![arcade-ai Monorepo
(2)](https://github.com/user-attachments/assets/25f102b0-bb87-4a04-9701-d227d05664b1)

### New Package Structure
- **`arcade-tdk`** - Lightweight toolkit development kit (core
decorators, auth)
- **`arcade-core`** - Core execution engine and catalog functionality  
- **`arcade-serve`** - FastAPI/MCP server components
- **`arcade-ai`** - Meta package that includes CLI functionality.
Optionally include evals via the `evals` extra. Optionally include all
packages via the `all` extra.

### Key Benefits
- **Lighter Dependencies**: Toolkits now depend only on `arcade-tdk` (~2
deps) vs full `arcade-ai` (~30+ deps)
- **Faster Builds**: uv provides 10-100x faster dependency resolution
and installation
- **Better Modularity**: Clear separation of concerns, consumers import
only what they need
- **Standard Tooling**: Eliminates custom poetry scripts, uses standard
Python packaging

### Migration Impact
- All 20 toolkits converted from poetry → uv with `arcade-tdk`
dependencies plus `arcade-ai[evals]` and `arcade-serve` dev
dependencies. When developing locally, devs should install toolkits via
`make install-local`.
- Modern Python 3.10+ type hints throughout
- Standardized build system with hatchling backend
- Enhanced Makefile with robust toolkit management commands
- Removed `arcade dev` CLI command
- Reduce the number of files created by `arcade new` and add an option
to not generate a tests and evals folder.

This foundation enables faster development cycles and cleaner dependency
chains for the growing toolkit ecosystem.

### Todo After this PR is merged
- [ ] Post-merge workflow(s) (release & publish containers, etc)
- [ ] Release order plan. @EricGustin suggests releasing in the
following order:
    1. `arcade-core` version 0.1.0
    2. `arcade-serve` version 0.1.0 and `arcade-tdk` version 0.1.0
    3. `arcade-ai` version 2.0.0
4. Patch release for all toolkits (all changes in toolkits are internal
refactors)
- [ ] [Update docs](https://github.com/ArcadeAI/docs/pull/318)

---------

Co-authored-by: Eric Gustin <eric@arcade.dev>
Co-authored-by: Eric Gustin <34000337+EricGustin@users.noreply.github.com>
2025-06-11 16:48:17 -07:00

291 lines
11 KiB
Python

from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import timedelta
from typing import Any, ClassVar
import pytz
from dateutil import parser
from arcade_evals.errors import WeightError
@dataclass
class Critic(ABC):
critic_field: str
weight: float
def __post_init__(self) -> None:
if self.weight < 0 or self.weight > 1:
raise WeightError(f"Critic weight must be between 0 and 1, got {self.weight}")
@abstractmethod
def evaluate(self, expected: Any, actual: Any) -> dict[str, Any]:
pass
@dataclass
class NoneCritic(Critic):
"""
A critic that has no effect on the evaluation results and does not actually evaluate.
If a critic is not found for an evaluation case's field, then
a NoneCritic is used to indicate that the field was not criticized.
"""
weight: float = 0.0
def __post_init__(self) -> None:
self.weight = 0.0
super().__post_init__()
def evaluate(self, expected: Any, actual: Any) -> dict[str, Any]:
return {"match": None, "score": self.weight, "is_criticized": False}
@dataclass
class BinaryCritic(Critic):
"""
A critic for performing exact equality comparisons between expected and actual values.
This critic evaluates whether the expected and actual values are exactly equal.
It's useful for scenarios where only an exact match is acceptable.
Returns:
A dict with:
- "match": True if expected == actual, otherwise False.
- "score": The full weight if there's a match, otherwise 0.0.
"""
def cast_actual(self, expected: Any, actual: Any) -> Any:
"""
Casts the actual value to the type of the expected value.
Args:
expected (Any): The expected value whose type will be used for casting.
actual (Any): The actual value to be cast.
Returns:
Any: The actual value cast to the type of the expected value.
Raises:
TypeError: If the casting is not possible.
"""
# In case both are strings.
if actual == "None":
actual = None
if expected == "None":
expected = None
if expected is None:
# No need to cast; return actual as is
return actual
if actual is None:
# No need to cast; return None
return None
expected_type = type(expected)
try:
return expected_type(actual)
except (ValueError, TypeError) as e:
raise TypeError(
f"Cannot cast actual value '{actual}' to type {expected_type.__name__}: {e}"
) from e
def evaluate(self, expected: Any, actual: Any) -> dict[str, float | bool]:
"""
Evaluates whether the expected and actual values are exactly equal after casting.
Args:
expected: The expected value.
actual: The actual value to compare, cast to the type of expected.
Returns:
dict: A dictionary containing the match status and score.
"""
# Cast actual to the type of expected
try:
actual_casted = self.cast_actual(expected, actual)
# TODO log or something better here
except TypeError:
actual_casted = actual
match = expected == actual_casted
return {"match": match, "score": self.weight if match else 0.0}
@dataclass
class NumericCritic(Critic):
"""
A critic for evaluating numeric values within a specified range.
This critic performs a "fuzzy" comparison of numeric values, where values closer
to each other (relative to the specified range) result in higher scores. It's
useful for scenarios where exact matches aren't necessary, but closeness within
a certain tolerance is rewarded.
Attributes:
value_range: The min and max values of the expected range.
match_threshold: The threshold for considering a match (default 0.8).
The evaluation process:
1. Normalizes both expected and actual values to a 0-1 scale based on value_range.
2. Calculates the absolute difference between these normalized values.
3. Subtracts this difference from 1 to get a similarity score (closer to 1 is more similar).
4. Multiplies the similarity by the critic's weight for the final score.
Returns:
A dict with:
- "match": True if the score >= match_threshold, otherwise False.
- "score": The calculated score (similarity * weight).
"""
value_range: tuple[float, float]
match_threshold: float = 0.8
def __init__(
self,
critic_field: str,
weight: float,
value_range: tuple[float, float],
match_threshold: float = 0.8,
):
super().__init__(critic_field, weight)
if value_range[0] >= value_range[1]:
raise ValueError("Invalid value_range: minimum must be less than maximum.")
self.value_range = value_range
self.match_threshold = match_threshold
def evaluate(self, expected: Any, actual: Any) -> dict[str, Any]:
min_val, max_val = self.value_range
normalized_expected = float((float(expected) - min_val) / (max_val - min_val))
normalized_actual = float((float(actual) - min_val) / (max_val - min_val))
score = float(1 - abs(normalized_expected - normalized_actual))
return {"match": bool(score >= self.match_threshold), "score": float(score * self.weight)}
@dataclass
class SimilarityCritic(Critic):
"""
A critic for evaluating the similarity between two strings.
This critic uses a specified similarity metric to compare the expected and actual
string values. Currently, it supports cosine similarity using TF-IDF vectorization.
Args:
metric: The similarity metric to use (default is "cosine").
similarity_threshold: The threshold for considering a match (default 0.8).
The evaluation process:
1. Converts both expected and actual values to strings.
2. Calculates the similarity score using the specified metric.
3. Determines a match based on the similarity_threshold.
4. Calculates the final score by multiplying the similarity by the critic's weight.
Returns:
A dict with:
- "match": True if similarity >= similarity_threshold, otherwise False.
- "score": The calculated score (similarity * weight).
Raises:
ImportError: If scikit-learn is not installed (required for cosine similarity).
ValueError: If an unsupported similarity metric is specified.
"""
metric: str = "cosine"
similarity_threshold: float = 0.8
SUPPORTED_METRICS: ClassVar[list[str]] = ["cosine"]
def __init__(
self,
critic_field: str,
weight: float,
similarity_threshold: float = 0.8,
metric: str = "cosine",
):
super().__init__(critic_field, weight)
if metric not in self.SUPPORTED_METRICS:
raise ValueError(f"Unsupported similarity metric: {metric}")
self.similarity_threshold = similarity_threshold
self.metric = metric
def evaluate(self, expected: str, actual: str) -> dict[str, float | bool]:
if self.metric == "cosine":
try:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
except ImportError:
raise ImportError(
"Use `pip install 'arcade-evals` to install the required dependencies for similarity metrics."
)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([expected, actual])
similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
else:
raise ValueError(f"Unsupported similarity metric: {self.metric}")
return {
"match": similarity >= self.similarity_threshold,
"score": min(similarity * self.weight, self.weight),
}
@dataclass
class DatetimeCritic(Critic):
"""
A critic that evaluates the closeness of datetime values within a specified tolerance.
Attributes:
tolerance: Acceptable timedelta between expected and actual datetimes.
max_difference: Maximum timedelta for a partial score.
"""
critic_field: str
weight: float
tolerance: timedelta = timedelta(seconds=500)
max_difference: timedelta = timedelta(hours=2)
def evaluate(self, expected: str, actual: str) -> dict[str, float | bool]:
"""Evaluates the closeness of datetime values within a specified tolerance."""
# Attempt to parse expected and actual datetime strings
try:
expected_dt = parser.parse(expected)
actual_dt = parser.parse(actual)
except (ValueError, TypeError):
# If parsing fails, return score 0
return {"match": False, "score": 0.0}
# Handle cases based on presence of tzinfo
if expected_dt.tzinfo is None and actual_dt.tzinfo is None:
# Both datetimes are naive, compare directly
time_diff_seconds = abs((expected_dt - actual_dt).total_seconds())
elif expected_dt.tzinfo is not None and actual_dt.tzinfo is not None:
# Both datetimes have tzinfo, compare in UTC
expected_utc = expected_dt.astimezone(pytz.utc)
actual_utc = actual_dt.astimezone(pytz.utc)
time_diff_seconds = abs((expected_utc - actual_utc).total_seconds())
else:
# One datetime has tzinfo and the other doesn't
# Compare naive datetime with the other's naive equivalent
if expected_dt.tzinfo is not None:
expected_naive = expected_dt.replace(tzinfo=None)
time_diff_seconds = abs((expected_naive - actual_dt).total_seconds())
else:
actual_naive = actual_dt.replace(tzinfo=None)
time_diff_seconds = abs((expected_dt - actual_naive).total_seconds())
# Convert tolerances to seconds
tolerance_seconds = self.tolerance.total_seconds()
max_difference_seconds = self.max_difference.total_seconds()
if time_diff_seconds <= tolerance_seconds:
# Full score if within tolerance
return {"match": True, "score": self.weight}
elif time_diff_seconds >= max_difference_seconds:
# No score if beyond max_difference
return {"match": False, "score": 0.0}
else:
# Partial score based on time difference
ratio = 1 - (time_diff_seconds / max_difference_seconds)
# Ensure ratio is not negative
ratio = max(ratio, 0)
score = self.weight * ratio
return {"match": False, "score": score}