arcade-mcp/libs/tests/sdk/test_eval.py
Sam Partee b6b4cd0a4c
🏗️ Restructure: Multi-Package Architecture + uv Migration (#412)
### Overview
Major restructuring from monolithic `arcade-ai` package to modular
library architecture with standardized uv-based dependency management.

![arcade-ai Monorepo
(2)](https://github.com/user-attachments/assets/25f102b0-bb87-4a04-9701-d227d05664b1)

### New Package Structure
- **`arcade-tdk`** - Lightweight toolkit development kit (core
decorators, auth)
- **`arcade-core`** - Core execution engine and catalog functionality  
- **`arcade-serve`** - FastAPI/MCP server components
- **`arcade-ai`** - Meta package that includes CLI functionality.
Optionally include evals via the `evals` extra. Optionally include all
packages via the `all` extra.

### Key Benefits
- **Lighter Dependencies**: Toolkits now depend only on `arcade-tdk` (~2
deps) vs full `arcade-ai` (~30+ deps)
- **Faster Builds**: uv provides 10-100x faster dependency resolution
and installation
- **Better Modularity**: Clear separation of concerns, consumers import
only what they need
- **Standard Tooling**: Eliminates custom poetry scripts, uses standard
Python packaging

### Migration Impact
- All 20 toolkits converted from poetry → uv with `arcade-tdk`
dependencies plus `arcade-ai[evals]` and `arcade-serve` dev
dependencies. When developing locally, devs should install toolkits via
`make install-local`.
- Modern Python 3.10+ type hints throughout
- Standardized build system with hatchling backend
- Enhanced Makefile with robust toolkit management commands
- Removed `arcade dev` CLI command
- Reduce the number of files created by `arcade new` and add an option
to not generate a tests and evals folder.

This foundation enables faster development cycles and cleaner dependency
chains for the growing toolkit ecosystem.

### Todo After this PR is merged
- [ ] Post-merge workflow(s) (release & publish containers, etc)
- [ ] Release order plan. @EricGustin suggests releasing in the
following order:
    1. `arcade-core` version 0.1.0
    2. `arcade-serve` version 0.1.0 and `arcade-tdk` version 0.1.0
    3. `arcade-ai` version 2.0.0
4. Patch release for all toolkits (all changes in toolkits are internal
refactors)
- [ ] [Update docs](https://github.com/ArcadeAI/docs/pull/318)

---------

Co-authored-by: Eric Gustin <eric@arcade.dev>
Co-authored-by: Eric Gustin <34000337+EricGustin@users.noreply.github.com>
2025-06-11 16:48:17 -07:00

412 lines
13 KiB
Python

from unittest.mock import Mock
import pytest
from arcade_evals import (
BinaryCritic,
EvalRubric,
ExpectedToolCall,
NamedExpectedToolCall,
NoneCritic,
SimilarityCritic,
)
from arcade_evals.eval import EvalCase, EvalSuite, EvaluationResult
from arcade_tdk import tool
@tool
def mock_tool(param1: str):
pass
@tool
def mock_tool_no_args():
pass
@tool
def mock_tool_multiple_args(
param1: str, param2: str, param3: str = "value3", param4: str = "value4"
):
pass
# Test EvaluationResult accumulation and pass/fail logic
def test_evaluation_result_accumulation():
"""
Test that EvaluationResult correctly accumulates scores and determines
pass/fail status based on thresholds.
"""
evaluation = EvaluationResult()
evaluation.add(
field="field1",
result={"match": True, "score": 0.8},
weight=1.0,
expected="expected_value",
actual="actual_value",
)
evaluation.add(
field="field2",
result={"match": False, "score": 0.0},
weight=0.5,
expected="expected_value",
actual="actual_value",
)
total_weight = 1.5
expected_score = (0.8 * 1.0 + 0.0 * 0.5) / total_weight
evaluation.compute_final_score(total_weight)
assert evaluation.score == expected_score
# Test EvalCase.evaluate()
def test_eval_case_evaluate():
"""
Test EvalCase's evaluate method to ensure it calculates the overall score
correctly based on tool selection and critics, and applies the rubric's
thresholds to determine pass/fail/warning status.
"""
# Define expected tool calls and actual tool calls
expected_tool_calls = [
NamedExpectedToolCall(name="ToolA", args={"param": "value1"}),
NamedExpectedToolCall(name="ToolB", args={"param": "value2"}),
]
actual_tool_calls = [
("ToolA", {"param": "value1"}),
("ToolB", {"param": "wrong_value"}),
]
# Define critics
critics = [
BinaryCritic(critic_field="param", weight=1.0),
]
# Create EvalCase with a rubric
case = EvalCase(
name="TestCase",
system_message="System message",
user_message="User message",
expected_tool_calls=expected_tool_calls,
critics=critics,
rubric=EvalRubric(fail_threshold=0.75, warn_threshold=0.9, tool_selection_weight=1.0),
)
# Evaluate the case
result = case.evaluate(actual_tool_calls)
# Expected calculations:
# - Tool selection score should be 2 * 1.0 = 2.0 (both tools are correct)
# - First critic score: match (1.0)
# - Second critic score: no match (0.0)
# - Total critic score: 1.0 + 0.0 = 1.0
# - Total weight: tool selection (2.0) + critics (2.0) = 4.0
# - Total score: (2.0 + 1.0) / 4.0 = 0.75
assert result.score == 0.75
assert result.passed is True
# Test EvalCase with mismatched tool calls
def test_eval_case_evaluate_mismatched_tools():
"""
Test EvalCase's evaluate method when the actual tool calls do not match
the expected tool calls to ensure tool selection scoring is correct.
"""
expected_tool_calls = [
NamedExpectedToolCall(name="ToolA", args={"param": "value"}),
]
actual_tool_calls = [
("ToolB", {"param": "value"}),
]
critics = [BinaryCritic(critic_field="param", weight=1.0)]
case = EvalCase(
name="TestCase",
system_message="",
user_message="",
expected_tool_calls=expected_tool_calls,
critics=critics,
rubric=EvalRubric(tool_selection_weight=1.0),
)
result = case.evaluate(actual_tool_calls)
# Tool selection score should be 0.0 since the tools don't match
# Critic is not evaluated since the tool selection failed
# Total score: 0.0
assert result.score == 0.0
assert result.passed is False
# Test EvalCase with multiple critics and weights
def test_eval_case_multiple_critics():
"""
Test EvalCase's evaluate method with multiple critics having different weights
to ensure individual critic scores are correctly combined into the total score.
"""
expected_tool_calls = [
NamedExpectedToolCall(name="ToolA", args={"param1": "value1", "param2": "value2"}),
]
actual_tool_calls = [
("ToolA", {"param1": "value1", "param2": "wrong_value"}),
]
critics = [
BinaryCritic(critic_field="param1", weight=0.6),
SimilarityCritic(critic_field="param2", weight=0.4, similarity_threshold=0.8),
]
case = EvalCase(
name="TestCase",
system_message="",
user_message="",
expected_tool_calls=expected_tool_calls,
critics=critics,
rubric=EvalRubric(fail_threshold=0.7),
)
result = case.evaluate(actual_tool_calls)
# Tool selection score: 1.0
# Critic scores:
# - param1: match (score 0.6)
# - param2: likely not match (score ~0.0)
# Total score: (1.0 + 0.6 + 0.0) / (1.0 + 0.6 + 0.4) = 1.6 / 2.0 = 0.8
assert pytest.approx(result.score, 0.01) == 0.8
assert result.passed
# Test EvalCase with missing expected and actual values in args
def test_eval_case_with_none_values():
"""
Test that when expected or actual values are None, the critic evaluates them appropriately.
"""
expected_args = {"param": None}
actual_args = {"param": None}
expected_tool_calls = [NamedExpectedToolCall(name="ToolA", args=expected_args)]
actual_tool_calls = [("ToolA", actual_args)]
critics = [BinaryCritic(critic_field="param", weight=1.0)]
case = EvalCase(
name="TestCase",
system_message="",
user_message="",
expected_tool_calls=expected_tool_calls,
critics=critics,
rubric=EvalRubric(tool_selection_weight=1.0),
)
result = case.evaluate(actual_tool_calls)
# Both values are None, so the critic should return a match
assert result.score == 2.0 / 2.0 # Full score (tool selection + critic score)
# Test EvalSuite.add_case()
def test_eval_suite_add_case():
"""
Test that add_case correctly adds a new evaluation case to the suite.
"""
mock_catalog = Mock()
mock_catalog.find_tool_by_func.return_value.get_fully_qualified_name.return_value = "MockTool"
suite = EvalSuite(name="TestSuite", system_message="System message", catalog=mock_catalog)
expected_tool_calls = [
ExpectedToolCall(
func=mock_tool,
args={"param1": "value"},
),
(
mock_tool,
{"param1": "value"},
),
]
suite.add_case(
name="TestCase",
user_message="User message",
expected_tool_calls=expected_tool_calls,
)
assert len(suite.cases) == 1
case = suite.cases[0]
assert len(case.expected_tool_calls) == 2
assert case.name == "TestCase"
assert case.user_message == "User message"
assert case.system_message == "System message"
assert case.expected_tool_calls[0] == NamedExpectedToolCall(
name="MockTool", args={"param1": "value"}
)
assert case.expected_tool_calls[1] == NamedExpectedToolCall(
name="MockTool", args={"param1": "value"}
)
# Test EvalSuite.extend_case()
def test_eval_suite_extend_case():
"""
Test that extend_case correctly extends the last added case with new information.
"""
mock_catalog = Mock()
mock_catalog.find_tool_by_func.return_value.get_fully_qualified_name.return_value = "MockTool"
suite = EvalSuite(name="TestSuite", system_message="System message", catalog=mock_catalog)
expected_tool_calls = [
ExpectedToolCall(
func=mock_tool,
args={"param1": "value"},
),
(
mock_tool,
{"param1": "value"},
),
]
suite.add_case(
name="InitialCase",
user_message="Initial user message",
expected_tool_calls=expected_tool_calls,
)
suite.extend_case(
name="ExtendedCase",
user_message="Extended user message",
expected_tool_calls=expected_tool_calls,
)
assert len(suite.cases) == 2
initial_case = suite.cases[0]
extended_case = suite.cases[1]
assert initial_case.name == "InitialCase"
assert extended_case.name == "ExtendedCase"
assert extended_case.user_message == "Extended user message"
assert extended_case.system_message == "System message"
assert len(extended_case.expected_tool_calls) == 2
assert extended_case.expected_tool_calls[0] == NamedExpectedToolCall(
name="MockTool", args={"param1": "value"}
)
assert extended_case.expected_tool_calls[1] == NamedExpectedToolCall(
name="MockTool", args={"param1": "value"}
)
def test_eval_suite_validate_critics_raises_value_error():
"""
Test that validate_critics raises a ValueError if multiple critics are detected for the same field.
"""
mock_catalog = Mock()
suite = EvalSuite(name="TestSuite", system_message="System message", catalog=mock_catalog)
case_name = "TestCase"
critics = [
BinaryCritic(critic_field="param", weight=0.5),
SimilarityCritic(critic_field="param", weight=0.5),
]
with pytest.raises(ValueError):
suite._validate_critics(critics, case_name)
def test_eval_suite_validate_critics_no_error():
"""
Test that validate_critics does not raise an error when critics are valid.
"""
mock_catalog = Mock()
suite = EvalSuite(name="TestSuite", system_message="System message", catalog=mock_catalog)
case_name = "TestCase"
critics = [
BinaryCritic(critic_field="param1", weight=0.5),
]
suite._validate_critics(critics, case_name)
@pytest.mark.parametrize(
"expected_tool_calls, critics, expected_critics_count, expected_critics_types",
[
(
# Test case 1: No arguments, expect no critics
[NamedExpectedToolCall(name="MockToolNoArgs", args={})],
None,
0,
[],
),
(
# Test case 2: Single argument, expect one NoneCritic
[NamedExpectedToolCall(name="MockTool", args={"param1": "value"})],
None,
1,
[(NoneCritic, "param1")],
),
(
# Test case 3: Multiple arguments with some critics, expect BinaryCritics for specified fields and NoneCritics for others
[
NamedExpectedToolCall(
name="MockToolMultipleArgs",
args={
"param1": "value1",
"param2": "value2",
"param3": "value3",
"param4": "value4",
},
)
],
[
BinaryCritic(critic_field="param1", weight=0.5),
BinaryCritic(critic_field="param2", weight=0.5),
],
4,
[
(BinaryCritic, "param1"),
(BinaryCritic, "param2"),
(NoneCritic, "param3"),
(NoneCritic, "param4"),
],
),
(
# Test case 4: Mixed tool calls with multiple critics, expect BinaryCritics for specified fields and NoneCritics for others
[
NamedExpectedToolCall(name="MockTool", args={"param1": "value"}),
NamedExpectedToolCall(name="MockToolNoArgs", args={}),
NamedExpectedToolCall(
name="MockToolMultipleArgs",
args={
"param1": "value1",
"param2": "value2",
"param3": "value3",
"param4": "value4",
},
),
],
[
BinaryCritic(critic_field="param1", weight=0.3),
BinaryCritic(critic_field="param2", weight=0.3),
BinaryCritic(critic_field="param3", weight=0.3),
],
4,
[
(BinaryCritic, "param1"),
(BinaryCritic, "param2"),
(BinaryCritic, "param3"),
(NoneCritic, "param4"),
],
),
],
)
def test_eval_suite_add_none_critics(
expected_tool_calls, critics, expected_critics_count, expected_critics_types
):
mock_catalog = Mock()
suite = EvalSuite(name="TestSuite", system_message="System message", catalog=mock_catalog)
critics_with_none = suite._add_none_critics(expected_tool_calls, critics)
assert len(critics_with_none) == expected_critics_count
for i, (expected_type, expected_field) in enumerate(expected_critics_types):
assert isinstance(critics_with_none[i], expected_type)
assert critics_with_none[i].critic_field == expected_field