arcade-mcp/arcade/tests/sdk/test_eval.py
Eric Gustin be2539602f
Evals New Features (#208)
# PR Description
This PR adds ~~four~~ three improvements to evals.

~~## 1. Add parameterized eval cases~~
~~Adds a new method named `add_parameterized_case`. Just like pytest’s
parameterized tests, eval cases can be parameterized with multiple user
messages. Adds a case to the `EvalSuite` for each user message. All
cases have the same expected tool call(s), params, additional_messages.
This reduces duplicate code and makes it easy to observe how a model
performs based on increasingly more difficult prompts.~~
```python
""" NO LONGER IN THIS PR
user_messages = [
    "Call the delete tweet by id tool with the tweet ID '148975632'.",
    "Delete the tweet with ID '148975632'.",
    "I don't want to have this tweet (148975632) on my account anymore.",
    "do the opposite of post for https://x.com/x/status/148975632",
]

suite.add_parameterized_case(
    name="Delete a tweet by ID",
    user_messages=user_messages,
    expected_tool_calls=[
        ExpectedToolCall(
            func=delete_tweet_by_id,
            args={"tweet_id": "148975632"},
        )
    ],
    critics=[
        BinaryCritic(
            critic_field="tweet_id",
            weight=1.0,
        ),
    ],
)
"""
```

~~PASSED Delete a tweet by ID (user_message 1 of 4) -- Score: 100.00%~~
~~PASSED Delete a tweet by ID (user_message 2 of 4) -- Score: 100.00%~~
~~PASSED Delete a tweet by ID (user_message 3 of 4) -- Score: 100.00%~~
~~FAILED Delete a tweet by ID (user_message 4 of 4) -- Score: 0.00%~~
~~Summary -- Total: 4 -- Passed: 3 -- Failed: 1~~

## 2. Parameters that are not explicitly criticized are assigned a
`NoneCritic`.
A NoneCritic has no effect on the evaluation results and does not
actually evaluate. Parameters that have a NoneCritic will be displayed
as ‘un-criticized’ in the evaluation summary (if `-d` flag is used).

![image](https://github.com/user-attachments/assets/300756ec-9b53-436a-9cf9-fc61d0b00c01)


## 3. Add a hardcoded `seed` parameter for evals.
The seed parameter aides in receiving (mostly) consistent outputs -
aiding in reproducibility for evaluations.

## 4. Disallow more than one critic for the same field.
Raises a `ValueError` if more than one critic is assigned to a field.

---------

Co-authored-by: Eric Gustin <eric@arcade-ai.com>
2025-02-05 15:22:08 -08:00

413 lines
13 KiB
Python

from unittest.mock import Mock
import pytest
from arcade.sdk import tool
from arcade.sdk.eval import (
BinaryCritic,
EvalRubric,
ExpectedToolCall,
NamedExpectedToolCall,
NoneCritic,
SimilarityCritic,
)
from arcade.sdk.eval.eval import EvalCase, EvalSuite, EvaluationResult
@tool
def mock_tool(param1: str):
pass
@tool
def mock_tool_no_args():
pass
@tool
def mock_tool_multiple_args(
param1: str, param2: str, param3: str = "value3", param4: str = "value4"
):
pass
# Test EvaluationResult accumulation and pass/fail logic
def test_evaluation_result_accumulation():
"""
Test that EvaluationResult correctly accumulates scores and determines
pass/fail status based on thresholds.
"""
evaluation = EvaluationResult()
evaluation.add(
field="field1",
result={"match": True, "score": 0.8},
weight=1.0,
expected="expected_value",
actual="actual_value",
)
evaluation.add(
field="field2",
result={"match": False, "score": 0.0},
weight=0.5,
expected="expected_value",
actual="actual_value",
)
total_weight = 1.5
expected_score = (0.8 * 1.0 + 0.0 * 0.5) / total_weight
evaluation.compute_final_score(total_weight)
assert evaluation.score == expected_score
# Test EvalCase.evaluate()
def test_eval_case_evaluate():
"""
Test EvalCase's evaluate method to ensure it calculates the overall score
correctly based on tool selection and critics, and applies the rubric's
thresholds to determine pass/fail/warning status.
"""
# Define expected tool calls and actual tool calls
expected_tool_calls = [
NamedExpectedToolCall(name="ToolA", args={"param": "value1"}),
NamedExpectedToolCall(name="ToolB", args={"param": "value2"}),
]
actual_tool_calls = [
("ToolA", {"param": "value1"}),
("ToolB", {"param": "wrong_value"}),
]
# Define critics
critics = [
BinaryCritic(critic_field="param", weight=1.0),
]
# Create EvalCase with a rubric
case = EvalCase(
name="TestCase",
system_message="System message",
user_message="User message",
expected_tool_calls=expected_tool_calls,
critics=critics,
rubric=EvalRubric(fail_threshold=0.75, warn_threshold=0.9, tool_selection_weight=1.0),
)
# Evaluate the case
result = case.evaluate(actual_tool_calls)
# Expected calculations:
# - Tool selection score should be 2 * 1.0 = 2.0 (both tools are correct)
# - First critic score: match (1.0)
# - Second critic score: no match (0.0)
# - Total critic score: 1.0 + 0.0 = 1.0
# - Total weight: tool selection (2.0) + critics (2.0) = 4.0
# - Total score: (2.0 + 1.0) / 4.0 = 0.75
assert result.score == 0.75
assert result.passed is True
# Test EvalCase with mismatched tool calls
def test_eval_case_evaluate_mismatched_tools():
"""
Test EvalCase's evaluate method when the actual tool calls do not match
the expected tool calls to ensure tool selection scoring is correct.
"""
expected_tool_calls = [
NamedExpectedToolCall(name="ToolA", args={"param": "value"}),
]
actual_tool_calls = [
("ToolB", {"param": "value"}),
]
critics = [BinaryCritic(critic_field="param", weight=1.0)]
case = EvalCase(
name="TestCase",
system_message="",
user_message="",
expected_tool_calls=expected_tool_calls,
critics=critics,
rubric=EvalRubric(tool_selection_weight=1.0),
)
result = case.evaluate(actual_tool_calls)
# Tool selection score should be 0.0 since the tools don't match
# Critic is not evaluated since the tool selection failed
# Total score: 0.0
assert result.score == 0.0
assert result.passed is False
# Test EvalCase with multiple critics and weights
def test_eval_case_multiple_critics():
"""
Test EvalCase's evaluate method with multiple critics having different weights
to ensure individual critic scores are correctly combined into the total score.
"""
expected_tool_calls = [
NamedExpectedToolCall(name="ToolA", args={"param1": "value1", "param2": "value2"}),
]
actual_tool_calls = [
("ToolA", {"param1": "value1", "param2": "wrong_value"}),
]
critics = [
BinaryCritic(critic_field="param1", weight=0.6),
SimilarityCritic(critic_field="param2", weight=0.4, similarity_threshold=0.8),
]
case = EvalCase(
name="TestCase",
system_message="",
user_message="",
expected_tool_calls=expected_tool_calls,
critics=critics,
rubric=EvalRubric(fail_threshold=0.7),
)
result = case.evaluate(actual_tool_calls)
# Tool selection score: 1.0
# Critic scores:
# - param1: match (score 0.6)
# - param2: likely not match (score ~0.0)
# Total score: (1.0 + 0.6 + 0.0) / (1.0 + 0.6 + 0.4) = 1.6 / 2.0 = 0.8
assert pytest.approx(result.score, 0.01) == 0.8
assert result.passed
# Test EvalCase with missing expected and actual values in args
def test_eval_case_with_none_values():
"""
Test that when expected or actual values are None, the critic evaluates them appropriately.
"""
expected_args = {"param": None}
actual_args = {"param": None}
expected_tool_calls = [NamedExpectedToolCall(name="ToolA", args=expected_args)]
actual_tool_calls = [("ToolA", actual_args)]
critics = [BinaryCritic(critic_field="param", weight=1.0)]
case = EvalCase(
name="TestCase",
system_message="",
user_message="",
expected_tool_calls=expected_tool_calls,
critics=critics,
rubric=EvalRubric(tool_selection_weight=1.0),
)
result = case.evaluate(actual_tool_calls)
# Both values are None, so the critic should return a match
assert result.score == 2.0 / 2.0 # Full score (tool selection + critic score)
# Test EvalSuite.add_case()
def test_eval_suite_add_case():
"""
Test that add_case correctly adds a new evaluation case to the suite.
"""
mock_catalog = Mock()
mock_catalog.find_tool_by_func.return_value.get_fully_qualified_name.return_value = "MockTool"
suite = EvalSuite(name="TestSuite", system_message="System message", catalog=mock_catalog)
expected_tool_calls = [
ExpectedToolCall(
func=mock_tool,
args={"param1": "value"},
),
(
mock_tool,
{"param1": "value"},
),
]
suite.add_case(
name="TestCase",
user_message="User message",
expected_tool_calls=expected_tool_calls,
)
assert len(suite.cases) == 1
case = suite.cases[0]
assert len(case.expected_tool_calls) == 2
assert case.name == "TestCase"
assert case.user_message == "User message"
assert case.system_message == "System message"
assert case.expected_tool_calls[0] == NamedExpectedToolCall(
name="MockTool", args={"param1": "value"}
)
assert case.expected_tool_calls[1] == NamedExpectedToolCall(
name="MockTool", args={"param1": "value"}
)
# Test EvalSuite.extend_case()
def test_eval_suite_extend_case():
"""
Test that extend_case correctly extends the last added case with new information.
"""
mock_catalog = Mock()
mock_catalog.find_tool_by_func.return_value.get_fully_qualified_name.return_value = "MockTool"
suite = EvalSuite(name="TestSuite", system_message="System message", catalog=mock_catalog)
expected_tool_calls = [
ExpectedToolCall(
func=mock_tool,
args={"param1": "value"},
),
(
mock_tool,
{"param1": "value"},
),
]
suite.add_case(
name="InitialCase",
user_message="Initial user message",
expected_tool_calls=expected_tool_calls,
)
suite.extend_case(
name="ExtendedCase",
user_message="Extended user message",
expected_tool_calls=expected_tool_calls,
)
assert len(suite.cases) == 2
initial_case = suite.cases[0]
extended_case = suite.cases[1]
assert initial_case.name == "InitialCase"
assert extended_case.name == "ExtendedCase"
assert extended_case.user_message == "Extended user message"
assert extended_case.system_message == "System message"
assert len(extended_case.expected_tool_calls) == 2
assert extended_case.expected_tool_calls[0] == NamedExpectedToolCall(
name="MockTool", args={"param1": "value"}
)
assert extended_case.expected_tool_calls[1] == NamedExpectedToolCall(
name="MockTool", args={"param1": "value"}
)
def test_eval_suite_validate_critics_raises_value_error():
"""
Test that validate_critics raises a ValueError if multiple critics are detected for the same field.
"""
mock_catalog = Mock()
suite = EvalSuite(name="TestSuite", system_message="System message", catalog=mock_catalog)
case_name = "TestCase"
critics = [
BinaryCritic(critic_field="param", weight=0.5),
SimilarityCritic(critic_field="param", weight=0.5),
]
with pytest.raises(ValueError):
suite._validate_critics(critics, case_name)
def test_eval_suite_validate_critics_no_error():
"""
Test that validate_critics does not raise an error when critics are valid.
"""
mock_catalog = Mock()
suite = EvalSuite(name="TestSuite", system_message="System message", catalog=mock_catalog)
case_name = "TestCase"
critics = [
BinaryCritic(critic_field="param1", weight=0.5),
]
suite._validate_critics(critics, case_name)
@pytest.mark.parametrize(
"expected_tool_calls, critics, expected_critics_count, expected_critics_types",
[
(
# Test case 1: No arguments, expect no critics
[NamedExpectedToolCall(name="MockToolNoArgs", args={})],
None,
0,
[],
),
(
# Test case 2: Single argument, expect one NoneCritic
[NamedExpectedToolCall(name="MockTool", args={"param1": "value"})],
None,
1,
[(NoneCritic, "param1")],
),
(
# Test case 3: Multiple arguments with some critics, expect BinaryCritics for specified fields and NoneCritics for others
[
NamedExpectedToolCall(
name="MockToolMultipleArgs",
args={
"param1": "value1",
"param2": "value2",
"param3": "value3",
"param4": "value4",
},
)
],
[
BinaryCritic(critic_field="param1", weight=0.5),
BinaryCritic(critic_field="param2", weight=0.5),
],
4,
[
(BinaryCritic, "param1"),
(BinaryCritic, "param2"),
(NoneCritic, "param3"),
(NoneCritic, "param4"),
],
),
(
# Test case 4: Mixed tool calls with multiple critics, expect BinaryCritics for specified fields and NoneCritics for others
[
NamedExpectedToolCall(name="MockTool", args={"param1": "value"}),
NamedExpectedToolCall(name="MockToolNoArgs", args={}),
NamedExpectedToolCall(
name="MockToolMultipleArgs",
args={
"param1": "value1",
"param2": "value2",
"param3": "value3",
"param4": "value4",
},
),
],
[
BinaryCritic(critic_field="param1", weight=0.3),
BinaryCritic(critic_field="param2", weight=0.3),
BinaryCritic(critic_field="param3", weight=0.3),
],
4,
[
(BinaryCritic, "param1"),
(BinaryCritic, "param2"),
(BinaryCritic, "param3"),
(NoneCritic, "param4"),
],
),
],
)
def test_eval_suite_add_none_critics(
expected_tool_calls, critics, expected_critics_count, expected_critics_types
):
mock_catalog = Mock()
suite = EvalSuite(name="TestSuite", system_message="System message", catalog=mock_catalog)
critics_with_none = suite._add_none_critics(expected_tool_calls, critics)
assert len(critics_with_none) == expected_critics_count
for i, (expected_type, expected_field) in enumerate(expected_critics_types):
assert isinstance(critics_with_none[i], expected_type)
assert critics_with_none[i].critic_field == expected_field