# MCP Server Tool Evaluation Support
## Overview
Add support for evaluating tools from remote MCP servers without
requiring Python callables. Enables direct evaluation of any
MCP-compatible tool server.
## What's New
### Core Features
- **`MCPToolRegistry`**: Evaluate tools from a single MCP server
- **`CompositeMCPRegistry`**: Evaluate tools from multiple MCP servers
simultaneously
- **Automatic loaders**: `load_from_stdio()` and `load_from_http()` to
fetch tools from running servers
- **Automatic namespacing**: Tools prefixed with server name (e.g.,
`server_tool_name`)
- **Smart name resolution**: Use short names if unique, full names if
ambiguous
- **OpenAI strict mode**: Automatic schema conversion prevents parameter
hallucinations
### Usage
**Automatic Loading:**
```python
from arcade_evals import load_from_stdio, MCPToolRegistry
# Load tools automatically from MCP server
tools = load_from_stdio(["npx", "-y", "@modelcontextprotocol/server-github"])
registry = MCPToolRegistry(tools)
```
**Single MCP Server:**
```python
from arcade_evals import MCPToolRegistry, ExpectedToolCall
registry = MCPToolRegistry(mcp_tools)
suite = EvalSuite(catalog=registry)
suite.add_case(
expected_tool_calls=[
ExpectedToolCall(tool_name="tool_name", args={...})
]
)
```
**Multiple MCP Servers:**
```python
from arcade_evals import CompositeMCPRegistry, load_from_stdio
# Load from multiple servers
github_tools = load_from_stdio(["npx", "-y", "@modelcontextprotocol/server-github"])
slack_tools = load_from_stdio(["npx", "-y", "@modelcontextprotocol/server-slack"])
composite = CompositeMCPRegistry(
tool_lists={
"github": github_tools,
"slack": slack_tools,
}
)
suite = EvalSuite(catalog=composite)
suite.add_case(
expected_tool_calls=[
ExpectedToolCall(tool_name="github_list_issues", args={...})
]
)
```
## Implementation
### Files Changed
- **`libs/arcade-evals/arcade_evals/registry.py`** (NEW): Registry
abstractions and implementations
- **`libs/arcade-evals/arcade_evals/loaders.py`** (NEW): Automatic tool
loading from MCP servers
- **`libs/arcade-evals/arcade_evals/eval.py`** (MODIFIED): Enhanced
`ExpectedToolCall` and evaluation logic
- **`libs/arcade-evals/arcade_evals/__init__.py`** (MODIFIED): Exported
new registries and loaders
### Key Technical Details
- Added `BaseToolRegistry` interface for abstraction
- `MCPToolRegistry` handles single server tools
- `CompositeMCPRegistry` manages multiple servers with collision
detection
- `load_from_stdio()` and `load_from_http()` for automatic tool
discovery
- Fixed name normalization bug: MCP tools use underscores (not dots)
- Optimized tool copying: 2.5x faster via shallow copy
## Testing
- ✅ 41 tests passing (25 new tests added)
- ✅ `test_eval_mcp_registry.py`: MCPToolRegistry functionality
- ✅ `test_eval_composite_mcp.py`: CompositeMCPRegistry with multiple
servers
- ✅ Verified backward compatibility with Python tools
## Backward Compatibility
✅ **100% backward compatible** - No breaking changes
## Breaking Changes
**None**
<!-- CURSOR_SUMMARY -->
---
> [!NOTE]
> Adds end-to-end eval UX: examples, a robust CLI runner, and rich
outputs.
>
> - **New examples**: `eval_arcade_gateway.py`,
`eval_stdio_mcp_server.py`, `eval_http_mcp_server.py`,
`eval_comprehensive_comparison.py` with timeouts, error handling, and
track-based comparisons; detailed `README.md`
> - **CLI runner**: `arcade_cli/evals_runner.py` to execute
evals/capture in parallel with progress, error isolation, failed-only
filtering, context inclusion, and multi-provider/model support
> - **Output formatters**: `arcade_cli/formatters/` (txt, md, html,
json) for evals and capture; comparative and multi-model HTML with tabs
and context rendering
> - **Display refactor**: `display.py` now supports writing multiple
formats, failed-only disclaimers, include-context, and improved console
summaries
>
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
ff8acf9c34a6b61462a019a1ee9df081006517d0. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
---------
Co-authored-by: Francisco Liberal <francisco@arcade.dev>
Co-authored-by: Mateo Torres <torresmateo@gmail.com>
846 lines
29 KiB
Python
846 lines
29 KiB
Python
import pytest
|
|
from arcade_evals import (
|
|
BinaryCritic,
|
|
EvalRubric,
|
|
ExpectedMCPToolCall,
|
|
ExpectedToolCall,
|
|
NamedExpectedToolCall,
|
|
NoneCritic,
|
|
SimilarityCritic,
|
|
)
|
|
from arcade_evals.eval import EvalCase, EvalSuite, EvaluationResult
|
|
from arcade_tdk import tool
|
|
|
|
# Mark all tests in this module as requiring evals dependencies
|
|
pytestmark = pytest.mark.evals
|
|
|
|
|
|
@tool
|
|
def mock_tool(param1: str):
|
|
pass
|
|
|
|
|
|
@tool
|
|
def mock_tool_no_args():
|
|
pass
|
|
|
|
|
|
@tool
|
|
def mock_tool_multiple_args(
|
|
param1: str, param2: str, param3: str = "value3", param4: str = "value4"
|
|
):
|
|
pass
|
|
|
|
|
|
# Test EvaluationResult accumulation and pass/fail logic
|
|
def test_evaluation_result_accumulation():
|
|
"""
|
|
Test that EvaluationResult correctly accumulates scores and determines
|
|
pass/fail status based on thresholds.
|
|
"""
|
|
evaluation = EvaluationResult()
|
|
evaluation.add(
|
|
field="field1",
|
|
result={"match": True, "score": 0.8},
|
|
weight=1.0,
|
|
expected="expected_value",
|
|
actual="actual_value",
|
|
)
|
|
evaluation.add(
|
|
field="field2",
|
|
result={"match": False, "score": 0.0},
|
|
weight=0.5,
|
|
expected="expected_value",
|
|
actual="actual_value",
|
|
)
|
|
total_weight = 1.5
|
|
expected_score = (0.8 * 1.0 + 0.0 * 0.5) / total_weight
|
|
evaluation.compute_final_score(total_weight)
|
|
assert evaluation.score == expected_score
|
|
|
|
|
|
class TestEvaluationResultProperties:
|
|
"""Tests for EvaluationResult.passed, .fail, and .warn properties."""
|
|
|
|
def test_passed_true_no_warning(self):
|
|
"""Test .passed=True, .fail=False, .warn=False for passing evaluation."""
|
|
evaluation = EvaluationResult()
|
|
evaluation.passed = True
|
|
evaluation.warning = False
|
|
|
|
assert evaluation.passed is True
|
|
assert evaluation.fail is False
|
|
assert evaluation.warn is False
|
|
|
|
def test_passed_true_with_warning(self):
|
|
"""Test .passed=True, .fail=False, .warn=True for passing with warning."""
|
|
evaluation = EvaluationResult()
|
|
evaluation.passed = True
|
|
evaluation.warning = True
|
|
|
|
assert evaluation.passed is True
|
|
assert evaluation.fail is False
|
|
assert evaluation.warn is True
|
|
|
|
def test_not_passed_with_warning(self):
|
|
"""Test that warning=True does NOT classify as fail."""
|
|
evaluation = EvaluationResult()
|
|
evaluation.passed = False
|
|
evaluation.warning = True
|
|
|
|
# This is the key case: warning should NOT be a fail
|
|
assert evaluation.passed is False
|
|
assert evaluation.fail is False # Not passed but warning, so not a fail
|
|
assert evaluation.warn is True
|
|
|
|
def test_not_passed_no_warning_is_fail(self):
|
|
"""Test .passed=False, .warning=False means it's a real fail."""
|
|
evaluation = EvaluationResult()
|
|
evaluation.passed = False
|
|
evaluation.warning = False
|
|
|
|
assert evaluation.passed is False
|
|
assert evaluation.fail is True
|
|
assert evaluation.warn is False
|
|
|
|
def test_fail_property_distinguishes_warnings_from_failures(self):
|
|
"""Test that the fail property correctly excludes warnings from failures."""
|
|
# Case 1: Passed - not a fail
|
|
passed_eval = EvaluationResult()
|
|
passed_eval.passed = True
|
|
passed_eval.warning = False
|
|
assert passed_eval.fail is False
|
|
|
|
# Case 2: Warning (not passed but warning set) - not a fail
|
|
warning_eval = EvaluationResult()
|
|
warning_eval.passed = False
|
|
warning_eval.warning = True
|
|
assert warning_eval.fail is False
|
|
|
|
# Case 3: Actual failure (not passed and not warning) - is a fail
|
|
failed_eval = EvaluationResult()
|
|
failed_eval.passed = False
|
|
failed_eval.warning = False
|
|
assert failed_eval.fail is True
|
|
|
|
|
|
# Test EvalCase.evaluate()
|
|
def test_eval_case_evaluate():
|
|
"""
|
|
Test EvalCase's evaluate method to ensure it calculates the overall score
|
|
correctly based on tool selection and critics, and applies the rubric's
|
|
thresholds to determine pass/fail/warning status.
|
|
"""
|
|
# Define expected tool calls and actual tool calls
|
|
expected_tool_calls = [
|
|
NamedExpectedToolCall(name="ToolA", args={"param": "value1"}),
|
|
NamedExpectedToolCall(name="ToolB", args={"param": "value2"}),
|
|
]
|
|
actual_tool_calls = [
|
|
("ToolA", {"param": "value1"}),
|
|
("ToolB", {"param": "wrong_value"}),
|
|
]
|
|
|
|
# Define critics
|
|
critics = [
|
|
BinaryCritic(critic_field="param", weight=1.0),
|
|
]
|
|
|
|
# Create EvalCase with a rubric
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="System message",
|
|
user_message="User message",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=critics,
|
|
rubric=EvalRubric(fail_threshold=0.75, warn_threshold=0.9, tool_selection_weight=1.0),
|
|
)
|
|
|
|
# Evaluate the case
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
# Expected calculations:
|
|
# - Tool selection score should be 2 * 1.0 = 2.0 (both tools are correct)
|
|
# - First critic score: match (1.0)
|
|
# - Second critic score: no match (0.0)
|
|
# - Total critic score: 1.0 + 0.0 = 1.0
|
|
# - Total weight: tool selection (2.0) + critics (2.0) = 4.0
|
|
# - Total score: (2.0 + 1.0) / 4.0 = 0.75
|
|
|
|
assert result.score == 0.75
|
|
assert result.passed is True
|
|
|
|
|
|
# Test EvalCase with mismatched tool calls
|
|
def test_eval_case_evaluate_mismatched_tools():
|
|
"""
|
|
Test EvalCase's evaluate method when the actual tool calls do not match
|
|
the expected tool calls to ensure tool selection scoring is correct.
|
|
"""
|
|
expected_tool_calls = [
|
|
NamedExpectedToolCall(name="ToolA", args={"param": "value"}),
|
|
]
|
|
actual_tool_calls = [
|
|
("ToolB", {"param": "value"}),
|
|
]
|
|
|
|
critics = [BinaryCritic(critic_field="param", weight=1.0)]
|
|
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="",
|
|
user_message="",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=critics,
|
|
rubric=EvalRubric(tool_selection_weight=1.0),
|
|
)
|
|
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
# Tool selection score should be 0.0 since the tools don't match
|
|
# Critic is not evaluated since the tool selection failed
|
|
# Total score: 0.0
|
|
|
|
assert result.score == 0.0
|
|
assert result.passed is False
|
|
|
|
|
|
# Test EvalCase with multiple critics and weights
|
|
def test_eval_case_multiple_critics():
|
|
"""
|
|
Test EvalCase's evaluate method with multiple critics having different weights
|
|
to ensure individual critic scores are correctly combined into the total score.
|
|
"""
|
|
expected_tool_calls = [
|
|
NamedExpectedToolCall(name="ToolA", args={"param1": "value1", "param2": "value2"}),
|
|
]
|
|
actual_tool_calls = [
|
|
("ToolA", {"param1": "value1", "param2": "wrong_value"}),
|
|
]
|
|
|
|
critics = [
|
|
BinaryCritic(critic_field="param1", weight=0.6),
|
|
SimilarityCritic(critic_field="param2", weight=0.4, similarity_threshold=0.8),
|
|
]
|
|
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="",
|
|
user_message="",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=critics,
|
|
rubric=EvalRubric(fail_threshold=0.7),
|
|
)
|
|
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
# Tool selection score: 1.0
|
|
# Critic scores:
|
|
# - param1: match (score 0.6)
|
|
# - param2: likely not match (score ~0.0)
|
|
# Total score: (1.0 + 0.6 + 0.0) / (1.0 + 0.6 + 0.4) = 1.6 / 2.0 = 0.8
|
|
|
|
assert pytest.approx(result.score, 0.01) == 0.8
|
|
assert result.passed
|
|
|
|
|
|
# Test EvalCase with missing expected and actual values in args
|
|
def test_eval_case_with_none_values():
|
|
"""
|
|
Test that when expected or actual values are None, the critic evaluates them appropriately.
|
|
"""
|
|
expected_args = {"param": None}
|
|
actual_args = {"param": None}
|
|
|
|
expected_tool_calls = [NamedExpectedToolCall(name="ToolA", args=expected_args)]
|
|
actual_tool_calls = [("ToolA", actual_args)]
|
|
|
|
critics = [BinaryCritic(critic_field="param", weight=1.0)]
|
|
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="",
|
|
user_message="",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=critics,
|
|
rubric=EvalRubric(tool_selection_weight=1.0),
|
|
)
|
|
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
# Both values are None, so the critic should return a match
|
|
assert result.score == 2.0 / 2.0 # Full score (tool selection + critic score)
|
|
|
|
|
|
# Test EvalSuite.add_case()
|
|
def test_eval_suite_add_case():
|
|
"""
|
|
Test that add_case correctly adds a new evaluation case to the suite.
|
|
"""
|
|
suite = EvalSuite(name="TestSuite", system_message="System message")
|
|
|
|
expected_tool_calls = [
|
|
ExpectedMCPToolCall(tool_name="MockTool", args={"param1": "value"}),
|
|
ExpectedMCPToolCall(tool_name="MockTool", args={"param1": "value"}),
|
|
]
|
|
|
|
suite.add_case(
|
|
name="TestCase",
|
|
user_message="User message",
|
|
expected_tool_calls=expected_tool_calls,
|
|
)
|
|
|
|
assert len(suite.cases) == 1
|
|
case = suite.cases[0]
|
|
assert len(case.expected_tool_calls) == 2
|
|
assert case.name == "TestCase"
|
|
assert case.user_message == "User message"
|
|
assert case.system_message == "System message"
|
|
assert case.expected_tool_calls[0] == NamedExpectedToolCall(
|
|
name="MockTool", args={"param1": "value"}
|
|
)
|
|
assert case.expected_tool_calls[1] == NamedExpectedToolCall(
|
|
name="MockTool", args={"param1": "value"}
|
|
)
|
|
|
|
|
|
# Test EvalSuite.extend_case()
|
|
def test_eval_suite_extend_case():
|
|
"""
|
|
Test that extend_case correctly extends the last added case with new information.
|
|
"""
|
|
suite = EvalSuite(name="TestSuite", system_message="System message")
|
|
|
|
expected_tool_calls = [
|
|
ExpectedMCPToolCall(tool_name="MockTool", args={"param1": "value"}),
|
|
ExpectedMCPToolCall(tool_name="MockTool", args={"param1": "value"}),
|
|
]
|
|
|
|
suite.add_case(
|
|
name="InitialCase",
|
|
user_message="Initial user message",
|
|
expected_tool_calls=expected_tool_calls,
|
|
)
|
|
|
|
suite.extend_case(
|
|
name="ExtendedCase",
|
|
user_message="Extended user message",
|
|
expected_tool_calls=expected_tool_calls,
|
|
)
|
|
|
|
assert len(suite.cases) == 2
|
|
initial_case = suite.cases[0]
|
|
extended_case = suite.cases[1]
|
|
|
|
assert initial_case.name == "InitialCase"
|
|
assert extended_case.name == "ExtendedCase"
|
|
assert extended_case.user_message == "Extended user message"
|
|
assert extended_case.system_message == "System message"
|
|
assert len(extended_case.expected_tool_calls) == 2
|
|
assert extended_case.expected_tool_calls[0] == NamedExpectedToolCall(
|
|
name="MockTool", args={"param1": "value"}
|
|
)
|
|
assert extended_case.expected_tool_calls[1] == NamedExpectedToolCall(
|
|
name="MockTool", args={"param1": "value"}
|
|
)
|
|
|
|
|
|
def test_eval_suite_validate_critics_raises_value_error():
|
|
"""
|
|
Test that validate_critics raises a ValueError if multiple critics are detected for the same field.
|
|
"""
|
|
suite = EvalSuite(name="TestSuite", system_message="System message")
|
|
|
|
case_name = "TestCase"
|
|
critics = [
|
|
BinaryCritic(critic_field="param", weight=0.5),
|
|
SimilarityCritic(critic_field="param", weight=0.5),
|
|
]
|
|
with pytest.raises(ValueError):
|
|
suite._validate_critics(critics, case_name)
|
|
|
|
|
|
def test_eval_suite_validate_critics_no_error():
|
|
"""
|
|
Test that validate_critics does not raise an error when critics are valid.
|
|
"""
|
|
suite = EvalSuite(name="TestSuite", system_message="System message")
|
|
|
|
case_name = "TestCase"
|
|
critics = [
|
|
BinaryCritic(critic_field="param1", weight=0.5),
|
|
]
|
|
|
|
suite._validate_critics(critics, case_name)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"expected_tool_calls, critics, expected_critics_count, expected_critics_types",
|
|
[
|
|
(
|
|
# Test case 1: No arguments, expect no critics
|
|
[NamedExpectedToolCall(name="MockToolNoArgs", args={})],
|
|
None,
|
|
0,
|
|
[],
|
|
),
|
|
(
|
|
# Test case 2: Single argument, expect one NoneCritic
|
|
[NamedExpectedToolCall(name="MockTool", args={"param1": "value"})],
|
|
None,
|
|
1,
|
|
[(NoneCritic, "param1")],
|
|
),
|
|
(
|
|
# Test case 3: Multiple arguments with some critics, expect BinaryCritics for specified fields and NoneCritics for others
|
|
[
|
|
NamedExpectedToolCall(
|
|
name="MockToolMultipleArgs",
|
|
args={
|
|
"param1": "value1",
|
|
"param2": "value2",
|
|
"param3": "value3",
|
|
"param4": "value4",
|
|
},
|
|
)
|
|
],
|
|
[
|
|
BinaryCritic(critic_field="param1", weight=0.5),
|
|
BinaryCritic(critic_field="param2", weight=0.5),
|
|
],
|
|
4,
|
|
[
|
|
(BinaryCritic, "param1"),
|
|
(BinaryCritic, "param2"),
|
|
(NoneCritic, "param3"),
|
|
(NoneCritic, "param4"),
|
|
],
|
|
),
|
|
(
|
|
# Test case 4: Mixed tool calls with multiple critics, expect BinaryCritics for specified fields and NoneCritics for others
|
|
[
|
|
NamedExpectedToolCall(name="MockTool", args={"param1": "value"}),
|
|
NamedExpectedToolCall(name="MockToolNoArgs", args={}),
|
|
NamedExpectedToolCall(
|
|
name="MockToolMultipleArgs",
|
|
args={
|
|
"param1": "value1",
|
|
"param2": "value2",
|
|
"param3": "value3",
|
|
"param4": "value4",
|
|
},
|
|
),
|
|
],
|
|
[
|
|
BinaryCritic(critic_field="param1", weight=0.3),
|
|
BinaryCritic(critic_field="param2", weight=0.3),
|
|
BinaryCritic(critic_field="param3", weight=0.3),
|
|
],
|
|
4,
|
|
[
|
|
(BinaryCritic, "param1"),
|
|
(BinaryCritic, "param2"),
|
|
(BinaryCritic, "param3"),
|
|
(NoneCritic, "param4"),
|
|
],
|
|
),
|
|
],
|
|
)
|
|
def test_eval_suite_add_none_critics(
|
|
expected_tool_calls, critics, expected_critics_count, expected_critics_types
|
|
):
|
|
suite = EvalSuite(name="TestSuite", system_message="System message")
|
|
|
|
critics_with_none = suite._add_none_critics(expected_tool_calls, critics)
|
|
assert len(critics_with_none) == expected_critics_count
|
|
for i, (expected_type, expected_field) in enumerate(expected_critics_types):
|
|
assert isinstance(critics_with_none[i], expected_type)
|
|
assert critics_with_none[i].critic_field == expected_field
|
|
|
|
|
|
# =============================================================================
|
|
# Tests for ExpectedToolCall and ExpectedMCPToolCall classes
|
|
# =============================================================================
|
|
|
|
|
|
class TestExpectedToolCall:
|
|
"""Tests for the ExpectedToolCall dataclass (Python tools)."""
|
|
|
|
def test_keyword_args(self):
|
|
"""Test creating with keyword arguments."""
|
|
tc = ExpectedToolCall(func=mock_tool, args={"param1": "value"})
|
|
assert tc.func == mock_tool
|
|
assert tc.args == {"param1": "value"}
|
|
|
|
def test_positional_args(self):
|
|
"""Test creating with positional arguments (restored feature)."""
|
|
tc = ExpectedToolCall(mock_tool, {"param1": "value"})
|
|
assert tc.func == mock_tool
|
|
assert tc.args == {"param1": "value"}
|
|
|
|
def test_default_empty_args(self):
|
|
"""Test that args defaults to empty dict."""
|
|
tc = ExpectedToolCall(func=mock_tool)
|
|
assert tc.func == mock_tool
|
|
assert tc.args == {}
|
|
|
|
def test_func_is_required(self):
|
|
"""Test that func is required (no default)."""
|
|
with pytest.raises(TypeError):
|
|
ExpectedToolCall() # type: ignore[call-arg]
|
|
|
|
def test_func_with_positional_only(self):
|
|
"""Test creating with just func as positional."""
|
|
tc = ExpectedToolCall(mock_tool)
|
|
assert tc.func == mock_tool
|
|
assert tc.args == {}
|
|
|
|
|
|
class TestExpectedMCPToolCall:
|
|
"""Tests for the ExpectedMCPToolCall dataclass (MCP tools)."""
|
|
|
|
def test_keyword_args(self):
|
|
"""Test creating with keyword arguments."""
|
|
tc = ExpectedMCPToolCall(tool_name="Calculator_Add", args={"a": 5, "b": 3})
|
|
assert tc.tool_name == "Calculator_Add"
|
|
assert tc.args == {"a": 5, "b": 3}
|
|
|
|
def test_positional_args(self):
|
|
"""Test creating with positional arguments."""
|
|
tc = ExpectedMCPToolCall("Calculator_Add", {"a": 5, "b": 3})
|
|
assert tc.tool_name == "Calculator_Add"
|
|
assert tc.args == {"a": 5, "b": 3}
|
|
|
|
def test_default_empty_args(self):
|
|
"""Test that args defaults to empty dict."""
|
|
tc = ExpectedMCPToolCall(tool_name="Calculator_Add")
|
|
assert tc.tool_name == "Calculator_Add"
|
|
assert tc.args == {}
|
|
|
|
def test_tool_name_is_required(self):
|
|
"""Test that tool_name is required (no default)."""
|
|
with pytest.raises(TypeError):
|
|
ExpectedMCPToolCall() # type: ignore[call-arg]
|
|
|
|
def test_tool_name_with_positional_only(self):
|
|
"""Test creating with just tool_name as positional."""
|
|
tc = ExpectedMCPToolCall("MyTool")
|
|
assert tc.tool_name == "MyTool"
|
|
assert tc.args == {}
|
|
|
|
|
|
class TestAnyExpectedToolCall:
|
|
"""Tests for mixed usage of ExpectedToolCall and ExpectedMCPToolCall."""
|
|
|
|
def test_import_any_expected_tool_call(self):
|
|
"""Test that AnyExpectedToolCall can be imported."""
|
|
from arcade_evals import AnyExpectedToolCall
|
|
|
|
# Type alias should work with both types
|
|
python_tc: AnyExpectedToolCall = ExpectedToolCall(func=mock_tool, args={"param1": "v"})
|
|
mcp_tc: AnyExpectedToolCall = ExpectedMCPToolCall(tool_name="MyTool", args={"a": 1})
|
|
assert python_tc is not None
|
|
assert mcp_tc is not None
|
|
|
|
def test_mixed_list(self):
|
|
"""Test that mixed lists work correctly."""
|
|
from arcade_evals import AnyExpectedToolCall
|
|
|
|
mixed_list: list[AnyExpectedToolCall] = [
|
|
ExpectedToolCall(func=mock_tool, args={"param1": "value"}),
|
|
ExpectedMCPToolCall(tool_name="RemoteTool", args={"x": 1}),
|
|
]
|
|
assert len(mixed_list) == 2
|
|
assert isinstance(mixed_list[0], ExpectedToolCall)
|
|
assert isinstance(mixed_list[1], ExpectedMCPToolCall)
|
|
|
|
def test_isinstance_checks(self):
|
|
"""Test that isinstance works correctly for type narrowing."""
|
|
from arcade_evals import AnyExpectedToolCall
|
|
|
|
tc: AnyExpectedToolCall = ExpectedToolCall(func=mock_tool, args={})
|
|
|
|
if isinstance(tc, ExpectedToolCall):
|
|
assert tc.func == mock_tool
|
|
elif isinstance(tc, ExpectedMCPToolCall):
|
|
pytest.fail("Should not reach here")
|
|
|
|
|
|
class TestExpectedToolCallConversion:
|
|
"""Tests for conversion of ExpectedToolCall types to NamedExpectedToolCall."""
|
|
|
|
def test_add_case_with_expected_mcp_tool_call(self):
|
|
"""Test add_case correctly converts ExpectedMCPToolCall."""
|
|
suite = EvalSuite(name="TestSuite", system_message="System message")
|
|
|
|
suite.add_case(
|
|
name="MCP Test",
|
|
user_message="Test message",
|
|
expected_tool_calls=[
|
|
ExpectedMCPToolCall(tool_name="RemoteTool", args={"x": 1}),
|
|
],
|
|
)
|
|
|
|
assert len(suite.cases) == 1
|
|
assert suite.cases[0].expected_tool_calls[0].name == "RemoteTool"
|
|
assert suite.cases[0].expected_tool_calls[0].args == {"x": 1}
|
|
|
|
def test_add_case_with_mixed_tool_calls(self):
|
|
"""Test add_case with both Python and MCP tools in same case."""
|
|
from typing import Annotated
|
|
|
|
from arcade_core import ToolCatalog
|
|
|
|
@tool
|
|
def annotated_tool(value: Annotated[str, "The input value"]) -> str:
|
|
"""A tool with proper annotations."""
|
|
return value
|
|
|
|
catalog = ToolCatalog()
|
|
catalog.add_tool(annotated_tool, "Test")
|
|
|
|
suite = EvalSuite(
|
|
name="MixedSuite",
|
|
system_message="System message",
|
|
catalog=catalog,
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Mixed Test",
|
|
user_message="Test message",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(func=annotated_tool, args={"value": "test"}),
|
|
ExpectedMCPToolCall(tool_name="RemoteTool", args={"x": 1}),
|
|
],
|
|
)
|
|
|
|
assert len(suite.cases) == 1
|
|
case = suite.cases[0]
|
|
assert len(case.expected_tool_calls) == 2
|
|
# First is Python tool - name should be toolkit_name + tool_name (PascalCase)
|
|
assert case.expected_tool_calls[0].name == "Test_AnnotatedTool"
|
|
# Second is MCP tool - name should be as-is
|
|
assert case.expected_tool_calls[1].name == "RemoteTool"
|
|
|
|
|
|
class TestToolSelectionFailure:
|
|
"""Tests for tool selection failure scenarios and partial matching."""
|
|
|
|
def test_tool_mismatch_with_fail_on_tool_selection_true(self):
|
|
"""Test that tool mismatch fails immediately when fail_on_tool_selection=True (default)."""
|
|
expected_tool_calls = [
|
|
NamedExpectedToolCall(name="ToolA", args={"param": "value"}),
|
|
]
|
|
actual_tool_calls = [
|
|
("ToolB", {"param": "value"}), # Wrong tool
|
|
]
|
|
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="",
|
|
user_message="",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=[BinaryCritic(critic_field="param", weight=1.0)],
|
|
rubric=EvalRubric(fail_on_tool_selection=True),
|
|
)
|
|
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
assert result.score == 0.0
|
|
assert result.passed is False
|
|
assert result.failure_reason is not None
|
|
assert "Tool selection mismatch" in result.failure_reason
|
|
|
|
def test_tool_mismatch_with_fail_on_tool_selection_false_partial_scoring(self):
|
|
"""Test that tool mismatch allows partial scoring when fail_on_tool_selection=False."""
|
|
expected_tool_calls = [
|
|
NamedExpectedToolCall(name="ToolA", args={"param": "value"}),
|
|
]
|
|
actual_tool_calls = [
|
|
("ToolB", {"param": "value"}), # Wrong tool but correct param
|
|
]
|
|
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="",
|
|
user_message="",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=[BinaryCritic(critic_field="param", weight=1.0)],
|
|
rubric=EvalRubric(
|
|
fail_on_tool_selection=False,
|
|
tool_selection_weight=1.0,
|
|
fail_threshold=0.3,
|
|
),
|
|
)
|
|
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
# Tool selection: 0.0 (wrong tool)
|
|
# Critic (param match): 1.0
|
|
# Total: 1.0 / 2.0 = 0.5
|
|
assert result.score == pytest.approx(0.5)
|
|
assert result.failure_reason is None # No early failure
|
|
assert result.passed is True # 0.5 >= 0.3 threshold
|
|
|
|
|
|
class TestToolCallQuantityFailure:
|
|
"""Tests for tool call quantity mismatch scenarios."""
|
|
|
|
def test_more_tool_calls_than_expected_fails(self):
|
|
"""Test that calling the right tool more times than expected fails by default."""
|
|
expected_tool_calls = [
|
|
NamedExpectedToolCall(name="ToolA", args={"param": "value"}),
|
|
]
|
|
actual_tool_calls = [
|
|
("ToolA", {"param": "value"}),
|
|
("ToolA", {"param": "value2"}), # Extra call
|
|
]
|
|
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="",
|
|
user_message="",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=[BinaryCritic(critic_field="param", weight=1.0)],
|
|
rubric=EvalRubric(fail_on_tool_call_quantity=True),
|
|
)
|
|
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
assert result.score == 0.0
|
|
assert result.passed is False
|
|
assert result.failure_reason is not None
|
|
assert "Expected 1 tool call(s), but got 2" in result.failure_reason
|
|
|
|
def test_fewer_tool_calls_than_expected_fails(self):
|
|
"""Test that calling fewer tools than expected fails by default."""
|
|
expected_tool_calls = [
|
|
NamedExpectedToolCall(name="ToolA", args={"param": "value1"}),
|
|
NamedExpectedToolCall(name="ToolB", args={"param": "value2"}),
|
|
]
|
|
actual_tool_calls = [
|
|
("ToolA", {"param": "value1"}), # Only one call
|
|
]
|
|
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="",
|
|
user_message="",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=[BinaryCritic(critic_field="param", weight=1.0)],
|
|
rubric=EvalRubric(fail_on_tool_call_quantity=True),
|
|
)
|
|
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
assert result.score == 0.0
|
|
assert result.passed is False
|
|
assert result.failure_reason is not None
|
|
assert "Expected 2 tool call(s), but got 1" in result.failure_reason
|
|
|
|
def test_quantity_mismatch_with_fail_on_quantity_false(self):
|
|
"""Test partial scoring when fail_on_tool_call_quantity=False."""
|
|
expected_tool_calls = [
|
|
NamedExpectedToolCall(name="ToolA", args={"param": "value"}),
|
|
]
|
|
actual_tool_calls = [
|
|
("ToolA", {"param": "value"}),
|
|
("ToolA", {"param": "extra"}), # Extra call - should be ignored
|
|
]
|
|
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="",
|
|
user_message="",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=[BinaryCritic(critic_field="param", weight=1.0)],
|
|
rubric=EvalRubric(
|
|
fail_on_tool_call_quantity=False,
|
|
fail_on_tool_selection=False,
|
|
tool_selection_weight=1.0,
|
|
),
|
|
)
|
|
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
# Should not fail early - evaluation continues
|
|
assert result.failure_reason is None
|
|
# Score depends on matching logic (Hungarian algorithm matches best pairs)
|
|
assert result.score > 0.0
|
|
|
|
def test_right_tool_called_multiple_times_partial_score(self):
|
|
"""Test calling the right tool multiple times with quantity check disabled."""
|
|
expected_tool_calls = [
|
|
NamedExpectedToolCall(name="Calculator_Add", args={"a": 5, "b": 3}),
|
|
]
|
|
actual_tool_calls = [
|
|
("Calculator_Add", {"a": 5, "b": 3}), # Correct call
|
|
("Calculator_Add", {"a": 10, "b": 20}), # Extra call with different args
|
|
]
|
|
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="",
|
|
user_message="",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=[
|
|
BinaryCritic(critic_field="a", weight=0.5),
|
|
BinaryCritic(critic_field="b", weight=0.5),
|
|
],
|
|
rubric=EvalRubric(
|
|
fail_on_tool_call_quantity=False,
|
|
fail_on_tool_selection=False,
|
|
tool_selection_weight=1.0,
|
|
fail_threshold=0.5,
|
|
),
|
|
)
|
|
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
# Should not fail immediately
|
|
assert result.failure_reason is None
|
|
# The Hungarian algorithm will match expected[0] with the best actual call
|
|
# First actual call matches perfectly: tool(1.0) + a(0.5) + b(0.5) = 2.0
|
|
assert result.score > 0.0
|
|
|
|
def test_no_tool_calls_when_one_expected_fails(self):
|
|
"""Test that zero tool calls when some expected fails by default."""
|
|
expected_tool_calls = [
|
|
NamedExpectedToolCall(name="ToolA", args={"param": "value"}),
|
|
]
|
|
actual_tool_calls = [] # No calls
|
|
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="",
|
|
user_message="",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=[BinaryCritic(critic_field="param", weight=1.0)],
|
|
rubric=EvalRubric(fail_on_tool_call_quantity=True),
|
|
)
|
|
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
assert result.score == 0.0
|
|
assert result.passed is False
|
|
assert "Expected 1 tool call(s), but got 0" in result.failure_reason
|
|
|
|
def test_both_empty_passes(self):
|
|
"""Test that no expected and no actual tool calls results in pass."""
|
|
expected_tool_calls = []
|
|
actual_tool_calls = []
|
|
|
|
case = EvalCase(
|
|
name="TestCase",
|
|
system_message="",
|
|
user_message="",
|
|
expected_tool_calls=expected_tool_calls,
|
|
critics=[],
|
|
rubric=EvalRubric(),
|
|
)
|
|
|
|
result = case.evaluate(actual_tool_calls)
|
|
|
|
assert result.score == 1.0
|
|
assert result.passed is True
|
|
assert result.failure_reason is None
|