<!-- CURSOR_SUMMARY --> > [!NOTE] > **Medium Risk** > Touches authentication/login flow, credentials-file permissions, and subprocess lifecycle behavior across platforms; while mostly defensive, regressions could impact login or process management on Windows/macOS runners. > > **Overview** > Improves Windows/cross-platform reliability across the CLI and MCP server: OAuth login now binds the callback server to `127.0.0.1`, avoids slow loopback reverse-DNS, adds a configurable callback timeout (`--timeout` + env default), and opens URLs via a Windows-friendly `_open_browser` to avoid flashing console windows. > > Centralizes CLI output via a shared `console` that forces UTF-8 on Windows, standardizes UTF-8 file reads/writes throughout, tightens credentials-file permissions on Windows using `icacls`, and adds shared Windows subprocess helpers for **no-window** process creation and graceful termination (used by `deploy`, MCP reload, and usage-tracking worker). > > Updates client configuration UX/robustness (Windows AppData resolution via `platformdirs`, Cursor config path fallbacks + compatibility writes, overwrite warnings, absolute `uv` path for GUI clients, safer path display) and improves `deploy` child-process handling to avoid pipe-buffer deadlocks while giving better debug-aware error messages. > > Expands CI to run tests on Linux/Windows/macOS, adds a no-auth CLI integration workflow, disables usage tracking in toolkits CI, and adds extensive regression tests for Windows signals, subprocess cleanup, UTF-8, and config-path edge cases; bumps `arcade-core` to `4.4.2` and `arcade-mcp-server` to `1.17.2` (with updated dependency pin). > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 0fabd8ca1cd647039ba6ddbdf3f7809c330bab9e. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY -->
893 lines
31 KiB
Python
893 lines
31 KiB
Python
"""
|
|
Tests for EvalSuite capture mode functionality.
|
|
|
|
Capture mode allows running evaluations without scoring - it simply records
|
|
the tool calls made by the model for debugging or generating expected calls.
|
|
"""
|
|
|
|
import json
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
from arcade_evals import (
|
|
CapturedCase,
|
|
CapturedToolCall,
|
|
CaptureResult,
|
|
EvalSuite,
|
|
)
|
|
|
|
# Mark all tests in this module as requiring evals dependencies
|
|
pytestmark = pytest.mark.evals
|
|
|
|
# --- CapturedToolCall Tests ---
|
|
|
|
|
|
class TestCapturedToolCall:
|
|
"""Tests for CapturedToolCall dataclass."""
|
|
|
|
def test_create_with_name_and_args(self):
|
|
"""Test creating a captured tool call with name and args."""
|
|
tc = CapturedToolCall(name="Weather_GetCurrent", args={"location": "London"})
|
|
assert tc.name == "Weather_GetCurrent"
|
|
assert tc.args == {"location": "London"}
|
|
|
|
def test_create_with_name_only(self):
|
|
"""Test creating a captured tool call with default empty args."""
|
|
tc = CapturedToolCall(name="Weather_GetCurrent")
|
|
assert tc.name == "Weather_GetCurrent"
|
|
assert tc.args == {}
|
|
|
|
def test_to_dict(self):
|
|
"""Test to_dict serialization."""
|
|
tc = CapturedToolCall(name="MyTool", args={"key": "value"})
|
|
result = tc.to_dict()
|
|
assert result == {"name": "MyTool", "args": {"key": "value"}}
|
|
|
|
def test_to_dict_empty_args(self):
|
|
"""Test to_dict with empty args."""
|
|
tc = CapturedToolCall(name="MyTool")
|
|
result = tc.to_dict()
|
|
assert result == {"name": "MyTool", "args": {}}
|
|
|
|
|
|
# --- CapturedCase Tests ---
|
|
|
|
|
|
class TestCapturedCase:
|
|
"""Tests for CapturedCase dataclass."""
|
|
|
|
def test_create_basic(self):
|
|
"""Test creating a captured case with minimal fields."""
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="Hello",
|
|
tool_calls=[CapturedToolCall(name="Tool1")],
|
|
)
|
|
assert case.case_name == "test_case"
|
|
assert case.user_message == "Hello"
|
|
assert len(case.tool_calls) == 1
|
|
assert case.system_message is None
|
|
assert case.additional_messages is None
|
|
|
|
def test_create_with_context(self):
|
|
"""Test creating a captured case with full context."""
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="Hello",
|
|
tool_calls=[CapturedToolCall(name="Tool1")],
|
|
system_message="You are an assistant",
|
|
additional_messages=[{"role": "assistant", "content": "Hi"}],
|
|
)
|
|
assert case.system_message == "You are an assistant"
|
|
assert case.additional_messages == [{"role": "assistant", "content": "Hi"}]
|
|
|
|
def test_to_dict_without_context(self):
|
|
"""Test to_dict without including context."""
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="Hello",
|
|
tool_calls=[CapturedToolCall(name="Tool1", args={"x": 1})],
|
|
system_message="System message",
|
|
additional_messages=[{"role": "user", "content": "msg"}],
|
|
)
|
|
result = case.to_dict(include_context=False)
|
|
assert result == {
|
|
"case_name": "test_case",
|
|
"user_message": "Hello",
|
|
"tool_calls": [{"name": "Tool1", "args": {"x": 1}}],
|
|
}
|
|
# Context should NOT be included
|
|
assert "system_message" not in result
|
|
assert "additional_messages" not in result
|
|
|
|
def test_to_dict_with_context(self):
|
|
"""Test to_dict including context."""
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="Hello",
|
|
tool_calls=[CapturedToolCall(name="Tool1", args={"x": 1})],
|
|
system_message="System message",
|
|
additional_messages=[{"role": "user", "content": "msg"}],
|
|
)
|
|
result = case.to_dict(include_context=True)
|
|
assert result == {
|
|
"case_name": "test_case",
|
|
"user_message": "Hello",
|
|
"tool_calls": [{"name": "Tool1", "args": {"x": 1}}],
|
|
"system_message": "System message",
|
|
"additional_messages": [{"role": "user", "content": "msg"}],
|
|
}
|
|
|
|
def test_to_dict_with_context_null_messages(self):
|
|
"""Test to_dict with context when additional_messages is None."""
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="Hello",
|
|
tool_calls=[],
|
|
system_message="Sys",
|
|
additional_messages=None,
|
|
)
|
|
result = case.to_dict(include_context=True)
|
|
assert result["additional_messages"] == []
|
|
|
|
def test_to_dict_normalizes_json_string_arguments(self):
|
|
"""Test that JSON string arguments in additional_messages are parsed into objects."""
|
|
# This simulates OpenAI's format where arguments is a JSON string
|
|
additional_messages = [
|
|
{"role": "user", "content": "List projects"},
|
|
{
|
|
"role": "assistant",
|
|
"content": "",
|
|
"tool_calls": [
|
|
{
|
|
"id": "call_123",
|
|
"type": "function",
|
|
"function": {
|
|
"name": "Linear_ListProjects",
|
|
"arguments": '{"state": "started"}', # JSON string
|
|
},
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"role": "tool",
|
|
"content": '{"projects": []}',
|
|
"tool_call_id": "call_123",
|
|
},
|
|
]
|
|
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="Hello",
|
|
tool_calls=[],
|
|
system_message="Sys",
|
|
additional_messages=additional_messages,
|
|
)
|
|
result = case.to_dict(include_context=True)
|
|
|
|
# Arguments should be parsed into an object, not a string
|
|
assistant_msg = result["additional_messages"][1]
|
|
assert assistant_msg["tool_calls"][0]["function"]["arguments"] == {"state": "started"}
|
|
|
|
def test_to_dict_handles_invalid_json_arguments(self):
|
|
"""Test that invalid JSON arguments are kept as strings."""
|
|
additional_messages = [
|
|
{
|
|
"role": "assistant",
|
|
"content": "",
|
|
"tool_calls": [
|
|
{
|
|
"id": "call_123",
|
|
"type": "function",
|
|
"function": {
|
|
"name": "SomeTool",
|
|
"arguments": "not valid json {", # Invalid JSON
|
|
},
|
|
}
|
|
],
|
|
},
|
|
]
|
|
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="Hello",
|
|
tool_calls=[],
|
|
system_message="Sys",
|
|
additional_messages=additional_messages,
|
|
)
|
|
result = case.to_dict(include_context=True)
|
|
|
|
# Invalid JSON should remain as string
|
|
assistant_msg = result["additional_messages"][0]
|
|
assert assistant_msg["tool_calls"][0]["function"]["arguments"] == "not valid json {"
|
|
|
|
def test_to_dict_normalizes_tool_response_content(self):
|
|
"""Test that JSON content in tool response messages is parsed into objects."""
|
|
additional_messages = [
|
|
{"role": "user", "content": "Get the initiative"},
|
|
{
|
|
"role": "assistant",
|
|
"content": "",
|
|
"tool_calls": [
|
|
{
|
|
"id": "call_get_init",
|
|
"type": "function",
|
|
"function": {
|
|
"name": "Linear_GetInitiative",
|
|
"arguments": '{"id": "init_123"}',
|
|
},
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"role": "tool",
|
|
"content": '{"id": "init_123", "name": "Q1 Goals", "status": "Planned"}',
|
|
"tool_call_id": "call_get_init",
|
|
"name": "Linear_GetInitiative",
|
|
},
|
|
]
|
|
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="Hello",
|
|
tool_calls=[],
|
|
system_message="Sys",
|
|
additional_messages=additional_messages,
|
|
)
|
|
result = case.to_dict(include_context=True)
|
|
|
|
# Tool call arguments should be parsed
|
|
assistant_msg = result["additional_messages"][1]
|
|
assert assistant_msg["tool_calls"][0]["function"]["arguments"] == {"id": "init_123"}
|
|
|
|
# Tool response content should be parsed
|
|
tool_msg = result["additional_messages"][2]
|
|
assert tool_msg["content"] == {"id": "init_123", "name": "Q1 Goals", "status": "Planned"}
|
|
|
|
def test_to_dict_keeps_non_json_tool_content_as_string(self):
|
|
"""Test that non-JSON tool content is kept as string."""
|
|
additional_messages = [
|
|
{
|
|
"role": "tool",
|
|
"content": "Error: Tool not found", # Plain text, not JSON
|
|
"tool_call_id": "call_123",
|
|
"name": "SomeTool",
|
|
},
|
|
]
|
|
|
|
case = CapturedCase(
|
|
case_name="test_case",
|
|
user_message="Hello",
|
|
tool_calls=[],
|
|
system_message="Sys",
|
|
additional_messages=additional_messages,
|
|
)
|
|
result = case.to_dict(include_context=True)
|
|
|
|
# Non-JSON content should remain as string
|
|
tool_msg = result["additional_messages"][0]
|
|
assert tool_msg["content"] == "Error: Tool not found"
|
|
|
|
def test_empty_tool_calls(self):
|
|
"""Test case with no tool calls."""
|
|
case = CapturedCase(
|
|
case_name="no_tools",
|
|
user_message="Just chat",
|
|
tool_calls=[],
|
|
)
|
|
result = case.to_dict()
|
|
assert result["tool_calls"] == []
|
|
|
|
|
|
# --- CaptureResult Tests ---
|
|
|
|
|
|
class TestCaptureResult:
|
|
"""Tests for CaptureResult dataclass."""
|
|
|
|
def test_create_basic(self):
|
|
"""Test creating a capture result."""
|
|
result = CaptureResult(
|
|
suite_name="My Suite",
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
captured_cases=[
|
|
CapturedCase(
|
|
case_name="case1",
|
|
user_message="Hello",
|
|
tool_calls=[CapturedToolCall(name="Tool1")],
|
|
)
|
|
],
|
|
)
|
|
assert result.suite_name == "My Suite"
|
|
assert result.model == "gpt-4o"
|
|
assert result.provider == "openai"
|
|
assert len(result.captured_cases) == 1
|
|
|
|
def test_to_dict_without_context(self):
|
|
"""Test to_dict without context."""
|
|
result = CaptureResult(
|
|
suite_name="Suite",
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
captured_cases=[
|
|
CapturedCase(
|
|
case_name="case1",
|
|
user_message="Hello",
|
|
tool_calls=[CapturedToolCall(name="Tool1", args={"a": 1})],
|
|
system_message="System",
|
|
)
|
|
],
|
|
)
|
|
d = result.to_dict(include_context=False)
|
|
assert d["suite_name"] == "Suite"
|
|
assert d["model"] == "gpt-4o"
|
|
assert d["provider"] == "openai"
|
|
assert len(d["captured_cases"]) == 1
|
|
assert "system_message" not in d["captured_cases"][0]
|
|
|
|
def test_to_dict_with_context(self):
|
|
"""Test to_dict with context."""
|
|
result = CaptureResult(
|
|
suite_name="Suite",
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
captured_cases=[
|
|
CapturedCase(
|
|
case_name="case1",
|
|
user_message="Hello",
|
|
tool_calls=[],
|
|
system_message="System",
|
|
additional_messages=[],
|
|
)
|
|
],
|
|
)
|
|
d = result.to_dict(include_context=True)
|
|
assert d["captured_cases"][0]["system_message"] == "System"
|
|
|
|
def test_to_json(self):
|
|
"""Test JSON serialization."""
|
|
result = CaptureResult(
|
|
suite_name="Suite",
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
captured_cases=[
|
|
CapturedCase(
|
|
case_name="case1",
|
|
user_message="Hello",
|
|
tool_calls=[CapturedToolCall(name="Tool1")],
|
|
)
|
|
],
|
|
)
|
|
json_str = result.to_json(include_context=False)
|
|
parsed = json.loads(json_str)
|
|
assert parsed["suite_name"] == "Suite"
|
|
assert parsed["model"] == "gpt-4o"
|
|
|
|
def test_to_json_with_indent(self):
|
|
"""Test JSON serialization with custom indent."""
|
|
result = CaptureResult(
|
|
suite_name="Suite",
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
captured_cases=[],
|
|
)
|
|
json_str = result.to_json(indent=4)
|
|
# Check that indentation is present (4 spaces)
|
|
assert " " in json_str
|
|
|
|
def test_write_to_file(self):
|
|
"""Test writing capture result to file."""
|
|
result = CaptureResult(
|
|
suite_name="Suite",
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
captured_cases=[
|
|
CapturedCase(
|
|
case_name="case1",
|
|
user_message="Hello",
|
|
tool_calls=[CapturedToolCall(name="Tool1", args={"x": 1})],
|
|
)
|
|
],
|
|
)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
filepath = Path(tmpdir) / "capture_output.json"
|
|
result.write_to_file(str(filepath))
|
|
|
|
# Verify file was created and has valid content
|
|
assert filepath.exists()
|
|
with open(filepath, encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
assert data["suite_name"] == "Suite"
|
|
assert len(data["captured_cases"]) == 1
|
|
|
|
def test_write_to_file_with_context(self):
|
|
"""Test writing capture result with context to file."""
|
|
result = CaptureResult(
|
|
suite_name="Suite",
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
captured_cases=[
|
|
CapturedCase(
|
|
case_name="case1",
|
|
user_message="Hello",
|
|
tool_calls=[],
|
|
system_message="System",
|
|
additional_messages=[{"role": "user", "content": "x"}],
|
|
)
|
|
],
|
|
)
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
filepath = Path(tmpdir) / "capture_output.json"
|
|
result.write_to_file(str(filepath), include_context=True)
|
|
|
|
with open(filepath, encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
assert data["captured_cases"][0]["system_message"] == "System"
|
|
|
|
def test_empty_captured_cases(self):
|
|
"""Test with no captured cases."""
|
|
result = CaptureResult(
|
|
suite_name="Empty Suite",
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
captured_cases=[],
|
|
)
|
|
d = result.to_dict()
|
|
assert d["captured_cases"] == []
|
|
|
|
|
|
# --- Imports Test ---
|
|
|
|
|
|
class TestCaptureImports:
|
|
"""Tests for capture mode imports."""
|
|
|
|
def test_import_from_arcade_evals(self):
|
|
"""Test that capture classes are importable from arcade_evals."""
|
|
from arcade_evals import CapturedCase, CapturedToolCall, CaptureResult
|
|
|
|
assert CapturedToolCall is not None
|
|
assert CapturedCase is not None
|
|
assert CaptureResult is not None
|
|
|
|
|
|
# --- EvalSuite.capture() Tests ---
|
|
|
|
|
|
class TestEvalSuiteCapture:
|
|
"""Tests for EvalSuite.capture() method."""
|
|
|
|
@pytest.fixture
|
|
def mock_openai_client(self):
|
|
"""Create a mock OpenAI client."""
|
|
client = AsyncMock()
|
|
# Create mock response with tool calls
|
|
mock_response = MagicMock()
|
|
mock_response.choices = [MagicMock()]
|
|
mock_response.choices[0].message.tool_calls = [MagicMock()]
|
|
mock_response.choices[0].message.tool_calls[0].function.name = "Weather_GetCurrent"
|
|
mock_response.choices[0].message.tool_calls[0].function.arguments = '{"location": "London"}'
|
|
client.chat.completions.create = AsyncMock(return_value=mock_response)
|
|
return client
|
|
|
|
@pytest.fixture
|
|
def basic_suite(self):
|
|
"""Create a basic EvalSuite with a tool and case."""
|
|
suite = EvalSuite(
|
|
name="Test Suite",
|
|
system_message="You are a helpful assistant",
|
|
)
|
|
suite.add_tool_definitions([
|
|
{"name": "Weather_GetCurrent", "description": "Get weather", "inputSchema": {}}
|
|
])
|
|
suite.add_case(
|
|
name="test_case",
|
|
user_message="What's the weather in London?",
|
|
expected_tool_calls=[], # No expectations in capture mode
|
|
)
|
|
return suite
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_returns_capture_result(self, basic_suite, mock_openai_client):
|
|
"""Test that capture() returns a CaptureResult."""
|
|
result = await basic_suite.capture(
|
|
client=mock_openai_client,
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
)
|
|
assert isinstance(result, CaptureResult)
|
|
assert result.suite_name == "Test Suite"
|
|
assert result.model == "gpt-4o"
|
|
assert result.provider == "openai"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_records_tool_calls(self, basic_suite, mock_openai_client):
|
|
"""Test that capture() records tool calls from model."""
|
|
result = await basic_suite.capture(
|
|
client=mock_openai_client,
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
)
|
|
assert len(result.captured_cases) == 1
|
|
case = result.captured_cases[0]
|
|
assert case.case_name == "test_case"
|
|
assert len(case.tool_calls) == 1
|
|
assert case.tool_calls[0].name == "Weather_GetCurrent"
|
|
assert case.tool_calls[0].args == {"location": "London"}
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_without_context(self, basic_suite, mock_openai_client):
|
|
"""Test that capture() without context doesn't include system message."""
|
|
result = await basic_suite.capture(
|
|
client=mock_openai_client,
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
include_context=False,
|
|
)
|
|
case = result.captured_cases[0]
|
|
assert case.system_message is None
|
|
assert case.additional_messages is None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_with_context(self, basic_suite, mock_openai_client):
|
|
"""Test that capture() with context includes system message."""
|
|
result = await basic_suite.capture(
|
|
client=mock_openai_client,
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
include_context=True,
|
|
)
|
|
case = result.captured_cases[0]
|
|
assert case.system_message == "You are a helpful assistant"
|
|
assert case.additional_messages is not None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_requires_tools(self):
|
|
"""Test that capture() raises error when no tools registered."""
|
|
suite = EvalSuite(
|
|
name="Empty Suite",
|
|
system_message="Test",
|
|
)
|
|
suite.add_case(
|
|
name="test_case",
|
|
user_message="Hello",
|
|
expected_tool_calls=[],
|
|
)
|
|
|
|
mock_client = AsyncMock()
|
|
with pytest.raises(ValueError, match="No tools registered"):
|
|
await suite.capture(mock_client, "gpt-4o", provider="openai")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_multiple_cases(self, mock_openai_client):
|
|
"""Test capture with multiple cases."""
|
|
suite = EvalSuite(
|
|
name="Multi Case Suite",
|
|
system_message="You are an assistant",
|
|
)
|
|
suite.add_tool_definitions([
|
|
{"name": "Tool1", "description": "Tool 1"},
|
|
{"name": "Tool2", "description": "Tool 2"},
|
|
])
|
|
suite.add_case(name="case1", user_message="Do thing 1", expected_tool_calls=[])
|
|
suite.add_case(name="case2", user_message="Do thing 2", expected_tool_calls=[])
|
|
|
|
result = await suite.capture(
|
|
client=mock_openai_client,
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
)
|
|
assert len(result.captured_cases) == 2
|
|
assert result.captured_cases[0].case_name == "case1"
|
|
assert result.captured_cases[1].case_name == "case2"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_no_tool_calls(self):
|
|
"""Test capture when model doesn't call any tools."""
|
|
suite = EvalSuite(
|
|
name="No Calls Suite",
|
|
system_message="Test",
|
|
)
|
|
suite.add_tool_definitions([{"name": "Tool1", "description": "Tool 1"}])
|
|
suite.add_case(name="case1", user_message="Hello", expected_tool_calls=[])
|
|
|
|
# Mock client that returns no tool calls
|
|
mock_client = AsyncMock()
|
|
mock_response = MagicMock()
|
|
mock_response.choices = [MagicMock()]
|
|
mock_response.choices[0].message.tool_calls = None
|
|
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
|
|
|
|
result = await suite.capture(
|
|
client=mock_client,
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
)
|
|
assert len(result.captured_cases) == 1
|
|
assert len(result.captured_cases[0].tool_calls) == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_normalizes_tool_calls(self, mock_openai_client):
|
|
"""Test that capture() normalizes tool names and fills defaults."""
|
|
suite = EvalSuite(
|
|
name="Normalization Suite",
|
|
system_message="Test",
|
|
)
|
|
# Add tool with default arg
|
|
suite.add_tool_definitions([
|
|
{
|
|
"name": "My.Tool",
|
|
"description": "Tool with default",
|
|
"inputSchema": {
|
|
"type": "object",
|
|
"properties": {
|
|
"arg1": {"type": "string", "default": "default_val"},
|
|
"arg2": {"type": "string"},
|
|
},
|
|
},
|
|
}
|
|
])
|
|
suite.add_case(name="case1", user_message="Call it", expected_tool_calls=[])
|
|
|
|
# Mock client returning tool call with underscored name and missing default arg
|
|
mock_response = MagicMock()
|
|
mock_response.choices = [MagicMock()]
|
|
mock_response.choices[0].message.tool_calls = [MagicMock()]
|
|
|
|
tool_call = mock_response.choices[0].message.tool_calls[0]
|
|
tool_call.function.name = "My_Tool" # Normalized name
|
|
tool_call.function.arguments = '{"arg2": "provided"}' # Missing arg1
|
|
|
|
mock_openai_client.chat.completions.create = AsyncMock(return_value=mock_response)
|
|
|
|
result = await suite.capture(
|
|
client=mock_openai_client,
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
)
|
|
|
|
case = result.captured_cases[0]
|
|
# Name is resolved to original format (My_Tool -> My.Tool)
|
|
# This ensures consistency with expected tool names
|
|
assert case.tool_calls[0].name == "My.Tool"
|
|
# Args should include default value
|
|
assert case.tool_calls[0].args == {"arg1": "default_val", "arg2": "provided"}
|
|
|
|
|
|
# --- tool_eval decorator capture mode Tests ---
|
|
|
|
|
|
class TestToolEvalCaptureMode:
|
|
"""Tests for tool_eval decorator with capture mode."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_tool_eval_capture_mode_flag(self):
|
|
"""Test that tool_eval wrapper passes capture_mode correctly."""
|
|
from arcade_evals import tool_eval
|
|
|
|
@tool_eval()
|
|
def my_eval():
|
|
suite = EvalSuite(
|
|
name="Test Suite",
|
|
system_message="Test",
|
|
)
|
|
suite.add_tool_definitions([{"name": "Tool1", "description": "D"}])
|
|
suite.add_case(name="case1", user_message="Hello", expected_tool_calls=[])
|
|
return suite
|
|
|
|
# Mock the underlying capture functions
|
|
with patch("arcade_evals.eval._capture_with_openai") as mock_capture:
|
|
mock_capture.return_value = CaptureResult(
|
|
suite_name="Test",
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
captured_cases=[],
|
|
)
|
|
|
|
results = await my_eval(
|
|
provider_api_key="test-key",
|
|
model="gpt-4o",
|
|
capture_mode=True,
|
|
include_context=False,
|
|
)
|
|
|
|
mock_capture.assert_called_once()
|
|
assert len(results) == 1
|
|
assert isinstance(results[0], CaptureResult)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_tool_eval_capture_mode_with_context(self):
|
|
"""Test that tool_eval wrapper passes include_context correctly."""
|
|
from arcade_evals import tool_eval
|
|
|
|
@tool_eval()
|
|
def my_eval():
|
|
suite = EvalSuite(
|
|
name="Test Suite",
|
|
system_message="Test",
|
|
)
|
|
suite.add_tool_definitions([{"name": "Tool1", "description": "D"}])
|
|
suite.add_case(name="case1", user_message="Hello", expected_tool_calls=[])
|
|
return suite
|
|
|
|
with patch("arcade_evals.eval._capture_with_openai") as mock_capture:
|
|
mock_capture.return_value = CaptureResult(
|
|
suite_name="Test",
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
captured_cases=[],
|
|
)
|
|
|
|
await my_eval(
|
|
provider_api_key="test-key",
|
|
model="gpt-4o",
|
|
capture_mode=True,
|
|
include_context=True,
|
|
)
|
|
|
|
# Verify include_context was passed
|
|
call_args = mock_capture.call_args
|
|
assert call_args.kwargs["include_context"] is True
|
|
|
|
|
|
# --- Multiple Tool Calls per Case Tests ---
|
|
|
|
|
|
class TestMultipleToolCalls:
|
|
"""Tests for capturing multiple tool calls from a single case."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_multiple_tool_calls(self):
|
|
"""Test capturing multiple tool calls from one model response."""
|
|
suite = EvalSuite(
|
|
name="Multi Tool Suite",
|
|
system_message="Test",
|
|
)
|
|
suite.add_tool_definitions([
|
|
{"name": "Tool1", "description": "D1"},
|
|
{"name": "Tool2", "description": "D2"},
|
|
])
|
|
suite.add_case(name="case1", user_message="Do both", expected_tool_calls=[])
|
|
|
|
# Mock client returning multiple tool calls
|
|
mock_client = AsyncMock()
|
|
mock_response = MagicMock()
|
|
mock_response.choices = [MagicMock()]
|
|
|
|
tool_call_1 = MagicMock()
|
|
tool_call_1.function.name = "Tool1"
|
|
tool_call_1.function.arguments = '{"arg1": "val1"}'
|
|
|
|
tool_call_2 = MagicMock()
|
|
tool_call_2.function.name = "Tool2"
|
|
tool_call_2.function.arguments = '{"arg2": "val2"}'
|
|
|
|
mock_response.choices[0].message.tool_calls = [tool_call_1, tool_call_2]
|
|
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
|
|
|
|
result = await suite.capture(
|
|
client=mock_client,
|
|
model="gpt-4o",
|
|
provider="openai",
|
|
)
|
|
|
|
assert len(result.captured_cases) == 1
|
|
case = result.captured_cases[0]
|
|
assert len(case.tool_calls) == 2
|
|
assert case.tool_calls[0].name == "Tool1"
|
|
assert case.tool_calls[0].args == {"arg1": "val1"}
|
|
assert case.tool_calls[1].name == "Tool2"
|
|
assert case.tool_calls[1].args == {"arg2": "val2"}
|
|
|
|
|
|
class TestCaptureWithAnthropic:
|
|
"""Tests for capture mode with Anthropic provider."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_with_anthropic_provider(self):
|
|
"""Test capture mode using Anthropic provider."""
|
|
suite = EvalSuite(
|
|
name="Anthropic Capture Suite",
|
|
system_message="Test system message",
|
|
)
|
|
suite.add_tool_definitions([
|
|
{"name": "Google.Search", "description": "Search"},
|
|
])
|
|
suite.add_case(
|
|
name="test_case",
|
|
user_message="Search for something",
|
|
expected_tool_calls=[],
|
|
)
|
|
|
|
# Mock Anthropic client
|
|
mock_client = AsyncMock()
|
|
mock_response = MagicMock()
|
|
|
|
# Anthropic returns tool_use blocks
|
|
mock_tool_use = MagicMock()
|
|
mock_tool_use.type = "tool_use"
|
|
mock_tool_use.name = "Google_Search" # Anthropic uses underscores
|
|
mock_tool_use.input = {"query": "test"}
|
|
|
|
mock_response.content = [mock_tool_use]
|
|
mock_client.messages.create = AsyncMock(return_value=mock_response)
|
|
|
|
result = await suite.capture(
|
|
client=mock_client,
|
|
model="claude-3-opus",
|
|
provider="anthropic",
|
|
)
|
|
|
|
assert result.provider == "anthropic"
|
|
assert len(result.captured_cases) == 1
|
|
# Should resolve Google_Search back to Google.Search
|
|
assert result.captured_cases[0].tool_calls[0].name == "Google.Search"
|
|
|
|
|
|
class TestCaptureHelperFunctions:
|
|
"""Tests for _capture_with_openai and _capture_with_anthropic helpers."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_with_openai_helper(self):
|
|
"""Test the _capture_with_openai helper function."""
|
|
from arcade_evals.capture import _capture_with_openai
|
|
|
|
suite = EvalSuite(
|
|
name="OpenAI Helper Test",
|
|
system_message="Test",
|
|
)
|
|
suite.add_tool_definitions([{"name": "TestTool", "description": "A test tool"}])
|
|
suite.add_case(name="case1", user_message="Test", expected_tool_calls=[])
|
|
|
|
# Mock the suite.capture method directly instead of AsyncOpenAI
|
|
mock_result = CaptureResult(
|
|
suite_name="OpenAI Helper Test",
|
|
provider="openai",
|
|
model="gpt-4o",
|
|
captured_cases=[
|
|
CapturedCase(
|
|
case_name="case1",
|
|
user_message="Test",
|
|
tool_calls=[],
|
|
system_message="Test",
|
|
additional_messages=[],
|
|
)
|
|
],
|
|
)
|
|
|
|
with patch.object(suite, "capture", return_value=mock_result) as mock_capture:
|
|
result = await _capture_with_openai(
|
|
suite=suite,
|
|
api_key="test-key",
|
|
model="gpt-4o",
|
|
include_context=True,
|
|
)
|
|
|
|
assert result.suite_name == "OpenAI Helper Test"
|
|
assert result.provider == "openai"
|
|
# Verify capture was called with correct arguments
|
|
mock_capture.assert_called_once()
|
|
call_args = mock_capture.call_args
|
|
# Arguments: (client, model, provider=..., include_context=...)
|
|
assert call_args.args[1] == "gpt-4o" # model
|
|
assert call_args.kwargs.get("provider") == "openai"
|
|
assert call_args.kwargs.get("include_context") is True
|
|
|
|
def test_capture_with_anthropic_function_exists(self):
|
|
"""Test that _capture_with_anthropic helper function exists and is callable."""
|
|
# Verify the function exists and has the expected signature
|
|
import inspect
|
|
|
|
from arcade_evals.capture import _capture_with_anthropic
|
|
|
|
sig = inspect.signature(_capture_with_anthropic)
|
|
params = list(sig.parameters.keys())
|
|
assert "suite" in params
|
|
assert "api_key" in params
|
|
assert "model" in params
|
|
assert "include_context" in params
|