arcade-mcp/libs/tests/sdk/test_eval_capture.py
jottakka fe8ddfd500
[TOO-326] Windows papercuts (#768)
<!-- CURSOR_SUMMARY -->
> [!NOTE]
> **Medium Risk**
> Touches authentication/login flow, credentials-file permissions, and
subprocess lifecycle behavior across platforms; while mostly defensive,
regressions could impact login or process management on Windows/macOS
runners.
> 
> **Overview**
> Improves Windows/cross-platform reliability across the CLI and MCP
server: OAuth login now binds the callback server to `127.0.0.1`, avoids
slow loopback reverse-DNS, adds a configurable callback timeout
(`--timeout` + env default), and opens URLs via a Windows-friendly
`_open_browser` to avoid flashing console windows.
> 
> Centralizes CLI output via a shared `console` that forces UTF-8 on
Windows, standardizes UTF-8 file reads/writes throughout, tightens
credentials-file permissions on Windows using `icacls`, and adds shared
Windows subprocess helpers for **no-window** process creation and
graceful termination (used by `deploy`, MCP reload, and usage-tracking
worker).
> 
> Updates client configuration UX/robustness (Windows AppData resolution
via `platformdirs`, Cursor config path fallbacks + compatibility writes,
overwrite warnings, absolute `uv` path for GUI clients, safer path
display) and improves `deploy` child-process handling to avoid
pipe-buffer deadlocks while giving better debug-aware error messages.
> 
> Expands CI to run tests on Linux/Windows/macOS, adds a no-auth CLI
integration workflow, disables usage tracking in toolkits CI, and adds
extensive regression tests for Windows signals, subprocess cleanup,
UTF-8, and config-path edge cases; bumps `arcade-core` to `4.4.2` and
`arcade-mcp-server` to `1.17.2` (with updated dependency pin).
> 
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
0fabd8ca1cd647039ba6ddbdf3f7809c330bab9e. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
2026-02-25 13:18:16 -03:00

893 lines
31 KiB
Python

"""
Tests for EvalSuite capture mode functionality.
Capture mode allows running evaluations without scoring - it simply records
the tool calls made by the model for debugging or generating expected calls.
"""
import json
import tempfile
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from arcade_evals import (
CapturedCase,
CapturedToolCall,
CaptureResult,
EvalSuite,
)
# Mark all tests in this module as requiring evals dependencies
pytestmark = pytest.mark.evals
# --- CapturedToolCall Tests ---
class TestCapturedToolCall:
"""Tests for CapturedToolCall dataclass."""
def test_create_with_name_and_args(self):
"""Test creating a captured tool call with name and args."""
tc = CapturedToolCall(name="Weather_GetCurrent", args={"location": "London"})
assert tc.name == "Weather_GetCurrent"
assert tc.args == {"location": "London"}
def test_create_with_name_only(self):
"""Test creating a captured tool call with default empty args."""
tc = CapturedToolCall(name="Weather_GetCurrent")
assert tc.name == "Weather_GetCurrent"
assert tc.args == {}
def test_to_dict(self):
"""Test to_dict serialization."""
tc = CapturedToolCall(name="MyTool", args={"key": "value"})
result = tc.to_dict()
assert result == {"name": "MyTool", "args": {"key": "value"}}
def test_to_dict_empty_args(self):
"""Test to_dict with empty args."""
tc = CapturedToolCall(name="MyTool")
result = tc.to_dict()
assert result == {"name": "MyTool", "args": {}}
# --- CapturedCase Tests ---
class TestCapturedCase:
"""Tests for CapturedCase dataclass."""
def test_create_basic(self):
"""Test creating a captured case with minimal fields."""
case = CapturedCase(
case_name="test_case",
user_message="Hello",
tool_calls=[CapturedToolCall(name="Tool1")],
)
assert case.case_name == "test_case"
assert case.user_message == "Hello"
assert len(case.tool_calls) == 1
assert case.system_message is None
assert case.additional_messages is None
def test_create_with_context(self):
"""Test creating a captured case with full context."""
case = CapturedCase(
case_name="test_case",
user_message="Hello",
tool_calls=[CapturedToolCall(name="Tool1")],
system_message="You are an assistant",
additional_messages=[{"role": "assistant", "content": "Hi"}],
)
assert case.system_message == "You are an assistant"
assert case.additional_messages == [{"role": "assistant", "content": "Hi"}]
def test_to_dict_without_context(self):
"""Test to_dict without including context."""
case = CapturedCase(
case_name="test_case",
user_message="Hello",
tool_calls=[CapturedToolCall(name="Tool1", args={"x": 1})],
system_message="System message",
additional_messages=[{"role": "user", "content": "msg"}],
)
result = case.to_dict(include_context=False)
assert result == {
"case_name": "test_case",
"user_message": "Hello",
"tool_calls": [{"name": "Tool1", "args": {"x": 1}}],
}
# Context should NOT be included
assert "system_message" not in result
assert "additional_messages" not in result
def test_to_dict_with_context(self):
"""Test to_dict including context."""
case = CapturedCase(
case_name="test_case",
user_message="Hello",
tool_calls=[CapturedToolCall(name="Tool1", args={"x": 1})],
system_message="System message",
additional_messages=[{"role": "user", "content": "msg"}],
)
result = case.to_dict(include_context=True)
assert result == {
"case_name": "test_case",
"user_message": "Hello",
"tool_calls": [{"name": "Tool1", "args": {"x": 1}}],
"system_message": "System message",
"additional_messages": [{"role": "user", "content": "msg"}],
}
def test_to_dict_with_context_null_messages(self):
"""Test to_dict with context when additional_messages is None."""
case = CapturedCase(
case_name="test_case",
user_message="Hello",
tool_calls=[],
system_message="Sys",
additional_messages=None,
)
result = case.to_dict(include_context=True)
assert result["additional_messages"] == []
def test_to_dict_normalizes_json_string_arguments(self):
"""Test that JSON string arguments in additional_messages are parsed into objects."""
# This simulates OpenAI's format where arguments is a JSON string
additional_messages = [
{"role": "user", "content": "List projects"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_123",
"type": "function",
"function": {
"name": "Linear_ListProjects",
"arguments": '{"state": "started"}', # JSON string
},
}
],
},
{
"role": "tool",
"content": '{"projects": []}',
"tool_call_id": "call_123",
},
]
case = CapturedCase(
case_name="test_case",
user_message="Hello",
tool_calls=[],
system_message="Sys",
additional_messages=additional_messages,
)
result = case.to_dict(include_context=True)
# Arguments should be parsed into an object, not a string
assistant_msg = result["additional_messages"][1]
assert assistant_msg["tool_calls"][0]["function"]["arguments"] == {"state": "started"}
def test_to_dict_handles_invalid_json_arguments(self):
"""Test that invalid JSON arguments are kept as strings."""
additional_messages = [
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_123",
"type": "function",
"function": {
"name": "SomeTool",
"arguments": "not valid json {", # Invalid JSON
},
}
],
},
]
case = CapturedCase(
case_name="test_case",
user_message="Hello",
tool_calls=[],
system_message="Sys",
additional_messages=additional_messages,
)
result = case.to_dict(include_context=True)
# Invalid JSON should remain as string
assistant_msg = result["additional_messages"][0]
assert assistant_msg["tool_calls"][0]["function"]["arguments"] == "not valid json {"
def test_to_dict_normalizes_tool_response_content(self):
"""Test that JSON content in tool response messages is parsed into objects."""
additional_messages = [
{"role": "user", "content": "Get the initiative"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_get_init",
"type": "function",
"function": {
"name": "Linear_GetInitiative",
"arguments": '{"id": "init_123"}',
},
}
],
},
{
"role": "tool",
"content": '{"id": "init_123", "name": "Q1 Goals", "status": "Planned"}',
"tool_call_id": "call_get_init",
"name": "Linear_GetInitiative",
},
]
case = CapturedCase(
case_name="test_case",
user_message="Hello",
tool_calls=[],
system_message="Sys",
additional_messages=additional_messages,
)
result = case.to_dict(include_context=True)
# Tool call arguments should be parsed
assistant_msg = result["additional_messages"][1]
assert assistant_msg["tool_calls"][0]["function"]["arguments"] == {"id": "init_123"}
# Tool response content should be parsed
tool_msg = result["additional_messages"][2]
assert tool_msg["content"] == {"id": "init_123", "name": "Q1 Goals", "status": "Planned"}
def test_to_dict_keeps_non_json_tool_content_as_string(self):
"""Test that non-JSON tool content is kept as string."""
additional_messages = [
{
"role": "tool",
"content": "Error: Tool not found", # Plain text, not JSON
"tool_call_id": "call_123",
"name": "SomeTool",
},
]
case = CapturedCase(
case_name="test_case",
user_message="Hello",
tool_calls=[],
system_message="Sys",
additional_messages=additional_messages,
)
result = case.to_dict(include_context=True)
# Non-JSON content should remain as string
tool_msg = result["additional_messages"][0]
assert tool_msg["content"] == "Error: Tool not found"
def test_empty_tool_calls(self):
"""Test case with no tool calls."""
case = CapturedCase(
case_name="no_tools",
user_message="Just chat",
tool_calls=[],
)
result = case.to_dict()
assert result["tool_calls"] == []
# --- CaptureResult Tests ---
class TestCaptureResult:
"""Tests for CaptureResult dataclass."""
def test_create_basic(self):
"""Test creating a capture result."""
result = CaptureResult(
suite_name="My Suite",
model="gpt-4o",
provider="openai",
captured_cases=[
CapturedCase(
case_name="case1",
user_message="Hello",
tool_calls=[CapturedToolCall(name="Tool1")],
)
],
)
assert result.suite_name == "My Suite"
assert result.model == "gpt-4o"
assert result.provider == "openai"
assert len(result.captured_cases) == 1
def test_to_dict_without_context(self):
"""Test to_dict without context."""
result = CaptureResult(
suite_name="Suite",
model="gpt-4o",
provider="openai",
captured_cases=[
CapturedCase(
case_name="case1",
user_message="Hello",
tool_calls=[CapturedToolCall(name="Tool1", args={"a": 1})],
system_message="System",
)
],
)
d = result.to_dict(include_context=False)
assert d["suite_name"] == "Suite"
assert d["model"] == "gpt-4o"
assert d["provider"] == "openai"
assert len(d["captured_cases"]) == 1
assert "system_message" not in d["captured_cases"][0]
def test_to_dict_with_context(self):
"""Test to_dict with context."""
result = CaptureResult(
suite_name="Suite",
model="gpt-4o",
provider="openai",
captured_cases=[
CapturedCase(
case_name="case1",
user_message="Hello",
tool_calls=[],
system_message="System",
additional_messages=[],
)
],
)
d = result.to_dict(include_context=True)
assert d["captured_cases"][0]["system_message"] == "System"
def test_to_json(self):
"""Test JSON serialization."""
result = CaptureResult(
suite_name="Suite",
model="gpt-4o",
provider="openai",
captured_cases=[
CapturedCase(
case_name="case1",
user_message="Hello",
tool_calls=[CapturedToolCall(name="Tool1")],
)
],
)
json_str = result.to_json(include_context=False)
parsed = json.loads(json_str)
assert parsed["suite_name"] == "Suite"
assert parsed["model"] == "gpt-4o"
def test_to_json_with_indent(self):
"""Test JSON serialization with custom indent."""
result = CaptureResult(
suite_name="Suite",
model="gpt-4o",
provider="openai",
captured_cases=[],
)
json_str = result.to_json(indent=4)
# Check that indentation is present (4 spaces)
assert " " in json_str
def test_write_to_file(self):
"""Test writing capture result to file."""
result = CaptureResult(
suite_name="Suite",
model="gpt-4o",
provider="openai",
captured_cases=[
CapturedCase(
case_name="case1",
user_message="Hello",
tool_calls=[CapturedToolCall(name="Tool1", args={"x": 1})],
)
],
)
with tempfile.TemporaryDirectory() as tmpdir:
filepath = Path(tmpdir) / "capture_output.json"
result.write_to_file(str(filepath))
# Verify file was created and has valid content
assert filepath.exists()
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
assert data["suite_name"] == "Suite"
assert len(data["captured_cases"]) == 1
def test_write_to_file_with_context(self):
"""Test writing capture result with context to file."""
result = CaptureResult(
suite_name="Suite",
model="gpt-4o",
provider="openai",
captured_cases=[
CapturedCase(
case_name="case1",
user_message="Hello",
tool_calls=[],
system_message="System",
additional_messages=[{"role": "user", "content": "x"}],
)
],
)
with tempfile.TemporaryDirectory() as tmpdir:
filepath = Path(tmpdir) / "capture_output.json"
result.write_to_file(str(filepath), include_context=True)
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
assert data["captured_cases"][0]["system_message"] == "System"
def test_empty_captured_cases(self):
"""Test with no captured cases."""
result = CaptureResult(
suite_name="Empty Suite",
model="gpt-4o",
provider="openai",
captured_cases=[],
)
d = result.to_dict()
assert d["captured_cases"] == []
# --- Imports Test ---
class TestCaptureImports:
"""Tests for capture mode imports."""
def test_import_from_arcade_evals(self):
"""Test that capture classes are importable from arcade_evals."""
from arcade_evals import CapturedCase, CapturedToolCall, CaptureResult
assert CapturedToolCall is not None
assert CapturedCase is not None
assert CaptureResult is not None
# --- EvalSuite.capture() Tests ---
class TestEvalSuiteCapture:
"""Tests for EvalSuite.capture() method."""
@pytest.fixture
def mock_openai_client(self):
"""Create a mock OpenAI client."""
client = AsyncMock()
# Create mock response with tool calls
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.tool_calls = [MagicMock()]
mock_response.choices[0].message.tool_calls[0].function.name = "Weather_GetCurrent"
mock_response.choices[0].message.tool_calls[0].function.arguments = '{"location": "London"}'
client.chat.completions.create = AsyncMock(return_value=mock_response)
return client
@pytest.fixture
def basic_suite(self):
"""Create a basic EvalSuite with a tool and case."""
suite = EvalSuite(
name="Test Suite",
system_message="You are a helpful assistant",
)
suite.add_tool_definitions([
{"name": "Weather_GetCurrent", "description": "Get weather", "inputSchema": {}}
])
suite.add_case(
name="test_case",
user_message="What's the weather in London?",
expected_tool_calls=[], # No expectations in capture mode
)
return suite
@pytest.mark.asyncio
async def test_capture_returns_capture_result(self, basic_suite, mock_openai_client):
"""Test that capture() returns a CaptureResult."""
result = await basic_suite.capture(
client=mock_openai_client,
model="gpt-4o",
provider="openai",
)
assert isinstance(result, CaptureResult)
assert result.suite_name == "Test Suite"
assert result.model == "gpt-4o"
assert result.provider == "openai"
@pytest.mark.asyncio
async def test_capture_records_tool_calls(self, basic_suite, mock_openai_client):
"""Test that capture() records tool calls from model."""
result = await basic_suite.capture(
client=mock_openai_client,
model="gpt-4o",
provider="openai",
)
assert len(result.captured_cases) == 1
case = result.captured_cases[0]
assert case.case_name == "test_case"
assert len(case.tool_calls) == 1
assert case.tool_calls[0].name == "Weather_GetCurrent"
assert case.tool_calls[0].args == {"location": "London"}
@pytest.mark.asyncio
async def test_capture_without_context(self, basic_suite, mock_openai_client):
"""Test that capture() without context doesn't include system message."""
result = await basic_suite.capture(
client=mock_openai_client,
model="gpt-4o",
provider="openai",
include_context=False,
)
case = result.captured_cases[0]
assert case.system_message is None
assert case.additional_messages is None
@pytest.mark.asyncio
async def test_capture_with_context(self, basic_suite, mock_openai_client):
"""Test that capture() with context includes system message."""
result = await basic_suite.capture(
client=mock_openai_client,
model="gpt-4o",
provider="openai",
include_context=True,
)
case = result.captured_cases[0]
assert case.system_message == "You are a helpful assistant"
assert case.additional_messages is not None
@pytest.mark.asyncio
async def test_capture_requires_tools(self):
"""Test that capture() raises error when no tools registered."""
suite = EvalSuite(
name="Empty Suite",
system_message="Test",
)
suite.add_case(
name="test_case",
user_message="Hello",
expected_tool_calls=[],
)
mock_client = AsyncMock()
with pytest.raises(ValueError, match="No tools registered"):
await suite.capture(mock_client, "gpt-4o", provider="openai")
@pytest.mark.asyncio
async def test_capture_multiple_cases(self, mock_openai_client):
"""Test capture with multiple cases."""
suite = EvalSuite(
name="Multi Case Suite",
system_message="You are an assistant",
)
suite.add_tool_definitions([
{"name": "Tool1", "description": "Tool 1"},
{"name": "Tool2", "description": "Tool 2"},
])
suite.add_case(name="case1", user_message="Do thing 1", expected_tool_calls=[])
suite.add_case(name="case2", user_message="Do thing 2", expected_tool_calls=[])
result = await suite.capture(
client=mock_openai_client,
model="gpt-4o",
provider="openai",
)
assert len(result.captured_cases) == 2
assert result.captured_cases[0].case_name == "case1"
assert result.captured_cases[1].case_name == "case2"
@pytest.mark.asyncio
async def test_capture_no_tool_calls(self):
"""Test capture when model doesn't call any tools."""
suite = EvalSuite(
name="No Calls Suite",
system_message="Test",
)
suite.add_tool_definitions([{"name": "Tool1", "description": "Tool 1"}])
suite.add_case(name="case1", user_message="Hello", expected_tool_calls=[])
# Mock client that returns no tool calls
mock_client = AsyncMock()
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.tool_calls = None
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
result = await suite.capture(
client=mock_client,
model="gpt-4o",
provider="openai",
)
assert len(result.captured_cases) == 1
assert len(result.captured_cases[0].tool_calls) == 0
@pytest.mark.asyncio
async def test_capture_normalizes_tool_calls(self, mock_openai_client):
"""Test that capture() normalizes tool names and fills defaults."""
suite = EvalSuite(
name="Normalization Suite",
system_message="Test",
)
# Add tool with default arg
suite.add_tool_definitions([
{
"name": "My.Tool",
"description": "Tool with default",
"inputSchema": {
"type": "object",
"properties": {
"arg1": {"type": "string", "default": "default_val"},
"arg2": {"type": "string"},
},
},
}
])
suite.add_case(name="case1", user_message="Call it", expected_tool_calls=[])
# Mock client returning tool call with underscored name and missing default arg
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.tool_calls = [MagicMock()]
tool_call = mock_response.choices[0].message.tool_calls[0]
tool_call.function.name = "My_Tool" # Normalized name
tool_call.function.arguments = '{"arg2": "provided"}' # Missing arg1
mock_openai_client.chat.completions.create = AsyncMock(return_value=mock_response)
result = await suite.capture(
client=mock_openai_client,
model="gpt-4o",
provider="openai",
)
case = result.captured_cases[0]
# Name is resolved to original format (My_Tool -> My.Tool)
# This ensures consistency with expected tool names
assert case.tool_calls[0].name == "My.Tool"
# Args should include default value
assert case.tool_calls[0].args == {"arg1": "default_val", "arg2": "provided"}
# --- tool_eval decorator capture mode Tests ---
class TestToolEvalCaptureMode:
"""Tests for tool_eval decorator with capture mode."""
@pytest.mark.asyncio
async def test_tool_eval_capture_mode_flag(self):
"""Test that tool_eval wrapper passes capture_mode correctly."""
from arcade_evals import tool_eval
@tool_eval()
def my_eval():
suite = EvalSuite(
name="Test Suite",
system_message="Test",
)
suite.add_tool_definitions([{"name": "Tool1", "description": "D"}])
suite.add_case(name="case1", user_message="Hello", expected_tool_calls=[])
return suite
# Mock the underlying capture functions
with patch("arcade_evals.eval._capture_with_openai") as mock_capture:
mock_capture.return_value = CaptureResult(
suite_name="Test",
model="gpt-4o",
provider="openai",
captured_cases=[],
)
results = await my_eval(
provider_api_key="test-key",
model="gpt-4o",
capture_mode=True,
include_context=False,
)
mock_capture.assert_called_once()
assert len(results) == 1
assert isinstance(results[0], CaptureResult)
@pytest.mark.asyncio
async def test_tool_eval_capture_mode_with_context(self):
"""Test that tool_eval wrapper passes include_context correctly."""
from arcade_evals import tool_eval
@tool_eval()
def my_eval():
suite = EvalSuite(
name="Test Suite",
system_message="Test",
)
suite.add_tool_definitions([{"name": "Tool1", "description": "D"}])
suite.add_case(name="case1", user_message="Hello", expected_tool_calls=[])
return suite
with patch("arcade_evals.eval._capture_with_openai") as mock_capture:
mock_capture.return_value = CaptureResult(
suite_name="Test",
model="gpt-4o",
provider="openai",
captured_cases=[],
)
await my_eval(
provider_api_key="test-key",
model="gpt-4o",
capture_mode=True,
include_context=True,
)
# Verify include_context was passed
call_args = mock_capture.call_args
assert call_args.kwargs["include_context"] is True
# --- Multiple Tool Calls per Case Tests ---
class TestMultipleToolCalls:
"""Tests for capturing multiple tool calls from a single case."""
@pytest.mark.asyncio
async def test_capture_multiple_tool_calls(self):
"""Test capturing multiple tool calls from one model response."""
suite = EvalSuite(
name="Multi Tool Suite",
system_message="Test",
)
suite.add_tool_definitions([
{"name": "Tool1", "description": "D1"},
{"name": "Tool2", "description": "D2"},
])
suite.add_case(name="case1", user_message="Do both", expected_tool_calls=[])
# Mock client returning multiple tool calls
mock_client = AsyncMock()
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
tool_call_1 = MagicMock()
tool_call_1.function.name = "Tool1"
tool_call_1.function.arguments = '{"arg1": "val1"}'
tool_call_2 = MagicMock()
tool_call_2.function.name = "Tool2"
tool_call_2.function.arguments = '{"arg2": "val2"}'
mock_response.choices[0].message.tool_calls = [tool_call_1, tool_call_2]
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
result = await suite.capture(
client=mock_client,
model="gpt-4o",
provider="openai",
)
assert len(result.captured_cases) == 1
case = result.captured_cases[0]
assert len(case.tool_calls) == 2
assert case.tool_calls[0].name == "Tool1"
assert case.tool_calls[0].args == {"arg1": "val1"}
assert case.tool_calls[1].name == "Tool2"
assert case.tool_calls[1].args == {"arg2": "val2"}
class TestCaptureWithAnthropic:
"""Tests for capture mode with Anthropic provider."""
@pytest.mark.asyncio
async def test_capture_with_anthropic_provider(self):
"""Test capture mode using Anthropic provider."""
suite = EvalSuite(
name="Anthropic Capture Suite",
system_message="Test system message",
)
suite.add_tool_definitions([
{"name": "Google.Search", "description": "Search"},
])
suite.add_case(
name="test_case",
user_message="Search for something",
expected_tool_calls=[],
)
# Mock Anthropic client
mock_client = AsyncMock()
mock_response = MagicMock()
# Anthropic returns tool_use blocks
mock_tool_use = MagicMock()
mock_tool_use.type = "tool_use"
mock_tool_use.name = "Google_Search" # Anthropic uses underscores
mock_tool_use.input = {"query": "test"}
mock_response.content = [mock_tool_use]
mock_client.messages.create = AsyncMock(return_value=mock_response)
result = await suite.capture(
client=mock_client,
model="claude-3-opus",
provider="anthropic",
)
assert result.provider == "anthropic"
assert len(result.captured_cases) == 1
# Should resolve Google_Search back to Google.Search
assert result.captured_cases[0].tool_calls[0].name == "Google.Search"
class TestCaptureHelperFunctions:
"""Tests for _capture_with_openai and _capture_with_anthropic helpers."""
@pytest.mark.asyncio
async def test_capture_with_openai_helper(self):
"""Test the _capture_with_openai helper function."""
from arcade_evals.capture import _capture_with_openai
suite = EvalSuite(
name="OpenAI Helper Test",
system_message="Test",
)
suite.add_tool_definitions([{"name": "TestTool", "description": "A test tool"}])
suite.add_case(name="case1", user_message="Test", expected_tool_calls=[])
# Mock the suite.capture method directly instead of AsyncOpenAI
mock_result = CaptureResult(
suite_name="OpenAI Helper Test",
provider="openai",
model="gpt-4o",
captured_cases=[
CapturedCase(
case_name="case1",
user_message="Test",
tool_calls=[],
system_message="Test",
additional_messages=[],
)
],
)
with patch.object(suite, "capture", return_value=mock_result) as mock_capture:
result = await _capture_with_openai(
suite=suite,
api_key="test-key",
model="gpt-4o",
include_context=True,
)
assert result.suite_name == "OpenAI Helper Test"
assert result.provider == "openai"
# Verify capture was called with correct arguments
mock_capture.assert_called_once()
call_args = mock_capture.call_args
# Arguments: (client, model, provider=..., include_context=...)
assert call_args.args[1] == "gpt-4o" # model
assert call_args.kwargs.get("provider") == "openai"
assert call_args.kwargs.get("include_context") is True
def test_capture_with_anthropic_function_exists(self):
"""Test that _capture_with_anthropic helper function exists and is callable."""
# Verify the function exists and has the expected signature
import inspect
from arcade_evals.capture import _capture_with_anthropic
sig = inspect.signature(_capture_with_anthropic)
params = list(sig.parameters.keys())
assert "suite" in params
assert "api_key" in params
assert "model" in params
assert "include_context" in params