## Summary - Improve tool call error messages across 4 libraries (arcade-core, arcade-tdk, arcade-mcp-server, arcade-serve) so agents can self-correct and Datadog can facet on structured fields - Guard empty error messages, enrich input validation errors with field-level detail, fix `@tool` decorator fallback formatting, surface `additional_prompt_content` in MCP responses, and add structured log extras for Datadog - Addresses the 3 worst error patterns: generic "Error in tool input deserialization", bare `KeyError` values, and empty `FatalToolError` messages **Linear:** TOO-627 **Plan:** `docs/plans/2026-04-08-improve-error-messages-handoff.md` ## Tasks - [ ] Task 1: Guard empty error messages (arcade-core) - [ ] Task 2: Enrich input validation error messages (arcade-core) - [ ] Task 3: Improve `@tool` decorator error fallback (arcade-tdk) - [ ] Task 4: Fix MCP agent-facing error response (arcade-mcp-server) - [ ] Task 5: Add structured log extras in BaseWorker (arcade-serve) - [ ] Task 6: Add structured log extras in MCP server (arcade-mcp-server) ## Test plan - [ ] Each task has dedicated unit tests verifying the new behavior - [ ] `make test` passes after all tasks - [ ] `make check` (ruff + mypy) passes - [ ] Verify the 3 worst error patterns now produce actionable messages 🤖 Generated with [Claude Code](https://claude.com/claude-code) <!-- CURSOR_SUMMARY --> --- > [!NOTE] > **Medium Risk** > Touches cross-library error formatting and logging behavior used in production tool execution paths; while mostly additive/guardrails, it changes agent-visible messages and Datadog log facets, which could impact client expectations and alerting. > > **Overview** > Improves tool-call error handling across core/runtime, MCP transport, worker transport, and the TDK to make agent-visible failures more actionable while *reducing sensitive-data leakage*. > > In `arcade-core`, empty error messages now get placeholders, `ToolOutputFactory.fail*` defaults blank messages, and input validation errors are rewritten as field-level summaries that intentionally omit rejected values (avoiding Pydantic echo of secrets). The `@tool` fallback in `arcade-tdk` no longer surfaces `str(exception)` to agents; it returns exception *type-only* in `message` while preserving full detail in `developer_message`. > > Adds a shared `build_tool_error_log_extra` helper and updates `arcade-serve` + `arcade-mcp-server` to emit consistent structured WARNING logs (`error_*`, `tool_name`, optional toolkit/version) for Datadog, while MCP error responses now append `additional_prompt_content` and force `structuredContent=None` on failures per spec. Includes extensive new tests and bumps package versions (`arcade-core` 4.6.2, `arcade-tdk` 3.6.1, `arcade-mcp-server` 1.19.3, `arcade-serve` 3.2.3). > > <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit e5c7ebcaf56176cfbd8e6d1f2b6295352abd0ec0. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot).</sup> <!-- /CURSOR_SUMMARY --> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
333 lines
12 KiB
Python
333 lines
12 KiB
Python
from typing import Annotated
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
from arcade_core.errors import ToolDefinitionError
|
|
from arcade_core.schema import (
|
|
ToolCallRequest,
|
|
ToolCallResponse,
|
|
ToolContext,
|
|
ToolReference,
|
|
)
|
|
from arcade_serve.core.base import BaseWorker
|
|
from arcade_serve.core.common import RequestData, Router
|
|
from arcade_serve.core.components import (
|
|
CallToolComponent,
|
|
CatalogComponent,
|
|
HealthCheckComponent,
|
|
)
|
|
from arcade_tdk import tool
|
|
|
|
|
|
@tool()
|
|
def sample_tool(
|
|
context: ToolContext, a: Annotated[int, "a"], b: Annotated[int, "b"]
|
|
) -> Annotated[int, "output"]:
|
|
"""Sample tool for testing."""
|
|
return a + b
|
|
|
|
|
|
# Define error tool at module level to avoid indentation issues with getsource
|
|
@tool()
|
|
def error_tool(context: ToolContext) -> int:
|
|
"""This tool always raises an error."""
|
|
raise ValueError("Something went wrong")
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_router():
|
|
router = MagicMock(spec=Router)
|
|
router.add_route = MagicMock()
|
|
return router
|
|
|
|
|
|
@pytest.fixture
|
|
def base_worker(mock_router, monkeypatch):
|
|
# Set env var temporarily for testing secret loading
|
|
monkeypatch.setenv("ARCADE_WORKER_SECRET", "test_secret_env")
|
|
worker = BaseWorker()
|
|
worker.register_routes(mock_router) # Register routes using the mock router
|
|
return worker
|
|
|
|
|
|
@pytest.fixture
|
|
def base_worker_no_auth():
|
|
return BaseWorker(disable_auth=True)
|
|
|
|
|
|
# --- BaseWorker Tests ---
|
|
|
|
|
|
def test_base_worker_init_with_secret():
|
|
worker = BaseWorker(secret="explicit_secret") # noqa: S106
|
|
assert worker.secret == "explicit_secret" # noqa: S105
|
|
assert not worker.disable_auth
|
|
|
|
|
|
def test_base_worker_init_with_env_secret(monkeypatch):
|
|
monkeypatch.setenv("ARCADE_WORKER_SECRET", "env_secret_value")
|
|
worker = BaseWorker()
|
|
assert worker.secret == "env_secret_value" # noqa: S105
|
|
assert not worker.disable_auth
|
|
|
|
|
|
def test_base_worker_init_no_secret_raises_error(monkeypatch):
|
|
# Ensure env var is not set
|
|
monkeypatch.delenv("ARCADE_WORKER_SECRET", raising=False)
|
|
with pytest.raises(ValueError, match="No secret provided for worker"):
|
|
BaseWorker()
|
|
|
|
|
|
def test_base_worker_init_disable_auth():
|
|
worker = BaseWorker(disable_auth=True)
|
|
assert worker.secret == ""
|
|
assert worker.disable_auth
|
|
|
|
|
|
def test_register_tool(base_worker_no_auth):
|
|
assert len(base_worker_no_auth.catalog) == 0
|
|
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
|
|
assert len(base_worker_no_auth.catalog) == 1
|
|
tool_def = base_worker_no_auth.get_catalog()[0]
|
|
assert tool_def.name == "SampleTool"
|
|
assert tool_def.toolkit.name == "TestKit"
|
|
|
|
|
|
def test_get_catalog(base_worker_no_auth):
|
|
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
|
|
catalog = base_worker_no_auth.get_catalog()
|
|
assert isinstance(catalog, list)
|
|
assert len(catalog) == 1
|
|
assert catalog[0].name == "SampleTool"
|
|
|
|
|
|
def test_health_check(base_worker_no_auth):
|
|
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
|
|
health = base_worker_no_auth.health_check()
|
|
assert health == {"status": "ok", "tool_count": "1"}
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_call_tool_success(base_worker_no_auth):
|
|
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
|
|
# Create ToolReference WITHOUT version, as register_tool doesn't seem to set it
|
|
tool_ref = ToolReference(toolkit="TestKit", name="SampleTool")
|
|
tool_request = ToolCallRequest(
|
|
execution_id="test_exec_id",
|
|
tool=tool_ref,
|
|
inputs={"a": 5, "b": 3},
|
|
)
|
|
|
|
response = await base_worker_no_auth.call_tool(tool_request)
|
|
|
|
assert response.success is True
|
|
assert response.output.value == 8
|
|
assert response.output.error is None
|
|
assert response.execution_id == "test_exec_id"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_call_tool_success_and_error_logs_use_same_tool_identifiers(
|
|
base_worker_no_auth, caplog
|
|
):
|
|
"""Success and error log lines must use identical tool identifier strings
|
|
so logs can be correlated with a single grep pattern."""
|
|
import logging
|
|
|
|
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
|
|
base_worker_no_auth.register_tool(error_tool, toolkit_name="test_kit")
|
|
|
|
success_req = ToolCallRequest(
|
|
execution_id="exec_consistency_ok",
|
|
tool=ToolReference(toolkit="TestKit", name="SampleTool"),
|
|
inputs={"a": 1, "b": 2},
|
|
)
|
|
error_req = ToolCallRequest(
|
|
execution_id="exec_consistency_err",
|
|
tool=ToolReference(toolkit="TestKit", name="ErrorTool"),
|
|
inputs={},
|
|
)
|
|
|
|
with caplog.at_level(logging.DEBUG, logger="arcade_serve.core.base"):
|
|
await base_worker_no_auth.call_tool(success_req)
|
|
await base_worker_no_auth.call_tool(error_req)
|
|
|
|
success_line = next(
|
|
r for r in caplog.records if "exec_consistency_ok" in r.getMessage() and "success" in r.getMessage()
|
|
)
|
|
error_line = next(
|
|
r for r in caplog.records if "exec_consistency_err" in r.getMessage() and "failed:" in r.getMessage()
|
|
)
|
|
# Both must use the bare tool name (".name"), NOT the full ``Toolkit.Tool`` fqname.
|
|
assert "Tool SampleTool " in success_line.getMessage()
|
|
assert "Tool ErrorTool " in error_line.getMessage()
|
|
# Neither line should contain the full-fqname form ``TestKit.SampleTool``.
|
|
assert "TestKit.SampleTool" not in success_line.getMessage()
|
|
assert "TestKit.ErrorTool" not in error_line.getMessage()
|
|
# Both must use the same "version <X>" word — proves the same source
|
|
# (``tool_fqname.toolkit_version``) is read on both paths.
|
|
assert "version " in success_line.getMessage()
|
|
assert "version " in error_line.getMessage()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_call_tool_execution_error(base_worker_no_auth):
|
|
# Tool is now defined at module level
|
|
try:
|
|
base_worker_no_auth.register_tool(error_tool, toolkit_name="error_kit")
|
|
except ToolDefinitionError as e:
|
|
pytest.fail(f"Failed to register error_tool: {e}")
|
|
|
|
# Create ToolReference WITHOUT version
|
|
tool_ref = ToolReference(toolkit="ErrorKit", name="ErrorTool")
|
|
tool_request = ToolCallRequest(
|
|
execution_id="test_exec_error",
|
|
tool=tool_ref,
|
|
inputs={},
|
|
)
|
|
|
|
response = await base_worker_no_auth.call_tool(tool_request)
|
|
|
|
assert response.success is False
|
|
assert response.output.value is None
|
|
assert response.output.error is not None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_call_tool_error_log_text_matches_structured_extras(base_worker_no_auth, caplog):
|
|
"""The primary failure warning's f-string must use the same resolved
|
|
``tool_fqname.name`` / ``tool_fqname.toolkit_version`` values that
|
|
``log_extra`` exposes — otherwise the human-readable text and the
|
|
Datadog facets disagree on which tool/version produced the error.
|
|
Previously the f-string used ``tool_request.tool.version`` (the *requested*
|
|
version, often ``None``) while the extras used the resolved version."""
|
|
base_worker_no_auth.register_tool(error_tool, toolkit_name="error_kit")
|
|
tool_request = ToolCallRequest(
|
|
execution_id="exec_log_check",
|
|
tool=ToolReference(toolkit="ErrorKit", name="ErrorTool"),
|
|
inputs={},
|
|
)
|
|
|
|
with caplog.at_level("WARNING", logger="arcade_serve.core.base"):
|
|
await base_worker_no_auth.call_tool(tool_request)
|
|
|
|
primary = next(
|
|
r for r in caplog.records if "exec_log_check" in r.getMessage() and "failed:" in r.getMessage()
|
|
)
|
|
# Text and structured extra must agree on name + version.
|
|
assert "Tool ErrorTool " in primary.getMessage()
|
|
assert getattr(primary, "tool_name", None) == "ErrorTool"
|
|
extra_version = getattr(primary, "toolkit_version", None)
|
|
assert f"version {extra_version}" in primary.getMessage()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_call_tool_error_secondary_log_carries_full_exception_content(
|
|
base_worker_no_auth, caplog
|
|
):
|
|
"""Under the strict data-leak policy, the @tool fallback puts the verbose
|
|
``str(exception)`` content into ``developer_message`` (server-side only,
|
|
never returned to the MCP client). The secondary ``"Developer message: ..."``
|
|
warning must therefore fire and carry that full content so on-call
|
|
engineers retain debugging context — the channel where leakage WOULD
|
|
matter (agent-facing ``message``) is covered by the dedicated leak tests
|
|
in ``libs/tests/tool/test_error_fallback.py``."""
|
|
base_worker_no_auth.register_tool(error_tool, toolkit_name="error_kit")
|
|
tool_request = ToolCallRequest(
|
|
execution_id="exec_dev_msg",
|
|
tool=ToolReference(toolkit="ErrorKit", name="ErrorTool"),
|
|
inputs={},
|
|
)
|
|
|
|
with caplog.at_level("WARNING", logger="arcade_serve.core.base"):
|
|
await base_worker_no_auth.call_tool(tool_request)
|
|
|
|
secondary = [
|
|
r for r in caplog.records
|
|
if "exec_dev_msg" in r.getMessage() and "Developer message:" in r.getMessage()
|
|
]
|
|
assert len(secondary) == 1, "secondary 'Developer message:' log should fire once"
|
|
# The full exception content is in the secondary log (and in Datadog facets).
|
|
assert "ValueError" in secondary[0].getMessage()
|
|
assert "Something went wrong" in secondary[0].getMessage()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_call_tool_not_found(base_worker_no_auth):
|
|
# Use ToolReference without version for lookup consistency
|
|
tool_ref = ToolReference(toolkit="nonexistent", name="nosuchtool")
|
|
tool_request = ToolCallRequest(
|
|
execution_id="test_exec_notfound",
|
|
tool=tool_ref,
|
|
inputs={},
|
|
)
|
|
|
|
# Update regex to match actual error format
|
|
with pytest.raises(ValueError):
|
|
await base_worker_no_auth.call_tool(tool_request)
|
|
|
|
|
|
# --- Component Tests (tested via BaseWorker registration) ---
|
|
|
|
|
|
def test_register_routes_registers_default_components(base_worker, mock_router):
|
|
# BaseWorker calls register_routes in its init via the fixture
|
|
assert mock_router.add_route.call_count == len(BaseWorker.default_components)
|
|
|
|
calls = mock_router.add_route.call_args_list
|
|
expected_paths = ["tools", "tools/invoke", "health"]
|
|
registered_paths = [
|
|
call[0][0] for call in calls
|
|
] # call[0] are positional args, call[0][0] is endpoint_path
|
|
|
|
assert sorted(registered_paths) == sorted(expected_paths)
|
|
|
|
# Check if components were instantiated and passed to add_route
|
|
assert any(isinstance(call[0][1], CatalogComponent) for call in calls)
|
|
assert any(isinstance(call[0][1], CallToolComponent) for call in calls)
|
|
assert any(isinstance(call[0][1], HealthCheckComponent) for call in calls)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_catalog_component_call(base_worker_no_auth):
|
|
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
|
|
component = CatalogComponent(base_worker_no_auth)
|
|
# Mock request data - not actually used by this component's __call__
|
|
mock_request = MagicMock(spec=RequestData)
|
|
catalog_response = await component(mock_request)
|
|
|
|
assert isinstance(catalog_response, list)
|
|
assert len(catalog_response) == 1
|
|
assert catalog_response[0].name == "SampleTool"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_call_tool_component_call(base_worker_no_auth):
|
|
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
|
|
component = CallToolComponent(base_worker_no_auth)
|
|
|
|
# Create ToolReference WITHOUT version
|
|
tool_ref = ToolReference(toolkit="TestKit", name="SampleTool")
|
|
request_body = {
|
|
"execution_id": "comp_test_exec",
|
|
"tool": tool_ref.model_dump(),
|
|
"inputs": {"a": 10, "b": 5},
|
|
}
|
|
mock_request = MagicMock(spec=RequestData)
|
|
mock_request.body_json = request_body
|
|
|
|
response = await component(mock_request)
|
|
|
|
assert isinstance(response, ToolCallResponse)
|
|
assert response.success is True
|
|
assert response.output.value == 15
|
|
assert response.execution_id == "comp_test_exec"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_health_check_component_call(base_worker_no_auth):
|
|
component = HealthCheckComponent(base_worker_no_auth)
|
|
mock_request = MagicMock(spec=RequestData)
|
|
health_response = await component(mock_request)
|
|
|
|
assert health_response == {"status": "ok", "tool_count": "0"}
|