arcade-mcp/libs/tests/worker/test_worker_base.py
Francisco Or Something dc4607daa4
feat(telemetry): add developer messages to tool error spans (#831)
## Summary
- Add shared span attributes for tool error diagnostics, including
developer-facing messages when present.
- Wire those attributes through MCP server, worker RunTool, and HTTP
CallTool spans while keeping default MCP response content public-only.
- Cover no-leak response behavior, non-recording spans, outputless
worker responses, and the shared attribute contract.

## Verification
- `uv run ruff format ...`
- `uv run ruff check ...`
- `uv run pytest -W ignore
libs/tests/arcade_mcp_server/test_debug_exposure_integration.py
libs/tests/core/test_log_extras.py
libs/tests/worker/test_worker_base.py`

Made with [Cursor](https://cursor.com)

<!-- CURSOR_SUMMARY -->
---

> [!NOTE]
> **Medium Risk**
> Adds new telemetry attributes that propagate tool error messages
(including optional developer_message) into active spans across MCP
server and worker execution paths; risk is mainly around potential
leakage of sensitive developer messages into tracing backends and
changes to observability contracts.
> 
> **Overview**
> Adds a shared
`arcade_core.log_extras.build_tool_error_span_attributes()` helper and
wires it into tool error paths so the current OpenTelemetry span is
annotated with stable `tool_error_*` attributes (including
`developer_message` when present).
> 
> MCP tool calls now record these span attributes on failure while
keeping default MCP response content sanitized, and `arcade-serve`
records the same attributes on both `RunTool` and HTTP `CallTool` spans
(handling `output=None`). Versions and dependency constraints are bumped
to consume the new core helper, with tests added/updated to lock the
span-attribute contract and verify behavior for non-recording spans and
no-leak responses.
> 
> <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit
33a53991d72140a662152f508dc53e9b769b9f07. Bugbot is set up for automated
code reviews on this repo. Configure
[here](https://www.cursor.com/dashboard/bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
2026-04-29 20:41:07 -03:00

467 lines
16 KiB
Python

from typing import Annotated
from unittest.mock import MagicMock
import pytest
from arcade_core.errors import ErrorKind, ToolDefinitionError
from arcade_core.schema import (
ToolCallError,
ToolCallOutput,
ToolCallRequest,
ToolCallResponse,
ToolContext,
ToolReference,
)
from arcade_serve.core import base as base_module
from arcade_serve.core import components as components_module
from arcade_serve.core.base import BaseWorker
from arcade_serve.core.common import RequestData, Router
from arcade_serve.core.components import (
CallToolComponent,
CatalogComponent,
HealthCheckComponent,
)
from arcade_tdk import tool
@tool()
def sample_tool(
context: ToolContext, a: Annotated[int, "a"], b: Annotated[int, "b"]
) -> Annotated[int, "output"]:
"""Sample tool for testing."""
return a + b
# Define error tool at module level to avoid indentation issues with getsource
@tool()
def error_tool(context: ToolContext) -> int:
"""This tool always raises an error."""
raise ValueError("Something went wrong")
class FakeSpan:
def __init__(self, name: str):
self.name = name
self.attributes: dict[str, object] = {}
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def set_attribute(self, key: str, value: object) -> None:
self.attributes[key] = value
class FakeTracer:
def __init__(self):
self.spans: list[FakeSpan] = []
def start_as_current_span(self, name: str) -> FakeSpan:
span = FakeSpan(name)
self.spans.append(span)
return span
@pytest.fixture
def mock_router():
router = MagicMock(spec=Router)
router.add_route = MagicMock()
return router
@pytest.fixture
def base_worker(mock_router, monkeypatch):
# Set env var temporarily for testing secret loading
monkeypatch.setenv("ARCADE_WORKER_SECRET", "test_secret_env")
worker = BaseWorker()
worker.register_routes(mock_router) # Register routes using the mock router
return worker
@pytest.fixture
def base_worker_no_auth():
return BaseWorker(disable_auth=True)
@pytest.fixture
def fake_tracer():
return FakeTracer()
# --- BaseWorker Tests ---
def test_base_worker_init_with_secret():
worker = BaseWorker(secret="explicit_secret") # noqa: S106
assert worker.secret == "explicit_secret" # noqa: S105
assert not worker.disable_auth
def test_base_worker_init_with_env_secret(monkeypatch):
monkeypatch.setenv("ARCADE_WORKER_SECRET", "env_secret_value")
worker = BaseWorker()
assert worker.secret == "env_secret_value" # noqa: S105
assert not worker.disable_auth
def test_base_worker_init_no_secret_raises_error(monkeypatch):
# Ensure env var is not set
monkeypatch.delenv("ARCADE_WORKER_SECRET", raising=False)
with pytest.raises(ValueError, match="No secret provided for worker"):
BaseWorker()
def test_base_worker_init_disable_auth():
worker = BaseWorker(disable_auth=True)
assert worker.secret == ""
assert worker.disable_auth
def test_register_tool(base_worker_no_auth):
assert len(base_worker_no_auth.catalog) == 0
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
assert len(base_worker_no_auth.catalog) == 1
tool_def = base_worker_no_auth.get_catalog()[0]
assert tool_def.name == "SampleTool"
assert tool_def.toolkit.name == "TestKit"
def test_get_catalog(base_worker_no_auth):
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
catalog = base_worker_no_auth.get_catalog()
assert isinstance(catalog, list)
assert len(catalog) == 1
assert catalog[0].name == "SampleTool"
def test_health_check(base_worker_no_auth):
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
health = base_worker_no_auth.health_check()
assert health == {"status": "ok", "tool_count": "1"}
@pytest.mark.asyncio
async def test_call_tool_success(base_worker_no_auth):
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
# Create ToolReference WITHOUT version, as register_tool doesn't seem to set it
tool_ref = ToolReference(toolkit="TestKit", name="SampleTool")
tool_request = ToolCallRequest(
execution_id="test_exec_id",
tool=tool_ref,
inputs={"a": 5, "b": 3},
)
response = await base_worker_no_auth.call_tool(tool_request)
assert response.success is True
assert response.output.value == 8
assert response.output.error is None
assert response.execution_id == "test_exec_id"
@pytest.mark.asyncio
async def test_call_tool_success_and_error_logs_use_same_tool_identifiers(
base_worker_no_auth, caplog
):
"""Success and error log lines must use identical tool identifier strings
so logs can be correlated with a single grep pattern."""
import logging
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
base_worker_no_auth.register_tool(error_tool, toolkit_name="test_kit")
success_req = ToolCallRequest(
execution_id="exec_consistency_ok",
tool=ToolReference(toolkit="TestKit", name="SampleTool"),
inputs={"a": 1, "b": 2},
)
error_req = ToolCallRequest(
execution_id="exec_consistency_err",
tool=ToolReference(toolkit="TestKit", name="ErrorTool"),
inputs={},
)
with caplog.at_level(logging.DEBUG, logger="arcade_serve.core.base"):
await base_worker_no_auth.call_tool(success_req)
await base_worker_no_auth.call_tool(error_req)
success_line = next(
r
for r in caplog.records
if "exec_consistency_ok" in r.getMessage() and "success" in r.getMessage()
)
error_line = next(
r
for r in caplog.records
if "exec_consistency_err" in r.getMessage() and "failed:" in r.getMessage()
)
# Both must use the bare tool name (".name"), NOT the full ``Toolkit.Tool`` fqname.
assert "Tool SampleTool " in success_line.getMessage()
assert "Tool ErrorTool " in error_line.getMessage()
# Neither line should contain the full-fqname form ``TestKit.SampleTool``.
assert "TestKit.SampleTool" not in success_line.getMessage()
assert "TestKit.ErrorTool" not in error_line.getMessage()
# Both must use the same "version <X>" word — proves the same source
# (``tool_fqname.toolkit_version``) is read on both paths.
assert "version " in success_line.getMessage()
assert "version " in error_line.getMessage()
@pytest.mark.asyncio
async def test_call_tool_execution_error(base_worker_no_auth):
# Tool is now defined at module level
try:
base_worker_no_auth.register_tool(error_tool, toolkit_name="error_kit")
except ToolDefinitionError as e:
pytest.fail(f"Failed to register error_tool: {e}")
# Create ToolReference WITHOUT version
tool_ref = ToolReference(toolkit="ErrorKit", name="ErrorTool")
tool_request = ToolCallRequest(
execution_id="test_exec_error",
tool=tool_ref,
inputs={},
)
response = await base_worker_no_auth.call_tool(tool_request)
assert response.success is False
assert response.output.value is None
assert response.output.error is not None
@pytest.mark.asyncio
async def test_call_tool_error_records_run_tool_span_attributes(
base_worker_no_auth, fake_tracer, monkeypatch
):
monkeypatch.setattr(base_module.trace, "get_tracer", lambda name: fake_tracer)
base_worker_no_auth.register_tool(error_tool, toolkit_name="error_kit")
tool_request = ToolCallRequest(
execution_id="exec_span_attrs",
tool=ToolReference(toolkit="ErrorKit", name="ErrorTool"),
inputs={},
)
response = await base_worker_no_auth.call_tool(tool_request)
assert response.success is False
run_tool_span = next(span for span in fake_tracer.spans if span.name == "RunTool")
assert run_tool_span.attributes["tool_error_kind"] == "TOOL_RUNTIME_FATAL"
assert run_tool_span.attributes["tool_error_message"].startswith(
"[TOOL_RUNTIME_FATAL] FatalToolError"
)
assert "ValueError" in run_tool_span.attributes["tool_error_developer_message"]
assert "Something went wrong" in run_tool_span.attributes["tool_error_developer_message"]
@pytest.mark.asyncio
async def test_call_tool_error_log_text_matches_structured_extras(base_worker_no_auth, caplog):
"""The primary failure warning's f-string must use the same resolved
``tool_fqname.name`` / ``tool_fqname.toolkit_version`` values that
``log_extra`` exposes — otherwise the human-readable text and the
Datadog facets disagree on which tool/version produced the error.
Previously the f-string used ``tool_request.tool.version`` (the *requested*
version, often ``None``) while the extras used the resolved version."""
base_worker_no_auth.register_tool(error_tool, toolkit_name="error_kit")
tool_request = ToolCallRequest(
execution_id="exec_log_check",
tool=ToolReference(toolkit="ErrorKit", name="ErrorTool"),
inputs={},
)
with caplog.at_level("WARNING", logger="arcade_serve.core.base"):
await base_worker_no_auth.call_tool(tool_request)
primary = next(
r
for r in caplog.records
if "exec_log_check" in r.getMessage() and "failed:" in r.getMessage()
)
# Text and structured extra must agree on name + version.
assert "Tool ErrorTool " in primary.getMessage()
assert getattr(primary, "tool_name", None) == "ErrorTool"
extra_version = getattr(primary, "toolkit_version", None)
assert f"version {extra_version}" in primary.getMessage()
@pytest.mark.asyncio
async def test_call_tool_error_secondary_log_carries_full_exception_content(
base_worker_no_auth, caplog
):
"""Under the strict data-leak policy, the @tool fallback puts the verbose
``str(exception)`` content into ``developer_message`` (server-side only,
never returned to the MCP client). The secondary ``"Developer message: ..."``
warning must therefore fire and carry that full content so on-call
engineers retain debugging context — the channel where leakage WOULD
matter (agent-facing ``message``) is covered by the dedicated leak tests
in ``libs/tests/tool/test_error_fallback.py``."""
base_worker_no_auth.register_tool(error_tool, toolkit_name="error_kit")
tool_request = ToolCallRequest(
execution_id="exec_dev_msg",
tool=ToolReference(toolkit="ErrorKit", name="ErrorTool"),
inputs={},
)
with caplog.at_level("WARNING", logger="arcade_serve.core.base"):
await base_worker_no_auth.call_tool(tool_request)
secondary = [
r
for r in caplog.records
if "exec_dev_msg" in r.getMessage() and "Developer message:" in r.getMessage()
]
assert len(secondary) == 1, "secondary 'Developer message:' log should fire once"
# The full exception content is in the secondary log (and in Datadog facets).
assert "ValueError" in secondary[0].getMessage()
assert "Something went wrong" in secondary[0].getMessage()
@pytest.mark.asyncio
async def test_call_tool_not_found(base_worker_no_auth):
# Use ToolReference without version for lookup consistency
tool_ref = ToolReference(toolkit="nonexistent", name="nosuchtool")
tool_request = ToolCallRequest(
execution_id="test_exec_notfound",
tool=tool_ref,
inputs={},
)
# Update regex to match actual error format
with pytest.raises(ValueError):
await base_worker_no_auth.call_tool(tool_request)
# --- Component Tests (tested via BaseWorker registration) ---
def test_register_routes_registers_default_components(base_worker, mock_router):
# BaseWorker calls register_routes in its init via the fixture
assert mock_router.add_route.call_count == len(BaseWorker.default_components)
calls = mock_router.add_route.call_args_list
expected_paths = ["tools", "tools/invoke", "health"]
registered_paths = [
call[0][0] for call in calls
] # call[0] are positional args, call[0][0] is endpoint_path
assert sorted(registered_paths) == sorted(expected_paths)
# Check if components were instantiated and passed to add_route
assert any(isinstance(call[0][1], CatalogComponent) for call in calls)
assert any(isinstance(call[0][1], CallToolComponent) for call in calls)
assert any(isinstance(call[0][1], HealthCheckComponent) for call in calls)
@pytest.mark.asyncio
async def test_catalog_component_call(base_worker_no_auth):
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
component = CatalogComponent(base_worker_no_auth)
# Mock request data - not actually used by this component's __call__
mock_request = MagicMock(spec=RequestData)
catalog_response = await component(mock_request)
assert isinstance(catalog_response, list)
assert len(catalog_response) == 1
assert catalog_response[0].name == "SampleTool"
@pytest.mark.asyncio
async def test_call_tool_component_call(base_worker_no_auth):
base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit")
component = CallToolComponent(base_worker_no_auth)
# Create ToolReference WITHOUT version
tool_ref = ToolReference(toolkit="TestKit", name="SampleTool")
request_body = {
"execution_id": "comp_test_exec",
"tool": tool_ref.model_dump(),
"inputs": {"a": 10, "b": 5},
}
mock_request = MagicMock(spec=RequestData)
mock_request.body_json = request_body
response = await component(mock_request)
assert isinstance(response, ToolCallResponse)
assert response.success is True
assert response.output.value == 15
assert response.execution_id == "comp_test_exec"
@pytest.mark.asyncio
async def test_call_tool_component_allows_missing_output():
class OutputlessWorker:
async def call_tool(self, call_tool_request):
return ToolCallResponse(
execution_id="comp_outputless_exec",
duration=1,
finished_at="2026-01-01T00:00:00",
success=False,
output=None,
)
component = CallToolComponent(OutputlessWorker())
mock_request = MagicMock(spec=RequestData)
mock_request.body_json = {
"execution_id": "comp_outputless_exec",
"tool": ToolReference(toolkit="TestKit", name="SampleTool").model_dump(),
"inputs": {},
}
response = await component(mock_request)
assert response.success is False
assert response.output is None
@pytest.mark.asyncio
async def test_call_tool_component_error_records_call_tool_span_attributes(
fake_tracer, monkeypatch
):
monkeypatch.setattr(components_module.trace, "get_tracer", lambda name: fake_tracer)
class ErrorWorker:
environment = "test"
async def call_tool(self, call_tool_request):
return ToolCallResponse(
execution_id="comp_error_exec",
duration=1,
finished_at="2026-01-01T00:00:00",
success=False,
output=ToolCallOutput(
error=ToolCallError(
kind=ErrorKind.TOOL_RUNTIME_FATAL,
message="public component failure",
developer_message="component developer details",
),
),
)
component = CallToolComponent(ErrorWorker())
mock_request = MagicMock(spec=RequestData)
mock_request.body_json = {
"execution_id": "comp_error_exec",
"tool": ToolReference(toolkit="TestKit", name="SampleTool").model_dump(),
"inputs": {},
}
response = await component(mock_request)
assert response.success is False
call_tool_span = next(span for span in fake_tracer.spans if span.name == "CallTool")
assert call_tool_span.attributes["tool_error_kind"] == "TOOL_RUNTIME_FATAL"
assert call_tool_span.attributes["tool_error_message"] == "public component failure"
assert (
call_tool_span.attributes["tool_error_developer_message"] == "component developer details"
)
@pytest.mark.asyncio
async def test_health_check_component_call(base_worker_no_auth):
component = HealthCheckComponent(base_worker_no_auth)
mock_request = MagicMock(spec=RequestData)
health_response = await component(mock_request)
assert health_response == {"status": "ok", "tool_count": "0"}