## Summary - Return an explicit `[DEBUG] stacktrace: unavailable ...` note when the stacktrace debug flag is enabled but the tool error payload has no stacktrace. - Preserve existing behavior for real stacktraces and for developer messages, including not leaking developer details unless the developer-message flag is enabled. - Clarify the toolkit-author docs around when stacktraces exist, such as unhandled exceptions or chained `raise ... from exc` errors. ## Test plan - `pre-commit run --files CLAUDE.md libs/arcade-mcp-server/arcade_mcp_server/_debug_exposure.py libs/tests/arcade_mcp_server/test_debug_exposure.py libs/tests/arcade_mcp_server/test_debug_exposure_integration.py` - `uv run --with pytest --with pytest-asyncio --with pytest-cov pytest libs/tests/arcade_mcp_server/test_debug_exposure.py libs/tests/arcade_mcp_server/test_debug_exposure_integration.py -v` - `ruff format --check libs/arcade-mcp-server/arcade_mcp_server/_debug_exposure.py libs/tests/arcade_mcp_server/test_debug_exposure.py libs/tests/arcade_mcp_server/test_debug_exposure_integration.py` - `ruff check libs/arcade-mcp-server/arcade_mcp_server/_debug_exposure.py libs/tests/arcade_mcp_server/test_debug_exposure.py libs/tests/arcade_mcp_server/test_debug_exposure_integration.py` <!-- CURSOR_SUMMARY --> --- > [!NOTE] > **Low Risk** > Low risk: changes are limited to debug-only error-message augmentation when an explicit env flag is enabled; default runtime behavior is unchanged. Main risk is only in local debugging scenarios where the new note could affect log parsing or expected error text. > > **Overview** > When `ARCADE_DEBUG_EXPOSE_STACKTRACE_IN_TOOL_ERROR_RESPONSES` is enabled, tool error messages now **always include a stacktrace debug section**: either the actual stacktrace (when present) or an explicit `[DEBUG] stacktrace: unavailable ...` note when the tool error payload had no stacktrace. > > Adds/updates unit + integration coverage for the missing-stacktrace case and adjusts expectations around “flag enabled but no content.” Updates toolkit-author docs to clarify when stacktraces exist, and bumps `arcade-mcp-server` patch version to `1.21.2`. > > <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit 7d85196a30d8d29be98ffb252a13ef2a78057742. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot).</sup> <!-- /CURSOR_SUMMARY -->
335 lines
12 KiB
Python
335 lines
12 KiB
Python
"""End-to-end integration tests for the MCP debug-exposure escape hatch.
|
|
|
|
These complement the pure-function unit tests in ``test_debug_exposure.py`` by
|
|
exercising the full MCP tool-call path:
|
|
|
|
tool raises -> ToolExecutor.run -> ToolOutputFactory.fail ->
|
|
MCPServer._call_tool -> augment_error_message_for_debug ->
|
|
CallToolResult.content[0].text
|
|
|
|
This is the path every real MCP client hits, so it's where regressions in the
|
|
wire-up (wrong call site, wrong argument order, missing import, etc.) would
|
|
actually surface. The unit tests can't catch those because they call the pure
|
|
function directly.
|
|
"""
|
|
|
|
from typing import Annotated
|
|
|
|
import pytest
|
|
import pytest_asyncio
|
|
from arcade_core.catalog import MaterializedTool, ToolCatalog, ToolMeta, create_func_models
|
|
from arcade_core.errors import FatalToolError
|
|
from arcade_core.schema import (
|
|
InputParameter,
|
|
ToolDefinition,
|
|
ToolInput,
|
|
ToolkitDefinition,
|
|
ToolOutput,
|
|
ToolRequirements,
|
|
ValueSchema,
|
|
)
|
|
from arcade_mcp_server import _debug_exposure as debug_exposure
|
|
from arcade_mcp_server import tool
|
|
from arcade_mcp_server.server import MCPServer
|
|
from arcade_mcp_server.settings import MCPSettings
|
|
from arcade_mcp_server.types import CallToolRequest, CallToolResult, JSONRPCResponse
|
|
|
|
_LEAK_MAGIC = "yes-i-accept-leaking-internals-to-the-agent"
|
|
_ENV_DEV_MSG = "ARCADE_DEBUG_EXPOSE_DEVELOPER_MESSAGE_IN_TOOL_ERROR_RESPONSES"
|
|
_ENV_STACKTRACE = "ARCADE_DEBUG_EXPOSE_STACKTRACE_IN_TOOL_ERROR_RESPONSES"
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _reset_leak_state(monkeypatch):
|
|
monkeypatch.delenv(_ENV_DEV_MSG, raising=False)
|
|
monkeypatch.delenv(_ENV_STACKTRACE, raising=False)
|
|
debug_exposure._warned_rejected.clear()
|
|
debug_exposure._warned_activated.clear()
|
|
yield
|
|
debug_exposure._warned_rejected.clear()
|
|
debug_exposure._warned_activated.clear()
|
|
|
|
|
|
# ---- Tool definitions used by the integration tests -------------------------
|
|
|
|
|
|
@tool
|
|
def raises_fatal_tool_error(
|
|
query: Annotated[str, "A query"],
|
|
) -> Annotated[str, "Result"]:
|
|
"""Simulates a toolkit author's tool failing with a rich error."""
|
|
raise FatalToolError(
|
|
message="Failed to fetch results",
|
|
developer_message=f"HTTP 503 on upstream endpoint for query={query!r}",
|
|
)
|
|
|
|
|
|
@tool
|
|
def raises_unhandled_exception(
|
|
query: Annotated[str, "A query"],
|
|
) -> Annotated[str, "Result"]:
|
|
"""Simulates a toolkit author's tool crashing with an unexpected exception.
|
|
|
|
The executor's generic `except Exception` branch populates the stacktrace
|
|
via `traceback.format_exc()`, which is what the stacktrace flag leaks.
|
|
"""
|
|
raise ValueError(f"unexpected crash for query={query!r}")
|
|
|
|
|
|
def _materialized(func, name: str) -> MaterializedTool:
|
|
definition = ToolDefinition(
|
|
name=name,
|
|
fully_qualified_name=f"TestToolkit.{name}",
|
|
description=f"{name} integration fixture",
|
|
toolkit=ToolkitDefinition(name="TestToolkit", description="", version="1.0.0"),
|
|
input=ToolInput(
|
|
parameters=[
|
|
InputParameter(
|
|
name="query",
|
|
required=True,
|
|
description="A query",
|
|
value_schema=ValueSchema(val_type="string"),
|
|
),
|
|
]
|
|
),
|
|
output=ToolOutput(
|
|
description="Result",
|
|
value_schema=ValueSchema(val_type="string"),
|
|
),
|
|
requirements=ToolRequirements(),
|
|
)
|
|
input_model, output_model = create_func_models(func)
|
|
return MaterializedTool(
|
|
tool=func,
|
|
definition=definition,
|
|
meta=ToolMeta(module=func.__module__, toolkit="TestToolkit"),
|
|
input_model=input_model,
|
|
output_model=output_model,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def erroring_catalog() -> ToolCatalog:
|
|
catalog = ToolCatalog()
|
|
mt1 = _materialized(raises_fatal_tool_error, "raises_fatal_tool_error")
|
|
mt2 = _materialized(raises_unhandled_exception, "raises_unhandled_exception")
|
|
catalog._tools[mt1.definition.get_fully_qualified_name()] = mt1
|
|
catalog._tools[mt2.definition.get_fully_qualified_name()] = mt2
|
|
return catalog
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def erroring_server(erroring_catalog) -> MCPServer:
|
|
settings = MCPSettings()
|
|
settings.middleware.mask_error_details = False
|
|
server = MCPServer(
|
|
catalog=erroring_catalog,
|
|
name="Integration Debug Exposure Server",
|
|
version="0.0.0",
|
|
settings=settings,
|
|
)
|
|
await server.start()
|
|
try:
|
|
yield server
|
|
finally:
|
|
await server.stop()
|
|
|
|
|
|
async def _call(erroring_server: MCPServer, tool_name: str) -> CallToolResult:
|
|
message = CallToolRequest(
|
|
jsonrpc="2.0",
|
|
id=1,
|
|
method="tools/call",
|
|
params={"name": f"TestToolkit.{tool_name}", "arguments": {"query": "ping"}},
|
|
)
|
|
response = await erroring_server._handle_call_tool(message)
|
|
assert isinstance(response, JSONRPCResponse)
|
|
assert isinstance(response.result, CallToolResult)
|
|
assert response.result.isError is True
|
|
assert response.result.structuredContent is None
|
|
return response.result
|
|
|
|
|
|
# ---- Integration tests ------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integration_baseline_no_leak(erroring_server):
|
|
"""Default state: the agent sees ONLY the sanitized message."""
|
|
result = await _call(erroring_server, "raises_fatal_tool_error")
|
|
text = result.content[0].text
|
|
assert "Failed to fetch results" in text
|
|
assert "[DEBUG]" not in text
|
|
assert "HTTP 503" not in text
|
|
assert "query='ping'" not in text
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integration_tool_error_records_developer_message_on_current_span(
|
|
erroring_server, monkeypatch
|
|
):
|
|
"""Developer message stays out of MCP content but is attached to telemetry."""
|
|
|
|
class FakeSpan:
|
|
def __init__(self):
|
|
self.attributes = {}
|
|
|
|
def is_recording(self):
|
|
return True
|
|
|
|
def set_attribute(self, key, value):
|
|
self.attributes[key] = value
|
|
|
|
span = FakeSpan()
|
|
monkeypatch.setattr("arcade_mcp_server.server.trace.get_current_span", lambda: span)
|
|
|
|
result = await _call(erroring_server, "raises_fatal_tool_error")
|
|
text = result.content[0].text
|
|
|
|
assert "Failed to fetch results" in text
|
|
assert "HTTP 503" not in text
|
|
assert span.attributes["tool_error_kind"] == "TOOL_RUNTIME_FATAL"
|
|
assert span.attributes["tool_error_message"].startswith("[TOOL_RUNTIME_FATAL] FatalToolError")
|
|
assert (
|
|
span.attributes["tool_error_developer_message"]
|
|
== "[TOOL_RUNTIME_FATAL] FatalToolError during execution of tool "
|
|
"'raises_fatal_tool_error': HTTP 503 on upstream endpoint for query='ping'"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integration_tool_error_skips_span_attributes_when_span_not_recording(
|
|
erroring_server, monkeypatch
|
|
):
|
|
"""A no-op/non-recording span should not be mutated."""
|
|
|
|
class FakeSpan:
|
|
def __init__(self):
|
|
self.attributes = {}
|
|
|
|
def is_recording(self):
|
|
return False
|
|
|
|
def set_attribute(self, key, value):
|
|
self.attributes[key] = value
|
|
|
|
span = FakeSpan()
|
|
monkeypatch.setattr("arcade_mcp_server.server.trace.get_current_span", lambda: span)
|
|
|
|
result = await _call(erroring_server, "raises_fatal_tool_error")
|
|
text = result.content[0].text
|
|
|
|
assert "Failed to fetch results" in text
|
|
assert "HTTP 503" not in text
|
|
assert span.attributes == {}
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integration_boolean_rejected_no_leak(erroring_server, monkeypatch, caplog):
|
|
"""Boolean-looking values are rejected by the MCP boundary too."""
|
|
monkeypatch.setenv(_ENV_DEV_MSG, "true")
|
|
import logging
|
|
|
|
with caplog.at_level(logging.WARNING, logger="arcade_mcp_server._debug_exposure"):
|
|
result = await _call(erroring_server, "raises_fatal_tool_error")
|
|
text = result.content[0].text
|
|
assert "Failed to fetch results" in text
|
|
assert "[DEBUG]" not in text
|
|
assert "HTTP 503" not in text
|
|
assert any("set to a truthy value but not to the required" in r.message for r in caplog.records)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integration_developer_message_flag_leaks_through_mcp(erroring_server, monkeypatch):
|
|
"""When the flag is set to the magic value, the MCP response `content`
|
|
carries `developer_message` alongside the sanitized message."""
|
|
monkeypatch.setenv(_ENV_DEV_MSG, _LEAK_MAGIC)
|
|
result = await _call(erroring_server, "raises_fatal_tool_error")
|
|
text = result.content[0].text
|
|
assert "Failed to fetch results" in text
|
|
assert "[DEBUG] developer_message:" in text
|
|
assert "HTTP 503 on upstream endpoint for query='ping'" in text
|
|
# Stacktrace flag is off — stacktrace must NOT leak.
|
|
assert "[DEBUG] stacktrace:" not in text
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integration_stacktrace_flag_leaks_traceback_through_mcp(
|
|
erroring_server, monkeypatch
|
|
):
|
|
"""Unhandled exceptions go through the executor's generic except branch,
|
|
which populates a real stacktrace. With the flag on, that stacktrace must
|
|
appear in the MCP response content."""
|
|
monkeypatch.setenv(_ENV_STACKTRACE, _LEAK_MAGIC)
|
|
result = await _call(erroring_server, "raises_unhandled_exception")
|
|
text = result.content[0].text
|
|
# The generic-exception branch wraps the message with the tool name.
|
|
assert "raises_unhandled_exception" in text
|
|
assert "[DEBUG] stacktrace:" in text
|
|
assert "Traceback" in text
|
|
assert "ValueError" in text
|
|
assert "unexpected crash for query='ping'" in text
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integration_stacktrace_flag_reports_missing_traceback(erroring_server, monkeypatch):
|
|
"""Directly raised ToolRuntimeError values may not carry a stacktrace.
|
|
|
|
When the stacktrace flag is enabled, the MCP response should say that
|
|
explicitly instead of silently omitting the stacktrace debug section.
|
|
"""
|
|
monkeypatch.setenv(_ENV_STACKTRACE, _LEAK_MAGIC)
|
|
result = await _call(erroring_server, "raises_fatal_tool_error")
|
|
text = result.content[0].text
|
|
assert "Failed to fetch results" in text
|
|
assert "[DEBUG] stacktrace: unavailable" in text
|
|
assert "tool error payload did not include one" in text
|
|
assert "HTTP 503" not in text
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integration_both_flags_leak_through_mcp(erroring_server, monkeypatch):
|
|
"""Both flags together on an unhandled exception: developer_message (from
|
|
`str(e)` in the executor) AND the stacktrace both reach the MCP content."""
|
|
monkeypatch.setenv(_ENV_DEV_MSG, _LEAK_MAGIC)
|
|
monkeypatch.setenv(_ENV_STACKTRACE, _LEAK_MAGIC)
|
|
result = await _call(erroring_server, "raises_unhandled_exception")
|
|
text = result.content[0].text
|
|
assert "[DEBUG] developer_message:" in text
|
|
assert "unexpected crash for query='ping'" in text
|
|
assert "[DEBUG] stacktrace:" in text
|
|
assert "Traceback" in text
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integration_success_path_unaffected_by_flags(
|
|
tool_catalog, mcp_settings, monkeypatch
|
|
):
|
|
"""Sanity check: even with both flags on, SUCCESSFUL tool responses are
|
|
not touched. The augmentation only runs on the error branch."""
|
|
monkeypatch.setenv(_ENV_DEV_MSG, _LEAK_MAGIC)
|
|
monkeypatch.setenv(_ENV_STACKTRACE, _LEAK_MAGIC)
|
|
server = MCPServer(
|
|
catalog=tool_catalog,
|
|
name="Success Path Server",
|
|
version="0.0.0",
|
|
settings=mcp_settings,
|
|
)
|
|
await server.start()
|
|
try:
|
|
response = await server._handle_call_tool(
|
|
CallToolRequest(
|
|
jsonrpc="2.0",
|
|
id=1,
|
|
method="tools/call",
|
|
params={"name": "TestToolkit.test_tool", "arguments": {"text": "hi"}},
|
|
)
|
|
)
|
|
finally:
|
|
await server.stop()
|
|
assert isinstance(response, JSONRPCResponse)
|
|
assert isinstance(response.result, CallToolResult)
|
|
assert response.result.isError is False
|
|
assert response.result.structuredContent is not None
|
|
for item in response.result.content:
|
|
assert "[DEBUG]" not in getattr(item, "text", "")
|