arcade-mcp/libs/tests/arcade_mcp_server/test_debug_exposure_integration.py
Francisco Or Something c866620435
fix(arcade-mcp-server): report missing debug stacktraces (#836)
## Summary
- Return an explicit `[DEBUG] stacktrace: unavailable ...` note when the
stacktrace debug flag is enabled but the tool error payload has no
stacktrace.
- Preserve existing behavior for real stacktraces and for developer
messages, including not leaking developer details unless the
developer-message flag is enabled.
- Clarify the toolkit-author docs around when stacktraces exist, such as
unhandled exceptions or chained `raise ... from exc` errors.

## Test plan
- `pre-commit run --files CLAUDE.md
libs/arcade-mcp-server/arcade_mcp_server/_debug_exposure.py
libs/tests/arcade_mcp_server/test_debug_exposure.py
libs/tests/arcade_mcp_server/test_debug_exposure_integration.py`
- `uv run --with pytest --with pytest-asyncio --with pytest-cov pytest
libs/tests/arcade_mcp_server/test_debug_exposure.py
libs/tests/arcade_mcp_server/test_debug_exposure_integration.py -v`
- `ruff format --check
libs/arcade-mcp-server/arcade_mcp_server/_debug_exposure.py
libs/tests/arcade_mcp_server/test_debug_exposure.py
libs/tests/arcade_mcp_server/test_debug_exposure_integration.py`
- `ruff check
libs/arcade-mcp-server/arcade_mcp_server/_debug_exposure.py
libs/tests/arcade_mcp_server/test_debug_exposure.py
libs/tests/arcade_mcp_server/test_debug_exposure_integration.py`

<!-- CURSOR_SUMMARY -->
---

> [!NOTE]
> **Low Risk**
> Low risk: changes are limited to debug-only error-message augmentation
when an explicit env flag is enabled; default runtime behavior is
unchanged. Main risk is only in local debugging scenarios where the new
note could affect log parsing or expected error text.
> 
> **Overview**
> When `ARCADE_DEBUG_EXPOSE_STACKTRACE_IN_TOOL_ERROR_RESPONSES` is
enabled, tool error messages now **always include a stacktrace debug
section**: either the actual stacktrace (when present) or an explicit
`[DEBUG] stacktrace: unavailable ...` note when the tool error payload
had no stacktrace.
> 
> Adds/updates unit + integration coverage for the missing-stacktrace
case and adjusts expectations around “flag enabled but no content.”
Updates toolkit-author docs to clarify when stacktraces exist, and bumps
`arcade-mcp-server` patch version to `1.21.2`.
> 
> <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit
7d85196a30d8d29be98ffb252a13ef2a78057742. Bugbot is set up for automated
code reviews on this repo. Configure
[here](https://www.cursor.com/dashboard/bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
2026-04-30 20:03:53 -03:00

335 lines
12 KiB
Python

"""End-to-end integration tests for the MCP debug-exposure escape hatch.
These complement the pure-function unit tests in ``test_debug_exposure.py`` by
exercising the full MCP tool-call path:
tool raises -> ToolExecutor.run -> ToolOutputFactory.fail ->
MCPServer._call_tool -> augment_error_message_for_debug ->
CallToolResult.content[0].text
This is the path every real MCP client hits, so it's where regressions in the
wire-up (wrong call site, wrong argument order, missing import, etc.) would
actually surface. The unit tests can't catch those because they call the pure
function directly.
"""
from typing import Annotated
import pytest
import pytest_asyncio
from arcade_core.catalog import MaterializedTool, ToolCatalog, ToolMeta, create_func_models
from arcade_core.errors import FatalToolError
from arcade_core.schema import (
InputParameter,
ToolDefinition,
ToolInput,
ToolkitDefinition,
ToolOutput,
ToolRequirements,
ValueSchema,
)
from arcade_mcp_server import _debug_exposure as debug_exposure
from arcade_mcp_server import tool
from arcade_mcp_server.server import MCPServer
from arcade_mcp_server.settings import MCPSettings
from arcade_mcp_server.types import CallToolRequest, CallToolResult, JSONRPCResponse
_LEAK_MAGIC = "yes-i-accept-leaking-internals-to-the-agent"
_ENV_DEV_MSG = "ARCADE_DEBUG_EXPOSE_DEVELOPER_MESSAGE_IN_TOOL_ERROR_RESPONSES"
_ENV_STACKTRACE = "ARCADE_DEBUG_EXPOSE_STACKTRACE_IN_TOOL_ERROR_RESPONSES"
@pytest.fixture(autouse=True)
def _reset_leak_state(monkeypatch):
monkeypatch.delenv(_ENV_DEV_MSG, raising=False)
monkeypatch.delenv(_ENV_STACKTRACE, raising=False)
debug_exposure._warned_rejected.clear()
debug_exposure._warned_activated.clear()
yield
debug_exposure._warned_rejected.clear()
debug_exposure._warned_activated.clear()
# ---- Tool definitions used by the integration tests -------------------------
@tool
def raises_fatal_tool_error(
query: Annotated[str, "A query"],
) -> Annotated[str, "Result"]:
"""Simulates a toolkit author's tool failing with a rich error."""
raise FatalToolError(
message="Failed to fetch results",
developer_message=f"HTTP 503 on upstream endpoint for query={query!r}",
)
@tool
def raises_unhandled_exception(
query: Annotated[str, "A query"],
) -> Annotated[str, "Result"]:
"""Simulates a toolkit author's tool crashing with an unexpected exception.
The executor's generic `except Exception` branch populates the stacktrace
via `traceback.format_exc()`, which is what the stacktrace flag leaks.
"""
raise ValueError(f"unexpected crash for query={query!r}")
def _materialized(func, name: str) -> MaterializedTool:
definition = ToolDefinition(
name=name,
fully_qualified_name=f"TestToolkit.{name}",
description=f"{name} integration fixture",
toolkit=ToolkitDefinition(name="TestToolkit", description="", version="1.0.0"),
input=ToolInput(
parameters=[
InputParameter(
name="query",
required=True,
description="A query",
value_schema=ValueSchema(val_type="string"),
),
]
),
output=ToolOutput(
description="Result",
value_schema=ValueSchema(val_type="string"),
),
requirements=ToolRequirements(),
)
input_model, output_model = create_func_models(func)
return MaterializedTool(
tool=func,
definition=definition,
meta=ToolMeta(module=func.__module__, toolkit="TestToolkit"),
input_model=input_model,
output_model=output_model,
)
@pytest.fixture
def erroring_catalog() -> ToolCatalog:
catalog = ToolCatalog()
mt1 = _materialized(raises_fatal_tool_error, "raises_fatal_tool_error")
mt2 = _materialized(raises_unhandled_exception, "raises_unhandled_exception")
catalog._tools[mt1.definition.get_fully_qualified_name()] = mt1
catalog._tools[mt2.definition.get_fully_qualified_name()] = mt2
return catalog
@pytest_asyncio.fixture
async def erroring_server(erroring_catalog) -> MCPServer:
settings = MCPSettings()
settings.middleware.mask_error_details = False
server = MCPServer(
catalog=erroring_catalog,
name="Integration Debug Exposure Server",
version="0.0.0",
settings=settings,
)
await server.start()
try:
yield server
finally:
await server.stop()
async def _call(erroring_server: MCPServer, tool_name: str) -> CallToolResult:
message = CallToolRequest(
jsonrpc="2.0",
id=1,
method="tools/call",
params={"name": f"TestToolkit.{tool_name}", "arguments": {"query": "ping"}},
)
response = await erroring_server._handle_call_tool(message)
assert isinstance(response, JSONRPCResponse)
assert isinstance(response.result, CallToolResult)
assert response.result.isError is True
assert response.result.structuredContent is None
return response.result
# ---- Integration tests ------------------------------------------------------
@pytest.mark.asyncio
async def test_integration_baseline_no_leak(erroring_server):
"""Default state: the agent sees ONLY the sanitized message."""
result = await _call(erroring_server, "raises_fatal_tool_error")
text = result.content[0].text
assert "Failed to fetch results" in text
assert "[DEBUG]" not in text
assert "HTTP 503" not in text
assert "query='ping'" not in text
@pytest.mark.asyncio
async def test_integration_tool_error_records_developer_message_on_current_span(
erroring_server, monkeypatch
):
"""Developer message stays out of MCP content but is attached to telemetry."""
class FakeSpan:
def __init__(self):
self.attributes = {}
def is_recording(self):
return True
def set_attribute(self, key, value):
self.attributes[key] = value
span = FakeSpan()
monkeypatch.setattr("arcade_mcp_server.server.trace.get_current_span", lambda: span)
result = await _call(erroring_server, "raises_fatal_tool_error")
text = result.content[0].text
assert "Failed to fetch results" in text
assert "HTTP 503" not in text
assert span.attributes["tool_error_kind"] == "TOOL_RUNTIME_FATAL"
assert span.attributes["tool_error_message"].startswith("[TOOL_RUNTIME_FATAL] FatalToolError")
assert (
span.attributes["tool_error_developer_message"]
== "[TOOL_RUNTIME_FATAL] FatalToolError during execution of tool "
"'raises_fatal_tool_error': HTTP 503 on upstream endpoint for query='ping'"
)
@pytest.mark.asyncio
async def test_integration_tool_error_skips_span_attributes_when_span_not_recording(
erroring_server, monkeypatch
):
"""A no-op/non-recording span should not be mutated."""
class FakeSpan:
def __init__(self):
self.attributes = {}
def is_recording(self):
return False
def set_attribute(self, key, value):
self.attributes[key] = value
span = FakeSpan()
monkeypatch.setattr("arcade_mcp_server.server.trace.get_current_span", lambda: span)
result = await _call(erroring_server, "raises_fatal_tool_error")
text = result.content[0].text
assert "Failed to fetch results" in text
assert "HTTP 503" not in text
assert span.attributes == {}
@pytest.mark.asyncio
async def test_integration_boolean_rejected_no_leak(erroring_server, monkeypatch, caplog):
"""Boolean-looking values are rejected by the MCP boundary too."""
monkeypatch.setenv(_ENV_DEV_MSG, "true")
import logging
with caplog.at_level(logging.WARNING, logger="arcade_mcp_server._debug_exposure"):
result = await _call(erroring_server, "raises_fatal_tool_error")
text = result.content[0].text
assert "Failed to fetch results" in text
assert "[DEBUG]" not in text
assert "HTTP 503" not in text
assert any("set to a truthy value but not to the required" in r.message for r in caplog.records)
@pytest.mark.asyncio
async def test_integration_developer_message_flag_leaks_through_mcp(erroring_server, monkeypatch):
"""When the flag is set to the magic value, the MCP response `content`
carries `developer_message` alongside the sanitized message."""
monkeypatch.setenv(_ENV_DEV_MSG, _LEAK_MAGIC)
result = await _call(erroring_server, "raises_fatal_tool_error")
text = result.content[0].text
assert "Failed to fetch results" in text
assert "[DEBUG] developer_message:" in text
assert "HTTP 503 on upstream endpoint for query='ping'" in text
# Stacktrace flag is off — stacktrace must NOT leak.
assert "[DEBUG] stacktrace:" not in text
@pytest.mark.asyncio
async def test_integration_stacktrace_flag_leaks_traceback_through_mcp(
erroring_server, monkeypatch
):
"""Unhandled exceptions go through the executor's generic except branch,
which populates a real stacktrace. With the flag on, that stacktrace must
appear in the MCP response content."""
monkeypatch.setenv(_ENV_STACKTRACE, _LEAK_MAGIC)
result = await _call(erroring_server, "raises_unhandled_exception")
text = result.content[0].text
# The generic-exception branch wraps the message with the tool name.
assert "raises_unhandled_exception" in text
assert "[DEBUG] stacktrace:" in text
assert "Traceback" in text
assert "ValueError" in text
assert "unexpected crash for query='ping'" in text
@pytest.mark.asyncio
async def test_integration_stacktrace_flag_reports_missing_traceback(erroring_server, monkeypatch):
"""Directly raised ToolRuntimeError values may not carry a stacktrace.
When the stacktrace flag is enabled, the MCP response should say that
explicitly instead of silently omitting the stacktrace debug section.
"""
monkeypatch.setenv(_ENV_STACKTRACE, _LEAK_MAGIC)
result = await _call(erroring_server, "raises_fatal_tool_error")
text = result.content[0].text
assert "Failed to fetch results" in text
assert "[DEBUG] stacktrace: unavailable" in text
assert "tool error payload did not include one" in text
assert "HTTP 503" not in text
@pytest.mark.asyncio
async def test_integration_both_flags_leak_through_mcp(erroring_server, monkeypatch):
"""Both flags together on an unhandled exception: developer_message (from
`str(e)` in the executor) AND the stacktrace both reach the MCP content."""
monkeypatch.setenv(_ENV_DEV_MSG, _LEAK_MAGIC)
monkeypatch.setenv(_ENV_STACKTRACE, _LEAK_MAGIC)
result = await _call(erroring_server, "raises_unhandled_exception")
text = result.content[0].text
assert "[DEBUG] developer_message:" in text
assert "unexpected crash for query='ping'" in text
assert "[DEBUG] stacktrace:" in text
assert "Traceback" in text
@pytest.mark.asyncio
async def test_integration_success_path_unaffected_by_flags(
tool_catalog, mcp_settings, monkeypatch
):
"""Sanity check: even with both flags on, SUCCESSFUL tool responses are
not touched. The augmentation only runs on the error branch."""
monkeypatch.setenv(_ENV_DEV_MSG, _LEAK_MAGIC)
monkeypatch.setenv(_ENV_STACKTRACE, _LEAK_MAGIC)
server = MCPServer(
catalog=tool_catalog,
name="Success Path Server",
version="0.0.0",
settings=mcp_settings,
)
await server.start()
try:
response = await server._handle_call_tool(
CallToolRequest(
jsonrpc="2.0",
id=1,
method="tools/call",
params={"name": "TestToolkit.test_tool", "arguments": {"text": "hi"}},
)
)
finally:
await server.stop()
assert isinstance(response, JSONRPCResponse)
assert isinstance(response.result, CallToolResult)
assert response.result.isError is False
assert response.result.structuredContent is not None
for item in response.result.content:
assert "[DEBUG]" not in getattr(item, "text", "")