diff --git a/libs/arcade-core/arcade_core/errors.py b/libs/arcade-core/arcade_core/errors.py index 707bf880..e9df316c 100644 --- a/libs/arcade-core/arcade_core/errors.py +++ b/libs/arcade-core/arcade_core/errors.py @@ -69,7 +69,10 @@ class ToolkitError(Exception, ABC): The error with the context added to the message. """ prefix = self.create_message_prefix(name) - self.message = f"{prefix}{self.message}" # type: ignore[has-type] + # Substitute placeholder when message is empty so the prefixed result + # always has a non-empty diagnostic body. + body = self.message if self.message and self.message.strip() else "(no details provided)" # type: ignore[has-type] + self.message = f"{prefix}{body}" if hasattr(self, "developer_message") and self.developer_message: # type: ignore[has-type] self.developer_message = f"{prefix}{self.developer_message}" # type: ignore[has-type] @@ -153,6 +156,13 @@ class ToolRuntimeError(ToolError, RuntimeError): Any failure starting from when the tool call begins until the tool call returns. Note: This class should typically not be instantiated directly, but rather subclassed. + + Runtime error payload contract: + - ``message`` is agent/client-facing. It is returned to tool consumers and can be shown to + agents or end users, so it should contain only intentional, safe context. + - ``developer_message`` is for server-side diagnostics. It is preserved in structured + error payloads and logs for debugging, and may contain more implementation detail than + ``message``. """ kind: ErrorKind = ErrorKind.TOOL_RUNTIME_FATAL @@ -167,6 +177,16 @@ class ToolRuntimeError(ToolError, RuntimeError): *, extra: dict[str, Any] | None = None, ): + """ + Args: + message: Safe summary intended for the agent/client. This value may be surfaced + directly in tool-call error responses, so do not include secrets, tokens, raw API + responses, or other sensitive details unless you explicitly want them exposed. + developer_message: Additional server-side debugging context. Use this for verbose + diagnostics such as raw upstream errors, stack-oriented clues, or internal + identifiers that should not be the primary message shown to the agent/client. + extra: Structured metadata to preserve alongside the error. + """ super().__init__(message) self.message = message self.developer_message = developer_message # type: ignore[assignment] diff --git a/libs/arcade-core/arcade_core/executor.py b/libs/arcade-core/arcade_core/executor.py index 1dfc2e3f..3589ba0b 100644 --- a/libs/arcade-core/arcade_core/executor.py +++ b/libs/arcade-core/arcade_core/executor.py @@ -101,9 +101,23 @@ class ToolExecutor: inputs = input_model(**kwargs) except ValidationError as e: + # IMPORTANT: do NOT include err["input"] or str(e) anywhere in the + # surfaced message/developer_message. Pydantic's ``str(e)`` and + # ``err["input"]`` echo the offending input value verbatim — which + # may contain user secrets (passwords, tokens, PII). Both fields + # below intentionally carry only field path + reason + Pydantic + # error type code, never the rejected value itself. + summary = "; ".join( + f"{'.'.join(str(loc) for loc in err['loc']) or ''}: {err['msg']}" + for err in e.errors() + ) + developer_summary = "; ".join( + f"{'.'.join(str(loc) for loc in err['loc']) or ''}[{err['type']}]" + for err in e.errors() + ) raise ToolInputError( - message="Error in tool input deserialization", - developer_message=str(e), + message=f"Invalid input: {summary}", + developer_message=f"Pydantic validation failed: {developer_summary}", ) from e return inputs diff --git a/libs/arcade-core/arcade_core/log_extras.py b/libs/arcade-core/arcade_core/log_extras.py new file mode 100644 index 00000000..8ab40bf9 --- /dev/null +++ b/libs/arcade-core/arcade_core/log_extras.py @@ -0,0 +1,60 @@ +"""Shared helper for tool-call-error structured-log extras. + +Field names (``error_*``, ``tool_name``, ``toolkit_version``) are load-bearing +for Datadog dashboards — renaming is a breaking change for downstream alerts. +""" + +from __future__ import annotations + +from typing import Any + +from arcade_core.schema import ToolCallError + + +def build_tool_error_log_extra( + error: ToolCallError, + *, + tool_name: str, + toolkit_name: str | None = None, + toolkit_version: str | None = None, + **additional: Any, +) -> dict[str, Any]: + """Build the structured ``extra`` dict for a failed-tool-call WARNING log. + + Args: + error: The ``ToolCallError``. ``kind`` is normalized via ``.value`` + so Datadog facets on a stable string, not ``"ErrorKind.X"``. + tool_name: Resolved tool identifier — callers should pass the same + value OTel spans / metrics use (e.g. ``tool_fqname.name``) so + logs correlate across signals. + toolkit_name: Optional toolkit identifier. + toolkit_version: Optional resolved toolkit version. + **additional: Caller-specific extras (e.g. ``execution_id``). Cannot + override canonical fields. + + Returns: + Dict suitable for ``logger.warning(..., extra=)``. Optional + *kwargs* are omitted when not supplied so Datadog can distinguish + "unset" from "set-to-None"; canonical ``error_*`` fields are always + present (``None`` when unset on the error). + """ + kind_value = error.kind.value if hasattr(error.kind, "value") else str(error.kind) + + extra: dict[str, Any] = { + "error_kind": kind_value, + "error_message": error.message, + "error_developer_message": error.developer_message, + "error_status_code": error.status_code, + "error_can_retry": error.can_retry, + "tool_name": tool_name, + } + if toolkit_name is not None: + extra["toolkit_name"] = toolkit_name + if toolkit_version is not None: + extra["toolkit_version"] = toolkit_version + + for k, v in additional.items(): + if k in extra: + continue + extra[k] = v + return extra diff --git a/libs/arcade-core/arcade_core/output.py b/libs/arcade-core/arcade_core/output.py index d5863802..349aac0d 100644 --- a/libs/arcade-core/arcade_core/output.py +++ b/libs/arcade-core/arcade_core/output.py @@ -71,6 +71,8 @@ class ToolOutputFactory: status_code: int | None = None, extra: dict[str, Any] | None = None, ) -> ToolCallOutput: + if not message or not message.strip(): + message = "Unspecified error during tool execution" return ToolCallOutput( error=ToolCallError( message=message, @@ -103,7 +105,8 @@ class ToolOutputFactory: DEPRECATED: Use ToolOutputFactory.fail instead. This method will be removed in version 3.0.0 """ - + if not message or not message.strip(): + message = "Unspecified error during tool execution" return ToolCallOutput( error=ToolCallError( message=message, diff --git a/libs/arcade-core/pyproject.toml b/libs/arcade-core/pyproject.toml index b579d40d..33a3baa1 100644 --- a/libs/arcade-core/pyproject.toml +++ b/libs/arcade-core/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "arcade-core" -version = "4.6.1" +version = "4.6.2" description = "Arcade Core - Core library for Arcade platform" readme = "README.md" license = { text = "MIT" } diff --git a/libs/arcade-mcp-server/arcade_mcp_server/logging_utils.py b/libs/arcade-mcp-server/arcade_mcp_server/logging_utils.py index 0b59ebea..88119283 100644 --- a/libs/arcade-mcp-server/arcade_mcp_server/logging_utils.py +++ b/libs/arcade-mcp-server/arcade_mcp_server/logging_utils.py @@ -2,6 +2,7 @@ import logging import sys +from typing import ClassVar from loguru import logger @@ -11,15 +12,80 @@ class LoguruInterceptHandler(logging.Handler): This handler bridges the standard Python logging module with Loguru, ensuring all logs (from both systems) use the same formatting. + Preserves custom 'extra' fields (used for structured logging). """ + # Standard fields populated on every ``logging.LogRecord`` by the stdlib + # logging module itself. They are NOT custom extras passed via + # ``logger.warning(..., extra={...})`` and forwarding them to loguru would + # both clobber loguru's own keys and pollute the structured log payload + # with redundant data. + # + # The set is the union of: + # * Attributes assigned in ``logging.LogRecord.__init__`` (CPython source: + # Lib/logging/__init__.py — ``LogRecord.__init__``). This covers + # ``name, msg, args, levelname, levelno, pathname, filename, module, + # exc_info, exc_text, stack_info, lineno, funcName, created, msecs, + # relativeCreated, thread, threadName, processName, process``. + # * ``taskName`` — added in Python 3.12 by ``LogRecord.__init__`` for + # asyncio task identification. Included unconditionally so the set + # stays correct on 3.10-3.11 (key just won't be present at runtime). + # * ``message`` — set lazily by ``LogRecord.getMessage()``; appears on + # ``record.__dict__`` after formatting. + # * ``asctime`` — set lazily by formatters that reference ``%(asctime)s``. + # + # Anything else in ``record.__dict__`` is, by stdlib convention, a custom + # extra passed via the ``extra={...}`` kwarg, and IS forwarded to loguru. + _STANDARD_FIELDS: ClassVar[set[str]] = { + "name", + "msg", + "args", + "created", + "msecs", + "levelname", + "levelno", + "pathname", + "filename", + "module", + "exc_info", + "exc_text", + "stack_info", + "lineno", + "funcName", + "relativeCreated", + "thread", + "threadName", + "processName", + "process", + "taskName", + "message", + "asctime", + } + def emit(self, record: logging.LogRecord) -> None: + # Resolve the loguru level: prefer the named level (DEBUG/INFO/WARNING/ + # ERROR/CRITICAL — all built into loguru), and for custom or otherwise + # unknown levels fall back to the integer ``levelno``. We pass an int + # (not ``str(levelno)``) because loguru only resolves stringified + # levels via name lookup — passing ``"42"`` raises ValueError, while + # passing the integer ``42`` is accepted directly per loguru's API. + level: str | int try: level = logger.level(record.levelname).name except ValueError: - level = str(record.levelno) + level = record.levelno - logger.opt(exception=record.exc_info).log(level, record.getMessage()) + # Extract custom extra fields from the log record. + # These are fields added via logger.warning(..., extra={...}). + # By preserving them and passing to loguru via bind(), they become + # part of the log record's extra dict, enabling Datadog faceting. + extras = {k: v for k, v in record.__dict__.items() if k not in self._STANDARD_FIELDS} + + # Bind extras to loguru context, then log the message. + # logger.bind() merges extras into the record["extra"] dict, + # making them available to sinks (e.g., Datadog, ELK) that support + # structured logging. + logger.bind(**extras).opt(exception=record.exc_info).log(level, record.getMessage()) def intercept_standard_logging() -> None: diff --git a/libs/arcade-mcp-server/arcade_mcp_server/server.py b/libs/arcade-mcp-server/arcade_mcp_server/server.py index 2c9770e6..22c63028 100644 --- a/libs/arcade-mcp-server/arcade_mcp_server/server.py +++ b/libs/arcade-mcp-server/arcade_mcp_server/server.py @@ -22,8 +22,9 @@ from typing import Any, Callable, cast from arcade_core.auth_tokens import get_valid_access_token from arcade_core.catalog import MaterializedTool, ToolCatalog from arcade_core.executor import ToolExecutor +from arcade_core.log_extras import build_tool_error_log_extra from arcade_core.network.org_transport import build_org_scoped_async_http_client -from arcade_core.schema import ToolAuthorizationContext, ToolContext +from arcade_core.schema import ToolAuthorizationContext, ToolCallError, ToolContext from arcade_core.schema import ToolAuthRequirement as CoreToolAuthRequirement from arcadepy import ArcadeError, AsyncArcade from arcadepy.types.auth_authorize_params import AuthRequirement, AuthRequirementOauth2 @@ -927,10 +928,21 @@ class MCPServer: ), ) else: - error = result.error or "Error calling tool" - content = convert_to_mcp_content(str(error)) + error = result.error + if error: + error_text = error.message + if error.additional_prompt_content: + error_text += f"\n\n{error.additional_prompt_content}" + content = convert_to_mcp_content(error_text) + self._log_tool_call_error(tool_name, error) + else: + content = convert_to_mcp_content("Error calling tool") self._tracker.track_tool_call(False, "error during tool execution") + # NOTE: structuredContent must be None on error responses. + # Per the MCP spec, structuredContent MUST validate against outputSchema — + # but error payloads will violate a tool's declared output schema. + # The error message is conveyed via ``content`` (TextContent) instead. return JSONRPCResponse( id=message.id, result=CallToolResult( @@ -980,6 +992,13 @@ class MCPServer: }, ) + def _log_tool_call_error(self, tool_name: str, error: ToolCallError) -> None: + """Emit a structured WARNING log for a failed tool call.""" + logger.warning( + f"Tool {tool_name} error: {error.message}", + extra=build_tool_error_log_extra(error, tool_name=tool_name), + ) + def _create_error_response( self, message: CallToolRequest, tool_response: dict[str, Any] ) -> JSONRPCResponse[CallToolResult]: diff --git a/libs/arcade-mcp-server/pyproject.toml b/libs/arcade-mcp-server/pyproject.toml index 41e30446..bebdffa2 100644 --- a/libs/arcade-mcp-server/pyproject.toml +++ b/libs/arcade-mcp-server/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "arcade-mcp-server" -version = "1.19.2" +version = "1.19.3" description = "Model Context Protocol (MCP) server framework for Arcade.dev" readme = "README.md" authors = [{ name = "Arcade.dev" }] diff --git a/libs/arcade-serve/arcade_serve/core/base.py b/libs/arcade-serve/arcade_serve/core/base.py index 5a5b8e7f..40b5dae5 100644 --- a/libs/arcade-serve/arcade_serve/core/base.py +++ b/libs/arcade-serve/arcade_serve/core/base.py @@ -6,6 +6,7 @@ from typing import Any, Callable, ClassVar from arcade_core.catalog import ToolCatalog, Toolkit from arcade_core.executor import ToolExecutor +from arcade_core.log_extras import build_tool_error_log_extra from arcade_core.schema import ( ToolCallRequest, ToolCallResponse, @@ -109,8 +110,9 @@ class BaseWorker(Worker): try: materialized_tool = self.catalog.get_tool(tool_fqname) except KeyError: + # Use resolved fqname components to match OTel span / metric labels. raise ValueError( - f"Tool {tool_fqname} not found in catalog with toolkit version {tool_request.tool.version}." + f"Tool {tool_fqname.name} not found in catalog with toolkit version {tool_fqname.toolkit_version}." ) start_time = time.time() @@ -127,7 +129,8 @@ class BaseWorker(Worker): ) execution_id = tool_request.execution_id or "" logger.info( - f"{execution_id} | Calling tool: {tool_fqname} version: {tool_request.tool.version}" + f"{execution_id} | Calling tool: {tool_fqname.name} " + f"version: {tool_fqname.toolkit_version}" ) logger.debug(f"{execution_id} | Tool inputs: {tool_request.inputs}") @@ -151,21 +154,31 @@ class BaseWorker(Worker): duration_ms = (end_time - start_time) * 1000 # Convert to milliseconds if output.error: - logger.warning( - f"{execution_id} | Tool {tool_fqname} version {tool_request.tool.version} failed" + log_extra = build_tool_error_log_extra( + output.error, + tool_name=str(tool_fqname.name), + toolkit_name=str(tool_fqname.toolkit_name), + toolkit_version=str(tool_fqname.toolkit_version), + execution_id=execution_id, ) - logger.warning(f"{execution_id} | Tool error: {output.error.message}") logger.warning( - f"{execution_id} | Tool developer message: {output.error.developer_message}" + f"{execution_id} | Tool {tool_fqname.name} version {tool_fqname.toolkit_version} failed: {output.error.message}", + extra=log_extra, ) + if output.error.developer_message: + logger.warning( + f"{execution_id} | Developer message: {output.error.developer_message}", + ) logger.debug( f"{execution_id} | duration: {duration_ms}ms | Tool output: {output.value}" ) if output.error.stacktrace: logger.debug(f"{execution_id} | Tool traceback: {output.error.stacktrace}") else: + # Match the failure-path identifiers for log correlation. logger.info( - f"{execution_id} | Tool {tool_fqname} version {tool_request.tool.version} success" + f"{execution_id} | Tool {tool_fqname.name} " + f"version {tool_fqname.toolkit_version} success" ) logger.debug( f"{execution_id} | duration: {duration_ms}ms | Tool output: {output.value}" diff --git a/libs/arcade-serve/pyproject.toml b/libs/arcade-serve/pyproject.toml index e72b95e8..b4082daa 100644 --- a/libs/arcade-serve/pyproject.toml +++ b/libs/arcade-serve/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "arcade-serve" -version = "3.2.2" +version = "3.2.3" description = "Arcade Serve - Serving infrastructure for Arcade tools and workers" readme = "README.md" license = {text = "MIT"} diff --git a/libs/arcade-tdk/arcade_tdk/tool.py b/libs/arcade-tdk/arcade_tdk/tool.py index 09f599e8..d553fde6 100644 --- a/libs/arcade-tdk/arcade_tdk/tool.py +++ b/libs/arcade-tdk/arcade_tdk/tool.py @@ -88,6 +88,22 @@ def _raise_as_arcade_error( Raises: ToolRuntimeError or some subclass thereof + + Data-leak policy (strict): + The fallback path below NEVER places ``str(exception)`` content into + the agent-facing ``message`` field. Tool authors commonly embed user + input in raised exception messages + (e.g. ``raise ValueError(f"Bad password: {password}")``) and the + framework cannot reliably distinguish secrets/PII from safe context. + + Resolution: + * ``message`` (agent-facing) — exception **type only**, no + ``str(exception)`` content. + * ``developer_message`` (server-side logs only — ``error_developer_message`` + Datadog facet, never returned to the client) — carries the + full ``f"{ExceptionType}: {str(exception)}"`` so on-call engineers + retain debugging context. Authorized log access is the security + boundary, not the agent transport. """ for adapter in adapter_chain: try: @@ -100,9 +116,17 @@ def _raise_as_arcade_error( if isinstance(mapped, ToolRuntimeError): raise mapped from exception + exc_type = type(exception).__name__ + exc_str = str(exception).strip() + # Agent-facing: type only — never str(exception). See "Data-leak policy". + message = f"An unhandled {exc_type} was raised by the tool." + # Server-side debugging: full content goes to logs only. + developer_message = ( + f"{exc_type}: {exc_str}" if exc_str else f"{exc_type} (no exception message)" + ) raise FatalToolError( - message=f"{exception!s}", - developer_message=f"{exception!s}", + message=message, + developer_message=developer_message, ) from exception diff --git a/libs/arcade-tdk/pyproject.toml b/libs/arcade-tdk/pyproject.toml index e0dcc61a..e14de148 100644 --- a/libs/arcade-tdk/pyproject.toml +++ b/libs/arcade-tdk/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "arcade-tdk" -version = "3.6.0" +version = "3.6.1" description = "Arcade TDK - Toolkit Development Kit for building Arcade tools" readme = "README.md" license = { text = "MIT" } diff --git a/libs/tests/arcade_mcp_server/test_logging_utils.py b/libs/tests/arcade_mcp_server/test_logging_utils.py new file mode 100644 index 00000000..b5c7364c --- /dev/null +++ b/libs/tests/arcade_mcp_server/test_logging_utils.py @@ -0,0 +1,140 @@ +"""Integration tests for ``LoguruInterceptHandler``. + +Exercises the full stdlib-logging → handler → loguru → sink path with a real +loguru sink (no mocks). The contract under test: + +* ``extra={"foo": "bar"}`` from a stdlib ``logger.warning`` call must reach + the loguru sink as ``record["extra"]["foo"] == "bar"`` so downstream + structured-log shippers (Datadog, ELK, etc.) can facet on it. +* Stdlib-internal LogRecord fields (``levelname``, ``pathname``, ``thread``, + ``message``, ``asctime``, …) must NOT be forwarded — that would both + clobber loguru's own keys and bloat the payload. +* Exception info must be preserved and rendered by loguru. +* The renderable message body (``record.getMessage()``) must reach the sink + intact. +""" + +from __future__ import annotations + +import logging + +import pytest +from arcade_mcp_server.logging_utils import LoguruInterceptHandler +from loguru import logger as loguru_logger + + +@pytest.fixture +def loguru_sink(): + """Capture every loguru record into a list and tear down at end.""" + captured: list[dict] = [] + + def sink(message): # message is loguru's "Message" wrapping a record dict + captured.append(dict(message.record)) + + sink_id = loguru_logger.add(sink, level="DEBUG", format="{message}") + yield captured + loguru_logger.remove(sink_id) + + +@pytest.fixture +def stdlib_logger_with_intercept(): + """A real stdlib logger wired through LoguruInterceptHandler — no mocks.""" + log = logging.getLogger("arcade_mcp_server.tests.intercept_integration") + log.handlers.clear() + log.addHandler(LoguruInterceptHandler()) + log.setLevel(logging.DEBUG) + log.propagate = False + yield log + log.handlers.clear() + + +def test_extra_fields_reach_loguru_sink(stdlib_logger_with_intercept, loguru_sink): + """The whole point of the handler: extras must survive the bridge.""" + stdlib_logger_with_intercept.warning( + "tool failure", + extra={ + "error_kind": "TOOL_RUNTIME_FATAL", + "tool_name": "MyToolkit.MyTool", + "error_status_code": 500, + }, + ) + + assert len(loguru_sink) == 1 + extra = loguru_sink[0]["extra"] + assert extra["error_kind"] == "TOOL_RUNTIME_FATAL" + assert extra["tool_name"] == "MyToolkit.MyTool" + assert extra["error_status_code"] == 500 + + +def test_stdlib_internal_fields_not_forwarded(stdlib_logger_with_intercept, loguru_sink): + """``record.__dict__`` carries dozens of stdlib-internal attributes + (``levelname``, ``pathname``, ``thread``, ``module``, …). Forwarding any + of them via ``logger.bind`` would clobber loguru's own equivalents and + pollute the structured payload.""" + stdlib_logger_with_intercept.warning("hello", extra={"my_extra": "kept"}) + + assert len(loguru_sink) == 1 + extra = loguru_sink[0]["extra"] + # The legitimate user-supplied extra survives. + assert extra.get("my_extra") == "kept" + # None of the stdlib internals leak into the bound extras. + for forbidden in ( + "levelname", + "levelno", + "pathname", + "filename", + "module", + "lineno", + "funcName", + "thread", + "threadName", + "processName", + "process", + "msg", + "args", + "exc_info", + "exc_text", + "stack_info", + "created", + "msecs", + "relativeCreated", + "name", + ): + assert forbidden not in extra, f"stdlib field {forbidden!r} leaked into loguru extra" + + +def test_message_body_preserved(stdlib_logger_with_intercept, loguru_sink): + """``record.getMessage()`` (which performs ``%`` interpolation of args) + must reach the sink — losing the renderable body would defeat the bridge.""" + stdlib_logger_with_intercept.warning("user %s did %s", "alice", "logout") + + assert loguru_sink[0]["message"] == "user alice did logout" + + +def test_exception_info_preserved(stdlib_logger_with_intercept, loguru_sink): + """``exc_info`` from ``logger.exception(...)`` must propagate so loguru + can render the traceback.""" + try: + raise ValueError("a deliberately raised test exception") + except ValueError: + stdlib_logger_with_intercept.exception("handled exception in test") + + assert len(loguru_sink) == 1 + exc = loguru_sink[0]["exception"] + assert exc is not None + # ValueError is a builtin so loguru's exception record carries the type. + assert exc.type is ValueError + assert "deliberately raised" in str(exc.value) + + +def test_unknown_level_falls_back_to_numeric(stdlib_logger_with_intercept, loguru_sink): + """A custom numeric level not registered with loguru must still be + accepted (handler falls back to ``str(record.levelno)`` when + ``logger.level(name)`` raises ValueError).""" + custom_level_no = 42 + logging.addLevelName(custom_level_no, "CUSTOM_TEST_LEVEL") + stdlib_logger_with_intercept.log(custom_level_no, "custom level emit") + + # The fallback path goes through ``logger.log(str(record.levelno), ...)``, + # which loguru accepts as a numeric level. The record reaches the sink. + assert any(r["message"] == "custom level emit" for r in loguru_sink) diff --git a/libs/tests/arcade_mcp_server/test_server.py b/libs/tests/arcade_mcp_server/test_server.py index 0b5387f6..49dfd8cc 100644 --- a/libs/tests/arcade_mcp_server/test_server.py +++ b/libs/tests/arcade_mcp_server/test_server.py @@ -4,16 +4,18 @@ import asyncio import contextlib import json from typing import Annotated -from unittest.mock import AsyncMock, Mock +from unittest.mock import AsyncMock, Mock, patch import pytest from arcade_core.auth import OAuth2 from arcade_core.catalog import MaterializedTool, ToolMeta, create_func_models -from arcade_core.errors import ToolRuntimeError +from arcade_core.errors import ErrorKind, ToolRuntimeError from arcade_core.schema import ( InputParameter, OAuth2Requirement, ToolAuthRequirement, + ToolCallError, + ToolCallOutput, ToolContext, ToolDefinition, ToolInput, @@ -1799,3 +1801,188 @@ class TestToolMetaExtensionEdgeCases: assert "skipped: tool not found" in caplog.text finally: await server.stop() + + +class TestToolErrorResponse: + """Test that tool error responses surface structured error data to agents.""" + + def _make_error_output( + self, + message: str = "Something went wrong", + developer_message: str | None = None, + additional_prompt_content: str | None = None, + kind: ErrorKind = ErrorKind.TOOL_RUNTIME_FATAL, + ) -> ToolCallOutput: + return ToolCallOutput( + error=ToolCallError( + message=message, + developer_message=developer_message, + additional_prompt_content=additional_prompt_content, + kind=kind, + ) + ) + + @pytest.mark.asyncio + async def test_tool_error_content_includes_additional_prompt(self, mcp_server): + error_output = self._make_error_output( + message="Spreadsheet not found", + additional_prompt_content="Available options: X, Y", + ) + message = CallToolRequest( + jsonrpc="2.0", + id=1, + method="tools/call", + params={"name": "TestToolkit.test_tool", "arguments": {"text": "test"}}, + ) + + with patch( + "arcade_mcp_server.server.ToolExecutor.run", + new_callable=AsyncMock, + return_value=error_output, + ): + response = await mcp_server._handle_call_tool(message) + + assert response.result.isError is True + text = response.result.content[0].text + assert "Available options: X, Y" in text + + @pytest.mark.asyncio + async def test_tool_error_structured_content_is_none(self, mcp_server): + """Per MCP spec, structuredContent must be None on error responses so + consumers don't attempt to validate an error payload against the tool's + declared outputSchema. Error details are conveyed via content text.""" + error_output = self._make_error_output(message="fail") + message = CallToolRequest( + jsonrpc="2.0", + id=1, + method="tools/call", + params={"name": "TestToolkit.test_tool", "arguments": {"text": "test"}}, + ) + + with patch( + "arcade_mcp_server.server.ToolExecutor.run", + new_callable=AsyncMock, + return_value=error_output, + ): + response = await mcp_server._handle_call_tool(message) + + assert response.result.isError is True + assert response.result.structuredContent is None + assert "fail" in response.result.content[0].text + + @pytest.mark.asyncio + async def test_tool_error_content_no_pydantic_repr(self, mcp_server): + error_output = self._make_error_output(message="fail") + message = CallToolRequest( + jsonrpc="2.0", + id=1, + method="tools/call", + params={"name": "TestToolkit.test_tool", "arguments": {"text": "test"}}, + ) + + with patch( + "arcade_mcp_server.server.ToolExecutor.run", + new_callable=AsyncMock, + return_value=error_output, + ): + response = await mcp_server._handle_call_tool(message) + + text = response.result.content[0].text + assert "kind= Annotated[str, "output"]: + """Tool with multiple required fields""" + return f"{name} is {age}" + + +catalog.add_tool(multi_field_tool, "MultiFieldToolkit") + + +@pytest.mark.asyncio +async def test_multiple_bad_fields_in_input_error(): + tool_definition = catalog.find_tool_by_func(multi_field_tool) + full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name()) + dummy_context = ToolContext() + + output = await ToolExecutor.run( + func=multi_field_tool, + definition=tool_definition, + input_model=full_tool.input_model, + output_model=full_tool.output_model, + context=dummy_context, + name=123, # wrong type + age="not_an_int", # wrong type + ) + + assert output.error is not None + assert "Invalid input:" in output.error.message + assert "name" in output.error.message + assert "age" in output.error.message + + +@pytest.mark.asyncio +async def test_input_validation_error_does_not_leak_input_values(): + """Pydantic's ``str(e)`` and ``err["input"]`` echo offending values back + verbatim — which may contain user secrets (passwords, tokens, PII). + Neither ``message`` (agent-facing) nor ``developer_message`` (Datadog log + facet) must contain the rejected input values.""" + tool_definition = catalog.find_tool_by_func(multi_field_tool) + full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name()) + dummy_context = ToolContext() + + secret = "SECRET_PASSWORD_DO_NOT_LEAK_42" + output = await ToolExecutor.run( + func=multi_field_tool, + definition=tool_definition, + input_model=full_tool.input_model, + output_model=full_tool.output_model, + context=dummy_context, + name=12345, # wrong type, ignored for the leak check + age=secret, # wrong type AND a "secret" value we want to ensure is not echoed + ) + + assert output.error is not None + # Field path + reason must be present (so agents can self-correct). + assert "age" in output.error.message + # But the actual rejected input value must NOT be anywhere in either field. + assert secret not in output.error.message + assert output.error.developer_message is not None + assert secret not in output.error.developer_message + # The integer wrong-type value also must not appear. + assert "12345" not in output.error.message + assert "12345" not in output.error.developer_message diff --git a/libs/tests/core/test_log_extras.py b/libs/tests/core/test_log_extras.py new file mode 100644 index 00000000..5b81c513 --- /dev/null +++ b/libs/tests/core/test_log_extras.py @@ -0,0 +1,126 @@ +"""Tests for ``build_tool_error_log_extra``. + +Locks the Datadog-facet contract shared by the MCP and worker transports. +Field names are load-bearing for dashboards. +""" + +import pytest +from arcade_core.errors import ErrorKind +from arcade_core.log_extras import build_tool_error_log_extra +from arcade_core.schema import ToolCallError + + +def _err(**kw): + defaults = { + "message": "Spreadsheet not found", + "kind": ErrorKind.TOOL_RUNTIME_FATAL, + "developer_message": None, + "status_code": 404, + "can_retry": False, + } + defaults.update(kw) + return ToolCallError(**defaults) + + +def test_canonical_field_contract(): + """Locks the canonical field names every Datadog dashboard depends on.""" + extra = build_tool_error_log_extra(_err(developer_message="dev: x"), tool_name="MyTool") + assert extra == { + "error_kind": "TOOL_RUNTIME_FATAL", + "error_message": "Spreadsheet not found", + "error_developer_message": "dev: x", + "error_status_code": 404, + "error_can_retry": False, + "tool_name": "MyTool", + } + + +def test_kind_enum_value_used_not_repr(): + """``error_kind`` must be the stable string code (Datadog facets on + ``"TOOL_RUNTIME_FATAL"``, NOT the Python repr ``"ErrorKind.TOOL_RUNTIME_FATAL"``).""" + extra = build_tool_error_log_extra( + _err(kind=ErrorKind.UPSTREAM_RUNTIME_RATE_LIMIT), tool_name="t" + ) + assert extra["error_kind"] == "UPSTREAM_RUNTIME_RATE_LIMIT" + assert "ErrorKind." not in extra["error_kind"] + + +def test_kind_string_fallback_when_not_enum(): + """If something passes a raw string in ``kind`` (not a ToolCallError that + would normalize it), the helper still produces a sensible value.""" + + class FakeError: + kind = "CUSTOM_KIND" + message = "x" + developer_message = None + status_code = None + can_retry = False + + extra = build_tool_error_log_extra(FakeError(), tool_name="t") + assert extra["error_kind"] == "CUSTOM_KIND" + + +def test_optional_toolkit_fields_omitted_by_default(): + """Optional toolkit fields are only present when supplied — Datadog can + distinguish 'not set' from 'set to None'.""" + extra = build_tool_error_log_extra(_err(), tool_name="MyTool") + assert "toolkit_name" not in extra + assert "toolkit_version" not in extra + + +def test_optional_toolkit_fields_included_when_supplied(): + extra = build_tool_error_log_extra( + _err(), + tool_name="MyTool", + toolkit_name="MyKit", + toolkit_version="1.2.3", + ) + assert extra["toolkit_name"] == "MyKit" + assert extra["toolkit_version"] == "1.2.3" + + +def test_additional_extras_merged(): + """Worker passes ``execution_id``; MCP server may add others. The helper + accepts arbitrary additions via **kwargs.""" + extra = build_tool_error_log_extra( + _err(), + tool_name="t", + execution_id="exec_42", + ) + assert extra["execution_id"] == "exec_42" + + +def test_additional_extras_cannot_clobber_canonical_fields(): + """A caller passing a key that collides with the canonical contract must + NOT be able to override the canonical value — that would let one call site + silently change the Datadog facet for every dashboard.""" + extra = build_tool_error_log_extra( + _err(message="canonical"), + tool_name="t", + error_message="OVERRIDE_ATTEMPT", + tool_name_override="ignored too", + ) + assert extra["error_message"] == "canonical" + # Non-canonical-name additions still pass through. + assert extra["tool_name_override"] == "ignored too" + + +def test_developer_message_none_propagates(): + """``error_developer_message`` is present and explicitly None when the + error has no developer_message — Datadog needs to distinguish 'unset' + from 'set to None' on this field specifically.""" + extra = build_tool_error_log_extra(_err(developer_message=None), tool_name="t") + assert "error_developer_message" in extra + assert extra["error_developer_message"] is None + + +def test_status_code_none_propagates(): + extra = build_tool_error_log_extra(_err(status_code=None), tool_name="t") + assert "error_status_code" in extra + assert extra["error_status_code"] is None + + +@pytest.mark.parametrize("can_retry", [True, False]) +def test_can_retry_propagates_as_bool(can_retry): + extra = build_tool_error_log_extra(_err(can_retry=can_retry), tool_name="t") + assert extra["error_can_retry"] is can_retry diff --git a/libs/tests/core/test_output.py b/libs/tests/core/test_output.py index 7b390144..d3680825 100644 --- a/libs/tests/core/test_output.py +++ b/libs/tests/core/test_output.py @@ -108,6 +108,36 @@ def test_fail(output_factory, message, developer_message): assert output.error.can_retry is False +def test_fail_empty_message_gets_default(output_factory): + output = output_factory.fail(message="") + assert output.error is not None + assert output.error.message == "Unspecified error during tool execution" + + +def test_fail_whitespace_message_gets_default(output_factory): + output = output_factory.fail(message=" ") + assert output.error is not None + assert output.error.message == "Unspecified error during tool execution" + + +def test_fail_nonempty_message_unchanged(output_factory): + output = output_factory.fail(message="real error") + assert output.error is not None + assert output.error.message == "real error" + + +def test_fail_retry_empty_message_gets_default(output_factory): + output = output_factory.fail_retry(message="") + assert output.error is not None + assert output.error.message == "Unspecified error during tool execution" + + +def test_fail_retry_whitespace_message_gets_default(output_factory): + output = output_factory.fail_retry(message=" ") + assert output.error is not None + assert output.error.message == "Unspecified error during tool execution" + + @pytest.mark.parametrize( "message, developer_message, additional_prompt_content, retry_after_ms", [ diff --git a/libs/tests/tool/test_error_fallback.py b/libs/tests/tool/test_error_fallback.py new file mode 100644 index 00000000..1aa99f22 --- /dev/null +++ b/libs/tests/tool/test_error_fallback.py @@ -0,0 +1,141 @@ +from typing import Annotated + +import pytest +from arcade_core.catalog import ToolCatalog +from arcade_core.executor import ToolExecutor +from arcade_core.schema import ToolContext +from arcade_tdk import tool + + +catalog = ToolCatalog() + + +@tool +def keyerror_tool() -> Annotated[str, "output"]: + """Tool that raises a bare KeyError""" + raise KeyError("x") + + +@tool +def empty_exception_tool() -> Annotated[str, "output"]: + """Tool that raises an exception with no message""" + raise Exception() + + +@tool +def fallback_with_secret_in_exception_tool() -> Annotated[str, "output"]: + """Tool that raises an exception whose ``str()`` contains a fake secret. + + Used to verify the strict data-leak policy: the secret must NEVER appear + in the agent-facing ``message``. It is allowed in ``developer_message``, + which is server-side only (Datadog facet, never returned to the MCP client). + """ + raise RuntimeError("Bad credentials for alice: password=hunter2_SECRET_PII") + + +catalog.add_tool(keyerror_tool, "ErrorFallbackToolkit") +catalog.add_tool(empty_exception_tool, "ErrorFallbackToolkit") +catalog.add_tool(fallback_with_secret_in_exception_tool, "ErrorFallbackToolkit") + + +@pytest.mark.asyncio +async def test_fallback_message_contains_type_but_not_exception_string(): + """The agent-facing ``message`` must include the exception **type** + (so agents know what class of failure occurred) but must NOT include + ``str(exception)`` content — that's what the data-leak policy reserves + for ``developer_message``.""" + tool_definition = catalog.find_tool_by_func(keyerror_tool) + full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name()) + + output = await ToolExecutor.run( + func=keyerror_tool, + definition=tool_definition, + input_model=full_tool.input_model, + output_model=full_tool.output_model, + context=ToolContext(), + ) + + assert output.error is not None + # Type IS present (safe: class names are source-defined). + assert "KeyError" in output.error.message + # The hint to use FatalToolError is present. + assert "FatalToolError" in output.error.message + # The exception string content (``str(KeyError('x'))`` == ``"'x'"``) is NOT. + # Stricter: no quoted argument representation should appear. + assert "'x'" not in output.error.message + + +@pytest.mark.asyncio +async def test_fallback_message_never_leaks_exception_str_content(): + """Strict data-leak policy: even if the tool author embeds secrets in + ``str(exception)``, the agent-facing ``message`` MUST NOT contain them. + The full content is preserved in ``developer_message`` (server-side + logs only) so on-call engineers retain debugging context.""" + tool_definition = catalog.find_tool_by_func(fallback_with_secret_in_exception_tool) + full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name()) + + output = await ToolExecutor.run( + func=fallback_with_secret_in_exception_tool, + definition=tool_definition, + input_model=full_tool.input_model, + output_model=full_tool.output_model, + context=ToolContext(), + ) + + assert output.error is not None + + # ── Agent-facing channel: nothing from str(exception) leaks ── + assert "hunter2_SECRET_PII" not in output.error.message + assert "alice" not in output.error.message + assert "password=" not in output.error.message + assert "Bad credentials" not in output.error.message + # Type still present so the agent knows what failed. + assert "RuntimeError" in output.error.message + + # ── Server-side log channel: full content preserved for debugging ── + assert output.error.developer_message is not None + assert "hunter2_SECRET_PII" in output.error.developer_message + assert "RuntimeError" in output.error.developer_message + + +@pytest.mark.asyncio +async def test_fallback_developer_message_carries_full_exception_content(): + """``developer_message`` (server-side log only) is where verbose exception + content lives — type + str(exception) — so engineers can debug.""" + tool_definition = catalog.find_tool_by_func(keyerror_tool) + full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name()) + + output = await ToolExecutor.run( + func=keyerror_tool, + definition=tool_definition, + input_model=full_tool.input_model, + output_model=full_tool.output_model, + context=ToolContext(), + ) + + assert output.error is not None + assert output.error.developer_message is not None + assert "KeyError" in output.error.developer_message + # The KeyError argument IS present in developer_message (vs absent in message). + assert "'x'" in output.error.developer_message + + +@pytest.mark.asyncio +async def test_fallback_empty_exception_developer_message_marks_no_details(): + """Empty exceptions (``raise Exception()``) get a ``(no exception message)`` + marker in ``developer_message`` so on-call engineers can distinguish + 'nothing to log' from 'log was lost'.""" + tool_definition = catalog.find_tool_by_func(empty_exception_tool) + full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name()) + + output = await ToolExecutor.run( + func=empty_exception_tool, + definition=tool_definition, + input_model=full_tool.input_model, + output_model=full_tool.output_model, + context=ToolContext(), + ) + + assert output.error is not None + assert output.error.developer_message is not None + assert "Exception (no exception message)" in output.error.developer_message diff --git a/libs/tests/worker/test_worker_base.py b/libs/tests/worker/test_worker_base.py index f17cd99e..fafe738c 100644 --- a/libs/tests/worker/test_worker_base.py +++ b/libs/tests/worker/test_worker_base.py @@ -124,6 +124,52 @@ async def test_call_tool_success(base_worker_no_auth): assert response.output.value == 8 assert response.output.error is None assert response.execution_id == "test_exec_id" + + +@pytest.mark.asyncio +async def test_call_tool_success_and_error_logs_use_same_tool_identifiers( + base_worker_no_auth, caplog +): + """Success and error log lines must use identical tool identifier strings + so logs can be correlated with a single grep pattern.""" + import logging + + base_worker_no_auth.register_tool(sample_tool, toolkit_name="test_kit") + base_worker_no_auth.register_tool(error_tool, toolkit_name="test_kit") + + success_req = ToolCallRequest( + execution_id="exec_consistency_ok", + tool=ToolReference(toolkit="TestKit", name="SampleTool"), + inputs={"a": 1, "b": 2}, + ) + error_req = ToolCallRequest( + execution_id="exec_consistency_err", + tool=ToolReference(toolkit="TestKit", name="ErrorTool"), + inputs={}, + ) + + with caplog.at_level(logging.DEBUG, logger="arcade_serve.core.base"): + await base_worker_no_auth.call_tool(success_req) + await base_worker_no_auth.call_tool(error_req) + + success_line = next( + r for r in caplog.records if "exec_consistency_ok" in r.getMessage() and "success" in r.getMessage() + ) + error_line = next( + r for r in caplog.records if "exec_consistency_err" in r.getMessage() and "failed:" in r.getMessage() + ) + # Both must use the bare tool name (".name"), NOT the full ``Toolkit.Tool`` fqname. + assert "Tool SampleTool " in success_line.getMessage() + assert "Tool ErrorTool " in error_line.getMessage() + # Neither line should contain the full-fqname form ``TestKit.SampleTool``. + assert "TestKit.SampleTool" not in success_line.getMessage() + assert "TestKit.ErrorTool" not in error_line.getMessage() + # Both must use the same "version " word — proves the same source + # (``tool_fqname.toolkit_version``) is read on both paths. + assert "version " in success_line.getMessage() + assert "version " in error_line.getMessage() + + @pytest.mark.asyncio async def test_call_tool_execution_error(base_worker_no_auth): # Tool is now defined at module level @@ -147,6 +193,65 @@ async def test_call_tool_execution_error(base_worker_no_auth): assert response.output.error is not None +@pytest.mark.asyncio +async def test_call_tool_error_log_text_matches_structured_extras(base_worker_no_auth, caplog): + """The primary failure warning's f-string must use the same resolved + ``tool_fqname.name`` / ``tool_fqname.toolkit_version`` values that + ``log_extra`` exposes — otherwise the human-readable text and the + Datadog facets disagree on which tool/version produced the error. + Previously the f-string used ``tool_request.tool.version`` (the *requested* + version, often ``None``) while the extras used the resolved version.""" + base_worker_no_auth.register_tool(error_tool, toolkit_name="error_kit") + tool_request = ToolCallRequest( + execution_id="exec_log_check", + tool=ToolReference(toolkit="ErrorKit", name="ErrorTool"), + inputs={}, + ) + + with caplog.at_level("WARNING", logger="arcade_serve.core.base"): + await base_worker_no_auth.call_tool(tool_request) + + primary = next( + r for r in caplog.records if "exec_log_check" in r.getMessage() and "failed:" in r.getMessage() + ) + # Text and structured extra must agree on name + version. + assert "Tool ErrorTool " in primary.getMessage() + assert getattr(primary, "tool_name", None) == "ErrorTool" + extra_version = getattr(primary, "toolkit_version", None) + assert f"version {extra_version}" in primary.getMessage() + + +@pytest.mark.asyncio +async def test_call_tool_error_secondary_log_carries_full_exception_content( + base_worker_no_auth, caplog +): + """Under the strict data-leak policy, the @tool fallback puts the verbose + ``str(exception)`` content into ``developer_message`` (server-side only, + never returned to the MCP client). The secondary ``"Developer message: ..."`` + warning must therefore fire and carry that full content so on-call + engineers retain debugging context — the channel where leakage WOULD + matter (agent-facing ``message``) is covered by the dedicated leak tests + in ``libs/tests/tool/test_error_fallback.py``.""" + base_worker_no_auth.register_tool(error_tool, toolkit_name="error_kit") + tool_request = ToolCallRequest( + execution_id="exec_dev_msg", + tool=ToolReference(toolkit="ErrorKit", name="ErrorTool"), + inputs={}, + ) + + with caplog.at_level("WARNING", logger="arcade_serve.core.base"): + await base_worker_no_auth.call_tool(tool_request) + + secondary = [ + r for r in caplog.records + if "exec_dev_msg" in r.getMessage() and "Developer message:" in r.getMessage() + ] + assert len(secondary) == 1, "secondary 'Developer message:' log should fire once" + # The full exception content is in the secondary log (and in Datadog facets). + assert "ValueError" in secondary[0].getMessage() + assert "Something went wrong" in secondary[0].getMessage() + + @pytest.mark.asyncio async def test_call_tool_not_found(base_worker_no_auth): # Use ToolReference without version for lookup consistency