## Summary - Improve tool call error messages across 4 libraries (arcade-core, arcade-tdk, arcade-mcp-server, arcade-serve) so agents can self-correct and Datadog can facet on structured fields - Guard empty error messages, enrich input validation errors with field-level detail, fix `@tool` decorator fallback formatting, surface `additional_prompt_content` in MCP responses, and add structured log extras for Datadog - Addresses the 3 worst error patterns: generic "Error in tool input deserialization", bare `KeyError` values, and empty `FatalToolError` messages **Linear:** TOO-627 **Plan:** `docs/plans/2026-04-08-improve-error-messages-handoff.md` ## Tasks - [ ] Task 1: Guard empty error messages (arcade-core) - [ ] Task 2: Enrich input validation error messages (arcade-core) - [ ] Task 3: Improve `@tool` decorator error fallback (arcade-tdk) - [ ] Task 4: Fix MCP agent-facing error response (arcade-mcp-server) - [ ] Task 5: Add structured log extras in BaseWorker (arcade-serve) - [ ] Task 6: Add structured log extras in MCP server (arcade-mcp-server) ## Test plan - [ ] Each task has dedicated unit tests verifying the new behavior - [ ] `make test` passes after all tasks - [ ] `make check` (ruff + mypy) passes - [ ] Verify the 3 worst error patterns now produce actionable messages 🤖 Generated with [Claude Code](https://claude.com/claude-code) <!-- CURSOR_SUMMARY --> --- > [!NOTE] > **Medium Risk** > Touches cross-library error formatting and logging behavior used in production tool execution paths; while mostly additive/guardrails, it changes agent-visible messages and Datadog log facets, which could impact client expectations and alerting. > > **Overview** > Improves tool-call error handling across core/runtime, MCP transport, worker transport, and the TDK to make agent-visible failures more actionable while *reducing sensitive-data leakage*. > > In `arcade-core`, empty error messages now get placeholders, `ToolOutputFactory.fail*` defaults blank messages, and input validation errors are rewritten as field-level summaries that intentionally omit rejected values (avoiding Pydantic echo of secrets). The `@tool` fallback in `arcade-tdk` no longer surfaces `str(exception)` to agents; it returns exception *type-only* in `message` while preserving full detail in `developer_message`. > > Adds a shared `build_tool_error_log_extra` helper and updates `arcade-serve` + `arcade-mcp-server` to emit consistent structured WARNING logs (`error_*`, `tool_name`, optional toolkit/version) for Datadog, while MCP error responses now append `additional_prompt_content` and force `structuredContent=None` on failures per spec. Includes extensive new tests and bumps package versions (`arcade-core` 4.6.2, `arcade-tdk` 3.6.1, `arcade-mcp-server` 1.19.3, `arcade-serve` 3.2.3). > > <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit e5c7ebcaf56176cfbd8e6d1f2b6295352abd0ec0. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot).</sup> <!-- /CURSOR_SUMMARY --> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
437 lines
15 KiB
Python
437 lines
15 KiB
Python
from typing import Annotated
|
|
|
|
import pytest
|
|
from arcade_core.catalog import ToolCatalog
|
|
from arcade_core.errors import (
|
|
ContextRequiredToolError,
|
|
ErrorKind,
|
|
ToolRuntimeError,
|
|
UpstreamError,
|
|
UpstreamRateLimitError,
|
|
)
|
|
from arcade_core.executor import ToolExecutor
|
|
from arcade_core.schema import ToolCallError, ToolCallLog, ToolCallOutput, ToolContext
|
|
from arcade_tdk import tool
|
|
from arcade_tdk.errors import (
|
|
RetryableToolError,
|
|
ToolExecutionError,
|
|
)
|
|
from typing_extensions import TypedDict
|
|
|
|
|
|
@tool
|
|
def simple_tool(inp: Annotated[str, "input"]) -> Annotated[str, "output"]:
|
|
"""Simple tool"""
|
|
return inp
|
|
|
|
|
|
@tool.deprecated("Use simple_tool instead")
|
|
@tool
|
|
def simple_deprecated_tool(inp: Annotated[str, "input"]) -> Annotated[str, "output"]:
|
|
"""Simple tool that is deprecated"""
|
|
return inp
|
|
|
|
|
|
@tool
|
|
def retryable_error_tool() -> Annotated[str, "output"]:
|
|
"""Tool that raises a retryable error"""
|
|
raise RetryableToolError("test", "test developer message", "additional prompt content", 1000)
|
|
|
|
|
|
@tool
|
|
def tool_execution_error_tool() -> Annotated[str, "output"]:
|
|
"""Tool that raises an error"""
|
|
raise ToolExecutionError("test", "test developer message")
|
|
|
|
|
|
@tool
|
|
def unexpected_error_tool() -> Annotated[str, "output"]:
|
|
"""Tool that raises an unexpected error"""
|
|
raise RuntimeError("test")
|
|
|
|
|
|
@tool
|
|
def context_required_error_tool() -> Annotated[str, "output"]:
|
|
"""Tool that raises a context required error"""
|
|
raise ContextRequiredToolError(
|
|
"test", additional_prompt_content="need the user to clarify something"
|
|
)
|
|
|
|
|
|
@tool
|
|
def upstream_error_tool() -> Annotated[str, "output"]:
|
|
"""Tool that raises an upstream error"""
|
|
# TODO: or test raising a httpx error? Do these types of tests belong in adapter tests?
|
|
raise UpstreamError("test", status_code=400)
|
|
|
|
|
|
@tool
|
|
def upstream_ratelimit_error_tool() -> Annotated[str, "output"]:
|
|
"""Tool that raises an upstream error"""
|
|
# TODO: or test raising a httpx error? Do these types of tests belong in adapter tests?
|
|
raise UpstreamRateLimitError("test", 1000)
|
|
|
|
|
|
@tool
|
|
def tool_runtime_error_tool() -> Annotated[str, "output"]:
|
|
"""Tool that raises a tool runtime error"""
|
|
raise ToolRuntimeError("test", "test developer message")
|
|
|
|
|
|
@tool
|
|
def bad_output_error_tool() -> Annotated[str, "output"]:
|
|
"""tool that returns a bad output type"""
|
|
return {"output": "test"}
|
|
|
|
|
|
# TypedDict output tools
|
|
class ResultDict(TypedDict):
|
|
"""Result dictionary."""
|
|
|
|
status: str
|
|
count: int
|
|
items: list[str]
|
|
|
|
|
|
@tool
|
|
def typeddict_output_tool() -> Annotated[ResultDict, "Returns a TypedDict"]:
|
|
"""Tool that returns a TypedDict."""
|
|
return ResultDict(status="success", count=3, items=["a", "b", "c"])
|
|
|
|
|
|
@tool
|
|
def list_typeddict_output_tool() -> Annotated[list[ResultDict], "Returns list of TypedDict"]:
|
|
"""Tool that returns a list of TypedDict."""
|
|
return [
|
|
ResultDict(status="first", count=1, items=["x"]),
|
|
ResultDict(status="second", count=2, items=["y", "z"]),
|
|
]
|
|
|
|
|
|
@tool
|
|
def dict_output_tool() -> Annotated[dict, "Returns a plain dict"]:
|
|
"""Tool that returns a plain dict."""
|
|
return {"key": "value", "number": 42, "nested": {"inner": "data"}}
|
|
|
|
|
|
# ---- Test Driver ----
|
|
tools = [
|
|
simple_tool,
|
|
simple_deprecated_tool,
|
|
retryable_error_tool,
|
|
tool_execution_error_tool,
|
|
unexpected_error_tool,
|
|
context_required_error_tool,
|
|
upstream_error_tool,
|
|
upstream_ratelimit_error_tool,
|
|
tool_runtime_error_tool,
|
|
bad_output_error_tool,
|
|
typeddict_output_tool,
|
|
list_typeddict_output_tool,
|
|
dict_output_tool,
|
|
]
|
|
catalog = ToolCatalog()
|
|
for tool_func in tools:
|
|
catalog.add_tool(tool_func, "simple_toolkit")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize(
|
|
"tool_func, inputs, expected_output",
|
|
[
|
|
(simple_tool, {"inp": "test"}, ToolCallOutput(value="test")),
|
|
(
|
|
simple_deprecated_tool,
|
|
{"inp": "test"},
|
|
ToolCallOutput(
|
|
value="test",
|
|
logs=[
|
|
ToolCallLog(
|
|
message="Use simple_tool instead",
|
|
level="warning",
|
|
subtype="deprecation",
|
|
)
|
|
],
|
|
),
|
|
),
|
|
(
|
|
retryable_error_tool,
|
|
{},
|
|
ToolCallOutput(
|
|
error=ToolCallError(
|
|
message="[TOOL_RUNTIME_RETRY] RetryableToolError during execution of tool 'retryable_error_tool': test",
|
|
kind=ErrorKind.TOOL_RUNTIME_RETRY,
|
|
developer_message="[TOOL_RUNTIME_RETRY] RetryableToolError during execution of tool 'retryable_error_tool': test developer message",
|
|
additional_prompt_content="additional prompt content",
|
|
retry_after_ms=1000,
|
|
can_retry=True,
|
|
)
|
|
),
|
|
),
|
|
(
|
|
tool_execution_error_tool,
|
|
{},
|
|
ToolCallOutput(
|
|
error=ToolCallError(
|
|
message="[TOOL_RUNTIME_FATAL] ToolExecutionError during execution of tool 'tool_execution_error_tool': test",
|
|
kind=ErrorKind.TOOL_RUNTIME_FATAL,
|
|
developer_message="[TOOL_RUNTIME_FATAL] ToolExecutionError during execution of tool 'tool_execution_error_tool': test developer message",
|
|
can_retry=False,
|
|
)
|
|
),
|
|
),
|
|
(
|
|
unexpected_error_tool,
|
|
{},
|
|
ToolCallOutput(
|
|
error=ToolCallError(
|
|
message=(
|
|
"[TOOL_RUNTIME_FATAL] FatalToolError during execution of tool "
|
|
"'unexpected_error_tool': An unhandled RuntimeError was raised by the tool."
|
|
),
|
|
kind=ErrorKind.TOOL_RUNTIME_FATAL,
|
|
developer_message=(
|
|
"[TOOL_RUNTIME_FATAL] FatalToolError during execution of tool "
|
|
"'unexpected_error_tool': RuntimeError: test"
|
|
),
|
|
can_retry=False,
|
|
status_code=500,
|
|
)
|
|
),
|
|
),
|
|
(
|
|
simple_tool,
|
|
{"inp": {"test": "test"}}, # takes in a string not a dict
|
|
ToolCallOutput(
|
|
error=ToolCallError(
|
|
message="[TOOL_RUNTIME_BAD_INPUT_VALUE] ToolInputError during execution of tool 'simple_tool': Invalid input: inp:",
|
|
kind=ErrorKind.TOOL_RUNTIME_BAD_INPUT_VALUE,
|
|
status_code=400,
|
|
developer_message=None, # can't guarantee this will be the same
|
|
)
|
|
),
|
|
),
|
|
(
|
|
context_required_error_tool,
|
|
{},
|
|
ToolCallOutput(
|
|
error=ToolCallError(
|
|
message="[TOOL_RUNTIME_CONTEXT_REQUIRED] ContextRequiredToolError during execution of tool 'context_required_error_tool': test",
|
|
kind=ErrorKind.TOOL_RUNTIME_CONTEXT_REQUIRED,
|
|
developer_message=None,
|
|
additional_prompt_content="need the user to clarify something",
|
|
)
|
|
),
|
|
),
|
|
(
|
|
upstream_error_tool,
|
|
{},
|
|
ToolCallOutput(
|
|
error=ToolCallError(
|
|
message="[UPSTREAM_RUNTIME_BAD_REQUEST] UpstreamError during execution of tool 'upstream_error_tool': test",
|
|
kind=ErrorKind.UPSTREAM_RUNTIME_BAD_REQUEST,
|
|
status_code=400,
|
|
developer_message=None,
|
|
)
|
|
),
|
|
),
|
|
(
|
|
upstream_ratelimit_error_tool,
|
|
{},
|
|
ToolCallOutput(
|
|
error=ToolCallError(
|
|
message="[UPSTREAM_RUNTIME_RATE_LIMIT] UpstreamRateLimitError during execution of tool 'upstream_ratelimit_error_tool': test",
|
|
kind=ErrorKind.UPSTREAM_RUNTIME_RATE_LIMIT,
|
|
status_code=429,
|
|
developer_message=None,
|
|
retry_after_ms=1000,
|
|
can_retry=True,
|
|
)
|
|
),
|
|
),
|
|
(
|
|
tool_runtime_error_tool,
|
|
{},
|
|
ToolCallOutput(
|
|
error=ToolCallError(
|
|
message="[TOOL_RUNTIME_FATAL] ToolRuntimeError during execution of tool 'tool_runtime_error_tool': test",
|
|
kind=ErrorKind.TOOL_RUNTIME_FATAL,
|
|
developer_message="[TOOL_RUNTIME_FATAL] ToolRuntimeError during execution of tool 'tool_runtime_error_tool': test developer message",
|
|
can_retry=False,
|
|
)
|
|
),
|
|
),
|
|
(
|
|
bad_output_error_tool,
|
|
{},
|
|
ToolCallOutput(
|
|
error=ToolCallError(
|
|
message="[TOOL_RUNTIME_BAD_OUTPUT_VALUE] ToolOutputError during execution of tool 'bad_output_error_tool': Failed to serialize tool output",
|
|
kind=ErrorKind.TOOL_RUNTIME_BAD_OUTPUT_VALUE,
|
|
status_code=500,
|
|
developer_message=None, # can't gaurantee this will be the same
|
|
)
|
|
),
|
|
),
|
|
(
|
|
typeddict_output_tool,
|
|
{},
|
|
ToolCallOutput(value={"status": "success", "count": 3, "items": ["a", "b", "c"]}),
|
|
),
|
|
(
|
|
list_typeddict_output_tool,
|
|
{},
|
|
ToolCallOutput(
|
|
value=[
|
|
{"status": "first", "count": 1, "items": ["x"]},
|
|
{"status": "second", "count": 2, "items": ["y", "z"]},
|
|
]
|
|
),
|
|
),
|
|
(
|
|
dict_output_tool,
|
|
{},
|
|
ToolCallOutput(value={"key": "value", "number": 42, "nested": {"inner": "data"}}),
|
|
),
|
|
],
|
|
ids=[
|
|
"simple_tool",
|
|
"simple_deprecated_tool",
|
|
"retryable_error_tool",
|
|
"exec_error_tool",
|
|
"unexpected_error_tool",
|
|
"invalid_input_type",
|
|
"context_required_error_tool",
|
|
"upstream_error_tool",
|
|
"upstream_ratelimit_error_tool",
|
|
"tool_runtime_error_tool",
|
|
"bad_output_type",
|
|
"typeddict_output",
|
|
"list_typeddict_output",
|
|
"dict_output",
|
|
],
|
|
)
|
|
async def test_tool_executor(tool_func, inputs, expected_output):
|
|
tool_definition = catalog.find_tool_by_func(tool_func)
|
|
|
|
dummy_context = ToolContext()
|
|
full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name())
|
|
output = await ToolExecutor.run(
|
|
func=tool_func,
|
|
definition=tool_definition,
|
|
input_model=full_tool.input_model,
|
|
output_model=full_tool.output_model,
|
|
context=dummy_context,
|
|
**inputs,
|
|
)
|
|
|
|
check_output(output, expected_output)
|
|
|
|
|
|
def check_output_error(output_error: ToolCallError, expected_error: ToolCallError):
|
|
if "Invalid input:" in expected_error.message:
|
|
assert output_error.message.startswith(
|
|
expected_error.message
|
|
), f"message mismatch: {output_error.message!r} does not start with {expected_error.message!r}"
|
|
else:
|
|
assert output_error.message == expected_error.message, "message mismatch"
|
|
assert output_error.kind == expected_error.kind, "kind mismatch"
|
|
if expected_error.developer_message:
|
|
assert (
|
|
output_error.developer_message == expected_error.developer_message
|
|
), "developer message mismatch"
|
|
assert output_error.can_retry == expected_error.can_retry, "can retry mismatch"
|
|
assert (
|
|
output_error.additional_prompt_content == expected_error.additional_prompt_content
|
|
), "additional prompt content mismatch"
|
|
assert output_error.retry_after_ms == expected_error.retry_after_ms, "retry after ms mismatch"
|
|
if expected_error.stacktrace:
|
|
assert output_error.stacktrace == expected_error.stacktrace, "stacktrace mismatch"
|
|
assert output_error.status_code == expected_error.status_code, "status code mismatch"
|
|
assert output_error.extra == expected_error.extra, "extra mismatch"
|
|
|
|
|
|
def check_output(output: ToolCallOutput, expected_output: ToolCallOutput):
|
|
# error in ToolCallOutput
|
|
if output.error:
|
|
check_output_error(output.error, expected_output.error)
|
|
|
|
# normal tool execution
|
|
else:
|
|
assert output.value == expected_output.value
|
|
|
|
# check logs
|
|
output_logs = output.logs or []
|
|
expected_logs = expected_output.logs or []
|
|
assert len(output_logs) == len(expected_logs)
|
|
for output_log, expected_log in zip(output_logs, expected_logs, strict=False):
|
|
assert output_log.message == expected_log.message
|
|
assert output_log.level == expected_log.level
|
|
assert output_log.subtype == expected_log.subtype
|
|
|
|
|
|
@tool
|
|
def multi_field_tool(
|
|
name: Annotated[str, "a name"],
|
|
age: Annotated[int, "an age"],
|
|
) -> Annotated[str, "output"]:
|
|
"""Tool with multiple required fields"""
|
|
return f"{name} is {age}"
|
|
|
|
|
|
catalog.add_tool(multi_field_tool, "MultiFieldToolkit")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_multiple_bad_fields_in_input_error():
|
|
tool_definition = catalog.find_tool_by_func(multi_field_tool)
|
|
full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name())
|
|
dummy_context = ToolContext()
|
|
|
|
output = await ToolExecutor.run(
|
|
func=multi_field_tool,
|
|
definition=tool_definition,
|
|
input_model=full_tool.input_model,
|
|
output_model=full_tool.output_model,
|
|
context=dummy_context,
|
|
name=123, # wrong type
|
|
age="not_an_int", # wrong type
|
|
)
|
|
|
|
assert output.error is not None
|
|
assert "Invalid input:" in output.error.message
|
|
assert "name" in output.error.message
|
|
assert "age" in output.error.message
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_input_validation_error_does_not_leak_input_values():
|
|
"""Pydantic's ``str(e)`` and ``err["input"]`` echo offending values back
|
|
verbatim — which may contain user secrets (passwords, tokens, PII).
|
|
Neither ``message`` (agent-facing) nor ``developer_message`` (Datadog log
|
|
facet) must contain the rejected input values."""
|
|
tool_definition = catalog.find_tool_by_func(multi_field_tool)
|
|
full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name())
|
|
dummy_context = ToolContext()
|
|
|
|
secret = "SECRET_PASSWORD_DO_NOT_LEAK_42"
|
|
output = await ToolExecutor.run(
|
|
func=multi_field_tool,
|
|
definition=tool_definition,
|
|
input_model=full_tool.input_model,
|
|
output_model=full_tool.output_model,
|
|
context=dummy_context,
|
|
name=12345, # wrong type, ignored for the leak check
|
|
age=secret, # wrong type AND a "secret" value we want to ensure is not echoed
|
|
)
|
|
|
|
assert output.error is not None
|
|
# Field path + reason must be present (so agents can self-correct).
|
|
assert "age" in output.error.message
|
|
# But the actual rejected input value must NOT be anywhere in either field.
|
|
assert secret not in output.error.message
|
|
assert output.error.developer_message is not None
|
|
assert secret not in output.error.developer_message
|
|
# The integer wrong-type value also must not appear.
|
|
assert "12345" not in output.error.message
|
|
assert "12345" not in output.error.developer_message
|