arcade-mcp/libs/tests/core/test_executor.py
Francisco Or Something 1492c80fc5
TOO-627: Improve error messages for agents and Datadog (#814)
## Summary

- Improve tool call error messages across 4 libraries (arcade-core,
arcade-tdk, arcade-mcp-server, arcade-serve) so agents can self-correct
and Datadog can facet on structured fields
- Guard empty error messages, enrich input validation errors with
field-level detail, fix `@tool` decorator fallback formatting, surface
`additional_prompt_content` in MCP responses, and add structured log
extras for Datadog
- Addresses the 3 worst error patterns: generic "Error in tool input
deserialization", bare `KeyError` values, and empty `FatalToolError`
messages

**Linear:** TOO-627
**Plan:** `docs/plans/2026-04-08-improve-error-messages-handoff.md`

## Tasks

- [ ] Task 1: Guard empty error messages (arcade-core)
- [ ] Task 2: Enrich input validation error messages (arcade-core)
- [ ] Task 3: Improve `@tool` decorator error fallback (arcade-tdk)
- [ ] Task 4: Fix MCP agent-facing error response (arcade-mcp-server)
- [ ] Task 5: Add structured log extras in BaseWorker (arcade-serve)
- [ ] Task 6: Add structured log extras in MCP server
(arcade-mcp-server)

## Test plan

- [ ] Each task has dedicated unit tests verifying the new behavior
- [ ] `make test` passes after all tasks
- [ ] `make check` (ruff + mypy) passes
- [ ] Verify the 3 worst error patterns now produce actionable messages

🤖 Generated with [Claude Code](https://claude.com/claude-code)

<!-- CURSOR_SUMMARY -->
---

> [!NOTE]
> **Medium Risk**
> Touches cross-library error formatting and logging behavior used in
production tool execution paths; while mostly additive/guardrails, it
changes agent-visible messages and Datadog log facets, which could
impact client expectations and alerting.
> 
> **Overview**
> Improves tool-call error handling across core/runtime, MCP transport,
worker transport, and the TDK to make agent-visible failures more
actionable while *reducing sensitive-data leakage*.
> 
> In `arcade-core`, empty error messages now get placeholders,
`ToolOutputFactory.fail*` defaults blank messages, and input validation
errors are rewritten as field-level summaries that intentionally omit
rejected values (avoiding Pydantic echo of secrets). The `@tool`
fallback in `arcade-tdk` no longer surfaces `str(exception)` to agents;
it returns exception *type-only* in `message` while preserving full
detail in `developer_message`.
> 
> Adds a shared `build_tool_error_log_extra` helper and updates
`arcade-serve` + `arcade-mcp-server` to emit consistent structured
WARNING logs (`error_*`, `tool_name`, optional toolkit/version) for
Datadog, while MCP error responses now append
`additional_prompt_content` and force `structuredContent=None` on
failures per spec. Includes extensive new tests and bumps package
versions (`arcade-core` 4.6.2, `arcade-tdk` 3.6.1, `arcade-mcp-server`
1.19.3, `arcade-serve` 3.2.3).
> 
> <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit
e5c7ebcaf56176cfbd8e6d1f2b6295352abd0ec0. Bugbot is set up for automated
code reviews on this repo. Configure
[here](https://www.cursor.com/dashboard/bugbot).</sup>
<!-- /CURSOR_SUMMARY -->

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 20:10:51 -03:00

437 lines
15 KiB
Python

from typing import Annotated
import pytest
from arcade_core.catalog import ToolCatalog
from arcade_core.errors import (
ContextRequiredToolError,
ErrorKind,
ToolRuntimeError,
UpstreamError,
UpstreamRateLimitError,
)
from arcade_core.executor import ToolExecutor
from arcade_core.schema import ToolCallError, ToolCallLog, ToolCallOutput, ToolContext
from arcade_tdk import tool
from arcade_tdk.errors import (
RetryableToolError,
ToolExecutionError,
)
from typing_extensions import TypedDict
@tool
def simple_tool(inp: Annotated[str, "input"]) -> Annotated[str, "output"]:
"""Simple tool"""
return inp
@tool.deprecated("Use simple_tool instead")
@tool
def simple_deprecated_tool(inp: Annotated[str, "input"]) -> Annotated[str, "output"]:
"""Simple tool that is deprecated"""
return inp
@tool
def retryable_error_tool() -> Annotated[str, "output"]:
"""Tool that raises a retryable error"""
raise RetryableToolError("test", "test developer message", "additional prompt content", 1000)
@tool
def tool_execution_error_tool() -> Annotated[str, "output"]:
"""Tool that raises an error"""
raise ToolExecutionError("test", "test developer message")
@tool
def unexpected_error_tool() -> Annotated[str, "output"]:
"""Tool that raises an unexpected error"""
raise RuntimeError("test")
@tool
def context_required_error_tool() -> Annotated[str, "output"]:
"""Tool that raises a context required error"""
raise ContextRequiredToolError(
"test", additional_prompt_content="need the user to clarify something"
)
@tool
def upstream_error_tool() -> Annotated[str, "output"]:
"""Tool that raises an upstream error"""
# TODO: or test raising a httpx error? Do these types of tests belong in adapter tests?
raise UpstreamError("test", status_code=400)
@tool
def upstream_ratelimit_error_tool() -> Annotated[str, "output"]:
"""Tool that raises an upstream error"""
# TODO: or test raising a httpx error? Do these types of tests belong in adapter tests?
raise UpstreamRateLimitError("test", 1000)
@tool
def tool_runtime_error_tool() -> Annotated[str, "output"]:
"""Tool that raises a tool runtime error"""
raise ToolRuntimeError("test", "test developer message")
@tool
def bad_output_error_tool() -> Annotated[str, "output"]:
"""tool that returns a bad output type"""
return {"output": "test"}
# TypedDict output tools
class ResultDict(TypedDict):
"""Result dictionary."""
status: str
count: int
items: list[str]
@tool
def typeddict_output_tool() -> Annotated[ResultDict, "Returns a TypedDict"]:
"""Tool that returns a TypedDict."""
return ResultDict(status="success", count=3, items=["a", "b", "c"])
@tool
def list_typeddict_output_tool() -> Annotated[list[ResultDict], "Returns list of TypedDict"]:
"""Tool that returns a list of TypedDict."""
return [
ResultDict(status="first", count=1, items=["x"]),
ResultDict(status="second", count=2, items=["y", "z"]),
]
@tool
def dict_output_tool() -> Annotated[dict, "Returns a plain dict"]:
"""Tool that returns a plain dict."""
return {"key": "value", "number": 42, "nested": {"inner": "data"}}
# ---- Test Driver ----
tools = [
simple_tool,
simple_deprecated_tool,
retryable_error_tool,
tool_execution_error_tool,
unexpected_error_tool,
context_required_error_tool,
upstream_error_tool,
upstream_ratelimit_error_tool,
tool_runtime_error_tool,
bad_output_error_tool,
typeddict_output_tool,
list_typeddict_output_tool,
dict_output_tool,
]
catalog = ToolCatalog()
for tool_func in tools:
catalog.add_tool(tool_func, "simple_toolkit")
@pytest.mark.asyncio
@pytest.mark.parametrize(
"tool_func, inputs, expected_output",
[
(simple_tool, {"inp": "test"}, ToolCallOutput(value="test")),
(
simple_deprecated_tool,
{"inp": "test"},
ToolCallOutput(
value="test",
logs=[
ToolCallLog(
message="Use simple_tool instead",
level="warning",
subtype="deprecation",
)
],
),
),
(
retryable_error_tool,
{},
ToolCallOutput(
error=ToolCallError(
message="[TOOL_RUNTIME_RETRY] RetryableToolError during execution of tool 'retryable_error_tool': test",
kind=ErrorKind.TOOL_RUNTIME_RETRY,
developer_message="[TOOL_RUNTIME_RETRY] RetryableToolError during execution of tool 'retryable_error_tool': test developer message",
additional_prompt_content="additional prompt content",
retry_after_ms=1000,
can_retry=True,
)
),
),
(
tool_execution_error_tool,
{},
ToolCallOutput(
error=ToolCallError(
message="[TOOL_RUNTIME_FATAL] ToolExecutionError during execution of tool 'tool_execution_error_tool': test",
kind=ErrorKind.TOOL_RUNTIME_FATAL,
developer_message="[TOOL_RUNTIME_FATAL] ToolExecutionError during execution of tool 'tool_execution_error_tool': test developer message",
can_retry=False,
)
),
),
(
unexpected_error_tool,
{},
ToolCallOutput(
error=ToolCallError(
message=(
"[TOOL_RUNTIME_FATAL] FatalToolError during execution of tool "
"'unexpected_error_tool': An unhandled RuntimeError was raised by the tool."
),
kind=ErrorKind.TOOL_RUNTIME_FATAL,
developer_message=(
"[TOOL_RUNTIME_FATAL] FatalToolError during execution of tool "
"'unexpected_error_tool': RuntimeError: test"
),
can_retry=False,
status_code=500,
)
),
),
(
simple_tool,
{"inp": {"test": "test"}}, # takes in a string not a dict
ToolCallOutput(
error=ToolCallError(
message="[TOOL_RUNTIME_BAD_INPUT_VALUE] ToolInputError during execution of tool 'simple_tool': Invalid input: inp:",
kind=ErrorKind.TOOL_RUNTIME_BAD_INPUT_VALUE,
status_code=400,
developer_message=None, # can't guarantee this will be the same
)
),
),
(
context_required_error_tool,
{},
ToolCallOutput(
error=ToolCallError(
message="[TOOL_RUNTIME_CONTEXT_REQUIRED] ContextRequiredToolError during execution of tool 'context_required_error_tool': test",
kind=ErrorKind.TOOL_RUNTIME_CONTEXT_REQUIRED,
developer_message=None,
additional_prompt_content="need the user to clarify something",
)
),
),
(
upstream_error_tool,
{},
ToolCallOutput(
error=ToolCallError(
message="[UPSTREAM_RUNTIME_BAD_REQUEST] UpstreamError during execution of tool 'upstream_error_tool': test",
kind=ErrorKind.UPSTREAM_RUNTIME_BAD_REQUEST,
status_code=400,
developer_message=None,
)
),
),
(
upstream_ratelimit_error_tool,
{},
ToolCallOutput(
error=ToolCallError(
message="[UPSTREAM_RUNTIME_RATE_LIMIT] UpstreamRateLimitError during execution of tool 'upstream_ratelimit_error_tool': test",
kind=ErrorKind.UPSTREAM_RUNTIME_RATE_LIMIT,
status_code=429,
developer_message=None,
retry_after_ms=1000,
can_retry=True,
)
),
),
(
tool_runtime_error_tool,
{},
ToolCallOutput(
error=ToolCallError(
message="[TOOL_RUNTIME_FATAL] ToolRuntimeError during execution of tool 'tool_runtime_error_tool': test",
kind=ErrorKind.TOOL_RUNTIME_FATAL,
developer_message="[TOOL_RUNTIME_FATAL] ToolRuntimeError during execution of tool 'tool_runtime_error_tool': test developer message",
can_retry=False,
)
),
),
(
bad_output_error_tool,
{},
ToolCallOutput(
error=ToolCallError(
message="[TOOL_RUNTIME_BAD_OUTPUT_VALUE] ToolOutputError during execution of tool 'bad_output_error_tool': Failed to serialize tool output",
kind=ErrorKind.TOOL_RUNTIME_BAD_OUTPUT_VALUE,
status_code=500,
developer_message=None, # can't gaurantee this will be the same
)
),
),
(
typeddict_output_tool,
{},
ToolCallOutput(value={"status": "success", "count": 3, "items": ["a", "b", "c"]}),
),
(
list_typeddict_output_tool,
{},
ToolCallOutput(
value=[
{"status": "first", "count": 1, "items": ["x"]},
{"status": "second", "count": 2, "items": ["y", "z"]},
]
),
),
(
dict_output_tool,
{},
ToolCallOutput(value={"key": "value", "number": 42, "nested": {"inner": "data"}}),
),
],
ids=[
"simple_tool",
"simple_deprecated_tool",
"retryable_error_tool",
"exec_error_tool",
"unexpected_error_tool",
"invalid_input_type",
"context_required_error_tool",
"upstream_error_tool",
"upstream_ratelimit_error_tool",
"tool_runtime_error_tool",
"bad_output_type",
"typeddict_output",
"list_typeddict_output",
"dict_output",
],
)
async def test_tool_executor(tool_func, inputs, expected_output):
tool_definition = catalog.find_tool_by_func(tool_func)
dummy_context = ToolContext()
full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name())
output = await ToolExecutor.run(
func=tool_func,
definition=tool_definition,
input_model=full_tool.input_model,
output_model=full_tool.output_model,
context=dummy_context,
**inputs,
)
check_output(output, expected_output)
def check_output_error(output_error: ToolCallError, expected_error: ToolCallError):
if "Invalid input:" in expected_error.message:
assert output_error.message.startswith(
expected_error.message
), f"message mismatch: {output_error.message!r} does not start with {expected_error.message!r}"
else:
assert output_error.message == expected_error.message, "message mismatch"
assert output_error.kind == expected_error.kind, "kind mismatch"
if expected_error.developer_message:
assert (
output_error.developer_message == expected_error.developer_message
), "developer message mismatch"
assert output_error.can_retry == expected_error.can_retry, "can retry mismatch"
assert (
output_error.additional_prompt_content == expected_error.additional_prompt_content
), "additional prompt content mismatch"
assert output_error.retry_after_ms == expected_error.retry_after_ms, "retry after ms mismatch"
if expected_error.stacktrace:
assert output_error.stacktrace == expected_error.stacktrace, "stacktrace mismatch"
assert output_error.status_code == expected_error.status_code, "status code mismatch"
assert output_error.extra == expected_error.extra, "extra mismatch"
def check_output(output: ToolCallOutput, expected_output: ToolCallOutput):
# error in ToolCallOutput
if output.error:
check_output_error(output.error, expected_output.error)
# normal tool execution
else:
assert output.value == expected_output.value
# check logs
output_logs = output.logs or []
expected_logs = expected_output.logs or []
assert len(output_logs) == len(expected_logs)
for output_log, expected_log in zip(output_logs, expected_logs, strict=False):
assert output_log.message == expected_log.message
assert output_log.level == expected_log.level
assert output_log.subtype == expected_log.subtype
@tool
def multi_field_tool(
name: Annotated[str, "a name"],
age: Annotated[int, "an age"],
) -> Annotated[str, "output"]:
"""Tool with multiple required fields"""
return f"{name} is {age}"
catalog.add_tool(multi_field_tool, "MultiFieldToolkit")
@pytest.mark.asyncio
async def test_multiple_bad_fields_in_input_error():
tool_definition = catalog.find_tool_by_func(multi_field_tool)
full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name())
dummy_context = ToolContext()
output = await ToolExecutor.run(
func=multi_field_tool,
definition=tool_definition,
input_model=full_tool.input_model,
output_model=full_tool.output_model,
context=dummy_context,
name=123, # wrong type
age="not_an_int", # wrong type
)
assert output.error is not None
assert "Invalid input:" in output.error.message
assert "name" in output.error.message
assert "age" in output.error.message
@pytest.mark.asyncio
async def test_input_validation_error_does_not_leak_input_values():
"""Pydantic's ``str(e)`` and ``err["input"]`` echo offending values back
verbatim — which may contain user secrets (passwords, tokens, PII).
Neither ``message`` (agent-facing) nor ``developer_message`` (Datadog log
facet) must contain the rejected input values."""
tool_definition = catalog.find_tool_by_func(multi_field_tool)
full_tool = catalog.get_tool(tool_definition.get_fully_qualified_name())
dummy_context = ToolContext()
secret = "SECRET_PASSWORD_DO_NOT_LEAK_42"
output = await ToolExecutor.run(
func=multi_field_tool,
definition=tool_definition,
input_model=full_tool.input_model,
output_model=full_tool.output_model,
context=dummy_context,
name=12345, # wrong type, ignored for the leak check
age=secret, # wrong type AND a "secret" value we want to ensure is not echoed
)
assert output.error is not None
# Field path + reason must be present (so agents can self-correct).
assert "age" in output.error.message
# But the actual rejected input value must NOT be anywhere in either field.
assert secret not in output.error.message
assert output.error.developer_message is not None
assert secret not in output.error.developer_message
# The integer wrong-type value also must not appear.
assert "12345" not in output.error.message
assert "12345" not in output.error.developer_message