arcade-mcp/libs/tests/core/test_output.py
Francisco Or Something 1492c80fc5
TOO-627: Improve error messages for agents and Datadog (#814)
## Summary

- Improve tool call error messages across 4 libraries (arcade-core,
arcade-tdk, arcade-mcp-server, arcade-serve) so agents can self-correct
and Datadog can facet on structured fields
- Guard empty error messages, enrich input validation errors with
field-level detail, fix `@tool` decorator fallback formatting, surface
`additional_prompt_content` in MCP responses, and add structured log
extras for Datadog
- Addresses the 3 worst error patterns: generic "Error in tool input
deserialization", bare `KeyError` values, and empty `FatalToolError`
messages

**Linear:** TOO-627
**Plan:** `docs/plans/2026-04-08-improve-error-messages-handoff.md`

## Tasks

- [ ] Task 1: Guard empty error messages (arcade-core)
- [ ] Task 2: Enrich input validation error messages (arcade-core)
- [ ] Task 3: Improve `@tool` decorator error fallback (arcade-tdk)
- [ ] Task 4: Fix MCP agent-facing error response (arcade-mcp-server)
- [ ] Task 5: Add structured log extras in BaseWorker (arcade-serve)
- [ ] Task 6: Add structured log extras in MCP server
(arcade-mcp-server)

## Test plan

- [ ] Each task has dedicated unit tests verifying the new behavior
- [ ] `make test` passes after all tasks
- [ ] `make check` (ruff + mypy) passes
- [ ] Verify the 3 worst error patterns now produce actionable messages

🤖 Generated with [Claude Code](https://claude.com/claude-code)

<!-- CURSOR_SUMMARY -->
---

> [!NOTE]
> **Medium Risk**
> Touches cross-library error formatting and logging behavior used in
production tool execution paths; while mostly additive/guardrails, it
changes agent-visible messages and Datadog log facets, which could
impact client expectations and alerting.
> 
> **Overview**
> Improves tool-call error handling across core/runtime, MCP transport,
worker transport, and the TDK to make agent-visible failures more
actionable while *reducing sensitive-data leakage*.
> 
> In `arcade-core`, empty error messages now get placeholders,
`ToolOutputFactory.fail*` defaults blank messages, and input validation
errors are rewritten as field-level summaries that intentionally omit
rejected values (avoiding Pydantic echo of secrets). The `@tool`
fallback in `arcade-tdk` no longer surfaces `str(exception)` to agents;
it returns exception *type-only* in `message` while preserving full
detail in `developer_message`.
> 
> Adds a shared `build_tool_error_log_extra` helper and updates
`arcade-serve` + `arcade-mcp-server` to emit consistent structured
WARNING logs (`error_*`, `tool_name`, optional toolkit/version) for
Datadog, while MCP error responses now append
`additional_prompt_content` and force `structuredContent=None` on
failures per spec. Includes extensive new tests and bumps package
versions (`arcade-core` 4.6.2, `arcade-tdk` 3.6.1, `arcade-mcp-server`
1.19.3, `arcade-serve` 3.2.3).
> 
> <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit
e5c7ebcaf56176cfbd8e6d1f2b6295352abd0ec0. Bugbot is set up for automated
code reviews on this repo. Configure
[here](https://www.cursor.com/dashboard/bugbot).</sup>
<!-- /CURSOR_SUMMARY -->

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 20:10:51 -03:00

162 lines
5.2 KiB
Python

from typing import Any
import pytest
from arcade_core.output import ToolOutputFactory
from pydantic import BaseModel
@pytest.fixture
def output_factory():
return ToolOutputFactory()
class SampleOutputModel(BaseModel):
result: Any
@pytest.mark.parametrize(
"data, expected_value",
[
(None, ""),
("success", "success"),
("", ""),
(None, ""),
(123, 123),
(0, 0),
(123.45, 123.45),
(True, True),
(False, False),
],
)
def test_success(output_factory, data, expected_value):
data_obj = SampleOutputModel(result=data) if data is not None else None
output = output_factory.success(data=data_obj)
assert output.value == expected_value
assert output.error is None
@pytest.mark.parametrize(
"data, expected_value",
[
# Dict types (simulating TypedDict at runtime)
({"name": "test", "value": 123}, {"name": "test", "value": 123}),
({}, {}),
({"nested": {"key": "value"}}, {"nested": {"key": "value"}}),
# List types
(["a", "b", "c"], ["a", "b", "c"]),
([1, 2, 3], [1, 2, 3]),
([], []),
# List of dicts (simulating list[TypedDict])
(
[{"id": 1, "name": "a"}, {"id": 2, "name": "b"}],
[{"id": 1, "name": "a"}, {"id": 2, "name": "b"}],
),
([{}], [{}]),
# Mixed lists
([1, "two", 3.0, True], [1, "two", 3.0, True]),
],
)
def test_success_complex_types(output_factory, data, expected_value):
"""Test that dict and list types are properly handled by ToolOutputFactory."""
data_obj = SampleOutputModel(result=data)
output = output_factory.success(data=data_obj)
assert output.value == expected_value
assert output.error is None
def test_success_with_basemodel_direct(output_factory):
"""Test that BaseModel instances are converted to dict via model_dump()."""
class TestModel(BaseModel):
name: str
value: int
model = TestModel(name="test", value=42)
output = output_factory.success(data=model)
assert output.value == {"name": "test", "value": 42}
assert output.error is None
def test_success_raw_dict(output_factory):
"""Test that raw dict values (not wrapped in model) are handled correctly."""
raw_dict = {"key": "value", "number": 123}
output = output_factory.success(data=raw_dict)
assert output.value == raw_dict
assert output.error is None
def test_success_raw_list(output_factory):
"""Test that raw list values (not wrapped in model) are handled correctly."""
raw_list = [{"id": 1}, {"id": 2}, {"id": 3}]
output = output_factory.success(data=raw_list)
assert output.value == raw_list
assert output.error is None
@pytest.mark.parametrize(
"message, developer_message",
[
("Error occurred", None),
("Error occurred", "Detailed error message"),
],
)
def test_fail(output_factory, message, developer_message):
output = output_factory.fail(message=message, developer_message=developer_message)
assert output.error is not None
assert output.error.message == message
assert output.error.developer_message == developer_message
assert output.error.can_retry is False
def test_fail_empty_message_gets_default(output_factory):
output = output_factory.fail(message="")
assert output.error is not None
assert output.error.message == "Unspecified error during tool execution"
def test_fail_whitespace_message_gets_default(output_factory):
output = output_factory.fail(message=" ")
assert output.error is not None
assert output.error.message == "Unspecified error during tool execution"
def test_fail_nonempty_message_unchanged(output_factory):
output = output_factory.fail(message="real error")
assert output.error is not None
assert output.error.message == "real error"
def test_fail_retry_empty_message_gets_default(output_factory):
output = output_factory.fail_retry(message="")
assert output.error is not None
assert output.error.message == "Unspecified error during tool execution"
def test_fail_retry_whitespace_message_gets_default(output_factory):
output = output_factory.fail_retry(message=" ")
assert output.error is not None
assert output.error.message == "Unspecified error during tool execution"
@pytest.mark.parametrize(
"message, developer_message, additional_prompt_content, retry_after_ms",
[
("Retry error", None, None, None),
("Retry error", "Retrying", "Please try again with this additional data: foobar", 1000),
],
)
def test_fail_retry(
output_factory, message, developer_message, additional_prompt_content, retry_after_ms
):
output = output_factory.fail_retry(
message=message,
developer_message=developer_message,
additional_prompt_content=additional_prompt_content,
retry_after_ms=retry_after_ms,
)
assert output.error is not None
assert output.error.message == message
assert output.error.developer_message == developer_message
assert output.error.can_retry is True
assert output.error.additional_prompt_content == additional_prompt_content
assert output.error.retry_after_ms == retry_after_ms