openai-agents-python/tests/test_agent_runner.py
2025-03-18 21:55:12 -04:00

636 lines
18 KiB
Python

from __future__ import annotations
import json
from typing import Any
import pytest
from typing_extensions import TypedDict
from agents import (
Agent,
GuardrailFunctionOutput,
Handoff,
HandoffInputData,
InputGuardrail,
InputGuardrailTripwireTriggered,
ModelBehaviorError,
OutputGuardrail,
OutputGuardrailTripwireTriggered,
RunContextWrapper,
Runner,
UserError,
handoff,
)
from agents.agent import ToolsToFinalOutputResult
from agents.tool import FunctionToolResult, function_tool
from .fake_model import FakeModel
from .test_responses import (
get_final_output_message,
get_function_tool,
get_function_tool_call,
get_handoff_tool_call,
get_text_input_item,
get_text_message,
)
@pytest.mark.asyncio
async def test_simple_first_run():
model = FakeModel()
agent = Agent(
name="test",
model=model,
)
model.set_next_output([get_text_message("first")])
result = await Runner.run(agent, input="test")
assert result.input == "test"
assert len(result.new_items) == 1, "exactly one item should be generated"
assert result.final_output == "first"
assert len(result.raw_responses) == 1, "exactly one model response should be generated"
assert result.raw_responses[0].output == [get_text_message("first")]
assert result.last_agent == agent
assert len(result.to_input_list()) == 2, "should have original input and generated item"
model.set_next_output([get_text_message("second")])
result = await Runner.run(
agent, input=[get_text_input_item("message"), get_text_input_item("another_message")]
)
assert len(result.new_items) == 1, "exactly one item should be generated"
assert result.final_output == "second"
assert len(result.raw_responses) == 1, "exactly one model response should be generated"
assert len(result.to_input_list()) == 3, "should have original input and generated item"
@pytest.mark.asyncio
async def test_subsequent_runs():
model = FakeModel()
agent = Agent(
name="test",
model=model,
)
model.set_next_output([get_text_message("third")])
result = await Runner.run(agent, input="test")
assert result.input == "test"
assert len(result.new_items) == 1, "exactly one item should be generated"
assert len(result.to_input_list()) == 2, "should have original input and generated item"
model.set_next_output([get_text_message("fourth")])
result = await Runner.run(agent, input=result.to_input_list())
assert len(result.input) == 2, f"should have previous input but got {result.input}"
assert len(result.new_items) == 1, "exactly one item should be generated"
assert result.final_output == "fourth"
assert len(result.raw_responses) == 1, "exactly one model response should be generated"
assert result.raw_responses[0].output == [get_text_message("fourth")]
assert result.last_agent == agent
assert len(result.to_input_list()) == 3, "should have original input and generated items"
@pytest.mark.asyncio
async def test_tool_call_runs():
model = FakeModel()
agent = Agent(
name="test",
model=model,
tools=[get_function_tool("foo", "tool_result")],
)
model.add_multiple_turn_outputs(
[
# First turn: a message and tool call
[get_text_message("a_message"), get_function_tool_call("foo", json.dumps({"a": "b"}))],
# Second turn: text message
[get_text_message("done")],
]
)
result = await Runner.run(agent, input="user_message")
assert result.final_output == "done"
assert len(result.raw_responses) == 2, (
"should have two responses: the first which produces a tool call, and the second which"
"handles the tool result"
)
assert len(result.to_input_list()) == 5, (
"should have five inputs: the original input, the message, the tool call, the tool result "
"and the done message"
)
@pytest.mark.asyncio
async def test_handoffs():
model = FakeModel()
agent_1 = Agent(
name="test",
model=model,
)
agent_2 = Agent(
name="test",
model=model,
)
agent_3 = Agent(
name="test",
model=model,
handoffs=[agent_1, agent_2],
tools=[get_function_tool("some_function", "result")],
)
model.add_multiple_turn_outputs(
[
# First turn: a tool call
[get_function_tool_call("some_function", json.dumps({"a": "b"}))],
# Second turn: a message and a handoff
[get_text_message("a_message"), get_handoff_tool_call(agent_1)],
# Third turn: text message
[get_text_message("done")],
]
)
result = await Runner.run(agent_3, input="user_message")
assert result.final_output == "done"
assert len(result.raw_responses) == 3, "should have three model responses"
assert len(result.to_input_list()) == 7, (
"should have 7 inputs: orig input, tool call, tool result, message, handoff, handoff"
"result, and done message"
)
assert result.last_agent == agent_1, "should have handed off to agent_1"
class Foo(TypedDict):
bar: str
@pytest.mark.asyncio
async def test_structured_output():
model = FakeModel()
agent_1 = Agent(
name="test",
model=model,
tools=[get_function_tool("bar", "bar_result")],
output_type=Foo,
)
agent_2 = Agent(
name="test",
model=model,
tools=[get_function_tool("foo", "foo_result")],
handoffs=[agent_1],
)
model.add_multiple_turn_outputs(
[
# First turn: a tool call
[get_function_tool_call("foo", json.dumps({"bar": "baz"}))],
# Second turn: a message and a handoff
[get_text_message("a_message"), get_handoff_tool_call(agent_1)],
# Third turn: tool call and structured output
[
get_function_tool_call("bar", json.dumps({"bar": "baz"})),
get_final_output_message(json.dumps(Foo(bar="baz"))),
],
]
)
result = await Runner.run(
agent_2,
input=[
get_text_input_item("user_message"),
get_text_input_item("another_message"),
],
)
assert result.final_output == Foo(bar="baz")
assert len(result.raw_responses) == 3, "should have three model responses"
assert len(result.to_input_list()) == 10, (
"should have input: 2 orig inputs, function call, function call result, message, handoff, "
"handoff output, tool call, tool call result, final output message"
)
assert result.last_agent == agent_1, "should have handed off to agent_1"
assert result.final_output == Foo(bar="baz"), "should have structured output"
def remove_new_items(handoff_input_data: HandoffInputData) -> HandoffInputData:
return HandoffInputData(
input_history=handoff_input_data.input_history,
pre_handoff_items=(),
new_items=(),
)
@pytest.mark.asyncio
async def test_handoff_filters():
model = FakeModel()
agent_1 = Agent(
name="test",
model=model,
)
agent_2 = Agent(
name="test",
model=model,
handoffs=[
handoff(
agent=agent_1,
input_filter=remove_new_items,
)
],
)
model.add_multiple_turn_outputs(
[
[get_text_message("1"), get_text_message("2"), get_handoff_tool_call(agent_1)],
[get_text_message("last")],
]
)
result = await Runner.run(agent_2, input="user_message")
assert result.final_output == "last"
assert len(result.raw_responses) == 2, "should have two model responses"
assert len(result.to_input_list()) == 2, (
"should only have 2 inputs: orig input and last message"
)
@pytest.mark.asyncio
async def test_async_input_filter_fails():
# DO NOT rename this without updating pyproject.toml
model = FakeModel()
agent_1 = Agent(
name="test",
model=model,
)
async def on_invoke_handoff(_ctx: RunContextWrapper[Any], _input: str) -> Agent[Any]:
return agent_1
async def invalid_input_filter(data: HandoffInputData) -> HandoffInputData:
return data # pragma: no cover
agent_2 = Agent[None](
name="test",
model=model,
handoffs=[
Handoff(
tool_name=Handoff.default_tool_name(agent_1),
tool_description=Handoff.default_tool_description(agent_1),
input_json_schema={},
on_invoke_handoff=on_invoke_handoff,
agent_name=agent_1.name,
# Purposely ignoring the type error here to simulate invalid input
input_filter=invalid_input_filter, # type: ignore
)
],
)
model.add_multiple_turn_outputs(
[
[get_text_message("1"), get_text_message("2"), get_handoff_tool_call(agent_1)],
[get_text_message("last")],
]
)
with pytest.raises(UserError):
await Runner.run(agent_2, input="user_message")
@pytest.mark.asyncio
async def test_invalid_input_filter_fails():
model = FakeModel()
agent_1 = Agent(
name="test",
model=model,
)
async def on_invoke_handoff(_ctx: RunContextWrapper[Any], _input: str) -> Agent[Any]:
return agent_1
def invalid_input_filter(data: HandoffInputData) -> HandoffInputData:
# Purposely returning a string to simulate invalid output
return "foo" # type: ignore
agent_2 = Agent[None](
name="test",
model=model,
handoffs=[
Handoff(
tool_name=Handoff.default_tool_name(agent_1),
tool_description=Handoff.default_tool_description(agent_1),
input_json_schema={},
on_invoke_handoff=on_invoke_handoff,
agent_name=agent_1.name,
input_filter=invalid_input_filter,
)
],
)
model.add_multiple_turn_outputs(
[
[get_text_message("1"), get_text_message("2"), get_handoff_tool_call(agent_1)],
[get_text_message("last")],
]
)
with pytest.raises(UserError):
await Runner.run(agent_2, input="user_message")
@pytest.mark.asyncio
async def test_non_callable_input_filter_causes_error():
model = FakeModel()
agent_1 = Agent(
name="test",
model=model,
)
async def on_invoke_handoff(_ctx: RunContextWrapper[Any], _input: str) -> Agent[Any]:
return agent_1
agent_2 = Agent[None](
name="test",
model=model,
handoffs=[
Handoff(
tool_name=Handoff.default_tool_name(agent_1),
tool_description=Handoff.default_tool_description(agent_1),
input_json_schema={},
on_invoke_handoff=on_invoke_handoff,
agent_name=agent_1.name,
# Purposely ignoring the type error here to simulate invalid input
input_filter="foo", # type: ignore
)
],
)
model.add_multiple_turn_outputs(
[
[get_text_message("1"), get_text_message("2"), get_handoff_tool_call(agent_1)],
[get_text_message("last")],
]
)
with pytest.raises(UserError):
await Runner.run(agent_2, input="user_message")
@pytest.mark.asyncio
async def test_handoff_on_input():
call_output: str | None = None
def on_input(_ctx: RunContextWrapper[Any], data: Foo) -> None:
nonlocal call_output
call_output = data["bar"]
model = FakeModel()
agent_1 = Agent(
name="test",
model=model,
)
agent_2 = Agent(
name="test",
model=model,
handoffs=[
handoff(
agent=agent_1,
on_handoff=on_input,
input_type=Foo,
)
],
)
model.add_multiple_turn_outputs(
[
[
get_text_message("1"),
get_text_message("2"),
get_handoff_tool_call(agent_1, args=json.dumps(Foo(bar="test_input"))),
],
[get_text_message("last")],
]
)
result = await Runner.run(agent_2, input="user_message")
assert result.final_output == "last"
assert call_output == "test_input", "should have called the handoff with the correct input"
@pytest.mark.asyncio
async def test_async_handoff_on_input():
call_output: str | None = None
async def on_input(_ctx: RunContextWrapper[Any], data: Foo) -> None:
nonlocal call_output
call_output = data["bar"]
model = FakeModel()
agent_1 = Agent(
name="test",
model=model,
)
agent_2 = Agent(
name="test",
model=model,
handoffs=[
handoff(
agent=agent_1,
on_handoff=on_input,
input_type=Foo,
)
],
)
model.add_multiple_turn_outputs(
[
[
get_text_message("1"),
get_text_message("2"),
get_handoff_tool_call(agent_1, args=json.dumps(Foo(bar="test_input"))),
],
[get_text_message("last")],
]
)
result = await Runner.run(agent_2, input="user_message")
assert result.final_output == "last"
assert call_output == "test_input", "should have called the handoff with the correct input"
@pytest.mark.asyncio
async def test_wrong_params_on_input_causes_error():
agent_1 = Agent(
name="test",
)
def _on_handoff_too_many_params(ctx: RunContextWrapper[Any], foo: Foo, bar: str) -> None:
pass
with pytest.raises(UserError):
handoff(
agent_1,
input_type=Foo,
# Purposely ignoring the type error here to simulate invalid input
on_handoff=_on_handoff_too_many_params, # type: ignore
)
def on_handoff_too_few_params(ctx: RunContextWrapper[Any]) -> None:
pass
with pytest.raises(UserError):
handoff(
agent_1,
input_type=Foo,
# Purposely ignoring the type error here to simulate invalid input
on_handoff=on_handoff_too_few_params, # type: ignore
)
@pytest.mark.asyncio
async def test_invalid_handoff_input_json_causes_error():
agent = Agent(name="test")
h = handoff(agent, input_type=Foo, on_handoff=lambda _ctx, _input: None)
with pytest.raises(ModelBehaviorError):
await h.on_invoke_handoff(
RunContextWrapper(None),
# Purposely ignoring the type error here to simulate invalid input
None, # type: ignore
)
with pytest.raises(ModelBehaviorError):
await h.on_invoke_handoff(RunContextWrapper(None), "invalid")
@pytest.mark.asyncio
async def test_input_guardrail_tripwire_triggered_causes_exception():
def guardrail_function(
context: RunContextWrapper[Any], agent: Agent[Any], input: Any
) -> GuardrailFunctionOutput:
return GuardrailFunctionOutput(
output_info=None,
tripwire_triggered=True,
)
agent = Agent(
name="test", input_guardrails=[InputGuardrail(guardrail_function=guardrail_function)]
)
model = FakeModel()
model.set_next_output([get_text_message("user_message")])
with pytest.raises(InputGuardrailTripwireTriggered):
await Runner.run(agent, input="user_message")
@pytest.mark.asyncio
async def test_output_guardrail_tripwire_triggered_causes_exception():
def guardrail_function(
context: RunContextWrapper[Any], agent: Agent[Any], agent_output: Any
) -> GuardrailFunctionOutput:
return GuardrailFunctionOutput(
output_info=None,
tripwire_triggered=True,
)
model = FakeModel()
agent = Agent(
name="test",
output_guardrails=[OutputGuardrail(guardrail_function=guardrail_function)],
model=model,
)
model.set_next_output([get_text_message("user_message")])
with pytest.raises(OutputGuardrailTripwireTriggered):
await Runner.run(agent, input="user_message")
@function_tool
def test_tool_one():
return Foo(bar="tool_one_result")
@function_tool
def test_tool_two():
return "tool_two_result"
@pytest.mark.asyncio
async def test_tool_use_behavior_first_output():
model = FakeModel()
agent = Agent(
name="test",
model=model,
tools=[get_function_tool("foo", "tool_result"), test_tool_one, test_tool_two],
tool_use_behavior="stop_on_first_tool",
output_type=Foo,
)
model.add_multiple_turn_outputs(
[
# First turn: a message and tool call
[
get_text_message("a_message"),
get_function_tool_call("test_tool_one", None),
get_function_tool_call("test_tool_two", None),
],
]
)
result = await Runner.run(agent, input="user_message")
assert result.final_output == Foo(bar="tool_one_result"), (
"should have used the first tool result"
)
def custom_tool_use_behavior(
context: RunContextWrapper[Any], results: list[FunctionToolResult]
) -> ToolsToFinalOutputResult:
if "test_tool_one" in [result.tool.name for result in results]:
return ToolsToFinalOutputResult(is_final_output=True, final_output="the_final_output")
else:
return ToolsToFinalOutputResult(is_final_output=False, final_output=None)
@pytest.mark.asyncio
async def test_tool_use_behavior_custom_function():
model = FakeModel()
agent = Agent(
name="test",
model=model,
tools=[get_function_tool("foo", "tool_result"), test_tool_one, test_tool_two],
tool_use_behavior=custom_tool_use_behavior,
)
model.add_multiple_turn_outputs(
[
# First turn: a message and tool call
[
get_text_message("a_message"),
get_function_tool_call("test_tool_two", None),
],
# Second turn: a message and tool call
[
get_text_message("a_message"),
get_function_tool_call("test_tool_one", None),
get_function_tool_call("test_tool_two", None),
],
]
)
result = await Runner.run(agent, input="user_message")
assert len(result.raw_responses) == 2, "should have two model responses"
assert result.final_output == "the_final_output", "should have used the custom function"