# PR Description
This PR adds ~~four~~ three improvements to evals.
~~## 1. Add parameterized eval cases~~
~~Adds a new method named `add_parameterized_case`. Just like pytest’s
parameterized tests, eval cases can be parameterized with multiple user
messages. Adds a case to the `EvalSuite` for each user message. All
cases have the same expected tool call(s), params, additional_messages.
This reduces duplicate code and makes it easy to observe how a model
performs based on increasingly more difficult prompts.~~
```python
""" NO LONGER IN THIS PR
user_messages = [
"Call the delete tweet by id tool with the tweet ID '148975632'.",
"Delete the tweet with ID '148975632'.",
"I don't want to have this tweet (148975632) on my account anymore.",
"do the opposite of post for https://x.com/x/status/148975632",
]
suite.add_parameterized_case(
name="Delete a tweet by ID",
user_messages=user_messages,
expected_tool_calls=[
ExpectedToolCall(
func=delete_tweet_by_id,
args={"tweet_id": "148975632"},
)
],
critics=[
BinaryCritic(
critic_field="tweet_id",
weight=1.0,
),
],
)
"""
```
~~PASSED Delete a tweet by ID (user_message 1 of 4) -- Score: 100.00%~~
~~PASSED Delete a tweet by ID (user_message 2 of 4) -- Score: 100.00%~~
~~PASSED Delete a tweet by ID (user_message 3 of 4) -- Score: 100.00%~~
~~FAILED Delete a tweet by ID (user_message 4 of 4) -- Score: 0.00%~~
~~Summary -- Total: 4 -- Passed: 3 -- Failed: 1~~
## 2. Parameters that are not explicitly criticized are assigned a
`NoneCritic`.
A NoneCritic has no effect on the evaluation results and does not
actually evaluate. Parameters that have a NoneCritic will be displayed
as ‘un-criticized’ in the evaluation summary (if `-d` flag is used).

## 3. Add a hardcoded `seed` parameter for evals.
The seed parameter aides in receiving (mostly) consistent outputs -
aiding in reproducibility for evaluations.
## 4. Disallow more than one critic for the same field.
Raises a `ValueError` if more than one critic is assigned to a field.
---------
Co-authored-by: Eric Gustin <eric@arcade-ai.com>
186 lines
8.2 KiB
Python
186 lines
8.2 KiB
Python
from arcade.sdk import ToolCatalog
|
|
from arcade.sdk.eval import (
|
|
BinaryCritic,
|
|
EvalRubric,
|
|
EvalSuite,
|
|
ExpectedToolCall,
|
|
SimilarityCritic,
|
|
tool_eval,
|
|
)
|
|
|
|
import arcade_google
|
|
from arcade_google.tools.docs import (
|
|
create_blank_document,
|
|
create_document_from_text,
|
|
get_document_by_id,
|
|
insert_text_at_end_of_document,
|
|
)
|
|
|
|
# Evaluation rubric
|
|
rubric = EvalRubric(
|
|
fail_threshold=0.9,
|
|
warn_threshold=0.95,
|
|
)
|
|
|
|
catalog = ToolCatalog()
|
|
catalog.add_module(arcade_google)
|
|
|
|
|
|
@tool_eval()
|
|
def docs_eval_suite() -> EvalSuite:
|
|
"""Create an evaluation suite for Google Docs tools."""
|
|
suite = EvalSuite(
|
|
name="Google Docs Tools Evaluation",
|
|
system_message="You are an AI assistant that can create and manage Google Docs using the provided tools.",
|
|
catalog=catalog,
|
|
rubric=rubric,
|
|
)
|
|
|
|
# A previous tool call to list_documents
|
|
additional_messages = [
|
|
{"role": "user", "content": "list my 10 most recently created docs"},
|
|
{
|
|
"role": "assistant",
|
|
"content": "Please go to this URL and authorize the action: [Link](https://accounts.google.com/)",
|
|
},
|
|
{
|
|
"role": "assistant",
|
|
"content": "",
|
|
"tool_calls": [
|
|
{
|
|
"id": "call_gegK723W2hXsORjBmq1Oexqk",
|
|
"type": "function",
|
|
"function": {
|
|
"name": "Google_ListDocuments",
|
|
"arguments": '{"limit":10,"order_by":"createdTime desc"}',
|
|
},
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"role": "tool",
|
|
"content": '{"documents":[{"id":"1e0rCoT1Yd14WuuEvd3hSUcN_-VD3df4T3Q08uLm3TWc","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst10"},{"id":"1eTSWd-5zQds8K9OWYygwtCFMUyuuMize3bh3HaRsKts","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst9"},{"id":"19Dqugn0rVi89K0C__lpg1HbhQOTenccyZOhPgivTHMs","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst8"},{"id":"1RCibzx14eqP3vS9yI4nD13OKf8Vee56RiszS53OkR7I","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst7"},{"id":"1imFb04JQuBn8SiSsRFf6fEuYCyXkbII4KX8fsmnT0jo","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst6"},{"id":"1ZC3oypdfLWFgBd-emeSykJf9tZOae6USsFboygRCr-w","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst5"},{"id":"1-gFGNWmwLxEiKa6NNixLNq3X-phXRMORVZfVTfBg8Sc","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst4"},{"id":"1eQ8UBO_PY3Lem4R8OVdIc9ODXt0MrSUAnEu994Qz8P8","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst3"},{"id":"1TOxB0MLry-JzntDWDT1LFywTLdr3XDWPT5L5UsHMs5c","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst2"},{"id":"1a1UQ7C90s8kGfnO8k6wfAZz_Cy5nGN2MkCoRB5y2j3w","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst1"}],"documents_count":10}',
|
|
"tool_call_id": "call_gegK723W2hXsORjBmq1Oexqk",
|
|
"name": "Google_ListDocuments",
|
|
},
|
|
{
|
|
"role": "assistant",
|
|
"content": "Here are your 10 most recently created Google Docs:\n\n1. [Tst10](https://docs.google.com/document/d/1e0rCoT1Yd14WuuEvd3hSUcN_-VD3df4T3Q08uLm3TWc)\n2. [Tst9](https://docs.google.com/document/d/1eTSWd-5zQds8K9OWYygwtCFMUyuuMize3bh3HaRsKts)\n3. [Tst8](https://docs.google.com/document/d/19Dqugn0rVi89K0C__lpg1HbhQOTenccyZOhPgivTHMs)\n4. [Tst7](https://docs.google.com/document/d/1RCibzx14eqP3vS9yI4nD13OKf8Vee56RiszS53OkR7I)\n5. [Tst6](https://docs.google.com/document/d/1imFb04JQuBn8SiSsRFf6fEuYCyXkbII4KX8fsmnT0jo)\n6. [Tst5](https://docs.google.com/document/d/1ZC3oypdfLWFgBd-emeSykJf9tZOae6USsFboygRCr-w)\n7. [Tst4](https://docs.google.com/document/d/1-gFGNWmwLxEiKa6NNixLNq3X-phXRMORVZfVTfBg8Sc)\n8. [Tst3](https://docs.google.com/document/d/1eQ8UBO_PY3Lem4R8OVdIc9ODXt0MrSUAnEu994Qz8P8)\n9. [Tst2](https://docs.google.com/document/d/1TOxB0MLry-JzntDWDT1LFywTLdr3XDWPT5L5UsHMs5c)\n10. [Tst1](https://docs.google.com/document/d/1a1UQ7C90s8kGfnO8k6wfAZz_Cy5nGN2MkCoRB5y2j3w)\n\nYou can click the links to open each document.",
|
|
},
|
|
]
|
|
|
|
suite.add_case(
|
|
name="Get document content",
|
|
user_message="Can you read me the contents of Tst9 doc and also Tst10 doc please",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=get_document_by_id,
|
|
args={
|
|
"document_id": "1eTSWd-5zQds8K9OWYygwtCFMUyuuMize3bh3HaRsKts",
|
|
},
|
|
),
|
|
ExpectedToolCall(
|
|
func=get_document_by_id,
|
|
args={
|
|
"document_id": "1e0rCoT1Yd14WuuEvd3hSUcN_-VD3df4T3Q08uLm3TWc",
|
|
},
|
|
),
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="document_id", weight=0.6),
|
|
],
|
|
additional_messages=additional_messages,
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Insert text at end of document",
|
|
user_message="Please add the text 'This is a new paragraph.' to the end of Tst4.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=insert_text_at_end_of_document,
|
|
args={
|
|
"document_id": "1-gFGNWmwLxEiKa6NNixLNq3X-phXRMORVZfVTfBg8Sc",
|
|
"text_content": "This is a new paragraph.",
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="document_id", weight=0.5),
|
|
SimilarityCritic(critic_field="text_content", weight=0.5),
|
|
],
|
|
additional_messages=additional_messages,
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Read the contents of two documents and then insert text at end of a different document.",
|
|
user_message="Can you read me the contents of Tst9 doc and also Tst10 doc please. Also, please add the text 'This is a new paragraph.' to the end of Tst4.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=insert_text_at_end_of_document,
|
|
args={
|
|
"document_id": "1-gFGNWmwLxEiKa6NNixLNq3X-phXRMORVZfVTfBg8Sc",
|
|
"text_content": "This is a new paragraph.",
|
|
},
|
|
),
|
|
ExpectedToolCall(
|
|
func=get_document_by_id,
|
|
args={
|
|
"document_id": "1eTSWd-5zQds8K9OWYygwtCFMUyuuMize3bh3HaRsKts",
|
|
},
|
|
),
|
|
ExpectedToolCall(
|
|
func=get_document_by_id,
|
|
args={
|
|
"document_id": "1e0rCoT1Yd14WuuEvd3hSUcN_-VD3df4T3Q08uLm3TWc",
|
|
},
|
|
),
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="document_id", weight=0.3),
|
|
SimilarityCritic(critic_field="text_content", weight=0.3),
|
|
],
|
|
additional_messages=additional_messages,
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Create blank document",
|
|
user_message="Create a new Doc titled 'Meeting Notes'.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=create_blank_document,
|
|
args={
|
|
"title": "Meeting Notes",
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
SimilarityCritic(critic_field="title", weight=1.0),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Create document from text",
|
|
user_message="Create a new doc called To-Do List with the content 'Buy groceries, Call mom, Finish report'.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=create_document_from_text,
|
|
args={
|
|
"title": "To-Do List",
|
|
"text_content": "Buy groceries\nCall mom\nFinish report",
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
SimilarityCritic(critic_field="title", weight=0.5),
|
|
SimilarityCritic(critic_field="text_content", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="No tool call case",
|
|
user_message="Create a new microsoft word document titled 'My Resume'.",
|
|
expected_tool_calls=[],
|
|
critics=[],
|
|
)
|
|
|
|
return suite
|