arcade-mcp/toolkits/google/evals/eval_google_gmail.py
Renato Byrro ac0f5aa10c
Search Google Drive documents and retrieve contents (#265)
This tool will be useful in scenarios akin to RAG, where someone wants
to ask questions or request the production of a summary, for instance,
about a bunch of documents related to a particular topic. Currently, to
fulfill such requests, the LLM needs to first `list_documents`, then
`get_document_by_id` for each document.

We also implement a utility functions to return documents in Markdown
and HTML, since the Drive API JSON is verbose and would waste too many
tokens unnecessarily.

Limitations: the Markdown/HTML utilities do not handle table of contents
(which I think aren't really useful here), headers, footers, or
footnotes.

---
This PR deprecates `list_documents` and implements `search_documents`,
apart from `search_and_retrieve_documents`). This configuration makes it
easier for LLMs to understand when to call each tool.

Both tools had their interfaces refactored to remove Google API-specific
arguments that were confusing LLMs sometimes, such as "corpora" and
"support_all_drives". It now accepts arguments that better relate to
expected user requests.

---------

Co-authored-by: Eric Gustin <eric@arcade.dev>
2025-03-07 18:42:12 -03:00

431 lines
16 KiB
Python

import json
from arcade.sdk import ToolCatalog
from arcade.sdk.eval import (
BinaryCritic,
EvalRubric,
EvalSuite,
ExpectedToolCall,
SimilarityCritic,
tool_eval,
)
import arcade_google
from arcade_google.models import GmailReplyToWhom
from arcade_google.tools.gmail import (
get_thread,
list_emails_by_header,
list_threads,
reply_to_email,
search_threads,
send_email,
write_draft_reply_email,
)
from arcade_google.utils import DateRange
# Evaluation rubric
rubric = EvalRubric(
fail_threshold=0.9,
warn_threshold=0.95,
)
catalog = ToolCatalog()
catalog.add_module(arcade_google)
@tool_eval()
def gmail_eval_suite() -> EvalSuite:
"""Create an evaluation suite for Gmail tools."""
suite = EvalSuite(
name="Gmail Tools Evaluation",
system_message="You are an AI assistant that can send and manage emails using the provided tools.",
catalog=catalog,
rubric=rubric,
)
suite.add_case(
name="Send email to user with clear username",
user_message="Send a email to johndoe@example.com saying 'Hello, can we meet at 3 PM?'. CC his boss janedoe@example.com",
expected_tool_calls=[
ExpectedToolCall(
func=send_email,
args={
"subject": "Meeting Request",
"body": "Hello, can we meet at 3 PM?",
"recipient": "johndoe@example.com",
"cc": ["janedoe@example.com"],
"bcc": None,
},
)
],
critics=[
SimilarityCritic(critic_field="subject", weight=0.125),
SimilarityCritic(critic_field="body", weight=0.25),
BinaryCritic(critic_field="recipient", weight=0.25),
BinaryCritic(critic_field="cc", weight=0.25),
BinaryCritic(critic_field="bcc", weight=0.125),
],
)
suite.add_case(
name="Simple list threads",
user_message="Get 42 threads like right now i even wanna see the ones in my trash",
expected_tool_calls=[
ExpectedToolCall(
func=list_threads,
args={"max_results": 42, "include_spam_trash": True},
)
],
critics=[
BinaryCritic(critic_field="max_results", weight=0.5),
BinaryCritic(critic_field="include_spam_trash", weight=0.5),
],
)
history = [
{"role": "user", "content": "list 1 thread"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_X8V5Hw9iJ3wfB8WMZf8omAMi",
"type": "function",
"function": {"name": "Google_ListThreads", "arguments": '{"max_results":1}'},
}
],
},
{
"role": "tool",
"content": '{"next_page_token":"10321400718999360131","num_threads":1,"threads":[{"historyId":"61691","id":"1934a8f8deccb749","snippet":"Hi Joe, I hope this email finds you well. Thank you for being a part of our community."}]}',
"tool_call_id": "call_X8V5Hw9iJ3wfB8WMZf8omAMi",
"name": "Google_ListThreads",
},
{
"role": "assistant",
"content": "Here is one email thread:\n\n- **Snippet:** Hi Joe, I hope this email finds you well. Thank you for being a part of our community.\n- **Thread ID:** 1934a8f8deccb749\n- **History ID:** 61691",
},
]
suite.add_case(
name="List threads with history",
user_message="Get the next 5 threads",
additional_messages=history,
expected_tool_calls=[
ExpectedToolCall(
func=list_threads,
args={
"max_results": 5,
"page_token": "10321400718999360131",
},
)
],
critics=[
BinaryCritic(critic_field="max_results", weight=0.2),
BinaryCritic(critic_field="page_token", weight=0.8),
],
)
suite.add_case(
name="Search threads",
user_message="Search for threads from johndoe@example.com to janedoe@example.com about that talk about 'Arcade AI' from yesterday",
expected_tool_calls=[
ExpectedToolCall(
func=search_threads,
args={
"sender": "johndoe@example.com",
"recipient": "janedoe@example.com",
"body": "Arcade AI",
"date_range": DateRange.YESTERDAY,
},
)
],
critics=[
BinaryCritic(critic_field="sender", weight=0.25),
BinaryCritic(critic_field="recipient", weight=0.25),
SimilarityCritic(critic_field="body", weight=0.25),
BinaryCritic(critic_field="date_range", weight=0.25),
],
)
suite.add_case(
name="Get a thread by ID",
user_message="Get the thread r-124325435467568867667878874565464564563523424323524235242412",
expected_tool_calls=[
ExpectedToolCall(
func=get_thread,
args={
"thread_id": "r-124325435467568867667878874565464564563523424323524235242412",
},
)
],
critics=[
BinaryCritic(critic_field="thread_id", weight=1.0),
],
)
return suite
@tool_eval()
def gmail_reply_eval_suite() -> EvalSuite:
"""Create an evaluation suite for Gmail reply tools."""
suite = EvalSuite(
name="Gmail Reply Tools Evaluation",
system_message="You are an AI assistant that can send and manage emails using the provided tools.",
catalog=catalog,
rubric=rubric,
)
email_history = [
{"role": "user", "content": "get the latest emails I received from johndoe@gmail.com"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_jowMD7aB9sVPClOfvNof7Llu",
"type": "function",
"function": {
"name": "Google_ListEmailsByHeader",
"arguments": json.dumps({
"sender": "johndoe@gmail.com",
"max_results": 5,
}),
},
}
],
},
{
"role": "tool",
"content": json.dumps({
"emails": [
{
"body": "test 1",
"cc": "",
"date": "Tue, 11 Feb 2025 11:33:08 -0300",
"from": "John Doe <johndoe@gmail.com>",
"header_message_id": "<cierty475cty7245yxq@mail.gmail.com>",
"history_id": "123456",
"id": "q34759q435nv",
"in_reply_to": "",
"label_ids": ["INBOX"],
"references": "",
"reply_to": "",
"snippet": "test 1",
"subject": "test 1",
"thread_id": "345y6v3596",
"to": "myself@gmail.com",
},
{
"body": "test 2",
"cc": "",
"date": "Mon, 20 Jan 2025 13:04:42 -0800",
"from": "John Doe <johndoe@gmail.com>",
"header_message_id": "<28745ytvw8745ct4@mail.gmail.com>",
"history_id": "3456758",
"id": "9475tvy24578yx",
"in_reply_to": "",
"label_ids": [],
"references": "",
"reply_to": "",
"snippet": "test 2",
"subject": "test 2",
"thread_id": "249576v3496",
"to": "myself@gmail.com",
},
]
}),
"tool_call_id": "call_jowMD7aB9sVPClOfvNof7Llu",
"name": "Google_ListEmailsByHeader",
},
{
"role": "assistant",
"content": "Here are the latest emails you received from johndoe@gmail.com:\n\n1. **Subject**: test 1\n - **Date**: Tue, 11 Feb 2025 11:33:08 -0300\n - **Snippet**: test 1\n\n2. **Subject**: test 2\n - **Date**: Mon, 20 Jan 2025 13:04:42 -0800\n - **Snippet**: test 2\n\nIf you need further details from any specific email, let me know!",
},
]
suite.add_case(
name="Reply to an email",
user_message="Reply to the email from johndoe@example.com about 'test 2' saying 'tested and working well'",
expected_tool_calls=[
ExpectedToolCall(
func=reply_to_email,
args={
"reply_to_message_id": "9475tvy24578yx",
"body": "tested and working well",
"reply_to_whom": GmailReplyToWhom.ONLY_THE_SENDER.value,
},
)
],
critics=[
SimilarityCritic(critic_field="subject", weight=1 / 7),
SimilarityCritic(critic_field="body", weight=1 / 7),
BinaryCritic(critic_field="recipient", weight=1 / 7),
BinaryCritic(critic_field="cc", weight=1 / 7),
BinaryCritic(critic_field="bcc", weight=1 / 7),
BinaryCritic(critic_field="reply_to_whom", weight=1 / 7),
BinaryCritic(critic_field="reply_to_message_id", weight=1 / 7),
],
additional_messages=email_history,
)
suite.add_case(
name="Reply to an email with every recipient",
user_message="Reply to every recipient in the email from johndoe@example.com about 'test 2' saying 'tested and working well'",
expected_tool_calls=[
ExpectedToolCall(
func=reply_to_email,
args={
"reply_to_message_id": "9475tvy24578yx",
"body": "tested and working well",
"reply_to_whom": GmailReplyToWhom.EVERY_RECIPIENT.value,
},
)
],
critics=[
SimilarityCritic(critic_field="subject", weight=1 / 7),
SimilarityCritic(critic_field="body", weight=1 / 7),
BinaryCritic(critic_field="recipient", weight=1 / 7),
BinaryCritic(critic_field="cc", weight=1 / 7),
BinaryCritic(critic_field="bcc", weight=1 / 7),
BinaryCritic(critic_field="reply_to_whom", weight=1 / 7),
BinaryCritic(critic_field="reply_to_message_id", weight=1 / 7),
],
additional_messages=email_history,
)
suite.add_case(
name="Reply to an email with bcc",
user_message="Reply to the email from johndoe@example.com about 'test 2' saying 'tested and working well' and send it to janedoe@example.com as bcc as well",
expected_tool_calls=[
ExpectedToolCall(
func=reply_to_email,
args={
"reply_to_message_id": "9475tvy24578yx",
"body": "tested and working well",
"bcc": ["janedoe@example.com"],
"reply_to_whom": GmailReplyToWhom.ONLY_THE_SENDER.value,
},
)
],
critics=[
SimilarityCritic(critic_field="subject", weight=1 / 7),
SimilarityCritic(critic_field="body", weight=1 / 7),
BinaryCritic(critic_field="recipient", weight=1 / 7),
BinaryCritic(critic_field="cc", weight=1 / 7),
BinaryCritic(critic_field="bcc", weight=1 / 7),
BinaryCritic(critic_field="reply_to_whom", weight=1 / 7),
BinaryCritic(critic_field="reply_to_message_id", weight=1 / 7),
],
additional_messages=email_history,
)
suite.add_case(
name="Write draft reply",
user_message="Write a draft reply to the email from johndoe@example.com about 'test 2' saying 'tested and working well'",
expected_tool_calls=[
ExpectedToolCall(
func=write_draft_reply_email,
args={
"reply_to_message_id": "9475tvy24578yx",
"body": "tested and working well",
"reply_to_whom": GmailReplyToWhom.ONLY_THE_SENDER.value,
},
)
],
critics=[
SimilarityCritic(critic_field="subject", weight=1 / 7),
SimilarityCritic(critic_field="body", weight=1 / 7),
BinaryCritic(critic_field="recipient", weight=1 / 7),
BinaryCritic(critic_field="cc", weight=1 / 7),
BinaryCritic(critic_field="bcc", weight=1 / 7),
BinaryCritic(critic_field="reply_to_message_id", weight=1 / 7),
BinaryCritic(critic_field="reply_to_whom", weight=1 / 7),
],
additional_messages=email_history,
)
suite.add_case(
name="Write draft reply to every recipient",
user_message="Write a draft reply to every recipient in the email from johndoe@example.com about 'test 2' saying 'tested and working well'",
expected_tool_calls=[
ExpectedToolCall(
func=write_draft_reply_email,
args={
"reply_to_message_id": "9475tvy24578yx",
"body": "tested and working well",
"reply_to_whom": GmailReplyToWhom.EVERY_RECIPIENT.value,
},
)
],
critics=[
SimilarityCritic(critic_field="subject", weight=1 / 7),
SimilarityCritic(critic_field="body", weight=1 / 7),
BinaryCritic(critic_field="recipient", weight=1 / 7),
BinaryCritic(critic_field="cc", weight=1 / 7),
BinaryCritic(critic_field="bcc", weight=1 / 7),
BinaryCritic(critic_field="reply_to_whom", weight=0.125),
BinaryCritic(critic_field="reply_to_message_id", weight=1 / 7),
],
additional_messages=email_history,
)
return suite
@tool_eval()
def gmail_list_emails_by_header_eval_suite() -> EvalSuite:
"""Create an evaluation suite for Gmail tools."""
suite = EvalSuite(
name="Gmail list_emails_by_header tool evaluation",
system_message="You are an AI assistant that can send and manage emails using the provided tools.",
catalog=catalog,
rubric=rubric,
)
suite.add_case(
name="List emails by header using date-range",
user_message="List all emails from johndoe@example.com to janedoe@example.com about 'Arcade AI' from yesterday",
expected_tool_calls=[
ExpectedToolCall(
func=list_emails_by_header,
args={
"sender": "johndoe@example.com",
"recipient": "janedoe@example.com",
"subject": "Arcade AI",
"date_range": DateRange.YESTERDAY.value,
},
)
],
critics=[
BinaryCritic(critic_field="sender", weight=1 / 4),
BinaryCritic(critic_field="recipient", weight=1 / 4),
SimilarityCritic(critic_field="subject", weight=1 / 4),
BinaryCritic(critic_field="date_range", weight=1 / 4),
],
)
suite.add_case(
name="List emails by header using date-range",
user_message="List all emails from johndoe@example.com to janedoe@example.com about 'Arcade AI' from the last month",
expected_tool_calls=[
ExpectedToolCall(
func=list_emails_by_header,
args={
"sender": "johndoe@example.com",
"recipient": "janedoe@example.com",
"subject": "Arcade AI",
"date_range": DateRange.LAST_MONTH.value,
},
)
],
critics=[
BinaryCritic(critic_field="sender", weight=1 / 4),
BinaryCritic(critic_field="recipient", weight=1 / 4),
SimilarityCritic(critic_field="subject", weight=1 / 4),
BinaryCritic(critic_field="date_range", weight=1 / 4),
],
)
return suite