# PR Description ## Split toolkits This PR splits the `Microsoft`, `Google`, and `Search` toolkits into multiple toolkits each. * `Microsoft` --> `OutlookCalendar`, `OutlookMail`. * `Google` -----> `GoogleCalendar`, `GoogleContacts`, `GoogleDocs`, `GoogleDrive`, `Gmail`, `GoogleSheets` * `Search` -----> `GoogleFinance`, `GoogleFlights`, `GoogleHotels`, `GoogleJobs`, `GoogleMaps`, `GoogleNews`, `GoogleSearch`, `GoogleShopping`, `Walmart`, `Youtube` > The original monolithic toolkits (`Microsoft`, `Google`, `Search`) are not removed in this PR. The plan is to keep those toolkits around while we > 1. Stop documenting the toolkits, > 2. Stop displaying the toolkits in the dashboard, and > 3. Help customers migrate over to the new split toolkits. ## Rename toolkits This PR renames the following toolkits * `Web` ------------> `Firecrawl` * `CodeSandbox` ---> `E2B` > The `Web` and `CodeSandbox` toolkits are not removed in this PR. The plan is to keep them around while we > 1. Stop documenting the toolkits, > 2. Stop displaying the toolkits in the dashboard, and > 3. Help customers migrate over to the new renamed toolkits. ## Rename tools Since toolkit names were changed, this called for some tools to be renamed as well. * `GoogleSearch.SearchGoogle` ----------------> `GoogleSearch.Search` * `GoogleShopping.SearchShoppingProducts` ---> `GoogleShopping.SearchProducts` * `Walmart.SearchWalmartProducts` ------------> `Walmart.SearchProducts` * `Walmart.GetWalmartProductDetails` ---------> `Walmart.GetProductDetails` * `Youtube.SearchYoutubeVideos` --------------> `Youtube.SearchForVideos` ## Google File Picker Improvements to the Google File Picker experience were also added in this PR. The following tools will ALWAYS provide llm_instructions in their response to "let the end-user know that they have the option to select more files via the file picker url if they want to": * `GoogleDocs.SearchDocuments` * `GoogleDocs.SearchAndRetrieveDocuments` * `GoogleDrive.GetFileTreeStructure` The following tools will only provide the file picker URL if a 404 or 403 from the Google API: * `GoogleDocs.InsertTextAtEndOfDocument` * `GoogleDocs.GetDocumentById` * `GoogleSheets.GetSpreadsheet` * `GoogleSheets.WriteToCell` Also, a standalone `GoogleDrive.GenerateGoogleFilePickerUrl` tool exists. ## Other * The `SearchDocuments` and `SearchAndRetrieveDocuments` tools used to be organized within the Drive portion of the Google toolkit, but I moved these into the new GoogleDocs toolkit because they are specific to Docs. # Progress - [x] `OutlookCalendar` - [x] `OutlookMail` - [x] `GoogleFinance` - [x] `GoogleFlights` - [x] `GoogleHotels` - [x] `GoogleJobs` - [x] `GoogleMaps` - [x] `GoogleNews` - [x] `GoogleSearch` - [x] `GoogleShopping` - [x] `Walmart` - [x] `Youtube` - [x] `GoogleCalendar` - [x] `GoogleContacts` - [x] `GoogleDocs` - [x] `GoogleDrive` - [x] `Gmail` - [x] `GoogleSheets` - [x] `Firecrawl` - [x] `E2B` - [x] File picker # Discussion * Repeated code is a consequence of splitting toolkits that use the same provider. I am open to any ideas that would allow multiple toolkits to reference common code. Comment your ideas in this PR.
384 lines
15 KiB
Python
384 lines
15 KiB
Python
from arcade_evals import (
|
|
BinaryCritic,
|
|
EvalRubric,
|
|
EvalSuite,
|
|
ExpectedToolCall,
|
|
SimilarityCritic,
|
|
tool_eval,
|
|
)
|
|
from arcade_tdk import ToolCatalog
|
|
|
|
import arcade_google_docs
|
|
from arcade_google_docs.enum import DocumentFormat, OrderBy
|
|
from arcade_google_docs.tools import (
|
|
create_blank_document,
|
|
create_document_from_text,
|
|
get_document_by_id,
|
|
insert_text_at_end_of_document,
|
|
search_and_retrieve_documents,
|
|
search_documents,
|
|
)
|
|
|
|
# Evaluation rubric
|
|
rubric = EvalRubric(
|
|
fail_threshold=0.9,
|
|
warn_threshold=0.95,
|
|
)
|
|
|
|
catalog = ToolCatalog()
|
|
catalog.add_module(arcade_google_docs)
|
|
|
|
|
|
@tool_eval()
|
|
def docs_eval_suite() -> EvalSuite:
|
|
"""Create an evaluation suite for Google Docs tools."""
|
|
suite = EvalSuite(
|
|
name="Google Docs Tools Evaluation",
|
|
system_message="You are an AI assistant that can create and manage Google Docs using the provided tools.",
|
|
catalog=catalog,
|
|
rubric=rubric,
|
|
)
|
|
|
|
# A previous tool call to list_documents
|
|
additional_messages = [
|
|
{"role": "user", "content": "list my 10 most recently created docs"},
|
|
{
|
|
"role": "assistant",
|
|
"content": "Please go to this URL and authorize the action: [Link](https://accounts.google.com/)",
|
|
},
|
|
{
|
|
"role": "assistant",
|
|
"content": "",
|
|
"tool_calls": [
|
|
{
|
|
"id": "call_gegK723W2hXsORjBmq1Oexqk",
|
|
"type": "function",
|
|
"function": {
|
|
"name": "Google_ListDocuments",
|
|
"arguments": '{"limit":10,"order_by":"createdTime desc"}',
|
|
},
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"role": "tool",
|
|
"content": '{"documents":[{"id":"1e0rCoT1Yd14WuuEvd3hSUcN_-VD3df4T3Q08uLm3TWc","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst10"},{"id":"1eTSWd-5zQds8K9OWYygwtCFMUyuuMize3bh3HaRsKts","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst9"},{"id":"19Dqugn0rVi89K0C__lpg1HbhQOTenccyZOhPgivTHMs","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst8"},{"id":"1RCibzx14eqP3vS9yI4nD13OKf8Vee56RiszS53OkR7I","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst7"},{"id":"1imFb04JQuBn8SiSsRFf6fEuYCyXkbII4KX8fsmnT0jo","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst6"},{"id":"1ZC3oypdfLWFgBd-emeSykJf9tZOae6USsFboygRCr-w","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst5"},{"id":"1-gFGNWmwLxEiKa6NNixLNq3X-phXRMORVZfVTfBg8Sc","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst4"},{"id":"1eQ8UBO_PY3Lem4R8OVdIc9ODXt0MrSUAnEu994Qz8P8","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst3"},{"id":"1TOxB0MLry-JzntDWDT1LFywTLdr3XDWPT5L5UsHMs5c","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst2"},{"id":"1a1UQ7C90s8kGfnO8k6wfAZz_Cy5nGN2MkCoRB5y2j3w","kind":"drive#file","mimeType":"application/vnd.google-apps.document","name":"Tst1"}],"documents_count":10}',
|
|
"tool_call_id": "call_gegK723W2hXsORjBmq1Oexqk",
|
|
"name": "Google_ListDocuments",
|
|
},
|
|
{
|
|
"role": "assistant",
|
|
"content": "Here are your 10 most recently created Google Docs:\n\n1. [Tst10](https://docs.google.com/document/d/1e0rCoT1Yd14WuuEvd3hSUcN_-VD3df4T3Q08uLm3TWc)\n2. [Tst9](https://docs.google.com/document/d/1eTSWd-5zQds8K9OWYygwtCFMUyuuMize3bh3HaRsKts)\n3. [Tst8](https://docs.google.com/document/d/19Dqugn0rVi89K0C__lpg1HbhQOTenccyZOhPgivTHMs)\n4. [Tst7](https://docs.google.com/document/d/1RCibzx14eqP3vS9yI4nD13OKf8Vee56RiszS53OkR7I)\n5. [Tst6](https://docs.google.com/document/d/1imFb04JQuBn8SiSsRFf6fEuYCyXkbII4KX8fsmnT0jo)\n6. [Tst5](https://docs.google.com/document/d/1ZC3oypdfLWFgBd-emeSykJf9tZOae6USsFboygRCr-w)\n7. [Tst4](https://docs.google.com/document/d/1-gFGNWmwLxEiKa6NNixLNq3X-phXRMORVZfVTfBg8Sc)\n8. [Tst3](https://docs.google.com/document/d/1eQ8UBO_PY3Lem4R8OVdIc9ODXt0MrSUAnEu994Qz8P8)\n9. [Tst2](https://docs.google.com/document/d/1TOxB0MLry-JzntDWDT1LFywTLdr3XDWPT5L5UsHMs5c)\n10. [Tst1](https://docs.google.com/document/d/1a1UQ7C90s8kGfnO8k6wfAZz_Cy5nGN2MkCoRB5y2j3w)\n\nYou can click the links to open each document.",
|
|
},
|
|
]
|
|
|
|
suite.add_case(
|
|
name="Get document content",
|
|
user_message="Can you read me the contents of Tst9 doc and also Tst10 doc please",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=get_document_by_id,
|
|
args={
|
|
"document_id": "1eTSWd-5zQds8K9OWYygwtCFMUyuuMize3bh3HaRsKts",
|
|
},
|
|
),
|
|
ExpectedToolCall(
|
|
func=get_document_by_id,
|
|
args={
|
|
"document_id": "1e0rCoT1Yd14WuuEvd3hSUcN_-VD3df4T3Q08uLm3TWc",
|
|
},
|
|
),
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="document_id", weight=0.6),
|
|
],
|
|
additional_messages=additional_messages,
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Insert text at end of document",
|
|
user_message="Please add the text 'This is a new paragraph.' to the end of Tst4.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=insert_text_at_end_of_document,
|
|
args={
|
|
"document_id": "1-gFGNWmwLxEiKa6NNixLNq3X-phXRMORVZfVTfBg8Sc",
|
|
"text_content": "This is a new paragraph.",
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="document_id", weight=0.5),
|
|
SimilarityCritic(critic_field="text_content", weight=0.5),
|
|
],
|
|
additional_messages=additional_messages,
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Read the contents of two documents and then insert text at end of a different document.",
|
|
user_message="Can you read me the contents of Tst9 doc and also Tst10 doc please. Also, please add the text 'This is a new paragraph.' to the end of Tst4.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=insert_text_at_end_of_document,
|
|
args={
|
|
"document_id": "1-gFGNWmwLxEiKa6NNixLNq3X-phXRMORVZfVTfBg8Sc",
|
|
"text_content": "This is a new paragraph.",
|
|
},
|
|
),
|
|
ExpectedToolCall(
|
|
func=get_document_by_id,
|
|
args={
|
|
"document_id": "1eTSWd-5zQds8K9OWYygwtCFMUyuuMize3bh3HaRsKts",
|
|
},
|
|
),
|
|
ExpectedToolCall(
|
|
func=get_document_by_id,
|
|
args={
|
|
"document_id": "1e0rCoT1Yd14WuuEvd3hSUcN_-VD3df4T3Q08uLm3TWc",
|
|
},
|
|
),
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="document_id", weight=0.3),
|
|
SimilarityCritic(critic_field="text_content", weight=0.3),
|
|
],
|
|
additional_messages=additional_messages,
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Create blank document",
|
|
user_message="Create a new Doc titled 'Meeting Notes'.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=create_blank_document,
|
|
args={
|
|
"title": "Meeting Notes",
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
SimilarityCritic(critic_field="title", weight=1.0),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Create document from text",
|
|
user_message="Create a new doc called To-Do List with the content 'Buy groceries, Call mom, Finish report'.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=create_document_from_text,
|
|
args={
|
|
"title": "To-Do List",
|
|
"text_content": "Buy groceries\nCall mom\nFinish report",
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
SimilarityCritic(critic_field="title", weight=0.5),
|
|
SimilarityCritic(critic_field="text_content", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="No tool call case",
|
|
user_message="Create a new microsoft word document titled 'My Resume'.",
|
|
expected_tool_calls=[],
|
|
critics=[],
|
|
)
|
|
|
|
return suite
|
|
|
|
|
|
@tool_eval()
|
|
def search_documents_eval_suite() -> EvalSuite:
|
|
"""Create an evaluation suite for Google Drive tools."""
|
|
suite = EvalSuite(
|
|
name="Google Drive Tools Evaluation",
|
|
system_message="You are an AI assistant that can manage Google Drive documents using the provided tools.",
|
|
catalog=catalog,
|
|
rubric=rubric,
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Search documents in Google Drive",
|
|
user_message="get my 49 most recently created documents, list the ones created most recently first.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=search_documents,
|
|
args={
|
|
"order_by": [OrderBy.CREATED_TIME_DESC.value],
|
|
"limit": 49,
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="order_by", weight=0.5),
|
|
BinaryCritic(critic_field="limit", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Search documents in Google Drive based on document keywords",
|
|
user_message="Search the documents that contain the word 'greedy' and the phrase 'hello, world'",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=search_documents,
|
|
args={
|
|
"document_contains": ["greedy", "hello, world"],
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="document_contains", weight=1.0),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Search documents in a specific Google Drive based on document keywords",
|
|
user_message="Search the documents that contain the word 'greedy' and the phrase 'hello, world' in the drive with id 'abc123'",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=search_documents,
|
|
args={
|
|
"document_contains": ["greedy", "hello, world"],
|
|
"search_only_in_shared_drive_id": "abc123",
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="search_only_in_shared_drive_id", weight=0.5),
|
|
BinaryCritic(critic_field="document_contains", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Search documents in a Google Drive Workspace organization domain based on document keywords",
|
|
user_message="Search the documents that contain the phrase 'hello, world' in the organization domain",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=search_documents,
|
|
args={
|
|
"document_contains": ["hello, world"],
|
|
"include_organization_domain_documents": True,
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="include_organization_domain_documents", weight=0.5),
|
|
BinaryCritic(critic_field="document_contains", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Search documents in shared drives",
|
|
user_message="Search the 5 documents from all drives corpora that nobody has touched in forever, excluding shared drives.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=search_documents,
|
|
args={
|
|
"limit": 5,
|
|
"include_shared_drives": False,
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="include_shared_drives", weight=0.5),
|
|
BinaryCritic(critic_field="limit", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="No tool call case",
|
|
user_message="List my 10 most recently modified documents that are stored in my Microsoft OneDrive.",
|
|
expected_tool_calls=[],
|
|
critics=[],
|
|
)
|
|
|
|
return suite
|
|
|
|
|
|
@tool_eval()
|
|
def search_and_retrieve_documents_eval_suite() -> EvalSuite:
|
|
"""Create an evaluation suite for Google Drive search and retrieve tools."""
|
|
suite = EvalSuite(
|
|
name="Google Drive Tools Evaluation",
|
|
system_message="You are an AI assistant that can manage Google Drive documents using the provided tools.",
|
|
catalog=catalog,
|
|
rubric=rubric,
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Search and retrieve (write summary)",
|
|
user_message="Write a summary of the documents in my Google Drive about 'MX Engineering'",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=search_and_retrieve_documents,
|
|
args={
|
|
"document_contains": ["MX Engineering"],
|
|
"return_format": DocumentFormat.MARKDOWN,
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="document_contains", weight=0.5),
|
|
BinaryCritic(critic_field="return_format", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Search and retrieve (project proposal)",
|
|
user_message="Display the document contents in HTML format from my Google Drive that contain the phrase 'project proposal'.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=search_and_retrieve_documents,
|
|
args={
|
|
"document_contains": ["project proposal"],
|
|
"return_format": DocumentFormat.HTML,
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="document_contains", weight=0.5),
|
|
BinaryCritic(critic_field="return_format", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Search and retrieve (meeting notes)",
|
|
user_message="Retrieve documents that contain both 'meeting notes' and 'budget' in JSON format.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=search_and_retrieve_documents,
|
|
args={
|
|
"document_contains": ["meeting notes", "budget"],
|
|
"return_format": DocumentFormat.GOOGLE_API_JSON,
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="document_contains", weight=0.5),
|
|
BinaryCritic(critic_field="return_format", weight=0.5),
|
|
],
|
|
)
|
|
|
|
suite.add_case(
|
|
name="Search and retrieve (Q1 report)",
|
|
user_message="Show me the content of the documents that mention 'Q1 report' but do not include the expression 'Project XYZ'.",
|
|
expected_tool_calls=[
|
|
ExpectedToolCall(
|
|
func=search_and_retrieve_documents,
|
|
args={
|
|
"document_contains": ["Q1 report"],
|
|
"document_not_contains": ["Project XYZ"],
|
|
"return_format": DocumentFormat.MARKDOWN,
|
|
},
|
|
)
|
|
],
|
|
critics=[
|
|
BinaryCritic(critic_field="document_contains", weight=1 / 3),
|
|
BinaryCritic(critic_field="document_not_contains", weight=1 / 3),
|
|
BinaryCritic(critic_field="return_format", weight=1 / 3),
|
|
],
|
|
)
|
|
|
|
return suite
|