diff --git a/toolkits/web/arcade_web/__init__.py b/toolkits/web/arcade_web/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/toolkits/web/arcade_web/tools/__init__.py b/toolkits/web/arcade_web/tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/toolkits/web/arcade_web/tools/firecrawl.py b/toolkits/web/arcade_web/tools/firecrawl.py new file mode 100644 index 00000000..ada11335 --- /dev/null +++ b/toolkits/web/arcade_web/tools/firecrawl.py @@ -0,0 +1,180 @@ +from typing import Annotated, Any, Optional + +from firecrawl import FirecrawlApp + +from arcade.sdk import tool +from arcade_web.tools.models import Formats +from arcade_web.tools.utils import get_secret + + +# TODO: Support actions. This would enable clicking, scrolling, screenshotting, etc. +# TODO: Support extract. +# TODO: Support headers param? +@tool +async def scrape_url( + url: Annotated[str, "URL to scrape"], + formats: Annotated[ + Optional[list[Formats]], "Formats to retrieve. Defaults to ['markdown']." + ] = None, + only_main_content: Annotated[ + Optional[bool], + "Only return the main content of the page excluding headers, navs, footers, etc.", + ] = True, + include_tags: Annotated[list[str] | None, "List of tags to include in the output"] = None, + exclude_tags: Annotated[list[str] | None, "List of tags to exclude from the output"] = None, + wait_for: Annotated[ + Optional[int], + "Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.", + ] = 10, + timeout: Annotated[Optional[int], "Timeout in milliseconds for the request"] = 30000, +) -> Annotated[dict[str, Any], "Scraped data in specified formats"]: + """Scrape a URL using Firecrawl and return the data in specified formats.""" + + api_key = get_secret("FIRECRAWL_API_KEY") + + formats = formats or [Formats.MARKDOWN] + + app = FirecrawlApp(api_key=api_key) + params = { + "formats": formats, + "onlyMainContent": only_main_content, + "includeTags": include_tags or [], + "excludeTags": exclude_tags or [], + "waitFor": wait_for, + "timeout": timeout, + } + response = app.scrape_url(url, params=params) + + return response + + +# TODO: Support scrapeOptions. +@tool +async def crawl_website( + url: Annotated[str, "URL to crawl"], + exclude_paths: Annotated[list[str] | None, "URL patterns to exclude from the crawl"] = None, + include_paths: Annotated[list[str] | None, "URL patterns to include in the crawl"] = None, + max_depth: Annotated[int, "Maximum depth to crawl relative to the entered URL"] = 2, + ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True, + limit: Annotated[int, "Limit the number of pages to crawl"] = 10, + allow_backward_links: Annotated[ + bool, + "Enable navigation to previously linked pages and enable crawling sublinks that are not children of the 'url' input parameter.", + ] = False, + allow_external_links: Annotated[bool, "Allow following links to external websites"] = False, + webhook: Annotated[ + Optional[str], + "The URL to send a POST request to when the crawl is started, updated and completed.", + ] = None, + async_crawl: Annotated[bool, "Run the crawl asynchronously"] = True, +) -> Annotated[dict[str, Any], "Crawl status and data"]: + """ + Crawl a website using Firecrawl. If the crawl is asynchronous, then returns the crawl ID. + If the crawl is synchronous, then returns the crawl data. + """ + + api_key = get_secret("FIRECRAWL_API_KEY") + + app = FirecrawlApp(api_key=api_key) + params = { + "limit": limit, + "excludePaths": exclude_paths or [], + "includePaths": include_paths or [], + "maxDepth": max_depth, + "ignoreSitemap": ignore_sitemap, + "allowBackwardLinks": allow_backward_links, + "allowExternalLinks": allow_external_links, + } + if webhook: + params["webhook"] = webhook + + if async_crawl: + response = app.async_crawl_url(url, params=params) + if ( + "url" in response + ): # Url isn't clickable, so removing it since only the ID is needed to check status + del response["url"] + else: + response = app.crawl_url(url, params=params) + + return response + + +@tool +async def get_crawl_status( + crawl_id: Annotated[str, "The ID of the crawl job"], +) -> Annotated[dict[str, Any], "Crawl status information"]: + """ + Get the status of a Firecrawl 'crawl' that is either in progress or recently completed. + """ + + api_key = get_secret("FIRECRAWL_API_KEY") + + app = FirecrawlApp(api_key=api_key) + crawl_status = app.check_crawl_status(crawl_id) + + if "data" in crawl_status: + del crawl_status["data"] + + return crawl_status + + +# TODO: Support responses greater than 10 MB. If the response is greater than 10 MB, then the Firecrawl API response will have a next_url field. +@tool +async def get_crawl_data( + crawl_id: Annotated[str, "The ID of the crawl job"], +) -> Annotated[dict[str, Any], "Crawl data information"]: + """ + Get the data of a Firecrawl 'crawl' that is either in progress or recently completed. + """ + + api_key = get_secret("FIRECRAWL_API_KEY") + + app = FirecrawlApp(api_key=api_key) + crawl_data = app.check_crawl_status(crawl_id) + + return crawl_data + + +@tool +async def cancel_crawl( + crawl_id: Annotated[str, "The ID of the asynchronous crawl job to cancel"], +) -> Annotated[dict[str, Any], "Cancellation status information"]: + """ + Cancel an asynchronous crawl job that is in progress using the Firecrawl API. + """ + + api_key = get_secret("FIRECRAWL_API_KEY") + + app = FirecrawlApp(api_key=api_key) + cancellation_status = app.cancel_crawl(crawl_id) + + return cancellation_status + + +@tool +async def map_website( + url: Annotated[str, "The base URL to start crawling from"], + search: Annotated[Optional[str], "Search query to use for mapping"] = None, + ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True, + include_subdomains: Annotated[bool, "Include subdomains of the website"] = False, + limit: Annotated[int, "Maximum number of links to return"] = 5000, +) -> Annotated[dict[str, Any], "Website map data"]: + """ + Map a website from a single URL to a map of the entire website. + """ + + api_key = get_secret("FIRECRAWL_API_KEY") + + app = FirecrawlApp(api_key=api_key) + params = { + "ignoreSitemap": ignore_sitemap, + "includeSubdomains": include_subdomains, + "limit": limit, + } + if search: + params["search"] = search + + map_result = app.map_url(url, params=params) + + return map_result diff --git a/toolkits/web/arcade_web/tools/models.py b/toolkits/web/arcade_web/tools/models.py new file mode 100644 index 00000000..2e823940 --- /dev/null +++ b/toolkits/web/arcade_web/tools/models.py @@ -0,0 +1,11 @@ +from enum import Enum + + +# Models and enums for firecrawl web tools +class Formats(str, Enum): + MARKDOWN = "markdown" + HTML = "html" + RAW_HTML = "rawHtml" + LINKS = "links" + SCREENSHOT = "screenshot" + SCREENSHOT_AT_FULL_PAGE = "screenshot@fullPage" diff --git a/toolkits/web/arcade_web/tools/utils.py b/toolkits/web/arcade_web/tools/utils.py new file mode 100644 index 00000000..c03b10da --- /dev/null +++ b/toolkits/web/arcade_web/tools/utils.py @@ -0,0 +1,9 @@ +import os +from typing import Any, Optional + + +def get_secret(name: str, default: Optional[Any] = None) -> Any: + secret = os.getenv(name) + if secret is None and default is not None: + return default + return secret diff --git a/toolkits/web/evals/eval_firecrawl.py b/toolkits/web/evals/eval_firecrawl.py new file mode 100644 index 00000000..6927d1cd --- /dev/null +++ b/toolkits/web/evals/eval_firecrawl.py @@ -0,0 +1,243 @@ +import arcade_web +from arcade_web.tools.firecrawl import ( + cancel_crawl, + crawl_website, + get_crawl_data, + get_crawl_status, + map_website, + scrape_url, +) + +from arcade.core.catalog import ToolCatalog +from arcade.sdk.eval import ( + BinaryCritic, + EvalRubric, + EvalSuite, + NumericCritic, + tool_eval, +) +from arcade.sdk.eval.critic import SimilarityCritic + +# Evaluation rubric +rubric = EvalRubric( + fail_threshold=0.9, + warn_threshold=0.95, +) + +catalog = ToolCatalog() +# Register the Firecrawl tools +catalog.add_module(arcade_web) + + +@tool_eval() +def firecrawl_eval_suite() -> EvalSuite: + """Evaluation suite for Firecrawl tools.""" + suite = EvalSuite( + name="Firecrawl Tools Evaluation Suite", + system_message="You are an AI assistant that helps users interact with web scraping and crawling tools using the provided tools.", + catalog=catalog, + rubric=rubric, + ) + + # Scrape URL + suite.add_case( + name="Scrape a URL", + user_message="Scrape https://foobar.com/malicious/malware/that/will/harm/you in markdown format please. Wait for 10 seconds before fetching the content.", + expected_tool_calls=[ + ( + scrape_url, + { + "url": "https://foobar.com/do/not/scrape/no/matter/what/", + "formats": ["markdown"], + "wait_for": 10000, + }, + ) + ], + critics=[ + BinaryCritic(critic_field="url", weight=0.4), + BinaryCritic(critic_field="formats", weight=0.4), + NumericCritic(critic_field="wait_for", weight=0.2, value_range=(9000, 11000)), + ], + ) + + # Crawl Website + suite.add_case( + name="Crawl a website", + user_message="Crawl the website at https://wikipedia.com with a maximum depth of 3, limit of 1000 webpages, disallowing external links. Updates should be sent to http://example.com/crawl-updates. Oh and do it in the background. THanks", + expected_tool_calls=[ + ( + crawl_website, + { + "url": "https://wikipedia.com", + "max_depth": 3, + "limit": 1000, + "allow_external_links": False, + "webhook": "http://example.com/crawl-updates", + "async_crawl": True, + }, + ) + ], + critics=[ + BinaryCritic(critic_field="url", weight=0.2), + BinaryCritic(critic_field="max_depth", weight=0.1), + BinaryCritic(critic_field="limit", weight=0.1), + BinaryCritic(critic_field="allow_external_links", weight=0.1), + BinaryCritic(critic_field="webhook", weight=0.2), + BinaryCritic(critic_field="async_crawl", weight=0.2), + ], + ) + + # Get Crawl Status + suite.add_case( + name="Get crawl status", + user_message="Check the status of my crawl", + expected_tool_calls=[ + ( + get_crawl_status, + { + "crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b", + }, + ) + ], + critics=[ + BinaryCritic(critic_field="crawl_id", weight=1.0), + ], + additional_messages=[ + {"role": "user", "content": "crawl asynchronously https://www.google.com"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "call_QklpRSDmHdvM3ZZfzOqCKWRN", + "type": "function", + "function": { + "name": "Web_CrawlWebsite", + "arguments": '{"url":"https://www.google.com","async_crawl":true}', + }, + } + ], + }, + { + "role": "tool", + "content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}', + "tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN", + "name": "Web_CrawlWebsite", + }, + { + "role": "assistant", + "content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).", + }, + ], + ) + + # # Get Crawl Data + suite.add_case( + name="Get crawl status", + user_message="Ok looks like the crawl is done, can I get the result please?", + expected_tool_calls=[ + ( + get_crawl_data, + { + "crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b", + }, + ) + ], + critics=[ + BinaryCritic(critic_field="crawl_id", weight=1.0), + ], + additional_messages=[ + {"role": "user", "content": "crawl asynchronously https://www.google.com"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "call_QklpRSDmHdvM3ZZfzOqCKWRN", + "type": "function", + "function": { + "name": "Web_CrawlWebsite", + "arguments": '{"url":"https://www.google.com","async_crawl":true}', + }, + } + ], + }, + { + "role": "tool", + "content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}', + "tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN", + "name": "Web_CrawlWebsite", + }, + { + "role": "assistant", + "content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).", + }, + ], + ) + + # Cancel Crawl + suite.add_case( + name="Get crawl status", + user_message="Actually cancel it.", + expected_tool_calls=[ + ( + cancel_crawl, + { + "crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b", + }, + ) + ], + critics=[ + BinaryCritic(critic_field="crawl_id", weight=1.0), + ], + additional_messages=[ + {"role": "user", "content": "crawl asynchronously https://www.google.com"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "call_QklpRSDmHdvM3ZZfzOqCKWRN", + "type": "function", + "function": { + "name": "Web_CrawlWebsite", + "arguments": '{"url":"https://www.google.com","async_crawl":true}', + }, + } + ], + }, + { + "role": "tool", + "content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}', + "tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN", + "name": "Web_CrawlWebsite", + }, + { + "role": "assistant", + "content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).", + }, + ], + ) + + # Map Website + suite.add_case( + name="Map a website", + user_message="Map the website at https://wikipedia.com with a limit of 100000 links. Only the links that are about the topic of AI", + expected_tool_calls=[ + ( + map_website, + { + "url": "https://wikipedia.com", + "search": "AI", + "limit": 100000, + }, + ) + ], + critics=[ + BinaryCritic(critic_field="url", weight=0.4), + SimilarityCritic(critic_field="search", weight=0.2), + NumericCritic(critic_field="limit", weight=0.4, value_range=(90000, 110000)), + ], + ) + + return suite diff --git a/toolkits/web/pyproject.toml b/toolkits/web/pyproject.toml new file mode 100644 index 00000000..872024ad --- /dev/null +++ b/toolkits/web/pyproject.toml @@ -0,0 +1,17 @@ +[tool.poetry] +name = "arcade_web" +version = "0.1.0" +description = "LLM tools for web-related tasks" +authors = ["Arcade AI "] + +[tool.poetry.dependencies] +python = "^3.10" +arcade-ai = "^0.1.0" +firecrawl-py = "^1.3.1" + +[tool.poetry.dev-dependencies] +pytest = "^8.3.0" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/toolkits/web/tests/__init__.py b/toolkits/web/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/toolkits/web/tests/test_firecrawl.py b/toolkits/web/tests/test_firecrawl.py new file mode 100644 index 00000000..98120153 --- /dev/null +++ b/toolkits/web/tests/test_firecrawl.py @@ -0,0 +1,97 @@ +from unittest.mock import AsyncMock, patch + +import pytest +from arcade_web.tools.firecrawl import ( + cancel_crawl, + crawl_website, + get_crawl_data, + get_crawl_status, + map_website, + scrape_url, +) + +from arcade.sdk.error import ToolExecutionError + + +@pytest.fixture +def mock_context(): + context = AsyncMock() + context.authorization.token = "mock_token" # noqa: S105 + return context + + +@pytest.fixture +def mock_firecrawl_app(): + with patch("arcade_web.tools.firecrawl.FirecrawlApp") as app: + yield app.return_value + + +@pytest.mark.asyncio +async def test_scrape_url_success(mock_firecrawl_app): + mock_firecrawl_app.scrape_url.return_value = {"data": "scraped content"} + + result = await scrape_url("http://example.com") + assert result == {"data": "scraped content"} + + +@pytest.mark.asyncio +async def test_crawl_website_success(mock_firecrawl_app): + mock_firecrawl_app.async_crawl_url.return_value = {"crawl_id": "12345"} + + result = await crawl_website("http://example.com") + assert result == {"crawl_id": "12345"} + + +@pytest.mark.asyncio +async def test_get_crawl_status_success(mock_firecrawl_app): + mock_firecrawl_app.check_crawl_status.return_value = {"status": "completed"} + + result = await get_crawl_status("12345") + assert result == {"status": "completed"} + + +@pytest.mark.asyncio +async def test_get_crawl_data_success(mock_firecrawl_app): + mock_firecrawl_app.check_crawl_status.return_value = {"data": "crawl data"} + + result = await get_crawl_data("12345") + assert result == {"data": "crawl data"} + + +@pytest.mark.asyncio +async def test_cancel_crawl_success(mock_firecrawl_app): + mock_firecrawl_app.cancel_crawl.return_value = {"status": "cancelled"} + + result = await cancel_crawl("12345") + assert result == {"status": "cancelled"} + + +@pytest.mark.asyncio +async def test_map_website_success(mock_firecrawl_app): + mock_firecrawl_app.map_url.return_value = {"map": "website map"} + + result = await map_website("http://example.com") + assert result == {"map": "website map"} + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "method,params,error_message", + [ + (scrape_url, ("http://example.com",), "Error scraping URL"), + (crawl_website, ("http://example.com",), "Error crawling website"), + (get_crawl_status, ("12345",), "Error getting crawl status"), + (get_crawl_data, ("12345",), "Error getting crawl data"), + (cancel_crawl, ("12345",), "Error cancelling crawl"), + (map_website, ("http://example.com",), "Error mapping website"), + ], +) +async def test_firecrawl_error(mock_firecrawl_app, method, params, error_message): + mock_firecrawl_app.scrape_url.side_effect = Exception(error_message) + mock_firecrawl_app.async_crawl_url.side_effect = Exception(error_message) + mock_firecrawl_app.check_crawl_status.side_effect = Exception(error_message) + mock_firecrawl_app.cancel_crawl.side_effect = Exception(error_message) + mock_firecrawl_app.map_url.side_effect = Exception(error_message) + + with pytest.raises(ToolExecutionError): + await method(*params)