Add Firecrawl Tools For The New arcade_web` Toolkit (#110)

# PR Description
This PR adds 6 new tools inside the new `arcade_web` toolkit. None of
these tools require auth. They do, however, require the
`FIRECRAWL_API_KEY` API Key to be set.

The new tools implement the [Firecrawl](https://www.firecrawl.dev/) APIs
`/scrape (POST)`, `/crawl (POST)`, `/crawl/{id} (GET)`, `/crawl/{id}
(DELETE)`, and `/map (POST)`.

The six tools are:
* `Web.ScrapeUrl`: 
- In the future I would like this tool to support actions (clicking,
scrolling, screenshotting, etc) and extract (specify what you want to
scrape) parameters. Firecrawl supports both of these parameters.
* `Web.CrawlWebsite`:
- If `async_crawl` is true, then the tool just returns the id of the
crawl job, which you can retrieve later with the `Web.GetCrawlData`
tool. If `async_crawl` is false, then the entire contents of the crawl
are returned.
* `Web.GetCrawlStatus`
- Works for in progress or recently finished crawl jobs (Firecrawl's
limitation)
* `Web.GetCrawlData`
- Works for in progress or recently finished crawl jobs (Firecrawl's
limitation)
* `Web.CancelCrawl`
    - You can cancel an in progress async crawl job
* `Web.MapWebsite`
- This endpoint is in alpha, but it can give you all of the links of an
entire website, or optionally, you can specify in natural language what
type of links you want to map by using the `search` parameter. For
example "only map webpages that are about AI"
This commit is contained in:
Eric Gustin 2024-10-17 16:10:53 -07:00 committed by GitHub
parent 1c6e3f4495
commit cc2a08ec34
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 557 additions and 0 deletions

View file

View file

@ -0,0 +1,180 @@
from typing import Annotated, Any, Optional
from firecrawl import FirecrawlApp
from arcade.sdk import tool
from arcade_web.tools.models import Formats
from arcade_web.tools.utils import get_secret
# TODO: Support actions. This would enable clicking, scrolling, screenshotting, etc.
# TODO: Support extract.
# TODO: Support headers param?
@tool
async def scrape_url(
url: Annotated[str, "URL to scrape"],
formats: Annotated[
Optional[list[Formats]], "Formats to retrieve. Defaults to ['markdown']."
] = None,
only_main_content: Annotated[
Optional[bool],
"Only return the main content of the page excluding headers, navs, footers, etc.",
] = True,
include_tags: Annotated[list[str] | None, "List of tags to include in the output"] = None,
exclude_tags: Annotated[list[str] | None, "List of tags to exclude from the output"] = None,
wait_for: Annotated[
Optional[int],
"Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
] = 10,
timeout: Annotated[Optional[int], "Timeout in milliseconds for the request"] = 30000,
) -> Annotated[dict[str, Any], "Scraped data in specified formats"]:
"""Scrape a URL using Firecrawl and return the data in specified formats."""
api_key = get_secret("FIRECRAWL_API_KEY")
formats = formats or [Formats.MARKDOWN]
app = FirecrawlApp(api_key=api_key)
params = {
"formats": formats,
"onlyMainContent": only_main_content,
"includeTags": include_tags or [],
"excludeTags": exclude_tags or [],
"waitFor": wait_for,
"timeout": timeout,
}
response = app.scrape_url(url, params=params)
return response
# TODO: Support scrapeOptions.
@tool
async def crawl_website(
url: Annotated[str, "URL to crawl"],
exclude_paths: Annotated[list[str] | None, "URL patterns to exclude from the crawl"] = None,
include_paths: Annotated[list[str] | None, "URL patterns to include in the crawl"] = None,
max_depth: Annotated[int, "Maximum depth to crawl relative to the entered URL"] = 2,
ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True,
limit: Annotated[int, "Limit the number of pages to crawl"] = 10,
allow_backward_links: Annotated[
bool,
"Enable navigation to previously linked pages and enable crawling sublinks that are not children of the 'url' input parameter.",
] = False,
allow_external_links: Annotated[bool, "Allow following links to external websites"] = False,
webhook: Annotated[
Optional[str],
"The URL to send a POST request to when the crawl is started, updated and completed.",
] = None,
async_crawl: Annotated[bool, "Run the crawl asynchronously"] = True,
) -> Annotated[dict[str, Any], "Crawl status and data"]:
"""
Crawl a website using Firecrawl. If the crawl is asynchronous, then returns the crawl ID.
If the crawl is synchronous, then returns the crawl data.
"""
api_key = get_secret("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=api_key)
params = {
"limit": limit,
"excludePaths": exclude_paths or [],
"includePaths": include_paths or [],
"maxDepth": max_depth,
"ignoreSitemap": ignore_sitemap,
"allowBackwardLinks": allow_backward_links,
"allowExternalLinks": allow_external_links,
}
if webhook:
params["webhook"] = webhook
if async_crawl:
response = app.async_crawl_url(url, params=params)
if (
"url" in response
): # Url isn't clickable, so removing it since only the ID is needed to check status
del response["url"]
else:
response = app.crawl_url(url, params=params)
return response
@tool
async def get_crawl_status(
crawl_id: Annotated[str, "The ID of the crawl job"],
) -> Annotated[dict[str, Any], "Crawl status information"]:
"""
Get the status of a Firecrawl 'crawl' that is either in progress or recently completed.
"""
api_key = get_secret("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=api_key)
crawl_status = app.check_crawl_status(crawl_id)
if "data" in crawl_status:
del crawl_status["data"]
return crawl_status
# TODO: Support responses greater than 10 MB. If the response is greater than 10 MB, then the Firecrawl API response will have a next_url field.
@tool
async def get_crawl_data(
crawl_id: Annotated[str, "The ID of the crawl job"],
) -> Annotated[dict[str, Any], "Crawl data information"]:
"""
Get the data of a Firecrawl 'crawl' that is either in progress or recently completed.
"""
api_key = get_secret("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=api_key)
crawl_data = app.check_crawl_status(crawl_id)
return crawl_data
@tool
async def cancel_crawl(
crawl_id: Annotated[str, "The ID of the asynchronous crawl job to cancel"],
) -> Annotated[dict[str, Any], "Cancellation status information"]:
"""
Cancel an asynchronous crawl job that is in progress using the Firecrawl API.
"""
api_key = get_secret("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=api_key)
cancellation_status = app.cancel_crawl(crawl_id)
return cancellation_status
@tool
async def map_website(
url: Annotated[str, "The base URL to start crawling from"],
search: Annotated[Optional[str], "Search query to use for mapping"] = None,
ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True,
include_subdomains: Annotated[bool, "Include subdomains of the website"] = False,
limit: Annotated[int, "Maximum number of links to return"] = 5000,
) -> Annotated[dict[str, Any], "Website map data"]:
"""
Map a website from a single URL to a map of the entire website.
"""
api_key = get_secret("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=api_key)
params = {
"ignoreSitemap": ignore_sitemap,
"includeSubdomains": include_subdomains,
"limit": limit,
}
if search:
params["search"] = search
map_result = app.map_url(url, params=params)
return map_result

View file

@ -0,0 +1,11 @@
from enum import Enum
# Models and enums for firecrawl web tools
class Formats(str, Enum):
MARKDOWN = "markdown"
HTML = "html"
RAW_HTML = "rawHtml"
LINKS = "links"
SCREENSHOT = "screenshot"
SCREENSHOT_AT_FULL_PAGE = "screenshot@fullPage"

View file

@ -0,0 +1,9 @@
import os
from typing import Any, Optional
def get_secret(name: str, default: Optional[Any] = None) -> Any:
secret = os.getenv(name)
if secret is None and default is not None:
return default
return secret

View file

@ -0,0 +1,243 @@
import arcade_web
from arcade_web.tools.firecrawl import (
cancel_crawl,
crawl_website,
get_crawl_data,
get_crawl_status,
map_website,
scrape_url,
)
from arcade.core.catalog import ToolCatalog
from arcade.sdk.eval import (
BinaryCritic,
EvalRubric,
EvalSuite,
NumericCritic,
tool_eval,
)
from arcade.sdk.eval.critic import SimilarityCritic
# Evaluation rubric
rubric = EvalRubric(
fail_threshold=0.9,
warn_threshold=0.95,
)
catalog = ToolCatalog()
# Register the Firecrawl tools
catalog.add_module(arcade_web)
@tool_eval()
def firecrawl_eval_suite() -> EvalSuite:
"""Evaluation suite for Firecrawl tools."""
suite = EvalSuite(
name="Firecrawl Tools Evaluation Suite",
system_message="You are an AI assistant that helps users interact with web scraping and crawling tools using the provided tools.",
catalog=catalog,
rubric=rubric,
)
# Scrape URL
suite.add_case(
name="Scrape a URL",
user_message="Scrape https://foobar.com/malicious/malware/that/will/harm/you in markdown format please. Wait for 10 seconds before fetching the content.",
expected_tool_calls=[
(
scrape_url,
{
"url": "https://foobar.com/do/not/scrape/no/matter/what/",
"formats": ["markdown"],
"wait_for": 10000,
},
)
],
critics=[
BinaryCritic(critic_field="url", weight=0.4),
BinaryCritic(critic_field="formats", weight=0.4),
NumericCritic(critic_field="wait_for", weight=0.2, value_range=(9000, 11000)),
],
)
# Crawl Website
suite.add_case(
name="Crawl a website",
user_message="Crawl the website at https://wikipedia.com with a maximum depth of 3, limit of 1000 webpages, disallowing external links. Updates should be sent to http://example.com/crawl-updates. Oh and do it in the background. THanks",
expected_tool_calls=[
(
crawl_website,
{
"url": "https://wikipedia.com",
"max_depth": 3,
"limit": 1000,
"allow_external_links": False,
"webhook": "http://example.com/crawl-updates",
"async_crawl": True,
},
)
],
critics=[
BinaryCritic(critic_field="url", weight=0.2),
BinaryCritic(critic_field="max_depth", weight=0.1),
BinaryCritic(critic_field="limit", weight=0.1),
BinaryCritic(critic_field="allow_external_links", weight=0.1),
BinaryCritic(critic_field="webhook", weight=0.2),
BinaryCritic(critic_field="async_crawl", weight=0.2),
],
)
# Get Crawl Status
suite.add_case(
name="Get crawl status",
user_message="Check the status of my crawl",
expected_tool_calls=[
(
get_crawl_status,
{
"crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b",
},
)
],
critics=[
BinaryCritic(critic_field="crawl_id", weight=1.0),
],
additional_messages=[
{"role": "user", "content": "crawl asynchronously https://www.google.com"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
"type": "function",
"function": {
"name": "Web_CrawlWebsite",
"arguments": '{"url":"https://www.google.com","async_crawl":true}',
},
}
],
},
{
"role": "tool",
"content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}',
"tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
"name": "Web_CrawlWebsite",
},
{
"role": "assistant",
"content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).",
},
],
)
# # Get Crawl Data
suite.add_case(
name="Get crawl status",
user_message="Ok looks like the crawl is done, can I get the result please?",
expected_tool_calls=[
(
get_crawl_data,
{
"crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b",
},
)
],
critics=[
BinaryCritic(critic_field="crawl_id", weight=1.0),
],
additional_messages=[
{"role": "user", "content": "crawl asynchronously https://www.google.com"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
"type": "function",
"function": {
"name": "Web_CrawlWebsite",
"arguments": '{"url":"https://www.google.com","async_crawl":true}',
},
}
],
},
{
"role": "tool",
"content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}',
"tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
"name": "Web_CrawlWebsite",
},
{
"role": "assistant",
"content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).",
},
],
)
# Cancel Crawl
suite.add_case(
name="Get crawl status",
user_message="Actually cancel it.",
expected_tool_calls=[
(
cancel_crawl,
{
"crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b",
},
)
],
critics=[
BinaryCritic(critic_field="crawl_id", weight=1.0),
],
additional_messages=[
{"role": "user", "content": "crawl asynchronously https://www.google.com"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
"type": "function",
"function": {
"name": "Web_CrawlWebsite",
"arguments": '{"url":"https://www.google.com","async_crawl":true}',
},
}
],
},
{
"role": "tool",
"content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}',
"tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
"name": "Web_CrawlWebsite",
},
{
"role": "assistant",
"content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).",
},
],
)
# Map Website
suite.add_case(
name="Map a website",
user_message="Map the website at https://wikipedia.com with a limit of 100000 links. Only the links that are about the topic of AI",
expected_tool_calls=[
(
map_website,
{
"url": "https://wikipedia.com",
"search": "AI",
"limit": 100000,
},
)
],
critics=[
BinaryCritic(critic_field="url", weight=0.4),
SimilarityCritic(critic_field="search", weight=0.2),
NumericCritic(critic_field="limit", weight=0.4, value_range=(90000, 110000)),
],
)
return suite

View file

@ -0,0 +1,17 @@
[tool.poetry]
name = "arcade_web"
version = "0.1.0"
description = "LLM tools for web-related tasks"
authors = ["Arcade AI <dev@arcade-ai.com>"]
[tool.poetry.dependencies]
python = "^3.10"
arcade-ai = "^0.1.0"
firecrawl-py = "^1.3.1"
[tool.poetry.dev-dependencies]
pytest = "^8.3.0"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View file

View file

@ -0,0 +1,97 @@
from unittest.mock import AsyncMock, patch
import pytest
from arcade_web.tools.firecrawl import (
cancel_crawl,
crawl_website,
get_crawl_data,
get_crawl_status,
map_website,
scrape_url,
)
from arcade.sdk.error import ToolExecutionError
@pytest.fixture
def mock_context():
context = AsyncMock()
context.authorization.token = "mock_token" # noqa: S105
return context
@pytest.fixture
def mock_firecrawl_app():
with patch("arcade_web.tools.firecrawl.FirecrawlApp") as app:
yield app.return_value
@pytest.mark.asyncio
async def test_scrape_url_success(mock_firecrawl_app):
mock_firecrawl_app.scrape_url.return_value = {"data": "scraped content"}
result = await scrape_url("http://example.com")
assert result == {"data": "scraped content"}
@pytest.mark.asyncio
async def test_crawl_website_success(mock_firecrawl_app):
mock_firecrawl_app.async_crawl_url.return_value = {"crawl_id": "12345"}
result = await crawl_website("http://example.com")
assert result == {"crawl_id": "12345"}
@pytest.mark.asyncio
async def test_get_crawl_status_success(mock_firecrawl_app):
mock_firecrawl_app.check_crawl_status.return_value = {"status": "completed"}
result = await get_crawl_status("12345")
assert result == {"status": "completed"}
@pytest.mark.asyncio
async def test_get_crawl_data_success(mock_firecrawl_app):
mock_firecrawl_app.check_crawl_status.return_value = {"data": "crawl data"}
result = await get_crawl_data("12345")
assert result == {"data": "crawl data"}
@pytest.mark.asyncio
async def test_cancel_crawl_success(mock_firecrawl_app):
mock_firecrawl_app.cancel_crawl.return_value = {"status": "cancelled"}
result = await cancel_crawl("12345")
assert result == {"status": "cancelled"}
@pytest.mark.asyncio
async def test_map_website_success(mock_firecrawl_app):
mock_firecrawl_app.map_url.return_value = {"map": "website map"}
result = await map_website("http://example.com")
assert result == {"map": "website map"}
@pytest.mark.asyncio
@pytest.mark.parametrize(
"method,params,error_message",
[
(scrape_url, ("http://example.com",), "Error scraping URL"),
(crawl_website, ("http://example.com",), "Error crawling website"),
(get_crawl_status, ("12345",), "Error getting crawl status"),
(get_crawl_data, ("12345",), "Error getting crawl data"),
(cancel_crawl, ("12345",), "Error cancelling crawl"),
(map_website, ("http://example.com",), "Error mapping website"),
],
)
async def test_firecrawl_error(mock_firecrawl_app, method, params, error_message):
mock_firecrawl_app.scrape_url.side_effect = Exception(error_message)
mock_firecrawl_app.async_crawl_url.side_effect = Exception(error_message)
mock_firecrawl_app.check_crawl_status.side_effect = Exception(error_message)
mock_firecrawl_app.cancel_crawl.side_effect = Exception(error_message)
mock_firecrawl_app.map_url.side_effect = Exception(error_message)
with pytest.raises(ToolExecutionError):
await method(*params)