arcade-mcp/toolkits/web/arcade_web/tools/firecrawl.py
Nate Barbettini e9ee3bba40
fix: Use tool secrets in toolkits (#271)
~~Note: Don't merge until the correct secrets have been added to Arcade
Cloud.~~

Ready to merge, the feature is already on its way to prod.

---------

Co-authored-by: Eric Gustin <eric@arcade.dev>
2025-03-04 13:35:36 -08:00

188 lines
6.5 KiB
Python

from typing import Annotated, Any, Optional
from arcade.sdk import ToolContext, tool
from firecrawl import FirecrawlApp
from arcade_web.tools.models import Formats
# TODO: Support actions. This would enable clicking, scrolling, screenshotting, etc.
# TODO: Support extract.
# TODO: Support headers param?
@tool(requires_secrets=["FIRECRAWL_API_KEY"])
async def scrape_url(
context: ToolContext,
url: Annotated[str, "URL to scrape"],
formats: Annotated[
Optional[list[Formats]], "Formats to retrieve. Defaults to ['markdown']."
] = None,
only_main_content: Annotated[
Optional[bool],
"Only return the main content of the page excluding headers, navs, footers, etc.",
] = True,
include_tags: Annotated[list[str] | None, "List of tags to include in the output"] = None,
exclude_tags: Annotated[list[str] | None, "List of tags to exclude from the output"] = None,
wait_for: Annotated[
Optional[int],
"Specify a delay in milliseconds before fetching the content, allowing the page "
"sufficient time to load.",
] = 10,
timeout: Annotated[Optional[int], "Timeout in milliseconds for the request"] = 30000,
) -> Annotated[dict[str, Any], "Scraped data in specified formats"]:
"""Scrape a URL using Firecrawl and return the data in specified formats."""
api_key = context.get_secret("FIRECRAWL_API_KEY")
formats = formats or [Formats.MARKDOWN]
app = FirecrawlApp(api_key=api_key)
params = {
"formats": formats,
"onlyMainContent": only_main_content,
"includeTags": include_tags or [],
"excludeTags": exclude_tags or [],
"waitFor": wait_for,
"timeout": timeout,
}
response = app.scrape_url(url, params=params)
return dict(response)
# TODO: Support scrapeOptions.
@tool(requires_secrets=["FIRECRAWL_API_KEY"])
async def crawl_website(
context: ToolContext,
url: Annotated[str, "URL to crawl"],
exclude_paths: Annotated[list[str] | None, "URL patterns to exclude from the crawl"] = None,
include_paths: Annotated[list[str] | None, "URL patterns to include in the crawl"] = None,
max_depth: Annotated[int, "Maximum depth to crawl relative to the entered URL"] = 2,
ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True,
limit: Annotated[int, "Limit the number of pages to crawl"] = 10,
allow_backward_links: Annotated[
bool,
"Enable navigation to previously linked pages and enable crawling "
"sublinks that are not children of the 'url' input parameter.",
] = False,
allow_external_links: Annotated[bool, "Allow following links to external websites"] = False,
webhook: Annotated[
Optional[str],
"The URL to send a POST request to when the crawl is started, updated and completed.",
] = None,
async_crawl: Annotated[bool, "Run the crawl asynchronously"] = True,
) -> Annotated[dict[str, Any], "Crawl status and data"]:
"""
Crawl a website using Firecrawl. If the crawl is asynchronous, then returns the crawl ID.
If the crawl is synchronous, then returns the crawl data.
"""
api_key = context.get_secret("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=api_key)
params = {
"limit": limit,
"excludePaths": exclude_paths or [],
"includePaths": include_paths or [],
"maxDepth": max_depth,
"ignoreSitemap": ignore_sitemap,
"allowBackwardLinks": allow_backward_links,
"allowExternalLinks": allow_external_links,
}
if webhook:
params["webhook"] = webhook
if async_crawl:
response = app.async_crawl_url(url, params=params)
if (
"url" in response
): # Url isn't clickable, so removing it since only the ID is needed to check status
del response["url"]
else:
response = app.crawl_url(url, params=params)
return dict(response)
@tool(requires_secrets=["FIRECRAWL_API_KEY"])
async def get_crawl_status(
context: ToolContext,
crawl_id: Annotated[str, "The ID of the crawl job"],
) -> Annotated[dict[str, Any], "Crawl status information"]:
"""
Get the status of a Firecrawl 'crawl' that is either in progress or recently completed.
"""
api_key = context.get_secret("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=api_key)
crawl_status = app.check_crawl_status(crawl_id)
if "data" in crawl_status:
del crawl_status["data"]
return dict(crawl_status)
# TODO: Support responses greater than 10 MB. If the response is greater than 10 MB,
# then the Firecrawl API response will have a next_url field.
@tool
async def get_crawl_data(
context: ToolContext,
crawl_id: Annotated[str, "The ID of the crawl job"],
) -> Annotated[dict[str, Any], "Crawl data information"]:
"""
Get the data of a Firecrawl 'crawl' that is either in progress or recently completed.
"""
api_key = context.get_secret("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=api_key)
crawl_data = app.check_crawl_status(crawl_id)
return dict(crawl_data)
@tool(requires_secrets=["FIRECRAWL_API_KEY"])
async def cancel_crawl(
context: ToolContext,
crawl_id: Annotated[str, "The ID of the asynchronous crawl job to cancel"],
) -> Annotated[dict[str, Any], "Cancellation status information"]:
"""
Cancel an asynchronous crawl job that is in progress using the Firecrawl API.
"""
api_key = context.get_secret("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=api_key)
cancellation_status = app.cancel_crawl(crawl_id)
return dict(cancellation_status)
@tool(requires_secrets=["FIRECRAWL_API_KEY"])
async def map_website(
context: ToolContext,
url: Annotated[str, "The base URL to start crawling from"],
search: Annotated[Optional[str], "Search query to use for mapping"] = None,
ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True,
include_subdomains: Annotated[bool, "Include subdomains of the website"] = False,
limit: Annotated[int, "Maximum number of links to return"] = 5000,
) -> Annotated[dict[str, Any], "Website map data"]:
"""
Map a website from a single URL to a map of the entire website.
"""
api_key = context.get_secret("FIRECRAWL_API_KEY")
app = FirecrawlApp(api_key=api_key)
params: dict[str, Any] = {
"ignoreSitemap": ignore_sitemap,
"includeSubdomains": include_subdomains,
"limit": limit,
}
if search:
params["search"] = search
map_result = app.map_url(url, params=params)
return dict(map_result)