Add Firecrawl Tools For The New arcade_web` Toolkit (#110)

# PR Description This PR adds 6 new tools inside the new `arcade_web` toolkit. None of these tools require auth. They do, however, require the `FIRECRAWL_API_KEY` API Key to be set. The new tools implement the [Firecrawl](https://www.firecrawl.dev/) APIs `/scrape (POST)`, `/crawl (POST)`, `/crawl/{id} (GET)`, `/crawl/{id} (DELETE)`, and `/map (POST)`. The six tools are: * `Web.ScrapeUrl`: - In the future I would like this tool to support actions (clicking, scrolling, screenshotting, etc) and extract (specify what you want to scrape) parameters. Firecrawl supports both of these parameters. * `Web.CrawlWebsite`: - If `async_crawl` is true, then the tool just returns the id of the crawl job, which you can retrieve later with the `Web.GetCrawlData` tool. If `async_crawl` is false, then the entire contents of the crawl are returned. * `Web.GetCrawlStatus` - Works for in progress or recently finished crawl jobs (Firecrawl's limitation) * `Web.GetCrawlData` - Works for in progress or recently finished crawl jobs (Firecrawl's limitation) * `Web.CancelCrawl` - You can cancel an in progress async crawl job * `Web.MapWebsite` - This endpoint is in alpha, but it can give you all of the links of an entire website, or optionally, you can specify in natural language what type of links you want to map by using the `search` parameter. For example "only map webpages that are about AI"
2024-10-17 16:10:53 -07:00 · 2024-10-17 16:10:53 -07:00 · cc2a08ec34
commit cc2a08ec34
parent 1c6e3f4495
9 changed files with 557 additions and 0 deletions
--- a/toolkits/web/arcade_web/init.py
+++ b/toolkits/web/arcade_web/init.py
--- a/toolkits/web/arcade_web/tools/init.py
+++ b/toolkits/web/arcade_web/tools/init.py
--- a/toolkits/web/arcade_web/tools/firecrawl.py
+++ b/toolkits/web/arcade_web/tools/firecrawl.py
@ -0,0 +1,180 @@
 from typing import Annotated, Any, Optional
 from firecrawl import FirecrawlApp
 from arcade.sdk import tool
 from arcade_web.tools.models import Formats
 from arcade_web.tools.utils import get_secret
 # TODO: Support actions. This would enable clicking, scrolling, screenshotting, etc.
 # TODO: Support extract.
 # TODO: Support headers param?
@tool
 async def scrape_url(
    url: Annotated[str, "URL to scrape"],
    formats: Annotated[
        Optional[list[Formats]], "Formats to retrieve. Defaults to ['markdown']."
    ] = None,
    only_main_content: Annotated[
        Optional[bool],
        "Only return the main content of the page excluding headers, navs, footers, etc.",
    ] = True,
    include_tags: Annotated[list[str] | None, "List of tags to include in the output"] = None,
    exclude_tags: Annotated[list[str] | None, "List of tags to exclude from the output"] = None,
    wait_for: Annotated[
        Optional[int],
        "Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
    ] = 10,
    timeout: Annotated[Optional[int], "Timeout in milliseconds for the request"] = 30000,
 ) -> Annotated[dict[str, Any], "Scraped data in specified formats"]:
    """Scrape a URL using Firecrawl and return the data in specified formats."""
    api_key = get_secret("FIRECRAWL_API_KEY")
    formats = formats or [Formats.MARKDOWN]
    app = FirecrawlApp(api_key=api_key)
    params = {
        "formats": formats,
        "onlyMainContent": only_main_content,
        "includeTags": include_tags or [],
        "excludeTags": exclude_tags or [],
        "waitFor": wait_for,
        "timeout": timeout,
    }
    response = app.scrape_url(url, params=params)
    return response
 # TODO: Support scrapeOptions.
@tool
 async def crawl_website(
    url: Annotated[str, "URL to crawl"],
    exclude_paths: Annotated[list[str] | None, "URL patterns to exclude from the crawl"] = None,
    include_paths: Annotated[list[str] | None, "URL patterns to include in the crawl"] = None,
    max_depth: Annotated[int, "Maximum depth to crawl relative to the entered URL"] = 2,
    ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True,
    limit: Annotated[int, "Limit the number of pages to crawl"] = 10,
    allow_backward_links: Annotated[
        bool,
        "Enable navigation to previously linked pages and enable crawling sublinks that are not children of the 'url' input parameter.",
    ] = False,
    allow_external_links: Annotated[bool, "Allow following links to external websites"] = False,
    webhook: Annotated[
        Optional[str],
        "The URL to send a POST request to when the crawl is started, updated and completed.",
    ] = None,
    async_crawl: Annotated[bool, "Run the crawl asynchronously"] = True,
 ) -> Annotated[dict[str, Any], "Crawl status and data"]:
    """
    Crawl a website using Firecrawl. If the crawl is asynchronous, then returns the crawl ID.
    If the crawl is synchronous, then returns the crawl data.
    """
    api_key = get_secret("FIRECRAWL_API_KEY")
    app = FirecrawlApp(api_key=api_key)
    params = {
        "limit": limit,
        "excludePaths": exclude_paths or [],
        "includePaths": include_paths or [],
        "maxDepth": max_depth,
        "ignoreSitemap": ignore_sitemap,
        "allowBackwardLinks": allow_backward_links,
        "allowExternalLinks": allow_external_links,
    }
    if webhook:
        params["webhook"] = webhook
    if async_crawl:
        response = app.async_crawl_url(url, params=params)
        if (
            "url" in response
        ):  # Url isn't clickable, so removing it since only the ID is needed to check status
            del response["url"]
    else:
        response = app.crawl_url(url, params=params)
    return response
@tool
 async def get_crawl_status(
    crawl_id: Annotated[str, "The ID of the crawl job"],
 ) -> Annotated[dict[str, Any], "Crawl status information"]:
    """
    Get the status of a Firecrawl 'crawl' that is either in progress or recently completed.
    """
    api_key = get_secret("FIRECRAWL_API_KEY")
    app = FirecrawlApp(api_key=api_key)
    crawl_status = app.check_crawl_status(crawl_id)
    if "data" in crawl_status:
        del crawl_status["data"]
    return crawl_status
 # TODO: Support responses greater than 10 MB. If the response is greater than 10 MB, then the Firecrawl API response will have a next_url field.
@tool
 async def get_crawl_data(
    crawl_id: Annotated[str, "The ID of the crawl job"],
 ) -> Annotated[dict[str, Any], "Crawl data information"]:
    """
    Get the data of a Firecrawl 'crawl' that is either in progress or recently completed.
    """
    api_key = get_secret("FIRECRAWL_API_KEY")
    app = FirecrawlApp(api_key=api_key)
    crawl_data = app.check_crawl_status(crawl_id)
    return crawl_data
@tool
 async def cancel_crawl(
    crawl_id: Annotated[str, "The ID of the asynchronous crawl job to cancel"],
 ) -> Annotated[dict[str, Any], "Cancellation status information"]:
    """
    Cancel an asynchronous crawl job that is in progress using the Firecrawl API.
    """
    api_key = get_secret("FIRECRAWL_API_KEY")
    app = FirecrawlApp(api_key=api_key)
    cancellation_status = app.cancel_crawl(crawl_id)
    return cancellation_status
@tool
 async def map_website(
    url: Annotated[str, "The base URL to start crawling from"],
    search: Annotated[Optional[str], "Search query to use for mapping"] = None,
    ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True,
    include_subdomains: Annotated[bool, "Include subdomains of the website"] = False,
    limit: Annotated[int, "Maximum number of links to return"] = 5000,
 ) -> Annotated[dict[str, Any], "Website map data"]:
    """
    Map a website from a single URL to a map of the entire website.
    """
    api_key = get_secret("FIRECRAWL_API_KEY")
    app = FirecrawlApp(api_key=api_key)
    params = {
        "ignoreSitemap": ignore_sitemap,
        "includeSubdomains": include_subdomains,
        "limit": limit,
    }
    if search:
        params["search"] = search
    map_result = app.map_url(url, params=params)
    return map_result
--- a/toolkits/web/arcade_web/tools/models.py
+++ b/toolkits/web/arcade_web/tools/models.py
@ -0,0 +1,11 @@
 from enum import Enum
 # Models and enums for firecrawl web tools
 class Formats(str, Enum):
    MARKDOWN = "markdown"
    HTML = "html"
    RAW_HTML = "rawHtml"
    LINKS = "links"
    SCREENSHOT = "screenshot"
    SCREENSHOT_AT_FULL_PAGE = "screenshot@fullPage"
--- a/toolkits/web/arcade_web/tools/utils.py
+++ b/toolkits/web/arcade_web/tools/utils.py
@ -0,0 +1,9 @@
 import os
 from typing import Any, Optional
 def get_secret(name: str, default: Optional[Any] = None) -> Any:
    secret = os.getenv(name)
    if secret is None and default is not None:
        return default
    return secret
--- a/toolkits/web/evals/eval_firecrawl.py
+++ b/toolkits/web/evals/eval_firecrawl.py
@ -0,0 +1,243 @@
 import arcade_web
 from arcade_web.tools.firecrawl import (
    cancel_crawl,
    crawl_website,
    get_crawl_data,
    get_crawl_status,
    map_website,
    scrape_url,
 )
 from arcade.core.catalog import ToolCatalog
 from arcade.sdk.eval import (
    BinaryCritic,
    EvalRubric,
    EvalSuite,
    NumericCritic,
    tool_eval,
 )
 from arcade.sdk.eval.critic import SimilarityCritic
 # Evaluation rubric
 rubric = EvalRubric(
    fail_threshold=0.9,
    warn_threshold=0.95,
 )
 catalog = ToolCatalog()
 # Register the Firecrawl tools
 catalog.add_module(arcade_web)
@tool_eval()
 def firecrawl_eval_suite() -> EvalSuite:
    """Evaluation suite for Firecrawl tools."""
    suite = EvalSuite(
        name="Firecrawl Tools Evaluation Suite",
        system_message="You are an AI assistant that helps users interact with web scraping and crawling tools using the provided tools.",
        catalog=catalog,
        rubric=rubric,
    )
    # Scrape URL
    suite.add_case(
        name="Scrape a URL",
        user_message="Scrape https://foobar.com/malicious/malware/that/will/harm/you in markdown format please. Wait for 10 seconds before fetching the content.",
        expected_tool_calls=[
            (
                scrape_url,
                {
                    "url": "https://foobar.com/do/not/scrape/no/matter/what/",
                    "formats": ["markdown"],
                    "wait_for": 10000,
                },
            )
        ],
        critics=[
            BinaryCritic(critic_field="url", weight=0.4),
            BinaryCritic(critic_field="formats", weight=0.4),
            NumericCritic(critic_field="wait_for", weight=0.2, value_range=(9000, 11000)),
        ],
    )
    # Crawl Website
    suite.add_case(
        name="Crawl a website",
        user_message="Crawl the website at https://wikipedia.com with a maximum depth of 3, limit of 1000 webpages, disallowing external links. Updates should be sent to http://example.com/crawl-updates. Oh and do it in the background. THanks",
        expected_tool_calls=[
            (
                crawl_website,
                {
                    "url": "https://wikipedia.com",
                    "max_depth": 3,
                    "limit": 1000,
                    "allow_external_links": False,
                    "webhook": "http://example.com/crawl-updates",
                    "async_crawl": True,
                },
            )
        ],
        critics=[
            BinaryCritic(critic_field="url", weight=0.2),
            BinaryCritic(critic_field="max_depth", weight=0.1),
            BinaryCritic(critic_field="limit", weight=0.1),
            BinaryCritic(critic_field="allow_external_links", weight=0.1),
            BinaryCritic(critic_field="webhook", weight=0.2),
            BinaryCritic(critic_field="async_crawl", weight=0.2),
        ],
    )
    # Get Crawl Status
    suite.add_case(
        name="Get crawl status",
        user_message="Check the status of my crawl",
        expected_tool_calls=[
            (
                get_crawl_status,
                {
                    "crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b",
                },
            )
        ],
        critics=[
            BinaryCritic(critic_field="crawl_id", weight=1.0),
        ],
        additional_messages=[
            {"role": "user", "content": "crawl asynchronously https://www.google.com"},
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
                        "type": "function",
                        "function": {
                            "name": "Web_CrawlWebsite",
                            "arguments": '{"url":"https://www.google.com","async_crawl":true}',
                        },
                    }
                ],
            },
            {
                "role": "tool",
                "content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}',
                "tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
                "name": "Web_CrawlWebsite",
            },
            {
                "role": "assistant",
                "content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).",
            },
        ],
    )
    # # Get Crawl Data
    suite.add_case(
        name="Get crawl status",
        user_message="Ok looks like the crawl is done, can I get the result please?",
        expected_tool_calls=[
            (
                get_crawl_data,
                {
                    "crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b",
                },
            )
        ],
        critics=[
            BinaryCritic(critic_field="crawl_id", weight=1.0),
        ],
        additional_messages=[
            {"role": "user", "content": "crawl asynchronously https://www.google.com"},
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
                        "type": "function",
                        "function": {
                            "name": "Web_CrawlWebsite",
                            "arguments": '{"url":"https://www.google.com","async_crawl":true}',
                        },
                    }
                ],
            },
            {
                "role": "tool",
                "content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}',
                "tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
                "name": "Web_CrawlWebsite",
            },
            {
                "role": "assistant",
                "content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).",
            },
        ],
    )
    # Cancel Crawl
    suite.add_case(
        name="Get crawl status",
        user_message="Actually cancel it.",
        expected_tool_calls=[
            (
                cancel_crawl,
                {
                    "crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b",
                },
            )
        ],
        critics=[
            BinaryCritic(critic_field="crawl_id", weight=1.0),
        ],
        additional_messages=[
            {"role": "user", "content": "crawl asynchronously https://www.google.com"},
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
                        "type": "function",
                        "function": {
                            "name": "Web_CrawlWebsite",
                            "arguments": '{"url":"https://www.google.com","async_crawl":true}',
                        },
                    }
                ],
            },
            {
                "role": "tool",
                "content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}',
                "tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
                "name": "Web_CrawlWebsite",
            },
            {
                "role": "assistant",
                "content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).",
            },
        ],
    )
    # Map Website
    suite.add_case(
        name="Map a website",
        user_message="Map the website at https://wikipedia.com with a limit of 100000 links. Only the links that are about the topic of AI",
        expected_tool_calls=[
            (
                map_website,
                {
                    "url": "https://wikipedia.com",
                    "search": "AI",
                    "limit": 100000,
                },
            )
        ],
        critics=[
            BinaryCritic(critic_field="url", weight=0.4),
            SimilarityCritic(critic_field="search", weight=0.2),
            NumericCritic(critic_field="limit", weight=0.4, value_range=(90000, 110000)),
        ],
    )
    return suite
--- a/toolkits/web/pyproject.toml
+++ b/toolkits/web/pyproject.toml
@ -0,0 +1,17 @@
 [tool.poetry]
 name = "arcade_web"
 version = "0.1.0"
 description = "LLM tools for web-related tasks"
 authors = ["Arcade AI <dev@arcade-ai.com>"]
 [tool.poetry.dependencies]
 python = "^3.10"
 arcade-ai = "^0.1.0"
 firecrawl-py = "^1.3.1"
 [tool.poetry.dev-dependencies]
 pytest = "^8.3.0"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
--- a/toolkits/web/tests/init.py
+++ b/toolkits/web/tests/init.py
--- a/toolkits/web/tests/test_firecrawl.py
+++ b/toolkits/web/tests/test_firecrawl.py
@ -0,0 +1,97 @@
 from unittest.mock import AsyncMock, patch
 import pytest
 from arcade_web.tools.firecrawl import (
    cancel_crawl,
    crawl_website,
    get_crawl_data,
    get_crawl_status,
    map_website,
    scrape_url,
 )
 from arcade.sdk.error import ToolExecutionError
@pytest.fixture
 def mock_context():
    context = AsyncMock()
    context.authorization.token = "mock_token"  # noqa: S105
    return context
@pytest.fixture
 def mock_firecrawl_app():
    with patch("arcade_web.tools.firecrawl.FirecrawlApp") as app:
        yield app.return_value
@pytest.mark.asyncio
 async def test_scrape_url_success(mock_firecrawl_app):
    mock_firecrawl_app.scrape_url.return_value = {"data": "scraped content"}
    result = await scrape_url("http://example.com")
    assert result == {"data": "scraped content"}
@pytest.mark.asyncio
 async def test_crawl_website_success(mock_firecrawl_app):
    mock_firecrawl_app.async_crawl_url.return_value = {"crawl_id": "12345"}
    result = await crawl_website("http://example.com")
    assert result == {"crawl_id": "12345"}
@pytest.mark.asyncio
 async def test_get_crawl_status_success(mock_firecrawl_app):
    mock_firecrawl_app.check_crawl_status.return_value = {"status": "completed"}
    result = await get_crawl_status("12345")
    assert result == {"status": "completed"}
@pytest.mark.asyncio
 async def test_get_crawl_data_success(mock_firecrawl_app):
    mock_firecrawl_app.check_crawl_status.return_value = {"data": "crawl data"}
    result = await get_crawl_data("12345")
    assert result == {"data": "crawl data"}
@pytest.mark.asyncio
 async def test_cancel_crawl_success(mock_firecrawl_app):
    mock_firecrawl_app.cancel_crawl.return_value = {"status": "cancelled"}
    result = await cancel_crawl("12345")
    assert result == {"status": "cancelled"}
@pytest.mark.asyncio
 async def test_map_website_success(mock_firecrawl_app):
    mock_firecrawl_app.map_url.return_value = {"map": "website map"}
    result = await map_website("http://example.com")
    assert result == {"map": "website map"}
@pytest.mark.asyncio
@pytest.mark.parametrize(
    "method,params,error_message",
    [
        (scrape_url, ("http://example.com",), "Error scraping URL"),
        (crawl_website, ("http://example.com",), "Error crawling website"),
        (get_crawl_status, ("12345",), "Error getting crawl status"),
        (get_crawl_data, ("12345",), "Error getting crawl data"),
        (cancel_crawl, ("12345",), "Error cancelling crawl"),
        (map_website, ("http://example.com",), "Error mapping website"),
    ],
 )
 async def test_firecrawl_error(mock_firecrawl_app, method, params, error_message):
    mock_firecrawl_app.scrape_url.side_effect = Exception(error_message)
    mock_firecrawl_app.async_crawl_url.side_effect = Exception(error_message)
    mock_firecrawl_app.check_crawl_status.side_effect = Exception(error_message)
    mock_firecrawl_app.cancel_crawl.side_effect = Exception(error_message)
    mock_firecrawl_app.map_url.side_effect = Exception(error_message)
    with pytest.raises(ToolExecutionError):
        await method(*params)