Add Firecrawl Tools For The New arcade_web` Toolkit (#110)
# PR Description This PR adds 6 new tools inside the new `arcade_web` toolkit. None of these tools require auth. They do, however, require the `FIRECRAWL_API_KEY` API Key to be set. The new tools implement the [Firecrawl](https://www.firecrawl.dev/) APIs `/scrape (POST)`, `/crawl (POST)`, `/crawl/{id} (GET)`, `/crawl/{id} (DELETE)`, and `/map (POST)`. The six tools are: * `Web.ScrapeUrl`: - In the future I would like this tool to support actions (clicking, scrolling, screenshotting, etc) and extract (specify what you want to scrape) parameters. Firecrawl supports both of these parameters. * `Web.CrawlWebsite`: - If `async_crawl` is true, then the tool just returns the id of the crawl job, which you can retrieve later with the `Web.GetCrawlData` tool. If `async_crawl` is false, then the entire contents of the crawl are returned. * `Web.GetCrawlStatus` - Works for in progress or recently finished crawl jobs (Firecrawl's limitation) * `Web.GetCrawlData` - Works for in progress or recently finished crawl jobs (Firecrawl's limitation) * `Web.CancelCrawl` - You can cancel an in progress async crawl job * `Web.MapWebsite` - This endpoint is in alpha, but it can give you all of the links of an entire website, or optionally, you can specify in natural language what type of links you want to map by using the `search` parameter. For example "only map webpages that are about AI"
This commit is contained in:
parent
1c6e3f4495
commit
cc2a08ec34
9 changed files with 557 additions and 0 deletions
0
toolkits/web/arcade_web/__init__.py
Normal file
0
toolkits/web/arcade_web/__init__.py
Normal file
0
toolkits/web/arcade_web/tools/__init__.py
Normal file
0
toolkits/web/arcade_web/tools/__init__.py
Normal file
180
toolkits/web/arcade_web/tools/firecrawl.py
Normal file
180
toolkits/web/arcade_web/tools/firecrawl.py
Normal file
|
|
@ -0,0 +1,180 @@
|
||||||
|
from typing import Annotated, Any, Optional
|
||||||
|
|
||||||
|
from firecrawl import FirecrawlApp
|
||||||
|
|
||||||
|
from arcade.sdk import tool
|
||||||
|
from arcade_web.tools.models import Formats
|
||||||
|
from arcade_web.tools.utils import get_secret
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Support actions. This would enable clicking, scrolling, screenshotting, etc.
|
||||||
|
# TODO: Support extract.
|
||||||
|
# TODO: Support headers param?
|
||||||
|
@tool
|
||||||
|
async def scrape_url(
|
||||||
|
url: Annotated[str, "URL to scrape"],
|
||||||
|
formats: Annotated[
|
||||||
|
Optional[list[Formats]], "Formats to retrieve. Defaults to ['markdown']."
|
||||||
|
] = None,
|
||||||
|
only_main_content: Annotated[
|
||||||
|
Optional[bool],
|
||||||
|
"Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||||
|
] = True,
|
||||||
|
include_tags: Annotated[list[str] | None, "List of tags to include in the output"] = None,
|
||||||
|
exclude_tags: Annotated[list[str] | None, "List of tags to exclude from the output"] = None,
|
||||||
|
wait_for: Annotated[
|
||||||
|
Optional[int],
|
||||||
|
"Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
|
||||||
|
] = 10,
|
||||||
|
timeout: Annotated[Optional[int], "Timeout in milliseconds for the request"] = 30000,
|
||||||
|
) -> Annotated[dict[str, Any], "Scraped data in specified formats"]:
|
||||||
|
"""Scrape a URL using Firecrawl and return the data in specified formats."""
|
||||||
|
|
||||||
|
api_key = get_secret("FIRECRAWL_API_KEY")
|
||||||
|
|
||||||
|
formats = formats or [Formats.MARKDOWN]
|
||||||
|
|
||||||
|
app = FirecrawlApp(api_key=api_key)
|
||||||
|
params = {
|
||||||
|
"formats": formats,
|
||||||
|
"onlyMainContent": only_main_content,
|
||||||
|
"includeTags": include_tags or [],
|
||||||
|
"excludeTags": exclude_tags or [],
|
||||||
|
"waitFor": wait_for,
|
||||||
|
"timeout": timeout,
|
||||||
|
}
|
||||||
|
response = app.scrape_url(url, params=params)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Support scrapeOptions.
|
||||||
|
@tool
|
||||||
|
async def crawl_website(
|
||||||
|
url: Annotated[str, "URL to crawl"],
|
||||||
|
exclude_paths: Annotated[list[str] | None, "URL patterns to exclude from the crawl"] = None,
|
||||||
|
include_paths: Annotated[list[str] | None, "URL patterns to include in the crawl"] = None,
|
||||||
|
max_depth: Annotated[int, "Maximum depth to crawl relative to the entered URL"] = 2,
|
||||||
|
ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True,
|
||||||
|
limit: Annotated[int, "Limit the number of pages to crawl"] = 10,
|
||||||
|
allow_backward_links: Annotated[
|
||||||
|
bool,
|
||||||
|
"Enable navigation to previously linked pages and enable crawling sublinks that are not children of the 'url' input parameter.",
|
||||||
|
] = False,
|
||||||
|
allow_external_links: Annotated[bool, "Allow following links to external websites"] = False,
|
||||||
|
webhook: Annotated[
|
||||||
|
Optional[str],
|
||||||
|
"The URL to send a POST request to when the crawl is started, updated and completed.",
|
||||||
|
] = None,
|
||||||
|
async_crawl: Annotated[bool, "Run the crawl asynchronously"] = True,
|
||||||
|
) -> Annotated[dict[str, Any], "Crawl status and data"]:
|
||||||
|
"""
|
||||||
|
Crawl a website using Firecrawl. If the crawl is asynchronous, then returns the crawl ID.
|
||||||
|
If the crawl is synchronous, then returns the crawl data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
api_key = get_secret("FIRECRAWL_API_KEY")
|
||||||
|
|
||||||
|
app = FirecrawlApp(api_key=api_key)
|
||||||
|
params = {
|
||||||
|
"limit": limit,
|
||||||
|
"excludePaths": exclude_paths or [],
|
||||||
|
"includePaths": include_paths or [],
|
||||||
|
"maxDepth": max_depth,
|
||||||
|
"ignoreSitemap": ignore_sitemap,
|
||||||
|
"allowBackwardLinks": allow_backward_links,
|
||||||
|
"allowExternalLinks": allow_external_links,
|
||||||
|
}
|
||||||
|
if webhook:
|
||||||
|
params["webhook"] = webhook
|
||||||
|
|
||||||
|
if async_crawl:
|
||||||
|
response = app.async_crawl_url(url, params=params)
|
||||||
|
if (
|
||||||
|
"url" in response
|
||||||
|
): # Url isn't clickable, so removing it since only the ID is needed to check status
|
||||||
|
del response["url"]
|
||||||
|
else:
|
||||||
|
response = app.crawl_url(url, params=params)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
@tool
|
||||||
|
async def get_crawl_status(
|
||||||
|
crawl_id: Annotated[str, "The ID of the crawl job"],
|
||||||
|
) -> Annotated[dict[str, Any], "Crawl status information"]:
|
||||||
|
"""
|
||||||
|
Get the status of a Firecrawl 'crawl' that is either in progress or recently completed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
api_key = get_secret("FIRECRAWL_API_KEY")
|
||||||
|
|
||||||
|
app = FirecrawlApp(api_key=api_key)
|
||||||
|
crawl_status = app.check_crawl_status(crawl_id)
|
||||||
|
|
||||||
|
if "data" in crawl_status:
|
||||||
|
del crawl_status["data"]
|
||||||
|
|
||||||
|
return crawl_status
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Support responses greater than 10 MB. If the response is greater than 10 MB, then the Firecrawl API response will have a next_url field.
|
||||||
|
@tool
|
||||||
|
async def get_crawl_data(
|
||||||
|
crawl_id: Annotated[str, "The ID of the crawl job"],
|
||||||
|
) -> Annotated[dict[str, Any], "Crawl data information"]:
|
||||||
|
"""
|
||||||
|
Get the data of a Firecrawl 'crawl' that is either in progress or recently completed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
api_key = get_secret("FIRECRAWL_API_KEY")
|
||||||
|
|
||||||
|
app = FirecrawlApp(api_key=api_key)
|
||||||
|
crawl_data = app.check_crawl_status(crawl_id)
|
||||||
|
|
||||||
|
return crawl_data
|
||||||
|
|
||||||
|
|
||||||
|
@tool
|
||||||
|
async def cancel_crawl(
|
||||||
|
crawl_id: Annotated[str, "The ID of the asynchronous crawl job to cancel"],
|
||||||
|
) -> Annotated[dict[str, Any], "Cancellation status information"]:
|
||||||
|
"""
|
||||||
|
Cancel an asynchronous crawl job that is in progress using the Firecrawl API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
api_key = get_secret("FIRECRAWL_API_KEY")
|
||||||
|
|
||||||
|
app = FirecrawlApp(api_key=api_key)
|
||||||
|
cancellation_status = app.cancel_crawl(crawl_id)
|
||||||
|
|
||||||
|
return cancellation_status
|
||||||
|
|
||||||
|
|
||||||
|
@tool
|
||||||
|
async def map_website(
|
||||||
|
url: Annotated[str, "The base URL to start crawling from"],
|
||||||
|
search: Annotated[Optional[str], "Search query to use for mapping"] = None,
|
||||||
|
ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True,
|
||||||
|
include_subdomains: Annotated[bool, "Include subdomains of the website"] = False,
|
||||||
|
limit: Annotated[int, "Maximum number of links to return"] = 5000,
|
||||||
|
) -> Annotated[dict[str, Any], "Website map data"]:
|
||||||
|
"""
|
||||||
|
Map a website from a single URL to a map of the entire website.
|
||||||
|
"""
|
||||||
|
|
||||||
|
api_key = get_secret("FIRECRAWL_API_KEY")
|
||||||
|
|
||||||
|
app = FirecrawlApp(api_key=api_key)
|
||||||
|
params = {
|
||||||
|
"ignoreSitemap": ignore_sitemap,
|
||||||
|
"includeSubdomains": include_subdomains,
|
||||||
|
"limit": limit,
|
||||||
|
}
|
||||||
|
if search:
|
||||||
|
params["search"] = search
|
||||||
|
|
||||||
|
map_result = app.map_url(url, params=params)
|
||||||
|
|
||||||
|
return map_result
|
||||||
11
toolkits/web/arcade_web/tools/models.py
Normal file
11
toolkits/web/arcade_web/tools/models.py
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
# Models and enums for firecrawl web tools
|
||||||
|
class Formats(str, Enum):
|
||||||
|
MARKDOWN = "markdown"
|
||||||
|
HTML = "html"
|
||||||
|
RAW_HTML = "rawHtml"
|
||||||
|
LINKS = "links"
|
||||||
|
SCREENSHOT = "screenshot"
|
||||||
|
SCREENSHOT_AT_FULL_PAGE = "screenshot@fullPage"
|
||||||
9
toolkits/web/arcade_web/tools/utils.py
Normal file
9
toolkits/web/arcade_web/tools/utils.py
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
import os
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
|
||||||
|
def get_secret(name: str, default: Optional[Any] = None) -> Any:
|
||||||
|
secret = os.getenv(name)
|
||||||
|
if secret is None and default is not None:
|
||||||
|
return default
|
||||||
|
return secret
|
||||||
243
toolkits/web/evals/eval_firecrawl.py
Normal file
243
toolkits/web/evals/eval_firecrawl.py
Normal file
|
|
@ -0,0 +1,243 @@
|
||||||
|
import arcade_web
|
||||||
|
from arcade_web.tools.firecrawl import (
|
||||||
|
cancel_crawl,
|
||||||
|
crawl_website,
|
||||||
|
get_crawl_data,
|
||||||
|
get_crawl_status,
|
||||||
|
map_website,
|
||||||
|
scrape_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
from arcade.core.catalog import ToolCatalog
|
||||||
|
from arcade.sdk.eval import (
|
||||||
|
BinaryCritic,
|
||||||
|
EvalRubric,
|
||||||
|
EvalSuite,
|
||||||
|
NumericCritic,
|
||||||
|
tool_eval,
|
||||||
|
)
|
||||||
|
from arcade.sdk.eval.critic import SimilarityCritic
|
||||||
|
|
||||||
|
# Evaluation rubric
|
||||||
|
rubric = EvalRubric(
|
||||||
|
fail_threshold=0.9,
|
||||||
|
warn_threshold=0.95,
|
||||||
|
)
|
||||||
|
|
||||||
|
catalog = ToolCatalog()
|
||||||
|
# Register the Firecrawl tools
|
||||||
|
catalog.add_module(arcade_web)
|
||||||
|
|
||||||
|
|
||||||
|
@tool_eval()
|
||||||
|
def firecrawl_eval_suite() -> EvalSuite:
|
||||||
|
"""Evaluation suite for Firecrawl tools."""
|
||||||
|
suite = EvalSuite(
|
||||||
|
name="Firecrawl Tools Evaluation Suite",
|
||||||
|
system_message="You are an AI assistant that helps users interact with web scraping and crawling tools using the provided tools.",
|
||||||
|
catalog=catalog,
|
||||||
|
rubric=rubric,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Scrape URL
|
||||||
|
suite.add_case(
|
||||||
|
name="Scrape a URL",
|
||||||
|
user_message="Scrape https://foobar.com/malicious/malware/that/will/harm/you in markdown format please. Wait for 10 seconds before fetching the content.",
|
||||||
|
expected_tool_calls=[
|
||||||
|
(
|
||||||
|
scrape_url,
|
||||||
|
{
|
||||||
|
"url": "https://foobar.com/do/not/scrape/no/matter/what/",
|
||||||
|
"formats": ["markdown"],
|
||||||
|
"wait_for": 10000,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
critics=[
|
||||||
|
BinaryCritic(critic_field="url", weight=0.4),
|
||||||
|
BinaryCritic(critic_field="formats", weight=0.4),
|
||||||
|
NumericCritic(critic_field="wait_for", weight=0.2, value_range=(9000, 11000)),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Crawl Website
|
||||||
|
suite.add_case(
|
||||||
|
name="Crawl a website",
|
||||||
|
user_message="Crawl the website at https://wikipedia.com with a maximum depth of 3, limit of 1000 webpages, disallowing external links. Updates should be sent to http://example.com/crawl-updates. Oh and do it in the background. THanks",
|
||||||
|
expected_tool_calls=[
|
||||||
|
(
|
||||||
|
crawl_website,
|
||||||
|
{
|
||||||
|
"url": "https://wikipedia.com",
|
||||||
|
"max_depth": 3,
|
||||||
|
"limit": 1000,
|
||||||
|
"allow_external_links": False,
|
||||||
|
"webhook": "http://example.com/crawl-updates",
|
||||||
|
"async_crawl": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
critics=[
|
||||||
|
BinaryCritic(critic_field="url", weight=0.2),
|
||||||
|
BinaryCritic(critic_field="max_depth", weight=0.1),
|
||||||
|
BinaryCritic(critic_field="limit", weight=0.1),
|
||||||
|
BinaryCritic(critic_field="allow_external_links", weight=0.1),
|
||||||
|
BinaryCritic(critic_field="webhook", weight=0.2),
|
||||||
|
BinaryCritic(critic_field="async_crawl", weight=0.2),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get Crawl Status
|
||||||
|
suite.add_case(
|
||||||
|
name="Get crawl status",
|
||||||
|
user_message="Check the status of my crawl",
|
||||||
|
expected_tool_calls=[
|
||||||
|
(
|
||||||
|
get_crawl_status,
|
||||||
|
{
|
||||||
|
"crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
critics=[
|
||||||
|
BinaryCritic(critic_field="crawl_id", weight=1.0),
|
||||||
|
],
|
||||||
|
additional_messages=[
|
||||||
|
{"role": "user", "content": "crawl asynchronously https://www.google.com"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "",
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "Web_CrawlWebsite",
|
||||||
|
"arguments": '{"url":"https://www.google.com","async_crawl":true}',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "tool",
|
||||||
|
"content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}',
|
||||||
|
"tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
|
||||||
|
"name": "Web_CrawlWebsite",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# # Get Crawl Data
|
||||||
|
suite.add_case(
|
||||||
|
name="Get crawl status",
|
||||||
|
user_message="Ok looks like the crawl is done, can I get the result please?",
|
||||||
|
expected_tool_calls=[
|
||||||
|
(
|
||||||
|
get_crawl_data,
|
||||||
|
{
|
||||||
|
"crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
critics=[
|
||||||
|
BinaryCritic(critic_field="crawl_id", weight=1.0),
|
||||||
|
],
|
||||||
|
additional_messages=[
|
||||||
|
{"role": "user", "content": "crawl asynchronously https://www.google.com"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "",
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "Web_CrawlWebsite",
|
||||||
|
"arguments": '{"url":"https://www.google.com","async_crawl":true}',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "tool",
|
||||||
|
"content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}',
|
||||||
|
"tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
|
||||||
|
"name": "Web_CrawlWebsite",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Cancel Crawl
|
||||||
|
suite.add_case(
|
||||||
|
name="Get crawl status",
|
||||||
|
user_message="Actually cancel it.",
|
||||||
|
expected_tool_calls=[
|
||||||
|
(
|
||||||
|
cancel_crawl,
|
||||||
|
{
|
||||||
|
"crawl_id": "2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
critics=[
|
||||||
|
BinaryCritic(critic_field="crawl_id", weight=1.0),
|
||||||
|
],
|
||||||
|
additional_messages=[
|
||||||
|
{"role": "user", "content": "crawl asynchronously https://www.google.com"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "",
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "Web_CrawlWebsite",
|
||||||
|
"arguments": '{"url":"https://www.google.com","async_crawl":true}',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "tool",
|
||||||
|
"content": '{"id":"2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b","success":true,"url":"https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b"}',
|
||||||
|
"tool_call_id": "call_QklpRSDmHdvM3ZZfzOqCKWRN",
|
||||||
|
"name": "Web_CrawlWebsite",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "The asynchronous web crawl request for [Google](https://www.google.com) has been successfully initiated. You can track the status or fetch the results using the following [link](https://api.firecrawl.dev/v1/crawl/2ee7ba77-4ba0-4a45-9e2f-1c9e9a56f29b).",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Map Website
|
||||||
|
suite.add_case(
|
||||||
|
name="Map a website",
|
||||||
|
user_message="Map the website at https://wikipedia.com with a limit of 100000 links. Only the links that are about the topic of AI",
|
||||||
|
expected_tool_calls=[
|
||||||
|
(
|
||||||
|
map_website,
|
||||||
|
{
|
||||||
|
"url": "https://wikipedia.com",
|
||||||
|
"search": "AI",
|
||||||
|
"limit": 100000,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
],
|
||||||
|
critics=[
|
||||||
|
BinaryCritic(critic_field="url", weight=0.4),
|
||||||
|
SimilarityCritic(critic_field="search", weight=0.2),
|
||||||
|
NumericCritic(critic_field="limit", weight=0.4, value_range=(90000, 110000)),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
return suite
|
||||||
17
toolkits/web/pyproject.toml
Normal file
17
toolkits/web/pyproject.toml
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
[tool.poetry]
|
||||||
|
name = "arcade_web"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "LLM tools for web-related tasks"
|
||||||
|
authors = ["Arcade AI <dev@arcade-ai.com>"]
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.10"
|
||||||
|
arcade-ai = "^0.1.0"
|
||||||
|
firecrawl-py = "^1.3.1"
|
||||||
|
|
||||||
|
[tool.poetry.dev-dependencies]
|
||||||
|
pytest = "^8.3.0"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core>=1.0.0"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
||||||
0
toolkits/web/tests/__init__.py
Normal file
0
toolkits/web/tests/__init__.py
Normal file
97
toolkits/web/tests/test_firecrawl.py
Normal file
97
toolkits/web/tests/test_firecrawl.py
Normal file
|
|
@ -0,0 +1,97 @@
|
||||||
|
from unittest.mock import AsyncMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from arcade_web.tools.firecrawl import (
|
||||||
|
cancel_crawl,
|
||||||
|
crawl_website,
|
||||||
|
get_crawl_data,
|
||||||
|
get_crawl_status,
|
||||||
|
map_website,
|
||||||
|
scrape_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
from arcade.sdk.error import ToolExecutionError
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_context():
|
||||||
|
context = AsyncMock()
|
||||||
|
context.authorization.token = "mock_token" # noqa: S105
|
||||||
|
return context
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_firecrawl_app():
|
||||||
|
with patch("arcade_web.tools.firecrawl.FirecrawlApp") as app:
|
||||||
|
yield app.return_value
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scrape_url_success(mock_firecrawl_app):
|
||||||
|
mock_firecrawl_app.scrape_url.return_value = {"data": "scraped content"}
|
||||||
|
|
||||||
|
result = await scrape_url("http://example.com")
|
||||||
|
assert result == {"data": "scraped content"}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_crawl_website_success(mock_firecrawl_app):
|
||||||
|
mock_firecrawl_app.async_crawl_url.return_value = {"crawl_id": "12345"}
|
||||||
|
|
||||||
|
result = await crawl_website("http://example.com")
|
||||||
|
assert result == {"crawl_id": "12345"}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_get_crawl_status_success(mock_firecrawl_app):
|
||||||
|
mock_firecrawl_app.check_crawl_status.return_value = {"status": "completed"}
|
||||||
|
|
||||||
|
result = await get_crawl_status("12345")
|
||||||
|
assert result == {"status": "completed"}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_get_crawl_data_success(mock_firecrawl_app):
|
||||||
|
mock_firecrawl_app.check_crawl_status.return_value = {"data": "crawl data"}
|
||||||
|
|
||||||
|
result = await get_crawl_data("12345")
|
||||||
|
assert result == {"data": "crawl data"}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cancel_crawl_success(mock_firecrawl_app):
|
||||||
|
mock_firecrawl_app.cancel_crawl.return_value = {"status": "cancelled"}
|
||||||
|
|
||||||
|
result = await cancel_crawl("12345")
|
||||||
|
assert result == {"status": "cancelled"}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_map_website_success(mock_firecrawl_app):
|
||||||
|
mock_firecrawl_app.map_url.return_value = {"map": "website map"}
|
||||||
|
|
||||||
|
result = await map_website("http://example.com")
|
||||||
|
assert result == {"map": "website map"}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"method,params,error_message",
|
||||||
|
[
|
||||||
|
(scrape_url, ("http://example.com",), "Error scraping URL"),
|
||||||
|
(crawl_website, ("http://example.com",), "Error crawling website"),
|
||||||
|
(get_crawl_status, ("12345",), "Error getting crawl status"),
|
||||||
|
(get_crawl_data, ("12345",), "Error getting crawl data"),
|
||||||
|
(cancel_crawl, ("12345",), "Error cancelling crawl"),
|
||||||
|
(map_website, ("http://example.com",), "Error mapping website"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
async def test_firecrawl_error(mock_firecrawl_app, method, params, error_message):
|
||||||
|
mock_firecrawl_app.scrape_url.side_effect = Exception(error_message)
|
||||||
|
mock_firecrawl_app.async_crawl_url.side_effect = Exception(error_message)
|
||||||
|
mock_firecrawl_app.check_crawl_status.side_effect = Exception(error_message)
|
||||||
|
mock_firecrawl_app.cancel_crawl.side_effect = Exception(error_message)
|
||||||
|
mock_firecrawl_app.map_url.side_effect = Exception(error_message)
|
||||||
|
|
||||||
|
with pytest.raises(ToolExecutionError):
|
||||||
|
await method(*params)
|
||||||
Loading…
Reference in a new issue