Updating Brightdata community pkg (#628)

Updating BrightData 
- Updating project.toml
- Fix linting issues (related to the repo configs)s
- Rename package from brightdata -> arcade-brightdata ( also will be
used by PyPI)
- Added to toolkits.txt so it can be deployed


Extra:
- Arcade new templates did not have the extra line at the end, so it has
been added.

---------

Co-authored-by: Francisco Liberal <francisco@arcade.dev>
This commit is contained in:
jottakka 2025-10-17 18:18:00 -03:00 committed by GitHub
parent c22f9e302b
commit 75fc298681
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 165 additions and 64 deletions

View file

@ -1,5 +1,6 @@
arcade-airtable-api
arcade-box-api
arcade-brightdata
arcade-calendly-api
arcade-cursor-agents-api
arcade-figma-api

View file

@ -37,4 +37,4 @@
## Development
Read the docs on how to create a toolkit [here](https://docs.arcade.dev/home/build-tools/create-a-toolkit)
Read the docs on how to create a toolkit [here](https://docs.arcade.dev/home/build-tools/create-a-toolkit)

View file

@ -48,4 +48,3 @@ def {{ toolkit_name }}_eval_suite() -> EvalSuite:
)
return suite

View file

@ -75,4 +75,3 @@ skip_empty = true
[tool.hatch.build.targets.wheel]
packages = [ "{{ package_name }}",]

View file

@ -11,4 +11,3 @@ def test_hello() -> None:
def test_hello_raises_error() -> None:
with pytest.raises(ToolExecutionError):
say_hello(1)

View file

@ -0,0 +1,18 @@
files: ^arcade_brightdata/.*
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: "v4.4.0"
hooks:
- id: check-case-conflict
- id: check-merge-conflict
- id: check-toml
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.7
hooks:
- id: ruff
args: [--fix]
- id: ruff-format

View file

@ -0,0 +1,44 @@
target-version = "py310"
line-length = 100
fix = true
[lint]
select = [
# flake8-2020
"YTT",
# flake8-bandit
"S",
# flake8-bugbear
"B",
# flake8-builtins
"A",
# flake8-comprehensions
"C4",
# flake8-debugger
"T10",
# flake8-simplify
"SIM",
# isort
"I",
# mccabe
"C90",
# pycodestyle
"E", "W",
# pyflakes
"F",
# pygrep-hooks
"PGH",
# pyupgrade
"UP",
# ruff
"RUF",
# tryceratops
"TRY",
]
[lint.per-file-ignores]
"**/tests/*" = ["S101"]
[format]
preview = true
skip-magic-trailing-comma = false

View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025, Arcade AI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,3 @@
from arcade_brightdata.tools import scrape_as_markdown, search_engine, web_data_feed
__all__ = ["scrape_as_markdown", "search_engine", "web_data_feed"]

View file

@ -53,10 +53,9 @@ class BrightDataClient:
self.endpoint, headers=self.headers, data=json.dumps(payload), timeout=30
)
if response.status_code != 200:
raise Exception(f"Failed to scrape: {response.status_code} - {response.text}") # noqa: TRY002
return response.text
response.raise_for_status()
result: str = response.text
return result
@staticmethod
def encode_query(query: str) -> str:

View file

@ -0,0 +1,7 @@
from arcade_brightdata.tools.bright_data_tools import (
scrape_as_markdown,
search_engine,
web_data_feed,
)
__all__ = ["scrape_as_markdown", "search_engine", "web_data_feed"]

View file

@ -1,13 +1,13 @@
import json
import time
from enum import Enum
from typing import Annotated, Any, Optional, cast
from typing import Annotated, Any, cast
import requests
from arcade_core.errors import RetryableToolError
from arcade_tdk import ToolContext, tool
from ..bright_data_client import BrightDataClient
from arcade_brightdata.bright_data_client import BrightDataClient
class DeviceType(str, Enum):
@ -76,13 +76,13 @@ def search_engine( # noqa: C901
context: ToolContext,
query: Annotated[str, "Search query"],
engine: Annotated[SearchEngine, "Search engine to use"] = SearchEngine.GOOGLE,
language: Annotated[Optional[str], "Two-letter language code"] = None,
country_code: Annotated[Optional[str], "Two-letter country code"] = None,
search_type: Annotated[Optional[SearchType], "Type of search"] = None,
start: Annotated[Optional[int], "Results pagination offset"] = None,
language: Annotated[str | None, "Two-letter language code"] = None,
country_code: Annotated[str | None, "Two-letter country code"] = None,
search_type: Annotated[SearchType | None, "Type of search"] = None,
start: Annotated[int | None, "Results pagination offset"] = None,
num_results: Annotated[int, "Number of results to return. The default is 10"] = 10,
location: Annotated[Optional[str], "Location for search results"] = None,
device: Annotated[Optional[DeviceType], "Device type"] = None,
location: Annotated[str | None, "Location for search results"] = None,
device: Annotated[DeviceType | None, "Device type"] = None,
return_json: Annotated[bool, "Return JSON instead of Markdown"] = False,
) -> Annotated[str, "Search results as Markdown or JSON"]:
"""
@ -173,8 +173,11 @@ def web_data_feed(
source_type: Annotated[SourceType, "Type of data source"],
url: Annotated[str, "URL of the web resource to extract data from"],
num_of_reviews: Annotated[
Optional[int],
"Number of reviews to retrieve. Only applicable for facebook_company_reviews. Default is None",
int | None,
(
"Number of reviews to retrieve. Only applicable for "
"facebook_company_reviews. Default is None"
),
] = None,
timeout: Annotated[int, "Maximum time in seconds to wait for data retrieval"] = 600,
polling_interval: Annotated[int, "Time in seconds between polling attempts"] = 1,
@ -194,17 +197,26 @@ def web_data_feed(
- youtube_videos
Examples:
web_data_feed("amazon_product", "https://amazon.com/dp/B08N5WRWNW") -> "{\"title\": \"Product Name\", ...}"
web_data_feed("linkedin_person_profile", "https://linkedin.com/in/johndoe") -> "{\"name\": \"John Doe\", ...}"
web_data_feed("facebook_company_reviews", "https://facebook.com/company", num_of_reviews=50) -> "[{\"review\": \"...\", ...}]"
web_data_feed("amazon_product", "https://amazon.com/dp/B08N5WRWNW")
-> "{\"title\": \"Product Name\", ...}"
web_data_feed("linkedin_person_profile", "https://linkedin.com/in/johndoe")
-> "{\"name\": \"John Doe\", ...}"
web_data_feed(
"facebook_company_reviews", "https://facebook.com/company", num_of_reviews=50
) -> "[{\"review\": \"...\", ...}]"
"""
api_key = context.get_secret("BRIGHTDATA_API_KEY")
client = BrightDataClient.create_client(api_key=api_key)
if num_of_reviews is not None and source_type != SourceType.FACEBOOK_COMPANY_REVIEWS:
raise RetryableToolError(
f"num_of_reviews parameter is only applicable for facebook_company_reviews, not for {source_type.value}",
additional_prompt_content="The num_of_reviews parameter should only be used with facebook_company_reviews source type.",
msg = (
f"num_of_reviews parameter is only applicable for facebook_company_reviews, "
f"not for {source_type.value}"
)
prompt = (
"The num_of_reviews parameter should only be used with "
"facebook_company_reviews source type."
)
raise RetryableToolError(msg, additional_prompt_content=prompt)
data = _extract_structured_data(
client=client,
source_type=source_type,
@ -220,7 +232,7 @@ def _extract_structured_data(
client: BrightDataClient,
source_type: SourceType,
url: str,
num_of_reviews: Optional[int] = None,
num_of_reviews: int | None = None,
timeout: int = 600,
polling_interval: int = 1,
) -> dict[str, Any]:
@ -262,10 +274,9 @@ def _extract_structured_data(
trigger_data = trigger_response.json()
if not trigger_data.get("snapshot_id"):
raise RetryableToolError(
"No snapshot ID returned from trigger request",
additional_prompt_content="Invalid input provided, use search_engine to get the relevant data first ",
)
msg = "No snapshot ID returned from trigger request"
prompt = "Invalid input provided, use search_engine to get the relevant data first"
raise RetryableToolError(msg, additional_prompt_content=prompt)
snapshot_id = trigger_data["snapshot_id"]
@ -297,4 +308,5 @@ def _extract_structured_data(
attempts += 1
time.sleep(polling_interval)
raise TimeoutError(f"Timeout after {max_attempts} seconds waiting for {source_type.value} data")
msg = f"Timeout after {max_attempts} seconds waiting for {source_type.value} data"
raise TimeoutError(msg)

View file

@ -1,3 +0,0 @@
from brightdata.tools import scrape_as_markdown, search_engine, web_data_feed
__all__ = ["scrape_as_markdown", "search_engine", "web_data_feed"]

View file

@ -1,3 +0,0 @@
from brightdata.tools.bright_data_tools import scrape_as_markdown, search_engine, web_data_feed
__all__ = ["scrape_as_markdown", "search_engine", "web_data_feed"]

View file

@ -3,20 +3,18 @@ requires = [ "hatchling",]
build-backend = "hatchling.build"
[project]
name = "brightdata"
version = "0.1.1"
name = "arcade_brightdata"
version = "0.2.0"
description = "Search, Crawl and Scrape any site, at scale, without getting blocked"
requires-python = ">=3.10"
dependencies = [
"arcade-tdk>=3.0.0,<4.0.0",
"requests>=2.32.5",
]
[[project.authors]]
name = "meirk-brd"
email = "meirk@brightdata.com"
[project.optional-dependencies]
dev = [
"arcade-mcp[all]>=1.2.0,<2.0.0",
@ -25,19 +23,18 @@ dev = [
"pytest-cov>=4.0.0,<4.1.0",
"pytest-mock>=3.11.1,<3.12.0",
"pytest-asyncio>=0.24.0,<0.25.0",
"types-requests>=2.32.0",
"mypy>=1.5.1,<1.6.0",
"pre-commit>=3.4.0,<3.5.0",
"tox>=4.11.1,<4.12.0",
"ruff>=0.7.4,<0.8.0",
"types-requests>=2.32.0",
]
# Tell Arcade.dev that this package is a toolkit
[project.entry-points.arcade_toolkits]
toolkit_name = "brightdata"
toolkit_name = "arcade_brightdata"
[tool.mypy]
files = [ "brightdata/**/*.py",]
files = [ "arcade_brightdata/**/*.py",]
python_version = "3.10"
disallow_untyped_defs = "True"
disallow_any_unimported = "True"
@ -48,6 +45,11 @@ warn_unused_ignores = "True"
show_error_codes = "True"
ignore_missing_imports = "True"
[tool.uv.sources]
arcade-mcp = { path = "../../", editable = true }
arcade-serve = { path = "../../libs/arcade-serve/", editable = true }
arcade-tdk = { path = "../../libs/arcade-tdk/", editable = true }
[tool.pytest.ini_options]
testpaths = [ "tests",]
@ -55,4 +57,4 @@ testpaths = [ "tests",]
skip_empty = true
[tool.hatch.build.targets.wheel]
packages = [ "brightdata",]
packages = [ "arcade_brightdata",]

View file

@ -2,10 +2,12 @@ from os import environ
from unittest.mock import Mock, patch
import pytest
import requests
from arcade_tdk import ToolContext, ToolSecretItem
from arcade_tdk.errors import ToolExecutionError
from brightdata.bright_data_client import BrightDataClient
from brightdata.tools.bright_data_tools import (
from arcade_brightdata.bright_data_client import BrightDataClient
from arcade_brightdata.tools.bright_data_tools import (
DeviceType,
SourceType,
scrape_as_markdown,
@ -79,18 +81,19 @@ class TestBrightDataClient:
mock_response = Mock()
mock_response.status_code = 400
mock_response.text = "Bad Request"
mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError(
"400 Client Error"
)
mock_post.return_value = mock_response
client = BrightDataClient("test_key", "test_zone")
with pytest.raises(Exception) as exc_info:
with pytest.raises(requests.exceptions.HTTPError):
client.make_request({"url": "https://example.com"})
assert "Failed to scrape: 400 - Bad Request" in str(exc_info.value)
class TestScrapeAsMarkdown:
@patch("brightdata.tools.bright_data_tools.BrightDataClient")
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
def test_scrape_as_markdown_success(self, mock_engine_class, mock_context):
mock_client = Mock()
mock_client.make_request.return_value = "# Test Page\n\nContent here"
@ -111,7 +114,7 @@ class TestScrapeAsMarkdown:
class TestSearchEngine:
@patch("brightdata.tools.bright_data_tools.BrightDataClient")
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
def test_search_engine_google_basic(self, mock_engine_class, mock_context):
mock_client = Mock()
mock_client.make_request.return_value = "# Search Results\n\nResult 1\nResult 2"
@ -125,7 +128,7 @@ class TestSearchEngine:
api_key=BRIGHTDATA_API_KEY, zone=BRIGHTDATA_ZONE
)
@patch("brightdata.tools.bright_data_tools.BrightDataClient")
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
def test_search_engine_bing(self, mock_engine_class, mock_context):
mock_client = Mock()
mock_client.make_request.return_value = "# Bing Results"
@ -143,7 +146,7 @@ class TestSearchEngine:
}
mock_client.make_request.assert_called_once_with(expected_payload)
@patch("brightdata.tools.bright_data_tools.BrightDataClient")
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
def test_search_engine_google_with_parameters(self, mock_engine_class, mock_context):
mock_client = Mock()
mock_client.make_request.return_value = "# Google Results with params"
@ -179,7 +182,7 @@ class TestSearchEngine:
with pytest.raises(ToolExecutionError):
search_engine(mock_context, "test query", engine="invalid_engine")
@patch("brightdata.tools.bright_data_tools.BrightDataClient")
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
def test_search_engine_google_jobs(self, mock_engine_class, mock_context):
mock_client = Mock()
mock_client.make_request.return_value = "# Job Results"
@ -194,8 +197,8 @@ class TestSearchEngine:
class TestWebDataFeed:
@patch("brightdata.tools.bright_data_tools._extract_structured_data")
@patch("brightdata.tools.bright_data_tools.BrightDataClient")
@patch("arcade_brightdata.tools.bright_data_tools._extract_structured_data")
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
def test_web_data_feed_success(self, mock_engine_class, mock_extract, mock_context):
mock_client = Mock()
mock_engine_class.create_client.return_value = mock_client
@ -216,8 +219,8 @@ class TestWebDataFeed:
polling_interval=1,
)
@patch("brightdata.tools.bright_data_tools._extract_structured_data")
@patch("brightdata.tools.bright_data_tools.BrightDataClient")
@patch("arcade_brightdata.tools.bright_data_tools._extract_structured_data")
@patch("arcade_brightdata.tools.bright_data_tools.BrightDataClient")
def test_web_data_feed_with_reviews(self, mock_engine_class, mock_extract, mock_context):
mock_client = Mock()
mock_engine_class.create_client.return_value = mock_client
@ -249,7 +252,7 @@ class TestExtractStructuredData:
@patch("requests.get")
@patch("requests.post")
def test_extract_structured_data_success(self, mock_post, mock_get):
from brightdata.tools.bright_data_tools import _extract_structured_data
from arcade_brightdata.tools.bright_data_tools import _extract_structured_data
client = BrightDataClient("test_key", "test_zone")
@ -282,7 +285,7 @@ class TestExtractStructuredData:
@patch("requests.get")
@patch("requests.post")
def test_extract_structured_data_with_polling(self, mock_post, mock_get):
from brightdata.tools.bright_data_tools import _extract_structured_data
from arcade_brightdata.tools.bright_data_tools import _extract_structured_data
client = BrightDataClient("test_key", "test_zone")
@ -311,7 +314,7 @@ class TestExtractStructuredData:
@patch("requests.post")
def test_extract_structured_data_invalid_source_type(self, mock_post):
from brightdata.tools.bright_data_tools import _extract_structured_data
from arcade_brightdata.tools.bright_data_tools import _extract_structured_data
client = BrightDataClient("test_key", "test_zone")
@ -327,7 +330,7 @@ class TestExtractStructuredData:
@patch("requests.get")
@patch("requests.post")
def test_extract_structured_data_no_snapshot_id(self, mock_post, mock_get):
from brightdata.tools.bright_data_tools import _extract_structured_data
from arcade_brightdata.tools.bright_data_tools import _extract_structured_data
client = BrightDataClient("test_key", "test_zone")
@ -349,7 +352,7 @@ class TestExtractStructuredData:
@patch("requests.post")
@patch("time.sleep")
def test_extract_structured_data_timeout(self, mock_sleep, mock_post, mock_get):
from brightdata.tools.bright_data_tools import _extract_structured_data
from arcade_brightdata.tools.bright_data_tools import _extract_structured_data
client = BrightDataClient("test_key", "test_zone")