diff --git a/toolkits/brightdata/Makefile b/toolkits/brightdata/Makefile new file mode 100644 index 00000000..0a8969be --- /dev/null +++ b/toolkits/brightdata/Makefile @@ -0,0 +1,55 @@ +.PHONY: help + +help: + @echo "🛠️ github Commands:\n" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' + +.PHONY: install +install: ## Install the uv environment and install all packages with dependencies + @echo "🚀 Creating virtual environment and installing all packages using uv" + @uv sync --active --all-extras --no-sources + @if [ -f .pre-commit-config.yaml ]; then uv run --no-sources pre-commit install; fi + @echo "✅ All packages and dependencies installed via uv" + +.PHONY: install-local +install-local: ## Install the uv environment and install all packages with dependencies with local Arcade sources + @echo "🚀 Creating virtual environment and installing all packages using uv" + @uv sync --active --all-extras + @if [ -f .pre-commit-config.yaml ]; then uv run pre-commit install; fi + @echo "✅ All packages and dependencies installed via uv" + +.PHONY: build +build: clean-build ## Build wheel file using poetry + @echo "🚀 Creating wheel file" + uv build + +.PHONY: clean-build +clean-build: ## clean build artifacts + @echo "🗑️ Cleaning dist directory" + rm -rf dist + +.PHONY: test +test: ## Test the code with pytest + @echo "🚀 Testing code: Running pytest" + @uv run --no-sources pytest -W ignore -v --cov --cov-config=pyproject.toml --cov-report=xml + +.PHONY: coverage +coverage: ## Generate coverage report + @echo "coverage report" + @uv run --no-sources coverage report + @echo "Generating coverage report" + @uv run --no-sources coverage html + +.PHONY: bump-version +bump-version: ## Bump the version in the pyproject.toml file by a patch version + @echo "🚀 Bumping version in pyproject.toml" + uv version --no-sources --bump patch + +.PHONY: check +check: ## Run code quality tools. + @if [ -f .pre-commit-config.yaml ]; then\ + echo "🚀 Linting code: Running pre-commit";\ + uv run --no-sources pre-commit run -a;\ + fi + @echo "🚀 Static type checking: Running mypy" + @uv run --no-sources mypy --config-file=pyproject.toml diff --git a/toolkits/brightdata/brightdata/__init__.py b/toolkits/brightdata/brightdata/__init__.py new file mode 100644 index 00000000..ccde0628 --- /dev/null +++ b/toolkits/brightdata/brightdata/__init__.py @@ -0,0 +1,3 @@ +from brightdata.tools import scrape_as_markdown, search_engine, web_data_feed + +__all__ = ["scrape_as_markdown", "search_engine", "web_data_feed"] diff --git a/toolkits/brightdata/brightdata/bright_data_client.py b/toolkits/brightdata/brightdata/bright_data_client.py new file mode 100644 index 00000000..b72ff71a --- /dev/null +++ b/toolkits/brightdata/brightdata/bright_data_client.py @@ -0,0 +1,64 @@ +import json +from typing import ClassVar +from urllib.parse import quote + +import requests + + +class BrightDataClient: + """Engine for interacting with Bright Data API with connection management.""" + + _clients: ClassVar[dict[str, "BrightDataClient"]] = {} + + def __init__(self, api_key: str, zone: str = "web_unlocker1") -> None: + """ + Initialize with API token and default zone. + Args: + api_key (str): Your Bright Data API token + zone (str): Bright Data zone name + """ + self.api_key = api_key + self.headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + } + self.zone = zone + self.endpoint = "https://api.brightdata.com/request" + + @classmethod + def create_client(cls, api_key: str, zone: str = "web_unlocker1") -> "BrightDataClient": + """Create or get cached client instance using API key only.""" + if api_key not in cls._clients: + cls._clients[api_key] = cls(api_key, zone) + + # Update zone for this request (user controls zone per request) + client = cls._clients[api_key] + client.zone = zone + return client + + @classmethod + def clear_cache(cls) -> None: + """Clear the client cache.""" + cls._clients.clear() + + def make_request(self, payload: dict) -> str: + """ + Make a request to Bright Data API. + Args: + payload (Dict): Request payload + Returns: + str: Response text + """ + response = requests.post( + self.endpoint, headers=self.headers, data=json.dumps(payload), timeout=30 + ) + + if response.status_code != 200: + raise Exception(f"Failed to scrape: {response.status_code} - {response.text}") # noqa: TRY002 + + return response.text + + @staticmethod + def encode_query(query: str) -> str: + """URL encode a search query.""" + return quote(query) diff --git a/toolkits/brightdata/brightdata/tools/__init__.py b/toolkits/brightdata/brightdata/tools/__init__.py new file mode 100644 index 00000000..5c18f8dc --- /dev/null +++ b/toolkits/brightdata/brightdata/tools/__init__.py @@ -0,0 +1,3 @@ +from brightdata.tools.bright_data_tools import scrape_as_markdown, search_engine, web_data_feed + +__all__ = ["scrape_as_markdown", "search_engine", "web_data_feed"] diff --git a/toolkits/brightdata/brightdata/tools/bright_data_tools.py b/toolkits/brightdata/brightdata/tools/bright_data_tools.py new file mode 100644 index 00000000..3ef7778a --- /dev/null +++ b/toolkits/brightdata/brightdata/tools/bright_data_tools.py @@ -0,0 +1,300 @@ +import json +import time +from enum import Enum +from typing import Annotated, Any, Optional, cast + +import requests +from arcade_core.errors import RetryableToolError +from arcade_tdk import ToolContext, tool + +from ..bright_data_client import BrightDataClient + + +class DeviceType(str, Enum): + MOBILE = "mobile" + IOS = "ios" + IPHONE = "iphone" + IPAD = "ipad" + ANDROID = "android" + ANDROID_TABLET = "android_tablet" + + +class SearchEngine(str, Enum): + GOOGLE = "google" + BING = "bing" + YANDEX = "yandex" + + +class SearchType(str, Enum): + IMAGES = "images" + SHOPPING = "shopping" + NEWS = "news" + JOBS = "jobs" + + +class SourceType(str, Enum): + AMAZON_PRODUCT = "amazon_product" + AMAZON_PRODUCT_REVIEWS = "amazon_product_reviews" + LINKEDIN_PERSON_PROFILE = "linkedin_person_profile" + LINKEDIN_COMPANY_PROFILE = "linkedin_company_profile" + ZOOMINFO_COMPANY_PROFILE = "zoominfo_company_profile" + INSTAGRAM_PROFILES = "instagram_profiles" + INSTAGRAM_POSTS = "instagram_posts" + INSTAGRAM_REELS = "instagram_reels" + INSTAGRAM_COMMENTS = "instagram_comments" + FACEBOOK_POSTS = "facebook_posts" + FACEBOOK_MARKETPLACE_LISTINGS = "facebook_marketplace_listings" + FACEBOOK_COMPANY_REVIEWS = "facebook_company_reviews" + X_POSTS = "x_posts" + ZILLOW_PROPERTIES_LISTING = "zillow_properties_listing" + BOOKING_HOTEL_LISTINGS = "booking_hotel_listings" + YOUTUBE_VIDEOS = "youtube_videos" + + +@tool(requires_secrets=["BRIGHTDATA_API_KEY", "BRIGHTDATA_ZONE"]) +def scrape_as_markdown( + context: ToolContext, + url: Annotated[str, "URL to scrape"], +) -> Annotated[str, "Scraped webpage content as Markdown"]: + """ + Scrape a webpage and return content in Markdown format using Bright Data. + + Examples: + scrape_as_markdown("https://example.com") -> "# Example Page\n\nContent..." + scrape_as_markdown("https://news.ycombinator.com") -> "# Hacker News\n..." + """ + api_key = context.get_secret("BRIGHTDATA_API_KEY") + zone = context.get_secret("BRIGHTDATA_ZONE") + client = BrightDataClient.create_client(api_key=api_key, zone=zone) + + payload = {"url": url, "zone": zone, "format": "raw", "data_format": "markdown"} + return client.make_request(payload) + + +@tool(requires_secrets=["BRIGHTDATA_API_KEY", "BRIGHTDATA_ZONE"]) +def search_engine( # noqa: C901 + context: ToolContext, + query: Annotated[str, "Search query"], + engine: Annotated[SearchEngine, "Search engine to use"] = SearchEngine.GOOGLE, + language: Annotated[Optional[str], "Two-letter language code"] = None, + country_code: Annotated[Optional[str], "Two-letter country code"] = None, + search_type: Annotated[Optional[SearchType], "Type of search"] = None, + start: Annotated[Optional[int], "Results pagination offset"] = None, + num_results: Annotated[int, "Number of results to return. The default is 10"] = 10, + location: Annotated[Optional[str], "Location for search results"] = None, + device: Annotated[Optional[DeviceType], "Device type"] = None, + return_json: Annotated[bool, "Return JSON instead of Markdown"] = False, +) -> Annotated[str, "Search results as Markdown or JSON"]: + """ + Search using Google, Bing, or Yandex with advanced parameters using Bright Data. + + Examples: + search_engine("climate change") -> "# Search Results\n\n## Climate Change - Wikipedia\n..." + search_engine("Python tutorials", engine="bing", num_results=5) -> "# Bing Results\n..." + search_engine("cats", search_type="images", country_code="us") -> "# Image Results\n..." + """ + api_key = context.get_secret("BRIGHTDATA_API_KEY") + zone = context.get_secret("BRIGHTDATA_ZONE") + client = BrightDataClient.create_client(api_key=api_key, zone=zone) + + encoded_query = BrightDataClient.encode_query(query) + + base_urls = { + SearchEngine.GOOGLE: f"https://www.google.com/search?q={encoded_query}", + SearchEngine.BING: f"https://www.bing.com/search?q={encoded_query}", + SearchEngine.YANDEX: f"https://yandex.com/search/?text={encoded_query}", + } + + search_url = base_urls[engine] + + if engine == SearchEngine.GOOGLE: + params = [] + + if language: + params.append(f"hl={language}") + + if country_code: + params.append(f"gl={country_code}") + + if search_type: + if search_type == SearchType.JOBS: + params.append("ibp=htl;jobs") + else: + search_types = { + SearchType.IMAGES: "isch", + SearchType.SHOPPING: "shop", + SearchType.NEWS: "nws", + } + tbm_value = search_types.get(search_type, search_type) + params.append(f"tbm={tbm_value}") + + if start is not None: + params.append(f"start={start}") + + if num_results: + params.append(f"num={num_results}") + + if location: + params.append(f"uule={BrightDataClient.encode_query(location)}") + + if device: + device_value = "1" + + if device.value in ["ios", "iphone"]: + device_value = "ios" + elif device.value == "ipad": + device_value = "ios_tablet" + elif device.value == "android": + device_value = "android" + elif device.value == "android_tablet": + device_value = "android_tablet" + + params.append(f"brd_mobile={device_value}") + + if return_json: + params.append("brd_json=1") + + if params: + search_url += "&" + "&".join(params) + + payload = { + "url": search_url, + "zone": zone, + "format": "raw", + "data_format": "markdown" if not return_json else "raw", + } + + return client.make_request(payload) + + +@tool(requires_secrets=["BRIGHTDATA_API_KEY"]) +def web_data_feed( + context: ToolContext, + source_type: Annotated[SourceType, "Type of data source"], + url: Annotated[str, "URL of the web resource to extract data from"], + num_of_reviews: Annotated[ + Optional[int], + "Number of reviews to retrieve. Only applicable for facebook_company_reviews. Default is None", + ] = None, + timeout: Annotated[int, "Maximum time in seconds to wait for data retrieval"] = 600, + polling_interval: Annotated[int, "Time in seconds between polling attempts"] = 1, +) -> Annotated[str, "Structured data from the requested source as JSON"]: + """ + Extract structured data from various websites like LinkedIn, Amazon, Instagram, etc. + NEVER MADE UP LINKS - IF LINKS ARE NEEDED, EXECUTE search_engine FIRST. + Supported source types: + - amazon_product, amazon_product_reviews + - linkedin_person_profile, linkedin_company_profile + - zoominfo_company_profile + - instagram_profiles, instagram_posts, instagram_reels, instagram_comments + - facebook_posts, facebook_marketplace_listings, facebook_company_reviews + - x_posts + - zillow_properties_listing + - booking_hotel_listings + - youtube_videos + + Examples: + web_data_feed("amazon_product", "https://amazon.com/dp/B08N5WRWNW") -> "{\"title\": \"Product Name\", ...}" + web_data_feed("linkedin_person_profile", "https://linkedin.com/in/johndoe") -> "{\"name\": \"John Doe\", ...}" + web_data_feed("facebook_company_reviews", "https://facebook.com/company", num_of_reviews=50) -> "[{\"review\": \"...\", ...}]" + """ + api_key = context.get_secret("BRIGHTDATA_API_KEY") + client = BrightDataClient.create_client(api_key=api_key) + if num_of_reviews is not None and source_type != SourceType.FACEBOOK_COMPANY_REVIEWS: + raise RetryableToolError( + f"num_of_reviews parameter is only applicable for facebook_company_reviews, not for {source_type.value}", + additional_prompt_content="The num_of_reviews parameter should only be used with facebook_company_reviews source type.", + ) + data = _extract_structured_data( + client=client, + source_type=source_type, + url=url, + num_of_reviews=num_of_reviews, + timeout=timeout, + polling_interval=polling_interval, + ) + return json.dumps(data, indent=2) + + +def _extract_structured_data( + client: BrightDataClient, + source_type: SourceType, + url: str, + num_of_reviews: Optional[int] = None, + timeout: int = 600, + polling_interval: int = 1, +) -> dict[str, Any]: + """ + Extract structured data from various sources. + """ + datasets = { + SourceType.AMAZON_PRODUCT: "gd_l7q7dkf244hwjntr0", + SourceType.AMAZON_PRODUCT_REVIEWS: "gd_le8e811kzy4ggddlq", + SourceType.LINKEDIN_PERSON_PROFILE: "gd_l1viktl72bvl7bjuj0", + SourceType.LINKEDIN_COMPANY_PROFILE: "gd_l1vikfnt1wgvvqz95w", + SourceType.ZOOMINFO_COMPANY_PROFILE: "gd_m0ci4a4ivx3j5l6nx", + SourceType.INSTAGRAM_PROFILES: "gd_l1vikfch901nx3by4", + SourceType.INSTAGRAM_POSTS: "gd_lk5ns7kz21pck8jpis", + SourceType.INSTAGRAM_REELS: "gd_lyclm20il4r5helnj", + SourceType.INSTAGRAM_COMMENTS: "gd_ltppn085pokosxh13", + SourceType.FACEBOOK_POSTS: "gd_lyclm1571iy3mv57zw", + SourceType.FACEBOOK_MARKETPLACE_LISTINGS: "gd_lvt9iwuh6fbcwmx1a", + SourceType.FACEBOOK_COMPANY_REVIEWS: "gd_m0dtqpiu1mbcyc2g86", + SourceType.X_POSTS: "gd_lwxkxvnf1cynvib9co", + SourceType.ZILLOW_PROPERTIES_LISTING: "gd_lfqkr8wm13ixtbd8f5", + SourceType.BOOKING_HOTEL_LISTINGS: "gd_m5mbdl081229ln6t4a", + SourceType.YOUTUBE_VIDEOS: "gd_m5mbdl081229ln6t4a", + } + + dataset_id = datasets[source_type] + + request_data = {"url": url} + if source_type == SourceType.FACEBOOK_COMPANY_REVIEWS and num_of_reviews is not None: + request_data["num_of_reviews"] = str(num_of_reviews) + + trigger_response = requests.post( + "https://api.brightdata.com/datasets/v3/trigger", + params={"dataset_id": dataset_id, "include_errors": "true"}, + headers=client.headers, + json=[request_data], + timeout=30, + ) + + trigger_data = trigger_response.json() + if not trigger_data.get("snapshot_id"): + raise RetryableToolError( + "No snapshot ID returned from trigger request", + additional_prompt_content="Invalid input provided, use search_engine to get the relevant data first ", + ) + + snapshot_id = trigger_data["snapshot_id"] + + attempts = 0 + max_attempts = timeout + + while attempts < max_attempts: + try: + snapshot_response = requests.get( + f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}", + params={"format": "json"}, + headers=client.headers, + timeout=30, + ) + + snapshot_data = cast(dict[str, Any], snapshot_response.json()) + + if isinstance(snapshot_data, dict) and snapshot_data.get("status") in ( + "running", + "building", + ): + attempts += 1 + time.sleep(polling_interval) + continue + else: + return snapshot_data + + except Exception: + attempts += 1 + time.sleep(polling_interval) + + raise TimeoutError(f"Timeout after {max_attempts} seconds waiting for {source_type.value} data") diff --git a/toolkits/brightdata/pyproject.toml b/toolkits/brightdata/pyproject.toml new file mode 100644 index 00000000..52288a21 --- /dev/null +++ b/toolkits/brightdata/pyproject.toml @@ -0,0 +1,58 @@ +[build-system] +requires = [ "hatchling",] +build-backend = "hatchling.build" + +[project] +name = "brightdata" +version = "0.1.0" +description = "Search, Crawl and Scrape any site, at scale, without getting blocked" +requires-python = ">=3.10" +dependencies = [ + "arcade-tdk>=2.0.0,<3.0.0", + "requests>=2.32.5", + +] +[[project.authors]] +name = "meirk-brd" +email = "meirk@brightdata.com" + + +[project.optional-dependencies] +dev = [ + "arcade-ai[evals]>=2.1.4,<3.0.0", + "arcade-serve>=2.0.0,<3.0.0", + "pytest>=8.3.0,<8.4.0", + "pytest-cov>=4.0.0,<4.1.0", + "pytest-mock>=3.11.1,<3.12.0", + "pytest-asyncio>=0.24.0,<0.25.0", + "types-requests>=2.32.0", + "mypy>=1.5.1,<1.6.0", + "pre-commit>=3.4.0,<3.5.0", + "tox>=4.11.1,<4.12.0", + "ruff>=0.7.4,<0.8.0", +] + +# Tell Arcade.dev that this package is a toolkit +[project.entry-points.arcade_toolkits] +toolkit_name = "brightdata" + +[tool.mypy] +files = [ "brightdata/**/*.py",] +python_version = "3.10" +disallow_untyped_defs = "True" +disallow_any_unimported = "True" +no_implicit_optional = "True" +check_untyped_defs = "True" +warn_return_any = "True" +warn_unused_ignores = "True" +show_error_codes = "True" +ignore_missing_imports = "True" + +[tool.pytest.ini_options] +testpaths = [ "tests",] + +[tool.coverage.report] +skip_empty = true + +[tool.hatch.build.targets.wheel] +packages = [ "brightdata",] diff --git a/toolkits/brightdata/tests/__init__.py b/toolkits/brightdata/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/toolkits/brightdata/tests/test_brightdata.py b/toolkits/brightdata/tests/test_brightdata.py new file mode 100644 index 00000000..84502275 --- /dev/null +++ b/toolkits/brightdata/tests/test_brightdata.py @@ -0,0 +1,411 @@ +from os import environ +from unittest.mock import Mock, patch + +import pytest +from arcade_tdk import ToolContext, ToolSecretItem +from arcade_tdk.errors import ToolExecutionError +from brightdata.bright_data_client import BrightDataClient +from brightdata.tools.bright_data_tools import ( + DeviceType, + SourceType, + scrape_as_markdown, + search_engine, + web_data_feed, +) + +BRIGHTDATA_API_KEY = environ.get("TEST_BRIGHTDATA_API_KEY") or "api-key" +BRIGHTDATA_ZONE = environ.get("TEST_BRIGHTDATA_ZONE") or "unblocker" + + +@pytest.fixture +def mock_context(): + context = ToolContext() + context.secrets = [] + context.secrets.append(ToolSecretItem(key="BRIGHTDATA_API_KEY", value=BRIGHTDATA_API_KEY)) + context.secrets.append(ToolSecretItem(key="BRIGHTDATA_ZONE", value=BRIGHTDATA_ZONE)) + return context + + +@pytest.fixture(autouse=True) +def cleanup_engines(): + """Clean up bright data clients after each test to prevent connection leaks.""" + yield + BrightDataClient.clear_cache() + + +class TestBrightDataClient: + def test_get_instance_creates_new_client(self): + client1 = BrightDataClient.create_client("test_key_1", "zone1") + client2 = BrightDataClient.create_client("test_key_2", "zone2") + + assert client1 != client2 + assert client1.api_key == "test_key_1" + assert client1.zone == "zone1" + assert client2.api_key == "test_key_2" + assert client2.zone == "zone2" + + def test_get_instance_returns_cached_client(self): + client1 = BrightDataClient.create_client("test_key", "zone1") + client2 = BrightDataClient.create_client("test_key", "zone1") + + assert client1 is client2 + + def test_clear_cache(self): + client1 = BrightDataClient.create_client("test_key", "zone1") + BrightDataClient.clear_cache() + client2 = BrightDataClient.create_client("test_key", "zone1") + + assert client1 is not client2 + + def test_encode_query(self): + result = BrightDataClient.encode_query("hello world test") + assert result == "hello%20world%20test" + + @patch("requests.post") + def test_make_request_success(self, mock_post): + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "Success response" + mock_post.return_value = mock_response + + client = BrightDataClient("test_key", "test_zone") + result = client.make_request({"url": "https://example.com"}) + + assert result == "Success response" + mock_post.assert_called_once() + + @patch("requests.post") + def test_make_request_failure(self, mock_post): + mock_response = Mock() + mock_response.status_code = 400 + mock_response.text = "Bad Request" + mock_post.return_value = mock_response + + client = BrightDataClient("test_key", "test_zone") + + with pytest.raises(Exception) as exc_info: + client.make_request({"url": "https://example.com"}) + + assert "Failed to scrape: 400 - Bad Request" in str(exc_info.value) + + +class TestScrapeAsMarkdown: + @patch("brightdata.tools.bright_data_tools.BrightDataClient") + def test_scrape_as_markdown_success(self, mock_engine_class, mock_context): + mock_client = Mock() + mock_client.make_request.return_value = "# Test Page\n\nContent here" + mock_engine_class.create_client.return_value = mock_client + + result = scrape_as_markdown(mock_context, "https://example.com") + + assert result == "# Test Page\n\nContent here" + mock_engine_class.create_client.assert_called_once_with( + api_key=BRIGHTDATA_API_KEY, zone=BRIGHTDATA_ZONE + ) + mock_client.make_request.assert_called_once_with({ + "url": "https://example.com", + "zone": BRIGHTDATA_ZONE, + "format": "raw", + "data_format": "markdown", + }) + + +class TestSearchEngine: + @patch("brightdata.tools.bright_data_tools.BrightDataClient") + def test_search_engine_google_basic(self, mock_engine_class, mock_context): + mock_client = Mock() + mock_client.make_request.return_value = "# Search Results\n\nResult 1\nResult 2" + mock_engine_class.create_client.return_value = mock_client + mock_engine_class.encode_query.return_value = "test%20query" + + result = search_engine(mock_context, "test query") + + assert result == "# Search Results\n\nResult 1\nResult 2" + mock_engine_class.create_client.assert_called_once_with( + api_key=BRIGHTDATA_API_KEY, zone=BRIGHTDATA_ZONE + ) + + @patch("brightdata.tools.bright_data_tools.BrightDataClient") + def test_search_engine_bing(self, mock_engine_class, mock_context): + mock_client = Mock() + mock_client.make_request.return_value = "# Bing Results" + mock_engine_class.create_client.return_value = mock_client + mock_engine_class.encode_query.return_value = "test%20query" + + result = search_engine(mock_context, "test query", engine="bing") + + assert result == "# Bing Results" + expected_payload = { + "url": "https://www.bing.com/search?q=test%20query", + "zone": BRIGHTDATA_ZONE, + "format": "raw", + "data_format": "markdown", + } + mock_client.make_request.assert_called_once_with(expected_payload) + + @patch("brightdata.tools.bright_data_tools.BrightDataClient") + def test_search_engine_google_with_parameters(self, mock_engine_class, mock_context): + mock_client = Mock() + mock_client.make_request.return_value = "# Google Results with params" + mock_engine_class.create_client.return_value = mock_client + mock_engine_class.encode_query.side_effect = lambda x: x.replace(" ", "%20") + + result = search_engine( + mock_context, + "test query", + language="en", + country_code="us", + search_type="images", + start=10, + num_results=20, + location="New York", + device=DeviceType.MOBILE, + return_json=True, + ) + + assert result == "# Google Results with params" + call_args = mock_client.make_request.call_args[0][0] + + assert "hl=en" in call_args["url"] + assert "gl=us" in call_args["url"] + assert "tbm=isch" in call_args["url"] + assert "start=10" in call_args["url"] + assert "num=20" in call_args["url"] + assert "brd_mobile=1" in call_args["url"] + assert "brd_json=1" in call_args["url"] + assert call_args["data_format"] == "raw" + + def test_search_engine_invalid_engine(self, mock_context): + with pytest.raises(ToolExecutionError): + search_engine(mock_context, "test query", engine="invalid_engine") + + @patch("brightdata.tools.bright_data_tools.BrightDataClient") + def test_search_engine_google_jobs(self, mock_engine_class, mock_context): + mock_client = Mock() + mock_client.make_request.return_value = "# Job Results" + mock_engine_class.create_client.return_value = mock_client + mock_engine_class.encode_query.return_value = "python%20developer" + + result = search_engine(mock_context, "python developer", search_type="jobs") + + assert result == "# Job Results" + call_args = mock_client.make_request.call_args[0][0] + assert "ibp=htl;jobs" in call_args["url"] + + +class TestWebDataFeed: + @patch("brightdata.tools.bright_data_tools._extract_structured_data") + @patch("brightdata.tools.bright_data_tools.BrightDataClient") + def test_web_data_feed_success(self, mock_engine_class, mock_extract, mock_context): + mock_client = Mock() + mock_engine_class.create_client.return_value = mock_client + mock_extract.return_value = {"title": "Test Product", "price": "$19.99"} + + result = web_data_feed(mock_context, "amazon_product", "https://amazon.com/dp/B08N5WRWNW") + + expected_json = '{\n "title": "Test Product",\n "price": "$19.99"\n}' + assert result == expected_json + + mock_engine_class.create_client.assert_called_once_with(api_key=BRIGHTDATA_API_KEY) + mock_extract.assert_called_once_with( + client=mock_client, + source_type=SourceType.AMAZON_PRODUCT, + url="https://amazon.com/dp/B08N5WRWNW", + num_of_reviews=None, + timeout=600, + polling_interval=1, + ) + + @patch("brightdata.tools.bright_data_tools._extract_structured_data") + @patch("brightdata.tools.bright_data_tools.BrightDataClient") + def test_web_data_feed_with_reviews(self, mock_engine_class, mock_extract, mock_context): + mock_client = Mock() + mock_engine_class.create_client.return_value = mock_client + mock_extract.return_value = [{"review": "Great product!", "rating": 5}] + + result = web_data_feed( + mock_context, + "facebook_company_reviews", + "https://facebook.com/company", + num_of_reviews=50, + timeout=300, + polling_interval=2, + ) + + expected_json = '[\n {\n "review": "Great product!",\n "rating": 5\n }\n]' + assert result == expected_json + + mock_extract.assert_called_once_with( + client=mock_client, + source_type=SourceType.FACEBOOK_COMPANY_REVIEWS, + url="https://facebook.com/company", + num_of_reviews=50, + timeout=300, + polling_interval=2, + ) + + +class TestExtractStructuredData: + @patch("requests.get") + @patch("requests.post") + def test_extract_structured_data_success(self, mock_post, mock_get): + from brightdata.tools.bright_data_tools import _extract_structured_data + + client = BrightDataClient("test_key", "test_zone") + + mock_trigger_response = Mock() + mock_trigger_response.json.return_value = {"snapshot_id": "snap_123"} + mock_post.return_value = mock_trigger_response + + mock_snapshot_response = Mock() + mock_snapshot_response.json.return_value = {"data": "extracted_data"} + mock_get.return_value = mock_snapshot_response + + result = _extract_structured_data( + client=client, + source_type=SourceType.AMAZON_PRODUCT, + url="https://amazon.com/dp/TEST", + timeout=10, + polling_interval=0.1, + ) + + assert result == {"data": "extracted_data"} + + mock_post.assert_called_once() + trigger_call = mock_post.call_args + assert "gd_l7q7dkf244hwjntr0" in str(trigger_call) # Amazon product dataset ID + + mock_get.assert_called_once() + snapshot_call = mock_get.call_args + assert "snap_123" in str(snapshot_call) + + @patch("requests.get") + @patch("requests.post") + def test_extract_structured_data_with_polling(self, mock_post, mock_get): + from brightdata.tools.bright_data_tools import _extract_structured_data + + client = BrightDataClient("test_key", "test_zone") + + mock_trigger_response = Mock() + mock_trigger_response.json.return_value = {"snapshot_id": "snap_123"} + mock_post.return_value = mock_trigger_response + + running_response = Mock() + running_response.json.return_value = {"status": "running"} + + complete_response = Mock() + complete_response.json.return_value = {"data": "final_data"} + + mock_get.side_effect = [running_response, complete_response] + + result = _extract_structured_data( + client=client, + source_type=SourceType.LINKEDIN_PERSON_PROFILE, + url="https://linkedin.com/in/test", + timeout=10, + polling_interval=0.1, + ) + + assert result == {"data": "final_data"} + assert mock_get.call_count == 2 + + @patch("requests.post") + def test_extract_structured_data_invalid_source_type(self, mock_post): + from brightdata.tools.bright_data_tools import _extract_structured_data + + client = BrightDataClient("test_key", "test_zone") + + # Create a mock SourceType that doesn't exist in the datasets dict + class InvalidSourceType: + value = "invalid_source" + + with pytest.raises(KeyError): + _extract_structured_data( + client=client, source_type=InvalidSourceType(), url="https://example.com" + ) + + @patch("requests.get") + @patch("requests.post") + def test_extract_structured_data_no_snapshot_id(self, mock_post, mock_get): + from brightdata.tools.bright_data_tools import _extract_structured_data + + client = BrightDataClient("test_key", "test_zone") + + # Mock trigger response without snapshot_id + mock_trigger_response = Mock() + mock_trigger_response.json.return_value = {} + mock_post.return_value = mock_trigger_response + + with pytest.raises(Exception) as exc_info: + _extract_structured_data( + client=client, + source_type=SourceType.AMAZON_PRODUCT, + url="https://amazon.com/dp/TEST", + ) + + assert "No snapshot ID returned from trigger request" in str(exc_info.value) + + @patch("requests.get") + @patch("requests.post") + @patch("time.sleep") + def test_extract_structured_data_timeout(self, mock_sleep, mock_post, mock_get): + from brightdata.tools.bright_data_tools import _extract_structured_data + + client = BrightDataClient("test_key", "test_zone") + + # Mock trigger response + mock_trigger_response = Mock() + mock_trigger_response.json.return_value = {"snapshot_id": "snap_123"} + mock_post.return_value = mock_trigger_response + + # Mock snapshot response that always returns running + mock_snapshot_response = Mock() + mock_snapshot_response.json.return_value = {"status": "running"} + mock_get.return_value = mock_snapshot_response + + with pytest.raises(TimeoutError) as exc_info: + _extract_structured_data( + client=client, + source_type=SourceType.AMAZON_PRODUCT, + url="https://amazon.com/dp/TEST", + timeout=2, + polling_interval=0.1, + ) + + assert "Timeout after 2 seconds waiting for amazon_product data" in str(exc_info.value) + + +class TestIntegration: + """Integration tests that test the full flow without mocking internal components.""" + + @patch("requests.post") + def test_scrape_as_markdown_integration(self, mock_post, mock_context): + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "# Integration Test\n\nThis is a test page" + mock_post.return_value = mock_response + + result = scrape_as_markdown(mock_context, "https://example.com") + + assert result == "# Integration Test\n\nThis is a test page" + + # Verify the request was made correctly + call_args = mock_post.call_args + assert call_args[1]["headers"]["Authorization"] == f"Bearer {BRIGHTDATA_API_KEY}" + assert "https://api.brightdata.com/request" in str(call_args) + + @patch("requests.post") + def test_search_engine_integration(self, mock_post, mock_context): + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "# Search Results\n\n1. First result\n2. Second result" + mock_post.return_value = mock_response + + result = search_engine(mock_context, "test query", engine="google") + + assert result == "# Search Results\n\n1. First result\n2. Second result" + + call_args = mock_post.call_args + payload = call_args[1]["data"] + assert '"url": "https://www.google.com/search?q=test%20query' in payload + assert '"data_format": "markdown"' in payload