From 8077635f259de20a49bf903143be267767d9a2ca Mon Sep 17 00:00:00 2001 From: wangzhuc Date: Thu, 2 Apr 2026 22:22:18 +0800 Subject: [PATCH] feat: add Camoufox anti-detection browser and fix visibility:hidden bug Add Camoufox as Level 2 fetcher to bypass WeChat bot verification. Fix #js_content visibility:hidden style causing empty markdown output. Co-Authored-By: Claude Opus 4.6 (1M context) --- dist/openclaw/VERSION | 2 +- dist/openclaw/requirements.txt | 1 + dist/openclaw/scripts/fetch_article.py | 51 ++++++++++++++++++++++---- requirements.txt | 1 + scripts/fetch_article.py | 51 ++++++++++++++++++++++---- 5 files changed, 89 insertions(+), 17 deletions(-) diff --git a/dist/openclaw/VERSION b/dist/openclaw/VERSION index d0149fe..80e78df 100644 --- a/dist/openclaw/VERSION +++ b/dist/openclaw/VERSION @@ -1 +1 @@ -1.3.4 +1.3.5 diff --git a/dist/openclaw/requirements.txt b/dist/openclaw/requirements.txt index f4dedc8..2fbddbd 100644 --- a/dist/openclaw/requirements.txt +++ b/dist/openclaw/requirements.txt @@ -2,6 +2,7 @@ markdown>=3.5 beautifulsoup4>=4.12 cssutils>=2.9 requests>=2.31 +camoufox[geoip]>=0.4 pyyaml>=6.0 Pygments>=2.15 Pillow>=10.0 diff --git a/dist/openclaw/scripts/fetch_article.py b/dist/openclaw/scripts/fetch_article.py index 811cf3a..4d64722 100644 --- a/dist/openclaw/scripts/fetch_article.py +++ b/dist/openclaw/scripts/fetch_article.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 """fetch_article.py — extract WeChat article content as Markdown. -Three-level fetching strategy: +Four-level fetching strategy: Level 1: requests (fast, zero overhead, works for most articles) - Level 2: Playwright headless Chrome (bypasses anti-scraping JS checks) - Level 3: Prompt user to save HTML manually and pass via --file + Level 2: Camoufox anti-detection browser (bypasses WeChat bot verification) + Level 3: Playwright headless Chrome (fallback) + Level 4: Prompt user to save HTML manually and pass via --file Usage: python3 scripts/fetch_article.py # auto fetch @@ -44,8 +45,31 @@ def _fetch_requests(url: str, timeout: int = 20) -> str | None: return None +def _fetch_camoufox(url: str) -> str | None: + """Level 2: Camoufox anti-detection browser. Returns HTML or None.""" + try: + from camoufox.sync_api import Camoufox + except ImportError: + return None + + try: + with Camoufox(headless=True) as browser: + page = browser.new_page() + page.goto(url, wait_until="domcontentloaded", timeout=30000) + try: + page.wait_for_selector("#js_content", timeout=10000) + except Exception: + pass # timeout — still try to parse + import time + time.sleep(2) # let JS finish rendering + html = page.content() + return html + except Exception: + return None + + def _fetch_playwright(url: str, timeout: int = 30000) -> str | None: - """Level 2: Playwright headless Chrome. Returns HTML or None.""" + """Level 3: Playwright headless Chrome. Returns HTML or None.""" try: from playwright.sync_api import sync_playwright except ImportError: @@ -70,18 +94,24 @@ def fetch_html(url: str) -> str: Returns HTML string. Exits with error if all levels fail. """ - # Level 1 + # Level 1: plain requests html = _fetch_requests(url) if html and _has_content(html): return html - # Level 2 - print("requests 未获取到正文,尝试 Playwright...", file=sys.stderr) + # Level 2: Camoufox anti-detection browser + print("requests 未获取到正文,尝试 Camoufox...", file=sys.stderr) + html = _fetch_camoufox(url) + if html and _has_content(html): + return html + + # Level 3: Playwright fallback + print("Camoufox 未获取到正文,尝试 Playwright...", file=sys.stderr) html = _fetch_playwright(url) if html and _has_content(html): return html - # Level 3 + # Level 4: manual print( "Error: 无法获取文章内容。请在浏览器中打开文章 → 右键另存为 HTML → 使用 --file 参数传入。", file=sys.stderr, @@ -231,6 +261,11 @@ def html_to_markdown(soup: BeautifulSoup) -> str: if content is None: return "" + # WeChat lazy-loads #js_content with visibility:hidden; JS removes it later. + # Strip the style so _elem_to_md doesn't skip the entire container. + if content.get("style"): + del content["style"] + raw = _elem_to_md(content) # Clean up excessive whitespace diff --git a/requirements.txt b/requirements.txt index f4dedc8..2fbddbd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ markdown>=3.5 beautifulsoup4>=4.12 cssutils>=2.9 requests>=2.31 +camoufox[geoip]>=0.4 pyyaml>=6.0 Pygments>=2.15 Pillow>=10.0 diff --git a/scripts/fetch_article.py b/scripts/fetch_article.py index 811cf3a..4d64722 100644 --- a/scripts/fetch_article.py +++ b/scripts/fetch_article.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 """fetch_article.py — extract WeChat article content as Markdown. -Three-level fetching strategy: +Four-level fetching strategy: Level 1: requests (fast, zero overhead, works for most articles) - Level 2: Playwright headless Chrome (bypasses anti-scraping JS checks) - Level 3: Prompt user to save HTML manually and pass via --file + Level 2: Camoufox anti-detection browser (bypasses WeChat bot verification) + Level 3: Playwright headless Chrome (fallback) + Level 4: Prompt user to save HTML manually and pass via --file Usage: python3 scripts/fetch_article.py # auto fetch @@ -44,8 +45,31 @@ def _fetch_requests(url: str, timeout: int = 20) -> str | None: return None +def _fetch_camoufox(url: str) -> str | None: + """Level 2: Camoufox anti-detection browser. Returns HTML or None.""" + try: + from camoufox.sync_api import Camoufox + except ImportError: + return None + + try: + with Camoufox(headless=True) as browser: + page = browser.new_page() + page.goto(url, wait_until="domcontentloaded", timeout=30000) + try: + page.wait_for_selector("#js_content", timeout=10000) + except Exception: + pass # timeout — still try to parse + import time + time.sleep(2) # let JS finish rendering + html = page.content() + return html + except Exception: + return None + + def _fetch_playwright(url: str, timeout: int = 30000) -> str | None: - """Level 2: Playwright headless Chrome. Returns HTML or None.""" + """Level 3: Playwright headless Chrome. Returns HTML or None.""" try: from playwright.sync_api import sync_playwright except ImportError: @@ -70,18 +94,24 @@ def fetch_html(url: str) -> str: Returns HTML string. Exits with error if all levels fail. """ - # Level 1 + # Level 1: plain requests html = _fetch_requests(url) if html and _has_content(html): return html - # Level 2 - print("requests 未获取到正文,尝试 Playwright...", file=sys.stderr) + # Level 2: Camoufox anti-detection browser + print("requests 未获取到正文,尝试 Camoufox...", file=sys.stderr) + html = _fetch_camoufox(url) + if html and _has_content(html): + return html + + # Level 3: Playwright fallback + print("Camoufox 未获取到正文,尝试 Playwright...", file=sys.stderr) html = _fetch_playwright(url) if html and _has_content(html): return html - # Level 3 + # Level 4: manual print( "Error: 无法获取文章内容。请在浏览器中打开文章 → 右键另存为 HTML → 使用 --file 参数传入。", file=sys.stderr, @@ -231,6 +261,11 @@ def html_to_markdown(soup: BeautifulSoup) -> str: if content is None: return "" + # WeChat lazy-loads #js_content with visibility:hidden; JS removes it later. + # Strip the style so _elem_to_md doesn't skip the entire container. + if content.get("style"): + del content["style"] + raw = _elem_to_md(content) # Clean up excessive whitespace