feat: add Camoufox anti-detection browser and fix visibility:hidden bug

Add Camoufox as Level 2 fetcher to bypass WeChat bot verification. Fix #js_content visibility:hidden style causing empty markdown output. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 22:22:18 +08:00 · 2026-04-02 22:22:18 +08:00 · 8077635f25
commit 8077635f25
parent 9c510724ba
5 changed files with 89 additions and 17 deletions
--- a/dist/openclaw/VERSION
+++ b/dist/openclaw/VERSION
@ -1 +1 @@
-1.3.4
+1.3.5
--- a/dist/openclaw/requirements.txt
+++ b/dist/openclaw/requirements.txt
@ -2,6 +2,7 @@ markdown>=3.5
 beautifulsoup4>=4.12
 cssutils>=2.9
 requests>=2.31
+camoufox[geoip]>=0.4
 pyyaml>=6.0
 Pygments>=2.15
 Pillow>=10.0
--- a/dist/openclaw/scripts/fetch_article.py
+++ b/dist/openclaw/scripts/fetch_article.py
@ -1,10 +1,11 @@
 #!/usr/bin/env python3
 """fetch_article.py — extract WeChat article content as Markdown.

-Three-level fetching strategy:
+Four-level fetching strategy:
  Level 1: requests (fast, zero overhead, works for most articles)
-  Level 2: Playwright headless Chrome (bypasses anti-scraping JS checks)
-  Level 3: Prompt user to save HTML manually and pass via --file
+  Level 2: Camoufox anti-detection browser (bypasses WeChat bot verification)
+  Level 3: Playwright headless Chrome (fallback)
+  Level 4: Prompt user to save HTML manually and pass via --file

 Usage:
    python3 scripts/fetch_article.py <url>                    # auto fetch
@ -44,8 +45,31 @@ def _fetch_requests(url: str, timeout: int = 20) -> str | None:
        return None


+def _fetch_camoufox(url: str) -> str | None:
+    """Level 2: Camoufox anti-detection browser. Returns HTML or None."""
+    try:
+        from camoufox.sync_api import Camoufox
+    except ImportError:
+        return None
+
+    try:
+        with Camoufox(headless=True) as browser:
+            page = browser.new_page()
+            page.goto(url, wait_until="domcontentloaded", timeout=30000)
+            try:
+                page.wait_for_selector("#js_content", timeout=10000)
+            except Exception:
+                pass  # timeout — still try to parse
+            import time
+            time.sleep(2)  # let JS finish rendering
+            html = page.content()
+            return html
+    except Exception:
+        return None
+
+
 def _fetch_playwright(url: str, timeout: int = 30000) -> str | None:
-    """Level 2: Playwright headless Chrome. Returns HTML or None."""
+    """Level 3: Playwright headless Chrome. Returns HTML or None."""
    try:
        from playwright.sync_api import sync_playwright
    except ImportError:
@ -70,18 +94,24 @@ def fetch_html(url: str) -> str:

    Returns HTML string. Exits with error if all levels fail.
    """
-    # Level 1
+    # Level 1: plain requests
    html = _fetch_requests(url)
    if html and _has_content(html):
        return html

-    # Level 2
-    print("requests 未获取到正文，尝试 Playwright...", file=sys.stderr)
+    # Level 2: Camoufox anti-detection browser
+    print("requests 未获取到正文，尝试 Camoufox...", file=sys.stderr)
+    html = _fetch_camoufox(url)
+    if html and _has_content(html):
+        return html
+
+    # Level 3: Playwright fallback
+    print("Camoufox 未获取到正文，尝试 Playwright...", file=sys.stderr)
    html = _fetch_playwright(url)
    if html and _has_content(html):
        return html

-    # Level 3
+    # Level 4: manual
    print(
        "Error: 无法获取文章内容。请在浏览器中打开文章 → 右键另存为 HTML → 使用 --file 参数传入。",
        file=sys.stderr,
@ -231,6 +261,11 @@ def html_to_markdown(soup: BeautifulSoup) -> str:
    if content is None:
        return ""

+    # WeChat lazy-loads #js_content with visibility:hidden; JS removes it later.
+    # Strip the style so _elem_to_md doesn't skip the entire container.
+    if content.get("style"):
+        del content["style"]
+
    raw = _elem_to_md(content)

    # Clean up excessive whitespace
--- a/requirements.txt
+++ b/requirements.txt
@ -2,6 +2,7 @@ markdown>=3.5
 beautifulsoup4>=4.12
 cssutils>=2.9
 requests>=2.31
+camoufox[geoip]>=0.4
 pyyaml>=6.0
 Pygments>=2.15
 Pillow>=10.0
--- a/scripts/fetch_article.py
+++ b/scripts/fetch_article.py
@ -1,10 +1,11 @@
 #!/usr/bin/env python3
 """fetch_article.py — extract WeChat article content as Markdown.

-Three-level fetching strategy:
+Four-level fetching strategy:
  Level 1: requests (fast, zero overhead, works for most articles)
-  Level 2: Playwright headless Chrome (bypasses anti-scraping JS checks)
-  Level 3: Prompt user to save HTML manually and pass via --file
+  Level 2: Camoufox anti-detection browser (bypasses WeChat bot verification)
+  Level 3: Playwright headless Chrome (fallback)
+  Level 4: Prompt user to save HTML manually and pass via --file

 Usage:
    python3 scripts/fetch_article.py <url>                    # auto fetch
@ -44,8 +45,31 @@ def _fetch_requests(url: str, timeout: int = 20) -> str | None:
        return None


+def _fetch_camoufox(url: str) -> str | None:
+    """Level 2: Camoufox anti-detection browser. Returns HTML or None."""
+    try:
+        from camoufox.sync_api import Camoufox
+    except ImportError:
+        return None
+
+    try:
+        with Camoufox(headless=True) as browser:
+            page = browser.new_page()
+            page.goto(url, wait_until="domcontentloaded", timeout=30000)
+            try:
+                page.wait_for_selector("#js_content", timeout=10000)
+            except Exception:
+                pass  # timeout — still try to parse
+            import time
+            time.sleep(2)  # let JS finish rendering
+            html = page.content()
+            return html
+    except Exception:
+        return None
+
+
 def _fetch_playwright(url: str, timeout: int = 30000) -> str | None:
-    """Level 2: Playwright headless Chrome. Returns HTML or None."""
+    """Level 3: Playwright headless Chrome. Returns HTML or None."""
    try:
        from playwright.sync_api import sync_playwright
    except ImportError:
@ -70,18 +94,24 @@ def fetch_html(url: str) -> str:

    Returns HTML string. Exits with error if all levels fail.
    """
-    # Level 1
+    # Level 1: plain requests
    html = _fetch_requests(url)
    if html and _has_content(html):
        return html

-    # Level 2
-    print("requests 未获取到正文，尝试 Playwright...", file=sys.stderr)
+    # Level 2: Camoufox anti-detection browser
+    print("requests 未获取到正文，尝试 Camoufox...", file=sys.stderr)
+    html = _fetch_camoufox(url)
+    if html and _has_content(html):
+        return html
+
+    # Level 3: Playwright fallback
+    print("Camoufox 未获取到正文，尝试 Playwright...", file=sys.stderr)
    html = _fetch_playwright(url)
    if html and _has_content(html):
        return html

-    # Level 3
+    # Level 4: manual
    print(
        "Error: 无法获取文章内容。请在浏览器中打开文章 → 右键另存为 HTML → 使用 --file 参数传入。",
        file=sys.stderr,
@ -231,6 +261,11 @@ def html_to_markdown(soup: BeautifulSoup) -> str:
    if content is None:
        return ""

+    # WeChat lazy-loads #js_content with visibility:hidden; JS removes it later.
+    # Strip the style so _elem_to_md doesn't skip the entire container.
+    if content.get("style"):
+        del content["style"]
+
    raw = _elem_to_md(content)

    # Clean up excessive whitespace
 @ -1 +1 @@
 .3.4
 .3.5