From 8077635f259de20a49bf903143be267767d9a2ca Mon Sep 17 00:00:00 2001
From: wangzhuc <wangzhuc@outlook.com>
Date: Thu, 2 Apr 2026 22:22:18 +0800
Subject: [PATCH] feat: add Camoufox anti-detection browser and fix
 visibility:hidden bug

Add Camoufox as Level 2 fetcher to bypass WeChat bot verification.
Fix #js_content visibility:hidden style causing empty markdown output.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 dist/openclaw/VERSION                  |  2 +-
 dist/openclaw/requirements.txt         |  1 +
 dist/openclaw/scripts/fetch_article.py | 51 ++++++++++++++++++++++----
 requirements.txt                       |  1 +
 scripts/fetch_article.py               | 51 ++++++++++++++++++++++----
 5 files changed, 89 insertions(+), 17 deletions(-)
diff --git a/dist/openclaw/VERSION b/dist/openclaw/VERSION
index d0149fe..80e78df 100644
--- a/dist/openclaw/VERSION
+++ b/dist/openclaw/VERSION
@@ -1 +1 @@
-1.3.4
+1.3.5
diff --git a/dist/openclaw/requirements.txt b/dist/openclaw/requirements.txt
index f4dedc8..2fbddbd 100644
--- a/dist/openclaw/requirements.txt
+++ b/dist/openclaw/requirements.txt
@@ -2,6 +2,7 @@ markdown>=3.5
 beautifulsoup4>=4.12
 cssutils>=2.9
 requests>=2.31
+camoufox[geoip]>=0.4
 pyyaml>=6.0
 Pygments>=2.15
 Pillow>=10.0
diff --git a/dist/openclaw/scripts/fetch_article.py b/dist/openclaw/scripts/fetch_article.py
index 811cf3a..4d64722 100644
--- a/dist/openclaw/scripts/fetch_article.py
+++ b/dist/openclaw/scripts/fetch_article.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python3
 """fetch_article.py — extract WeChat article content as Markdown.
 
-Three-level fetching strategy:
+Four-level fetching strategy:
   Level 1: requests (fast, zero overhead, works for most articles)
-  Level 2: Playwright headless Chrome (bypasses anti-scraping JS checks)
-  Level 3: Prompt user to save HTML manually and pass via --file
+  Level 2: Camoufox anti-detection browser (bypasses WeChat bot verification)
+  Level 3: Playwright headless Chrome (fallback)
+  Level 4: Prompt user to save HTML manually and pass via --file
 
 Usage:
     python3 scripts/fetch_article.py <url>                    # auto fetch
@@ -44,8 +45,31 @@ def _fetch_requests(url: str, timeout: int = 20) -> str | None:
         return None
 
 
+def _fetch_camoufox(url: str) -> str | None:
+    """Level 2: Camoufox anti-detection browser. Returns HTML or None."""
+    try:
+        from camoufox.sync_api import Camoufox
+    except ImportError:
+        return None
+
+    try:
+        with Camoufox(headless=True) as browser:
+            page = browser.new_page()
+            page.goto(url, wait_until="domcontentloaded", timeout=30000)
+            try:
+                page.wait_for_selector("#js_content", timeout=10000)
+            except Exception:
+                pass  # timeout — still try to parse
+            import time
+            time.sleep(2)  # let JS finish rendering
+            html = page.content()
+            return html
+    except Exception:
+        return None
+
+
 def _fetch_playwright(url: str, timeout: int = 30000) -> str | None:
-    """Level 2: Playwright headless Chrome. Returns HTML or None."""
+    """Level 3: Playwright headless Chrome. Returns HTML or None."""
     try:
         from playwright.sync_api import sync_playwright
     except ImportError:
@@ -70,18 +94,24 @@ def fetch_html(url: str) -> str:
 
     Returns HTML string. Exits with error if all levels fail.
     """
-    # Level 1
+    # Level 1: plain requests
     html = _fetch_requests(url)
     if html and _has_content(html):
         return html
 
-    # Level 2
-    print("requests 未获取到正文，尝试 Playwright...", file=sys.stderr)
+    # Level 2: Camoufox anti-detection browser
+    print("requests 未获取到正文，尝试 Camoufox...", file=sys.stderr)
+    html = _fetch_camoufox(url)
+    if html and _has_content(html):
+        return html
+
+    # Level 3: Playwright fallback
+    print("Camoufox 未获取到正文，尝试 Playwright...", file=sys.stderr)
     html = _fetch_playwright(url)
     if html and _has_content(html):
         return html
 
-    # Level 3
+    # Level 4: manual
     print(
         "Error: 无法获取文章内容。请在浏览器中打开文章 → 右键另存为 HTML → 使用 --file 参数传入。",
         file=sys.stderr,
@@ -231,6 +261,11 @@ def html_to_markdown(soup: BeautifulSoup) -> str:
     if content is None:
         return ""
 
+    # WeChat lazy-loads #js_content with visibility:hidden; JS removes it later.
+    # Strip the style so _elem_to_md doesn't skip the entire container.
+    if content.get("style"):
+        del content["style"]
+
     raw = _elem_to_md(content)
 
     # Clean up excessive whitespace
diff --git a/requirements.txt b/requirements.txt
index f4dedc8..2fbddbd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ markdown>=3.5
 beautifulsoup4>=4.12
 cssutils>=2.9
 requests>=2.31
+camoufox[geoip]>=0.4
 pyyaml>=6.0
 Pygments>=2.15
 Pillow>=10.0
diff --git a/scripts/fetch_article.py b/scripts/fetch_article.py
index 811cf3a..4d64722 100644
--- a/scripts/fetch_article.py
+++ b/scripts/fetch_article.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python3
 """fetch_article.py — extract WeChat article content as Markdown.
 
-Three-level fetching strategy:
+Four-level fetching strategy:
   Level 1: requests (fast, zero overhead, works for most articles)
-  Level 2: Playwright headless Chrome (bypasses anti-scraping JS checks)
-  Level 3: Prompt user to save HTML manually and pass via --file
+  Level 2: Camoufox anti-detection browser (bypasses WeChat bot verification)
+  Level 3: Playwright headless Chrome (fallback)
+  Level 4: Prompt user to save HTML manually and pass via --file
 
 Usage:
     python3 scripts/fetch_article.py <url>                    # auto fetch
@@ -44,8 +45,31 @@ def _fetch_requests(url: str, timeout: int = 20) -> str | None:
         return None
 
 
+def _fetch_camoufox(url: str) -> str | None:
+    """Level 2: Camoufox anti-detection browser. Returns HTML or None."""
+    try:
+        from camoufox.sync_api import Camoufox
+    except ImportError:
+        return None
+
+    try:
+        with Camoufox(headless=True) as browser:
+            page = browser.new_page()
+            page.goto(url, wait_until="domcontentloaded", timeout=30000)
+            try:
+                page.wait_for_selector("#js_content", timeout=10000)
+            except Exception:
+                pass  # timeout — still try to parse
+            import time
+            time.sleep(2)  # let JS finish rendering
+            html = page.content()
+            return html
+    except Exception:
+        return None
+
+
 def _fetch_playwright(url: str, timeout: int = 30000) -> str | None:
-    """Level 2: Playwright headless Chrome. Returns HTML or None."""
+    """Level 3: Playwright headless Chrome. Returns HTML or None."""
     try:
         from playwright.sync_api import sync_playwright
     except ImportError:
@@ -70,18 +94,24 @@ def fetch_html(url: str) -> str:
 
     Returns HTML string. Exits with error if all levels fail.
     """
-    # Level 1
+    # Level 1: plain requests
     html = _fetch_requests(url)
     if html and _has_content(html):
         return html
 
-    # Level 2
-    print("requests 未获取到正文，尝试 Playwright...", file=sys.stderr)
+    # Level 2: Camoufox anti-detection browser
+    print("requests 未获取到正文，尝试 Camoufox...", file=sys.stderr)
+    html = _fetch_camoufox(url)
+    if html and _has_content(html):
+        return html
+
+    # Level 3: Playwright fallback
+    print("Camoufox 未获取到正文，尝试 Playwright...", file=sys.stderr)
     html = _fetch_playwright(url)
     if html and _has_content(html):
         return html
 
-    # Level 3
+    # Level 4: manual
     print(
         "Error: 无法获取文章内容。请在浏览器中打开文章 → 右键另存为 HTML → 使用 --file 参数传入。",
         file=sys.stderr,
@@ -231,6 +261,11 @@ def html_to_markdown(soup: BeautifulSoup) -> str:
     if content is None:
         return ""
 
+    # WeChat lazy-loads #js_content with visibility:hidden; JS removes it later.
+    # Strip the style so _elem_to_md doesn't skip the entire container.
+    if content.get("style"):
+        del content["style"]
+
     raw = _elem_to_md(content)
 
     # Clean up excessive whitespace