feat: add Camoufox anti-detection browser and fix visibility:hidden bug

Add Camoufox as Level 2 fetcher to bypass WeChat bot verification.
Fix #js_content visibility:hidden style causing empty markdown output.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
wangzhuc 2026-04-02 22:22:18 +08:00 committed by github-actions[bot]
parent 9c510724ba
commit 8077635f25
5 changed files with 89 additions and 17 deletions

View file

@ -1 +1 @@
1.3.4 1.3.5

View file

@ -2,6 +2,7 @@ markdown>=3.5
beautifulsoup4>=4.12 beautifulsoup4>=4.12
cssutils>=2.9 cssutils>=2.9
requests>=2.31 requests>=2.31
camoufox[geoip]>=0.4
pyyaml>=6.0 pyyaml>=6.0
Pygments>=2.15 Pygments>=2.15
Pillow>=10.0 Pillow>=10.0

View file

@ -1,10 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""fetch_article.py — extract WeChat article content as Markdown. """fetch_article.py — extract WeChat article content as Markdown.
Three-level fetching strategy: Four-level fetching strategy:
Level 1: requests (fast, zero overhead, works for most articles) Level 1: requests (fast, zero overhead, works for most articles)
Level 2: Playwright headless Chrome (bypasses anti-scraping JS checks) Level 2: Camoufox anti-detection browser (bypasses WeChat bot verification)
Level 3: Prompt user to save HTML manually and pass via --file Level 3: Playwright headless Chrome (fallback)
Level 4: Prompt user to save HTML manually and pass via --file
Usage: Usage:
python3 scripts/fetch_article.py <url> # auto fetch python3 scripts/fetch_article.py <url> # auto fetch
@ -44,8 +45,31 @@ def _fetch_requests(url: str, timeout: int = 20) -> str | None:
return None return None
def _fetch_camoufox(url: str) -> str | None:
"""Level 2: Camoufox anti-detection browser. Returns HTML or None."""
try:
from camoufox.sync_api import Camoufox
except ImportError:
return None
try:
with Camoufox(headless=True) as browser:
page = browser.new_page()
page.goto(url, wait_until="domcontentloaded", timeout=30000)
try:
page.wait_for_selector("#js_content", timeout=10000)
except Exception:
pass # timeout — still try to parse
import time
time.sleep(2) # let JS finish rendering
html = page.content()
return html
except Exception:
return None
def _fetch_playwright(url: str, timeout: int = 30000) -> str | None: def _fetch_playwright(url: str, timeout: int = 30000) -> str | None:
"""Level 2: Playwright headless Chrome. Returns HTML or None.""" """Level 3: Playwright headless Chrome. Returns HTML or None."""
try: try:
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
except ImportError: except ImportError:
@ -70,18 +94,24 @@ def fetch_html(url: str) -> str:
Returns HTML string. Exits with error if all levels fail. Returns HTML string. Exits with error if all levels fail.
""" """
# Level 1 # Level 1: plain requests
html = _fetch_requests(url) html = _fetch_requests(url)
if html and _has_content(html): if html and _has_content(html):
return html return html
# Level 2 # Level 2: Camoufox anti-detection browser
print("requests 未获取到正文,尝试 Playwright...", file=sys.stderr) print("requests 未获取到正文,尝试 Camoufox...", file=sys.stderr)
html = _fetch_camoufox(url)
if html and _has_content(html):
return html
# Level 3: Playwright fallback
print("Camoufox 未获取到正文,尝试 Playwright...", file=sys.stderr)
html = _fetch_playwright(url) html = _fetch_playwright(url)
if html and _has_content(html): if html and _has_content(html):
return html return html
# Level 3 # Level 4: manual
print( print(
"Error: 无法获取文章内容。请在浏览器中打开文章 → 右键另存为 HTML → 使用 --file 参数传入。", "Error: 无法获取文章内容。请在浏览器中打开文章 → 右键另存为 HTML → 使用 --file 参数传入。",
file=sys.stderr, file=sys.stderr,
@ -231,6 +261,11 @@ def html_to_markdown(soup: BeautifulSoup) -> str:
if content is None: if content is None:
return "" return ""
# WeChat lazy-loads #js_content with visibility:hidden; JS removes it later.
# Strip the style so _elem_to_md doesn't skip the entire container.
if content.get("style"):
del content["style"]
raw = _elem_to_md(content) raw = _elem_to_md(content)
# Clean up excessive whitespace # Clean up excessive whitespace

View file

@ -2,6 +2,7 @@ markdown>=3.5
beautifulsoup4>=4.12 beautifulsoup4>=4.12
cssutils>=2.9 cssutils>=2.9
requests>=2.31 requests>=2.31
camoufox[geoip]>=0.4
pyyaml>=6.0 pyyaml>=6.0
Pygments>=2.15 Pygments>=2.15
Pillow>=10.0 Pillow>=10.0

View file

@ -1,10 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""fetch_article.py — extract WeChat article content as Markdown. """fetch_article.py — extract WeChat article content as Markdown.
Three-level fetching strategy: Four-level fetching strategy:
Level 1: requests (fast, zero overhead, works for most articles) Level 1: requests (fast, zero overhead, works for most articles)
Level 2: Playwright headless Chrome (bypasses anti-scraping JS checks) Level 2: Camoufox anti-detection browser (bypasses WeChat bot verification)
Level 3: Prompt user to save HTML manually and pass via --file Level 3: Playwright headless Chrome (fallback)
Level 4: Prompt user to save HTML manually and pass via --file
Usage: Usage:
python3 scripts/fetch_article.py <url> # auto fetch python3 scripts/fetch_article.py <url> # auto fetch
@ -44,8 +45,31 @@ def _fetch_requests(url: str, timeout: int = 20) -> str | None:
return None return None
def _fetch_camoufox(url: str) -> str | None:
"""Level 2: Camoufox anti-detection browser. Returns HTML or None."""
try:
from camoufox.sync_api import Camoufox
except ImportError:
return None
try:
with Camoufox(headless=True) as browser:
page = browser.new_page()
page.goto(url, wait_until="domcontentloaded", timeout=30000)
try:
page.wait_for_selector("#js_content", timeout=10000)
except Exception:
pass # timeout — still try to parse
import time
time.sleep(2) # let JS finish rendering
html = page.content()
return html
except Exception:
return None
def _fetch_playwright(url: str, timeout: int = 30000) -> str | None: def _fetch_playwright(url: str, timeout: int = 30000) -> str | None:
"""Level 2: Playwright headless Chrome. Returns HTML or None.""" """Level 3: Playwright headless Chrome. Returns HTML or None."""
try: try:
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
except ImportError: except ImportError:
@ -70,18 +94,24 @@ def fetch_html(url: str) -> str:
Returns HTML string. Exits with error if all levels fail. Returns HTML string. Exits with error if all levels fail.
""" """
# Level 1 # Level 1: plain requests
html = _fetch_requests(url) html = _fetch_requests(url)
if html and _has_content(html): if html and _has_content(html):
return html return html
# Level 2 # Level 2: Camoufox anti-detection browser
print("requests 未获取到正文,尝试 Playwright...", file=sys.stderr) print("requests 未获取到正文,尝试 Camoufox...", file=sys.stderr)
html = _fetch_camoufox(url)
if html and _has_content(html):
return html
# Level 3: Playwright fallback
print("Camoufox 未获取到正文,尝试 Playwright...", file=sys.stderr)
html = _fetch_playwright(url) html = _fetch_playwright(url)
if html and _has_content(html): if html and _has_content(html):
return html return html
# Level 3 # Level 4: manual
print( print(
"Error: 无法获取文章内容。请在浏览器中打开文章 → 右键另存为 HTML → 使用 --file 参数传入。", "Error: 无法获取文章内容。请在浏览器中打开文章 → 右键另存为 HTML → 使用 --file 参数传入。",
file=sys.stderr, file=sys.stderr,
@ -231,6 +261,11 @@ def html_to_markdown(soup: BeautifulSoup) -> str:
if content is None: if content is None:
return "" return ""
# WeChat lazy-loads #js_content with visibility:hidden; JS removes it later.
# Strip the style so _elem_to_md doesn't skip the entire container.
if content.get("style"):
del content["style"]
raw = _elem_to_md(content) raw = _elem_to_md(content)
# Clean up excessive whitespace # Clean up excessive whitespace