wewrite/scripts/learn_theme.py
wangzhuc 77e76077d8 fix(learn-theme): HTTP error handling, DRY title extraction, text_light fix
- fetch_article: catch RequestException, add raise_for_status()
- Extract _attach_title() shared by fetch_article and _load_from_file
- text_light: only search foreground colors, not background values

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-01 12:37:47 +08:00

490 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""learn_theme.py — extract a WeWrite-compatible theme from a WeChat article URL.
Usage:
python3 scripts/learn_theme.py <url> # fetch + analyse live article
python3 scripts/learn_theme.py --file <path> # analyse a saved HTML file
"""
import colorsys
import re
import sys
from collections import Counter
import requests
from bs4 import BeautifulSoup
# ---------------------------------------------------------------------------
# 1. Color utilities
# ---------------------------------------------------------------------------
def rgb_to_hex(rgb_str: str) -> str:
"""Convert ``rgb(r,g,b)`` or ``rgba(r,g,b,a)`` to ``#rrggbb``.
Pass-through for values that already look like hex (lowercased).
Return the original string unchanged if no pattern matches.
"""
if not isinstance(rgb_str, str):
return rgb_str
s = rgb_str.strip()
# Already hex
if re.match(r"^#[0-9a-fA-F]{3,8}$", s):
return s.lower()
m = re.match(
r"rgba?\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)(?:\s*,\s*[\d.]+)?\s*\)",
s,
re.IGNORECASE,
)
if m:
r, g, b = int(m.group(1)), int(m.group(2)), int(m.group(3))
return "#{:02x}{:02x}{:02x}".format(r, g, b)
return s
def lightness(hex_color: str) -> float:
"""Return HLS lightness (0.01.0) for a hex colour string.
Returns 0.5 for any invalid / non-hex input.
"""
s = hex_color.strip().lstrip("#")
if len(s) == 3:
s = "".join(c * 2 for c in s)
if len(s) != 6:
return 0.5
try:
r = int(s[0:2], 16) / 255.0
g = int(s[2:4], 16) / 255.0
b = int(s[4:6], 16) / 255.0
except ValueError:
return 0.5
_h, l, _s = colorsys.rgb_to_hls(r, g, b)
return l
def is_gray(hex_color: str, threshold: int = 30) -> bool:
"""Return True if R, G, B values are all within *threshold* of each other."""
s = hex_color.strip().lstrip("#")
if len(s) == 3:
s = "".join(c * 2 for c in s)
if len(s) != 6:
return False
try:
r = int(s[0:2], 16)
g = int(s[2:4], 16)
b = int(s[4:6], 16)
except ValueError:
return False
return max(r, g, b) - min(r, g, b) <= threshold
def adjust_lightness(hex_color: str, target_l: float) -> str:
"""Return a new hex colour with lightness set to *target_l* (0.01.0)."""
s = hex_color.strip().lstrip("#")
if len(s) == 3:
s = "".join(c * 2 for c in s)
if len(s) != 6:
return hex_color
try:
r = int(s[0:2], 16) / 255.0
g = int(s[2:4], 16) / 255.0
b = int(s[4:6], 16) / 255.0
except ValueError:
return hex_color
h, _l, sat = colorsys.rgb_to_hls(r, g, b)
nr, ng, nb = colorsys.hls_to_rgb(h, max(0.0, min(1.0, target_l)), sat)
return "#{:02x}{:02x}{:02x}".format(
int(nr * 255), int(ng * 255), int(nb * 255)
)
def derive_darkmode(colors: dict) -> dict:
"""Derive a dark-mode colour dict from a light-mode *colors* dict.
Rules
-----
background → #1e1e1e
text → lightness set to 0.80
text_light → lightness set to 0.60
primary → lightness + 0.15, capped at 0.85
code_bg → #2d2d2d
code_color → #d4d4d4
quote_bg → #252525
quote_border → dark-mode primary
"""
primary = colors.get("primary", "#2563eb")
primary_l = lightness(primary)
dm_primary = adjust_lightness(primary, min(primary_l + 0.15, 0.85))
dm = {
"background": "#1e1e1e",
"text": adjust_lightness(colors.get("text", "#333333"), 0.80),
"text_light": adjust_lightness(colors.get("text_light", "#666666"), 0.60),
"primary": dm_primary,
"code_bg": "#2d2d2d",
"code_color": "#d4d4d4",
"quote_bg": "#252525",
"quote_border": dm_primary,
}
return dm
# ---------------------------------------------------------------------------
# 2. HTML fetch and style extraction
# ---------------------------------------------------------------------------
def parse_inline_style(style_str: str) -> dict:
"""Parse ``"color: red; font-size: 16px"`` into ``{"color": "red", "font-size": "16px"}``."""
result = {}
if not style_str:
return result
for declaration in style_str.split(";"):
declaration = declaration.strip()
if ":" not in declaration:
continue
prop, _, val = declaration.partition(":")
result[prop.strip().lower()] = val.strip()
return result
_TARGET_TAGS = {
"p", "section", "span", "strong", "em",
"h1", "h2", "h3", "h4",
"blockquote", "code", "pre", "img", "a",
}
_BROWSER_UA = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
)
def _attach_title(soup, content) -> None:
"""Find the article title in *soup* and stash it on *content*."""
title_tag = soup.find("h1", class_="rich_media_title") or soup.find(
"h1", id="activity-name"
)
content._wewrite_title = title_tag.get_text(strip=True) if title_tag else ""
def fetch_article(url: str, timeout: int = 20) -> "BeautifulSoup tag":
"""Fetch a WeChat article, return the ``#js_content`` element.
The article title is attached as ``content._wewrite_title`` (empty string
if not found). Exits with code 1 on network errors or missing content.
Parameters
----------
url: WeChat article URL (mp.weixin.qq.com/…)
timeout: HTTP request timeout in seconds (default 20).
"""
try:
resp = requests.get(url, headers={"User-Agent": _BROWSER_UA}, timeout=timeout)
resp.raise_for_status()
except requests.exceptions.RequestException as exc:
print(f"Error: failed to fetch URL: {exc}", file=sys.stderr)
sys.exit(1)
resp.encoding = "utf-8"
soup = BeautifulSoup(resp.text, "html.parser")
content = soup.find(id="js_content")
if content is None:
print("Error: #js_content not found — the page may require verification.", file=sys.stderr)
sys.exit(1)
_attach_title(soup, content)
return content
def extract_styles(content) -> dict:
"""Iterate all elements in *content*, group inline styles by tag name.
Returns ``{tag_name: [style_dict, ...], ...}`` for the target tags.
Only elements that have a non-empty ``style`` attribute are included.
"""
grouped: dict[str, list[dict]] = {tag: [] for tag in _TARGET_TAGS}
for elem in content.find_all(True):
tag = elem.name
if tag not in _TARGET_TAGS:
continue
raw_style = elem.get("style", "")
if not raw_style:
continue
parsed = parse_inline_style(raw_style)
if parsed:
grouped[tag].append(parsed)
return grouped
# ---------------------------------------------------------------------------
# 3. Style analysis
# ---------------------------------------------------------------------------
DEFAULTS = {
"primary": "#2563eb",
"secondary": "#3b82f6",
"text": "#333333",
"text_light": "#666666",
"background": "#ffffff",
"code_bg": "#1e293b",
"code_color": "#e2e8f0",
"quote_border": "#2563eb",
"quote_bg": "#eff6ff",
"border_radius": "8px",
"font_size": "16px",
"line_height": "1.8",
"letter_spacing": "0px",
"font_family": (
'-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, '
'"Helvetica Neue", Arial, "PingFang SC", "Hiragino Sans GB", '
'"Microsoft YaHei", sans-serif'
),
"p_margin": "0 0 16px 0",
}
def most_common_value(style_list: list, prop: str):
"""Return the most common value of CSS *prop* across *style_list*.
Returns ``None`` if the property does not appear in any dict.
"""
values = [d[prop] for d in style_list if prop in d and d[prop]]
if not values:
return None
return Counter(values).most_common(1)[0][0]
def _parse_px(value: str) -> float | None:
"""Parse a CSS pixel value like ``"16px"`` → 16.0, or return None."""
if not value:
return None
m = re.match(r"([\d.]+)\s*px", value.strip(), re.IGNORECASE)
return float(m.group(1)) if m else None
def analyze_styles(grouped: dict) -> dict:
"""Analyse the output of :func:`extract_styles` and return a flat theme dict.
Inferred properties (falling back to DEFAULTS when not found):
text, text_light, primary, secondary, background,
font_size, line_height, letter_spacing, font_family, p_margin,
quote_border, quote_bg, code_bg, code_color, border_radius.
"""
result = dict(DEFAULTS) # start with all defaults
# --- text ------------------------------------------------------------------
p_styles = grouped.get("p", [])
raw_text = most_common_value(p_styles, "color")
if raw_text:
result["text"] = rgb_to_hex(raw_text)
# --- text_light ------------------------------------------------------------
# Collect foreground colours only (not backgrounds) for text_light candidates
all_colors = []
for tag_styles in grouped.values():
for d in tag_styles:
val = d.get("color")
if val:
all_colors.append(rgb_to_hex(val))
text_light_candidates = [
c for c in all_colors
if is_gray(c) and 0.15 < lightness(c) < 0.85 and c != result["text"]
]
if text_light_candidates:
# Pick the one with the highest lightness
result["text_light"] = max(text_light_candidates, key=lightness)
# --- primary (accent color) ------------------------------------------------
# Collect non-gray colors from strong/section/h1-h3/span; boost colors from
# elements whose font-size is ≥ 20 px (weight × 5).
# Exclude the dominant text color so near-black body text never wins.
accent_tags = {"strong", "section", "h1", "h2", "h3", "span"}
accent_counter: Counter = Counter()
for tag in accent_tags:
for d in grouped.get(tag, []):
color_val = d.get("color")
if not color_val:
continue
hex_c = rgb_to_hex(color_val)
if is_gray(hex_c):
continue
if hex_c == result["text"]:
continue
# Check font-size for boost
fs = d.get("font-size")
fs_px = _parse_px(fs) if fs else None
weight = 5 if (fs_px is not None and fs_px >= 20) else 1
accent_counter[hex_c] += weight
if accent_counter:
sorted_accents = accent_counter.most_common()
result["primary"] = sorted_accents[0][0]
# --- secondary ---------------------------------------------------------
if len(sorted_accents) >= 2:
result["secondary"] = sorted_accents[1][0]
else:
# Derive: primary + 10% lightness, cap 0.90
primary_l = lightness(result["primary"])
result["secondary"] = adjust_lightness(
result["primary"], min(primary_l + 0.10, 0.90)
)
else:
# No accent found — derive secondary from default primary
primary_l = lightness(result["primary"])
result["secondary"] = adjust_lightness(
result["primary"], min(primary_l + 0.10, 0.90)
)
# --- background ------------------------------------------------------------
# Check background-color of the first few <section> elements for high lightness
for d in (grouped.get("section", []))[:10]:
bg = d.get("background-color") or d.get("background")
if bg:
hex_bg = rgb_to_hex(bg)
if lightness(hex_bg) > 0.85:
result["background"] = hex_bg
break
# --- typography (from <p>) -------------------------------------------------
if p_styles:
fs = most_common_value(p_styles, "font-size")
if fs:
result["font_size"] = fs
lh = most_common_value(p_styles, "line-height")
if lh:
result["line_height"] = lh
ls = most_common_value(p_styles, "letter-spacing")
if ls:
result["letter_spacing"] = ls
margin = most_common_value(p_styles, "margin")
if margin:
result["p_margin"] = margin
# font-family from <span>
span_styles = grouped.get("span", [])
ff = most_common_value(span_styles, "font-family")
if ff:
result["font_family"] = ff
# --- quote_border / quote_bg -----------------------------------------------
# Priority: actual <blockquote> elements first.
# For section/p: only use a background when a border-left is also present on
# that element (avoids picking up decorative divider colors).
bq_border = None
bq_bg = None
# Pass 1: blockquote (highest confidence)
for d in grouped.get("blockquote", []):
bl = d.get("border-left") or d.get("border-left-color")
if bl and not bq_border:
color_match = re.search(r"(rgb[a]?\([^)]+\)|#[0-9a-fA-F]{3,8})", bl)
if color_match:
bq_border = rgb_to_hex(color_match.group(1))
bg = d.get("background-color") or d.get("background")
if bg and not bq_bg:
hex_bg = rgb_to_hex(bg)
if hex_bg not in ("#ffffff", "#000000") and not is_gray(hex_bg, threshold=10):
bq_bg = hex_bg
# Pass 2: section/p — only trust backgrounds that co-occur with border-left
if not bq_border:
for tag in ("section", "p"):
for d in grouped.get(tag, []):
bl = d.get("border-left") or d.get("border-left-color")
if bl:
color_match = re.search(r"(rgb[a]?\([^)]+\)|#[0-9a-fA-F]{3,8})", bl)
if color_match and not bq_border:
bq_border = rgb_to_hex(color_match.group(1))
bg = d.get("background-color") or d.get("background")
if bg and not bq_bg:
hex_bg = rgb_to_hex(bg)
if hex_bg not in ("#ffffff", "#000000") and not is_gray(
hex_bg, threshold=10
):
bq_bg = hex_bg
if bq_border:
result["quote_border"] = bq_border
else:
result["quote_border"] = result["primary"]
if bq_bg:
result["quote_bg"] = bq_bg
else:
# Derive a light tint of primary
primary_l = lightness(result["primary"])
result["quote_bg"] = adjust_lightness(result["primary"], min(primary_l + 0.35, 0.95))
# --- code_bg / code_color --------------------------------------------------
for tag in ("pre", "code"):
tag_styles = grouped.get(tag, [])
bg = most_common_value(tag_styles, "background-color") or most_common_value(
tag_styles, "background"
)
if bg:
result["code_bg"] = rgb_to_hex(bg)
color = most_common_value(tag_styles, "color")
if color:
result["code_color"] = rgb_to_hex(color)
# --- border_radius ---------------------------------------------------------
all_radii = []
for tag_styles in grouped.values():
for d in tag_styles:
br = d.get("border-radius")
if br:
all_radii.append(br)
if all_radii:
result["border_radius"] = Counter(all_radii).most_common(1)[0][0]
return result
# ---------------------------------------------------------------------------
# CLI entry point / smoke test
# ---------------------------------------------------------------------------
def _load_from_file(path: str):
"""Load #js_content from a local HTML file (for smoke testing)."""
with open(path, encoding="utf-8") as fh:
soup = BeautifulSoup(fh.read(), "html.parser")
content = soup.find(id="js_content")
if content is None:
print(f"Error: #js_content not found in {path}", file=sys.stderr)
sys.exit(1)
_attach_title(soup, content)
return content
def main():
args = sys.argv[1:]
if not args:
print(__doc__)
sys.exit(0)
if args[0] == "--file" and len(args) >= 2:
content = _load_from_file(args[1])
else:
content = fetch_article(args[0])
print(f"Title: {content._wewrite_title}")
grouped = extract_styles(content)
print("Elements with styles:")
for tag, styles in grouped.items():
if styles:
print(f" <{tag}>: {len(styles)} elements")
theme = analyze_styles(grouped)
print("\nInferred theme:")
for key, val in theme.items():
print(f" {key}: {val}")
# Dark mode
dm = derive_darkmode(theme)
print("\nDerived dark mode:")
for key, val in dm.items():
print(f" {key}: {val}")
if __name__ == "__main__":
main()