wewrite/scripts/learn_edits.py
wangzhuc 73a67fffc7 fix: improve local file matching for WeChat draft sync
- learn_edits.py: prioritize output_file field from history.yaml,
  fall back to title slug matching, then largest file
- SKILL.md: add output_file field to history.yaml schema
- Fixes wrong file match when multiple articles share the same date

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 16:20:17 +08:00

515 lines
18 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Learn from human edits by diffing AI draft vs published final.
Compares the original AI-generated article with the human-edited version,
computes structured diffs, and saves typed lessons to lessons/.
Each lesson has:
- type: word_sub / para_delete / para_add / structure / title / tone
- occurrences: how many times this pattern has been seen across all lessons
- first_seen / last_seen: timestamps for confidence decay
- confidence: auto-computed from occurrences + recency
When summarizing, outputs all patterns with aggregated confidence scores.
The Agent uses this to write structured playbook.md rules.
Usage:
python3 learn_edits.py --draft path/to/draft.md --final path/to/final.md
python3 learn_edits.py --from-wechat # auto-sync from WeChat draft box
python3 learn_edits.py --summarize # all lessons with confidence
python3 learn_edits.py --summarize --json # JSON output for agent
"""
import argparse
import difflib
import json
import re
import sys
from datetime import datetime, timedelta
from pathlib import Path
import yaml
SKILL_DIR = Path(__file__).parent.parent
# Pattern types with descriptions
PATTERN_TYPES = {
"word_sub": "用词替换",
"para_delete": "段落删除",
"para_add": "段落新增",
"structure": "结构调整",
"title": "标题修改",
"tone": "语气调整",
"expression": "表达偏好",
}
def load_text(path: str) -> str:
return Path(path).read_text(encoding="utf-8")
def markdown_to_plaintext(md: str) -> str:
"""Strip markdown formatting to plain text for diff comparison."""
text = md
# Remove HTML comments (editing anchors etc.)
text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
# Remove markdown headers markers
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
# Remove bold/italic markers
text = re.sub(r"\*{1,3}(.*?)\*{1,3}", r"\1", text)
# Remove inline code
text = re.sub(r"`([^`]+)`", r"\1", text)
# Remove link syntax [text](url) → text
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
# Remove image syntax
text = re.sub(r"!\[([^\]]*)\]\([^)]+\)", r"\1", text)
# Collapse whitespace
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def fetch_wechat_draft() -> tuple[str, str, str]:
"""
Fetch the latest draft from WeChat and find the corresponding local file.
Returns (draft_plaintext, final_plaintext, draft_path).
"""
# Load config
config_path = SKILL_DIR / "config.yaml"
if not config_path.exists():
raise FileNotFoundError("config.yaml not found — need WeChat API credentials")
with open(config_path) as f:
config = yaml.safe_load(f)
wechat = config.get("wechat", {})
appid = wechat.get("appid", "")
secret = wechat.get("secret", "")
if not appid or not secret:
raise ValueError("config.yaml missing wechat.appid or wechat.secret")
# Load history to find latest article with media_id
history_path = SKILL_DIR / "history.yaml"
if not history_path.exists():
raise FileNotFoundError("history.yaml not found — no articles to compare")
with open(history_path) as f:
history = yaml.safe_load(f) or []
# Find most recent article with media_id
latest = None
for article in reversed(history):
if article.get("media_id"):
latest = article
break
if not latest:
raise ValueError("No article with media_id found in history.yaml")
media_id = latest["media_id"]
title = latest.get("title", "")
# Find the local draft file
# Priority: output_file field → title slug match → largest file
date = latest.get("date", "")
output_dir = SKILL_DIR / "output"
draft_path = None
# First try: exact path from history
output_file = latest.get("output_file", "")
if output_file:
candidate = SKILL_DIR / output_file if not Path(output_file).is_absolute() else Path(output_file)
if candidate.exists():
draft_path = candidate
if not draft_path and date:
candidates = sorted(output_dir.glob(f"{date}-*.md"))
if len(candidates) == 1:
draft_path = candidates[0]
elif len(candidates) > 1:
# Multiple files on same date — try to match by title keywords
title_lower = title.lower()
for c in candidates:
slug = c.stem.replace(date + "-", "").replace("-", " ")
# Check if slug words appear in title
if any(w in title_lower for w in slug.split() if len(w) > 1):
draft_path = c
break
if not draft_path:
# Fallback: use the largest file (likely the final version)
draft_path = max(candidates, key=lambda p: p.stat().st_size)
if not draft_path or not draft_path.exists():
raise FileNotFoundError(
f"Cannot find local draft for '{title}' (date={date}) in output/"
)
# Get access token and fetch draft from WeChat
sys.path.insert(0, str(SKILL_DIR / "toolkit"))
from wechat_api import get_access_token
from publisher import get_draft, html_to_plaintext
token = get_access_token(appid, secret)
html = get_draft(token, media_id)
wechat_text = html_to_plaintext(html)
# Convert local draft to plaintext
local_md = load_text(str(draft_path))
local_text = markdown_to_plaintext(local_md)
print(f"本地文件: {draft_path}")
print(f"微信草稿: media_id={media_id}")
print(f"文章标题: {title}")
print(f"本地字数: {len(local_text)}, 微信字数: {len(wechat_text)}")
return local_text, wechat_text, str(draft_path)
def split_sections(text: str) -> list[dict]:
"""Split markdown into sections by H2 headers."""
sections = []
current = {"header": "(intro)", "lines": []}
for line in text.split("\n"):
if line.strip().startswith("## "):
if current["lines"] or current["header"] != "(intro)":
sections.append(current)
current = {"header": line.strip(), "lines": []}
else:
current["lines"].append(line)
sections.append(current)
return sections
def extract_title(text: str) -> str:
for line in text.split("\n"):
if line.strip().startswith("# ") and not line.strip().startswith("## "):
return line.strip()[2:].strip()
return ""
def compute_diff(draft: str, final: str) -> dict:
"""Compute structured diff between draft and final."""
draft_lines = draft.split("\n")
final_lines = final.split("\n")
differ = difflib.unified_diff(draft_lines, final_lines, lineterm="")
diff_lines = list(differ)
additions = [l[1:].strip() for l in diff_lines
if l.startswith("+") and not l.startswith("+++") and l[1:].strip()]
deletions = [l[1:].strip() for l in diff_lines
if l.startswith("-") and not l.startswith("---") and l[1:].strip()]
draft_title = extract_title(draft)
final_title = extract_title(final)
draft_sections = split_sections(draft)
final_sections = split_sections(final)
draft_h2s = [s["header"] for s in draft_sections if s["header"] != "(intro)"]
final_h2s = [s["header"] for s in final_sections if s["header"] != "(intro)"]
draft_chars = len(draft.replace("\n", "").replace(" ", ""))
final_chars = len(final.replace("\n", "").replace(" ", ""))
return {
"title_changed": draft_title != final_title,
"draft_title": draft_title,
"final_title": final_title,
"structure_changed": draft_h2s != final_h2s,
"draft_h2s": draft_h2s,
"final_h2s": final_h2s,
"lines_added": len(additions),
"lines_deleted": len(deletions),
"draft_chars": draft_chars,
"final_chars": final_chars,
"char_diff": final_chars - draft_chars,
"additions_sample": additions[:20],
"deletions_sample": deletions[:20],
}
def save_lesson(diff_result: dict, draft_path: str, final_path: str) -> Path:
"""Save structured lesson data for Agent to analyze."""
lessons_dir = SKILL_DIR / "lessons"
lessons_dir.mkdir(parents=True, exist_ok=True)
date_str = datetime.now().strftime("%Y-%m-%d")
lesson_file = lessons_dir / f"{date_str}-diff.yaml"
counter = 1
while lesson_file.exists():
lesson_file = lessons_dir / f"{date_str}-diff-{counter}.yaml"
counter += 1
data = {
"date": date_str,
"timestamp": datetime.now().isoformat(),
"draft_file": str(draft_path),
"final_file": str(final_path),
"diff_summary": {
"title_changed": diff_result["title_changed"],
"draft_title": diff_result["draft_title"],
"final_title": diff_result["final_title"],
"structure_changed": diff_result["structure_changed"],
"lines_added": diff_result["lines_added"],
"lines_deleted": diff_result["lines_deleted"],
"char_diff": diff_result["char_diff"],
},
# Agent fills these after analyzing the draft and final:
"patterns": [],
# Pattern format (Agent writes):
# - type: "word_sub" # one of PATTERN_TYPES keys
# key: "avoid_jiangzhen" # short unique identifier
# description: "把'讲真'替换为'坦白说'"
# rule: "不要使用'讲真',用'坦白说'代替" # imperative, executable
}
with open(lesson_file, "w", encoding="utf-8") as f:
yaml.dump(data, f, allow_unicode=True, default_flow_style=False)
return lesson_file
def load_all_lessons() -> list[dict]:
"""Load all lesson files."""
lessons_dir = SKILL_DIR / "lessons"
if not lessons_dir.exists():
return []
lessons = []
for f in sorted(lessons_dir.glob("*-diff*.yaml")):
with open(f, "r", encoding="utf-8") as fh:
data = yaml.safe_load(fh)
if data:
lessons.append(data)
return lessons
def compute_confidence(occurrences: int, first_seen: str, last_seen: str) -> float:
"""Compute confidence score from frequency and recency.
Confidence = base_from_occurrences + recency_bonus - age_decay.
- 1 occurrence = 3 (low, might be one-off)
- 2 occurrences = 5 (moderate, likely a preference)
- 3+ occurrences = 7+ (high, confirmed preference)
- Recency bonus: +1 if last_seen within 7 days
- Age decay: -1 per 30 days since last_seen (user style evolves)
- Clamped to 1-10
"""
base = min(8, 2 + occurrences * 2)
try:
last = datetime.fromisoformat(last_seen)
days_since = (datetime.now() - last).days
except (ValueError, TypeError):
days_since = 0
recency_bonus = 1.0 if days_since <= 7 else 0.0
age_decay = max(0, days_since // 30)
return max(1.0, min(10.0, base + recency_bonus - age_decay))
def aggregate_patterns(lessons: list[dict]) -> list[dict]:
"""Aggregate patterns across all lessons. Returns sorted by confidence."""
pattern_map = {} # key → aggregated data
for lesson in lessons:
date = lesson.get("date", "")
timestamp = lesson.get("timestamp", date)
for p in lesson.get("patterns", []):
key = p.get("key", "")
if not key:
continue
if key not in pattern_map:
pattern_map[key] = {
"key": key,
"type": p.get("type", "expression"),
"description": p.get("description", ""),
"rule": p.get("rule", ""),
"occurrences": 0,
"first_seen": timestamp,
"last_seen": timestamp,
}
entry = pattern_map[key]
entry["occurrences"] += 1
# Keep the most recent description/rule (may evolve)
if p.get("description"):
entry["description"] = p["description"]
if p.get("rule"):
entry["rule"] = p["rule"]
# Update timestamps
if timestamp < entry["first_seen"]:
entry["first_seen"] = timestamp
if timestamp > entry["last_seen"]:
entry["last_seen"] = timestamp
# Compute confidence for each
results = []
for entry in pattern_map.values():
entry["confidence"] = round(compute_confidence(
entry["occurrences"], entry["first_seen"], entry["last_seen"]
), 1)
results.append(entry)
# Sort by confidence descending
results.sort(key=lambda x: x["confidence"], reverse=True)
return results
def summarize_lessons(as_json: bool = False):
"""Load all lessons, aggregate patterns, output with confidence scores."""
lessons = load_all_lessons()
if not lessons:
print("No lessons found.")
return
patterns = aggregate_patterns(lessons)
if as_json:
print(json.dumps({
"total_lessons": len(lessons),
"total_patterns": len(patterns),
"patterns": patterns,
}, ensure_ascii=False, indent=2))
return
print(f"Total lessons: {len(lessons)}")
print(f"Unique patterns: {len(patterns)}")
print()
for p in patterns:
type_label = PATTERN_TYPES.get(p["type"], p["type"])
conf_bar = "" * int(p["confidence"]) + "" * (10 - int(p["confidence"]))
print(f" {conf_bar} {p['confidence']:4.1f} [{type_label}] {p['key']}")
print(f" {p['description']}")
if p["rule"]:
print(f"{p['rule']}")
print(f" seen {p['occurrences']}x, first {p['first_seen'][:10]}, last {p['last_seen'][:10]}")
print()
def main():
parser = argparse.ArgumentParser(description="Learn from human edits")
parser.add_argument("--draft", help="Path to AI draft")
parser.add_argument("--final", help="Path to human-edited final")
parser.add_argument("--from-wechat", action="store_true",
help="Auto-fetch edited version from WeChat draft box")
parser.add_argument("--summarize", action="store_true", help="Summarize all lessons")
parser.add_argument("--json", action="store_true", help="JSON output (with --summarize)")
args = parser.parse_args()
if args.summarize:
summarize_lessons(as_json=args.json)
return
if args.from_wechat:
local_text, wechat_text, draft_path = fetch_wechat_draft()
if local_text == wechat_text:
print("\n微信草稿与本地文件内容一致,没有修改。")
return
diff_result = compute_diff(local_text, wechat_text)
# Save with special marker for wechat source
lesson_file = save_lesson(diff_result, draft_path, f"wechat:{draft_path}")
print(f"\nLesson saved to: {lesson_file}")
print(f"\n检测到 {diff_result['lines_added']} 处新增, {diff_result['lines_deleted']} 处删除")
print(f"字数变化: {diff_result['char_diff']:+d}")
print(f"\nAgent 接下来读取 {draft_path} 和微信草稿内容,分析修改模式并写入 {lesson_file}")
return
if not args.draft or not args.final:
print("Error: --draft and --final required (or use --from-wechat)", file=sys.stderr)
sys.exit(1)
draft = load_text(args.draft)
final = load_text(args.final)
diff_result = compute_diff(draft, final)
# Print summary
print("=" * 60)
print("EDIT ANALYSIS")
print("=" * 60)
if diff_result["title_changed"]:
print(f"\n标题修改:")
print(f" AI: {diff_result['draft_title']}")
print(f" 人工: {diff_result['final_title']}")
if diff_result["structure_changed"]:
print(f"\n结构修改:")
print(f" AI H2: {diff_result['draft_h2s']}")
print(f" 人工 H2: {diff_result['final_h2s']}")
print(f"\n数量变化:")
print(f" 新增 {diff_result['lines_added']} 行, 删除 {diff_result['lines_deleted']}")
print(f" 字数变化: {diff_result['char_diff']:+d} ({diff_result['draft_chars']}{diff_result['final_chars']})")
if diff_result["deletions_sample"]:
print(f"\n被删除的内容(采样):")
for line in diff_result["deletions_sample"][:10]:
print(f" - {line[:80]}")
if diff_result["additions_sample"]:
print(f"\n新增的内容(采样):")
for line in diff_result["additions_sample"][:10]:
print(f" + {line[:80]}")
# Save lesson
lesson_file = save_lesson(diff_result, args.draft, args.final)
print(f"\nLesson saved to: {lesson_file}")
# Auto-grow exemplar library from edited finals
final_title = extract_title(final)
try:
import extract_exemplar
exemplar = extract_exemplar.extract_exemplar(final, source=final_title or "user-edited")
if exemplar["humanness_score"] <= 50:
exemplar_path = extract_exemplar.save_exemplar(exemplar)
print(f"\n✓ 终稿已加入范文库: {exemplar_path}")
print(f" Score: {exemplar['humanness_score']:.1f}/100, Category: {exemplar['category']}")
else:
print(f"\n⚠ 终稿 humanness_score={exemplar['humanness_score']:.1f} > 50未加入范文库")
except Exception as e:
print(f"\n⚠ 范文提取跳过: {e}")
lesson_count = len(load_all_lessons())
print(f"Total lessons: {lesson_count}")
if lesson_count >= 5 and lesson_count % 5 == 0:
print(f"\n{'=' * 60}")
print("PLAYBOOK UPDATE TRIGGERED")
print(f"{'=' * 60}")
print(f"{lesson_count} lessons. Agent should run:")
print(f" python3 scripts/learn_edits.py --summarize --json")
print(f"Then update playbook.md with high-confidence patterns.")
# Instructions for Agent
print(f"""
{'=' * 60}
INSTRUCTIONS FOR AGENT
{'=' * 60}
Read the draft and final versions, then for each meaningful edit:
1. Read: {args.draft}
2. Read: {args.final}
3. For each edit, add a pattern entry to {lesson_file}:
patterns:
- type: "word_sub" # {' / '.join(PATTERN_TYPES.keys())}
key: "short_unique_id" # e.g. "avoid_jiangzhen", "shorter_paragraphs"
description: "'讲真'替换为'坦白说'"
rule: "不要使用'讲真',用'坦白说'代替" # imperative, executable
4. Rules must be imperative (可执行的指令), not descriptive.
BAD: "用户偏好简短段落"
GOOD: "段落不超过 80 字,长段必须在 3 句内换行"
5. If pattern already exists in previous lessons (same key),
confidence will auto-increase on next --summarize.
""")
if __name__ == "__main__":
main()