#!/usr/bin/env python3 """ Learn from human edits by diffing AI draft vs published final. Compares the original AI-generated article with the human-edited version, computes structured diffs, and saves typed lessons to lessons/. Each lesson has: - type: word_sub / para_delete / para_add / structure / title / tone - occurrences: how many times this pattern has been seen across all lessons - first_seen / last_seen: timestamps for confidence decay - confidence: auto-computed from occurrences + recency When summarizing, outputs all patterns with aggregated confidence scores. The Agent uses this to write structured playbook.md rules. Usage: python3 learn_edits.py --draft path/to/draft.md --final path/to/final.md python3 learn_edits.py --from-wechat # auto-sync from WeChat draft box python3 learn_edits.py --summarize # all lessons with confidence python3 learn_edits.py --summarize --json # JSON output for agent """ import argparse import difflib import json import re import sys from datetime import datetime, timedelta from pathlib import Path import yaml SKILL_DIR = Path(__file__).parent.parent # Pattern types with descriptions PATTERN_TYPES = { "word_sub": "用词替换", "para_delete": "段落删除", "para_add": "段落新增", "structure": "结构调整", "title": "标题修改", "tone": "语气调整", "expression": "表达偏好", } def load_text(path: str) -> str: return Path(path).read_text(encoding="utf-8") def markdown_to_plaintext(md: str) -> str: """Strip markdown formatting to plain text for diff comparison.""" text = md # Remove HTML comments (editing anchors etc.) text = re.sub(r"", "", text, flags=re.DOTALL) # Remove markdown headers markers text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) # Remove bold/italic markers text = re.sub(r"\*{1,3}(.*?)\*{1,3}", r"\1", text) # Remove inline code text = re.sub(r"`([^`]+)`", r"\1", text) # Remove link syntax [text](url) → text text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) # Remove image syntax text = re.sub(r"!\[([^\]]*)\]\([^)]+\)", r"\1", text) # Collapse whitespace text = re.sub(r"[ \t]+", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def fetch_wechat_draft() -> tuple[str, str, str]: """ Fetch the latest draft from WeChat and find the corresponding local file. Returns (draft_plaintext, final_plaintext, draft_path). """ # Load config config_path = SKILL_DIR / "config.yaml" if not config_path.exists(): raise FileNotFoundError("config.yaml not found — need WeChat API credentials") with open(config_path) as f: config = yaml.safe_load(f) wechat = config.get("wechat", {}) appid = wechat.get("appid", "") secret = wechat.get("secret", "") if not appid or not secret: raise ValueError("config.yaml missing wechat.appid or wechat.secret") # Load history to find latest article with media_id history_path = SKILL_DIR / "history.yaml" if not history_path.exists(): raise FileNotFoundError("history.yaml not found — no articles to compare") with open(history_path) as f: history = yaml.safe_load(f) or [] # Find most recent article with media_id latest = None for article in reversed(history): if article.get("media_id"): latest = article break if not latest: raise ValueError("No article with media_id found in history.yaml") media_id = latest["media_id"] title = latest.get("title", "") # Find the local draft file # Priority: output_file field → title slug match → largest file date = latest.get("date", "") output_dir = SKILL_DIR / "output" draft_path = None # First try: exact path from history output_file = latest.get("output_file", "") if output_file: candidate = SKILL_DIR / output_file if not Path(output_file).is_absolute() else Path(output_file) if candidate.exists(): draft_path = candidate if not draft_path and date: candidates = sorted(output_dir.glob(f"{date}-*.md")) if len(candidates) == 1: draft_path = candidates[0] elif len(candidates) > 1: # Multiple files on same date — try to match by title keywords title_lower = title.lower() for c in candidates: slug = c.stem.replace(date + "-", "").replace("-", " ") # Check if slug words appear in title if any(w in title_lower for w in slug.split() if len(w) > 1): draft_path = c break if not draft_path: # Fallback: use the largest file (likely the final version) draft_path = max(candidates, key=lambda p: p.stat().st_size) if not draft_path or not draft_path.exists(): raise FileNotFoundError( f"Cannot find local draft for '{title}' (date={date}) in output/" ) # Get access token and fetch draft from WeChat sys.path.insert(0, str(SKILL_DIR / "toolkit")) from wechat_api import get_access_token from publisher import get_draft, html_to_plaintext token = get_access_token(appid, secret) html = get_draft(token, media_id) wechat_text = html_to_plaintext(html) # Convert local draft to plaintext local_md = load_text(str(draft_path)) local_text = markdown_to_plaintext(local_md) print(f"本地文件: {draft_path}") print(f"微信草稿: media_id={media_id}") print(f"文章标题: {title}") print(f"本地字数: {len(local_text)}, 微信字数: {len(wechat_text)}") return local_text, wechat_text, str(draft_path) def split_sections(text: str) -> list[dict]: """Split markdown into sections by H2 headers.""" sections = [] current = {"header": "(intro)", "lines": []} for line in text.split("\n"): if line.strip().startswith("## "): if current["lines"] or current["header"] != "(intro)": sections.append(current) current = {"header": line.strip(), "lines": []} else: current["lines"].append(line) sections.append(current) return sections def extract_title(text: str) -> str: for line in text.split("\n"): if line.strip().startswith("# ") and not line.strip().startswith("## "): return line.strip()[2:].strip() return "" def compute_diff(draft: str, final: str) -> dict: """Compute structured diff between draft and final.""" draft_lines = draft.split("\n") final_lines = final.split("\n") differ = difflib.unified_diff(draft_lines, final_lines, lineterm="") diff_lines = list(differ) additions = [l[1:].strip() for l in diff_lines if l.startswith("+") and not l.startswith("+++") and l[1:].strip()] deletions = [l[1:].strip() for l in diff_lines if l.startswith("-") and not l.startswith("---") and l[1:].strip()] draft_title = extract_title(draft) final_title = extract_title(final) draft_sections = split_sections(draft) final_sections = split_sections(final) draft_h2s = [s["header"] for s in draft_sections if s["header"] != "(intro)"] final_h2s = [s["header"] for s in final_sections if s["header"] != "(intro)"] draft_chars = len(draft.replace("\n", "").replace(" ", "")) final_chars = len(final.replace("\n", "").replace(" ", "")) return { "title_changed": draft_title != final_title, "draft_title": draft_title, "final_title": final_title, "structure_changed": draft_h2s != final_h2s, "draft_h2s": draft_h2s, "final_h2s": final_h2s, "lines_added": len(additions), "lines_deleted": len(deletions), "draft_chars": draft_chars, "final_chars": final_chars, "char_diff": final_chars - draft_chars, "additions_sample": additions[:20], "deletions_sample": deletions[:20], } def save_lesson(diff_result: dict, draft_path: str, final_path: str) -> Path: """Save structured lesson data for Agent to analyze.""" lessons_dir = SKILL_DIR / "lessons" lessons_dir.mkdir(parents=True, exist_ok=True) date_str = datetime.now().strftime("%Y-%m-%d") lesson_file = lessons_dir / f"{date_str}-diff.yaml" counter = 1 while lesson_file.exists(): lesson_file = lessons_dir / f"{date_str}-diff-{counter}.yaml" counter += 1 data = { "date": date_str, "timestamp": datetime.now().isoformat(), "draft_file": str(draft_path), "final_file": str(final_path), "diff_summary": { "title_changed": diff_result["title_changed"], "draft_title": diff_result["draft_title"], "final_title": diff_result["final_title"], "structure_changed": diff_result["structure_changed"], "lines_added": diff_result["lines_added"], "lines_deleted": diff_result["lines_deleted"], "char_diff": diff_result["char_diff"], }, # Agent fills these after analyzing the draft and final: "patterns": [], # Pattern format (Agent writes): # - type: "word_sub" # one of PATTERN_TYPES keys # key: "avoid_jiangzhen" # short unique identifier # description: "把'讲真'替换为'坦白说'" # rule: "不要使用'讲真',用'坦白说'代替" # imperative, executable } with open(lesson_file, "w", encoding="utf-8") as f: yaml.dump(data, f, allow_unicode=True, default_flow_style=False) return lesson_file def load_all_lessons() -> list[dict]: """Load all lesson files.""" lessons_dir = SKILL_DIR / "lessons" if not lessons_dir.exists(): return [] lessons = [] for f in sorted(lessons_dir.glob("*-diff*.yaml")): with open(f, "r", encoding="utf-8") as fh: data = yaml.safe_load(fh) if data: lessons.append(data) return lessons def compute_confidence(occurrences: int, first_seen: str, last_seen: str) -> float: """Compute confidence score from frequency and recency. Confidence = base_from_occurrences + recency_bonus - age_decay. - 1 occurrence = 3 (low, might be one-off) - 2 occurrences = 5 (moderate, likely a preference) - 3+ occurrences = 7+ (high, confirmed preference) - Recency bonus: +1 if last_seen within 7 days - Age decay: -1 per 30 days since last_seen (user style evolves) - Clamped to 1-10 """ base = min(8, 2 + occurrences * 2) try: last = datetime.fromisoformat(last_seen) days_since = (datetime.now() - last).days except (ValueError, TypeError): days_since = 0 recency_bonus = 1.0 if days_since <= 7 else 0.0 age_decay = max(0, days_since // 30) return max(1.0, min(10.0, base + recency_bonus - age_decay)) def aggregate_patterns(lessons: list[dict]) -> list[dict]: """Aggregate patterns across all lessons. Returns sorted by confidence.""" pattern_map = {} # key → aggregated data for lesson in lessons: date = lesson.get("date", "") timestamp = lesson.get("timestamp", date) for p in lesson.get("patterns", []): key = p.get("key", "") if not key: continue if key not in pattern_map: pattern_map[key] = { "key": key, "type": p.get("type", "expression"), "description": p.get("description", ""), "rule": p.get("rule", ""), "occurrences": 0, "first_seen": timestamp, "last_seen": timestamp, } entry = pattern_map[key] entry["occurrences"] += 1 # Keep the most recent description/rule (may evolve) if p.get("description"): entry["description"] = p["description"] if p.get("rule"): entry["rule"] = p["rule"] # Update timestamps if timestamp < entry["first_seen"]: entry["first_seen"] = timestamp if timestamp > entry["last_seen"]: entry["last_seen"] = timestamp # Compute confidence for each results = [] for entry in pattern_map.values(): entry["confidence"] = round(compute_confidence( entry["occurrences"], entry["first_seen"], entry["last_seen"] ), 1) results.append(entry) # Sort by confidence descending results.sort(key=lambda x: x["confidence"], reverse=True) return results def summarize_lessons(as_json: bool = False): """Load all lessons, aggregate patterns, output with confidence scores.""" lessons = load_all_lessons() if not lessons: print("No lessons found.") return patterns = aggregate_patterns(lessons) if as_json: print(json.dumps({ "total_lessons": len(lessons), "total_patterns": len(patterns), "patterns": patterns, }, ensure_ascii=False, indent=2)) return print(f"Total lessons: {len(lessons)}") print(f"Unique patterns: {len(patterns)}") print() for p in patterns: type_label = PATTERN_TYPES.get(p["type"], p["type"]) conf_bar = "█" * int(p["confidence"]) + "░" * (10 - int(p["confidence"])) print(f" {conf_bar} {p['confidence']:4.1f} [{type_label}] {p['key']}") print(f" {p['description']}") if p["rule"]: print(f" → {p['rule']}") print(f" seen {p['occurrences']}x, first {p['first_seen'][:10]}, last {p['last_seen'][:10]}") print() def main(): parser = argparse.ArgumentParser(description="Learn from human edits") parser.add_argument("--draft", help="Path to AI draft") parser.add_argument("--final", help="Path to human-edited final") parser.add_argument("--from-wechat", action="store_true", help="Auto-fetch edited version from WeChat draft box") parser.add_argument("--summarize", action="store_true", help="Summarize all lessons") parser.add_argument("--json", action="store_true", help="JSON output (with --summarize)") args = parser.parse_args() if args.summarize: summarize_lessons(as_json=args.json) return if args.from_wechat: local_text, wechat_text, draft_path = fetch_wechat_draft() if local_text == wechat_text: print("\n微信草稿与本地文件内容一致,没有修改。") return diff_result = compute_diff(local_text, wechat_text) # Save with special marker for wechat source lesson_file = save_lesson(diff_result, draft_path, f"wechat:{draft_path}") print(f"\nLesson saved to: {lesson_file}") print(f"\n检测到 {diff_result['lines_added']} 处新增, {diff_result['lines_deleted']} 处删除") print(f"字数变化: {diff_result['char_diff']:+d}") print(f"\nAgent 接下来读取 {draft_path} 和微信草稿内容,分析修改模式并写入 {lesson_file}") return if not args.draft or not args.final: print("Error: --draft and --final required (or use --from-wechat)", file=sys.stderr) sys.exit(1) draft = load_text(args.draft) final = load_text(args.final) diff_result = compute_diff(draft, final) # Print summary print("=" * 60) print("EDIT ANALYSIS") print("=" * 60) if diff_result["title_changed"]: print(f"\n标题修改:") print(f" AI: {diff_result['draft_title']}") print(f" 人工: {diff_result['final_title']}") if diff_result["structure_changed"]: print(f"\n结构修改:") print(f" AI H2: {diff_result['draft_h2s']}") print(f" 人工 H2: {diff_result['final_h2s']}") print(f"\n数量变化:") print(f" 新增 {diff_result['lines_added']} 行, 删除 {diff_result['lines_deleted']} 行") print(f" 字数变化: {diff_result['char_diff']:+d} ({diff_result['draft_chars']} → {diff_result['final_chars']})") if diff_result["deletions_sample"]: print(f"\n被删除的内容(采样):") for line in diff_result["deletions_sample"][:10]: print(f" - {line[:80]}") if diff_result["additions_sample"]: print(f"\n新增的内容(采样):") for line in diff_result["additions_sample"][:10]: print(f" + {line[:80]}") # Save lesson lesson_file = save_lesson(diff_result, args.draft, args.final) print(f"\nLesson saved to: {lesson_file}") # Auto-grow exemplar library from edited finals final_title = extract_title(final) try: import extract_exemplar exemplar = extract_exemplar.extract_exemplar(final, source=final_title or "user-edited") if exemplar["humanness_score"] <= 50: exemplar_path = extract_exemplar.save_exemplar(exemplar) print(f"\n✓ 终稿已加入范文库: {exemplar_path}") print(f" Score: {exemplar['humanness_score']:.1f}/100, Category: {exemplar['category']}") else: print(f"\n⚠ 终稿 humanness_score={exemplar['humanness_score']:.1f} > 50,未加入范文库") except Exception as e: print(f"\n⚠ 范文提取跳过: {e}") lesson_count = len(load_all_lessons()) print(f"Total lessons: {lesson_count}") if lesson_count >= 5 and lesson_count % 5 == 0: print(f"\n{'=' * 60}") print("PLAYBOOK UPDATE TRIGGERED") print(f"{'=' * 60}") print(f"{lesson_count} lessons. Agent should run:") print(f" python3 scripts/learn_edits.py --summarize --json") print(f"Then update playbook.md with high-confidence patterns.") # Instructions for Agent print(f""" {'=' * 60} INSTRUCTIONS FOR AGENT {'=' * 60} Read the draft and final versions, then for each meaningful edit: 1. Read: {args.draft} 2. Read: {args.final} 3. For each edit, add a pattern entry to {lesson_file}: patterns: - type: "word_sub" # {' / '.join(PATTERN_TYPES.keys())} key: "short_unique_id" # e.g. "avoid_jiangzhen", "shorter_paragraphs" description: "把'讲真'替换为'坦白说'" rule: "不要使用'讲真',用'坦白说'代替" # imperative, executable 4. Rules must be imperative (可执行的指令), not descriptive. BAD: "用户偏好简短段落" GOOD: "段落不超过 80 字,长段必须在 3 句内换行" 5. If pattern already exists in previous lessons (same key), confidence will auto-increase on next --summarize. """) if __name__ == "__main__": main()