- learn_edits.py: prioritize output_file field from history.yaml, fall back to title slug matching, then largest file - SKILL.md: add output_file field to history.yaml schema - Fixes wrong file match when multiple articles share the same date Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
515 lines
18 KiB
Python
515 lines
18 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Learn from human edits by diffing AI draft vs published final.
|
||
|
||
Compares the original AI-generated article with the human-edited version,
|
||
computes structured diffs, and saves typed lessons to lessons/.
|
||
|
||
Each lesson has:
|
||
- type: word_sub / para_delete / para_add / structure / title / tone
|
||
- occurrences: how many times this pattern has been seen across all lessons
|
||
- first_seen / last_seen: timestamps for confidence decay
|
||
- confidence: auto-computed from occurrences + recency
|
||
|
||
When summarizing, outputs all patterns with aggregated confidence scores.
|
||
The Agent uses this to write structured playbook.md rules.
|
||
|
||
Usage:
|
||
python3 learn_edits.py --draft path/to/draft.md --final path/to/final.md
|
||
python3 learn_edits.py --from-wechat # auto-sync from WeChat draft box
|
||
python3 learn_edits.py --summarize # all lessons with confidence
|
||
python3 learn_edits.py --summarize --json # JSON output for agent
|
||
"""
|
||
|
||
import argparse
|
||
import difflib
|
||
import json
|
||
import re
|
||
import sys
|
||
from datetime import datetime, timedelta
|
||
from pathlib import Path
|
||
|
||
import yaml
|
||
|
||
SKILL_DIR = Path(__file__).parent.parent
|
||
|
||
# Pattern types with descriptions
|
||
PATTERN_TYPES = {
|
||
"word_sub": "用词替换",
|
||
"para_delete": "段落删除",
|
||
"para_add": "段落新增",
|
||
"structure": "结构调整",
|
||
"title": "标题修改",
|
||
"tone": "语气调整",
|
||
"expression": "表达偏好",
|
||
}
|
||
|
||
|
||
def load_text(path: str) -> str:
|
||
return Path(path).read_text(encoding="utf-8")
|
||
|
||
|
||
def markdown_to_plaintext(md: str) -> str:
|
||
"""Strip markdown formatting to plain text for diff comparison."""
|
||
text = md
|
||
# Remove HTML comments (editing anchors etc.)
|
||
text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
|
||
# Remove markdown headers markers
|
||
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
|
||
# Remove bold/italic markers
|
||
text = re.sub(r"\*{1,3}(.*?)\*{1,3}", r"\1", text)
|
||
# Remove inline code
|
||
text = re.sub(r"`([^`]+)`", r"\1", text)
|
||
# Remove link syntax [text](url) → text
|
||
text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
|
||
# Remove image syntax
|
||
text = re.sub(r"!\[([^\]]*)\]\([^)]+\)", r"\1", text)
|
||
# Collapse whitespace
|
||
text = re.sub(r"[ \t]+", " ", text)
|
||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||
return text.strip()
|
||
|
||
|
||
def fetch_wechat_draft() -> tuple[str, str, str]:
|
||
"""
|
||
Fetch the latest draft from WeChat and find the corresponding local file.
|
||
Returns (draft_plaintext, final_plaintext, draft_path).
|
||
"""
|
||
# Load config
|
||
config_path = SKILL_DIR / "config.yaml"
|
||
if not config_path.exists():
|
||
raise FileNotFoundError("config.yaml not found — need WeChat API credentials")
|
||
|
||
with open(config_path) as f:
|
||
config = yaml.safe_load(f)
|
||
|
||
wechat = config.get("wechat", {})
|
||
appid = wechat.get("appid", "")
|
||
secret = wechat.get("secret", "")
|
||
if not appid or not secret:
|
||
raise ValueError("config.yaml missing wechat.appid or wechat.secret")
|
||
|
||
# Load history to find latest article with media_id
|
||
history_path = SKILL_DIR / "history.yaml"
|
||
if not history_path.exists():
|
||
raise FileNotFoundError("history.yaml not found — no articles to compare")
|
||
|
||
with open(history_path) as f:
|
||
history = yaml.safe_load(f) or []
|
||
|
||
# Find most recent article with media_id
|
||
latest = None
|
||
for article in reversed(history):
|
||
if article.get("media_id"):
|
||
latest = article
|
||
break
|
||
|
||
if not latest:
|
||
raise ValueError("No article with media_id found in history.yaml")
|
||
|
||
media_id = latest["media_id"]
|
||
title = latest.get("title", "")
|
||
|
||
# Find the local draft file
|
||
# Priority: output_file field → title slug match → largest file
|
||
date = latest.get("date", "")
|
||
output_dir = SKILL_DIR / "output"
|
||
draft_path = None
|
||
|
||
# First try: exact path from history
|
||
output_file = latest.get("output_file", "")
|
||
if output_file:
|
||
candidate = SKILL_DIR / output_file if not Path(output_file).is_absolute() else Path(output_file)
|
||
if candidate.exists():
|
||
draft_path = candidate
|
||
|
||
if not draft_path and date:
|
||
candidates = sorted(output_dir.glob(f"{date}-*.md"))
|
||
if len(candidates) == 1:
|
||
draft_path = candidates[0]
|
||
elif len(candidates) > 1:
|
||
# Multiple files on same date — try to match by title keywords
|
||
title_lower = title.lower()
|
||
for c in candidates:
|
||
slug = c.stem.replace(date + "-", "").replace("-", " ")
|
||
# Check if slug words appear in title
|
||
if any(w in title_lower for w in slug.split() if len(w) > 1):
|
||
draft_path = c
|
||
break
|
||
if not draft_path:
|
||
# Fallback: use the largest file (likely the final version)
|
||
draft_path = max(candidates, key=lambda p: p.stat().st_size)
|
||
|
||
if not draft_path or not draft_path.exists():
|
||
raise FileNotFoundError(
|
||
f"Cannot find local draft for '{title}' (date={date}) in output/"
|
||
)
|
||
|
||
# Get access token and fetch draft from WeChat
|
||
sys.path.insert(0, str(SKILL_DIR / "toolkit"))
|
||
from wechat_api import get_access_token
|
||
from publisher import get_draft, html_to_plaintext
|
||
|
||
token = get_access_token(appid, secret)
|
||
html = get_draft(token, media_id)
|
||
wechat_text = html_to_plaintext(html)
|
||
|
||
# Convert local draft to plaintext
|
||
local_md = load_text(str(draft_path))
|
||
local_text = markdown_to_plaintext(local_md)
|
||
|
||
print(f"本地文件: {draft_path}")
|
||
print(f"微信草稿: media_id={media_id}")
|
||
print(f"文章标题: {title}")
|
||
print(f"本地字数: {len(local_text)}, 微信字数: {len(wechat_text)}")
|
||
|
||
return local_text, wechat_text, str(draft_path)
|
||
|
||
|
||
def split_sections(text: str) -> list[dict]:
|
||
"""Split markdown into sections by H2 headers."""
|
||
sections = []
|
||
current = {"header": "(intro)", "lines": []}
|
||
for line in text.split("\n"):
|
||
if line.strip().startswith("## "):
|
||
if current["lines"] or current["header"] != "(intro)":
|
||
sections.append(current)
|
||
current = {"header": line.strip(), "lines": []}
|
||
else:
|
||
current["lines"].append(line)
|
||
sections.append(current)
|
||
return sections
|
||
|
||
|
||
def extract_title(text: str) -> str:
|
||
for line in text.split("\n"):
|
||
if line.strip().startswith("# ") and not line.strip().startswith("## "):
|
||
return line.strip()[2:].strip()
|
||
return ""
|
||
|
||
|
||
def compute_diff(draft: str, final: str) -> dict:
|
||
"""Compute structured diff between draft and final."""
|
||
draft_lines = draft.split("\n")
|
||
final_lines = final.split("\n")
|
||
|
||
differ = difflib.unified_diff(draft_lines, final_lines, lineterm="")
|
||
diff_lines = list(differ)
|
||
|
||
additions = [l[1:].strip() for l in diff_lines
|
||
if l.startswith("+") and not l.startswith("+++") and l[1:].strip()]
|
||
deletions = [l[1:].strip() for l in diff_lines
|
||
if l.startswith("-") and not l.startswith("---") and l[1:].strip()]
|
||
|
||
draft_title = extract_title(draft)
|
||
final_title = extract_title(final)
|
||
|
||
draft_sections = split_sections(draft)
|
||
final_sections = split_sections(final)
|
||
draft_h2s = [s["header"] for s in draft_sections if s["header"] != "(intro)"]
|
||
final_h2s = [s["header"] for s in final_sections if s["header"] != "(intro)"]
|
||
|
||
draft_chars = len(draft.replace("\n", "").replace(" ", ""))
|
||
final_chars = len(final.replace("\n", "").replace(" ", ""))
|
||
|
||
return {
|
||
"title_changed": draft_title != final_title,
|
||
"draft_title": draft_title,
|
||
"final_title": final_title,
|
||
"structure_changed": draft_h2s != final_h2s,
|
||
"draft_h2s": draft_h2s,
|
||
"final_h2s": final_h2s,
|
||
"lines_added": len(additions),
|
||
"lines_deleted": len(deletions),
|
||
"draft_chars": draft_chars,
|
||
"final_chars": final_chars,
|
||
"char_diff": final_chars - draft_chars,
|
||
"additions_sample": additions[:20],
|
||
"deletions_sample": deletions[:20],
|
||
}
|
||
|
||
|
||
def save_lesson(diff_result: dict, draft_path: str, final_path: str) -> Path:
|
||
"""Save structured lesson data for Agent to analyze."""
|
||
lessons_dir = SKILL_DIR / "lessons"
|
||
lessons_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||
lesson_file = lessons_dir / f"{date_str}-diff.yaml"
|
||
|
||
counter = 1
|
||
while lesson_file.exists():
|
||
lesson_file = lessons_dir / f"{date_str}-diff-{counter}.yaml"
|
||
counter += 1
|
||
|
||
data = {
|
||
"date": date_str,
|
||
"timestamp": datetime.now().isoformat(),
|
||
"draft_file": str(draft_path),
|
||
"final_file": str(final_path),
|
||
"diff_summary": {
|
||
"title_changed": diff_result["title_changed"],
|
||
"draft_title": diff_result["draft_title"],
|
||
"final_title": diff_result["final_title"],
|
||
"structure_changed": diff_result["structure_changed"],
|
||
"lines_added": diff_result["lines_added"],
|
||
"lines_deleted": diff_result["lines_deleted"],
|
||
"char_diff": diff_result["char_diff"],
|
||
},
|
||
# Agent fills these after analyzing the draft and final:
|
||
"patterns": [],
|
||
# Pattern format (Agent writes):
|
||
# - type: "word_sub" # one of PATTERN_TYPES keys
|
||
# key: "avoid_jiangzhen" # short unique identifier
|
||
# description: "把'讲真'替换为'坦白说'"
|
||
# rule: "不要使用'讲真',用'坦白说'代替" # imperative, executable
|
||
}
|
||
|
||
with open(lesson_file, "w", encoding="utf-8") as f:
|
||
yaml.dump(data, f, allow_unicode=True, default_flow_style=False)
|
||
|
||
return lesson_file
|
||
|
||
|
||
def load_all_lessons() -> list[dict]:
|
||
"""Load all lesson files."""
|
||
lessons_dir = SKILL_DIR / "lessons"
|
||
if not lessons_dir.exists():
|
||
return []
|
||
lessons = []
|
||
for f in sorted(lessons_dir.glob("*-diff*.yaml")):
|
||
with open(f, "r", encoding="utf-8") as fh:
|
||
data = yaml.safe_load(fh)
|
||
if data:
|
||
lessons.append(data)
|
||
return lessons
|
||
|
||
|
||
def compute_confidence(occurrences: int, first_seen: str, last_seen: str) -> float:
|
||
"""Compute confidence score from frequency and recency.
|
||
|
||
Confidence = base_from_occurrences + recency_bonus - age_decay.
|
||
|
||
- 1 occurrence = 3 (low, might be one-off)
|
||
- 2 occurrences = 5 (moderate, likely a preference)
|
||
- 3+ occurrences = 7+ (high, confirmed preference)
|
||
- Recency bonus: +1 if last_seen within 7 days
|
||
- Age decay: -1 per 30 days since last_seen (user style evolves)
|
||
- Clamped to 1-10
|
||
"""
|
||
base = min(8, 2 + occurrences * 2)
|
||
|
||
try:
|
||
last = datetime.fromisoformat(last_seen)
|
||
days_since = (datetime.now() - last).days
|
||
except (ValueError, TypeError):
|
||
days_since = 0
|
||
|
||
recency_bonus = 1.0 if days_since <= 7 else 0.0
|
||
age_decay = max(0, days_since // 30)
|
||
|
||
return max(1.0, min(10.0, base + recency_bonus - age_decay))
|
||
|
||
|
||
def aggregate_patterns(lessons: list[dict]) -> list[dict]:
|
||
"""Aggregate patterns across all lessons. Returns sorted by confidence."""
|
||
pattern_map = {} # key → aggregated data
|
||
|
||
for lesson in lessons:
|
||
date = lesson.get("date", "")
|
||
timestamp = lesson.get("timestamp", date)
|
||
for p in lesson.get("patterns", []):
|
||
key = p.get("key", "")
|
||
if not key:
|
||
continue
|
||
if key not in pattern_map:
|
||
pattern_map[key] = {
|
||
"key": key,
|
||
"type": p.get("type", "expression"),
|
||
"description": p.get("description", ""),
|
||
"rule": p.get("rule", ""),
|
||
"occurrences": 0,
|
||
"first_seen": timestamp,
|
||
"last_seen": timestamp,
|
||
}
|
||
entry = pattern_map[key]
|
||
entry["occurrences"] += 1
|
||
# Keep the most recent description/rule (may evolve)
|
||
if p.get("description"):
|
||
entry["description"] = p["description"]
|
||
if p.get("rule"):
|
||
entry["rule"] = p["rule"]
|
||
# Update timestamps
|
||
if timestamp < entry["first_seen"]:
|
||
entry["first_seen"] = timestamp
|
||
if timestamp > entry["last_seen"]:
|
||
entry["last_seen"] = timestamp
|
||
|
||
# Compute confidence for each
|
||
results = []
|
||
for entry in pattern_map.values():
|
||
entry["confidence"] = round(compute_confidence(
|
||
entry["occurrences"], entry["first_seen"], entry["last_seen"]
|
||
), 1)
|
||
results.append(entry)
|
||
|
||
# Sort by confidence descending
|
||
results.sort(key=lambda x: x["confidence"], reverse=True)
|
||
return results
|
||
|
||
|
||
def summarize_lessons(as_json: bool = False):
|
||
"""Load all lessons, aggregate patterns, output with confidence scores."""
|
||
lessons = load_all_lessons()
|
||
if not lessons:
|
||
print("No lessons found.")
|
||
return
|
||
|
||
patterns = aggregate_patterns(lessons)
|
||
|
||
if as_json:
|
||
print(json.dumps({
|
||
"total_lessons": len(lessons),
|
||
"total_patterns": len(patterns),
|
||
"patterns": patterns,
|
||
}, ensure_ascii=False, indent=2))
|
||
return
|
||
|
||
print(f"Total lessons: {len(lessons)}")
|
||
print(f"Unique patterns: {len(patterns)}")
|
||
print()
|
||
|
||
for p in patterns:
|
||
type_label = PATTERN_TYPES.get(p["type"], p["type"])
|
||
conf_bar = "█" * int(p["confidence"]) + "░" * (10 - int(p["confidence"]))
|
||
print(f" {conf_bar} {p['confidence']:4.1f} [{type_label}] {p['key']}")
|
||
print(f" {p['description']}")
|
||
if p["rule"]:
|
||
print(f" → {p['rule']}")
|
||
print(f" seen {p['occurrences']}x, first {p['first_seen'][:10]}, last {p['last_seen'][:10]}")
|
||
print()
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Learn from human edits")
|
||
parser.add_argument("--draft", help="Path to AI draft")
|
||
parser.add_argument("--final", help="Path to human-edited final")
|
||
parser.add_argument("--from-wechat", action="store_true",
|
||
help="Auto-fetch edited version from WeChat draft box")
|
||
parser.add_argument("--summarize", action="store_true", help="Summarize all lessons")
|
||
parser.add_argument("--json", action="store_true", help="JSON output (with --summarize)")
|
||
args = parser.parse_args()
|
||
|
||
if args.summarize:
|
||
summarize_lessons(as_json=args.json)
|
||
return
|
||
|
||
if args.from_wechat:
|
||
local_text, wechat_text, draft_path = fetch_wechat_draft()
|
||
if local_text == wechat_text:
|
||
print("\n微信草稿与本地文件内容一致,没有修改。")
|
||
return
|
||
diff_result = compute_diff(local_text, wechat_text)
|
||
# Save with special marker for wechat source
|
||
lesson_file = save_lesson(diff_result, draft_path, f"wechat:{draft_path}")
|
||
print(f"\nLesson saved to: {lesson_file}")
|
||
print(f"\n检测到 {diff_result['lines_added']} 处新增, {diff_result['lines_deleted']} 处删除")
|
||
print(f"字数变化: {diff_result['char_diff']:+d}")
|
||
print(f"\nAgent 接下来读取 {draft_path} 和微信草稿内容,分析修改模式并写入 {lesson_file}")
|
||
return
|
||
|
||
if not args.draft or not args.final:
|
||
print("Error: --draft and --final required (or use --from-wechat)", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
draft = load_text(args.draft)
|
||
final = load_text(args.final)
|
||
diff_result = compute_diff(draft, final)
|
||
|
||
# Print summary
|
||
print("=" * 60)
|
||
print("EDIT ANALYSIS")
|
||
print("=" * 60)
|
||
|
||
if diff_result["title_changed"]:
|
||
print(f"\n标题修改:")
|
||
print(f" AI: {diff_result['draft_title']}")
|
||
print(f" 人工: {diff_result['final_title']}")
|
||
|
||
if diff_result["structure_changed"]:
|
||
print(f"\n结构修改:")
|
||
print(f" AI H2: {diff_result['draft_h2s']}")
|
||
print(f" 人工 H2: {diff_result['final_h2s']}")
|
||
|
||
print(f"\n数量变化:")
|
||
print(f" 新增 {diff_result['lines_added']} 行, 删除 {diff_result['lines_deleted']} 行")
|
||
print(f" 字数变化: {diff_result['char_diff']:+d} ({diff_result['draft_chars']} → {diff_result['final_chars']})")
|
||
|
||
if diff_result["deletions_sample"]:
|
||
print(f"\n被删除的内容(采样):")
|
||
for line in diff_result["deletions_sample"][:10]:
|
||
print(f" - {line[:80]}")
|
||
|
||
if diff_result["additions_sample"]:
|
||
print(f"\n新增的内容(采样):")
|
||
for line in diff_result["additions_sample"][:10]:
|
||
print(f" + {line[:80]}")
|
||
|
||
# Save lesson
|
||
lesson_file = save_lesson(diff_result, args.draft, args.final)
|
||
print(f"\nLesson saved to: {lesson_file}")
|
||
|
||
# Auto-grow exemplar library from edited finals
|
||
final_title = extract_title(final)
|
||
try:
|
||
import extract_exemplar
|
||
exemplar = extract_exemplar.extract_exemplar(final, source=final_title or "user-edited")
|
||
if exemplar["humanness_score"] <= 50:
|
||
exemplar_path = extract_exemplar.save_exemplar(exemplar)
|
||
print(f"\n✓ 终稿已加入范文库: {exemplar_path}")
|
||
print(f" Score: {exemplar['humanness_score']:.1f}/100, Category: {exemplar['category']}")
|
||
else:
|
||
print(f"\n⚠ 终稿 humanness_score={exemplar['humanness_score']:.1f} > 50,未加入范文库")
|
||
except Exception as e:
|
||
print(f"\n⚠ 范文提取跳过: {e}")
|
||
|
||
lesson_count = len(load_all_lessons())
|
||
print(f"Total lessons: {lesson_count}")
|
||
|
||
if lesson_count >= 5 and lesson_count % 5 == 0:
|
||
print(f"\n{'=' * 60}")
|
||
print("PLAYBOOK UPDATE TRIGGERED")
|
||
print(f"{'=' * 60}")
|
||
print(f"{lesson_count} lessons. Agent should run:")
|
||
print(f" python3 scripts/learn_edits.py --summarize --json")
|
||
print(f"Then update playbook.md with high-confidence patterns.")
|
||
|
||
# Instructions for Agent
|
||
print(f"""
|
||
{'=' * 60}
|
||
INSTRUCTIONS FOR AGENT
|
||
{'=' * 60}
|
||
|
||
Read the draft and final versions, then for each meaningful edit:
|
||
|
||
1. Read: {args.draft}
|
||
2. Read: {args.final}
|
||
3. For each edit, add a pattern entry to {lesson_file}:
|
||
|
||
patterns:
|
||
- type: "word_sub" # {' / '.join(PATTERN_TYPES.keys())}
|
||
key: "short_unique_id" # e.g. "avoid_jiangzhen", "shorter_paragraphs"
|
||
description: "把'讲真'替换为'坦白说'"
|
||
rule: "不要使用'讲真',用'坦白说'代替" # imperative, executable
|
||
|
||
4. Rules must be imperative (可执行的指令), not descriptive.
|
||
BAD: "用户偏好简短段落"
|
||
GOOD: "段落不超过 80 字,长段必须在 3 句内换行"
|
||
|
||
5. If pattern already exists in previous lessons (same key),
|
||
confidence will auto-increase on next --summarize.
|
||
""")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|