From 885cae8e7de671afd7d0b51c3e5d4fb350e6326e Mon Sep 17 00:00:00 2001 From: wangzhuc Date: Mon, 30 Mar 2026 22:32:02 +0800 Subject: [PATCH] feat: add SICO-style exemplar extraction system for few-shot writing - New script: scripts/extract_exemplar.py Extracts style fingerprints from human-written articles (opening hook, emotional peak, transition/self-correction, closing) with statistical analysis (sentence stddev, vocab temperature, negative ratio, paragraph CV). Auto-detects category, supports batch import. - SKILL.md: Add Step 4.4 exemplar injection Loads matching exemplars by category before writing, injects segments as few-shot style examples in the prompt. - learn_edits.py: Auto-grow exemplar library After user edits, auto-extracts the final version into the exemplar library if humanness_score <= 50. Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitignore | 4 + SKILL.md | 36 +++- references/exemplars/.gitkeep | 0 scripts/extract_exemplar.py | 373 ++++++++++++++++++++++++++++++++++ scripts/learn_edits.py | 14 ++ 5 files changed, 426 insertions(+), 1 deletion(-) create mode 100644 references/exemplars/.gitkeep create mode 100644 scripts/extract_exemplar.py diff --git a/.gitignore b/.gitignore index d65d17a..c4c474b 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,10 @@ optimization-results.tsv output/ !output/.gitkeep +# Exemplar library (user-specific content) +references/exemplars/*.md +references/exemplars/index.yaml + # Legacy client directories clients/ diff --git a/SKILL.md b/SKILL.md index bafddf6..70d2219 100644 --- a/SKILL.md +++ b/SKILL.md @@ -198,6 +198,7 @@ WebSearch: "{选题关键词} 数据 报告 2025 2026" 读取: {skill_dir}/playbook.md(如果存在,按 confidence 分级执行) 读取: {skill_dir}/writing-config.yaml(如果存在,作为写作参数) 读取: {skill_dir}/history.yaml(最近 3 篇的 dimensions 字段) +读取: {skill_dir}/references/exemplars/index.yaml(如果存在) ``` **4.1 历史最佳参数参考**(有 history.yaml 且包含 composite_score 时执行): @@ -219,7 +220,38 @@ WebSearch: "{选题关键词} 数据 报告 2025 2026" **优先级**:playbook.md(confidence ≥ 5 的规则)> persona > writing-guide.md。writing-guide 是底线(禁用词等),persona 在此基础上特化风格参数,playbook 中高置信度规则是用户个性化的最终覆盖。playbook 中 confidence < 5 的规则作为软性参考。 -**4.4 写文章**: +**4.4 范文风格注入**(有 `references/exemplars/index.yaml` 时执行): + +从 index.yaml 筛选 category 匹配当前框架类型的范文,按 humanness_score 升序(越低越人类)取 top 3。读取对应 .md 文件的片段内容。 + +在写作 prompt 中注入: + +> 以下是该公众号风格的真实段落示例,模仿其句长节奏、情绪强度和口语化程度: +> +> 【开头风格】 +> {exemplar_1 的开头钩子段} +> +> 【情绪段风格】 +> {exemplar_2 的情绪高峰段} +> +> 【收尾风格】 +> {exemplar_3 的收尾段} + +Category 映射规则: + +| 框架类型 | exemplar category | +|----------|-------------------| +| 痛点型/深度解读 | tech-opinion | +| 故事型 | story-emotional | +| 清单型/对比型 | list-practical | +| 热点解读型 | hot-take | +| 其他 | general | + +如果匹配到的范文不足 3 篇,用 general category 补足。如果范文库为空,跳过此步。 + +建库命令:`python3 {skill_dir}/scripts/extract_exemplar.py article.md` + +**4.5 写文章**: - H1 标题(20-28 字) + H2 结构,1500-2500 字 - 真实素材锚定:Step 3.2 的素材分散嵌入各 H2 段落 - **写作人格**:按 4.3 加载的人格参数写作(数据呈现方式、个人声音浓度、不确定性表达等) @@ -375,6 +407,8 @@ python3 {skill_dir}/toolkit/cli.py preview {markdown} --theme {theme} --no-open | 做一个小绿书/图片帖 | `python3 {skill_dir}/toolkit/cli.py image-post img1.jpg img2.jpg -t "标题"` | | 诊断配置 / 检查反AI / 为什么AI检测没过 | `python3 {skill_dir}/scripts/diagnose.py --json` + LLM 交叉分析 | | 优化写作参数 / 优化参数 | 迭代循环:写测试短文 → 打分 → 调参(见辅助功能) | +| 导入范文 / 建范文库 | `python3 {skill_dir}/scripts/extract_exemplar.py article.md` | +| 查看范文库 | `python3 {skill_dir}/scripts/extract_exemplar.py --list` | --- diff --git a/references/exemplars/.gitkeep b/references/exemplars/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/scripts/extract_exemplar.py b/scripts/extract_exemplar.py new file mode 100644 index 0000000..e05097d --- /dev/null +++ b/scripts/extract_exemplar.py @@ -0,0 +1,373 @@ +#!/usr/bin/env python3 +""" +Extract style exemplars from human-written articles for SICO-style few-shot injection. + +Takes a markdown article, analyzes it for style fingerprints, extracts key +segments (opening hook, emotional peak, transition/self-correction, closing), +and saves structured exemplar files to references/exemplars/. + +Usage: + python3 scripts/extract_exemplar.py article.md + python3 scripts/extract_exemplar.py article.md --category tech-opinion --source "公众号名" + python3 scripts/extract_exemplar.py article1.md article2.md article3.md # batch + python3 scripts/extract_exemplar.py --list # list all exemplars +""" + +import argparse +import json +import re +import sys +from datetime import datetime +from pathlib import Path + +import yaml + +# Reuse analysis functions from humanness_score +sys.path.insert(0, str(Path(__file__).parent)) +import humanness_score as hs + +SKILL_DIR = Path(__file__).parent.parent +EXEMPLARS_DIR = SKILL_DIR / "references" / "exemplars" +INDEX_FILE = EXEMPLARS_DIR / "index.yaml" + +CATEGORIES = ["tech-opinion", "story-emotional", "list-practical", "hot-take", "general"] + +# Category detection markers +STORY_MARKERS = [ + "我", "我们", "那天", "那年", "记得", "后来", "当时", + "第一次", "最后", "突然", "终于", +] + + +# ============================================================ +# Segment Extraction +# ============================================================ + +def extract_headings(text): + """Extract H2 headings from markdown.""" + return re.findall(r'^##\s+(.+)$', text, re.MULTILINE) + + +def extract_title(text): + """Extract H1 title from markdown.""" + m = re.search(r'^#\s+(.+)$', text, re.MULTILINE) + return m.group(1).strip() if m else "" + + +def extract_opening(paragraphs, max_chars=250): + """Extract opening hook — first non-empty paragraph(s) up to max_chars.""" + result = [] + total = 0 + for p in paragraphs: + if total + len(p) > max_chars and result: + break + result.append(p) + total += len(p) + return "\n\n".join(result) + + +def extract_emotional_peak(paragraphs): + """Find paragraph with highest negative emotion density.""" + best_para, best_density = "", -1.0 + for p in paragraphs: + if len(p) < 20: + continue + count = sum(1 for m in hs.NEGATIVE_MARKERS if m in p) + density = count / len(p) * 100 + if density > best_density: + best_density = density + best_para = p + return best_para if best_density > 0 else "" + + +def extract_transition(paragraphs): + """Find paragraph with most self-correction / transition patterns.""" + transition_words = [ + "但是", "不过", "然而", "话说回来", "换个角度", + "说回来", "但话又说回来", "不对", "算了", + ] + best_para, best_count = "", 0 + for p in paragraphs: + if len(p) < 20: + continue + count = sum(len(re.findall(pat, p)) for pat in hs.SELF_CORRECTION_PATTERNS) + count += sum(p.count(w) for w in transition_words) + if count > best_count: + best_count = count + best_para = p + return best_para if best_count > 0 else "" + + +def extract_closing(paragraphs, max_chars=250): + """Extract closing paragraph(s), reading backwards.""" + result = [] + total = 0 + for p in reversed(paragraphs): + if total + len(p) > max_chars and result: + break + result.insert(0, p) + total += len(p) + return "\n\n".join(result) + + +# ============================================================ +# Category Detection +# ============================================================ + +def detect_category(text, paragraphs, headings): + """Auto-detect article category from content features.""" + data_count = sum(len(re.findall(p, text)) for p in hs.REAL_SOURCE_PATTERNS) + story_count = sum(text.count(m) for m in STORY_MARKERS) + h2_count = len(headings) + neg_count = sum(1 for m in hs.NEGATIVE_MARKERS if m in text) + + scores = { + "tech-opinion": data_count * 2, + "story-emotional": story_count * 1.5, + "list-practical": h2_count * 3 if h2_count >= 5 else 0, + "hot-take": neg_count * 2 + data_count if len(text) < 2000 else 0, + "general": 5, + } + return max(scores, key=scores.get) + + +# ============================================================ +# Statistical Fingerprint +# ============================================================ + +def compute_vocab_temperature(text): + """Compute vocabulary temperature band distribution.""" + counts = { + "cold": sum(text.count(w) for w in hs.COLD_WORDS), + "warm": sum(text.count(w) for w in hs.WARM_WORDS), + "hot": sum(text.count(w) for w in hs.HOT_WORDS), + "wild": sum(text.count(w) for w in hs.WILD_WORDS), + } + total = sum(counts.values()) + if total == 0: + return {k: 0.25 for k in counts} + return {k: round(v / total, 2) for k, v in counts.items()} + + +def compute_paragraph_cv(paragraphs): + """Coefficient of variation for paragraph lengths.""" + if len(paragraphs) < 3: + return 0.0 + lengths = [len(p) for p in paragraphs] + mean = sum(lengths) / len(lengths) + if mean == 0: + return 0.0 + variance = sum((l - mean) ** 2 for l in lengths) / len(lengths) + return round((variance ** 0.5) / mean, 2) + + +def count_short_paragraphs(text): + """Count single-sentence short paragraphs (1-10 chars, non-heading).""" + return sum(1 for l in text.split('\n') + if l.strip() and 1 <= len(l.strip()) <= 10 + and not l.strip().startswith('#')) + + +# ============================================================ +# Main Extraction +# ============================================================ + +def extract_exemplar(text, category=None, source=None): + """Analyze article and return structured exemplar dict.""" + clean = re.sub(r'^#+\s+.*$', '', text, flags=re.MULTILINE).strip() + paragraphs = hs._split_paragraphs(text) + sentences = hs._split_sentences(clean) + headings = extract_headings(text) + title = extract_title(text) or source or "untitled" + + if not category: + category = detect_category(clean, paragraphs, headings) + + score_result = hs.score_article(text) + + # Sentence length stats + lengths = [len(s) for s in sentences] + if len(lengths) >= 2: + mean = sum(lengths) / len(lengths) + variance = sum((l - mean) ** 2 for l in lengths) / len(lengths) + sentence_stddev = round(variance ** 0.5, 1) + else: + sentence_stddev = 0.0 + + neg_count = sum(1 for s in sentences if any(m in s for m in hs.NEGATIVE_MARKERS)) + negative_ratio = round(neg_count / len(sentences), 2) if sentences else 0.0 + + return { + "title": title, + "source": source or title, + "category": category, + "humanness_score": score_result["composite_score"], + "fingerprint": { + "sentence_stddev": sentence_stddev, + "vocab_temperature": compute_vocab_temperature(clean), + "negative_ratio": negative_ratio, + "paragraph_cv": compute_paragraph_cv(paragraphs), + "short_paragraphs": count_short_paragraphs(text), + }, + "segments": { + "opening": extract_opening(paragraphs), + "emotional_peak": extract_emotional_peak(paragraphs), + "transition": extract_transition(paragraphs), + "closing": extract_closing(paragraphs), + }, + "extracted_at": datetime.now().strftime("%Y-%m-%d"), + "char_count": len(clean), + } + + +# ============================================================ +# Persistence +# ============================================================ + +def save_exemplar(exemplar): + """Save exemplar to markdown file and update index.yaml. Returns filepath.""" + EXEMPLARS_DIR.mkdir(parents=True, exist_ok=True) + + category = exemplar["category"] + num = 1 + while (EXEMPLARS_DIR / f"{category}-{num:03d}.md").exists(): + num += 1 + filename = f"{category}-{num:03d}.md" + filepath = EXEMPLARS_DIR / filename + + fp = exemplar["fingerprint"] + seg = exemplar["segments"] + + frontmatter = { + "source": exemplar["source"], + "category": category, + "humanness_score": exemplar["humanness_score"], + "sentence_stddev": fp["sentence_stddev"], + "vocab_temperature": fp["vocab_temperature"], + "negative_ratio": fp["negative_ratio"], + "paragraph_cv": fp["paragraph_cv"], + "short_paragraphs": fp["short_paragraphs"], + "extracted_at": exemplar["extracted_at"], + } + + content = "---\n" + content += yaml.dump(frontmatter, allow_unicode=True, default_flow_style=False) + content += "---\n\n" + + section_map = [ + ("opening", "开头钩子"), + ("emotional_peak", "情绪高峰"), + ("transition", "转折/自纠"), + ("closing", "收尾"), + ] + for key, label in section_map: + if seg.get(key): + content += f"## {label}\n\n{seg[key]}\n\n" + + filepath.write_text(content, encoding="utf-8") + _update_index(filename, exemplar) + return filepath + + +def _update_index(filename, exemplar): + """Add or update entry in index.yaml.""" + index = [] + if INDEX_FILE.exists(): + with open(INDEX_FILE, "r", encoding="utf-8") as f: + index = yaml.safe_load(f) or [] + + entry = { + "file": filename, + "source": exemplar["source"], + "category": exemplar["category"], + "humanness_score": exemplar["humanness_score"], + "extracted_at": exemplar["extracted_at"], + } + index = [e for e in index if e.get("file") != filename] + index.append(entry) + index.sort(key=lambda x: (x["category"], x["humanness_score"])) + + with open(INDEX_FILE, "w", encoding="utf-8") as f: + yaml.dump(index, f, allow_unicode=True, default_flow_style=False) + + +# ============================================================ +# List / CLI +# ============================================================ + +def list_exemplars(): + """Print all exemplars in the library.""" + if not INDEX_FILE.exists(): + print("范文库为空。用法: python3 scripts/extract_exemplar.py article.md") + return + + with open(INDEX_FILE, "r", encoding="utf-8") as f: + index = yaml.safe_load(f) or [] + + if not index: + print("范文库为空。") + return + + print(f"\n{'=' * 60}") + print(f"范文库 ({len(index)} 篇)") + print(f"{'=' * 60}") + + by_cat = {} + for e in index: + by_cat.setdefault(e["category"], []).append(e) + + for cat, entries in sorted(by_cat.items()): + print(f"\n [{cat}] ({len(entries)} 篇)") + for e in entries: + score = e["humanness_score"] + bar = "█" * int((100 - score) / 10) + "░" * (10 - int((100 - score) / 10)) + print(f" {bar} {score:5.1f} {e['source'][:40]}") + + +def main(): + parser = argparse.ArgumentParser(description="Extract style exemplars from articles") + parser.add_argument("inputs", nargs="*", help="Markdown article file(s)") + parser.add_argument("--category", "-c", choices=CATEGORIES, + help="Article category (auto-detected if omitted)") + parser.add_argument("--source", "-s", help="Source name (e.g. account name)") + parser.add_argument("--list", "-l", action="store_true", help="List all exemplars") + parser.add_argument("--json", action="store_true", help="JSON output") + args = parser.parse_args() + + if args.list: + list_exemplars() + return + + if not args.inputs: + parser.print_help() + sys.exit(1) + + for input_path in args.inputs: + path = Path(input_path) + if not path.exists(): + print(f"Error: {input_path} not found", file=sys.stderr) + continue + + text = path.read_text(encoding="utf-8") + exemplar = extract_exemplar(text, category=args.category, source=args.source) + filepath = save_exemplar(exemplar) + + if args.json: + print(json.dumps(exemplar, ensure_ascii=False, indent=2)) + else: + print(f"✓ {path.name}") + print(f" Category: {exemplar['category']}") + print(f" Score: {exemplar['humanness_score']:.1f}/100") + print(f" Segments: {sum(1 for v in exemplar['segments'].values() if v)}/4") + fp = exemplar["fingerprint"] + print(f" Stddev: {fp['sentence_stddev']}") + print(f" Neg ratio: {fp['negative_ratio']:.0%}") + print(f" Para CV: {fp['paragraph_cv']}") + temp = fp["vocab_temperature"] + print(f" Temp: cold={temp['cold']} warm={temp['warm']} hot={temp['hot']} wild={temp['wild']}") + print(f" Saved to: {filepath}") + print() + + +if __name__ == "__main__": + main() diff --git a/scripts/learn_edits.py b/scripts/learn_edits.py index 175db49..1b6ac67 100644 --- a/scripts/learn_edits.py +++ b/scripts/learn_edits.py @@ -325,6 +325,20 @@ def main(): lesson_file = save_lesson(diff_result, args.draft, args.final) print(f"\nLesson saved to: {lesson_file}") + # Auto-grow exemplar library from edited finals + final_title = extract_title(final) + try: + import extract_exemplar + exemplar = extract_exemplar.extract_exemplar(final, source=final_title or "user-edited") + if exemplar["humanness_score"] <= 50: + exemplar_path = extract_exemplar.save_exemplar(exemplar) + print(f"\n✓ 终稿已加入范文库: {exemplar_path}") + print(f" Score: {exemplar['humanness_score']:.1f}/100, Category: {exemplar['category']}") + else: + print(f"\n⚠ 终稿 humanness_score={exemplar['humanness_score']:.1f} > 50,未加入范文库") + except Exception as e: + print(f"\n⚠ 范文提取跳过: {e}") + lesson_count = len(load_all_lessons()) print(f"Total lessons: {lesson_count}")