新增优化循环框架：humanness_score.py + optimize_loop.py

借鉴 Karpathy autoresearch 的 change→score→keep/rollback 模式： - humanness_score.py: 固定打分器，两层评分（客观checklist + 主观读者感） 6项客观检查：禁用词/真实引用/破句/句长方差/段长方差/词汇温度 1项主观LLM判官（stub，需配置API）复合分 0-100（越低越像人） - optimize_loop.py: 迭代框架，通过修改 writing-config.yaml 参数自动生成文章→打分→保留或回滚→记录到 results.tsv Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 23:18:55 +08:00 · 2026-03-28 23:18:55 +08:00 · 8e16c70ead
commit 8e16c70ead
parent 59aa215f12
2 changed files with 448 additions and 0 deletions
--- a/scripts/humanness_score.py
+++ b/scripts/humanness_score.py
@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+"""
+Fixed humanness scoring pipeline for WeWrite optimization loop.
+
+Two-layer scoring inspired by autoresearch + the "objective checklist + subjective feel" pattern:
+
+Layer 1: Objective checklist (yes/no, deterministic, won't drift)
+Layer 2: Subjective reader-feel (LLM judge, 1-10)
+
+Composite = Layer1 pass_rate * 0.6 + Layer2 normalized * 0.4
+
+DO NOT MODIFY this file during optimization. It is the fixed evaluation function.
+
+Usage:
+    python3 humanness_score.py article.md
+    python3 humanness_score.py article.md --verbose
+    python3 humanness_score.py article.md --json
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+
+# ============================================================
+# Layer 1: Objective Checklist (deterministic yes/no)
+# ============================================================
+
+BANNED_WORDS = [
+    "首先", "其次", "再者", "最后", "总之", "综上所述", "总而言之",
+    "此外", "另外", "与此同时", "不仅如此", "更重要的是", "在此基础上",
+    "作为一个", "让我们", "值得注意的是", "需要指出的是", "不可否认",
+    "毋庸置疑", "众所周知", "事实上", "显而易见", "可以说", "从某种意义上说",
+    "非常重要", "至关重要", "不言而喻", "具有重要意义", "发挥着重要作用",
+    "意义深远", "影响深远", "引发了广泛关注", "引起了热烈讨论",
+    "总的来说", "综合来看", "由此可见", "不难发现", "通过以上分析",
+    "正如我们所看到的",
+]
+
+# Real-source indicators: named people, organizations, specific publications
+REAL_SOURCE_PATTERNS = [
+    r'[A-Z][a-z]+\s+[A-Z][a-z]+',  # Named person (English)
+    r'[\u4e00-\u9fff]{2,4}(?:表示|指出|认为|写道|提到|说过)',  # Chinese name + said
+    r'(?:据|根据|来自)\s*[\u4e00-\u9fff]+(?:报告|数据|研究|调查)',  # "according to X report"
+    r'20[12]\d\s*年',  # Specific year reference
+    r'\d+(?:\.\d+)?%',  # Specific percentage
+    r'(?:亿|万)\s*(?:美元|元|人民币)',  # Specific monetary amount
+]
+
+
+def check_no_banned_words(text: str) -> tuple[bool, str]:
+    """Check: zero banned words."""
+    found = [w for w in BANNED_WORDS if w in text]
+    if found:
+        return False, f"Found {len(found)} banned words: {found[:5]}"
+    return True, "0 banned words"
+
+
+def check_real_sources(text: str) -> tuple[bool, str]:
+    """Check: article references real external sources (≥3 instances)."""
+    count = 0
+    for pattern in REAL_SOURCE_PATTERNS:
+        count += len(re.findall(pattern, text))
+    if count >= 3:
+        return True, f"{count} real-source indicators found"
+    return False, f"Only {count} real-source indicators (need ≥3)"
+
+
+def check_broken_sentences(text: str) -> tuple[bool, str]:
+    """Check: ≥3 broken/incomplete sentences (dashes, ellipsis, self-corrections)."""
+    patterns = [
+        r'——(?!.*[，。！？])',  # em-dash interruption without ending punct
+        r'\.{3,}|…',  # ellipsis
+        r'不对[，,]',  # self-correction "不对，"
+        r'算了',  # abandonment "算了"
+        r'^.{1,6}[。！？]$',  # ultra-short sentence (≤6 chars + punct) as standalone line
+    ]
+    count = 0
+    lines = text.split('\n')
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        for p in patterns:
+            count += len(re.findall(p, line))
+        # Check for ultra-short standalone paragraphs (1-10 chars)
+        if 1 <= len(line) <= 10 and not line.startswith('#'):
+            count += 1
+    if count >= 3:
+        return True, f"{count} broken/incomplete structures"
+    return False, f"Only {count} broken structures (need ≥3)"
+
+
+def check_sentence_length_variance(text: str) -> tuple[bool, str]:
+    """Check: sentence length standard deviation > threshold.
+
+    AI text has suspiciously uniform sentence lengths.
+    Human text varies wildly (3-char to 80-char sentences in the same paragraph).
+    """
+    # Split by Chinese sentence-ending punctuation
+    sentences = re.split(r'[。！？\n]', text)
+    sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 1]
+
+    if len(sentences) < 5:
+        return False, "Too few sentences to measure"
+
+    lengths = [len(s) for s in sentences]
+    mean = sum(lengths) / len(lengths)
+    variance = sum((l - mean) ** 2 for l in lengths) / len(lengths)
+    stddev = variance ** 0.5
+
+    # Threshold: human text typically has stddev > 15 chars
+    # AI text tends to be 8-12
+    if stddev > 15:
+        return True, f"Sentence length stddev = {stddev:.1f} (good variance)"
+    return False, f"Sentence length stddev = {stddev:.1f} (too uniform, need >15)"
+
+
+def check_paragraph_length_variance(text: str) -> tuple[bool, str]:
+    """Check: no consecutive paragraphs of similar length."""
+    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip() and not p.strip().startswith('#')]
+    if len(paragraphs) < 3:
+        return True, "Too few paragraphs to check"
+
+    consecutive_similar = 0
+    for i in range(len(paragraphs) - 1):
+        len_a = len(paragraphs[i])
+        len_b = len(paragraphs[i + 1])
+        if abs(len_a - len_b) <= 20:
+            consecutive_similar += 1
+
+    if consecutive_similar <= 1:
+        return True, f"{consecutive_similar} consecutive similar-length pairs (OK)"
+    return False, f"{consecutive_similar} consecutive similar-length pairs (too uniform)"
+
+
+def check_word_temperature_mix(text: str) -> tuple[bool, str]:
+    """Check: mix of formal/colloquial/slang/wild vocabulary."""
+    cold = ["边际", "认知负荷", "信息不对称", "路径依赖", "商业模式", "生态系统", "增量"]
+    warm = ["说白了", "其实吧", "讲真", "说实话", "坦白讲", "懂的都懂", "怎么说呢"]
+    hot = ["DNA动了", "格局打开", "遥遥领先", "卷", "内卷", "炸了", "杀疯了", "吃灰"]
+    wild = ["整挺好", "不靠谱", "瞎折腾", "搁这儿", "糊弄", "扯", "嗯"]
+
+    found_temps = 0
+    if any(w in text for w in cold): found_temps += 1
+    if any(w in text for w in warm): found_temps += 1
+    if any(w in text for w in hot): found_temps += 1
+    if any(w in text for w in wild): found_temps += 1
+
+    if found_temps >= 3:
+        return True, f"{found_temps}/4 temperature types found"
+    return False, f"Only {found_temps}/4 temperature types (need ≥3)"
+
+
+def run_layer1(text: str) -> dict:
+    """Run all Layer 1 checks. Returns dict with results."""
+    checks = [
+        ("no_banned_words", check_no_banned_words),
+        ("real_sources", check_real_sources),
+        ("broken_sentences", check_broken_sentences),
+        ("sentence_length_variance", check_sentence_length_variance),
+        ("paragraph_length_variance", check_paragraph_length_variance),
+        ("word_temperature_mix", check_word_temperature_mix),
+    ]
+
+    results = {}
+    passed = 0
+    total = len(checks)
+
+    for name, fn in checks:
+        ok, detail = fn(text)
+        results[name] = {"passed": ok, "detail": detail}
+        if ok:
+            passed += 1
+
+    results["_summary"] = {
+        "passed": passed,
+        "total": total,
+        "pass_rate": round(passed / total, 4),
+    }
+    return results
+
+
+# ============================================================
+# Layer 2: Subjective Reader-Feel (LLM judge)
+# ============================================================
+
+JUDGE_PROMPT = """你是一个经验丰富的公众号读者。你对 AI 生成的内容非常敏感——你能凭直觉分辨出人写的和 AI 写的。
+
+现在请你阅读下面这篇文章，然后回答一个问题：
+
+**这篇文章读起来像人写的吗？**
+
+评分标准（1-10）：
+- 1-3：明显是 AI 写的。语言过于规整，没有个人色彩，像教科书。
+- 4-5：有一些 AI 痕迹。整体流畅但缺乏个人经历、情感起伏或出人意料的表达。
+- 6-7：大部分像人写的，偶尔有几句感觉"太完美了"。
+- 8-9：很像人写的。有个人风格、情感波动、不完美感，像一个真人编辑的作品。
+- 10：完全像人写的。如果不告诉我，我不会怀疑这是 AI 参与的。
+
+请只输出一个 JSON：{"score": 数字, "reason": "一句话理由"}
+
+---
+
+文章内容：
+
+{article}
+"""
+
+
+def run_layer2_stub(text: str) -> dict:
+    """Layer 2 stub — returns placeholder when no LLM API available.
+
+    In production, this calls Claude/GPT to judge the article.
+    For the optimization loop, replace this with actual API call.
+    """
+    return {
+        "score": 5.0,
+        "reason": "(stub) LLM judge not configured — using default score",
+        "is_stub": True,
+    }
+
+
+# ============================================================
+# Composite Score
+# ============================================================
+
+def compute_composite(layer1: dict, layer2: dict) -> float:
+    """Composite score: lower is better (like val_bpb in autoresearch).
+
+    Inverted so that 0 = perfect human, 100 = obvious AI.
+    """
+    l1_pass_rate = layer1["_summary"]["pass_rate"]
+    l2_score = layer2["score"] / 10.0  # normalize to 0-1
+
+    # Composite: higher pass_rate and higher reader score = more human
+    humanness = l1_pass_rate * 0.6 + l2_score * 0.4
+
+    # Invert: 0 = perfect human, 100 = obvious AI
+    return round((1 - humanness) * 100, 2)
+
+
+# ============================================================
+# Main
+# ============================================================
+
+def score_article(text: str, verbose: bool = False) -> dict:
+    """Score an article. Returns full results dict."""
+    # Strip markdown headers for scoring
+    clean = re.sub(r'^#+\s+.*$', '', text, flags=re.MULTILINE).strip()
+
+    layer1 = run_layer1(clean)
+    layer2 = run_layer2_stub(clean)
+    composite = compute_composite(layer1, layer2)
+
+    result = {
+        "composite_score": composite,
+        "layer1": layer1,
+        "layer2": layer2,
+        "char_count": len(clean),
+    }
+
+    if verbose:
+        print(f"\n{'='*60}")
+        print(f"HUMANNESS SCORE: {composite:.1f}/100 (lower = more human)")
+        print(f"{'='*60}")
+        print(f"\nLayer 1 — Objective Checklist ({layer1['_summary']['passed']}/{layer1['_summary']['total']})")
+        for name, data in layer1.items():
+            if name.startswith('_'):
+                continue
+            status = "✓" if data["passed"] else "✗"
+            print(f"  {status} {name}: {data['detail']}")
+        print(f"\nLayer 2 — Reader Feel: {layer2['score']}/10")
+        print(f"  {layer2['reason']}")
+        print(f"\nComposite: {composite:.1f} (0=完美人类, 100=明显AI)")
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Score article humanness")
+    parser.add_argument("input", help="Markdown article file")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Detailed output")
+    parser.add_argument("--json", action="store_true", help="JSON output")
+    args = parser.parse_args()
+
+    text = Path(args.input).read_text(encoding="utf-8")
+    result = score_article(text, verbose=args.verbose)
+
+    if args.json:
+        print(json.dumps(result, ensure_ascii=False, indent=2))
+    elif not args.verbose:
+        print(f"{result['composite_score']:.1f}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/optimize_loop.py
+++ b/scripts/optimize_loop.py
@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+WeWrite Optimization Loop — autoresearch-style iterative improvement.
+
+Inspired by Karpathy's autoresearch: change → score → keep/rollback → repeat.
+But instead of optimizing ML training code, we optimize WRITING RULES to
+produce articles that pass AI detection while maintaining quality.
+
+The mutable surface: writing-config.yaml (style parameters + prompt rules)
+The fixed evaluation: humanness_score.py (objective checklist + subjective feel)
+The metric: composite_score (lower = more human, like val_bpb)
+
+Usage:
+    python3 optimize_loop.py --topic "AI Agent" --iterations 10
+    python3 optimize_loop.py --topic "AI Agent" --iterations 5 --verbose
+
+Architecture:
+    1. Load current writing-config.yaml
+    2. Generate article with current config
+    3. Score with humanness_score.py
+    4. LLM proposes a change to writing-config.yaml
+    5. Generate article with new config
+    6. Score again
+    7. If improved → keep (commit). If not → rollback.
+    8. Log to results.tsv
+    9. Repeat.
+
+Requirements:
+    - ANTHROPIC_API_KEY in environment (for article generation + LLM judge)
+    - writing-config.yaml in skill root (created on first run with defaults)
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+
+import yaml
+
+SKILL_DIR = Path(__file__).parent.parent
+CONFIG_PATH = SKILL_DIR / "writing-config.yaml"
+RESULTS_PATH = SKILL_DIR / "optimization-results.tsv"
+
+DEFAULT_CONFIG = {
+    "persona": "科技媒体资深编辑，写了八年公众号，对AI行业有深度认知",
+    "sentence_variance": 0.7,
+    "broken_sentence_rate": 0.04,
+    "idiom_density": 0.15,
+    "filler_style": "mixed",  # literary / casual / mixed / minimal
+    "paragraph_rhythm": "chaotic",  # structured / chaotic / wave
+    "self_correction_rate": 0.02,
+    "tangent_frequency": "every_800_chars",  # never / every_500 / every_800 / every_1200
+    "real_data_density": "high",  # low / medium / high
+    "word_temperature_bias": "warm",  # cold / warm / hot / balanced
+    "emotional_arc": "restrained_to_burst",  # flat / gradual / restrained_to_burst / volatile
+    "opening_style": "scene",  # scene / data / question / anecdote / cold_open
+    "closing_style": "open_question",  # summary / open_question / image / abrupt
+    "structure_linearity": 0.3,  # 0=fully non-linear, 1=fully linear
+}
+
+
+def ensure_config():
+    """Create default writing-config.yaml if it doesn't exist."""
+    if not CONFIG_PATH.exists():
+        with open(CONFIG_PATH, "w", encoding="utf-8") as f:
+            yaml.dump(DEFAULT_CONFIG, f, allow_unicode=True, default_flow_style=False)
+        print(f"Created default config: {CONFIG_PATH}")
+    return yaml.safe_load(CONFIG_PATH.read_text(encoding="utf-8"))
+
+
+def score_article(article_path: str) -> dict:
+    """Run humanness_score.py on an article. Returns parsed result."""
+    result = subprocess.run(
+        ["python3", str(SKILL_DIR / "scripts" / "humanness_score.py"), article_path, "--json"],
+        capture_output=True, text=True
+    )
+    if result.returncode != 0:
+        print(f"Scoring failed: {result.stderr}", file=sys.stderr)
+        return {"composite_score": 100.0, "error": result.stderr}
+    return json.loads(result.stdout)
+
+
+def log_result(iteration: int, composite: float, config_summary: str, status: str, description: str):
+    """Append result to TSV log."""
+    header_needed = not RESULTS_PATH.exists()
+    with open(RESULTS_PATH, "a", encoding="utf-8") as f:
+        if header_needed:
+            f.write("iteration\ttimestamp\tcomposite\tstatus\tdescription\tconfig_change\n")
+        ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        f.write(f"{iteration}\t{ts}\t{composite:.2f}\t{status}\t{description}\t{config_summary}\n")
+
+
+def print_banner(iteration: int, total: int):
+    print(f"\n{'='*60}")
+    print(f"  OPTIMIZATION LOOP — Iteration {iteration}/{total}")
+    print(f"{'='*60}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="WeWrite optimization loop")
+    parser.add_argument("--topic", required=True, help="Article topic for testing")
+    parser.add_argument("--iterations", type=int, default=10, help="Number of iterations")
+    parser.add_argument("--verbose", "-v", action="store_true")
+    args = parser.parse_args()
+
+    print(f"""
+╔══════════════════════════════════════════════════════╗
+║  WeWrite Optimization Loop                          ║
+║  Topic: {args.topic:<44s}║
+║  Iterations: {args.iterations:<39d}║
+║                                                      ║
+║  Pattern: change config → generate → score →         ║
+║           keep if better, rollback if worse           ║
+╚══════════════════════════════════════════════════════╝
+""")
+
+    config = ensure_config()
+
+    print("This script provides the FRAMEWORK for optimization.")
+    print("To run the full loop, you need:")
+    print("  1. An article generation function (Claude API)")
+    print("  2. A scoring function (humanness_score.py — included)")
+    print("  3. An LLM to propose config changes (Claude API)")
+    print()
+    print("Current config:")
+    print(yaml.dump(config, allow_unicode=True, default_flow_style=False))
+    print()
+    print("Run this loop via Claude Code / OpenClaw agent:")
+    print()
+    print("  Agent reads writing-config.yaml")
+    print("  → generates article with those rules")
+    print("  → scores with: python3 scripts/humanness_score.py article.md --json")
+    print("  → proposes a config change")
+    print("  → generates new article")
+    print("  → scores again")
+    print("  → if composite_score decreased → commit config change")
+    print("  → if composite_score same/worse → rollback")
+    print("  → logs to optimization-results.tsv")
+    print("  → repeats")
+    print()
+    print("To test scoring on an existing article:")
+    print(f"  python3 scripts/humanness_score.py <article.md> --verbose")
+
+
+if __name__ == "__main__":
+    main()