新增优化循环框架:humanness_score.py + optimize_loop.py
借鉴 Karpathy autoresearch 的 change→score→keep/rollback 模式: - humanness_score.py: 固定打分器,两层评分(客观checklist + 主观读者感) 6项客观检查:禁用词/真实引用/破句/句长方差/段长方差/词汇温度 1项主观LLM判官(stub,需配置API) 复合分 0-100(越低越像人) - optimize_loop.py: 迭代框架,通过修改 writing-config.yaml 参数 自动生成文章→打分→保留或回滚→记录到 results.tsv Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
59aa215f12
commit
8e16c70ead
2 changed files with 448 additions and 0 deletions
299
scripts/humanness_score.py
Normal file
299
scripts/humanness_score.py
Normal file
|
|
@ -0,0 +1,299 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fixed humanness scoring pipeline for WeWrite optimization loop.
|
||||
|
||||
Two-layer scoring inspired by autoresearch + the "objective checklist + subjective feel" pattern:
|
||||
|
||||
Layer 1: Objective checklist (yes/no, deterministic, won't drift)
|
||||
Layer 2: Subjective reader-feel (LLM judge, 1-10)
|
||||
|
||||
Composite = Layer1 pass_rate * 0.6 + Layer2 normalized * 0.4
|
||||
|
||||
DO NOT MODIFY this file during optimization. It is the fixed evaluation function.
|
||||
|
||||
Usage:
|
||||
python3 humanness_score.py article.md
|
||||
python3 humanness_score.py article.md --verbose
|
||||
python3 humanness_score.py article.md --json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Layer 1: Objective Checklist (deterministic yes/no)
|
||||
# ============================================================
|
||||
|
||||
BANNED_WORDS = [
|
||||
"首先", "其次", "再者", "最后", "总之", "综上所述", "总而言之",
|
||||
"此外", "另外", "与此同时", "不仅如此", "更重要的是", "在此基础上",
|
||||
"作为一个", "让我们", "值得注意的是", "需要指出的是", "不可否认",
|
||||
"毋庸置疑", "众所周知", "事实上", "显而易见", "可以说", "从某种意义上说",
|
||||
"非常重要", "至关重要", "不言而喻", "具有重要意义", "发挥着重要作用",
|
||||
"意义深远", "影响深远", "引发了广泛关注", "引起了热烈讨论",
|
||||
"总的来说", "综合来看", "由此可见", "不难发现", "通过以上分析",
|
||||
"正如我们所看到的",
|
||||
]
|
||||
|
||||
# Real-source indicators: named people, organizations, specific publications
|
||||
REAL_SOURCE_PATTERNS = [
|
||||
r'[A-Z][a-z]+\s+[A-Z][a-z]+', # Named person (English)
|
||||
r'[\u4e00-\u9fff]{2,4}(?:表示|指出|认为|写道|提到|说过)', # Chinese name + said
|
||||
r'(?:据|根据|来自)\s*[\u4e00-\u9fff]+(?:报告|数据|研究|调查)', # "according to X report"
|
||||
r'20[12]\d\s*年', # Specific year reference
|
||||
r'\d+(?:\.\d+)?%', # Specific percentage
|
||||
r'(?:亿|万)\s*(?:美元|元|人民币)', # Specific monetary amount
|
||||
]
|
||||
|
||||
|
||||
def check_no_banned_words(text: str) -> tuple[bool, str]:
|
||||
"""Check: zero banned words."""
|
||||
found = [w for w in BANNED_WORDS if w in text]
|
||||
if found:
|
||||
return False, f"Found {len(found)} banned words: {found[:5]}"
|
||||
return True, "0 banned words"
|
||||
|
||||
|
||||
def check_real_sources(text: str) -> tuple[bool, str]:
|
||||
"""Check: article references real external sources (≥3 instances)."""
|
||||
count = 0
|
||||
for pattern in REAL_SOURCE_PATTERNS:
|
||||
count += len(re.findall(pattern, text))
|
||||
if count >= 3:
|
||||
return True, f"{count} real-source indicators found"
|
||||
return False, f"Only {count} real-source indicators (need ≥3)"
|
||||
|
||||
|
||||
def check_broken_sentences(text: str) -> tuple[bool, str]:
|
||||
"""Check: ≥3 broken/incomplete sentences (dashes, ellipsis, self-corrections)."""
|
||||
patterns = [
|
||||
r'——(?!.*[,。!?])', # em-dash interruption without ending punct
|
||||
r'\.{3,}|…', # ellipsis
|
||||
r'不对[,,]', # self-correction "不对,"
|
||||
r'算了', # abandonment "算了"
|
||||
r'^.{1,6}[。!?]$', # ultra-short sentence (≤6 chars + punct) as standalone line
|
||||
]
|
||||
count = 0
|
||||
lines = text.split('\n')
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
for p in patterns:
|
||||
count += len(re.findall(p, line))
|
||||
# Check for ultra-short standalone paragraphs (1-10 chars)
|
||||
if 1 <= len(line) <= 10 and not line.startswith('#'):
|
||||
count += 1
|
||||
if count >= 3:
|
||||
return True, f"{count} broken/incomplete structures"
|
||||
return False, f"Only {count} broken structures (need ≥3)"
|
||||
|
||||
|
||||
def check_sentence_length_variance(text: str) -> tuple[bool, str]:
|
||||
"""Check: sentence length standard deviation > threshold.
|
||||
|
||||
AI text has suspiciously uniform sentence lengths.
|
||||
Human text varies wildly (3-char to 80-char sentences in the same paragraph).
|
||||
"""
|
||||
# Split by Chinese sentence-ending punctuation
|
||||
sentences = re.split(r'[。!?\n]', text)
|
||||
sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 1]
|
||||
|
||||
if len(sentences) < 5:
|
||||
return False, "Too few sentences to measure"
|
||||
|
||||
lengths = [len(s) for s in sentences]
|
||||
mean = sum(lengths) / len(lengths)
|
||||
variance = sum((l - mean) ** 2 for l in lengths) / len(lengths)
|
||||
stddev = variance ** 0.5
|
||||
|
||||
# Threshold: human text typically has stddev > 15 chars
|
||||
# AI text tends to be 8-12
|
||||
if stddev > 15:
|
||||
return True, f"Sentence length stddev = {stddev:.1f} (good variance)"
|
||||
return False, f"Sentence length stddev = {stddev:.1f} (too uniform, need >15)"
|
||||
|
||||
|
||||
def check_paragraph_length_variance(text: str) -> tuple[bool, str]:
|
||||
"""Check: no consecutive paragraphs of similar length."""
|
||||
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip() and not p.strip().startswith('#')]
|
||||
if len(paragraphs) < 3:
|
||||
return True, "Too few paragraphs to check"
|
||||
|
||||
consecutive_similar = 0
|
||||
for i in range(len(paragraphs) - 1):
|
||||
len_a = len(paragraphs[i])
|
||||
len_b = len(paragraphs[i + 1])
|
||||
if abs(len_a - len_b) <= 20:
|
||||
consecutive_similar += 1
|
||||
|
||||
if consecutive_similar <= 1:
|
||||
return True, f"{consecutive_similar} consecutive similar-length pairs (OK)"
|
||||
return False, f"{consecutive_similar} consecutive similar-length pairs (too uniform)"
|
||||
|
||||
|
||||
def check_word_temperature_mix(text: str) -> tuple[bool, str]:
|
||||
"""Check: mix of formal/colloquial/slang/wild vocabulary."""
|
||||
cold = ["边际", "认知负荷", "信息不对称", "路径依赖", "商业模式", "生态系统", "增量"]
|
||||
warm = ["说白了", "其实吧", "讲真", "说实话", "坦白讲", "懂的都懂", "怎么说呢"]
|
||||
hot = ["DNA动了", "格局打开", "遥遥领先", "卷", "内卷", "炸了", "杀疯了", "吃灰"]
|
||||
wild = ["整挺好", "不靠谱", "瞎折腾", "搁这儿", "糊弄", "扯", "嗯"]
|
||||
|
||||
found_temps = 0
|
||||
if any(w in text for w in cold): found_temps += 1
|
||||
if any(w in text for w in warm): found_temps += 1
|
||||
if any(w in text for w in hot): found_temps += 1
|
||||
if any(w in text for w in wild): found_temps += 1
|
||||
|
||||
if found_temps >= 3:
|
||||
return True, f"{found_temps}/4 temperature types found"
|
||||
return False, f"Only {found_temps}/4 temperature types (need ≥3)"
|
||||
|
||||
|
||||
def run_layer1(text: str) -> dict:
|
||||
"""Run all Layer 1 checks. Returns dict with results."""
|
||||
checks = [
|
||||
("no_banned_words", check_no_banned_words),
|
||||
("real_sources", check_real_sources),
|
||||
("broken_sentences", check_broken_sentences),
|
||||
("sentence_length_variance", check_sentence_length_variance),
|
||||
("paragraph_length_variance", check_paragraph_length_variance),
|
||||
("word_temperature_mix", check_word_temperature_mix),
|
||||
]
|
||||
|
||||
results = {}
|
||||
passed = 0
|
||||
total = len(checks)
|
||||
|
||||
for name, fn in checks:
|
||||
ok, detail = fn(text)
|
||||
results[name] = {"passed": ok, "detail": detail}
|
||||
if ok:
|
||||
passed += 1
|
||||
|
||||
results["_summary"] = {
|
||||
"passed": passed,
|
||||
"total": total,
|
||||
"pass_rate": round(passed / total, 4),
|
||||
}
|
||||
return results
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Layer 2: Subjective Reader-Feel (LLM judge)
|
||||
# ============================================================
|
||||
|
||||
JUDGE_PROMPT = """你是一个经验丰富的公众号读者。你对 AI 生成的内容非常敏感——你能凭直觉分辨出人写的和 AI 写的。
|
||||
|
||||
现在请你阅读下面这篇文章,然后回答一个问题:
|
||||
|
||||
**这篇文章读起来像人写的吗?**
|
||||
|
||||
评分标准(1-10):
|
||||
- 1-3:明显是 AI 写的。语言过于规整,没有个人色彩,像教科书。
|
||||
- 4-5:有一些 AI 痕迹。整体流畅但缺乏个人经历、情感起伏或出人意料的表达。
|
||||
- 6-7:大部分像人写的,偶尔有几句感觉"太完美了"。
|
||||
- 8-9:很像人写的。有个人风格、情感波动、不完美感,像一个真人编辑的作品。
|
||||
- 10:完全像人写的。如果不告诉我,我不会怀疑这是 AI 参与的。
|
||||
|
||||
请只输出一个 JSON:{"score": 数字, "reason": "一句话理由"}
|
||||
|
||||
---
|
||||
|
||||
文章内容:
|
||||
|
||||
{article}
|
||||
"""
|
||||
|
||||
|
||||
def run_layer2_stub(text: str) -> dict:
|
||||
"""Layer 2 stub — returns placeholder when no LLM API available.
|
||||
|
||||
In production, this calls Claude/GPT to judge the article.
|
||||
For the optimization loop, replace this with actual API call.
|
||||
"""
|
||||
return {
|
||||
"score": 5.0,
|
||||
"reason": "(stub) LLM judge not configured — using default score",
|
||||
"is_stub": True,
|
||||
}
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Composite Score
|
||||
# ============================================================
|
||||
|
||||
def compute_composite(layer1: dict, layer2: dict) -> float:
|
||||
"""Composite score: lower is better (like val_bpb in autoresearch).
|
||||
|
||||
Inverted so that 0 = perfect human, 100 = obvious AI.
|
||||
"""
|
||||
l1_pass_rate = layer1["_summary"]["pass_rate"]
|
||||
l2_score = layer2["score"] / 10.0 # normalize to 0-1
|
||||
|
||||
# Composite: higher pass_rate and higher reader score = more human
|
||||
humanness = l1_pass_rate * 0.6 + l2_score * 0.4
|
||||
|
||||
# Invert: 0 = perfect human, 100 = obvious AI
|
||||
return round((1 - humanness) * 100, 2)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Main
|
||||
# ============================================================
|
||||
|
||||
def score_article(text: str, verbose: bool = False) -> dict:
|
||||
"""Score an article. Returns full results dict."""
|
||||
# Strip markdown headers for scoring
|
||||
clean = re.sub(r'^#+\s+.*$', '', text, flags=re.MULTILINE).strip()
|
||||
|
||||
layer1 = run_layer1(clean)
|
||||
layer2 = run_layer2_stub(clean)
|
||||
composite = compute_composite(layer1, layer2)
|
||||
|
||||
result = {
|
||||
"composite_score": composite,
|
||||
"layer1": layer1,
|
||||
"layer2": layer2,
|
||||
"char_count": len(clean),
|
||||
}
|
||||
|
||||
if verbose:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"HUMANNESS SCORE: {composite:.1f}/100 (lower = more human)")
|
||||
print(f"{'='*60}")
|
||||
print(f"\nLayer 1 — Objective Checklist ({layer1['_summary']['passed']}/{layer1['_summary']['total']})")
|
||||
for name, data in layer1.items():
|
||||
if name.startswith('_'):
|
||||
continue
|
||||
status = "✓" if data["passed"] else "✗"
|
||||
print(f" {status} {name}: {data['detail']}")
|
||||
print(f"\nLayer 2 — Reader Feel: {layer2['score']}/10")
|
||||
print(f" {layer2['reason']}")
|
||||
print(f"\nComposite: {composite:.1f} (0=完美人类, 100=明显AI)")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Score article humanness")
|
||||
parser.add_argument("input", help="Markdown article file")
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Detailed output")
|
||||
parser.add_argument("--json", action="store_true", help="JSON output")
|
||||
args = parser.parse_args()
|
||||
|
||||
text = Path(args.input).read_text(encoding="utf-8")
|
||||
result = score_article(text, verbose=args.verbose)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||||
elif not args.verbose:
|
||||
print(f"{result['composite_score']:.1f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
149
scripts/optimize_loop.py
Normal file
149
scripts/optimize_loop.py
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
WeWrite Optimization Loop — autoresearch-style iterative improvement.
|
||||
|
||||
Inspired by Karpathy's autoresearch: change → score → keep/rollback → repeat.
|
||||
But instead of optimizing ML training code, we optimize WRITING RULES to
|
||||
produce articles that pass AI detection while maintaining quality.
|
||||
|
||||
The mutable surface: writing-config.yaml (style parameters + prompt rules)
|
||||
The fixed evaluation: humanness_score.py (objective checklist + subjective feel)
|
||||
The metric: composite_score (lower = more human, like val_bpb)
|
||||
|
||||
Usage:
|
||||
python3 optimize_loop.py --topic "AI Agent" --iterations 10
|
||||
python3 optimize_loop.py --topic "AI Agent" --iterations 5 --verbose
|
||||
|
||||
Architecture:
|
||||
1. Load current writing-config.yaml
|
||||
2. Generate article with current config
|
||||
3. Score with humanness_score.py
|
||||
4. LLM proposes a change to writing-config.yaml
|
||||
5. Generate article with new config
|
||||
6. Score again
|
||||
7. If improved → keep (commit). If not → rollback.
|
||||
8. Log to results.tsv
|
||||
9. Repeat.
|
||||
|
||||
Requirements:
|
||||
- ANTHROPIC_API_KEY in environment (for article generation + LLM judge)
|
||||
- writing-config.yaml in skill root (created on first run with defaults)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
SKILL_DIR = Path(__file__).parent.parent
|
||||
CONFIG_PATH = SKILL_DIR / "writing-config.yaml"
|
||||
RESULTS_PATH = SKILL_DIR / "optimization-results.tsv"
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
"persona": "科技媒体资深编辑,写了八年公众号,对AI行业有深度认知",
|
||||
"sentence_variance": 0.7,
|
||||
"broken_sentence_rate": 0.04,
|
||||
"idiom_density": 0.15,
|
||||
"filler_style": "mixed", # literary / casual / mixed / minimal
|
||||
"paragraph_rhythm": "chaotic", # structured / chaotic / wave
|
||||
"self_correction_rate": 0.02,
|
||||
"tangent_frequency": "every_800_chars", # never / every_500 / every_800 / every_1200
|
||||
"real_data_density": "high", # low / medium / high
|
||||
"word_temperature_bias": "warm", # cold / warm / hot / balanced
|
||||
"emotional_arc": "restrained_to_burst", # flat / gradual / restrained_to_burst / volatile
|
||||
"opening_style": "scene", # scene / data / question / anecdote / cold_open
|
||||
"closing_style": "open_question", # summary / open_question / image / abrupt
|
||||
"structure_linearity": 0.3, # 0=fully non-linear, 1=fully linear
|
||||
}
|
||||
|
||||
|
||||
def ensure_config():
|
||||
"""Create default writing-config.yaml if it doesn't exist."""
|
||||
if not CONFIG_PATH.exists():
|
||||
with open(CONFIG_PATH, "w", encoding="utf-8") as f:
|
||||
yaml.dump(DEFAULT_CONFIG, f, allow_unicode=True, default_flow_style=False)
|
||||
print(f"Created default config: {CONFIG_PATH}")
|
||||
return yaml.safe_load(CONFIG_PATH.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def score_article(article_path: str) -> dict:
|
||||
"""Run humanness_score.py on an article. Returns parsed result."""
|
||||
result = subprocess.run(
|
||||
["python3", str(SKILL_DIR / "scripts" / "humanness_score.py"), article_path, "--json"],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f"Scoring failed: {result.stderr}", file=sys.stderr)
|
||||
return {"composite_score": 100.0, "error": result.stderr}
|
||||
return json.loads(result.stdout)
|
||||
|
||||
|
||||
def log_result(iteration: int, composite: float, config_summary: str, status: str, description: str):
|
||||
"""Append result to TSV log."""
|
||||
header_needed = not RESULTS_PATH.exists()
|
||||
with open(RESULTS_PATH, "a", encoding="utf-8") as f:
|
||||
if header_needed:
|
||||
f.write("iteration\ttimestamp\tcomposite\tstatus\tdescription\tconfig_change\n")
|
||||
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
f.write(f"{iteration}\t{ts}\t{composite:.2f}\t{status}\t{description}\t{config_summary}\n")
|
||||
|
||||
|
||||
def print_banner(iteration: int, total: int):
|
||||
print(f"\n{'='*60}")
|
||||
print(f" OPTIMIZATION LOOP — Iteration {iteration}/{total}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="WeWrite optimization loop")
|
||||
parser.add_argument("--topic", required=True, help="Article topic for testing")
|
||||
parser.add_argument("--iterations", type=int, default=10, help="Number of iterations")
|
||||
parser.add_argument("--verbose", "-v", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"""
|
||||
╔══════════════════════════════════════════════════════╗
|
||||
║ WeWrite Optimization Loop ║
|
||||
║ Topic: {args.topic:<44s}║
|
||||
║ Iterations: {args.iterations:<39d}║
|
||||
║ ║
|
||||
║ Pattern: change config → generate → score → ║
|
||||
║ keep if better, rollback if worse ║
|
||||
╚══════════════════════════════════════════════════════╝
|
||||
""")
|
||||
|
||||
config = ensure_config()
|
||||
|
||||
print("This script provides the FRAMEWORK for optimization.")
|
||||
print("To run the full loop, you need:")
|
||||
print(" 1. An article generation function (Claude API)")
|
||||
print(" 2. A scoring function (humanness_score.py — included)")
|
||||
print(" 3. An LLM to propose config changes (Claude API)")
|
||||
print()
|
||||
print("Current config:")
|
||||
print(yaml.dump(config, allow_unicode=True, default_flow_style=False))
|
||||
print()
|
||||
print("Run this loop via Claude Code / OpenClaw agent:")
|
||||
print()
|
||||
print(" Agent reads writing-config.yaml")
|
||||
print(" → generates article with those rules")
|
||||
print(" → scores with: python3 scripts/humanness_score.py article.md --json")
|
||||
print(" → proposes a config change")
|
||||
print(" → generates new article")
|
||||
print(" → scores again")
|
||||
print(" → if composite_score decreased → commit config change")
|
||||
print(" → if composite_score same/worse → rollback")
|
||||
print(" → logs to optimization-results.tsv")
|
||||
print(" → repeats")
|
||||
print()
|
||||
print("To test scoring on an existing article:")
|
||||
print(f" python3 scripts/humanness_score.py <article.md> --verbose")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue