wewrite/dist/openclaw/scripts/humanness_score.py
wangzhuc e1a0d6ef47 新增 OpenClaw 兼容:build 脚本 + CI + 首次产物
- scripts/build_openclaw.py:SKILL.md 转换({skill_dir}→{baseDir}、WebSearch→web_search、移除 allowed-tools)
- .github/workflows/build-openclaw.yml:push to main 时自动构建 dist/openclaw/
- dist/openclaw/:首次构建产物入库,OpenClaw 用户可直接使用

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 13:00:07 +08:00

299 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Fixed humanness scoring pipeline for WeWrite optimization loop.
Two-layer scoring inspired by autoresearch + the "objective checklist + subjective feel" pattern:
Layer 1: Objective checklist (yes/no, deterministic, won't drift)
Layer 2: Subjective reader-feel (LLM judge, 1-10)
Composite = Layer1 pass_rate * 0.6 + Layer2 normalized * 0.4
DO NOT MODIFY this file during optimization. It is the fixed evaluation function.
Usage:
python3 humanness_score.py article.md
python3 humanness_score.py article.md --verbose
python3 humanness_score.py article.md --json
"""
import argparse
import json
import re
import sys
from pathlib import Path
# ============================================================
# Layer 1: Objective Checklist (deterministic yes/no)
# ============================================================
BANNED_WORDS = [
"首先", "其次", "再者", "最后", "总之", "综上所述", "总而言之",
"此外", "另外", "与此同时", "不仅如此", "更重要的是", "在此基础上",
"作为一个", "让我们", "值得注意的是", "需要指出的是", "不可否认",
"毋庸置疑", "众所周知", "事实上", "显而易见", "可以说", "从某种意义上说",
"非常重要", "至关重要", "不言而喻", "具有重要意义", "发挥着重要作用",
"意义深远", "影响深远", "引发了广泛关注", "引起了热烈讨论",
"总的来说", "综合来看", "由此可见", "不难发现", "通过以上分析",
"正如我们所看到的",
]
# Real-source indicators: named people, organizations, specific publications
REAL_SOURCE_PATTERNS = [
r'[A-Z][a-z]+\s+[A-Z][a-z]+', # Named person (English)
r'[\u4e00-\u9fff]{2,4}(?:表示|指出|认为|写道|提到|说过)', # Chinese name + said
r'(?:据|根据|来自)\s*[\u4e00-\u9fff]+(?:报告|数据|研究|调查)', # "according to X report"
r'20[12]\d\s*年', # Specific year reference
r'\d+(?:\.\d+)?%', # Specific percentage
r'(?:亿|万)\s*(?:美元|元|人民币)', # Specific monetary amount
]
def check_no_banned_words(text: str) -> tuple[bool, str]:
"""Check: zero banned words."""
found = [w for w in BANNED_WORDS if w in text]
if found:
return False, f"Found {len(found)} banned words: {found[:5]}"
return True, "0 banned words"
def check_real_sources(text: str) -> tuple[bool, str]:
"""Check: article references real external sources (≥3 instances)."""
count = 0
for pattern in REAL_SOURCE_PATTERNS:
count += len(re.findall(pattern, text))
if count >= 3:
return True, f"{count} real-source indicators found"
return False, f"Only {count} real-source indicators (need ≥3)"
def check_broken_sentences(text: str) -> tuple[bool, str]:
"""Check: ≥3 broken/incomplete sentences (dashes, ellipsis, self-corrections)."""
patterns = [
r'——(?!.*[,。!?])', # em-dash interruption without ending punct
r'\.{3,}|…', # ellipsis
r'不对[,]', # self-correction "不对,"
r'算了', # abandonment "算了"
r'^.{1,6}[。!?]$', # ultra-short sentence (≤6 chars + punct) as standalone line
]
count = 0
lines = text.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
for p in patterns:
count += len(re.findall(p, line))
# Check for ultra-short standalone paragraphs (1-10 chars)
if 1 <= len(line) <= 10 and not line.startswith('#'):
count += 1
if count >= 3:
return True, f"{count} broken/incomplete structures"
return False, f"Only {count} broken structures (need ≥3)"
def check_sentence_length_variance(text: str) -> tuple[bool, str]:
"""Check: sentence length standard deviation > threshold.
AI text has suspiciously uniform sentence lengths.
Human text varies wildly (3-char to 80-char sentences in the same paragraph).
"""
# Split by Chinese sentence-ending punctuation
sentences = re.split(r'[。!?\n]', text)
sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 1]
if len(sentences) < 5:
return False, "Too few sentences to measure"
lengths = [len(s) for s in sentences]
mean = sum(lengths) / len(lengths)
variance = sum((l - mean) ** 2 for l in lengths) / len(lengths)
stddev = variance ** 0.5
# Threshold: human text typically has stddev > 15 chars
# AI text tends to be 8-12
if stddev > 15:
return True, f"Sentence length stddev = {stddev:.1f} (good variance)"
return False, f"Sentence length stddev = {stddev:.1f} (too uniform, need >15)"
def check_paragraph_length_variance(text: str) -> tuple[bool, str]:
"""Check: no consecutive paragraphs of similar length."""
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip() and not p.strip().startswith('#')]
if len(paragraphs) < 3:
return True, "Too few paragraphs to check"
consecutive_similar = 0
for i in range(len(paragraphs) - 1):
len_a = len(paragraphs[i])
len_b = len(paragraphs[i + 1])
if abs(len_a - len_b) <= 20:
consecutive_similar += 1
if consecutive_similar <= 1:
return True, f"{consecutive_similar} consecutive similar-length pairs (OK)"
return False, f"{consecutive_similar} consecutive similar-length pairs (too uniform)"
def check_word_temperature_mix(text: str) -> tuple[bool, str]:
"""Check: mix of formal/colloquial/slang/wild vocabulary."""
cold = ["边际", "认知负荷", "信息不对称", "路径依赖", "商业模式", "生态系统", "增量"]
warm = ["说白了", "其实吧", "讲真", "说实话", "坦白讲", "懂的都懂", "怎么说呢"]
hot = ["DNA动了", "格局打开", "遥遥领先", "", "内卷", "炸了", "杀疯了", "吃灰"]
wild = ["整挺好", "不靠谱", "瞎折腾", "搁这儿", "糊弄", "", ""]
found_temps = 0
if any(w in text for w in cold): found_temps += 1
if any(w in text for w in warm): found_temps += 1
if any(w in text for w in hot): found_temps += 1
if any(w in text for w in wild): found_temps += 1
if found_temps >= 3:
return True, f"{found_temps}/4 temperature types found"
return False, f"Only {found_temps}/4 temperature types (need ≥3)"
def run_layer1(text: str) -> dict:
"""Run all Layer 1 checks. Returns dict with results."""
checks = [
("no_banned_words", check_no_banned_words),
("real_sources", check_real_sources),
("broken_sentences", check_broken_sentences),
("sentence_length_variance", check_sentence_length_variance),
("paragraph_length_variance", check_paragraph_length_variance),
("word_temperature_mix", check_word_temperature_mix),
]
results = {}
passed = 0
total = len(checks)
for name, fn in checks:
ok, detail = fn(text)
results[name] = {"passed": ok, "detail": detail}
if ok:
passed += 1
results["_summary"] = {
"passed": passed,
"total": total,
"pass_rate": round(passed / total, 4),
}
return results
# ============================================================
# Layer 2: Subjective Reader-Feel (LLM judge)
# ============================================================
JUDGE_PROMPT = """你是一个经验丰富的公众号读者。你对 AI 生成的内容非常敏感——你能凭直觉分辨出人写的和 AI 写的。
现在请你阅读下面这篇文章,然后回答一个问题:
**这篇文章读起来像人写的吗?**
评分标准1-10
- 1-3明显是 AI 写的。语言过于规整,没有个人色彩,像教科书。
- 4-5有一些 AI 痕迹。整体流畅但缺乏个人经历、情感起伏或出人意料的表达。
- 6-7大部分像人写的偶尔有几句感觉"太完美了"
- 8-9很像人写的。有个人风格、情感波动、不完美感像一个真人编辑的作品。
- 10完全像人写的。如果不告诉我我不会怀疑这是 AI 参与的。
请只输出一个 JSON{"score": 数字, "reason": "一句话理由"}
---
文章内容:
{article}
"""
def run_layer2_stub(text: str) -> dict:
"""Layer 2 stub — returns placeholder when no LLM API available.
In production, this calls Claude/GPT to judge the article.
For the optimization loop, replace this with actual API call.
"""
return {
"score": 5.0,
"reason": "(stub) LLM judge not configured — using default score",
"is_stub": True,
}
# ============================================================
# Composite Score
# ============================================================
def compute_composite(layer1: dict, layer2: dict) -> float:
"""Composite score: lower is better (like val_bpb in autoresearch).
Inverted so that 0 = perfect human, 100 = obvious AI.
"""
l1_pass_rate = layer1["_summary"]["pass_rate"]
l2_score = layer2["score"] / 10.0 # normalize to 0-1
# Composite: higher pass_rate and higher reader score = more human
humanness = l1_pass_rate * 0.6 + l2_score * 0.4
# Invert: 0 = perfect human, 100 = obvious AI
return round((1 - humanness) * 100, 2)
# ============================================================
# Main
# ============================================================
def score_article(text: str, verbose: bool = False) -> dict:
"""Score an article. Returns full results dict."""
# Strip markdown headers for scoring
clean = re.sub(r'^#+\s+.*$', '', text, flags=re.MULTILINE).strip()
layer1 = run_layer1(clean)
layer2 = run_layer2_stub(clean)
composite = compute_composite(layer1, layer2)
result = {
"composite_score": composite,
"layer1": layer1,
"layer2": layer2,
"char_count": len(clean),
}
if verbose:
print(f"\n{'='*60}")
print(f"HUMANNESS SCORE: {composite:.1f}/100 (lower = more human)")
print(f"{'='*60}")
print(f"\nLayer 1 — Objective Checklist ({layer1['_summary']['passed']}/{layer1['_summary']['total']})")
for name, data in layer1.items():
if name.startswith('_'):
continue
status = "" if data["passed"] else ""
print(f" {status} {name}: {data['detail']}")
print(f"\nLayer 2 — Reader Feel: {layer2['score']}/10")
print(f" {layer2['reason']}")
print(f"\nComposite: {composite:.1f} (0=完美人类, 100=明显AI)")
return result
def main():
parser = argparse.ArgumentParser(description="Score article humanness")
parser.add_argument("input", help="Markdown article file")
parser.add_argument("--verbose", "-v", action="store_true", help="Detailed output")
parser.add_argument("--json", action="store_true", help="JSON output")
args = parser.parse_args()
text = Path(args.input).read_text(encoding="utf-8")
result = score_article(text, verbose=args.verbose)
if args.json:
print(json.dumps(result, ensure_ascii=False, indent=2))
elif not args.verbose:
print(f"{result['composite_score']:.1f}")
if __name__ == "__main__":
main()