wewrite/scripts/humanness_score.py
wangzhuc df72e51ea1 feat: rewrite humanness_score.py with continuous scoring and param mapping
- 11 checks across 2 tiers (6 statistical + 5 pattern), up from 6
- Continuous 0-1 scores instead of pass/fail booleans
- Each check maps to a writing-config parameter via param field
- New checks: negative emotion ratio, adverb density, vocabulary richness,
  sentence length range, self-correction patterns
- New --tier3 flag for agent to pass LLM structural analysis score
- param_scores in JSON output: flat param→score map for optimization
- Standalone mode redistributes weights (T1=62.5%, T2=37.5%)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 19:54:11 +08:00

449 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Humanness scoring for WeWrite articles.
Three-tier evaluation aligned with writing-guide.md's anti-AI checklist:
Tier 1 (Statistical, 50%): 6 checks measuring statistical properties
that AI detectors analyze (burstiness, distribution, variance).
Tier 2 (Pattern, 30%): 5 checks for specific linguistic patterns
(banned words, broken sentences, real sources).
Tier 3 (LLM, 20%): Semantic analysis done by the agent in SKILL.md
(style drift, density waves, coherence). Passed via --tier3 flag.
Each check outputs a continuous 0-1 score and maps to a writing-config
parameter, so the optimization loop knows which knob to turn.
Standalone mode (no --tier3): weights redistribute to T1=62.5%, T2=37.5%.
Usage:
python3 humanness_score.py article.md # single score
python3 humanness_score.py article.md --verbose # detailed report
python3 humanness_score.py article.md --json # full JSON
python3 humanness_score.py article.md --json --tier3 0.7 # with agent score
"""
import argparse
import json
import re
import sys
from pathlib import Path
# ============================================================
# Constants
# ============================================================
BANNED_WORDS = [
"首先", "其次", "再者", "最后", "总之", "综上所述", "总而言之",
"此外", "另外", "与此同时", "不仅如此", "更重要的是", "在此基础上",
"作为一个", "让我们", "值得注意的是", "需要指出的是", "不可否认",
"毋庸置疑", "众所周知", "事实上", "显而易见", "可以说", "从某种意义上说",
"非常重要", "至关重要", "不言而喻", "具有重要意义", "发挥着重要作用",
"意义深远", "影响深远", "引发了广泛关注", "引起了热烈讨论",
"总的来说", "综合来看", "由此可见", "不难发现", "通过以上分析",
"正如我们所看到的",
]
REAL_SOURCE_PATTERNS = [
r'[A-Z][a-z]+\s+[A-Z][a-z]+',
r'[\u4e00-\u9fff]{2,4}(?:表示|指出|认为|写道|提到|说过)',
r'(?:据|根据|来自)\s*[\u4e00-\u9fff]+(?:报告|数据|研究|调查)',
r'20[12]\d\s*年',
r'\d+(?:\.\d+)?%',
r'(?:亿|万)\s*(?:美元|元|人民币)',
]
NEGATIVE_MARKERS = [
"失望", "糟糕", "", "", "", "差劲", "崩溃", "吐槽", "",
"", "", "焦虑", "担忧", "不满", "恶心", "可怕", "可悲", "可笑",
"离谱", "尴尬", "无语", "", "", "", "",
"太扯了", "说实话我很失望", "搞什么", "不靠谱", "受不了",
]
COMMON_ADVERBS = [
"非常", "十分", "极其", "特别", "相当", "尤其", "格外",
"更加", "越来越", "逐渐", "不断", "始终", "一直",
"已经", "正在", "将要", "可能", "大概", "或许",
"似乎", "显然", "明显", "确实", "果然", "居然",
"竟然", "简直", "几乎", "完全", "绝对", "必然",
]
COLD_WORDS = ["边际", "认知负荷", "信息不对称", "路径依赖", "商业模式", "生态系统", "增量"]
WARM_WORDS = ["说白了", "其实吧", "讲真", "说实话", "坦白讲", "懂的都懂", "怎么说呢"]
HOT_WORDS = ["DNA动了", "格局打开", "遥遥<EFBFBD><EFBFBD>", "", "内卷", "炸了", "杀疯了", "吃灰"]
WILD_WORDS = ["整挺好", "不靠谱", "瞎折腾", "搁这儿", "糊弄", "", ""]
SELF_CORRECTION_PATTERNS = [
r'不对[,]', r'准确说', r'算了', r'说错了',
r'其实不是', r'我记混了', r'应该说', r'更准确地说',
r'[^]{4,}', # Chinese parenthetical insertion (≥4 chars)
]
BROKEN_SENTENCE_PATTERNS = [
r'——(?!.*[,。!?])',
r'\.{3,}|…',
r'不对[,]',
r'算了',
]
# ============================================================
# Helpers
# ============================================================
def _split_sentences(text):
"""Split text by Chinese sentence-ending punctuation."""
sentences = re.split(r'[。!?\n]', text)
return [s.strip() for s in sentences if s.strip() and len(s.strip()) > 1]
def _split_paragraphs(text):
"""Split text into paragraphs, excluding headings."""
return [p.strip() for p in text.split('\n\n')
if p.strip() and not p.strip().startswith('#')]
def _make_result(score, detail, param=None):
"""Create a check result dict."""
r = {"score": round(max(0.0, min(1.0, score)), 4), "detail": detail}
if param is not None:
r["param"] = param
else:
r["param"] = None
return r
# ============================================================
# Tier 1: Statistical Checks (weight 50%)
# ============================================================
def score_sentence_length_stddev(text):
"""[1.1] Sentence length standard deviation. → sentence_variance"""
sentences = _split_sentences(text)
if len(sentences) < 5:
return _make_result(0.5, "too few sentences to measure", "sentence_variance")
lengths = [len(s) for s in sentences]
mean = sum(lengths) / len(lengths)
variance = sum((l - mean) ** 2 for l in lengths) / len(lengths)
stddev = variance ** 0.5
score = min(1.0, stddev / 25.0)
return _make_result(score, f"stddev={stddev:.1f} (target ≥15)", "sentence_variance")
def score_sentence_length_range(text):
"""[1.1] Sentence length range (max - min). → sentence_variance"""
sentences = _split_sentences(text)
if len(sentences) < 5:
return _make_result(0.5, "too few sentences", "sentence_variance")
lengths = [len(s) for s in sentences]
rng = max(lengths) - min(lengths)
range_score = min(1.0, rng / 40.0)
# Check for single-sentence short paragraphs
lines = text.split('\n')
short_paras = sum(1 for l in lines if l.strip() and 1 <= len(l.strip()) <= 5
and not l.strip().startswith('#'))
expected = max(1, len(text) / 500)
short_score = min(1.0, short_paras / expected)
score = range_score * 0.6 + short_score * 0.4
return _make_result(score, f"range={rng} (target ≥30), short_paras={short_paras}", "sentence_variance")
def score_paragraph_length_variance(text):
"""[1.3] Paragraph length variance. → paragraph_rhythm"""
paragraphs = _split_paragraphs(text)
if len(paragraphs) < 3:
return _make_result(0.5, "too few paragraphs", "paragraph_rhythm")
total_pairs = len(paragraphs) - 1
similar = sum(1 for i in range(total_pairs)
if abs(len(paragraphs[i]) - len(paragraphs[i + 1])) <= 20)
score = 1.0 - (similar / total_pairs) if total_pairs > 0 else 0.5
return _make_result(score, f"{similar}/{total_pairs} consecutive similar-length pairs", "paragraph_rhythm")
def score_vocabulary_richness(text):
"""[1.2] CJK bigram type-token ratio + temperature mix. → word_temperature_bias"""
cjk_chars = re.findall(r'[\u4e00-\u9fff]', text)
if len(cjk_chars) < 20:
return _make_result(0.5, "too few CJK characters", "word_temperature_bias")
bigrams = [cjk_chars[i] + cjk_chars[i + 1] for i in range(len(cjk_chars) - 1)]
ttr = len(set(bigrams)) / len(bigrams) if bigrams else 0
ttr_score = min(1.0, ttr / 0.7)
# Temperature mix bonus
found_temps = sum([
any(w in text for w in COLD_WORDS),
any(w in text for w in WARM_WORDS),
any(w in text for w in HOT_WORDS),
any(w in text for w in WILD_WORDS),
])
temp_bonus = found_temps / 4.0 * 0.3
score = min(1.0, ttr_score * 0.7 + temp_bonus)
return _make_result(score, f"bigram_ttr={ttr:.3f}, temps={found_temps}/4", "word_temperature_bias")
def score_negative_emotion_ratio(text):
"""[1.4] Negative emotion ratio. → emotional_arc"""
sentences = _split_sentences(text)
if not sentences:
return _make_result(0.5, "no sentences", "emotional_arc")
negative_count = sum(1 for s in sentences
if any(m in s for m in NEGATIVE_MARKERS))
ratio = negative_count / len(sentences)
score = min(1.0, ratio / 0.25)
return _make_result(score, f"negative={negative_count}/{len(sentences)} ({ratio:.0%}, target ≥20%)", "emotional_arc")
def score_adverb_density(text):
"""[1.5] Adverb density control. → adverb_max_per_100"""
char_count = len(text)
if char_count < 50:
return _make_result(0.5, "text too short", "adverb_max_per_100")
# Count adverb occurrences
total_adverbs = sum(text.count(adv) for adv in COMMON_ADVERBS)
density = total_adverbs / char_count * 100
# Check consecutive sentences starting with adverbs
sentences = _split_sentences(text)
consecutive_adverb_starts = 0
for i in range(len(sentences) - 1):
a_starts = any(sentences[i].startswith(adv) for adv in COMMON_ADVERBS)
b_starts = any(sentences[i + 1].startswith(adv) for adv in COMMON_ADVERBS)
if a_starts and b_starts:
consecutive_adverb_starts += 1
score = 1.0
if density > 3.0:
score -= min(0.5, (density - 3.0) * 0.1)
score -= consecutive_adverb_starts * 0.3
return _make_result(score, f"density={density:.1f}/100chars, consecutive_starts={consecutive_adverb_starts}", "adverb_max_per_100")
# ============================================================
# Tier 2: Pattern Checks (weight 30%)
# ============================================================
def score_banned_words(text):
"""[2.1] Banned word check. → null (hard rule, no config param)"""
found = [w for w in BANNED_WORDS if w in text]
score = max(0.0, 1.0 - len(found) * 0.2)
detail = "0 banned words" if not found else f"{len(found)} found: {found[:5]}"
return _make_result(score, detail, None)
def score_broken_sentences(text):
"""[2.2] Broken/incomplete sentence patterns. → broken_sentence_rate"""
count = 0
lines = text.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
for p in BROKEN_SENTENCE_PATTERNS:
count += len(re.findall(p, line))
if 1 <= len(line) <= 10 and not line.startswith('#'):
count += 1
char_count = len(text)
expected = max(3, char_count / 500 * 3)
score = min(1.0, count / expected)
return _make_result(score, f"{count} broken structures (expected ≥{expected:.0f})", "broken_sentence_rate")
def score_real_sources(text):
"""[3.1] Real external source indicators. → real_data_density"""
count = 0
for pattern in REAL_SOURCE_PATTERNS:
count += len(re.findall(pattern, text))
score = min(1.0, count / 5.0)
return _make_result(score, f"{count} real-source indicators (target ≥5)", "real_data_density")
def score_word_temperature_mix(text):
"""[1.2] Word temperature band coverage. → word_temperature_bias"""
found_temps = sum([
any(w in text for w in COLD_WORDS),
any(w in text for w in WARM_WORDS),
any(w in text for w in HOT_WORDS),
any(w in text for w in WILD_WORDS),
])
score = max(0.0, (found_temps - 1) / 3.0)
return _make_result(score, f"{found_temps}/4 temperature bands", "word_temperature_bias")
def score_self_correction(text):
"""[2.2] Self-correction and parenthetical patterns. → self_correction_rate"""
count = 0
for pattern in SELF_CORRECTION_PATTERNS:
count += len(re.findall(pattern, text))
score = min(1.0, count / 3.0)
return _make_result(score, f"{count} self-corrections/insertions (target ≥3)", "self_correction_rate")
# ============================================================
# Tier Runners
# ============================================================
TIER1_CHECKS = [
("sentence_length_stddev", score_sentence_length_stddev),
("sentence_length_range", score_sentence_length_range),
("paragraph_length_variance", score_paragraph_length_variance),
("vocabulary_richness", score_vocabulary_richness),
("negative_emotion_ratio", score_negative_emotion_ratio),
("adverb_density", score_adverb_density),
]
TIER2_CHECKS = [
("banned_words", score_banned_words),
("broken_sentences", score_broken_sentences),
("real_sources", score_real_sources),
("word_temperature_mix", score_word_temperature_mix),
("self_correction", score_self_correction),
]
def run_tier(checks, text):
"""Run a tier of checks. Returns dict keyed by check name + _summary."""
results = {}
scores = []
for name, fn in checks:
r = fn(text)
results[name] = r
scores.append(r["score"])
results["_summary"] = {
"count": len(checks),
"mean_score": round(sum(scores) / len(scores), 4) if scores else 0,
"scores": [round(s, 4) for s in scores],
}
return results
# ============================================================
# Composite Score
# ============================================================
def compute_composite(tier1, tier2, tier3_score=None):
"""Compute composite score (0=human, 100=AI).
With tier3: T1=50%, T2=30%, T3=20%
Without: T1=62.5%, T2=37.5%
"""
t1_mean = tier1["_summary"]["mean_score"]
t2_mean = tier2["_summary"]["mean_score"]
if tier3_score is not None:
humanness = t1_mean * 0.50 + t2_mean * 0.30 + tier3_score * 0.20
weights = {"tier1": 0.50, "tier2": 0.30, "tier3": 0.20}
else:
humanness = t1_mean * 0.625 + t2_mean * 0.375
weights = {"tier1": 0.625, "tier2": 0.375}
composite = round((1 - humanness) * 100, 2)
return composite, weights
def build_param_scores(tier1, tier2):
"""Build flat param→score map for optimization. Averages if multiple checks map to same param."""
param_map = {}
for tier in [tier1, tier2]:
for name, data in tier.items():
if name.startswith("_"):
continue
param = data.get("param")
if param is None:
continue
if param not in param_map:
param_map[param] = []
param_map[param].append(data["score"])
return {p: round(sum(scores) / len(scores), 4) for p, scores in param_map.items()}
# ============================================================
# Main API
# ============================================================
def score_article(text, verbose=False, tier3_score=None):
"""Score an article. Returns full results dict."""
clean = re.sub(r'^#+\s+.*$', '', text, flags=re.MULTILINE).strip()
tier1 = run_tier(TIER1_CHECKS, clean)
tier2 = run_tier(TIER2_CHECKS, clean)
composite, weights = compute_composite(tier1, tier2, tier3_score)
param_scores = build_param_scores(tier1, tier2)
result = {
"composite_score": composite,
"tier1": tier1,
"tier2": tier2,
"tier3": {
"score": tier3_score,
"source": "agent" if tier3_score is not None else "not_available",
},
"weights": weights,
"param_scores": param_scores,
"char_count": len(clean),
}
if verbose:
_print_verbose(result)
return result
def _print_verbose(result):
"""Print a human-readable report."""
composite = result["composite_score"]
print(f"\n{'=' * 60}")
print(f"HUMANNESS SCORE: {composite:.1f}/100 (lower = more human)")
print(f"{'=' * 60}")
for tier_name, tier_label, weight in [
("tier1", "Tier 1 — Statistical", result["weights"].get("tier1", 0)),
("tier2", "Tier 2 — Pattern", result["weights"].get("tier2", 0)),
]:
tier = result[tier_name]
summary = tier["_summary"]
print(f"\n{tier_label} (weight {weight:.0%}, mean {summary['mean_score']:.2f})")
for name, data in tier.items():
if name.startswith("_"):
continue
bar = "" * int(data["score"] * 10) + "" * (10 - int(data["score"] * 10))
param_tag = f" [{data['param']}]" if data.get("param") else ""
print(f" {bar} {data['score']:.2f} {name}{param_tag}")
print(f" {data['detail']}")
t3 = result["tier3"]
if t3["score"] is not None:
t3_weight = result["weights"].get("tier3", 0)
print(f"\nTier 3 — LLM (weight {t3_weight:.0%})")
print(f" Score: {t3['score']:.2f} (source: {t3['source']})")
else:
print(f"\nTier 3 — LLM: not available (standalone mode)")
print(f"\nComposite: {composite:.1f} (0=完美人类, 100=明显AI)")
print(f"Weights: {result['weights']}")
param_scores = result["param_scores"]
if param_scores:
sorted_params = sorted(param_scores.items(), key=lambda x: x[1])
print(f"\nLowest-scoring parameters (optimize these first):")
for param, score in sorted_params[:3]:
print(f" {param}: {score:.2f}")
def main():
parser = argparse.ArgumentParser(description="Score article humanness (0=human, 100=AI)")
parser.add_argument("input", help="Markdown article file")
parser.add_argument("--verbose", "-v", action="store_true", help="Detailed report")
parser.add_argument("--json", action="store_true", help="JSON output")
parser.add_argument("--tier3", type=float, default=None,
help="Tier 3 LLM score (0-1), passed by agent from SKILL.md")
args = parser.parse_args()
text = Path(args.input).read_text(encoding="utf-8")
result = score_article(text, verbose=args.verbose, tier3_score=args.tier3)
if args.json:
print(json.dumps(result, ensure_ascii=False, indent=2))
elif not args.verbose:
print(f"{result['composite_score']:.1f}")
if __name__ == "__main__":
main()