chore: rebuild dist/openclaw from source

2026-03-30 11:59:47 +00:00 · 2026-03-30 11:59:47 +00:00 · ed73813d03
commit ed73813d03
parent bd85fc5c6b
6 changed files with 378 additions and 350 deletions
--- a/dist/openclaw/SKILL.md
+++ b/dist/openclaw/SKILL.md
@ -44,6 +44,20 @@ description: |
     - 读取 `writing-config.yaml`（如存在），检查是否有 AI 特征参数（emotional_arc: flat、paragraph_rhythm: structured、closing_style: summary）
     - 读取 `history.yaml` 最近 5 篇，检查 persona 使用和 web_search 降级情况
  4. 综合输出自然语言报告 + 按优先级排序的改进建议
 - 用户说"优化写作参数"/"优化参数"/"跑优化" → 执行以下流程：
  1. 读取 `{baseDir}/writing-config.yaml`（不存在则从 `writing-config.example.yaml` 复制）
  2. 用户可指定迭代次数（默认 3），如"优化参数跑 5 轮"
  3. **迭代循环**（每轮）：
     a. 用当前 writing-config.yaml 参数写一篇 500 字测试短文（主题：用户指定或"AI Agent 行业观察"）
     b. 保存到 `{baseDir}/output/optimize-test.md`
     c. `python3 {baseDir}/scripts/humanness_score.py {baseDir}/output/optimize-test.md --json --tier3 {agent_tier3_score}`
     d. Agent 做 Tier 3 分析（读测试短文，评估风格漂移/密度波浪/连贯性打破/整体人感，输出 0-1 分数传入 --tier3）
     e. 解析 JSON 中 `param_scores`，找到得分最低的 1-2 个参数
     f. 调整 writing-config.yaml 中对应参数（方向：让该维度更"人类"）
     g. 记录本轮：迭代编号、composite_score、调整的参数、旧值→新值
  4. 循环结束后，保留 composite_score 最低（最人类）的 writing-config.yaml
  5. 输出优化报告：起始分 → 最终分，每轮调整，最终参数
  6. 提示："参数已优化。下次写文章时自动使用新参数。"
 - 用户说"更新"/"更新 WeWrite"/"升级" → 在 `{baseDir}` 执行 `git pull origin main`，完成后告知版本变化
 ---
@ -211,6 +225,19 @@ web_search: "{选题关键词} 数据 报告 2025 2026"
 不通过 → 定向重写该段落。3 次仍不过 → 标注跳过。
 **5b-2. 脚本验证**（补充逐项检查）：
 Agent 在 5b 逐项检查时同步完成 Tier 3 评估（风格漂移、密度波浪、连贯性打破、整体人感），产出 0-1 分数。
 ```bash
 python3 {baseDir}/scripts/humanness_score.py {article_path} --json --tier3 {agent_tier3_score}
 ```
 解读 JSON 中 `composite_score`：
 - < 30 → 通过，继续 Step 6
 - 30-50 → 查看 `param_scores` 中最低分项，定向重写对应段落
 - \> 50 → 重大问题，逐个低分项修复，最多 3 轮
 ---
 ### Step 6: 视觉 AI
@ -302,6 +329,7 @@ python3 {baseDir}/toolkit/cli.py preview {markdown} --theme {theme} --no-open -o
 | 学习我的修改 | `读取: {baseDir}/references/learn-edits.md` |
 | 做一个小绿书/图片帖 | `python3 {baseDir}/toolkit/cli.py image-post img1.jpg img2.jpg -t "标题"` |
 | 诊断配置 / 检查反AI / 为什么AI检测没过 | `python3 {baseDir}/scripts/diagnose.py --json` + LLM 交叉分析 |
 | 优化写作参数 / 优化参数 | 迭代循环：写测试短文 → 打分 → 调参（见辅助功能） |
 ---
--- a/dist/openclaw/VERSION
+++ b/dist/openclaw/VERSION
@ -1 +1 @@
-1.2.0
+1.3.0
--- a/dist/openclaw/scripts/diagnose.py
+++ b/dist/openclaw/scripts/diagnose.py
@ -157,7 +157,7 @@ def check_enhancements():
    else:
        checks.append(make_check(
            "enhancement", "writing_config", "warn",
-            "not found → using defaults (run optimize_loop.py to tune)",
+            "not found → using defaults (say '优化参数' to tune)",
        ))
    # playbook.md
@ -240,7 +240,7 @@ def compute_summary(checks):
        elif name == "playbook":
            recs.append('Edit a generated article, then say "学习我的修改" to build playbook.md')
        elif name == "writing_config":
-            recs.append('Run: python3 scripts/optimize_loop.py --topic "your topic" --iterations 10')
+            recs.append('Say "优化参数" to run the optimization loop')
        elif name == "history_articles":
            recs.append("Generate your first article to start building history")
        elif name == "dimension_variance":
--- a/dist/openclaw/scripts/humanness_score.py
+++ b/dist/openclaw/scripts/humanness_score.py
@ -1,20 +1,26 @@
 #!/usr/bin/env python3
 """
-Fixed humanness scoring pipeline for WeWrite optimization loop.
+Humanness scoring for WeWrite articles.
-Two-layer scoring inspired by autoresearch + the "objective checklist + subjective feel" pattern:
+Three-tier evaluation aligned with writing-guide.md's anti-AI checklist:
-Layer 1: Objective checklist (yes/no, deterministic, won't drift)
+  Tier 1 (Statistical, 50%): 6 checks measuring statistical properties
-Layer 2: Subjective reader-feel (LLM judge, 1-10)
+         that AI detectors analyze (burstiness, distribution, variance).
  Tier 2 (Pattern, 30%):     5 checks for specific linguistic patterns
         (banned words, broken sentences, real sources).
  Tier 3 (LLM, 20%):        Semantic analysis done by the agent in SKILL.md
         (style drift, density waves, coherence). Passed via --tier3 flag.
-Composite = Layer1 pass_rate * 0.6 + Layer2 normalized * 0.4
+Each check outputs a continuous 0-1 score and maps to a writing-config
 parameter, so the optimization loop knows which knob to turn.
-DO NOT MODIFY this file during optimization. It is the fixed evaluation function.
+Standalone mode (no --tier3): weights redistribute to T1=62.5%, T2=37.5%.
 Usage:
-    python3 humanness_score.py article.md
+    python3 humanness_score.py article.md                    # single score
-    python3 humanness_score.py article.md --verbose
+    python3 humanness_score.py article.md --verbose          # detailed report
-    python3 humanness_score.py article.md --json
+    python3 humanness_score.py article.md --json             # full JSON
    python3 humanness_score.py article.md --json --tier3 0.7 # with agent score
 """
 import argparse
@ -25,7 +31,7 @@ from pathlib import Path
 # ============================================================
-# Layer 1: Objective Checklist (deterministic yes/no)
+# Constants
 # ============================================================
 BANNED_WORDS = [
@ -39,255 +45,399 @@ BANNED_WORDS = [
    "正如我们所看到的",
 ]
 # Real-source indicators: named people, organizations, specific publications
 REAL_SOURCE_PATTERNS = [
-    r'[A-Z][a-z]+\s+[A-Z][a-z]+',  # Named person (English)
+    r'[A-Z][a-z]+\s+[A-Z][a-z]+',
-    r'[\u4e00-\u9fff]{2,4}(?:表示|指出|认为|写道|提到|说过)',  # Chinese name + said
+    r'[\u4e00-\u9fff]{2,4}(?:表示|指出|认为|写道|提到|说过)',
-    r'(?:据|根据|来自)\s*[\u4e00-\u9fff]+(?:报告|数据|研究|调查)',  # "according to X report"
+    r'(?:据|根据|来自)\s*[\u4e00-\u9fff]+(?:报告|数据|研究|调查)',
-    r'20[12]\d\s*年',  # Specific year reference
+    r'20[12]\d\s*年',
-    r'\d+(?:\.\d+)?%',  # Specific percentage
+    r'\d+(?:\.\d+)?%',
-    r'(?:亿|万)\s*(?:美元|元|人民币)',  # Specific monetary amount
+    r'(?:亿|万)\s*(?:美元|元|人民币)',
 ]
 NEGATIVE_MARKERS = [
    "失望", "糟糕", "扯", "坑", "烂", "差劲", "崩溃", "吐槽", "骂",
    "怒", "烦", "焦虑", "担忧", "不满", "恶心", "可怕", "可悲", "可笑",
    "离谱", "尴尬", "无语", "蠢", "惨", "亏", "危",
    "太扯了", "说实话我很失望", "搞什么", "不靠谱", "受不了",
 ]
 COMMON_ADVERBS = [
    "非常", "十分", "极其", "特别", "相当", "尤其", "格外",
    "更加", "越来越", "逐渐", "不断", "始终", "一直",
    "已经", "正在", "将要", "可能", "大概", "或许",
    "似乎", "显然", "明显", "确实", "果然", "居然",
    "竟然", "简直", "几乎", "完全", "绝对", "必然",
 ]
 COLD_WORDS = ["边际", "认知负荷", "信息不对称", "路径依赖", "商业模式", "生态系统", "增量"]
 WARM_WORDS = ["说白了", "其实吧", "讲真", "说实话", "坦白讲", "懂的都懂", "怎么说呢"]
 HOT_WORDS = ["DNA动了", "格局打开", "遥遥<EFBFBD><EFBFBD>先", "卷", "内卷", "炸了", "杀疯了", "吃灰"]
 WILD_WORDS = ["整挺好", "不靠谱", "瞎折腾", "搁这儿", "糊弄", "扯", "嗯"]
 SELF_CORRECTION_PATTERNS = [
    r'不对[，,]', r'准确说', r'算了', r'说错了',
    r'其实不是', r'我记混了', r'应该说', r'更准确地说',
    r'（[^）]{4,}）',  # Chinese parenthetical insertion (≥4 chars)
 ]
 BROKEN_SENTENCE_PATTERNS = [
    r'——(?!.*[，。！？])',
    r'\.{3,}|…',
    r'不对[，,]',
    r'算了',
 ]
-def check_no_banned_words(text: str) -> tuple[bool, str]:
+# ============================================================
-    """Check: zero banned words."""
+# Helpers
 # ============================================================
 def _split_sentences(text):
    """Split text by Chinese sentence-ending punctuation."""
    sentences = re.split(r'[。！？\n]', text)
    return [s.strip() for s in sentences if s.strip() and len(s.strip()) > 1]
 def _split_paragraphs(text):
    """Split text into paragraphs, excluding headings."""
    return [p.strip() for p in text.split('\n\n')
            if p.strip() and not p.strip().startswith('#')]
 def _make_result(score, detail, param=None):
    """Create a check result dict."""
    r = {"score": round(max(0.0, min(1.0, score)), 4), "detail": detail}
    if param is not None:
        r["param"] = param
    else:
        r["param"] = None
    return r
 # ============================================================
 # Tier 1: Statistical Checks (weight 50%)
 # ============================================================
 def score_sentence_length_stddev(text):
    """[1.1] Sentence length standard deviation. → sentence_variance"""
    sentences = _split_sentences(text)
    if len(sentences) < 5:
        return _make_result(0.5, "too few sentences to measure", "sentence_variance")
    lengths = [len(s) for s in sentences]
    mean = sum(lengths) / len(lengths)
    variance = sum((l - mean) ** 2 for l in lengths) / len(lengths)
    stddev = variance ** 0.5
    score = min(1.0, stddev / 25.0)
    return _make_result(score, f"stddev={stddev:.1f} (target ≥15)", "sentence_variance")
 def score_sentence_length_range(text):
    """[1.1] Sentence length range (max - min). → sentence_variance"""
    sentences = _split_sentences(text)
    if len(sentences) < 5:
        return _make_result(0.5, "too few sentences", "sentence_variance")
    lengths = [len(s) for s in sentences]
    rng = max(lengths) - min(lengths)
    range_score = min(1.0, rng / 40.0)
    # Check for single-sentence short paragraphs
    lines = text.split('\n')
    short_paras = sum(1 for l in lines if l.strip() and 1 <= len(l.strip()) <= 5
                      and not l.strip().startswith('#'))
    expected = max(1, len(text) / 500)
    short_score = min(1.0, short_paras / expected)
    score = range_score * 0.6 + short_score * 0.4
    return _make_result(score, f"range={rng} (target ≥30), short_paras={short_paras}", "sentence_variance")
 def score_paragraph_length_variance(text):
    """[1.3] Paragraph length variance. → paragraph_rhythm"""
    paragraphs = _split_paragraphs(text)
    if len(paragraphs) < 3:
        return _make_result(0.5, "too few paragraphs", "paragraph_rhythm")
    total_pairs = len(paragraphs) - 1
    similar = sum(1 for i in range(total_pairs)
                  if abs(len(paragraphs[i]) - len(paragraphs[i + 1])) <= 20)
    score = 1.0 - (similar / total_pairs) if total_pairs > 0 else 0.5
    return _make_result(score, f"{similar}/{total_pairs} consecutive similar-length pairs", "paragraph_rhythm")
 def score_vocabulary_richness(text):
    """[1.2] CJK bigram type-token ratio + temperature mix. → word_temperature_bias"""
    cjk_chars = re.findall(r'[\u4e00-\u9fff]', text)
    if len(cjk_chars) < 20:
        return _make_result(0.5, "too few CJK characters", "word_temperature_bias")
    bigrams = [cjk_chars[i] + cjk_chars[i + 1] for i in range(len(cjk_chars) - 1)]
    ttr = len(set(bigrams)) / len(bigrams) if bigrams else 0
    ttr_score = min(1.0, ttr / 0.7)
    # Temperature mix bonus
    found_temps = sum([
        any(w in text for w in COLD_WORDS),
        any(w in text for w in WARM_WORDS),
        any(w in text for w in HOT_WORDS),
        any(w in text for w in WILD_WORDS),
    ])
    temp_bonus = found_temps / 4.0 * 0.3
    score = min(1.0, ttr_score * 0.7 + temp_bonus)
    return _make_result(score, f"bigram_ttr={ttr:.3f}, temps={found_temps}/4", "word_temperature_bias")
 def score_negative_emotion_ratio(text):
    """[1.4] Negative emotion ratio. → emotional_arc"""
    sentences = _split_sentences(text)
    if not sentences:
        return _make_result(0.5, "no sentences", "emotional_arc")
    negative_count = sum(1 for s in sentences
                         if any(m in s for m in NEGATIVE_MARKERS))
    ratio = negative_count / len(sentences)
    score = min(1.0, ratio / 0.25)
    return _make_result(score, f"negative={negative_count}/{len(sentences)} ({ratio:.0%}, target ≥20%)", "emotional_arc")
 def score_adverb_density(text):
    """[1.5] Adverb density control. → adverb_max_per_100"""
    char_count = len(text)
    if char_count < 50:
        return _make_result(0.5, "text too short", "adverb_max_per_100")
    # Count adverb occurrences
    total_adverbs = sum(text.count(adv) for adv in COMMON_ADVERBS)
    density = total_adverbs / char_count * 100
    # Check consecutive sentences starting with adverbs
    sentences = _split_sentences(text)
    consecutive_adverb_starts = 0
    for i in range(len(sentences) - 1):
        a_starts = any(sentences[i].startswith(adv) for adv in COMMON_ADVERBS)
        b_starts = any(sentences[i + 1].startswith(adv) for adv in COMMON_ADVERBS)
        if a_starts and b_starts:
            consecutive_adverb_starts += 1
    score = 1.0
    if density > 3.0:
        score -= min(0.5, (density - 3.0) * 0.1)
    score -= consecutive_adverb_starts * 0.3
    return _make_result(score, f"density={density:.1f}/100chars, consecutive_starts={consecutive_adverb_starts}", "adverb_max_per_100")
 # ============================================================
 # Tier 2: Pattern Checks (weight 30%)
 # ============================================================
 def score_banned_words(text):
    """[2.1] Banned word check. → null (hard rule, no config param)"""
    found = [w for w in BANNED_WORDS if w in text]
-    if found:
+    score = max(0.0, 1.0 - len(found) * 0.2)
-        return False, f"Found {len(found)} banned words: {found[:5]}"
+    detail = "0 banned words" if not found else f"{len(found)} found: {found[:5]}"
-    return True, "0 banned words"
+    return _make_result(score, detail, None)
-def check_real_sources(text: str) -> tuple[bool, str]:
+def score_broken_sentences(text):
-    """Check: article references real external sources (≥3 instances)."""
+    """[2.2] Broken/incomplete sentence patterns. → broken_sentence_rate"""
    count = 0
    for pattern in REAL_SOURCE_PATTERNS:
        count += len(re.findall(pattern, text))
    if count >= 3:
        return True, f"{count} real-source indicators found"
    return False, f"Only {count} real-source indicators (need ≥3)"
 def check_broken_sentences(text: str) -> tuple[bool, str]:
    """Check: ≥3 broken/incomplete sentences (dashes, ellipsis, self-corrections)."""
    patterns = [
        r'——(?!.*[，。！？])',  # em-dash interruption without ending punct
        r'\.{3,}|…',  # ellipsis
        r'不对[，,]',  # self-correction "不对，"
        r'算了',  # abandonment "算了"
        r'^.{1,6}[。！？]$',  # ultra-short sentence (≤6 chars + punct) as standalone line
    ]
    count = 0
    lines = text.split('\n')
    for line in lines:
        line = line.strip()
        if not line:
            continue
-        for p in patterns:
+        for p in BROKEN_SENTENCE_PATTERNS:
            count += len(re.findall(p, line))
        # Check for ultra-short standalone paragraphs (1-10 chars)
        if 1 <= len(line) <= 10 and not line.startswith('#'):
            count += 1
-    if count >= 3:
+    char_count = len(text)
-        return True, f"{count} broken/incomplete structures"
+    expected = max(3, char_count / 500 * 3)
-    return False, f"Only {count} broken structures (need ≥3)"
+    score = min(1.0, count / expected)
    return _make_result(score, f"{count} broken structures (expected ≥{expected:.0f})", "broken_sentence_rate")
-def check_sentence_length_variance(text: str) -> tuple[bool, str]:
+def score_real_sources(text):
-    """Check: sentence length standard deviation > threshold.
+    """[3.1] Real external source indicators. → real_data_density"""
-
+    count = 0
-    AI text has suspiciously uniform sentence lengths.
+    for pattern in REAL_SOURCE_PATTERNS:
-    Human text varies wildly (3-char to 80-char sentences in the same paragraph).
+        count += len(re.findall(pattern, text))
-    """
+    score = min(1.0, count / 5.0)
-    # Split by Chinese sentence-ending punctuation
+    return _make_result(score, f"{count} real-source indicators (target ≥5)", "real_data_density")
    sentences = re.split(r'[。！？\n]', text)
    sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 1]
    if len(sentences) < 5:
        return False, "Too few sentences to measure"
    lengths = [len(s) for s in sentences]
    mean = sum(lengths) / len(lengths)
    variance = sum((l - mean) ** 2 for l in lengths) / len(lengths)
    stddev = variance ** 0.5
    # Threshold: human text typically has stddev > 15 chars
    # AI text tends to be 8-12
    if stddev > 15:
        return True, f"Sentence length stddev = {stddev:.1f} (good variance)"
    return False, f"Sentence length stddev = {stddev:.1f} (too uniform, need >15)"
-def check_paragraph_length_variance(text: str) -> tuple[bool, str]:
+def score_word_temperature_mix(text):
-    """Check: no consecutive paragraphs of similar length."""
+    """[1.2] Word temperature band coverage. → word_temperature_bias"""
-    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip() and not p.strip().startswith('#')]
+    found_temps = sum([
-    if len(paragraphs) < 3:
+        any(w in text for w in COLD_WORDS),
-        return True, "Too few paragraphs to check"
+        any(w in text for w in WARM_WORDS),
-
+        any(w in text for w in HOT_WORDS),
-    consecutive_similar = 0
+        any(w in text for w in WILD_WORDS),
-    for i in range(len(paragraphs) - 1):
+    ])
-        len_a = len(paragraphs[i])
+    score = max(0.0, (found_temps - 1) / 3.0)
-        len_b = len(paragraphs[i + 1])
+    return _make_result(score, f"{found_temps}/4 temperature bands", "word_temperature_bias")
        if abs(len_a - len_b) <= 20:
            consecutive_similar += 1
    if consecutive_similar <= 1:
        return True, f"{consecutive_similar} consecutive similar-length pairs (OK)"
    return False, f"{consecutive_similar} consecutive similar-length pairs (too uniform)"
-def check_word_temperature_mix(text: str) -> tuple[bool, str]:
+def score_self_correction(text):
-    """Check: mix of formal/colloquial/slang/wild vocabulary."""
+    """[2.2] Self-correction and parenthetical patterns. → self_correction_rate"""
-    cold = ["边际", "认知负荷", "信息不对称", "路径依赖", "商业模式", "生态系统", "增量"]
+    count = 0
-    warm = ["说白了", "其实吧", "讲真", "说实话", "坦白讲", "懂的都懂", "怎么说呢"]
+    for pattern in SELF_CORRECTION_PATTERNS:
-    hot = ["DNA动了", "格局打开", "遥遥领先", "卷", "内卷", "炸了", "杀疯了", "吃灰"]
+        count += len(re.findall(pattern, text))
-    wild = ["整挺好", "不靠谱", "瞎折腾", "搁这儿", "糊弄", "扯", "嗯"]
+    score = min(1.0, count / 3.0)
-
+    return _make_result(score, f"{count} self-corrections/insertions (target ≥3)", "self_correction_rate")
    found_temps = 0
    if any(w in text for w in cold): found_temps += 1
    if any(w in text for w in warm): found_temps += 1
    if any(w in text for w in hot): found_temps += 1
    if any(w in text for w in wild): found_temps += 1
    if found_temps >= 3:
        return True, f"{found_temps}/4 temperature types found"
    return False, f"Only {found_temps}/4 temperature types (need ≥3)"
-def run_layer1(text: str) -> dict:
+# ============================================================
-    """Run all Layer 1 checks. Returns dict with results."""
+# Tier Runners
-    checks = [
+# ============================================================
        ("no_banned_words", check_no_banned_words),
        ("real_sources", check_real_sources),
        ("broken_sentences", check_broken_sentences),
        ("sentence_length_variance", check_sentence_length_variance),
        ("paragraph_length_variance", check_paragraph_length_variance),
        ("word_temperature_mix", check_word_temperature_mix),
    ]
 TIER1_CHECKS = [
    ("sentence_length_stddev", score_sentence_length_stddev),
    ("sentence_length_range", score_sentence_length_range),
    ("paragraph_length_variance", score_paragraph_length_variance),
    ("vocabulary_richness", score_vocabulary_richness),
    ("negative_emotion_ratio", score_negative_emotion_ratio),
    ("adverb_density", score_adverb_density),
 ]
 TIER2_CHECKS = [
    ("banned_words", score_banned_words),
    ("broken_sentences", score_broken_sentences),
    ("real_sources", score_real_sources),
    ("word_temperature_mix", score_word_temperature_mix),
    ("self_correction", score_self_correction),
 ]
 def run_tier(checks, text):
    """Run a tier of checks. Returns dict keyed by check name + _summary."""
    results = {}
-    passed = 0
+    scores = []
    total = len(checks)
    for name, fn in checks:
-        ok, detail = fn(text)
+        r = fn(text)
-        results[name] = {"passed": ok, "detail": detail}
+        results[name] = r
-        if ok:
+        scores.append(r["score"])
            passed += 1
    results["_summary"] = {
-        "passed": passed,
+        "count": len(checks),
-        "total": total,
+        "mean_score": round(sum(scores) / len(scores), 4) if scores else 0,
-        "pass_rate": round(passed / total, 4),
+        "scores": [round(s, 4) for s in scores],
    }
    return results
 # ============================================================
 # Layer 2: Subjective Reader-Feel (LLM judge)
 # ============================================================
 JUDGE_PROMPT = """你是一个经验丰富的公众号读者。你对 AI 生成的内容非常敏感——你能凭直觉分辨出人写的和 AI 写的。
 现在请你阅读下面这篇文章，然后回答一个问题：
 **这篇文章读起来像人写的吗？**
 评分标准（1-10）：
 - 1-3：明显是 AI 写的。语言过于规整，没有个人色彩，像教科书。
 - 4-5：有一些 AI 痕迹。整体流畅但缺乏个人经历、情感起伏或出人意料的表达。
 - 6-7：大部分像人写的，偶尔有几句感觉"太完美了"。
 - 8-9：很像人写的。有个人风格、情感波动、不完美感，像一个真人编辑的作品。
 - 10：完全像人写的。如果不告诉我，我不会怀疑这是 AI 参与的。
 请只输出一个 JSON：{"score": 数字, "reason": "一句话理由"}
 ---
 文章内容：
 {article}
 """
 def run_layer2_stub(text: str) -> dict:
    """Layer 2 stub — returns placeholder when no LLM API available.
    In production, this calls Claude/GPT to judge the article.
    For the optimization loop, replace this with actual API call.
    """
    return {
        "score": 5.0,
        "reason": "(stub) LLM judge not configured — using default score",
        "is_stub": True,
    }
 # ============================================================
 # Composite Score
 # ============================================================
-def compute_composite(layer1: dict, layer2: dict) -> float:
+def compute_composite(tier1, tier2, tier3_score=None):
-    """Composite score: lower is better (like val_bpb in autoresearch).
+    """Compute composite score (0=human, 100=AI).
-    Inverted so that 0 = perfect human, 100 = obvious AI.
+    With tier3: T1=50%, T2=30%, T3=20%
    Without:    T1=62.5%, T2=37.5%
    """
-    l1_pass_rate = layer1["_summary"]["pass_rate"]
+    t1_mean = tier1["_summary"]["mean_score"]
-    l2_score = layer2["score"] / 10.0  # normalize to 0-1
+    t2_mean = tier2["_summary"]["mean_score"]
-    # Composite: higher pass_rate and higher reader score = more human
+    if tier3_score is not None:
-    humanness = l1_pass_rate * 0.6 + l2_score * 0.4
+        humanness = t1_mean * 0.50 + t2_mean * 0.30 + tier3_score * 0.20
        weights = {"tier1": 0.50, "tier2": 0.30, "tier3": 0.20}
    else:
        humanness = t1_mean * 0.625 + t2_mean * 0.375
        weights = {"tier1": 0.625, "tier2": 0.375}
-    # Invert: 0 = perfect human, 100 = obvious AI
+    composite = round((1 - humanness) * 100, 2)
-    return round((1 - humanness) * 100, 2)
+    return composite, weights
 def build_param_scores(tier1, tier2):
    """Build flat param→score map for optimization. Averages if multiple checks map to same param."""
    param_map = {}
    for tier in [tier1, tier2]:
        for name, data in tier.items():
            if name.startswith("_"):
                continue
            param = data.get("param")
            if param is None:
                continue
            if param not in param_map:
                param_map[param] = []
            param_map[param].append(data["score"])
    return {p: round(sum(scores) / len(scores), 4) for p, scores in param_map.items()}
 # ============================================================
-# Main
+# Main API
 # ============================================================
-def score_article(text: str, verbose: bool = False) -> dict:
+def score_article(text, verbose=False, tier3_score=None):
    """Score an article. Returns full results dict."""
    # Strip markdown headers for scoring
    clean = re.sub(r'^#+\s+.*$', '', text, flags=re.MULTILINE).strip()
-    layer1 = run_layer1(clean)
+    tier1 = run_tier(TIER1_CHECKS, clean)
-    layer2 = run_layer2_stub(clean)
+    tier2 = run_tier(TIER2_CHECKS, clean)
-    composite = compute_composite(layer1, layer2)
+    composite, weights = compute_composite(tier1, tier2, tier3_score)
    param_scores = build_param_scores(tier1, tier2)
    result = {
        "composite_score": composite,
-        "layer1": layer1,
+        "tier1": tier1,
-        "layer2": layer2,
+        "tier2": tier2,
        "tier3": {
            "score": tier3_score,
            "source": "agent" if tier3_score is not None else "not_available",
        },
        "weights": weights,
        "param_scores": param_scores,
        "char_count": len(clean),
    }
    if verbose:
-        print(f"\n{'='*60}")
+        _print_verbose(result)
        print(f"HUMANNESS SCORE: {composite:.1f}/100 (lower = more human)")
        print(f"{'='*60}")
        print(f"\nLayer 1 — Objective Checklist ({layer1['_summary']['passed']}/{layer1['_summary']['total']})")
        for name, data in layer1.items():
            if name.startswith('_'):
                continue
            status = "✓" if data["passed"] else "✗"
            print(f"  {status} {name}: {data['detail']}")
        print(f"\nLayer 2 — Reader Feel: {layer2['score']}/10")
        print(f"  {layer2['reason']}")
        print(f"\nComposite: {composite:.1f} (0=完美人类, 100=明显AI)")
    return result
 def _print_verbose(result):
    """Print a human-readable report."""
    composite = result["composite_score"]
    print(f"\n{'=' * 60}")
    print(f"HUMANNESS SCORE: {composite:.1f}/100 (lower = more human)")
    print(f"{'=' * 60}")
    for tier_name, tier_label, weight in [
        ("tier1", "Tier 1 — Statistical", result["weights"].get("tier1", 0)),
        ("tier2", "Tier 2 — Pattern", result["weights"].get("tier2", 0)),
    ]:
        tier = result[tier_name]
        summary = tier["_summary"]
        print(f"\n{tier_label} (weight {weight:.0%}, mean {summary['mean_score']:.2f})")
        for name, data in tier.items():
            if name.startswith("_"):
                continue
            bar = "█" * int(data["score"] * 10) + "░" * (10 - int(data["score"] * 10))
            param_tag = f" [{data['param']}]" if data.get("param") else ""
            print(f"  {bar} {data['score']:.2f}  {name}{param_tag}")
            print(f"         {data['detail']}")
    t3 = result["tier3"]
    if t3["score"] is not None:
        t3_weight = result["weights"].get("tier3", 0)
        print(f"\nTier 3 — LLM (weight {t3_weight:.0%})")
        print(f"  Score: {t3['score']:.2f} (source: {t3['source']})")
    else:
        print(f"\nTier 3 — LLM: not available (standalone mode)")
    print(f"\nComposite: {composite:.1f} (0=完美人类, 100=明显AI)")
    print(f"Weights: {result['weights']}")
    param_scores = result["param_scores"]
    if param_scores:
        sorted_params = sorted(param_scores.items(), key=lambda x: x[1])
        print(f"\nLowest-scoring parameters (optimize these first):")
        for param, score in sorted_params[:3]:
            print(f"  {param}: {score:.2f}")
 def main():
-    parser = argparse.ArgumentParser(description="Score article humanness")
+    parser = argparse.ArgumentParser(description="Score article humanness (0=human, 100=AI)")
    parser.add_argument("input", help="Markdown article file")
-    parser.add_argument("--verbose", "-v", action="store_true", help="Detailed output")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Detailed report")
    parser.add_argument("--json", action="store_true", help="JSON output")
    parser.add_argument("--tier3", type=float, default=None,
                        help="Tier 3 LLM score (0-1), passed by agent from SKILL.md")
    args = parser.parse_args()
    text = Path(args.input).read_text(encoding="utf-8")
-    result = score_article(text, verbose=args.verbose)
+    result = score_article(text, verbose=args.verbose, tier3_score=args.tier3)
    if args.json:
        print(json.dumps(result, ensure_ascii=False, indent=2))
--- a/dist/openclaw/scripts/optimize_loop.py
+++ b/dist/openclaw/scripts/optimize_loop.py
@ -1,149 +0,0 @@
 #!/usr/bin/env python3
 """
 WeWrite Optimization Loop — autoresearch-style iterative improvement.
 Inspired by Karpathy's autoresearch: change → score → keep/rollback → repeat.
 But instead of optimizing ML training code, we optimize WRITING RULES to
 produce articles that pass AI detection while maintaining quality.
 The mutable surface: writing-config.yaml (style parameters + prompt rules)
 The fixed evaluation: humanness_score.py (objective checklist + subjective feel)
 The metric: composite_score (lower = more human, like val_bpb)
 Usage:
    python3 optimize_loop.py --topic "AI Agent" --iterations 10
    python3 optimize_loop.py --topic "AI Agent" --iterations 5 --verbose
 Architecture:
    1. Load current writing-config.yaml
    2. Generate article with current config
    3. Score with humanness_score.py
    4. LLM proposes a change to writing-config.yaml
    5. Generate article with new config
    6. Score again
    7. If improved → keep (commit). If not → rollback.
    8. Log to results.tsv
    9. Repeat.
 Requirements:
    - ANTHROPIC_API_KEY in environment (for article generation + LLM judge)
    - writing-config.yaml in skill root (created on first run with defaults)
 """
 import argparse
 import json
 import os
 import subprocess
 import sys
 from datetime import datetime
 from pathlib import Path
 import yaml
 SKILL_DIR = Path(__file__).parent.parent
 CONFIG_PATH = SKILL_DIR / "writing-config.yaml"
 RESULTS_PATH = SKILL_DIR / "optimization-results.tsv"
 DEFAULT_CONFIG = {
    "persona": "科技媒体资深编辑，写了八年公众号，对AI行业有深度认知",
    "sentence_variance": 0.7,
    "broken_sentence_rate": 0.04,
    "idiom_density": 0.15,
    "filler_style": "mixed",  # literary / casual / mixed / minimal
    "paragraph_rhythm": "chaotic",  # structured / chaotic / wave
    "self_correction_rate": 0.02,
    "tangent_frequency": "every_800_chars",  # never / every_500 / every_800 / every_1200
    "real_data_density": "high",  # low / medium / high
    "word_temperature_bias": "warm",  # cold / warm / hot / balanced
    "emotional_arc": "restrained_to_burst",  # flat / gradual / restrained_to_burst / volatile
    "opening_style": "scene",  # scene / data / question / anecdote / cold_open
    "closing_style": "open_question",  # summary / open_question / image / abrupt
    "structure_linearity": 0.3,  # 0=fully non-linear, 1=fully linear
 }
 def ensure_config():
    """Create default writing-config.yaml if it doesn't exist."""
    if not CONFIG_PATH.exists():
        with open(CONFIG_PATH, "w", encoding="utf-8") as f:
            yaml.dump(DEFAULT_CONFIG, f, allow_unicode=True, default_flow_style=False)
        print(f"Created default config: {CONFIG_PATH}")
    return yaml.safe_load(CONFIG_PATH.read_text(encoding="utf-8"))
 def score_article(article_path: str) -> dict:
    """Run humanness_score.py on an article. Returns parsed result."""
    result = subprocess.run(
        ["python3", str(SKILL_DIR / "scripts" / "humanness_score.py"), article_path, "--json"],
        capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f"Scoring failed: {result.stderr}", file=sys.stderr)
        return {"composite_score": 100.0, "error": result.stderr}
    return json.loads(result.stdout)
 def log_result(iteration: int, composite: float, config_summary: str, status: str, description: str):
    """Append result to TSV log."""
    header_needed = not RESULTS_PATH.exists()
    with open(RESULTS_PATH, "a", encoding="utf-8") as f:
        if header_needed:
            f.write("iteration\ttimestamp\tcomposite\tstatus\tdescription\tconfig_change\n")
        ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        f.write(f"{iteration}\t{ts}\t{composite:.2f}\t{status}\t{description}\t{config_summary}\n")
 def print_banner(iteration: int, total: int):
    print(f"\n{'='*60}")
    print(f"  OPTIMIZATION LOOP — Iteration {iteration}/{total}")
    print(f"{'='*60}")
 def main():
    parser = argparse.ArgumentParser(description="WeWrite optimization loop")
    parser.add_argument("--topic", required=True, help="Article topic for testing")
    parser.add_argument("--iterations", type=int, default=10, help="Number of iterations")
    parser.add_argument("--verbose", "-v", action="store_true")
    args = parser.parse_args()
    print(f"""
 ╔══════════════════════════════════════════════════════╗
 ║  WeWrite Optimization Loop                          ║
 ║  Topic: {args.topic:<44s}║
 ║  Iterations: {args.iterations:<39d}║
 ║                                                      ║
 ║  Pattern: change config → generate → score →         ║
 ║           keep if better, rollback if worse           ║
 ╚══════════════════════════════════════════════════════╝
 """)
    config = ensure_config()
    print("This script provides the FRAMEWORK for optimization.")
    print("To run the full loop, you need:")
    print("  1. An article generation function (Claude API)")
    print("  2. A scoring function (humanness_score.py — included)")
    print("  3. An LLM to propose config changes (Claude API)")
    print()
    print("Current config:")
    print(yaml.dump(config, allow_unicode=True, default_flow_style=False))
    print()
    print("Run this loop via Claude Code / OpenClaw agent:")
    print()
    print("  Agent reads writing-config.yaml")
    print("  → generates article with those rules")
    print("  → scores with: python3 scripts/humanness_score.py article.md --json")
    print("  → proposes a config change")
    print("  → generates new article")
    print("  → scores again")
    print("  → if composite_score decreased → commit config change")
    print("  → if composite_score same/worse → rollback")
    print("  → logs to optimization-results.tsv")
    print("  → repeats")
    print()
    print("To test scoring on an existing article:")
    print(f"  python3 scripts/humanness_score.py <article.md> --verbose")
 if __name__ == "__main__":
    main()
--- a/dist/openclaw/writing-config.example.yaml
+++ b/dist/openclaw/writing-config.example.yaml
@ -1,10 +1,9 @@
 # WeWrite 写作参数（可优化）
-# 复制为 writing-config.yaml，然后用 optimize loop 迭代调优
+# 复制为 writing-config.yaml，在对话中说"优化参数"让 Agent 迭代调优
-# 或手动调整后观察朱雀检测结果
+# 或手动调整后用 humanness_score.py 评估
 #
 # 这个文件是起点，不是最优解。
-# 运行: python3 scripts/optimize_loop.py --topic "你的主题" --iterations 10
+# 在对话中说"优化参数"即可自动调优，每轮调整得分最低的参数。
 # 每次迭代会修改 writing-config.yaml 中的参数，保留得分更好的版本。
 #
 # 参数分三层，对应 writing-guide.md 的反检测结构。