From f9722fb93bcc7ada3c83f1d01abaae910d06f5e1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 30 Mar 2026 16:55:01 +0000 Subject: [PATCH] chore: rebuild dist/openclaw from source --- dist/openclaw/SKILL.md | 77 +++- dist/openclaw/VERSION | 2 +- dist/openclaw/personas/cold-analyst.yaml | 2 +- dist/openclaw/personas/industry-observer.yaml | 2 +- dist/openclaw/personas/midnight-friend.yaml | 2 +- dist/openclaw/personas/sharp-journalist.yaml | 2 +- dist/openclaw/personas/warm-editor.yaml | 2 +- dist/openclaw/references/exemplar-seeds.yaml | 96 +++++ dist/openclaw/references/exemplars/.gitkeep | 0 dist/openclaw/references/writing-guide.md | 12 + dist/openclaw/scripts/extract_exemplar.py | 374 ++++++++++++++++++ dist/openclaw/scripts/humanness_score.py | 115 +++++- dist/openclaw/scripts/learn_edits.py | 14 + 13 files changed, 683 insertions(+), 17 deletions(-) create mode 100644 dist/openclaw/references/exemplar-seeds.yaml create mode 100644 dist/openclaw/references/exemplars/.gitkeep create mode 100644 dist/openclaw/scripts/extract_exemplar.py diff --git a/dist/openclaw/SKILL.md b/dist/openclaw/SKILL.md index bce7b76..92b7e48 100644 --- a/dist/openclaw/SKILL.md +++ b/dist/openclaw/SKILL.md @@ -43,7 +43,7 @@ description: | 2. 如果有 fail 项 → 直接报告,建议修复 3. 如果全 pass 或仅 warn → 继续 LLM 深度分析: - 读取 `style.yaml` 的 tone/voice 与 writing_persona,判断是否矛盾 - - 读取 `writing-config.yaml`(如存在),检查是否有 AI 特征参数(emotional_arc: flat、paragraph_rhythm: structured、closing_style: summary) + - 读取 `writing-config.yaml`(如存在),检查是否有 AI 特征参数(emotional_arc: flat、paragraph_rhythm: structured、closing_tendency: summary) - 读取 `history.yaml` 最近 5 篇,检查 persona 使用和 web_search 降级情况 4. 综合输出自然语言报告 + 按优先级排序的改进建议 - 用户说"优化写作参数"/"优化参数"/"跑优化" → 执行以下流程: @@ -97,6 +97,7 @@ python3 -c "import markdown, bs4, cssutils, requests, yaml, pygments, PIL" 2>&1 | Python 依赖 | 静默 | 提供 `pip install -r requirements.txt` | | `wechat.appid` + `secret` | 静默 | 设 `skip_publish = true` | | `image.api_key` | 静默 | 设 `skip_image_gen = true` | +| `references/exemplars/index.yaml` | 静默 | 提示:"范文库为空。如果你有已发布的文章(markdown),可以说**'导入范文'**建立风格库,写出来的文章会更像你。没有也不影响使用。" | **1.2 版本检查**(静默通过或提醒): @@ -189,6 +190,7 @@ web_search: "{选题关键词} 数据 报告 2025 2026" 读取: {baseDir}/playbook.md(如果存在,按 confidence 分级执行) 读取: {baseDir}/writing-config.yaml(如果存在,作为写作参数) 读取: {baseDir}/history.yaml(最近 3 篇的 dimensions 字段) +读取: {baseDir}/references/exemplars/index.yaml(如果存在) ``` **4.1 历史最佳参数参考**(有 history.yaml 且包含 composite_score 时执行): @@ -208,18 +210,75 @@ web_search: "{选题关键词} 数据 报告 2025 2026" 人格文件定义了:语气浓度、数据呈现方式、情绪弧线、段落节奏、不确定性表达模板等。作为 4.4 的硬性约束执行。 -**优先级**:playbook.md(confidence ≥ 5 的规则)> persona > writing-guide.md。writing-guide 是底线(禁用词等),persona 在此基础上特化风格参数,playbook 中高置信度规则是用户个性化的最终覆盖。playbook 中 confidence < 5 的规则作为软性参考。 +**优先级**:playbook.md(confidence ≥ 5 的规则)> persona > 范文风格 > writing-guide.md。writing-guide 是底线(禁用词等),范文提供风格示范(句长节奏、情绪表达方式),persona 在此基础上特化风格参数(语气浓度、数据呈现),playbook 中高置信度规则是用户个性化的最终覆盖。playbook 中 confidence < 5 的规则作为软性参考。 -**4.4 写文章**: +**4.4 范文风格注入**(有 `references/exemplars/index.yaml` 时执行): + +从 index.yaml 筛选 category 匹配当前框架类型的范文,按 humanness_score 升序(越低越人类)取 top 3。读取对应 .md 文件的片段内容。 + +在写作 prompt 中注入: + +> 以下是该公众号风格的真实段落示例,模仿其句长节奏、情绪强度和口语化程度: +> +> 【开头风格】 +> {exemplar_1 的开头钩子段} +> +> 【情绪段风格】 +> {exemplar_2 的情绪高峰段} +> +> 【转折风格】 +> {exemplar_2 或 exemplar_3 的转折/自纠段(如有)} +> +> 【收尾风格】 +> {exemplar_3 的收尾段} + +Category 映射规则: + +| 框架类型 | exemplar category | +|----------|-------------------| +| 痛点型/深度解读 | tech-opinion | +| 故事型 | story-emotional | +| 清单型/对比型 | list-practical | +| 热点解读型 | hot-take | +| 其他 | general | + +如果匹配到的范文不足 3 篇,用 general category 补足。 + +**Fallback(范文库为空时)**:读取 `{baseDir}/references/exemplar-seeds.yaml`,从每个段落类型中随机选 1 个注入 prompt。种子段落只示范人类写作的结构模式(句长方差、情绪锐度、自我纠正、非总结式收尾),不携带特定风格。注入时使用: + +> 以下是人类写作的结构模式示例,注意模仿其句长节奏和情绪表达方式(不要模仿具体内容或风格): +> +> 【开头模式】{seeds.opening_hooks 随机 1 个} +> +> 【情绪段模式】{seeds.emotional_peaks 随机 1 个} +> +> 【转折模式】{seeds.transitions 随机 1 个} +> +> 【收尾模式】{seeds.closings 随机 1 个} + +建库命令:`python3 {baseDir}/scripts/extract_exemplar.py article.md` + +**4.5 写文章**: - H1 标题(20-28 字) + H2 结构,1500-2500 字 - 真实素材锚定:Step 3.2 的素材分散嵌入各 H2 段落 - **写作人格**:按 4.3 加载的人格参数写作(数据呈现方式、个人声音浓度、不确定性表达等) +- **收尾方式**:persona 的 `closing_tendency` 仅作为倾向参考。根据文章内容和情绪弧线自行判断最自然的收尾方式(参见 writing-guide.md 收尾多样性表)。如果 history.yaml 中最近 3 篇有 `closing_type` 字段,避免使用相同的收尾类型 - 3 层反检测规则(统计/语言/内容)在初稿阶段全部生效 - 2-3 个编辑锚点:`` - 可选容器语法:`:::dialogue`、`:::timeline`、`:::callout`、`:::quote` 保存到 `{baseDir}/output/{date}-{slug}.md` +**4.6 快速自检**(写完后立即执行,减少 Step 5 重写概率): + +对初稿做 3 项最易不达标的快速扫描,**当场修复**,不留到 Step 5: + +1. **禁用词扫描**:检查 writing-guide.md 2.1 的禁用词列表,命中的直接替换(最常见的问题,修复成本最低) +2. **句长方差检查**:粗略扫描是否有连续 3 句以上长度接近的段落,有则拆句或加短句 +3. **负面情绪检查**:全文是否有 ≥ 2 处真实负面表达,不够则在编辑锚点附近补充 + +这 3 项检查不需要调用脚本,LLM 自行完成即可。目标是让初稿在进入 Step 5 前已经消除最明显的问题。 + --- ### Step 5: SEO + 验证 @@ -249,7 +308,7 @@ web_search: "{选题关键词} 数据 报告 2025 2026" | 内容 | 密度波浪 | 高密度段后跟低密度段 | 3.3 | | 内容 | 维度贯穿 | 激活维度全文可见 | 3.4 | -不通过 → 定向重写该段落。3 次仍不过 → 标注跳过。 +不通过 → **定向修复**:只替换不达标的具体句子/段落,不动已通过的部分。每轮最多改 3 处,改完立即重新检查该项。2 轮仍不过 → 标注跳过,继续下一项。 **5.3 脚本验证**(补充逐项检查): @@ -261,8 +320,8 @@ python3 {baseDir}/scripts/humanness_score.py {article_path} --json --tier3 {agen 解读 JSON 中 `composite_score`: - < 30 → 通过,继续 Step 6 -- 30-50 → 查看 `param_scores` 中最低分项,定向重写对应段落 -- \> 50 → 重大问题,逐个低分项修复,最多 3 轮 +- 30-50 → 查看 `param_scores` 中最低分的 1-2 项,只修复对应的具体句子(不重写整段),改完重新打分。1 轮即可 +- \> 50 → 取 `param_scores` 最低的 2-3 项,逐项定向修复(每项只改最相关的 1-2 处),最多 2 轮。仍 > 50 则标记 DONE_WITH_CONCERNS 继续 --- @@ -332,6 +391,7 @@ python3 {baseDir}/toolkit/cli.py preview {markdown} --theme {theme} --no-open -o writing_persona: "{人格名}" dimensions: - "{维度}: {选项}" + closing_type: "{收尾类型}" # trailing_off/unanswered/scene_revert/abrupt_stop/anti_conclusion/image composite_score: {Step 5.3 的 composite_score} # 0=人类, 100=AI writing_config_snapshot: # 本次使用的关键参数(从 writing-config.yaml 提取) sentence_variance: {值} @@ -366,6 +426,8 @@ python3 {baseDir}/toolkit/cli.py preview {markdown} --theme {theme} --no-open -o | 做一个小绿书/图片帖 | `python3 {baseDir}/toolkit/cli.py image-post img1.jpg img2.jpg -t "标题"` | | 诊断配置 / 检查反AI / 为什么AI检测没过 | `python3 {baseDir}/scripts/diagnose.py --json` + LLM 交叉分析 | | 优化写作参数 / 优化参数 | 迭代循环:写测试短文 → 打分 → 调参(见辅助功能) | +| 导入范文 / 建范文库 | `python3 {baseDir}/scripts/extract_exemplar.py article.md` | +| 查看范文库 | `python3 {baseDir}/scripts/extract_exemplar.py --list` | --- @@ -380,7 +442,8 @@ python3 {baseDir}/toolkit/cli.py preview {markdown} --theme {theme} --no-open -o | 素材采集(web_search) | LLM 训练数据中可验证的公开信息 | | 维度随机化 | history 空时跳过去重 | | Persona 文件不存在 | 回退到 midnight-friend(默认) | -| 去 AI 验证 | 3 次重写不过则跳过该项 | +| 范文库为空 | Fallback 到 exemplar-seeds.yaml(通用模式) | +| 去 AI 验证 | 2 轮定向修复不过则跳过该项 | | 生图失败 | 输出提示词 | | 推送失败 | 本地 HTML | | 历史写入 | 警告不阻断 | diff --git a/dist/openclaw/VERSION b/dist/openclaw/VERSION index f0bb29e..31e5c84 100644 --- a/dist/openclaw/VERSION +++ b/dist/openclaw/VERSION @@ -1 +1 @@ -1.3.0 +1.3.3 diff --git a/dist/openclaw/personas/cold-analyst.yaml b/dist/openclaw/personas/cold-analyst.yaml index 33ee473..82f2b07 100644 --- a/dist/openclaw/personas/cold-analyst.yaml +++ b/dist/openclaw/personas/cold-analyst.yaml @@ -17,7 +17,7 @@ single_sentence_paragraph_rate: 0.08 # 少用单句段落,保持专业感 emotional_arc: "flat_with_insight" # 整体平稳,在关键洞察处提升强度 opening_style: "thesis" # 开头直接亮核心论点 -closing_style: "implications" # 以"这意味着什么"收束 +closing_tendency: "implications" # 倾向于以"这意味着什么"收束,但根据文章内容自行判断最合适的收尾方式 data_intro_pattern: "framework → data → implication → caveat" # 示例: diff --git a/dist/openclaw/personas/industry-observer.yaml b/dist/openclaw/personas/industry-observer.yaml index 19f6651..a1e07da 100644 --- a/dist/openclaw/personas/industry-observer.yaml +++ b/dist/openclaw/personas/industry-observer.yaml @@ -16,7 +16,7 @@ single_sentence_paragraph_rate: 0.10 emotional_arc: "steady_with_spikes" # 整体平稳,1-2 处锐利判断 opening_style: "news_hook" # 以一个行业事件/数据切入 -closing_style: "open_question" # 留一个没答案的问题 +closing_tendency: "open_question" # 倾向于留一个没答案的问题,但根据文章内容自行判断最合适的收尾方式 data_intro_pattern: "context → data → contrast → judgment" # 示例: diff --git a/dist/openclaw/personas/midnight-friend.yaml b/dist/openclaw/personas/midnight-friend.yaml index 2291f62..678541d 100644 --- a/dist/openclaw/personas/midnight-friend.yaml +++ b/dist/openclaw/personas/midnight-friend.yaml @@ -18,7 +18,7 @@ single_sentence_paragraph_rate: 0.25 # 25% 的段落只有 1 句 # 情绪 emotional_arc: "restrained_to_burst" opening_style: "personal_moment" # 以一个私人时刻开头("凌晨一点多…") -closing_style: "trailing_off" # 不收束,像聊天自然结尾("我先睡了"/"真的看不清楚") +closing_tendency: "trailing_off" # 倾向于不收束、像聊天自然结尾,但根据文章内容自行判断最合适的收尾方式 # 数据呈现 data_intro_pattern: "scene → reaction → data → interpretation" diff --git a/dist/openclaw/personas/sharp-journalist.yaml b/dist/openclaw/personas/sharp-journalist.yaml index e5839f8..c1ce32f 100644 --- a/dist/openclaw/personas/sharp-journalist.yaml +++ b/dist/openclaw/personas/sharp-journalist.yaml @@ -17,7 +17,7 @@ single_sentence_paragraph_rate: 0.20 # 多用短句成段制造节奏 emotional_arc: "cold_open_to_sharp_close" opening_style: "cold_open" # 直接切入核心矛盾,不铺垫 -closing_style: "sharp_statement" # 一句定性收束 +closing_tendency: "sharp_statement" # 倾向于一句定性收束,但根据文章内容自行判断最合适的收尾方式 data_intro_pattern: "claim → evidence → twist" # 示例: diff --git a/dist/openclaw/personas/warm-editor.yaml b/dist/openclaw/personas/warm-editor.yaml index e25c12e..40246f4 100644 --- a/dist/openclaw/personas/warm-editor.yaml +++ b/dist/openclaw/personas/warm-editor.yaml @@ -16,7 +16,7 @@ single_sentence_paragraph_rate: 0.15 emotional_arc: "gentle_build" # 缓慢升温,情绪在中后段到达高点 opening_style: "scene" # 以一个温暖的场景开头 -closing_style: "image" # 用一个画面收束 +closing_tendency: "image" # 倾向于用一个画面收束,但根据文章内容自行判断最合适的收尾方式 data_intro_pattern: "story → embed data → feeling" # 示例: diff --git a/dist/openclaw/references/exemplar-seeds.yaml b/dist/openclaw/references/exemplar-seeds.yaml new file mode 100644 index 0000000..c896755 --- /dev/null +++ b/dist/openclaw/references/exemplar-seeds.yaml @@ -0,0 +1,96 @@ +# 通用人类写作模式种子 +# +# 用途:没有范文库的用户,Step 4.4 用这些段落作为 few-shot 注入, +# 教 LLM "人类写作的结构模式长什么样"。 +# +# 设计原则: +# - 只示范结构模式(句长方差、情绪锐度、自我纠正、非总结式收尾) +# - 不携带特定风格/人格(任何 persona 都能兼容) +# - 每个段落标注了它示范的反AI模式 +# +# 有用户自己的范文库时,这个文件不会被使用。 + +opening_hooks: + - text: | + 好多年没有坐公交了,上次去太子湾,由于景区限行,只能把车停在外面,坐景区免费接驳车过去。 + 前面座位看到一个小女孩一直在刷那种 AI 生成的短视频,画面非常粗糙,内容也很假,滑到下一个居然还是差不多的东西,看得津津有味。 + 当时看到这一幕我甚至有点伤心。 + pattern: "日常观察切入 → 意外情绪反应。不总结、不预告、不铺垫。" + + - text: | + **本硕八年毕业,单程通勤两个半小时,**月薪2690。 这是市场给我贴的标签。 + **裸辞。创业,年收超7位数。** 这是我自己撕掉那个标签之后,重新定义的自己。 + 一路走来,中间发生了什么?我讲给你听。 + pattern: "标签→撕裂对比开头。两组加粗短句制造视觉和语义落差。句长标准差 45.7(数据最高)。" + + - text: | + 29号,我和小伙伴在深圳搞活动。 + 活动结束之后,我想顺道拜访一个多年没见的老朋友,发消息过去。 + 他回:在三亚。 + 我问:度假? + 他说:带孩子。 + 我盯着手机屏幕,愣了整整三秒。 + pattern: "对话碎片制造节奏。2-4字短句紧邻20+字长句。物理反应替代心理描写。" + + - text: | + 我信了这套话很多年。 + "要有长期主义。要相信复利。时间是最好的朋友。" + 最惨的一次,在一个方向扎进去3年,回头一看,什么都没留下来。 + pattern: "先认同再推翻。引用常见正确的话→用个人惨痛经历否定。开头即高潮。" + +emotional_peaks: + - text: | + 我信了这套话很多年。 + 最惨的一次,在一个方向扎进去3年,回头一看,什么都没留下来。 + 这不是失败——失败还有个明确的结果。 + 是你信错了一件事。 + pattern: "用'最惨'而非'有挑战'。否定委婉说法('这不是失败'),给出更痛的定义。" + + - text: | + 很多人在温水煮青蛙的过程中得过且过,过着看似满意、实则内心有很多不满的生活,然后说一句,算了吧,现在这样也还行,但这样反而错失了挖掘自己最大潜力的机会。 + pattern: "用'温水煮青蛙'具象化停滞感。'算了吧'是内心独白式引用。来自得分最低(32.8)的文章。" + + - text: | + 讲真,我每次看到这种争论,都觉得……怎么说呢……挺无语的。 + 不是说这些人蠢。 + 是他们在纠结一个根本不存在的问题。 + 什么叫"AI味道"?你能定义吗?你能量化吗?你能验证吗? + 不能。 + 那你在纠结什么? + pattern: "填充词('怎么说呢')+ 连续反问不给答案 + 单字段落('不能。')。" + +transitions: + - text: | + 我第一反应是"孩子这时候不应该在学校吗",第二反应是想把这话发过去,第三反应是我把那句话吞回去了——因为我在那三秒里想清楚了一件事。 + pattern: "思维过程外化(三个反应)。破折号打断 → 时间锚点('三秒')→ 悬念。" + + - text: | + 不过,到了之后我发现,什么作息啊,学习强度啊,都不是最难熬的,人才是。 + pattern: "列举预期困难再一句否定。转折词 + 真实困难揭示。" + + - text: | + 不过话又说回来。知道自己在局里,这件事本身,就已经是出局的开始了。 + pattern: "'不过话又说回来'——自我推翻后重新定位。制造思维的非线性感。" + +closings: + - text: | + 时间是你唯一不可再生的资源。 + 把它投进一个真实存在的锚点,才叫复利。 + 投进一个"我相信它会好"的希望,叫做漫长的等死。 + pattern: "重新定义核心概念收尾。'等死'替代励志结论。来自得分 36.7 的文章。" + + - text: | + 有了 AI 之后,很多事都更容易了,但也正因为更容易了,什么东西真的值得做、值得花很多年去换,反而变得更难想清楚。要做什么可能比怎么更快做出一个东西更加重要了。 + pattern: "结尾是未完成的思考,不是结论。'可能'留有余地。没有升华。" + + - text: | + 我苦哈哈的在电脑前,写这篇文章,想着我的女儿。 + 差距是真实的。 + 机会也是真实的。 + 时钟在走,窗口在收窄。 + pattern: "回到写作现场。重复句式('是真实的')制造执念感。来自得分 33.0 的文章。" + + - text: | + 不要在那个愣住的感觉里待太久。 + 那个感觉,待久了,就成了借口。 + pattern: "回扣开头意象。两句话收束,不解释。草率感本身就是风格。" diff --git a/dist/openclaw/references/exemplars/.gitkeep b/dist/openclaw/references/exemplars/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/dist/openclaw/references/writing-guide.md b/dist/openclaw/references/writing-guide.md index c0a2e35..3e4d75d 100644 --- a/dist/openclaw/references/writing-guide.md +++ b/dist/openclaw/references/writing-guide.md @@ -101,6 +101,18 @@ AI 段落长度趋于均匀。人类段落忽长忽短。 - 每段末尾都用反问句(变成另一种模式化) - 口语词匀速分布(不要每 200 字准时出现一个"讲真") - 总结性收尾("让我们拭目以待"/"未来可期") +- 连续文章使用相同收尾结构(收尾方式应由文章内容决定,不是由人格模板决定) + +**收尾多样性**:persona 的 `closing_tendency` 是倾向而非硬规则。根据文章走到结尾时的内容和情绪自行判断最自然的收尾方式。以下是常见的人类收尾模式,每篇文章选最贴合内容的一种: + +| 模式 | 特征 | 适合场景 | +|------|------|---------| +| 自然断流 | 像聊天说到一半停了("我先睡了"/"就这样吧") | 深夜风格、随笔 | +| 未答之问 | 以问题结尾,不给答案 | 争议话题、引发思考 | +| 场景回扣 | 回到开头的意象/场景 | 叙事类、故事驱动 | +| 硬切 | 最后一个论点说完直接结束,无收束语 | 评论、观点类 | +| 反结论 | 明确拒绝给结论("我也不知道"/"答案可能不存在") | 复杂议题、探索性 | +| 画面定格 | 用一个视觉画面收束 | 情感类、人物类 | **writing-config 参数**:`emotional_arc`(flat/gradual/restrained_to_burst/volatile) diff --git a/dist/openclaw/scripts/extract_exemplar.py b/dist/openclaw/scripts/extract_exemplar.py new file mode 100644 index 0000000..710d1fa --- /dev/null +++ b/dist/openclaw/scripts/extract_exemplar.py @@ -0,0 +1,374 @@ +#!/usr/bin/env python3 +""" +Extract style exemplars from human-written articles for SICO-style few-shot injection. + +Takes a markdown article, analyzes it for style fingerprints, extracts key +segments (opening hook, emotional peak, transition/self-correction, closing), +and saves structured exemplar files to references/exemplars/. + +Usage: + python3 scripts/extract_exemplar.py article.md + python3 scripts/extract_exemplar.py article.md --category tech-opinion --source "公众号名" + python3 scripts/extract_exemplar.py article1.md article2.md article3.md # batch + python3 scripts/extract_exemplar.py --list # list all exemplars +""" + +import argparse +import json +import re +import sys +from datetime import datetime +from pathlib import Path + +import yaml + +# Reuse analysis functions from humanness_score +sys.path.insert(0, str(Path(__file__).parent)) +import humanness_score as hs + +SKILL_DIR = Path(__file__).parent.parent +EXEMPLARS_DIR = SKILL_DIR / "references" / "exemplars" +INDEX_FILE = EXEMPLARS_DIR / "index.yaml" + +CATEGORIES = ["tech-opinion", "story-emotional", "list-practical", "hot-take", "general"] + +# Category detection markers +STORY_MARKERS = [ + "我", "我们", "那天", "那年", "记得", "后来", "当时", + "第一次", "最后", "突然", "终于", +] + + +# ============================================================ +# Segment Extraction +# ============================================================ + +def extract_headings(text): + """Extract H2 headings from markdown.""" + return re.findall(r'^##\s+(.+)$', text, re.MULTILINE) + + +def extract_title(text): + """Extract H1 title from markdown.""" + m = re.search(r'^#\s+(.+)$', text, re.MULTILINE) + return m.group(1).strip() if m else "" + + +def extract_opening(paragraphs, max_chars=250): + """Extract opening hook — first non-empty paragraph(s) up to max_chars.""" + result = [] + total = 0 + for p in paragraphs: + if total + len(p) > max_chars and result: + break + result.append(p) + total += len(p) + return "\n\n".join(result) + + +def extract_emotional_peak(paragraphs): + """Find paragraph with highest negative emotion density.""" + best_para, best_density = "", -1.0 + for p in paragraphs: + if len(p) < 20: + continue + count = sum(1 for m in hs.NEGATIVE_MARKERS if m in p) + density = count / len(p) * 100 + if density > best_density: + best_density = density + best_para = p + return best_para if best_density > 0 else "" + + +def extract_transition(paragraphs): + """Find paragraph with most self-correction / transition patterns.""" + transition_words = [ + "但是", "不过", "然而", "话说回来", "换个角度", + "说回来", "但话又说回来", "不对", "算了", + ] + best_para, best_count = "", 0 + for p in paragraphs: + if len(p) < 20: + continue + count = sum(len(re.findall(pat, p)) for pat in hs.SELF_CORRECTION_PATTERNS) + count += sum(p.count(w) for w in transition_words) + if count > best_count: + best_count = count + best_para = p + return best_para if best_count > 0 else "" + + +def extract_closing(paragraphs, max_chars=250): + """Extract closing paragraph(s), reading backwards.""" + result = [] + total = 0 + for p in reversed(paragraphs): + if total + len(p) > max_chars and result: + break + result.insert(0, p) + total += len(p) + return "\n\n".join(result) + + +# ============================================================ +# Category Detection +# ============================================================ + +def detect_category(text, paragraphs, headings): + """Auto-detect article category from content features.""" + data_count = sum(len(re.findall(p, text)) for p in hs.REAL_SOURCE_PATTERNS) + story_count = sum(text.count(m) for m in STORY_MARKERS) + h2_count = len(headings) + neg_count = sum(1 for m in hs.NEGATIVE_MARKERS if m in text) + + scores = { + "tech-opinion": data_count * 2, + "story-emotional": story_count * 1.5, + "list-practical": h2_count * 3 if h2_count >= 5 else 0, + "hot-take": neg_count * 2 + data_count if len(text) < 2000 else 0, + "general": 5, + } + return max(scores, key=scores.get) + + +# ============================================================ +# Statistical Fingerprint +# ============================================================ + +def compute_vocab_temperature(text): + """Compute vocabulary temperature band distribution.""" + counts = { + "cold": sum(text.count(w) for w in hs.COLD_WORDS), + "warm": sum(text.count(w) for w in hs.WARM_WORDS), + "hot": sum(text.count(w) for w in hs.HOT_WORDS), + "wild": sum(text.count(w) for w in hs.WILD_WORDS), + } + total = sum(counts.values()) + if total == 0: + return {k: 0.25 for k in counts} + return {k: round(v / total, 2) for k, v in counts.items()} + + +def compute_paragraph_cv(paragraphs): + """Coefficient of variation for paragraph lengths.""" + if len(paragraphs) < 3: + return 0.0 + lengths = [len(p) for p in paragraphs] + mean = sum(lengths) / len(lengths) + if mean == 0: + return 0.0 + variance = sum((l - mean) ** 2 for l in lengths) / len(lengths) + return round((variance ** 0.5) / mean, 2) + + +def count_short_paragraphs(text): + """Count single-sentence short paragraphs (1-10 chars, non-heading).""" + return sum(1 for l in text.split('\n') + if l.strip() and 1 <= len(l.strip()) <= 10 + and not l.strip().startswith('#')) + + +# ============================================================ +# Main Extraction +# ============================================================ + +def extract_exemplar(text, category=None, source=None): + """Analyze article and return structured exemplar dict.""" + clean = re.sub(r'^#+\s+.*$', '', text, flags=re.MULTILINE).strip() + paragraphs = hs._split_paragraphs(text) + sentences = hs._split_sentences(clean) + headings = extract_headings(text) + title = extract_title(text) or source or "" + + if not category: + category = detect_category(clean, paragraphs, headings) + + score_result = hs.score_article(text) + + # Sentence length stats + lengths = [len(s) for s in sentences] + if len(lengths) >= 2: + mean = sum(lengths) / len(lengths) + variance = sum((l - mean) ** 2 for l in lengths) / len(lengths) + sentence_stddev = round(variance ** 0.5, 1) + else: + sentence_stddev = 0.0 + + neg_count = sum(1 for s in sentences if any(m in s for m in hs.NEGATIVE_MARKERS)) + negative_ratio = round(neg_count / len(sentences), 2) if sentences else 0.0 + + return { + "title": title, + "source": source or title, + "category": category, + "humanness_score": score_result["composite_score"], + "fingerprint": { + "sentence_stddev": sentence_stddev, + "vocab_temperature": compute_vocab_temperature(clean), + "negative_ratio": negative_ratio, + "paragraph_cv": compute_paragraph_cv(paragraphs), + "short_paragraphs": count_short_paragraphs(text), + }, + "segments": { + "opening": extract_opening(paragraphs), + "emotional_peak": extract_emotional_peak(paragraphs), + "transition": extract_transition(paragraphs), + "closing": extract_closing(paragraphs), + }, + "extracted_at": datetime.now().strftime("%Y-%m-%d"), + "char_count": len(clean), + } + + +# ============================================================ +# Persistence +# ============================================================ + +def save_exemplar(exemplar): + """Save exemplar to markdown file and update index.yaml. Returns filepath.""" + EXEMPLARS_DIR.mkdir(parents=True, exist_ok=True) + + category = exemplar["category"] + num = 1 + while (EXEMPLARS_DIR / f"{category}-{num:03d}.md").exists(): + num += 1 + filename = f"{category}-{num:03d}.md" + filepath = EXEMPLARS_DIR / filename + + fp = exemplar["fingerprint"] + seg = exemplar["segments"] + + frontmatter = { + "source": exemplar["source"], + "category": category, + "humanness_score": exemplar["humanness_score"], + "sentence_stddev": fp["sentence_stddev"], + "vocab_temperature": fp["vocab_temperature"], + "negative_ratio": fp["negative_ratio"], + "paragraph_cv": fp["paragraph_cv"], + "short_paragraphs": fp["short_paragraphs"], + "extracted_at": exemplar["extracted_at"], + } + + content = "---\n" + content += yaml.dump(frontmatter, allow_unicode=True, default_flow_style=False) + content += "---\n\n" + + section_map = [ + ("opening", "开头钩子"), + ("emotional_peak", "情绪高峰"), + ("transition", "转折/自纠"), + ("closing", "收尾"), + ] + for key, label in section_map: + if seg.get(key): + content += f"## {label}\n\n{seg[key]}\n\n" + + filepath.write_text(content, encoding="utf-8") + _update_index(filename, exemplar) + return filepath + + +def _update_index(filename, exemplar): + """Add or update entry in index.yaml.""" + index = [] + if INDEX_FILE.exists(): + with open(INDEX_FILE, "r", encoding="utf-8") as f: + index = yaml.safe_load(f) or [] + + entry = { + "file": filename, + "source": exemplar["source"], + "category": exemplar["category"], + "humanness_score": exemplar["humanness_score"], + "extracted_at": exemplar["extracted_at"], + } + index = [e for e in index if e.get("file") != filename] + index.append(entry) + index.sort(key=lambda x: (x["category"], x["humanness_score"])) + + with open(INDEX_FILE, "w", encoding="utf-8") as f: + yaml.dump(index, f, allow_unicode=True, default_flow_style=False) + + +# ============================================================ +# List / CLI +# ============================================================ + +def list_exemplars(): + """Print all exemplars in the library.""" + if not INDEX_FILE.exists(): + print("范文库为空。用法: python3 scripts/extract_exemplar.py article.md") + return + + with open(INDEX_FILE, "r", encoding="utf-8") as f: + index = yaml.safe_load(f) or [] + + if not index: + print("范文库为空。") + return + + print(f"\n{'=' * 60}") + print(f"范文库 ({len(index)} 篇)") + print(f"{'=' * 60}") + + by_cat = {} + for e in index: + by_cat.setdefault(e["category"], []).append(e) + + for cat, entries in sorted(by_cat.items()): + print(f"\n [{cat}] ({len(entries)} 篇)") + for e in entries: + score = e["humanness_score"] + bar = "█" * int((100 - score) / 10) + "░" * (10 - int((100 - score) / 10)) + print(f" {bar} {score:5.1f} {e['source'][:40]}") + + +def main(): + parser = argparse.ArgumentParser(description="Extract style exemplars from articles") + parser.add_argument("inputs", nargs="*", help="Markdown article file(s)") + parser.add_argument("--category", "-c", choices=CATEGORIES, + help="Article category (auto-detected if omitted)") + parser.add_argument("--source", "-s", help="Source name (e.g. account name)") + parser.add_argument("--list", "-l", action="store_true", help="List all exemplars") + parser.add_argument("--json", action="store_true", help="JSON output") + args = parser.parse_args() + + if args.list: + list_exemplars() + return + + if not args.inputs: + parser.print_help() + sys.exit(1) + + for input_path in args.inputs: + path = Path(input_path) + if not path.exists(): + print(f"Error: {input_path} not found", file=sys.stderr) + continue + + text = path.read_text(encoding="utf-8") + source = args.source or path.stem # fallback to filename without extension + exemplar = extract_exemplar(text, category=args.category, source=source) + filepath = save_exemplar(exemplar) + + if args.json: + print(json.dumps(exemplar, ensure_ascii=False, indent=2)) + else: + print(f"✓ {path.name}") + print(f" Category: {exemplar['category']}") + print(f" Score: {exemplar['humanness_score']:.1f}/100") + print(f" Segments: {sum(1 for v in exemplar['segments'].values() if v)}/4") + fp = exemplar["fingerprint"] + print(f" Stddev: {fp['sentence_stddev']}") + print(f" Neg ratio: {fp['negative_ratio']:.0%}") + print(f" Para CV: {fp['paragraph_cv']}") + temp = fp["vocab_temperature"] + print(f" Temp: cold={temp['cold']} warm={temp['warm']} hot={temp['hot']} wild={temp['wild']}") + print(f" Saved to: {filepath}") + print() + + +if __name__ == "__main__": + main() diff --git a/dist/openclaw/scripts/humanness_score.py b/dist/openclaw/scripts/humanness_score.py index 6aabc32..3f3a079 100644 --- a/dist/openclaw/scripts/humanness_score.py +++ b/dist/openclaw/scripts/humanness_score.py @@ -55,10 +55,23 @@ REAL_SOURCE_PATTERNS = [ ] NEGATIVE_MARKERS = [ + # 直接负面情绪 "失望", "糟糕", "扯", "坑", "烂", "差劲", "崩溃", "吐槽", "骂", "怒", "烦", "焦虑", "担忧", "不满", "恶心", "可怕", "可悲", "可笑", "离谱", "尴尬", "无语", "蠢", "惨", "亏", "危", + # 绝望/迷茫 + "绝望", "迷茫", "心累", "丧", "后悔", "后怕", "心寒", + # 欺骗/操控(隐性负面) + "骗", "忽悠", "割韭菜", "套路", "画大饼", "洗脑", + # 失败/徒劳 + "白费", "白搭", "没戏", "黄了", "凉了", "废了", + # 自嘲/自贬 + "傻", "天真", "吃亏", "自嗨", "打脸", + # 讽刺/反语 + "呵呵", "好吧", "行吧", "真服了", + # 短语 "太扯了", "说实话我很失望", "搞什么", "不靠谱", "受不了", + "受够了", "想哭", "伤心", "苦哈哈", "得过且过", ] COMMON_ADVERBS = [ @@ -69,10 +82,27 @@ COMMON_ADVERBS = [ "竟然", "简直", "几乎", "完全", "绝对", "必然", ] -COLD_WORDS = ["边际", "认知负荷", "信息不对称", "路径依赖", "商业模式", "生态系统", "增量"] -WARM_WORDS = ["说白了", "其实吧", "讲真", "说实话", "坦白讲", "懂的都懂", "怎么说呢"] -HOT_WORDS = ["DNA动了", "格局打开", "遥遥��先", "卷", "内卷", "炸了", "杀疯了", "吃灰"] -WILD_WORDS = ["整挺好", "不靠谱", "瞎折腾", "搁这儿", "糊弄", "扯", "嗯"] +COLD_WORDS = [ + "边际", "认知负荷", "信息不对称", "路径依赖", "商业模式", "生态系统", "增量", + "技术栈", "标准化", "结构性", "规模化", "护城河", "飞轮", "闭环", + "赛道", "壁垒", "方法论", "底层逻辑", "第一性原理", "杠杆", "复利", + "ROI", "PMF", "代运营", "供给侧", "需求侧", +] +WARM_WORDS = [ + "说白了", "其实吧", "讲真", "说实话", "坦白讲", "懂的都懂", "怎么说呢", + "老实说", "这么说吧", "你想啊", "别急", "慢慢来", + "有意思的是", "好玩的是", "巧的是", "说来话长", "话说回来", +] +HOT_WORDS = [ + "DNA动了", "格局打开", "遥遥领先", "卷", "内卷", "炸了", "杀疯了", "吃灰", + "凡尔赛", "标题党", "躺平", "摆烂", "破防", "上头", "内耗", + "蒸发", "出圈", "降维打击", "弯道超车", +] +WILD_WORDS = [ + "整挺好", "不靠谱", "瞎折腾", "搁这儿", "糊弄", "扯", "嗯", + "苦哈哈", "傻乎乎", "稀里糊涂", "得了吧", "算了吧", + "摔了跤", "交学费", "踩坑", "翻车", "栽了", +] SELF_CORRECTION_PATTERNS = [ r'不对[,,]', r'准确说', r'算了', r'说错了', @@ -314,6 +344,81 @@ def run_tier(checks, text): return results +# ============================================================ +# Calibration (bell-curve + over-optimization penalty) +# ============================================================ + +# Human article baselines (from 15 example articles, 2026-03-30) +# Dimensions where AI over-optimizes: bell-curve scoring penalizes +# both "too low" AND "too high" relative to human average. +_BELL_CURVE_CHECKS = { + "broken_sentences": 0.39, + "self_correction": 0.20, + "sentence_length_range": 0.71, + "paragraph_length_variance": 0.52, + "banned_words": 0.73, +} + + +def _bell_curve(raw_score, center): + """Score peaks at center (human avg), penalizes over-optimization. + + Below center: linear rise (as before). + Above center: quadratic penalty — too much is suspicious. + """ + if center <= 0: + return raw_score + if raw_score <= center: + return raw_score / center + else: + overshoot = (raw_score - center) / (1.0 - center) if center < 1 else 0 + return max(0.0, 1.0 - overshoot * overshoot) + + +def calibrate_tiers(tier1, tier2): + """Apply bell-curve calibration and over-optimization penalty in-place.""" + # 1. Bell-curve adjustment for over-optimizable dimensions + for tier in [tier1, tier2]: + for name, data in tier.items(): + if name.startswith("_"): + continue + if name in _BELL_CURVE_CHECKS: + raw = data["score"] + center = _BELL_CURVE_CHECKS[name] + calibrated = round(max(0.0, min(1.0, _bell_curve(raw, center))), 4) + data["raw_score"] = raw + data["score"] = calibrated + data["detail"] += f" [calibrated from {raw:.2f}, center={center}]" + + # 2. Over-optimization penalty: if 60%+ of checks score > 0.8, + # the article is suspiciously "perfect" — apply global penalty. + all_scores = [] + for tier in [tier1, tier2]: + for name, data in tier.items(): + if not name.startswith("_"): + all_scores.append(data["score"]) + + high_count = sum(1 for s in all_scores if s > 0.8) + over_opt_ratio = high_count / len(all_scores) if all_scores else 0 + penalty = 1.0 + if over_opt_ratio >= 0.6: + penalty = 0.85 # 15% penalty for suspiciously perfect articles + + if penalty < 1.0: + for tier in [tier1, tier2]: + for name, data in tier.items(): + if not name.startswith("_"): + data["score"] = round(data["score"] * penalty, 4) + + # 3. Recalculate tier summaries + for tier in [tier1, tier2]: + scores = [data["score"] for name, data in tier.items() if not name.startswith("_")] + tier["_summary"]["mean_score"] = round(sum(scores) / len(scores), 4) if scores else 0 + tier["_summary"]["scores"] = [round(s, 4) for s in scores] + + return penalty + + # ============================================================ # Composite Score # ============================================================ @@ -364,6 +469,7 @@ def score_article(text, verbose=False, tier3_score=None): tier1 = run_tier(TIER1_CHECKS, clean) tier2 = run_tier(TIER2_CHECKS, clean) + over_opt_penalty = calibrate_tiers(tier1, tier2) composite, weights = compute_composite(tier1, tier2, tier3_score) param_scores = build_param_scores(tier1, tier2) @@ -377,6 +483,7 @@ def score_article(text, verbose=False, tier3_score=None): }, "weights": weights, "param_scores": param_scores, + "over_optimization_penalty": over_opt_penalty, "char_count": len(clean), } diff --git a/dist/openclaw/scripts/learn_edits.py b/dist/openclaw/scripts/learn_edits.py index 175db49..1b6ac67 100644 --- a/dist/openclaw/scripts/learn_edits.py +++ b/dist/openclaw/scripts/learn_edits.py @@ -325,6 +325,20 @@ def main(): lesson_file = save_lesson(diff_result, args.draft, args.final) print(f"\nLesson saved to: {lesson_file}") + # Auto-grow exemplar library from edited finals + final_title = extract_title(final) + try: + import extract_exemplar + exemplar = extract_exemplar.extract_exemplar(final, source=final_title or "user-edited") + if exemplar["humanness_score"] <= 50: + exemplar_path = extract_exemplar.save_exemplar(exemplar) + print(f"\n✓ 终稿已加入范文库: {exemplar_path}") + print(f" Score: {exemplar['humanness_score']:.1f}/100, Category: {exemplar['category']}") + else: + print(f"\n⚠ 终稿 humanness_score={exemplar['humanness_score']:.1f} > 50,未加入范文库") + except Exception as e: + print(f"\n⚠ 范文提取跳过: {e}") + lesson_count = len(load_all_lessons()) print(f"Total lessons: {lesson_count}")