From 039a6caa9dbcac09fc642ca609445c3020a00bd0 Mon Sep 17 00:00:00 2001
From: ystyleb <ystyleb@foxmail.com>
Date: Sun, 29 Mar 2026 00:13:35 +0800
Subject: [PATCH] fix: normalize hotspot scores across platforms for fair
 sorting

Previously, hotspots were sorted by raw hot values directly, but different
platforms use vastly different scales (Toutiao ~10M, Weibo ~1M, Baidu ~100K),
causing Toutiao to dominate all results while Weibo and Baidu entries were
always truncated.

Now uses rank-based normalization (0-100) within each source before merging,
so cross-platform sorting gives equal weight to each platform's top stories.

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/fetch_hotspots.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/scripts/fetch_hotspots.py b/scripts/fetch_hotspots.py
index 0ae81ba..ec8847a 100644
--- a/scripts/fetch_hotspots.py
+++ b/scripts/fetch_hotspots.py
@@ -144,8 +144,21 @@ def main():
             sources_fail.append(name)
 
     all_items = deduplicate(all_items)
-    # Normalize hot values for sorting (different scales across sources)
-    all_items.sort(key=lambda x: int(x.get("hot", 0) or 0), reverse=True)
+
+    # Normalize hot values across platforms (different scales: toutiao ~10M, weibo ~1M, baidu ~100K)
+    # Strategy: within each source, rank-based score 0-100, so cross-platform sorting is fair
+    by_source: dict[str, list[dict]] = {}
+    for item in all_items:
+        by_source.setdefault(item["source"], []).append(item)
+
+    for source, items in by_source.items():
+        items.sort(key=lambda x: int(x.get("hot", 0) or 0), reverse=True)
+        n = len(items)
+        for rank, item in enumerate(items):
+            # Top item = 100, linear decay to ~1 for last item
+            item["hot_normalized"] = round(100 * (n - rank) / n, 1) if n > 0 else 0
+
+    all_items.sort(key=lambda x: x.get("hot_normalized", 0), reverse=True)
     all_items = all_items[:args.limit]
 
     tz = timezone(timedelta(hours=8))