Merge branch 'benchmarks-h100'

2026-03-15 12:00:00 +01:00 · 2026-03-15 12:00:00 +01:00 · e14b913807
commit e14b913807
parent 3b7a2fcc87 47d4cbeecc
15 changed files with 572 additions and 0 deletions
--- a/benchmark_bars_h100.png
+++ b/benchmark_bars_h100.png
--- a/benchmark_latency_h100.png
+++ b/benchmark_latency_h100.png
--- a/benchmark_robustness_h100.png
+++ b/benchmark_robustness_h100.png
--- a/benchmark_scatter_acl6060_h100.png
+++ b/benchmark_scatter_acl6060_h100.png
--- a/benchmark_scatter_en_h100.png
+++ b/benchmark_scatter_en_h100.png
--- a/benchmark_scatter_h100.png
+++ b/benchmark_scatter_h100.png
--- a/benchmarks/h100/acl6060_per_talk.png
+++ b/benchmarks/h100/acl6060_per_talk.png
--- a/benchmarks/h100/bars_wer_rtf_latency.png
+++ b/benchmarks/h100/bars_wer_rtf_latency.png
--- a/benchmarks/h100/bench_voxtral_hf_batch.py
+++ b/benchmarks/h100/bench_voxtral_hf_batch.py
@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""Standalone Voxtral benchmark — no whisperlivekit imports."""
+import json, logging, re, time, wave, queue, threading
+import numpy as np
+import torch
+
+logging.basicConfig(level=logging.WARNING)
+for n in ["transformers","torch","httpx"]:
+    logging.getLogger(n).setLevel(logging.ERROR)
+
+from jiwer import wer as compute_wer
+from transformers import AutoProcessor, VoxtralRealtimeForConditionalGeneration, TextIteratorStreamer
+
+def norm(t):
+    return re.sub(r' +', ' ', re.sub(r'[^a-z0-9 ]', ' ', t.lower())).strip()
+
+def load_audio(path):
+    with wave.open(path, 'r') as wf:
+        return np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16).astype(np.float32) / 32768.0
+
+# Load model
+print("Loading Voxtral-Mini-4B...", flush=True)
+MODEL_ID = "mistralai/Voxtral-Mini-4B-Realtime-2602"
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = VoxtralRealtimeForConditionalGeneration.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, device_map="cuda:0",
+)
+print(f"Loaded, GPU: {torch.cuda.memory_allocated()/1e9:.1f} GB", flush=True)
+
+def transcribe_batch(audio_np):
+    """Simple batch transcription (not streaming)."""
+    # Voxtral expects audio as input_features from processor
+    inputs = processor(
+        audio=audio_np, sampling_rate=16000, return_tensors="pt",
+    ).to("cuda:0").to(torch.bfloat16)
+
+    t0 = time.perf_counter()
+    with torch.inference_mode():
+        generated = model.generate(**inputs, max_new_tokens=1024)
+    t1 = time.perf_counter()
+
+    text = processor.batch_decode(generated, skip_special_tokens=True)[0].strip()
+    return text, t1 - t0
+
+# 1. LibriSpeech test-clean
+print("\n=== Voxtral / LibriSpeech test-clean ===", flush=True)
+clean = json.load(open("/home/cloud/benchmark_data/metadata.json"))
+wers = []; ta = tp = 0
+for i, s in enumerate(clean):
+    audio = load_audio(s['path'])
+    hyp, pt = transcribe_batch(audio)
+    w = compute_wer(norm(s['reference']), norm(hyp))
+    wers.append(w); ta += s['duration']; tp += pt
+    if i < 3 or i % 20 == 0:
+        print(f"  [{i}] {s['duration']:.1f}s RTF={pt/s['duration']:.2f} WER={w:.1%} | {hyp[:60]}", flush=True)
+clean_wer = np.mean(wers); clean_rtf = tp/ta
+print(f"  CLEAN: WER {clean_wer:.2%}, RTF {clean_rtf:.3f} ({len(clean)} samples, {ta:.0f}s)")
+
+# 2. LibriSpeech test-other
+print("\n=== Voxtral / LibriSpeech test-other ===", flush=True)
+other = json.load(open("/home/cloud/benchmark_data/metadata_other.json"))
+wers2 = []; ta2 = tp2 = 0
+for i, s in enumerate(other):
+    audio = load_audio(s['path'])
+    hyp, pt = transcribe_batch(audio)
+    w = compute_wer(norm(s['reference']), norm(hyp))
+    wers2.append(w); ta2 += s['duration']; tp2 += pt
+    if i < 3 or i % 20 == 0:
+        print(f"  [{i}] {s['duration']:.1f}s RTF={pt/s['duration']:.2f} WER={w:.1%}", flush=True)
+other_wer = np.mean(wers2); other_rtf = tp2/ta2
+print(f"  OTHER: WER {other_wer:.2%}, RTF {other_rtf:.3f} ({len(other)} samples, {ta2:.0f}s)")
+
+# 3. ACL6060
+print("\n=== Voxtral / ACL6060 ===", flush=True)
+acl_results = []
+for talk in ["110", "117", "268", "367", "590"]:
+    audio = load_audio(f"/home/cloud/acl6060_audio/2022.acl-long.{talk}.wav")
+    dur = len(audio) / 16000
+    gw = []
+    with open(f"/home/cloud/iwslt26-sst/inputs/en/acl6060.ts/gold-jsonl/2022.acl-long.{talk}.jsonl") as f:
+        for line in f:
+            gw.append(json.loads(line)["text"].strip())
+    gold = " ".join(gw)
+
+    # For long audio, process in 30s chunks
+    all_hyp = []
+    t0 = time.perf_counter()
+    chunk_size = 30 * 16000
+    for start in range(0, len(audio), chunk_size):
+        chunk = audio[start:start + chunk_size]
+        if len(chunk) < 1600:  # skip very short tail
+            continue
+        hyp, _ = transcribe_batch(chunk)
+        all_hyp.append(hyp)
+    t1 = time.perf_counter()
+
+    full_hyp = " ".join(all_hyp)
+    w = compute_wer(norm(gold), norm(full_hyp))
+    rtf = (t1 - t0) / dur
+    acl_results.append({"talk": talk, "wer": w, "rtf": rtf, "dur": dur})
+    print(f"  Talk {talk}: {dur:.0f}s, WER {w:.2%}, RTF {rtf:.3f}", flush=True)
+
+acl_wer = np.mean([r["wer"] for r in acl_results])
+acl_rtf = np.mean([r["rtf"] for r in acl_results])
+print(f"  ACL6060 AVERAGE: WER {acl_wer:.2%}, RTF {acl_rtf:.3f}")
+
+# Summary
+print(f"\n{'='*60}")
+print(f"  VOXTRAL BENCHMARK SUMMARY (H100 80GB)")
+print(f"{'='*60}")
+print(f"  {'Dataset':>25} {'WER':>7} {'RTF':>7}")
+print(f"  {'-'*42}")
+print(f"  {'LibriSpeech clean':>25} {clean_wer:>6.2%} {clean_rtf:>7.3f}")
+print(f"  {'LibriSpeech other':>25} {other_wer:>6.2%} {other_rtf:>7.3f}")
+print(f"  {'ACL6060 (5 talks)':>25} {acl_wer:>6.2%} {acl_rtf:>7.3f}")
+
+results = {
+    "clean": {"avg_wer": round(float(clean_wer), 4), "rtf": round(float(clean_rtf), 3)},
+    "other": {"avg_wer": round(float(other_wer), 4), "rtf": round(float(other_rtf), 3)},
+    "acl6060": {"avg_wer": round(float(acl_wer), 4), "avg_rtf": round(float(acl_rtf), 3),
+                "talks": [{k: (round(float(v), 4) if isinstance(v, (float, np.floating)) else v) for k, v in r.items()} for r in acl_results]},
+}
+json.dump(results, open("/home/cloud/bench_voxtral_results.json", "w"), indent=2)
+print(f"\nSaved to /home/cloud/bench_voxtral_results.json")
--- a/benchmarks/h100/bench_voxtral_vllm_realtime.py
+++ b/benchmarks/h100/bench_voxtral_vllm_realtime.py
@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+"""Benchmark Voxtral via vLLM WebSocket /v1/realtime — proper streaming."""
+import asyncio, json, base64, time, wave, re, os
+import numpy as np
+import websockets
+import librosa
+from jiwer import wer as compute_wer
+
+MODEL = "mistralai/Voxtral-Mini-4B-Realtime-2602"
+WS_URI = "ws://localhost:8000/v1/realtime"
+
+def norm(t):
+    return re.sub(r' +', ' ', re.sub(r'[^a-z0-9 ]', ' ', t.lower())).strip()
+
+async def transcribe(audio_path, max_tokens=4096):
+    audio, _ = librosa.load(audio_path, sr=16000, mono=True)
+    pcm16 = (audio * 32767).astype(np.int16).tobytes()
+    dur = len(audio) / 16000
+
+    t0 = time.time()
+    transcript = ""
+    first_token_time = None
+
+    async with websockets.connect(WS_URI, max_size=2**24) as ws:
+        await ws.recv()  # session.created
+        await ws.send(json.dumps({"type": "session.update", "model": MODEL}))
+        await ws.send(json.dumps({"type": "input_audio_buffer.commit"}))  # signal ready
+
+        # Send audio in 4KB chunks
+        for i in range(0, len(pcm16), 4096):
+            await ws.send(json.dumps({
+                "type": "input_audio_buffer.append",
+                "audio": base64.b64encode(pcm16[i:i+4096]).decode(),
+            }))
+
+        await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True}))
+
+        while True:
+            try:
+                msg = json.loads(await asyncio.wait_for(ws.recv(), timeout=120))
+                if msg["type"] == "transcription.delta":
+                    d = msg.get("delta", "")
+                    if d.strip() and first_token_time is None:
+                        first_token_time = time.time() - t0
+                    transcript += d
+                elif msg["type"] == "transcription.done":
+                    transcript = msg.get("text", transcript)
+                    break
+                elif msg["type"] == "error":
+                    break
+            except asyncio.TimeoutError:
+                break
+
+    elapsed = time.time() - t0
+    return transcript.strip(), dur, elapsed / dur, first_token_time or elapsed
+
+async def main():
+    # Warmup
+    print("Warmup...", flush=True)
+    await transcribe("/home/cloud/benchmark_data/librispeech_clean_0000.wav")
+
+    # LibriSpeech clean (full 91 samples)
+    print("\n=== Voxtral vLLM Realtime / LibriSpeech clean ===", flush=True)
+    clean = json.load(open("/home/cloud/benchmark_data/metadata.json"))
+    wers = []; ta = tp = 0
+    for i, s in enumerate(clean):
+        hyp, dur, rtf, fwl = await transcribe(s['path'])
+        w = compute_wer(norm(s['reference']), norm(hyp)) if hyp else 1.0
+        wers.append(w); ta += dur; tp += dur * rtf
+        if i < 3 or i % 20 == 0:
+            print(f"  [{i}] {dur:.1f}s RTF={rtf:.3f} FWL={fwl:.2f}s WER={w:.1%} | {hyp[:60]}", flush=True)
+    clean_wer = np.mean(wers); clean_rtf = tp / ta
+    print(f"  CLEAN ({len(clean)}): WER {clean_wer:.2%}, RTF {clean_rtf:.3f}\n", flush=True)
+
+    # LibriSpeech other (full 133 samples)
+    print("=== Voxtral vLLM Realtime / LibriSpeech other ===", flush=True)
+    other = json.load(open("/home/cloud/benchmark_data/metadata_other.json"))
+    wers2 = []; ta2 = tp2 = 0
+    for i, s in enumerate(other):
+        hyp, dur, rtf, fwl = await transcribe(s['path'])
+        w = compute_wer(norm(s['reference']), norm(hyp)) if hyp else 1.0
+        wers2.append(w); ta2 += dur; tp2 += dur * rtf
+        if i < 3 or i % 20 == 0:
+            print(f"  [{i}] {dur:.1f}s RTF={rtf:.3f} WER={w:.1%}", flush=True)
+    other_wer = np.mean(wers2); other_rtf = tp2 / ta2
+    print(f"  OTHER ({len(other)}): WER {other_wer:.2%}, RTF {other_rtf:.3f}\n", flush=True)
+
+    # ACL6060 talks
+    print("=== Voxtral vLLM Realtime / ACL6060 ===", flush=True)
+    acl = []
+    for talk in ["110", "117", "268", "367", "590"]:
+        gw = []
+        with open(f"/home/cloud/iwslt26-sst/inputs/en/acl6060.ts/gold-jsonl/2022.acl-long.{talk}.jsonl") as f:
+            for line in f: gw.append(json.loads(line)["text"].strip())
+        gold = " ".join(gw)
+
+        hyp, dur, rtf, fwl = await transcribe(f"/home/cloud/acl6060_audio/2022.acl-long.{talk}.wav")
+        w = compute_wer(norm(gold), norm(hyp)) if hyp else 1.0
+        acl.append({"talk": talk, "wer": round(float(w),4), "rtf": round(float(rtf),3), "dur": round(dur,1)})
+        print(f"  Talk {talk}: {dur:.0f}s, WER {w:.2%}, RTF {rtf:.3f}, FWL {fwl:.2f}s", flush=True)
+
+    acl_wer = np.mean([r["wer"] for r in acl])
+    acl_rtf = np.mean([r["rtf"] for r in acl])
+    print(f"  ACL6060 AVERAGE: WER {acl_wer:.2%}, RTF {acl_rtf:.3f}\n", flush=True)
+
+    # Summary
+    print(f"{'='*55}")
+    print(f"  VOXTRAL vLLM REALTIME BENCHMARK (H100)")
+    print(f"{'='*55}")
+    print(f"  LS clean ({len(clean)}): WER {clean_wer:.2%}, RTF {clean_rtf:.3f}")
+    print(f"  LS other ({len(other)}): WER {other_wer:.2%}, RTF {other_rtf:.3f}")
+    print(f"  ACL6060 (5):     WER {acl_wer:.2%}, RTF {acl_rtf:.3f}")
+
+    results = {
+        "clean": {"avg_wer": round(float(clean_wer),4), "rtf": round(float(clean_rtf),3), "n": len(clean)},
+        "other": {"avg_wer": round(float(other_wer),4), "rtf": round(float(other_rtf),3), "n": len(other)},
+        "acl6060": {"avg_wer": round(float(acl_wer),4), "avg_rtf": round(float(acl_rtf),3), "talks": acl},
+    }
+    json.dump(results, open("/home/cloud/bench_voxtral_realtime_results.json", "w"), indent=2)
+    print(f"\n  Saved to /home/cloud/bench_voxtral_realtime_results.json")
+
+asyncio.run(main())
--- a/benchmarks/h100/generate_figures.py
+++ b/benchmarks/h100/generate_figures.py
@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+"""
+Generate polished benchmark figures for WhisperLiveKit H100 results.
+
+Reads data from results.json, outputs PNGs to this directory.
+Run: python3 benchmarks/h100/generate_figures.py
+"""
+import json
+import os
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import numpy as np
+
+DIR = os.path.dirname(os.path.abspath(__file__))
+DATA = json.load(open(os.path.join(DIR, "results.json")))
+
+# ── Style constants ──
+COLORS = {
+    "whisper":  "#d63031",
+    "qwen_b":   "#6c5ce7",
+    "qwen_s":   "#00b894",
+    "voxtral":  "#fdcb6e",
+    "fw_m5":    "#74b9ff",
+    "mlx_m5":   "#55efc4",
+    "vox_m5":   "#ffeaa7",
+}
+plt.rcParams.update({
+    "font.family": "sans-serif",
+    "font.size": 11,
+    "axes.spines.top": False,
+    "axes.spines.right": False,
+})
+
+
+def _save(fig, name):
+    path = os.path.join(DIR, name)
+    fig.savefig(path, dpi=180, bbox_inches="tight", facecolor="white")
+    plt.close(fig)
+    print(f"  {name}")
+
+
+# ──────────────────────────────────────────────────────────
+# Figure 1: WER vs RTF scatter — H100 (LibriSpeech clean)
+# ──────────────────────────────────────────────────────────
+def fig_scatter_clean():
+    ls = DATA["librispeech_clean"]["systems"]
+    m5 = DATA["m5_reference"]["systems"]
+
+    fig, ax = plt.subplots(figsize=(9, 7.5))
+
+    ax.axhspan(0, 10, color="#f0fff0", alpha=0.5, zorder=0)
+
+    # M5 (ghost dots)
+    for k, v in m5.items():
+        ax.scatter(v["rtf"], v["wer"], s=50, c="silver", marker="o",
+                   alpha=0.22, zorder=2, linewidths=0.4, edgecolors="gray")
+
+    # H100 systems — (name, data, color, marker, size, label_x_off, label_y_off)
+    pts = [
+        ("Whisper large-v3",            ls["whisper_large_v3_batch"],     COLORS["whisper"], "h", 240, -8, -16),
+        ("Qwen3-ASR 0.6B (batch)",     ls["qwen3_0.6b_batch"],           COLORS["qwen_b"],  "h", 170,  8,   6),
+        ("Qwen3-ASR 1.7B (batch)",     ls["qwen3_1.7b_batch"],           COLORS["qwen_b"],  "h", 240,  8, -16),
+        ("Voxtral 4B (vLLM)",          ls["voxtral_4b_vllm_realtime"],   COLORS["voxtral"], "D", 260,  8,   6),
+        ("Qwen3 0.6B SimulStream+KV",  ls["qwen3_0.6b_simulstream_kv"], COLORS["qwen_s"],  "s", 220,  8,   6),
+        ("Qwen3 1.7B SimulStream+KV",  ls["qwen3_1.7b_simulstream_kv"], COLORS["qwen_s"],  "s", 280,  8,  -16),
+    ]
+
+    for name, d, color, marker, sz, lx, ly in pts:
+        ax.scatter(d["rtf"], d["wer"], s=sz, c=color, marker=marker,
+                   edgecolors="white", linewidths=1.5, zorder=5)
+        ax.annotate(name, (d["rtf"], d["wer"]), fontsize=8.5, fontweight="bold",
+                    xytext=(lx, ly), textcoords="offset points",
+                    arrowprops=dict(arrowstyle="-", color="#aaa", lw=0.5))
+
+    ax.set_xlabel("RTF  (lower = faster)")
+    ax.set_ylabel("WER %  (lower = better)")
+    ax.set_title("Speed vs Accuracy  —  LibriSpeech test-clean  (H100 80 GB)",
+                 fontsize=13, fontweight="bold", pad=12)
+    ax.set_xlim(-0.005, 0.20)
+    ax.set_ylim(-0.3, 10)
+    ax.grid(True, alpha=0.12)
+
+    legend = [
+        mpatches.Patch(color=COLORS["whisper"], label="Whisper large-v3"),
+        mpatches.Patch(color=COLORS["qwen_b"],  label="Qwen3-ASR (batch)"),
+        mpatches.Patch(color=COLORS["qwen_s"],  label="Qwen3 SimulStream+KV"),
+        mpatches.Patch(color=COLORS["voxtral"], label="Voxtral 4B (vLLM)"),
+        plt.Line2D([0],[0], marker="h", color="w", mfc="gray", ms=8, label="Batch"),
+        plt.Line2D([0],[0], marker="s", color="w", mfc="gray", ms=8, label="Streaming"),
+    ]
+    ax.legend(handles=legend, fontsize=8.5, loc="upper right", framealpha=0.85, ncol=2)
+    _save(fig, "wer_vs_rtf_clean.png")
+
+
+# ──────────────────────────────────────────────────────────
+# Figure 2: ACL6060 conference talks — the realistic test
+# ──────────────────────────────────────────────────────────
+def fig_scatter_acl6060():
+    acl = DATA["acl6060"]["systems"]
+
+    fig, ax = plt.subplots(figsize=(10, 6.5))
+    ax.axhspan(0, 15, color="#f0fff0", alpha=0.4, zorder=0)
+
+    pts = [
+        ("Voxtral 4B\n(vLLM Realtime)",    acl["voxtral_4b_vllm_realtime"],  COLORS["voxtral"], "D", 380),
+        ("Qwen3 1.7B\nSimulStream+KV",     acl["qwen3_1.7b_simulstream_kv"], COLORS["qwen_s"],  "s", 380),
+        ("Qwen3 0.6B\nSimulStream+KV",     acl["qwen3_0.6b_simulstream_kv"], COLORS["qwen_s"],  "s", 260),
+        ("Whisper large-v3\n(batch)",       acl["whisper_large_v3_batch"],    COLORS["whisper"], "h", 320),
+    ]
+    label_off = [(10, -12), (10, 6), (10, 6), (10, 6)]
+
+    for (name, d, color, marker, sz), (lx, ly) in zip(pts, label_off):
+        wer = d["avg_wer"]; rtf = d["avg_rtf"]
+        ax.scatter(rtf, wer, s=sz, c=color, marker=marker,
+                   edgecolors="white", linewidths=1.5, zorder=5)
+        ax.annotate(name, (rtf, wer), fontsize=9.5, fontweight="bold",
+                    xytext=(lx, ly), textcoords="offset points",
+                    arrowprops=dict(arrowstyle="-", color="#aaa", lw=0.6))
+
+    # Cascade annotation
+    ax.annotate("Full STT+MT cascade\nRTF 0.15 (real-time)",
+                xy=(0.151, 1), xytext=(0.25, 4),
+                fontsize=9, fontstyle="italic", color="#1565c0",
+                arrowprops=dict(arrowstyle="->", color="#1565c0", lw=1.5),
+                bbox=dict(boxstyle="round,pad=0.3", fc="#e3f2fd", ec="#90caf9", alpha=0.9))
+
+    ax.set_xlabel("RTF  (lower = faster)")
+    ax.set_ylabel("WER %  (lower = better)")
+    ax.set_title("ACL6060 Conference Talks  —  5 talks, 58 min  (H100 80 GB)",
+                 fontsize=13, fontweight="bold", pad=12)
+    ax.set_xlim(-0.005, 0.30)
+    ax.set_ylim(-1, 26)
+    ax.grid(True, alpha=0.12)
+    _save(fig, "wer_vs_rtf_acl6060.png")
+
+
+# ──────────────────────────────────────────────────────────
+# Figure 3: Bar chart — WER + RTF side-by-side
+# ──────────────────────────────────────────────────────────
+def fig_bars():
+    names = [
+        "Whisper\nlarge-v3", "Voxtral 4B\n(vLLM)", "Qwen3 0.6B\n(batch)",
+        "Qwen3 1.7B\n(batch)", "Qwen3 0.6B\nSimulStream", "Qwen3 1.7B\nSimulStream",
+    ]
+    wer_c = [2.02, 2.71, 2.30, 2.46, 6.44, 8.09]
+    wer_o = [7.79, 9.26, 6.12, 5.34, 9.27, 9.56]
+    rtf_c = [0.071, 0.137, 0.065, 0.069, 0.109, 0.117]
+    fwl   = [472, 137, 432, 457, 91, 94]  # ms
+    cols  = [COLORS["whisper"], COLORS["voxtral"], COLORS["qwen_b"],
+             COLORS["qwen_b"], COLORS["qwen_s"], COLORS["qwen_s"]]
+    cols_l = ["#ff7675", "#ffeaa7", "#a29bfe", "#a29bfe", "#55efc4", "#55efc4"]
+
+    x = np.arange(len(names))
+    fig, axes = plt.subplots(1, 3, figsize=(16, 6))
+
+    # WER
+    ax = axes[0]; w = 0.36
+    ax.bar(x - w/2, wer_c, w, color=cols, alpha=0.9, edgecolor="white", label="test-clean")
+    ax.bar(x + w/2, wer_o, w, color=cols_l, alpha=0.65, edgecolor="white", label="test-other")
+    ax.set_ylabel("WER %"); ax.set_title("Word Error Rate", fontweight="bold")
+    ax.set_xticks(x); ax.set_xticklabels(names, fontsize=7.5, rotation=25, ha="right")
+    ax.legend(fontsize=8); ax.grid(axis="y", alpha=0.15)
+    for i, v in enumerate(wer_c):
+        ax.text(i - w/2, v + 0.2, f"{v:.1f}", ha="center", fontsize=7, fontweight="bold")
+
+    # RTF
+    ax = axes[1]
+    ax.bar(x, rtf_c, 0.55, color=cols, alpha=0.9, edgecolor="white")
+    ax.set_ylabel("RTF  (lower = faster)"); ax.set_title("Real-Time Factor (test-clean)", fontweight="bold")
+    ax.set_xticks(x); ax.set_xticklabels(names, fontsize=7.5, rotation=25, ha="right")
+    ax.grid(axis="y", alpha=0.15)
+    for i, v in enumerate(rtf_c):
+        ax.text(i, v + 0.003, f"{v:.3f}", ha="center", fontsize=8, fontweight="bold")
+
+    # First-word latency
+    ax = axes[2]
+    ax.bar(x, fwl, 0.55, color=cols, alpha=0.9, edgecolor="white")
+    ax.set_ylabel("ms"); ax.set_title("First Word Latency", fontweight="bold")
+    ax.set_xticks(x); ax.set_xticklabels(names, fontsize=7.5, rotation=25, ha="right")
+    ax.grid(axis="y", alpha=0.15)
+    for i, v in enumerate(fwl):
+        ax.text(i, v + 8, f"{v}", ha="center", fontsize=8, fontweight="bold")
+
+    fig.suptitle("LibriSpeech Benchmark  —  H100 80 GB", fontsize=14, fontweight="bold")
+    plt.tight_layout()
+    _save(fig, "bars_wer_rtf_latency.png")
+
+
+# ──────────────────────────────────────────────────────────
+# Figure 4: Clean vs Other robustness
+# ──────────────────────────────────────────────────────────
+def fig_robustness():
+    models = [
+        ("Whisper large-v3",          2.02, 7.79, COLORS["whisper"], "h", 280),
+        ("Qwen3 0.6B (batch)",       2.30, 6.12, COLORS["qwen_b"],  "h", 180),
+        ("Qwen3 1.7B (batch)",       2.46, 5.34, COLORS["qwen_b"],  "h", 280),
+        ("Voxtral 4B (vLLM)",        2.71, 9.26, COLORS["voxtral"], "D", 280),
+        ("Qwen3 0.6B\nSimulStream",  6.44, 9.27, COLORS["qwen_s"],  "s", 240),
+        ("Qwen3 1.7B\nSimulStream",  8.09, 9.56, COLORS["qwen_s"],  "s", 300),
+    ]
+    # Manual label offsets — carefully placed to avoid overlap
+    offsets = [(-55, 10), (8, 10), (8, -18), (-55, -18), (-10, 12), (10, -18)]
+
+    fig, ax = plt.subplots(figsize=(8.5, 7))
+    ax.plot([0, 13], [0, 13], "--", color="#ccc", lw=1, zorder=1)
+    ax.fill_between([0, 13], [0, 13], [13, 13], color="#fff5f5", alpha=0.5, zorder=0)
+    ax.text(4, 11, "degrades more\non noisy audio", fontsize=9, color="#bbb", fontstyle="italic")
+
+    for (name, wc, wo, color, marker, sz), (lx, ly) in zip(models, offsets):
+        ax.scatter(wc, wo, s=sz, c=color, marker=marker,
+                   edgecolors="white", linewidths=1.5, zorder=5)
+        ax.annotate(name, (wc, wo), fontsize=8.5, fontweight="bold",
+                    xytext=(lx, ly), textcoords="offset points",
+                    arrowprops=dict(arrowstyle="-", color="#aaa", lw=0.6))
+        deg = wo - wc
+        ax.annotate(f"+{deg:.1f}%", (wc, wo), fontsize=7, color="#999",
+                    xytext=(-6, -13), textcoords="offset points")
+
+    ax.set_xlabel("WER % on test-clean")
+    ax.set_ylabel("WER % on test-other")
+    ax.set_title("Clean vs Noisy Robustness  (H100 80 GB)", fontsize=13, fontweight="bold", pad=12)
+    ax.set_xlim(-0.3, 12); ax.set_ylim(-0.3, 12)
+    ax.set_aspect("equal"); ax.grid(True, alpha=0.12)
+    _save(fig, "robustness_clean_vs_other.png")
+
+
+# ──────────────────────────────────────────────────────────
+# Figure 5: ACL6060 per-talk breakdown (Qwen3 vs Voxtral)
+# ──────────────────────────────────────────────────────────
+def fig_per_talk():
+    q = DATA["acl6060"]["systems"]["qwen3_1.7b_simulstream_kv"]["per_talk"]
+    v = DATA["acl6060"]["systems"]["voxtral_4b_vllm_realtime"]["per_talk"]
+    talks = DATA["acl6060"]["talks"]
+
+    fig, ax = plt.subplots(figsize=(9, 5))
+    x = np.arange(len(talks)); w = 0.35
+
+    bars_v = ax.bar(x - w/2, [v[t] for t in talks], w, color=COLORS["voxtral"],
+                    edgecolor="white", label="Voxtral 4B (vLLM)")
+    bars_q = ax.bar(x + w/2, [q[t] for t in talks], w, color=COLORS["qwen_s"],
+                    edgecolor="white", label="Qwen3 1.7B SimulStream+KV")
+
+    for bar in bars_v:
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
+                f"{bar.get_height():.1f}", ha="center", fontsize=8)
+    for bar in bars_q:
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
+                f"{bar.get_height():.1f}", ha="center", fontsize=8)
+
+    ax.set_xlabel("ACL6060 Talk ID")
+    ax.set_ylabel("WER %")
+    ax.set_title("Per-Talk WER  —  ACL6060 Conference Talks  (H100 80 GB)",
+                 fontsize=13, fontweight="bold", pad=12)
+    ax.set_xticks(x); ax.set_xticklabels([f"Talk {t}" for t in talks])
+    ax.legend(fontsize=9); ax.grid(axis="y", alpha=0.15)
+    ax.set_ylim(0, 18)
+    _save(fig, "acl6060_per_talk.png")
+
+
+if __name__ == "__main__":
+    print("Generating H100 benchmark figures...")
+    fig_scatter_clean()
+    fig_scatter_acl6060()
+    fig_bars()
+    fig_robustness()
+    fig_per_talk()
+    print("Done!")
--- a/benchmarks/h100/results.json
+++ b/benchmarks/h100/results.json
@ -0,0 +1,56 @@
+{
+  "hardware": "NVIDIA H100 80GB HBM3, CUDA 12.4, Driver 550.163",
+  "date": "2026-03-15",
+
+  "librispeech_clean": {
+    "n_samples": 91,
+    "total_audio_s": 602,
+    "systems": {
+      "whisper_large_v3_batch":     {"wer": 2.02, "rtf": 0.071, "first_word_latency_s": 0.472},
+      "qwen3_0.6b_batch":          {"wer": 2.30, "rtf": 0.065, "first_word_latency_s": 0.432},
+      "qwen3_1.7b_batch":          {"wer": 2.46, "rtf": 0.069, "first_word_latency_s": 0.457},
+      "voxtral_4b_vllm_realtime":  {"wer": 2.71, "rtf": 0.137, "first_word_latency_s": 0.137},
+      "qwen3_0.6b_simulstream_kv": {"wer": 6.44, "rtf": 0.109, "first_word_latency_s": 0.091},
+      "qwen3_1.7b_simulstream_kv": {"wer": 8.09, "rtf": 0.117, "first_word_latency_s": 0.094}
+    }
+  },
+
+  "librispeech_other": {
+    "n_samples": 133,
+    "total_audio_s": 600,
+    "systems": {
+      "qwen3_1.7b_batch":          {"wer": 5.34, "rtf": 0.088},
+      "qwen3_0.6b_batch":          {"wer": 6.12, "rtf": 0.086},
+      "whisper_large_v3_batch":     {"wer": 7.79, "rtf": 0.092},
+      "qwen3_0.6b_simulstream_kv": {"wer": 9.27, "rtf": 0.127},
+      "voxtral_4b_vllm_realtime":  {"wer": 9.26, "rtf": 0.144},
+      "qwen3_1.7b_simulstream_kv": {"wer": 9.56, "rtf": 0.140}
+    }
+  },
+
+  "acl6060": {
+    "description": "5 ACL 2022 conference talks, 58 min total",
+    "talks": ["110", "117", "268", "367", "590"],
+    "systems": {
+      "voxtral_4b_vllm_realtime":  {"avg_wer": 7.83, "avg_rtf": 0.203, "per_talk": {"110": 5.18, "117": 2.24, "268": 14.88, "367": 9.40, "590": 7.45}},
+      "qwen3_1.7b_simulstream_kv": {"avg_wer": 9.20, "avg_rtf": 0.074, "per_talk": {"110": 5.59, "117": 8.12, "268": 12.25, "367": 12.29, "590": 7.77}},
+      "qwen3_0.6b_simulstream_kv": {"avg_wer": 13.21, "avg_rtf": 0.098},
+      "whisper_large_v3_batch":     {"avg_wer": 22.53, "avg_rtf": 0.125}
+    }
+  },
+
+  "m5_reference": {
+    "description": "MacBook M5 results (from WLK scatter benchmarks)",
+    "systems": {
+      "fw_la_base":    {"wer": 17.0, "rtf": 0.82},
+      "fw_la_small":   {"wer":  8.6, "rtf": 0.76},
+      "fw_ss_base":    {"wer":  7.8, "rtf": 0.46},
+      "fw_ss_small":   {"wer":  7.0, "rtf": 0.90},
+      "mlx_ss_base":   {"wer":  7.7, "rtf": 0.34},
+      "mlx_ss_small":  {"wer":  6.5, "rtf": 0.68},
+      "voxtral_mlx":   {"wer":  7.0, "rtf": 0.26},
+      "qwen3_mlx_0.6b":{"wer":  5.5, "rtf": 0.55},
+      "qwen3_0.6b_batch":{"wer":24.0, "rtf": 1.42}
+    }
+  }
+}
--- a/benchmarks/h100/robustness_clean_vs_other.png
+++ b/benchmarks/h100/robustness_clean_vs_other.png
--- a/benchmarks/h100/wer_vs_rtf_acl6060.png
+++ b/benchmarks/h100/wer_vs_rtf_acl6060.png
--- a/benchmarks/h100/wer_vs_rtf_clean.png
+++ b/benchmarks/h100/wer_vs_rtf_clean.png