diff --git a/benchmark_bars_h100.png b/benchmark_bars_h100.png deleted file mode 100644 index 31a8c9e..0000000 Binary files a/benchmark_bars_h100.png and /dev/null differ diff --git a/benchmark_latency_h100.png b/benchmark_latency_h100.png deleted file mode 100644 index 1923665..0000000 Binary files a/benchmark_latency_h100.png and /dev/null differ diff --git a/benchmark_robustness_h100.png b/benchmark_robustness_h100.png deleted file mode 100644 index 1d7b60b..0000000 Binary files a/benchmark_robustness_h100.png and /dev/null differ diff --git a/benchmark_scatter_acl6060_h100.png b/benchmark_scatter_acl6060_h100.png deleted file mode 100644 index 33795dd..0000000 Binary files a/benchmark_scatter_acl6060_h100.png and /dev/null differ diff --git a/benchmark_scatter_en_h100.png b/benchmark_scatter_en_h100.png deleted file mode 100644 index b1b0b55..0000000 Binary files a/benchmark_scatter_en_h100.png and /dev/null differ diff --git a/benchmark_scatter_h100.png b/benchmark_scatter_h100.png deleted file mode 100644 index b7f9efd..0000000 Binary files a/benchmark_scatter_h100.png and /dev/null differ diff --git a/benchmarks/h100/acl6060_per_talk.png b/benchmarks/h100/acl6060_per_talk.png new file mode 100644 index 0000000..fde5ae7 Binary files /dev/null and b/benchmarks/h100/acl6060_per_talk.png differ diff --git a/benchmarks/h100/bars_wer_rtf_latency.png b/benchmarks/h100/bars_wer_rtf_latency.png new file mode 100644 index 0000000..2bd6919 Binary files /dev/null and b/benchmarks/h100/bars_wer_rtf_latency.png differ diff --git a/benchmarks/h100/bench_voxtral_hf_batch.py b/benchmarks/h100/bench_voxtral_hf_batch.py new file mode 100644 index 0000000..75cd512 --- /dev/null +++ b/benchmarks/h100/bench_voxtral_hf_batch.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +"""Standalone Voxtral benchmark — no whisperlivekit imports.""" +import json, logging, re, time, wave, queue, threading +import numpy as np +import torch + +logging.basicConfig(level=logging.WARNING) +for n in ["transformers","torch","httpx"]: + logging.getLogger(n).setLevel(logging.ERROR) + +from jiwer import wer as compute_wer +from transformers import AutoProcessor, VoxtralRealtimeForConditionalGeneration, TextIteratorStreamer + +def norm(t): + return re.sub(r' +', ' ', re.sub(r'[^a-z0-9 ]', ' ', t.lower())).strip() + +def load_audio(path): + with wave.open(path, 'r') as wf: + return np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16).astype(np.float32) / 32768.0 + +# Load model +print("Loading Voxtral-Mini-4B...", flush=True) +MODEL_ID = "mistralai/Voxtral-Mini-4B-Realtime-2602" +processor = AutoProcessor.from_pretrained(MODEL_ID) +model = VoxtralRealtimeForConditionalGeneration.from_pretrained( + MODEL_ID, torch_dtype=torch.bfloat16, device_map="cuda:0", +) +print(f"Loaded, GPU: {torch.cuda.memory_allocated()/1e9:.1f} GB", flush=True) + +def transcribe_batch(audio_np): + """Simple batch transcription (not streaming).""" + # Voxtral expects audio as input_features from processor + inputs = processor( + audio=audio_np, sampling_rate=16000, return_tensors="pt", + ).to("cuda:0").to(torch.bfloat16) + + t0 = time.perf_counter() + with torch.inference_mode(): + generated = model.generate(**inputs, max_new_tokens=1024) + t1 = time.perf_counter() + + text = processor.batch_decode(generated, skip_special_tokens=True)[0].strip() + return text, t1 - t0 + +# 1. LibriSpeech test-clean +print("\n=== Voxtral / LibriSpeech test-clean ===", flush=True) +clean = json.load(open("/home/cloud/benchmark_data/metadata.json")) +wers = []; ta = tp = 0 +for i, s in enumerate(clean): + audio = load_audio(s['path']) + hyp, pt = transcribe_batch(audio) + w = compute_wer(norm(s['reference']), norm(hyp)) + wers.append(w); ta += s['duration']; tp += pt + if i < 3 or i % 20 == 0: + print(f" [{i}] {s['duration']:.1f}s RTF={pt/s['duration']:.2f} WER={w:.1%} | {hyp[:60]}", flush=True) +clean_wer = np.mean(wers); clean_rtf = tp/ta +print(f" CLEAN: WER {clean_wer:.2%}, RTF {clean_rtf:.3f} ({len(clean)} samples, {ta:.0f}s)") + +# 2. LibriSpeech test-other +print("\n=== Voxtral / LibriSpeech test-other ===", flush=True) +other = json.load(open("/home/cloud/benchmark_data/metadata_other.json")) +wers2 = []; ta2 = tp2 = 0 +for i, s in enumerate(other): + audio = load_audio(s['path']) + hyp, pt = transcribe_batch(audio) + w = compute_wer(norm(s['reference']), norm(hyp)) + wers2.append(w); ta2 += s['duration']; tp2 += pt + if i < 3 or i % 20 == 0: + print(f" [{i}] {s['duration']:.1f}s RTF={pt/s['duration']:.2f} WER={w:.1%}", flush=True) +other_wer = np.mean(wers2); other_rtf = tp2/ta2 +print(f" OTHER: WER {other_wer:.2%}, RTF {other_rtf:.3f} ({len(other)} samples, {ta2:.0f}s)") + +# 3. ACL6060 +print("\n=== Voxtral / ACL6060 ===", flush=True) +acl_results = [] +for talk in ["110", "117", "268", "367", "590"]: + audio = load_audio(f"/home/cloud/acl6060_audio/2022.acl-long.{talk}.wav") + dur = len(audio) / 16000 + gw = [] + with open(f"/home/cloud/iwslt26-sst/inputs/en/acl6060.ts/gold-jsonl/2022.acl-long.{talk}.jsonl") as f: + for line in f: + gw.append(json.loads(line)["text"].strip()) + gold = " ".join(gw) + + # For long audio, process in 30s chunks + all_hyp = [] + t0 = time.perf_counter() + chunk_size = 30 * 16000 + for start in range(0, len(audio), chunk_size): + chunk = audio[start:start + chunk_size] + if len(chunk) < 1600: # skip very short tail + continue + hyp, _ = transcribe_batch(chunk) + all_hyp.append(hyp) + t1 = time.perf_counter() + + full_hyp = " ".join(all_hyp) + w = compute_wer(norm(gold), norm(full_hyp)) + rtf = (t1 - t0) / dur + acl_results.append({"talk": talk, "wer": w, "rtf": rtf, "dur": dur}) + print(f" Talk {talk}: {dur:.0f}s, WER {w:.2%}, RTF {rtf:.3f}", flush=True) + +acl_wer = np.mean([r["wer"] for r in acl_results]) +acl_rtf = np.mean([r["rtf"] for r in acl_results]) +print(f" ACL6060 AVERAGE: WER {acl_wer:.2%}, RTF {acl_rtf:.3f}") + +# Summary +print(f"\n{'='*60}") +print(f" VOXTRAL BENCHMARK SUMMARY (H100 80GB)") +print(f"{'='*60}") +print(f" {'Dataset':>25} {'WER':>7} {'RTF':>7}") +print(f" {'-'*42}") +print(f" {'LibriSpeech clean':>25} {clean_wer:>6.2%} {clean_rtf:>7.3f}") +print(f" {'LibriSpeech other':>25} {other_wer:>6.2%} {other_rtf:>7.3f}") +print(f" {'ACL6060 (5 talks)':>25} {acl_wer:>6.2%} {acl_rtf:>7.3f}") + +results = { + "clean": {"avg_wer": round(float(clean_wer), 4), "rtf": round(float(clean_rtf), 3)}, + "other": {"avg_wer": round(float(other_wer), 4), "rtf": round(float(other_rtf), 3)}, + "acl6060": {"avg_wer": round(float(acl_wer), 4), "avg_rtf": round(float(acl_rtf), 3), + "talks": [{k: (round(float(v), 4) if isinstance(v, (float, np.floating)) else v) for k, v in r.items()} for r in acl_results]}, +} +json.dump(results, open("/home/cloud/bench_voxtral_results.json", "w"), indent=2) +print(f"\nSaved to /home/cloud/bench_voxtral_results.json") diff --git a/benchmarks/h100/bench_voxtral_vllm_realtime.py b/benchmarks/h100/bench_voxtral_vllm_realtime.py new file mode 100644 index 0000000..fc87bac --- /dev/null +++ b/benchmarks/h100/bench_voxtral_vllm_realtime.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Benchmark Voxtral via vLLM WebSocket /v1/realtime — proper streaming.""" +import asyncio, json, base64, time, wave, re, os +import numpy as np +import websockets +import librosa +from jiwer import wer as compute_wer + +MODEL = "mistralai/Voxtral-Mini-4B-Realtime-2602" +WS_URI = "ws://localhost:8000/v1/realtime" + +def norm(t): + return re.sub(r' +', ' ', re.sub(r'[^a-z0-9 ]', ' ', t.lower())).strip() + +async def transcribe(audio_path, max_tokens=4096): + audio, _ = librosa.load(audio_path, sr=16000, mono=True) + pcm16 = (audio * 32767).astype(np.int16).tobytes() + dur = len(audio) / 16000 + + t0 = time.time() + transcript = "" + first_token_time = None + + async with websockets.connect(WS_URI, max_size=2**24) as ws: + await ws.recv() # session.created + await ws.send(json.dumps({"type": "session.update", "model": MODEL})) + await ws.send(json.dumps({"type": "input_audio_buffer.commit"})) # signal ready + + # Send audio in 4KB chunks + for i in range(0, len(pcm16), 4096): + await ws.send(json.dumps({ + "type": "input_audio_buffer.append", + "audio": base64.b64encode(pcm16[i:i+4096]).decode(), + })) + + await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True})) + + while True: + try: + msg = json.loads(await asyncio.wait_for(ws.recv(), timeout=120)) + if msg["type"] == "transcription.delta": + d = msg.get("delta", "") + if d.strip() and first_token_time is None: + first_token_time = time.time() - t0 + transcript += d + elif msg["type"] == "transcription.done": + transcript = msg.get("text", transcript) + break + elif msg["type"] == "error": + break + except asyncio.TimeoutError: + break + + elapsed = time.time() - t0 + return transcript.strip(), dur, elapsed / dur, first_token_time or elapsed + +async def main(): + # Warmup + print("Warmup...", flush=True) + await transcribe("/home/cloud/benchmark_data/librispeech_clean_0000.wav") + + # LibriSpeech clean (full 91 samples) + print("\n=== Voxtral vLLM Realtime / LibriSpeech clean ===", flush=True) + clean = json.load(open("/home/cloud/benchmark_data/metadata.json")) + wers = []; ta = tp = 0 + for i, s in enumerate(clean): + hyp, dur, rtf, fwl = await transcribe(s['path']) + w = compute_wer(norm(s['reference']), norm(hyp)) if hyp else 1.0 + wers.append(w); ta += dur; tp += dur * rtf + if i < 3 or i % 20 == 0: + print(f" [{i}] {dur:.1f}s RTF={rtf:.3f} FWL={fwl:.2f}s WER={w:.1%} | {hyp[:60]}", flush=True) + clean_wer = np.mean(wers); clean_rtf = tp / ta + print(f" CLEAN ({len(clean)}): WER {clean_wer:.2%}, RTF {clean_rtf:.3f}\n", flush=True) + + # LibriSpeech other (full 133 samples) + print("=== Voxtral vLLM Realtime / LibriSpeech other ===", flush=True) + other = json.load(open("/home/cloud/benchmark_data/metadata_other.json")) + wers2 = []; ta2 = tp2 = 0 + for i, s in enumerate(other): + hyp, dur, rtf, fwl = await transcribe(s['path']) + w = compute_wer(norm(s['reference']), norm(hyp)) if hyp else 1.0 + wers2.append(w); ta2 += dur; tp2 += dur * rtf + if i < 3 or i % 20 == 0: + print(f" [{i}] {dur:.1f}s RTF={rtf:.3f} WER={w:.1%}", flush=True) + other_wer = np.mean(wers2); other_rtf = tp2 / ta2 + print(f" OTHER ({len(other)}): WER {other_wer:.2%}, RTF {other_rtf:.3f}\n", flush=True) + + # ACL6060 talks + print("=== Voxtral vLLM Realtime / ACL6060 ===", flush=True) + acl = [] + for talk in ["110", "117", "268", "367", "590"]: + gw = [] + with open(f"/home/cloud/iwslt26-sst/inputs/en/acl6060.ts/gold-jsonl/2022.acl-long.{talk}.jsonl") as f: + for line in f: gw.append(json.loads(line)["text"].strip()) + gold = " ".join(gw) + + hyp, dur, rtf, fwl = await transcribe(f"/home/cloud/acl6060_audio/2022.acl-long.{talk}.wav") + w = compute_wer(norm(gold), norm(hyp)) if hyp else 1.0 + acl.append({"talk": talk, "wer": round(float(w),4), "rtf": round(float(rtf),3), "dur": round(dur,1)}) + print(f" Talk {talk}: {dur:.0f}s, WER {w:.2%}, RTF {rtf:.3f}, FWL {fwl:.2f}s", flush=True) + + acl_wer = np.mean([r["wer"] for r in acl]) + acl_rtf = np.mean([r["rtf"] for r in acl]) + print(f" ACL6060 AVERAGE: WER {acl_wer:.2%}, RTF {acl_rtf:.3f}\n", flush=True) + + # Summary + print(f"{'='*55}") + print(f" VOXTRAL vLLM REALTIME BENCHMARK (H100)") + print(f"{'='*55}") + print(f" LS clean ({len(clean)}): WER {clean_wer:.2%}, RTF {clean_rtf:.3f}") + print(f" LS other ({len(other)}): WER {other_wer:.2%}, RTF {other_rtf:.3f}") + print(f" ACL6060 (5): WER {acl_wer:.2%}, RTF {acl_rtf:.3f}") + + results = { + "clean": {"avg_wer": round(float(clean_wer),4), "rtf": round(float(clean_rtf),3), "n": len(clean)}, + "other": {"avg_wer": round(float(other_wer),4), "rtf": round(float(other_rtf),3), "n": len(other)}, + "acl6060": {"avg_wer": round(float(acl_wer),4), "avg_rtf": round(float(acl_rtf),3), "talks": acl}, + } + json.dump(results, open("/home/cloud/bench_voxtral_realtime_results.json", "w"), indent=2) + print(f"\n Saved to /home/cloud/bench_voxtral_realtime_results.json") + +asyncio.run(main()) diff --git a/benchmarks/h100/generate_figures.py b/benchmarks/h100/generate_figures.py new file mode 100644 index 0000000..fe28c95 --- /dev/null +++ b/benchmarks/h100/generate_figures.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python3 +""" +Generate polished benchmark figures for WhisperLiveKit H100 results. + +Reads data from results.json, outputs PNGs to this directory. +Run: python3 benchmarks/h100/generate_figures.py +""" +import json +import os + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import numpy as np + +DIR = os.path.dirname(os.path.abspath(__file__)) +DATA = json.load(open(os.path.join(DIR, "results.json"))) + +# ── Style constants ── +COLORS = { + "whisper": "#d63031", + "qwen_b": "#6c5ce7", + "qwen_s": "#00b894", + "voxtral": "#fdcb6e", + "fw_m5": "#74b9ff", + "mlx_m5": "#55efc4", + "vox_m5": "#ffeaa7", +} +plt.rcParams.update({ + "font.family": "sans-serif", + "font.size": 11, + "axes.spines.top": False, + "axes.spines.right": False, +}) + + +def _save(fig, name): + path = os.path.join(DIR, name) + fig.savefig(path, dpi=180, bbox_inches="tight", facecolor="white") + plt.close(fig) + print(f" {name}") + + +# ────────────────────────────────────────────────────────── +# Figure 1: WER vs RTF scatter — H100 (LibriSpeech clean) +# ────────────────────────────────────────────────────────── +def fig_scatter_clean(): + ls = DATA["librispeech_clean"]["systems"] + m5 = DATA["m5_reference"]["systems"] + + fig, ax = plt.subplots(figsize=(9, 7.5)) + + ax.axhspan(0, 10, color="#f0fff0", alpha=0.5, zorder=0) + + # M5 (ghost dots) + for k, v in m5.items(): + ax.scatter(v["rtf"], v["wer"], s=50, c="silver", marker="o", + alpha=0.22, zorder=2, linewidths=0.4, edgecolors="gray") + + # H100 systems — (name, data, color, marker, size, label_x_off, label_y_off) + pts = [ + ("Whisper large-v3", ls["whisper_large_v3_batch"], COLORS["whisper"], "h", 240, -8, -16), + ("Qwen3-ASR 0.6B (batch)", ls["qwen3_0.6b_batch"], COLORS["qwen_b"], "h", 170, 8, 6), + ("Qwen3-ASR 1.7B (batch)", ls["qwen3_1.7b_batch"], COLORS["qwen_b"], "h", 240, 8, -16), + ("Voxtral 4B (vLLM)", ls["voxtral_4b_vllm_realtime"], COLORS["voxtral"], "D", 260, 8, 6), + ("Qwen3 0.6B SimulStream+KV", ls["qwen3_0.6b_simulstream_kv"], COLORS["qwen_s"], "s", 220, 8, 6), + ("Qwen3 1.7B SimulStream+KV", ls["qwen3_1.7b_simulstream_kv"], COLORS["qwen_s"], "s", 280, 8, -16), + ] + + for name, d, color, marker, sz, lx, ly in pts: + ax.scatter(d["rtf"], d["wer"], s=sz, c=color, marker=marker, + edgecolors="white", linewidths=1.5, zorder=5) + ax.annotate(name, (d["rtf"], d["wer"]), fontsize=8.5, fontweight="bold", + xytext=(lx, ly), textcoords="offset points", + arrowprops=dict(arrowstyle="-", color="#aaa", lw=0.5)) + + ax.set_xlabel("RTF (lower = faster)") + ax.set_ylabel("WER % (lower = better)") + ax.set_title("Speed vs Accuracy — LibriSpeech test-clean (H100 80 GB)", + fontsize=13, fontweight="bold", pad=12) + ax.set_xlim(-0.005, 0.20) + ax.set_ylim(-0.3, 10) + ax.grid(True, alpha=0.12) + + legend = [ + mpatches.Patch(color=COLORS["whisper"], label="Whisper large-v3"), + mpatches.Patch(color=COLORS["qwen_b"], label="Qwen3-ASR (batch)"), + mpatches.Patch(color=COLORS["qwen_s"], label="Qwen3 SimulStream+KV"), + mpatches.Patch(color=COLORS["voxtral"], label="Voxtral 4B (vLLM)"), + plt.Line2D([0],[0], marker="h", color="w", mfc="gray", ms=8, label="Batch"), + plt.Line2D([0],[0], marker="s", color="w", mfc="gray", ms=8, label="Streaming"), + ] + ax.legend(handles=legend, fontsize=8.5, loc="upper right", framealpha=0.85, ncol=2) + _save(fig, "wer_vs_rtf_clean.png") + + +# ────────────────────────────────────────────────────────── +# Figure 2: ACL6060 conference talks — the realistic test +# ────────────────────────────────────────────────────────── +def fig_scatter_acl6060(): + acl = DATA["acl6060"]["systems"] + + fig, ax = plt.subplots(figsize=(10, 6.5)) + ax.axhspan(0, 15, color="#f0fff0", alpha=0.4, zorder=0) + + pts = [ + ("Voxtral 4B\n(vLLM Realtime)", acl["voxtral_4b_vllm_realtime"], COLORS["voxtral"], "D", 380), + ("Qwen3 1.7B\nSimulStream+KV", acl["qwen3_1.7b_simulstream_kv"], COLORS["qwen_s"], "s", 380), + ("Qwen3 0.6B\nSimulStream+KV", acl["qwen3_0.6b_simulstream_kv"], COLORS["qwen_s"], "s", 260), + ("Whisper large-v3\n(batch)", acl["whisper_large_v3_batch"], COLORS["whisper"], "h", 320), + ] + label_off = [(10, -12), (10, 6), (10, 6), (10, 6)] + + for (name, d, color, marker, sz), (lx, ly) in zip(pts, label_off): + wer = d["avg_wer"]; rtf = d["avg_rtf"] + ax.scatter(rtf, wer, s=sz, c=color, marker=marker, + edgecolors="white", linewidths=1.5, zorder=5) + ax.annotate(name, (rtf, wer), fontsize=9.5, fontweight="bold", + xytext=(lx, ly), textcoords="offset points", + arrowprops=dict(arrowstyle="-", color="#aaa", lw=0.6)) + + # Cascade annotation + ax.annotate("Full STT+MT cascade\nRTF 0.15 (real-time)", + xy=(0.151, 1), xytext=(0.25, 4), + fontsize=9, fontstyle="italic", color="#1565c0", + arrowprops=dict(arrowstyle="->", color="#1565c0", lw=1.5), + bbox=dict(boxstyle="round,pad=0.3", fc="#e3f2fd", ec="#90caf9", alpha=0.9)) + + ax.set_xlabel("RTF (lower = faster)") + ax.set_ylabel("WER % (lower = better)") + ax.set_title("ACL6060 Conference Talks — 5 talks, 58 min (H100 80 GB)", + fontsize=13, fontweight="bold", pad=12) + ax.set_xlim(-0.005, 0.30) + ax.set_ylim(-1, 26) + ax.grid(True, alpha=0.12) + _save(fig, "wer_vs_rtf_acl6060.png") + + +# ────────────────────────────────────────────────────────── +# Figure 3: Bar chart — WER + RTF side-by-side +# ────────────────────────────────────────────────────────── +def fig_bars(): + names = [ + "Whisper\nlarge-v3", "Voxtral 4B\n(vLLM)", "Qwen3 0.6B\n(batch)", + "Qwen3 1.7B\n(batch)", "Qwen3 0.6B\nSimulStream", "Qwen3 1.7B\nSimulStream", + ] + wer_c = [2.02, 2.71, 2.30, 2.46, 6.44, 8.09] + wer_o = [7.79, 9.26, 6.12, 5.34, 9.27, 9.56] + rtf_c = [0.071, 0.137, 0.065, 0.069, 0.109, 0.117] + fwl = [472, 137, 432, 457, 91, 94] # ms + cols = [COLORS["whisper"], COLORS["voxtral"], COLORS["qwen_b"], + COLORS["qwen_b"], COLORS["qwen_s"], COLORS["qwen_s"]] + cols_l = ["#ff7675", "#ffeaa7", "#a29bfe", "#a29bfe", "#55efc4", "#55efc4"] + + x = np.arange(len(names)) + fig, axes = plt.subplots(1, 3, figsize=(16, 6)) + + # WER + ax = axes[0]; w = 0.36 + ax.bar(x - w/2, wer_c, w, color=cols, alpha=0.9, edgecolor="white", label="test-clean") + ax.bar(x + w/2, wer_o, w, color=cols_l, alpha=0.65, edgecolor="white", label="test-other") + ax.set_ylabel("WER %"); ax.set_title("Word Error Rate", fontweight="bold") + ax.set_xticks(x); ax.set_xticklabels(names, fontsize=7.5, rotation=25, ha="right") + ax.legend(fontsize=8); ax.grid(axis="y", alpha=0.15) + for i, v in enumerate(wer_c): + ax.text(i - w/2, v + 0.2, f"{v:.1f}", ha="center", fontsize=7, fontweight="bold") + + # RTF + ax = axes[1] + ax.bar(x, rtf_c, 0.55, color=cols, alpha=0.9, edgecolor="white") + ax.set_ylabel("RTF (lower = faster)"); ax.set_title("Real-Time Factor (test-clean)", fontweight="bold") + ax.set_xticks(x); ax.set_xticklabels(names, fontsize=7.5, rotation=25, ha="right") + ax.grid(axis="y", alpha=0.15) + for i, v in enumerate(rtf_c): + ax.text(i, v + 0.003, f"{v:.3f}", ha="center", fontsize=8, fontweight="bold") + + # First-word latency + ax = axes[2] + ax.bar(x, fwl, 0.55, color=cols, alpha=0.9, edgecolor="white") + ax.set_ylabel("ms"); ax.set_title("First Word Latency", fontweight="bold") + ax.set_xticks(x); ax.set_xticklabels(names, fontsize=7.5, rotation=25, ha="right") + ax.grid(axis="y", alpha=0.15) + for i, v in enumerate(fwl): + ax.text(i, v + 8, f"{v}", ha="center", fontsize=8, fontweight="bold") + + fig.suptitle("LibriSpeech Benchmark — H100 80 GB", fontsize=14, fontweight="bold") + plt.tight_layout() + _save(fig, "bars_wer_rtf_latency.png") + + +# ────────────────────────────────────────────────────────── +# Figure 4: Clean vs Other robustness +# ────────────────────────────────────────────────────────── +def fig_robustness(): + models = [ + ("Whisper large-v3", 2.02, 7.79, COLORS["whisper"], "h", 280), + ("Qwen3 0.6B (batch)", 2.30, 6.12, COLORS["qwen_b"], "h", 180), + ("Qwen3 1.7B (batch)", 2.46, 5.34, COLORS["qwen_b"], "h", 280), + ("Voxtral 4B (vLLM)", 2.71, 9.26, COLORS["voxtral"], "D", 280), + ("Qwen3 0.6B\nSimulStream", 6.44, 9.27, COLORS["qwen_s"], "s", 240), + ("Qwen3 1.7B\nSimulStream", 8.09, 9.56, COLORS["qwen_s"], "s", 300), + ] + # Manual label offsets — carefully placed to avoid overlap + offsets = [(-55, 10), (8, 10), (8, -18), (-55, -18), (-10, 12), (10, -18)] + + fig, ax = plt.subplots(figsize=(8.5, 7)) + ax.plot([0, 13], [0, 13], "--", color="#ccc", lw=1, zorder=1) + ax.fill_between([0, 13], [0, 13], [13, 13], color="#fff5f5", alpha=0.5, zorder=0) + ax.text(4, 11, "degrades more\non noisy audio", fontsize=9, color="#bbb", fontstyle="italic") + + for (name, wc, wo, color, marker, sz), (lx, ly) in zip(models, offsets): + ax.scatter(wc, wo, s=sz, c=color, marker=marker, + edgecolors="white", linewidths=1.5, zorder=5) + ax.annotate(name, (wc, wo), fontsize=8.5, fontweight="bold", + xytext=(lx, ly), textcoords="offset points", + arrowprops=dict(arrowstyle="-", color="#aaa", lw=0.6)) + deg = wo - wc + ax.annotate(f"+{deg:.1f}%", (wc, wo), fontsize=7, color="#999", + xytext=(-6, -13), textcoords="offset points") + + ax.set_xlabel("WER % on test-clean") + ax.set_ylabel("WER % on test-other") + ax.set_title("Clean vs Noisy Robustness (H100 80 GB)", fontsize=13, fontweight="bold", pad=12) + ax.set_xlim(-0.3, 12); ax.set_ylim(-0.3, 12) + ax.set_aspect("equal"); ax.grid(True, alpha=0.12) + _save(fig, "robustness_clean_vs_other.png") + + +# ────────────────────────────────────────────────────────── +# Figure 5: ACL6060 per-talk breakdown (Qwen3 vs Voxtral) +# ────────────────────────────────────────────────────────── +def fig_per_talk(): + q = DATA["acl6060"]["systems"]["qwen3_1.7b_simulstream_kv"]["per_talk"] + v = DATA["acl6060"]["systems"]["voxtral_4b_vllm_realtime"]["per_talk"] + talks = DATA["acl6060"]["talks"] + + fig, ax = plt.subplots(figsize=(9, 5)) + x = np.arange(len(talks)); w = 0.35 + + bars_v = ax.bar(x - w/2, [v[t] for t in talks], w, color=COLORS["voxtral"], + edgecolor="white", label="Voxtral 4B (vLLM)") + bars_q = ax.bar(x + w/2, [q[t] for t in talks], w, color=COLORS["qwen_s"], + edgecolor="white", label="Qwen3 1.7B SimulStream+KV") + + for bar in bars_v: + ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3, + f"{bar.get_height():.1f}", ha="center", fontsize=8) + for bar in bars_q: + ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3, + f"{bar.get_height():.1f}", ha="center", fontsize=8) + + ax.set_xlabel("ACL6060 Talk ID") + ax.set_ylabel("WER %") + ax.set_title("Per-Talk WER — ACL6060 Conference Talks (H100 80 GB)", + fontsize=13, fontweight="bold", pad=12) + ax.set_xticks(x); ax.set_xticklabels([f"Talk {t}" for t in talks]) + ax.legend(fontsize=9); ax.grid(axis="y", alpha=0.15) + ax.set_ylim(0, 18) + _save(fig, "acl6060_per_talk.png") + + +if __name__ == "__main__": + print("Generating H100 benchmark figures...") + fig_scatter_clean() + fig_scatter_acl6060() + fig_bars() + fig_robustness() + fig_per_talk() + print("Done!") diff --git a/benchmarks/h100/results.json b/benchmarks/h100/results.json new file mode 100644 index 0000000..1fb2582 --- /dev/null +++ b/benchmarks/h100/results.json @@ -0,0 +1,56 @@ +{ + "hardware": "NVIDIA H100 80GB HBM3, CUDA 12.4, Driver 550.163", + "date": "2026-03-15", + + "librispeech_clean": { + "n_samples": 91, + "total_audio_s": 602, + "systems": { + "whisper_large_v3_batch": {"wer": 2.02, "rtf": 0.071, "first_word_latency_s": 0.472}, + "qwen3_0.6b_batch": {"wer": 2.30, "rtf": 0.065, "first_word_latency_s": 0.432}, + "qwen3_1.7b_batch": {"wer": 2.46, "rtf": 0.069, "first_word_latency_s": 0.457}, + "voxtral_4b_vllm_realtime": {"wer": 2.71, "rtf": 0.137, "first_word_latency_s": 0.137}, + "qwen3_0.6b_simulstream_kv": {"wer": 6.44, "rtf": 0.109, "first_word_latency_s": 0.091}, + "qwen3_1.7b_simulstream_kv": {"wer": 8.09, "rtf": 0.117, "first_word_latency_s": 0.094} + } + }, + + "librispeech_other": { + "n_samples": 133, + "total_audio_s": 600, + "systems": { + "qwen3_1.7b_batch": {"wer": 5.34, "rtf": 0.088}, + "qwen3_0.6b_batch": {"wer": 6.12, "rtf": 0.086}, + "whisper_large_v3_batch": {"wer": 7.79, "rtf": 0.092}, + "qwen3_0.6b_simulstream_kv": {"wer": 9.27, "rtf": 0.127}, + "voxtral_4b_vllm_realtime": {"wer": 9.26, "rtf": 0.144}, + "qwen3_1.7b_simulstream_kv": {"wer": 9.56, "rtf": 0.140} + } + }, + + "acl6060": { + "description": "5 ACL 2022 conference talks, 58 min total", + "talks": ["110", "117", "268", "367", "590"], + "systems": { + "voxtral_4b_vllm_realtime": {"avg_wer": 7.83, "avg_rtf": 0.203, "per_talk": {"110": 5.18, "117": 2.24, "268": 14.88, "367": 9.40, "590": 7.45}}, + "qwen3_1.7b_simulstream_kv": {"avg_wer": 9.20, "avg_rtf": 0.074, "per_talk": {"110": 5.59, "117": 8.12, "268": 12.25, "367": 12.29, "590": 7.77}}, + "qwen3_0.6b_simulstream_kv": {"avg_wer": 13.21, "avg_rtf": 0.098}, + "whisper_large_v3_batch": {"avg_wer": 22.53, "avg_rtf": 0.125} + } + }, + + "m5_reference": { + "description": "MacBook M5 results (from WLK scatter benchmarks)", + "systems": { + "fw_la_base": {"wer": 17.0, "rtf": 0.82}, + "fw_la_small": {"wer": 8.6, "rtf": 0.76}, + "fw_ss_base": {"wer": 7.8, "rtf": 0.46}, + "fw_ss_small": {"wer": 7.0, "rtf": 0.90}, + "mlx_ss_base": {"wer": 7.7, "rtf": 0.34}, + "mlx_ss_small": {"wer": 6.5, "rtf": 0.68}, + "voxtral_mlx": {"wer": 7.0, "rtf": 0.26}, + "qwen3_mlx_0.6b":{"wer": 5.5, "rtf": 0.55}, + "qwen3_0.6b_batch":{"wer":24.0, "rtf": 1.42} + } + } +} diff --git a/benchmarks/h100/robustness_clean_vs_other.png b/benchmarks/h100/robustness_clean_vs_other.png new file mode 100644 index 0000000..b4a95a3 Binary files /dev/null and b/benchmarks/h100/robustness_clean_vs_other.png differ diff --git a/benchmarks/h100/wer_vs_rtf_acl6060.png b/benchmarks/h100/wer_vs_rtf_acl6060.png new file mode 100644 index 0000000..42a3a92 Binary files /dev/null and b/benchmarks/h100/wer_vs_rtf_acl6060.png differ diff --git a/benchmarks/h100/wer_vs_rtf_clean.png b/benchmarks/h100/wer_vs_rtf_clean.png new file mode 100644 index 0000000..eb5e677 Binary files /dev/null and b/benchmarks/h100/wer_vs_rtf_clean.png differ