#!/usr/bin/env python3 """Generate the architecture.png diagram for WhisperLiveKit README.""" import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.patches as mpatches from matplotlib.patches import FancyBboxPatch, FancyArrowPatch # ── Colours ── C_BG = "#1a1a2e" C_PANEL = "#16213e" C_PANEL2 = "#0f3460" C_ACCENT = "#e94560" C_GREEN = "#4ecca3" C_ORANGE = "#f5a623" C_BLUE = "#4a9eff" C_PURPLE = "#b06af2" C_PINK = "#ff6b9d" C_YELLOW = "#f0e68c" C_TEXT = "#e8e8e8" C_TEXTDIM = "#a0a0b0" C_BOX_BG = "#1e2d4a" C_BOX_BG2 = "#2a1a3a" C_BOX_BG3 = "#1a3a2a" C_BORDER = "#3a4a6a" fig, ax = plt.subplots(1, 1, figsize=(20, 12), facecolor=C_BG) ax.set_xlim(0, 20) ax.set_ylim(0, 12) ax.set_aspect("equal") ax.axis("off") fig.subplots_adjust(left=0.01, right=0.99, top=0.97, bottom=0.01) def box(x, y, w, h, label, color=C_BORDER, bg=C_BOX_BG, fontsize=8, bold=False, text_color=C_TEXT, radius=0.15): rect = FancyBboxPatch( (x, y), w, h, boxstyle=f"round,pad=0.05,rounding_size={radius}", facecolor=bg, edgecolor=color, linewidth=1.2, ) ax.add_patch(rect) weight = "bold" if bold else "normal" ax.text(x + w/2, y + h/2, label, ha="center", va="center", fontsize=fontsize, color=text_color, fontweight=weight, family="monospace") return rect def arrow(x1, y1, x2, y2, color=C_TEXTDIM, style="->", lw=1.2): ax.annotate("", xy=(x2, y2), xytext=(x1, y1), arrowprops=dict(arrowstyle=style, color=color, lw=lw)) def section_box(x, y, w, h, title, bg=C_PANEL, border=C_BORDER, title_color=C_ACCENT): rect = FancyBboxPatch( (x, y), w, h, boxstyle="round,pad=0.05,rounding_size=0.2", facecolor=bg, edgecolor=border, linewidth=1.5, ) ax.add_patch(rect) ax.text(x + 0.15, y + h - 0.25, title, ha="left", va="top", fontsize=9, color=title_color, fontweight="bold", family="monospace") # ═══════════════════════════════════════════════════════════════════ # Title # ═══════════════════════════════════════════════════════════════════ ax.text(10, 11.7, "WhisperLiveKit Architecture", ha="center", va="center", fontsize=16, color=C_TEXT, fontweight="bold", family="monospace") ax.text(10, 11.35, "CLI commands: serve | listen | run | transcribe | bench | diagnose | models | pull | rm | check", ha="center", va="center", fontsize=7, color=C_TEXTDIM, family="monospace") # ═══════════════════════════════════════════════════════════════════ # Left: Client / Server # ═══════════════════════════════════════════════════════════════════ section_box(0.1, 7.0, 3.5, 4.0, "FastAPI Server", border=C_GREEN) box(0.3, 10.0, 1.5, 0.5, "Web UI\nHTML + JS", color=C_GREEN, fontsize=7) box(2.0, 10.0, 1.4, 0.5, "Frontend\n(optional)", color=C_GREEN, fontsize=7) box(0.3, 9.1, 3.1, 0.6, "WebSocket /asr • /v1/listen", color=C_GREEN, fontsize=7, bold=True) box(0.3, 8.3, 3.1, 0.5, "REST /v1/audio/transcriptions", color=C_GREEN, fontsize=7) box(0.3, 7.4, 3.1, 0.5, "Health • /v1/models", color=C_GREEN, fontsize=7) # Clients ax.text(0.2, 6.5, "Clients:", fontsize=7, color=C_TEXTDIM, family="monospace") for i, client in enumerate(["Browser", "OpenAI SDK", "Deepgram SDK", "TestHarness"]): box(0.3 + i * 0.9, 5.8, 0.8, 0.5, client, fontsize=5.5, bg="#1a2a1a", color="#3a6a3a") # ═══════════════════════════════════════════════════════════════════ # Centre: Audio Processor (per-session pipeline) # ═══════════════════════════════════════════════════════════════════ section_box(4.0, 5.5, 5.5, 5.5, "Audio Processor (per session)", border=C_BLUE) box(4.3, 10.0, 2.0, 0.6, "FFmpeg\nDecoding", color=C_BLUE, bg="#1a2a4a", bold=True) arrow(3.6, 9.4, 4.3, 10.2, color=C_GREEN) box(6.6, 10.0, 2.6, 0.6, "Silero VAD\nspeech / silence", color=C_BLUE, bg="#1a2a4a") arrow(6.3, 10.3, 6.6, 10.3, color=C_BLUE) box(4.3, 8.8, 4.9, 0.8, "SessionASRProxy\nthread-safe per-session language override", color=C_BLUE, fontsize=7) arrow(6.0, 10.0, 6.0, 9.6, color=C_BLUE) box(4.3, 7.6, 2.3, 0.8, "DiffTracker\n(opt-in ?mode=diff)", color="#5a5a7a", fontsize=7) box(6.9, 7.6, 2.3, 0.8, "Result Formatter\n→ FrontData.to_dict()", color=C_BLUE, fontsize=7) # Streaming policies ax.text(4.3, 7.1, "Streaming policies:", fontsize=7, color=C_ORANGE, fontweight="bold", family="monospace") box(4.3, 6.2, 2.3, 0.7, "LocalAgreement\nHypothesisBuffer", color=C_ORANGE, bg="#2a2a1a", fontsize=7) box(6.9, 6.2, 2.3, 0.7, "SimulStreaming\nAlignAtt (Whisper)", color=C_ORANGE, bg="#2a2a1a", fontsize=7) # ═══════════════════════════════════════════════════════════════════ # Right: TranscriptionEngine (singleton) # ═══════════════════════════════════════════════════════════════════ section_box(10.0, 0.3, 9.8, 10.7, "TranscriptionEngine (singleton — shared across sessions)", border=C_ACCENT, bg="#1e1520") ax.text(10.2, 10.5, "6 ASR Backends", fontsize=9, color=C_ACCENT, fontweight="bold", family="monospace") # ── Whisper backends ── section_box(10.2, 7.3, 4.5, 3.0, "Whisper Family (chunk-based)", border=C_PURPLE, bg=C_BOX_BG2) box(10.4, 9.2, 1.3, 0.6, "Faster\nWhisper", color=C_PURPLE, bg="#2a1a3a", fontsize=7, bold=True) box(11.9, 9.2, 1.3, 0.6, "MLX\nWhisper", color=C_PURPLE, bg="#2a1a3a", fontsize=7, bold=True) box(13.4, 9.2, 1.1, 0.6, "OpenAI\nWhisper", color=C_PURPLE, bg="#2a1a3a", fontsize=7) ax.text(10.4, 8.7, "PCM → Encoder → Decoder → Tokens", fontsize=6.5, color=C_TEXTDIM, family="monospace") ax.text(10.4, 8.3, "Uses LocalAgreement or SimulStreaming (AlignAtt)", fontsize=6, color=C_PURPLE, family="monospace") ax.text(10.4, 7.9, "Language detection • Buffer trimming", fontsize=6, color=C_TEXTDIM, family="monospace") ax.text(10.4, 7.5, "CPU / CUDA / MLX", fontsize=6, color=C_TEXTDIM, family="monospace") # ── Voxtral backends ── section_box(10.2, 3.8, 4.5, 3.2, "Voxtral (native streaming)", border=C_PINK, bg="#2a1520") box(10.4, 5.9, 1.8, 0.6, "Voxtral MLX\n(Apple Silicon)", color=C_PINK, bg="#2a1520", fontsize=7, bold=True) box(12.5, 5.9, 2.0, 0.6, "Voxtral HF\n(CUDA/MPS/CPU)", color=C_PINK, bg="#2a1520", fontsize=7, bold=True) ax.text(10.4, 5.4, "Incremental encoder → Autoregressive decoder", fontsize=6.5, color=C_TEXTDIM, family="monospace") ax.text(10.4, 5.0, "Sliding KV cache • Token-by-token output", fontsize=6, color=C_PINK, family="monospace") ax.text(10.4, 4.6, "No chunking needed — truly streams audio", fontsize=6, color=C_TEXTDIM, family="monospace") ax.text(10.4, 4.2, "4B params • 15 languages • 6-bit quant (MLX)", fontsize=6, color=C_TEXTDIM, family="monospace") # ── Qwen3 backend ── section_box(15.0, 3.8, 4.6, 3.2, "Qwen3 ASR (batch + aligner)", border=C_GREEN, bg=C_BOX_BG3) box(15.2, 5.9, 1.5, 0.6, "Qwen3 ASR\n1.7B / 0.6B", color=C_GREEN, bg="#1a3a2a", fontsize=7, bold=True) box(16.9, 5.9, 1.5, 0.6, "Qwen3\nSimul", color=C_GREEN, bg="#1a3a2a", fontsize=7, bold=True) box(18.6, 5.9, 1.0, 0.6, "Forced\nAligner", color=C_GREEN, bg="#1a3a2a", fontsize=6.5) ax.text(15.2, 5.4, "Batch + SimulStreaming (AlignAtt)", fontsize=6.5, color=C_TEXTDIM, family="monospace") ax.text(15.2, 5.0, "ForcedAligner provides word timestamps", fontsize=6, color=C_GREEN, family="monospace") ax.text(15.2, 4.6, "LocalAgreement or border-distance policy", fontsize=6, color=C_TEXTDIM, family="monospace") ax.text(15.2, 4.2, "29 languages • CUDA/MPS/CPU", fontsize=6, color=C_TEXTDIM, family="monospace") # ── OpenAI API ── box(15.2, 7.7, 4.2, 0.6, "OpenAI API (cloud)", color="#5a6a7a", fontsize=7) ax.text(15.2, 7.4, "Remote transcription • API key required", fontsize=6, color=C_TEXTDIM, family="monospace") # ── Shared components ── section_box(10.2, 0.5, 9.4, 3.0, "Shared Components", border="#5a6a7a", bg="#151520") box(10.4, 2.2, 2.5, 0.8, "Mel Spectrogram\ncached DFT + filterbank", color="#5a6a7a", fontsize=7) box(13.2, 2.2, 2.5, 0.8, "Diarization\nSortformer / pyannote", color="#5a6a7a", fontsize=7) box(16.0, 2.2, 3.4, 0.8, "Translation\nNLLB • CTranslate2", color="#5a6a7a", fontsize=7) box(10.4, 0.8, 4.0, 0.8, "WhisperLiveKitConfig\n(single source of truth)", color=C_ACCENT, fontsize=7, bold=True) box(14.8, 0.8, 2.3, 0.8, "TestHarness\npipeline testing", color="#5a6a7a", fontsize=7) box(17.3, 0.8, 2.3, 0.8, "Benchmark\n8 langs • 13 samples", color=C_ORANGE, fontsize=7, bold=True) # ═══════════════════════════════════════════════════════════════════ # Arrows: main data flow # ═══════════════════════════════════════════════════════════════════ # Audio processor → TranscriptionEngine arrow(9.5, 8.5, 10.2, 8.5, color=C_ACCENT, lw=2) ax.text(9.6, 8.8, "PCM audio", fontsize=6, color=C_ACCENT, family="monospace") # TranscriptionEngine → Audio processor (results) arrow(10.2, 7.0, 9.5, 7.0, color=C_GREEN, lw=2) ax.text(9.6, 7.3, "ASRTokens", fontsize=6, color=C_GREEN, family="monospace") # Streaming policy connections arrow(5.5, 6.2, 5.5, 5.5, color=C_ORANGE, style="->") arrow(8.1, 6.2, 8.1, 5.5, color=C_ORANGE, style="->") ax.text(4.3, 5.6, "Whisper + Qwen3", fontsize=5.5, color=C_ORANGE, family="monospace") ax.text(6.9, 5.6, "Whisper + Qwen3-simul", fontsize=5.5, color=C_ORANGE, family="monospace") # Voxtral note (no policy needed) ax.text(10.2, 3.5, "Voxtral: own streaming processor (no external policy)", fontsize=6, color=C_PINK, family="monospace", style="italic") # ═══════════════════════════════════════════════════════════════════ # Legend # ═══════════════════════════════════════════════════════════════════ legend_y = 5.0 ax.text(0.3, legend_y, "Streaming modes:", fontsize=7, color=C_TEXT, fontweight="bold", family="monospace") for i, (label, color) in enumerate([ ("Native streaming (Voxtral)", C_PINK), ("Chunk-based (Whisper)", C_PURPLE), ("Batch + aligner (Qwen3)", C_GREEN), ]): ax.plot([0.3], [legend_y - 0.4 - i * 0.35], "s", color=color, markersize=6) ax.text(0.6, legend_y - 0.4 - i * 0.35, label, fontsize=6.5, color=color, va="center", family="monospace") plt.savefig("architecture.png", dpi=200, facecolor=C_BG, bbox_inches="tight", pad_inches=0.1) print("Saved architecture.png")