From 4b2377c243d63b582ef89b9171310b59208394e7 Mon Sep 17 00:00:00 2001
From: Quentin Fuxa <quentin.fuxa@gmail.com>
Date: Sun, 22 Feb 2026 23:38:04 +0100
Subject: [PATCH] fix: correct false auto-detect claim, median bug, RTF
 inflation

- BENCHMARK.md: whisper also supports --language auto, voxtral is not
  the only one. Fixed mlx-whisper speed comparison (LA is actually
  faster than SS for mlx-whisper, not comparable).
- metrics.py: median calculation was wrong for even-length lists
  (took upper middle instead of averaging the two middle values).
- metrics_collector.py: RTF was inflated because log_summary() used
  wall-clock elapsed time instead of sum of actual ASR call durations.
- README.md: clarified that whisper also supports auto language
  detection, voxtral just does it better.
- Added 2 new median tests (even + odd length).
---
 BENCHMARK.md                        | 21 +++++++++--------
 README.md                           |  3 ++-
 tests/test_metrics.py               | 36 +++++++++++++++++++++++++++++
 whisperlivekit/metrics.py           |  7 +++++-
 whisperlivekit/metrics_collector.py |  4 ++--
 5 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/BENCHMARK.md b/BENCHMARK.md
index 9d27ab2..df1293f 100644
--- a/BENCHMARK.md
+++ b/BENCHMARK.md
@@ -77,24 +77,25 @@ should be run with `--lan fr` or `--lan auto`. The Voxtral backends auto-detect
 ### Speed (RTF = processing time / audio duration, lower is better)
 
 1. **mlx-whisper + LocalAgreement** is the fastest combo on Apple Silicon, reaching 0.05-0.06x RTF
-   on English audio. This means 30 seconds of audio is processed in under 2 seconds.
-2. **SimulStreaming** is consistently faster than LocalAgreement for faster-whisper, but comparable
-   for mlx-whisper.
+   on English audio. 30 seconds of audio processed in under 2 seconds.
+2. For **faster-whisper**, SimulStreaming is consistently faster than LocalAgreement.
+   For **mlx-whisper**, it is the opposite: LocalAgreement (0.05-0.06x) is faster than SimulStreaming (0.11-0.14x).
 3. **voxtral-mlx** runs at 0.18-0.32x RTF, roughly 3-5x slower than mlx-whisper but well within
    real-time requirements.
-4. **voxtral (HF transformers)** is the slowest, hitting 1.0-1.3x RTF. On longer audio, it risks
+4. **voxtral (HF transformers)** is the slowest at 1.0-1.3x RTF. On longer audio it risks
    falling behind real-time. On Apple Silicon, the MLX variant is strongly preferred.
 
 ### Accuracy (WER = Word Error Rate, lower is better)
 
 1. **SimulStreaming** produces significantly better WER than LocalAgreement for whisper backends.
    On the 30s English file: 5.3% vs 23.7-44.7%.
-2. **voxtral-mlx** achieves strong accuracy (0% on short English, 9.2% on multi-speaker) and is
-   the only backend that auto-detects language, making it the best choice for multilingual use.
+2. **voxtral-mlx** has good accuracy (0% on short English, 9.2% on multi-speaker).
+   Whisper also supports `--language auto`, but Voxtral's language detection is more
+   reliable and does not bias towards English the way Whisper's auto mode tends to.
 3. **LocalAgreement** tends to duplicate the last sentence, inflating WER. This is a known
    artifact of the LCP (Longest Common Prefix) commit strategy at end-of-stream.
 4. **Voxtral** backends handle French natively with 28-37% WER, while whisper backends
-   attempted English transcription of French audio (not a fair comparison for French).
+   were run with `--lan en` here (not a fair comparison for French).
 
 ### Timestamp Accuracy (MAE = Mean Absolute Error on word start times, lower is better)
 
@@ -102,10 +103,10 @@ should be run with `--lan fr` or `--lan auto`. The Voxtral backends auto-detect
    processes overlapping audio windows and validates via prefix matching.
 2. **SimulStreaming** timestamps are slightly less precise (0.24-0.40s MAE) but still usable
    for most applications.
-3. **voxtral-mlx** achieves excellent timestamps on English (0.18-0.25s MAE) but can drift on
+3. **voxtral-mlx** has good timestamp accuracy on English (0.18-0.25s MAE) but drifts on
    audio with long silence gaps (3.4s MAE on the French file with 4-second pauses).
-4. **voxtral (HF)** has the worst timestamp accuracy (1.0-4.0s MAE), likely due to the
-   additional overhead of the transformers pipeline.
+4. **voxtral (HF)** has the worst timestamp accuracy (1.0-4.0s MAE). This is likely related to
+   differences in the transformers-based decoding pipeline rather than model quality.
 
 ### VAC (Voice Activity Classification) Impact
 
diff --git a/README.md b/README.md
index d434a25..d35d569 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,8 @@ See **[BENCHMARK.md](BENCHMARK.md)** for detailed performance comparisons across
 
 WhisperLiveKit supports [Voxtral Mini](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602),
 a 4B-parameter speech model from Mistral AI that natively handles 100+ languages with automatic
-language detection. Unlike whisper-based backends, Voxtral does not require specifying `--language`.
+language detection. Whisper also supports auto-detection (`--language auto`), but Voxtral's per-chunk
+detection is more reliable and does not bias towards English.
 
 ```bash
 # Apple Silicon (native MLX, recommended)
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 365e168..4412b32 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -145,3 +145,39 @@ class TestComputeTimestampAccuracy:
         result = compute_timestamp_accuracy(pred, ref)
         assert result["n_matched"] == 1
         assert result["mae_start"] == pytest.approx(0.1)
+
+    def test_median_even_count(self):
+        """Median with even number of matched words should average the two middle values."""
+        ref = [
+            {"word": "a", "start": 0.0, "end": 0.2},
+            {"word": "b", "start": 0.5, "end": 0.7},
+            {"word": "c", "start": 1.0, "end": 1.2},
+            {"word": "d", "start": 1.5, "end": 1.7},
+        ]
+        pred = [
+            {"word": "a", "start": 0.1, "end": 0.3},   # delta 0.1
+            {"word": "b", "start": 0.7, "end": 0.9},   # delta 0.2
+            {"word": "c", "start": 1.3, "end": 1.5},   # delta 0.3
+            {"word": "d", "start": 1.9, "end": 2.1},   # delta 0.4
+        ]
+        result = compute_timestamp_accuracy(pred, ref)
+        assert result["n_matched"] == 4
+        # sorted abs deltas: [0.1, 0.2, 0.3, 0.4] -> median = (0.2 + 0.3) / 2 = 0.25
+        assert result["median_delta_start"] == pytest.approx(0.25)
+
+    def test_median_odd_count(self):
+        """Median with odd number of matched words takes the middle value."""
+        ref = [
+            {"word": "a", "start": 0.0, "end": 0.2},
+            {"word": "b", "start": 0.5, "end": 0.7},
+            {"word": "c", "start": 1.0, "end": 1.2},
+        ]
+        pred = [
+            {"word": "a", "start": 0.1, "end": 0.3},   # delta 0.1
+            {"word": "b", "start": 0.8, "end": 1.0},   # delta 0.3
+            {"word": "c", "start": 1.2, "end": 1.4},   # delta 0.2
+        ]
+        result = compute_timestamp_accuracy(pred, ref)
+        assert result["n_matched"] == 3
+        # sorted abs deltas: [0.1, 0.2, 0.3] -> median = 0.2
+        assert result["median_delta_start"] == pytest.approx(0.2)
diff --git a/whisperlivekit/metrics.py b/whisperlivekit/metrics.py
index 09e9c12..8bbd9af 100644
--- a/whisperlivekit/metrics.py
+++ b/whisperlivekit/metrics.py
@@ -140,11 +140,16 @@ def compute_timestamp_accuracy(
 
     abs_deltas = [abs(d) for d in deltas_start]
     sorted_abs = sorted(abs_deltas)
+    n = len(sorted_abs)
+    if n % 2 == 1:
+        median = sorted_abs[n // 2]
+    else:
+        median = (sorted_abs[n // 2 - 1] + sorted_abs[n // 2]) / 2
 
     return {
         "mae_start": sum(abs_deltas) / len(abs_deltas),
         "max_delta_start": max(abs_deltas),
-        "median_delta_start": sorted_abs[len(sorted_abs) // 2],
+        "median_delta_start": median,
         "n_matched": len(deltas_start),
         "n_ref": len(reference),
         "n_pred": len(predicted),
diff --git a/whisperlivekit/metrics_collector.py b/whisperlivekit/metrics_collector.py
index 365f07a..03db5dc 100644
--- a/whisperlivekit/metrics_collector.py
+++ b/whisperlivekit/metrics_collector.py
@@ -78,7 +78,7 @@ class SessionMetrics:
 
     def log_summary(self) -> None:
         """Emit a structured log line summarising the session."""
-        elapsed = time.time() - self.session_start if self.session_start else 0
-        self.total_processing_time_s = elapsed
+        self.total_processing_time_s = sum(self.transcription_durations)
         d = self.to_dict()
+        d["session_elapsed_s"] = round(time.time() - self.session_start, 3) if self.session_start else 0
         logger.info(f"SESSION_METRICS {d}")