From 4b2377c243d63b582ef89b9171310b59208394e7 Mon Sep 17 00:00:00 2001 From: Quentin Fuxa Date: Sun, 22 Feb 2026 23:38:04 +0100 Subject: [PATCH] fix: correct false auto-detect claim, median bug, RTF inflation - BENCHMARK.md: whisper also supports --language auto, voxtral is not the only one. Fixed mlx-whisper speed comparison (LA is actually faster than SS for mlx-whisper, not comparable). - metrics.py: median calculation was wrong for even-length lists (took upper middle instead of averaging the two middle values). - metrics_collector.py: RTF was inflated because log_summary() used wall-clock elapsed time instead of sum of actual ASR call durations. - README.md: clarified that whisper also supports auto language detection, voxtral just does it better. - Added 2 new median tests (even + odd length). --- BENCHMARK.md | 21 +++++++++-------- README.md | 3 ++- tests/test_metrics.py | 36 +++++++++++++++++++++++++++++ whisperlivekit/metrics.py | 7 +++++- whisperlivekit/metrics_collector.py | 4 ++-- 5 files changed, 57 insertions(+), 14 deletions(-) diff --git a/BENCHMARK.md b/BENCHMARK.md index 9d27ab2..df1293f 100644 --- a/BENCHMARK.md +++ b/BENCHMARK.md @@ -77,24 +77,25 @@ should be run with `--lan fr` or `--lan auto`. The Voxtral backends auto-detect ### Speed (RTF = processing time / audio duration, lower is better) 1. **mlx-whisper + LocalAgreement** is the fastest combo on Apple Silicon, reaching 0.05-0.06x RTF - on English audio. This means 30 seconds of audio is processed in under 2 seconds. -2. **SimulStreaming** is consistently faster than LocalAgreement for faster-whisper, but comparable - for mlx-whisper. + on English audio. 30 seconds of audio processed in under 2 seconds. +2. For **faster-whisper**, SimulStreaming is consistently faster than LocalAgreement. + For **mlx-whisper**, it is the opposite: LocalAgreement (0.05-0.06x) is faster than SimulStreaming (0.11-0.14x). 3. **voxtral-mlx** runs at 0.18-0.32x RTF, roughly 3-5x slower than mlx-whisper but well within real-time requirements. -4. **voxtral (HF transformers)** is the slowest, hitting 1.0-1.3x RTF. On longer audio, it risks +4. **voxtral (HF transformers)** is the slowest at 1.0-1.3x RTF. On longer audio it risks falling behind real-time. On Apple Silicon, the MLX variant is strongly preferred. ### Accuracy (WER = Word Error Rate, lower is better) 1. **SimulStreaming** produces significantly better WER than LocalAgreement for whisper backends. On the 30s English file: 5.3% vs 23.7-44.7%. -2. **voxtral-mlx** achieves strong accuracy (0% on short English, 9.2% on multi-speaker) and is - the only backend that auto-detects language, making it the best choice for multilingual use. +2. **voxtral-mlx** has good accuracy (0% on short English, 9.2% on multi-speaker). + Whisper also supports `--language auto`, but Voxtral's language detection is more + reliable and does not bias towards English the way Whisper's auto mode tends to. 3. **LocalAgreement** tends to duplicate the last sentence, inflating WER. This is a known artifact of the LCP (Longest Common Prefix) commit strategy at end-of-stream. 4. **Voxtral** backends handle French natively with 28-37% WER, while whisper backends - attempted English transcription of French audio (not a fair comparison for French). + were run with `--lan en` here (not a fair comparison for French). ### Timestamp Accuracy (MAE = Mean Absolute Error on word start times, lower is better) @@ -102,10 +103,10 @@ should be run with `--lan fr` or `--lan auto`. The Voxtral backends auto-detect processes overlapping audio windows and validates via prefix matching. 2. **SimulStreaming** timestamps are slightly less precise (0.24-0.40s MAE) but still usable for most applications. -3. **voxtral-mlx** achieves excellent timestamps on English (0.18-0.25s MAE) but can drift on +3. **voxtral-mlx** has good timestamp accuracy on English (0.18-0.25s MAE) but drifts on audio with long silence gaps (3.4s MAE on the French file with 4-second pauses). -4. **voxtral (HF)** has the worst timestamp accuracy (1.0-4.0s MAE), likely due to the - additional overhead of the transformers pipeline. +4. **voxtral (HF)** has the worst timestamp accuracy (1.0-4.0s MAE). This is likely related to + differences in the transformers-based decoding pipeline rather than model quality. ### VAC (Voice Activity Classification) Impact diff --git a/README.md b/README.md index d434a25..d35d569 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,8 @@ See **[BENCHMARK.md](BENCHMARK.md)** for detailed performance comparisons across WhisperLiveKit supports [Voxtral Mini](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602), a 4B-parameter speech model from Mistral AI that natively handles 100+ languages with automatic -language detection. Unlike whisper-based backends, Voxtral does not require specifying `--language`. +language detection. Whisper also supports auto-detection (`--language auto`), but Voxtral's per-chunk +detection is more reliable and does not bias towards English. ```bash # Apple Silicon (native MLX, recommended) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 365e168..4412b32 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -145,3 +145,39 @@ class TestComputeTimestampAccuracy: result = compute_timestamp_accuracy(pred, ref) assert result["n_matched"] == 1 assert result["mae_start"] == pytest.approx(0.1) + + def test_median_even_count(self): + """Median with even number of matched words should average the two middle values.""" + ref = [ + {"word": "a", "start": 0.0, "end": 0.2}, + {"word": "b", "start": 0.5, "end": 0.7}, + {"word": "c", "start": 1.0, "end": 1.2}, + {"word": "d", "start": 1.5, "end": 1.7}, + ] + pred = [ + {"word": "a", "start": 0.1, "end": 0.3}, # delta 0.1 + {"word": "b", "start": 0.7, "end": 0.9}, # delta 0.2 + {"word": "c", "start": 1.3, "end": 1.5}, # delta 0.3 + {"word": "d", "start": 1.9, "end": 2.1}, # delta 0.4 + ] + result = compute_timestamp_accuracy(pred, ref) + assert result["n_matched"] == 4 + # sorted abs deltas: [0.1, 0.2, 0.3, 0.4] -> median = (0.2 + 0.3) / 2 = 0.25 + assert result["median_delta_start"] == pytest.approx(0.25) + + def test_median_odd_count(self): + """Median with odd number of matched words takes the middle value.""" + ref = [ + {"word": "a", "start": 0.0, "end": 0.2}, + {"word": "b", "start": 0.5, "end": 0.7}, + {"word": "c", "start": 1.0, "end": 1.2}, + ] + pred = [ + {"word": "a", "start": 0.1, "end": 0.3}, # delta 0.1 + {"word": "b", "start": 0.8, "end": 1.0}, # delta 0.3 + {"word": "c", "start": 1.2, "end": 1.4}, # delta 0.2 + ] + result = compute_timestamp_accuracy(pred, ref) + assert result["n_matched"] == 3 + # sorted abs deltas: [0.1, 0.2, 0.3] -> median = 0.2 + assert result["median_delta_start"] == pytest.approx(0.2) diff --git a/whisperlivekit/metrics.py b/whisperlivekit/metrics.py index 09e9c12..8bbd9af 100644 --- a/whisperlivekit/metrics.py +++ b/whisperlivekit/metrics.py @@ -140,11 +140,16 @@ def compute_timestamp_accuracy( abs_deltas = [abs(d) for d in deltas_start] sorted_abs = sorted(abs_deltas) + n = len(sorted_abs) + if n % 2 == 1: + median = sorted_abs[n // 2] + else: + median = (sorted_abs[n // 2 - 1] + sorted_abs[n // 2]) / 2 return { "mae_start": sum(abs_deltas) / len(abs_deltas), "max_delta_start": max(abs_deltas), - "median_delta_start": sorted_abs[len(sorted_abs) // 2], + "median_delta_start": median, "n_matched": len(deltas_start), "n_ref": len(reference), "n_pred": len(predicted), diff --git a/whisperlivekit/metrics_collector.py b/whisperlivekit/metrics_collector.py index 365f07a..03db5dc 100644 --- a/whisperlivekit/metrics_collector.py +++ b/whisperlivekit/metrics_collector.py @@ -78,7 +78,7 @@ class SessionMetrics: def log_summary(self) -> None: """Emit a structured log line summarising the session.""" - elapsed = time.time() - self.session_start if self.session_start else 0 - self.total_processing_time_s = elapsed + self.total_processing_time_s = sum(self.transcription_durations) d = self.to_dict() + d["session_elapsed_s"] = round(time.time() - self.session_start, 3) if self.session_start else 0 logger.info(f"SESSION_METRICS {d}")