Improve tokens alignment and silence handling

This commit is contained in:
Quentin Fuxa 2026-01-24 10:55:00 +01:00
parent 6e85c16614
commit a282cbe75f

View file

@ -1,9 +1,15 @@
from time import time
from typing import Any, List, Optional, Tuple, Union
from whisperlivekit.timed_objects import (ASRToken, Segment, PuncSegment, Silence,
SilentSegment, SpeakerSegment,
TimedText)
from whisperlivekit.timed_objects import (
ASRToken,
PuncSegment,
Segment,
Silence,
SilentSegment,
SpeakerSegment,
TimedText,
)
_DEFAULT_RETENTION_SECONDS: float = 300.0
@ -212,9 +218,19 @@ class TokensAlignment:
self,
diarization: bool = False,
translation: bool = False,
current_silence: Optional[Silence] = None
current_silence: Optional[Silence] = None,
audio_time: Optional[float] = None,
) -> Tuple[List[Segment], str, Union[str, TimedText]]:
"""Return the formatted segments plus buffers, optionally with diarization/translation."""
"""Return the formatted segments plus buffers, optionally with diarization/translation.
Args:
audio_time: Current audio stream position in seconds. Used as fallback
for ongoing silence end time instead of wall-clock (which breaks
when audio is fed faster or slower than real-time).
"""
# Fallback for ongoing silence: prefer audio stream time over wall-clock
_silence_now = audio_time if audio_time is not None else (time() - self.beg_loop)
if diarization:
segments, diarization_buffer = self.get_lines_diarization()
else:
@ -225,7 +241,7 @@ class TokensAlignment:
self.validated_segments.append(Segment.from_tokens(self.current_line_tokens))
self.current_line_tokens = []
end_silence = token.end if token.has_ended else time() - self.beg_loop
end_silence = token.end if token.has_ended else _silence_now
if self.validated_segments and self.validated_segments[-1].is_silence():
self.validated_segments[-1].end = end_silence
else:
@ -241,7 +257,7 @@ class TokensAlignment:
segments.append(Segment.from_tokens(self.current_line_tokens))
if current_silence:
end_silence = current_silence.end if current_silence.has_ended else time() - self.beg_loop
end_silence = current_silence.end if current_silence.has_ended else _silence_now
if segments and segments[-1].is_silence():
segments[-1] = SilentSegment(start=segments[-1].start, end=end_silence)
else: