Improve tokens alignment and silence handling
This commit is contained in:
parent
6e85c16614
commit
a282cbe75f
1 changed files with 26 additions and 10 deletions
|
|
@ -1,9 +1,15 @@
|
|||
from time import time
|
||||
from typing import Any, List, Optional, Tuple, Union
|
||||
|
||||
from whisperlivekit.timed_objects import (ASRToken, Segment, PuncSegment, Silence,
|
||||
SilentSegment, SpeakerSegment,
|
||||
TimedText)
|
||||
from whisperlivekit.timed_objects import (
|
||||
ASRToken,
|
||||
PuncSegment,
|
||||
Segment,
|
||||
Silence,
|
||||
SilentSegment,
|
||||
SpeakerSegment,
|
||||
TimedText,
|
||||
)
|
||||
|
||||
_DEFAULT_RETENTION_SECONDS: float = 300.0
|
||||
|
||||
|
|
@ -212,9 +218,19 @@ class TokensAlignment:
|
|||
self,
|
||||
diarization: bool = False,
|
||||
translation: bool = False,
|
||||
current_silence: Optional[Silence] = None
|
||||
current_silence: Optional[Silence] = None,
|
||||
audio_time: Optional[float] = None,
|
||||
) -> Tuple[List[Segment], str, Union[str, TimedText]]:
|
||||
"""Return the formatted segments plus buffers, optionally with diarization/translation."""
|
||||
"""Return the formatted segments plus buffers, optionally with diarization/translation.
|
||||
|
||||
Args:
|
||||
audio_time: Current audio stream position in seconds. Used as fallback
|
||||
for ongoing silence end time instead of wall-clock (which breaks
|
||||
when audio is fed faster or slower than real-time).
|
||||
"""
|
||||
# Fallback for ongoing silence: prefer audio stream time over wall-clock
|
||||
_silence_now = audio_time if audio_time is not None else (time() - self.beg_loop)
|
||||
|
||||
if diarization:
|
||||
segments, diarization_buffer = self.get_lines_diarization()
|
||||
else:
|
||||
|
|
@ -225,7 +241,7 @@ class TokensAlignment:
|
|||
self.validated_segments.append(Segment.from_tokens(self.current_line_tokens))
|
||||
self.current_line_tokens = []
|
||||
|
||||
end_silence = token.end if token.has_ended else time() - self.beg_loop
|
||||
end_silence = token.end if token.has_ended else _silence_now
|
||||
if self.validated_segments and self.validated_segments[-1].is_silence():
|
||||
self.validated_segments[-1].end = end_silence
|
||||
else:
|
||||
|
|
@ -241,7 +257,7 @@ class TokensAlignment:
|
|||
segments.append(Segment.from_tokens(self.current_line_tokens))
|
||||
|
||||
if current_silence:
|
||||
end_silence = current_silence.end if current_silence.has_ended else time() - self.beg_loop
|
||||
end_silence = current_silence.end if current_silence.has_ended else _silence_now
|
||||
if segments and segments[-1].is_silence():
|
||||
segments[-1] = SilentSegment(start=segments[-1].start, end=end_silence)
|
||||
else:
|
||||
|
|
|
|||
Loading…
Reference in a new issue