Refactor timed objects and data structures

2026-01-11 16:08:00 +01:00 · 2026-01-11 16:08:00 +01:00 · e144abbbc7
commit e144abbbc7
parent 83362c89c4
4 changed files with 25 additions and 20 deletions
--- a/whisperlivekit/metrics.py
+++ b/whisperlivekit/metrics.py
@ -6,7 +6,7 @@ text normalization, and word-level timestamp accuracy metrics with greedy alignm

 import re
 import unicodedata
-from typing import Dict, List, Optional
+from typing import Dict, List


 def normalize_text(text: str) -> str:
--- a/whisperlivekit/metrics_collector.py
+++ b/whisperlivekit/metrics_collector.py
@ -78,7 +78,6 @@ class SessionMetrics:

    def log_summary(self) -> None:
        """Emit a structured log line summarising the session."""
-        self.total_processing_time_s = sum(self.transcription_durations)
        d = self.to_dict()
        d["session_elapsed_s"] = round(time.time() - self.session_start, 3) if self.session_start else 0
        logger.info(f"SESSION_METRICS {d}")
--- a/whisperlivekit/thread_safety.py
+++ b/whisperlivekit/thread_safety.py
@ -20,8 +20,8 @@ Usage:
    export WHISPERLIVEKIT_LOCK_TIMEOUT=60
 """

-import os
 import logging
+import os
 import threading

 logger = logging.getLogger(__name__)
--- a/whisperlivekit/timed_objects.py
+++ b/whisperlivekit/timed_objects.py
@ -1,12 +1,18 @@
 from dataclasses import dataclass, field
-from datetime import timedelta
 from typing import Any, Dict, List, Optional, Union

 PUNCTUATION_MARKS = {'.', '!', '?', '。', '！', '？'}

 def format_time(seconds: float) -> str:
-    """Format seconds as HH:MM:SS."""
-    return str(timedelta(seconds=int(seconds)))
+    """Format seconds as H:MM:SS.cc (centisecond precision)."""
+    total_cs = int(round(seconds * 100))
+    cs = total_cs % 100
+    total_s = total_cs // 100
+    s = total_s % 60
+    total_m = total_s // 60
+    m = total_m % 60
+    h = total_m // 60
+    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"

@dataclass
 class Timed:
@ -18,10 +24,10 @@ class TimedText(Timed):
    text: Optional[str] = ''
    speaker: Optional[int] = -1
    detected_language: Optional[str] = None
-    
+
    def has_punctuation(self) -> bool:
        return any(char in PUNCTUATION_MARKS for char in self.text.strip())
-    
+
    def is_within(self, other: 'TimedText') -> bool:
        return other.contains_timespan(self)

@ -30,10 +36,10 @@ class TimedText(Timed):

    def contains_timespan(self, other: 'TimedText') -> bool:
        return self.start <= other.start and self.end >= other.end
-    
+
    def __bool__(self) -> bool:
        return bool(self.text)
-    
+
    def __str__(self) -> str:
        return str(self.text)

@ -103,7 +109,7 @@ class Silence():
            return None
        self.duration = self.end - self.start
        return self.duration
-    
+
    def is_silence(self) -> bool:
        return True

@ -127,9 +133,9 @@ class Segment(TimedText):
        """Return a normalized segment representing the provided tokens."""
        if not tokens:
            return None
-        
+
        start_token = tokens[0]
-        end_token = tokens[-1]        
+        end_token = tokens[-1]
        if is_silence:
            return cls(
                start=start_token.start,
@ -176,7 +182,7 @@ class SilentSegment(Segment):
        self.text = ''


-@dataclass  
+@dataclass
 class FrontData():
    status: str = ''
    error: str = ''
@ -186,7 +192,7 @@ class FrontData():
    buffer_translation: str = ''
    remaining_time_transcription: float = 0.
    remaining_time_diarization: float = 0.
-    
+
    def to_dict(self) -> Dict[str, Any]:
        """Serialize the front-end data payload."""
        _dict: Dict[str, Any] = {
@ -202,15 +208,15 @@ class FrontData():
            _dict['error'] = self.error
        return _dict

-@dataclass  
+@dataclass
 class ChangeSpeaker:
    speaker: int
    start: int

-@dataclass  
+@dataclass
 class State():
    """Unified state class for audio processing.
-    
+
    Contains both persistent state (tokens, buffers) and temporary update buffers
    (new_* fields) that are consumed by TokensAlignment.
    """
@ -221,10 +227,10 @@ class State():
    end_attributed_speaker: float = 0.0
    remaining_time_transcription: float = 0.0
    remaining_time_diarization: float = 0.0
-    
+
    # Temporary update buffers (consumed by TokensAlignment.update())
    new_tokens: List[Union[ASRToken, Silence]] = field(default_factory=list)
    new_translation: List[Any] = field(default_factory=list)
    new_diarization: List[Any] = field(default_factory=list)
    new_tokens_buffer: List[Any] = field(default_factory=list)  # only when local agreement
-    new_translation_buffer= TimedText()
+    new_translation_buffer: TimedText = field(default_factory=TimedText)