From 0b43035701fc06ebe75665bb7a6189dbaab0864a Mon Sep 17 00:00:00 2001
From: Quentin Fuxa <quentin.fuxa@gmail.com>
Date: Wed, 9 Apr 2025 11:34:59 +0200
Subject: [PATCH] enhance chunking to handle audio buffer time limits

---
 .../whisper_streaming_custom/online_asr.py    | 50 +++++++++++++++----
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/whisperlivekit/whisper_streaming_custom/online_asr.py b/whisperlivekit/whisper_streaming_custom/online_asr.py
index 2fd9de0..158287f 100644
--- a/whisperlivekit/whisper_streaming_custom/online_asr.py
+++ b/whisperlivekit/whisper_streaming_custom/online_asr.py
@@ -216,31 +216,54 @@ class OnlineASRProcessor:
         """
         If the committed tokens form at least two sentences, chunk the audio
         buffer at the end time of the penultimate sentence.
+        Also ensures chunking happens if audio buffer exceeds a time limit.
         """
+        buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE        
         if not self.committed:
+            if buffer_duration > self.buffer_trimming_sec:
+                chunk_time = self.buffer_time_offset + (buffer_duration / 2)
+                logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}")
+                self.chunk_at(chunk_time)
             return
+        
         logger.debug("COMPLETED SENTENCE: " + " ".join(token.text for token in self.committed))
         sentences = self.words_to_sentences(self.committed)
         for sentence in sentences:
             logger.debug(f"\tSentence: {sentence.text}")
-        if len(sentences) < 2:
-            return
-        # Keep the last two sentences.
-        while len(sentences) > 2:
-            sentences.pop(0)
-        chunk_time = sentences[-2].end
-        logger.debug(f"--- Sentence chunked at {chunk_time:.2f}")
-        self.chunk_at(chunk_time)
+        
+        chunk_done = False
+        if len(sentences) >= 2:
+            while len(sentences) > 2:
+                sentences.pop(0)
+            chunk_time = sentences[-2].end
+            logger.debug(f"--- Sentence chunked at {chunk_time:.2f}")
+            self.chunk_at(chunk_time)
+            chunk_done = True
+        
+        if not chunk_done and buffer_duration > self.buffer_trimming_sec:
+            last_committed_time = self.committed[-1].end
+            logger.debug(f"--- Not enough sentences, chunking at last committed time {last_committed_time:.2f}")
+            self.chunk_at(last_committed_time)
 
     def chunk_completed_segment(self, res):
         """
         Chunk the audio buffer based on segment-end timestamps reported by the ASR.
+        Also ensures chunking happens if audio buffer exceeds a time limit.
         """
+        buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE        
         if not self.committed:
+            if buffer_duration > self.buffer_trimming_sec:
+                chunk_time = self.buffer_time_offset + (buffer_duration / 2)
+                logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}")
+                self.chunk_at(chunk_time)
             return
+        
+        logger.debug("Processing committed tokens for segmenting")
         ends = self.asr.segments_end_ts(res)
-        last_committed_time = self.committed[-1].end
+        last_committed_time = self.committed[-1].end        
+        chunk_done = False
         if len(ends) > 1:
+            logger.debug("Multiple segments available for chunking")
             e = ends[-2] + self.buffer_time_offset
             while len(ends) > 2 and e > last_committed_time:
                 ends.pop(-1)
@@ -248,11 +271,18 @@ class OnlineASRProcessor:
             if e <= last_committed_time:
                 logger.debug(f"--- Segment chunked at {e:.2f}")
                 self.chunk_at(e)
+                chunk_done = True
             else:
                 logger.debug("--- Last segment not within committed area")
         else:
             logger.debug("--- Not enough segments to chunk")
-
+        
+        if not chunk_done and buffer_duration > self.buffer_trimming_sec:
+            logger.debug(f"--- Buffer too large, chunking at last committed time {last_committed_time:.2f}")
+            self.chunk_at(last_committed_time)
+        
+        logger.debug("Segment chunking complete")
+        
     def chunk_at(self, time: float):
         """
         Trim both the hypothesis and audio buffer at the given time.