From baddf0284b3fd1f2b21cb8231070a566d5862570 Mon Sep 17 00:00:00 2001
From: Silas Kieser <silas.kieser@gmail.com>
Date: Mon, 27 Jan 2025 15:36:19 +0100
Subject: [PATCH 1/4] buffer length in sentence segmentation is no also max as
 in segment.

---
 src/whisper_streaming/online_asr.py | 60 ++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 23 deletions(-)

diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py
index 26f6611..1d7ceb4 100644
--- a/src/whisper_streaming/online_asr.py
+++ b/src/whisper_streaming/online_asr.py
@@ -110,6 +110,15 @@ class OnlineASRProcessor:
 
         self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
 
+        if self.buffer_trimming_way not in ["sentence", "segment"]:
+            raise ValueError("buffer_trimming must be either 'sentence' or 'segment'")
+        if self.buffer_trimming_sec <= 0:
+            raise ValueError("buffer_trimming_sec must be positive")
+        elif self.buffer_trimming_sec > 30:
+            logger.warning(
+                f"buffer_trimming_sec is set to {self.buffer_trimming_sec}, which is very long. It may cause OOM."
+            )
+
     def init(self, offset=None):
         """run this when starting or restarting processing"""
         self.audio_buffer = np.array([], dtype=np.float32)
@@ -171,35 +180,40 @@ class OnlineASRProcessor:
 
         # there is a newly confirmed text
 
-        if o and self.buffer_trimming_way == "sentence":  # trim the completed sentences
-            if (
-                len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec
-            ):  # longer than this
-                
-                logger.debug("chunking sentence")
-                self.chunk_completed_sentence()
+        if self.buffer_trimming_way == "sentence":
+
+            self.chunk_completed_sentence()
                 
 
-            else:
-                logger.debug("not enough audio to trim as a sentence")
 
-        if self.buffer_trimming_way == "segment":
-            s = self.buffer_trimming_sec  # trim the completed segments longer than s,
-        else:
-            s = 30  # if the audio buffer is longer than 30s, trim it
+            
 
-        if len(self.audio_buffer) / self.SAMPLING_RATE > s:
+        if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec :
+                
+            if self.buffer_trimming_way == "sentence":
+                logger.warning(f"Chunck segment after {self.buffer_trimming_sec} seconds!"
+                                " Even if no sentence was found!"
+                             )
+            
+            
             self.chunk_completed_segment(res)
+       
+
+                # alternative: on any word
+                # l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10
+                # let's find commited word that is less
+                # k = len(self.commited)-1
+                # while k>0 and self.commited[k][1] > l:
+                #    k -= 1
+                # t = self.commited[k][1]
+                # self.chunk_at(t)
+
+        
+
+
+
+
 
-            # alternative: on any word
-            # l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10
-            # let's find commited word that is less
-            # k = len(self.commited)-1
-            # while k>0 and self.commited[k][1] > l:
-            #    k -= 1
-            # t = self.commited[k][1]
-            logger.debug("chunking segment")
-            # self.chunk_at(t)
 
         logger.debug(
             f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}"

From 04170153e0c08a9d5f1f92ace6fcd08b4564ec80 Mon Sep 17 00:00:00 2001
From: Silas Kieser <silas.kieser@gmail.com>
Date: Mon, 27 Jan 2025 16:12:30 +0100
Subject: [PATCH 2/4] improve logging

---
 src/whisper_streaming/online_asr.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py
index 1d7ceb4..bc45f87 100644
--- a/src/whisper_streaming/online_asr.py
+++ b/src/whisper_streaming/online_asr.py
@@ -69,6 +69,7 @@ class HypothesisBuffer:
         return commit
 
     def pop_commited(self, time):
+        "Remove (from the beginning) of commited_in_buffer all the words that are finished before `time`"
         while self.commited_in_buffer and self.commited_in_buffer[0][1] <= time:
             self.commited_in_buffer.pop(0)
 
@@ -183,7 +184,8 @@ class OnlineASRProcessor:
         if self.buffer_trimming_way == "sentence":
 
             self.chunk_completed_sentence()
-                
+
+            
 
 
             
@@ -197,6 +199,7 @@ class OnlineASRProcessor:
             
             
             self.chunk_completed_segment(res)
+    
        
 
                 # alternative: on any word
@@ -215,9 +218,7 @@ class OnlineASRProcessor:
 
 
 
-        logger.debug(
-            f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}"
-        )
+
         return self.to_flush(o)
 
     def chunk_completed_sentence(self):
@@ -252,7 +253,9 @@ class OnlineASRProcessor:
 
         t = self.commited[-1][1]
 
-        if len(ends) > 1:
+        if len(ends) <= 1:
+            logger.debug(f"--- not enough segments to chunk (<=1 words)")
+        else:
 
             e = ends[-2] + self.buffer_time_offset
             while len(ends) > 2 and e > t:
@@ -263,16 +266,21 @@ class OnlineASRProcessor:
                 self.chunk_at(e)
             else:
                 logger.debug(f"--- last segment not within commited area")
-        else:
-            logger.debug(f"--- not enough segments to chunk")
+
 
     def chunk_at(self, time):
         """trims the hypothesis and audio buffer at "time" """
+        logger.debug(f"chunking at {time:2.2f}s")
+
         self.transcript_buffer.pop_commited(time)
         cut_seconds = time - self.buffer_time_offset
         self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :]
         self.buffer_time_offset = time
 
+        logger.debug(
+            f"len of audio buffer is now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}s"
+            )
+
     def words_to_sentences(self, words):
         """Uses self.tokenize for sentence segmentation of words.
         Returns: [(beg,end,"sentence 1"),...]

From 77d43885a3cb3dd203f3e59ce06de6264ea21bb1 Mon Sep 17 00:00:00 2001
From: Silas Kieser <silas.kieser@gmail.com>
Date: Mon, 27 Jan 2025 16:29:06 +0100
Subject: [PATCH 3/4] chunk at sentence takes now an argument =self.comited

---
 src/whisper_streaming/online_asr.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py
index bc45f87..e81b395 100644
--- a/src/whisper_streaming/online_asr.py
+++ b/src/whisper_streaming/online_asr.py
@@ -173,9 +173,11 @@ class OnlineASRProcessor:
 
         self.transcript_buffer.insert(tsw, self.buffer_time_offset)
         o = self.transcript_buffer.flush()
+        # Completed words
         self.commited.extend(o)
         completed = self.to_flush(o)
         logger.debug(f">>>>COMPLETE NOW: {completed[2]}")
+        ## The rest is incomplete
         the_rest = self.to_flush(self.transcript_buffer.complete())
         logger.debug(f"INCOMPLETE: {the_rest[2]}")
 
@@ -183,11 +185,12 @@ class OnlineASRProcessor:
 
         if self.buffer_trimming_way == "sentence":
 
-            self.chunk_completed_sentence()
+            self.chunk_completed_sentence(self.commited)
 
             
 
-
+        # TODO: new words in `completed` should not be reterned unless they form a sentence
+        # TODO: only complete sentences should go to completed
             
 
         if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec :
@@ -219,13 +222,13 @@ class OnlineASRProcessor:
 
 
 
-        return self.to_flush(o)
+        return completed
 
-    def chunk_completed_sentence(self):
-        if self.commited == []:
+    def chunk_completed_sentence(self, commited_text):
+        if commited_text == []:
             return
 
-        sents = self.words_to_sentences(self.commited)
+        sents = self.words_to_sentences(commited_text)
 
 
 
@@ -436,7 +439,7 @@ class VACOnlineASRProcessor(OnlineASRProcessor):
             ret = self.online.process_iter()
             return ret
         else:
-            print("no online update, only VAD", self.status, file=self.logfile)
+            logger.debug("no online update, only VAD")
             return (None, None, "")
 
     def finish(self):

From 8ee1488c0871f2ba87f11e20ba6e20c2aa94f057 Mon Sep 17 00:00:00 2001
From: Silas Kieser <silas.kieser@gmail.com>
Date: Mon, 27 Jan 2025 16:49:22 +0100
Subject: [PATCH 4/4] rename to_flush to concatenate_tsw

---
 src/whisper_streaming/online_asr.py | 28 +++++++++++++++++-----------
 whisper_fastapi_online_server.py    |  4 ++--
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py
index e81b395..c4e9612 100644
--- a/src/whisper_streaming/online_asr.py
+++ b/src/whisper_streaming/online_asr.py
@@ -170,15 +170,15 @@ class OnlineASRProcessor:
 
         # transform to [(beg,end,"word1"), ...]
         tsw = self.asr.ts_words(res)
-
+        # insert into HypothesisBuffer
         self.transcript_buffer.insert(tsw, self.buffer_time_offset)
         o = self.transcript_buffer.flush()
         # Completed words
         self.commited.extend(o)
-        completed = self.to_flush(o)
+        completed = self.concatenate_tsw(o) # This will be returned at the end of the function
         logger.debug(f">>>>COMPLETE NOW: {completed[2]}")
         ## The rest is incomplete
-        the_rest = self.to_flush(self.transcript_buffer.complete())
+        the_rest = self.concatenate_tsw(self.transcript_buffer.complete())
         logger.debug(f"INCOMPLETE: {the_rest[2]}")
 
         # there is a newly confirmed text
@@ -245,7 +245,7 @@ class OnlineASRProcessor:
         # we will continue with audio processing at this timestamp
         chunk_at = sents[-2][1]
 
-        logger.debug(f"[Sentence-segmentation]: sentence will be chunked at {chunk_at:2.2f}")
+
         self.chunk_at(chunk_at)
 
     def chunk_completed_segment(self, res):
@@ -275,6 +275,11 @@ class OnlineASRProcessor:
         """trims the hypothesis and audio buffer at "time" """
         logger.debug(f"chunking at {time:2.2f}s")
 
+        logger.debug(
+            f"len of audio buffer before chunking is: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}s"
+            )
+
+
         self.transcript_buffer.pop_commited(time)
         cut_seconds = time - self.buffer_time_offset
         self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :]
@@ -316,14 +321,14 @@ class OnlineASRProcessor:
         Returns: the same format as self.process_iter()
         """
         o = self.transcript_buffer.complete()
-        f = self.to_flush(o)
+        f = self.concatenate_tsw(o)
         logger.debug(f"last, noncommited: {f[0]*1000:.0f}-{f[1]*1000:.0f}: {f[2]}")
         self.buffer_time_offset += len(self.audio_buffer) / 16000
         return f
 
-    def to_flush(
+    def concatenate_tsw(
         self,
-        sents,
+        tsw,
         sep=None,
         offset=0,
     ):
@@ -332,13 +337,14 @@ class OnlineASRProcessor:
         # return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
         if sep is None:
             sep = self.asr.sep
-        t = sep.join(s[2] for s in sents)
-        if len(sents) == 0:
+            
+        t = sep.join(s[2] for s in tsw)
+        if len(tsw) == 0:
             b = None
             e = None
         else:
-            b = offset + sents[0][0]
-            e = offset + sents[-1][1]
+            b = offset + tsw[0][0]
+            e = offset + tsw[-1][1]
         return (b, e, t)
 
 
diff --git a/whisper_fastapi_online_server.py b/whisper_fastapi_online_server.py
index 6dd09e3..51f5fe2 100644
--- a/whisper_fastapi_online_server.py
+++ b/whisper_fastapi_online_server.py
@@ -125,13 +125,13 @@ async def websocket_endpoint(websocket: WebSocket):
                     transcription = online.process_iter()[2]
                     full_transcription += transcription
                     if args.vac:
-                        buffer = online.online.to_flush(
+                        buffer = online.online.concatenate_tsw(
                             online.online.transcript_buffer.buffer
                         )[
                             2
                         ]  # We need to access the underlying online object to get the buffer
                     else:
-                        buffer = online.to_flush(online.transcript_buffer.buffer)[2]
+                        buffer = online.concatenate_tsw(online.transcript_buffer.buffer)[2]
                     if (
                         buffer in full_transcription
                     ):  # With VAC, the buffer is not updated until the next chunk is processed