From baddf0284b3fd1f2b21cb8231070a566d5862570 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Mon, 27 Jan 2025 15:36:19 +0100 Subject: [PATCH 1/4] buffer length in sentence segmentation is no also max as in segment. --- src/whisper_streaming/online_asr.py | 60 ++++++++++++++++++----------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py index 26f6611..1d7ceb4 100644 --- a/src/whisper_streaming/online_asr.py +++ b/src/whisper_streaming/online_asr.py @@ -110,6 +110,15 @@ class OnlineASRProcessor: self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming + if self.buffer_trimming_way not in ["sentence", "segment"]: + raise ValueError("buffer_trimming must be either 'sentence' or 'segment'") + if self.buffer_trimming_sec <= 0: + raise ValueError("buffer_trimming_sec must be positive") + elif self.buffer_trimming_sec > 30: + logger.warning( + f"buffer_trimming_sec is set to {self.buffer_trimming_sec}, which is very long. It may cause OOM." + ) + def init(self, offset=None): """run this when starting or restarting processing""" self.audio_buffer = np.array([], dtype=np.float32) @@ -171,35 +180,40 @@ class OnlineASRProcessor: # there is a newly confirmed text - if o and self.buffer_trimming_way == "sentence": # trim the completed sentences - if ( - len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec - ): # longer than this - - logger.debug("chunking sentence") - self.chunk_completed_sentence() + if self.buffer_trimming_way == "sentence": + + self.chunk_completed_sentence() - else: - logger.debug("not enough audio to trim as a sentence") - if self.buffer_trimming_way == "segment": - s = self.buffer_trimming_sec # trim the completed segments longer than s, - else: - s = 30 # if the audio buffer is longer than 30s, trim it + - if len(self.audio_buffer) / self.SAMPLING_RATE > s: + if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec : + + if self.buffer_trimming_way == "sentence": + logger.warning(f"Chunck segment after {self.buffer_trimming_sec} seconds!" + " Even if no sentence was found!" + ) + + self.chunk_completed_segment(res) + + + # alternative: on any word + # l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10 + # let's find commited word that is less + # k = len(self.commited)-1 + # while k>0 and self.commited[k][1] > l: + # k -= 1 + # t = self.commited[k][1] + # self.chunk_at(t) + + + + + + - # alternative: on any word - # l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10 - # let's find commited word that is less - # k = len(self.commited)-1 - # while k>0 and self.commited[k][1] > l: - # k -= 1 - # t = self.commited[k][1] - logger.debug("chunking segment") - # self.chunk_at(t) logger.debug( f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}" From 04170153e0c08a9d5f1f92ace6fcd08b4564ec80 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Mon, 27 Jan 2025 16:12:30 +0100 Subject: [PATCH 2/4] improve logging --- src/whisper_streaming/online_asr.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py index 1d7ceb4..bc45f87 100644 --- a/src/whisper_streaming/online_asr.py +++ b/src/whisper_streaming/online_asr.py @@ -69,6 +69,7 @@ class HypothesisBuffer: return commit def pop_commited(self, time): + "Remove (from the beginning) of commited_in_buffer all the words that are finished before `time`" while self.commited_in_buffer and self.commited_in_buffer[0][1] <= time: self.commited_in_buffer.pop(0) @@ -183,7 +184,8 @@ class OnlineASRProcessor: if self.buffer_trimming_way == "sentence": self.chunk_completed_sentence() - + + @@ -197,6 +199,7 @@ class OnlineASRProcessor: self.chunk_completed_segment(res) + # alternative: on any word @@ -215,9 +218,7 @@ class OnlineASRProcessor: - logger.debug( - f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}" - ) + return self.to_flush(o) def chunk_completed_sentence(self): @@ -252,7 +253,9 @@ class OnlineASRProcessor: t = self.commited[-1][1] - if len(ends) > 1: + if len(ends) <= 1: + logger.debug(f"--- not enough segments to chunk (<=1 words)") + else: e = ends[-2] + self.buffer_time_offset while len(ends) > 2 and e > t: @@ -263,16 +266,21 @@ class OnlineASRProcessor: self.chunk_at(e) else: logger.debug(f"--- last segment not within commited area") - else: - logger.debug(f"--- not enough segments to chunk") + def chunk_at(self, time): """trims the hypothesis and audio buffer at "time" """ + logger.debug(f"chunking at {time:2.2f}s") + self.transcript_buffer.pop_commited(time) cut_seconds = time - self.buffer_time_offset self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :] self.buffer_time_offset = time + logger.debug( + f"len of audio buffer is now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}s" + ) + def words_to_sentences(self, words): """Uses self.tokenize for sentence segmentation of words. Returns: [(beg,end,"sentence 1"),...] From 77d43885a3cb3dd203f3e59ce06de6264ea21bb1 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Mon, 27 Jan 2025 16:29:06 +0100 Subject: [PATCH 3/4] chunk at sentence takes now an argument =self.comited --- src/whisper_streaming/online_asr.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py index bc45f87..e81b395 100644 --- a/src/whisper_streaming/online_asr.py +++ b/src/whisper_streaming/online_asr.py @@ -173,9 +173,11 @@ class OnlineASRProcessor: self.transcript_buffer.insert(tsw, self.buffer_time_offset) o = self.transcript_buffer.flush() + # Completed words self.commited.extend(o) completed = self.to_flush(o) logger.debug(f">>>>COMPLETE NOW: {completed[2]}") + ## The rest is incomplete the_rest = self.to_flush(self.transcript_buffer.complete()) logger.debug(f"INCOMPLETE: {the_rest[2]}") @@ -183,11 +185,12 @@ class OnlineASRProcessor: if self.buffer_trimming_way == "sentence": - self.chunk_completed_sentence() + self.chunk_completed_sentence(self.commited) - + # TODO: new words in `completed` should not be reterned unless they form a sentence + # TODO: only complete sentences should go to completed if len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec : @@ -219,13 +222,13 @@ class OnlineASRProcessor: - return self.to_flush(o) + return completed - def chunk_completed_sentence(self): - if self.commited == []: + def chunk_completed_sentence(self, commited_text): + if commited_text == []: return - sents = self.words_to_sentences(self.commited) + sents = self.words_to_sentences(commited_text) @@ -436,7 +439,7 @@ class VACOnlineASRProcessor(OnlineASRProcessor): ret = self.online.process_iter() return ret else: - print("no online update, only VAD", self.status, file=self.logfile) + logger.debug("no online update, only VAD") return (None, None, "") def finish(self): From 8ee1488c0871f2ba87f11e20ba6e20c2aa94f057 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Mon, 27 Jan 2025 16:49:22 +0100 Subject: [PATCH 4/4] rename to_flush to concatenate_tsw --- src/whisper_streaming/online_asr.py | 28 +++++++++++++++++----------- whisper_fastapi_online_server.py | 4 ++-- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py index e81b395..c4e9612 100644 --- a/src/whisper_streaming/online_asr.py +++ b/src/whisper_streaming/online_asr.py @@ -170,15 +170,15 @@ class OnlineASRProcessor: # transform to [(beg,end,"word1"), ...] tsw = self.asr.ts_words(res) - + # insert into HypothesisBuffer self.transcript_buffer.insert(tsw, self.buffer_time_offset) o = self.transcript_buffer.flush() # Completed words self.commited.extend(o) - completed = self.to_flush(o) + completed = self.concatenate_tsw(o) # This will be returned at the end of the function logger.debug(f">>>>COMPLETE NOW: {completed[2]}") ## The rest is incomplete - the_rest = self.to_flush(self.transcript_buffer.complete()) + the_rest = self.concatenate_tsw(self.transcript_buffer.complete()) logger.debug(f"INCOMPLETE: {the_rest[2]}") # there is a newly confirmed text @@ -245,7 +245,7 @@ class OnlineASRProcessor: # we will continue with audio processing at this timestamp chunk_at = sents[-2][1] - logger.debug(f"[Sentence-segmentation]: sentence will be chunked at {chunk_at:2.2f}") + self.chunk_at(chunk_at) def chunk_completed_segment(self, res): @@ -275,6 +275,11 @@ class OnlineASRProcessor: """trims the hypothesis and audio buffer at "time" """ logger.debug(f"chunking at {time:2.2f}s") + logger.debug( + f"len of audio buffer before chunking is: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}s" + ) + + self.transcript_buffer.pop_commited(time) cut_seconds = time - self.buffer_time_offset self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :] @@ -316,14 +321,14 @@ class OnlineASRProcessor: Returns: the same format as self.process_iter() """ o = self.transcript_buffer.complete() - f = self.to_flush(o) + f = self.concatenate_tsw(o) logger.debug(f"last, noncommited: {f[0]*1000:.0f}-{f[1]*1000:.0f}: {f[2]}") self.buffer_time_offset += len(self.audio_buffer) / 16000 return f - def to_flush( + def concatenate_tsw( self, - sents, + tsw, sep=None, offset=0, ): @@ -332,13 +337,14 @@ class OnlineASRProcessor: # return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty if sep is None: sep = self.asr.sep - t = sep.join(s[2] for s in sents) - if len(sents) == 0: + + t = sep.join(s[2] for s in tsw) + if len(tsw) == 0: b = None e = None else: - b = offset + sents[0][0] - e = offset + sents[-1][1] + b = offset + tsw[0][0] + e = offset + tsw[-1][1] return (b, e, t) diff --git a/whisper_fastapi_online_server.py b/whisper_fastapi_online_server.py index 6dd09e3..51f5fe2 100644 --- a/whisper_fastapi_online_server.py +++ b/whisper_fastapi_online_server.py @@ -125,13 +125,13 @@ async def websocket_endpoint(websocket: WebSocket): transcription = online.process_iter()[2] full_transcription += transcription if args.vac: - buffer = online.online.to_flush( + buffer = online.online.concatenate_tsw( online.online.transcript_buffer.buffer )[ 2 ] # We need to access the underlying online object to get the buffer else: - buffer = online.to_flush(online.transcript_buffer.buffer)[2] + buffer = online.concatenate_tsw(online.transcript_buffer.buffer)[2] if ( buffer in full_transcription ): # With VAC, the buffer is not updated until the next chunk is processed