From 42d2784c20e9ef776ee41ea4c690fc63c8fbb2df Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 21 Jan 2025 11:18:54 +0100 Subject: [PATCH 1/6] clearer log messages for sentence segmentation --- src/whisper_streaming/online_asr.py | 12 +++++++++--- whisper_online.py | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py index dc34fd8..4b4f1ad 100644 --- a/src/whisper_streaming/online_asr.py +++ b/src/whisper_streaming/online_asr.py @@ -194,10 +194,16 @@ class OnlineASRProcessor: def chunk_completed_sentence(self): if self.commited == []: return - logger.debug("COMPLETED SENTENCE: ", [s[2] for s in self.commited]) + + raw_text = self.asr.sep.join([s[2] for s in self.commited]) + logger.debug(f"[Sentence-segmentation] Raw Text: {raw_text}") + sents = self.words_to_sentences(self.commited) + + + for s in sents: - logger.debug(f"\t\tSENT: {s}") + logger.debug(f"[Sentence-segmentation] completed sentence: {s}") if len(sents) < 2: return while len(sents) > 2: @@ -205,7 +211,7 @@ class OnlineASRProcessor: # we will continue with audio processing at this timestamp chunk_at = sents[-2][1] - logger.debug(f"--- sentence chunked at {chunk_at:2.2f}") + logger.debug(f"[Sentence-segmentation]: sentence chunked at {chunk_at:2.2f}") self.chunk_at(chunk_at) def chunk_completed_segment(self, res): diff --git a/whisper_online.py b/whisper_online.py index 077e660..55a1183 100644 --- a/whisper_online.py +++ b/whisper_online.py @@ -58,7 +58,7 @@ def create_tokenizer(lan): lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split() ): - logger.debug( + logger.warning( f"{lan} code is not supported by wtpsplit. Going to use None lang_code option." ) lan = None From 42935805817602ef4c10a569eec0134590df2d4d Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 21 Jan 2025 12:06:03 +0100 Subject: [PATCH 2/6] use moses sentence segmenter instead of tokenizer --- src/whisper_streaming/online_asr.py | 41 ++++++++++++++++++----------- whisper_online.py | 7 ++--- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py index 4b4f1ad..522a243 100644 --- a/src/whisper_streaming/online_asr.py +++ b/src/whisper_streaming/online_asr.py @@ -87,11 +87,20 @@ class OnlineASRProcessor: buffer_trimming=("segment", 15), logfile=sys.stderr, ): - """asr: WhisperASR object - tokenize_method: sentence tokenizer function for the target language. Must be a callable and behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all. - ("segment", 15) - buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option. - logfile: where to store the log. + """ + Initialize OnlineASRProcessor. + + Args: + asr: WhisperASR object + tokenize_method: Sentence tokenizer function for the target language. + Must be a function that takes a list of text as input like MosesSentenceSplitter. + Can be None if using "segment" buffer trimming option. + buffer_trimming: Tuple of (option, seconds) where: + - option: Either "sentence" or "segment" + - seconds: Number of seconds threshold for buffer trimming + Default is ("segment", 15) + logfile: File to store logs + """ self.asr = asr self.tokenize = tokenize_method @@ -194,24 +203,25 @@ class OnlineASRProcessor: def chunk_completed_sentence(self): if self.commited == []: return - - raw_text = self.asr.sep.join([s[2] for s in self.commited]) - logger.debug(f"[Sentence-segmentation] Raw Text: {raw_text}") sents = self.words_to_sentences(self.commited) - for s in sents: - logger.debug(f"[Sentence-segmentation] completed sentence: {s}") if len(sents) < 2: + logger.debug(f"[Sentence-segmentation] no sentence segmented.") return - while len(sents) > 2: - sents.pop(0) + + + + identified_sentence= "\n - ".join([f"{s[0]*1000:.0f}-{s[1]*1000:.0f} {s[2]}" for s in sents]) + logger.debug(f"[Sentence-segmentation] identified sentences:\n - {identified_sentence}") + + # we will continue with audio processing at this timestamp chunk_at = sents[-2][1] - logger.debug(f"[Sentence-segmentation]: sentence chunked at {chunk_at:2.2f}") + logger.debug(f"[Sentence-segmentation]: sentence will be chunked at {chunk_at:2.2f}") self.chunk_at(chunk_at) def chunk_completed_segment(self, res): @@ -249,8 +259,9 @@ class OnlineASRProcessor: """ cwords = [w for w in words] - t = " ".join(o[2] for o in cwords) - s = self.tokenize(t) + t = self.asr.sep.join(o[2] for o in cwords) + logger.debug(f"[Sentence-segmentation] Raw Text: {t}") + s = self.tokenize([t]) out = [] while s: beg = None diff --git a/whisper_online.py b/whisper_online.py index 55a1183..f553d45 100644 --- a/whisper_online.py +++ b/whisper_online.py @@ -49,16 +49,16 @@ def create_tokenizer(lan): lan in "as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh".split() ): - from mosestokenizer import MosesTokenizer + from mosestokenizer import MosesSentenceSplitter - return MosesTokenizer(lan) + return MosesSentenceSplitter(lan) # the following languages are in Whisper, but not in wtpsplit: if ( lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split() ): - logger.warning( + logger.debug( f"{lan} code is not supported by wtpsplit. Going to use None lang_code option." ) lan = None @@ -204,6 +204,7 @@ def backend_factory(args): # Create the tokenizer if args.buffer_trimming == "sentence": + tokenizer = create_tokenizer(tgt_language) else: tokenizer = None From 9f262813ecda75703aa0e30235b2e9692e29ca48 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 21 Jan 2025 10:39:38 +0100 Subject: [PATCH 3/6] sep for mlx is also "" --- src/whisper_streaming/backends.py | 2 +- src/whisper_streaming/online_asr.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/whisper_streaming/backends.py b/src/whisper_streaming/backends.py index 682cfc5..99ba762 100644 --- a/src/whisper_streaming/backends.py +++ b/src/whisper_streaming/backends.py @@ -164,7 +164,7 @@ class MLXWhisper(ASRBase): Significantly faster than faster-whisper (without CUDA) on Apple M1. """ - sep = " " + sep = "" # In my experience in french it should also be no space. def load_model(self, modelsize=None, cache_dir=None, model_dir=None): """ diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py index 522a243..207da01 100644 --- a/src/whisper_streaming/online_asr.py +++ b/src/whisper_streaming/online_asr.py @@ -175,7 +175,13 @@ class OnlineASRProcessor: if ( len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec ): # longer than this + + logger.debug("chunking sentence") self.chunk_completed_sentence() + + + else: + logger.debug("not enough audio to trim as a sentence") if self.buffer_trimming_way == "segment": s = self.buffer_trimming_sec # trim the completed segments longer than s, @@ -286,7 +292,7 @@ class OnlineASRProcessor: """ o = self.transcript_buffer.complete() f = self.to_flush(o) - logger.debug(f"last, noncommited: {f}") + logger.debug(f"last, noncommited: {f[0]*1000:.0f}-{f[1]*1000:.0f}: {f[2]}") self.buffer_time_offset += len(self.audio_buffer) / 16000 return f From 25eb276794171f163e3fe035f209550e5740305e Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 21 Jan 2025 14:08:41 +0100 Subject: [PATCH 4/6] ignore wav and scripts --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index b6e4761..88eef4b 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +*.wav +run_*.sh \ No newline at end of file From 69a2ed6bfbd42eb07b1168ca03f417f93376dd16 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 21 Jan 2025 14:45:45 +0100 Subject: [PATCH 5/6] add logger for online asr --- whisper_online.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/whisper_online.py b/whisper_online.py index f553d45..9a61c2b 100644 --- a/whisper_online.py +++ b/whisper_online.py @@ -236,10 +236,12 @@ def asr_factory(args, logfile=sys.stderr): online = online_factory(args, asr, tokenizer, logfile=logfile) return asr, online -def set_logging(args, logger, other="_server"): +def set_logging(args, logger, others=[]): logging.basicConfig(format="%(levelname)s\t%(message)s") # format='%(name)s logger.setLevel(args.log_level) - logging.getLogger("whisper_online" + other).setLevel(args.log_level) + + for other in others: + logging.getLogger(other).setLevel(args.log_level) # logging.getLogger("whisper_online_server").setLevel(args.log_level) @@ -276,7 +278,7 @@ if __name__ == "__main__": args = parser.parse_args() # reset to store stderr to different file stream, e.g. open(os.devnull,"w") - logfile = sys.stderr + logfile = None # sys.stderr if args.offline and args.comp_unaware: logger.error( @@ -288,7 +290,7 @@ if __name__ == "__main__": # logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s', # level=getattr(logging, args.log_level)) - set_logging(args, logger) + set_logging(args, logger,others=["src.whisper_streaming.online_asr"]) audio_path = args.audio_path From f0eaffacd3e174646d59918be5cc7966bb1d8211 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Tue, 21 Jan 2025 14:59:36 +0100 Subject: [PATCH 6/6] improve logging in whisper_online.py --- src/whisper_streaming/online_asr.py | 2 +- whisper_online.py | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/whisper_streaming/online_asr.py b/src/whisper_streaming/online_asr.py index 207da01..26f6611 100644 --- a/src/whisper_streaming/online_asr.py +++ b/src/whisper_streaming/online_asr.py @@ -151,7 +151,7 @@ class OnlineASRProcessor: """ prompt, non_prompt = self.prompt() - logger.debug(f"PROMPT: {prompt}") + logger.debug(f"PROMPT(previous): {prompt}") logger.debug(f"CONTEXT: {non_prompt}") logger.debug( f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}" diff --git a/whisper_online.py b/whisper_online.py index 9a61c2b..cd5d005 100644 --- a/whisper_online.py +++ b/whisper_online.py @@ -323,15 +323,18 @@ if __name__ == "__main__": if now is None: now = time.time() - start if o[0] is not None: - print( - "%1.4f %1.0f %1.0f %s" % (now * 1000, o[0] * 1000, o[1] * 1000, o[2]), - file=logfile, - flush=True, - ) - print( - "%1.4f %1.0f %1.0f %s" % (now * 1000, o[0] * 1000, o[1] * 1000, o[2]), - flush=True, + log_string = f"{now*1000:1.0f}, {o[0]*1000:1.0f}-{o[1]*1000:1.0f} ({(now-o[1]):+1.0f}s): {o[2]}" + + logger.debug( + log_string ) + + if logfile is not None: + print( + log_string, + file=logfile, + flush=True, + ) else: # No text, so no output pass