keep a test script in base directory

2025-01-28 17:13:03 +01:00 · 2025-01-28 17:13:03 +01:00 · 611d33cba5
commit 611d33cba5
parent ab7c22d3e3
3 changed files with 183 additions and 173 deletions
--- a/src/init.py
+++ b/src/init.py
--- a/src/whisper_streaming/whisper_online.py
+++ b/src/whisper_streaming/whisper_online.py
@ -5,23 +5,12 @@ import librosa
 from functools import lru_cache
 import time
 import logging
-from backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR
-from online_asr import OnlineASRProcessor, VACOnlineASRProcessor
+from .backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR
+from .online_asr import OnlineASRProcessor, VACOnlineASRProcessor

 logger = logging.getLogger(__name__)


-@lru_cache(10**6)
-def load_audio(fname):
-    a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
-    return a
-
-
-def load_audio_chunk(fname, beg, end):
-    audio = load_audio(fname)
-    beg_s = int(beg * 16000)
-    end_s = int(end * 16000)
-    return audio[beg_s:end_s]

 WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(
    ","
@ -244,163 +233,3 @@ def set_logging(args, logger, others=[]):
        logging.getLogger(other).setLevel(args.log_level)


-#    logging.getLogger("whisper_online_server").setLevel(args.log_level)
-
-
-if __name__ == "__main__":
-
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--audio_path",
-        type=str,
-        default='samples_jfk.wav',
-        help="Filename of 16kHz mono channel wav, on which live streaming is simulated.",
-    )
-    add_shared_args(parser)
-    parser.add_argument(
-        "--start_at",
-        type=float,
-        default=0.0,
-        help="Start processing audio at this time.",
-    )
-    parser.add_argument(
-        "--offline", action="store_true", default=False, help="Offline mode."
-    )
-    parser.add_argument(
-        "--comp_unaware",
-        action="store_true",
-        default=False,
-        help="Computationally unaware simulation.",
-    )
-
-    args = parser.parse_args()
-
-    # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
-    logfile = None # sys.stderr
-
-    if args.offline and args.comp_unaware:
-        logger.error(
-            "No or one option from --offline and --comp_unaware are available, not both. Exiting."
-        )
-        sys.exit(1)
-
-    #    if args.log_level:
-    #        logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s',
-    #                            level=getattr(logging, args.log_level))
-
-    set_logging(args, logger,others=["online_asr"])
-
-    audio_path = args.audio_path
-
-    SAMPLING_RATE = 16000
-    duration = len(load_audio(audio_path)) / SAMPLING_RATE
-    logger.info("Audio duration is: %2.2f seconds" % duration)
-
-    asr, online = asr_factory(args, logfile=logfile)
-    if args.vac:
-        min_chunk = args.vac_chunk_size
-    else:
-        min_chunk = args.min_chunk_size
-
-    # load the audio into the LRU cache before we start the timer
-    a = load_audio_chunk(audio_path, 0, 1)
-
-    # warm up the ASR because the very first transcribe takes much more time than the other
-    asr.transcribe(a)
-
-    beg = args.start_at
-    start = time.time() - beg
-
-    def output_transcript(o, now=None):
-        # output format in stdout is like:
-        # 4186.3606 0 1720 Takhle to je
-        # - the first three words are:
-        #    - emission time from beginning of processing, in milliseconds
-        #    - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
-        # - the next words: segment transcript
-        if now is None:
-            now = time.time() - start
-        if o[0] is not None:
-            log_string = f"{now*1000:1.0f}, {o[0]*1000:1.0f}-{o[1]*1000:1.0f} ({(now-o[1]):+1.0f}s): {o[2]}"
-
-            logger.debug(
-                log_string
-            )
-
-            if logfile is not None:
-                print(
-                    log_string,
-                    file=logfile,
-                    flush=True,
-                )
-        else:
-            # No text, so no output
-            pass
-
-    if args.offline:  ## offline mode processing (for testing/debugging)
-        a = load_audio(audio_path)
-        online.insert_audio_chunk(a)
-        try:
-            o = online.process_iter()
-        except AssertionError as e:
-            logger.error(f"assertion error: {repr(e)}")
-        else:
-            output_transcript(o)
-        now = None
-    elif args.comp_unaware:  # computational unaware mode
-        end = beg + min_chunk
-        while True:
-            a = load_audio_chunk(audio_path, beg, end)
-            online.insert_audio_chunk(a)
-            try:
-                o = online.process_iter()
-            except AssertionError as e:
-                logger.error(f"assertion error: {repr(e)}")
-                pass
-            else:
-                output_transcript(o, now=end)
-
-            logger.debug(f"## last processed {end:.2f}s")
-
-            if end >= duration:
-                break
-
-            beg = end
-
-            if end + min_chunk > duration:
-                end = duration
-            else:
-                end += min_chunk
-        now = duration
-
-    else:  # online = simultaneous mode
-        end = 0
-        while True:
-            now = time.time() - start
-            if now < end + min_chunk:
-                time.sleep(min_chunk + end - now)
-            end = time.time() - start
-            a = load_audio_chunk(audio_path, beg, end)
-            beg = end
-            online.insert_audio_chunk(a)
-
-            try:
-                o = online.process_iter()
-            except AssertionError as e:
-                logger.error(f"assertion error: {e}")
-                pass
-            else:
-                output_transcript(o)
-            now = time.time() - start
-            logger.debug(
-                f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}"
-            )
-
-            if end >= duration:
-                break
-        now = None
-
-    o = online.finish()
-    output_transcript(o, now=now)
--- a/whisper_noserver_test.py
+++ b/whisper_noserver_test.py
@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+import sys
+import numpy as np
+import librosa
+from functools import lru_cache
+import time
+import logging
+
+logger = logging.getLogger(__name__)
+
+from src.whisper_streaming.whisper_online import *
+
+@lru_cache(10**6)
+def load_audio(fname):
+    a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
+    return a
+
+
+def load_audio_chunk(fname, beg, end):
+    audio = load_audio(fname)
+    beg_s = int(beg * 16000)
+    end_s = int(end * 16000)
+    return audio[beg_s:end_s]
+
+if __name__ == "__main__":
+
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--audio_path",
+        type=str,
+        default='samples_jfk.wav',
+        help="Filename of 16kHz mono channel wav, on which live streaming is simulated.",
+    )
+    add_shared_args(parser)
+    parser.add_argument(
+        "--start_at",
+        type=float,
+        default=0.0,
+        help="Start processing audio at this time.",
+    )
+    parser.add_argument(
+        "--offline", action="store_true", default=False, help="Offline mode."
+    )
+    parser.add_argument(
+        "--comp_unaware",
+        action="store_true",
+        default=False,
+        help="Computationally unaware simulation.",
+    )
+
+    args = parser.parse_args()
+
+    # reset to store stderr to different file stream, e.g. open(os.devnull,"w")
+    logfile = None # sys.stderr
+
+    if args.offline and args.comp_unaware:
+        logger.error(
+            "No or one option from --offline and --comp_unaware are available, not both. Exiting."
+        )
+        sys.exit(1)
+
+    #    if args.log_level:
+    #        logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s',
+    #                            level=getattr(logging, args.log_level))
+
+    set_logging(args, logger,others=["src.whisper_streaming.online_asr"])
+
+    audio_path = args.audio_path
+
+    SAMPLING_RATE = 16000
+    duration = len(load_audio(audio_path)) / SAMPLING_RATE
+    logger.info("Audio duration is: %2.2f seconds" % duration)
+
+    asr, online = asr_factory(args, logfile=logfile)
+    if args.vac:
+        min_chunk = args.vac_chunk_size
+    else:
+        min_chunk = args.min_chunk_size
+
+    # load the audio into the LRU cache before we start the timer
+    a = load_audio_chunk(audio_path, 0, 1)
+
+    # warm up the ASR because the very first transcribe takes much more time than the other
+    asr.transcribe(a)
+
+    beg = args.start_at
+    start = time.time() - beg
+
+    def output_transcript(o, now=None):
+        # output format in stdout is like:
+        # 4186.3606 0 1720 Takhle to je
+        # - the first three words are:
+        #    - emission time from beginning of processing, in milliseconds
+        #    - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
+        # - the next words: segment transcript
+        if now is None:
+            now = time.time() - start
+        if o[0] is not None:
+            log_string = f"{now*1000:1.0f}, {o[0]*1000:1.0f}-{o[1]*1000:1.0f} ({(now-o[1]):+1.0f}s): {o[2]}"
+
+            logger.debug(
+                log_string
+            )
+
+            if logfile is not None:
+                print(
+                    log_string,
+                    file=logfile,
+                    flush=True,
+                )
+        else:
+            # No text, so no output
+            pass
+
+    if args.offline:  ## offline mode processing (for testing/debugging)
+        a = load_audio(audio_path)
+        online.insert_audio_chunk(a)
+        try:
+            o = online.process_iter()
+        except AssertionError as e:
+            logger.error(f"assertion error: {repr(e)}")
+        else:
+            output_transcript(o)
+        now = None
+    elif args.comp_unaware:  # computational unaware mode
+        end = beg + min_chunk
+        while True:
+            a = load_audio_chunk(audio_path, beg, end)
+            online.insert_audio_chunk(a)
+            try:
+                o = online.process_iter()
+            except AssertionError as e:
+                logger.error(f"assertion error: {repr(e)}")
+                pass
+            else:
+                output_transcript(o, now=end)
+
+            logger.debug(f"## last processed {end:.2f}s")
+
+            if end >= duration:
+                break
+
+            beg = end
+
+            if end + min_chunk > duration:
+                end = duration
+            else:
+                end += min_chunk
+        now = duration
+
+    else:  # online = simultaneous mode
+        end = 0
+        while True:
+            now = time.time() - start
+            if now < end + min_chunk:
+                time.sleep(min_chunk + end - now)
+            end = time.time() - start
+            a = load_audio_chunk(audio_path, beg, end)
+            beg = end
+            online.insert_audio_chunk(a)
+
+            try:
+                o = online.process_iter()
+            except AssertionError as e:
+                logger.error(f"assertion error: {e}")
+                pass
+            else:
+                output_transcript(o)
+            now = time.time() - start
+            logger.debug(
+                f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}"
+            )
+
+            if end >= duration:
+                break
+        now = None
+
+    o = online.finish()
+    output_transcript(o, now=now)