keep a test script in base directory
This commit is contained in:
parent
ab7c22d3e3
commit
611d33cba5
3 changed files with 183 additions and 173 deletions
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
|
|
@ -5,23 +5,12 @@ import librosa
|
|||
from functools import lru_cache
|
||||
import time
|
||||
import logging
|
||||
from backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR
|
||||
from online_asr import OnlineASRProcessor, VACOnlineASRProcessor
|
||||
from .backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR
|
||||
from .online_asr import OnlineASRProcessor, VACOnlineASRProcessor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@lru_cache(10**6)
|
||||
def load_audio(fname):
|
||||
a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
|
||||
return a
|
||||
|
||||
|
||||
def load_audio_chunk(fname, beg, end):
|
||||
audio = load_audio(fname)
|
||||
beg_s = int(beg * 16000)
|
||||
end_s = int(end * 16000)
|
||||
return audio[beg_s:end_s]
|
||||
|
||||
WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(
|
||||
","
|
||||
|
|
@ -244,163 +233,3 @@ def set_logging(args, logger, others=[]):
|
|||
logging.getLogger(other).setLevel(args.log_level)
|
||||
|
||||
|
||||
# logging.getLogger("whisper_online_server").setLevel(args.log_level)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--audio_path",
|
||||
type=str,
|
||||
default='samples_jfk.wav',
|
||||
help="Filename of 16kHz mono channel wav, on which live streaming is simulated.",
|
||||
)
|
||||
add_shared_args(parser)
|
||||
parser.add_argument(
|
||||
"--start_at",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="Start processing audio at this time.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--offline", action="store_true", default=False, help="Offline mode."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--comp_unaware",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Computationally unaware simulation.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# reset to store stderr to different file stream, e.g. open(os.devnull,"w")
|
||||
logfile = None # sys.stderr
|
||||
|
||||
if args.offline and args.comp_unaware:
|
||||
logger.error(
|
||||
"No or one option from --offline and --comp_unaware are available, not both. Exiting."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# if args.log_level:
|
||||
# logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s',
|
||||
# level=getattr(logging, args.log_level))
|
||||
|
||||
set_logging(args, logger,others=["online_asr"])
|
||||
|
||||
audio_path = args.audio_path
|
||||
|
||||
SAMPLING_RATE = 16000
|
||||
duration = len(load_audio(audio_path)) / SAMPLING_RATE
|
||||
logger.info("Audio duration is: %2.2f seconds" % duration)
|
||||
|
||||
asr, online = asr_factory(args, logfile=logfile)
|
||||
if args.vac:
|
||||
min_chunk = args.vac_chunk_size
|
||||
else:
|
||||
min_chunk = args.min_chunk_size
|
||||
|
||||
# load the audio into the LRU cache before we start the timer
|
||||
a = load_audio_chunk(audio_path, 0, 1)
|
||||
|
||||
# warm up the ASR because the very first transcribe takes much more time than the other
|
||||
asr.transcribe(a)
|
||||
|
||||
beg = args.start_at
|
||||
start = time.time() - beg
|
||||
|
||||
def output_transcript(o, now=None):
|
||||
# output format in stdout is like:
|
||||
# 4186.3606 0 1720 Takhle to je
|
||||
# - the first three words are:
|
||||
# - emission time from beginning of processing, in milliseconds
|
||||
# - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
|
||||
# - the next words: segment transcript
|
||||
if now is None:
|
||||
now = time.time() - start
|
||||
if o[0] is not None:
|
||||
log_string = f"{now*1000:1.0f}, {o[0]*1000:1.0f}-{o[1]*1000:1.0f} ({(now-o[1]):+1.0f}s): {o[2]}"
|
||||
|
||||
logger.debug(
|
||||
log_string
|
||||
)
|
||||
|
||||
if logfile is not None:
|
||||
print(
|
||||
log_string,
|
||||
file=logfile,
|
||||
flush=True,
|
||||
)
|
||||
else:
|
||||
# No text, so no output
|
||||
pass
|
||||
|
||||
if args.offline: ## offline mode processing (for testing/debugging)
|
||||
a = load_audio(audio_path)
|
||||
online.insert_audio_chunk(a)
|
||||
try:
|
||||
o = online.process_iter()
|
||||
except AssertionError as e:
|
||||
logger.error(f"assertion error: {repr(e)}")
|
||||
else:
|
||||
output_transcript(o)
|
||||
now = None
|
||||
elif args.comp_unaware: # computational unaware mode
|
||||
end = beg + min_chunk
|
||||
while True:
|
||||
a = load_audio_chunk(audio_path, beg, end)
|
||||
online.insert_audio_chunk(a)
|
||||
try:
|
||||
o = online.process_iter()
|
||||
except AssertionError as e:
|
||||
logger.error(f"assertion error: {repr(e)}")
|
||||
pass
|
||||
else:
|
||||
output_transcript(o, now=end)
|
||||
|
||||
logger.debug(f"## last processed {end:.2f}s")
|
||||
|
||||
if end >= duration:
|
||||
break
|
||||
|
||||
beg = end
|
||||
|
||||
if end + min_chunk > duration:
|
||||
end = duration
|
||||
else:
|
||||
end += min_chunk
|
||||
now = duration
|
||||
|
||||
else: # online = simultaneous mode
|
||||
end = 0
|
||||
while True:
|
||||
now = time.time() - start
|
||||
if now < end + min_chunk:
|
||||
time.sleep(min_chunk + end - now)
|
||||
end = time.time() - start
|
||||
a = load_audio_chunk(audio_path, beg, end)
|
||||
beg = end
|
||||
online.insert_audio_chunk(a)
|
||||
|
||||
try:
|
||||
o = online.process_iter()
|
||||
except AssertionError as e:
|
||||
logger.error(f"assertion error: {e}")
|
||||
pass
|
||||
else:
|
||||
output_transcript(o)
|
||||
now = time.time() - start
|
||||
logger.debug(
|
||||
f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}"
|
||||
)
|
||||
|
||||
if end >= duration:
|
||||
break
|
||||
now = None
|
||||
|
||||
o = online.finish()
|
||||
output_transcript(o, now=now)
|
||||
|
|
|
|||
181
whisper_noserver_test.py
Normal file
181
whisper_noserver_test.py
Normal file
|
|
@ -0,0 +1,181 @@
|
|||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import numpy as np
|
||||
import librosa
|
||||
from functools import lru_cache
|
||||
import time
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from src.whisper_streaming.whisper_online import *
|
||||
|
||||
@lru_cache(10**6)
|
||||
def load_audio(fname):
|
||||
a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
|
||||
return a
|
||||
|
||||
|
||||
def load_audio_chunk(fname, beg, end):
|
||||
audio = load_audio(fname)
|
||||
beg_s = int(beg * 16000)
|
||||
end_s = int(end * 16000)
|
||||
return audio[beg_s:end_s]
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--audio_path",
|
||||
type=str,
|
||||
default='samples_jfk.wav',
|
||||
help="Filename of 16kHz mono channel wav, on which live streaming is simulated.",
|
||||
)
|
||||
add_shared_args(parser)
|
||||
parser.add_argument(
|
||||
"--start_at",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="Start processing audio at this time.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--offline", action="store_true", default=False, help="Offline mode."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--comp_unaware",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Computationally unaware simulation.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# reset to store stderr to different file stream, e.g. open(os.devnull,"w")
|
||||
logfile = None # sys.stderr
|
||||
|
||||
if args.offline and args.comp_unaware:
|
||||
logger.error(
|
||||
"No or one option from --offline and --comp_unaware are available, not both. Exiting."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# if args.log_level:
|
||||
# logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s',
|
||||
# level=getattr(logging, args.log_level))
|
||||
|
||||
set_logging(args, logger,others=["src.whisper_streaming.online_asr"])
|
||||
|
||||
audio_path = args.audio_path
|
||||
|
||||
SAMPLING_RATE = 16000
|
||||
duration = len(load_audio(audio_path)) / SAMPLING_RATE
|
||||
logger.info("Audio duration is: %2.2f seconds" % duration)
|
||||
|
||||
asr, online = asr_factory(args, logfile=logfile)
|
||||
if args.vac:
|
||||
min_chunk = args.vac_chunk_size
|
||||
else:
|
||||
min_chunk = args.min_chunk_size
|
||||
|
||||
# load the audio into the LRU cache before we start the timer
|
||||
a = load_audio_chunk(audio_path, 0, 1)
|
||||
|
||||
# warm up the ASR because the very first transcribe takes much more time than the other
|
||||
asr.transcribe(a)
|
||||
|
||||
beg = args.start_at
|
||||
start = time.time() - beg
|
||||
|
||||
def output_transcript(o, now=None):
|
||||
# output format in stdout is like:
|
||||
# 4186.3606 0 1720 Takhle to je
|
||||
# - the first three words are:
|
||||
# - emission time from beginning of processing, in milliseconds
|
||||
# - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
|
||||
# - the next words: segment transcript
|
||||
if now is None:
|
||||
now = time.time() - start
|
||||
if o[0] is not None:
|
||||
log_string = f"{now*1000:1.0f}, {o[0]*1000:1.0f}-{o[1]*1000:1.0f} ({(now-o[1]):+1.0f}s): {o[2]}"
|
||||
|
||||
logger.debug(
|
||||
log_string
|
||||
)
|
||||
|
||||
if logfile is not None:
|
||||
print(
|
||||
log_string,
|
||||
file=logfile,
|
||||
flush=True,
|
||||
)
|
||||
else:
|
||||
# No text, so no output
|
||||
pass
|
||||
|
||||
if args.offline: ## offline mode processing (for testing/debugging)
|
||||
a = load_audio(audio_path)
|
||||
online.insert_audio_chunk(a)
|
||||
try:
|
||||
o = online.process_iter()
|
||||
except AssertionError as e:
|
||||
logger.error(f"assertion error: {repr(e)}")
|
||||
else:
|
||||
output_transcript(o)
|
||||
now = None
|
||||
elif args.comp_unaware: # computational unaware mode
|
||||
end = beg + min_chunk
|
||||
while True:
|
||||
a = load_audio_chunk(audio_path, beg, end)
|
||||
online.insert_audio_chunk(a)
|
||||
try:
|
||||
o = online.process_iter()
|
||||
except AssertionError as e:
|
||||
logger.error(f"assertion error: {repr(e)}")
|
||||
pass
|
||||
else:
|
||||
output_transcript(o, now=end)
|
||||
|
||||
logger.debug(f"## last processed {end:.2f}s")
|
||||
|
||||
if end >= duration:
|
||||
break
|
||||
|
||||
beg = end
|
||||
|
||||
if end + min_chunk > duration:
|
||||
end = duration
|
||||
else:
|
||||
end += min_chunk
|
||||
now = duration
|
||||
|
||||
else: # online = simultaneous mode
|
||||
end = 0
|
||||
while True:
|
||||
now = time.time() - start
|
||||
if now < end + min_chunk:
|
||||
time.sleep(min_chunk + end - now)
|
||||
end = time.time() - start
|
||||
a = load_audio_chunk(audio_path, beg, end)
|
||||
beg = end
|
||||
online.insert_audio_chunk(a)
|
||||
|
||||
try:
|
||||
o = online.process_iter()
|
||||
except AssertionError as e:
|
||||
logger.error(f"assertion error: {e}")
|
||||
pass
|
||||
else:
|
||||
output_transcript(o)
|
||||
now = time.time() - start
|
||||
logger.debug(
|
||||
f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}"
|
||||
)
|
||||
|
||||
if end >= duration:
|
||||
break
|
||||
now = None
|
||||
|
||||
o = online.finish()
|
||||
output_transcript(o, now=now)
|
||||
Loading…
Reference in a new issue