Add audio partial silence in chunks handling. bump to 0.2.14.post3

This commit is contained in:
Quentin Fuxa 2025-11-17 22:52:00 +01:00
parent 437641fb43
commit e9b4ceeee5
4 changed files with 13 additions and 71 deletions

View file

@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "whisperlivekit" name = "whisperlivekit"
version = "0.2.14.post2" version = "0.2.14.post3"
description = "Real-time speech-to-text with speaker diarization using Whisper" description = "Real-time speech-to-text with speaker diarization using Whisper"
readme = "README.md" readme = "README.md"
authors = [ authors = [

View file

@ -18,7 +18,7 @@ from whisperlivekit.backend_support import (
import torch import torch
from whisperlivekit.simul_whisper.config import AlignAttConfig from whisperlivekit.simul_whisper.config import AlignAttConfig
from whisperlivekit.simul_whisper.simul_whisper import PaddedAlignAttWhisper from whisperlivekit.simul_whisper.simul_whisper import AlignAtt
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -34,6 +34,8 @@ if HAS_FASTER_WHISPER:
else: else:
WhisperModel = None WhisperModel = None
MIN_DURATION_REAL_SILENCE = 5
class SimulStreamingOnlineProcessor: class SimulStreamingOnlineProcessor:
SAMPLING_RATE = 16000 SAMPLING_RATE = 16000
@ -56,7 +58,7 @@ class SimulStreamingOnlineProcessor:
def load_new_backend(self): def load_new_backend(self):
model = self.asr.get_new_model_instance() model = self.asr.get_new_model_instance()
self.model = PaddedAlignAttWhisper( self.model = AlignAtt(
cfg=self.asr.cfg, cfg=self.asr.cfg,
loaded_model=model, loaded_model=model,
mlx_encoder=self.asr.mlx_encoder, mlx_encoder=self.asr.mlx_encoder,
@ -69,10 +71,10 @@ class SimulStreamingOnlineProcessor:
def end_silence(self, silence_duration, offset): def end_silence(self, silence_duration, offset):
""" """
If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame If silences are > MIN_DURATION_REAL_SILENCE, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
""" """
self.end += silence_duration self.end += silence_duration
long_silence = silence_duration >= 5 long_silence = silence_duration >= MIN_DURATION_REAL_SILENCE
if not long_silence: if not long_silence:
gap_len = int(16000 * silence_duration) gap_len = int(16000 * silence_duration)
if gap_len > 0: if gap_len > 0:
@ -306,7 +308,7 @@ class SimulStreamingASR():
if warmup_audio is not None: if warmup_audio is not None:
warmup_audio = torch.from_numpy(warmup_audio).float() warmup_audio = torch.from_numpy(warmup_audio).float()
if self.fast_encoder: if self.fast_encoder:
temp_model = PaddedAlignAttWhisper( temp_model = AlignAtt(
cfg=self.cfg, cfg=self.cfg,
loaded_model=whisper_model, loaded_model=whisper_model,
mlx_encoder=self.mlx_encoder, mlx_encoder=self.mlx_encoder,

View file

@ -1,43 +0,0 @@
class Tokens:
def __init__(self, tokens):
self.tokens = tokens
# def clone(self):
# return Tokens(self.tokens.clone())
def __str__(self):
return str(self.tokens.tolist())
def __repr__(self):
return self.__str__()
class BeamTokens(Tokens):
def __init__(self, tokens, beam_size):
self.tokens = tokens
self.beam_size = beam_size
def clone(self):
return BeamTokens(self.tokens.clone())
def __str__(self):
return f"BeamTokens({self.tokens.tolist()}, beam_size={self.beam_size})"
def __repr__(self):
return self.__str__()
def as_text(self, tokenizer):
return tokenizer.decode(self.tokens)
class Logits(Tokens):
def __init__(self, logits):
super().__init__(logits)
# def clone(self):
# return Logits(self.tokens.clone(), self.beam_size)
def __str__(self):
# return "abc"
return f"Logits({self.tokens.shape})"
def __repr__(self):
return self.__str__()

View file

@ -1,17 +1,16 @@
# This code was originally in simul_whisper/transcriber/simul_whisper.py . It is adapted a lot for SimulStreaming.
import os import os
import logging import logging
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import numpy as np
from whisperlivekit.whisper import load_model, DecodingOptions, tokenizer from whisperlivekit.whisper import DecodingOptions, tokenizer
from .config import AlignAttConfig from .config import AlignAttConfig
from whisperlivekit.timed_objects import ASRToken from whisperlivekit.timed_objects import ASRToken
from whisperlivekit.whisper.audio import log_mel_spectrogram, TOKENS_PER_SECOND, pad_or_trim, N_SAMPLES, N_FRAMES from whisperlivekit.whisper.audio import log_mel_spectrogram, TOKENS_PER_SECOND, pad_or_trim, N_SAMPLES, N_FRAMES
from whisperlivekit.whisper.timing import median_filter from whisperlivekit.whisper.timing import median_filter
from whisperlivekit.whisper.decoding import GreedyDecoder, BeamSearchDecoder, SuppressTokens, detect_language from whisperlivekit.whisper.decoding import GreedyDecoder, BeamSearchDecoder, SuppressTokens
from .beam import BeamPyTorchInference from .beam import BeamPyTorchInference
from .eow_detection import fire_at_boundary, load_cif from .eow_detection import fire_at_boundary, load_cif
import os import os
@ -22,26 +21,18 @@ from whisperlivekit.backend_support import (
faster_backend_available, faster_backend_available,
) )
import numpy as np
from ..timed_objects import PUNCTUATION_MARKS from ..timed_objects import PUNCTUATION_MARKS
from .generation_progress import *
DEC_PAD = 50257 DEC_PAD = 50257
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
HAS_MLX_WHISPER = False
HAS_FASTER_WHISPER = False
if mlx_backend_available(): if mlx_backend_available():
from mlx_whisper.audio import log_mel_spectrogram as mlx_log_mel_spectrogram from mlx_whisper.audio import log_mel_spectrogram as mlx_log_mel_spectrogram
from mlx_whisper.transcribe import pad_or_trim as mlx_pad_or_trim from mlx_whisper.transcribe import pad_or_trim as mlx_pad_or_trim
HAS_MLX_WHISPER = True
if faster_backend_available(): if faster_backend_available():
from faster_whisper.audio import pad_or_trim as fw_pad_or_trim from faster_whisper.audio import pad_or_trim as fw_pad_or_trim
from faster_whisper.feature_extractor import FeatureExtractor from faster_whisper.feature_extractor import FeatureExtractor
HAS_FASTER_WHISPER = True
USE_MLCORE = False USE_MLCORE = False
@ -60,7 +51,7 @@ def load_coreml_encoder():
return _coreml_encoder, _coreml_input_name, _coreml_output_name return _coreml_encoder, _coreml_input_name, _coreml_output_name
class PaddedAlignAttWhisper: class AlignAtt:
def __init__( def __init__(
self, self,
cfg: AlignAttConfig, cfg: AlignAttConfig,
@ -72,7 +63,7 @@ class PaddedAlignAttWhisper:
self.model = loaded_model self.model = loaded_model
self.mlx_encoder = mlx_encoder self.mlx_encoder = mlx_encoder
self.fw_encoder = fw_encoder self.fw_encoder = fw_encoder
if fw_encoder: if fw_encoder:
self.fw_feature_extractor = FeatureExtractor(feature_size=self.model.dims.n_mels) self.fw_feature_extractor = FeatureExtractor(feature_size=self.model.dims.n_mels)
self.coreml_encoder_tuple = None self.coreml_encoder_tuple = None
@ -414,14 +405,6 @@ class PaddedAlignAttWhisper:
else: else:
input_segments = self.segments[0] input_segments = self.segments[0]
# if self.cfg.language == "auto" and self.reset_tokenizer_to_auto_next_call:
# logger.debug("Resetting tokenizer to auto for new sentence.")
# self.create_tokenizer(None)
# self.detected_language = None
# self.init_tokens()
# self.reset_tokenizer_to_auto_next_call = False
# NEW : we can use a different encoder, before using standart whisper for cross attention with the hooks on the decoder
beg_encode = time() beg_encode = time()
if self.use_mlcore: if self.use_mlcore:
coreml_encoder, coreml_input_name, coreml_output_name = self.coreml_encoder_tuple coreml_encoder, coreml_input_name, coreml_output_name = self.coreml_encoder_tuple