Add audio partial silence in chunks handling. bump to 0.2.14.post3
This commit is contained in:
parent
437641fb43
commit
e9b4ceeee5
4 changed files with 13 additions and 71 deletions
|
|
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "whisperlivekit"
|
name = "whisperlivekit"
|
||||||
version = "0.2.14.post2"
|
version = "0.2.14.post3"
|
||||||
description = "Real-time speech-to-text with speaker diarization using Whisper"
|
description = "Real-time speech-to-text with speaker diarization using Whisper"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
authors = [
|
authors = [
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ from whisperlivekit.backend_support import (
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from whisperlivekit.simul_whisper.config import AlignAttConfig
|
from whisperlivekit.simul_whisper.config import AlignAttConfig
|
||||||
from whisperlivekit.simul_whisper.simul_whisper import PaddedAlignAttWhisper
|
from whisperlivekit.simul_whisper.simul_whisper import AlignAtt
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -34,6 +34,8 @@ if HAS_FASTER_WHISPER:
|
||||||
else:
|
else:
|
||||||
WhisperModel = None
|
WhisperModel = None
|
||||||
|
|
||||||
|
MIN_DURATION_REAL_SILENCE = 5
|
||||||
|
|
||||||
class SimulStreamingOnlineProcessor:
|
class SimulStreamingOnlineProcessor:
|
||||||
SAMPLING_RATE = 16000
|
SAMPLING_RATE = 16000
|
||||||
|
|
||||||
|
|
@ -56,7 +58,7 @@ class SimulStreamingOnlineProcessor:
|
||||||
|
|
||||||
def load_new_backend(self):
|
def load_new_backend(self):
|
||||||
model = self.asr.get_new_model_instance()
|
model = self.asr.get_new_model_instance()
|
||||||
self.model = PaddedAlignAttWhisper(
|
self.model = AlignAtt(
|
||||||
cfg=self.asr.cfg,
|
cfg=self.asr.cfg,
|
||||||
loaded_model=model,
|
loaded_model=model,
|
||||||
mlx_encoder=self.asr.mlx_encoder,
|
mlx_encoder=self.asr.mlx_encoder,
|
||||||
|
|
@ -69,10 +71,10 @@ class SimulStreamingOnlineProcessor:
|
||||||
|
|
||||||
def end_silence(self, silence_duration, offset):
|
def end_silence(self, silence_duration, offset):
|
||||||
"""
|
"""
|
||||||
If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
|
If silences are > MIN_DURATION_REAL_SILENCE, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
|
||||||
"""
|
"""
|
||||||
self.end += silence_duration
|
self.end += silence_duration
|
||||||
long_silence = silence_duration >= 5
|
long_silence = silence_duration >= MIN_DURATION_REAL_SILENCE
|
||||||
if not long_silence:
|
if not long_silence:
|
||||||
gap_len = int(16000 * silence_duration)
|
gap_len = int(16000 * silence_duration)
|
||||||
if gap_len > 0:
|
if gap_len > 0:
|
||||||
|
|
@ -306,7 +308,7 @@ class SimulStreamingASR():
|
||||||
if warmup_audio is not None:
|
if warmup_audio is not None:
|
||||||
warmup_audio = torch.from_numpy(warmup_audio).float()
|
warmup_audio = torch.from_numpy(warmup_audio).float()
|
||||||
if self.fast_encoder:
|
if self.fast_encoder:
|
||||||
temp_model = PaddedAlignAttWhisper(
|
temp_model = AlignAtt(
|
||||||
cfg=self.cfg,
|
cfg=self.cfg,
|
||||||
loaded_model=whisper_model,
|
loaded_model=whisper_model,
|
||||||
mlx_encoder=self.mlx_encoder,
|
mlx_encoder=self.mlx_encoder,
|
||||||
|
|
|
||||||
|
|
@ -1,43 +0,0 @@
|
||||||
class Tokens:
|
|
||||||
def __init__(self, tokens):
|
|
||||||
self.tokens = tokens
|
|
||||||
|
|
||||||
# def clone(self):
|
|
||||||
# return Tokens(self.tokens.clone())
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return str(self.tokens.tolist())
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.__str__()
|
|
||||||
|
|
||||||
class BeamTokens(Tokens):
|
|
||||||
def __init__(self, tokens, beam_size):
|
|
||||||
self.tokens = tokens
|
|
||||||
self.beam_size = beam_size
|
|
||||||
|
|
||||||
def clone(self):
|
|
||||||
return BeamTokens(self.tokens.clone())
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return f"BeamTokens({self.tokens.tolist()}, beam_size={self.beam_size})"
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.__str__()
|
|
||||||
|
|
||||||
def as_text(self, tokenizer):
|
|
||||||
return tokenizer.decode(self.tokens)
|
|
||||||
|
|
||||||
class Logits(Tokens):
|
|
||||||
def __init__(self, logits):
|
|
||||||
super().__init__(logits)
|
|
||||||
|
|
||||||
# def clone(self):
|
|
||||||
# return Logits(self.tokens.clone(), self.beam_size)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
# return "abc"
|
|
||||||
return f"Logits({self.tokens.shape})"
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.__str__()
|
|
||||||
|
|
@ -1,17 +1,16 @@
|
||||||
# This code was originally in simul_whisper/transcriber/simul_whisper.py . It is adapted a lot for SimulStreaming.
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from whisperlivekit.whisper import load_model, DecodingOptions, tokenizer
|
from whisperlivekit.whisper import DecodingOptions, tokenizer
|
||||||
from .config import AlignAttConfig
|
from .config import AlignAttConfig
|
||||||
from whisperlivekit.timed_objects import ASRToken
|
from whisperlivekit.timed_objects import ASRToken
|
||||||
from whisperlivekit.whisper.audio import log_mel_spectrogram, TOKENS_PER_SECOND, pad_or_trim, N_SAMPLES, N_FRAMES
|
from whisperlivekit.whisper.audio import log_mel_spectrogram, TOKENS_PER_SECOND, pad_or_trim, N_SAMPLES, N_FRAMES
|
||||||
from whisperlivekit.whisper.timing import median_filter
|
from whisperlivekit.whisper.timing import median_filter
|
||||||
from whisperlivekit.whisper.decoding import GreedyDecoder, BeamSearchDecoder, SuppressTokens, detect_language
|
from whisperlivekit.whisper.decoding import GreedyDecoder, BeamSearchDecoder, SuppressTokens
|
||||||
from .beam import BeamPyTorchInference
|
from .beam import BeamPyTorchInference
|
||||||
from .eow_detection import fire_at_boundary, load_cif
|
from .eow_detection import fire_at_boundary, load_cif
|
||||||
import os
|
import os
|
||||||
|
|
@ -22,26 +21,18 @@ from whisperlivekit.backend_support import (
|
||||||
faster_backend_available,
|
faster_backend_available,
|
||||||
)
|
)
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from ..timed_objects import PUNCTUATION_MARKS
|
from ..timed_objects import PUNCTUATION_MARKS
|
||||||
from .generation_progress import *
|
|
||||||
|
|
||||||
DEC_PAD = 50257
|
DEC_PAD = 50257
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
HAS_MLX_WHISPER = False
|
|
||||||
HAS_FASTER_WHISPER = False
|
|
||||||
|
|
||||||
if mlx_backend_available():
|
if mlx_backend_available():
|
||||||
from mlx_whisper.audio import log_mel_spectrogram as mlx_log_mel_spectrogram
|
from mlx_whisper.audio import log_mel_spectrogram as mlx_log_mel_spectrogram
|
||||||
from mlx_whisper.transcribe import pad_or_trim as mlx_pad_or_trim
|
from mlx_whisper.transcribe import pad_or_trim as mlx_pad_or_trim
|
||||||
HAS_MLX_WHISPER = True
|
|
||||||
|
|
||||||
if faster_backend_available():
|
if faster_backend_available():
|
||||||
from faster_whisper.audio import pad_or_trim as fw_pad_or_trim
|
from faster_whisper.audio import pad_or_trim as fw_pad_or_trim
|
||||||
from faster_whisper.feature_extractor import FeatureExtractor
|
from faster_whisper.feature_extractor import FeatureExtractor
|
||||||
HAS_FASTER_WHISPER = True
|
|
||||||
|
|
||||||
USE_MLCORE = False
|
USE_MLCORE = False
|
||||||
|
|
||||||
|
|
@ -60,7 +51,7 @@ def load_coreml_encoder():
|
||||||
return _coreml_encoder, _coreml_input_name, _coreml_output_name
|
return _coreml_encoder, _coreml_input_name, _coreml_output_name
|
||||||
|
|
||||||
|
|
||||||
class PaddedAlignAttWhisper:
|
class AlignAtt:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
cfg: AlignAttConfig,
|
cfg: AlignAttConfig,
|
||||||
|
|
@ -72,7 +63,7 @@ class PaddedAlignAttWhisper:
|
||||||
|
|
||||||
self.model = loaded_model
|
self.model = loaded_model
|
||||||
self.mlx_encoder = mlx_encoder
|
self.mlx_encoder = mlx_encoder
|
||||||
self.fw_encoder = fw_encoder
|
self.fw_encoder = fw_encoder
|
||||||
if fw_encoder:
|
if fw_encoder:
|
||||||
self.fw_feature_extractor = FeatureExtractor(feature_size=self.model.dims.n_mels)
|
self.fw_feature_extractor = FeatureExtractor(feature_size=self.model.dims.n_mels)
|
||||||
self.coreml_encoder_tuple = None
|
self.coreml_encoder_tuple = None
|
||||||
|
|
@ -414,14 +405,6 @@ class PaddedAlignAttWhisper:
|
||||||
else:
|
else:
|
||||||
input_segments = self.segments[0]
|
input_segments = self.segments[0]
|
||||||
|
|
||||||
# if self.cfg.language == "auto" and self.reset_tokenizer_to_auto_next_call:
|
|
||||||
# logger.debug("Resetting tokenizer to auto for new sentence.")
|
|
||||||
# self.create_tokenizer(None)
|
|
||||||
# self.detected_language = None
|
|
||||||
# self.init_tokens()
|
|
||||||
# self.reset_tokenizer_to_auto_next_call = False
|
|
||||||
|
|
||||||
# NEW : we can use a different encoder, before using standart whisper for cross attention with the hooks on the decoder
|
|
||||||
beg_encode = time()
|
beg_encode = time()
|
||||||
if self.use_mlcore:
|
if self.use_mlcore:
|
||||||
coreml_encoder, coreml_input_name, coreml_output_name = self.coreml_encoder_tuple
|
coreml_encoder, coreml_input_name, coreml_output_name = self.coreml_encoder_tuple
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue