cutom alignment heads parameter for custom models

This commit is contained in:
Quentin Fuxa 2025-09-27 11:04:00 +02:00
parent 2fe3ca0188
commit 8cbaeecc75
10 changed files with 179 additions and 177 deletions

View file

@ -140,6 +140,7 @@ async def websocket_endpoint(websocket: WebSocket):
| Parameter | Description | Default |
|-----------|-------------|---------|
| `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/available_models.md) | `small` |
| `--model-dir` | Directory containing Whisper model.bin and other files. Overrides `--model`. | `None` |
| `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` |
| `--target-language` | If sets, activates translation using NLLB. Ex: `fr`. [118 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/translation/mapping_languages.py). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly. | `None` |
| `--task` | Set to `translate` to translate *only* to english, using Whisper translation. | `transcribe` |
@ -169,6 +170,7 @@ async def websocket_endpoint(websocket: WebSocket):
| SimulStreaming backend options | Description | Default |
|-----------|-------------|---------|
| `--disable-fast-encoder` | Disable Faster Whisper or MLX Whisper backends for the encoder (if installed). Inference can be slower but helpful when GPU memory is limited | `False` |
| `--custom-alignment-heads` | Use your own alignment heads, useful when `--model-dir` is used | `None` |
| `--frame-threshold` | AlignAtt frame threshold (lower = faster, higher = more accurate) | `25` |
| `--beams` | Number of beams for beam search (1 = greedy decoding) | `1` |
| `--decoder` | Force decoder type (`beam` or `greedy`) | `auto` |

View file

@ -72,7 +72,6 @@ class AudioProcessor:
# Models and processing
self.asr = models.asr
self.tokenizer = models.tokenizer
self.vac_model = models.vac_model
if self.args.vac:
self.vac = FixedVADIterator(models.vac_model)
@ -109,7 +108,7 @@ class AudioProcessor:
self.diarization = None
if self.args.transcription:
self.transcription = online_factory(self.args, models.asr, models.tokenizer)
self.transcription = online_factory(self.args, models.asr)
self.sep = self.transcription.asr.sep
if self.args.diarization:
self.diarization = online_diarization_factory(self.args, models.diarization_model)

View file

@ -4,10 +4,15 @@ try:
except ImportError:
from .whisper_streaming_custom.whisper_online import backend_factory
from .whisper_streaming_custom.online_asr import OnlineASRProcessor
from whisperlivekit.warmup import warmup_asr
from argparse import Namespace
import sys
def update_with_kwargs(_dict, kwargs):
_dict.update({
k: v for k, v in kwargs.items() if k in _dict
})
return _dict
class TranscriptionEngine:
_instance = None
_initialized = False
@ -21,20 +26,12 @@ class TranscriptionEngine:
if TranscriptionEngine._initialized:
return
defaults = {
global_params = {
"host": "localhost",
"port": 8000,
"warmup_file": None,
"diarization": False,
"punctuation_split": False,
"min_chunk_size": 0.5,
"model": "tiny",
"model_cache_dir": None,
"model_dir": None,
"lan": "auto",
"task": "transcribe",
"target_language": "",
"backend": "faster-whisper",
"vac": True,
"vac_chunk_size": 0.04,
"log_level": "DEBUG",
@ -43,54 +40,31 @@ class TranscriptionEngine:
"transcription": True,
"vad": True,
"pcm_input": False,
# whisperstreaming params:
"buffer_trimming": "segment",
"confidence_validation": False,
"buffer_trimming_sec": 15,
# simulstreaming params:
"disable_fast_encoder": False,
"frame_threshold": 25,
"beams": 1,
"decoder_type": None,
"audio_max_len": 20.0,
"audio_min_len": 0.0,
"cif_ckpt_path": None,
"never_fire": False,
"init_prompt": None,
"static_init_prompt": None,
"max_context_tokens": None,
"model_path": './base.pt',
"diarization_backend": "sortformer",
# diarization params:
"disable_punctuation_split" : False,
"segmentation_model": "pyannote/segmentation-3.0",
"embedding_model": "pyannote/embedding",
# translation params:
"nllb_backend": "ctranslate2",
"nllb_size": "600M"
"diarization_backend": "sortformer",
}
global_params = update_with_kwargs(global_params, kwargs)
config_dict = {**defaults, **kwargs}
transcription_common_params = {
"backend": "simulstreaming",
"warmup_file": None,
"min_chunk_size": 0.5,
"model_size": "tiny",
"model_cache_dir": None,
"model_dir": None,
"lan": "auto",
"task": "transcribe",
}
transcription_common_params = update_with_kwargs(transcription_common_params, kwargs)
if 'no_transcription' in kwargs:
config_dict['transcription'] = not kwargs['no_transcription']
global_params['transcription'] = not global_params['no_transcription']
if 'no_vad' in kwargs:
config_dict['vad'] = not kwargs['no_vad']
global_params['vad'] = not kwargs['no_vad']
if 'no_vac' in kwargs:
config_dict['vac'] = not kwargs['no_vac']
config_dict.pop('no_transcription', None)
config_dict.pop('no_vad', None)
global_params['vac'] = not kwargs['no_vac']
if 'language' in kwargs:
config_dict['lan'] = kwargs['language']
config_dict.pop('language', None)
self.args = Namespace(**config_dict)
self.args = Namespace(**{**global_params, **transcription_common_params})
self.asr = None
self.tokenizer = None
@ -104,44 +78,57 @@ class TranscriptionEngine:
if self.args.transcription:
if self.args.backend == "simulstreaming":
from whisperlivekit.simul_whisper import SimulStreamingASR
self.tokenizer = None
simulstreaming_kwargs = {}
for attr in ['frame_threshold', 'beams', 'decoder_type', 'audio_max_len', 'audio_min_len',
'cif_ckpt_path', 'never_fire', 'init_prompt', 'static_init_prompt',
'max_context_tokens', 'model_path', 'warmup_file', 'preload_model_count', 'disable_fast_encoder']:
if hasattr(self.args, attr):
simulstreaming_kwargs[attr] = getattr(self.args, attr)
# Add segment_length from min_chunk_size
simulstreaming_kwargs['segment_length'] = getattr(self.args, 'min_chunk_size', 0.5)
simulstreaming_kwargs['task'] = self.args.task
size = self.args.model
simulstreaming_params = {
"disable_fast_encoder": False,
"custom_alignment_heads": None,
"frame_threshold": 25,
"beams": 1,
"decoder_type": None,
"audio_max_len": 20.0,
"audio_min_len": 0.0,
"cif_ckpt_path": None,
"never_fire": False,
"init_prompt": None,
"static_init_prompt": None,
"max_context_tokens": None,
"model_path": './base.pt',
"preload_model_count": 1,
}
simulstreaming_params = update_with_kwargs(simulstreaming_params, kwargs)
self.tokenizer = None
self.asr = SimulStreamingASR(
modelsize=size,
lan=self.args.lan,
cache_dir=getattr(self.args, 'model_cache_dir', None),
model_dir=getattr(self.args, 'model_dir', None),
**simulstreaming_kwargs
**transcription_common_params, **simulstreaming_params
)
else:
self.asr, self.tokenizer = backend_factory(self.args)
warmup_asr(self.asr, self.args.warmup_file) #for simulstreaming, warmup should be done in the online class not here
whisperstreaming_params = {
"buffer_trimming": "segment",
"confidence_validation": False,
"buffer_trimming_sec": 15,
}
whisperstreaming_params = update_with_kwargs(whisperstreaming_params, kwargs)
self.asr = backend_factory(
**transcription_common_params, **whisperstreaming_params
)
if self.args.diarization:
if self.args.diarization_backend == "diart":
from whisperlivekit.diarization.diart_backend import DiartDiarization
diart_params = {
"segmentation_model": "pyannote/segmentation-3.0",
"embedding_model": "pyannote/embedding",
}
diart_params = update_with_kwargs(diart_params, kwargs)
self.diarization_model = DiartDiarization(
block_duration=self.args.min_chunk_size,
segmentation_model_name=self.args.segmentation_model,
embedding_model_name=self.args.embedding_model
**diart_params
)
elif self.args.diarization_backend == "sortformer":
from whisperlivekit.diarization.sortformer_backend import SortformerDiarization
self.diarization_model = SortformerDiarization()
else:
raise ValueError(f"Unknown diarization backend: {self.args.diarization_backend}")
self.translation_model = None
if self.args.target_language:
@ -149,26 +136,21 @@ class TranscriptionEngine:
raise Exception('Translation cannot be set with language auto when transcription backend is not simulstreaming')
else:
from whisperlivekit.translation.translation import load_model
self.translation_model = load_model([self.args.lan], backend=self.args.nllb_backend, model_size=self.args.nllb_size) #in the future we want to handle different languages for different speakers
translation_params = {
"nllb_backend": "ctranslate2",
"nllb_size": "600M"
}
translation_params = update_with_kwargs(translation_params, kwargs)
self.translation_model = load_model([self.args.lan], **translation_params) #in the future we want to handle different languages for different speakers
TranscriptionEngine._initialized = True
def online_factory(args, asr, tokenizer, logfile=sys.stderr):
def online_factory(args, asr):
if args.backend == "simulstreaming":
from whisperlivekit.simul_whisper import SimulStreamingOnlineProcessor
online = SimulStreamingOnlineProcessor(
asr,
logfile=logfile,
)
online = SimulStreamingOnlineProcessor(asr)
else:
online = OnlineASRProcessor(
asr,
tokenizer,
logfile=logfile,
buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec),
confidence_validation = args.confidence_validation
)
online = OnlineASRProcessor(asr)
return online

View file

@ -89,6 +89,7 @@ def parse_args():
"--model",
type=str,
default="small",
dest='model_size',
help="Name size of the Whisper model to use (default: tiny). Suggested values: tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo. The model is automatically downloaded from the model hub if not present in model cache dir.",
)
@ -109,6 +110,7 @@ def parse_args():
"--language",
type=str,
default="auto",
dest='lan',
help="Source language code, e.g. en,de,cs, or 'auto' for language detection.",
)
parser.add_argument(
@ -189,6 +191,13 @@ def parse_args():
dest="disable_fast_encoder",
help="Disable Faster Whisper or MLX Whisper backends for encoding (if installed). Slower but helpful when GPU memory is limited",
)
simulstreaming_group.add_argument(
"--custom-alignment-heads",
type=str,
default=None,
help="Use your own alignment heads, useful when `--model-dir` is used",
)
simulstreaming_group.add_argument(
"--frame-threshold",

View file

@ -47,7 +47,6 @@ class SimulStreamingOnlineProcessor:
self,
asr,
logfile=sys.stderr,
warmup_file=None
):
self.asr = asr
self.logfile = logfile
@ -146,31 +145,20 @@ class SimulStreamingASR():
"""SimulStreaming backend with AlignAtt policy."""
sep = ""
def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr, **kwargs):
def __init__(self, logfile=sys.stderr, **kwargs):
self.logfile = logfile
self.transcribe_kargs = {}
self.original_language = lan
self.model_path = kwargs.get('model_path', './large-v3.pt')
self.frame_threshold = kwargs.get('frame_threshold', 25)
self.audio_max_len = kwargs.get('audio_max_len', 20.0)
self.audio_min_len = kwargs.get('audio_min_len', 0.0)
self.segment_length = kwargs.get('segment_length', 0.5)
self.beams = kwargs.get('beams', 1)
self.decoder_type = kwargs.get('decoder_type', 'greedy' if self.beams == 1 else 'beam')
self.task = kwargs.get('task', 'transcribe')
self.cif_ckpt_path = kwargs.get('cif_ckpt_path', None)
self.never_fire = kwargs.get('never_fire', False)
self.init_prompt = kwargs.get('init_prompt', None)
self.static_init_prompt = kwargs.get('static_init_prompt', None)
self.max_context_tokens = kwargs.get('max_context_tokens', None)
self.warmup_file = kwargs.get('warmup_file', None)
self.preload_model_count = kwargs.get('preload_model_count', 1)
self.disable_fast_encoder = kwargs.get('disable_fast_encoder', False)
for key, value in kwargs.items():
setattr(self, key, value)
if self.decoder_type is None:
self.decoder_type = 'greedy' if self.beams == 1 else 'beam'
self.fast_encoder = False
if model_dir is not None:
self.model_path = model_dir
elif modelsize is not None:
if self.model_dir is not None:
self.model_path = self.model_dir
elif self.model_size is not None:
model_mapping = {
'tiny': './tiny.pt',
'base': './base.pt',
@ -185,13 +173,13 @@ class SimulStreamingASR():
'large-v3': './large-v3.pt',
'large': './large-v3.pt'
}
self.model_path = model_mapping.get(modelsize, f'./{modelsize}.pt')
self.model_path = model_mapping.get(self.model_size, f'./{self.model_size}.pt')
self.cfg = AlignAttConfig(
model_path=self.model_path,
segment_length=self.segment_length,
segment_length=self.min_chunk_size,
frame_threshold=self.frame_threshold,
language=self.original_language,
language=self.lan,
audio_max_len=self.audio_max_len,
audio_min_len=self.audio_min_len,
cif_ckpt_path=self.cif_ckpt_path,
@ -210,11 +198,15 @@ class SimulStreamingASR():
else:
self.tokenizer = None
self.model_name = os.path.basename(self.cfg.model_path).replace(".pt", "")
self.model_path = os.path.dirname(os.path.abspath(self.cfg.model_path))
if self.model_dir:
self.model_name = self.model_dir
self.model_path = None
else:
self.model_name = os.path.basename(self.cfg.model_path).replace(".pt", "")
self.model_path = os.path.dirname(os.path.abspath(self.cfg.model_path))
self.mlx_encoder, self.fw_encoder = None, None
if not self.disable_fast_encoder:
if not self.disable_fast_encoder and not self.model_dir:
if HAS_MLX_WHISPER:
print('Simulstreaming will use MLX whisper for a faster encoder.')
mlx_model_name = mlx_model_mapping[self.model_name]
@ -233,7 +225,12 @@ class SimulStreamingASR():
def load_model(self):
whisper_model = load_model(name=self.model_name, download_root=self.model_path, decoder_only=self.fast_encoder)
whisper_model = load_model(
name=self.model_name,
download_root=self.model_path,
decoder_only=self.fast_encoder,
custom_alignment_heads=self.custom_alignment_heads
)
warmup_audio = load_file(self.warmup_file)
if warmup_audio is not None:
warmup_audio = torch.from_numpy(warmup_audio).float()
@ -249,7 +246,7 @@ class SimulStreamingASR():
else:
# For standard encoder, use the original transcribe warmup
warmup_audio = load_file(self.warmup_file)
whisper_model.transcribe(warmup_audio, language=self.original_language if self.original_language != 'auto' else None)
whisper_model.transcribe(warmup_audio, language=self.lan if self.lan != 'auto' else None)
return whisper_model
def get_new_model_instance(self):

View file

@ -105,7 +105,8 @@ def load_model(
device: Optional[Union[str, torch.device]] = None,
download_root: str = None,
in_memory: bool = False,
decoder_only=False
decoder_only=False,
custom_alignment_heads=None
) -> Whisper:
"""
Load a Whisper ASR model
@ -135,15 +136,17 @@ def load_model(
download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default), "whisper")
if name in _MODELS:
checkpoint_file = _download(_MODELS[name], download_root, in_memory)
alignment_heads = _ALIGNMENT_HEADS[name]
checkpoint_file = _download(_MODELS[name], download_root, in_memory)
elif os.path.isfile(name):
checkpoint_file = open(name, "rb").read() if in_memory else name
alignment_heads = None
else:
raise RuntimeError(
f"Model {name} not found; available models = {available_models()}"
)
alignment_heads = _ALIGNMENT_HEADS.get(name, None)
if custom_alignment_heads:
alignment_heads = custom_alignment_heads.encode()
with (
io.BytesIO(checkpoint_file) if in_memory else open(checkpoint_file, "rb")

View file

@ -21,27 +21,27 @@ class TranslationModel():
device: str
tokenizer: dict = field(default_factory=dict)
backend_type: str = 'ctranslate2'
model_size: str = '600M'
nllb_size: str = '600M'
def get_tokenizer(self, input_lang):
if not self.tokenizer.get(input_lang, False):
self.tokenizer[input_lang] = transformers.AutoTokenizer.from_pretrained(
f"facebook/nllb-200-distilled-{self.model_size}",
f"facebook/nllb-200-distilled-{self.nllb_size}",
src_lang=input_lang,
clean_up_tokenization_spaces=True
)
return self.tokenizer[input_lang]
def load_model(src_langs, backend='ctranslate2', model_size='600M'):
def load_model(src_langs, nllb_backend='ctranslate2', nllb_size='600M'):
device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL = f'nllb-200-distilled-{model_size}-ctranslate2'
if backend=='ctranslate2':
MODEL = f'nllb-200-distilled-{nllb_size}-ctranslate2'
if nllb_backend=='ctranslate2':
MODEL_GUY = 'entai2965'
huggingface_hub.snapshot_download(MODEL_GUY + '/' + MODEL,local_dir=MODEL)
translator = ctranslate2.Translator(MODEL,device=device)
elif backend=='transformers':
translator = transformers.AutoModelForSeq2SeqLM.from_pretrained(f"facebook/nllb-200-distilled-{model_size}")
elif nllb_backend=='transformers':
translator = transformers.AutoModelForSeq2SeqLM.from_pretrained(f"facebook/nllb-200-distilled-{nllb_size}")
tokenizer = dict()
for src_lang in src_langs:
if src_lang != 'auto':
@ -50,9 +50,9 @@ def load_model(src_langs, backend='ctranslate2', model_size='600M'):
translation_model = TranslationModel(
translator=translator,
tokenizer=tokenizer,
backend_type=backend,
backend_type=nllb_backend,
device = device,
model_size = model_size
nllb_size = nllb_size
)
for src_lang in src_langs:
if src_lang != 'auto':
@ -157,7 +157,7 @@ if __name__ == '__main__':
test = test_string.split(' ')
step = len(test) // 3
shared_model = load_model([input_lang], backend='ctranslate2')
shared_model = load_model([input_lang], nllb_backend='ctranslate2')
online_translation = OnlineTranslation(shared_model, input_languages=[input_lang], output_languages=[output_lang])
beg_inference = time.time()

View file

@ -11,14 +11,14 @@ class ASRBase:
sep = " " # join transcribe words with this character (" " for whisper_timestamped,
# "" for faster-whisper because it emits the spaces when needed)
def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr):
def __init__(self, lan, model_size=None, cache_dir=None, model_dir=None, logfile=sys.stderr):
self.logfile = logfile
self.transcribe_kargs = {}
if lan == "auto":
self.original_language = None
else:
self.original_language = lan
self.model = self.load_model(modelsize, cache_dir, model_dir)
self.model = self.load_model(model_size, cache_dir, model_dir)
def with_offset(self, offset: float) -> ASRToken:
# This method is kept for compatibility (typically you will use ASRToken.with_offset)
@ -27,7 +27,7 @@ class ASRBase:
def __repr__(self):
return f"ASRToken(start={self.start:.2f}, end={self.end:.2f}, text={self.text!r})"
def load_model(self, modelsize, cache_dir, model_dir):
def load_model(self, model_size, cache_dir, model_dir):
raise NotImplementedError("must be implemented in the child class")
def transcribe(self, audio, init_prompt=""):
@ -41,7 +41,7 @@ class WhisperTimestampedASR(ASRBase):
"""Uses whisper_timestamped as the backend."""
sep = " "
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
def load_model(self, model_size=None, cache_dir=None, model_dir=None):
import whisper
import whisper_timestamped
from whisper_timestamped import transcribe_timestamped
@ -49,7 +49,7 @@ class WhisperTimestampedASR(ASRBase):
self.transcribe_timestamped = transcribe_timestamped
if model_dir is not None:
logger.debug("ignoring model_dir, not implemented")
return whisper.load_model(modelsize, download_root=cache_dir)
return whisper.load_model(model_size, download_root=cache_dir)
def transcribe(self, audio, init_prompt=""):
result = self.transcribe_timestamped(
@ -88,17 +88,17 @@ class FasterWhisperASR(ASRBase):
"""Uses faster-whisper as the backend."""
sep = ""
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
def load_model(self, model_size=None, cache_dir=None, model_dir=None):
from faster_whisper import WhisperModel
if model_dir is not None:
logger.debug(f"Loading whisper model from model_dir {model_dir}. "
f"modelsize and cache_dir parameters are not used.")
f"model_size and cache_dir parameters are not used.")
model_size_or_path = model_dir
elif modelsize is not None:
model_size_or_path = modelsize
elif model_size is not None:
model_size_or_path = model_size
else:
raise ValueError("Either modelsize or model_dir must be set")
raise ValueError("Either model_size or model_dir must be set")
device = "auto" # Allow CTranslate2 to decide available device
compute_type = "auto" # Allow CTranslate2 to decide faster compute type
@ -149,18 +149,18 @@ class MLXWhisper(ASRBase):
"""
sep = ""
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
def load_model(self, model_size=None, cache_dir=None, model_dir=None):
from mlx_whisper.transcribe import ModelHolder, transcribe
import mlx.core as mx
if model_dir is not None:
logger.debug(f"Loading whisper model from model_dir {model_dir}. modelsize parameter is not used.")
logger.debug(f"Loading whisper model from model_dir {model_dir}. model_size parameter is not used.")
model_size_or_path = model_dir
elif modelsize is not None:
model_size_or_path = self.translate_model_name(modelsize)
logger.debug(f"Loading whisper model {modelsize}. You use mlx whisper, so {model_size_or_path} will be used.")
elif model_size is not None:
model_size_or_path = self.translate_model_name(model_size)
logger.debug(f"Loading whisper model {model_size}. You use mlx whisper, so {model_size_or_path} will be used.")
else:
raise ValueError("Either modelsize or model_dir must be set")
raise ValueError("Either model_size or model_dir must be set")
self.model_size_or_path = model_size_or_path
dtype = mx.float16

View file

@ -106,9 +106,6 @@ class OnlineASRProcessor:
def __init__(
self,
asr,
tokenize_method: Optional[callable] = None,
buffer_trimming: Tuple[str, float] = ("segment", 15),
confidence_validation = False,
logfile=sys.stderr,
):
"""
@ -119,13 +116,14 @@ class OnlineASRProcessor:
buffer_trimming: A tuple (option, seconds), where option is either "sentence" or "segment".
"""
self.asr = asr
self.tokenize = tokenize_method
self.tokenize = asr.tokenizer
self.logfile = logfile
self.confidence_validation = confidence_validation
self.confidence_validation = asr.confidence_validation
self.global_time_offset = 0.0
self.init()
self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
self.buffer_trimming_way = asr.buffer_trimming
self.buffer_trimming_sec = asr.buffer_trimming_sec
if self.buffer_trimming_way not in ["sentence", "segment"]:
raise ValueError("buffer_trimming must be either 'sentence' or 'segment'")

View file

@ -6,6 +6,7 @@ from functools import lru_cache
import time
import logging
from .backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR
from whisperlivekit.warmup import warmup_asr
logger = logging.getLogger(__name__)
@ -63,11 +64,23 @@ def create_tokenizer(lan):
return WtPtok()
def backend_factory(args):
backend = args.backend
def backend_factory(
backend,
lan,
model_size,
model_cache_dir,
model_dir,
task,
buffer_trimming,
buffer_trimming_sec,
confidence_validation,
warmup_file=None,
min_chunk_size=None,
):
backend = backend
if backend == "openai-api":
logger.debug("Using OpenAI API.")
asr = OpenaiApiASR(lan=args.lan)
asr = OpenaiApiASR(lan=lan)
else:
if backend == "faster-whisper":
asr_cls = FasterWhisperASR
@ -77,34 +90,33 @@ def backend_factory(args):
asr_cls = WhisperTimestampedASR
# Only for FasterWhisperASR and WhisperTimestampedASR
size = args.model
t = time.time()
logger.info(f"Loading Whisper {size} model for language {args.lan}...")
logger.info(f"Loading Whisper {model_size} model for language {lan}...")
asr = asr_cls(
modelsize=size,
lan=args.lan,
cache_dir=getattr(args, 'model_cache_dir', None),
model_dir=getattr(args, 'model_dir', None),
model_size=model_size,
lan=lan,
cache_dir=model_cache_dir,
model_dir=model_dir,
)
e = time.time()
logger.info(f"done. It took {round(e-t,2)} seconds.")
# Apply common configurations
if getattr(args, "vad", False): # Checks if VAD argument is present and True
logger.info("Setting VAD filter")
asr.use_vad()
language = args.lan
if args.task == "translate":
if backend != "simulstreaming":
asr.set_translate_task()
if task == "translate":
tgt_language = "en" # Whisper translates into English
else:
tgt_language = language # Whisper transcribes in this language
tgt_language = lan # Whisper transcribes in this language
# Create the tokenizer
if args.buffer_trimming == "sentence":
if buffer_trimming == "sentence":
tokenizer = create_tokenizer(tgt_language)
else:
tokenizer = None
return asr, tokenizer
warmup_asr(asr, warmup_file)
asr.confidence_validation = confidence_validation
asr.tokenizer = tokenizer
asr.buffer_trimming = buffer_trimming
asr.buffer_trimming_sec = buffer_trimming_sec
return asr