cutom alignment heads parameter for custom models
This commit is contained in:
parent
2fe3ca0188
commit
8cbaeecc75
10 changed files with 179 additions and 177 deletions
|
|
@ -140,6 +140,7 @@ async def websocket_endpoint(websocket: WebSocket):
|
|||
| Parameter | Description | Default |
|
||||
|-----------|-------------|---------|
|
||||
| `--model` | Whisper model size. List and recommandations [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/available_models.md) | `small` |
|
||||
| `--model-dir` | Directory containing Whisper model.bin and other files. Overrides `--model`. | `None` |
|
||||
| `--language` | List [here](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/simul_whisper/whisper/tokenizer.py). If you use `auto`, the model attempts to detect the language automatically, but it tends to bias towards English. | `auto` |
|
||||
| `--target-language` | If sets, activates translation using NLLB. Ex: `fr`. [118 languages available](https://github.com/QuentinFuxa/WhisperLiveKit/blob/main/whisperlivekit/translation/mapping_languages.py). If you want to translate to english, you should rather use `--task translate`, since Whisper can do it directly. | `None` |
|
||||
| `--task` | Set to `translate` to translate *only* to english, using Whisper translation. | `transcribe` |
|
||||
|
|
@ -169,6 +170,7 @@ async def websocket_endpoint(websocket: WebSocket):
|
|||
| SimulStreaming backend options | Description | Default |
|
||||
|-----------|-------------|---------|
|
||||
| `--disable-fast-encoder` | Disable Faster Whisper or MLX Whisper backends for the encoder (if installed). Inference can be slower but helpful when GPU memory is limited | `False` |
|
||||
| `--custom-alignment-heads` | Use your own alignment heads, useful when `--model-dir` is used | `None` |
|
||||
| `--frame-threshold` | AlignAtt frame threshold (lower = faster, higher = more accurate) | `25` |
|
||||
| `--beams` | Number of beams for beam search (1 = greedy decoding) | `1` |
|
||||
| `--decoder` | Force decoder type (`beam` or `greedy`) | `auto` |
|
||||
|
|
|
|||
|
|
@ -72,7 +72,6 @@ class AudioProcessor:
|
|||
|
||||
# Models and processing
|
||||
self.asr = models.asr
|
||||
self.tokenizer = models.tokenizer
|
||||
self.vac_model = models.vac_model
|
||||
if self.args.vac:
|
||||
self.vac = FixedVADIterator(models.vac_model)
|
||||
|
|
@ -109,7 +108,7 @@ class AudioProcessor:
|
|||
self.diarization = None
|
||||
|
||||
if self.args.transcription:
|
||||
self.transcription = online_factory(self.args, models.asr, models.tokenizer)
|
||||
self.transcription = online_factory(self.args, models.asr)
|
||||
self.sep = self.transcription.asr.sep
|
||||
if self.args.diarization:
|
||||
self.diarization = online_diarization_factory(self.args, models.diarization_model)
|
||||
|
|
|
|||
|
|
@ -4,10 +4,15 @@ try:
|
|||
except ImportError:
|
||||
from .whisper_streaming_custom.whisper_online import backend_factory
|
||||
from .whisper_streaming_custom.online_asr import OnlineASRProcessor
|
||||
from whisperlivekit.warmup import warmup_asr
|
||||
from argparse import Namespace
|
||||
import sys
|
||||
|
||||
def update_with_kwargs(_dict, kwargs):
|
||||
_dict.update({
|
||||
k: v for k, v in kwargs.items() if k in _dict
|
||||
})
|
||||
return _dict
|
||||
|
||||
class TranscriptionEngine:
|
||||
_instance = None
|
||||
_initialized = False
|
||||
|
|
@ -21,20 +26,12 @@ class TranscriptionEngine:
|
|||
if TranscriptionEngine._initialized:
|
||||
return
|
||||
|
||||
defaults = {
|
||||
global_params = {
|
||||
"host": "localhost",
|
||||
"port": 8000,
|
||||
"warmup_file": None,
|
||||
"diarization": False,
|
||||
"punctuation_split": False,
|
||||
"min_chunk_size": 0.5,
|
||||
"model": "tiny",
|
||||
"model_cache_dir": None,
|
||||
"model_dir": None,
|
||||
"lan": "auto",
|
||||
"task": "transcribe",
|
||||
"target_language": "",
|
||||
"backend": "faster-whisper",
|
||||
"vac": True,
|
||||
"vac_chunk_size": 0.04,
|
||||
"log_level": "DEBUG",
|
||||
|
|
@ -43,54 +40,31 @@ class TranscriptionEngine:
|
|||
"transcription": True,
|
||||
"vad": True,
|
||||
"pcm_input": False,
|
||||
|
||||
# whisperstreaming params:
|
||||
"buffer_trimming": "segment",
|
||||
"confidence_validation": False,
|
||||
"buffer_trimming_sec": 15,
|
||||
|
||||
# simulstreaming params:
|
||||
"disable_fast_encoder": False,
|
||||
"frame_threshold": 25,
|
||||
"beams": 1,
|
||||
"decoder_type": None,
|
||||
"audio_max_len": 20.0,
|
||||
"audio_min_len": 0.0,
|
||||
"cif_ckpt_path": None,
|
||||
"never_fire": False,
|
||||
"init_prompt": None,
|
||||
"static_init_prompt": None,
|
||||
"max_context_tokens": None,
|
||||
"model_path": './base.pt',
|
||||
"diarization_backend": "sortformer",
|
||||
|
||||
# diarization params:
|
||||
"disable_punctuation_split" : False,
|
||||
"segmentation_model": "pyannote/segmentation-3.0",
|
||||
"embedding_model": "pyannote/embedding",
|
||||
|
||||
# translation params:
|
||||
"nllb_backend": "ctranslate2",
|
||||
"nllb_size": "600M"
|
||||
"diarization_backend": "sortformer",
|
||||
}
|
||||
global_params = update_with_kwargs(global_params, kwargs)
|
||||
|
||||
config_dict = {**defaults, **kwargs}
|
||||
transcription_common_params = {
|
||||
"backend": "simulstreaming",
|
||||
"warmup_file": None,
|
||||
"min_chunk_size": 0.5,
|
||||
"model_size": "tiny",
|
||||
"model_cache_dir": None,
|
||||
"model_dir": None,
|
||||
"lan": "auto",
|
||||
"task": "transcribe",
|
||||
}
|
||||
transcription_common_params = update_with_kwargs(transcription_common_params, kwargs)
|
||||
|
||||
if 'no_transcription' in kwargs:
|
||||
config_dict['transcription'] = not kwargs['no_transcription']
|
||||
global_params['transcription'] = not global_params['no_transcription']
|
||||
if 'no_vad' in kwargs:
|
||||
config_dict['vad'] = not kwargs['no_vad']
|
||||
global_params['vad'] = not kwargs['no_vad']
|
||||
if 'no_vac' in kwargs:
|
||||
config_dict['vac'] = not kwargs['no_vac']
|
||||
|
||||
config_dict.pop('no_transcription', None)
|
||||
config_dict.pop('no_vad', None)
|
||||
global_params['vac'] = not kwargs['no_vac']
|
||||
|
||||
if 'language' in kwargs:
|
||||
config_dict['lan'] = kwargs['language']
|
||||
config_dict.pop('language', None)
|
||||
|
||||
self.args = Namespace(**config_dict)
|
||||
self.args = Namespace(**{**global_params, **transcription_common_params})
|
||||
|
||||
self.asr = None
|
||||
self.tokenizer = None
|
||||
|
|
@ -104,44 +78,57 @@ class TranscriptionEngine:
|
|||
if self.args.transcription:
|
||||
if self.args.backend == "simulstreaming":
|
||||
from whisperlivekit.simul_whisper import SimulStreamingASR
|
||||
self.tokenizer = None
|
||||
simulstreaming_kwargs = {}
|
||||
for attr in ['frame_threshold', 'beams', 'decoder_type', 'audio_max_len', 'audio_min_len',
|
||||
'cif_ckpt_path', 'never_fire', 'init_prompt', 'static_init_prompt',
|
||||
'max_context_tokens', 'model_path', 'warmup_file', 'preload_model_count', 'disable_fast_encoder']:
|
||||
if hasattr(self.args, attr):
|
||||
simulstreaming_kwargs[attr] = getattr(self.args, attr)
|
||||
|
||||
# Add segment_length from min_chunk_size
|
||||
simulstreaming_kwargs['segment_length'] = getattr(self.args, 'min_chunk_size', 0.5)
|
||||
simulstreaming_kwargs['task'] = self.args.task
|
||||
|
||||
size = self.args.model
|
||||
simulstreaming_params = {
|
||||
"disable_fast_encoder": False,
|
||||
"custom_alignment_heads": None,
|
||||
"frame_threshold": 25,
|
||||
"beams": 1,
|
||||
"decoder_type": None,
|
||||
"audio_max_len": 20.0,
|
||||
"audio_min_len": 0.0,
|
||||
"cif_ckpt_path": None,
|
||||
"never_fire": False,
|
||||
"init_prompt": None,
|
||||
"static_init_prompt": None,
|
||||
"max_context_tokens": None,
|
||||
"model_path": './base.pt',
|
||||
"preload_model_count": 1,
|
||||
}
|
||||
simulstreaming_params = update_with_kwargs(simulstreaming_params, kwargs)
|
||||
|
||||
self.tokenizer = None
|
||||
self.asr = SimulStreamingASR(
|
||||
modelsize=size,
|
||||
lan=self.args.lan,
|
||||
cache_dir=getattr(self.args, 'model_cache_dir', None),
|
||||
model_dir=getattr(self.args, 'model_dir', None),
|
||||
**simulstreaming_kwargs
|
||||
**transcription_common_params, **simulstreaming_params
|
||||
)
|
||||
|
||||
else:
|
||||
self.asr, self.tokenizer = backend_factory(self.args)
|
||||
warmup_asr(self.asr, self.args.warmup_file) #for simulstreaming, warmup should be done in the online class not here
|
||||
|
||||
whisperstreaming_params = {
|
||||
"buffer_trimming": "segment",
|
||||
"confidence_validation": False,
|
||||
"buffer_trimming_sec": 15,
|
||||
}
|
||||
whisperstreaming_params = update_with_kwargs(whisperstreaming_params, kwargs)
|
||||
|
||||
self.asr = backend_factory(
|
||||
**transcription_common_params, **whisperstreaming_params
|
||||
)
|
||||
|
||||
if self.args.diarization:
|
||||
if self.args.diarization_backend == "diart":
|
||||
from whisperlivekit.diarization.diart_backend import DiartDiarization
|
||||
diart_params = {
|
||||
"segmentation_model": "pyannote/segmentation-3.0",
|
||||
"embedding_model": "pyannote/embedding",
|
||||
}
|
||||
diart_params = update_with_kwargs(diart_params, kwargs)
|
||||
self.diarization_model = DiartDiarization(
|
||||
block_duration=self.args.min_chunk_size,
|
||||
segmentation_model_name=self.args.segmentation_model,
|
||||
embedding_model_name=self.args.embedding_model
|
||||
**diart_params
|
||||
)
|
||||
elif self.args.diarization_backend == "sortformer":
|
||||
from whisperlivekit.diarization.sortformer_backend import SortformerDiarization
|
||||
self.diarization_model = SortformerDiarization()
|
||||
else:
|
||||
raise ValueError(f"Unknown diarization backend: {self.args.diarization_backend}")
|
||||
|
||||
self.translation_model = None
|
||||
if self.args.target_language:
|
||||
|
|
@ -149,26 +136,21 @@ class TranscriptionEngine:
|
|||
raise Exception('Translation cannot be set with language auto when transcription backend is not simulstreaming')
|
||||
else:
|
||||
from whisperlivekit.translation.translation import load_model
|
||||
self.translation_model = load_model([self.args.lan], backend=self.args.nllb_backend, model_size=self.args.nllb_size) #in the future we want to handle different languages for different speakers
|
||||
translation_params = {
|
||||
"nllb_backend": "ctranslate2",
|
||||
"nllb_size": "600M"
|
||||
}
|
||||
translation_params = update_with_kwargs(translation_params, kwargs)
|
||||
self.translation_model = load_model([self.args.lan], **translation_params) #in the future we want to handle different languages for different speakers
|
||||
TranscriptionEngine._initialized = True
|
||||
|
||||
|
||||
|
||||
def online_factory(args, asr, tokenizer, logfile=sys.stderr):
|
||||
def online_factory(args, asr):
|
||||
if args.backend == "simulstreaming":
|
||||
from whisperlivekit.simul_whisper import SimulStreamingOnlineProcessor
|
||||
online = SimulStreamingOnlineProcessor(
|
||||
asr,
|
||||
logfile=logfile,
|
||||
)
|
||||
online = SimulStreamingOnlineProcessor(asr)
|
||||
else:
|
||||
online = OnlineASRProcessor(
|
||||
asr,
|
||||
tokenizer,
|
||||
logfile=logfile,
|
||||
buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec),
|
||||
confidence_validation = args.confidence_validation
|
||||
)
|
||||
online = OnlineASRProcessor(asr)
|
||||
return online
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -89,6 +89,7 @@ def parse_args():
|
|||
"--model",
|
||||
type=str,
|
||||
default="small",
|
||||
dest='model_size',
|
||||
help="Name size of the Whisper model to use (default: tiny). Suggested values: tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo. The model is automatically downloaded from the model hub if not present in model cache dir.",
|
||||
)
|
||||
|
||||
|
|
@ -109,6 +110,7 @@ def parse_args():
|
|||
"--language",
|
||||
type=str,
|
||||
default="auto",
|
||||
dest='lan',
|
||||
help="Source language code, e.g. en,de,cs, or 'auto' for language detection.",
|
||||
)
|
||||
parser.add_argument(
|
||||
|
|
@ -189,6 +191,13 @@ def parse_args():
|
|||
dest="disable_fast_encoder",
|
||||
help="Disable Faster Whisper or MLX Whisper backends for encoding (if installed). Slower but helpful when GPU memory is limited",
|
||||
)
|
||||
|
||||
simulstreaming_group.add_argument(
|
||||
"--custom-alignment-heads",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Use your own alignment heads, useful when `--model-dir` is used",
|
||||
)
|
||||
|
||||
simulstreaming_group.add_argument(
|
||||
"--frame-threshold",
|
||||
|
|
|
|||
|
|
@ -47,7 +47,6 @@ class SimulStreamingOnlineProcessor:
|
|||
self,
|
||||
asr,
|
||||
logfile=sys.stderr,
|
||||
warmup_file=None
|
||||
):
|
||||
self.asr = asr
|
||||
self.logfile = logfile
|
||||
|
|
@ -146,31 +145,20 @@ class SimulStreamingASR():
|
|||
"""SimulStreaming backend with AlignAtt policy."""
|
||||
sep = ""
|
||||
|
||||
def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr, **kwargs):
|
||||
def __init__(self, logfile=sys.stderr, **kwargs):
|
||||
self.logfile = logfile
|
||||
self.transcribe_kargs = {}
|
||||
self.original_language = lan
|
||||
|
||||
self.model_path = kwargs.get('model_path', './large-v3.pt')
|
||||
self.frame_threshold = kwargs.get('frame_threshold', 25)
|
||||
self.audio_max_len = kwargs.get('audio_max_len', 20.0)
|
||||
self.audio_min_len = kwargs.get('audio_min_len', 0.0)
|
||||
self.segment_length = kwargs.get('segment_length', 0.5)
|
||||
self.beams = kwargs.get('beams', 1)
|
||||
self.decoder_type = kwargs.get('decoder_type', 'greedy' if self.beams == 1 else 'beam')
|
||||
self.task = kwargs.get('task', 'transcribe')
|
||||
self.cif_ckpt_path = kwargs.get('cif_ckpt_path', None)
|
||||
self.never_fire = kwargs.get('never_fire', False)
|
||||
self.init_prompt = kwargs.get('init_prompt', None)
|
||||
self.static_init_prompt = kwargs.get('static_init_prompt', None)
|
||||
self.max_context_tokens = kwargs.get('max_context_tokens', None)
|
||||
self.warmup_file = kwargs.get('warmup_file', None)
|
||||
self.preload_model_count = kwargs.get('preload_model_count', 1)
|
||||
self.disable_fast_encoder = kwargs.get('disable_fast_encoder', False)
|
||||
for key, value in kwargs.items():
|
||||
setattr(self, key, value)
|
||||
|
||||
if self.decoder_type is None:
|
||||
self.decoder_type = 'greedy' if self.beams == 1 else 'beam'
|
||||
|
||||
self.fast_encoder = False
|
||||
if model_dir is not None:
|
||||
self.model_path = model_dir
|
||||
elif modelsize is not None:
|
||||
if self.model_dir is not None:
|
||||
self.model_path = self.model_dir
|
||||
elif self.model_size is not None:
|
||||
model_mapping = {
|
||||
'tiny': './tiny.pt',
|
||||
'base': './base.pt',
|
||||
|
|
@ -185,13 +173,13 @@ class SimulStreamingASR():
|
|||
'large-v3': './large-v3.pt',
|
||||
'large': './large-v3.pt'
|
||||
}
|
||||
self.model_path = model_mapping.get(modelsize, f'./{modelsize}.pt')
|
||||
self.model_path = model_mapping.get(self.model_size, f'./{self.model_size}.pt')
|
||||
|
||||
self.cfg = AlignAttConfig(
|
||||
model_path=self.model_path,
|
||||
segment_length=self.segment_length,
|
||||
segment_length=self.min_chunk_size,
|
||||
frame_threshold=self.frame_threshold,
|
||||
language=self.original_language,
|
||||
language=self.lan,
|
||||
audio_max_len=self.audio_max_len,
|
||||
audio_min_len=self.audio_min_len,
|
||||
cif_ckpt_path=self.cif_ckpt_path,
|
||||
|
|
@ -210,11 +198,15 @@ class SimulStreamingASR():
|
|||
else:
|
||||
self.tokenizer = None
|
||||
|
||||
self.model_name = os.path.basename(self.cfg.model_path).replace(".pt", "")
|
||||
self.model_path = os.path.dirname(os.path.abspath(self.cfg.model_path))
|
||||
if self.model_dir:
|
||||
self.model_name = self.model_dir
|
||||
self.model_path = None
|
||||
else:
|
||||
self.model_name = os.path.basename(self.cfg.model_path).replace(".pt", "")
|
||||
self.model_path = os.path.dirname(os.path.abspath(self.cfg.model_path))
|
||||
|
||||
self.mlx_encoder, self.fw_encoder = None, None
|
||||
if not self.disable_fast_encoder:
|
||||
if not self.disable_fast_encoder and not self.model_dir:
|
||||
if HAS_MLX_WHISPER:
|
||||
print('Simulstreaming will use MLX whisper for a faster encoder.')
|
||||
mlx_model_name = mlx_model_mapping[self.model_name]
|
||||
|
|
@ -233,7 +225,12 @@ class SimulStreamingASR():
|
|||
|
||||
|
||||
def load_model(self):
|
||||
whisper_model = load_model(name=self.model_name, download_root=self.model_path, decoder_only=self.fast_encoder)
|
||||
whisper_model = load_model(
|
||||
name=self.model_name,
|
||||
download_root=self.model_path,
|
||||
decoder_only=self.fast_encoder,
|
||||
custom_alignment_heads=self.custom_alignment_heads
|
||||
)
|
||||
warmup_audio = load_file(self.warmup_file)
|
||||
if warmup_audio is not None:
|
||||
warmup_audio = torch.from_numpy(warmup_audio).float()
|
||||
|
|
@ -249,7 +246,7 @@ class SimulStreamingASR():
|
|||
else:
|
||||
# For standard encoder, use the original transcribe warmup
|
||||
warmup_audio = load_file(self.warmup_file)
|
||||
whisper_model.transcribe(warmup_audio, language=self.original_language if self.original_language != 'auto' else None)
|
||||
whisper_model.transcribe(warmup_audio, language=self.lan if self.lan != 'auto' else None)
|
||||
return whisper_model
|
||||
|
||||
def get_new_model_instance(self):
|
||||
|
|
|
|||
|
|
@ -105,7 +105,8 @@ def load_model(
|
|||
device: Optional[Union[str, torch.device]] = None,
|
||||
download_root: str = None,
|
||||
in_memory: bool = False,
|
||||
decoder_only=False
|
||||
decoder_only=False,
|
||||
custom_alignment_heads=None
|
||||
) -> Whisper:
|
||||
"""
|
||||
Load a Whisper ASR model
|
||||
|
|
@ -135,15 +136,17 @@ def load_model(
|
|||
download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default), "whisper")
|
||||
|
||||
if name in _MODELS:
|
||||
checkpoint_file = _download(_MODELS[name], download_root, in_memory)
|
||||
alignment_heads = _ALIGNMENT_HEADS[name]
|
||||
checkpoint_file = _download(_MODELS[name], download_root, in_memory)
|
||||
elif os.path.isfile(name):
|
||||
checkpoint_file = open(name, "rb").read() if in_memory else name
|
||||
alignment_heads = None
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Model {name} not found; available models = {available_models()}"
|
||||
)
|
||||
|
||||
alignment_heads = _ALIGNMENT_HEADS.get(name, None)
|
||||
if custom_alignment_heads:
|
||||
alignment_heads = custom_alignment_heads.encode()
|
||||
|
||||
with (
|
||||
io.BytesIO(checkpoint_file) if in_memory else open(checkpoint_file, "rb")
|
||||
|
|
|
|||
|
|
@ -21,27 +21,27 @@ class TranslationModel():
|
|||
device: str
|
||||
tokenizer: dict = field(default_factory=dict)
|
||||
backend_type: str = 'ctranslate2'
|
||||
model_size: str = '600M'
|
||||
nllb_size: str = '600M'
|
||||
|
||||
def get_tokenizer(self, input_lang):
|
||||
if not self.tokenizer.get(input_lang, False):
|
||||
self.tokenizer[input_lang] = transformers.AutoTokenizer.from_pretrained(
|
||||
f"facebook/nllb-200-distilled-{self.model_size}",
|
||||
f"facebook/nllb-200-distilled-{self.nllb_size}",
|
||||
src_lang=input_lang,
|
||||
clean_up_tokenization_spaces=True
|
||||
)
|
||||
return self.tokenizer[input_lang]
|
||||
|
||||
|
||||
def load_model(src_langs, backend='ctranslate2', model_size='600M'):
|
||||
def load_model(src_langs, nllb_backend='ctranslate2', nllb_size='600M'):
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
MODEL = f'nllb-200-distilled-{model_size}-ctranslate2'
|
||||
if backend=='ctranslate2':
|
||||
MODEL = f'nllb-200-distilled-{nllb_size}-ctranslate2'
|
||||
if nllb_backend=='ctranslate2':
|
||||
MODEL_GUY = 'entai2965'
|
||||
huggingface_hub.snapshot_download(MODEL_GUY + '/' + MODEL,local_dir=MODEL)
|
||||
translator = ctranslate2.Translator(MODEL,device=device)
|
||||
elif backend=='transformers':
|
||||
translator = transformers.AutoModelForSeq2SeqLM.from_pretrained(f"facebook/nllb-200-distilled-{model_size}")
|
||||
elif nllb_backend=='transformers':
|
||||
translator = transformers.AutoModelForSeq2SeqLM.from_pretrained(f"facebook/nllb-200-distilled-{nllb_size}")
|
||||
tokenizer = dict()
|
||||
for src_lang in src_langs:
|
||||
if src_lang != 'auto':
|
||||
|
|
@ -50,9 +50,9 @@ def load_model(src_langs, backend='ctranslate2', model_size='600M'):
|
|||
translation_model = TranslationModel(
|
||||
translator=translator,
|
||||
tokenizer=tokenizer,
|
||||
backend_type=backend,
|
||||
backend_type=nllb_backend,
|
||||
device = device,
|
||||
model_size = model_size
|
||||
nllb_size = nllb_size
|
||||
)
|
||||
for src_lang in src_langs:
|
||||
if src_lang != 'auto':
|
||||
|
|
@ -157,7 +157,7 @@ if __name__ == '__main__':
|
|||
test = test_string.split(' ')
|
||||
step = len(test) // 3
|
||||
|
||||
shared_model = load_model([input_lang], backend='ctranslate2')
|
||||
shared_model = load_model([input_lang], nllb_backend='ctranslate2')
|
||||
online_translation = OnlineTranslation(shared_model, input_languages=[input_lang], output_languages=[output_lang])
|
||||
|
||||
beg_inference = time.time()
|
||||
|
|
|
|||
|
|
@ -11,14 +11,14 @@ class ASRBase:
|
|||
sep = " " # join transcribe words with this character (" " for whisper_timestamped,
|
||||
# "" for faster-whisper because it emits the spaces when needed)
|
||||
|
||||
def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr):
|
||||
def __init__(self, lan, model_size=None, cache_dir=None, model_dir=None, logfile=sys.stderr):
|
||||
self.logfile = logfile
|
||||
self.transcribe_kargs = {}
|
||||
if lan == "auto":
|
||||
self.original_language = None
|
||||
else:
|
||||
self.original_language = lan
|
||||
self.model = self.load_model(modelsize, cache_dir, model_dir)
|
||||
self.model = self.load_model(model_size, cache_dir, model_dir)
|
||||
|
||||
def with_offset(self, offset: float) -> ASRToken:
|
||||
# This method is kept for compatibility (typically you will use ASRToken.with_offset)
|
||||
|
|
@ -27,7 +27,7 @@ class ASRBase:
|
|||
def __repr__(self):
|
||||
return f"ASRToken(start={self.start:.2f}, end={self.end:.2f}, text={self.text!r})"
|
||||
|
||||
def load_model(self, modelsize, cache_dir, model_dir):
|
||||
def load_model(self, model_size, cache_dir, model_dir):
|
||||
raise NotImplementedError("must be implemented in the child class")
|
||||
|
||||
def transcribe(self, audio, init_prompt=""):
|
||||
|
|
@ -41,7 +41,7 @@ class WhisperTimestampedASR(ASRBase):
|
|||
"""Uses whisper_timestamped as the backend."""
|
||||
sep = " "
|
||||
|
||||
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
||||
def load_model(self, model_size=None, cache_dir=None, model_dir=None):
|
||||
import whisper
|
||||
import whisper_timestamped
|
||||
from whisper_timestamped import transcribe_timestamped
|
||||
|
|
@ -49,7 +49,7 @@ class WhisperTimestampedASR(ASRBase):
|
|||
self.transcribe_timestamped = transcribe_timestamped
|
||||
if model_dir is not None:
|
||||
logger.debug("ignoring model_dir, not implemented")
|
||||
return whisper.load_model(modelsize, download_root=cache_dir)
|
||||
return whisper.load_model(model_size, download_root=cache_dir)
|
||||
|
||||
def transcribe(self, audio, init_prompt=""):
|
||||
result = self.transcribe_timestamped(
|
||||
|
|
@ -88,17 +88,17 @@ class FasterWhisperASR(ASRBase):
|
|||
"""Uses faster-whisper as the backend."""
|
||||
sep = ""
|
||||
|
||||
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
||||
def load_model(self, model_size=None, cache_dir=None, model_dir=None):
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
if model_dir is not None:
|
||||
logger.debug(f"Loading whisper model from model_dir {model_dir}. "
|
||||
f"modelsize and cache_dir parameters are not used.")
|
||||
f"model_size and cache_dir parameters are not used.")
|
||||
model_size_or_path = model_dir
|
||||
elif modelsize is not None:
|
||||
model_size_or_path = modelsize
|
||||
elif model_size is not None:
|
||||
model_size_or_path = model_size
|
||||
else:
|
||||
raise ValueError("Either modelsize or model_dir must be set")
|
||||
raise ValueError("Either model_size or model_dir must be set")
|
||||
device = "auto" # Allow CTranslate2 to decide available device
|
||||
compute_type = "auto" # Allow CTranslate2 to decide faster compute type
|
||||
|
||||
|
|
@ -149,18 +149,18 @@ class MLXWhisper(ASRBase):
|
|||
"""
|
||||
sep = ""
|
||||
|
||||
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
||||
def load_model(self, model_size=None, cache_dir=None, model_dir=None):
|
||||
from mlx_whisper.transcribe import ModelHolder, transcribe
|
||||
import mlx.core as mx
|
||||
|
||||
if model_dir is not None:
|
||||
logger.debug(f"Loading whisper model from model_dir {model_dir}. modelsize parameter is not used.")
|
||||
logger.debug(f"Loading whisper model from model_dir {model_dir}. model_size parameter is not used.")
|
||||
model_size_or_path = model_dir
|
||||
elif modelsize is not None:
|
||||
model_size_or_path = self.translate_model_name(modelsize)
|
||||
logger.debug(f"Loading whisper model {modelsize}. You use mlx whisper, so {model_size_or_path} will be used.")
|
||||
elif model_size is not None:
|
||||
model_size_or_path = self.translate_model_name(model_size)
|
||||
logger.debug(f"Loading whisper model {model_size}. You use mlx whisper, so {model_size_or_path} will be used.")
|
||||
else:
|
||||
raise ValueError("Either modelsize or model_dir must be set")
|
||||
raise ValueError("Either model_size or model_dir must be set")
|
||||
|
||||
self.model_size_or_path = model_size_or_path
|
||||
dtype = mx.float16
|
||||
|
|
|
|||
|
|
@ -106,9 +106,6 @@ class OnlineASRProcessor:
|
|||
def __init__(
|
||||
self,
|
||||
asr,
|
||||
tokenize_method: Optional[callable] = None,
|
||||
buffer_trimming: Tuple[str, float] = ("segment", 15),
|
||||
confidence_validation = False,
|
||||
logfile=sys.stderr,
|
||||
):
|
||||
"""
|
||||
|
|
@ -119,13 +116,14 @@ class OnlineASRProcessor:
|
|||
buffer_trimming: A tuple (option, seconds), where option is either "sentence" or "segment".
|
||||
"""
|
||||
self.asr = asr
|
||||
self.tokenize = tokenize_method
|
||||
self.tokenize = asr.tokenizer
|
||||
self.logfile = logfile
|
||||
self.confidence_validation = confidence_validation
|
||||
self.confidence_validation = asr.confidence_validation
|
||||
self.global_time_offset = 0.0
|
||||
self.init()
|
||||
|
||||
self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
|
||||
self.buffer_trimming_way = asr.buffer_trimming
|
||||
self.buffer_trimming_sec = asr.buffer_trimming_sec
|
||||
|
||||
if self.buffer_trimming_way not in ["sentence", "segment"]:
|
||||
raise ValueError("buffer_trimming must be either 'sentence' or 'segment'")
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from functools import lru_cache
|
|||
import time
|
||||
import logging
|
||||
from .backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR
|
||||
from whisperlivekit.warmup import warmup_asr
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -63,11 +64,23 @@ def create_tokenizer(lan):
|
|||
return WtPtok()
|
||||
|
||||
|
||||
def backend_factory(args):
|
||||
backend = args.backend
|
||||
def backend_factory(
|
||||
backend,
|
||||
lan,
|
||||
model_size,
|
||||
model_cache_dir,
|
||||
model_dir,
|
||||
task,
|
||||
buffer_trimming,
|
||||
buffer_trimming_sec,
|
||||
confidence_validation,
|
||||
warmup_file=None,
|
||||
min_chunk_size=None,
|
||||
):
|
||||
backend = backend
|
||||
if backend == "openai-api":
|
||||
logger.debug("Using OpenAI API.")
|
||||
asr = OpenaiApiASR(lan=args.lan)
|
||||
asr = OpenaiApiASR(lan=lan)
|
||||
else:
|
||||
if backend == "faster-whisper":
|
||||
asr_cls = FasterWhisperASR
|
||||
|
|
@ -77,34 +90,33 @@ def backend_factory(args):
|
|||
asr_cls = WhisperTimestampedASR
|
||||
|
||||
# Only for FasterWhisperASR and WhisperTimestampedASR
|
||||
size = args.model
|
||||
|
||||
t = time.time()
|
||||
logger.info(f"Loading Whisper {size} model for language {args.lan}...")
|
||||
logger.info(f"Loading Whisper {model_size} model for language {lan}...")
|
||||
asr = asr_cls(
|
||||
modelsize=size,
|
||||
lan=args.lan,
|
||||
cache_dir=getattr(args, 'model_cache_dir', None),
|
||||
model_dir=getattr(args, 'model_dir', None),
|
||||
model_size=model_size,
|
||||
lan=lan,
|
||||
cache_dir=model_cache_dir,
|
||||
model_dir=model_dir,
|
||||
)
|
||||
e = time.time()
|
||||
logger.info(f"done. It took {round(e-t,2)} seconds.")
|
||||
|
||||
# Apply common configurations
|
||||
if getattr(args, "vad", False): # Checks if VAD argument is present and True
|
||||
logger.info("Setting VAD filter")
|
||||
asr.use_vad()
|
||||
|
||||
language = args.lan
|
||||
if args.task == "translate":
|
||||
if backend != "simulstreaming":
|
||||
asr.set_translate_task()
|
||||
if task == "translate":
|
||||
tgt_language = "en" # Whisper translates into English
|
||||
else:
|
||||
tgt_language = language # Whisper transcribes in this language
|
||||
tgt_language = lan # Whisper transcribes in this language
|
||||
|
||||
# Create the tokenizer
|
||||
if args.buffer_trimming == "sentence":
|
||||
if buffer_trimming == "sentence":
|
||||
tokenizer = create_tokenizer(tgt_language)
|
||||
else:
|
||||
tokenizer = None
|
||||
return asr, tokenizer
|
||||
|
||||
warmup_asr(asr, warmup_file)
|
||||
|
||||
asr.confidence_validation = confidence_validation
|
||||
asr.tokenizer = tokenizer
|
||||
asr.buffer_trimming = buffer_trimming
|
||||
asr.buffer_trimming_sec = buffer_trimming_sec
|
||||
return asr
|
||||
Loading…
Reference in a new issue