0.2.10
This commit is contained in:
parent
777ec63a71
commit
1833e7c921
5 changed files with 48 additions and 12 deletions
|
|
@ -1,4 +1,4 @@
|
||||||
# Available model sizes:
|
# Available Whisper model sizes:
|
||||||
|
|
||||||
- tiny.en (english only)
|
- tiny.en (english only)
|
||||||
- tiny
|
- tiny
|
||||||
|
|
@ -71,3 +71,39 @@
|
||||||
3. Good hardware and want best quality? → `large-v3`
|
3. Good hardware and want best quality? → `large-v3`
|
||||||
4. Need fast, high-quality transcription without translation? → `large-v3-turbo`
|
4. Need fast, high-quality transcription without translation? → `large-v3-turbo`
|
||||||
5. Need translation capabilities? → `large-v2` or `large-v3` (avoid turbo)
|
5. Need translation capabilities? → `large-v2` or `large-v3` (avoid turbo)
|
||||||
|
|
||||||
|
|
||||||
|
_______________________
|
||||||
|
|
||||||
|
# Translation Models and Backend
|
||||||
|
|
||||||
|
**Language Support**: ~200 languages
|
||||||
|
|
||||||
|
## Distilled Model Sizes Available
|
||||||
|
|
||||||
|
| Model | Size | Parameters | VRAM (FP16) | VRAM (INT8) | Quality |
|
||||||
|
|-------|------|------------|-------------|-------------|---------|
|
||||||
|
| 600M | 2.46 GB | 600M | ~1.5GB | ~800MB | Good, understandable |
|
||||||
|
| 1.3B | 5.48 GB | 1.3B | ~3GB | ~1.5GB | Better accuracy, context |
|
||||||
|
|
||||||
|
**Quality Impact**: 1.3B has ~15-25% better BLEU scores vs 600M across language pairs.
|
||||||
|
|
||||||
|
## Backend Performance
|
||||||
|
|
||||||
|
| Backend | Speed vs Base | Memory Usage | Quality Loss |
|
||||||
|
|---------|---------------|--------------|--------------|
|
||||||
|
| CTranslate2 | 6-10x faster | 40-60% less | ~5% BLEU drop |
|
||||||
|
| Transformers | Baseline | High | None |
|
||||||
|
| Transformers + MPS (on Apple Silicon) | 2x faster | Medium | None |
|
||||||
|
|
||||||
|
**Metrics**:
|
||||||
|
- CTranslate2: 50-100+ tokens/sec
|
||||||
|
- Transformers: 10-30 tokens/sec
|
||||||
|
- Apple Silicon with MPS: Up to 2x faster than CTranslate2
|
||||||
|
|
||||||
|
## Quick Decision Matrix
|
||||||
|
|
||||||
|
**Choose 600M**: Limited resources, close to 0 lag
|
||||||
|
**Choose 1.3B**: Quality matters
|
||||||
|
**Choose Transformers**: On Apple Silicon
|
||||||
|
|
||||||
|
|
|
||||||
BIN
demo.png
BIN
demo.png
Binary file not shown.
|
Before Width: | Height: | Size: 449 KiB After Width: | Height: | Size: 1.2 MiB |
|
|
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "whisperlivekit"
|
name = "whisperlivekit"
|
||||||
version = "0.2.9"
|
version = "0.2.10"
|
||||||
description = "Real-time speech-to-text with speaker diarization using Whisper"
|
description = "Real-time speech-to-text with speaker diarization using Whisper"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
authors = [
|
authors = [
|
||||||
|
|
|
||||||
|
|
@ -445,8 +445,8 @@ class AudioProcessor:
|
||||||
elif not lines:
|
elif not lines:
|
||||||
lines = [Line(
|
lines = [Line(
|
||||||
speaker=1,
|
speaker=1,
|
||||||
start=state.get("end_buffer", 0),
|
start=state.end_buffer,
|
||||||
end=state.get("end_buffer", 0)
|
end=state.end_buffer
|
||||||
)]
|
)]
|
||||||
|
|
||||||
response = FrontData(
|
response = FrontData(
|
||||||
|
|
|
||||||
|
|
@ -346,13 +346,6 @@ function renderLinesWithBuffer(
|
||||||
|
|
||||||
let currentLineText = item.text || "";
|
let currentLineText = item.text || "";
|
||||||
|
|
||||||
if (item.translation) {
|
|
||||||
currentLineText += `<div class="label_translation">
|
|
||||||
<img src="/web/src/translate.svg" alt="Translation" width="12" height="12" />
|
|
||||||
<span>${item.translation}</span>
|
|
||||||
</div>`;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (idx === lines.length - 1) {
|
if (idx === lines.length - 1) {
|
||||||
if (!isFinalizing && item.speaker !== -2) {
|
if (!isFinalizing && item.speaker !== -2) {
|
||||||
if (remaining_time_transcription > 0) {
|
if (remaining_time_transcription > 0) {
|
||||||
|
|
@ -386,6 +379,13 @@ function renderLinesWithBuffer(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (item.translation) {
|
||||||
|
currentLineText += `<div class="label_translation">
|
||||||
|
<img src="/web/src/translate.svg" alt="Translation" width="12" height="12" />
|
||||||
|
<span>${item.translation}</span>
|
||||||
|
</div>`;
|
||||||
|
}
|
||||||
|
|
||||||
return currentLineText.trim().length > 0 || speakerLabel.length > 0
|
return currentLineText.trim().length > 0 || speakerLabel.length > 0
|
||||||
? `<p>${speakerLabel}<br/><div class='textcontent'>${currentLineText}</div></p>`
|
? `<p>${speakerLabel}<br/><div class='textcontent'>${currentLineText}</div></p>`
|
||||||
: `<p>${speakerLabel}<br/></p>`;
|
: `<p>${speakerLabel}<br/></p>`;
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue