diff --git a/src/web/demo.png b/src/web/demo.png index 53522be..74d0fe6 100644 Binary files a/src/web/demo.png and b/src/web/demo.png differ diff --git a/src/web/live_transcription.html b/src/web/live_transcription.html index af48481..b9d4ea6 100644 --- a/src/web/live_transcription.html +++ b/src/web/live_transcription.html @@ -78,12 +78,59 @@ #linesTranscript strong { color: #333; } - /* Grey buffer styling */ + #speaker { + background-color: #dcefff; + border-radius: 30px; + padding: 2px 10px; + font-size: 14px; + } + #timeInfo { + color: #666; + margin-left: 10px; + } + .textcontent { + font-size: 16px; + margin-left: 10px; + padding-left: 10px; + border-left: 2px solid #dcefff; + margin-bottom: 10px; + } .buffer { color: rgb(180, 180, 180); font-style: italic; margin-left: 4px; } + .spinner { + display: inline-block; + width: 8px; + height: 8px; + border: 2px solid rgba(0, 0, 0, 0.2); + border-top: 2px solid #333; + border-radius: 50%; + animation: spin 0.6s linear infinite; + vertical-align: middle; + margin-bottom: 2px; + } + + @keyframes spin { + to { + transform: rotate(360deg); + } + } + .silence { + color: #666; + background-color: #f3f3f3; + font-size: 13px; + border-radius: 30px; + padding: 2px 10px; + } + .loading { + color: #666; + background-color: #eff9ff; + font-size: 14px; + border-radius: 30px; + padding: 2px 10px; + } @@ -188,7 +235,7 @@ } */ const { lines = [], buffer = "" } = data; - renderLinesWithBuffer(lines, buffer); + renderLinesWithBuffer( lines, buffer); }; }); } @@ -199,27 +246,36 @@ linesTranscriptDiv.innerHTML = ""; return; } + + + const linesHtml = lines.map((item, idx) => { - let speakerLabel = ""; - if (item.speaker === -2) { - speakerLabel = "No speaker"; - } else if (item.speaker !== -1) { - speakerLabel = `Speaker ${item.speaker}`; - } - let timeInfo = ""; if (item.beg !== undefined && item.end !== undefined) { - timeInfo = ` [${item.beg}, ${item.end}]`; + timeInfo = ` ${item.beg} - ${item.end}`; } + + let speakerLabel = ""; + if (item.speaker === -2) { + speakerLabel = `Silence${timeInfo}`; + } else if (item.speaker == -1) { + speakerLabel = ` ${item.diff} second(s) of audio are undergoing diarization`; + } else if (item.speaker == -3) { + speakerLabel = `${timeInfo}`; + } else if (item.speaker !== -1) { + speakerLabel = `Speaker ${item.speaker}${timeInfo}`; + } + + let textContent = item.text; if (idx === lines.length - 1 && buffer) { textContent += `${buffer}`; } - return speakerLabel - ? `

${speakerLabel}${timeInfo} ${textContent}

` - : `

${textContent}

`; + return textContent + ? `

${speakerLabel}

${textContent}

` + : `

${speakerLabel}

`; }).join(""); linesTranscriptDiv.innerHTML = linesHtml; diff --git a/whisper_fastapi_online_server.py b/whisper_fastapi_online_server.py index e4a1571..1c00751 100644 --- a/whisper_fastapi_online_server.py +++ b/whisper_fastapi_online_server.py @@ -214,10 +214,10 @@ async def websocket_endpoint(websocket: WebSocket): else: chunk_history.append({ "beg": time() - beg_loop, - "end": time() - beg_loop + 0.1, + "end": time() - beg_loop + 1, "text": '', }) - sleep(0.1) + sleep(1) buffer = '' if args.diarization: @@ -225,28 +225,29 @@ async def websocket_endpoint(websocket: WebSocket): diarization.assign_speakers_to_chunks(chunk_history) - current_speaker = -1 - lines = [{ - "beg": 0, - "end": 0, - "speaker": current_speaker, - "text": "" - }] - for ch in chunk_history: - if args.diarization and ch["speaker"] and ch["speaker"] != current_speaker: - new_speaker = ch["speaker"] + current_speaker = 0 + lines = [] + last_end_diarized = 0 + for ind, ch in enumerate(chunk_history): + speaker = ch.get("speaker", -3) + if speaker == -1 and ind < len(chunk_history) - 1: + continue + elif speaker != current_speaker: lines.append( { - "speaker": new_speaker, + "speaker": speaker, "text": ch['text'], "beg": format_time(ch['beg']), "end": format_time(ch['end']), + "diff": round(ch['end'] - last_end_diarized, 2) } ) - current_speaker = new_speaker - else: + current_speaker = speaker + elif speaker != -1: lines[-1]["text"] += ch['text'] lines[-1]["end"] = format_time(ch['end']) + if speaker != -1: + last_end_diarized = max(ch['end'], last_end_diarized) response = {"lines": lines, "buffer": buffer} await websocket.send_json(response)