Merge pull request #52 from QuentinFuxa/diart_integration_improvements

Diart integration improvements
2025-02-19 14:43:39 +01:00 · 2025-02-19 14:43:39 +01:00 · 97c0ae6154
commit 97c0ae6154
parent 450c93fef8 09d40a7de8
3 changed files with 85 additions and 28 deletions
--- a/src/web/demo.png
+++ b/src/web/demo.png
--- a/src/web/live_transcription.html
+++ b/src/web/live_transcription.html
@ -78,12 +78,59 @@
    #linesTranscript strong {
      color: #333;
    }
-    /* Grey buffer styling */
+    #speaker {
+      background-color: #dcefff;
+      border-radius: 30px;
+      padding: 2px 10px;
+      font-size: 14px;
+    }
+    #timeInfo {
+      color: #666;
+      margin-left: 10px;
+    }
+    .textcontent {
+      font-size: 16px;
+      margin-left: 10px;
+      padding-left: 10px;
+      border-left: 2px  solid #dcefff;
+      margin-bottom: 10px;
+    }
    .buffer {
      color: rgb(180, 180, 180);
      font-style: italic;
      margin-left: 4px;
    }
+    .spinner {
+        display: inline-block;
+        width: 8px;
+        height: 8px;
+        border: 2px solid rgba(0, 0, 0, 0.2);
+        border-top: 2px solid #333;
+        border-radius: 50%;
+        animation: spin 0.6s linear infinite;
+        vertical-align: middle;
+        margin-bottom: 2px;
+        }
+
+        @keyframes spin {
+        to {
+            transform: rotate(360deg);
+        }
+        }
+    .silence {
+      color: #666;
+      background-color: #f3f3f3;
+      font-size: 13px;
+      border-radius: 30px;
+      padding: 2px 10px;
+    }
+    .loading {
+      color: #666;
+      background-color: #eff9ff;
+      font-size: 14px;
+      border-radius: 30px;
+      padding: 2px 10px;
+    }
  </style>
 </head>
 <body>
@ -188,7 +235,7 @@
            }
          */
          const { lines = [], buffer = "" } = data;
-          renderLinesWithBuffer(lines, buffer);
+          renderLinesWithBuffer( lines, buffer);
        };
      });
    }
@ -199,27 +246,36 @@
        linesTranscriptDiv.innerHTML = "";
        return;
      }
+
+
+
      const linesHtml = lines.map((item, idx) => {
-        let speakerLabel = "";
-        if (item.speaker === -2) {
-          speakerLabel = "No speaker";
-        } else if (item.speaker !== -1) {
-          speakerLabel = `Speaker ${item.speaker}`;
-        }
-        
        let timeInfo = "";
        if (item.beg !== undefined && item.end !== undefined) {
-          timeInfo = ` [${item.beg}, ${item.end}]`;
+          timeInfo = ` ${item.beg} - ${item.end}`;
        }
+
+        let speakerLabel = "";
+        if (item.speaker === -2) {
+          speakerLabel = `<span class="silence">Silence<span id='timeInfo'>${timeInfo}</span></span>`;
+        } else if (item.speaker == -1) {
+          speakerLabel = `<span class='loading'> <span class="spinner"></span><span id='timeInfo'>${item.diff} second(s) of audio are undergoing diarization</span></span>`;
+        } else if (item.speaker == -3) {
+          speakerLabel = `<span id="speaker"><span id='timeInfo'>${timeInfo}</span>`;
+        } else if (item.speaker !== -1) {
+          speakerLabel = `<span id="speaker">Speaker ${item.speaker}<span id='timeInfo'>${timeInfo}</span></span>`;
+        } 
+        
+
        
        let textContent = item.text;
        if (idx === lines.length - 1 && buffer) {
          textContent += `<span class="buffer">${buffer}</span>`;
        }
        
-        return speakerLabel
-          ? `<p><strong>${speakerLabel}${timeInfo}</strong> ${textContent}</p>`
-          : `<p>${textContent}</p>`;
+        return textContent
+          ? `<p>${speakerLabel}<br/><div class='textcontent'>${textContent}</div></p>`
+          : `<p >${speakerLabel}<br/></p>`;
      }).join("");

      linesTranscriptDiv.innerHTML = linesHtml;
--- a/whisper_fastapi_online_server.py
+++ b/whisper_fastapi_online_server.py
@ -214,10 +214,10 @@ async def websocket_endpoint(websocket: WebSocket):
                    else:
                        chunk_history.append({
                                "beg": time() - beg_loop,
-                                "end": time() - beg_loop + 0.1,
+                                "end": time() - beg_loop + 1,
                                "text": '',
                        })
-                        sleep(0.1)
+                        sleep(1)
                        buffer = ''

                    if args.diarization:
@ -225,28 +225,29 @@ async def websocket_endpoint(websocket: WebSocket):
                        diarization.assign_speakers_to_chunks(chunk_history)

                    
-                    current_speaker = -1
-                    lines = [{
-                        "beg": 0,
-                        "end": 0,
-                        "speaker": current_speaker,
-                        "text": ""
-                        }]
-                    for ch in chunk_history:
-                        if args.diarization and ch["speaker"] and ch["speaker"] != current_speaker:
-                            new_speaker = ch["speaker"]
+                    current_speaker = 0
+                    lines = []
+                    last_end_diarized = 0
+                    for ind, ch in enumerate(chunk_history):
+                        speaker = ch.get("speaker", -3)
+                        if speaker == -1 and ind < len(chunk_history) - 1:
+                            continue
+                        elif speaker != current_speaker:
                            lines.append(
                                {
-                                    "speaker": new_speaker,
+                                    "speaker": speaker,
                                    "text": ch['text'],
                                    "beg": format_time(ch['beg']),
                                    "end": format_time(ch['end']),
+                                    "diff": round(ch['end'] - last_end_diarized, 2)
                                }
                            )
-                            current_speaker = new_speaker
-                        else:
+                            current_speaker = speaker
+                        elif speaker != -1:
                            lines[-1]["text"] += ch['text']
                            lines[-1]["end"] = format_time(ch['end'])
+                        if speaker != -1:
+                            last_end_diarized = max(ch['end'], last_end_diarized)

                    response = {"lines": lines, "buffer": buffer}
                    await websocket.send_json(response)