import logging from datetime import timedelta from whisperlivekit.remove_silences import handle_silences logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) PUNCTUATION_MARKS = {'.', '!', '?', '。', '!', '?'} CHECK_AROUND = 4 def format_time(seconds: float) -> str: """Format seconds as HH:MM:SS.""" return str(timedelta(seconds=int(seconds))) def is_punctuation(token): if token.text.strip() in PUNCTUATION_MARKS: return True return False def next_punctuation_change(i, tokens): for ind in range(i+1, min(len(tokens), i+CHECK_AROUND+1)): if is_punctuation(tokens[ind]): return ind return None def next_speaker_change(i, tokens, speaker): for ind in range(i-1, max(0, i-CHECK_AROUND)-1, -1): token = tokens[ind] if is_punctuation(token): break if token.speaker != speaker: return ind, token.speaker return None, speaker def new_line( token, speaker, last_end_diarized, debug_info = "" ): return { "speaker": int(speaker), "text": token.text + debug_info, "beg": format_time(token.start), "end": format_time(token.end), "diff": round(token.end - last_end_diarized, 2) } def append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized): if token.text: lines[-1]["text"] += sep + token.text + debug_info lines[-1]["end"] = format_time(token.end) lines[-1]["diff"] = round(token.end - last_end_diarized, 2) def format_output(state, silence, current_time, diarization, debug): tokens = state["tokens"] translated_tokens = state["translated_tokens"] # Here we will attribute the speakers only based on the timestamps of the segments buffer_transcription = state["buffer_transcription"] buffer_diarization = state["buffer_diarization"] end_attributed_speaker = state["end_attributed_speaker"] sep = state["sep"] previous_speaker = -1 lines = [] last_end_diarized = 0 undiarized_text = [] tokens, buffer_transcription, buffer_diarization = handle_silences(tokens, buffer_transcription, buffer_diarization, current_time, silence) last_punctuation = None for i, token in enumerate(tokens): speaker = token.speaker if not diarization and speaker == -1: #Speaker -1 means no attributed by diarization. In the frontend, it should appear under 'Speaker 1' speaker = 1 if diarization and not tokens[-1].speaker == -2: if (speaker in [-1, 0]) and token.end >= end_attributed_speaker: undiarized_text.append(token.text) continue elif (speaker in [-1, 0]) and token.end < end_attributed_speaker: speaker = previous_speaker if speaker not in [-1, 0]: last_end_diarized = max(token.end, last_end_diarized) debug_info = "" if debug: debug_info = f"[{format_time(token.start)} : {format_time(token.end)}]" if not lines: lines.append(new_line(token, speaker, last_end_diarized, debug_info = "")) continue else: previous_speaker = lines[-1]['speaker'] if is_punctuation(token): last_punctuation = i if last_punctuation == i-1: if speaker != previous_speaker: # perfect, diarization perfectly aligned lines.append(new_line(token, speaker, last_end_diarized, debug_info = "")) last_punctuation, next_punctuation = None, None continue speaker_change_pos, new_speaker = next_speaker_change(i, tokens, speaker) if speaker_change_pos: # Corrects delay: # That was the idea. Okay haha |SPLIT SPEAKER| that's a good one # should become: # That was the idea. |SPLIT SPEAKER| Okay haha that's a good one lines.append(new_line(token, new_speaker, last_end_diarized, debug_info = "")) else: # No speaker change to come append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized) continue if speaker != previous_speaker: if speaker == -2 or previous_speaker == -2: #silences can happen anytime lines.append(new_line(token, speaker, last_end_diarized, debug_info = "")) continue elif next_punctuation_change(i, tokens): # Corrects advance: # Are you |SPLIT SPEAKER| okay? yeah, sure. Absolutely # should become: # Are you okay? |SPLIT SPEAKER| yeah, sure. Absolutely append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized) continue else: #we create a new speaker, but that's no ideal. We are not sure about the split. We prefer to append to previous line # lines.append(new_line(token, speaker, last_end_diarized, debug_info = "")) pass append_token_to_last_line(lines, sep, token, debug_info, last_end_diarized) return lines, undiarized_text, buffer_transcription, ''