diff --git a/whisperlivekit/audio_processor.py b/whisperlivekit/audio_processor.py index 7b7b3fd..2227590 100644 --- a/whisperlivekit/audio_processor.py +++ b/whisperlivekit/audio_processor.py @@ -62,13 +62,13 @@ class AudioProcessor: self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz self.is_pcm_input = self.args.pcm_input - self.debug = False # State management self.is_stopping = False self.silence = False self.silence_duration = 0.0 self.tokens = [] + self.last_validated_token = 0 self.translated_segments = [] self.buffer_transcription = Transcript() self.end_buffer = 0 @@ -138,7 +138,7 @@ class AudioProcessor: async def add_dummy_token(self): """Placeholder token when no transcription is available.""" async with self.lock: - current_time = time() - self.beg_loop if self.beg_loop else 0 + current_time = time() - self.beg_loop self.tokens.append(ASRToken( start=current_time, end=current_time + 1, text=".", speaker=-1, is_dummy=True @@ -161,6 +161,7 @@ class AudioProcessor: return State( tokens=self.tokens.copy(), + last_validated_token=self.last_validated_token, translated_segments=self.translated_segments.copy(), buffer_transcription=self.buffer_transcription, end_buffer=self.end_buffer, @@ -428,35 +429,23 @@ class AudioProcessor: """Format processing results for output.""" while True: try: - # If FFmpeg error occurred, notify front-end if self._ffmpeg_error: - yield FrontData( - status="error", - error=f"FFmpeg error: {self._ffmpeg_error}" - ) + yield FrontData(status="error", error=f"FFmpeg error: {self._ffmpeg_error}") self._ffmpeg_error = None await asyncio.sleep(1) continue - # Get current state state = await self.get_current_state() - - # Add dummy tokens if needed - if (not state.tokens or state.tokens[-1].is_dummy) and not self.args.transcription and self.args.diarization: - await self.add_dummy_token() - sleep(0.5) - state = await self.get_current_state() - # Format output - lines, undiarized_text, end_w_silence = format_output( + + lines, undiarized_text = format_output( state, self.silence, - current_time = time() - self.beg_loop if self.beg_loop else None, + current_time = time() - self.beg_loop, args = self.args, - debug = self.debug, sep=self.sep ) - if end_w_silence: + if lines and lines[-1].speaker == -2: buffer_transcription = Transcript() else: buffer_transcription = state.buffer_transcription diff --git a/whisperlivekit/remove_silences.py b/whisperlivekit/remove_silences.py index cdbb442..6ff472f 100644 --- a/whisperlivekit/remove_silences.py +++ b/whisperlivekit/remove_silences.py @@ -78,16 +78,8 @@ def no_token_to_silence(tokens): return new_tokens def ends_with_silence(tokens, current_time, vac_detected_silence): - end_w_silence = False - if not tokens: - return [], end_w_silence last_token = tokens[-1] - if tokens and current_time and ( - current_time - last_token.end >= END_SILENCE_DURATION - or - (current_time - last_token.end >= 3 and vac_detected_silence) - ): - end_w_silence = True + if vac_detected_silence or (current_time - last_token.end >= END_SILENCE_DURATION): if last_token.speaker == -2: last_token.end = current_time else: @@ -99,12 +91,14 @@ def ends_with_silence(tokens, current_time, vac_detected_silence): probability=0.95 ) ) - return tokens, end_w_silence + return tokens def handle_silences(tokens, current_time, vac_detected_silence): + if not tokens: + return [] tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text tokens = no_token_to_silence(tokens) - tokens, end_w_silence = ends_with_silence(tokens, current_time, vac_detected_silence) - return tokens, end_w_silence + tokens = ends_with_silence(tokens, current_time, vac_detected_silence) + return tokens \ No newline at end of file diff --git a/whisperlivekit/results_formater.py b/whisperlivekit/results_formater.py index 327ba27..22df26d 100644 --- a/whisperlivekit/results_formater.py +++ b/whisperlivekit/results_formater.py @@ -7,6 +7,8 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) CHECK_AROUND = 4 +DEBUG = False + def is_punctuation(token): if token.is_punctuation(): @@ -30,99 +32,96 @@ def next_speaker_change(i, tokens, speaker): def new_line( token, - speaker, - debug_info = "" ): return Line( - speaker = speaker, - text = token.text + debug_info, + speaker = token.corrected_speaker, + text = token.text + (f"[{format_time(token.start)} : {format_time(token.end)}]" if DEBUG else ""), start = token.start, end = token.end, detected_language=token.detected_language ) -def append_token_to_last_line(lines, sep, token, debug_info): - if token.text: - lines[-1].text += sep + token.text + debug_info - lines[-1].end = token.end - if not lines[-1].detected_language and token.detected_language: - lines[-1].detected_language = token.detected_language - +def append_token_to_last_line(lines, sep, token): + if not lines: + lines.append(new_line(token)) + else: + if token.text: + lines[-1].text += sep + token.text + (f"[{format_time(token.start)} : {format_time(token.end)}]" if DEBUG else "") + lines[-1].end = token.end + if not lines[-1].detected_language and token.detected_language: + lines[-1].detected_language = token.detected_language + -def format_output(state, silence, current_time, args, debug, sep): +def format_output(state, silence, current_time, args, sep): diarization = args.diarization disable_punctuation_split = args.disable_punctuation_split tokens = state.tokens translated_segments = state.translated_segments # Here we will attribute the speakers only based on the timestamps of the segments - end_attributed_speaker = state.end_attributed_speaker + last_validated_token = state.last_validated_token - previous_speaker = -1 - lines = [] + previous_speaker = 1 undiarized_text = [] - tokens, end_w_silence = handle_silences(tokens, current_time, silence) + tokens = handle_silences(tokens, current_time, silence) last_punctuation = None - for i, token in enumerate(tokens): - speaker = token.speaker - if not diarization and speaker == -1: #Speaker -1 means no attributed by diarization. In the frontend, it should appear under 'Speaker 1' - speaker = 1 - if diarization and not tokens[-1].speaker == -2: - if (speaker in [-1, 0]) and token.end >= end_attributed_speaker: - undiarized_text.append(token.text) - continue - elif (speaker in [-1, 0]) and token.end < end_attributed_speaker: - speaker = previous_speaker - debug_info = "" - if debug: - debug_info = f"[{format_time(token.start)} : {format_time(token.end)}]" - - if not lines: - lines.append(new_line(token, speaker, debug_info = "")) - continue + for i, token in enumerate(tokens[last_validated_token:]): + speaker = int(token.speaker) + token.corrected_speaker = speaker + if not diarization: + if speaker == -1: #Speaker -1 means no attributed by diarization. In the frontend, it should appear under 'Speaker 1' + token.corrected_speaker = 1 + token.validated_speaker = True else: - previous_speaker = lines[-1].speaker - - if is_punctuation(token): - last_punctuation = i - - - if last_punctuation == i-1: - if speaker != previous_speaker: - # perfect, diarization perfectly aligned - lines.append(new_line(token, speaker, debug_info = "")) - last_punctuation, next_punctuation = None, None - continue - - speaker_change_pos, new_speaker = next_speaker_change(i, tokens, speaker) - if speaker_change_pos: - # Corrects delay: - # That was the idea. Okay haha |SPLIT SPEAKER| that's a good one - # should become: - # That was the idea. |SPLIT SPEAKER| Okay haha that's a good one - lines.append(new_line(token, new_speaker, debug_info = "")) - else: - # No speaker change to come - append_token_to_last_line(lines, sep, token, debug_info) - continue - + # if token.end > end_attributed_speaker and token.speaker != -2: + # if tokens[-1].speaker == -2: #if it finishes by a silence, we want to append the undiarized text to the last speaker. + # token.corrected_speaker = previous_speaker + # else: + # undiarized_text.append(token.text) + # continue + # else: + if is_punctuation(token): + last_punctuation = i + + if last_punctuation == i-1: + if token.speaker != previous_speaker: + token.validated_speaker = True + # perfect, diarization perfectly aligned + last_punctuation = None + else: + speaker_change_pos, new_speaker = next_speaker_change(i, tokens, speaker) + if speaker_change_pos: + # Corrects delay: + # That was the idea. haha |SPLIT SPEAKER| that's a good one + # should become: + # That was the idea. |SPLIT SPEAKER| haha that's a good one + token.corrected_speaker = new_speaker + token.validated_speaker = True + elif speaker != previous_speaker: + if not (speaker == -2 or previous_speaker == -2): + if next_punctuation_change(i, tokens): + # Corrects advance: + # Are you |SPLIT SPEAKER| ? yeah, sure. Absolutely + # should become: + # Are you ? |SPLIT SPEAKER| yeah, sure. Absolutely + token.corrected_speaker = previous_speaker + token.validated_speaker = True + else: #Problematic, except if the language has no punctuation. We append to previous line, except if disable_punctuation_split is set to True. + if not disable_punctuation_split: + token.corrected_speaker = previous_speaker + token.validated_speaker = False + if token.validated_speaker: + state.last_validated_token = i + previous_speaker = token.corrected_speaker - if speaker != previous_speaker: - if speaker == -2 or previous_speaker == -2: #silences can happen anytime - lines.append(new_line(token, speaker, debug_info = "")) - continue - elif next_punctuation_change(i, tokens): - # Corrects advance: - # Are you |SPLIT SPEAKER| okay? yeah, sure. Absolutely - # should become: - # Are you okay? |SPLIT SPEAKER| yeah, sure. Absolutely - append_token_to_last_line(lines, sep, token, debug_info) - continue - else: #we create a new speaker, but that's no ideal. We are not sure about the split. We prefer to append to previous line - if disable_punctuation_split: - lines.append(new_line(token, speaker, debug_info = "")) - continue - pass - - append_token_to_last_line(lines, sep, token, debug_info) + previous_speaker = 1 + + lines = [] + for token in tokens: + if int(token.corrected_speaker) != int(previous_speaker): + lines.append(new_line(token)) + else: + append_token_to_last_line(lines, sep, token) + + previous_speaker = token.corrected_speaker if lines and translated_segments: unassigned_translated_segments = [] @@ -158,4 +157,4 @@ def format_output(state, silence, current_time, args, debug, sep): if state.buffer_transcription and lines: lines[-1].end = max(state.buffer_transcription.end, lines[-1].end) - return lines, undiarized_text, end_w_silence + return lines, undiarized_text diff --git a/whisperlivekit/timed_objects.py b/whisperlivekit/timed_objects.py index 0637ed8..103ed33 100644 --- a/whisperlivekit/timed_objects.py +++ b/whisperlivekit/timed_objects.py @@ -43,6 +43,12 @@ class TimedText: @dataclass() class ASRToken(TimedText): + + corrected_speaker: Optional[int] = -1 + validated_speaker: bool = False + validated_text: bool = False + validated_language: bool = False + def with_offset(self, offset: float) -> "ASRToken": """Return a new token with the time offset added.""" return ASRToken(self.start + offset, self.end + offset, self.text, self.speaker, self.probability, detected_language=self.detected_language) @@ -169,6 +175,7 @@ class ChangeSpeaker: @dataclass class State(): tokens: list + last_validated_token: int translated_segments: list buffer_transcription: str end_buffer: float