Align backend and frontend time handling
This commit is contained in:
parent
9feb0e597b
commit
b32dd8bfc4
3 changed files with 29 additions and 22 deletions
|
|
@ -47,6 +47,7 @@ class AudioProcessor:
|
||||||
self.last_ffmpeg_activity = time()
|
self.last_ffmpeg_activity = time()
|
||||||
self.ffmpeg_health_check_interval = 5
|
self.ffmpeg_health_check_interval = 5
|
||||||
self.ffmpeg_max_idle_time = 10
|
self.ffmpeg_max_idle_time = 10
|
||||||
|
self.debug = False
|
||||||
|
|
||||||
# State management
|
# State management
|
||||||
self.is_stopping = False
|
self.is_stopping = False
|
||||||
|
|
@ -58,7 +59,7 @@ class AudioProcessor:
|
||||||
self.end_buffer = 0
|
self.end_buffer = 0
|
||||||
self.end_attributed_speaker = 0
|
self.end_attributed_speaker = 0
|
||||||
self.lock = asyncio.Lock()
|
self.lock = asyncio.Lock()
|
||||||
self.beg_loop = time()
|
self.beg_loop = None #to deal with a potential little lag at the websocket initialization, this is now set in process_audio
|
||||||
self.sep = " " # Default separator
|
self.sep = " " # Default separator
|
||||||
self.last_response_content = ""
|
self.last_response_content = ""
|
||||||
|
|
||||||
|
|
@ -298,11 +299,10 @@ class AudioProcessor:
|
||||||
|
|
||||||
asr_internal_buffer_duration_s = len(getattr(self.online, 'audio_buffer', [])) / self.online.SAMPLING_RATE
|
asr_internal_buffer_duration_s = len(getattr(self.online, 'audio_buffer', [])) / self.online.SAMPLING_RATE
|
||||||
transcription_lag_s = max(0.0, time() - self.beg_loop - self.end_buffer)
|
transcription_lag_s = max(0.0, time() - self.beg_loop - self.end_buffer)
|
||||||
|
asr_processing_logs = f"internal_buffer={asr_internal_buffer_duration_s:.2f}s | lag={transcription_lag_s:.2f}s |"
|
||||||
logger.info(
|
if type(item) is Silence:
|
||||||
f"ASR processing: internal_buffer={asr_internal_buffer_duration_s:.2f}s, "
|
asr_processing_logs += f" + Silence of = {item.duration :.2fs} | last_end = {self.tokens[-1].end} |"
|
||||||
f"lag={transcription_lag_s:.2f}s."
|
logger.info(asr_processing_logs)
|
||||||
)
|
|
||||||
|
|
||||||
if type(item) is Silence:
|
if type(item) is Silence:
|
||||||
cumulative_pcm_duration_stream_time += item.duration
|
cumulative_pcm_duration_stream_time += item.duration
|
||||||
|
|
@ -444,8 +444,8 @@ class AudioProcessor:
|
||||||
lines = []
|
lines = []
|
||||||
last_end_diarized = 0
|
last_end_diarized = 0
|
||||||
undiarized_text = []
|
undiarized_text = []
|
||||||
current_time = time() - self.beg_loop
|
current_time = time() - self.beg_loop if self.beg_loop else None
|
||||||
tokens = handle_silences(tokens, current_time, self.silence)
|
tokens, buffer_transcription = handle_silences(tokens, buffer_transcription, current_time, self.silence)
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
speaker = token.speaker
|
speaker = token.speaker
|
||||||
|
|
||||||
|
|
@ -459,21 +459,23 @@ class AudioProcessor:
|
||||||
if speaker not in [-1, 0]:
|
if speaker not in [-1, 0]:
|
||||||
last_end_diarized = max(token.end, last_end_diarized)
|
last_end_diarized = max(token.end, last_end_diarized)
|
||||||
|
|
||||||
# Group by speaker
|
debug_info = ""
|
||||||
|
if self.debug:
|
||||||
|
debug_info = f"[{format_time(token.start)} : {format_time(token.end)}]"
|
||||||
if speaker != previous_speaker or not lines:
|
if speaker != previous_speaker or not lines:
|
||||||
lines.append({
|
lines.append({
|
||||||
"speaker": speaker,
|
"speaker": speaker,
|
||||||
"text": token.text,
|
"text": token.text + debug_info,
|
||||||
"beg": format_time(token.start),
|
"beg": format_time(token.start),
|
||||||
"end": format_time(token.end),
|
"end": format_time(token.end),
|
||||||
"diff": round(token.end - last_end_diarized, 2)
|
"diff": round(token.end - last_end_diarized, 2)
|
||||||
})
|
})
|
||||||
previous_speaker = speaker
|
previous_speaker = speaker
|
||||||
elif token.text: # Only append if text isn't empty
|
elif token.text: # Only append if text isn't empty
|
||||||
lines[-1]["text"] += sep + token.text
|
lines[-1]["text"] += sep + token.text + debug_info
|
||||||
lines[-1]["end"] = format_time(token.end)
|
lines[-1]["end"] = format_time(token.end)
|
||||||
lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
|
lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
|
||||||
|
|
||||||
# Handle undiarized text
|
# Handle undiarized text
|
||||||
if undiarized_text:
|
if undiarized_text:
|
||||||
combined = sep.join(undiarized_text)
|
combined = sep.join(undiarized_text)
|
||||||
|
|
@ -634,6 +636,10 @@ class AudioProcessor:
|
||||||
|
|
||||||
async def process_audio(self, message):
|
async def process_audio(self, message):
|
||||||
"""Process incoming audio data."""
|
"""Process incoming audio data."""
|
||||||
|
|
||||||
|
if not self.beg_loop:
|
||||||
|
self.beg_loop = time()
|
||||||
|
|
||||||
if not message:
|
if not message:
|
||||||
logger.info("Empty audio message received, initiating stop sequence.")
|
logger.info("Empty audio message received, initiating stop sequence.")
|
||||||
self.is_stopping = True
|
self.is_stopping = True
|
||||||
|
|
|
||||||
|
|
@ -77,7 +77,7 @@ def no_token_to_silence(tokens):
|
||||||
new_tokens.append(token)
|
new_tokens.append(token)
|
||||||
return new_tokens
|
return new_tokens
|
||||||
|
|
||||||
def ends_with_silence(tokens, current_time, vac_detected_silence):
|
def ends_with_silence(tokens, buffer_transcription, current_time, vac_detected_silence):
|
||||||
if not tokens:
|
if not tokens:
|
||||||
return []
|
return []
|
||||||
last_token = tokens[-1]
|
last_token = tokens[-1]
|
||||||
|
|
@ -97,12 +97,13 @@ def ends_with_silence(tokens, current_time, vac_detected_silence):
|
||||||
probability=0.95
|
probability=0.95
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
#We validate the buffer has because of the silence
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
def handle_silences(tokens, current_time, vac_detected_silence):
|
def handle_silences(tokens, buffer_transcription, current_time, vac_detected_silence):
|
||||||
tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text
|
tokens = blank_to_silence(tokens) #useful for simulstreaming backend which tends to generate [BLANK_AUDIO] text
|
||||||
tokens = no_token_to_silence(tokens)
|
tokens = no_token_to_silence(tokens)
|
||||||
tokens = ends_with_silence(tokens, current_time, vac_detected_silence)
|
tokens = ends_with_silence(tokens, buffer_transcription, current_time, vac_detected_silence)
|
||||||
return tokens
|
return tokens, buffer_transcription
|
||||||
|
|
||||||
|
|
@ -36,7 +36,6 @@ class SimulStreamingOnlineProcessor:
|
||||||
):
|
):
|
||||||
self.asr = asr
|
self.asr = asr
|
||||||
self.logfile = logfile
|
self.logfile = logfile
|
||||||
self.is_last = False
|
|
||||||
self.end = 0.0
|
self.end = 0.0
|
||||||
self.global_time_offset = 0.0
|
self.global_time_offset = 0.0
|
||||||
|
|
||||||
|
|
@ -57,12 +56,13 @@ class SimulStreamingOnlineProcessor:
|
||||||
If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
|
If silences are > 5s, we do a complete context clear. Otherwise, we just insert a small silence and shift the last_attend_frame
|
||||||
"""
|
"""
|
||||||
if silence_duration < 5:
|
if silence_duration < 5:
|
||||||
gap_silence = torch.zeros(int(16000*min(silence_duration, 1.0)))
|
gap_silence = torch.zeros(int(16000*silence_duration))
|
||||||
self.model.insert_audio(gap_silence)
|
self.model.insert_audio(gap_silence)
|
||||||
self.global_time_offset = silence_duration - 1.0
|
# self.global_time_offset += silence_duration
|
||||||
else:
|
else:
|
||||||
|
self.process_iter(is_last=True) #we want to totally process what remains in the buffer.
|
||||||
self.model.refresh_segment(complete=True)
|
self.model.refresh_segment(complete=True)
|
||||||
self.global_time_offset += silence_duration
|
self.global_time_offset += silence_duration + offset
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -132,14 +132,14 @@ class SimulStreamingOnlineProcessor:
|
||||||
logger.debug(f"TS-WORD:\t{start_time:.2f}\t{end_time:.2f}\t{word}")
|
logger.debug(f"TS-WORD:\t{start_time:.2f}\t{end_time:.2f}\t{word}")
|
||||||
return timestamped_words
|
return timestamped_words
|
||||||
|
|
||||||
def process_iter(self) -> Tuple[List[ASRToken], float]:
|
def process_iter(self, is_last=False) -> Tuple[List[ASRToken], float]:
|
||||||
"""
|
"""
|
||||||
Process accumulated audio chunks using SimulStreaming.
|
Process accumulated audio chunks using SimulStreaming.
|
||||||
|
|
||||||
Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
|
Returns a tuple: (list of committed ASRToken objects, float representing the audio processed up to time).
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
tokens, generation_progress = self.model.infer(is_last=self.is_last)
|
tokens, generation_progress = self.model.infer(is_last=is_last)
|
||||||
ts_words = self.timestamped_text(tokens, generation_progress)
|
ts_words = self.timestamped_text(tokens, generation_progress)
|
||||||
|
|
||||||
new_tokens = []
|
new_tokens = []
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue