From 9556d07484769e3edeca9a5b19674d8040395a79 Mon Sep 17 00:00:00 2001 From: Rodrigo Date: Fri, 1 Dec 2023 17:33:46 -0300 Subject: [PATCH 1/5] vad --- mic_test_whisper_simple.py | 95 +++++++++++++++++++++++++++ mic_test_whisper_streaming.py | 71 +++++++++++++++++++++ microphone_stream.py | 82 ++++++++++++++++++++++++ voice_activity_controller.py | 117 ++++++++++++++++++++++++++++++++++ 4 files changed, 365 insertions(+) create mode 100644 mic_test_whisper_simple.py create mode 100644 mic_test_whisper_streaming.py create mode 100644 microphone_stream.py create mode 100644 voice_activity_controller.py diff --git a/mic_test_whisper_simple.py b/mic_test_whisper_simple.py new file mode 100644 index 0000000..58d3a8d --- /dev/null +++ b/mic_test_whisper_simple.py @@ -0,0 +1,95 @@ +from microphone_stream import MicrophoneStream +from voice_activity_controller import VoiceActivityController +from whisper_online import * +import numpy as np +import librosa +import io +import soundfile +import sys + + + + +class SimpleASRProcessor: + + def __init__(self, asr, sampling_rate = 16000): + """run this when starting or restarting processing""" + self.audio_buffer = np.array([],dtype=np.float32) + self.prompt_buffer = "" + self.asr = asr + self.sampling_rate = sampling_rate + self.init_prompt = '' + + def ts_words(self, segments): + result = "" + for segment in segments: + if segment.no_speech_prob > 0.9: + continue + for word in segment.words: + w = word.word + t = (word.start, word.end, w) + result +=w + return result + + def stream_process(self, vad_result): + iter_in_phrase = 0 + for chunk, is_final in vad_result: + iter_in_phrase += 1 + + if chunk is not None: + sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") + audio, _ = librosa.load(sf,sr=SAMPLING_RATE) + # self.audio_buffer.append(chunk) + out = [] + out.append(audio) + a = np.concatenate(out) + self.audio_buffer = np.append(self.audio_buffer, a) + + if is_final and len(self.audio_buffer) > 0: + res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) + # use custom ts_words + tsw = self.ts_words(res) + self.init_prompt = self.init_prompt + tsw + self.init_prompt = self.init_prompt [-100:] + self.audio_buffer.resize(0) + iter_in_phrase =0 + yield True, tsw + # show progress evry 10 chunks + elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0: + res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) + # use custom ts_words + tsw = self.ts_words(res) + yield False, tsw + + + + + + + +SAMPLING_RATE = 16000 + +model = "large-v2" +src_lan = "en" # source language +tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used +use_vad_result = True +min_sample_length = 1 * SAMPLING_RATE + + + +vad = VoiceActivityController(use_vad_result = use_vad_result) +asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model + +tokenizer = create_tokenizer(tgt_lan) +online = SimpleASRProcessor(asr) + + +stream = MicrophoneStream() +stream = vad.detect_user_speech(stream, audio_in_int16 = False) +stream = online.stream_process(stream) + +for isFinal, text in stream: + if isFinal: + print( text, end="\r\n") + else: + print( text, end="\r") diff --git a/mic_test_whisper_streaming.py b/mic_test_whisper_streaming.py new file mode 100644 index 0000000..26c0ba5 --- /dev/null +++ b/mic_test_whisper_streaming.py @@ -0,0 +1,71 @@ +from microphone_stream import MicrophoneStream +from voice_activity_controller import VoiceActivityController +from whisper_online import * +import numpy as np +import librosa +import io +import soundfile +import sys + + +SAMPLING_RATE = 16000 +model = "large-v2" +src_lan = "en" # source language +tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used +use_vad_result = True +min_sample_length = 1 * SAMPLING_RATE + + + +asr = FasterWhisperASR(src_lan, model) # loads and wraps Whisper model +tokenizer = create_tokenizer(tgt_lan) # sentence segmenter for the target language +online = OnlineASRProcessor(asr, tokenizer) # create processing object + +microphone_stream = MicrophoneStream() +vad = VoiceActivityController(use_vad_result = use_vad_result) + +complete_text = '' +final_processing_pending = False +out = [] +out_len = 0 +for iter in vad.detect_user_speech(microphone_stream): # processing loop: + raw_bytes= iter[0] + is_final = iter[1] + + if raw_bytes: + sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") + audio, _ = librosa.load(sf,sr=SAMPLING_RATE) + out.append(audio) + out_len += len(audio) + + + if (is_final or out_len >= min_sample_length) and out_len>0: + a = np.concatenate(out) + online.insert_audio_chunk(a) + + if out_len > min_sample_length: + o = online.process_iter() + print('-----'*10) + complete_text = complete_text + o[2] + print('PARTIAL - '+ complete_text) # do something with current partial output + print('-----'*10) + out = [] + out_len = 0 + + if is_final: + o = online.finish() + online.init() + # final_processing_pending = False + print('-----'*10) + complete_text = complete_text + o[2] + print('FINAL - '+ complete_text) # do something with current partial output + print('-----'*10) + out = [] + out_len = 0 + + + + + + + diff --git a/microphone_stream.py b/microphone_stream.py new file mode 100644 index 0000000..c317844 --- /dev/null +++ b/microphone_stream.py @@ -0,0 +1,82 @@ + + +### mic stream + +import queue +import re +import sys +import pyaudio + + +class MicrophoneStream: + def __init__( + self, + sample_rate: int = 16000, + ): + """ + Creates a stream of audio from the microphone. + + Args: + chunk_size: The size of each chunk of audio to read from the microphone. + channels: The number of channels to record audio from. + sample_rate: The sample rate to record audio at. + """ + try: + import pyaudio + except ImportError: + raise Exception('py audio not installed') + + self._pyaudio = pyaudio.PyAudio() + self.sample_rate = sample_rate + + self._chunk_size = int(self.sample_rate * 0.1) + self._stream = self._pyaudio.open( + format=pyaudio.paInt16, + channels=1, + rate=sample_rate, + input=True, + frames_per_buffer=self._chunk_size, + ) + + self._open = True + + def __iter__(self): + """ + Returns the iterator object. + """ + + return self + + def __next__(self): + """ + Reads a chunk of audio from the microphone. + """ + if not self._open: + raise StopIteration + + try: + return self._stream.read(self._chunk_size) + except KeyboardInterrupt: + raise StopIteration + + def close(self): + """ + Closes the stream. + """ + + self._open = False + + if self._stream.is_active(): + self._stream.stop_stream() + + self._stream.close() + self._pyaudio.terminate() + + + + + + + + + diff --git a/voice_activity_controller.py b/voice_activity_controller.py new file mode 100644 index 0000000..d1cf031 --- /dev/null +++ b/voice_activity_controller.py @@ -0,0 +1,117 @@ +import torch +import numpy as np +# import sounddevice as sd +import torch +import numpy as np + + +class VoiceActivityController: + def __init__( + self, + sampling_rate = 16000, + second_ofSilence = 0.5, + second_ofSpeech = 0.25, + second_ofMinRecording = 10, + use_vad_result = True, + activity_detected_callback=None, + ): + self.activity_detected_callback=activity_detected_callback + self.model, self.utils = torch.hub.load( + repo_or_dir='snakers4/silero-vad', + model='silero_vad' + ) + (self.get_speech_timestamps, + save_audio, + read_audio, + VADIterator, + collect_chunks) = self.utils + + self.sampling_rate = sampling_rate + self.silence_limit = second_ofSilence * self.sampling_rate + self.speech_limit = second_ofSpeech *self.sampling_rate + self.MIN_RECORDING_LENGTH = second_ofMinRecording * self.sampling_rate + + self.use_vad_result = use_vad_result + self.vad_iterator = VADIterator( + model =self.model, + threshold = 0.3, + sampling_rate= 16000, + min_silence_duration_ms = 500, #100 + speech_pad_ms = 400 #30 + ) + self.last_marked_chunk = None + + + def int2float(self, sound): + abs_max = np.abs(sound).max() + sound = sound.astype('float32') + if abs_max > 0: + sound *= 1/32768 + sound = sound.squeeze() # depends on the use case + return sound + + def apply_vad(self, audio): + audio_float32 = self.int2float(audio) + chunk = self.vad_iterator(audio_float32, return_seconds=False) + + if chunk is not None: + if "start" in chunk: + start = chunk["start"] + self.last_marked_chunk = chunk + return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0 + + if "end" in chunk: + #todo: pending get the padding from the next chunk + end = chunk["end"] if chunk["end"] < len(audio) else len(audio) + self.last_marked_chunk = chunk + return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end + + if self.last_marked_chunk is not None: + if "start" in self.last_marked_chunk: + return audio, len(audio) ,0 + + if "end" in self.last_marked_chunk: + return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio) + + return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0 + + + + def detect_user_speech(self, audio_stream, audio_in_int16 = False): + silence_len= 0 + speech_len = 0 + + for data in audio_stream: # replace with your condition of choice + # if isinstance(data, EndOfTransmission): + # raise EndOfTransmission("End of transmission detected") + + + audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data + wav = audio_block + + + is_final = False + voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav) + # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}') + + if speech_in_wav > 0 : + silence_len= 0 + speech_len += speech_in_wav + if self.activity_detected_callback is not None: + self.activity_detected_callback() + + silence_len = silence_len + last_silent_duration_in_wav + if silence_len>= self.silence_limit and speech_len >= self.speech_limit: + is_final = True + silence_len= 0 + speech_len = 0 + + + yield voice_audio.tobytes(), is_final + + + + + + + From 3fad8133b40e0efd51f3eb810486ba7e517cd46a Mon Sep 17 00:00:00 2001 From: Rodrigo Date: Fri, 1 Dec 2023 18:08:43 -0300 Subject: [PATCH 2/5] delete unused var --- voice_activity_controller.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/voice_activity_controller.py b/voice_activity_controller.py index d1cf031..e9083cb 100644 --- a/voice_activity_controller.py +++ b/voice_activity_controller.py @@ -11,7 +11,6 @@ class VoiceActivityController: sampling_rate = 16000, second_ofSilence = 0.5, second_ofSpeech = 0.25, - second_ofMinRecording = 10, use_vad_result = True, activity_detected_callback=None, ): @@ -29,13 +28,12 @@ class VoiceActivityController: self.sampling_rate = sampling_rate self.silence_limit = second_ofSilence * self.sampling_rate self.speech_limit = second_ofSpeech *self.sampling_rate - self.MIN_RECORDING_LENGTH = second_ofMinRecording * self.sampling_rate self.use_vad_result = use_vad_result self.vad_iterator = VADIterator( model =self.model, - threshold = 0.3, - sampling_rate= 16000, + threshold = 0.3, # 0.5 + sampling_rate= self.sampling_rate, min_silence_duration_ms = 500, #100 speech_pad_ms = 400 #30 ) From c8c786af4fb0fdf709cb0748acf91b5001d39bbe Mon Sep 17 00:00:00 2001 From: Rodrigo Date: Wed, 6 Dec 2023 12:17:55 -0300 Subject: [PATCH 3/5] use of silero model instead of silero VadIterator --- mic_test_whisper_simple.py | 8 +-- mic_test_whisper_streaming.py | 2 +- microphone_stream.py | 2 +- voice_activity_controller.py | 106 ++++++++++++++++++---------------- whisper_online.py | 9 ++- 5 files changed, 69 insertions(+), 58 deletions(-) diff --git a/mic_test_whisper_simple.py b/mic_test_whisper_simple.py index 58d3a8d..63160e0 100644 --- a/mic_test_whisper_simple.py +++ b/mic_test_whisper_simple.py @@ -39,7 +39,6 @@ class SimpleASRProcessor: if chunk is not None: sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW") audio, _ = librosa.load(sf,sr=SAMPLING_RATE) - # self.audio_buffer.append(chunk) out = [] out.append(audio) a = np.concatenate(out) @@ -47,15 +46,16 @@ class SimpleASRProcessor: if is_final and len(self.audio_buffer) > 0: res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) - # use custom ts_words tsw = self.ts_words(res) + self.init_prompt = self.init_prompt + tsw self.init_prompt = self.init_prompt [-100:] self.audio_buffer.resize(0) iter_in_phrase =0 + yield True, tsw - # show progress evry 10 chunks - elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0: + # show progress evry 50 chunks + elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0: res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt) # use custom ts_words tsw = self.ts_words(res) diff --git a/mic_test_whisper_streaming.py b/mic_test_whisper_streaming.py index 26c0ba5..bd68832 100644 --- a/mic_test_whisper_streaming.py +++ b/mic_test_whisper_streaming.py @@ -13,7 +13,7 @@ model = "large-v2" src_lan = "en" # source language tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used use_vad_result = True -min_sample_length = 1 * SAMPLING_RATE +min_sample_length = 1.5 * SAMPLING_RATE diff --git a/microphone_stream.py b/microphone_stream.py index c317844..63d5019 100644 --- a/microphone_stream.py +++ b/microphone_stream.py @@ -29,7 +29,7 @@ class MicrophoneStream: self._pyaudio = pyaudio.PyAudio() self.sample_rate = sample_rate - self._chunk_size = int(self.sample_rate * 0.1) + self._chunk_size = int(self.sample_rate * 40 / 1000) self._stream = self._pyaudio.open( format=pyaudio.paInt16, channels=1, diff --git a/voice_activity_controller.py b/voice_activity_controller.py index e9083cb..533daab 100644 --- a/voice_activity_controller.py +++ b/voice_activity_controller.py @@ -3,16 +3,27 @@ import numpy as np # import sounddevice as sd import torch import numpy as np +import datetime +def int2float(sound): + abs_max = np.abs(sound).max() + sound = sound.astype('float32') + if abs_max > 0: + sound *= 1/32768 + sound = sound.squeeze() # depends on the use case + return sound + class VoiceActivityController: def __init__( self, sampling_rate = 16000, - second_ofSilence = 0.5, - second_ofSpeech = 0.25, + min_silence_to_final_ms = 500, + min_speech_to_final_ms = 100, + min_silence_duration_ms = 100, use_vad_result = True, activity_detected_callback=None, + threshold =0.3 ): self.activity_detected_callback=activity_detected_callback self.model, self.utils = torch.hub.load( @@ -26,84 +37,77 @@ class VoiceActivityController: collect_chunks) = self.utils self.sampling_rate = sampling_rate - self.silence_limit = second_ofSilence * self.sampling_rate - self.speech_limit = second_ofSpeech *self.sampling_rate + self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000 + self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000 + self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 self.use_vad_result = use_vad_result - self.vad_iterator = VADIterator( - model =self.model, - threshold = 0.3, # 0.5 - sampling_rate= self.sampling_rate, - min_silence_duration_ms = 500, #100 - speech_pad_ms = 400 #30 - ) self.last_marked_chunk = None - - - def int2float(self, sound): - abs_max = np.abs(sound).max() - sound = sound.astype('float32') - if abs_max > 0: - sound *= 1/32768 - sound = sound.squeeze() # depends on the use case - return sound + self.threshold = threshold + self.reset_states() + + def reset_states(self): + self.model.reset_states() + self.temp_end = 0 + self.current_sample = 0 def apply_vad(self, audio): - audio_float32 = self.int2float(audio) - chunk = self.vad_iterator(audio_float32, return_seconds=False) + x = int2float(audio) + if not torch.is_tensor(x): + try: + x = torch.Tensor(x) + except: + raise TypeError("Audio cannot be casted to tensor. Cast it manually") - if chunk is not None: - if "start" in chunk: - start = chunk["start"] - self.last_marked_chunk = chunk - return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0 - - if "end" in chunk: - #todo: pending get the padding from the next chunk - end = chunk["end"] if chunk["end"] < len(audio) else len(audio) - self.last_marked_chunk = chunk - return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end + speech_prob = self.model(x, self.sampling_rate).item() + + window_size_samples = len(x[0]) if x.dim() == 2 else len(x) + self.current_sample += window_size_samples - if self.last_marked_chunk is not None: - if "start" in self.last_marked_chunk: - return audio, len(audio) ,0 - if "end" in self.last_marked_chunk: - return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio) + if (speech_prob >= self.threshold): + self.temp_end = 0 + return audio, window_size_samples, 0 + + else : + if not self.temp_end: + self.temp_end = self.current_sample + + if self.current_sample - self.temp_end < self.min_silence_samples: + return audio, 0, window_size_samples + else: + return np.array([], dtype=np.float16) , 0, window_size_samples + - return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0 def detect_user_speech(self, audio_stream, audio_in_int16 = False): - silence_len= 0 + last_silence_len= 0 speech_len = 0 for data in audio_stream: # replace with your condition of choice - # if isinstance(data, EndOfTransmission): - # raise EndOfTransmission("End of transmission detected") audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data wav = audio_block - is_final = False - voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav) - # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}') + voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav) + if speech_in_wav > 0 : - silence_len= 0 + last_silence_len= 0 speech_len += speech_in_wav if self.activity_detected_callback is not None: self.activity_detected_callback() - silence_len = silence_len + last_silent_duration_in_wav - if silence_len>= self.silence_limit and speech_len >= self.speech_limit: + last_silence_len += last_silent_in_wav + if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit: + is_final = True - silence_len= 0 - speech_len = 0 - + last_silence_len= 0 + speech_len = 0 yield voice_audio.tobytes(), is_final diff --git a/whisper_online.py b/whisper_online.py index 8efbbab..dc23c18 100644 --- a/whisper_online.py +++ b/whisper_online.py @@ -4,7 +4,7 @@ import numpy as np import librosa from functools import lru_cache import time - +import datetime @lru_cache @@ -118,14 +118,21 @@ class FasterWhisperASR(ASRBase): return model def transcribe(self, audio, init_prompt=""): + + # tiempo_inicio = datetime.datetime.now() # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01) segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs) + + # print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe take { (datetime.datetime.now() -tiempo_inicio) } ms.') + return list(segments) def ts_words(self, segments): o = [] for segment in segments: for word in segment.words: + if segment.no_speech_prob > 0.9: + continue # not stripping the spaces -- should not be merged with them! w = word.word t = (word.start, word.end, w) From ea2a9ca2e65c1945c4607c3b5570d190923d1552 Mon Sep 17 00:00:00 2001 From: Rodrigo Date: Wed, 6 Dec 2023 12:52:29 -0300 Subject: [PATCH 4/5] use of silero model instead of silero VadIterator --- voice_activity_controller.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/voice_activity_controller.py b/voice_activity_controller.py index 533daab..59aceca 100644 --- a/voice_activity_controller.py +++ b/voice_activity_controller.py @@ -30,11 +30,11 @@ class VoiceActivityController: repo_or_dir='snakers4/silero-vad', model='silero_vad' ) - (self.get_speech_timestamps, - save_audio, - read_audio, - VADIterator, - collect_chunks) = self.utils + # (self.get_speech_timestamps, + # save_audio, + # read_audio, + # VADIterator, + # collect_chunks) = self.utils self.sampling_rate = sampling_rate self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000 From 324dee03e7a5403232cc668e0ca07f64204a953b Mon Sep 17 00:00:00 2001 From: Rodrigo Date: Sat, 9 Dec 2023 17:12:43 -0300 Subject: [PATCH 5/5] vad --- mic_test_whisper_simple.py | 6 +++--- mic_test_whisper_streaming.py | 4 ++-- voice_activity_controller.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mic_test_whisper_simple.py b/mic_test_whisper_simple.py index 63160e0..3b2b61d 100644 --- a/mic_test_whisper_simple.py +++ b/mic_test_whisper_simple.py @@ -72,12 +72,12 @@ SAMPLING_RATE = 16000 model = "large-v2" src_lan = "en" # source language tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used -use_vad_result = True +use_vad = False min_sample_length = 1 * SAMPLING_RATE -vad = VoiceActivityController(use_vad_result = use_vad_result) +vac = VoiceActivityController(use_vad_result = use_vad) asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model tokenizer = create_tokenizer(tgt_lan) @@ -85,7 +85,7 @@ online = SimpleASRProcessor(asr) stream = MicrophoneStream() -stream = vad.detect_user_speech(stream, audio_in_int16 = False) +stream = vac.detect_user_speech(stream, audio_in_int16 = False) stream = online.stream_process(stream) for isFinal, text in stream: diff --git a/mic_test_whisper_streaming.py b/mic_test_whisper_streaming.py index bd68832..b427015 100644 --- a/mic_test_whisper_streaming.py +++ b/mic_test_whisper_streaming.py @@ -13,7 +13,7 @@ model = "large-v2" src_lan = "en" # source language tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used use_vad_result = True -min_sample_length = 1.5 * SAMPLING_RATE +min_sample_length = 1 * SAMPLING_RATE @@ -54,12 +54,12 @@ for iter in vad.detect_user_speech(microphone_stream): # processing loop: if is_final: o = online.finish() - online.init() # final_processing_pending = False print('-----'*10) complete_text = complete_text + o[2] print('FINAL - '+ complete_text) # do something with current partial output print('-----'*10) + online.init() out = [] out_len = 0 diff --git a/voice_activity_controller.py b/voice_activity_controller.py index 59aceca..3ccc29a 100644 --- a/voice_activity_controller.py +++ b/voice_activity_controller.py @@ -76,7 +76,7 @@ class VoiceActivityController: if self.current_sample - self.temp_end < self.min_silence_samples: return audio, 0, window_size_samples else: - return np.array([], dtype=np.float16) , 0, window_size_samples + return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples