From 9556d07484769e3edeca9a5b19674d8040395a79 Mon Sep 17 00:00:00 2001
From: Rodrigo <ro.goab@gmail.com>
Date: Fri, 1 Dec 2023 17:33:46 -0300
Subject: [PATCH 1/5] vad

---
 mic_test_whisper_simple.py    |  95 +++++++++++++++++++++++++++
 mic_test_whisper_streaming.py |  71 +++++++++++++++++++++
 microphone_stream.py          |  82 ++++++++++++++++++++++++
 voice_activity_controller.py  | 117 ++++++++++++++++++++++++++++++++++
 4 files changed, 365 insertions(+)
 create mode 100644 mic_test_whisper_simple.py
 create mode 100644 mic_test_whisper_streaming.py
 create mode 100644 microphone_stream.py
 create mode 100644 voice_activity_controller.py

diff --git a/mic_test_whisper_simple.py b/mic_test_whisper_simple.py
new file mode 100644
index 0000000..58d3a8d
--- /dev/null
+++ b/mic_test_whisper_simple.py
@@ -0,0 +1,95 @@
+from microphone_stream import MicrophoneStream
+from voice_activity_controller import VoiceActivityController
+from whisper_online import *
+import numpy as np
+import librosa  
+import io
+import soundfile
+import sys
+
+
+
+
+class SimpleASRProcessor:
+
+    def __init__(self, asr, sampling_rate = 16000):
+        """run this when starting or restarting processing"""
+        self.audio_buffer = np.array([],dtype=np.float32)
+        self.prompt_buffer = ""
+        self.asr = asr
+        self.sampling_rate = sampling_rate
+        self.init_prompt = ''
+
+    def ts_words(self, segments):
+        result = ""
+        for segment in segments:
+            if segment.no_speech_prob > 0.9:
+                continue
+            for word in segment.words:
+                w = word.word
+                t = (word.start, word.end, w)
+                result +=w
+        return result 
+
+    def stream_process(self, vad_result):
+        iter_in_phrase = 0
+        for chunk, is_final in vad_result:
+            iter_in_phrase += 1
+
+            if chunk is not None:
+                sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
+                audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
+                # self.audio_buffer.append(chunk)
+                out = []
+                out.append(audio)
+                a = np.concatenate(out)
+                self.audio_buffer = np.append(self.audio_buffer, a)
+
+            if is_final and len(self.audio_buffer) > 0:
+                res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
+                # use custom ts_words
+                tsw = self.ts_words(res)
+                self.init_prompt = self.init_prompt + tsw
+                self.init_prompt  = self.init_prompt [-100:]
+                self.audio_buffer.resize(0)
+                iter_in_phrase =0
+                yield True, tsw
+            # show progress evry 10 chunks
+            elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0:
+                res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
+                # use custom ts_words
+                tsw = self.ts_words(res)
+                yield False, tsw
+            
+        
+
+
+
+
+
+SAMPLING_RATE = 16000
+
+model = "large-v2"
+src_lan = "en"  # source language
+tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
+use_vad_result = True
+min_sample_length = 1 * SAMPLING_RATE
+
+
+
+vad = VoiceActivityController(use_vad_result = use_vad_result)
+asr = FasterWhisperASR(src_lan, "large-v2")  # loads and wraps Whisper model
+
+tokenizer = create_tokenizer(tgt_lan)
+online = SimpleASRProcessor(asr)
+
+
+stream = MicrophoneStream()
+stream = vad.detect_user_speech(stream, audio_in_int16 = False) 
+stream = online.stream_process(stream)
+
+for isFinal, text in stream:
+    if isFinal:
+        print( text,  end="\r\n")
+    else:
+        print( text,  end="\r")
diff --git a/mic_test_whisper_streaming.py b/mic_test_whisper_streaming.py
new file mode 100644
index 0000000..26c0ba5
--- /dev/null
+++ b/mic_test_whisper_streaming.py
@@ -0,0 +1,71 @@
+from microphone_stream import MicrophoneStream
+from voice_activity_controller import VoiceActivityController
+from whisper_online import *
+import numpy as np
+import librosa  
+import io
+import soundfile
+import sys
+
+
+SAMPLING_RATE = 16000
+model = "large-v2"
+src_lan = "en"  # source language
+tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
+use_vad_result = True
+min_sample_length = 1 * SAMPLING_RATE
+
+
+
+asr = FasterWhisperASR(src_lan, model)  # loads and wraps Whisper model
+tokenizer = create_tokenizer(tgt_lan)  # sentence segmenter for the target language
+online = OnlineASRProcessor(asr, tokenizer)  # create processing object
+
+microphone_stream = MicrophoneStream() 
+vad = VoiceActivityController(use_vad_result = use_vad_result)
+
+complete_text = ''
+final_processing_pending = False
+out = []
+out_len = 0
+for iter in vad.detect_user_speech(microphone_stream):   # processing loop:
+    raw_bytes=  iter[0]
+    is_final =  iter[1]
+
+    if  raw_bytes:
+        sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
+        audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
+        out.append(audio)
+        out_len += len(audio)
+
+    
+    if (is_final or out_len >= min_sample_length) and out_len>0:
+        a = np.concatenate(out)
+        online.insert_audio_chunk(a)    
+        
+    if out_len > min_sample_length:
+        o = online.process_iter()
+        print('-----'*10)
+        complete_text = complete_text + o[2]
+        print('PARTIAL - '+ complete_text) # do something with current partial output
+        print('-----'*10)     
+        out = []
+        out_len = 0   
+
+    if is_final:
+        o = online.finish()
+        online.init()   
+        # final_processing_pending = False         
+        print('-----'*10)
+        complete_text = complete_text + o[2]
+        print('FINAL - '+ complete_text) # do something with current partial output
+        print('-----'*10)   
+        out = []
+        out_len = 0    
+        
+
+
+
+
+
+
diff --git a/microphone_stream.py b/microphone_stream.py
new file mode 100644
index 0000000..c317844
--- /dev/null
+++ b/microphone_stream.py
@@ -0,0 +1,82 @@
+
+
+### mic stream
+
+import queue
+import re
+import sys
+import pyaudio
+
+
+class MicrophoneStream:
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+    ):
+        """
+        Creates a stream of audio from the microphone.
+
+        Args:
+            chunk_size: The size of each chunk of audio to read from the microphone.
+            channels: The number of channels to record audio from.
+            sample_rate: The sample rate to record audio at.
+        """
+        try:
+            import pyaudio
+        except ImportError:
+            raise Exception('py audio not installed')
+
+        self._pyaudio = pyaudio.PyAudio()
+        self.sample_rate = sample_rate
+
+        self._chunk_size = int(self.sample_rate * 0.1)
+        self._stream = self._pyaudio.open(
+            format=pyaudio.paInt16,
+            channels=1,
+            rate=sample_rate,
+            input=True,
+            frames_per_buffer=self._chunk_size,
+        )
+
+        self._open = True
+
+    def __iter__(self):
+        """
+        Returns the iterator object.
+        """
+
+        return self
+
+    def __next__(self):
+        """
+        Reads a chunk of audio from the microphone.
+        """
+        if not self._open:
+            raise StopIteration
+
+        try:
+            return self._stream.read(self._chunk_size)
+        except KeyboardInterrupt:
+            raise StopIteration
+
+    def close(self):
+        """
+        Closes the stream.
+        """
+
+        self._open = False
+
+        if self._stream.is_active():
+            self._stream.stop_stream()
+
+        self._stream.close()
+        self._pyaudio.terminate()
+
+
+
+
+
+
+
+
+
diff --git a/voice_activity_controller.py b/voice_activity_controller.py
new file mode 100644
index 0000000..d1cf031
--- /dev/null
+++ b/voice_activity_controller.py
@@ -0,0 +1,117 @@
+import torch
+import numpy as np
+# import sounddevice as sd
+import torch
+import numpy as np
+
+
+class VoiceActivityController:
+    def __init__(
+            self, 
+            sampling_rate = 16000,
+            second_ofSilence = 0.5,
+            second_ofSpeech = 0.25,
+            second_ofMinRecording = 10,
+            use_vad_result = True,
+            activity_detected_callback=None,
+        ):
+        self.activity_detected_callback=activity_detected_callback
+        self.model, self.utils = torch.hub.load(
+            repo_or_dir='snakers4/silero-vad',
+            model='silero_vad'
+        )
+        (self.get_speech_timestamps,
+        save_audio,
+        read_audio,
+        VADIterator,
+        collect_chunks) = self.utils
+
+        self.sampling_rate = sampling_rate  
+        self.silence_limit = second_ofSilence * self.sampling_rate 
+        self.speech_limit = second_ofSpeech *self.sampling_rate 
+        self.MIN_RECORDING_LENGTH =  second_ofMinRecording * self.sampling_rate 
+
+        self.use_vad_result = use_vad_result
+        self.vad_iterator = VADIterator(
+            model =self.model,
+            threshold = 0.3,
+            sampling_rate= 16000,
+            min_silence_duration_ms = 500, #100
+            speech_pad_ms = 400 #30
+        )
+        self.last_marked_chunk = None
+        
+    
+    def int2float(self, sound):
+        abs_max = np.abs(sound).max()
+        sound = sound.astype('float32')
+        if abs_max > 0:
+            sound *= 1/32768
+        sound = sound.squeeze()  # depends on the use case
+        return sound
+
+    def apply_vad(self, audio):
+        audio_float32 = self.int2float(audio)
+        chunk = self.vad_iterator(audio_float32, return_seconds=False)
+
+        if chunk is not None:        
+            if "start" in chunk:
+                start = chunk["start"]
+                self.last_marked_chunk = chunk
+                return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0
+            
+            if "end" in chunk:
+                #todo: pending get the padding from the next chunk
+                end = chunk["end"] if chunk["end"] < len(audio) else len(audio)
+                self.last_marked_chunk = chunk
+                return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end
+
+        if self.last_marked_chunk is not None:
+            if "start" in self.last_marked_chunk:
+                return audio, len(audio)  ,0
+
+            if "end" in self.last_marked_chunk:
+                return  np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio) 
+
+        return  np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0 
+
+
+
+    def detect_user_speech(self, audio_stream, audio_in_int16 = False):
+        silence_len= 0
+        speech_len = 0
+
+        for data in audio_stream:  # replace with your condition of choice
+            # if isinstance(data, EndOfTransmission):
+            #     raise EndOfTransmission("End of transmission detected")
+            
+            
+            audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
+            wav = audio_block
+            
+
+            is_final = False
+            voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav)
+            # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}')
+
+            if speech_in_wav > 0 :
+                silence_len= 0                
+                speech_len += speech_in_wav
+                if self.activity_detected_callback is not None:
+                    self.activity_detected_callback()
+
+            silence_len = silence_len + last_silent_duration_in_wav
+            if silence_len>= self.silence_limit and speech_len >= self.speech_limit:
+                is_final = True
+                silence_len= 0
+                speech_len = 0
+            
+
+            yield voice_audio.tobytes(), is_final
+
+
+
+
+
+
+

From 3fad8133b40e0efd51f3eb810486ba7e517cd46a Mon Sep 17 00:00:00 2001
From: Rodrigo <ro.goab@gmail.com>
Date: Fri, 1 Dec 2023 18:08:43 -0300
Subject: [PATCH 2/5] delete unused var

---
 voice_activity_controller.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/voice_activity_controller.py b/voice_activity_controller.py
index d1cf031..e9083cb 100644
--- a/voice_activity_controller.py
+++ b/voice_activity_controller.py
@@ -11,7 +11,6 @@ class VoiceActivityController:
             sampling_rate = 16000,
             second_ofSilence = 0.5,
             second_ofSpeech = 0.25,
-            second_ofMinRecording = 10,
             use_vad_result = True,
             activity_detected_callback=None,
         ):
@@ -29,13 +28,12 @@ class VoiceActivityController:
         self.sampling_rate = sampling_rate  
         self.silence_limit = second_ofSilence * self.sampling_rate 
         self.speech_limit = second_ofSpeech *self.sampling_rate 
-        self.MIN_RECORDING_LENGTH =  second_ofMinRecording * self.sampling_rate 
 
         self.use_vad_result = use_vad_result
         self.vad_iterator = VADIterator(
             model =self.model,
-            threshold = 0.3,
-            sampling_rate= 16000,
+            threshold = 0.3, # 0.5
+            sampling_rate= self.sampling_rate,
             min_silence_duration_ms = 500, #100
             speech_pad_ms = 400 #30
         )

From c8c786af4fb0fdf709cb0748acf91b5001d39bbe Mon Sep 17 00:00:00 2001
From: Rodrigo <ro.goab@gmail.com>
Date: Wed, 6 Dec 2023 12:17:55 -0300
Subject: [PATCH 3/5] use of silero model instead of silero VadIterator

---
 mic_test_whisper_simple.py    |   8 +--
 mic_test_whisper_streaming.py |   2 +-
 microphone_stream.py          |   2 +-
 voice_activity_controller.py  | 106 ++++++++++++++++++----------------
 whisper_online.py             |   9 ++-
 5 files changed, 69 insertions(+), 58 deletions(-)

diff --git a/mic_test_whisper_simple.py b/mic_test_whisper_simple.py
index 58d3a8d..63160e0 100644
--- a/mic_test_whisper_simple.py
+++ b/mic_test_whisper_simple.py
@@ -39,7 +39,6 @@ class SimpleASRProcessor:
             if chunk is not None:
                 sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
                 audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
-                # self.audio_buffer.append(chunk)
                 out = []
                 out.append(audio)
                 a = np.concatenate(out)
@@ -47,15 +46,16 @@ class SimpleASRProcessor:
 
             if is_final and len(self.audio_buffer) > 0:
                 res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
-                # use custom ts_words
                 tsw = self.ts_words(res)
+                
                 self.init_prompt = self.init_prompt + tsw
                 self.init_prompt  = self.init_prompt [-100:]
                 self.audio_buffer.resize(0)
                 iter_in_phrase =0
+                
                 yield True, tsw
-            # show progress evry 10 chunks
-            elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0:
+            # show progress evry 50 chunks
+            elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
                 res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
                 # use custom ts_words
                 tsw = self.ts_words(res)
diff --git a/mic_test_whisper_streaming.py b/mic_test_whisper_streaming.py
index 26c0ba5..bd68832 100644
--- a/mic_test_whisper_streaming.py
+++ b/mic_test_whisper_streaming.py
@@ -13,7 +13,7 @@ model = "large-v2"
 src_lan = "en"  # source language
 tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
 use_vad_result = True
-min_sample_length = 1 * SAMPLING_RATE
+min_sample_length = 1.5 * SAMPLING_RATE
 
 
 
diff --git a/microphone_stream.py b/microphone_stream.py
index c317844..63d5019 100644
--- a/microphone_stream.py
+++ b/microphone_stream.py
@@ -29,7 +29,7 @@ class MicrophoneStream:
         self._pyaudio = pyaudio.PyAudio()
         self.sample_rate = sample_rate
 
-        self._chunk_size = int(self.sample_rate * 0.1)
+        self._chunk_size = int(self.sample_rate * 40  / 1000)
         self._stream = self._pyaudio.open(
             format=pyaudio.paInt16,
             channels=1,
diff --git a/voice_activity_controller.py b/voice_activity_controller.py
index e9083cb..533daab 100644
--- a/voice_activity_controller.py
+++ b/voice_activity_controller.py
@@ -3,16 +3,27 @@ import numpy as np
 # import sounddevice as sd
 import torch
 import numpy as np
+import datetime
 
 
+def int2float(sound):
+    abs_max = np.abs(sound).max()
+    sound = sound.astype('float32')
+    if abs_max > 0:
+        sound *= 1/32768
+    sound = sound.squeeze()  # depends on the use case
+    return sound
+
 class VoiceActivityController:
     def __init__(
             self, 
             sampling_rate = 16000,
-            second_ofSilence = 0.5,
-            second_ofSpeech = 0.25,
+            min_silence_to_final_ms = 500,
+            min_speech_to_final_ms = 100,
+            min_silence_duration_ms = 100,
             use_vad_result = True,
             activity_detected_callback=None,
+            threshold =0.3
         ):
         self.activity_detected_callback=activity_detected_callback
         self.model, self.utils = torch.hub.load(
@@ -26,84 +37,77 @@ class VoiceActivityController:
         collect_chunks) = self.utils
 
         self.sampling_rate = sampling_rate  
-        self.silence_limit = second_ofSilence * self.sampling_rate 
-        self.speech_limit = second_ofSpeech *self.sampling_rate 
+        self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000 
+        self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000
+        self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
 
         self.use_vad_result = use_vad_result
-        self.vad_iterator = VADIterator(
-            model =self.model,
-            threshold = 0.3, # 0.5
-            sampling_rate= self.sampling_rate,
-            min_silence_duration_ms = 500, #100
-            speech_pad_ms = 400 #30
-        )
         self.last_marked_chunk = None
-        
-    
-    def int2float(self, sound):
-        abs_max = np.abs(sound).max()
-        sound = sound.astype('float32')
-        if abs_max > 0:
-            sound *= 1/32768
-        sound = sound.squeeze()  # depends on the use case
-        return sound
+        self.threshold = threshold
+        self.reset_states()
+
+    def reset_states(self):
+        self.model.reset_states()
+        self.temp_end = 0
+        self.current_sample = 0
 
     def apply_vad(self, audio):
-        audio_float32 = self.int2float(audio)
-        chunk = self.vad_iterator(audio_float32, return_seconds=False)
+        x = int2float(audio)
+        if not torch.is_tensor(x):
+            try:
+                x = torch.Tensor(x)
+            except:
+                raise TypeError("Audio cannot be casted to tensor. Cast it manually")
 
-        if chunk is not None:        
-            if "start" in chunk:
-                start = chunk["start"]
-                self.last_marked_chunk = chunk
-                return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0
-            
-            if "end" in chunk:
-                #todo: pending get the padding from the next chunk
-                end = chunk["end"] if chunk["end"] < len(audio) else len(audio)
-                self.last_marked_chunk = chunk
-                return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end
+        speech_prob = self.model(x, self.sampling_rate).item()
+        
+        window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
+        self.current_sample += window_size_samples 
 
-        if self.last_marked_chunk is not None:
-            if "start" in self.last_marked_chunk:
-                return audio, len(audio)  ,0
 
-            if "end" in self.last_marked_chunk:
-                return  np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio) 
+        if (speech_prob >= self.threshold):
+            self.temp_end = 0
+            return audio, window_size_samples, 0
+
+        else :
+            if not self.temp_end:
+                self.temp_end = self.current_sample
+
+            if self.current_sample - self.temp_end < self.min_silence_samples:
+                return audio, 0, window_size_samples
+            else:
+                return np.array([], dtype=np.float16) , 0, window_size_samples
+
 
-        return  np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0 
 
 
 
     def detect_user_speech(self, audio_stream, audio_in_int16 = False):
-        silence_len= 0
+        last_silence_len= 0
         speech_len = 0
 
         for data in audio_stream:  # replace with your condition of choice
-            # if isinstance(data, EndOfTransmission):
-            #     raise EndOfTransmission("End of transmission detected")
             
             
             audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
             wav = audio_block
             
-
             is_final = False
-            voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav)
-            # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}')
+            voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
+
 
             if speech_in_wav > 0 :
-                silence_len= 0                
+                last_silence_len= 0                
                 speech_len += speech_in_wav
                 if self.activity_detected_callback is not None:
                     self.activity_detected_callback()
 
-            silence_len = silence_len + last_silent_duration_in_wav
-            if silence_len>= self.silence_limit and speech_len >= self.speech_limit:
+            last_silence_len +=  last_silent_in_wav
+            if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit:
+
                 is_final = True
-                silence_len= 0
-                speech_len = 0
-            
+                last_silence_len= 0
+                speech_len = 0                
 
             yield voice_audio.tobytes(), is_final
 
diff --git a/whisper_online.py b/whisper_online.py
index 8efbbab..dc23c18 100644
--- a/whisper_online.py
+++ b/whisper_online.py
@@ -4,7 +4,7 @@ import numpy as np
 import librosa  
 from functools import lru_cache
 import time
-
+import datetime
 
 
 @lru_cache
@@ -118,14 +118,21 @@ class FasterWhisperASR(ASRBase):
         return model
 
     def transcribe(self, audio, init_prompt=""):
+
+        # tiempo_inicio = datetime.datetime.now()
         # tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
         segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
+        
+        # print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe  take { (datetime.datetime.now() -tiempo_inicio)  } ms.')
+
         return list(segments)
 
     def ts_words(self, segments):
         o = []
         for segment in segments:
             for word in segment.words:
+                if segment.no_speech_prob > 0.9:
+                    continue
                 # not stripping the spaces -- should not be merged with them!
                 w = word.word
                 t = (word.start, word.end, w)

From ea2a9ca2e65c1945c4607c3b5570d190923d1552 Mon Sep 17 00:00:00 2001
From: Rodrigo <ro.goab@gmail.com>
Date: Wed, 6 Dec 2023 12:52:29 -0300
Subject: [PATCH 4/5] use of silero model instead of silero VadIterator

---
 voice_activity_controller.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/voice_activity_controller.py b/voice_activity_controller.py
index 533daab..59aceca 100644
--- a/voice_activity_controller.py
+++ b/voice_activity_controller.py
@@ -30,11 +30,11 @@ class VoiceActivityController:
             repo_or_dir='snakers4/silero-vad',
             model='silero_vad'
         )
-        (self.get_speech_timestamps,
-        save_audio,
-        read_audio,
-        VADIterator,
-        collect_chunks) = self.utils
+        # (self.get_speech_timestamps,
+        # save_audio,
+        # read_audio,
+        # VADIterator,
+        # collect_chunks) = self.utils
 
         self.sampling_rate = sampling_rate  
         self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000 

From 324dee03e7a5403232cc668e0ca07f64204a953b Mon Sep 17 00:00:00 2001
From: Rodrigo <ro.goab@gmail.com>
Date: Sat, 9 Dec 2023 17:12:43 -0300
Subject: [PATCH 5/5] vad

---
 mic_test_whisper_simple.py    | 6 +++---
 mic_test_whisper_streaming.py | 4 ++--
 voice_activity_controller.py  | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mic_test_whisper_simple.py b/mic_test_whisper_simple.py
index 63160e0..3b2b61d 100644
--- a/mic_test_whisper_simple.py
+++ b/mic_test_whisper_simple.py
@@ -72,12 +72,12 @@ SAMPLING_RATE = 16000
 model = "large-v2"
 src_lan = "en"  # source language
 tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
-use_vad_result = True
+use_vad = False
 min_sample_length = 1 * SAMPLING_RATE
 
 
 
-vad = VoiceActivityController(use_vad_result = use_vad_result)
+vac = VoiceActivityController(use_vad_result = use_vad)
 asr = FasterWhisperASR(src_lan, "large-v2")  # loads and wraps Whisper model
 
 tokenizer = create_tokenizer(tgt_lan)
@@ -85,7 +85,7 @@ online = SimpleASRProcessor(asr)
 
 
 stream = MicrophoneStream()
-stream = vad.detect_user_speech(stream, audio_in_int16 = False) 
+stream = vac.detect_user_speech(stream, audio_in_int16 = False) 
 stream = online.stream_process(stream)
 
 for isFinal, text in stream:
diff --git a/mic_test_whisper_streaming.py b/mic_test_whisper_streaming.py
index bd68832..b427015 100644
--- a/mic_test_whisper_streaming.py
+++ b/mic_test_whisper_streaming.py
@@ -13,7 +13,7 @@ model = "large-v2"
 src_lan = "en"  # source language
 tgt_lan = "en"  # target language  -- same as source for ASR, "en" if translate task is used
 use_vad_result = True
-min_sample_length = 1.5 * SAMPLING_RATE
+min_sample_length = 1 * SAMPLING_RATE
 
 
 
@@ -54,12 +54,12 @@ for iter in vad.detect_user_speech(microphone_stream):   # processing loop:
 
     if is_final:
         o = online.finish()
-        online.init()   
         # final_processing_pending = False         
         print('-----'*10)
         complete_text = complete_text + o[2]
         print('FINAL - '+ complete_text) # do something with current partial output
         print('-----'*10)   
+        online.init()   
         out = []
         out_len = 0    
         
diff --git a/voice_activity_controller.py b/voice_activity_controller.py
index 59aceca..3ccc29a 100644
--- a/voice_activity_controller.py
+++ b/voice_activity_controller.py
@@ -76,7 +76,7 @@ class VoiceActivityController:
             if self.current_sample - self.temp_end < self.min_silence_samples:
                 return audio, 0, window_size_samples
             else:
-                return np.array([], dtype=np.float16) , 0, window_size_samples
+                return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0, window_size_samples