added on_recorded_chunk cb

KoljaB · Mar 16, 2024 · ba6e549 · ba6e549
1 parent e5613ca
commit ba6e549
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 17 deletions.
diff --git a/RealtimeSTT/audio_recorder.py b/RealtimeSTT/audio_recorder.py
@@ -125,6 +125,8 @@ def __init__(self,
                  on_wakeword_timeout=None,
                  on_wakeword_detection_start=None,
                  on_wakeword_detection_end=None,
+                 on_recorded_chunk=None,
+                 debug_mode=False
                  ):
         """
         Initializes an audio recorder and  transcription
@@ -246,6 +248,11 @@ def __init__(self,
         - on_wakeword_detection_end (callable, default=None): Callback
             function to be called when the system stops to listen for
             wake words (e.g. because of timeout or wake word detected)
+        - on_recorded_chunk (callable, default=None): Callback function to be
+            called when a chunk of audio is recorded. The function is called
+            with the recorded audio chunk as its argument.
+        - debug_mode (bool, default=False): If set to True, the system will
+            print additional debug information to the console.
 
         Raises:
             Exception: Errors related to initializing transcription
@@ -278,6 +285,7 @@ def __init__(self,
         self.on_vad_detect_stop = on_vad_detect_stop
         self.on_wakeword_detection_start = on_wakeword_detection_start
         self.on_wakeword_detection_end = on_wakeword_detection_end
+        self.on_recorded_chunk = on_recorded_chunk
         self.on_transcription_start = on_transcription_start
         self.enable_realtime_transcription = enable_realtime_transcription
         self.realtime_model_type = realtime_model_type
@@ -288,6 +296,7 @@ def __init__(self,
         self.on_realtime_transcription_stabilized = (
             on_realtime_transcription_stabilized
         )
+        self.debug_mode = debug_mode
         self.allowed_latency_limit = ALLOWED_LATENCY_LIMIT
 
         self.level = level
@@ -578,9 +587,6 @@ def _transcription_worker(conn,
                         transcription = " ".join(seg.text for seg in segments)
                         transcription = transcription.strip()
                         conn.send(('success', transcription))
-                    except faster_whisper.WhisperError as e:
-                        logging.error(f"Whisper transcription error: {e}")
-                        conn.send(('error', str(e)))
                     except Exception as e:
                         logging.error(f"General transcription error: {e}")
                         conn.send(('error', str(e)))
@@ -633,13 +639,14 @@ def _audio_data_worker(audio_queue,
 
         try:
             audio_interface = pyaudio.PyAudio()
-            stream = audio_interface.open(rate=sample_rate,
-                                          format=pyaudio.paInt16,
-                                          channels=1,
-                                          input=True,
-                                          frames_per_buffer=buffer_size,
-                                          input_device_index=input_device_index,
-                                          )
+            stream = audio_interface.open(
+                rate=sample_rate,
+                format=pyaudio.paInt16,
+                channels=1,
+                input=True,
+                frames_per_buffer=buffer_size,
+                input_device_index=input_device_index,
+                )
 
         except Exception as e:
             logging.exception("Error initializing pyaudio "
@@ -978,6 +985,8 @@ def _recording_worker(self):
                 try:
 
                     data = self.audio_queue.get()
+                    if self.on_recorded_chunk:
+                        self.on_recorded_chunk(data)
 
                     # Handle queue overflow
                     queue_overflow_logged = False
@@ -1326,10 +1335,20 @@ def _is_webrtc_speech(self, data, all_frames_must_be_true=False):
             if self.webrtc_vad_model.is_speech(frame, self.sample_rate):
                 speech_frames += 1
                 if not all_frames_must_be_true:
+                    if self.debug_mode:
+                        print(f"Speech detected in frame {i + 1}"
+                              f" of {num_frames}")
                     return True
         if all_frames_must_be_true:
+            if self.debug_mode and speech_frames == num_frames:
+                print(f"Speech detected in {speech_frames} of "
+                      f"{num_frames} frames")
+            elif self.debug_mode:
+                print(f"Speech not detected in all {num_frames} frames")
             return speech_frames == num_frames
         else:
+            if self.debug_mode:
+                print(f"Speech not detected in any of {num_frames} frames")
             return False
 
     def _check_voice_activity(self, data):

diff --git a/requirements-gpu.txt b/requirements-gpu.txt
@@ -1,5 +1,5 @@
 PyAudio==0.2.14
-faster-whisper==0.10.0
+faster-whisper==1.0.1
 pvporcupine==1.9.5
 webrtcvad==2.0.10
 halo==0.0.31
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 PyAudio==0.2.14
-faster-whisper==0.10.0
+faster-whisper==1.0.1
 pvporcupine==1.9.5
 webrtcvad==2.0.10
 halo==0.0.31

diff --git a/requirements_raw.txt b/requirements_raw.txt