aarnphm
diff --git a/‎BUILD.bazel
+1 b/‎BUILD.bazel
+1
diff --git a/‎examples/stream/stream.py
+24-14 b/‎examples/stream/stream.py
+24-14
diff --git a/‎rules/deps.bzl
+3-3 b/‎rules/deps.bzl
+3-3
diff --git a/‎src/whispercpp/__init__.py
+23-17 b/‎src/whispercpp/__init__.py
+23-17
diff --git a/‎src/whispercpp/__init__.pyi
+3-3 b/‎src/whispercpp/__init__.pyi
+3-3
@@ -128,6 +128,7 @@ pybind_extension(
     deps = [
         ":audio_lib",
         ":context_lib",
+        "@com_github_ggerganov_whisper//:common",
     ],
 )
 
 
@@ -1,26 +1,23 @@
 """Some streaming examples."""
 
+import os
 import sys
 import typing as t
 
 import whispercpp as w
 
 
 def main(**kwargs: t.Any):
+    kwargs.pop("list_audio_devices")
+    mname = kwargs.pop("model_name", os.getenv("GGML_MODEL", "tiny.en"))
     iterator: t.Iterator[str] | None = None
     try:
-        iterator = w.Whisper.from_pretrained(kwargs["model_name"]).stream_transcribe(
-            length_ms=kwargs["length_ms"],
-            device_id=kwargs["device_id"],
-            sample_rate=kwargs["sample_rate"],
-            step_ms=kwargs["step_ms"],
-            n_threads=kwargs["n_threads"],
-            no_timestamp=True,
-        )
+        iterator = w.Whisper.from_pretrained(mname).stream_transcribe(**kwargs)
     finally:
         assert iterator is not None, "Something went wrong!"
-        sys.stderr.writelines(f"- {it}\n" for it in iterator)
-        sys.stderr.write("Transcriptions:\n")
+        sys.stderr.writelines(
+            ["\nTranscription (line by line):\n"] + [f"{it}\n" for it in iterator]
+        )
         sys.stderr.flush()
 
 
@@ -46,18 +43,31 @@ def main(**kwargs: t.Any):
         help="Sample rate of the audio device",
         default=w.api.SAMPLE_RATE,
     )
+    parser.add_argument(
+        "--n_threads",
+        type=int,
+        help="Number of threads to use for decoding",
+        default=8,
+    )
     parser.add_argument(
         "--step_ms",
         type=int,
         help="Step size of the audio buffer in milliseconds",
-        default=500,
+        default=2000,
     )
     parser.add_argument(
-        "--n_threads",
+        "--keep_ms",
         type=int,
-        help="Number of threads to use for decoding",
-        default=4,
+        help="Length of the audio buffer to keep in milliseconds",
+        default=200,
+    )
+    parser.add_argument(
+        "--max_tokens",
+        type=int,
+        help="Maximum number of tokens to decode",
+        default=32,
     )
+    parser.add_argument("--audio_ctx", type=int, help="Audio context", default=0)
     parser.add_argument(
         "--list_audio_devices",
         action="store_true",
 
@@ -81,9 +81,9 @@ def internal_deps():
     http_archive(
         name = "com_github_libsdl_sdl2",
         build_file = Label("//extern:sdl2.BUILD"),
-        sha256 = "03ab539ff65f6f544969eb3fed138a3fd7224496aa8404eda5e8355877b6dca1",
-        strip_prefix = "SDL-6c495a92f0bbc5637d565b5339afa943a78108f7",
-        urls = ["https://github.com/libsdl-org/SDL/archive/6c495a92f0bbc5637d565b5339afa943a78108f7.zip"],
+        sha256 = "e2ac043bd2b67be328f875043617b904a0bb7d277ba239fe8ac6b9c94b85cbac",
+        strip_prefix = "SDL-dca3fd8307c2c9ebda8d8ea623bbbf19649f5e22",
+        urls = ["https://github.com/libsdl-org/SDL/archive/dca3fd8307c2c9ebda8d8ea623bbbf19649f5e22.zip"],
     )
 
     git_repository(
 
@@ -73,6 +73,7 @@ def __init__(self, *args: t.Any, **kwargs: t.Any):
         params: api.Params
         no_state: bool
         basedir: str | None
+        _transcript: list[str]
 
     _context_initialized: bool = False
 
@@ -112,6 +113,7 @@ def from_pretrained(
         )
         context.reset_timings()
         _context_initialized = not no_state
+        _transcript = []
         _ref.__dict__.update(locals())
         return _ref
 
@@ -149,6 +151,7 @@ def from_params(
         )
         context.reset_timings()
         _context_initialized = not no_state
+        _transcript = []
         _ref.__dict__.update(locals())
         return _ref
 
@@ -214,9 +217,8 @@ def stream_transcribe(
         device_id: int = 0,
         sample_rate: int | None = None,
         **kwargs: t.Any,
-    ) -> t.Generator[str, None, list[str]]:
-        """
-        Streaming transcription from microphone. Note that this function is blocking.
+    ) -> list[str]:
+        """Streaming transcription from microphone. Note that this function is blocking.
 
         Args:
             length_ms (int, optional): Length of audio to transcribe in milliseconds. Defaults to 10000.
@@ -227,30 +229,34 @@ def stream_transcribe(
         Returns:
             A generator of all transcripted text from given audio device.
         """
-        is_running = True
-
         if sample_rate is None:
             sample_rate = api.SAMPLE_RATE
-        length_ms = kwargs.pop("length_ms", 10000)
+        if "length_ms" not in kwargs:
+            kwargs["length_ms"] = 5000
+        if "step_ms" not in kwargs:
+            kwargs["step_ms"] = 700
+
+        if kwargs["step_ms"] < 500:
+            raise ValueError("step_ms must be >= 500")
 
-        ac = audio.AudioCapture(length_ms)
+        ac = audio.AudioCapture(kwargs["length_ms"])
         if not ac.init_device(device_id, sample_rate):
             raise RuntimeError("Failed to initialize audio capture device.")
 
+        self.params.on_new_segment(self._store_transcript_handler, self._transcript)
+
         try:
-            while is_running:
-                is_running = audio.sdl_poll_events()
-                if not is_running:
-                    break
-                ac.stream_transcribe(
-                    self.context, self.params, length_ms=length_ms, **kwargs
-                )
+            ac.stream_transcribe(self.context, self.params, **kwargs)
         except KeyboardInterrupt:
             # handled from C++
             pass
-        finally:
-            yield from ac.transcript
-            return ac.transcript
+        return self._transcript
+
+    def _store_transcript_handler(self, ctx: api.Context, n_new: int, data: list[str]):
+        segment = ctx.full_n_segments() - n_new
+        while segment < ctx.full_n_segments():
+            data.append(ctx.full_get_segment_text(segment))
+            segment += 1
 
 
 __all__ = ["Whisper", "api", "utils", "audio"]
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
+from typing import Iterator
 from typing import overload
-from typing import Generator
 from typing import TYPE_CHECKING
 
 from . import api as api
@@ -31,7 +31,7 @@ class Whisper:
         self, filename: str, num_proc: int = ..., strict: bool = ...
     ) -> str: ...
     @overload
-    def stream_transcribe(self) -> Generator[str, None, list[str]]: ...
+    def stream_transcribe(self) -> Iterator[str]: ...
     @overload
     def stream_transcribe(
         self,
@@ -40,7 +40,7 @@ class Whisper:
         device_id: int = ...,
         sample_rate: int | None = ...,
         step_ms: int = ...,
-    ) -> Generator[str, None, list[str]]: ...
+    ) -> Iterator[str]: ...
     @classmethod
     @overload
     def from_pretrained(cls, model_name: str) -> Whisper: ...
Original file line number	Diff line number	Diff line change
`@@ -128,6 +128,7 @@ pybind_extension(`
`128`	`128`	`deps = [`
`129`	`129`	`":audio_lib",`
`130`	`130`	`":context_lib",`
	`131`	`+ "@com_github_ggerganov_whisper//:common",`
`131`	`132`	`],`
`132`	`133`	`)`
`133`	`134`