feat(#132): process highlights in groups (#133)

Fixes #132
MartinBernstorff · Apr 4, 2024 · cde04e4 · cde04e4
1 parent 59ef776
commit cde04e4
Show file tree

Hide file tree

Showing 17 changed files with 121 additions and 130 deletions.
diff --git a/memorymarker/__main__.py b/memorymarker/__main__.py
@@ -23,6 +23,7 @@
     OpenAIModelCompleter,
 )
 from memorymarker.question_generator.flows.question_flow import QuestionFlow
+from memorymarker.question_generator.main import chunk_highlights
 from memorymarker.question_generator.qa_responses import QAResponses
 from memorymarker.question_generator.steps.qa_extractor import QuestionExtractionStep
 from memorymarker.question_generator.steps.qa_generation import QuestionGenerationStep
@@ -131,12 +132,17 @@ def typer_cli(
     base_completer = AnthropicCompleter(
         api_key=anthropic_api_key, model="claude-3-opus-20240229"
     )
+    chunked_highlights = (
+        highlights.groupby(lambda _: _.source_document.title)
+        .map(lambda _: chunk_highlights(_, 5))
+        .flatten()
+    )
     questions = asyncio.run(
         QuestionFlow(
             _name="simplified_reasoning",
             steps=(
                 ReasoningStep(completer=base_completer),
-                QuestionGenerationStep(completer=base_completer),
+                QuestionGenerationStep(completer=base_completer, n_questions=(1, 5)),
                 QuestionExtractionStep(
                     completer=OpenAIModelCompleter(
                         api_key=openai_api_key,
@@ -145,7 +151,7 @@ def typer_cli(
                     )
                 ),
             ),
-        )(highlights[0:max_n])
+        )(chunked_highlights[0:max_n])
     )
 
     logging.info("Writing questions to markdown...")

diff --git a/memorymarker/document_providers/base.py b/memorymarker/document_providers/base.py
@@ -7,7 +7,7 @@
 
     from iterpy.iter import Iter
 
-    from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
+    from memorymarker.question_generator.reasoned_highlight import Highlights
 
     from .omnivore_document import OmnivoreDocument
 
@@ -30,5 +30,5 @@ class HighlightManager(Protocol):
 
     def get_highlights_since_update(
         self, date: "dt.datetime"
-    ) -> Sequence["ReasonedHighlight"]:
+    ) -> Sequence["Highlights"]:
         ...
diff --git a/memorymarker/document_providers/hydrator/main.py b/memorymarker/document_providers/hydrator/main.py
@@ -7,7 +7,7 @@
 from bs4 import BeautifulSoup, NavigableString, Tag
 from joblib import Memory
 
-from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
+from memorymarker.question_generator.reasoned_highlight import Highlights
 
 if TYPE_CHECKING:
     from memorymarker.document_providers.base import OrphanHighlight
@@ -76,8 +76,8 @@ def __init__(self, soup_downloader: Callable[[str], BeautifulSoup]) -> None:
 
     def hydrate_highlights(
         self, highlights: Sequence["OrphanHighlight"]
-    ) -> Sequence[ReasonedHighlight | None]:
-        hydrated_highlights: list[ReasonedHighlight | None] = []
+    ) -> Sequence[Highlights | None]:
+        hydrated_highlights: list[Highlights | None] = []
         for highlight in highlights:
             try:
                 page = urlopen(highlight.uri)
@@ -91,7 +91,7 @@ def hydrate_highlights(
                 soup=soup, highlight=highlight.highlight
             )
             hydrated_highlights.append(
-                ReasonedHighlight(
+                Highlights(
                     highlighted_text=highlight.highlight,
                     prefix=context[:100],
                     suffix=context[-100:],

diff --git a/memorymarker/document_providers/omnivore_document.py b/memorymarker/document_providers/omnivore_document.py
@@ -4,7 +4,7 @@
 from pydantic import BaseModel
 
 from memorymarker.question_generator.reasoned_highlight import (
-    ReasonedHighlight,
+    Highlights,
     SourceDocument,
 )
 
@@ -19,13 +19,11 @@ class OmnivoreDocument(BaseModel):
     slug: str
     highlights: Sequence[Mapping[str, Any]]
 
-    def _parse_highlight(
-        self, highlight: Mapping[str, str]
-    ) -> ReasonedHighlight | None:
+    def _parse_highlight(self, highlight: Mapping[str, str]) -> Highlights | None:
         if "quote" not in highlight or highlight["quote"] is None:  # type: ignore
             return None
 
-        return ReasonedHighlight(
+        return Highlights(
             source_document=SourceDocument(
                 title=self.title,
                 uri=f"https://omnivore.app/me/{self.slug}#{highlight["id"]}",
@@ -41,6 +39,6 @@ def _parse_highlight(
             updated_at=highlight["updatedAt"],  # type: ignore # Will be recast on init.
         )
 
-    def get_highlights(self) -> Iter[ReasonedHighlight]:
+    def get_highlights(self) -> Iter[Highlights]:
         highlights = Iter(self.highlights).map(self._parse_highlight)
         return highlights.filter(lambda _: _ is not None)  # type: ignore
diff --git a/memorymarker/persist_questions/markdown.py b/memorymarker/persist_questions/markdown.py
@@ -5,7 +5,7 @@
 if TYPE_CHECKING:
     from pathlib import Path
 
-    from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
+    from memorymarker.question_generator.reasoned_highlight import Highlights
 
 
 def clean_filename(filename: str) -> str:
@@ -16,7 +16,7 @@ def clean_filename(filename: str) -> str:
 
 
 def highlight_group_to_file(
-    output_dir: "Path", group: tuple[str, Sequence["ReasonedHighlight"]]
+    output_dir: "Path", group: tuple[str, Sequence["Highlights"]]
 ) -> None:
     save_path = output_dir / f"{clean_filename(group[0])}.md"
 

diff --git a/memorymarker/persist_questions/test_markdown.py b/memorymarker/persist_questions/test_markdown.py
@@ -6,12 +6,12 @@
 import memorymarker.persist_questions.markdown as markdown
 from memorymarker.question_generator.qa_responses import QAPrompt
 from memorymarker.question_generator.reasoned_highlight import (
-    ReasonedHighlight,
+    Highlights,
     SourceDocument,
 )
 
 
-class FakeHydratedHighlight(ReasonedHighlight):
+class FakeHydratedHighlight(Highlights):
     source_document: SourceDocument = SourceDocument(
         title="The Hitchhiker's Guide to the Galaxy",
         uri="https://en.wikipedia.org/wiki/The_Hitchhiker%27s_Guide_to_the_Galaxy#meaning_of_life",

diff --git a/memorymarker/question_generator/example_repo_airtable.py b/memorymarker/question_generator/example_repo_airtable.py
@@ -9,7 +9,7 @@
 if TYPE_CHECKING:
     from iterpy.iter import Iter
 
-    from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
+    from memorymarker.question_generator.reasoned_highlight import Highlights
 
 
 class PipelineHighlightIdentity:
@@ -64,7 +64,7 @@ def get_existing_examples(self) -> Sequence[QATableRow]:
 
 
 def update_repository(
-    new_responses: "Iter[ReasonedHighlight]", repository: AirtableExampleRepo
+    new_responses: "Iter[Highlights]", repository: AirtableExampleRepo
 ):
     for example in new_responses:
         for qa_pair in example.question_answer_pairs:

diff --git a/memorymarker/question_generator/flows/question_flow.py b/memorymarker/question_generator/flows/question_flow.py
@@ -5,7 +5,7 @@
 from iterpy.iter import Iter
 
 if TYPE_CHECKING:
-    from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
+    from memorymarker.question_generator.reasoned_highlight import Highlights
     from memorymarker.question_generator.steps.step import FlowStep
 
 sem = asyncio.Semaphore(4)
@@ -16,19 +16,15 @@ class QuestionFlow:
     _name: str
     steps: tuple["FlowStep"]
 
-    async def _process_item(
-        self, highlight: "ReasonedHighlight"
-    ) -> "ReasonedHighlight":
+    async def _process_item(self, highlight: "Highlights") -> "Highlights":
         result = highlight
         async with sem:
             for step in self.steps:
                 result = await step(highlight)
         result.pipeline_name = self.name
         return result
 
-    async def __call__(
-        self, highlights: Iter["ReasonedHighlight"]
-    ) -> Iter["ReasonedHighlight"]:
+    async def __call__(self, highlights: Iter["Highlights"]) -> Iter["Highlights"]:
         results = await asyncio.gather(
             *[self._process_item(highlight) for highlight in highlights]
         )

diff --git a/memorymarker/question_generator/main.py b/memorymarker/question_generator/main.py
@@ -28,14 +28,14 @@
 from memorymarker.question_generator.steps.reasoning import ReasoningStep
 
 if TYPE_CHECKING:
-    from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
+    from memorymarker.question_generator.reasoned_highlight import Highlights
 
 omnivore_cache = Memory(".cache/omnivore")
 
 
 @dataclass(frozen=True)
 class HighlightWithPipeline(PipelineHighlightIdentity):
-    highlight: "ReasonedHighlight"
+    highlight: "Highlights"
     pipeline: QuestionFlow
 
     def identity(self) -> int:
@@ -45,7 +45,7 @@ def identity(self) -> int:
 
 
 def _generate_highlight_pipeline_pairs(
-    selected_highlights: Iter["ReasonedHighlight"], pipelines: Sequence[QuestionFlow]
+    selected_highlights: Iter["Highlights"], pipelines: Sequence[QuestionFlow]
 ) -> Iter[HighlightWithPipeline]:
     return Iter(
         [
@@ -57,9 +57,7 @@ def _generate_highlight_pipeline_pairs(
 
 
 @omnivore_cache.cache()  # type: ignore
-def _select_highlights_from_omnivore(
-    search_terms: set[str],
-) -> Iter["ReasonedHighlight"]:
+def _select_highlights_from_omnivore() -> Iter["Highlights"]:
     highlights = (
         Omnivore(
             api_key=os.getenv("OMNIVORE_API_KEY", "No OMNIVORE_API_KEY in environment")
@@ -69,25 +67,51 @@ def _select_highlights_from_omnivore(
         .flatten()
     )
 
-    selected_highlights = highlights.filter(
-        lambda _: any(term in _.highlighted_text for term in search_terms)
-    )
+    return highlights
+
+
+def chunk_highlights(
+    group: tuple[str, Sequence["Highlights"]], chunk_size: int
+) -> Sequence["Highlights"]:
+    groups: Sequence["Highlights"] = []
+
+    for i in range(0, len(group[1]), 5):
+        subset: Sequence["Highlights"] = group[1][i : i + chunk_size]
+        combined_text = "\n\n---n\n".join(
+            f"{_.prefix} <HIGHLIGHT>{_.highlighted_text}</HIGHLIGHT> {_.suffix}"
+            for _ in subset
+        )
+        new_highlight = subset[-1]
+        new_highlight.highlighted_text = combined_text
+        new_highlight.prefix = ""
+        new_highlight.suffix = ""
+        groups.append(new_highlight)
 
-    return selected_highlights
+    return groups
 
 
 async def main():
     repository = AirtableExampleRepo()
-    selected_highlights = _select_highlights_from_omnivore(
-        search_terms={
-            "drenge og mænd ikke har nogen værdi",
-            "The quality of a model",
-            "Dependency injection is not effective if",
-            "The essence of writing code then is to internalize the problem domain",
-            "stack is a data structure that contains a collection of elements where you can add and delete elements from just one end ",
-            "A semaphore manages an internal counter",
-        }
+    # content_filter = {
+    #     "drenge og mænd ikke har nogen værdi",
+    #     "The quality of a model",
+    #     "Dependency injection is not effective if",
+    #     "The essence of writing code then is to internalize the problem domain",
+    #     "stack is a data structure that contains a collection of elements where you can add and delete elements from just one end ",
+    #     "A semaphore manages an internal counter",
+    # }
+    document_titles = {"Singly Linked List", "Jeg har set mit køns smerte"}
+    input_highlights = _select_highlights_from_omnivore()
+    selected_highlights = input_highlights.filter(
+        lambda _: any(title in _.source_document.title for title in document_titles)
     )
+
+    grouped_highlights = (
+        selected_highlights.groupby(lambda _: _.source_document.title)
+        .map(lambda group: chunk_highlights(group=group, chunk_size=5))
+        .flatten()
+    )
+
     old_example_hashes = (
         Iter(repository.get_existing_examples()).map(lambda _: _.__hash__()).to_list()
     )
@@ -99,13 +123,15 @@ async def main():
     #     api_key=os.getenv("OPENAI_API_KEY", None), model="gpt-4-turbo-preview"
     # )
     new_highlights = _generate_highlight_pipeline_pairs(
-        selected_highlights,
+        grouped_highlights,
         [
             QuestionFlow(
-                _name="simplified_reasoning",
+                _name="chunked_reasoning",
                 steps=(
                     ReasoningStep(completer=base_completer),
-                    QuestionGenerationStep(completer=base_completer),
+                    QuestionGenerationStep(
+                        completer=base_completer, n_questions=(1, 5)
+                    ),
                     QuestionExtractionStep(
                         completer=OpenAIModelCompleter(
                             api_key=os.getenv("OPENAI_API_KEY", "No OPENAI_API"),

diff --git a/memorymarker/question_generator/pipeline_runner.py b/memorymarker/question_generator/pipeline_runner.py
@@ -6,22 +6,20 @@
 if TYPE_CHECKING:
     from memorymarker.question_generator.flows.question_flow import QuestionFlow
     from memorymarker.question_generator.main import HighlightWithPipeline
-    from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
+    from memorymarker.question_generator.reasoned_highlight import Highlights
 
 
 async def run_pipeline(
     pipeline_name: str,
     pipelinename2pipeline: Mapping[str, "QuestionFlow"],
-    highlights: Sequence["ReasonedHighlight"],
-) -> Iter["ReasonedHighlight"]:
+    highlights: Sequence["Highlights"],
+) -> Iter["Highlights"]:
     pipeline = pipelinename2pipeline[pipeline_name]
     prompts = await pipeline(Iter(highlights))
     return prompts
 
 
-async def run_pipelines(
-    pairs: Iter["HighlightWithPipeline"],
-) -> Iter["ReasonedHighlight"]:
+async def run_pipelines(pairs: Iter["HighlightWithPipeline"]) -> Iter["Highlights"]:
     pipelinename2pipeline = {pair.pipeline.name: pair.pipeline for pair in pairs}
     pipelines_with_highlights = pairs.groupby(lambda _: _.pipeline.name)
 

diff --git a/memorymarker/question_generator/qa_responses.py b/memorymarker/question_generator/qa_responses.py
@@ -5,12 +5,12 @@
 from pydantic import BaseModel
 
 if TYPE_CHECKING:
-    from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
+    from memorymarker.question_generator.reasoned_highlight import Highlights
 
 
 @dataclass(frozen=True)
 class QAPrompt:
-    hydrated_highlight: "ReasonedHighlight | None"
+    hydrated_highlight: "Highlights | None"
     question: str
     answer: str
     title: str
@@ -25,7 +25,7 @@ class QAPromptResponseModel(BaseModel):
     question: str
     answer: str
 
-    def to_qaprompt(self, reasoned_highlight: "ReasonedHighlight") -> QAPrompt:
+    def to_qaprompt(self, reasoned_highlight: "Highlights") -> QAPrompt:
         return QAPrompt(
             hydrated_highlight=reasoned_highlight,
             question=self.question,

diff --git a/memorymarker/question_generator/reasoned_highlight.py b/memorymarker/question_generator/reasoned_highlight.py
@@ -18,7 +18,7 @@ def to_markdown_quote(text: str) -> str:
     return "\n".join(lines)
 
 
-class ReasonedHighlight(BaseModel):
+class Highlights(BaseModel):
     source_document: SourceDocument
 
     updated_at: dt.datetime

diff --git a/memorymarker/question_generator/steps/qa_extractor.py b/memorymarker/question_generator/steps/qa_extractor.py
@@ -8,7 +8,7 @@
         ModelCompleter,
     )
     from memorymarker.question_generator.qa_responses import QAResponses
-    from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
+    from memorymarker.question_generator.reasoned_highlight import Highlights
 
 
 @dataclass(frozen=True)
@@ -18,9 +18,7 @@ class QuestionExtractionStep(FlowStep):
     def identity(self) -> str:
         return f"{self.__class__.__name__}_{self.completer.identity()}"
 
-    async def __call__(
-        self, reasoned_highlight: "ReasonedHighlight"
-    ) -> "ReasonedHighlight":
+    async def __call__(self, reasoned_highlight: "Highlights") -> "Highlights":
         responses: QAResponses = await self.completer(
             f"""Extract:
 {reasoned_highlight.qa_string}