Skip to content

Commit

Permalink
feat(#132): process highlights in groups (#133)
Browse files Browse the repository at this point in the history
Fixes #132
  • Loading branch information
MartinBernstorff authored Apr 4, 2024
1 parent 59ef776 commit cde04e4
Show file tree
Hide file tree
Showing 17 changed files with 121 additions and 130 deletions.
10 changes: 8 additions & 2 deletions memorymarker/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
OpenAIModelCompleter,
)
from memorymarker.question_generator.flows.question_flow import QuestionFlow
from memorymarker.question_generator.main import chunk_highlights
from memorymarker.question_generator.qa_responses import QAResponses
from memorymarker.question_generator.steps.qa_extractor import QuestionExtractionStep
from memorymarker.question_generator.steps.qa_generation import QuestionGenerationStep
Expand Down Expand Up @@ -131,12 +132,17 @@ def typer_cli(
base_completer = AnthropicCompleter(
api_key=anthropic_api_key, model="claude-3-opus-20240229"
)
chunked_highlights = (
highlights.groupby(lambda _: _.source_document.title)
.map(lambda _: chunk_highlights(_, 5))
.flatten()
)
questions = asyncio.run(
QuestionFlow(
_name="simplified_reasoning",
steps=(
ReasoningStep(completer=base_completer),
QuestionGenerationStep(completer=base_completer),
QuestionGenerationStep(completer=base_completer, n_questions=(1, 5)),
QuestionExtractionStep(
completer=OpenAIModelCompleter(
api_key=openai_api_key,
Expand All @@ -145,7 +151,7 @@ def typer_cli(
)
),
),
)(highlights[0:max_n])
)(chunked_highlights[0:max_n])
)

logging.info("Writing questions to markdown...")
Expand Down
4 changes: 2 additions & 2 deletions memorymarker/document_providers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from iterpy.iter import Iter

from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
from memorymarker.question_generator.reasoned_highlight import Highlights

from .omnivore_document import OmnivoreDocument

Expand All @@ -30,5 +30,5 @@ class HighlightManager(Protocol):

def get_highlights_since_update(
self, date: "dt.datetime"
) -> Sequence["ReasonedHighlight"]:
) -> Sequence["Highlights"]:
...
8 changes: 4 additions & 4 deletions memorymarker/document_providers/hydrator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from bs4 import BeautifulSoup, NavigableString, Tag
from joblib import Memory

from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
from memorymarker.question_generator.reasoned_highlight import Highlights

if TYPE_CHECKING:
from memorymarker.document_providers.base import OrphanHighlight
Expand Down Expand Up @@ -76,8 +76,8 @@ def __init__(self, soup_downloader: Callable[[str], BeautifulSoup]) -> None:

def hydrate_highlights(
self, highlights: Sequence["OrphanHighlight"]
) -> Sequence[ReasonedHighlight | None]:
hydrated_highlights: list[ReasonedHighlight | None] = []
) -> Sequence[Highlights | None]:
hydrated_highlights: list[Highlights | None] = []
for highlight in highlights:
try:
page = urlopen(highlight.uri)
Expand All @@ -91,7 +91,7 @@ def hydrate_highlights(
soup=soup, highlight=highlight.highlight
)
hydrated_highlights.append(
ReasonedHighlight(
Highlights(
highlighted_text=highlight.highlight,
prefix=context[:100],
suffix=context[-100:],
Expand Down
10 changes: 4 additions & 6 deletions memorymarker/document_providers/omnivore_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pydantic import BaseModel

from memorymarker.question_generator.reasoned_highlight import (
ReasonedHighlight,
Highlights,
SourceDocument,
)

Expand All @@ -19,13 +19,11 @@ class OmnivoreDocument(BaseModel):
slug: str
highlights: Sequence[Mapping[str, Any]]

def _parse_highlight(
self, highlight: Mapping[str, str]
) -> ReasonedHighlight | None:
def _parse_highlight(self, highlight: Mapping[str, str]) -> Highlights | None:
if "quote" not in highlight or highlight["quote"] is None: # type: ignore
return None

return ReasonedHighlight(
return Highlights(
source_document=SourceDocument(
title=self.title,
uri=f"https://omnivore.app/me/{self.slug}#{highlight["id"]}",
Expand All @@ -41,6 +39,6 @@ def _parse_highlight(
updated_at=highlight["updatedAt"], # type: ignore # Will be recast on init.
)

def get_highlights(self) -> Iter[ReasonedHighlight]:
def get_highlights(self) -> Iter[Highlights]:
highlights = Iter(self.highlights).map(self._parse_highlight)
return highlights.filter(lambda _: _ is not None) # type: ignore
4 changes: 2 additions & 2 deletions memorymarker/persist_questions/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
if TYPE_CHECKING:
from pathlib import Path

from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
from memorymarker.question_generator.reasoned_highlight import Highlights


def clean_filename(filename: str) -> str:
Expand All @@ -16,7 +16,7 @@ def clean_filename(filename: str) -> str:


def highlight_group_to_file(
output_dir: "Path", group: tuple[str, Sequence["ReasonedHighlight"]]
output_dir: "Path", group: tuple[str, Sequence["Highlights"]]
) -> None:
save_path = output_dir / f"{clean_filename(group[0])}.md"

Expand Down
4 changes: 2 additions & 2 deletions memorymarker/persist_questions/test_markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
import memorymarker.persist_questions.markdown as markdown
from memorymarker.question_generator.qa_responses import QAPrompt
from memorymarker.question_generator.reasoned_highlight import (
ReasonedHighlight,
Highlights,
SourceDocument,
)


class FakeHydratedHighlight(ReasonedHighlight):
class FakeHydratedHighlight(Highlights):
source_document: SourceDocument = SourceDocument(
title="The Hitchhiker's Guide to the Galaxy",
uri="https://en.wikipedia.org/wiki/The_Hitchhiker%27s_Guide_to_the_Galaxy#meaning_of_life",
Expand Down
4 changes: 2 additions & 2 deletions memorymarker/question_generator/example_repo_airtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
if TYPE_CHECKING:
from iterpy.iter import Iter

from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
from memorymarker.question_generator.reasoned_highlight import Highlights


class PipelineHighlightIdentity:
Expand Down Expand Up @@ -64,7 +64,7 @@ def get_existing_examples(self) -> Sequence[QATableRow]:


def update_repository(
new_responses: "Iter[ReasonedHighlight]", repository: AirtableExampleRepo
new_responses: "Iter[Highlights]", repository: AirtableExampleRepo
):
for example in new_responses:
for qa_pair in example.question_answer_pairs:
Expand Down
10 changes: 3 additions & 7 deletions memorymarker/question_generator/flows/question_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from iterpy.iter import Iter

if TYPE_CHECKING:
from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
from memorymarker.question_generator.reasoned_highlight import Highlights
from memorymarker.question_generator.steps.step import FlowStep

sem = asyncio.Semaphore(4)
Expand All @@ -16,19 +16,15 @@ class QuestionFlow:
_name: str
steps: tuple["FlowStep"]

async def _process_item(
self, highlight: "ReasonedHighlight"
) -> "ReasonedHighlight":
async def _process_item(self, highlight: "Highlights") -> "Highlights":
result = highlight
async with sem:
for step in self.steps:
result = await step(highlight)
result.pipeline_name = self.name
return result

async def __call__(
self, highlights: Iter["ReasonedHighlight"]
) -> Iter["ReasonedHighlight"]:
async def __call__(self, highlights: Iter["Highlights"]) -> Iter["Highlights"]:
results = await asyncio.gather(
*[self._process_item(highlight) for highlight in highlights]
)
Expand Down
70 changes: 48 additions & 22 deletions memorymarker/question_generator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@
from memorymarker.question_generator.steps.reasoning import ReasoningStep

if TYPE_CHECKING:
from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
from memorymarker.question_generator.reasoned_highlight import Highlights

omnivore_cache = Memory(".cache/omnivore")


@dataclass(frozen=True)
class HighlightWithPipeline(PipelineHighlightIdentity):
highlight: "ReasonedHighlight"
highlight: "Highlights"
pipeline: QuestionFlow

def identity(self) -> int:
Expand All @@ -45,7 +45,7 @@ def identity(self) -> int:


def _generate_highlight_pipeline_pairs(
selected_highlights: Iter["ReasonedHighlight"], pipelines: Sequence[QuestionFlow]
selected_highlights: Iter["Highlights"], pipelines: Sequence[QuestionFlow]
) -> Iter[HighlightWithPipeline]:
return Iter(
[
Expand All @@ -57,9 +57,7 @@ def _generate_highlight_pipeline_pairs(


@omnivore_cache.cache() # type: ignore
def _select_highlights_from_omnivore(
search_terms: set[str],
) -> Iter["ReasonedHighlight"]:
def _select_highlights_from_omnivore() -> Iter["Highlights"]:
highlights = (
Omnivore(
api_key=os.getenv("OMNIVORE_API_KEY", "No OMNIVORE_API_KEY in environment")
Expand All @@ -69,25 +67,51 @@ def _select_highlights_from_omnivore(
.flatten()
)

selected_highlights = highlights.filter(
lambda _: any(term in _.highlighted_text for term in search_terms)
)
return highlights


def chunk_highlights(
group: tuple[str, Sequence["Highlights"]], chunk_size: int
) -> Sequence["Highlights"]:
groups: Sequence["Highlights"] = []

for i in range(0, len(group[1]), 5):
subset: Sequence["Highlights"] = group[1][i : i + chunk_size]
combined_text = "\n\n---n\n".join(
f"{_.prefix} <HIGHLIGHT>{_.highlighted_text}</HIGHLIGHT> {_.suffix}"
for _ in subset
)
new_highlight = subset[-1]
new_highlight.highlighted_text = combined_text
new_highlight.prefix = ""
new_highlight.suffix = ""
groups.append(new_highlight)

return selected_highlights
return groups


async def main():
repository = AirtableExampleRepo()
selected_highlights = _select_highlights_from_omnivore(
search_terms={
"drenge og mænd ikke har nogen værdi",
"The quality of a model",
"Dependency injection is not effective if",
"The essence of writing code then is to internalize the problem domain",
"stack is a data structure that contains a collection of elements where you can add and delete elements from just one end ",
"A semaphore manages an internal counter",
}
# content_filter = {
# "drenge og mænd ikke har nogen værdi",
# "The quality of a model",
# "Dependency injection is not effective if",
# "The essence of writing code then is to internalize the problem domain",
# "stack is a data structure that contains a collection of elements where you can add and delete elements from just one end ",
# "A semaphore manages an internal counter",
# }
document_titles = {"Singly Linked List", "Jeg har set mit køns smerte"}
input_highlights = _select_highlights_from_omnivore()
selected_highlights = input_highlights.filter(
lambda _: any(title in _.source_document.title for title in document_titles)
)

grouped_highlights = (
selected_highlights.groupby(lambda _: _.source_document.title)
.map(lambda group: chunk_highlights(group=group, chunk_size=5))
.flatten()
)

old_example_hashes = (
Iter(repository.get_existing_examples()).map(lambda _: _.__hash__()).to_list()
)
Expand All @@ -99,13 +123,15 @@ async def main():
# api_key=os.getenv("OPENAI_API_KEY", None), model="gpt-4-turbo-preview"
# )
new_highlights = _generate_highlight_pipeline_pairs(
selected_highlights,
grouped_highlights,
[
QuestionFlow(
_name="simplified_reasoning",
_name="chunked_reasoning",
steps=(
ReasoningStep(completer=base_completer),
QuestionGenerationStep(completer=base_completer),
QuestionGenerationStep(
completer=base_completer, n_questions=(1, 5)
),
QuestionExtractionStep(
completer=OpenAIModelCompleter(
api_key=os.getenv("OPENAI_API_KEY", "No OPENAI_API"),
Expand Down
10 changes: 4 additions & 6 deletions memorymarker/question_generator/pipeline_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,20 @@
if TYPE_CHECKING:
from memorymarker.question_generator.flows.question_flow import QuestionFlow
from memorymarker.question_generator.main import HighlightWithPipeline
from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
from memorymarker.question_generator.reasoned_highlight import Highlights


async def run_pipeline(
pipeline_name: str,
pipelinename2pipeline: Mapping[str, "QuestionFlow"],
highlights: Sequence["ReasonedHighlight"],
) -> Iter["ReasonedHighlight"]:
highlights: Sequence["Highlights"],
) -> Iter["Highlights"]:
pipeline = pipelinename2pipeline[pipeline_name]
prompts = await pipeline(Iter(highlights))
return prompts


async def run_pipelines(
pairs: Iter["HighlightWithPipeline"],
) -> Iter["ReasonedHighlight"]:
async def run_pipelines(pairs: Iter["HighlightWithPipeline"]) -> Iter["Highlights"]:
pipelinename2pipeline = {pair.pipeline.name: pair.pipeline for pair in pairs}
pipelines_with_highlights = pairs.groupby(lambda _: _.pipeline.name)

Expand Down
6 changes: 3 additions & 3 deletions memorymarker/question_generator/qa_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
from pydantic import BaseModel

if TYPE_CHECKING:
from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
from memorymarker.question_generator.reasoned_highlight import Highlights


@dataclass(frozen=True)
class QAPrompt:
hydrated_highlight: "ReasonedHighlight | None"
hydrated_highlight: "Highlights | None"
question: str
answer: str
title: str
Expand All @@ -25,7 +25,7 @@ class QAPromptResponseModel(BaseModel):
question: str
answer: str

def to_qaprompt(self, reasoned_highlight: "ReasonedHighlight") -> QAPrompt:
def to_qaprompt(self, reasoned_highlight: "Highlights") -> QAPrompt:
return QAPrompt(
hydrated_highlight=reasoned_highlight,
question=self.question,
Expand Down
2 changes: 1 addition & 1 deletion memorymarker/question_generator/reasoned_highlight.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def to_markdown_quote(text: str) -> str:
return "\n".join(lines)


class ReasonedHighlight(BaseModel):
class Highlights(BaseModel):
source_document: SourceDocument

updated_at: dt.datetime
Expand Down
6 changes: 2 additions & 4 deletions memorymarker/question_generator/steps/qa_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
ModelCompleter,
)
from memorymarker.question_generator.qa_responses import QAResponses
from memorymarker.question_generator.reasoned_highlight import ReasonedHighlight
from memorymarker.question_generator.reasoned_highlight import Highlights


@dataclass(frozen=True)
Expand All @@ -18,9 +18,7 @@ class QuestionExtractionStep(FlowStep):
def identity(self) -> str:
return f"{self.__class__.__name__}_{self.completer.identity()}"

async def __call__(
self, reasoned_highlight: "ReasonedHighlight"
) -> "ReasonedHighlight":
async def __call__(self, reasoned_highlight: "Highlights") -> "Highlights":
responses: QAResponses = await self.completer(
f"""Extract:
{reasoned_highlight.qa_string}
Expand Down
Loading

0 comments on commit cde04e4

Please sign in to comment.