Skip to content

Commit b8d7136

Browse files
petyosiKludex
andauthored
Improve Algolia indexing (pydantic#721)
Co-authored-by: Marcelo Trylesinski <marcelotryle@gmail.com>
1 parent 4725053 commit b8d7136

File tree

1 file changed

+33
-4
lines changed

1 file changed

+33
-4
lines changed

docs/.hooks/algolia.py

+33-4
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ class AlgoliaRecord(TypedDict):
2525
ALGOLIA_INDEX_NAME = 'pydantic-ai-docs'
2626
ALGOLIA_WRITE_API_KEY = os.environ.get('ALGOLIA_WRITE_API_KEY')
2727

28+
# Algolia has a limit of 100kb per record in the paid plan,
29+
# leave some space for the other fields as well.
30+
MAX_CONTENT_LENGTH = 90_000
31+
2832

2933
def on_page_content(html: str, page: Page, config: Config, files: Files) -> str:
3034
if not ALGOLIA_WRITE_API_KEY:
@@ -35,6 +39,26 @@ def on_page_content(html: str, page: Page, config: Config, files: Files) -> str:
3539

3640
soup = BeautifulSoup(html, 'html.parser')
3741

42+
# Clean up presentational and UI elements
43+
for element in soup.find_all(['autoref']):
44+
element.decompose()
45+
46+
# this removes the large source code embeds from Github
47+
for element in soup.find_all('details'):
48+
element.decompose()
49+
50+
# Cleanup code examples
51+
for extra in soup.find_all('div', attrs={'class': 'language-python highlight'}):
52+
extra.replace_with(BeautifulSoup(f'<pre>{extra.find('code').get_text()}</pre>', 'html.parser'))
53+
54+
# Cleanup code examples, part 2
55+
for extra in soup.find_all('div', attrs={'class': 'language-python doc-signature highlight'}):
56+
extra.replace_with(BeautifulSoup(f'<pre>{extra.find('code').get_text()}</pre>', 'html.parser'))
57+
58+
# The API reference generates HTML tables with line numbers, this strips the line numbers cell and goes back to a code block
59+
for extra in soup.find_all('table', attrs={'class': 'highlighttable'}):
60+
extra.replace_with(BeautifulSoup(f'<pre>{extra.find('code').get_text()}</pre>', 'html.parser'))
61+
3862
# Find all h1 and h2 headings
3963
headings = soup.find_all(['h1', 'h2'])
4064

@@ -75,14 +99,19 @@ def on_post_build(config: Config) -> None:
7599

76100
client = SearchClientSync(ALGOLIA_APP_ID, ALGOLIA_WRITE_API_KEY)
77101

78-
# temporary filter the records from the index if the content is bigger than 10k characters
79-
filtered_records = list(filter(lambda record: len(record['content']) < 9000, records))
102+
for record in records:
103+
if len(record['content']) > MAX_CONTENT_LENGTH:
104+
print(
105+
f"Record with title '{record['title']}' has more than {MAX_CONTENT_LENGTH} characters, {len(record['content'])}."
106+
)
107+
print(record['content'])
108+
109+
# Filter the records from the index if the content is bigger than 100kb, Algolia limit
110+
filtered_records = list(filter(lambda record: len(record['content']) < MAX_CONTENT_LENGTH, records))
80111
print(f'Uploading {len(filtered_records)} out of {len(records)} records to Algolia...')
81112

82-
# Clear the index first
83113
client.clear_objects(index_name=ALGOLIA_INDEX_NAME)
84114

85-
# Execute batch operation
86115
client.batch(
87116
index_name=ALGOLIA_INDEX_NAME,
88117
batch_write_params={'requests': [{'action': 'addObject', 'body': record} for record in filtered_records]},

0 commit comments

Comments
 (0)