@@ -25,6 +25,10 @@ class AlgoliaRecord(TypedDict):
25
25
ALGOLIA_INDEX_NAME = 'pydantic-ai-docs'
26
26
ALGOLIA_WRITE_API_KEY = os .environ .get ('ALGOLIA_WRITE_API_KEY' )
27
27
28
+ # Algolia has a limit of 100kb per record in the paid plan,
29
+ # leave some space for the other fields as well.
30
+ MAX_CONTENT_LENGTH = 90_000
31
+
28
32
29
33
def on_page_content (html : str , page : Page , config : Config , files : Files ) -> str :
30
34
if not ALGOLIA_WRITE_API_KEY :
@@ -35,6 +39,26 @@ def on_page_content(html: str, page: Page, config: Config, files: Files) -> str:
35
39
36
40
soup = BeautifulSoup (html , 'html.parser' )
37
41
42
+ # Clean up presentational and UI elements
43
+ for element in soup .find_all (['autoref' ]):
44
+ element .decompose ()
45
+
46
+ # this removes the large source code embeds from Github
47
+ for element in soup .find_all ('details' ):
48
+ element .decompose ()
49
+
50
+ # Cleanup code examples
51
+ for extra in soup .find_all ('div' , attrs = {'class' : 'language-python highlight' }):
52
+ extra .replace_with (BeautifulSoup (f'<pre>{ extra .find ('code' ).get_text ()} </pre>' , 'html.parser' ))
53
+
54
+ # Cleanup code examples, part 2
55
+ for extra in soup .find_all ('div' , attrs = {'class' : 'language-python doc-signature highlight' }):
56
+ extra .replace_with (BeautifulSoup (f'<pre>{ extra .find ('code' ).get_text ()} </pre>' , 'html.parser' ))
57
+
58
+ # The API reference generates HTML tables with line numbers, this strips the line numbers cell and goes back to a code block
59
+ for extra in soup .find_all ('table' , attrs = {'class' : 'highlighttable' }):
60
+ extra .replace_with (BeautifulSoup (f'<pre>{ extra .find ('code' ).get_text ()} </pre>' , 'html.parser' ))
61
+
38
62
# Find all h1 and h2 headings
39
63
headings = soup .find_all (['h1' , 'h2' ])
40
64
@@ -75,14 +99,19 @@ def on_post_build(config: Config) -> None:
75
99
76
100
client = SearchClientSync (ALGOLIA_APP_ID , ALGOLIA_WRITE_API_KEY )
77
101
78
- # temporary filter the records from the index if the content is bigger than 10k characters
79
- filtered_records = list (filter (lambda record : len (record ['content' ]) < 9000 , records ))
102
+ for record in records :
103
+ if len (record ['content' ]) > MAX_CONTENT_LENGTH :
104
+ print (
105
+ f"Record with title '{ record ['title' ]} ' has more than { MAX_CONTENT_LENGTH } characters, { len (record ['content' ])} ."
106
+ )
107
+ print (record ['content' ])
108
+
109
+ # Filter the records from the index if the content is bigger than 100kb, Algolia limit
110
+ filtered_records = list (filter (lambda record : len (record ['content' ]) < MAX_CONTENT_LENGTH , records ))
80
111
print (f'Uploading { len (filtered_records )} out of { len (records )} records to Algolia...' )
81
112
82
- # Clear the index first
83
113
client .clear_objects (index_name = ALGOLIA_INDEX_NAME )
84
114
85
- # Execute batch operation
86
115
client .batch (
87
116
index_name = ALGOLIA_INDEX_NAME ,
88
117
batch_write_params = {'requests' : [{'action' : 'addObject' , 'body' : record } for record in filtered_records ]},
0 commit comments