@@ -209,26 +209,83 @@ def get_artist_works(self):
209
209
# Find the parent element corresponding to the text heading
210
210
parent_element = element .find_element ('xpath' , '../..' )
211
211
212
- # Find right arrow button
213
- right_arrow_element = parent_element .find_element ('xpath' , \
214
- './/*[contains(@data-gaaction,"rightArrow")]' )
212
+ # Initialize total number of artworks
213
+ # (set to number of artworks by artist with the most artworks)
214
+ total_num_artworks = 200000
215
+
216
+ # Find number of artists
217
+ # Find elements with tag name 'h3'
218
+ items_elements = parent_element .find_elements ('tag name' , 'h3' )
219
+ for element in items_elements :
220
+ if 'items' in element .text :
221
+ match = re .search (r'\d+' , element .text )
222
+ if match :
223
+ total_num_artworks = int (match .group ())
224
+ break
225
+
226
+ # Find right arrow element
227
+ def _find_right_arrow_element (parent_element ):
228
+
229
+ right_arrow_element = parent_element .find_element ('xpath' , \
230
+ './/*[contains(@data-gaaction,"rightArrow")]' )
231
+
232
+ return right_arrow_element
233
+
234
+ # Get list of artwork links
235
+ def _get_list_links (parent_element ):
236
+
237
+ # Find right arrow button
238
+ right_arrow_element = parent_element .find_element ('xpath' , \
239
+ './/*[contains(@data-gaaction,"rightArrow")]' )
240
+
241
+ # List of all elements with links to artworks
242
+ elements = right_arrow_element .find_elements ('xpath' , \
243
+ '//*[contains(@href,"/asset/")]' )
244
+
245
+ # Get the links from the XPath elements
246
+ list_links = [element .get_attribute ('href' ) for element in elements ]
247
+
248
+ return list_links
249
+
250
+ # Click on right arrow
251
+ def _click_on_right_arrow (parent_element ):
215
252
216
- # Check if right arrow button can still be clicked
217
- while right_arrow_element .get_attribute ('tabindex' ) is not None :
218
253
# Find right arrow button
219
254
right_arrow_element = parent_element .find_element ('xpath' , \
220
255
'.//*[contains(@data-gaaction,"rightArrow")]' )
221
256
# Click on right arrow button
222
257
self .driver .execute_script ("arguments[0].click();" , right_arrow_element )
223
- # Wait for page to load
224
- time .sleep (random_wait_time (min_wait = self .min_wait_time ))
225
258
226
- # List of all elements with links to artworks
227
- elements = right_arrow_element .find_elements ('xpath' , \
228
- '//*[contains(@href,"/asset/")]' )
259
+ list_links = _get_list_links (parent_element )
260
+
261
+ # Initialize count of number of iterations for which the number of artworks remains the same
262
+ n_tries = 0
263
+
264
+ while (len (list_links ) < total_num_artworks and n_tries < 3 ):
265
+
266
+ # Save current number of artworks
267
+ old_num_artworks = len (list_links )
268
+
269
+ # Find right arrow element
270
+ right_arrow_element = _find_right_arrow_element (parent_element )
271
+
272
+ # Check if right arrow button can still be clicked
273
+ if right_arrow_element .get_attribute ('tabindex' ) is not None :
274
+
275
+ # Click on right arrow
276
+ _click_on_right_arrow (parent_element )
277
+
278
+ # Wait for page to load
279
+ time .sleep (random_wait_time (min_wait = self .min_wait_time ))
280
+
281
+ # Obtain new list of artworks
282
+ list_links = _get_list_links (parent_element )
229
283
230
- # Get the links from the XPath elements
231
- list_links = [element .get_attribute ('href' ) for element in elements ]
284
+ if len (list_links ) == old_num_artworks :
285
+ # Count number of iterations for which the number of artworks remains the same
286
+ n_tries = n_tries + 1
287
+ else :
288
+ n_tries = 0
232
289
233
290
return list_links
234
291
0 commit comments