Skip to content

Commit 726aad9

Browse files
authored
Fix pagination issue (#31)
* fix pagination issue * remove trailing whitespace * avoid possibility of infinite loop * refine handling of edge cases * remove check for total number of artworks being 0
1 parent fc13ed6 commit 726aad9

File tree

1 file changed

+69
-12
lines changed

1 file changed

+69
-12
lines changed

artscraper/find_artworks.py

+69-12
Original file line numberDiff line numberDiff line change
@@ -209,26 +209,83 @@ def get_artist_works(self):
209209
# Find the parent element corresponding to the text heading
210210
parent_element = element.find_element('xpath', '../..')
211211

212-
# Find right arrow button
213-
right_arrow_element = parent_element.find_element('xpath', \
214-
'.//*[contains(@data-gaaction,"rightArrow")]')
212+
# Initialize total number of artworks
213+
# (set to number of artworks by artist with the most artworks)
214+
total_num_artworks = 200000
215+
216+
# Find number of artists
217+
# Find elements with tag name 'h3'
218+
items_elements = parent_element.find_elements('tag name', 'h3')
219+
for element in items_elements:
220+
if 'items' in element.text:
221+
match = re.search(r'\d+', element.text)
222+
if match:
223+
total_num_artworks = int(match.group())
224+
break
225+
226+
# Find right arrow element
227+
def _find_right_arrow_element(parent_element):
228+
229+
right_arrow_element = parent_element.find_element('xpath', \
230+
'.//*[contains(@data-gaaction,"rightArrow")]')
231+
232+
return right_arrow_element
233+
234+
# Get list of artwork links
235+
def _get_list_links(parent_element):
236+
237+
# Find right arrow button
238+
right_arrow_element = parent_element.find_element('xpath', \
239+
'.//*[contains(@data-gaaction,"rightArrow")]')
240+
241+
# List of all elements with links to artworks
242+
elements = right_arrow_element.find_elements('xpath', \
243+
'//*[contains(@href,"/asset/")]')
244+
245+
# Get the links from the XPath elements
246+
list_links = [element.get_attribute('href') for element in elements]
247+
248+
return list_links
249+
250+
# Click on right arrow
251+
def _click_on_right_arrow(parent_element):
215252

216-
# Check if right arrow button can still be clicked
217-
while right_arrow_element.get_attribute('tabindex') is not None:
218253
# Find right arrow button
219254
right_arrow_element = parent_element.find_element('xpath', \
220255
'.//*[contains(@data-gaaction,"rightArrow")]')
221256
# Click on right arrow button
222257
self.driver.execute_script("arguments[0].click();", right_arrow_element)
223-
# Wait for page to load
224-
time.sleep(random_wait_time(min_wait=self.min_wait_time))
225258

226-
# List of all elements with links to artworks
227-
elements = right_arrow_element.find_elements('xpath', \
228-
'//*[contains(@href,"/asset/")]')
259+
list_links = _get_list_links(parent_element)
260+
261+
# Initialize count of number of iterations for which the number of artworks remains the same
262+
n_tries = 0
263+
264+
while (len(list_links) < total_num_artworks and n_tries < 3):
265+
266+
# Save current number of artworks
267+
old_num_artworks = len(list_links)
268+
269+
# Find right arrow element
270+
right_arrow_element = _find_right_arrow_element(parent_element)
271+
272+
# Check if right arrow button can still be clicked
273+
if right_arrow_element.get_attribute('tabindex') is not None:
274+
275+
# Click on right arrow
276+
_click_on_right_arrow(parent_element)
277+
278+
# Wait for page to load
279+
time.sleep(random_wait_time(min_wait=self.min_wait_time))
280+
281+
# Obtain new list of artworks
282+
list_links = _get_list_links(parent_element)
229283

230-
# Get the links from the XPath elements
231-
list_links = [element.get_attribute('href') for element in elements]
284+
if len(list_links) == old_num_artworks:
285+
# Count number of iterations for which the number of artworks remains the same
286+
n_tries = n_tries + 1
287+
else:
288+
n_tries = 0
232289

233290
return list_links
234291

0 commit comments

Comments
 (0)