-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
98 lines (81 loc) · 4.26 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import asyncio
import argparse
import pickle
import os
import aiohttp
from typing import List
from playwright.async_api import async_playwright, Page
async def worker(worker_id, page: Page, queue: asyncio.Queue, ref_queue:List, BASE_URL: str):
while True:
link, path = await queue.get()
ref_queue.pop()
print(worker_id, '[PROCESSING]', BASE_URL + link)
await page.goto(BASE_URL + link, wait_until='networkidle')
print(worker_id, '[LOADED]', BASE_URL + link)
if 'tps' in path.lower() or 'pos' in path.lower():
os.makedirs(path, exist_ok=True)
with open(os.path.join(path, 'link.txt'), 'w') as file:
file.write(BASE_URL + link)
btn = page.locator('button.btn.btn-dark.float-end', has_text='Form Pindai')
await btn.click()
try:
await page.wait_for_selector('div.card-body div.row div.col-md-4')
images = await page.query_selector_all('div.card-body div.row div.col-md-4 a img')
for i, image in enumerate(images):
img_link = await image.get_attribute('src')
ext = img_link.split('.')[-1]
async with aiohttp.ClientSession() as session:
async with session.get(img_link) as response:
if response.status == 200:
with open(os.path.join(path, f'{i}.{ext}'), 'wb') as file:
file.write(await response.read())
else:
print('Failed to download image', img_link)
except Exception as e:
print('Error on', BASE_URL + link, ', Path:', path)
print(e)
else:
items = await page.query_selector_all('table.table.table-hover tr')
for item in items:
data = await item.query_selector('td a')
if data is None:
continue
text = await data.inner_text()
next_link = await data.get_attribute('href')
next_path = os.path.join(path, text)
queue.put_nowait((next_link, next_path))
ref_queue.append((next_link, next_path))
queue.task_done()
with open('.queue.cache', 'wb') as file:
pickle.dump(ref_queue, file)
print(worker_id, '[DONE]', BASE_URL + link)
async def main(base_url:str, start_url: str, output: str, timeout: int, workers: int, headless:bool, resume:bool):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=headless)
context = await browser.new_context()
context.set_default_timeout(timeout)
pages = [await context.new_page() for _ in range(workers)]
if resume and os.path.exists('.queue.cache'):
print('Resuming previous session')
with open('.queue.cache', 'rb') as file:
ref_queue = pickle.load(file)
else:
ref_queue = [(start_url, output)]
queue = asyncio.LifoQueue()
for start_url, output in ref_queue:
queue.put_nowait((start_url, output))
tasks = [asyncio.create_task(worker(f'Worker-{i}:', pages[i], queue, ref_queue, base_url)) for i in range(workers)]
await queue.join()
await browser.close()
if __name__ == '__main__':
BASE_URL = 'https://pemilu2024.kpu.go.id'
args = argparse.ArgumentParser(description='Indonesia Pemilu 2024 Scraper')
args.add_argument('--start-url', type=str, help=f'Starting url. Base is {BASE_URL}', default='/pilpres/hitung-suara')
args.add_argument('--output', type=str, help='Output directory', default='results')
args.add_argument('--timeout', type=int, help='Timeout for each request', default=3000)
args.add_argument('--workers', type=int, help='Total concurrent workers', default=15)
args.add_argument('--headless', action='store_true', help='Run in headless mode')
args.add_argument('--resume', action='store_true', help='Resume previous session if available')
args = args.parse_args()
print(args)
asyncio.run(main(BASE_URL, args.start_url, args.output, args.timeout, args.workers, args.headless, args.resume))