Skip to content

Commit 37ccfff

Browse files
✨ Events (#99)
* ✨ Events * Fix lint * 📚 Add Event documentation - Add sample output
1 parent b23916b commit 37ccfff

21 files changed

+631
-211
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,5 @@ dmypy.json
138138
# Cython debug symbols
139139
cython_debug/
140140

141+
# Draw.io
142+
*.drawio

README.md

+31
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,37 @@ You can run your scraper from terminal/shell/command-line by supplying URLs, the
6565
dude scrape --url "<url>" --output data.json path/to/script.py
6666
```
6767

68+
The output in `data.json` should contain the actual URL and the metadata prepended with underscore.
69+
70+
```json5
71+
[
72+
{
73+
"_page_number": 1,
74+
"_page_url": "https://dude.ron.sh/",
75+
"_group_id": 4502003824,
76+
"_group_index": 0,
77+
"_element_index": 0,
78+
"url": "/url-1.html"
79+
},
80+
{
81+
"_page_number": 1,
82+
"_page_url": "https://dude.ron.sh/",
83+
"_group_id": 4502003824,
84+
"_group_index": 0,
85+
"_element_index": 1,
86+
"url": "/url-2.html"
87+
},
88+
{
89+
"_page_number": 1,
90+
"_page_url": "https://dude.ron.sh/",
91+
"_group_id": 4502003824,
92+
"_group_index": 0,
93+
"_element_index": 2,
94+
"url": "/url-3.html"
95+
}
96+
]
97+
```
98+
6899
## Features
69100

70101
- Simple [Flask](https://github.com/pallets/flask)-inspired design - build a scraper with decorators.

docs/advanced/14_events.md

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Events
2+
3+
Functions can be registered to be called on specific events which makes it possible to run custom actions like setting
4+
up databases or calling API for authentication, and performing additional actions on page objects
5+
(can be soup, driver, selector or tree objects) like taking screenshots.
6+
7+
Here is a diagram when events are being executed.
8+
9+
![Events](../diagrams/events.png)
10+
11+
## Startup Event
12+
13+
The Startup event is executed at the start of the process.
14+
15+
The `@startup()` decorator can be used to register a function for startup.
16+
This can be used to setup databases or authenticate to APIs and other possible use-cases prior to actual web scraping.
17+
18+
```python
19+
from pathlib import Path
20+
21+
from dude import startup
22+
23+
SAVE_DIR: Path
24+
25+
26+
@startup()
27+
def initialize_csv():
28+
global SAVE_DIR
29+
SAVE_DIR = Path(__file__).resolve().parent / "temp"
30+
SAVE_DIR.mkdir(exist_ok=True)
31+
```
32+
33+
## Pre-Setup Event
34+
35+
The Pre-Setup event is executed after loading a page or getting an HTTP response.
36+
37+
The `@pre_setup()` decorator can be used to register a function for pre-setup.
38+
Note that the function should accept one argument which can either be a page, driver, soup, Parsel selector or LXML tree.
39+
40+
```python
41+
import uuid
42+
from dude import pre_setup
43+
44+
...
45+
46+
@pre_setup()
47+
def screenshot(page):
48+
unique_name = str(uuid.uuid4())
49+
page.screenshot(path=SAVE_DIR / f"{unique_name}.png")
50+
```
51+
52+
## Post-Setup Event
53+
54+
The Post-Setup event is executed after running the [setup functions](https://roniemartinez.github.io/dude/advanced/01_setup.html).
55+
56+
The `@post_setup()` decorator can be used to register a function for post-setup.
57+
Note that the function should accept one argument which can either be a page, driver, soup, Parsel selector or LXML tree.
58+
59+
```python
60+
import uuid
61+
from dude import post_setup
62+
63+
...
64+
65+
@post_setup()
66+
def print_pdf(page):
67+
unique_name = str(uuid.uuid4())
68+
page.pdf(path=SAVE_DIR / f"{unique_name}.pdf")
69+
```

docs/diagrams/events.png

99.7 KB
Loading

docs/index.md

+31
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,37 @@ To start scraping, use any of the following options. Click on the annotations (+
6565

6666
1. You can also use **dude.run()** function and run **python path/to/script.py** from terminal.
6767

68+
The output in `data.json` should contain the actual URL and the metadata prepended with underscore.
69+
70+
```json5
71+
[
72+
{
73+
"_page_number": 1,
74+
"_page_url": "https://dude.ron.sh/",
75+
"_group_id": 4502003824,
76+
"_group_index": 0,
77+
"_element_index": 0,
78+
"url": "/url-1.html"
79+
},
80+
{
81+
"_page_number": 1,
82+
"_page_url": "https://dude.ron.sh/",
83+
"_group_id": 4502003824,
84+
"_group_index": 0,
85+
"_element_index": 1,
86+
"url": "/url-2.html"
87+
},
88+
{
89+
"_page_number": 1,
90+
"_page_url": "https://dude.ron.sh/",
91+
"_group_id": 4502003824,
92+
"_group_index": 0,
93+
"_element_index": 2,
94+
"url": "/url-3.html"
95+
}
96+
]
97+
```
98+
6899
## License
69100

70101
This project is licensed under the terms of the GNU AGPLv3+ license.

dude/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
from pathlib import Path
44
from typing import Any
55

6-
from .context import group, run, save, select # noqa: F401
6+
from .context import group, post_setup, pre_setup, run, save, select, startup # noqa: F401
77
from .scraper import Scraper # noqa: F401
88

9-
__al__ = ["Scraper", "group", "run", "save", "select"]
9+
__al__ = ["Scraper", "group", "run", "save", "select", "startup", "pre_setup", "post_setup"]
1010

1111

1212
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

dude/base.py

+120-21
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
AsyncIterable,
1111
Callable,
1212
Coroutine,
13+
DefaultDict,
1314
Deque,
1415
Dict,
1516
Iterable,
@@ -43,12 +44,14 @@ def __init__(
4344
rules: List[Rule] = None,
4445
groups: Dict[Callable, Selector] = None,
4546
save_rules: Dict[str, Any] = None,
47+
events: Optional[DefaultDict] = None,
4648
has_async: bool = False,
4749
scraper: Optional["ScraperAbstract"] = None,
4850
) -> None:
4951
self.rules: List[Rule] = rules or []
5052
self.groups: Dict[Callable, Selector] = groups or {}
5153
self.save_rules: Dict[str, Any] = save_rules or {"json": save_json}
54+
self.events: DefaultDict = events or collections.defaultdict(list)
5255
self.has_async = has_async
5356
self.scraper = scraper
5457
self.adblock = Adblocker()
@@ -129,10 +132,8 @@ def wrapper(func: Callable) -> Union[Callable, Coroutine]:
129132
navigate=navigate,
130133
priority=priority,
131134
)
132-
if not self.scraper:
133-
self.rules.append(rule)
134-
else:
135-
self.scraper.rules.append(rule)
135+
rules = self.scraper.rules if self.scraper else self.rules
136+
rules.append(rule)
136137
return func
137138

138139
return wrapper
@@ -198,10 +199,61 @@ def wrapper(func: Callable) -> Callable:
198199
if asyncio.iscoroutinefunction(func):
199200
self.has_async = True
200201

201-
if not self.scraper:
202-
self.save_rules[format] = func
203-
else:
204-
self.scraper.save_rules[format] = func
202+
save_rules = self.scraper.save_rules if self.scraper else self.save_rules
203+
save_rules[format] = func
204+
return func
205+
206+
return wrapper
207+
208+
def startup(self) -> Callable:
209+
"""
210+
Decorator to register a function to startup events.
211+
212+
Startup events are executed before any actual scraping happens to, for example, setup databases, etc.
213+
"""
214+
215+
def wrapper(func: Callable) -> Callable:
216+
if asyncio.iscoroutinefunction(func):
217+
self.has_async = True
218+
219+
events = self.scraper.events if self.scraper else self.events
220+
events["startup"].append(func)
221+
return func
222+
223+
return wrapper
224+
225+
def pre_setup(self) -> Callable:
226+
"""
227+
Decorator to register a function to pre-setup events.
228+
229+
Pre-setup events are executed after a page is loaded (or HTTP response in case of HTTPX) and
230+
before running the setup functions.
231+
"""
232+
233+
def wrapper(func: Callable) -> Callable:
234+
if asyncio.iscoroutinefunction(func):
235+
self.has_async = True
236+
237+
events = self.scraper.events if self.scraper else self.events
238+
events["pre-setup"].append(func)
239+
return func
240+
241+
return wrapper
242+
243+
def post_setup(self) -> Callable:
244+
"""
245+
Decorator to register a function to post-setup events.
246+
247+
Post-setup events are executed after running the setup functions and before the actual web-scraping happens.
248+
This is useful when "page clean-ups" are done in the setup functions.
249+
"""
250+
251+
def wrapper(func: Callable) -> Callable:
252+
if asyncio.iscoroutinefunction(func):
253+
self.has_async = True
254+
255+
events = self.scraper.events if self.scraper else self.events
256+
events["post-setup"].append(func)
205257
return func
206258

207259
return wrapper
@@ -220,16 +272,47 @@ def iter_urls(self) -> Iterable[str]:
220272
except IndexError:
221273
pass
222274

275+
def _update_rule_groups(self) -> Iterable[Rule]:
276+
for rule in self.rules:
277+
if rule.group:
278+
yield rule
279+
elif rule.handler in self.groups:
280+
yield rule._replace(group=self.groups[rule.handler])
281+
else:
282+
yield rule._replace(group=Selector(selector=":root"))
283+
284+
def initialize_scraper(self, urls: Sequence[str]) -> None:
285+
self.rules = [rule for rule in self._update_rule_groups()]
286+
self.urls = collections.deque(urls)
287+
self.allowed_domains = {urlparse(url).netloc for url in urls}
288+
self.event_startup()
289+
290+
def event_startup(self) -> None:
291+
"""
292+
Run all startup events
293+
"""
294+
loop = None
295+
if self.has_async:
296+
loop = asyncio.get_event_loop()
297+
298+
for func in self.events["startup"]:
299+
if asyncio.iscoroutinefunction(func):
300+
assert loop is not None
301+
loop.run_until_complete(func())
302+
else:
303+
func()
304+
223305

224306
class ScraperAbstract(ScraperBase):
225307
def __init__(
226308
self,
227309
rules: List[Rule] = None,
228310
groups: Dict[Callable, Selector] = None,
229311
save_rules: Dict[str, Any] = None,
312+
events: Optional[DefaultDict] = None,
230313
has_async: bool = False,
231314
) -> None:
232-
super(ScraperAbstract, self).__init__(rules, groups, save_rules, has_async)
315+
super(ScraperAbstract, self).__init__(rules, groups, save_rules, events, has_async)
233316
self.collected_data: List[ScrapedData] = []
234317

235318
@abstractmethod
@@ -248,18 +331,6 @@ def navigate(self) -> bool:
248331
async def navigate_async(self) -> bool:
249332
raise NotImplementedError # pragma: no cover
250333

251-
def update_rule_groups(self) -> None:
252-
self.rules = [rule for rule in self._update_rule_groups()]
253-
254-
def _update_rule_groups(self) -> Iterable[Rule]:
255-
for rule in self.rules:
256-
if rule.group:
257-
yield rule
258-
elif rule.handler in self.groups:
259-
yield rule._replace(group=self.groups[rule.handler])
260-
else:
261-
yield rule._replace(group=Selector(selector=":root"))
262-
263334
@abstractmethod
264335
def collect_elements(self) -> Iterable[Tuple[str, int, int, int, Any, Callable]]:
265336
"""
@@ -276,6 +347,34 @@ async def collect_elements_async(self) -> AsyncIterable[Tuple[str, int, int, int
276347
yield "", 0, 0, 0, 0, str # HACK: mypy does not identify this as AsyncIterable # pragma: no cover
277348
raise NotImplementedError # pragma: no cover
278349

350+
def event_pre_setup(self, *args: Any) -> None:
351+
"""
352+
Run all pre-setup events.
353+
"""
354+
for func in self.events["pre-setup"]:
355+
func(*args)
356+
357+
def event_post_setup(self, *args: Any) -> None:
358+
"""
359+
Run all post-setup events.
360+
"""
361+
for func in self.events["post-setup"]:
362+
func(*args)
363+
364+
async def event_pre_setup_async(self, *args: Any) -> None:
365+
"""
366+
Run all pre-setup events.
367+
"""
368+
for func in self.events["pre-setup"]:
369+
await func(*args)
370+
371+
async def event_post_setup_async(self, *args: Any) -> None:
372+
"""
373+
Run all post-setup events.
374+
"""
375+
for func in self.events["post-setup"]:
376+
await func(*args)
377+
279378
def extract_all(self, page_number: int, **kwargs: Any) -> Iterable[ScrapedData]:
280379
"""
281380
Extracts all the data using the registered handler functions.

dude/context.py

+3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
run = _scraper.run
1111
save = _scraper.save
1212
select = _scraper.select
13+
startup = _scraper.startup
14+
pre_setup = _scraper.pre_setup
15+
post_setup = _scraper.post_setup
1316

1417

1518
"""

0 commit comments

Comments
 (0)