Skip to content

Commit d19cc29

Browse files
🔨 Run adblock on HTTPX request event hook (#126)
1 parent 90ceb53 commit d19cc29

7 files changed

+93
-19
lines changed

dude/base.py

-3
Original file line numberDiff line numberDiff line change
@@ -331,9 +331,6 @@ def iter_urls(self) -> Iterable[str]:
331331
if urlparse(url).netloc not in self.allowed_domains:
332332
logger.info("URL %s is not in allowed domains.", url)
333333
continue
334-
if self.adblock.check_network_urls(url=url, source_url=url, request_type="document"):
335-
logger.info("URL %s has been blocked.", url)
336-
continue
337334
yield url
338335
except IndexError:
339336
pass

dude/optional/beautifulsoup_scraper.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99

1010
from ..base import ScraperAbstract
1111
from ..rule import Selector, SelectorType, rule_grouper, rule_sorter
12-
from .utils import async_http_get, http_get
12+
from .utils import HTTPXMixin, async_http_get, http_get
1313

1414
logger = logging.getLogger(__name__)
1515

1616

17-
class BeautifulSoupScraper(ScraperAbstract):
17+
class BeautifulSoupScraper(ScraperAbstract, HTTPXMixin):
1818
"""
1919
Scraper using BeautifulSoup4 parser and HTTPX for requests
2020
"""
@@ -62,7 +62,7 @@ def run_sync(
6262
save_per_page: bool,
6363
**kwargs: Any,
6464
) -> None:
65-
with httpx.Client(proxies=proxy) as client:
65+
with httpx.Client(proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}) as client:
6666
for url in self.iter_urls():
6767
logger.info("Requesting url %s", url)
6868
for i in range(1, pages + 1):
@@ -96,7 +96,9 @@ async def run_async(
9696
save_per_page: bool,
9797
**kwargs: Any,
9898
) -> None:
99-
async with httpx.AsyncClient(proxies=proxy) as client:
99+
async with httpx.AsyncClient(
100+
proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}
101+
) as client:
100102
for url in self.iter_urls():
101103
logger.info("Requesting url %s", url)
102104
for i in range(1, pages + 1):

dude/optional/lxml_scraper.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010

1111
from ..base import ScraperAbstract
1212
from ..rule import Selector, SelectorType, rule_grouper, rule_sorter
13-
from .utils import async_http_get, http_get
13+
from .utils import HTTPXMixin, async_http_get, http_get
1414

1515
logger = logging.getLogger(__name__)
1616

1717

18-
class LxmlScraper(ScraperAbstract):
18+
class LxmlScraper(ScraperAbstract, HTTPXMixin):
1919
"""
2020
Scraper using lxml parser backend and HTTPX for requests
2121
"""
@@ -63,7 +63,7 @@ def run_sync(
6363
save_per_page: bool,
6464
**kwargs: Any,
6565
) -> None:
66-
with httpx.Client(proxies=proxy) as client:
66+
with httpx.Client(proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}) as client:
6767
for url in self.iter_urls():
6868
logger.info("Requesting url %s", url)
6969
for i in range(1, pages + 1):
@@ -99,7 +99,9 @@ async def run_async(
9999
save_per_page: bool,
100100
**kwargs: Any,
101101
) -> None:
102-
async with httpx.AsyncClient(proxies=proxy) as client:
102+
async with httpx.AsyncClient(
103+
proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}
104+
) as client:
103105
for url in self.iter_urls():
104106
logger.info("Requesting url %s", url)
105107
for i in range(1, pages + 1):

dude/optional/parsel_scraper.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99

1010
from ..base import ScraperAbstract
1111
from ..rule import Selector, SelectorType, rule_grouper, rule_sorter
12-
from .utils import async_http_get, http_get
12+
from .utils import HTTPXMixin, async_http_get, http_get
1313

1414
logger = logging.getLogger(__name__)
1515

1616

17-
class ParselScraper(ScraperAbstract):
17+
class ParselScraper(ScraperAbstract, HTTPXMixin):
1818
"""
1919
Scraper using Parsel parser backend and HTTPX for requests
2020
"""
@@ -62,7 +62,7 @@ def run_sync(
6262
save_per_page: bool,
6363
**kwargs: Any,
6464
) -> None:
65-
with httpx.Client(proxies=proxy) as client:
65+
with httpx.Client(proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}) as client:
6666
for url in self.iter_urls():
6767
logger.info("Requesting url %s", url)
6868
for i in range(1, pages + 1):
@@ -97,7 +97,9 @@ async def run_async(
9797
save_per_page: bool,
9898
**kwargs: Any,
9999
) -> None:
100-
async with httpx.AsyncClient(proxies=proxy) as client:
100+
async with httpx.AsyncClient(
101+
proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}
102+
) as client:
101103
for url in self.iter_urls():
102104
logger.info("Requesting url %s", url)
103105
for i in range(1, pages + 1):

dude/optional/utils.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Optional, Tuple
44

55
import httpx
6+
from httpx import Request
67

78
logger = logging.getLogger(__name__)
89

@@ -35,7 +36,7 @@ async def async_http_get(client: httpx.AsyncClient, url: str) -> Tuple[Optional[
3536
response = await client.get(url)
3637
response.raise_for_status()
3738
return response.text, str(response.url)
38-
except httpx.HTTPStatusError as e:
39+
except (httpx.HTTPStatusError, httpx.RequestError) as e:
3940
logger.warning(e)
4041
return None, url
4142

@@ -49,6 +50,21 @@ def http_get(client: httpx.Client, url: str) -> Tuple[Optional[str], str]:
4950
response = client.get(url)
5051
response.raise_for_status()
5152
return response.text, str(response.url)
52-
except httpx.HTTPStatusError as e:
53+
except (httpx.HTTPStatusError, httpx.RequestError) as e:
5354
logger.warning(e)
5455
return None, url
56+
57+
58+
class HTTPXMixin:
59+
def _block_httpx_request_if_needed(self, request: Request) -> None:
60+
url = str(request.url)
61+
source_url = (
62+
request.headers.get("referer") or request.headers.get("origin") or request.headers.get("host") or url
63+
)
64+
if self.adblock.check_network_urls( # type: ignore
65+
url=url,
66+
source_url=source_url,
67+
request_type=request.headers.get("sec-fetch-dest") or "other",
68+
):
69+
logger.info("URL %s has been blocked.", url)
70+
raise httpx.RequestError(message=f"URL {url} has been blocked.", request=request)

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pydude"
3-
version = "0.14.0"
3+
version = "0.15.0"
44
repository = "https://github.com/roniemartinez/dude"
55
description = "dude uncomplicated data extraction"
66
authors = ["Ronie Martinez <ronmarti18@gmail.com>"]

tests/test_bs4.py

+56-1
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,21 @@
11
import sys
2-
from typing import Any, Callable, Dict, List
2+
from typing import Any, Callable, Dict, List, Optional
33
from unittest import mock
44

55
import httpx
66
import pytest
7+
from braveblock import Adblocker
78
from bs4 import BeautifulSoup
89

910
from dude import Scraper
11+
from dude.optional.beautifulsoup_scraper import BeautifulSoupScraper
12+
13+
14+
@pytest.fixture()
15+
def scraper_application_with_bs4_parser() -> Scraper:
16+
scraper = BeautifulSoupScraper()
17+
scraper.adblock = Adblocker(rules=["https://dude.ron.sh/blockme.css"])
18+
return Scraper(scraper=scraper)
1019

1120

1221
@pytest.fixture()
@@ -88,6 +97,35 @@ def url(element: BeautifulSoup) -> Dict:
8897
return {"url": element["href"]}
8998

9099

100+
@pytest.fixture()
101+
def bs4_select_with_parser(scraper_application_with_bs4_parser: Scraper) -> None:
102+
@scraper_application_with_bs4_parser.group(css=".custom-group")
103+
@scraper_application_with_bs4_parser.select(css=".title")
104+
def title(element: BeautifulSoup) -> Dict:
105+
return {"title": element.get_text()}
106+
107+
@scraper_application_with_bs4_parser.select(css=".title", group_css=".custom-group")
108+
def empty(element: BeautifulSoup) -> Dict:
109+
return {}
110+
111+
@scraper_application_with_bs4_parser.group(css=".custom-group")
112+
@scraper_application_with_bs4_parser.select(css=".title", url="example.com")
113+
def url_dont_match(element: BeautifulSoup) -> Dict:
114+
return {"title": element.get_text()}
115+
116+
@scraper_application_with_bs4_parser.select(css=".url", group_css=".custom-group")
117+
def url(element: BeautifulSoup) -> Dict:
118+
return {"url": element["href"]}
119+
120+
121+
@pytest.fixture()
122+
def scraper_with_parser_save(scraper_application_with_bs4_parser: Scraper, mock_database: mock.MagicMock) -> None:
123+
@scraper_application_with_bs4_parser.save("custom")
124+
def save_to_database(data: Any, output: Optional[str]) -> bool:
125+
mock_database.save(data)
126+
return True
127+
128+
91129
def test_full_flow_bs4(
92130
scraper_application: Scraper,
93131
bs4_select: None,
@@ -269,3 +307,20 @@ def test_unsupported_regex(
269307

270308
with pytest.raises(Exception):
271309
scraper_application.run(urls=[test_url], pages=2, format="custom", parser="bs4")
310+
311+
312+
def test_scraper_with_parser(
313+
scraper_application_with_bs4_parser: Scraper,
314+
bs4_select_with_parser: None,
315+
scraper_with_parser_save: None,
316+
mock_database: mock.MagicMock,
317+
) -> None:
318+
assert scraper_application_with_bs4_parser.has_async is False
319+
assert scraper_application_with_bs4_parser.scraper is not None
320+
assert len(scraper_application_with_bs4_parser.scraper.rules) == 4
321+
322+
scraper_application_with_bs4_parser.run(
323+
urls=["https://dude.ron.sh/blockme.css"], pages=2, format="custom", parser="bs4"
324+
)
325+
326+
mock_database.save.assert_not_called()

0 commit comments

Comments
 (0)