Skip to content

Commit f0cf10a

Browse files
✨ Block ads (#74)
* ✨ Block ads * Block ads in Pyppeteer * Block ads in Selenium - Use Selenium-Wire - Expose WebDriver in Setup and Navigate handlers * Update grouping description * Bump version * Tests
1 parent babff66 commit f0cf10a

14 files changed

+475
-43
lines changed

.github/workflows/python.yml

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ on:
1010
pull_request:
1111
branches:
1212
- master
13+
workflow_dispatch:
1314

1415
concurrency:
1516
group: ${{ github.ref }}

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ dude scrape --url "<url>" --output data.json path/to/script.py
6969

7070
- Simple [Flask](https://github.com/pallets/flask)-inspired design - build a scraper with decorators.
7171
- Uses [Playwright](https://playwright.dev/python/) API - run your scraper in Chrome, Firefox and Webkit and leverage Playwright's powerful selector engine supporting CSS, XPath, text, regex, etc.
72-
- Data grouping - group related scraping data.
72+
- Data grouping - group related results.
7373
- URL pattern matching - run functions on specific URLs.
7474
- Priority - reorder functions based on priority.
7575
- Setup function - enable setup steps (clicking dialogs or login).

docs/advanced/01_setup.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Setup handlers could perform any of the following:
77
2. Click on dialogs buttons
88

99
To create a Setup handler, you can pass `setup=True` parameter to `@select()` decorator.
10-
The only difference with Setup and normal element handler is that setup functions should accept 2 parameters, the element matched by the selector and the Page object.
10+
The only difference with Setup and normal element handler is that setup functions should accept 2 parameters, the element matched by the selector and the Page object (or WebDriver object in Selenium).
1111
Click on the annotations (+ sign) for more details.
1212

1313
=== "Python"

docs/advanced/02_navigate.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Navigate handlers are used to move from page to page.
44

55
To create a Navigate handler, you can pass `navigate=True` parameter to `@select()` decorator.
6-
Like Setup handlers, Navigate handlers should accept 2 parameters, the element matched by the selector and the Page object.
6+
Like Setup handlers, Navigate handlers should accept 2 parameters, the element matched by the selector and the Page object (or WebDriver object in Selenium).
77
Click on the annotations (+ sign) for more details.
88

99
=== "Python"

docs/features.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
- Simple [Flask](https://github.com/pallets/flask)-inspired design - build a scraper with decorators.
44
- Uses [Playwright](https://playwright.dev/python/) API - run your scraper in Chrome, Firefox and Webkit and leverage Playwright's powerful selector engine supporting CSS, XPath, text, regex, etc.
5-
- Data grouping - group related scraping data.
5+
- Data grouping - group related results.
66
- URL pattern matching - run functions on specific URLs.
77
- Priority - reorder functions based on priority.
88
- Setup function - enable setup steps (clicking dialogs or login).

dude/optional/pyppeteer_scraper.py

+25-1
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
import logging
44
from typing import Any, AsyncIterable, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
55

6+
from braveblock import Adblocker
67
from pyppeteer import launch
78
from pyppeteer.element_handle import ElementHandle
9+
from pyppeteer.network_manager import Request
810
from pyppeteer.page import Page
911

1012
from ..base import ScraperAbstract
@@ -19,6 +21,10 @@ class PyppeteerScraper(ScraperAbstract):
1921
Pyppeteer-based scraper
2022
"""
2123

24+
def __init__(self, *args: Any, **kwargs: Any) -> None:
25+
super(PyppeteerScraper, self).__init__(*args, **kwargs)
26+
self.adblock = Adblocker()
27+
2228
def run(
2329
self,
2430
urls: Sequence[str],
@@ -97,6 +103,21 @@ async def navigate_async(self, page: Page = None) -> bool:
97103
return True
98104
return False
99105

106+
async def _block_url_if_needed(self, request: Request) -> Any:
107+
url = request.url
108+
source_url = (
109+
request.headers.get("referer") or request.headers.get("origin") or request.headers.get("host") or url
110+
)
111+
if self.adblock.check_network_urls(
112+
url=url,
113+
source_url=source_url,
114+
request_type=request.resourceType,
115+
):
116+
logger.info("URL %s has been blocked.", url)
117+
return await request.abort()
118+
else:
119+
return await request.continue_()
120+
100121
async def _run_async(
101122
self,
102123
urls: Sequence[str],
@@ -106,7 +127,7 @@ async def _run_async(
106127
output: Optional[str],
107128
format: str,
108129
) -> None:
109-
launch_args: Dict[str, Any] = {"headless": headless, "args": ["--no-sandbox", "--disable-notifications"]}
130+
launch_args: Dict[str, Any] = {"headless": headless, "args": ["--disable-notifications"]}
110131
if proxy:
111132
launch_args["args"] = [f"--proxy-server={proxy['server']}"]
112133

@@ -116,6 +137,9 @@ async def _run_async(
116137
if proxy and proxy["username"] and proxy["password"]:
117138
await page.authenticate(credentials={"username": proxy["username"], "password": proxy["password"]})
118139

140+
await page.setRequestInterception(True)
141+
page.on("request", lambda res: asyncio.ensure_future(self._block_url_if_needed(res)))
142+
119143
for url in urls:
120144
await page.goto(url)
121145
logger.info("Loaded page %s", page.url)

dude/optional/selenium_scraper.py

+41-18
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,16 @@
44
import os
55
from typing import Any, AsyncIterable, Callable, Iterable, Optional, Sequence, Tuple, Union
66

7-
from selenium import webdriver
7+
from braveblock import Adblocker
88
from selenium.webdriver.chrome.options import Options as ChromeOptions
99
from selenium.webdriver.chrome.service import Service as ChromeService
1010
from selenium.webdriver.common.by import By
1111
from selenium.webdriver.firefox.options import Options as FirefoxOptions
1212
from selenium.webdriver.firefox.service import Service as FirefoxService
1313
from selenium.webdriver.remote.webdriver import WebDriver
1414
from selenium.webdriver.remote.webelement import WebElement
15+
from seleniumwire.request import Request
16+
from seleniumwire.webdriver import Chrome, Firefox
1517
from webdriver_manager.chrome import ChromeDriverManager
1618
from webdriver_manager.firefox import GeckoDriverManager
1719
from webdriver_manager.utils import ChromeType
@@ -27,6 +29,10 @@ class SeleniumScraper(ScraperAbstract):
2729
Selenium-based scraper
2830
"""
2931

32+
def __init__(self, *args: Any, **kwargs: Any) -> None:
33+
super(SeleniumScraper, self).__init__(*args, **kwargs)
34+
self.adblock = Adblocker()
35+
3036
def run(
3137
self,
3238
urls: Sequence[str],
@@ -100,7 +106,7 @@ def setup(self, driver: WebDriver = None) -> None:
100106
assert driver is not None
101107
for rule in self.get_setup_rules(driver.current_url):
102108
for element in self._get_elements(driver, rule.selector):
103-
rule.handler(element)
109+
rule.handler(element, driver)
104110

105111
async def setup_async(self, driver: WebDriver = None) -> None:
106112
"""
@@ -112,9 +118,9 @@ async def setup_async(self, driver: WebDriver = None) -> None:
112118
for rule in self.get_setup_rules(driver.current_url):
113119
for element in self._get_elements(driver, rule.selector):
114120
if asyncio.iscoroutinefunction(rule.handler):
115-
await rule.handler(element)
121+
await rule.handler(element, driver)
116122
else:
117-
rule.handler(element)
123+
rule.handler(element, driver)
118124

119125
def navigate(self, driver: WebDriver = None) -> bool:
120126
"""
@@ -125,7 +131,7 @@ def navigate(self, driver: WebDriver = None) -> bool:
125131
assert driver is not None
126132
for rule in self.get_navigate_rules(driver.current_url):
127133
for element in self._get_elements(driver, rule.selector):
128-
rule.handler(element)
134+
rule.handler(element, driver)
129135
logger.info("Navigated to %s", driver.current_url)
130136
return True
131137
return False
@@ -140,9 +146,9 @@ async def navigate_async(self, driver: WebDriver = None) -> bool:
140146
for rule in self.get_navigate_rules(driver.current_url):
141147
for element in self._get_elements(driver, rule.selector):
142148
if asyncio.iscoroutinefunction(rule.handler):
143-
await rule.handler(element)
149+
await rule.handler(element, driver)
144150
else:
145-
rule.handler(element)
151+
rule.handler(element, driver)
146152
logger.info("Navigated to %s", driver.current_url)
147153
return True
148154
return False
@@ -203,23 +209,40 @@ async def _run_async(
203209
driver.close()
204210
await self._save_async(format, output)
205211

206-
@staticmethod
207-
def _get_driver(browser_type: str, headless: bool) -> WebDriver:
212+
def _block_url_if_needed(self, request: Request) -> None:
213+
url = request.url
214+
source_url = (
215+
request.headers.get("referer") or request.headers.get("origin") or request.headers.get("host") or url
216+
)
217+
if self.adblock.check_network_urls(
218+
url=url,
219+
source_url=source_url,
220+
request_type=request.headers.get("sec-fetch-dest") or "other",
221+
):
222+
logger.info("URL %s has been blocked.", url)
223+
request.abort()
224+
225+
def _get_driver(self, browser_type: str, headless: bool) -> WebDriver:
208226
# TODO: Add more drivers: https://github.com/SergeyPirogov/webdriver_manager#webdriver-manager-for-python
209227
if browser_type == "firefox":
210228
executable_path = GeckoDriverManager().install()
211229
firefox_options = FirefoxOptions()
212230
firefox_options.headless = headless
213231
firefox_options.set_preference("dom.webnotifications.enabled", False)
214-
return webdriver.Firefox(service=FirefoxService(executable_path=executable_path), options=firefox_options)
215-
216-
chrome_options = ChromeOptions()
217-
chrome_options.headless = headless
218-
chrome_options.add_argument("disable-notifications")
219-
executable_path = ChromeDriverManager(
220-
chrome_type=ChromeType.CHROMIUM, version=os.getenv("CHROMEDRIVER_VERSION", "latest")
221-
).install()
222-
return webdriver.Chrome(service=ChromeService(executable_path=executable_path), options=chrome_options)
232+
driver = Firefox(service=FirefoxService(executable_path=executable_path), options=firefox_options)
233+
else:
234+
chrome_options = ChromeOptions()
235+
chrome_options.headless = headless
236+
chrome_options.add_argument("disable-notifications")
237+
executable_path = ChromeDriverManager(
238+
chrome_type=ChromeType.CHROMIUM, version=os.getenv("CHROMEDRIVER_VERSION", "latest")
239+
).install()
240+
driver = Chrome(service=ChromeService(executable_path=executable_path), options=chrome_options)
241+
242+
driver.implicitly_wait(10)
243+
driver.request_interceptor = self._block_url_if_needed
244+
245+
return driver
223246

224247
def collect_elements(self, driver: WebDriver = None) -> Iterable[Tuple[str, int, int, int, Any, Callable]]:
225248
"""

dude/playwright_scraper.py

+24
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
from typing import Any, AsyncIterable, Callable, Dict, Iterable, Optional, Sequence, Tuple, Union
55

6+
from braveblock import Adblocker
67
from playwright import async_api, sync_api
78
from playwright.async_api import async_playwright
89
from playwright.sync_api import sync_playwright
@@ -18,6 +19,10 @@ class PlaywrightScraper(ScraperAbstract):
1819
Playwright-based scraper
1920
"""
2021

22+
def __init__(self, *args: Any, **kwargs: Any) -> None:
23+
super(PlaywrightScraper, self).__init__(*args, **kwargs)
24+
self.adblock = Adblocker()
25+
2126
def run(
2227
self,
2328
urls: Sequence[str],
@@ -148,6 +153,23 @@ def _get_launch_kwargs(browser_type: str) -> Dict[str, Any]:
148153
args.append("--disable-notifications")
149154
return {"args": args, "firefox_user_prefs": {"dom.webnotifications.enabled": False}}
150155

156+
def _block_url_if_needed(self, route: Union[sync_api.Route, async_api.Route]) -> Any:
157+
url = route.request.url
158+
source_url = (
159+
route.request.headers.get("referer")
160+
or route.request.headers.get("origin")
161+
or route.request.headers.get("host")
162+
or url
163+
)
164+
if self.adblock.check_network_urls(
165+
url=url,
166+
source_url=source_url,
167+
request_type=route.request.resource_type,
168+
):
169+
logger.info("URL %s has been blocked.", url)
170+
return route.abort()
171+
return route.continue_()
172+
151173
def _run_sync(
152174
self,
153175
urls: Sequence[str],
@@ -163,6 +185,7 @@ def _run_sync(
163185
with sync_playwright() as p:
164186
browser = p[browser_type].launch(headless=headless, proxy=proxy, **launch_kwargs)
165187
page = browser.new_page()
188+
page.route("**/*", self._block_url_if_needed)
166189
self._scrape_sync(page, urls, pages)
167190
browser.close()
168191
self._save(format, output)
@@ -193,6 +216,7 @@ async def _run_async(
193216
async with async_playwright() as p:
194217
browser = await p[browser_type].launch(headless=headless, proxy=proxy, **launch_kwargs)
195218
page = await browser.new_page()
219+
await page.route("**/*", self._block_url_if_needed)
196220
for url in urls:
197221
await page.goto(url)
198222
logger.info("Loaded page %s", page.url)

examples/dude.html

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
<head>
44
<meta charset="UTF-8">
55
<title>A simple sandbox for dude</title>
6+
<link rel="stylesheet" href="https://dude.ron.sh/blockme.css">
67
</head>
78
<body>
89
<div class="custom-group">

0 commit comments

Comments
 (0)