🔨 Run adblock on HTTPX request event hook (#126)

roniemartinez · web-flow · commit d19cc2948bcf · 2022-03-31T20:34:29.000+02:00
diff --git a/dude/base.py b/dude/base.py
@@ -331,9 +331,6 @@ def iter_urls(self) -> Iterable[str]:
                 if urlparse(url).netloc not in self.allowed_domains:
                     logger.info("URL %s is not in allowed domains.", url)
                     continue
-                if self.adblock.check_network_urls(url=url, source_url=url, request_type="document"):
-                    logger.info("URL %s has been blocked.", url)
-                    continue
                 yield url
         except IndexError:
             pass
diff --git a/dude/optional/beautifulsoup_scraper.py b/dude/optional/beautifulsoup_scraper.py
@@ -9,12 +9,12 @@
 
 from ..base import ScraperAbstract
 from ..rule import Selector, SelectorType, rule_grouper, rule_sorter
-from .utils import async_http_get, http_get
+from .utils import HTTPXMixin, async_http_get, http_get
 
 logger = logging.getLogger(__name__)
 
 
-class BeautifulSoupScraper(ScraperAbstract):
+class BeautifulSoupScraper(ScraperAbstract, HTTPXMixin):
     """
     Scraper using BeautifulSoup4 parser and HTTPX for requests
     """
@@ -62,7 +62,7 @@ def run_sync(
         save_per_page: bool,
         **kwargs: Any,
     ) -> None:
-        with httpx.Client(proxies=proxy) as client:
+        with httpx.Client(proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}) as client:
             for url in self.iter_urls():
                 logger.info("Requesting url %s", url)
                 for i in range(1, pages + 1):
@@ -96,7 +96,9 @@ async def run_async(
         save_per_page: bool,
         **kwargs: Any,
     ) -> None:
-        async with httpx.AsyncClient(proxies=proxy) as client:
+        async with httpx.AsyncClient(
+            proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}
+        ) as client:
             for url in self.iter_urls():
                 logger.info("Requesting url %s", url)
                 for i in range(1, pages + 1):
diff --git a/dude/optional/lxml_scraper.py b/dude/optional/lxml_scraper.py
@@ -10,12 +10,12 @@
 
 from ..base import ScraperAbstract
 from ..rule import Selector, SelectorType, rule_grouper, rule_sorter
-from .utils import async_http_get, http_get
+from .utils import HTTPXMixin, async_http_get, http_get
 
 logger = logging.getLogger(__name__)
 
 
-class LxmlScraper(ScraperAbstract):
+class LxmlScraper(ScraperAbstract, HTTPXMixin):
     """
     Scraper using lxml parser backend and HTTPX for requests
     """
@@ -63,7 +63,7 @@ def run_sync(
         save_per_page: bool,
         **kwargs: Any,
     ) -> None:
-        with httpx.Client(proxies=proxy) as client:
+        with httpx.Client(proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}) as client:
             for url in self.iter_urls():
                 logger.info("Requesting url %s", url)
                 for i in range(1, pages + 1):
@@ -99,7 +99,9 @@ async def run_async(
         save_per_page: bool,
         **kwargs: Any,
     ) -> None:
-        async with httpx.AsyncClient(proxies=proxy) as client:
+        async with httpx.AsyncClient(
+            proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}
+        ) as client:
             for url in self.iter_urls():
                 logger.info("Requesting url %s", url)
                 for i in range(1, pages + 1):
diff --git a/dude/optional/parsel_scraper.py b/dude/optional/parsel_scraper.py
@@ -9,12 +9,12 @@
 
 from ..base import ScraperAbstract
 from ..rule import Selector, SelectorType, rule_grouper, rule_sorter
-from .utils import async_http_get, http_get
+from .utils import HTTPXMixin, async_http_get, http_get
 
 logger = logging.getLogger(__name__)
 
 
-class ParselScraper(ScraperAbstract):
+class ParselScraper(ScraperAbstract, HTTPXMixin):
     """
     Scraper using Parsel parser backend and HTTPX for requests
     """
@@ -62,7 +62,7 @@ def run_sync(
         save_per_page: bool,
         **kwargs: Any,
     ) -> None:
-        with httpx.Client(proxies=proxy) as client:
+        with httpx.Client(proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}) as client:
             for url in self.iter_urls():
                 logger.info("Requesting url %s", url)
                 for i in range(1, pages + 1):
@@ -97,7 +97,9 @@ async def run_async(
         save_per_page: bool,
         **kwargs: Any,
     ) -> None:
-        async with httpx.AsyncClient(proxies=proxy) as client:
+        async with httpx.AsyncClient(
+            proxies=proxy, event_hooks={"request": [self._block_httpx_request_if_needed]}
+        ) as client:
             for url in self.iter_urls():
                 logger.info("Requesting url %s", url)
                 for i in range(1, pages + 1):
diff --git a/dude/optional/utils.py b/dude/optional/utils.py
@@ -3,6 +3,7 @@
 from typing import Optional, Tuple
 
 import httpx
+from httpx import Request
 
 logger = logging.getLogger(__name__)
 
@@ -35,7 +36,7 @@ async def async_http_get(client: httpx.AsyncClient, url: str) -> Tuple[Optional[
             response = await client.get(url)
             response.raise_for_status()
             return response.text, str(response.url)
-        except httpx.HTTPStatusError as e:
+        except (httpx.HTTPStatusError, httpx.RequestError) as e:
             logger.warning(e)
             return None, url
 
@@ -49,6 +50,21 @@ def http_get(client: httpx.Client, url: str) -> Tuple[Optional[str], str]:
             response = client.get(url)
             response.raise_for_status()
             return response.text, str(response.url)
-        except httpx.HTTPStatusError as e:
+        except (httpx.HTTPStatusError, httpx.RequestError) as e:
             logger.warning(e)
             return None, url
+
+
+class HTTPXMixin:
+    def _block_httpx_request_if_needed(self, request: Request) -> None:
+        url = str(request.url)
+        source_url = (
+            request.headers.get("referer") or request.headers.get("origin") or request.headers.get("host") or url
+        )
+        if self.adblock.check_network_urls(  # type: ignore
+            url=url,
+            source_url=source_url,
+            request_type=request.headers.get("sec-fetch-dest") or "other",
+        ):
+            logger.info("URL %s has been blocked.", url)
+            raise httpx.RequestError(message=f"URL {url} has been blocked.", request=request)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pydude"
-version = "0.14.0"
+version = "0.15.0"
 repository = "https://github.com/roniemartinez/dude"
 description = "dude uncomplicated data extraction"
 authors = ["Ronie Martinez <ronmarti18@gmail.com>"]
diff --git a/tests/test_bs4.py b/tests/test_bs4.py
@@ -1,12 +1,21 @@
 import sys
-from typing import Any, Callable, Dict, List
+from typing import Any, Callable, Dict, List, Optional
 from unittest import mock
 
 import httpx
 import pytest
+from braveblock import Adblocker
 from bs4 import BeautifulSoup
 
 from dude import Scraper
+from dude.optional.beautifulsoup_scraper import BeautifulSoupScraper
+
+
+@pytest.fixture()
+def scraper_application_with_bs4_parser() -> Scraper:
+    scraper = BeautifulSoupScraper()
+    scraper.adblock = Adblocker(rules=["https://dude.ron.sh/blockme.css"])
+    return Scraper(scraper=scraper)
 
 
 @pytest.fixture()
@@ -88,6 +97,35 @@ def url(element: BeautifulSoup) -> Dict:
         return {"url": element["href"]}
 
 
+@pytest.fixture()
+def bs4_select_with_parser(scraper_application_with_bs4_parser: Scraper) -> None:
+    @scraper_application_with_bs4_parser.group(css=".custom-group")
+    @scraper_application_with_bs4_parser.select(css=".title")
+    def title(element: BeautifulSoup) -> Dict:
+        return {"title": element.get_text()}
+
+    @scraper_application_with_bs4_parser.select(css=".title", group_css=".custom-group")
+    def empty(element: BeautifulSoup) -> Dict:
+        return {}
+
+    @scraper_application_with_bs4_parser.group(css=".custom-group")
+    @scraper_application_with_bs4_parser.select(css=".title", url="example.com")
+    def url_dont_match(element: BeautifulSoup) -> Dict:
+        return {"title": element.get_text()}
+
+    @scraper_application_with_bs4_parser.select(css=".url", group_css=".custom-group")
+    def url(element: BeautifulSoup) -> Dict:
+        return {"url": element["href"]}
+
+
+@pytest.fixture()
+def scraper_with_parser_save(scraper_application_with_bs4_parser: Scraper, mock_database: mock.MagicMock) -> None:
+    @scraper_application_with_bs4_parser.save("custom")
+    def save_to_database(data: Any, output: Optional[str]) -> bool:
+        mock_database.save(data)
+        return True
+
+
 def test_full_flow_bs4(
     scraper_application: Scraper,
     bs4_select: None,
@@ -269,3 +307,20 @@ def test_unsupported_regex(
 
     with pytest.raises(Exception):
         scraper_application.run(urls=[test_url], pages=2, format="custom", parser="bs4")
+
+
+def test_scraper_with_parser(
+    scraper_application_with_bs4_parser: Scraper,
+    bs4_select_with_parser: None,
+    scraper_with_parser_save: None,
+    mock_database: mock.MagicMock,
+) -> None:
+    assert scraper_application_with_bs4_parser.has_async is False
+    assert scraper_application_with_bs4_parser.scraper is not None
+    assert len(scraper_application_with_bs4_parser.scraper.rules) == 4
+
+    scraper_application_with_bs4_parser.run(
+        urls=["https://dude.ron.sh/blockme.css"], pages=2, format="custom", parser="bs4"
+    )
+
+    mock_database.save.assert_not_called()