|
1 | 1 | import sys
|
2 |
| -from typing import Any, Callable, Dict, List |
| 2 | +from typing import Any, Callable, Dict, List, Optional |
3 | 3 | from unittest import mock
|
4 | 4 |
|
5 | 5 | import httpx
|
6 | 6 | import pytest
|
| 7 | +from braveblock import Adblocker |
7 | 8 | from bs4 import BeautifulSoup
|
8 | 9 |
|
9 | 10 | from dude import Scraper
|
| 11 | +from dude.optional.beautifulsoup_scraper import BeautifulSoupScraper |
| 12 | + |
| 13 | + |
| 14 | +@pytest.fixture() |
| 15 | +def scraper_application_with_bs4_parser() -> Scraper: |
| 16 | + scraper = BeautifulSoupScraper() |
| 17 | + scraper.adblock = Adblocker(rules=["https://dude.ron.sh/blockme.css"]) |
| 18 | + return Scraper(scraper=scraper) |
10 | 19 |
|
11 | 20 |
|
12 | 21 | @pytest.fixture()
|
@@ -88,6 +97,35 @@ def url(element: BeautifulSoup) -> Dict:
|
88 | 97 | return {"url": element["href"]}
|
89 | 98 |
|
90 | 99 |
|
| 100 | +@pytest.fixture() |
| 101 | +def bs4_select_with_parser(scraper_application_with_bs4_parser: Scraper) -> None: |
| 102 | + @scraper_application_with_bs4_parser.group(css=".custom-group") |
| 103 | + @scraper_application_with_bs4_parser.select(css=".title") |
| 104 | + def title(element: BeautifulSoup) -> Dict: |
| 105 | + return {"title": element.get_text()} |
| 106 | + |
| 107 | + @scraper_application_with_bs4_parser.select(css=".title", group_css=".custom-group") |
| 108 | + def empty(element: BeautifulSoup) -> Dict: |
| 109 | + return {} |
| 110 | + |
| 111 | + @scraper_application_with_bs4_parser.group(css=".custom-group") |
| 112 | + @scraper_application_with_bs4_parser.select(css=".title", url="example.com") |
| 113 | + def url_dont_match(element: BeautifulSoup) -> Dict: |
| 114 | + return {"title": element.get_text()} |
| 115 | + |
| 116 | + @scraper_application_with_bs4_parser.select(css=".url", group_css=".custom-group") |
| 117 | + def url(element: BeautifulSoup) -> Dict: |
| 118 | + return {"url": element["href"]} |
| 119 | + |
| 120 | + |
| 121 | +@pytest.fixture() |
| 122 | +def scraper_with_parser_save(scraper_application_with_bs4_parser: Scraper, mock_database: mock.MagicMock) -> None: |
| 123 | + @scraper_application_with_bs4_parser.save("custom") |
| 124 | + def save_to_database(data: Any, output: Optional[str]) -> bool: |
| 125 | + mock_database.save(data) |
| 126 | + return True |
| 127 | + |
| 128 | + |
91 | 129 | def test_full_flow_bs4(
|
92 | 130 | scraper_application: Scraper,
|
93 | 131 | bs4_select: None,
|
@@ -269,3 +307,20 @@ def test_unsupported_regex(
|
269 | 307 |
|
270 | 308 | with pytest.raises(Exception):
|
271 | 309 | scraper_application.run(urls=[test_url], pages=2, format="custom", parser="bs4")
|
| 310 | + |
| 311 | + |
| 312 | +def test_scraper_with_parser( |
| 313 | + scraper_application_with_bs4_parser: Scraper, |
| 314 | + bs4_select_with_parser: None, |
| 315 | + scraper_with_parser_save: None, |
| 316 | + mock_database: mock.MagicMock, |
| 317 | +) -> None: |
| 318 | + assert scraper_application_with_bs4_parser.has_async is False |
| 319 | + assert scraper_application_with_bs4_parser.scraper is not None |
| 320 | + assert len(scraper_application_with_bs4_parser.scraper.rules) == 4 |
| 321 | + |
| 322 | + scraper_application_with_bs4_parser.run( |
| 323 | + urls=["https://dude.ron.sh/blockme.css"], pages=2, format="custom", parser="bs4" |
| 324 | + ) |
| 325 | + |
| 326 | + mock_database.save.assert_not_called() |
0 commit comments