Skip to content

Commit 3f76a0f

Browse files
✨ Use fnmatch (#122)
1 parent ee25611 commit 3f76a0f

14 files changed

+31
-28
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ Changing the output to `--output data.csv` should result in the following CSV co
105105
- Simple [Flask](https://github.com/pallets/flask)-inspired design - build a scraper with decorators.
106106
- Uses [Playwright](https://playwright.dev/python/) API - run your scraper in Chrome, Firefox and Webkit and leverage Playwright's powerful selector engine supporting CSS, XPath, text, regex, etc.
107107
- Data grouping - group related results.
108-
- URL pattern matching - run functions on specific URLs.
108+
- URL pattern matching - run functions on matched URLs.
109109
- Priority - reorder functions based on priority.
110110
- Setup function - enable setup steps (clicking dialogs or login).
111111
- Navigate function - enable navigation steps to move to other pages.

docs/advanced/04_url_pattern_matching.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
# URL Pattern Matching
22

33
In order to make a handler function to run on specific websites, a `url` pattern parameter can be passed to `@select()` decorator.
4-
The `url` pattern parameter should be a valid regular expression.
5-
The example below will only run if the URL of the current page matches `.*\.com`.
4+
The `url` pattern parameter should be a valid Unix shell-style wildcards (see https://docs.python.org/3/library/fnmatch.html).
5+
The example below will only run if the URL of the current page matches `*.com/*`.
66

77
=== "Python"
88

99
```python
1010
from dude import select
1111

1212

13-
@select(css=".title", url=r".*\.com")
13+
@select(css=".title", url="*.com/*")
1414
def result_title(element):
1515
return {"title": element.text_content()}
1616
```

docs/features.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
- Simple [Flask](https://github.com/pallets/flask)-inspired design - build a scraper with decorators.
44
- Uses [Playwright](https://playwright.dev/python/) API - run your scraper in Chrome, Firefox and Webkit and leverage Playwright's powerful selector engine supporting CSS, XPath, text, regex, etc.
55
- Data grouping - group related results.
6-
- URL pattern matching - run functions on specific URLs.
6+
- URL pattern matching - run functions on matched URLs.
77
- Priority - reorder functions based on priority.
88
- Setup function - enable setup steps (clicking dialogs or login).
99
- Navigate function - enable navigation steps to move to other pages.

dude/base.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def select(
133133
group: str = None,
134134
setup: bool = False,
135135
navigate: bool = False,
136-
url: str = "",
136+
url: str = "*",
137137
priority: int = 100,
138138
css: str = None,
139139
xpath: str = None,
@@ -151,7 +151,7 @@ def select(
151151
:param group: (Optional) Element selector where the matched element should be grouped. Defaults to ":root".
152152
:param setup: Flag to register a setup handler.
153153
:param navigate: Flag to register a navigate handler.
154-
:param url: URL pattern. Run the handler function only when the pattern matches (defaults to empty string).
154+
:param url: URL pattern. Run the handler function only when the pattern matches (defaults to *).
155155
:param priority: Priority, the lowest value will be executed first (default 100).
156156
:param css: CSS selector.
157157
:param xpath: XPath selector.

dude/rule.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import re
1+
import fnmatch
22
from enum import Enum, auto
33
from typing import Callable, NamedTuple, Optional, Tuple
44

@@ -77,6 +77,6 @@ def rule_grouper(rule: Rule) -> Selector:
7777

7878
def rule_filter(url: str, setup: bool = False, navigate: bool = False) -> Callable:
7979
def wrapper(rule: Rule) -> bool:
80-
return re.search(rule.url_pattern, url) is not None and rule.setup is setup and rule.navigate is navigate
80+
return fnmatch.fnmatch(url, rule.url_pattern) and rule.setup is setup and rule.navigate is navigate
8181

8282
return wrapper

examples/url_pattern.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
11
from dude import select
22

33

4-
@select(css="a.url", url=r".*\.html")
4+
@select(css="a.url", url="*.html")
5+
@select(css="a.url", url="*dude.ron.sh/*")
56
def result_url(element):
67
return {"url": element.get_attribute("href")}
78

89

9-
@select(css=".title", url=r".*\.html")
10+
@select(css=".title", url="*.html")
11+
@select(css=".title", url="*dude.ron.sh/*")
1012
def result_title(element):
1113
return {"title": element.text_content()}
1214

1315

14-
@select(css=".description", url=r".*\.html")
16+
@select(css=".description", url="*.html")
17+
@select(css=".description", url="*dude.ron.sh/*")
1518
def result_description(element):
1619
return {"description": element.text_content()}
1720

@@ -22,4 +25,4 @@ def result_description(element):
2225
import dude
2326

2427
html = f"file://{(Path(__file__).resolve().parent / 'dude.html').absolute()}"
25-
dude.run(urls=[html])
28+
dude.run(urls=[html, "https://dude.ron.sh"])

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pydude"
3-
version = "0.13.0"
3+
version = "0.14.0"
44
repository = "https://github.com/roniemartinez/dude"
55
description = "dude uncomplicated data extraction"
66
authors = ["Ronie Martinez <ronmarti18@gmail.com>"]

tests/test_bs4.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def empty(element: BeautifulSoup) -> Dict:
2121
return {}
2222

2323
@scraper_application.group(css=".custom-group")
24-
@scraper_application.select(css=".title", url=r"example\.com")
24+
@scraper_application.select(css=".title", url="example.com")
2525
def url_dont_match(element: BeautifulSoup) -> Dict:
2626
return {"title": element.get_text()}
2727

@@ -42,7 +42,7 @@ async def empty(element: BeautifulSoup) -> Dict:
4242
return {}
4343

4444
@scraper_application.group(css=".custom-group")
45-
@scraper_application.select(css=".title", url=r"example\.com")
45+
@scraper_application.select(css=".title", url="example.com")
4646
async def url_dont_match(element: BeautifulSoup) -> Dict:
4747
return {"title": element.get_text()}
4848

tests/test_lxml.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def empty(element: _Element) -> Dict:
2121
return {}
2222

2323
@scraper_application.group(css=".custom-group")
24-
@scraper_application.select(css=".title", url=r"example\.com")
24+
@scraper_application.select(css=".title", url="example.com")
2525
def url_dont_match(element: _Element) -> Dict:
2626
return {"title": element.text}
2727

@@ -42,7 +42,7 @@ async def empty(element: _Element) -> Dict:
4242
return {}
4343

4444
@scraper_application.group(css=".custom-group")
45-
@scraper_application.select(css=".title", url=r"example\.com")
45+
@scraper_application.select(css=".title", url="example.com")
4646
async def url_dont_match(element: _Element) -> Dict:
4747
return {"title": element.text}
4848

tests/test_parsel.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def empty(selector: parsel.Selector) -> Dict:
2121
return {}
2222

2323
@scraper_application.group(css=".custom-group")
24-
@scraper_application.select(css=".title::text", url=r"example\.com")
24+
@scraper_application.select(css=".title::text", url="example.com")
2525
def url_dont_match(selector: parsel.Selector) -> Dict:
2626
return {"title": selector.get()}
2727

@@ -42,7 +42,7 @@ async def empty(selector: parsel.Selector) -> Dict:
4242
return {}
4343

4444
@scraper_application.group(css=".custom-group")
45-
@scraper_application.select(css=".title::text", url=r"example\.com")
45+
@scraper_application.select(css=".title::text", url="example.com")
4646
async def url_dont_match(selector: parsel.Selector) -> Dict:
4747
return {"title": selector.get()}
4848

tests/test_playwright_async.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ async def empty(element: async_api.ElementHandle) -> Dict:
2121
return {}
2222

2323
@scraper_application.group(css=".custom-group")
24-
@scraper_application.select(css=".title", url=r"example\.com")
24+
@scraper_application.select(css=".title", url="example.com")
2525
async def url_dont_match(element: async_api.ElementHandle) -> Dict:
2626
return {"title": await element.text_content()}
2727

tests/test_playwright_sync.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def empty(element: sync_api.ElementHandle) -> Dict:
2525
return {}
2626

2727
@scraper_application.group(css=".custom-group")
28-
@scraper_application.select(css=".title", url=r"example\.com")
28+
@scraper_application.select(css=".title", url="example.com")
2929
def url_dont_match(element: sync_api.ElementHandle) -> Dict:
3030
return {"title": element.text_content()}
3131

@@ -62,7 +62,7 @@ def empty(element: sync_api.ElementHandle) -> Dict:
6262
return {}
6363

6464
@scraper_application_with_parser.group(css=".custom-group")
65-
@scraper_application_with_parser.select(css=".title", url=r"example\.com")
65+
@scraper_application_with_parser.select(css=".title", url="example.com")
6666
def url_dont_match(element: sync_api.ElementHandle) -> Dict:
6767
return {"title": element.text_content()}
6868

tests/test_pyppeteer.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ async def empty(element: ElementHandle, page: Page) -> Dict:
2929
return {}
3030

3131
@scraper_application.group(css=".custom-group")
32-
@scraper_application.select(css=".title", url=r"example\.com")
32+
@scraper_application.select(css=".title", url="example.com")
3333
async def url_dont_match(element: ElementHandle, page: Page) -> Dict:
3434
return {"title": await page.evaluate("(element) => element.textContent", element)}
3535

@@ -51,7 +51,7 @@ async def empty(element: ElementHandle, page: Page) -> Dict:
5151
return {}
5252

5353
@scraper_application_with_pyppeteer_parser.group(css=".custom-group")
54-
@scraper_application_with_pyppeteer_parser.select(css=".title", url=r"example\.com")
54+
@scraper_application_with_pyppeteer_parser.select(css=".title", url="example.com")
5555
async def url_dont_match(element: ElementHandle, page: Page) -> Dict:
5656
return {"title": await page.evaluate("(element) => element.textContent", element)}
5757

tests/test_selenium.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def empty(element: WebElement) -> Dict:
2929
return {}
3030

3131
@scraper_application.group(css=".custom-group")
32-
@scraper_application.select(css=".title", url=r"example\.com")
32+
@scraper_application.select(css=".title", url="example.com")
3333
def url_dont_match(element: WebElement) -> Dict:
3434
return {"title": element.text}
3535

@@ -50,7 +50,7 @@ def empty(element: WebElement) -> Dict:
5050
return {}
5151

5252
@scraper_application_with_selenium_parser.group(css=".custom-group")
53-
@scraper_application_with_selenium_parser.select(css=".title", url=r"example\.com")
53+
@scraper_application_with_selenium_parser.select(css=".title", url="example.com")
5454
def url_dont_match(element: WebElement) -> Dict:
5555
return {"title": element.text}
5656

@@ -119,7 +119,7 @@ async def empty(element: WebElement) -> Dict:
119119
return {}
120120

121121
@scraper_application.group(css=".custom-group")
122-
@scraper_application.select(css=".title", url=r"example\.com")
122+
@scraper_application.select(css=".title", url="example.com")
123123
async def url_dont_match(element: WebElement) -> Dict:
124124
return {"title": element.text}
125125

0 commit comments

Comments
 (0)