✨ Use fnmatch (#122)

roniemartinez · web-flow · commit 3f76a0f41f8a · 2022-03-29T21:21:48.000+02:00
diff --git a/README.md b/README.md
@@ -105,7 +105,7 @@ Changing the output to `--output data.csv` should result in the following CSV co
 - Simple [Flask](https://github.com/pallets/flask)-inspired design - build a scraper with decorators.
 - Uses [Playwright](https://playwright.dev/python/) API - run your scraper in Chrome, Firefox and Webkit and leverage Playwright's powerful selector engine supporting CSS, XPath, text, regex, etc.
 - Data grouping - group related results.
-- URL pattern matching - run functions on specific URLs.
+- URL pattern matching - run functions on matched URLs.
 - Priority - reorder functions based on priority.
 - Setup function - enable setup steps (clicking dialogs or login).
 - Navigate function - enable navigation steps to move to other pages.
diff --git a/docs/advanced/04_url_pattern_matching.md b/docs/advanced/04_url_pattern_matching.md
@@ -1,16 +1,16 @@
 # URL Pattern Matching
 
 In order to make a handler function to run on specific websites, a `url` pattern parameter can be passed to `@select()` decorator.
-The `url` pattern parameter should be a valid regular expression. 
-The example below will only run if the URL of the current page matches `.*\.com`.
+The `url` pattern parameter should be a valid Unix shell-style wildcards (see https://docs.python.org/3/library/fnmatch.html). 
+The example below will only run if the URL of the current page matches `*.com/*`.
 
 === "Python"
 
     ```python
     from dude import select
     
     
-    @select(css=".title", url=r".*\.com")
+    @select(css=".title", url="*.com/*")
     def result_title(element):
         return {"title": element.text_content()}
     ```
diff --git a/docs/features.md b/docs/features.md
@@ -3,7 +3,7 @@
 - Simple [Flask](https://github.com/pallets/flask)-inspired design - build a scraper with decorators.
 - Uses [Playwright](https://playwright.dev/python/) API - run your scraper in Chrome, Firefox and Webkit and leverage Playwright's powerful selector engine supporting CSS, XPath, text, regex, etc.
 - Data grouping - group related results.
-- URL pattern matching - run functions on specific URLs.
+- URL pattern matching - run functions on matched URLs.
 - Priority - reorder functions based on priority.
 - Setup function - enable setup steps (clicking dialogs or login).
 - Navigate function - enable navigation steps to move to other pages.
diff --git a/dude/base.py b/dude/base.py
@@ -133,7 +133,7 @@ def select(
         group: str = None,
         setup: bool = False,
         navigate: bool = False,
-        url: str = "",
+        url: str = "*",
         priority: int = 100,
         css: str = None,
         xpath: str = None,
@@ -151,7 +151,7 @@ def select(
         :param group: (Optional) Element selector where the matched element should be grouped. Defaults to ":root".
         :param setup: Flag to register a setup handler.
         :param navigate: Flag to register a navigate handler.
-        :param url: URL pattern. Run the handler function only when the pattern matches (defaults to empty string).
+        :param url: URL pattern. Run the handler function only when the pattern matches (defaults to *).
         :param priority: Priority, the lowest value will be executed first (default 100).
         :param css: CSS selector.
         :param xpath: XPath selector.
diff --git a/dude/rule.py b/dude/rule.py
@@ -1,4 +1,4 @@
-import re
+import fnmatch
 from enum import Enum, auto
 from typing import Callable, NamedTuple, Optional, Tuple
 
@@ -77,6 +77,6 @@ def rule_grouper(rule: Rule) -> Selector:
 
 def rule_filter(url: str, setup: bool = False, navigate: bool = False) -> Callable:
     def wrapper(rule: Rule) -> bool:
-        return re.search(rule.url_pattern, url) is not None and rule.setup is setup and rule.navigate is navigate
+        return fnmatch.fnmatch(url, rule.url_pattern) and rule.setup is setup and rule.navigate is navigate
 
     return wrapper
diff --git a/examples/url_pattern.py b/examples/url_pattern.py
@@ -1,17 +1,20 @@
 from dude import select
 
 
-@select(css="a.url", url=r".*\.html")
+@select(css="a.url", url="*.html")
+@select(css="a.url", url="*dude.ron.sh/*")
 def result_url(element):
     return {"url": element.get_attribute("href")}
 
 
-@select(css=".title", url=r".*\.html")
+@select(css=".title", url="*.html")
+@select(css=".title", url="*dude.ron.sh/*")
 def result_title(element):
     return {"title": element.text_content()}
 
 
-@select(css=".description", url=r".*\.html")
+@select(css=".description", url="*.html")
+@select(css=".description", url="*dude.ron.sh/*")
 def result_description(element):
     return {"description": element.text_content()}
 
@@ -22,4 +25,4 @@ def result_description(element):
     import dude
 
     html = f"file://{(Path(__file__).resolve().parent / 'dude.html').absolute()}"
-    dude.run(urls=[html])
+    dude.run(urls=[html, "https://dude.ron.sh"])
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pydude"
-version = "0.13.0"
+version = "0.14.0"
 repository = "https://github.com/roniemartinez/dude"
 description = "dude uncomplicated data extraction"
 authors = ["Ronie Martinez <ronmarti18@gmail.com>"]
diff --git a/tests/test_bs4.py b/tests/test_bs4.py
@@ -21,7 +21,7 @@ def empty(element: BeautifulSoup) -> Dict:
         return {}
 
     @scraper_application.group(css=".custom-group")
-    @scraper_application.select(css=".title", url=r"example\.com")
+    @scraper_application.select(css=".title", url="example.com")
     def url_dont_match(element: BeautifulSoup) -> Dict:
         return {"title": element.get_text()}
 
@@ -42,7 +42,7 @@ async def empty(element: BeautifulSoup) -> Dict:
         return {}
 
     @scraper_application.group(css=".custom-group")
-    @scraper_application.select(css=".title", url=r"example\.com")
+    @scraper_application.select(css=".title", url="example.com")
     async def url_dont_match(element: BeautifulSoup) -> Dict:
         return {"title": element.get_text()}
 
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
@@ -21,7 +21,7 @@ def empty(element: _Element) -> Dict:
         return {}
 
     @scraper_application.group(css=".custom-group")
-    @scraper_application.select(css=".title", url=r"example\.com")
+    @scraper_application.select(css=".title", url="example.com")
     def url_dont_match(element: _Element) -> Dict:
         return {"title": element.text}
 
@@ -42,7 +42,7 @@ async def empty(element: _Element) -> Dict:
         return {}
 
     @scraper_application.group(css=".custom-group")
-    @scraper_application.select(css=".title", url=r"example\.com")
+    @scraper_application.select(css=".title", url="example.com")
     async def url_dont_match(element: _Element) -> Dict:
         return {"title": element.text}
 
diff --git a/tests/test_parsel.py b/tests/test_parsel.py
@@ -21,7 +21,7 @@ def empty(selector: parsel.Selector) -> Dict:
         return {}
 
     @scraper_application.group(css=".custom-group")
-    @scraper_application.select(css=".title::text", url=r"example\.com")
+    @scraper_application.select(css=".title::text", url="example.com")
     def url_dont_match(selector: parsel.Selector) -> Dict:
         return {"title": selector.get()}
 
@@ -42,7 +42,7 @@ async def empty(selector: parsel.Selector) -> Dict:
         return {}
 
     @scraper_application.group(css=".custom-group")
-    @scraper_application.select(css=".title::text", url=r"example\.com")
+    @scraper_application.select(css=".title::text", url="example.com")
     async def url_dont_match(selector: parsel.Selector) -> Dict:
         return {"title": selector.get()}
 
diff --git a/tests/test_playwright_async.py b/tests/test_playwright_async.py
@@ -21,7 +21,7 @@ async def empty(element: async_api.ElementHandle) -> Dict:
         return {}
 
     @scraper_application.group(css=".custom-group")
-    @scraper_application.select(css=".title", url=r"example\.com")
+    @scraper_application.select(css=".title", url="example.com")
     async def url_dont_match(element: async_api.ElementHandle) -> Dict:
         return {"title": await element.text_content()}
 
diff --git a/tests/test_playwright_sync.py b/tests/test_playwright_sync.py
@@ -25,7 +25,7 @@ def empty(element: sync_api.ElementHandle) -> Dict:
         return {}
 
     @scraper_application.group(css=".custom-group")
-    @scraper_application.select(css=".title", url=r"example\.com")
+    @scraper_application.select(css=".title", url="example.com")
     def url_dont_match(element: sync_api.ElementHandle) -> Dict:
         return {"title": element.text_content()}
 
@@ -62,7 +62,7 @@ def empty(element: sync_api.ElementHandle) -> Dict:
         return {}
 
     @scraper_application_with_parser.group(css=".custom-group")
-    @scraper_application_with_parser.select(css=".title", url=r"example\.com")
+    @scraper_application_with_parser.select(css=".title", url="example.com")
     def url_dont_match(element: sync_api.ElementHandle) -> Dict:
         return {"title": element.text_content()}
 
diff --git a/tests/test_pyppeteer.py b/tests/test_pyppeteer.py
@@ -29,7 +29,7 @@ async def empty(element: ElementHandle, page: Page) -> Dict:
         return {}
 
     @scraper_application.group(css=".custom-group")
-    @scraper_application.select(css=".title", url=r"example\.com")
+    @scraper_application.select(css=".title", url="example.com")
     async def url_dont_match(element: ElementHandle, page: Page) -> Dict:
         return {"title": await page.evaluate("(element) => element.textContent", element)}
 
@@ -51,7 +51,7 @@ async def empty(element: ElementHandle, page: Page) -> Dict:
         return {}
 
     @scraper_application_with_pyppeteer_parser.group(css=".custom-group")
-    @scraper_application_with_pyppeteer_parser.select(css=".title", url=r"example\.com")
+    @scraper_application_with_pyppeteer_parser.select(css=".title", url="example.com")
     async def url_dont_match(element: ElementHandle, page: Page) -> Dict:
         return {"title": await page.evaluate("(element) => element.textContent", element)}
 
diff --git a/tests/test_selenium.py b/tests/test_selenium.py
@@ -29,7 +29,7 @@ def empty(element: WebElement) -> Dict:
         return {}
 
     @scraper_application.group(css=".custom-group")
-    @scraper_application.select(css=".title", url=r"example\.com")
+    @scraper_application.select(css=".title", url="example.com")
     def url_dont_match(element: WebElement) -> Dict:
         return {"title": element.text}
 
@@ -50,7 +50,7 @@ def empty(element: WebElement) -> Dict:
         return {}
 
     @scraper_application_with_selenium_parser.group(css=".custom-group")
-    @scraper_application_with_selenium_parser.select(css=".title", url=r"example\.com")
+    @scraper_application_with_selenium_parser.select(css=".title", url="example.com")
     def url_dont_match(element: WebElement) -> Dict:
         return {"title": element.text}
 
@@ -119,7 +119,7 @@ async def empty(element: WebElement) -> Dict:
         return {}
 
     @scraper_application.group(css=".custom-group")
-    @scraper_application.select(css=".title", url=r"example\.com")
+    @scraper_application.select(css=".title", url="example.com")
     async def url_dont_match(element: WebElement) -> Dict:
         return {"title": element.text}