diff --git a/scrapegraph-py/CONTRIBUTING.md b/scrapegraph-py/CONTRIBUTING.md
index 0510252..914b3de 100644
--- a/scrapegraph-py/CONTRIBUTING.md
+++ b/scrapegraph-py/CONTRIBUTING.md
@@ -13,11 +13,36 @@ Thank you for your interest in contributing to **ScrapeGraphAI**! We welcome con
## Getting Started
-To get started with contributing, follow these steps:
+### Development Setup
1. Fork the repository on GitHub **(FROM pre/beta branch)**.
-2. Clone your forked repository to your local machine.
-3. Install the necessary dependencies from requirements.txt or via pyproject.toml as you prefere :).
+2. Clone your forked repository:
+ ```bash
+ git clone https://github.com/ScrapeGraphAI/scrapegraph-sdk.git
+ cd scrapegraph-sdk/scrapegraph-py
+ ```
+
+3. Install dependencies using uv (recommended):
+ ```bash
+ # Install uv if you haven't already
+ pip install uv
+
+ # Install dependencies
+ uv sync
+
+ # Install pre-commit hooks
+ uv run pre-commit install
+ ```
+
+4. Run tests:
+ ```bash
+ # Run all tests
+ uv run pytest
+
+ # Run specific test file
+ uv run pytest tests/test_client.py
+ ```
+
4. Make your changes or additions.
5. Test your changes thoroughly.
6. Commit your changes with descriptive commit messages.
diff --git a/scrapegraph-py/README.md b/scrapegraph-py/README.md
index 221698e..abccd51 100644
--- a/scrapegraph-py/README.md
+++ b/scrapegraph-py/README.md
@@ -6,164 +6,175 @@
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![Documentation Status](https://readthedocs.org/projects/scrapegraph-py/badge/?version=latest)](https://scrapegraph-py.readthedocs.io/en/latest/?badge=latest)
-Official Python SDK for the ScrapeGraph AI API - Smart web scraping powered by AI.
-
-## 🚀 Features
-
-- ✨ Smart web scraping with AI
-- 🔄 Both sync and async clients
-- 📊 Structured output with Pydantic schemas
-- 🔍 Detailed logging with emojis
-- ⚡ Automatic retries and error handling
-- 🔐 Secure API authentication
+Official Python SDK for the ScrapeGraph API - Smart web scraping powered by AI.
## 📦 Installation
-### Using pip
-
-```
+```bash
pip install scrapegraph-py
```
-### Using uv
+## 🚀 Features
-We recommend using [uv](https://docs.astral.sh/uv/) to install the dependencies and pre-commit hooks.
+- 🤖 AI-powered web scraping
+- 🔄 Both sync and async clients
+- 📊 Structured output with Pydantic schemas
+- 🔍 Detailed logging
+- ⚡ Automatic retries
+- 🔐 Secure authentication
-```
-# Install uv if you haven't already
-pip install uv
+## 🎯 Quick Start
-# Install dependencies
-uv sync
+```python
+from scrapegraph_py import Client
-# Install pre-commit hooks
-uv run pre-commit install
+client = Client(api_key="your-api-key-here")
```
-## 🔧 Quick Start
-
> [!NOTE]
-> If you prefer, you can use the environment variables to configure the API key and load them using `load_dotenv()`
+> You can set the `SGAI_API_KEY` environment variable and initialize the client without parameters: `client = Client()`
-```python
-from scrapegraph_py import SyncClient
-from scrapegraph_py.logger import get_logger
+## 📚 Available Endpoints
+
+### 🔍 SmartScraper
-# Enable debug logging
-logger = get_logger(level="DEBUG")
+Scrapes any webpage using AI to extract specific information.
+
+```python
+from scrapegraph_py import Client
-# Initialize client
-sgai_client = SyncClient(api_key="your-api-key-here")
+client = Client(api_key="your-api-key-here")
-# Make a request
-response = sgai_client.smartscraper(
+# Basic usage
+response = client.smartscraper(
website_url="https://example.com",
user_prompt="Extract the main heading and description"
)
-print(response["result"])
-```
-
-## 🎯 Examples
-
-### Async Usage
-
-```python
-import asyncio
-from scrapegraph_py import AsyncClient
-
-async def main():
- async with AsyncClient(api_key="your-api-key-here") as sgai_client:
- response = await sgai_client.smartscraper(
- website_url="https://example.com",
- user_prompt="Summarize the main content"
- )
- print(response["result"])
-
-asyncio.run(main())
+print(response)
```
With Output Schema
+Output Schema (Optional)
```python
from pydantic import BaseModel, Field
-from scrapegraph_py import SyncClient
+from scrapegraph_py import Client
+
+client = Client(api_key="your-api-key-here")
class WebsiteData(BaseModel):
title: str = Field(description="The page title")
description: str = Field(description="The meta description")
-sgai_client = SyncClient(api_key="your-api-key-here")
-response = sgai_client.smartscraper(
+response = client.smartscraper(
website_url="https://example.com",
user_prompt="Extract the title and description",
output_schema=WebsiteData
)
-
-print(response["result"])
```
+
We are a technology company focused on AI solutions.
+Email: contact@example.com
+We are a technology company focused on AI solutions.
+Email: contact@example.com
+Phone: (555) 123-4567
+Content
", + description="HTML content, maximum size 2MB", + ) + output_schema: Optional[Type[BaseModel]] = None + + @model_validator(mode="after") + def validate_user_prompt(self) -> "LocalScraperRequest": + if self.user_prompt is None or not self.user_prompt.strip(): + raise ValueError("User prompt cannot be empty") + if not any(c.isalnum() for c in self.user_prompt): + raise ValueError("User prompt must contain a valid prompt") + return self + + @model_validator(mode="after") + def validate_website_html(self) -> "LocalScraperRequest": + if self.website_html is None or not self.website_html.strip(): + raise ValueError("Website HTML cannot be empty") + + if len(self.website_html.encode("utf-8")) > 2 * 1024 * 1024: + raise ValueError("Website HTML content exceeds maximum size of 2MB") + + try: + soup = BeautifulSoup(self.website_html, "html.parser") + if not soup.find(): + raise ValueError("Invalid HTML - no parseable content found") + except Exception as e: + raise ValueError(f"Invalid HTML structure: {str(e)}") + + return self + + def model_dump(self, *args, **kwargs) -> dict: + data = super().model_dump(*args, **kwargs) + # Convert the Pydantic model schema to dict if present + if self.output_schema is not None: + data["output_schema"] = self.output_schema.model_json_schema() + return data + + +class GetLocalScraperRequest(BaseModel): + """Request model for get_localscraper endpoint""" + + request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") + + @model_validator(mode="after") + def validate_request_id(self) -> "GetLocalScraperRequest": + try: + # Validate the request_id is a valid UUID + UUID(self.request_id) + except ValueError: + raise ValueError("request_id must be a valid UUID") + return self diff --git a/scrapegraph-py/scrapegraph_py/models/markdownify.py b/scrapegraph-py/scrapegraph_py/models/markdownify.py new file mode 100644 index 0000000..5b12aa2 --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/markdownify.py @@ -0,0 +1,35 @@ +# Models for markdownify endpoint + +from uuid import UUID + +from pydantic import BaseModel, Field, model_validator + + +class MarkdownifyRequest(BaseModel): + website_url: str = Field(..., example="https://scrapegraphai.com/") + + @model_validator(mode="after") + def validate_url(self) -> "MarkdownifyRequest": + if self.website_url is None or not self.website_url.strip(): + raise ValueError("Website URL cannot be empty") + if not ( + self.website_url.startswith("http://") + or self.website_url.startswith("https://") + ): + raise ValueError("Invalid URL") + return self + + +class GetMarkdownifyRequest(BaseModel): + """Request model for get_markdownify endpoint""" + + request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") + + @model_validator(mode="after") + def validate_request_id(self) -> "GetMarkdownifyRequest": + try: + # Validate the request_id is a valid UUID + UUID(self.request_id) + except ValueError: + raise ValueError("request_id must be a valid UUID") + return self diff --git a/scrapegraph-py/tests/test_async_client.py b/scrapegraph-py/tests/test_async_client.py index 8ebb43b..78f9717 100644 --- a/scrapegraph-py/tests/test_async_client.py +++ b/scrapegraph-py/tests/test_async_client.py @@ -101,3 +101,78 @@ async def test_api_error(mock_api_key): ) assert exc_info.value.status_code == 400 assert "Bad request" in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_markdownify(mock_api_key): + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/markdownify", + payload={ + "request_id": str(uuid4()), + "status": "completed", + "result": "# Example Page\n\nThis is markdown content.", + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.markdownify(website_url="https://example.com") + assert response["status"] == "completed" + assert "# Example Page" in response["result"] + + +@pytest.mark.asyncio +async def test_get_markdownify(mock_api_key, mock_uuid): + with aioresponses() as mocked: + mocked.get( + f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", + payload={ + "request_id": mock_uuid, + "status": "completed", + "result": "# Example Page\n\nThis is markdown content.", + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.get_markdownify(mock_uuid) + assert response["status"] == "completed" + assert response["request_id"] == mock_uuid + + +@pytest.mark.asyncio +async def test_localscraper(mock_api_key): + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/localscraper", + payload={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"extracted_info": "Test content"}, + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.localscraper( + user_prompt="Extract info", + website_html="Test content
", + ) + assert response["status"] == "completed" + assert "extracted_info" in response["result"] + + +@pytest.mark.asyncio +async def test_get_localscraper(mock_api_key, mock_uuid): + with aioresponses() as mocked: + mocked.get( + f"https://api.scrapegraphai.com/v1/localscraper/{mock_uuid}", + payload={ + "request_id": mock_uuid, + "status": "completed", + "result": {"extracted_info": "Test content"}, + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.get_localscraper(mock_uuid) + assert response["status"] == "completed" + assert response["request_id"] == mock_uuid diff --git a/scrapegraph-py/tests/test_client.py b/scrapegraph-py/tests/test_client.py index 6163a6d..23c1162 100644 --- a/scrapegraph-py/tests/test_client.py +++ b/scrapegraph-py/tests/test_client.py @@ -97,3 +97,78 @@ def test_network_error(mock_api_key): client.smartscraper( website_url="https://example.com", user_prompt="Describe this page." ) + + +@responses.activate +def test_markdownify(mock_api_key): + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/markdownify", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": "# Example Page\n\nThis is markdown content.", + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.markdownify(website_url="https://example.com") + assert response["status"] == "completed" + assert "# Example Page" in response["result"] + + +@responses.activate +def test_get_markdownify(mock_api_key, mock_uuid): + responses.add( + responses.GET, + f"https://api.scrapegraphai.com/v1/markdownify/{mock_uuid}", + json={ + "request_id": mock_uuid, + "status": "completed", + "result": "# Example Page\n\nThis is markdown content.", + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.get_markdownify(mock_uuid) + assert response["status"] == "completed" + assert response["request_id"] == mock_uuid + + +@responses.activate +def test_localscraper(mock_api_key): + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/localscraper", + json={ + "request_id": str(uuid4()), + "status": "completed", + "result": {"extracted_info": "Test content"}, + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.localscraper( + user_prompt="Extract info", + website_html="Test content
", + ) + assert response["status"] == "completed" + assert "extracted_info" in response["result"] + + +@responses.activate +def test_get_localscraper(mock_api_key, mock_uuid): + responses.add( + responses.GET, + f"https://api.scrapegraphai.com/v1/localscraper/{mock_uuid}", + json={ + "request_id": mock_uuid, + "status": "completed", + "result": {"extracted_info": "Test content"}, + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.get_localscraper(mock_uuid) + assert response["status"] == "completed" + assert response["request_id"] == mock_uuid diff --git a/scrapegraph-py/tests/test_models.py b/scrapegraph-py/tests/test_models.py index 3e75169..70841aa 100644 --- a/scrapegraph-py/tests/test_models.py +++ b/scrapegraph-py/tests/test_models.py @@ -2,6 +2,11 @@ from pydantic import BaseModel, ValidationError from scrapegraph_py.models.feedback import FeedbackRequest +from scrapegraph_py.models.localscraper import ( + GetLocalScraperRequest, + LocalScraperRequest, +) +from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest from scrapegraph_py.models.smartscraper import ( GetSmartScraperRequest, SmartScraperRequest, @@ -80,3 +85,62 @@ def test_feedback_request_validation(): FeedbackRequest( request_id="invalid-uuid", rating=5, feedback_text="Great service!" ) + + +def test_markdownify_request_validation(): + # Valid input + request = MarkdownifyRequest(website_url="https://example.com") + assert request.website_url == "https://example.com" + + # Invalid URL + with pytest.raises(ValidationError): + MarkdownifyRequest(website_url="invalid-url") + + # Empty URL + with pytest.raises(ValidationError): + MarkdownifyRequest(website_url="") + + +def test_get_markdownify_request_validation(): + # Valid UUID + request = GetMarkdownifyRequest(request_id="123e4567-e89b-12d3-a456-426614174000") + assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" + + # Invalid UUID + with pytest.raises(ValidationError): + GetMarkdownifyRequest(request_id="invalid-uuid") + + +def test_localscraper_request_validation(): + # Valid input + request = LocalScraperRequest( + user_prompt="Extract info", + website_html="Test content
", + ) + assert request.user_prompt == "Extract info" + assert "Test content
" in request.website_html + + # Empty prompt + with pytest.raises(ValidationError): + LocalScraperRequest( + user_prompt="", website_html="Test content
" + ) + + # Invalid HTML + with pytest.raises(ValidationError): + LocalScraperRequest(user_prompt="Extract info", website_html="not valid html") + + # HTML too large (>2MB) + large_html = "x" * (2 * 1024 * 1024 + 1) + with pytest.raises(ValidationError): + LocalScraperRequest(user_prompt="Extract info", website_html=large_html) + + +def test_get_localscraper_request_validation(): + # Valid UUID + request = GetLocalScraperRequest(request_id="123e4567-e89b-12d3-a456-426614174000") + assert request.request_id == "123e4567-e89b-12d3-a456-426614174000" + + # Invalid UUID + with pytest.raises(ValidationError): + GetLocalScraperRequest(request_id="invalid-uuid") diff --git a/scrapegraph-py/uv.lock b/scrapegraph-py/uv.lock index 8214b93..1990785 100644 --- a/scrapegraph-py/uv.lock +++ b/scrapegraph-py/uv.lock @@ -1489,10 +1489,10 @@ version = "0.0.3" source = { editable = "." } dependencies = [ { name = "aiohttp" }, + { name = "beautifulsoup4" }, { name = "pydantic" }, { name = "python-dotenv" }, { name = "requests" }, - { name = "validators" }, ] [package.optional-dependencies] @@ -1526,12 +1526,12 @@ dev = [ [package.metadata] requires-dist = [ { name = "aiohttp", specifier = ">=3.11.8" }, + { name = "beautifulsoup4", specifier = ">=4.12.3" }, { name = "furo", marker = "extra == 'docs'", specifier = "==2024.5.6" }, { name = "pydantic", specifier = ">=2.10.2" }, { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "requests", specifier = ">=2.32.3" }, { name = "sphinx", marker = "extra == 'docs'", specifier = "==6.0" }, - { name = "validators", specifier = ">=0.34.0" }, ] [package.metadata.requires-dev] @@ -1730,15 +1730,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/d9/5f4c13cecde62396b0d3fe530a50ccea91e7dfc1ccf0e09c228841bb5ba8/urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac", size = 126338 }, ] -[[package]] -name = "validators" -version = "0.34.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/64/07/91582d69320f6f6daaf2d8072608a4ad8884683d4840e7e4f3a9dbdcc639/validators-0.34.0.tar.gz", hash = "sha256:647fe407b45af9a74d245b943b18e6a816acf4926974278f6dd617778e1e781f", size = 70955 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/78/36828a4d857b25896f9774c875714ba4e9b3bc8a92d2debe3f4df3a83d4f/validators-0.34.0-py3-none-any.whl", hash = "sha256:c804b476e3e6d3786fa07a30073a4ef694e617805eb1946ceee3fe5a9b8b1321", size = 43536 }, -] - [[package]] name = "virtualenv" version = "20.28.0"