Skip to content

feat: add infnite scroll #37

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions scrapegraph-py/examples/async/smartscraper_infinite_scroll.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Example of using SmartScraper with infinite scrolling in asynchronous mode.
This example demonstrates how to scrape content from multiple webpages concurrently using infinite scrolling.
"""

import asyncio
from scrapegraph_py import AsyncClient
from scrapegraph_py.logger import sgai_logger

# Set up logging
sgai_logger.set_logging(level="INFO")

async def scrape_with_infinite_scroll(client: AsyncClient, url: str, prompt: str, max_pages: int = 10):
"""Helper function to perform a single scraping task with infinite scrolling"""
response = await client.smartscraper(
website_url=url,
user_prompt=prompt,
infinite_scrolling=True,
max_pages=max_pages
)
return response

async def main():
# Initialize the async client with your API key
async with AsyncClient(api_key="your-api-key-here") as sgai_client:
# Example 1: Scrape multiple pages concurrently
tasks = [
scrape_with_infinite_scroll(
sgai_client,
"https://example.com/products",
"Extract all product names and prices",
max_pages=20
),
scrape_with_infinite_scroll(
sgai_client,
"https://example.com/articles",
"Extract all article titles and authors",
max_pages=15
),
scrape_with_infinite_scroll(
sgai_client,
"https://example.com/news",
"Extract all news headlines and dates",
max_pages=10
)
]

# Wait for all scraping tasks to complete
results = await asyncio.gather(*tasks)

# Process and print results
for i, result in enumerate(results, 1):
print(f"\nExample {i} Results:")
print(f"Request ID: {result['request_id']}")
print(f"Result: {result['result']}")

# Example 2: Single page without infinite scrolling
response = await sgai_client.smartscraper(
website_url="https://example.com/static-page",
user_prompt="Extract the main heading and first paragraph",
infinite_scrolling=False
)
print("\nExample 4 - Without infinite scrolling:")
print(f"Request ID: {response['request_id']}")
print(f"Result: {response['result']}")

if __name__ == "__main__":
asyncio.run(main())
33 changes: 33 additions & 0 deletions scrapegraph-py/examples/sync/smartscraper_infinite_scroll.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""
Example of using SmartScraper with infinite scrolling in synchronous mode.
This example demonstrates how to scrape content from a webpage that requires scrolling to load more content.
"""

from scrapegraph_py import Client
from scrapegraph_py.logger import sgai_logger
import time

# Set up logging
sgai_logger.set_logging(level="INFO")

def main():
# Initialize the client with your API key
sgai_client = Client(api_key="your-api-key-here")

response1 = sgai_client.smartscraper(
website_url="https://www.ycombinator.com/companies",
user_prompt="Extract all the companies and their info",
infinite_scrolling=True,
max_pages=10,
)

print("\nExample 1 - Basic infinite scrolling:")
print(f"Request ID: {response1['request_id']}")
print(f"Result: {response1['result']}")


# Always close the client when done
sgai_client.close()

if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions scrapegraph-py/scrapegraph_py/async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ async def smartscraper(
website_html: Optional[str] = None,
headers: Optional[dict[str, str]] = None,
output_schema: Optional[BaseModel] = None,
infinite_scrolling: bool = False,
max_pages: int = 10,
):
"""Send a smartscraper request"""
logger.info("🔍 Starting smartscraper request")
Expand All @@ -184,13 +186,17 @@ async def smartscraper(
if headers:
logger.debug("🔧 Using custom headers")
logger.debug(f"📝 Prompt: {user_prompt}")
if infinite_scrolling:
logger.debug(f"🔄 Infinite scrolling enabled with max_pages={max_pages}")

request = SmartScraperRequest(
website_url=website_url,
website_html=website_html,
headers=headers,
user_prompt=user_prompt,
output_schema=output_schema,
infinite_scrolling=infinite_scrolling,
max_pages=max_pages,
)
logger.debug("✅ Request validation passed")

Expand Down
6 changes: 6 additions & 0 deletions scrapegraph-py/scrapegraph_py/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ def smartscraper(
website_html: Optional[str] = None,
headers: Optional[dict[str, str]] = None,
output_schema: Optional[BaseModel] = None,
infinite_scrolling: bool = False,
max_pages: int = 10,
):
"""Send a smartscraper request"""
logger.info("🔍 Starting smartscraper request")
Expand All @@ -192,13 +194,17 @@ def smartscraper(
if headers:
logger.debug("🔧 Using custom headers")
logger.debug(f"📝 Prompt: {user_prompt}")
if infinite_scrolling:
logger.debug(f"🔄 Infinite scrolling enabled with max_pages={max_pages}")

request = SmartScraperRequest(
website_url=website_url,
website_html=website_html,
headers=headers,
user_prompt=user_prompt,
output_schema=output_schema,
infinite_scrolling=infinite_scrolling,
max_pages=max_pages,
)
logger.debug("✅ Request validation passed")

Expand Down
10 changes: 9 additions & 1 deletion scrapegraph-py/scrapegraph_py/models/smartscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from uuid import UUID

from bs4 import BeautifulSoup
from pydantic import BaseModel, Field, model_validator
from pydantic import BaseModel, Field, model_validator, conint


class SmartScraperRequest(BaseModel):
Expand All @@ -28,6 +28,14 @@ class SmartScraperRequest(BaseModel):
},
description="Optional headers to send with the request, including cookies and user agent",
)
infinite_scrolling: bool = Field(
default=False,
description="Enable infinite scrolling to load more content dynamically",
)
max_pages: conint(ge=1, le=1000) = Field(
default=10,
description="Maximum number of pages to scroll when infinite_scrolling is enabled",
)
output_schema: Optional[Type[BaseModel]] = None

@model_validator(mode="after")
Expand Down
2 changes: 1 addition & 1 deletion scrapegraph-py/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.