|
# Page Interaction |
|
|
|
Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events. |
|
|
|
## JavaScript Execution |
|
|
|
### Basic Execution |
|
|
|
```python |
|
from crawl4ai.async_configs import CrawlerRunConfig |
|
|
|
# Single JavaScript command |
|
config = CrawlerRunConfig( |
|
js_code="window.scrollTo(0, document.body.scrollHeight);" |
|
) |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
|
|
# Multiple commands |
|
js_commands = [ |
|
"window.scrollTo(0, document.body.scrollHeight);", |
|
"document.querySelector('.load-more').click();", |
|
"document.querySelector('#consent-button').click();" |
|
] |
|
config = CrawlerRunConfig(js_code=js_commands) |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
``` |
|
|
|
## Wait Conditions |
|
|
|
### CSS-Based Waiting |
|
|
|
Wait for elements to appear: |
|
|
|
```python |
|
config = CrawlerRunConfig(wait_for="css:.dynamic-content") # Wait for element with class 'dynamic-content' |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
``` |
|
|
|
### JavaScript-Based Waiting |
|
|
|
Wait for custom conditions: |
|
|
|
```python |
|
# Wait for number of elements |
|
wait_condition = """() => { |
|
return document.querySelectorAll('.item').length > 10; |
|
}""" |
|
|
|
config = CrawlerRunConfig(wait_for=f"js:{wait_condition}") |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
|
|
# Wait for dynamic content to load |
|
wait_for_content = """() => { |
|
const content = document.querySelector('.content'); |
|
return content && content.innerText.length > 100; |
|
}""" |
|
|
|
config = CrawlerRunConfig(wait_for=f"js:{wait_for_content}") |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
``` |
|
|
|
## Handling Dynamic Content |
|
|
|
### Load More Content |
|
|
|
Handle infinite scroll or load more buttons: |
|
|
|
```python |
|
config = CrawlerRunConfig( |
|
js_code=[ |
|
"window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom |
|
"const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();" # Click load more |
|
], |
|
wait_for="js:() => document.querySelectorAll('.item').length > previousCount" # Wait for new content |
|
) |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
``` |
|
|
|
### Form Interaction |
|
|
|
Handle forms and inputs: |
|
|
|
```python |
|
js_form_interaction = """ |
|
document.querySelector('#search').value = 'search term'; // Fill form fields |
|
document.querySelector('form').submit(); // Submit form |
|
""" |
|
|
|
config = CrawlerRunConfig( |
|
js_code=js_form_interaction, |
|
wait_for="css:.results" # Wait for results to load |
|
) |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
``` |
|
|
|
## Timing Control |
|
|
|
### Delays and Timeouts |
|
|
|
Control timing of interactions: |
|
|
|
```python |
|
config = CrawlerRunConfig( |
|
page_timeout=60000, # Page load timeout (ms) |
|
delay_before_return_html=2.0 # Wait before capturing content |
|
) |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
``` |
|
|
|
## Complex Interactions Example |
|
|
|
Here's an example of handling a dynamic page with multiple interactions: |
|
|
|
```python |
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig |
|
|
|
async def crawl_dynamic_content(): |
|
async with AsyncWebCrawler() as crawler: |
|
# Initial page load |
|
config = CrawlerRunConfig( |
|
js_code="document.querySelector('.cookie-accept')?.click();", # Handle cookie consent |
|
wait_for="css:.main-content" |
|
) |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
|
|
# Load more content |
|
session_id = "dynamic_session" # Keep session for multiple interactions |
|
|
|
for page in range(3): # Load 3 pages of content |
|
config = CrawlerRunConfig( |
|
session_id=session_id, |
|
js_code=[ |
|
"window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom |
|
"window.previousCount = document.querySelectorAll('.item').length;", # Store item count |
|
"document.querySelector('.load-more')?.click();" # Click load more |
|
], |
|
wait_for="""() => { |
|
const currentCount = document.querySelectorAll('.item').length; |
|
return currentCount > window.previousCount; |
|
}""", |
|
js_only=(page > 0) # Execute JS without reloading page for subsequent interactions |
|
) |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
print(f"Page {page + 1} items:", len(result.cleaned_html)) |
|
|
|
# Clean up session |
|
await crawler.crawler_strategy.kill_session(session_id) |
|
``` |
|
|
|
## Using with Extraction Strategies |
|
|
|
Combine page interaction with structured extraction: |
|
|
|
```python |
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy |
|
from crawl4ai.async_configs import CrawlerRunConfig |
|
|
|
# Pattern-based extraction after interaction |
|
schema = { |
|
"name": "Dynamic Items", |
|
"baseSelector": ".item", |
|
"fields": [ |
|
{"name": "title", "selector": "h2", "type": "text"}, |
|
{"name": "description", "selector": ".desc", "type": "text"} |
|
] |
|
} |
|
|
|
config = CrawlerRunConfig( |
|
js_code="window.scrollTo(0, document.body.scrollHeight);", |
|
wait_for="css:.item:nth-child(10)", # Wait for 10 items |
|
extraction_strategy=JsonCssExtractionStrategy(schema) |
|
) |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
|
|
# Or use LLM to analyze dynamic content |
|
class ContentAnalysis(BaseModel): |
|
topics: List[str] |
|
summary: str |
|
|
|
config = CrawlerRunConfig( |
|
js_code="document.querySelector('.show-more').click();", |
|
wait_for="css:.full-content", |
|
extraction_strategy=LLMExtractionStrategy( |
|
provider="ollama/nemotron", |
|
schema=ContentAnalysis.schema(), |
|
instruction="Analyze the full content" |
|
) |
|
) |
|
result = await crawler.arun(url="https://example.com", config=config) |
|
``` |
|
|