Spaces:
Runtime error
Runtime error
Complete Parameter Guide for arun()
The following parameters can be passed to the arun()
method. They are organized by their primary usage context and functionality.
Core Parameters
await crawler.arun(
url="https://example.com", # Required: URL to crawl
verbose=True, # Enable detailed logging
cache_mode=CacheMode.ENABLED, # Control cache behavior
warmup=True # Whether to run warmup check
)
Cache Control
from crawl4ai import CacheMode
await crawler.arun(
cache_mode=CacheMode.ENABLED, # Normal caching (read/write)
# Other cache modes:
# cache_mode=CacheMode.DISABLED # No caching at all
# cache_mode=CacheMode.READ_ONLY # Only read from cache
# cache_mode=CacheMode.WRITE_ONLY # Only write to cache
# cache_mode=CacheMode.BYPASS # Skip cache for this operation
)
Content Processing Parameters
Text Processing
await crawler.arun(
word_count_threshold=10, # Minimum words per content block
image_description_min_word_threshold=5, # Minimum words for image descriptions
only_text=False, # Extract only text content
excluded_tags=['form', 'nav'], # HTML tags to exclude
keep_data_attributes=False, # Preserve data-* attributes
)
Content Selection
await crawler.arun(
css_selector=".main-content", # CSS selector for content extraction
remove_forms=True, # Remove all form elements
remove_overlay_elements=True, # Remove popups/modals/overlays
)
Link Handling
await crawler.arun(
exclude_external_links=True, # Remove external links
exclude_social_media_links=True, # Remove social media links
exclude_external_images=True, # Remove external images
exclude_domains=["ads.example.com"], # Specific domains to exclude
social_media_domains=[ # Additional social media domains
"facebook.com",
"twitter.com",
"instagram.com"
]
)
Browser Control Parameters
Basic Browser Settings
await crawler.arun(
headless=True, # Run browser in headless mode
browser_type="chromium", # Browser engine: "chromium", "firefox", "webkit"
page_timeout=60000, # Page load timeout in milliseconds
user_agent="custom-agent", # Custom user agent
)
Navigation and Waiting
await crawler.arun(
wait_for="css:.dynamic-content", # Wait for element/condition
delay_before_return_html=2.0, # Wait before returning HTML (seconds)
)
JavaScript Execution
await crawler.arun(
js_code=[ # JavaScript to execute (string or list)
"window.scrollTo(0, document.body.scrollHeight);",
"document.querySelector('.load-more').click();"
],
js_only=False, # Only execute JavaScript without reloading page
)
Anti-Bot Features
await crawler.arun(
magic=True, # Enable all anti-detection features
simulate_user=True, # Simulate human behavior
override_navigator=True # Override navigator properties
)
Session Management
await crawler.arun(
session_id="my_session", # Session identifier for persistent browsing
)
Screenshot Options
await crawler.arun(
screenshot=True, # Take page screenshot
screenshot_wait_for=2.0, # Wait before screenshot (seconds)
)
Proxy Configuration
await crawler.arun(
proxy="http://proxy.example.com:8080", # Simple proxy URL
proxy_config={ # Advanced proxy settings
"server": "http://proxy.example.com:8080",
"username": "user",
"password": "pass"
}
)
Content Extraction Parameters
Extraction Strategy
await crawler.arun(
extraction_strategy=LLMExtractionStrategy(
provider="ollama/llama2",
schema=MySchema.schema(),
instruction="Extract specific data"
)
)
Chunking Strategy
await crawler.arun(
chunking_strategy=RegexChunking(
patterns=[r'\n\n', r'\.\s+']
)
)
HTML to Text Options
await crawler.arun(
html2text={
"ignore_links": False,
"ignore_images": False,
"escape_dot": False,
"body_width": 0,
"protect_links": True,
"unicode_snob": True
}
)
Debug Options
await crawler.arun(
log_console=True, # Log browser console messages
)
Parameter Interactions and Notes
Cache and Performance Setup
# Optimal caching for repeated crawls await crawler.arun( cache_mode=CacheMode.ENABLED, word_count_threshold=10, process_iframes=False )
Dynamic Content Handling
# Handle lazy-loaded content await crawler.arun( js_code="window.scrollTo(0, document.body.scrollHeight);", wait_for="css:.lazy-content", delay_before_return_html=2.0, cache_mode=CacheMode.WRITE_ONLY # Cache results after dynamic load )
Content Extraction Pipeline
# Complete extraction setup await crawler.arun( css_selector=".main-content", word_count_threshold=20, extraction_strategy=my_strategy, chunking_strategy=my_chunking, process_iframes=True, remove_overlay_elements=True, cache_mode=CacheMode.ENABLED )
Best Practices
Performance Optimization
await crawler.arun( cache_mode=CacheMode.ENABLED, # Use full caching word_count_threshold=10, # Filter out noise process_iframes=False # Skip iframes if not needed )
Reliable Scraping
await crawler.arun( magic=True, # Enable anti-detection delay_before_return_html=1.0, # Wait for dynamic content page_timeout=60000, # Longer timeout for slow pages cache_mode=CacheMode.WRITE_ONLY # Cache results after successful crawl )
Clean Content
await crawler.arun( remove_overlay_elements=True, # Remove popups excluded_tags=['nav', 'aside'],# Remove unnecessary elements keep_data_attributes=False, # Remove data attributes cache_mode=CacheMode.ENABLED # Use cache for faster processing )