Spaces:
Runtime error
Runtime error
import os, sys | |
sys.path.append( | |
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
) | |
import asyncio | |
import time | |
import json | |
import re | |
from typing import Dict, List | |
from bs4 import BeautifulSoup | |
from pydantic import BaseModel, Field | |
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig | |
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator | |
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter | |
from crawl4ai.extraction_strategy import ( | |
JsonCssExtractionStrategy, | |
LLMExtractionStrategy, | |
) | |
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) | |
print("Crawl4AI: Advanced Web Crawling and Data Extraction") | |
print("GitHub Repository: https://github.com/unclecode/crawl4ai") | |
print("Twitter: @unclecode") | |
print("Website: https://crawl4ai.com") | |
# Basic Example - Simple Crawl | |
async def simple_crawl(): | |
print("\n--- Basic Usage ---") | |
browser_config = BrowserConfig(headless=True) | |
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun( | |
url="https://www.nbcnews.com/business", config=crawler_config | |
) | |
print(result.markdown[:500]) | |
async def clean_content(): | |
crawler_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
excluded_tags=["nav", "footer", "aside"], | |
remove_overlay_elements=True, | |
markdown_generator=DefaultMarkdownGenerator( | |
content_filter=PruningContentFilter( | |
threshold=0.48, threshold_type="fixed", min_word_threshold=0 | |
), | |
options={"ignore_links": True}, | |
), | |
) | |
async with AsyncWebCrawler() as crawler: | |
result = await crawler.arun( | |
url="https://en.wikipedia.org/wiki/Apple", | |
config=crawler_config, | |
) | |
full_markdown_length = len(result.markdown_v2.raw_markdown) | |
fit_markdown_length = len(result.markdown_v2.fit_markdown) | |
print(f"Full Markdown Length: {full_markdown_length}") | |
print(f"Fit Markdown Length: {fit_markdown_length}") | |
async def link_analysis(): | |
crawler_config = CrawlerRunConfig( | |
cache_mode=CacheMode.ENABLED, | |
exclude_external_links=True, | |
exclude_social_media_links=True, | |
) | |
async with AsyncWebCrawler() as crawler: | |
result = await crawler.arun( | |
url="https://www.nbcnews.com/business", | |
config=crawler_config, | |
) | |
print(f"Found {len(result.links['internal'])} internal links") | |
print(f"Found {len(result.links['external'])} external links") | |
for link in result.links['internal'][:5]: | |
print(f"Href: {link['href']}\nText: {link['text']}\n") | |
# JavaScript Execution Example | |
async def simple_example_with_running_js_code(): | |
print("\n--- Executing JavaScript and Using CSS Selectors ---") | |
browser_config = BrowserConfig(headless=True, java_script_enabled=True) | |
crawler_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();", | |
# wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }" | |
) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun( | |
url="https://www.nbcnews.com/business", config=crawler_config | |
) | |
print(result.markdown[:500]) | |
# CSS Selector Example | |
async def simple_example_with_css_selector(): | |
print("\n--- Using CSS Selectors ---") | |
browser_config = BrowserConfig(headless=True) | |
crawler_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description" | |
) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun( | |
url="https://www.nbcnews.com/business", config=crawler_config | |
) | |
print(result.markdown[:500]) | |
async def media_handling(): | |
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True) | |
async with AsyncWebCrawler() as crawler: | |
result = await crawler.arun( | |
url="https://www.nbcnews.com/business", | |
config=crawler_config | |
) | |
for img in result.media['images'][:5]: | |
print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}") | |
async def custom_hook_workflow(verbose=True): | |
async with AsyncWebCrawler() as crawler: | |
# Set a 'before_goto' hook to run custom code just before navigation | |
crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate...")) | |
# Perform the crawl operation | |
result = await crawler.arun( | |
url="https://crawl4ai.com" | |
) | |
print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- ")) | |
# Proxy Example | |
async def use_proxy(): | |
print("\n--- Using a Proxy ---") | |
browser_config = BrowserConfig( | |
headless=True, | |
proxy_config={ | |
"server": "http://proxy.example.com:8080", | |
"username": "username", | |
"password": "password", | |
}, | |
) | |
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun( | |
url="https://www.nbcnews.com/business", config=crawler_config | |
) | |
if result.success: | |
print(result.markdown[:500]) | |
# Screenshot Example | |
async def capture_and_save_screenshot(url: str, output_path: str): | |
browser_config = BrowserConfig(headless=True) | |
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun(url=url, config=crawler_config) | |
if result.success and result.screenshot: | |
import base64 | |
screenshot_data = base64.b64decode(result.screenshot) | |
with open(output_path, "wb") as f: | |
f.write(screenshot_data) | |
print(f"Screenshot saved successfully to {output_path}") | |
else: | |
print("Failed to capture screenshot") | |
# LLM Extraction Example | |
class OpenAIModelFee(BaseModel): | |
model_name: str = Field(..., description="Name of the OpenAI model.") | |
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") | |
output_fee: str = Field( | |
..., description="Fee for output token for the OpenAI model." | |
) | |
async def extract_structured_data_using_llm( | |
provider: str, api_token: str = None, extra_headers: Dict[str, str] = None | |
): | |
print(f"\n--- Extracting Structured Data with {provider} ---") | |
if api_token is None and provider != "ollama": | |
print(f"API token is required for {provider}. Skipping this example.") | |
return | |
browser_config = BrowserConfig(headless=True) | |
extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000} | |
if extra_headers: | |
extra_args["extra_headers"] = extra_headers | |
crawler_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
word_count_threshold=1, | |
page_timeout=80000, | |
extraction_strategy=LLMExtractionStrategy( | |
provider=provider, | |
api_token=api_token, | |
schema=OpenAIModelFee.model_json_schema(), | |
extraction_type="schema", | |
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. | |
Do not miss any models in the entire content.""", | |
extra_args=extra_args, | |
), | |
) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun( | |
url="https://openai.com/api/pricing/", config=crawler_config | |
) | |
print(result.extracted_content) | |
# CSS Extraction Example | |
async def extract_structured_data_using_css_extractor(): | |
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") | |
schema = { | |
"name": "KidoCode Courses", | |
"baseSelector": "section.charge-methodology .w-tab-content > div", | |
"fields": [ | |
{ | |
"name": "section_title", | |
"selector": "h3.heading-50", | |
"type": "text", | |
}, | |
{ | |
"name": "section_description", | |
"selector": ".charge-content", | |
"type": "text", | |
}, | |
{ | |
"name": "course_name", | |
"selector": ".text-block-93", | |
"type": "text", | |
}, | |
{ | |
"name": "course_description", | |
"selector": ".course-content-text", | |
"type": "text", | |
}, | |
{ | |
"name": "course_icon", | |
"selector": ".image-92", | |
"type": "attribute", | |
"attribute": "src", | |
}, | |
], | |
} | |
browser_config = BrowserConfig(headless=True, java_script_enabled=True) | |
js_click_tabs = """ | |
(async () => { | |
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); | |
for(let tab of tabs) { | |
tab.scrollIntoView(); | |
tab.click(); | |
await new Promise(r => setTimeout(r, 500)); | |
} | |
})(); | |
""" | |
crawler_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
extraction_strategy=JsonCssExtractionStrategy(schema), | |
js_code=[js_click_tabs], | |
) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun( | |
url="https://www.kidocode.com/degrees/technology", config=crawler_config | |
) | |
companies = json.loads(result.extracted_content) | |
print(f"Successfully extracted {len(companies)} companies") | |
print(json.dumps(companies[0], indent=2)) | |
# Dynamic Content Examples - Method 1 | |
async def crawl_dynamic_content_pages_method_1(): | |
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") | |
first_commit = "" | |
async def on_execution_started(page, **kwargs): | |
nonlocal first_commit | |
try: | |
while True: | |
await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4") | |
commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4") | |
commit = await commit.evaluate("(element) => element.textContent") | |
commit = re.sub(r"\s+", "", commit) | |
if commit and commit != first_commit: | |
first_commit = commit | |
break | |
await asyncio.sleep(0.5) | |
except Exception as e: | |
print(f"Warning: New content didn't appear after JavaScript execution: {e}") | |
browser_config = BrowserConfig(headless=False, java_script_enabled=True) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) | |
url = "https://github.com/microsoft/TypeScript/commits/main" | |
session_id = "typescript_commits_session" | |
all_commits = [] | |
js_next_page = """ | |
const button = document.querySelector('a[data-testid="pagination-next-button"]'); | |
if (button) button.click(); | |
""" | |
for page in range(3): | |
crawler_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
css_selector="li.Box-sc-g0xbh4-0", | |
js_code=js_next_page if page > 0 else None, | |
js_only=page > 0, | |
session_id=session_id, | |
) | |
result = await crawler.arun(url=url, config=crawler_config) | |
assert result.success, f"Failed to crawl page {page + 1}" | |
soup = BeautifulSoup(result.cleaned_html, "html.parser") | |
commits = soup.select("li") | |
all_commits.extend(commits) | |
print(f"Page {page + 1}: Found {len(commits)} commits") | |
print(f"Successfully crawled {len(all_commits)} commits across 3 pages") | |
# Dynamic Content Examples - Method 2 | |
async def crawl_dynamic_content_pages_method_2(): | |
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") | |
browser_config = BrowserConfig(headless=False, java_script_enabled=True) | |
js_next_page_and_wait = """ | |
(async () => { | |
const getCurrentCommit = () => { | |
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); | |
return commits.length > 0 ? commits[0].textContent.trim() : null; | |
}; | |
const initialCommit = getCurrentCommit(); | |
const button = document.querySelector('a[data-testid="pagination-next-button"]'); | |
if (button) button.click(); | |
while (true) { | |
await new Promise(resolve => setTimeout(resolve, 100)); | |
const newCommit = getCurrentCommit(); | |
if (newCommit && newCommit !== initialCommit) { | |
break; | |
} | |
} | |
})(); | |
""" | |
schema = { | |
"name": "Commit Extractor", | |
"baseSelector": "li.Box-sc-g0xbh4-0", | |
"fields": [ | |
{ | |
"name": "title", | |
"selector": "h4.markdown-title", | |
"type": "text", | |
"transform": "strip", | |
}, | |
], | |
} | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
url = "https://github.com/microsoft/TypeScript/commits/main" | |
session_id = "typescript_commits_session" | |
all_commits = [] | |
extraction_strategy = JsonCssExtractionStrategy(schema) | |
for page in range(3): | |
crawler_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
css_selector="li.Box-sc-g0xbh4-0", | |
extraction_strategy=extraction_strategy, | |
js_code=js_next_page_and_wait if page > 0 else None, | |
js_only=page > 0, | |
session_id=session_id, | |
) | |
result = await crawler.arun(url=url, config=crawler_config) | |
assert result.success, f"Failed to crawl page {page + 1}" | |
commits = json.loads(result.extracted_content) | |
all_commits.extend(commits) | |
print(f"Page {page + 1}: Found {len(commits)} commits") | |
print(f"Successfully crawled {len(all_commits)} commits across 3 pages") | |
async def cosine_similarity_extraction(): | |
crawl_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
extraction_strategy=CosineStrategy( | |
word_count_threshold=10, | |
max_dist=0.2, # Maximum distance between two words | |
linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single) | |
top_k=3, # Number of top keywords to extract | |
sim_threshold=0.3, # Similarity threshold for clustering | |
semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings | |
verbose=True | |
), | |
) | |
async with AsyncWebCrawler() as crawler: | |
result = await crawler.arun( | |
url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156", | |
config=crawl_config | |
) | |
print(json.loads(result.extracted_content)[:5]) | |
# Browser Comparison | |
async def crawl_custom_browser_type(): | |
print("\n--- Browser Comparison ---") | |
# Firefox | |
browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True) | |
start = time.time() | |
async with AsyncWebCrawler(config=browser_config_firefox) as crawler: | |
result = await crawler.arun( | |
url="https://www.example.com", | |
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), | |
) | |
print("Firefox:", time.time() - start) | |
print(result.markdown[:500]) | |
# WebKit | |
browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True) | |
start = time.time() | |
async with AsyncWebCrawler(config=browser_config_webkit) as crawler: | |
result = await crawler.arun( | |
url="https://www.example.com", | |
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), | |
) | |
print("WebKit:", time.time() - start) | |
print(result.markdown[:500]) | |
# Chromium (default) | |
browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True) | |
start = time.time() | |
async with AsyncWebCrawler(config=browser_config_chromium) as crawler: | |
result = await crawler.arun( | |
url="https://www.example.com", | |
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), | |
) | |
print("Chromium:", time.time() - start) | |
print(result.markdown[:500]) | |
# Anti-Bot and User Simulation | |
async def crawl_with_user_simulation(): | |
browser_config = BrowserConfig( | |
headless=True, | |
user_agent_mode="random", | |
user_agent_generator_config={"device_type": "mobile", "os_type": "android"}, | |
) | |
crawler_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
magic=True, | |
simulate_user=True, | |
override_navigator=True, | |
) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config) | |
print(result.markdown) | |
async def ssl_certification(): | |
# Configure crawler to fetch SSL certificate | |
config = CrawlerRunConfig( | |
fetch_ssl_certificate=True, | |
cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates | |
) | |
async with AsyncWebCrawler() as crawler: | |
result = await crawler.arun( | |
url='https://example.com', | |
config=config | |
) | |
if result.success and result.ssl_certificate: | |
cert = result.ssl_certificate | |
# 1. Access certificate properties directly | |
print("\nCertificate Information:") | |
print(f"Issuer: {cert.issuer.get('CN', '')}") | |
print(f"Valid until: {cert.valid_until}") | |
print(f"Fingerprint: {cert.fingerprint}") | |
# 2. Export certificate in different formats | |
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis | |
print("\nCertificate exported to:") | |
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}") | |
pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers | |
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}") | |
der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps | |
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}") | |
# Speed Comparison | |
async def speed_comparison(): | |
print("\n--- Speed Comparison ---") | |
# Firecrawl comparison | |
from firecrawl import FirecrawlApp | |
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"]) | |
start = time.time() | |
scrape_status = app.scrape_url( | |
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]} | |
) | |
end = time.time() | |
print("Firecrawl:") | |
print(f"Time taken: {end - start:.2f} seconds") | |
print(f"Content length: {len(scrape_status['markdown'])} characters") | |
print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}") | |
print() | |
# Crawl4AI comparisons | |
browser_config = BrowserConfig(headless=True) | |
# Simple crawl | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
start = time.time() | |
result = await crawler.arun( | |
url="https://www.nbcnews.com/business", | |
config=CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, word_count_threshold=0 | |
), | |
) | |
end = time.time() | |
print("Crawl4AI (simple crawl):") | |
print(f"Time taken: {end - start:.2f} seconds") | |
print(f"Content length: {len(result.markdown)} characters") | |
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") | |
print() | |
# Advanced filtering | |
start = time.time() | |
result = await crawler.arun( | |
url="https://www.nbcnews.com/business", | |
config=CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
word_count_threshold=0, | |
markdown_generator=DefaultMarkdownGenerator( | |
content_filter=PruningContentFilter( | |
threshold=0.48, threshold_type="fixed", min_word_threshold=0 | |
) | |
), | |
), | |
) | |
end = time.time() | |
print("Crawl4AI (Markdown Plus):") | |
print(f"Time taken: {end - start:.2f} seconds") | |
print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters") | |
print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters") | |
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") | |
print() | |
# Main execution | |
async def main(): | |
# Basic examples | |
# await simple_crawl() | |
# await simple_example_with_running_js_code() | |
# await simple_example_with_css_selector() | |
# Advanced examples | |
# await extract_structured_data_using_css_extractor() | |
await extract_structured_data_using_llm( | |
"openai/gpt-4o", os.getenv("OPENAI_API_KEY") | |
) | |
# await crawl_dynamic_content_pages_method_1() | |
# await crawl_dynamic_content_pages_method_2() | |
# Browser comparisons | |
# await crawl_custom_browser_type() | |
# Performance testing | |
# await speed_comparison() | |
# Screenshot example | |
# await capture_and_save_screenshot( | |
# "https://www.example.com", | |
# os.path.join(__location__, "tmp/example_screenshot.jpg") | |
# ) | |
if __name__ == "__main__": | |
asyncio.run(main()) | |