Spaces:
Runtime error
Runtime error
""" | |
Example demonstrating different extraction strategies with various input formats. | |
This example shows how to: | |
1. Use different input formats (markdown, HTML, fit_markdown) | |
2. Work with JSON-based extractors (CSS and XPath) | |
3. Use LLM-based extraction with different input formats | |
4. Configure browser and crawler settings properly | |
""" | |
import asyncio | |
import os | |
from typing import Dict, Any | |
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode | |
from crawl4ai.extraction_strategy import ( | |
LLMExtractionStrategy, | |
JsonCssExtractionStrategy, | |
JsonXPathExtractionStrategy | |
) | |
from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking | |
from crawl4ai.content_filter_strategy import PruningContentFilter | |
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator | |
async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str): | |
"""Helper function to run extraction with proper configuration""" | |
try: | |
# Configure the crawler run settings | |
config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
extraction_strategy=strategy, | |
markdown_generator=DefaultMarkdownGenerator( | |
content_filter=PruningContentFilter() # For fit_markdown support | |
) | |
) | |
# Run the crawler | |
result = await crawler.arun(url=url, config=config) | |
if result.success: | |
print(f"\n=== {name} Results ===") | |
print(f"Extracted Content: {result.extracted_content}") | |
print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}") | |
print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}") | |
else: | |
print(f"Error in {name}: Crawl failed") | |
except Exception as e: | |
print(f"Error in {name}: {str(e)}") | |
async def main(): | |
# Example URL (replace with actual URL) | |
url = "https://example.com/product-page" | |
# Configure browser settings | |
browser_config = BrowserConfig( | |
headless=True, | |
verbose=True | |
) | |
# Initialize extraction strategies | |
# 1. LLM Extraction with different input formats | |
markdown_strategy = LLMExtractionStrategy( | |
provider="openai/gpt-4o-mini", | |
api_token=os.getenv("OPENAI_API_KEY"), | |
instruction="Extract product information including name, price, and description" | |
) | |
html_strategy = LLMExtractionStrategy( | |
input_format="html", | |
provider="openai/gpt-4o-mini", | |
api_token=os.getenv("OPENAI_API_KEY"), | |
instruction="Extract product information from HTML including structured data" | |
) | |
fit_markdown_strategy = LLMExtractionStrategy( | |
input_format="fit_markdown", | |
provider="openai/gpt-4o-mini", | |
api_token=os.getenv("OPENAI_API_KEY"), | |
instruction="Extract product information from cleaned markdown" | |
) | |
# 2. JSON CSS Extraction (automatically uses HTML input) | |
css_schema = { | |
"baseSelector": ".product", | |
"fields": [ | |
{"name": "title", "selector": "h1.product-title", "type": "text"}, | |
{"name": "price", "selector": ".price", "type": "text"}, | |
{"name": "description", "selector": ".description", "type": "text"} | |
] | |
} | |
css_strategy = JsonCssExtractionStrategy(schema=css_schema) | |
# 3. JSON XPath Extraction (automatically uses HTML input) | |
xpath_schema = { | |
"baseSelector": "//div[@class='product']", | |
"fields": [ | |
{"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"}, | |
{"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"}, | |
{"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"} | |
] | |
} | |
xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema) | |
# Use context manager for proper resource handling | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
# Run all strategies | |
await run_extraction(crawler, url, markdown_strategy, "Markdown LLM") | |
await run_extraction(crawler, url, html_strategy, "HTML LLM") | |
await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM") | |
await run_extraction(crawler, url, css_strategy, "CSS Extraction") | |
await run_extraction(crawler, url, xpath_strategy, "XPath Extraction") | |
if __name__ == "__main__": | |
asyncio.run(main()) | |