|
# LLM Extraction with AsyncWebCrawler |
|
|
|
Crawl4AI's AsyncWebCrawler allows you to use Language Models (LLMs) to extract structured data or relevant content from web pages asynchronously. Below are two examples demonstrating how to use `LLMExtractionStrategy` for different purposes with the AsyncWebCrawler. |
|
|
|
## Example 1: Extract Structured Data |
|
|
|
In this example, we use the `LLMExtractionStrategy` to extract structured data (model names and their fees) from the OpenAI pricing page. |
|
|
|
```python |
|
import os |
|
import json |
|
import asyncio |
|
from crawl4ai import AsyncWebCrawler |
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy |
|
from pydantic import BaseModel, Field |
|
|
|
class OpenAIModelFee(BaseModel): |
|
model_name: str = Field(..., description="Name of the OpenAI model.") |
|
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") |
|
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") |
|
|
|
async def extract_openai_fees(): |
|
url = 'https://openai.com/api/pricing/' |
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler: |
|
result = await crawler.arun( |
|
url=url, |
|
word_count_threshold=1, |
|
extraction_strategy=LLMExtractionStrategy( |
|
provider="openai/gpt-4o", # Or use ollama like provider="ollama/nemotron" |
|
api_token=os.getenv('OPENAI_API_KEY'), |
|
schema=OpenAIModelFee.model_json_schema(), |
|
extraction_type="schema", |
|
instruction="From the crawled content, extract all mentioned model names along with their " |
|
"fees for input and output tokens. Make sure not to miss anything in the entire content. " |
|
'One extracted model JSON format should look like this: ' |
|
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' |
|
), |
|
bypass_cache=True, |
|
) |
|
|
|
model_fees = json.loads(result.extracted_content) |
|
print(f"Number of models extracted: {len(model_fees)}") |
|
|
|
with open(".data/openai_fees.json", "w", encoding="utf-8") as f: |
|
json.dump(model_fees, f, indent=2) |
|
|
|
asyncio.run(extract_openai_fees()) |
|
``` |
|
|
|
## Example 2: Extract Relevant Content |
|
|
|
In this example, we instruct the LLM to extract only content related to technology from the NBC News business page. |
|
|
|
```python |
|
import os |
|
import json |
|
import asyncio |
|
from crawl4ai import AsyncWebCrawler |
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy |
|
|
|
async def extract_tech_content(): |
|
async with AsyncWebCrawler(verbose=True) as crawler: |
|
result = await crawler.arun( |
|
url="https://www.nbcnews.com/business", |
|
extraction_strategy=LLMExtractionStrategy( |
|
provider="openai/gpt-4o", |
|
api_token=os.getenv('OPENAI_API_KEY'), |
|
instruction="Extract only content related to technology" |
|
), |
|
bypass_cache=True, |
|
) |
|
|
|
tech_content = json.loads(result.extracted_content) |
|
print(f"Number of tech-related items extracted: {len(tech_content)}") |
|
|
|
with open(".data/tech_content.json", "w", encoding="utf-8") as f: |
|
json.dump(tech_content, f, indent=2) |
|
|
|
asyncio.run(extract_tech_content()) |
|
``` |
|
|
|
## Advanced Usage: Combining JS Execution with LLM Extraction |
|
|
|
This example demonstrates how to combine JavaScript execution with LLM extraction to handle dynamic content: |
|
|
|
```python |
|
async def extract_dynamic_content(): |
|
js_code = """ |
|
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); |
|
if (loadMoreButton) { |
|
loadMoreButton.click(); |
|
await new Promise(resolve => setTimeout(resolve, 2000)); |
|
} |
|
""" |
|
|
|
wait_for = """ |
|
() => { |
|
const articles = document.querySelectorAll('article.tease-card'); |
|
return articles.length > 10; |
|
} |
|
""" |
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler: |
|
result = await crawler.arun( |
|
url="https://www.nbcnews.com/business", |
|
js_code=js_code, |
|
wait_for=wait_for, |
|
css_selector="article.tease-card", |
|
extraction_strategy=LLMExtractionStrategy( |
|
provider="openai/gpt-4o", |
|
api_token=os.getenv('OPENAI_API_KEY'), |
|
instruction="Summarize each article, focusing on technology-related content" |
|
), |
|
bypass_cache=True, |
|
) |
|
|
|
summaries = json.loads(result.extracted_content) |
|
print(f"Number of summarized articles: {len(summaries)}") |
|
|
|
with open(".data/tech_summaries.json", "w", encoding="utf-8") as f: |
|
json.dump(summaries, f, indent=2) |
|
|
|
asyncio.run(extract_dynamic_content()) |
|
``` |
|
|
|
## Customizing LLM Provider |
|
|
|
Crawl4AI uses the `litellm` library under the hood, which allows you to use any LLM provider you want. Just pass the correct model name and API token: |
|
|
|
```python |
|
extraction_strategy=LLMExtractionStrategy( |
|
provider="your_llm_provider/model_name", |
|
api_token="your_api_token", |
|
instruction="Your extraction instruction" |
|
) |
|
``` |
|
|
|
This flexibility allows you to integrate with various LLM providers and tailor the extraction process to your specific needs. |
|
|
|
## Error Handling and Retries |
|
|
|
When working with external LLM APIs, it's important to handle potential errors and implement retry logic. Here's an example of how you might do this: |
|
|
|
```python |
|
import asyncio |
|
from tenacity import retry, stop_after_attempt, wait_exponential |
|
|
|
class LLMExtractionError(Exception): |
|
pass |
|
|
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) |
|
async def extract_with_retry(crawler, url, extraction_strategy): |
|
try: |
|
result = await crawler.arun(url=url, extraction_strategy=extraction_strategy, bypass_cache=True) |
|
return json.loads(result.extracted_content) |
|
except Exception as e: |
|
raise LLMExtractionError(f"Failed to extract content: {str(e)}") |
|
|
|
async def main(): |
|
async with AsyncWebCrawler(verbose=True) as crawler: |
|
try: |
|
content = await extract_with_retry( |
|
crawler, |
|
"https://www.example.com", |
|
LLMExtractionStrategy( |
|
provider="openai/gpt-4o", |
|
api_token=os.getenv('OPENAI_API_KEY'), |
|
instruction="Extract and summarize main points" |
|
) |
|
) |
|
print("Extracted content:", content) |
|
except LLMExtractionError as e: |
|
print(f"Extraction failed after retries: {e}") |
|
|
|
asyncio.run(main()) |
|
``` |
|
|
|
This example uses the `tenacity` library to implement a retry mechanism with exponential backoff, which can help handle temporary failures or rate limiting from the LLM API. |