Spaces:
Runtime error
Runtime error
# File: async_webcrawler_multiple_urls_example.py | |
import os, sys | |
# append 2 parent directories to sys.path to import crawl4ai | |
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
sys.path.append(parent_dir) | |
import asyncio | |
from crawl4ai import AsyncWebCrawler | |
async def main(): | |
# Initialize the AsyncWebCrawler | |
async with AsyncWebCrawler(verbose=True) as crawler: | |
# List of URLs to crawl | |
urls = [ | |
"https://example.com", | |
"https://python.org", | |
"https://github.com", | |
"https://stackoverflow.com", | |
"https://news.ycombinator.com" | |
] | |
# Set up crawling parameters | |
word_count_threshold = 100 | |
# Run the crawling process for multiple URLs | |
results = await crawler.arun_many( | |
urls=urls, | |
word_count_threshold=word_count_threshold, | |
bypass_cache=True, | |
verbose=True | |
) | |
# Process the results | |
for result in results: | |
if result.success: | |
print(f"Successfully crawled: {result.url}") | |
print(f"Title: {result.metadata.get('title', 'N/A')}") | |
print(f"Word count: {len(result.markdown.split())}") | |
print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}") | |
print(f"Number of images: {len(result.media.get('images', []))}") | |
print("---") | |
else: | |
print(f"Failed to crawl: {result.url}") | |
print(f"Error: {result.error_message}") | |
print("---") | |
if __name__ == "__main__": | |
asyncio.run(main()) |