agenticAi / agents /web_browsing_agent.py
Cline
Initial commit
0af0a55
raw
history blame
8.95 kB
import aiohttp
from bs4 import BeautifulSoup
from typing import Dict, Any, List
from loguru import logger
from utils.llm_orchestrator import LLMOrchestrator
import asyncio
from urllib.parse import urljoin, urlparse
class WebBrowsingAgent:
def __init__(self, llm_api_key: str):
"""Initialize the Web Browsing Agent."""
logger.info("Initializing WebBrowsingAgent")
self.llm_orchestrator = LLMOrchestrator(llm_api_key)
self.session = None
self.setup_logger()
self.visited_urls = set()
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.capabilities = [
"web_browsing",
"data_collection",
"content_processing",
"information_extraction",
"link_crawling"
]
def setup_logger(self):
"""Configure logging for the agent."""
logger.add("logs/web_browsing_agent.log", rotation="500 MB")
async def initialize(self):
"""Initialize the aiohttp session."""
logger.info("Initializing aiohttp session")
if not self.session:
self.session = aiohttp.ClientSession(headers=self.headers)
async def execute(self, task: Dict[str, Any]) -> Dict[str, Any]:
"""Execute a web browsing task."""
logger.info(f"Executing task: {task}")
await self.initialize()
if 'url' not in task:
logger.error("URL not provided in task")
raise ValueError("URL not provided in task")
try:
content = await self.collect_data(task['url'])
processed_data = await self.process_content(content, task)
logger.info(f"Successfully executed task: {task}")
return {
'status': 'success',
'data': processed_data,
'url': task['url']
}
except Exception as e:
logger.error(f"Error executing task: {str(e)}")
return {
'status': 'error',
'error': str(e),
'url': task['url']
}
async def collect_data(self, url: str, retries: int = 3,
delay: int = 1) -> Dict[str, Any]:
"""Collect data from a URL with error handling and retries."""
for attempt in range(retries):
try:
async with self.session.get(url) as response:
if response.status == 200:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
# Extract various types of content
text_content = soup.get_text(separator=' ', strip=True)
links = [
link.get('href') for link in soup.find_all(
'a', href=True)]
images = [
img.get('src') for img in soup.find_all(
'img', src=True)]
# Process links to get absolute URLs
processed_links = [urljoin(url, link)
for link in links]
logger.info(f"Successfully collected data from {url}")
return {
'url': url,
'text_content': text_content,
'links': processed_links,
'images': images,
'status_code': response.status,
'headers': dict(response.headers)
}
else:
logger.error(
f"HTTP {response.status}: Failed to fetch {url} on attempt {attempt + 1}")
if attempt < retries - 1:
# Exponential backoff
await asyncio.sleep(delay * (2 ** attempt))
else:
raise Exception(
f"HTTP {response.status}: Failed to fetch {url} after multiple retries")
except aiohttp.ClientError as e:
logger.error(
f"Network error on attempt {attempt + 1} for {url}: {str(e)}")
if attempt < retries - 1:
# Exponential backoff
await asyncio.sleep(delay * (2 ** attempt))
else:
raise Exception(
f"Network error: Failed to fetch {url} after multiple retries")
except aiohttp.HttpProcessingError as e:
logger.error(
f"HTTP processing error on attempt {attempt + 1} for {url}: {str(e)}")
if attempt < retries - 1:
# Exponential backoff
await asyncio.sleep(delay * (2 ** attempt))
else:
raise Exception(
f"HTTP processing error: Failed to fetch {url} after multiple retries")
except Exception as e:
logger.error(
f"Unexpected error on attempt {attempt + 1} for {url}: {str(e)}")
if attempt < retries - 1:
# Exponential backoff
await asyncio.sleep(delay * (2 ** attempt))
else:
raise Exception(
f"Unexpected error: Failed to fetch {url} after multiple retries")
async def process_content(
self, content: Dict[str, Any], task: Dict[str, Any]) -> Dict[str, Any]:
"""Process collected content using LLM."""
logger.info(f"Processing content for {content['url']}")
try:
# Generate summary of the content
summary = await self.llm_orchestrator.generate_completion(
f"Summarize the following content:\n{content['text_content'][:1000]}..."
)
# Extract key information based on task requirements
extracted_info = await self.extract_relevant_information(content, task)
logger.info(f"Successfully processed content for {content['url']}")
return {
'summary': summary,
'extracted_info': extracted_info,
'metadata': {
'url': content['url'],
'num_links': len(content['links']),
'num_images': len(content['images'])
}
}
except Exception as e:
logger.error(f"Error processing content: {str(e)}")
raise
async def extract_relevant_information(
self, content: Dict[str, Any], task: Dict[str, Any]) -> Dict[str, Any]:
"""Extract relevant information based on task requirements."""
logger.info(f"Extracting relevant information for {content['url']}")
# Use LLM to extract specific information based on task requirements
prompt = f"""
Extract relevant information from the following content based on these requirements:
Task requirements: {task.get('requirements', 'general information')}
Content:
{content['text_content'][:1500]}...
"""
extracted_info = await self.llm_orchestrator.generate_completion(prompt)
logger.info(f"Successfully extracted information for {content['url']}")
return {'extracted_information': extracted_info}
async def crawl_links(self, base_url: str,
max_depth: int = 2) -> List[Dict[str, Any]]:
"""Crawl links starting from a base URL up to a maximum depth."""
logger.info(f"Crawling links from {base_url} up to depth {max_depth}")
results = []
async def crawl(url: str, depth: int):
if depth > max_depth or url in self.visited_urls:
return
self.visited_urls.add(url)
try:
content = await self.collect_data(url)
results.append(content)
if depth < max_depth:
tasks = []
for link in content['links']:
if link not in self.visited_urls:
tasks.append(crawl(link, depth + 1))
await asyncio.gather(*tasks)
except Exception as e:
logger.error(f"Error crawling {url}: {str(e)}")
await crawl(base_url, 0)
logger.info(f"Finished crawling links from {base_url}")
return results
async def shutdown(self):
"""Cleanup resources."""
logger.info("Shutting down WebBrowsingAgent")
if self.session:
await self.session.close()
self.session = None