Spaces:

nananie143
/

agenticAi

Runtime error

agenticAi / agents /web_browsing_agent.py

Cline

Initial commit

0af0a55 6 months ago

8.95 kB

	import aiohttp
	from bs4 import BeautifulSoup
	from typing import Dict, Any, List
	from loguru import logger
	from utils.llm_orchestrator import LLMOrchestrator
	import asyncio
	from urllib.parse import urljoin, urlparse


	class WebBrowsingAgent:
	def __init__(self, llm_api_key: str):
	"""Initialize the Web Browsing Agent."""
	logger.info("Initializing WebBrowsingAgent")
	self.llm_orchestrator = LLMOrchestrator(llm_api_key)
	self.session = None
	self.setup_logger()
	self.visited_urls = set()
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	self.capabilities = [
	"web_browsing",
	"data_collection",
	"content_processing",
	"information_extraction",
	"link_crawling"
	]

	def setup_logger(self):
	"""Configure logging for the agent."""
	logger.add("logs/web_browsing_agent.log", rotation="500 MB")

	async def initialize(self):
	"""Initialize the aiohttp session."""
	logger.info("Initializing aiohttp session")
	if not self.session:
	self.session = aiohttp.ClientSession(headers=self.headers)

	async def execute(self, task: Dict[str, Any]) -> Dict[str, Any]:
	"""Execute a web browsing task."""
	logger.info(f"Executing task: {task}")
	await self.initialize()

	if 'url' not in task:
	logger.error("URL not provided in task")
	raise ValueError("URL not provided in task")

	try:
	content = await self.collect_data(task['url'])
	processed_data = await self.process_content(content, task)
	logger.info(f"Successfully executed task: {task}")
	return {
	'status': 'success',
	'data': processed_data,
	'url': task['url']
	}
	except Exception as e:
	logger.error(f"Error executing task: {str(e)}")
	return {
	'status': 'error',
	'error': str(e),
	'url': task['url']
	}

	async def collect_data(self, url: str, retries: int = 3,
	delay: int = 1) -> Dict[str, Any]:
	"""Collect data from a URL with error handling and retries."""
	for attempt in range(retries):
	try:
	async with self.session.get(url) as response:
	if response.status == 200:
	html = await response.text()
	soup = BeautifulSoup(html, 'html.parser')

	# Extract various types of content
	text_content = soup.get_text(separator=' ', strip=True)
	links = [
	link.get('href') for link in soup.find_all(
	'a', href=True)]
	images = [
	img.get('src') for img in soup.find_all(
	'img', src=True)]

	# Process links to get absolute URLs
	processed_links = [urljoin(url, link)
	for link in links]

	logger.info(f"Successfully collected data from {url}")
	return {
	'url': url,
	'text_content': text_content,
	'links': processed_links,
	'images': images,
	'status_code': response.status,
	'headers': dict(response.headers)
	}
	else:
	logger.error(
	f"HTTP {response.status}: Failed to fetch {url} on attempt {attempt + 1}")
	if attempt < retries - 1:
	# Exponential backoff
	await asyncio.sleep(delay * (2 ** attempt))
	else:
	raise Exception(
	f"HTTP {response.status}: Failed to fetch {url} after multiple retries")
	except aiohttp.ClientError as e:
	logger.error(
	f"Network error on attempt {attempt + 1} for {url}: {str(e)}")
	if attempt < retries - 1:
	# Exponential backoff
	await asyncio.sleep(delay * (2 ** attempt))
	else:
	raise Exception(
	f"Network error: Failed to fetch {url} after multiple retries")
	except aiohttp.HttpProcessingError as e:
	logger.error(
	f"HTTP processing error on attempt {attempt + 1} for {url}: {str(e)}")
	if attempt < retries - 1:
	# Exponential backoff
	await asyncio.sleep(delay * (2 ** attempt))
	else:
	raise Exception(
	f"HTTP processing error: Failed to fetch {url} after multiple retries")
	except Exception as e:
	logger.error(
	f"Unexpected error on attempt {attempt + 1} for {url}: {str(e)}")
	if attempt < retries - 1:
	# Exponential backoff
	await asyncio.sleep(delay * (2 ** attempt))
	else:
	raise Exception(
	f"Unexpected error: Failed to fetch {url} after multiple retries")

	async def process_content(
	self, content: Dict[str, Any], task: Dict[str, Any]) -> Dict[str, Any]:
	"""Process collected content using LLM."""
	logger.info(f"Processing content for {content['url']}")
	try:
	# Generate summary of the content
	summary = await self.llm_orchestrator.generate_completion(
	f"Summarize the following content:\n{content['text_content'][:1000]}..."
	)

	# Extract key information based on task requirements
	extracted_info = await self.extract_relevant_information(content, task)

	logger.info(f"Successfully processed content for {content['url']}")
	return {
	'summary': summary,
	'extracted_info': extracted_info,
	'metadata': {
	'url': content['url'],
	'num_links': len(content['links']),
	'num_images': len(content['images'])
	}
	}
	except Exception as e:
	logger.error(f"Error processing content: {str(e)}")
	raise

	async def extract_relevant_information(
	self, content: Dict[str, Any], task: Dict[str, Any]) -> Dict[str, Any]:
	"""Extract relevant information based on task requirements."""
	logger.info(f"Extracting relevant information for {content['url']}")
	# Use LLM to extract specific information based on task requirements
	prompt = f"""
	Extract relevant information from the following content based on these requirements:
	Task requirements: {task.get('requirements', 'general information')}

	Content:
	{content['text_content'][:1500]}...
	"""

	extracted_info = await self.llm_orchestrator.generate_completion(prompt)
	logger.info(f"Successfully extracted information for {content['url']}")
	return {'extracted_information': extracted_info}

	async def crawl_links(self, base_url: str,
	max_depth: int = 2) -> List[Dict[str, Any]]:
	"""Crawl links starting from a base URL up to a maximum depth."""
	logger.info(f"Crawling links from {base_url} up to depth {max_depth}")
	results = []

	async def crawl(url: str, depth: int):
	if depth > max_depth or url in self.visited_urls:
	return

	self.visited_urls.add(url)
	try:
	content = await self.collect_data(url)
	results.append(content)

	if depth < max_depth:
	tasks = []
	for link in content['links']:
	if link not in self.visited_urls:
	tasks.append(crawl(link, depth + 1))
	await asyncio.gather(*tasks)
	except Exception as e:
	logger.error(f"Error crawling {url}: {str(e)}")

	await crawl(base_url, 0)
	logger.info(f"Finished crawling links from {base_url}")
	return results

	async def shutdown(self):
	"""Cleanup resources."""
	logger.info("Shutting down WebBrowsingAgent")
	if self.session:
	await self.session.close()
	self.session = None