Spaces:
Runtime error
Runtime error
File size: 8,953 Bytes
0af0a55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
import aiohttp
from bs4 import BeautifulSoup
from typing import Dict, Any, List
from loguru import logger
from utils.llm_orchestrator import LLMOrchestrator
import asyncio
from urllib.parse import urljoin, urlparse
class WebBrowsingAgent:
def __init__(self, llm_api_key: str):
"""Initialize the Web Browsing Agent."""
logger.info("Initializing WebBrowsingAgent")
self.llm_orchestrator = LLMOrchestrator(llm_api_key)
self.session = None
self.setup_logger()
self.visited_urls = set()
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.capabilities = [
"web_browsing",
"data_collection",
"content_processing",
"information_extraction",
"link_crawling"
]
def setup_logger(self):
"""Configure logging for the agent."""
logger.add("logs/web_browsing_agent.log", rotation="500 MB")
async def initialize(self):
"""Initialize the aiohttp session."""
logger.info("Initializing aiohttp session")
if not self.session:
self.session = aiohttp.ClientSession(headers=self.headers)
async def execute(self, task: Dict[str, Any]) -> Dict[str, Any]:
"""Execute a web browsing task."""
logger.info(f"Executing task: {task}")
await self.initialize()
if 'url' not in task:
logger.error("URL not provided in task")
raise ValueError("URL not provided in task")
try:
content = await self.collect_data(task['url'])
processed_data = await self.process_content(content, task)
logger.info(f"Successfully executed task: {task}")
return {
'status': 'success',
'data': processed_data,
'url': task['url']
}
except Exception as e:
logger.error(f"Error executing task: {str(e)}")
return {
'status': 'error',
'error': str(e),
'url': task['url']
}
async def collect_data(self, url: str, retries: int = 3,
delay: int = 1) -> Dict[str, Any]:
"""Collect data from a URL with error handling and retries."""
for attempt in range(retries):
try:
async with self.session.get(url) as response:
if response.status == 200:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
# Extract various types of content
text_content = soup.get_text(separator=' ', strip=True)
links = [
link.get('href') for link in soup.find_all(
'a', href=True)]
images = [
img.get('src') for img in soup.find_all(
'img', src=True)]
# Process links to get absolute URLs
processed_links = [urljoin(url, link)
for link in links]
logger.info(f"Successfully collected data from {url}")
return {
'url': url,
'text_content': text_content,
'links': processed_links,
'images': images,
'status_code': response.status,
'headers': dict(response.headers)
}
else:
logger.error(
f"HTTP {response.status}: Failed to fetch {url} on attempt {attempt + 1}")
if attempt < retries - 1:
# Exponential backoff
await asyncio.sleep(delay * (2 ** attempt))
else:
raise Exception(
f"HTTP {response.status}: Failed to fetch {url} after multiple retries")
except aiohttp.ClientError as e:
logger.error(
f"Network error on attempt {attempt + 1} for {url}: {str(e)}")
if attempt < retries - 1:
# Exponential backoff
await asyncio.sleep(delay * (2 ** attempt))
else:
raise Exception(
f"Network error: Failed to fetch {url} after multiple retries")
except aiohttp.HttpProcessingError as e:
logger.error(
f"HTTP processing error on attempt {attempt + 1} for {url}: {str(e)}")
if attempt < retries - 1:
# Exponential backoff
await asyncio.sleep(delay * (2 ** attempt))
else:
raise Exception(
f"HTTP processing error: Failed to fetch {url} after multiple retries")
except Exception as e:
logger.error(
f"Unexpected error on attempt {attempt + 1} for {url}: {str(e)}")
if attempt < retries - 1:
# Exponential backoff
await asyncio.sleep(delay * (2 ** attempt))
else:
raise Exception(
f"Unexpected error: Failed to fetch {url} after multiple retries")
async def process_content(
self, content: Dict[str, Any], task: Dict[str, Any]) -> Dict[str, Any]:
"""Process collected content using LLM."""
logger.info(f"Processing content for {content['url']}")
try:
# Generate summary of the content
summary = await self.llm_orchestrator.generate_completion(
f"Summarize the following content:\n{content['text_content'][:1000]}..."
)
# Extract key information based on task requirements
extracted_info = await self.extract_relevant_information(content, task)
logger.info(f"Successfully processed content for {content['url']}")
return {
'summary': summary,
'extracted_info': extracted_info,
'metadata': {
'url': content['url'],
'num_links': len(content['links']),
'num_images': len(content['images'])
}
}
except Exception as e:
logger.error(f"Error processing content: {str(e)}")
raise
async def extract_relevant_information(
self, content: Dict[str, Any], task: Dict[str, Any]) -> Dict[str, Any]:
"""Extract relevant information based on task requirements."""
logger.info(f"Extracting relevant information for {content['url']}")
# Use LLM to extract specific information based on task requirements
prompt = f"""
Extract relevant information from the following content based on these requirements:
Task requirements: {task.get('requirements', 'general information')}
Content:
{content['text_content'][:1500]}...
"""
extracted_info = await self.llm_orchestrator.generate_completion(prompt)
logger.info(f"Successfully extracted information for {content['url']}")
return {'extracted_information': extracted_info}
async def crawl_links(self, base_url: str,
max_depth: int = 2) -> List[Dict[str, Any]]:
"""Crawl links starting from a base URL up to a maximum depth."""
logger.info(f"Crawling links from {base_url} up to depth {max_depth}")
results = []
async def crawl(url: str, depth: int):
if depth > max_depth or url in self.visited_urls:
return
self.visited_urls.add(url)
try:
content = await self.collect_data(url)
results.append(content)
if depth < max_depth:
tasks = []
for link in content['links']:
if link not in self.visited_urls:
tasks.append(crawl(link, depth + 1))
await asyncio.gather(*tasks)
except Exception as e:
logger.error(f"Error crawling {url}: {str(e)}")
await crawl(base_url, 0)
logger.info(f"Finished crawling links from {base_url}")
return results
async def shutdown(self):
"""Cleanup resources."""
logger.info("Shutting down WebBrowsingAgent")
if self.session:
await self.session.close()
self.session = None
|