Spaces:

nananie143
/

agenticAi

Runtime error

File size: 8,953 Bytes

0af0a55

import aiohttp
from bs4 import BeautifulSoup
from typing import Dict, Any, List
from loguru import logger
from utils.llm_orchestrator import LLMOrchestrator
import asyncio
from urllib.parse import urljoin, urlparse


class WebBrowsingAgent:
    def __init__(self, llm_api_key: str):
        """Initialize the Web Browsing Agent."""
        logger.info("Initializing WebBrowsingAgent")
        self.llm_orchestrator = LLMOrchestrator(llm_api_key)
        self.session = None
        self.setup_logger()
        self.visited_urls = set()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.capabilities = [
            "web_browsing",
            "data_collection",
            "content_processing",
            "information_extraction",
            "link_crawling"
        ]

    def setup_logger(self):
        """Configure logging for the agent."""
        logger.add("logs/web_browsing_agent.log", rotation="500 MB")

    async def initialize(self):
        """Initialize the aiohttp session."""
        logger.info("Initializing aiohttp session")
        if not self.session:
            self.session = aiohttp.ClientSession(headers=self.headers)

    async def execute(self, task: Dict[str, Any]) -> Dict[str, Any]:
        """Execute a web browsing task."""
        logger.info(f"Executing task: {task}")
        await self.initialize()

        if 'url' not in task:
            logger.error("URL not provided in task")
            raise ValueError("URL not provided in task")

        try:
            content = await self.collect_data(task['url'])
            processed_data = await self.process_content(content, task)
            logger.info(f"Successfully executed task: {task}")
            return {
                'status': 'success',
                'data': processed_data,
                'url': task['url']
            }
        except Exception as e:
            logger.error(f"Error executing task: {str(e)}")
            return {
                'status': 'error',
                'error': str(e),
                'url': task['url']
            }

    async def collect_data(self, url: str, retries: int = 3,
                           delay: int = 1) -> Dict[str, Any]:
        """Collect data from a URL with error handling and retries."""
        for attempt in range(retries):
            try:
                async with self.session.get(url) as response:
                    if response.status == 200:
                        html = await response.text()
                        soup = BeautifulSoup(html, 'html.parser')

                        # Extract various types of content
                        text_content = soup.get_text(separator=' ', strip=True)
                        links = [
                            link.get('href') for link in soup.find_all(
                                'a', href=True)]
                        images = [
                            img.get('src') for img in soup.find_all(
                                'img', src=True)]

                        # Process links to get absolute URLs
                        processed_links = [urljoin(url, link)
                                           for link in links]

                        logger.info(f"Successfully collected data from {url}")
                        return {
                            'url': url,
                            'text_content': text_content,
                            'links': processed_links,
                            'images': images,
                            'status_code': response.status,
                            'headers': dict(response.headers)
                        }
                    else:
                        logger.error(
                            f"HTTP {response.status}: Failed to fetch {url} on attempt {attempt + 1}")
                        if attempt < retries - 1:
                            # Exponential backoff
                            await asyncio.sleep(delay * (2 ** attempt))
                        else:
                            raise Exception(
                                f"HTTP {response.status}: Failed to fetch {url} after multiple retries")
            except aiohttp.ClientError as e:
                logger.error(
                    f"Network error on attempt {attempt + 1} for {url}: {str(e)}")
                if attempt < retries - 1:
                    # Exponential backoff
                    await asyncio.sleep(delay * (2 ** attempt))
                else:
                    raise Exception(
                        f"Network error: Failed to fetch {url} after multiple retries")
            except aiohttp.HttpProcessingError as e:
                logger.error(
                    f"HTTP processing error on attempt {attempt + 1} for {url}: {str(e)}")
                if attempt < retries - 1:
                    # Exponential backoff
                    await asyncio.sleep(delay * (2 ** attempt))
                else:
                    raise Exception(
                        f"HTTP processing error: Failed to fetch {url} after multiple retries")
            except Exception as e:
                logger.error(
                    f"Unexpected error on attempt {attempt + 1} for {url}: {str(e)}")
                if attempt < retries - 1:
                    # Exponential backoff
                    await asyncio.sleep(delay * (2 ** attempt))
                else:
                    raise Exception(
                        f"Unexpected error: Failed to fetch {url} after multiple retries")

    async def process_content(
            self, content: Dict[str, Any], task: Dict[str, Any]) -> Dict[str, Any]:
        """Process collected content using LLM."""
        logger.info(f"Processing content for {content['url']}")
        try:
            # Generate summary of the content
            summary = await self.llm_orchestrator.generate_completion(
                f"Summarize the following content:\n{content['text_content'][:1000]}..."
            )

            # Extract key information based on task requirements
            extracted_info = await self.extract_relevant_information(content, task)

            logger.info(f"Successfully processed content for {content['url']}")
            return {
                'summary': summary,
                'extracted_info': extracted_info,
                'metadata': {
                    'url': content['url'],
                    'num_links': len(content['links']),
                    'num_images': len(content['images'])
                }
            }
        except Exception as e:
            logger.error(f"Error processing content: {str(e)}")
            raise

    async def extract_relevant_information(
            self, content: Dict[str, Any], task: Dict[str, Any]) -> Dict[str, Any]:
        """Extract relevant information based on task requirements."""
        logger.info(f"Extracting relevant information for {content['url']}")
        # Use LLM to extract specific information based on task requirements
        prompt = f"""
        Extract relevant information from the following content based on these requirements:
        Task requirements: {task.get('requirements', 'general information')}

        Content:
        {content['text_content'][:1500]}...
        """

        extracted_info = await self.llm_orchestrator.generate_completion(prompt)
        logger.info(f"Successfully extracted information for {content['url']}")
        return {'extracted_information': extracted_info}

    async def crawl_links(self, base_url: str,
                          max_depth: int = 2) -> List[Dict[str, Any]]:
        """Crawl links starting from a base URL up to a maximum depth."""
        logger.info(f"Crawling links from {base_url} up to depth {max_depth}")
        results = []

        async def crawl(url: str, depth: int):
            if depth > max_depth or url in self.visited_urls:
                return

            self.visited_urls.add(url)
            try:
                content = await self.collect_data(url)
                results.append(content)

                if depth < max_depth:
                    tasks = []
                    for link in content['links']:
                        if link not in self.visited_urls:
                            tasks.append(crawl(link, depth + 1))
                    await asyncio.gather(*tasks)
            except Exception as e:
                logger.error(f"Error crawling {url}: {str(e)}")

        await crawl(base_url, 0)
        logger.info(f"Finished crawling links from {base_url}")
        return results

    async def shutdown(self):
        """Cleanup resources."""
        logger.info("Shutting down WebBrowsingAgent")
        if self.session:
            await self.session.close()
            self.session = None