my-news-agent

Runtime error

File size: 2,721 Bytes

fd18d93
8fe992b
487a7fe
861422e
bf6d34c
799b253
8fe992b
 
 
 
 
 
 
 
 
 
 
 
e993dbb
8fe992b
 
 
e993dbb
8fe992b
 
799b253
 
 
 
 
 
8fe992b
e993dbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fe992b
e993dbb
8fe992b
e993dbb
 
 
8fe992b

from typing import Any, Optional
from smolagents.tools import Tool
import requests
import markdownify
import smolagents
import re  # Add re import here

class VisitWebpageTool(Tool):
    name = "visit_webpage"
    description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
    inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
    output_type = "string"

    def forward(self, url: str) -> str:
        try:
            import requests
            from markdownify import markdownify
            from requests.exceptions import RequestException
            from bs4 import BeautifulSoup
            from smolagents.utils import truncate_content
        except ImportError as e:
            raise ImportError(
                "You must install packages `markdownify`, `requests`, and `beautifulsoup4` to run this tool: run `pip install markdownify requests beautifulsoup4`."
            ) from e
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            
            response = requests.get(url, timeout=20, headers=headers)
            response.raise_for_status()

            # Parse HTML and extract main content
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove unwanted elements
            for element in soup(['script', 'style', 'nav', 'footer', 'iframe', 'header']):
                element.decompose()
            
            # Get main content (prioritize article or main tags)
            main_content = soup.find('article') or soup.find('main') or soup.find('body')
            
            if main_content:
                # Convert only the main content to markdown
                markdown_content = markdownify(str(main_content)).strip()
            else:
                markdown_content = markdownify(response.text).strip()

            # Remove multiple line breaks and limit content
            markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
            
            # Limit content to ~4000 words (roughly 5000 tokens)
            return truncate_content(markdown_content, 4000)

        except requests.exceptions.Timeout:
            return "The request timed out. Please try again later or check the URL."
        except RequestException as e:
            return f"Error fetching the webpage: {str(e)}"
        except Exception as e:
            return f"An unexpected error occurred: {str(e)}"

    def __init__(self, *args, **kwargs):
        self.is_initialized = False