webmetaextractor

Runtime error

File size: 4,283 Bytes

9b5b26a
 
 
 
c19d193
6aae614
27f26c0
8fe992b
9b5b26a
 
5df72d6
930fb8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b5b26a
27f26c0
 
 
9b5b26a
27f26c0
 
 
 
9b5b26a
27f26c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b5b26a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c01ffb
 
6aae614
ae7a494
 
 
 
e121372
bf6d34c
 
29ec968
fe328e0
13d500a
8c01ffb
 
9b5b26a
 
8c01ffb
861422e
 
9b5b26a
8c01ffb
8fe992b
930fb8e
8c01ffb
 
 
 
 
 
861422e
8fe992b
 
9b5b26a
8c01ffb

from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
import datetime
import requests
import pytz
import yaml
from tools.final_answer import FinalAnswerTool
from bs4 import BeautifulSoup

from Gradio_UI import GradioUI

# Below is an example of a tool that does nothing. Amaze us with your creativity !
@tool
def scrape_webpage(url: str, tag: str = "p", class_name: str = None) -> dict:
    """Extrae contenido de una página web según una etiqueta HTML y clase opcional.
    
    Args:
        url: URL de la página a scrapear.
        tag: Etiqueta HTML a extraer (por defecto <p>).
        class_name: Clase CSS opcional para filtrar resultados.

    Returns:
        Un diccionario con el contenido extraído.
    """
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        
        if class_name:
            elements = soup.find_all(tag, class_=class_name)
        else:
            elements = soup.find_all(tag)

        extracted_data = [element.get_text(strip=True) for element in elements]

        return {"url": url, "scraped_data": extracted_data[:20]}  # Limita a 10 resultados

    except requests.exceptions.RequestException as e:
        return {"error": f"Error al acceder a la URL: {str(e)}"}
    except Exception as e:
        return {"error": f"Error inesperado: {str(e)}"}


@tool
def extract_metadata_from_url(url: str) -> dict:
    """Extrae todos los metadatos de una página web.
    
    Args:
        url: La URL de la página web a analizar.

    Returns:
        Un diccionario con los metadatos encontrados.
    """
    try:
        # Obtener el contenido de la página
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Lanza un error si el request falla

        # Parsear el contenido HTML con BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extraer los metadatos de la página
        metadata = {}
        for meta in soup.find_all('meta'):
            if 'name' in meta.attrs and 'content' in meta.attrs:
                metadata[meta['name']] = meta['content']
            elif 'property' in meta.attrs and 'content' in meta.attrs:
                metadata[meta['property']] = meta['content']

        return metadata if metadata else {"error": "No se encontraron metadatos en la página."}

    except requests.exceptions.RequestException as e:
        return {"error": f"Error al acceder a la URL: {str(e)}"}

@tool
def get_current_time_in_timezone(timezone: str) -> str:
    """A tool that fetches the current local time in a specified timezone.
    Args:
        timezone: A string representing a valid timezone (e.g., 'America/New_York').
    """
    try:
        # Create timezone object
        tz = pytz.timezone(timezone)
        # Get current time in that timezone
        local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
        return f"The current local time in {timezone} is: {local_time}"
    except Exception as e:
        return f"Error fetching time for timezone '{timezone}': {str(e)}"


final_answer = FinalAnswerTool()

# If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder:
# model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud' 

model = HfApiModel(
max_tokens=2096,
temperature=0.5,
model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded
custom_role_conversions=None,
)


# Import tool from Hub
image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)

with open("prompts.yaml", 'r') as stream:
    prompt_templates = yaml.safe_load(stream)
    
agent = CodeAgent(
    model=model,
    tools=[final_answer, extract_metadata_from_url, scrape_webpage], ## add your tools here (don't remove final answer)
    max_steps=6,
    verbosity_level=1,
    grammar=None,
    planning_interval=None,
    name=None,
    description=None,
    prompt_templates=prompt_templates
)


GradioUI(agent).launch()