Final_Assignment_Template

Sleeping

App Files Files Community

altozachmo commited on Apr 28

Commit

968a67a

1 Parent(s): 9ff7774

add start

Browse files

Files changed (9) hide show

agents/agent.py +30 -14
agents/video_agent.py +70 -0
app.py +1 -0
prompts/__init__.py +0 -0
prompts/helium.py +45 -0
pyproject.toml +3 -0
tools/video_analyzer.py +232 -157
tools/web_utils.py +39 -0
uv.lock +119 -0

agents/agent.py CHANGED Viewed

@@ -3,27 +3,46 @@ from smolagents import (
     DuckDuckGoSearchTool,
     WikipediaSearchTool,
     LiteLLMModel,
 )
 from tools.text_search import TextSearch
 from tools.text_splitter import text_splitter
-from tools.video_analyzer import YouTubeObjectCounterTool
 class MyAgent:
     def __init__(
         self,
         provider: str = "litellm",
-        model_id: str = "ollama_chat/gemma3:12b-it-qat",
         api_base: str | None = None,
         api_key: str | None = None,
         planning_interval: int = 3,
         num_ctx: int = 8192,
     ):
         """
         Initializes the agent depending on the provider and model ID.
         Args:
             provider (str): The provider of the model (e.g., "litellm", "huggingface").
             model_id (str): The ID of the model to be used.
         Returns:
             None: None
         """
@@ -33,30 +52,27 @@ class MyAgent:
         self.api_key = api_key
         self.planning_interval = planning_interval
         self.num_ctx = num_ctx
         model = LiteLLMModel(
             model_id=self.model_id,
             api_base=self.api_base,
             api_key=self.api_key,
             num_ctx=self.num_ctx,
-            add_base_tools=True,
         )
-        tools = [
-            DuckDuckGoSearchTool(),  # Search tool for web queries
-            WikipediaSearchTool(),  # Search tool for Wikipedia queries
-            TextSearch(),  # Search tool for text queries
-            text_splitter,  # Text splitter tool for breaking down large texts
-            # into manageable lists.
-            YouTubeObjectCounterTool(),  # Tool for analyzing YouTube videos
-        ]
         # Initialize the agent with the specified provider and model ID
         if provider == "litellm":
             self.agent = CodeAgent(
                 model=model,
                 tools=tools,
-                planning_interval=planning_interval,
             )
         else:
             raise ValueError(f"Unsupported provider: {provider}")

     DuckDuckGoSearchTool,
     WikipediaSearchTool,
     LiteLLMModel,
+    Tool,
 )
 from tools.text_search import TextSearch
 from tools.text_splitter import text_splitter
+from tools.video_analyzer import WebVideoAnalyzerTool
+from typing import Callable
 class MyAgent:
     def __init__(
         self,
         provider: str = "litellm",
+        model_id: str = "gemini/gemini-2.0-flash-lite",
         api_base: str | None = None,
         api_key: str | None = None,
         planning_interval: int = 3,
         num_ctx: int = 8192,
+        tools: list[Tool] = [],
+        add_base_tools: bool = True,
+        temperature: float = 0.2,
+        additional_authorized_imports: list[str] = [],
+        step_callbacks: list[Callable] = [],
+        max_steps: int = 20,
+        verbosity_level: int = 2,
     ):
         """
         Initializes the agent depending on the provider and model ID.
         Args:
             provider (str): The provider of the model (e.g., "litellm", "huggingface").
             model_id (str): The ID of the model to be used.
+            tools (list[Tool]): The tools to be used by the agent.
+            api_base (str | None): The base URL of the API.
+            api_key (str | None): The API key.
+            planning_interval (int): The interval for planning.
+            num_ctx (int): The number of context tokens.
+            add_base_tools (bool): Whether to add base tools.
+            temperature (float): The temperature for the model.
+            additional_authorized_imports (list[str]): The additional authorized imports.
+            step_callbacks (list[Callable]): The step callbacks.
+            max_steps (int): The maximum steps.
+            verbosity_level (int): The verbosity level.
         Returns:
             None: None
         """
         self.api_key = api_key
         self.planning_interval = planning_interval
         self.num_ctx = num_ctx
+        self.temperature = temperature
         model = LiteLLMModel(
             model_id=self.model_id,
             api_base=self.api_base,
             api_key=self.api_key,
             num_ctx=self.num_ctx,
+            add_base_tools=add_base_tools,
+            temperature=self.temperature,
         )
         # Initialize the agent with the specified provider and model ID
         if provider == "litellm":
             self.agent = CodeAgent(
                 model=model,
                 tools=tools,
+                planning_interval=self.planning_interval,
+                additional_authorized_imports=additional_authorized_imports,
+                step_callbacks=step_callbacks,
+                max_steps=max_steps,
+                verbosity_level=verbosity_level,
             )
         else:
             raise ValueError(f"Unsupported provider: {provider}")

agents/video_agent.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from io import BytesIO
+from time import sleep
+import os
+import sys
+# Add the parent directory to the Python path so modules can be found
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import helium
+from dotenv import load_dotenv
+from PIL import Image
+from selenium import webdriver
+from smolagents import CodeAgent
+from smolagents.agents import ActionStep
+from agents.agent import MyAgent
+from prompts.helium import HELIUM_PROMPT
+load_dotenv()
+# Configure Chrome options
+chrome_options = webdriver.ChromeOptions()
+chrome_options.add_argument("--force-device-scale-factor=1")
+chrome_options.add_argument("--window-size=1000,1350")
+chrome_options.add_argument("--disable-pdf-viewer")
+chrome_options.add_argument("--window-position=0,0")
+# Initialize the browser
+driver = helium.start_chrome(headless=False, options=chrome_options)
+def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
+    sleep(1.0)  # Let JavaScript animations happen before taking the screenshot
+    driver = helium.get_driver()
+    current_step = memory_step.step_number
+    if driver is not None:
+        for previous_memory_step in agent.memory.steps:  # Remove previous screenshots for lean processing
+            if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
+                previous_memory_step.observations_images = None
+        png_bytes = driver.get_screenshot_as_png()
+        image = Image.open(BytesIO(png_bytes))
+        print(f"Captured a browser screenshot: {image.size} pixels")
+        memory_step.observations_images = [image.copy()]  # Create a copy to ensure it persists
+    # Update observations with current URL
+    url_info = f"Current url: {driver.current_url}"
+    memory_step.observations = (
+        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
+    )
+video_agent = MyAgent(
+    api_key=os.getenv("GEMINI_API_KEY"),
+    temperature=0.0,
+    add_base_tools=False,
+    additional_authorized_imports=["helium"],
+    step_callbacks=[save_screenshot],
+    max_steps=20,
+    verbosity_level=2,
+)
+video_agent.agent.python_executor("from helium import *", video_agent.agent.state)
+search_request = """
+Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
+"""
+agent_output = video_agent(search_request + HELIUM_PROMPT)
+print("Final output:")
+print(agent_output)

app.py CHANGED Viewed

@@ -36,6 +36,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             api_key=os.getenv("GEMINI_API_KEY"),
             planning_interval=3,
             num_ctx=8192,
         )
     except Exception as e:

             api_key=os.getenv("GEMINI_API_KEY"),
             planning_interval=3,
             num_ctx=8192,
+            temperature=0.2,
         )
     except Exception as e:

prompts/__init__.py ADDED Viewed

File without changes

prompts/helium.py ADDED Viewed

	@@ -0,0 +1,45 @@

+HELIUM_PROMPT = """
+You can use helium to access websites. Don't bother about the helium driver, it's already managed.
+We've already ran "from helium import *"
+Then you can go to pages!
+Code:
+```py
+go_to('github.com/trending')
+```<end_code>
+You can directly click clickable elements by inputting the text that appears on them.
+Code:
+```py
+click("Top products")
+```<end_code>
+If it's a link:
+Code:
+```py
+click(Link("Top products"))
+```<end_code>
+If you try to interact with an element and it's not found, you'll get a LookupError.
+In general stop your action after each button click to see what happens on your screenshot.
+Never try to login in a page.
+To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
+Code:
+```py
+scroll_down(num_pixels=1200) # This will scroll one viewport down
+```<end_code>
+When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
+Just use your built-in tool `close_popups` to close them:
+Code:
+```py
+close_popups()
+```<end_code>
+You can use .exists() to check for the existence of an element. For example:
+Code:
+```py
+if Text('Accept cookies?').exists():
+    click('I accept')
+```<end_code>
+"""

pyproject.toml CHANGED Viewed

@@ -7,13 +7,16 @@ requires-python = ">=3.12"
 dependencies = [
     "ffmpeg>=1.4",
     "gradio[oauth]>=5.27.0",
     "litellm==1.67.1",
     "numpy>=2.2.5",
     "openai>=1.76.0",
     "opencv-python>=4.11.0.86",
     "pandas>=2.2.3",
     "python-dotenv>=1.1.0",
     "requests>=2.32.3",
     "smolagents[litellm]>=1.14.0",
     "timm>=1.0.15",
     "torch>=2.7.0",

 dependencies = [
     "ffmpeg>=1.4",
     "gradio[oauth]>=5.27.0",
+    "helium>=5.1.1",
     "litellm==1.67.1",
     "numpy>=2.2.5",
     "openai>=1.76.0",
     "opencv-python>=4.11.0.86",
     "pandas>=2.2.3",
+    "pillow>=11.2.1",
     "python-dotenv>=1.1.0",
     "requests>=2.32.3",
+    "selenium>=4.31.0",
     "smolagents[litellm]>=1.14.0",
     "timm>=1.0.15",
     "torch>=2.7.0",

tools/video_analyzer.py CHANGED Viewed

@@ -1,199 +1,274 @@
 from smolagents import Tool
 import os
-import cv2
 import tempfile
-from yt_dlp import YoutubeDL
 from transformers import pipeline
-from typing import Any
 from PIL import Image
-class YouTubeObjectCounterTool(Tool):
-    name = "youtube_object_counter"
-    description = "Analyzes a YouTube video frame by frame and counts the number of objects of a specified type visible in each frame."
     inputs = {
         "url": {
             "type": "string",
-            "description": "The URL of the YouTube video to analyze.",
         },
         "label": {
             "type": "string",
             "description": "The type of object to count (e.g., 'bird', 'person', 'car', 'dog'). Use common object names recognized by standard object detection models.",
         },
     }
     output_type = "string"
-    def _download_video(self, url):
-        """Downloads the YouTube video to a temporary file."""
-        print(f"Downloading video from {url}...")
-        temp_dir = tempfile.mkdtemp()
-        video_path = os.path.join(temp_dir, "video.mp4")
-        ydl_opts = {
-            "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
-            "outtmpl": video_path,
-            "quiet": True,
-            "no_warnings": True,
-        }
         try:
-            with YoutubeDL(ydl_opts) as ydl:
-                ydl.download([url])
-            print(f"Video downloaded to {video_path}")
-            return video_path
         except Exception as e:
-            error_msg = f"Error downloading video: {str(e)}"
-            print(error_msg)
-            raise RuntimeError(error_msg)
-    def _count_objects_in_frame(self, frame, label: str):
-        """Counts objects of specified label in a single frame using the object detection model."""
         try:
-            # Convert OpenCV BGR frame to RGB
-            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            # Convert numpy array to PIL Image
-            pil_image = Image.fromarray(rgb_frame)
-            # Load the detector
-            detector = pipeline("object-detection", model="facebook/detr-resnet-50")
-            # Run detection with PIL Image
-            results = detector(pil_image)
             # Count objects matching the label
-            object_count = sum(
-                1 for result in results if label.lower() in result["label"].lower()
-            )
             return object_count
         except Exception as e:
-            print(f"Error detecting objects in frame: {str(e)}")
             return 0
-    def _analyze_video(self, video_path: str, label: str) -> dict[str, Any]:
-        """Analyzes the video frame by frame and counts objects of the specified label."""
-        sample_rate = 30
-        print(
-            f"Analyzing video {video_path}, looking for '{label}' objects, sampling every {sample_rate} frames..."
-        )
-        # Open the video file
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            raise RuntimeError(f"Error: Could not open video file {video_path}")
-        # Get video properties
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        duration = frame_count / fps
-        # Initialize results
-        frame_results = []
-        total_objects = 0
-        max_objects = 0
-        max_objects_frame = 0
-        frame_idx = 0
-        # Process frames
-        while cap.isOpened():
-            ret, frame = cap.read()
-            if not ret:
-                break
-            # Only process every nth frame
-            if frame_idx % sample_rate == 0:
-                time_point = frame_idx / fps
-                print(f"Processing frame {frame_idx} at time {time_point:.2f}s...")
-                object_count = self._count_objects_in_frame(frame, label)
-                total_objects += object_count
-                if object_count > max_objects:
-                    max_objects = object_count
-                    max_objects_frame = frame_idx
-                frame_results.append(
-                    {
-                        "frame": frame_idx,
-                        "time": time_point,
-                        "object_count": object_count,
-                    }
-                )
-            frame_idx += 1
-        # Release resources
-        cap.release()
-        # Calculate statistics
-        avg_objects_per_frame = (
-            total_objects / len(frame_results) if frame_results else 0
-        )
-        max_objects_time = max_objects_frame / fps if max_objects_frame else 0
-        # Clean up the temporary file
-        try:
-            os.remove(video_path)
-            print(f"Deleted temporary video file: {video_path}")
-        except Exception as e:
-            print(
-                f"Warning: Failed to delete temporary video file: {video_path} | {str(e)}"
-            )
-        return {
-            "frame_results": frame_results,
-            "total_frames_analyzed": len(frame_results),
-            "video_duration": duration,
-            "fps": fps,
-            "total_frames": frame_count,
-            "average_objects_per_analyzed_frame": avg_objects_per_frame,
-            "max_objects_in_single_frame": max_objects,
-            "max_objects_frame": max_objects_frame,
-            "max_objects_time": max_objects_time,
-            "label": label,
-        }
-    def forward(self, url: str, label: str) -> str:
         """
-        Analyzes a YouTube video frame by frame and counts objects of the specified type.
         Args:
-            url (str): The URL of the YouTube video to analyze.
             label (str): The type of object to count (e.g., 'bird', 'person', 'car', 'dog').
         Returns:
-            str: A detailed report of object counts per frame and summary statistics.
         """
         try:
-            # Download the video
-            video_path = self._download_video(url)
-            # Analyze the video
-            results = self._analyze_video(video_path, label)
             # Generate a report
             report = [
-                f"# {label.title()} Count Analysis for YouTube Video",
                 f"Video URL: {url}",
-                f"Video duration: {results['video_duration']:.2f} seconds",
-                f"Analyzed {results['total_frames_analyzed']} frames out of {results['total_frames']} total frames",
-                f"Sampling rate: 1 frame every 30 frames (approximately {results['fps'] / 30:.2f} frames per second)",
                 "## Summary",
-                f"Average {label}s per analyzed frame: {results['average_objects_per_analyzed_frame']:.2f}",
-                f"Maximum {label}s in a single frame: {results['max_objects_in_single_frame']} (at {results['max_objects_time']:.2f} seconds)",
             ]
             # Add frame-by-frame details
-            report.append("## Frame-by-Frame Analysis")
-            for result in results["frame_results"]:
-                report.append(
-                    f"Frame {result['frame']} (Time: {result['time']:.2f}s): {result['object_count']} {label}s"
-                )
             return "\n".join(report)
         except Exception as e:
             return f"Error analyzing video: {str(e)}"

 from smolagents import Tool
 import os
+import time
 import tempfile
 from transformers import pipeline
+from typing import List, Dict
 from PIL import Image
+import io
+# Import required browser automation libraries
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import helium
+class WebVideoAnalyzerTool(Tool):
+    name = "web_video_analyzer"
+    description = "Analyzes a video on a webpage (YouTube, Vimeo, etc.) by taking screenshots at intervals and counting objects of a specified type in each frame."
     inputs = {
         "url": {
             "type": "string",
+            "description": "The URL of the web page containing the video to analyze.",
         },
         "label": {
             "type": "string",
             "description": "The type of object to count (e.g., 'bird', 'person', 'car', 'dog'). Use common object names recognized by standard object detection models.",
         },
+        "duration": {
+            "type": "integer",
+            "description": "How many seconds of the video to analyze (default: 30)",
+        },
+        "interval": {
+            "type": "integer",
+            "description": "How often to take screenshots (in seconds, default: 1)",
+        }
     }
     output_type = "string"
+    def _setup_browser(self):
+        """Initialize the browser with appropriate settings."""
+        if self.driver is not None:
+            return self.driver
+        print("Setting up browser...")
+        # Configure Chrome options
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_argument("--force-device-scale-factor=1")
+        chrome_options.add_argument("--window-size=1280,720")
+        chrome_options.add_argument("--disable-pdf-viewer")
+        chrome_options.add_argument("--window-position=0,0")
+        chrome_options.add_argument("--autoplay-policy=no-user-gesture-required")
+        # Initialize the driver
+        self.driver = helium.start_chrome(headless=False, options=chrome_options)
+        return self.driver
+    def _navigate_to_video(self, url: str) -> bool:
+        """Navigate to the video URL and prepare for playback."""
         try:
+            print(f"Navigating to {url}...")
+            helium.go_to(url)
+            # Wait for page to load
+            time.sleep(3)
+            # Handle YouTube-specific interactions
+            if "youtube.com" in url:
+                try:
+                    # Accept cookies if prompted
+                    if helium.Button("Accept all").exists():
+                        helium.click("Accept all")
+                    elif helium.Button("I agree").exists():
+                        helium.click("I agree")
+                    # Click on the video to ensure it's playing
+                    try:
+                        # Find the video player element
+                        video_element = WebDriverWait(self.driver, 10).until(
+                            EC.presence_of_element_located((By.TAG_NAME, "video"))
+                        )
+                        video_element.click()
+                        # Ensure the video is playing by trying to click the play button if visible
+                        try:
+                            play_button = self.driver.find_element(By.CLASS_NAME, "ytp-play-button")
+                            if "Play" in play_button.get_attribute("aria-label"):
+                                play_button.click()
+                        except:
+                            pass
+                    except:
+                        print("Could not locate video element to click")
+                except Exception as e:
+                    print(f"Error during YouTube setup: {str(e)}")
+            # General approach - try to find and click on any video element
+            else:
+                try:
+                    # Try to find video element
+                    video_elements = self.driver.find_elements(By.TAG_NAME, "video")
+                    if video_elements:
+                        video_elements[0].click()
+                except Exception as e:
+                    print(f"Could not find or click video element: {str(e)}")
+            # Allow video to start
+            time.sleep(2)
+            return True
         except Exception as e:
+            print(f"Error navigating to {url}: {str(e)}")
+            return False
+    def _close_popups(self):
+        """Attempt to close any popups or overlays."""
         try:
+            # Try pressing Escape key to close general popups
+            webdriver.ActionChains(self.driver).send_keys(Keys.ESCAPE).perform()
+            # YouTube-specific: try to close any visible dialog or popup
+            if "youtube.com" in self.driver.current_url:
+                # Try to find and click close buttons on popups
+                try:
+                    close_buttons = self.driver.find_elements(By.CSS_SELECTOR,
+                                                             "button.ytp-ad-overlay-close-button, button.ytp-ad-skip-button")
+                    for button in close_buttons:
+                        button.click()
+                except:
+                    pass
+        except Exception as e:
+            print(f"Error closing popups: {str(e)}")
+    def _take_screenshot(self) -> Image.Image:
+        """Take a screenshot of the current browser window."""
+        png_bytes = self.driver.get_screenshot_as_png()
+        return Image.open(io.BytesIO(png_bytes))
+    def _analyze_screenshot(self, image: Image.Image, label: str) -> int:
+        """Count objects of the specified label in a screenshot."""
+        detector = pipeline("object-detection", model="facebook/detr-resnet-50")
+        try:
+            # Run detection on the image
+            results = detector(image)
             # Count objects matching the label
+            object_count = sum(1 for result in results if label.lower() in result["label"].lower())
+            # Debug: print detected classes
+            detected_classes = [result["label"] for result in results]
+            if detected_classes:
+                print(f"Detected classes: {', '.join(detected_classes)}")
             return object_count
         except Exception as e:
+            print(f"Error detecting objects in screenshot: {str(e)}")
             return 0
+    def _capture_video_frames(self, duration: int = 30, interval: int = 1, label: str = "") -> List[Dict]:
+        """Capture frames from the video at regular intervals."""
+        results = []
+        print(f"Starting frame capture for {duration} seconds with {interval} second intervals...")
+        temp_dir = tempfile.mkdtemp()
+        for seconds_elapsed in range(0, duration, interval):
+            # Take screenshot
+            try:
+                print(f"Capturing frame at {seconds_elapsed} seconds...")
+                screenshot = self._take_screenshot()
+                # Save screenshot for debugging (optional)
+                screenshot_path = os.path.join(temp_dir, f"frame_{seconds_elapsed}.jpg")
+                screenshot.save(screenshot_path)
+                # Analyze screenshot
+                object_count = self._analyze_screenshot(screenshot, label)
+                # Store results
+                results.append({
+                    "time": seconds_elapsed,
+                    "object_count": object_count,
+                    "screenshot_path": screenshot_path
+                })
+                # Wait for next interval
+                if seconds_elapsed + interval < duration:
+                    time.sleep(interval)
+            except Exception as e:
+                print(f"Error capturing frame at {seconds_elapsed} seconds: {str(e)}")
+        return results
+    def forward(self, url: str, label: str, duration: int = 30, interval: int = 1) -> str:
         """
+        Analyzes a video on a webpage by taking screenshots and counting objects.
         Args:
+            url (str): The URL of the webpage containing the video.
             label (str): The type of object to count (e.g., 'bird', 'person', 'car', 'dog').
+            duration (int): How many seconds of the video to analyze.
+            interval (int): How often to take screenshots (in seconds).
         Returns:
+            str: A detailed report of object counts over time.
         """
         try:
+            # Setup the browser
+            self._setup_browser()
+            # Navigate to the video
+            if not self._navigate_to_video(url):
+                return f"Error: Could not navigate to or play the video at {url}"
+            # Close any popups or overlays
+            self._close_popups()
+            # Capture and analyze frames
+            frame_results = self._capture_video_frames(duration, interval, label)
+            # Calculate summary statistics
+            if not frame_results:
+                return f"Error: No frames were successfully captured and analyzed"
+            total_objects = sum(result["object_count"] for result in frame_results)
+            avg_objects = total_objects / len(frame_results)
+            max_objects = max(frame_results, key=lambda x: x["object_count"])
             # Generate a report
             report = [
+                f"# {label.title()} Count Analysis for Video",
                 f"Video URL: {url}",
+                f"Analysis duration: {duration} seconds",
+                f"Screenshots taken: {len(frame_results)} (every {interval} second(s))",
+                "",
                 "## Summary",
+                f"Total {label}s detected: {total_objects}",
+                f"Average {label}s per screenshot: {avg_objects:.2f}",
+                f"Maximum {label}s in a single screenshot: {max_objects['object_count']} (at {max_objects['time']} seconds)",
+                "",
+                "## Time-based Analysis"
             ]
             # Add frame-by-frame details
+            for result in frame_results:
+                report.append(f"Time {result['time']} seconds: {result['object_count']} {label}s")
+            # Clean up
+            try:
+                helium.kill_browser()
+                self.driver = None
+            except:
+                print("Warning: Could not properly close the browser")
             return "\n".join(report)
         except Exception as e:
+            # Ensure browser is closed on error
+            try:
+                if self.driver:
+                    helium.kill_browser()
+                    self.driver = None
+            except:
+                pass
             return f"Error analyzing video: {str(e)}"

tools/web_utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from smolagents import tool
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.by import By
+driver = None
+@tool
+def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
+    """
+    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
+    Args:
+        text: The text to search for
+        nth_result: Which occurrence to jump to (default: 1)
+    """
+    if driver:
+        elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
+        if nth_result > len(elements):
+            raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
+        result = f"Found {len(elements)} matches for '{text}'."
+        elem = elements[nth_result - 1]
+        driver.execute_script("arguments[0].scrollIntoView(true);", elem)
+        result += f"Focused on element {nth_result} of {len(elements)}"
+    return result
+@tool
+def go_back() -> None:
+    """Goes back to previous page."""
+    if driver:
+        driver.back()
+@tool
+def close_popups() -> str:
+    """
+    Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows!
+    This does not work on cookie consent banners.
+    """
+    if driver:
+        webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()

uv.lock CHANGED Viewed

@@ -530,6 +530,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515 },
 ]
 [[package]]
 name = "hf-agents-gaia-agent"
 version = "0.1.0"
@@ -537,13 +546,16 @@ source = { virtual = "." }
 dependencies = [
     { name = "ffmpeg" },
     { name = "gradio", extra = ["oauth"] },
     { name = "litellm" },
     { name = "numpy" },
     { name = "openai" },
     { name = "opencv-python" },
     { name = "pandas" },
     { name = "python-dotenv" },
     { name = "requests" },
     { name = "smolagents", extra = ["litellm"] },
     { name = "timm" },
     { name = "torch" },
@@ -557,13 +569,16 @@ dependencies = [
 requires-dist = [
     { name = "ffmpeg", specifier = ">=1.4" },
     { name = "gradio", extras = ["oauth"], specifier = ">=5.27.0" },
     { name = "litellm", specifier = "==1.67.1" },
     { name = "numpy", specifier = ">=2.2.5" },
     { name = "openai", specifier = ">=1.76.0" },
     { name = "opencv-python", specifier = ">=4.11.0.86" },
     { name = "pandas", specifier = ">=2.2.3" },
     { name = "python-dotenv", specifier = ">=1.1.0" },
     { name = "requests", specifier = ">=2.32.3" },
     { name = "smolagents", extras = ["litellm"], specifier = ">=1.14.0" },
     { name = "timm", specifier = ">=1.0.15" },
     { name = "torch", specifier = ">=2.7.0" },
@@ -1180,6 +1195,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/81/9c/b66ce9245ff319df2c3278acd351a3f6145ef34b4a2d7f4b0f739368370f/orjson-3.10.16-cp313-cp313-win_amd64.whl", hash = "sha256:fe0a145e96d51971407cb8ba947e63ead2aa915db59d6631a355f5f2150b56b7", size = 133954 },
 ]
 [[package]]
 name = "packaging"
 version = "25.0"
@@ -1421,6 +1448,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 },
 ]
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -1672,6 +1708,23 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/69/e2/b011c38e5394c4c18fb5500778a55ec43ad6106126e74723ffaee246f56e/safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11", size = 308878 },
 ]
 [[package]]
 name = "semantic-version"
 version = "2.10.0"
@@ -1741,6 +1794,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 },
 ]
 [[package]]
 name = "soupsieve"
 version = "2.7"
@@ -1948,6 +2010,37 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a9/b6/5257d04ae327b44db31f15cce39e6020cc986333c715660b1315a9724d82/transformers-4.51.3-py3-none-any.whl", hash = "sha256:fd3279633ceb2b777013234bbf0b4f5c2d23c4626b05497691f00cfda55e8a83", size = 10383940 },
 ]
 [[package]]
 name = "triton"
 version = "3.3.0"
@@ -2015,6 +2108,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680 },
 ]
 [[package]]
 name = "uvicorn"
 version = "0.34.2"
@@ -2028,6 +2126,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b1/4b/4cef6ce21a2aaca9d852a6e84ef4f135d99fcd74fa75105e2fc0c8308acd/uvicorn-0.34.2-py3-none-any.whl", hash = "sha256:deb49af569084536d269fe0a6d67e3754f104cf03aba7c11c40f01aadf33c403", size = 62483 },
 ]
 [[package]]
 name = "websockets"
 version = "15.0.1"
@@ -2068,6 +2175,18 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b9/aa/2e35be124dfc7e581480705f912040172f6570cc12e68a245ba9258c32ef/wikipedia_api-0.8.1.tar.gz", hash = "sha256:b31e93b3f5407c1a1ba413ed7326a05379a3c270df6cf6a211aca67a14c5658b", size = 19934 }
 [[package]]
 name = "yarl"
 version = "1.20.0"

     { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515 },
 ]
+[[package]]
+name = "helium"
+version = "5.1.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "selenium" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/71/6f2bef5db8741467848f2b2c7f7818df44234df0de0917428a16da3f6e81/helium-5.1.1.tar.gz", hash = "sha256:8232b6597d24b435cda4e18a95ae883ff0bdcdbff70f98a6cb41133864d2d493", size = 40494 }
 [[package]]
 name = "hf-agents-gaia-agent"
 version = "0.1.0"
 dependencies = [
     { name = "ffmpeg" },
     { name = "gradio", extra = ["oauth"] },
+    { name = "helium" },
     { name = "litellm" },
     { name = "numpy" },
     { name = "openai" },
     { name = "opencv-python" },
     { name = "pandas" },
+    { name = "pillow" },
     { name = "python-dotenv" },
     { name = "requests" },
+    { name = "selenium" },
     { name = "smolagents", extra = ["litellm"] },
     { name = "timm" },
     { name = "torch" },
 requires-dist = [
     { name = "ffmpeg", specifier = ">=1.4" },
     { name = "gradio", extras = ["oauth"], specifier = ">=5.27.0" },
+    { name = "helium", specifier = ">=5.1.1" },
     { name = "litellm", specifier = "==1.67.1" },
     { name = "numpy", specifier = ">=2.2.5" },
     { name = "openai", specifier = ">=1.76.0" },
     { name = "opencv-python", specifier = ">=4.11.0.86" },
     { name = "pandas", specifier = ">=2.2.3" },
+    { name = "pillow", specifier = ">=11.2.1" },
     { name = "python-dotenv", specifier = ">=1.1.0" },
     { name = "requests", specifier = ">=2.32.3" },
+    { name = "selenium", specifier = ">=4.31.0" },
     { name = "smolagents", extras = ["litellm"], specifier = ">=1.14.0" },
     { name = "timm", specifier = ">=1.0.15" },
     { name = "torch", specifier = ">=2.7.0" },
     { url = "https://files.pythonhosted.org/packages/81/9c/b66ce9245ff319df2c3278acd351a3f6145ef34b4a2d7f4b0f739368370f/orjson-3.10.16-cp313-cp313-win_amd64.whl", hash = "sha256:fe0a145e96d51971407cb8ba947e63ead2aa915db59d6631a355f5f2150b56b7", size = 133954 },
 ]
+[[package]]
+name = "outcome"
+version = "1.3.0.post0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "attrs" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/98/df/77698abfac98571e65ffeb0c1fba8ffd692ab8458d617a0eed7d9a8d38f2/outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8", size = 21060 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/55/8b/5ab7257531a5d830fc8000c476e63c935488d74609b50f9384a643ec0a62/outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b", size = 10692 },
+]
 [[package]]
 name = "packaging"
 version = "25.0"
     { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 },
 ]
+[[package]]
+name = "pysocks"
+version = "1.7.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/bd/11/293dd436aea955d45fc4e8a35b6ae7270f5b8e00b53cf6c024c83b657a11/PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0", size = 284429 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", size = 16725 },
+]
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
     { url = "https://files.pythonhosted.org/packages/69/e2/b011c38e5394c4c18fb5500778a55ec43ad6106126e74723ffaee246f56e/safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11", size = 308878 },
 ]
+[[package]]
+name = "selenium"
+version = "4.31.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "trio" },
+    { name = "trio-websocket" },
+    { name = "typing-extensions" },
+    { name = "urllib3", extra = ["socks"] },
+    { name = "websocket-client" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e0/bf/642cce8b5a9edad8e4880fdefbeb24f69bec2086b1121c63f883c412b797/selenium-4.31.0.tar.gz", hash = "sha256:441cffc436a2e6659fe3cfb012692435652efd38b0d368d16f661a5db47825f5", size = 855418 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/53/212db779d2481b0a8428365960596f8d5a4d482ae12c441d0507fd54aaf2/selenium-4.31.0-py3-none-any.whl", hash = "sha256:7b8b8d5e424d7133cb7aa656263b19ac505ec26d65c0f921a696e7e2c5ccd95b", size = 9350584 },
+]
 [[package]]
 name = "semantic-version"
 version = "2.10.0"
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 },
 ]
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575 },
+]
 [[package]]
 name = "soupsieve"
 version = "2.7"
     { url = "https://files.pythonhosted.org/packages/a9/b6/5257d04ae327b44db31f15cce39e6020cc986333c715660b1315a9724d82/transformers-4.51.3-py3-none-any.whl", hash = "sha256:fd3279633ceb2b777013234bbf0b4f5c2d23c4626b05497691f00cfda55e8a83", size = 10383940 },
 ]
+[[package]]
+name = "trio"
+version = "0.30.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "attrs" },
+    { name = "cffi", marker = "(implementation_name != 'pypy' and os_name == 'nt' and platform_machine != 'aarch64' and sys_platform == 'linux') or (implementation_name != 'pypy' and os_name == 'nt' and sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "idna" },
+    { name = "outcome" },
+    { name = "sniffio" },
+    { name = "sortedcontainers" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/01/c1/68d582b4d3a1c1f8118e18042464bb12a7c1b75d64d75111b297687041e3/trio-0.30.0.tar.gz", hash = "sha256:0781c857c0c81f8f51e0089929a26b5bb63d57f927728a5586f7e36171f064df", size = 593776 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/69/8e/3f6dfda475ecd940e786defe6df6c500734e686c9cd0a0f8ef6821e9b2f2/trio-0.30.0-py3-none-any.whl", hash = "sha256:3bf4f06b8decf8d3cf00af85f40a89824669e2d033bb32469d34840edcfc22a5", size = 499194 },
+]
+[[package]]
+name = "trio-websocket"
+version = "0.12.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "outcome" },
+    { name = "trio" },
+    { name = "wsproto" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/3c/8b4358e81f2f2cfe71b66a267f023a91db20a817b9425dd964873796980a/trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae", size = 33549 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c7/19/eb640a397bba49ba49ef9dbe2e7e5c04202ba045b6ce2ec36e9cadc51e04/trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6", size = 21221 },
+]
 [[package]]
 name = "triton"
 version = "3.3.0"
     { url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680 },
 ]
+[package.optional-dependencies]
+socks = [
+    { name = "pysocks" },
+]
 [[package]]
 name = "uvicorn"
 version = "0.34.2"
     { url = "https://files.pythonhosted.org/packages/b1/4b/4cef6ce21a2aaca9d852a6e84ef4f135d99fcd74fa75105e2fc0c8308acd/uvicorn-0.34.2-py3-none-any.whl", hash = "sha256:deb49af569084536d269fe0a6d67e3754f104cf03aba7c11c40f01aadf33c403", size = 62483 },
 ]
+[[package]]
+name = "websocket-client"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e6/30/fba0d96b4b5fbf5948ed3f4681f7da2f9f64512e1d303f94b4cc174c24a5/websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da", size = 54648 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/84/44687a29792a70e111c5c477230a72c4b957d88d16141199bf9acb7537a3/websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526", size = 58826 },
+]
 [[package]]
 name = "websockets"
 version = "15.0.1"
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b9/aa/2e35be124dfc7e581480705f912040172f6570cc12e68a245ba9258c32ef/wikipedia_api-0.8.1.tar.gz", hash = "sha256:b31e93b3f5407c1a1ba413ed7326a05379a3c270df6cf6a211aca67a14c5658b", size = 19934 }
+[[package]]
+name = "wsproto"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c9/4a/44d3c295350d776427904d73c189e10aeae66d7f555bb2feee16d1e4ba5a/wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065", size = 53425 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/58/e860788190eba3bcce367f74d29c4675466ce8dddfba85f7827588416f01/wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736", size = 24226 },
+]
 [[package]]
 name = "yarl"
 version = "1.20.0"