computer-agent

Running on CPU Upgrade

App Files Files Community

M-Rique commited on Apr 7

Commit

64b82de

1 Parent(s): 5cc8c9a

Fix smolagents integration

Browse files

Files changed (3) hide show

app.py +14 -28
e2bqwen.py +50 -223
eval.py +3 -3

app.py CHANGED Viewed

@@ -16,6 +16,10 @@ from smolagents.gradio_ui import GradioUI, stream_to_gradio
 from model_replay import FakeModelReplayLog
 from gradio_modal import Modal
 from e2bqwen import QwenVLAPIModel, E2BVisionAgent
 E2B_API_KEY = os.getenv("E2B_API_KEY")
@@ -28,7 +32,7 @@ TMP_DIR = './tmp/'
 if not os.path.exists(TMP_DIR):
     os.makedirs(TMP_DIR)
-hf_token = os.getenv("HUGGINGFACE_API_KEY")
 login(token=hf_token)
 custom_css = """
@@ -297,25 +301,6 @@ custom_js = """function() {
 }
 """
-def write_to_console_log(log_file_path, message):
-    """
-    Appends a message to the specified log file with a newline character.
-    Parameters:
-        log_file_path (str): Path to the log file
-        message (str): Message to append to the log file
-    """
-    if log_file_path is None:
-        return False
-    try:
-        # Open the file in append mode
-        with open(log_file_path, 'a') as log_file:
-            # Write the message followed by a newline
-            log_file.write(f"{message}\n")
-        return True
-    except Exception as e:
-        print(f"Error writing to log file: {str(e)}")
-        return False
 def upload_to_hf_and_remove(folder_path):
@@ -472,16 +457,16 @@ def create_agent(data_dir, desktop):
         hf_token = hf_token,
     )
-    model = OpenAIServerModel(
-        "gpt-4o",api_key=os.getenv("OPENAI_API_KEY")
-    )
     return E2BVisionAgent(
         model=model,
         data_dir=data_dir,
         desktop=desktop,
         max_steps=200,
         verbosity_level=2,
-        planning_interval=10,
         use_v1_prompt=True
     )
@@ -527,13 +512,14 @@ class EnrichedGradioUI(GradioUI):
                 yield stored_messages
             # THIS ERASES IMAGES FROM AGENT MEMORY, USE WITH CAUTION
-            if consent_storage:
-                summary = get_agent_summary_erase_images(session_state["agent"])
-                save_final_status(data_dir, "completed", summary = summary)
             yield stored_messages
         except Exception as e:
             error_message=f"Error in interaction: {str(e)}"
             print(error_message)
             stored_messages.append(gr.ChatMessage(role="assistant", content="Run failed:\n" + error_message))
             if consent_storage:
@@ -584,7 +570,7 @@ _Please note that we store the task logs by default so **do not write any person
                     "Check the commuting time between Bern and Zurich on Google maps",
                     "Write 'Hello World' in a text editor",
                     "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
-                    "Search a flight Rome - Berlin for tomorrow",
                     "What' s the name of the pond just south of Château de Fontainebleau in Google maps?",
                     "Go on the Hugging Face Hub, find the space for FLUX1.dev, then generate a picture of the Golden Gate bridge",
                     "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",

 from model_replay import FakeModelReplayLog
 from gradio_modal import Modal
+from dotenv import load_dotenv
+load_dotenv(override=True)
 from e2bqwen import QwenVLAPIModel, E2BVisionAgent
 E2B_API_KEY = os.getenv("E2B_API_KEY")
 if not os.path.exists(TMP_DIR):
     os.makedirs(TMP_DIR)
+hf_token = os.getenv("HF_TOKEN")
 login(token=hf_token)
 custom_css = """
 }
 """
 def upload_to_hf_and_remove(folder_path):
         hf_token = hf_token,
     )
+    # model = OpenAIServerModel(
+    #     "gpt-4o",api_key=os.getenv("OPENAI_API_KEY")
+    # )
     return E2BVisionAgent(
         model=model,
         data_dir=data_dir,
         desktop=desktop,
         max_steps=200,
         verbosity_level=2,
+        # planning_interval=10,
         use_v1_prompt=True
     )
                 yield stored_messages
             # THIS ERASES IMAGES FROM AGENT MEMORY, USE WITH CAUTION
+            # if consent_storage:
+            #     summary = get_agent_summary_erase_images(session_state["agent"])
+            #     save_final_status(data_dir, "completed", summary = summary)
             yield stored_messages
         except Exception as e:
             error_message=f"Error in interaction: {str(e)}"
+            raise e
             print(error_message)
             stored_messages.append(gr.ChatMessage(role="assistant", content="Run failed:\n" + error_message))
             if consent_storage:
                     "Check the commuting time between Bern and Zurich on Google maps",
                     "Write 'Hello World' in a text editor",
                     "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
+                    "Search a flight from Rome to Berlin for tomorrow on Skyscanner",
                     "What' s the name of the pond just south of Château de Fontainebleau in Google maps?",
                     "Go on the Hugging Face Hub, find the space for FLUX1.dev, then generate a picture of the Golden Gate bridge",
                     "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",

e2bqwen.py CHANGED Viewed

@@ -1,15 +1,9 @@
 import os
 import time
-import base64
 from io import BytesIO
-from textwrap import dedent
-from typing import Any, Dict, List, Optional, Tuple
-import json
 import unicodedata
-# HF API params
-from huggingface_hub import InferenceClient
 # E2B imports
 from e2b_desktop import Sandbox
 from PIL import Image
@@ -17,7 +11,8 @@ from PIL import Image
 # SmolaAgents imports
 from smolagents import CodeAgent, tool, HfApiModel
 from smolagents.memory import ActionStep
-from smolagents.models import ChatMessage, MessageRole, Model
 from smolagents.monitoring import LogLevel
 from smolagents.agent_types import AgentImage
 from PIL import ImageDraw
@@ -48,7 +43,7 @@ On top of performing computations in the Python code snippets that you create, y
 {%- endfor %}
 The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels, take it into account to decide clicking coordinates.
-If you clicked somewhere in the previous action, a red crosshair will appear at the exact location of the previous click.
 The image might have change since then but the cross stays at the previous click. If your click seems to have changed nothing, check that this location is exactly where you intended to click. Otherwise correct the click coordinates.
 </tools>
@@ -98,7 +93,7 @@ click(251, 441)
 Step 4:
 Short term goal: I want to open a text editor.
 Where I am: I am still under the Accessories menu.
-What I see: Nothing has changed compared to previous screenshot. Under the open submenu Accessories, I still see 'Text Editor'. The red crosshair is off from the element.
 Reflection: My last click must have been off. Let's correct this.
 Action: I will click the correct place, right in the middle of the element.
 Code:
@@ -145,7 +140,7 @@ On each step, look at the last screenshot and action to validate if previous ste
 Use click to move through menus on the desktop and scroll for web and specific applications.
 Always analyze the latest screenshot carefully before performing actions.
 Desktop menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
-Remember the tools that you have as those can save you time, for example open_url to enter a website rather than searching for the browser in the OS.
 </general_guidelines>
 """
@@ -153,21 +148,13 @@ def draw_marker_on_image(image_copy, click_coordinates):
     x, y = click_coordinates
     draw = ImageDraw.Draw(image_copy)
     cross_size, linewidth = 10, 3
-    # Draw red cross lines
-    draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
-    draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
     # Add a circle around it for better visibility
-    draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
     return image_copy
-from jinja2 import StrictUndefined, Template
-def populate_template(template: str, variables: Dict[str, Any]) -> str:
-    compiled_template = Template(template, undefined=StrictUndefined)
-    return compiled_template.render(**variables)
 class E2BVisionAgent(CodeAgent):
     """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
@@ -179,7 +166,7 @@ class E2BVisionAgent(CodeAgent):
         tools: List[tool] = None,
         max_steps: int = 200,
         verbosity_level: LogLevel = 2,
-        planning_interval: int = 10,
         use_v1_prompt: bool = False,
         **kwargs
     ):
@@ -216,7 +203,7 @@ class E2BVisionAgent(CodeAgent):
         self.step_callbacks.append(self.take_screenshot_callback)
     def initialize_system_prompt(self) -> str:
-        if self.use_v1_prompt:
             return """You are a desktop automation assistant that can control a remote desktop environment.
 You only have access to the following tools to interact with the desktop, no additional ones:
 - click(x, y): Performs a left-click at the specified coordinates
@@ -228,6 +215,8 @@ You only have access to the following tools to interact with the desktop, no add
 - scroll(x, y, direction, amount): Scrolls a website in a browser or a document (direction can be "up" or "down", a common amount is 1 or 2 scroll("down",1) ). DO NOT use scroll to move through linux desktop menus. x, y, is the mouse position to scroll on.
 - wait(seconds): Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
 - open_url(url): Directly opens a browser with the specified url, saves time compared to clicking in a browser and going through the initial setup wizard.
 - final_answer("YOUR FINAL ANSWER TEXT"): Announces that the task requested is completed and provides a final text
 The desktop has a resolution of {resolution_x}x{resolution_y}.
 IMPORTANT:
@@ -247,13 +236,13 @@ After each action, you'll receive an updated screenshot. Review it carefully bef
 COMMAND FORMAT:
 Always format your actions as Python code blocks. For example:
 ```python
-click(250, 300)
 ```<end_code>
 TASK EXAMPLE:
 For a task like "Open a text editor and type 'Hello World'":
 1- First, analyze the screenshot to find the Applications menu and click on it being very precise, clicking in the middle of the text 'Applications':
 ```python
-click(50, 10)
 ```<end_code>
 2- Remembering that menus are navigated through clicking, after analyzing the screenshot with the applications menu open we see that a notes application probably fits in the Accessories section (we see it is a section in the menu thanks to the tiny white triangle after the text accessories). We look for Accessories and click on it being very precise, clicking in the middle of the text 'Accessories'. DO NOT try to move through the menus with scroll, it won't work:
 ```python
@@ -280,6 +269,7 @@ Use click to move through menus on the desktop and scroll for web and specific a
 REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
 """.format(resolution_x=self.width, resolution_y=self.height)
         else:
             system_prompt = populate_template(
                 self.prompt_templates["system_prompt"],
                 variables={
@@ -405,16 +395,18 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
         @tool
         def scroll(x: int, y: int, direction: str = "down", amount: int = 1) -> str:
             """
-            Uses scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
             Args:
                 x: The x coordinate (horizontal position) of the element to scroll/zoom
                 y: The y coordinate (vertical position) of the element to scroll/zoom
                 direction: The direction to scroll ("up" or "down"), defaults to "down"
                 amount: The amount to scroll. A good amount is 1 or 2.
             """
             self.desktop.scroll(direction=direction, amount=amount)
-            self.logger.log(f"Scrolled {direction} by {amount}")
-            return f"Scrolled {direction} by {amount}"
         @tool
         def wait(seconds: float) -> str:
@@ -430,7 +422,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
         @tool
         def open_url(url: str) -> str:
             """
-            Directly opens a browser with the specified url, saves time compared to clicking in a browser and going through the initial setup wizard.
             Args:
                 url: The URL to open
             """
@@ -494,9 +486,9 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
         image_copy = image.copy()
-        if getattr(self, "click_coordinates", None):
-            print("DRAWING MARKER")
-            image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
         self.last_marked_screenshot = AgentImage(screenshot_path)
         print(f"Saved screenshot for step {current_step} to {screenshot_path}")
@@ -506,7 +498,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
         ) in agent.memory.steps:  # Remove previous screenshots from logs for lean processing
             if (
                 isinstance(previous_memory_step, ActionStep)
-                and previous_memory_step.step_number <= current_step - 2
             ):
                 previous_memory_step.observations_images = None
@@ -535,81 +527,27 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
             print("E2B sandbox terminated")
-# class QwenVLAPIModel(Model):
-#     """Model wrapper for Qwen2.5VL API with fallback mechanism"""
-#     def __init__(
-#         self,
-#         model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
-#         hf_token: str = None,
-#     ):
-#         super().__init__()
-#         self.model_id = model_id
-#         self.base_model = HfApiModel(
-#             model_id="https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud",
-#             token=hf_token,
-#             max_tokens=4096,
-#         )
-#         self.fallback_model = HfApiModel(
-#             model_id,
-#             provider="nebius",
-#             token=hf_token,
-#             max_tokens=4096,
-#         )
-#     def __call__(
-#         self,
-#         messages: List[Dict[str, Any]],
-#         stop_sequences: Optional[List[str]] = None,
-#         **kwargs
-#     ) -> ChatMessage:
-#         try:
-#             message = self.base_model(messages, stop_sequences, **kwargs)
-#             return message
-#         except Exception as e:
-#             print(f"Base model failed with error: {e}. Calling fallback model.")
-#         # Continue to fallback
-#         try:
-#             message = self.fallback_model(messages, stop_sequences, **kwargs)
-#             return message
-#         except Exception as e:
-#             raise Exception(f"Both endpoints failed. Last error: {e}")
 class QwenVLAPIModel(Model):
     """Model wrapper for Qwen2.5VL API with fallback mechanism"""
     def __init__(
         self,
-        model_path: str = "Qwen/Qwen2.5-VL-72B-Instruct",
-        provider: str = "hyperbolic",
         hf_token: str = None,
-        #hf_base_url: str = "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
-        #hf_base_url: str = "https://s41ydkv0iyjeokyj.us-east-1.aws.endpoints.huggingface.cloud/v1/"
-        #hf_base_url: str = "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
-        hf_base_url: str= "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
     ):
         super().__init__()
-        self.model_path = model_path
-        self.model_id = model_path
-        self.provider = provider
-        self.hf_token = hf_token
-        self.hf_base_url = hf_base_url
-        # Initialize hyperbolic client
-        self.hyperbolic_client = InferenceClient(
-            provider=self.provider,
         )
-        # Initialize HF OpenAI-compatible client if token is provided
-        self.hf_client = None
-        if hf_token:
-            from openai import OpenAI
-            self.hf_client = OpenAI(
-                base_url=self.hf_base_url,
-                api_key=self.hf_token
-            )
     def __call__(
         self,
@@ -617,129 +555,18 @@ class QwenVLAPIModel(Model):
         stop_sequences: Optional[List[str]] = None,
         **kwargs
     ) -> ChatMessage:
-        """Convert a list of messages to an API request with fallback mechanism"""
-        print(messages)
-        # Format messages once for both APIs
-        formatted_messages = self._format_messages(messages)
-        # First try the HF endpoint if available
-        if self.hf_client:
-            try:
-                completion = self._call_hf_endpoint(
-                    formatted_messages,
-                    stop_sequences,
-                    **kwargs
-                )
-                return ChatMessage(role=MessageRole.ASSISTANT, content=completion)
-            except Exception as e:
-                print(f"HF endpoint failed with error: {e}. Falling back to hyperbolic.")
-                # Continue to fallback
-        # Fallback to hyperbolic
         try:
-            return self._call_hyperbolic(formatted_messages, stop_sequences, **kwargs)
         except Exception as e:
             raise Exception(f"Both endpoints failed. Last error: {e}")
-    def _format_messages(self, messages: List[Dict[str, Any]]):
-        """Format messages for API requests - works for both endpoints"""
-        formatted_messages = []
-        for msg in messages:
-            role = msg["role"]
-            content = []
-            if isinstance(msg["content"], list):
-                for item in msg["content"]:
-                    if item["type"] == "text":
-                        content.append({"type": "text", "text": item["text"]})
-                    elif item["type"] == "image":
-                        # Handle image path or direct image object
-                        if isinstance(item["image"], str):
-                            # Image is a path
-                            with open(item["image"], "rb") as image_file:
-                                base64_image = base64.b64encode(image_file.read()).decode("utf-8")
-                        else:
-                            # Image is a PIL image or similar object
-                            img_byte_arr = io.BytesIO()
-                            item["image"].save(img_byte_arr, format="PNG")
-                            base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
-                        content.append({
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:image/png;base64,{base64_image}"
-                            }
-                        })
-            else:
-                # Plain text message
-                content = [{"type": "text", "text": msg["content"]}]
-            formatted_messages.append({"role": role, "content": content})
-        return formatted_messages
-    def _call_hf_endpoint(self, formatted_messages, stop_sequences=None, **kwargs):
-        """Call the Hugging Face OpenAI-compatible endpoint"""
-        # Extract parameters with defaults
-        max_tokens = kwargs.get("max_new_tokens", 512)
-        temperature = kwargs.get("temperature", 0.7)
-        top_p = kwargs.get("top_p", 0.9)
-        stream = kwargs.get("stream", False)
-        completion = self.hf_client.chat.completions.create(
-            model="tgi",  # Model name for the endpoint
-            messages=formatted_messages,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            stream=stream,
-            stop=stop_sequences
-        )
-        if stream:
-            # For streaming responses, return a generator
-            def stream_generator():
-                for chunk in completion:
-                    yield chunk.choices[0].delta.content or ""
-            return stream_generator()
-        else:
-            # For non-streaming, return the full text
-            return completion.choices[0].message.content
-    def _call_hyperbolic(self, formatted_messages, stop_sequences=None, **kwargs):
-        """Call the hyperbolic API"""
-        completion = self.hyperbolic_client.chat.completions.create(
-            model=self.model_path,
-            messages=formatted_messages,
-            max_tokens=kwargs.get("max_new_tokens", 512),
-            temperature=kwargs.get("temperature", 0.7),
-            top_p=kwargs.get("top_p", 0.9),
-        )
-        # Extract the response text
-        output_text = completion.choices[0].message.content
-        return ChatMessage(role=MessageRole.ASSISTANT, content=output_text)
-    def to_dict(self) -> Dict[str, Any]:
-        """Convert the model to a dictionary"""
-        return {
-            "class": self.__class__.__name__,
-            "model_path": self.model_path,
-            "provider": self.provider,
-            "hf_base_url": self.hf_base_url,
-            # We don't save the API keys for security reasons
-        }
-    @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> "QwenVLAPIModel":
-        """Create a model from a dictionary"""
-        return cls(
-            model_path=data.get("model_path", "Qwen/Qwen2.5-VL-72B-Instruct"),
-            provider=data.get("provider", "hyperbolic"),
-            hf_base_url=data.get("hf_base_url", "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"),
-        )

 import os
 import time
 from io import BytesIO
+from typing import Any, Dict, List, Optional
 import unicodedata
 # E2B imports
 from e2b_desktop import Sandbox
 from PIL import Image
 # SmolaAgents imports
 from smolagents import CodeAgent, tool, HfApiModel
 from smolagents.memory import ActionStep
+from smolagents.models import ChatMessage, Model
+from smolagents.agents import populate_template
 from smolagents.monitoring import LogLevel
 from smolagents.agent_types import AgentImage
 from PIL import ImageDraw
 {%- endfor %}
 The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels, take it into account to decide clicking coordinates.
+If you clicked somewhere in the previous action, a green crosshair will appear at the exact location of the previous click.
 The image might have change since then but the cross stays at the previous click. If your click seems to have changed nothing, check that this location is exactly where you intended to click. Otherwise correct the click coordinates.
 </tools>
 Step 4:
 Short term goal: I want to open a text editor.
 Where I am: I am still under the Accessories menu.
+What I see: Nothing has changed compared to previous screenshot. Under the open submenu Accessories, I still see 'Text Editor'. The green cross is off from the element.
 Reflection: My last click must have been off. Let's correct this.
 Action: I will click the correct place, right in the middle of the element.
 Code:
 Use click to move through menus on the desktop and scroll for web and specific applications.
 Always analyze the latest screenshot carefully before performing actions.
 Desktop menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
+NEVER CLICK THE WEB BROWSER ICON TO OPEN THE WEB BROWSER: use open_url
 </general_guidelines>
 """
     x, y = click_coordinates
     draw = ImageDraw.Draw(image_copy)
     cross_size, linewidth = 10, 3
+    # Draw cross
+    draw.line((x - cross_size, y, x + cross_size, y), fill="green", width=linewidth)
+    draw.line((x, y - cross_size, x, y + cross_size), fill="green", width=linewidth)
     # Add a circle around it for better visibility
+    draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="green", width=linewidth)
     return image_copy
 class E2BVisionAgent(CodeAgent):
     """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
         tools: List[tool] = None,
         max_steps: int = 200,
         verbosity_level: LogLevel = 2,
+        planning_interval: int = None,
         use_v1_prompt: bool = False,
         **kwargs
     ):
         self.step_callbacks.append(self.take_screenshot_callback)
     def initialize_system_prompt(self) -> str:
+        if True:
             return """You are a desktop automation assistant that can control a remote desktop environment.
 You only have access to the following tools to interact with the desktop, no additional ones:
 - click(x, y): Performs a left-click at the specified coordinates
 - scroll(x, y, direction, amount): Scrolls a website in a browser or a document (direction can be "up" or "down", a common amount is 1 or 2 scroll("down",1) ). DO NOT use scroll to move through linux desktop menus. x, y, is the mouse position to scroll on.
 - wait(seconds): Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
 - open_url(url): Directly opens a browser with the specified url, saves time compared to clicking in a browser and going through the initial setup wizard.
+- drag_and_drop(x1, y1, x2, y2): Clicks [x1, y1], drags mouse to [x2, y2], then releases click.
+- find_on_page_ctrl_f(search_string): Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F.
 - final_answer("YOUR FINAL ANSWER TEXT"): Announces that the task requested is completed and provides a final text
 The desktop has a resolution of {resolution_x}x{resolution_y}.
 IMPORTANT:
 COMMAND FORMAT:
 Always format your actions as Python code blocks. For example:
 ```python
+click(250, 304)
 ```<end_code>
 TASK EXAMPLE:
 For a task like "Open a text editor and type 'Hello World'":
 1- First, analyze the screenshot to find the Applications menu and click on it being very precise, clicking in the middle of the text 'Applications':
 ```python
+click(52, 10)
 ```<end_code>
 2- Remembering that menus are navigated through clicking, after analyzing the screenshot with the applications menu open we see that a notes application probably fits in the Accessories section (we see it is a section in the menu thanks to the tiny white triangle after the text accessories). We look for Accessories and click on it being very precise, clicking in the middle of the text 'Accessories'. DO NOT try to move through the menus with scroll, it won't work:
 ```python
 REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
 """.format(resolution_x=self.width, resolution_y=self.height)
         else:
+            print("USING v2 prompt")
             system_prompt = populate_template(
                 self.prompt_templates["system_prompt"],
                 variables={
         @tool
         def scroll(x: int, y: int, direction: str = "down", amount: int = 1) -> str:
             """
+            Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
             Args:
                 x: The x coordinate (horizontal position) of the element to scroll/zoom
                 y: The y coordinate (vertical position) of the element to scroll/zoom
                 direction: The direction to scroll ("up" or "down"), defaults to "down"
                 amount: The amount to scroll. A good amount is 1 or 2.
             """
+            self.desktop.move_mouse(x, y)
             self.desktop.scroll(direction=direction, amount=amount)
+            message = f"Scrolled {direction} by {amount}"
+            self.logger.log(message)
+            return message
         @tool
         def wait(seconds: float) -> str:
         @tool
         def open_url(url: str) -> str:
             """
+            Directly opens a browser with the specified url: use this at start of web searches rather than trying to click the browser.
             Args:
                 url: The URL to open
             """
         image_copy = image.copy()
+        # if getattr(self, "click_coordinates", None):
+        #     print("DRAWING MARKER")
+        #     image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
         self.last_marked_screenshot = AgentImage(screenshot_path)
         print(f"Saved screenshot for step {current_step} to {screenshot_path}")
         ) in agent.memory.steps:  # Remove previous screenshots from logs for lean processing
             if (
                 isinstance(previous_memory_step, ActionStep)
+                and previous_memory_step.step_number <= current_step - 1
             ):
                 previous_memory_step.observations_images = None
             print("E2B sandbox terminated")
 class QwenVLAPIModel(Model):
     """Model wrapper for Qwen2.5VL API with fallback mechanism"""
     def __init__(
         self,
+        model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
         hf_token: str = None,
     ):
         super().__init__()
+        self.model_id = model_id
+        self.base_model = HfApiModel(
+            model_id="https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud",
+            token=hf_token,
+            max_tokens=4096,
+        )
+        self.fallback_model = HfApiModel(
+            model_id,
+            provider="nebius",
+            token=hf_token,
+            max_tokens=4096,
         )
     def __call__(
         self,
         stop_sequences: Optional[List[str]] = None,
         **kwargs
     ) -> ChatMessage:
         try:
+            message = self.base_model(messages, stop_sequences, **kwargs)
+            return message
+        except Exception as e:
+            raise e
+            print(f"Base model failed with error: {e}. Calling fallback model.")
+        # Continue to fallback
+        try:
+            message = self.fallback_model(messages, stop_sequences, **kwargs)
+            return message
         except Exception as e:
+            raise e
             raise Exception(f"Both endpoints failed. Last error: {e}")

eval.py CHANGED Viewed

@@ -18,7 +18,7 @@ from e2bqwen import QwenVLAPIModel, E2BVisionAgent
 from dotenv import load_dotenv
-load_dotenv()
 # Environment variables and constants
 E2B_API_KEY = os.getenv("E2B_API_KEY")
 # Try to get token dynamically, fall back to environment variable
@@ -290,7 +290,7 @@ def main():
         "wiki": "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
         "flight": "Search a flight Rome - Berlin for tomorrow",
         "pond": "What's the name of the pond just south of Château de Fontainebleau in Google maps?",
-        "flux": "Go generate a picture of the Golden Gate bridge on a FLUX1.dev space",
         "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
     }
@@ -298,7 +298,7 @@ def main():
     os.makedirs(args.output_dir, exist_ok=True)
     # Run the evaluation
-    eval_dir = run_evaluation(examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps)
 if __name__ == "__main__":
     main()

 from dotenv import load_dotenv
+load_dotenv(override=True)
 # Environment variables and constants
 E2B_API_KEY = os.getenv("E2B_API_KEY")
 # Try to get token dynamically, fall back to environment variable
         "wiki": "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
         "flight": "Search a flight Rome - Berlin for tomorrow",
         "pond": "What's the name of the pond just south of Château de Fontainebleau in Google maps?",
+        "flux": "Go on the Hugging Face Hub, find a Space for FLUX1.dev, and generate a picture of the Golden Gate bridge.",
         "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
     }
     os.makedirs(args.output_dir, exist_ok=True)
     # Run the evaluation
+    run_evaluation(examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps)
 if __name__ == "__main__":
     main()