Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Fix smolagents integration
Browse files- app.py +14 -28
- e2bqwen.py +50 -223
- eval.py +3 -3
app.py
CHANGED
@@ -16,6 +16,10 @@ from smolagents.gradio_ui import GradioUI, stream_to_gradio
|
|
16 |
from model_replay import FakeModelReplayLog
|
17 |
from gradio_modal import Modal
|
18 |
|
|
|
|
|
|
|
|
|
19 |
from e2bqwen import QwenVLAPIModel, E2BVisionAgent
|
20 |
|
21 |
E2B_API_KEY = os.getenv("E2B_API_KEY")
|
@@ -28,7 +32,7 @@ TMP_DIR = './tmp/'
|
|
28 |
if not os.path.exists(TMP_DIR):
|
29 |
os.makedirs(TMP_DIR)
|
30 |
|
31 |
-
hf_token = os.getenv("
|
32 |
login(token=hf_token)
|
33 |
|
34 |
custom_css = """
|
@@ -297,25 +301,6 @@ custom_js = """function() {
|
|
297 |
}
|
298 |
"""
|
299 |
|
300 |
-
def write_to_console_log(log_file_path, message):
|
301 |
-
"""
|
302 |
-
Appends a message to the specified log file with a newline character.
|
303 |
-
|
304 |
-
Parameters:
|
305 |
-
log_file_path (str): Path to the log file
|
306 |
-
message (str): Message to append to the log file
|
307 |
-
"""
|
308 |
-
if log_file_path is None:
|
309 |
-
return False
|
310 |
-
try:
|
311 |
-
# Open the file in append mode
|
312 |
-
with open(log_file_path, 'a') as log_file:
|
313 |
-
# Write the message followed by a newline
|
314 |
-
log_file.write(f"{message}\n")
|
315 |
-
return True
|
316 |
-
except Exception as e:
|
317 |
-
print(f"Error writing to log file: {str(e)}")
|
318 |
-
return False
|
319 |
|
320 |
def upload_to_hf_and_remove(folder_path):
|
321 |
|
@@ -472,16 +457,16 @@ def create_agent(data_dir, desktop):
|
|
472 |
hf_token = hf_token,
|
473 |
)
|
474 |
|
475 |
-
model = OpenAIServerModel(
|
476 |
-
|
477 |
-
)
|
478 |
return E2BVisionAgent(
|
479 |
model=model,
|
480 |
data_dir=data_dir,
|
481 |
desktop=desktop,
|
482 |
max_steps=200,
|
483 |
verbosity_level=2,
|
484 |
-
planning_interval=10,
|
485 |
use_v1_prompt=True
|
486 |
)
|
487 |
|
@@ -527,13 +512,14 @@ class EnrichedGradioUI(GradioUI):
|
|
527 |
yield stored_messages
|
528 |
|
529 |
# THIS ERASES IMAGES FROM AGENT MEMORY, USE WITH CAUTION
|
530 |
-
if consent_storage:
|
531 |
-
|
532 |
-
|
533 |
yield stored_messages
|
534 |
|
535 |
except Exception as e:
|
536 |
error_message=f"Error in interaction: {str(e)}"
|
|
|
537 |
print(error_message)
|
538 |
stored_messages.append(gr.ChatMessage(role="assistant", content="Run failed:\n" + error_message))
|
539 |
if consent_storage:
|
@@ -584,7 +570,7 @@ _Please note that we store the task logs by default so **do not write any person
|
|
584 |
"Check the commuting time between Bern and Zurich on Google maps",
|
585 |
"Write 'Hello World' in a text editor",
|
586 |
"When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
|
587 |
-
"Search a flight Rome
|
588 |
"What' s the name of the pond just south of Château de Fontainebleau in Google maps?",
|
589 |
"Go on the Hugging Face Hub, find the space for FLUX1.dev, then generate a picture of the Golden Gate bridge",
|
590 |
"Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
|
|
|
16 |
from model_replay import FakeModelReplayLog
|
17 |
from gradio_modal import Modal
|
18 |
|
19 |
+
from dotenv import load_dotenv
|
20 |
+
|
21 |
+
load_dotenv(override=True)
|
22 |
+
|
23 |
from e2bqwen import QwenVLAPIModel, E2BVisionAgent
|
24 |
|
25 |
E2B_API_KEY = os.getenv("E2B_API_KEY")
|
|
|
32 |
if not os.path.exists(TMP_DIR):
|
33 |
os.makedirs(TMP_DIR)
|
34 |
|
35 |
+
hf_token = os.getenv("HF_TOKEN")
|
36 |
login(token=hf_token)
|
37 |
|
38 |
custom_css = """
|
|
|
301 |
}
|
302 |
"""
|
303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
|
305 |
def upload_to_hf_and_remove(folder_path):
|
306 |
|
|
|
457 |
hf_token = hf_token,
|
458 |
)
|
459 |
|
460 |
+
# model = OpenAIServerModel(
|
461 |
+
# "gpt-4o",api_key=os.getenv("OPENAI_API_KEY")
|
462 |
+
# )
|
463 |
return E2BVisionAgent(
|
464 |
model=model,
|
465 |
data_dir=data_dir,
|
466 |
desktop=desktop,
|
467 |
max_steps=200,
|
468 |
verbosity_level=2,
|
469 |
+
# planning_interval=10,
|
470 |
use_v1_prompt=True
|
471 |
)
|
472 |
|
|
|
512 |
yield stored_messages
|
513 |
|
514 |
# THIS ERASES IMAGES FROM AGENT MEMORY, USE WITH CAUTION
|
515 |
+
# if consent_storage:
|
516 |
+
# summary = get_agent_summary_erase_images(session_state["agent"])
|
517 |
+
# save_final_status(data_dir, "completed", summary = summary)
|
518 |
yield stored_messages
|
519 |
|
520 |
except Exception as e:
|
521 |
error_message=f"Error in interaction: {str(e)}"
|
522 |
+
raise e
|
523 |
print(error_message)
|
524 |
stored_messages.append(gr.ChatMessage(role="assistant", content="Run failed:\n" + error_message))
|
525 |
if consent_storage:
|
|
|
570 |
"Check the commuting time between Bern and Zurich on Google maps",
|
571 |
"Write 'Hello World' in a text editor",
|
572 |
"When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
|
573 |
+
"Search a flight from Rome to Berlin for tomorrow on Skyscanner",
|
574 |
"What' s the name of the pond just south of Château de Fontainebleau in Google maps?",
|
575 |
"Go on the Hugging Face Hub, find the space for FLUX1.dev, then generate a picture of the Golden Gate bridge",
|
576 |
"Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
|
e2bqwen.py
CHANGED
@@ -1,15 +1,9 @@
|
|
1 |
import os
|
2 |
import time
|
3 |
-
import base64
|
4 |
from io import BytesIO
|
5 |
-
from
|
6 |
-
from typing import Any, Dict, List, Optional, Tuple
|
7 |
-
import json
|
8 |
import unicodedata
|
9 |
|
10 |
-
# HF API params
|
11 |
-
from huggingface_hub import InferenceClient
|
12 |
-
|
13 |
# E2B imports
|
14 |
from e2b_desktop import Sandbox
|
15 |
from PIL import Image
|
@@ -17,7 +11,8 @@ from PIL import Image
|
|
17 |
# SmolaAgents imports
|
18 |
from smolagents import CodeAgent, tool, HfApiModel
|
19 |
from smolagents.memory import ActionStep
|
20 |
-
from smolagents.models import ChatMessage,
|
|
|
21 |
from smolagents.monitoring import LogLevel
|
22 |
from smolagents.agent_types import AgentImage
|
23 |
from PIL import ImageDraw
|
@@ -48,7 +43,7 @@ On top of performing computations in the Python code snippets that you create, y
|
|
48 |
{%- endfor %}
|
49 |
|
50 |
The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels, take it into account to decide clicking coordinates.
|
51 |
-
If you clicked somewhere in the previous action, a
|
52 |
The image might have change since then but the cross stays at the previous click. If your click seems to have changed nothing, check that this location is exactly where you intended to click. Otherwise correct the click coordinates.
|
53 |
</tools>
|
54 |
|
@@ -98,7 +93,7 @@ click(251, 441)
|
|
98 |
Step 4:
|
99 |
Short term goal: I want to open a text editor.
|
100 |
Where I am: I am still under the Accessories menu.
|
101 |
-
What I see: Nothing has changed compared to previous screenshot. Under the open submenu Accessories, I still see 'Text Editor'. The
|
102 |
Reflection: My last click must have been off. Let's correct this.
|
103 |
Action: I will click the correct place, right in the middle of the element.
|
104 |
Code:
|
@@ -145,7 +140,7 @@ On each step, look at the last screenshot and action to validate if previous ste
|
|
145 |
Use click to move through menus on the desktop and scroll for web and specific applications.
|
146 |
Always analyze the latest screenshot carefully before performing actions.
|
147 |
Desktop menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
|
148 |
-
|
149 |
</general_guidelines>
|
150 |
"""
|
151 |
|
@@ -153,21 +148,13 @@ def draw_marker_on_image(image_copy, click_coordinates):
|
|
153 |
x, y = click_coordinates
|
154 |
draw = ImageDraw.Draw(image_copy)
|
155 |
cross_size, linewidth = 10, 3
|
156 |
-
# Draw
|
157 |
-
draw.line((x - cross_size, y, x + cross_size, y), fill="
|
158 |
-
draw.line((x, y - cross_size, x, y + cross_size), fill="
|
159 |
# Add a circle around it for better visibility
|
160 |
-
draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="
|
161 |
return image_copy
|
162 |
|
163 |
-
from jinja2 import StrictUndefined, Template
|
164 |
-
|
165 |
-
|
166 |
-
def populate_template(template: str, variables: Dict[str, Any]) -> str:
|
167 |
-
compiled_template = Template(template, undefined=StrictUndefined)
|
168 |
-
return compiled_template.render(**variables)
|
169 |
-
|
170 |
-
|
171 |
|
172 |
class E2BVisionAgent(CodeAgent):
|
173 |
"""Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
|
@@ -179,7 +166,7 @@ class E2BVisionAgent(CodeAgent):
|
|
179 |
tools: List[tool] = None,
|
180 |
max_steps: int = 200,
|
181 |
verbosity_level: LogLevel = 2,
|
182 |
-
planning_interval: int =
|
183 |
use_v1_prompt: bool = False,
|
184 |
**kwargs
|
185 |
):
|
@@ -216,7 +203,7 @@ class E2BVisionAgent(CodeAgent):
|
|
216 |
self.step_callbacks.append(self.take_screenshot_callback)
|
217 |
|
218 |
def initialize_system_prompt(self) -> str:
|
219 |
-
if
|
220 |
return """You are a desktop automation assistant that can control a remote desktop environment.
|
221 |
You only have access to the following tools to interact with the desktop, no additional ones:
|
222 |
- click(x, y): Performs a left-click at the specified coordinates
|
@@ -228,6 +215,8 @@ You only have access to the following tools to interact with the desktop, no add
|
|
228 |
- scroll(x, y, direction, amount): Scrolls a website in a browser or a document (direction can be "up" or "down", a common amount is 1 or 2 scroll("down",1) ). DO NOT use scroll to move through linux desktop menus. x, y, is the mouse position to scroll on.
|
229 |
- wait(seconds): Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
|
230 |
- open_url(url): Directly opens a browser with the specified url, saves time compared to clicking in a browser and going through the initial setup wizard.
|
|
|
|
|
231 |
- final_answer("YOUR FINAL ANSWER TEXT"): Announces that the task requested is completed and provides a final text
|
232 |
The desktop has a resolution of {resolution_x}x{resolution_y}.
|
233 |
IMPORTANT:
|
@@ -247,13 +236,13 @@ After each action, you'll receive an updated screenshot. Review it carefully bef
|
|
247 |
COMMAND FORMAT:
|
248 |
Always format your actions as Python code blocks. For example:
|
249 |
```python
|
250 |
-
click(250,
|
251 |
```<end_code>
|
252 |
TASK EXAMPLE:
|
253 |
For a task like "Open a text editor and type 'Hello World'":
|
254 |
1- First, analyze the screenshot to find the Applications menu and click on it being very precise, clicking in the middle of the text 'Applications':
|
255 |
```python
|
256 |
-
click(
|
257 |
```<end_code>
|
258 |
2- Remembering that menus are navigated through clicking, after analyzing the screenshot with the applications menu open we see that a notes application probably fits in the Accessories section (we see it is a section in the menu thanks to the tiny white triangle after the text accessories). We look for Accessories and click on it being very precise, clicking in the middle of the text 'Accessories'. DO NOT try to move through the menus with scroll, it won't work:
|
259 |
```python
|
@@ -280,6 +269,7 @@ Use click to move through menus on the desktop and scroll for web and specific a
|
|
280 |
REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
|
281 |
""".format(resolution_x=self.width, resolution_y=self.height)
|
282 |
else:
|
|
|
283 |
system_prompt = populate_template(
|
284 |
self.prompt_templates["system_prompt"],
|
285 |
variables={
|
@@ -405,16 +395,18 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
|
|
405 |
@tool
|
406 |
def scroll(x: int, y: int, direction: str = "down", amount: int = 1) -> str:
|
407 |
"""
|
408 |
-
|
409 |
Args:
|
410 |
x: The x coordinate (horizontal position) of the element to scroll/zoom
|
411 |
y: The y coordinate (vertical position) of the element to scroll/zoom
|
412 |
direction: The direction to scroll ("up" or "down"), defaults to "down"
|
413 |
amount: The amount to scroll. A good amount is 1 or 2.
|
414 |
"""
|
|
|
415 |
self.desktop.scroll(direction=direction, amount=amount)
|
416 |
-
|
417 |
-
|
|
|
418 |
|
419 |
@tool
|
420 |
def wait(seconds: float) -> str:
|
@@ -430,7 +422,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
|
|
430 |
@tool
|
431 |
def open_url(url: str) -> str:
|
432 |
"""
|
433 |
-
Directly opens a browser with the specified url
|
434 |
Args:
|
435 |
url: The URL to open
|
436 |
"""
|
@@ -494,9 +486,9 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
|
|
494 |
|
495 |
image_copy = image.copy()
|
496 |
|
497 |
-
if getattr(self, "click_coordinates", None):
|
498 |
-
|
499 |
-
|
500 |
|
501 |
self.last_marked_screenshot = AgentImage(screenshot_path)
|
502 |
print(f"Saved screenshot for step {current_step} to {screenshot_path}")
|
@@ -506,7 +498,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
|
|
506 |
) in agent.memory.steps: # Remove previous screenshots from logs for lean processing
|
507 |
if (
|
508 |
isinstance(previous_memory_step, ActionStep)
|
509 |
-
and previous_memory_step.step_number <= current_step -
|
510 |
):
|
511 |
previous_memory_step.observations_images = None
|
512 |
|
@@ -535,81 +527,27 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
|
|
535 |
print("E2B sandbox terminated")
|
536 |
|
537 |
|
538 |
-
# class QwenVLAPIModel(Model):
|
539 |
-
# """Model wrapper for Qwen2.5VL API with fallback mechanism"""
|
540 |
-
|
541 |
-
# def __init__(
|
542 |
-
# self,
|
543 |
-
# model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
|
544 |
-
# hf_token: str = None,
|
545 |
-
# ):
|
546 |
-
# super().__init__()
|
547 |
-
# self.model_id = model_id
|
548 |
-
# self.base_model = HfApiModel(
|
549 |
-
# model_id="https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud",
|
550 |
-
# token=hf_token,
|
551 |
-
# max_tokens=4096,
|
552 |
-
# )
|
553 |
-
# self.fallback_model = HfApiModel(
|
554 |
-
# model_id,
|
555 |
-
# provider="nebius",
|
556 |
-
# token=hf_token,
|
557 |
-
# max_tokens=4096,
|
558 |
-
# )
|
559 |
-
|
560 |
-
# def __call__(
|
561 |
-
# self,
|
562 |
-
# messages: List[Dict[str, Any]],
|
563 |
-
# stop_sequences: Optional[List[str]] = None,
|
564 |
-
# **kwargs
|
565 |
-
# ) -> ChatMessage:
|
566 |
-
|
567 |
-
# try:
|
568 |
-
# message = self.base_model(messages, stop_sequences, **kwargs)
|
569 |
-
# return message
|
570 |
-
# except Exception as e:
|
571 |
-
# print(f"Base model failed with error: {e}. Calling fallback model.")
|
572 |
-
|
573 |
-
# # Continue to fallback
|
574 |
-
# try:
|
575 |
-
# message = self.fallback_model(messages, stop_sequences, **kwargs)
|
576 |
-
# return message
|
577 |
-
# except Exception as e:
|
578 |
-
# raise Exception(f"Both endpoints failed. Last error: {e}")
|
579 |
-
|
580 |
class QwenVLAPIModel(Model):
|
581 |
"""Model wrapper for Qwen2.5VL API with fallback mechanism"""
|
582 |
|
583 |
def __init__(
|
584 |
self,
|
585 |
-
|
586 |
-
provider: str = "hyperbolic",
|
587 |
hf_token: str = None,
|
588 |
-
#hf_base_url: str = "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
|
589 |
-
#hf_base_url: str = "https://s41ydkv0iyjeokyj.us-east-1.aws.endpoints.huggingface.cloud/v1/"
|
590 |
-
#hf_base_url: str = "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
|
591 |
-
hf_base_url: str= "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
|
592 |
):
|
593 |
super().__init__()
|
594 |
-
self.
|
595 |
-
self.
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
provider=
|
|
|
|
|
603 |
)
|
604 |
-
|
605 |
-
# Initialize HF OpenAI-compatible client if token is provided
|
606 |
-
self.hf_client = None
|
607 |
-
if hf_token:
|
608 |
-
from openai import OpenAI
|
609 |
-
self.hf_client = OpenAI(
|
610 |
-
base_url=self.hf_base_url,
|
611 |
-
api_key=self.hf_token
|
612 |
-
)
|
613 |
|
614 |
def __call__(
|
615 |
self,
|
@@ -617,129 +555,18 @@ class QwenVLAPIModel(Model):
|
|
617 |
stop_sequences: Optional[List[str]] = None,
|
618 |
**kwargs
|
619 |
) -> ChatMessage:
|
620 |
-
"""Convert a list of messages to an API request with fallback mechanism"""
|
621 |
-
print(messages)
|
622 |
-
# Format messages once for both APIs
|
623 |
-
formatted_messages = self._format_messages(messages)
|
624 |
-
|
625 |
-
# First try the HF endpoint if available
|
626 |
-
if self.hf_client:
|
627 |
-
try:
|
628 |
-
completion = self._call_hf_endpoint(
|
629 |
-
formatted_messages,
|
630 |
-
stop_sequences,
|
631 |
-
**kwargs
|
632 |
-
)
|
633 |
-
return ChatMessage(role=MessageRole.ASSISTANT, content=completion)
|
634 |
-
except Exception as e:
|
635 |
-
print(f"HF endpoint failed with error: {e}. Falling back to hyperbolic.")
|
636 |
-
# Continue to fallback
|
637 |
|
638 |
-
# Fallback to hyperbolic
|
639 |
try:
|
640 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
641 |
except Exception as e:
|
|
|
642 |
raise Exception(f"Both endpoints failed. Last error: {e}")
|
643 |
-
|
644 |
-
def _format_messages(self, messages: List[Dict[str, Any]]):
|
645 |
-
"""Format messages for API requests - works for both endpoints"""
|
646 |
-
|
647 |
-
formatted_messages = []
|
648 |
-
|
649 |
-
for msg in messages:
|
650 |
-
role = msg["role"]
|
651 |
-
content = []
|
652 |
-
|
653 |
-
if isinstance(msg["content"], list):
|
654 |
-
for item in msg["content"]:
|
655 |
-
if item["type"] == "text":
|
656 |
-
content.append({"type": "text", "text": item["text"]})
|
657 |
-
elif item["type"] == "image":
|
658 |
-
# Handle image path or direct image object
|
659 |
-
if isinstance(item["image"], str):
|
660 |
-
# Image is a path
|
661 |
-
with open(item["image"], "rb") as image_file:
|
662 |
-
base64_image = base64.b64encode(image_file.read()).decode("utf-8")
|
663 |
-
else:
|
664 |
-
# Image is a PIL image or similar object
|
665 |
-
img_byte_arr = io.BytesIO()
|
666 |
-
item["image"].save(img_byte_arr, format="PNG")
|
667 |
-
base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
668 |
-
|
669 |
-
content.append({
|
670 |
-
"type": "image_url",
|
671 |
-
"image_url": {
|
672 |
-
"url": f"data:image/png;base64,{base64_image}"
|
673 |
-
}
|
674 |
-
})
|
675 |
-
else:
|
676 |
-
# Plain text message
|
677 |
-
content = [{"type": "text", "text": msg["content"]}]
|
678 |
-
|
679 |
-
formatted_messages.append({"role": role, "content": content})
|
680 |
-
|
681 |
-
return formatted_messages
|
682 |
-
|
683 |
-
def _call_hf_endpoint(self, formatted_messages, stop_sequences=None, **kwargs):
|
684 |
-
"""Call the Hugging Face OpenAI-compatible endpoint"""
|
685 |
-
|
686 |
-
# Extract parameters with defaults
|
687 |
-
max_tokens = kwargs.get("max_new_tokens", 512)
|
688 |
-
temperature = kwargs.get("temperature", 0.7)
|
689 |
-
top_p = kwargs.get("top_p", 0.9)
|
690 |
-
stream = kwargs.get("stream", False)
|
691 |
-
|
692 |
-
completion = self.hf_client.chat.completions.create(
|
693 |
-
model="tgi", # Model name for the endpoint
|
694 |
-
messages=formatted_messages,
|
695 |
-
max_tokens=max_tokens,
|
696 |
-
temperature=temperature,
|
697 |
-
top_p=top_p,
|
698 |
-
stream=stream,
|
699 |
-
stop=stop_sequences
|
700 |
-
)
|
701 |
-
|
702 |
-
if stream:
|
703 |
-
# For streaming responses, return a generator
|
704 |
-
def stream_generator():
|
705 |
-
for chunk in completion:
|
706 |
-
yield chunk.choices[0].delta.content or ""
|
707 |
-
return stream_generator()
|
708 |
-
else:
|
709 |
-
# For non-streaming, return the full text
|
710 |
-
return completion.choices[0].message.content
|
711 |
-
|
712 |
-
def _call_hyperbolic(self, formatted_messages, stop_sequences=None, **kwargs):
|
713 |
-
"""Call the hyperbolic API"""
|
714 |
-
|
715 |
-
completion = self.hyperbolic_client.chat.completions.create(
|
716 |
-
model=self.model_path,
|
717 |
-
messages=formatted_messages,
|
718 |
-
max_tokens=kwargs.get("max_new_tokens", 512),
|
719 |
-
temperature=kwargs.get("temperature", 0.7),
|
720 |
-
top_p=kwargs.get("top_p", 0.9),
|
721 |
-
)
|
722 |
-
|
723 |
-
# Extract the response text
|
724 |
-
output_text = completion.choices[0].message.content
|
725 |
-
|
726 |
-
return ChatMessage(role=MessageRole.ASSISTANT, content=output_text)
|
727 |
-
|
728 |
-
def to_dict(self) -> Dict[str, Any]:
|
729 |
-
"""Convert the model to a dictionary"""
|
730 |
-
return {
|
731 |
-
"class": self.__class__.__name__,
|
732 |
-
"model_path": self.model_path,
|
733 |
-
"provider": self.provider,
|
734 |
-
"hf_base_url": self.hf_base_url,
|
735 |
-
# We don't save the API keys for security reasons
|
736 |
-
}
|
737 |
-
|
738 |
-
@classmethod
|
739 |
-
def from_dict(cls, data: Dict[str, Any]) -> "QwenVLAPIModel":
|
740 |
-
"""Create a model from a dictionary"""
|
741 |
-
return cls(
|
742 |
-
model_path=data.get("model_path", "Qwen/Qwen2.5-VL-72B-Instruct"),
|
743 |
-
provider=data.get("provider", "hyperbolic"),
|
744 |
-
hf_base_url=data.get("hf_base_url", "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"),
|
745 |
-
)
|
|
|
1 |
import os
|
2 |
import time
|
|
|
3 |
from io import BytesIO
|
4 |
+
from typing import Any, Dict, List, Optional
|
|
|
|
|
5 |
import unicodedata
|
6 |
|
|
|
|
|
|
|
7 |
# E2B imports
|
8 |
from e2b_desktop import Sandbox
|
9 |
from PIL import Image
|
|
|
11 |
# SmolaAgents imports
|
12 |
from smolagents import CodeAgent, tool, HfApiModel
|
13 |
from smolagents.memory import ActionStep
|
14 |
+
from smolagents.models import ChatMessage, Model
|
15 |
+
from smolagents.agents import populate_template
|
16 |
from smolagents.monitoring import LogLevel
|
17 |
from smolagents.agent_types import AgentImage
|
18 |
from PIL import ImageDraw
|
|
|
43 |
{%- endfor %}
|
44 |
|
45 |
The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels, take it into account to decide clicking coordinates.
|
46 |
+
If you clicked somewhere in the previous action, a green crosshair will appear at the exact location of the previous click.
|
47 |
The image might have change since then but the cross stays at the previous click. If your click seems to have changed nothing, check that this location is exactly where you intended to click. Otherwise correct the click coordinates.
|
48 |
</tools>
|
49 |
|
|
|
93 |
Step 4:
|
94 |
Short term goal: I want to open a text editor.
|
95 |
Where I am: I am still under the Accessories menu.
|
96 |
+
What I see: Nothing has changed compared to previous screenshot. Under the open submenu Accessories, I still see 'Text Editor'. The green cross is off from the element.
|
97 |
Reflection: My last click must have been off. Let's correct this.
|
98 |
Action: I will click the correct place, right in the middle of the element.
|
99 |
Code:
|
|
|
140 |
Use click to move through menus on the desktop and scroll for web and specific applications.
|
141 |
Always analyze the latest screenshot carefully before performing actions.
|
142 |
Desktop menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
|
143 |
+
NEVER CLICK THE WEB BROWSER ICON TO OPEN THE WEB BROWSER: use open_url
|
144 |
</general_guidelines>
|
145 |
"""
|
146 |
|
|
|
148 |
x, y = click_coordinates
|
149 |
draw = ImageDraw.Draw(image_copy)
|
150 |
cross_size, linewidth = 10, 3
|
151 |
+
# Draw cross
|
152 |
+
draw.line((x - cross_size, y, x + cross_size, y), fill="green", width=linewidth)
|
153 |
+
draw.line((x, y - cross_size, x, y + cross_size), fill="green", width=linewidth)
|
154 |
# Add a circle around it for better visibility
|
155 |
+
draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="green", width=linewidth)
|
156 |
return image_copy
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
class E2BVisionAgent(CodeAgent):
|
160 |
"""Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
|
|
|
166 |
tools: List[tool] = None,
|
167 |
max_steps: int = 200,
|
168 |
verbosity_level: LogLevel = 2,
|
169 |
+
planning_interval: int = None,
|
170 |
use_v1_prompt: bool = False,
|
171 |
**kwargs
|
172 |
):
|
|
|
203 |
self.step_callbacks.append(self.take_screenshot_callback)
|
204 |
|
205 |
def initialize_system_prompt(self) -> str:
|
206 |
+
if True:
|
207 |
return """You are a desktop automation assistant that can control a remote desktop environment.
|
208 |
You only have access to the following tools to interact with the desktop, no additional ones:
|
209 |
- click(x, y): Performs a left-click at the specified coordinates
|
|
|
215 |
- scroll(x, y, direction, amount): Scrolls a website in a browser or a document (direction can be "up" or "down", a common amount is 1 or 2 scroll("down",1) ). DO NOT use scroll to move through linux desktop menus. x, y, is the mouse position to scroll on.
|
216 |
- wait(seconds): Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
|
217 |
- open_url(url): Directly opens a browser with the specified url, saves time compared to clicking in a browser and going through the initial setup wizard.
|
218 |
+
- drag_and_drop(x1, y1, x2, y2): Clicks [x1, y1], drags mouse to [x2, y2], then releases click.
|
219 |
+
- find_on_page_ctrl_f(search_string): Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F.
|
220 |
- final_answer("YOUR FINAL ANSWER TEXT"): Announces that the task requested is completed and provides a final text
|
221 |
The desktop has a resolution of {resolution_x}x{resolution_y}.
|
222 |
IMPORTANT:
|
|
|
236 |
COMMAND FORMAT:
|
237 |
Always format your actions as Python code blocks. For example:
|
238 |
```python
|
239 |
+
click(250, 304)
|
240 |
```<end_code>
|
241 |
TASK EXAMPLE:
|
242 |
For a task like "Open a text editor and type 'Hello World'":
|
243 |
1- First, analyze the screenshot to find the Applications menu and click on it being very precise, clicking in the middle of the text 'Applications':
|
244 |
```python
|
245 |
+
click(52, 10)
|
246 |
```<end_code>
|
247 |
2- Remembering that menus are navigated through clicking, after analyzing the screenshot with the applications menu open we see that a notes application probably fits in the Accessories section (we see it is a section in the menu thanks to the tiny white triangle after the text accessories). We look for Accessories and click on it being very precise, clicking in the middle of the text 'Accessories'. DO NOT try to move through the menus with scroll, it won't work:
|
248 |
```python
|
|
|
269 |
REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
|
270 |
""".format(resolution_x=self.width, resolution_y=self.height)
|
271 |
else:
|
272 |
+
print("USING v2 prompt")
|
273 |
system_prompt = populate_template(
|
274 |
self.prompt_templates["system_prompt"],
|
275 |
variables={
|
|
|
395 |
@tool
|
396 |
def scroll(x: int, y: int, direction: str = "down", amount: int = 1) -> str:
|
397 |
"""
|
398 |
+
Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
|
399 |
Args:
|
400 |
x: The x coordinate (horizontal position) of the element to scroll/zoom
|
401 |
y: The y coordinate (vertical position) of the element to scroll/zoom
|
402 |
direction: The direction to scroll ("up" or "down"), defaults to "down"
|
403 |
amount: The amount to scroll. A good amount is 1 or 2.
|
404 |
"""
|
405 |
+
self.desktop.move_mouse(x, y)
|
406 |
self.desktop.scroll(direction=direction, amount=amount)
|
407 |
+
message = f"Scrolled {direction} by {amount}"
|
408 |
+
self.logger.log(message)
|
409 |
+
return message
|
410 |
|
411 |
@tool
|
412 |
def wait(seconds: float) -> str:
|
|
|
422 |
@tool
|
423 |
def open_url(url: str) -> str:
|
424 |
"""
|
425 |
+
Directly opens a browser with the specified url: use this at start of web searches rather than trying to click the browser.
|
426 |
Args:
|
427 |
url: The URL to open
|
428 |
"""
|
|
|
486 |
|
487 |
image_copy = image.copy()
|
488 |
|
489 |
+
# if getattr(self, "click_coordinates", None):
|
490 |
+
# print("DRAWING MARKER")
|
491 |
+
# image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
|
492 |
|
493 |
self.last_marked_screenshot = AgentImage(screenshot_path)
|
494 |
print(f"Saved screenshot for step {current_step} to {screenshot_path}")
|
|
|
498 |
) in agent.memory.steps: # Remove previous screenshots from logs for lean processing
|
499 |
if (
|
500 |
isinstance(previous_memory_step, ActionStep)
|
501 |
+
and previous_memory_step.step_number <= current_step - 1
|
502 |
):
|
503 |
previous_memory_step.observations_images = None
|
504 |
|
|
|
527 |
print("E2B sandbox terminated")
|
528 |
|
529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
class QwenVLAPIModel(Model):
|
531 |
"""Model wrapper for Qwen2.5VL API with fallback mechanism"""
|
532 |
|
533 |
def __init__(
|
534 |
self,
|
535 |
+
model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
|
|
|
536 |
hf_token: str = None,
|
|
|
|
|
|
|
|
|
537 |
):
|
538 |
super().__init__()
|
539 |
+
self.model_id = model_id
|
540 |
+
self.base_model = HfApiModel(
|
541 |
+
model_id="https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud",
|
542 |
+
token=hf_token,
|
543 |
+
max_tokens=4096,
|
544 |
+
)
|
545 |
+
self.fallback_model = HfApiModel(
|
546 |
+
model_id,
|
547 |
+
provider="nebius",
|
548 |
+
token=hf_token,
|
549 |
+
max_tokens=4096,
|
550 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
|
552 |
def __call__(
|
553 |
self,
|
|
|
555 |
stop_sequences: Optional[List[str]] = None,
|
556 |
**kwargs
|
557 |
) -> ChatMessage:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
558 |
|
|
|
559 |
try:
|
560 |
+
message = self.base_model(messages, stop_sequences, **kwargs)
|
561 |
+
return message
|
562 |
+
except Exception as e:
|
563 |
+
raise e
|
564 |
+
print(f"Base model failed with error: {e}. Calling fallback model.")
|
565 |
+
|
566 |
+
# Continue to fallback
|
567 |
+
try:
|
568 |
+
message = self.fallback_model(messages, stop_sequences, **kwargs)
|
569 |
+
return message
|
570 |
except Exception as e:
|
571 |
+
raise e
|
572 |
raise Exception(f"Both endpoints failed. Last error: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval.py
CHANGED
@@ -18,7 +18,7 @@ from e2bqwen import QwenVLAPIModel, E2BVisionAgent
|
|
18 |
|
19 |
from dotenv import load_dotenv
|
20 |
|
21 |
-
load_dotenv()
|
22 |
# Environment variables and constants
|
23 |
E2B_API_KEY = os.getenv("E2B_API_KEY")
|
24 |
# Try to get token dynamically, fall back to environment variable
|
@@ -290,7 +290,7 @@ def main():
|
|
290 |
"wiki": "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
|
291 |
"flight": "Search a flight Rome - Berlin for tomorrow",
|
292 |
"pond": "What's the name of the pond just south of Château de Fontainebleau in Google maps?",
|
293 |
-
"flux": "Go generate a picture of the Golden Gate bridge
|
294 |
"hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
|
295 |
}
|
296 |
|
@@ -298,7 +298,7 @@ def main():
|
|
298 |
os.makedirs(args.output_dir, exist_ok=True)
|
299 |
|
300 |
# Run the evaluation
|
301 |
-
|
302 |
|
303 |
if __name__ == "__main__":
|
304 |
main()
|
|
|
18 |
|
19 |
from dotenv import load_dotenv
|
20 |
|
21 |
+
load_dotenv(override=True)
|
22 |
# Environment variables and constants
|
23 |
E2B_API_KEY = os.getenv("E2B_API_KEY")
|
24 |
# Try to get token dynamically, fall back to environment variable
|
|
|
290 |
"wiki": "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
|
291 |
"flight": "Search a flight Rome - Berlin for tomorrow",
|
292 |
"pond": "What's the name of the pond just south of Château de Fontainebleau in Google maps?",
|
293 |
+
"flux": "Go on the Hugging Face Hub, find a Space for FLUX1.dev, and generate a picture of the Golden Gate bridge.",
|
294 |
"hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
|
295 |
}
|
296 |
|
|
|
298 |
os.makedirs(args.output_dir, exist_ok=True)
|
299 |
|
300 |
# Run the evaluation
|
301 |
+
run_evaluation(examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps)
|
302 |
|
303 |
if __name__ == "__main__":
|
304 |
main()
|