Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Add last click marker on screenshots
Browse files- app.py +1 -1
- e2bqwen.py +20 -7
app.py
CHANGED
@@ -580,7 +580,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js, fill_width=True) as de
|
|
580 |
|
581 |
gr.Examples(
|
582 |
examples=[
|
583 |
-
"Check the commuting time between Bern and Zurich",
|
584 |
"Write 'Hello World' in a text editor",
|
585 |
"Search a flight Paris - Berlin for tomorrow",
|
586 |
"Could you head to Fontainebleau (France) in Google Maps then drag and drop to position the castle of Fontainebleau exactly in the center?",
|
|
|
580 |
|
581 |
gr.Examples(
|
582 |
examples=[
|
583 |
+
"Check the commuting time between Bern and Zurich on Google maps",
|
584 |
"Write 'Hello World' in a text editor",
|
585 |
"Search a flight Paris - Berlin for tomorrow",
|
586 |
"Could you head to Fontainebleau (France) in Google Maps then drag and drop to position the castle of Fontainebleau exactly in the center?",
|
e2bqwen.py
CHANGED
@@ -19,7 +19,7 @@ from smolagents.memory import ActionStep
|
|
19 |
from smolagents.models import ChatMessage, MessageRole, Model
|
20 |
from smolagents.monitoring import LogLevel
|
21 |
from smolagents.agent_types import AgentImage
|
22 |
-
|
23 |
|
24 |
E2B_SYSTEM_PROMPT_TEMPLATE = """You are a desktop automation assistant that can control a remote desktop environment.
|
25 |
On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
|
@@ -35,7 +35,7 @@ IMPORTANT:
|
|
35 |
- Remember the tools that you have as those can save you time, for example open_url to enter a website rather than searching for the browser in the OS.
|
36 |
- Whenever you click, MAKE SURE to click in the middle of the button, text, link or any other clickable element. Not under, not on the side. IN THE MIDDLE. In menus it is always better to click in the middle of the text rather than in the tiny icon. Calculate extremelly well the coordinates. A mistake here can make the full task fail.
|
37 |
- To navigate the desktop you should open menus and click. Menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
|
38 |
-
- Always analyze the latest screenshot carefully before performing actions. If you clicked somewhere in the previous action
|
39 |
|
40 |
You must proceed step by step:
|
41 |
1. Understand the task thoroughly
|
@@ -81,7 +81,6 @@ final_answer("Done")
|
|
81 |
```<end_code>
|
82 |
|
83 |
Remember to:
|
84 |
-
|
85 |
Always wait for appropriate loading times
|
86 |
Use precise coordinates based on the current screenshot
|
87 |
Execute one action at a time
|
@@ -149,6 +148,7 @@ class E2BVisionAgent(CodeAgent):
|
|
149 |
self.desktop.move_mouse(x, y)
|
150 |
self.desktop.left_click()
|
151 |
self.logger.log(f"Clicked at coordinates ({x}, {y})")
|
|
|
152 |
return f"Clicked at coordinates ({x}, {y})"
|
153 |
|
154 |
@tool
|
@@ -162,6 +162,7 @@ class E2BVisionAgent(CodeAgent):
|
|
162 |
self.desktop.move_mouse(x, y)
|
163 |
self.desktop.right_click()
|
164 |
self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
|
|
|
165 |
return f"Right-clicked at coordinates ({x}, {y})"
|
166 |
|
167 |
@tool
|
@@ -175,6 +176,7 @@ class E2BVisionAgent(CodeAgent):
|
|
175 |
self.desktop.move_mouse(x, y)
|
176 |
self.desktop.double_click()
|
177 |
self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
|
|
|
178 |
return f"Double-clicked at coordinates ({x}, {y})"
|
179 |
|
180 |
@tool
|
@@ -204,12 +206,10 @@ class E2BVisionAgent(CodeAgent):
|
|
204 |
@tool
|
205 |
def press_key(key: str) -> str:
|
206 |
"""
|
207 |
-
Presses a keyboard key
|
208 |
Args:
|
209 |
-
key: The key to press (e.g
|
210 |
"""
|
211 |
-
if key == "enter":
|
212 |
-
key = "Return"
|
213 |
self.desktop.press(key)
|
214 |
self.logger.log(f"Pressed key: {key}")
|
215 |
return f"Pressed key: {key}"
|
@@ -304,6 +304,17 @@ class E2BVisionAgent(CodeAgent):
|
|
304 |
screenshot_bytes = self.desktop.screenshot()
|
305 |
image = Image.open(BytesIO(screenshot_bytes))
|
306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
# Create a filename with step number
|
308 |
screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
|
309 |
image.save(screenshot_path)
|
@@ -324,6 +335,8 @@ class E2BVisionAgent(CodeAgent):
|
|
324 |
|
325 |
# memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
|
326 |
|
|
|
|
|
327 |
|
328 |
def close(self):
|
329 |
"""Clean up resources"""
|
|
|
19 |
from smolagents.models import ChatMessage, MessageRole, Model
|
20 |
from smolagents.monitoring import LogLevel
|
21 |
from smolagents.agent_types import AgentImage
|
22 |
+
from PIL import ImageDraw
|
23 |
|
24 |
E2B_SYSTEM_PROMPT_TEMPLATE = """You are a desktop automation assistant that can control a remote desktop environment.
|
25 |
On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
|
|
|
35 |
- Remember the tools that you have as those can save you time, for example open_url to enter a website rather than searching for the browser in the OS.
|
36 |
- Whenever you click, MAKE SURE to click in the middle of the button, text, link or any other clickable element. Not under, not on the side. IN THE MIDDLE. In menus it is always better to click in the middle of the text rather than in the tiny icon. Calculate extremelly well the coordinates. A mistake here can make the full task fail.
|
37 |
- To navigate the desktop you should open menus and click. Menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
|
38 |
+
- Always analyze the latest screenshot carefully before performing actions. If you clicked somewhere in the previous action, a red crosshair will appear at the exact click location: if nothing happened, check that this location is exactly where you intended to click. Otherwise correct the click coordinates.
|
39 |
|
40 |
You must proceed step by step:
|
41 |
1. Understand the task thoroughly
|
|
|
81 |
```<end_code>
|
82 |
|
83 |
Remember to:
|
|
|
84 |
Always wait for appropriate loading times
|
85 |
Use precise coordinates based on the current screenshot
|
86 |
Execute one action at a time
|
|
|
148 |
self.desktop.move_mouse(x, y)
|
149 |
self.desktop.left_click()
|
150 |
self.logger.log(f"Clicked at coordinates ({x}, {y})")
|
151 |
+
self.click_coordinates = [x, y]
|
152 |
return f"Clicked at coordinates ({x}, {y})"
|
153 |
|
154 |
@tool
|
|
|
162 |
self.desktop.move_mouse(x, y)
|
163 |
self.desktop.right_click()
|
164 |
self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
|
165 |
+
self.click_coordinates = [x, y]
|
166 |
return f"Right-clicked at coordinates ({x}, {y})"
|
167 |
|
168 |
@tool
|
|
|
176 |
self.desktop.move_mouse(x, y)
|
177 |
self.desktop.double_click()
|
178 |
self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
|
179 |
+
self.click_coordinates = [x, y]
|
180 |
return f"Double-clicked at coordinates ({x}, {y})"
|
181 |
|
182 |
@tool
|
|
|
206 |
@tool
|
207 |
def press_key(key: str) -> str:
|
208 |
"""
|
209 |
+
Presses a keyboard key
|
210 |
Args:
|
211 |
+
key: The key to press (e.g. "enter", "space", "backspace", etc.).
|
212 |
"""
|
|
|
|
|
213 |
self.desktop.press(key)
|
214 |
self.logger.log(f"Pressed key: {key}")
|
215 |
return f"Pressed key: {key}"
|
|
|
304 |
screenshot_bytes = self.desktop.screenshot()
|
305 |
image = Image.open(BytesIO(screenshot_bytes))
|
306 |
|
307 |
+
if getattr(self, "click_coordinates", None):
|
308 |
+
# If a click was performed in the last action, mark it on the image
|
309 |
+
x, y = self.click_coordinates
|
310 |
+
draw = ImageDraw.Draw(image)
|
311 |
+
cross_size, linewidth = 10, 3
|
312 |
+
# Draw red cross lines
|
313 |
+
draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
|
314 |
+
draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
|
315 |
+
# Add a circle around it for better visibility
|
316 |
+
draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
|
317 |
+
|
318 |
# Create a filename with step number
|
319 |
screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
|
320 |
image.save(screenshot_path)
|
|
|
335 |
|
336 |
# memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
|
337 |
|
338 |
+
self.click_coordinates = None
|
339 |
+
|
340 |
|
341 |
def close(self):
|
342 |
"""Clean up resources"""
|