Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Adjust resolution
Browse files- app.py +4 -4
- e2bqwen.py +17 -21
app.py
CHANGED
@@ -20,8 +20,8 @@ E2B_API_KEY = os.getenv("E2B_API_KEY")
|
|
20 |
SANDBOXES = {}
|
21 |
SANDBOX_METADATA = {}
|
22 |
SANDBOX_TIMEOUT = 600
|
23 |
-
WIDTH =
|
24 |
-
HEIGHT =
|
25 |
TMP_DIR = './tmp/'
|
26 |
if not os.path.exists(TMP_DIR):
|
27 |
os.makedirs(TMP_DIR)
|
@@ -528,7 +528,7 @@ class EnrichedGradioUI(GradioUI):
|
|
528 |
if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
|
529 |
stored_messages.append(gr.ChatMessage(
|
530 |
role="assistant",
|
531 |
-
content={"path": session_state["agent"].
|
532 |
))
|
533 |
stored_messages.append(msg)
|
534 |
yield stored_messages
|
@@ -619,7 +619,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
|
|
619 |
left: 110px;
|
620 |
}
|
621 |
.sandbox-iframe {
|
622 |
-
transform: scale(0.
|
623 |
/* transform: scale(0.59); */
|
624 |
}
|
625 |
|
|
|
20 |
SANDBOXES = {}
|
21 |
SANDBOX_METADATA = {}
|
22 |
SANDBOX_TIMEOUT = 600
|
23 |
+
WIDTH = 1024
|
24 |
+
HEIGHT = 768
|
25 |
TMP_DIR = './tmp/'
|
26 |
if not os.path.exists(TMP_DIR):
|
27 |
os.makedirs(TMP_DIR)
|
|
|
528 |
if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
|
529 |
stored_messages.append(gr.ChatMessage(
|
530 |
role="assistant",
|
531 |
+
content={"path": session_state["agent"].last_marked_screenshot.to_string(), "mime_type": "image/png"},
|
532 |
))
|
533 |
stored_messages.append(msg)
|
534 |
yield stored_messages
|
|
|
619 |
left: 110px;
|
620 |
}
|
621 |
.sandbox-iframe {
|
622 |
+
transform: scale(0.667);
|
623 |
/* transform: scale(0.59); */
|
624 |
}
|
625 |
|
e2bqwen.py
CHANGED
@@ -138,27 +138,27 @@ Whenever you click, MAKE SURE to click in the middle of the button, text, link o
|
|
138 |
</click_guidelines>
|
139 |
|
140 |
<general_guidelines>
|
|
|
141 |
You can wait for appropriate loading times using the wait() tool. But don't wait forever, sometimes you've just misclicked and the process didn't launch.
|
142 |
-
Use precise coordinates based on the current screenshot.
|
143 |
Execute one action at a time: don't try to pack a click and typing in one action.
|
144 |
On each step, look at the last screenshot and action to validate if previous steps worked and decide the next action. If you repeated an action already without effect, it means that this action is useless: don't repeat it and try something else.
|
145 |
Use click to move through menus on the desktop and scroll for web and specific applications.
|
146 |
Always analyze the latest screenshot carefully before performing actions. Make sure to:
|
147 |
To navigate the desktop you should open menus and click. Menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
|
148 |
-
Always analyze the latest screenshot carefully before performing actions.
|
149 |
-
The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels.
|
150 |
</general_guidelines>
|
151 |
"""
|
152 |
|
153 |
-
def draw_marker_on_image(
|
154 |
x, y = click_coordinates
|
155 |
-
draw = ImageDraw.Draw(
|
156 |
cross_size, linewidth = 10, 3
|
157 |
# Draw red cross lines
|
158 |
draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
|
159 |
draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
|
160 |
# Add a circle around it for better visibility
|
161 |
draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
|
|
|
162 |
|
163 |
class E2BVisionAgent(CodeAgent):
|
164 |
"""Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
|
@@ -194,7 +194,7 @@ class E2BVisionAgent(CodeAgent):
|
|
194 |
**kwargs
|
195 |
)
|
196 |
self.prompt_templates["system_prompt"] = E2B_SYSTEM_PROMPT_TEMPLATE.replace("<<resolution_x>>", str(self.width)).replace("<<resolution_y>>", str(self.height))
|
197 |
-
|
198 |
|
199 |
# Add screen info to state
|
200 |
self.state["screen_width"] = self.width
|
@@ -396,25 +396,21 @@ class E2BVisionAgent(CodeAgent):
|
|
396 |
|
397 |
current_step = memory_step.step_number
|
398 |
|
399 |
-
time.sleep(
|
400 |
screenshot_bytes = self.desktop.screenshot(format="bytes")
|
401 |
image = Image.open(BytesIO(screenshot_bytes))
|
402 |
|
403 |
-
if getattr(self, "click_coordinates", None):
|
404 |
-
# If a click was performed in the last action, mark it on the image
|
405 |
-
x, y = self.click_coordinates
|
406 |
-
draw = ImageDraw.Draw(image)
|
407 |
-
cross_size, linewidth = 10, 3
|
408 |
-
# Draw red cross lines
|
409 |
-
draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
|
410 |
-
draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
|
411 |
-
# Add a circle around it for better visibility
|
412 |
-
draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
|
413 |
-
|
414 |
# Create a filename with step number
|
415 |
screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
|
416 |
image.save(screenshot_path)
|
417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
print(f"Saved screenshot for step {current_step} to {screenshot_path}")
|
419 |
|
420 |
for (
|
@@ -433,8 +429,8 @@ class E2BVisionAgent(CodeAgent):
|
|
433 |
if previous_memory_step.tool_calls[0].arguments == memory_step.tool_calls[0].arguments:
|
434 |
memory_step.observations += "\nWARNING: You've executed the same action several times in a row. MAKE SURE TO NOT UNNECESSARILY REPEAT ACTIONS."
|
435 |
|
436 |
-
# Add to the current memory step
|
437 |
-
memory_step.observations_images = [
|
438 |
|
439 |
# memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
|
440 |
|
|
|
138 |
</click_guidelines>
|
139 |
|
140 |
<general_guidelines>
|
141 |
+
Always analyze the latest screenshot carefully before performing actions.
|
142 |
You can wait for appropriate loading times using the wait() tool. But don't wait forever, sometimes you've just misclicked and the process didn't launch.
|
143 |
+
Use precise coordinates based on the current screenshot. The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels: NEVER USE HYPOTHETIC COORDINATES, USE TRUE COORDINATES that you can see from the screenshot.
|
144 |
Execute one action at a time: don't try to pack a click and typing in one action.
|
145 |
On each step, look at the last screenshot and action to validate if previous steps worked and decide the next action. If you repeated an action already without effect, it means that this action is useless: don't repeat it and try something else.
|
146 |
Use click to move through menus on the desktop and scroll for web and specific applications.
|
147 |
Always analyze the latest screenshot carefully before performing actions. Make sure to:
|
148 |
To navigate the desktop you should open menus and click. Menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
|
|
|
|
|
149 |
</general_guidelines>
|
150 |
"""
|
151 |
|
152 |
+
def draw_marker_on_image(image_copy, click_coordinates):
|
153 |
x, y = click_coordinates
|
154 |
+
draw = ImageDraw.Draw(image_copy)
|
155 |
cross_size, linewidth = 10, 3
|
156 |
# Draw red cross lines
|
157 |
draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
|
158 |
draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
|
159 |
# Add a circle around it for better visibility
|
160 |
draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
|
161 |
+
return image_copy
|
162 |
|
163 |
class E2BVisionAgent(CodeAgent):
|
164 |
"""Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
|
|
|
194 |
**kwargs
|
195 |
)
|
196 |
self.prompt_templates["system_prompt"] = E2B_SYSTEM_PROMPT_TEMPLATE.replace("<<resolution_x>>", str(self.width)).replace("<<resolution_y>>", str(self.height))
|
197 |
+
print("PROMPT TEMPLATE:", self.prompt_templates["system_prompt"])
|
198 |
|
199 |
# Add screen info to state
|
200 |
self.state["screen_width"] = self.width
|
|
|
396 |
|
397 |
current_step = memory_step.step_number
|
398 |
|
399 |
+
time.sleep(3.0) # Let things happen on the desktop
|
400 |
screenshot_bytes = self.desktop.screenshot(format="bytes")
|
401 |
image = Image.open(BytesIO(screenshot_bytes))
|
402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
# Create a filename with step number
|
404 |
screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
|
405 |
image.save(screenshot_path)
|
406 |
+
|
407 |
+
image_copy = image.copy()
|
408 |
+
|
409 |
+
if getattr(self, "click_coordinates", None):
|
410 |
+
print("DRAWING MARKER")
|
411 |
+
image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
|
412 |
+
|
413 |
+
self.last_marked_screenshot = AgentImage(screenshot_path)
|
414 |
print(f"Saved screenshot for step {current_step} to {screenshot_path}")
|
415 |
|
416 |
for (
|
|
|
429 |
if previous_memory_step.tool_calls[0].arguments == memory_step.tool_calls[0].arguments:
|
430 |
memory_step.observations += "\nWARNING: You've executed the same action several times in a row. MAKE SURE TO NOT UNNECESSARILY REPEAT ACTIONS."
|
431 |
|
432 |
+
# Add the marker-edited image to the current memory step
|
433 |
+
memory_step.observations_images = [image_copy]
|
434 |
|
435 |
# memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
|
436 |
|