m-ric HF Staff commited on
Commit
052302c
·
1 Parent(s): 6992bed

Refactor e2b agent to make system prompt dynamically accept tool descriptions

Browse files
Files changed (2) hide show
  1. app.py +2 -1
  2. e2bqwen.py +83 -76
app.py CHANGED
@@ -558,7 +558,8 @@ with gr.Blocks(css=custom_css, js=custom_js, fill_width=True) as demo:
558
  examples=[
559
  "Check the commuting time between Bern and Zurich",
560
  "Write 'Hello World' in a text editor",
561
- "Search a flight Paris - Berlin for tomorrow"
 
562
  ],
563
  inputs = task_input,
564
  label= "Example Tasks",
 
558
  examples=[
559
  "Check the commuting time between Bern and Zurich",
560
  "Write 'Hello World' in a text editor",
561
+ "Search a flight Paris - Berlin for tomorrow",
562
+ "Could you head to Fontainebleau (France) in Google Maps then drag and drop to position the castle of Fontainebleau exactly in the center?",
563
  ],
564
  inputs = task_input,
565
  label= "Example Tasks",
e2bqwen.py CHANGED
@@ -39,77 +39,15 @@ def write_to_console_log(log_file_path, message):
39
  print(f"Error writing to log file: {str(e)}")
40
  return False
41
 
42
- class E2BVisionAgent(CodeAgent):
43
- """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
44
- def __init__(
45
- self,
46
- model: HfApiModel,
47
- data_dir: str,
48
- desktop: Sandbox,
49
- tools: List[tool] = None,
50
- max_steps: int = 200,
51
- verbosity_level: LogLevel = 4,
52
- planning_interval: int = 15,
53
- log_file = None,
54
- **kwargs
55
- ):
56
- self.desktop = desktop
57
- self.data_dir = data_dir
58
- self.log_path = log_file
59
- write_to_console_log(self.log_path, "Booting agent...")
60
- self.planning_interval = planning_interval
61
- # Initialize Desktop
62
- self.width, self.height = self.desktop.get_screen_size()
63
- print(f"Screen size: {self.width}x{self.height}")
64
- write_to_console_log(self.log_path, f"Desktop resolution detected: {self.width}x{self.height}")
65
-
66
 
67
-
68
- # Set up temp directory
69
- os.makedirs(self.data_dir, exist_ok=True)
70
- print(f"Screenshots and steps will be saved to: {self.data_dir}")
71
- print(f"Verbosity level set to {verbosity_level}")
72
-
73
-
74
- # Initialize base agent
75
- super().__init__(
76
- tools=tools or [],
77
- model=model,
78
- max_steps=max_steps,
79
- verbosity_level=verbosity_level,
80
- planning_interval = self.planning_interval,
81
- **kwargs
82
- )
83
-
84
-
85
- # Add screen info to state
86
- self.state["screen_width"] = self.width
87
- self.state["screen_height"] = self.height
88
-
89
-
90
- # Add default tools
91
- self._setup_desktop_tools()
92
- write_to_console_log(self.log_path, "Setting up agent tools...")
93
- self.step_callbacks.append(self.take_snapshot_callback)
94
- write_to_console_log(self.log_path, "Studying an action plan... that will take a bit.")
95
-
96
-
97
- def initialize_system_prompt(self):
98
- return """You are a desktop automation assistant that can control a remote desktop environment.
99
- You only have access to the following tools to interact with the desktop, no additional ones:
100
-
101
- - click(x, y): Performs a left-click at the specified coordinates
102
- - right_click(x, y): Performs a right-click at the specified coordinates
103
- - double_click(x, y): Performs a double-click at the specified coordinates
104
- - move_mouse(x, y): Moves the mouse cursor to the specified coordinates
105
- - type_text(text): Types the specified text at the current cursor position
106
- - press_key(key): Presses a keyboard key (e.g., "Return", "tab", "ctrl+c")
107
- - scroll(direction, amount): Scrolls a website in a browser or a document (direction can be "up" or "down", a common amount is 1 or 2 scroll("down",1) ). DO NOT use scroll to move through linux desktop menus.
108
- - wait(seconds): Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
109
- - open_url(url): Directly opens a browser with the specified url, saves time compared to clicking in a browser and going through the initial setup wizard.
110
- - final_answer("YOUR FINAL ANSWER TEXT"): Announces that the task requested is completed and provides a final text
111
-
112
- The desktop has a resolution of {resolution_x}x{resolution_y}.
113
 
114
  IMPORTANT:
115
  - Remember the tools that you have as those can save you time, for example open_url to enter a website rather than searching for the browser in the OS.
@@ -168,7 +106,60 @@ Execute one action at a time
168
  Verify the result before proceeding to the next step
169
  Use click to move through menus on the desktop and scroll for web and specific applications.
170
  REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
171
- """.format(resolution_x=self.width, resolution_y=self.height)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  def _setup_desktop_tools(self):
174
  """Register all desktop tools"""
@@ -226,7 +217,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
226
  @tool
227
  def type_text(text: str, delay_in_ms: int = 75) -> str:
228
  """
229
- Types the specified text at the current cursor position
230
  Args:
231
  text: The text to type
232
  delay_in_ms: Delay between keystrokes in milliseconds
@@ -238,7 +229,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
238
  @tool
239
  def press_key(key: str) -> str:
240
  """
241
- Presses a keyboard key
242
  Args:
243
  key: The key to press (e.g., "Return", "tab", "ctrl+c")
244
  """
@@ -258,10 +249,25 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
258
  write_to_console_log(self.log_path, "Went back one page")
259
  return "Went back one page"
260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  @tool
262
  def scroll(direction: str = "down", amount: int = 1) -> str:
263
  """
264
- Scrolls the page
265
  Args:
266
  direction: The direction to scroll ("up" or "down"), defaults to "down"
267
  amount: The amount to scroll. A good amount is 1 or 2.
@@ -273,7 +279,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
273
  @tool
274
  def wait(seconds: float) -> str:
275
  """
276
- Waits for the specified number of seconds
277
  Args:
278
  seconds: Number of seconds to wait
279
  """
@@ -284,7 +290,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
284
  @tool
285
  def open_url(url: str) -> str:
286
  """
287
- Opens the specified URL in the default browser
288
  Args:
289
  url: The URL to open
290
  """
@@ -310,6 +316,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
310
  self.tools["wait"] = wait
311
  self.tools["open_url"] = open_url
312
  self.tools["go_back"] = go_back
 
313
 
314
 
315
  def store_metadata_to_file(self, agent) -> None:
 
39
  print(f"Error writing to log file: {str(e)}")
40
  return False
41
 
42
+ E2B_SYSTEM_PROMPT_TEMPLATE = """You are a desktop automation assistant that can control a remote desktop environment.
43
+ On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
44
+ {%- for tool in tools.values() %}
45
+ - {{ tool.name }}: {{ tool.description }}
46
+ Takes inputs: {{tool.inputs}}
47
+ Returns an output of type: {{tool.output_type}}
48
+ {%- endfor %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ The desktop has a resolution of <<resolution_x>>x<<resolution_y>>.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  IMPORTANT:
53
  - Remember the tools that you have as those can save you time, for example open_url to enter a website rather than searching for the browser in the OS.
 
106
  Verify the result before proceeding to the next step
107
  Use click to move through menus on the desktop and scroll for web and specific applications.
108
  REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
109
+ """
110
+
111
+ class E2BVisionAgent(CodeAgent):
112
+ """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
113
+ def __init__(
114
+ self,
115
+ model: HfApiModel,
116
+ data_dir: str,
117
+ desktop: Sandbox,
118
+ tools: List[tool] = None,
119
+ max_steps: int = 200,
120
+ verbosity_level: LogLevel = 4,
121
+ planning_interval: int = 15,
122
+ log_file = None,
123
+ **kwargs
124
+ ):
125
+ self.desktop = desktop
126
+ self.data_dir = data_dir
127
+ self.log_path = log_file
128
+ write_to_console_log(self.log_path, "Booting agent...")
129
+ self.planning_interval = planning_interval
130
+ # Initialize Desktop
131
+ self.width, self.height = self.desktop.get_screen_size()
132
+ print(f"Screen size: {self.width}x{self.height}")
133
+ write_to_console_log(self.log_path, f"Desktop resolution detected: {self.width}x{self.height}")
134
+
135
+
136
+ # Set up temp directory
137
+ os.makedirs(self.data_dir, exist_ok=True)
138
+ print(f"Screenshots and steps will be saved to: {self.data_dir}")
139
+ print(f"Verbosity level set to {verbosity_level}")
140
+
141
+ # Initialize base agent
142
+ super().__init__(
143
+ tools=tools or [],
144
+ model=model,
145
+ max_steps=max_steps,
146
+ verbosity_level=verbosity_level,
147
+ planning_interval = self.planning_interval,
148
+ **kwargs
149
+ )
150
+ self.prompt_templates["system_prompt"] = E2B_SYSTEM_PROMPT_TEMPLATE.replace("<<resolution_x>>", str(self.width)).replace("<<resolution_y>>", str(self.height))
151
+
152
+
153
+ # Add screen info to state
154
+ self.state["screen_width"] = self.width
155
+ self.state["screen_height"] = self.height
156
+
157
+
158
+ # Add default tools
159
+ self._setup_desktop_tools()
160
+ write_to_console_log(self.log_path, "Setting up agent tools...")
161
+ self.step_callbacks.append(self.take_snapshot_callback)
162
+ write_to_console_log(self.log_path, "Studying an action plan... that will take a bit.")
163
 
164
  def _setup_desktop_tools(self):
165
  """Register all desktop tools"""
 
217
  @tool
218
  def type_text(text: str, delay_in_ms: int = 75) -> str:
219
  """
220
+ Types the specified text at the current cursor position.
221
  Args:
222
  text: The text to type
223
  delay_in_ms: Delay between keystrokes in milliseconds
 
229
  @tool
230
  def press_key(key: str) -> str:
231
  """
232
+ Presses a keyboard key (e.g., "Return", "tab", "ctrl+c")
233
  Args:
234
  key: The key to press (e.g., "Return", "tab", "ctrl+c")
235
  """
 
249
  write_to_console_log(self.log_path, "Went back one page")
250
  return "Went back one page"
251
 
252
+ @tool
253
+ def drag_and_drop(x1: int, y1: int, x2: int, y2: int) -> str:
254
+ """
255
+ Clicks [x1, y1], drags mouse to [x2, y2], then release click.
256
+ Args:
257
+ x1: origin x coordinate
258
+ y1: origin y coordinate
259
+ x2: end x coordinate
260
+ y2: end y coordinate
261
+ """
262
+ self.desktop.drag([x1, y1], [x2, y2])
263
+ message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
264
+ write_to_console_log(self.log_path, message)
265
+ return message
266
+
267
  @tool
268
  def scroll(direction: str = "down", amount: int = 1) -> str:
269
  """
270
+ Uses scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
271
  Args:
272
  direction: The direction to scroll ("up" or "down"), defaults to "down"
273
  amount: The amount to scroll. A good amount is 1 or 2.
 
279
  @tool
280
  def wait(seconds: float) -> str:
281
  """
282
+ Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
283
  Args:
284
  seconds: Number of seconds to wait
285
  """
 
290
  @tool
291
  def open_url(url: str) -> str:
292
  """
293
+ Directly opens a browser with the specified url, saves time compared to clicking in a browser and going through the initial setup wizard.
294
  Args:
295
  url: The URL to open
296
  """
 
316
  self.tools["wait"] = wait
317
  self.tools["open_url"] = open_url
318
  self.tools["go_back"] = go_back
319
+ self.tools["drag_and_drop"] = drag_and_drop
320
 
321
 
322
  def store_metadata_to_file(self, agent) -> None: