Miquel Farré commited on
Commit
94436e0
·
1 Parent(s): 4ac90bc

adding terminal

Browse files
Files changed (2) hide show
  1. app.py +127 -38
  2. e2bqwen.py +39 -101
app.py CHANGED
@@ -10,6 +10,7 @@ from textwrap import dedent
10
  import time
11
  from threading import Timer
12
 
 
13
  from e2bqwen import QwenVLAPIModel, E2BVisionAgent
14
 
15
  E2B_API_KEY = os.getenv("E2B_API_KEY")
@@ -242,9 +243,34 @@ function() {
242
  setTimeout(monitorForErrors, 3000);
243
  }
244
  });
 
 
 
 
 
 
245
  }
246
  """
247
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  def upload_to_hf_and_remove(folder_path):
249
 
250
  repo_id = "open-agents/os-agent-logs"
@@ -367,8 +393,38 @@ def save_final_status(folder, status, details = None):
367
  a.write(json.dumps({"status":status,"details":str(details)}))
368
  a.close()
369
 
370
- def run_agent_task(task_input, request: gr.Request):
 
 
 
 
 
 
 
 
 
 
 
371
  session_hash = request.session_hash
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  interaction_id = generate_interaction_id(request)
373
  desktop = get_or_create_sandbox(session_hash)
374
 
@@ -377,7 +433,7 @@ def run_agent_task(task_input, request: gr.Request):
377
  if not os.path.exists(data_dir):
378
  os.makedirs(data_dir)
379
 
380
-
381
  # Create the agent
382
  agent = E2BVisionAgent(
383
  model=model,
@@ -386,6 +442,7 @@ def run_agent_task(task_input, request: gr.Request):
386
  max_steps=200,
387
  verbosity_level=LogLevel.INFO,
388
  planning_interval=5,
 
389
  )
390
 
391
  # Construct the full task with instructions
@@ -404,28 +461,28 @@ def run_agent_task(task_input, request: gr.Request):
404
  """)
405
 
406
  try:
407
-
408
  # Run the agent
409
  result = agent.run(full_task)
410
- save_final_status(data_dir, "completed", details = result)
411
- return f"Task completed: {result}"
412
-
413
  except Exception as e:
414
  error_message = f"Error running agent: {str(e)} Details {traceback.format_exc()}"
415
  save_final_status(data_dir, "failed", details = error_message)
416
  print(error_message)
417
- if 'Both endpoints failed' in error_message:
418
- return "Error running agent - Model inference endpoints not ready. Try again later."
419
- return "Error running agent"
420
 
421
  finally:
422
  upload_to_hf_and_remove(data_dir)
423
 
 
 
424
  # Create a Gradio app with Blocks
425
  with gr.Blocks(css=custom_css, js=custom_js) as demo:
426
- #gr.HTML("""<h1 style="text-align: center">Personal Computer Assistant</h1>""")
427
-
428
- # HTML output with simulated image and iframe - default to interactive
429
  html_output = gr.HTML(
430
  value=html_template.format(
431
  stream_url="",
@@ -435,13 +492,11 @@ with gr.Blocks(css=custom_css, js=custom_js) as demo:
435
  label="Output"
436
  )
437
  with gr.Row():
438
- # Text input for task
439
  task_input = gr.Textbox(
440
  value="Find picture of cute puppies",
441
  label="Enter your command",
442
  )
443
 
444
- # Examples
445
  gr.Examples(
446
  examples=[
447
  "Check the commuting time between Bern and Zurich",
@@ -452,23 +507,49 @@ with gr.Blocks(css=custom_css, js=custom_js) as demo:
452
  label= "Example Tasks",
453
  examples_per_page=4
454
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
 
456
-
457
- # Results output
458
- results_output = gr.Textbox(
459
- label="Results",
460
- interactive=False,
461
- elem_id="results-output"
462
- )
463
-
464
- # Update button
465
  update_btn = gr.Button("Let's go!")
466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  # Function to set view-only mode
468
  def clear_and_set_view_only(task_input, request: gr.Request):
469
  # First clear the results, then set view-only mode
470
- return "", update_html(False, request)
471
-
472
  # Function to set interactive mode
473
  def set_interactive_mode(request: gr.Request):
474
  return update_html(True, request)
@@ -484,34 +565,42 @@ with gr.Blocks(css=custom_css, js=custom_js) as demo:
484
  # This will keep the BSOD visible
485
  return gr.update()
486
 
 
487
  # Chain the events
488
- # 1. Set view-only mode when button is clicked
489
  view_only_event = update_btn.click(
490
  fn=clear_and_set_view_only,
491
  inputs=[task_input],
492
- outputs=[results_output, html_output]
493
  )
494
 
495
- # 2. Then run the agent task
496
  task_result = view_only_event.then(
497
  fn=run_agent_task,
498
- inputs=[task_input],
499
- outputs=results_output
500
  )
501
 
502
- # 3. Then check the result and conditionally set to interactive mode
503
  task_result.then(
504
  fn=check_and_set_interactive,
505
- inputs=[results_output], # Pass the result text to check
506
  outputs=html_output
507
  )
508
-
509
- # Load the sandbox on app start with initial HTML
510
  demo.load(
511
- fn=update_html,
512
- inputs=[gr.Checkbox(value=True, visible=False)], # Hidden checkbox with True value
513
- outputs=html_output
514
  )
 
 
 
 
 
 
 
 
515
 
516
  # Launch the app
517
  if __name__ == "__main__":
 
10
  import time
11
  from threading import Timer
12
 
13
+
14
  from e2bqwen import QwenVLAPIModel, E2BVisionAgent
15
 
16
  E2B_API_KEY = os.getenv("E2B_API_KEY")
 
243
  setTimeout(monitorForErrors, 3000);
244
  }
245
  });
246
+
247
+ // Set up an interval to click the refresh button every 5 seconds
248
+ setInterval(function() {
249
+ const btn = document.getElementById('refresh-log-btn');
250
+ if (btn) btn.click();
251
+ }, 5000);
252
  }
253
  """
254
+ def write_to_console_log(log_file_path, message):
255
+ """
256
+ Appends a message to the specified log file with a newline character.
257
+
258
+ Parameters:
259
+ log_file_path (str): Path to the log file
260
+ message (str): Message to append to the log file
261
+ """
262
+ if log_file_path is None:
263
+ return False
264
+ try:
265
+ # Open the file in append mode
266
+ with open(log_file_path, 'a') as log_file:
267
+ # Write the message followed by a newline
268
+ log_file.write(f"{message}\n")
269
+ return True
270
+ except Exception as e:
271
+ print(f"Error writing to log file: {str(e)}")
272
+ return False
273
+
274
  def upload_to_hf_and_remove(folder_path):
275
 
276
  repo_id = "open-agents/os-agent-logs"
 
393
  a.write(json.dumps({"status":status,"details":str(details)}))
394
  a.close()
395
 
396
+ def get_log_file_path(session_hash):
397
+ """
398
+ Creates a log file path based on the session hash.
399
+ Makes sure the directory exists.
400
+ """
401
+ log_dir = os.path.join(TMP_DIR, session_hash)
402
+ if not os.path.exists(log_dir):
403
+ os.makedirs(log_dir)
404
+
405
+ return os.path.join(log_dir, 'console.log')
406
+
407
+ def initialize_session(interactive_mode, request: gr.Request):
408
  session_hash = request.session_hash
409
+ # Create session-specific log file
410
+ log_path = get_log_file_path(session_hash)
411
+ # Initialize log file if it doesn't exist
412
+ if not os.path.exists(log_path):
413
+ with open(log_path, 'w') as f:
414
+ f.write(f"Ready to go...\n")
415
+ # Return HTML and session hash
416
+ return update_html(interactive_mode, request), session_hash
417
+
418
+ # Function to read log content that gets the path from session hash
419
+ def update_terminal_from_session(session_hash):
420
+ if not session_hash:
421
+ return "Waiting for session..."
422
+
423
+ log_path = get_log_file_path(session_hash)
424
+ return read_log_content(log_path)
425
+
426
+
427
+ def run_agent_task(task_input, session_hash, request: gr.Request):
428
  interaction_id = generate_interaction_id(request)
429
  desktop = get_or_create_sandbox(session_hash)
430
 
 
433
  if not os.path.exists(data_dir):
434
  os.makedirs(data_dir)
435
 
436
+ log_file = get_log_file_path(session_hash)
437
  # Create the agent
438
  agent = E2BVisionAgent(
439
  model=model,
 
442
  max_steps=200,
443
  verbosity_level=LogLevel.INFO,
444
  planning_interval=5,
445
+ log_file = log_file
446
  )
447
 
448
  # Construct the full task with instructions
 
461
  """)
462
 
463
  try:
 
464
  # Run the agent
465
  result = agent.run(full_task)
466
+ save_final_status(data_dir, "completed", details = result)
467
+ return f"Task completed: {result}", gr.update(visible=True), gr.update(visible=False)
468
+
469
  except Exception as e:
470
  error_message = f"Error running agent: {str(e)} Details {traceback.format_exc()}"
471
  save_final_status(data_dir, "failed", details = error_message)
472
  print(error_message)
473
+ error_result = "Error running agent - Model inference endpoints not ready. Try again later." if 'Both endpoints failed' in error_message else "Error running agent"
474
+ return error_result, gr.update(visible=True), gr.update(visible=False)
 
475
 
476
  finally:
477
  upload_to_hf_and_remove(data_dir)
478
 
479
+
480
+
481
  # Create a Gradio app with Blocks
482
  with gr.Blocks(css=custom_css, js=custom_js) as demo:
483
+ #Storing session hash in a state variable
484
+ session_hash_state = gr.State(None)
485
+
486
  html_output = gr.HTML(
487
  value=html_template.format(
488
  stream_url="",
 
492
  label="Output"
493
  )
494
  with gr.Row():
 
495
  task_input = gr.Textbox(
496
  value="Find picture of cute puppies",
497
  label="Enter your command",
498
  )
499
 
 
500
  gr.Examples(
501
  examples=[
502
  "Check the commuting time between Bern and Zurich",
 
507
  label= "Example Tasks",
508
  examples_per_page=4
509
  )
510
+
511
+ with gr.Group(visible=True) as terminal_container:
512
+ terminal = gr.Textbox(
513
+ value="Initializing...",
514
+ label='Console',
515
+ lines=5,
516
+ max_lines=10,
517
+ interactive=False
518
+ )
519
+
520
+ # Hidden refresh button
521
+ refresh_btn = gr.Button("Refresh", visible=False, elem_id="refresh-log-btn")
522
+
523
+ with gr.Group(visible=False) as results_container:
524
+ results_output = gr.Textbox(
525
+ label="Results",
526
+ interactive=False,
527
+ elem_id="results-output"
528
+ )
529
 
 
 
 
 
 
 
 
 
 
530
  update_btn = gr.Button("Let's go!")
531
 
532
+
533
+ def read_log_content(log_file, tail=4):
534
+ """Read the contents of a log file for a specific session"""
535
+ if not log_file:
536
+ return "Waiting for session..."
537
+
538
+ if not os.path.exists(log_file):
539
+ return "Waiting for machine from the future to boot..."
540
+
541
+ try:
542
+ with open(log_file, 'r') as f:
543
+ lines = f.readlines()
544
+ return "".join(lines[-tail:] if len(lines) > tail else lines)
545
+ except Exception as e:
546
+ return f"Guru meditation: {str(e)}"
547
+
548
  # Function to set view-only mode
549
  def clear_and_set_view_only(task_input, request: gr.Request):
550
  # First clear the results, then set view-only mode
551
+ return "", update_html(False, request), gr.update(visible=False), gr.update(visible=True)
552
+
553
  # Function to set interactive mode
554
  def set_interactive_mode(request: gr.Request):
555
  return update_html(True, request)
 
565
  # This will keep the BSOD visible
566
  return gr.update()
567
 
568
+
569
  # Chain the events
570
+ # 1. Set view-only mode when button is clicked and reset visibility
571
  view_only_event = update_btn.click(
572
  fn=clear_and_set_view_only,
573
  inputs=[task_input],
574
+ outputs=[results_output, html_output, results_container, terminal_container]
575
  )
576
 
577
+ # 2. Then run the agent task and update visibility
578
  task_result = view_only_event.then(
579
  fn=run_agent_task,
580
+ inputs=[task_input,session_hash_state],
581
+ outputs=[results_output, results_container, terminal_container]
582
  )
583
 
584
+ # 3. Set interactive mode when task completes successfully
585
  task_result.then(
586
  fn=check_and_set_interactive,
587
+ inputs=[results_output],
588
  outputs=html_output
589
  )
590
+
 
591
  demo.load(
592
+ fn=initialize_session,
593
+ inputs=[gr.Checkbox(value=True, visible=False)],
594
+ outputs=[html_output, session_hash_state]
595
  )
596
+
597
+ # Connect refresh button to update terminal
598
+ refresh_btn.click(
599
+ fn=update_terminal_from_session,
600
+ inputs=[session_hash_state],
601
+ outputs=[terminal]
602
+ )
603
+
604
 
605
  # Launch the app
606
  if __name__ == "__main__":
e2bqwen.py CHANGED
@@ -19,7 +19,26 @@ from smolagents.memory import ActionStep
19
  from smolagents.models import ChatMessage, MessageRole, Model
20
  from smolagents.monitoring import LogLevel
21
 
22
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  class E2BVisionAgent(CodeAgent):
24
  """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
25
  def __init__(
@@ -31,14 +50,19 @@ class E2BVisionAgent(CodeAgent):
31
  max_steps: int = 200,
32
  verbosity_level: LogLevel = 4,
33
  planning_interval: int = 15,
 
34
  **kwargs
35
  ):
36
  self.desktop = desktop
37
  self.data_dir = data_dir
 
 
38
  self.planning_interval = planning_interval
39
  # Initialize Desktop
40
  self.width, self.height = self.desktop.get_screen_size()
41
  print(f"Screen size: {self.width}x{self.height}")
 
 
42
 
43
 
44
  # Set up temp directory
@@ -65,7 +89,9 @@ class E2BVisionAgent(CodeAgent):
65
 
66
  # Add default tools
67
  self._setup_desktop_tools()
 
68
  self.step_callbacks.append(self.take_snapshot_callback)
 
69
 
70
 
71
  def initialize_system_prompt(self):
@@ -156,6 +182,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
156
  """
157
  self.desktop.move_mouse(x, y)
158
  self.desktop.left_click()
 
159
  return f"Clicked at coordinates ({x}, {y})"
160
 
161
  @tool
@@ -168,6 +195,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
168
  """
169
  self.desktop.move_mouse(x, y)
170
  self.desktop.right_click()
 
171
  return f"Right-clicked at coordinates ({x}, {y})"
172
 
173
  @tool
@@ -180,6 +208,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
180
  """
181
  self.desktop.move_mouse(x, y)
182
  self.desktop.double_click()
 
183
  return f"Double-clicked at coordinates ({x}, {y})"
184
 
185
  @tool
@@ -191,6 +220,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
191
  y: The y coordinate (vertical position)
192
  """
193
  self.desktop.move_mouse(x, y)
 
194
  return f"Moved mouse to coordinates ({x}, {y})"
195
 
196
  @tool
@@ -202,6 +232,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
202
  delay_in_ms: Delay between keystrokes in milliseconds
203
  """
204
  self.desktop.write(text, delay_in_ms=delay_in_ms)
 
205
  return f"Typed text: '{text}'"
206
 
207
  @tool
@@ -214,6 +245,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
214
  if key == "enter":
215
  key = "Return"
216
  self.desktop.press(key)
 
217
  return f"Pressed key: {key}"
218
 
219
  @tool
@@ -223,6 +255,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
223
  Args:
224
  """
225
  self.desktop.press(["alt", "left"])
 
226
  return "Went back one page"
227
 
228
  @tool
@@ -234,6 +267,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
234
  amount: The amount to scroll. A good amount is 1 or 2.
235
  """
236
  self.desktop.scroll(direction=direction, amount=amount)
 
237
  return f"Scrolled {direction} by {amount}"
238
 
239
  @tool
@@ -244,6 +278,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
244
  seconds: Number of seconds to wait
245
  """
246
  time.sleep(seconds)
 
247
  return f"Waited for {seconds} seconds"
248
 
249
  @tool
@@ -260,6 +295,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
260
  self.desktop.open(url)
261
  # Give it time to load
262
  time.sleep(2)
 
263
  return f"Opened URL: {url}"
264
 
265
 
@@ -289,7 +325,6 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
289
  messages = [{"role": MessageRole.SYSTEM, "content": [{"type": "text", "text": self.system_prompt}]}]
290
  # Get the last memory step
291
  last_step = self.memory.steps[-1] if self.memory.steps else None
292
-
293
  for memory_step in self.memory.steps:
294
  if hasattr(memory_step, "task") and memory_step.task:
295
  # Add task message if it exists
@@ -359,6 +394,8 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
359
 
360
  def take_snapshot_callback(self, memory_step: ActionStep, agent=None) -> None:
361
  """Callback that takes a screenshot + memory snapshot after a step completes"""
 
 
362
  current_step = memory_step.step_number
363
  print(f"Taking screenshot for step {current_step}")
364
  # Check if desktop is still running
@@ -407,105 +444,6 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
407
  print("E2B sandbox terminated")
408
 
409
 
410
-
411
- # class QwenVLAPIModel(Model):
412
- # """Model wrapper for Qwen2.5VL API"""
413
-
414
- # def __init__(
415
- # self,
416
- # model_path: str = "Qwen/Qwen2.5-VL-72B-Instruct",
417
- # provider: str = "hyperbolic"
418
- # ):
419
- # super().__init__()
420
- # self.model_path = model_path
421
- # self.model_id = model_path
422
- # self.provider = provider
423
-
424
- # self.client = InferenceClient(
425
- # provider=self.provider,
426
- # )
427
-
428
- # def __call__(
429
- # self,
430
- # messages: List[Dict[str, Any]],
431
- # stop_sequences: Optional[List[str]] = None,
432
- # **kwargs
433
- # ) -> ChatMessage:
434
- # """Convert a list of messages to an API request and return the response"""
435
- # # # Count images in messages - debug
436
- # # image_count = 0
437
- # # for msg in messages:
438
- # # if isinstance(msg.get("content"), list):
439
- # # for item in msg["content"]:
440
- # # if isinstance(item, dict) and item.get("type") == "image":
441
- # # image_count += 1
442
-
443
- # # print(f"QwenVLAPIModel received {len(messages)} messages with {image_count} images")
444
-
445
- # # Format the messages for the API
446
-
447
- # formatted_messages = []
448
-
449
- # for msg in messages:
450
- # role = msg["role"]
451
- # if isinstance(msg["content"], list):
452
- # content = []
453
- # for item in msg["content"]:
454
- # if item["type"] == "text":
455
- # content.append({"type": "text", "text": item["text"]})
456
- # elif item["type"] == "image":
457
- # # Handle image path or direct image object
458
- # if isinstance(item["image"], str):
459
- # # Image is a path
460
- # with open(item["image"], "rb") as image_file:
461
- # base64_image = base64.b64encode(image_file.read()).decode("utf-8")
462
- # else:
463
- # # Image is a PIL image or similar object
464
- # img_byte_arr = io.BytesIO()
465
- # item["image"].save(img_byte_arr, format="PNG")
466
- # base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
467
-
468
- # content.append({
469
- # "type": "image_url",
470
- # "image_url": {
471
- # "url": f"data:image/png;base64,{base64_image}"
472
- # }
473
- # })
474
- # else:
475
- # content = [{"type": "text", "text": msg["content"]}]
476
-
477
- # formatted_messages.append({"role": role, "content": content})
478
-
479
- # # Make the API request
480
- # completion = self.client.chat.completions.create(
481
- # model=self.model_path,
482
- # messages=formatted_messages,
483
- # max_tokens=kwargs.get("max_new_tokens", 512),
484
- # temperature=kwargs.get("temperature", 0.7),
485
- # top_p=kwargs.get("top_p", 0.9),
486
- # )
487
-
488
- # # Extract the response text
489
- # output_text = completion.choices[0].message.content
490
-
491
- # return ChatMessage(role=MessageRole.ASSISTANT, content=output_text)
492
-
493
- # def to_dict(self) -> Dict[str, Any]:
494
- # """Convert the model to a dictionary"""
495
- # return {
496
- # "class": self.__class__.__name__,
497
- # "model_path": self.model_path,
498
- # "provider": self.provider,
499
- # # We don't save the API key for security reasons
500
- # }
501
-
502
- # @classmethod
503
- # def from_dict(cls, data: Dict[str, Any]) -> "QwenVLAPIModel":
504
- # """Create a model from a dictionary"""
505
- # return cls(
506
- # model_path=data.get("model_path", "Qwen/Qwen2.5-VL-72B-Instruct"),
507
- # provider=data.get("provider", "hyperbolic"),
508
- # )
509
  class QwenVLAPIModel(Model):
510
  """Model wrapper for Qwen2.5VL API with fallback mechanism"""
511
 
 
19
  from smolagents.models import ChatMessage, MessageRole, Model
20
  from smolagents.monitoring import LogLevel
21
 
22
+ def write_to_console_log(log_file_path, message):
23
+ """
24
+ Appends a message to the specified log file with a newline character.
25
+
26
+ Parameters:
27
+ log_file_path (str): Path to the log file
28
+ message (str): Message to append to the log file
29
+ """
30
+ if log_file_path is None:
31
+ return False
32
+ try:
33
+ # Open the file in append mode
34
+ with open(log_file_path, 'a') as log_file:
35
+ # Write the message followed by a newline
36
+ log_file.write(f"{message}\n")
37
+ return True
38
+ except Exception as e:
39
+ print(f"Error writing to log file: {str(e)}")
40
+ return False
41
+
42
  class E2BVisionAgent(CodeAgent):
43
  """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
44
  def __init__(
 
50
  max_steps: int = 200,
51
  verbosity_level: LogLevel = 4,
52
  planning_interval: int = 15,
53
+ log_file = None,
54
  **kwargs
55
  ):
56
  self.desktop = desktop
57
  self.data_dir = data_dir
58
+ self.log_path = log_file
59
+ write_to_console_log(self.log_path, "Booting agent...")
60
  self.planning_interval = planning_interval
61
  # Initialize Desktop
62
  self.width, self.height = self.desktop.get_screen_size()
63
  print(f"Screen size: {self.width}x{self.height}")
64
+ write_to_console_log(self.log_path, f"Desktop resolution detected: {self.width}x{self.height}")
65
+
66
 
67
 
68
  # Set up temp directory
 
89
 
90
  # Add default tools
91
  self._setup_desktop_tools()
92
+ write_to_console_log(self.log_path, "Setting up agent tools...")
93
  self.step_callbacks.append(self.take_snapshot_callback)
94
+ write_to_console_log(self.log_path, "Studying an action plan... that will take a bit.")
95
 
96
 
97
  def initialize_system_prompt(self):
 
182
  """
183
  self.desktop.move_mouse(x, y)
184
  self.desktop.left_click()
185
+ write_to_console_log(self.log_path, f"Clicked at coordinates ({x}, {y})")
186
  return f"Clicked at coordinates ({x}, {y})"
187
 
188
  @tool
 
195
  """
196
  self.desktop.move_mouse(x, y)
197
  self.desktop.right_click()
198
+ write_to_console_log(self.log_path, f"Right-clicked at coordinates ({x}, {y})")
199
  return f"Right-clicked at coordinates ({x}, {y})"
200
 
201
  @tool
 
208
  """
209
  self.desktop.move_mouse(x, y)
210
  self.desktop.double_click()
211
+ write_to_console_log(self.log_path, f"Double-clicked at coordinates ({x}, {y})")
212
  return f"Double-clicked at coordinates ({x}, {y})"
213
 
214
  @tool
 
220
  y: The y coordinate (vertical position)
221
  """
222
  self.desktop.move_mouse(x, y)
223
+ write_to_console_log(self.log_path, f"Moved mouse to coordinates ({x}, {y})")
224
  return f"Moved mouse to coordinates ({x}, {y})"
225
 
226
  @tool
 
232
  delay_in_ms: Delay between keystrokes in milliseconds
233
  """
234
  self.desktop.write(text, delay_in_ms=delay_in_ms)
235
+ write_to_console_log(self.log_path, f"Typed text: '{text}'")
236
  return f"Typed text: '{text}'"
237
 
238
  @tool
 
245
  if key == "enter":
246
  key = "Return"
247
  self.desktop.press(key)
248
+ write_to_console_log(self.log_path, f"Pressed key: {key}")
249
  return f"Pressed key: {key}"
250
 
251
  @tool
 
255
  Args:
256
  """
257
  self.desktop.press(["alt", "left"])
258
+ write_to_console_log(self.log_path, "Went back one page")
259
  return "Went back one page"
260
 
261
  @tool
 
267
  amount: The amount to scroll. A good amount is 1 or 2.
268
  """
269
  self.desktop.scroll(direction=direction, amount=amount)
270
+ write_to_console_log(self.log_path, f"Scrolled {direction} by {amount}")
271
  return f"Scrolled {direction} by {amount}"
272
 
273
  @tool
 
278
  seconds: Number of seconds to wait
279
  """
280
  time.sleep(seconds)
281
+ write_to_console_log(self.log_path, f"Waited for {seconds} seconds")
282
  return f"Waited for {seconds} seconds"
283
 
284
  @tool
 
295
  self.desktop.open(url)
296
  # Give it time to load
297
  time.sleep(2)
298
+ write_to_console_log(self.log_path, f"Opening URL: {url}")
299
  return f"Opened URL: {url}"
300
 
301
 
 
325
  messages = [{"role": MessageRole.SYSTEM, "content": [{"type": "text", "text": self.system_prompt}]}]
326
  # Get the last memory step
327
  last_step = self.memory.steps[-1] if self.memory.steps else None
 
328
  for memory_step in self.memory.steps:
329
  if hasattr(memory_step, "task") and memory_step.task:
330
  # Add task message if it exists
 
394
 
395
  def take_snapshot_callback(self, memory_step: ActionStep, agent=None) -> None:
396
  """Callback that takes a screenshot + memory snapshot after a step completes"""
397
+ write_to_console_log(self.log_path, "Analyzing screen content...")
398
+
399
  current_step = memory_step.step_number
400
  print(f"Taking screenshot for step {current_step}")
401
  # Check if desktop is still running
 
444
  print("E2B sandbox terminated")
445
 
446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  class QwenVLAPIModel(Model):
448
  """Model wrapper for Qwen2.5VL API with fallback mechanism"""
449