M-Rique commited on
Commit
64b82de
·
1 Parent(s): 5cc8c9a

Fix smolagents integration

Browse files
Files changed (3) hide show
  1. app.py +14 -28
  2. e2bqwen.py +50 -223
  3. eval.py +3 -3
app.py CHANGED
@@ -16,6 +16,10 @@ from smolagents.gradio_ui import GradioUI, stream_to_gradio
16
  from model_replay import FakeModelReplayLog
17
  from gradio_modal import Modal
18
 
 
 
 
 
19
  from e2bqwen import QwenVLAPIModel, E2BVisionAgent
20
 
21
  E2B_API_KEY = os.getenv("E2B_API_KEY")
@@ -28,7 +32,7 @@ TMP_DIR = './tmp/'
28
  if not os.path.exists(TMP_DIR):
29
  os.makedirs(TMP_DIR)
30
 
31
- hf_token = os.getenv("HUGGINGFACE_API_KEY")
32
  login(token=hf_token)
33
 
34
  custom_css = """
@@ -297,25 +301,6 @@ custom_js = """function() {
297
  }
298
  """
299
 
300
- def write_to_console_log(log_file_path, message):
301
- """
302
- Appends a message to the specified log file with a newline character.
303
-
304
- Parameters:
305
- log_file_path (str): Path to the log file
306
- message (str): Message to append to the log file
307
- """
308
- if log_file_path is None:
309
- return False
310
- try:
311
- # Open the file in append mode
312
- with open(log_file_path, 'a') as log_file:
313
- # Write the message followed by a newline
314
- log_file.write(f"{message}\n")
315
- return True
316
- except Exception as e:
317
- print(f"Error writing to log file: {str(e)}")
318
- return False
319
 
320
  def upload_to_hf_and_remove(folder_path):
321
 
@@ -472,16 +457,16 @@ def create_agent(data_dir, desktop):
472
  hf_token = hf_token,
473
  )
474
 
475
- model = OpenAIServerModel(
476
- "gpt-4o",api_key=os.getenv("OPENAI_API_KEY")
477
- )
478
  return E2BVisionAgent(
479
  model=model,
480
  data_dir=data_dir,
481
  desktop=desktop,
482
  max_steps=200,
483
  verbosity_level=2,
484
- planning_interval=10,
485
  use_v1_prompt=True
486
  )
487
 
@@ -527,13 +512,14 @@ class EnrichedGradioUI(GradioUI):
527
  yield stored_messages
528
 
529
  # THIS ERASES IMAGES FROM AGENT MEMORY, USE WITH CAUTION
530
- if consent_storage:
531
- summary = get_agent_summary_erase_images(session_state["agent"])
532
- save_final_status(data_dir, "completed", summary = summary)
533
  yield stored_messages
534
 
535
  except Exception as e:
536
  error_message=f"Error in interaction: {str(e)}"
 
537
  print(error_message)
538
  stored_messages.append(gr.ChatMessage(role="assistant", content="Run failed:\n" + error_message))
539
  if consent_storage:
@@ -584,7 +570,7 @@ _Please note that we store the task logs by default so **do not write any person
584
  "Check the commuting time between Bern and Zurich on Google maps",
585
  "Write 'Hello World' in a text editor",
586
  "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
587
- "Search a flight Rome - Berlin for tomorrow",
588
  "What' s the name of the pond just south of Château de Fontainebleau in Google maps?",
589
  "Go on the Hugging Face Hub, find the space for FLUX1.dev, then generate a picture of the Golden Gate bridge",
590
  "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
 
16
  from model_replay import FakeModelReplayLog
17
  from gradio_modal import Modal
18
 
19
+ from dotenv import load_dotenv
20
+
21
+ load_dotenv(override=True)
22
+
23
  from e2bqwen import QwenVLAPIModel, E2BVisionAgent
24
 
25
  E2B_API_KEY = os.getenv("E2B_API_KEY")
 
32
  if not os.path.exists(TMP_DIR):
33
  os.makedirs(TMP_DIR)
34
 
35
+ hf_token = os.getenv("HF_TOKEN")
36
  login(token=hf_token)
37
 
38
  custom_css = """
 
301
  }
302
  """
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
  def upload_to_hf_and_remove(folder_path):
306
 
 
457
  hf_token = hf_token,
458
  )
459
 
460
+ # model = OpenAIServerModel(
461
+ # "gpt-4o",api_key=os.getenv("OPENAI_API_KEY")
462
+ # )
463
  return E2BVisionAgent(
464
  model=model,
465
  data_dir=data_dir,
466
  desktop=desktop,
467
  max_steps=200,
468
  verbosity_level=2,
469
+ # planning_interval=10,
470
  use_v1_prompt=True
471
  )
472
 
 
512
  yield stored_messages
513
 
514
  # THIS ERASES IMAGES FROM AGENT MEMORY, USE WITH CAUTION
515
+ # if consent_storage:
516
+ # summary = get_agent_summary_erase_images(session_state["agent"])
517
+ # save_final_status(data_dir, "completed", summary = summary)
518
  yield stored_messages
519
 
520
  except Exception as e:
521
  error_message=f"Error in interaction: {str(e)}"
522
+ raise e
523
  print(error_message)
524
  stored_messages.append(gr.ChatMessage(role="assistant", content="Run failed:\n" + error_message))
525
  if consent_storage:
 
570
  "Check the commuting time between Bern and Zurich on Google maps",
571
  "Write 'Hello World' in a text editor",
572
  "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
573
+ "Search a flight from Rome to Berlin for tomorrow on Skyscanner",
574
  "What' s the name of the pond just south of Château de Fontainebleau in Google maps?",
575
  "Go on the Hugging Face Hub, find the space for FLUX1.dev, then generate a picture of the Golden Gate bridge",
576
  "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
e2bqwen.py CHANGED
@@ -1,15 +1,9 @@
1
  import os
2
  import time
3
- import base64
4
  from io import BytesIO
5
- from textwrap import dedent
6
- from typing import Any, Dict, List, Optional, Tuple
7
- import json
8
  import unicodedata
9
 
10
- # HF API params
11
- from huggingface_hub import InferenceClient
12
-
13
  # E2B imports
14
  from e2b_desktop import Sandbox
15
  from PIL import Image
@@ -17,7 +11,8 @@ from PIL import Image
17
  # SmolaAgents imports
18
  from smolagents import CodeAgent, tool, HfApiModel
19
  from smolagents.memory import ActionStep
20
- from smolagents.models import ChatMessage, MessageRole, Model
 
21
  from smolagents.monitoring import LogLevel
22
  from smolagents.agent_types import AgentImage
23
  from PIL import ImageDraw
@@ -48,7 +43,7 @@ On top of performing computations in the Python code snippets that you create, y
48
  {%- endfor %}
49
 
50
  The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels, take it into account to decide clicking coordinates.
51
- If you clicked somewhere in the previous action, a red crosshair will appear at the exact location of the previous click.
52
  The image might have change since then but the cross stays at the previous click. If your click seems to have changed nothing, check that this location is exactly where you intended to click. Otherwise correct the click coordinates.
53
  </tools>
54
 
@@ -98,7 +93,7 @@ click(251, 441)
98
  Step 4:
99
  Short term goal: I want to open a text editor.
100
  Where I am: I am still under the Accessories menu.
101
- What I see: Nothing has changed compared to previous screenshot. Under the open submenu Accessories, I still see 'Text Editor'. The red crosshair is off from the element.
102
  Reflection: My last click must have been off. Let's correct this.
103
  Action: I will click the correct place, right in the middle of the element.
104
  Code:
@@ -145,7 +140,7 @@ On each step, look at the last screenshot and action to validate if previous ste
145
  Use click to move through menus on the desktop and scroll for web and specific applications.
146
  Always analyze the latest screenshot carefully before performing actions.
147
  Desktop menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
148
- Remember the tools that you have as those can save you time, for example open_url to enter a website rather than searching for the browser in the OS.
149
  </general_guidelines>
150
  """
151
 
@@ -153,21 +148,13 @@ def draw_marker_on_image(image_copy, click_coordinates):
153
  x, y = click_coordinates
154
  draw = ImageDraw.Draw(image_copy)
155
  cross_size, linewidth = 10, 3
156
- # Draw red cross lines
157
- draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
158
- draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
159
  # Add a circle around it for better visibility
160
- draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
161
  return image_copy
162
 
163
- from jinja2 import StrictUndefined, Template
164
-
165
-
166
- def populate_template(template: str, variables: Dict[str, Any]) -> str:
167
- compiled_template = Template(template, undefined=StrictUndefined)
168
- return compiled_template.render(**variables)
169
-
170
-
171
 
172
  class E2BVisionAgent(CodeAgent):
173
  """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
@@ -179,7 +166,7 @@ class E2BVisionAgent(CodeAgent):
179
  tools: List[tool] = None,
180
  max_steps: int = 200,
181
  verbosity_level: LogLevel = 2,
182
- planning_interval: int = 10,
183
  use_v1_prompt: bool = False,
184
  **kwargs
185
  ):
@@ -216,7 +203,7 @@ class E2BVisionAgent(CodeAgent):
216
  self.step_callbacks.append(self.take_screenshot_callback)
217
 
218
  def initialize_system_prompt(self) -> str:
219
- if self.use_v1_prompt:
220
  return """You are a desktop automation assistant that can control a remote desktop environment.
221
  You only have access to the following tools to interact with the desktop, no additional ones:
222
  - click(x, y): Performs a left-click at the specified coordinates
@@ -228,6 +215,8 @@ You only have access to the following tools to interact with the desktop, no add
228
  - scroll(x, y, direction, amount): Scrolls a website in a browser or a document (direction can be "up" or "down", a common amount is 1 or 2 scroll("down",1) ). DO NOT use scroll to move through linux desktop menus. x, y, is the mouse position to scroll on.
229
  - wait(seconds): Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
230
  - open_url(url): Directly opens a browser with the specified url, saves time compared to clicking in a browser and going through the initial setup wizard.
 
 
231
  - final_answer("YOUR FINAL ANSWER TEXT"): Announces that the task requested is completed and provides a final text
232
  The desktop has a resolution of {resolution_x}x{resolution_y}.
233
  IMPORTANT:
@@ -247,13 +236,13 @@ After each action, you'll receive an updated screenshot. Review it carefully bef
247
  COMMAND FORMAT:
248
  Always format your actions as Python code blocks. For example:
249
  ```python
250
- click(250, 300)
251
  ```<end_code>
252
  TASK EXAMPLE:
253
  For a task like "Open a text editor and type 'Hello World'":
254
  1- First, analyze the screenshot to find the Applications menu and click on it being very precise, clicking in the middle of the text 'Applications':
255
  ```python
256
- click(50, 10)
257
  ```<end_code>
258
  2- Remembering that menus are navigated through clicking, after analyzing the screenshot with the applications menu open we see that a notes application probably fits in the Accessories section (we see it is a section in the menu thanks to the tiny white triangle after the text accessories). We look for Accessories and click on it being very precise, clicking in the middle of the text 'Accessories'. DO NOT try to move through the menus with scroll, it won't work:
259
  ```python
@@ -280,6 +269,7 @@ Use click to move through menus on the desktop and scroll for web and specific a
280
  REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
281
  """.format(resolution_x=self.width, resolution_y=self.height)
282
  else:
 
283
  system_prompt = populate_template(
284
  self.prompt_templates["system_prompt"],
285
  variables={
@@ -405,16 +395,18 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
405
  @tool
406
  def scroll(x: int, y: int, direction: str = "down", amount: int = 1) -> str:
407
  """
408
- Uses scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
409
  Args:
410
  x: The x coordinate (horizontal position) of the element to scroll/zoom
411
  y: The y coordinate (vertical position) of the element to scroll/zoom
412
  direction: The direction to scroll ("up" or "down"), defaults to "down"
413
  amount: The amount to scroll. A good amount is 1 or 2.
414
  """
 
415
  self.desktop.scroll(direction=direction, amount=amount)
416
- self.logger.log(f"Scrolled {direction} by {amount}")
417
- return f"Scrolled {direction} by {amount}"
 
418
 
419
  @tool
420
  def wait(seconds: float) -> str:
@@ -430,7 +422,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
430
  @tool
431
  def open_url(url: str) -> str:
432
  """
433
- Directly opens a browser with the specified url, saves time compared to clicking in a browser and going through the initial setup wizard.
434
  Args:
435
  url: The URL to open
436
  """
@@ -494,9 +486,9 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
494
 
495
  image_copy = image.copy()
496
 
497
- if getattr(self, "click_coordinates", None):
498
- print("DRAWING MARKER")
499
- image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
500
 
501
  self.last_marked_screenshot = AgentImage(screenshot_path)
502
  print(f"Saved screenshot for step {current_step} to {screenshot_path}")
@@ -506,7 +498,7 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
506
  ) in agent.memory.steps: # Remove previous screenshots from logs for lean processing
507
  if (
508
  isinstance(previous_memory_step, ActionStep)
509
- and previous_memory_step.step_number <= current_step - 2
510
  ):
511
  previous_memory_step.observations_images = None
512
 
@@ -535,81 +527,27 @@ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
535
  print("E2B sandbox terminated")
536
 
537
 
538
- # class QwenVLAPIModel(Model):
539
- # """Model wrapper for Qwen2.5VL API with fallback mechanism"""
540
-
541
- # def __init__(
542
- # self,
543
- # model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
544
- # hf_token: str = None,
545
- # ):
546
- # super().__init__()
547
- # self.model_id = model_id
548
- # self.base_model = HfApiModel(
549
- # model_id="https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud",
550
- # token=hf_token,
551
- # max_tokens=4096,
552
- # )
553
- # self.fallback_model = HfApiModel(
554
- # model_id,
555
- # provider="nebius",
556
- # token=hf_token,
557
- # max_tokens=4096,
558
- # )
559
-
560
- # def __call__(
561
- # self,
562
- # messages: List[Dict[str, Any]],
563
- # stop_sequences: Optional[List[str]] = None,
564
- # **kwargs
565
- # ) -> ChatMessage:
566
-
567
- # try:
568
- # message = self.base_model(messages, stop_sequences, **kwargs)
569
- # return message
570
- # except Exception as e:
571
- # print(f"Base model failed with error: {e}. Calling fallback model.")
572
-
573
- # # Continue to fallback
574
- # try:
575
- # message = self.fallback_model(messages, stop_sequences, **kwargs)
576
- # return message
577
- # except Exception as e:
578
- # raise Exception(f"Both endpoints failed. Last error: {e}")
579
-
580
  class QwenVLAPIModel(Model):
581
  """Model wrapper for Qwen2.5VL API with fallback mechanism"""
582
 
583
  def __init__(
584
  self,
585
- model_path: str = "Qwen/Qwen2.5-VL-72B-Instruct",
586
- provider: str = "hyperbolic",
587
  hf_token: str = None,
588
- #hf_base_url: str = "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
589
- #hf_base_url: str = "https://s41ydkv0iyjeokyj.us-east-1.aws.endpoints.huggingface.cloud/v1/"
590
- #hf_base_url: str = "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
591
- hf_base_url: str= "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
592
  ):
593
  super().__init__()
594
- self.model_path = model_path
595
- self.model_id = model_path
596
- self.provider = provider
597
- self.hf_token = hf_token
598
- self.hf_base_url = hf_base_url
599
-
600
- # Initialize hyperbolic client
601
- self.hyperbolic_client = InferenceClient(
602
- provider=self.provider,
 
 
603
  )
604
-
605
- # Initialize HF OpenAI-compatible client if token is provided
606
- self.hf_client = None
607
- if hf_token:
608
- from openai import OpenAI
609
- self.hf_client = OpenAI(
610
- base_url=self.hf_base_url,
611
- api_key=self.hf_token
612
- )
613
 
614
  def __call__(
615
  self,
@@ -617,129 +555,18 @@ class QwenVLAPIModel(Model):
617
  stop_sequences: Optional[List[str]] = None,
618
  **kwargs
619
  ) -> ChatMessage:
620
- """Convert a list of messages to an API request with fallback mechanism"""
621
- print(messages)
622
- # Format messages once for both APIs
623
- formatted_messages = self._format_messages(messages)
624
-
625
- # First try the HF endpoint if available
626
- if self.hf_client:
627
- try:
628
- completion = self._call_hf_endpoint(
629
- formatted_messages,
630
- stop_sequences,
631
- **kwargs
632
- )
633
- return ChatMessage(role=MessageRole.ASSISTANT, content=completion)
634
- except Exception as e:
635
- print(f"HF endpoint failed with error: {e}. Falling back to hyperbolic.")
636
- # Continue to fallback
637
 
638
- # Fallback to hyperbolic
639
  try:
640
- return self._call_hyperbolic(formatted_messages, stop_sequences, **kwargs)
 
 
 
 
 
 
 
 
 
641
  except Exception as e:
 
642
  raise Exception(f"Both endpoints failed. Last error: {e}")
643
-
644
- def _format_messages(self, messages: List[Dict[str, Any]]):
645
- """Format messages for API requests - works for both endpoints"""
646
-
647
- formatted_messages = []
648
-
649
- for msg in messages:
650
- role = msg["role"]
651
- content = []
652
-
653
- if isinstance(msg["content"], list):
654
- for item in msg["content"]:
655
- if item["type"] == "text":
656
- content.append({"type": "text", "text": item["text"]})
657
- elif item["type"] == "image":
658
- # Handle image path or direct image object
659
- if isinstance(item["image"], str):
660
- # Image is a path
661
- with open(item["image"], "rb") as image_file:
662
- base64_image = base64.b64encode(image_file.read()).decode("utf-8")
663
- else:
664
- # Image is a PIL image or similar object
665
- img_byte_arr = io.BytesIO()
666
- item["image"].save(img_byte_arr, format="PNG")
667
- base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
668
-
669
- content.append({
670
- "type": "image_url",
671
- "image_url": {
672
- "url": f"data:image/png;base64,{base64_image}"
673
- }
674
- })
675
- else:
676
- # Plain text message
677
- content = [{"type": "text", "text": msg["content"]}]
678
-
679
- formatted_messages.append({"role": role, "content": content})
680
-
681
- return formatted_messages
682
-
683
- def _call_hf_endpoint(self, formatted_messages, stop_sequences=None, **kwargs):
684
- """Call the Hugging Face OpenAI-compatible endpoint"""
685
-
686
- # Extract parameters with defaults
687
- max_tokens = kwargs.get("max_new_tokens", 512)
688
- temperature = kwargs.get("temperature", 0.7)
689
- top_p = kwargs.get("top_p", 0.9)
690
- stream = kwargs.get("stream", False)
691
-
692
- completion = self.hf_client.chat.completions.create(
693
- model="tgi", # Model name for the endpoint
694
- messages=formatted_messages,
695
- max_tokens=max_tokens,
696
- temperature=temperature,
697
- top_p=top_p,
698
- stream=stream,
699
- stop=stop_sequences
700
- )
701
-
702
- if stream:
703
- # For streaming responses, return a generator
704
- def stream_generator():
705
- for chunk in completion:
706
- yield chunk.choices[0].delta.content or ""
707
- return stream_generator()
708
- else:
709
- # For non-streaming, return the full text
710
- return completion.choices[0].message.content
711
-
712
- def _call_hyperbolic(self, formatted_messages, stop_sequences=None, **kwargs):
713
- """Call the hyperbolic API"""
714
-
715
- completion = self.hyperbolic_client.chat.completions.create(
716
- model=self.model_path,
717
- messages=formatted_messages,
718
- max_tokens=kwargs.get("max_new_tokens", 512),
719
- temperature=kwargs.get("temperature", 0.7),
720
- top_p=kwargs.get("top_p", 0.9),
721
- )
722
-
723
- # Extract the response text
724
- output_text = completion.choices[0].message.content
725
-
726
- return ChatMessage(role=MessageRole.ASSISTANT, content=output_text)
727
-
728
- def to_dict(self) -> Dict[str, Any]:
729
- """Convert the model to a dictionary"""
730
- return {
731
- "class": self.__class__.__name__,
732
- "model_path": self.model_path,
733
- "provider": self.provider,
734
- "hf_base_url": self.hf_base_url,
735
- # We don't save the API keys for security reasons
736
- }
737
-
738
- @classmethod
739
- def from_dict(cls, data: Dict[str, Any]) -> "QwenVLAPIModel":
740
- """Create a model from a dictionary"""
741
- return cls(
742
- model_path=data.get("model_path", "Qwen/Qwen2.5-VL-72B-Instruct"),
743
- provider=data.get("provider", "hyperbolic"),
744
- hf_base_url=data.get("hf_base_url", "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"),
745
- )
 
1
  import os
2
  import time
 
3
  from io import BytesIO
4
+ from typing import Any, Dict, List, Optional
 
 
5
  import unicodedata
6
 
 
 
 
7
  # E2B imports
8
  from e2b_desktop import Sandbox
9
  from PIL import Image
 
11
  # SmolaAgents imports
12
  from smolagents import CodeAgent, tool, HfApiModel
13
  from smolagents.memory import ActionStep
14
+ from smolagents.models import ChatMessage, Model
15
+ from smolagents.agents import populate_template
16
  from smolagents.monitoring import LogLevel
17
  from smolagents.agent_types import AgentImage
18
  from PIL import ImageDraw
 
43
  {%- endfor %}
44
 
45
  The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels, take it into account to decide clicking coordinates.
46
+ If you clicked somewhere in the previous action, a green crosshair will appear at the exact location of the previous click.
47
  The image might have change since then but the cross stays at the previous click. If your click seems to have changed nothing, check that this location is exactly where you intended to click. Otherwise correct the click coordinates.
48
  </tools>
49
 
 
93
  Step 4:
94
  Short term goal: I want to open a text editor.
95
  Where I am: I am still under the Accessories menu.
96
+ What I see: Nothing has changed compared to previous screenshot. Under the open submenu Accessories, I still see 'Text Editor'. The green cross is off from the element.
97
  Reflection: My last click must have been off. Let's correct this.
98
  Action: I will click the correct place, right in the middle of the element.
99
  Code:
 
140
  Use click to move through menus on the desktop and scroll for web and specific applications.
141
  Always analyze the latest screenshot carefully before performing actions.
142
  Desktop menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
143
+ NEVER CLICK THE WEB BROWSER ICON TO OPEN THE WEB BROWSER: use open_url
144
  </general_guidelines>
145
  """
146
 
 
148
  x, y = click_coordinates
149
  draw = ImageDraw.Draw(image_copy)
150
  cross_size, linewidth = 10, 3
151
+ # Draw cross
152
+ draw.line((x - cross_size, y, x + cross_size, y), fill="green", width=linewidth)
153
+ draw.line((x, y - cross_size, x, y + cross_size), fill="green", width=linewidth)
154
  # Add a circle around it for better visibility
155
+ draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="green", width=linewidth)
156
  return image_copy
157
 
 
 
 
 
 
 
 
 
158
 
159
  class E2BVisionAgent(CodeAgent):
160
  """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
 
166
  tools: List[tool] = None,
167
  max_steps: int = 200,
168
  verbosity_level: LogLevel = 2,
169
+ planning_interval: int = None,
170
  use_v1_prompt: bool = False,
171
  **kwargs
172
  ):
 
203
  self.step_callbacks.append(self.take_screenshot_callback)
204
 
205
  def initialize_system_prompt(self) -> str:
206
+ if True:
207
  return """You are a desktop automation assistant that can control a remote desktop environment.
208
  You only have access to the following tools to interact with the desktop, no additional ones:
209
  - click(x, y): Performs a left-click at the specified coordinates
 
215
  - scroll(x, y, direction, amount): Scrolls a website in a browser or a document (direction can be "up" or "down", a common amount is 1 or 2 scroll("down",1) ). DO NOT use scroll to move through linux desktop menus. x, y, is the mouse position to scroll on.
216
  - wait(seconds): Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
217
  - open_url(url): Directly opens a browser with the specified url, saves time compared to clicking in a browser and going through the initial setup wizard.
218
+ - drag_and_drop(x1, y1, x2, y2): Clicks [x1, y1], drags mouse to [x2, y2], then releases click.
219
+ - find_on_page_ctrl_f(search_string): Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F.
220
  - final_answer("YOUR FINAL ANSWER TEXT"): Announces that the task requested is completed and provides a final text
221
  The desktop has a resolution of {resolution_x}x{resolution_y}.
222
  IMPORTANT:
 
236
  COMMAND FORMAT:
237
  Always format your actions as Python code blocks. For example:
238
  ```python
239
+ click(250, 304)
240
  ```<end_code>
241
  TASK EXAMPLE:
242
  For a task like "Open a text editor and type 'Hello World'":
243
  1- First, analyze the screenshot to find the Applications menu and click on it being very precise, clicking in the middle of the text 'Applications':
244
  ```python
245
+ click(52, 10)
246
  ```<end_code>
247
  2- Remembering that menus are navigated through clicking, after analyzing the screenshot with the applications menu open we see that a notes application probably fits in the Accessories section (we see it is a section in the menu thanks to the tiny white triangle after the text accessories). We look for Accessories and click on it being very precise, clicking in the middle of the text 'Accessories'. DO NOT try to move through the menus with scroll, it won't work:
248
  ```python
 
269
  REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
270
  """.format(resolution_x=self.width, resolution_y=self.height)
271
  else:
272
+ print("USING v2 prompt")
273
  system_prompt = populate_template(
274
  self.prompt_templates["system_prompt"],
275
  variables={
 
395
  @tool
396
  def scroll(x: int, y: int, direction: str = "down", amount: int = 1) -> str:
397
  """
398
+ Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
399
  Args:
400
  x: The x coordinate (horizontal position) of the element to scroll/zoom
401
  y: The y coordinate (vertical position) of the element to scroll/zoom
402
  direction: The direction to scroll ("up" or "down"), defaults to "down"
403
  amount: The amount to scroll. A good amount is 1 or 2.
404
  """
405
+ self.desktop.move_mouse(x, y)
406
  self.desktop.scroll(direction=direction, amount=amount)
407
+ message = f"Scrolled {direction} by {amount}"
408
+ self.logger.log(message)
409
+ return message
410
 
411
  @tool
412
  def wait(seconds: float) -> str:
 
422
  @tool
423
  def open_url(url: str) -> str:
424
  """
425
+ Directly opens a browser with the specified url: use this at start of web searches rather than trying to click the browser.
426
  Args:
427
  url: The URL to open
428
  """
 
486
 
487
  image_copy = image.copy()
488
 
489
+ # if getattr(self, "click_coordinates", None):
490
+ # print("DRAWING MARKER")
491
+ # image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
492
 
493
  self.last_marked_screenshot = AgentImage(screenshot_path)
494
  print(f"Saved screenshot for step {current_step} to {screenshot_path}")
 
498
  ) in agent.memory.steps: # Remove previous screenshots from logs for lean processing
499
  if (
500
  isinstance(previous_memory_step, ActionStep)
501
+ and previous_memory_step.step_number <= current_step - 1
502
  ):
503
  previous_memory_step.observations_images = None
504
 
 
527
  print("E2B sandbox terminated")
528
 
529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
  class QwenVLAPIModel(Model):
531
  """Model wrapper for Qwen2.5VL API with fallback mechanism"""
532
 
533
  def __init__(
534
  self,
535
+ model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
 
536
  hf_token: str = None,
 
 
 
 
537
  ):
538
  super().__init__()
539
+ self.model_id = model_id
540
+ self.base_model = HfApiModel(
541
+ model_id="https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud",
542
+ token=hf_token,
543
+ max_tokens=4096,
544
+ )
545
+ self.fallback_model = HfApiModel(
546
+ model_id,
547
+ provider="nebius",
548
+ token=hf_token,
549
+ max_tokens=4096,
550
  )
 
 
 
 
 
 
 
 
 
551
 
552
  def __call__(
553
  self,
 
555
  stop_sequences: Optional[List[str]] = None,
556
  **kwargs
557
  ) -> ChatMessage:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
 
 
559
  try:
560
+ message = self.base_model(messages, stop_sequences, **kwargs)
561
+ return message
562
+ except Exception as e:
563
+ raise e
564
+ print(f"Base model failed with error: {e}. Calling fallback model.")
565
+
566
+ # Continue to fallback
567
+ try:
568
+ message = self.fallback_model(messages, stop_sequences, **kwargs)
569
+ return message
570
  except Exception as e:
571
+ raise e
572
  raise Exception(f"Both endpoints failed. Last error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval.py CHANGED
@@ -18,7 +18,7 @@ from e2bqwen import QwenVLAPIModel, E2BVisionAgent
18
 
19
  from dotenv import load_dotenv
20
 
21
- load_dotenv()
22
  # Environment variables and constants
23
  E2B_API_KEY = os.getenv("E2B_API_KEY")
24
  # Try to get token dynamically, fall back to environment variable
@@ -290,7 +290,7 @@ def main():
290
  "wiki": "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
291
  "flight": "Search a flight Rome - Berlin for tomorrow",
292
  "pond": "What's the name of the pond just south of Château de Fontainebleau in Google maps?",
293
- "flux": "Go generate a picture of the Golden Gate bridge on a FLUX1.dev space",
294
  "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
295
  }
296
 
@@ -298,7 +298,7 @@ def main():
298
  os.makedirs(args.output_dir, exist_ok=True)
299
 
300
  # Run the evaluation
301
- eval_dir = run_evaluation(examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps)
302
 
303
  if __name__ == "__main__":
304
  main()
 
18
 
19
  from dotenv import load_dotenv
20
 
21
+ load_dotenv(override=True)
22
  # Environment variables and constants
23
  E2B_API_KEY = os.getenv("E2B_API_KEY")
24
  # Try to get token dynamically, fall back to environment variable
 
290
  "wiki": "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
291
  "flight": "Search a flight Rome - Berlin for tomorrow",
292
  "pond": "What's the name of the pond just south of Château de Fontainebleau in Google maps?",
293
+ "flux": "Go on the Hugging Face Hub, find a Space for FLUX1.dev, and generate a picture of the Golden Gate bridge.",
294
  "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
295
  }
296
 
 
298
  os.makedirs(args.output_dir, exist_ok=True)
299
 
300
  # Run the evaluation
301
+ run_evaluation(examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps)
302
 
303
  if __name__ == "__main__":
304
  main()