computer-agent

Running on CPU Upgrade

App Files Files Community

m-ric HF Staff commited on Mar 29

Commit

379f8cb

1 Parent(s): 4208d01

Implement replay function

Browse files

Files changed (3) hide show

app.py +18 -9
e2bqwen.py +11 -4
model_replay.py +10 -7

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from e2b_desktop import Sandbox
 from smolagents import CodeAgent
 from smolagents.monitoring import LogLevel
 from smolagents.gradio_ui import GradioUI, stream_to_gradio
-from model_replay import FakeModelClass
 from e2bqwen import QwenVLAPIModel, E2BVisionAgent
@@ -488,7 +488,7 @@ class EnrichedGradioUI(GradioUI):
             gr.Button(interactive=False),
         )
-    def interact_with_agent(self, task_input, stored_messages, session_state, session_hash, replay_log, request: gr.Request):
         import gradio as gr
         interaction_id = generate_interaction_id(request)
@@ -504,9 +504,10 @@ class EnrichedGradioUI(GradioUI):
         else:
             session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
-        if replay_log is not None:
             original_model = session_state["agent"].model
-            session_state["agent"].model = FakeModelReplayLog(replay_log)
         try:
             stored_messages.append(gr.ChatMessage(role="user", content=task_input))
@@ -539,8 +540,9 @@ class EnrichedGradioUI(GradioUI):
             save_final_status(data_dir, "failed", summary=[], error_message=error_message)
         finally:
-            if replay_log: # Replace the model with original model
                 session_state["agent"].model = original_model
             upload_to_hf_and_remove(data_dir)
 theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
@@ -573,7 +575,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
                     "Check the commuting time between Bern and Zurich on Google maps",
                     "Write 'Hello World' in a text editor",
                     "Search a flight Paris - Berlin for tomorrow",
-                    "Could you head to Fontainebleau (France) in Google Maps, and get me the name of the pond just south of the castle?",
                     "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background"
                 ],
                 inputs = task_input,
@@ -685,9 +687,10 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
         fn=clear_and_set_view_only,
         inputs=[task_input],
         outputs=[sandbox_html]
-    ).then(
         agent_ui.interact_with_agent,
-        inputs=[task_input, stored_messages, session_state, session_hash_state, None],
         outputs=[chatbot_display]
     ).then(
         fn=set_interactive,
@@ -695,13 +698,19 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
         outputs=[sandbox_html]
     )
     replay_btn.click(
         fn=clear_and_set_view_only,
         inputs=[task_input],
         outputs=[sandbox_html]
     ).then(
         agent_ui.interact_with_agent,
-        inputs=[task_input, stored_messages, session_state, session_hash_state, "udupp2fyavq_1743170323"],
         outputs=[chatbot_display]
     ).then(
         fn=set_interactive,

 from smolagents import CodeAgent
 from smolagents.monitoring import LogLevel
 from smolagents.gradio_ui import GradioUI, stream_to_gradio
+from model_replay import FakeModelReplayLog
 from e2bqwen import QwenVLAPIModel, E2BVisionAgent
             gr.Button(interactive=False),
         )
+    def interact_with_agent(self, task_input, stored_messages, session_state, session_hash, request: gr.Request):
         import gradio as gr
         interaction_id = generate_interaction_id(request)
         else:
             session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
+        if "replay_log" in session_state and session_state["replay_log"] is not None:
             original_model = session_state["agent"].model
+            session_state["agent"].model = FakeModelReplayLog(session_state["replay_log"])
         try:
             stored_messages.append(gr.ChatMessage(role="user", content=task_input))
             save_final_status(data_dir, "failed", summary=[], error_message=error_message)
         finally:
+            if "replay_log" in session_state and session_state["replay_log"] is not None: # Replace the model with original model
                 session_state["agent"].model = original_model
+                session_state["replay_log"] = None
             upload_to_hf_and_remove(data_dir)
 theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
                     "Check the commuting time between Bern and Zurich on Google maps",
                     "Write 'Hello World' in a text editor",
                     "Search a flight Paris - Berlin for tomorrow",
+                    "Search for Château de Fontainebleau in Google Maps",
                     "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background"
                 ],
                 inputs = task_input,
         fn=clear_and_set_view_only,
         inputs=[task_input],
         outputs=[sandbox_html]
+    )
+    view_only_event.then(
         agent_ui.interact_with_agent,
+        inputs=[task_input, stored_messages, session_state, session_hash_state],
         outputs=[chatbot_display]
     ).then(
         fn=set_interactive,
         outputs=[sandbox_html]
     )
+    def set_logs_source(session_state):
+        session_state["replay_log"] = "udupp2fyavq_1743170323"
     replay_btn.click(
         fn=clear_and_set_view_only,
         inputs=[task_input],
         outputs=[sandbox_html]
+    ).then(
+        set_logs_source,
+        inputs=[session_state]
     ).then(
         agent_ui.interact_with_agent,
+        inputs=[task_input, stored_messages, session_state, session_hash_state],
         outputs=[chatbot_display]
     ).then(
         fn=set_interactive,

e2bqwen.py CHANGED Viewed

@@ -5,6 +5,7 @@ from io import BytesIO
 from textwrap import dedent
 from typing import Any, Dict, List, Optional, Tuple
 import json
 # HF API params
 from huggingface_hub import InferenceClient
@@ -260,6 +261,9 @@ class E2BVisionAgent(CodeAgent):
             self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
             return f"Moved mouse to coordinates ({x}, {y})"
         @tool
         def type_text(text: str, delay_in_ms: int = 75) -> str:
             """
@@ -268,9 +272,10 @@ class E2BVisionAgent(CodeAgent):
                 text: The text to type
                 delay_in_ms: Delay between keystrokes in milliseconds
             """
-            self.desktop.write(text, delay_in_ms=delay_in_ms)
-            self.logger.log(f"Typed text: '{text}'")
-            return f"Typed text: '{text}'"
         @tool
         def press_key(key: str) -> str:
@@ -309,10 +314,12 @@ class E2BVisionAgent(CodeAgent):
             return message
         @tool
-        def scroll(direction: str = "down", amount: int = 1) -> str:
             """
             Uses scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
             Args:
                 direction: The direction to scroll ("up" or "down"), defaults to "down"
                 amount: The amount to scroll. A good amount is 1 or 2.
             """

 from textwrap import dedent
 from typing import Any, Dict, List, Optional, Tuple
 import json
+import unicodedata
 # HF API params
 from huggingface_hub import InferenceClient
             self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
             return f"Moved mouse to coordinates ({x}, {y})"
+        def normalize_text(text):
+            return ''.join(c for c in unicodedata.normalize('NFD', text) if not unicodedata.combining(c))
         @tool
         def type_text(text: str, delay_in_ms: int = 75) -> str:
             """
                 text: The text to type
                 delay_in_ms: Delay between keystrokes in milliseconds
             """
+            clean_text = normalize_text(text)
+            self.desktop.write(clean_text, delay_in_ms=delay_in_ms)
+            self.logger.log(f"Typed text: '{clean_text}'")
+            return f"Typed text: '{clean_text}'"
         @tool
         def press_key(key: str) -> str:
             return message
         @tool
+        def scroll(x: int, y: int, direction: str = "down", amount: int = 1) -> str:
             """
             Uses scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
             Args:
+                x: The x coordinate (horizontal position) of the element to scroll/zoom
+                y: The y coordinate (vertical position) of the element to scroll/zoom
                 direction: The direction to scroll ("up" or "down"), defaults to "down"
                 amount: The amount to scroll. A good amount is 1 or 2.
             """

model_replay.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from smolagents.models import Model, ChatMessage, Tool, MessageRole
-from time import time
-class FakeModelClass(Model):
     """A model class that returns pre-recorded responses from a log file.
     This class is useful for testing and debugging purposes, as it doesn't make
@@ -19,7 +23,7 @@ class FakeModelClass(Model):
         **kwargs
     ):
         super().__init__(**kwargs)
-        self.dataset_name = "smolagents/computer-agent-logs",
         self.log_folder = log_folder
         self.call_counter = 0
         self.model_outputs = self._load_model_outputs()
@@ -40,9 +44,8 @@ class FakeModelClass(Model):
         # Extract only the model_output from each step in tool_calls
         model_outputs = []
-        for step in log_data.get("tool_calls", []):
-            if "model_output_message" in step:
-                model_outputs.append(step["model_output_message"])
         print(f"Loaded {len(model_outputs)} model outputs from log file")
         return model_outputs
@@ -67,7 +70,7 @@ class FakeModelClass(Model):
         Returns:
             ChatMessage: The next pre-recorded response.
         """
-        time.sleep(1.0)
         # Get the next model output
         if self.call_counter < len(self.model_outputs):

 from smolagents.models import Model, ChatMessage, Tool, MessageRole
+from time import sleep
+from typing import List, Dict, Optional
+from huggingface_hub import hf_hub_download
+import json
+class FakeModelReplayLog(Model):
     """A model class that returns pre-recorded responses from a log file.
     This class is useful for testing and debugging purposes, as it doesn't make
         **kwargs
     ):
         super().__init__(**kwargs)
+        self.dataset_name = "smolagents/computer-agent-logs"
         self.log_folder = log_folder
         self.call_counter = 0
         self.model_outputs = self._load_model_outputs()
         # Extract only the model_output from each step in tool_calls
         model_outputs = []
+        for step in log_data["summary"][1:]:
+            model_outputs.append(step["model_output_message"]["content"])
         print(f"Loaded {len(model_outputs)} model outputs from log file")
         return model_outputs
         Returns:
             ChatMessage: The next pre-recorded response.
         """
+        sleep(1.0)
         # Get the next model output
         if self.call_counter < len(self.model_outputs):