computer-agent

Running on CPU Upgrade

App Files Files Community

m-ric HF Staff commited on Mar 28

Commit

dc86104

1 Parent(s): cf80979

Working export log to dataset

Browse files

Files changed (2) hide show

app.py +60 -54
e2bqwen.py +14 -30

app.py CHANGED Viewed

@@ -425,44 +425,51 @@ def generate_interaction_id(request):
     """Generate a unique ID combining session hash and timestamp"""
     return f"{request.session_hash}_{int(time.time())}"
-def save_final_status(folder, status, details = None):
-    a = open(os.path.join(folder,"status.json"),"w")
-    a.write(json.dumps({"status":status,"details":details}))
-    a.close()
-def get_log_file_path(session_hash):
-    """
-    Creates a log file path based on the session hash.
-    Makes sure the directory exists.
-    """
-    log_dir = os.path.join(TMP_DIR, session_hash)
-    if not os.path.exists(log_dir):
-        os.makedirs(log_dir)
-    return os.path.join(log_dir, 'console.log')
 def initialize_session(interactive_mode, request: gr.Request):
     session_hash = request.session_hash
-    # Create session-specific log file
-    log_path = get_log_file_path(session_hash)
-    # Initialize log file if it doesn't exist
-    if not os.path.exists(log_path):
-        with open(log_path, 'w') as f:
-            f.write(f"Ready to go...\n")
     # Return HTML and session hash
     return update_html(interactive_mode, request), session_hash
-# Function to read log content that gets the path from session hash
-def update_terminal_from_session(session_hash):
-    if not session_hash:
-        return "Waiting for session..."
-    log_path = get_log_file_path(session_hash)
-    return read_log_content(log_path)
-def create_agent(data_dir, desktop, log_file):
     model = QwenVLAPIModel(
         model_id="Qwen/Qwen2.5-VL-72B-Instruct",
         hf_token = hf_token,
@@ -474,7 +481,6 @@ def create_agent(data_dir, desktop, log_file):
         max_steps=200,
         verbosity_level=2,
         planning_interval=10,
-        log_file = log_file
     )
 class EnrichedGradioUI(GradioUI):
@@ -497,10 +503,9 @@ class EnrichedGradioUI(GradioUI):
         if not os.path.exists(data_dir):
             os.makedirs(data_dir)
-        log_file = get_log_file_path(session_hash)
         if "agent" not in session_state:
-            session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop, log_file=log_file)
         # Construct the full task with instructions
         full_task = task_input + dedent(f"""
@@ -517,31 +522,32 @@ class EnrichedGradioUI(GradioUI):
             We can only execute one action at a time. On each step, answer only a python blob with the action to perform
         """)
-        # try:
-        stored_messages.append(gr.ChatMessage(role="user", content=task_input))
-        yield stored_messages
-        for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
-            if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
-                stored_messages.append(gr.ChatMessage(
-                    role="assistant",
-                    content={"path": session_state["agent"].last_screenshot.to_string(), "mime_type": "image/png"},
-                ))
-            stored_messages.append(msg)
             yield stored_messages
-        yield stored_messages
-        # TODO: uncomment below after testing
-        #     save_final_status(data_dir, "completed", details = str(session_state["agent"].memory.get_succinct_steps()))
-        # except Exception as e:
-        #     error_message=f"Error in interaction: {str(e)}"
-        #     stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
-        #     yield stored_messages
-        #     save_final_status(data_dir, "failed", details = str(error_message))
-        # finally:
-        #     upload_to_hf_and_remove(data_dir)
 theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")

     """Generate a unique ID combining session hash and timestamp"""
     return f"{request.session_hash}_{int(time.time())}"
+def chat_message_to_json(obj):
+    """Custom JSON serializer for ChatMessage and related objects"""
+    if hasattr(obj, '__dict__'):
+        # Create a copy of the object's __dict__ to avoid modifying the original
+        result = obj.__dict__.copy()
+        # Remove the 'raw' field which may contain non-serializable data
+        if 'raw' in result:
+            del result['raw']
+        # Process the content or tool_calls if they exist
+        if 'content' in result and result['content'] is not None:
+            if hasattr(result['content'], '__dict__'):
+                result['content'] = chat_message_to_json(result['content'])
+        if 'tool_calls' in result and result['tool_calls'] is not None:
+            result['tool_calls'] = [chat_message_to_json(tc) for tc in result['tool_calls']]
+        return result
+    elif isinstance(obj, (list, tuple)):
+        return [chat_message_to_json(item) for item in obj]
+    else:
+        return obj
+def save_final_status(folder, status: str, memory, error_message = None) -> None:
+    metadata_path = os.path.join(folder, "metadata.json")
+    output = {}
+    # THIS ERASES IMAGES FROM MEMORY, USE WITH CAUTION
+    for memory_step in memory.steps:
+        if getattr(memory_step, "observations_images", None):
+            memory_step.observations_images = None
+    a = open(metadata_path,"w")
+    summary = memory.get_succinct_steps()
+    a.write(json.dumps({"status":status, "summary":summary, "error_message": error_message}, default=chat_message_to_json))
+    a.close()
 def initialize_session(interactive_mode, request: gr.Request):
     session_hash = request.session_hash
     # Return HTML and session hash
     return update_html(interactive_mode, request), session_hash
+def create_agent(data_dir, desktop):
     model = QwenVLAPIModel(
         model_id="Qwen/Qwen2.5-VL-72B-Instruct",
         hf_token = hf_token,
         max_steps=200,
         verbosity_level=2,
         planning_interval=10,
     )
 class EnrichedGradioUI(GradioUI):
         if not os.path.exists(data_dir):
             os.makedirs(data_dir)
         if "agent" not in session_state:
+            session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
         # Construct the full task with instructions
         full_task = task_input + dedent(f"""
             We can only execute one action at a time. On each step, answer only a python blob with the action to perform
         """)
+        try:
+            stored_messages.append(gr.ChatMessage(role="user", content=task_input))
             yield stored_messages
+            for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
+                if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
+                    stored_messages.append(gr.ChatMessage(
+                        role="assistant",
+                        content={"path": session_state["agent"].last_screenshot.to_string(), "mime_type": "image/png"},
+                    ))
+                stored_messages.append(msg)
+                yield stored_messages
+            yield stored_messages
+            save_final_status(data_dir, "completed", memory = session_state["agent"].memory)
+        # # TODO: uncomment below after testing
+        except Exception as e:
+            error_message=f"Error in interaction: {str(e)}"
+            stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
+            yield stored_messages
+            raise e
+            save_final_status(data_dir, "failed", summary={}, error_message=error_message)
+        finally:
+            upload_to_hf_and_remove(data_dir)
 theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")

e2bqwen.py CHANGED Viewed

@@ -101,12 +101,10 @@ class E2BVisionAgent(CodeAgent):
         max_steps: int = 200,
         verbosity_level: LogLevel = 2,
         planning_interval: int = 10,
-        log_file = None,
         **kwargs
     ):
         self.desktop = desktop
         self.data_dir = data_dir
-        self.log_path = log_file
         self.planning_interval = planning_interval
         # Initialize Desktop
         self.width, self.height = self.desktop.get_screen_size()
@@ -137,7 +135,6 @@ class E2BVisionAgent(CodeAgent):
         self.logger.log("Setting up agent tools...")
         self._setup_desktop_tools()
         self.step_callbacks.append(self.take_screenshot_callback)
-        self.final_answer_checks = [self.store_metadata_to_file]
     def _setup_desktop_tools(self):
         """Register all desktop tools"""
@@ -151,7 +148,7 @@ class E2BVisionAgent(CodeAgent):
             """
             self.desktop.move_mouse(x, y)
             self.desktop.left_click()
-            self.logger.log(self.log_path, f"Clicked at coordinates ({x}, {y})")
             return f"Clicked at coordinates ({x}, {y})"
         @tool
@@ -164,7 +161,7 @@ class E2BVisionAgent(CodeAgent):
             """
             self.desktop.move_mouse(x, y)
             self.desktop.right_click()
-            self.logger.log(self.log_path, f"Right-clicked at coordinates ({x}, {y})")
             return f"Right-clicked at coordinates ({x}, {y})"
         @tool
@@ -177,7 +174,7 @@ class E2BVisionAgent(CodeAgent):
             """
             self.desktop.move_mouse(x, y)
             self.desktop.double_click()
-            self.logger.log(self.log_path, f"Double-clicked at coordinates ({x}, {y})")
             return f"Double-clicked at coordinates ({x}, {y})"
         @tool
@@ -189,7 +186,7 @@ class E2BVisionAgent(CodeAgent):
                 y: The y coordinate (vertical position)
             """
             self.desktop.move_mouse(x, y)
-            self.logger.log(self.log_path, f"Moved mouse to coordinates ({x}, {y})")
             return f"Moved mouse to coordinates ({x}, {y})"
         @tool
@@ -201,7 +198,7 @@ class E2BVisionAgent(CodeAgent):
                 delay_in_ms: Delay between keystrokes in milliseconds
             """
             self.desktop.write(text, delay_in_ms=delay_in_ms)
-            self.logger.log(self.log_path, f"Typed text: '{text}'")
             return f"Typed text: '{text}'"
         @tool
@@ -214,7 +211,7 @@ class E2BVisionAgent(CodeAgent):
             if key == "enter":
                 key = "Return"
             self.desktop.press(key)
-            self.logger.log(self.log_path, f"Pressed key: {key}")
             return f"Pressed key: {key}"
         @tool
@@ -224,7 +221,7 @@ class E2BVisionAgent(CodeAgent):
             Args:
             """
             self.desktop.press(["alt", "left"])
-            self.logger.log(self.log_path, "Went back one page")
             return "Went back one page"
         @tool
@@ -239,7 +236,7 @@ class E2BVisionAgent(CodeAgent):
             """
             self.desktop.drag([x1, y1], [x2, y2])
             message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
-            self.logger.log(self.log_path, message)
             return message
         @tool
@@ -251,7 +248,7 @@ class E2BVisionAgent(CodeAgent):
                 amount: The amount to scroll. A good amount is 1 or 2.
             """
             self.desktop.scroll(direction=direction, amount=amount)
-            self.logger.log(self.log_path, f"Scrolled {direction} by {amount}")
             return f"Scrolled {direction} by {amount}"
         @tool
@@ -262,7 +259,7 @@ class E2BVisionAgent(CodeAgent):
                 seconds: Number of seconds to wait, generally 3 is enough.
             """
             time.sleep(seconds)
-            self.logger.log(self.log_path, f"Waited for {seconds} seconds")
             return f"Waited for {seconds} seconds"
         @tool
@@ -279,7 +276,7 @@ class E2BVisionAgent(CodeAgent):
             self.desktop.open(url)
             # Give it time to load
             time.sleep(2)
-            self.logger.log(self.log_path, f"Opening URL: {url}")
             return f"Opened URL: {url}"
@@ -297,22 +294,9 @@ class E2BVisionAgent(CodeAgent):
         self.tools["drag_and_drop"] = drag_and_drop
-    def store_metadata_to_file(self, final_answer, memory) -> None:
-        metadata_path = os.path.join(self.data_dir, "metadata.json")
-        output = {}
-        # THIS ERASES IMAGES FROM MEMORY, USE WITH CAUTION
-        for memory_step in self.memory.steps:
-            if getattr(memory_step, "observations_images", None):
-                memory_step.observations_images = None
-        a = open(metadata_path,"w")
-        a.write(json.dumps(self.write_memory_to_messages()))
-        a.close()
-        return True
     def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
         """Callback that takes a screenshot + memory snapshot after a step completes"""
-        self.logger.log(self.log_path, "Analyzing screen content...")
         current_step = memory_step.step_number
@@ -362,12 +346,12 @@ class QwenVLAPIModel(Model):
         self.model_id = model_id
         self.base_model = HfApiModel(
             model_id,
-            provider="nebius",
             token=hf_token,
         )
         self.fallback_model = HfApiModel(
             model_id,
-            provider="hyperbolic",
             token=hf_token,
         )

         max_steps: int = 200,
         verbosity_level: LogLevel = 2,
         planning_interval: int = 10,
         **kwargs
     ):
         self.desktop = desktop
         self.data_dir = data_dir
         self.planning_interval = planning_interval
         # Initialize Desktop
         self.width, self.height = self.desktop.get_screen_size()
         self.logger.log("Setting up agent tools...")
         self._setup_desktop_tools()
         self.step_callbacks.append(self.take_screenshot_callback)
     def _setup_desktop_tools(self):
         """Register all desktop tools"""
             """
             self.desktop.move_mouse(x, y)
             self.desktop.left_click()
+            self.logger.log(f"Clicked at coordinates ({x}, {y})")
             return f"Clicked at coordinates ({x}, {y})"
         @tool
             """
             self.desktop.move_mouse(x, y)
             self.desktop.right_click()
+            self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
             return f"Right-clicked at coordinates ({x}, {y})"
         @tool
             """
             self.desktop.move_mouse(x, y)
             self.desktop.double_click()
+            self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
             return f"Double-clicked at coordinates ({x}, {y})"
         @tool
                 y: The y coordinate (vertical position)
             """
             self.desktop.move_mouse(x, y)
+            self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
             return f"Moved mouse to coordinates ({x}, {y})"
         @tool
                 delay_in_ms: Delay between keystrokes in milliseconds
             """
             self.desktop.write(text, delay_in_ms=delay_in_ms)
+            self.logger.log(f"Typed text: '{text}'")
             return f"Typed text: '{text}'"
         @tool
             if key == "enter":
                 key = "Return"
             self.desktop.press(key)
+            self.logger.log(f"Pressed key: {key}")
             return f"Pressed key: {key}"
         @tool
             Args:
             """
             self.desktop.press(["alt", "left"])
+            self.logger.log("Went back one page")
             return "Went back one page"
         @tool
             """
             self.desktop.drag([x1, y1], [x2, y2])
             message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
+            self.logger.log(message)
             return message
         @tool
                 amount: The amount to scroll. A good amount is 1 or 2.
             """
             self.desktop.scroll(direction=direction, amount=amount)
+            self.logger.log(f"Scrolled {direction} by {amount}")
             return f"Scrolled {direction} by {amount}"
         @tool
                 seconds: Number of seconds to wait, generally 3 is enough.
             """
             time.sleep(seconds)
+            self.logger.log(f"Waited for {seconds} seconds")
             return f"Waited for {seconds} seconds"
         @tool
             self.desktop.open(url)
             # Give it time to load
             time.sleep(2)
+            self.logger.log(f"Opening URL: {url}")
             return f"Opened URL: {url}"
         self.tools["drag_and_drop"] = drag_and_drop
     def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
         """Callback that takes a screenshot + memory snapshot after a step completes"""
+        self.logger.log("Analyzing screen content...")
         current_step = memory_step.step_number
         self.model_id = model_id
         self.base_model = HfApiModel(
             model_id,
+            provider="hyperbolic",
             token=hf_token,
         )
         self.fallback_model = HfApiModel(
             model_id,
+            provider="nebius",
             token=hf_token,
         )