computer-agent

Running on CPU Upgrade

App Files Files Community

m-ric HF Staff commited on Mar 27

Commit

c5a6fe8

1 Parent(s): ba38624

Try logging pointer clicks

Browse files

Files changed (2) hide show

app.py +78 -2
e2bqwen.py +22 -22

app.py CHANGED Viewed

@@ -389,6 +389,82 @@ def get_or_create_sandbox(session_hash):
     desktop.stream.start(require_auth=True)
     setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
     desktop.commands.run(setup_cmd)
     # Store sandbox with metadata
     SANDBOXES[session_hash] = desktop
@@ -473,7 +549,7 @@ def create_agent(data_dir, desktop, log_file):
         desktop=desktop,
         max_steps=200,
         verbosity_level=LogLevel.INFO,
-        planning_interval=5,
         log_file = log_file
     )
@@ -511,7 +587,7 @@ class EnrichedGradioUI(GradioUI):
             1. Look at elements on the screen to determine what to click or interact with
             2. Use precise coordinates for mouse movements and clicks
             3. Wait for page loads or animations to complete using the wait() tool
-            4. Sometimes you may have missed a click, so never assume that you're on the right page, always make sure that your previous action worked In the screenshot you can see if the mouse is out of the clickable area. Pay special attention to this.
             When you receive a task, break it down into step-by-step actions. On each step, look at the current screenshot to validate if previous steps worked and decide the next action.
             We can only execute one action at a time. On each step, answer only a python blob with the action to perform

     desktop.stream.start(require_auth=True)
     setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
     desktop.commands.run(setup_cmd)
+    pointer_highlight_cmd = """#!/bin/bash
+sudo apt update
+sudo apt install -y x11-apps xinput
+cat << 'EOF' > /tmp/click_marker.sh
+#!/bin/bash
+echo "$(date): Script started" >> /tmp/click_debug.log
+# Hardcoded mouse ID from your output
+MOUSE_ID=6
+# Keep track of current absolute position
+CURRENT_X=0
+CURRENT_Y=0
+# Monitor raw mouse events
+xinput test $MOUSE_ID | while read event; do
+    echo "$(date): Event: $event" >> /tmp/click_debug.log
+    # Update position from motion events
+    if echo "$event" | grep -q "motion"; then
+        # Extract absolute position values
+        if echo "$event" | grep -q "absolute"; then
+            X_VAL=$(echo "$event" | grep -o "a\[0\]=.*" | cut -d= -f2 | cut -d' ' -f1)
+            Y_VAL=$(echo "$event" | grep -o "a\[1\]=.*" | cut -d= -f2 | cut -d' ' -f1)
+            if [ ! -z "$X_VAL" ]; then
+                CURRENT_X=$X_VAL
+            fi
+            if [ ! -z "$Y_VAL" ]; then
+                CURRENT_Y=$Y_VAL
+            fi
+            echo "$(date): Position updated to $CURRENT_X,$CURRENT_Y" >> /tmp/click_debug.log
+        fi
+    fi
+    # Check if this is a button press event
+    if echo "$event" | grep -q "button press"; then
+        echo "$(date): Button press detected at $CURRENT_X,$CURRENT_Y" >> /tmp/click_debug.log
+        # Show xlogo at current position
+        xlogo -geometry 40x40+$CURRENT_X+$CURRENT_Y &
+        LOGO_PID=$!
+        echo "$(date): Started xlogo with PID $LOGO_PID" >> /tmp/click_debug.log
+        # Keep xlogo open for 2 seconds
+        sleep 2
+        # Kill xlogo
+        kill $LOGO_PID 2>/dev/null
+        echo "$(date): Closed xlogo" >> /tmp/click_debug.log
+    fi
+done
+echo "$(date): Script exited unexpectedly" >> /tmp/click_debug.log
+EOF
+# Make the script executable
+chmod +x /tmp/click_marker.sh
+# Create a setup log entry
+echo "Click marker setup completed at $(date)" > /tmp/click_marker_setup.log
+# Launch the script with nohup to keep it running after terminal closes
+nohup /tmp/click_marker.sh > /dev/null 2>&1 &
+# Record the PID in the log file
+echo "Running with PID: $!" >> /tmp/click_marker_setup.log
+echo "To stop it, run: kill $!" >> /tmp/click_marker_setup.log
+"""
+    desktop.commands.run(pointer_highlight_cmd)
     # Store sandbox with metadata
     SANDBOXES[session_hash] = desktop
         desktop=desktop,
         max_steps=200,
         verbosity_level=LogLevel.INFO,
+        planning_interval=10,
         log_file = log_file
     )
             1. Look at elements on the screen to determine what to click or interact with
             2. Use precise coordinates for mouse movements and clicks
             3. Wait for page loads or animations to complete using the wait() tool
+            4. Sometimes you may have missed a click, so never assume that you're on the right page, always make sure that your previous action worked. In the screenshot you can see if the mouse is out of the clickable area. Pay special attention to this.
             When you receive a task, break it down into step-by-step actions. On each step, look at the current screenshot to validate if previous steps worked and decide the next action.
             We can only execute one action at a time. On each step, answer only a python blob with the action to perform

e2bqwen.py CHANGED Viewed

@@ -98,7 +98,7 @@ class E2BVisionAgent(CodeAgent):
         tools: List[tool] = None,
         max_steps: int = 200,
         verbosity_level: LogLevel = 4,
-        planning_interval: int = 15,
         log_file = None,
         **kwargs
     ):
@@ -340,8 +340,7 @@ class E2BVisionAgent(CodeAgent):
         memory_step.observations_images = [image.copy()]  # This takes the original image directly.
         # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
     def close(self):
         """Clean up resources"""
@@ -458,7 +457,7 @@ class QwenVLAPIModel(Model):
         )
         assert not self.hf_base_url.endswith("/v1/"), "Enter your base url without '/v1/' suffix."
         # Initialize HF OpenAI-compatible client if token is provided
         self.hf_client = None
         if hf_token:
@@ -512,22 +511,23 @@ class QwenVLAPIModel(Model):
                     if item["type"] == "text":
                         content.append({"type": "text", "text": item["text"]})
                     elif item["type"] == "image":
-                        # Handle image path or direct image object
-                        if isinstance(item["image"], str):
-                            # Image is a path
-                            with open(item["image"], "rb") as image_file:
-                                base64_image = base64.b64encode(image_file.read()).decode("utf-8")
-                        else:
-                            # Image is a PIL image or similar object
-                            img_byte_arr = BytesIO()
-                            base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
-                        content.append({
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:image/png;base64,{base64_image}"
-                            }
-                        })
             else:
                 # Plain text message
                 content = [{"type": "text", "text": msg["content"]}]
@@ -540,7 +540,7 @@ class QwenVLAPIModel(Model):
         """Call the Hugging Face OpenAI-compatible endpoint"""
         # Extract parameters with defaults
-        max_tokens = kwargs.get("max_new_tokens", 1024)
         temperature = kwargs.get("temperature", 0.7)
         top_p = kwargs.get("top_p", 0.9)
         stream = kwargs.get("stream", False)
@@ -571,7 +571,7 @@ class QwenVLAPIModel(Model):
         completion = self.hyperbolic_client.chat.completions.create(
             model=self.model_path,
             messages=formatted_messages,
-            max_tokens=kwargs.get("max_new_tokens", 1024),
             temperature=kwargs.get("temperature", 0.7),
             top_p=kwargs.get("top_p", 0.9),
             stop=stop_sequences

         tools: List[tool] = None,
         max_steps: int = 200,
         verbosity_level: LogLevel = 4,
+        planning_interval: int = 10,
         log_file = None,
         **kwargs
     ):
         memory_step.observations_images = [image.copy()]  # This takes the original image directly.
         # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
     def close(self):
         """Clean up resources"""
         )
         assert not self.hf_base_url.endswith("/v1/"), "Enter your base url without '/v1/' suffix."
         # Initialize HF OpenAI-compatible client if token is provided
         self.hf_client = None
         if hf_token:
                     if item["type"] == "text":
                         content.append({"type": "text", "text": item["text"]})
                     elif item["type"] == "image":
+                        # # Handle image path or direct image object
+                        # if isinstance(item["image"], str):
+                        #     # Image is a path
+                        #     with open(item["image"], "rb") as image_file:
+                        #         base64_image = base64.b64encode(image_file.read()).decode("utf-8")
+                        # else:
+                        #     # Image is a PIL image or similar object
+                        #     img_byte_arr = BytesIO()
+                        #     base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
+                        # content.append({
+                        #     "type": "image_url",
+                        #     "image_url": {
+                        #         "url": f"data:image/png;base64,{base64_image}"
+                        #     }
+                        # })
+                        pass
             else:
                 # Plain text message
                 content = [{"type": "text", "text": msg["content"]}]
         """Call the Hugging Face OpenAI-compatible endpoint"""
         # Extract parameters with defaults
+        max_tokens = kwargs.get("max_new_tokens", 4096)
         temperature = kwargs.get("temperature", 0.7)
         top_p = kwargs.get("top_p", 0.9)
         stream = kwargs.get("stream", False)
         completion = self.hyperbolic_client.chat.completions.create(
             model=self.model_path,
             messages=formatted_messages,
+            max_tokens=kwargs.get("max_new_tokens", 4096),
             temperature=kwargs.get("temperature", 0.7),
             top_p=kwargs.get("top_p", 0.9),
             stop=stop_sequences