Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Try logging pointer clicks
Browse files- app.py +78 -2
- e2bqwen.py +22 -22
app.py
CHANGED
@@ -389,6 +389,82 @@ def get_or_create_sandbox(session_hash):
|
|
389 |
desktop.stream.start(require_auth=True)
|
390 |
setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
|
391 |
desktop.commands.run(setup_cmd)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
# Store sandbox with metadata
|
394 |
SANDBOXES[session_hash] = desktop
|
@@ -473,7 +549,7 @@ def create_agent(data_dir, desktop, log_file):
|
|
473 |
desktop=desktop,
|
474 |
max_steps=200,
|
475 |
verbosity_level=LogLevel.INFO,
|
476 |
-
planning_interval=
|
477 |
log_file = log_file
|
478 |
)
|
479 |
|
@@ -511,7 +587,7 @@ class EnrichedGradioUI(GradioUI):
|
|
511 |
1. Look at elements on the screen to determine what to click or interact with
|
512 |
2. Use precise coordinates for mouse movements and clicks
|
513 |
3. Wait for page loads or animations to complete using the wait() tool
|
514 |
-
4. Sometimes you may have missed a click, so never assume that you're on the right page, always make sure that your previous action worked In the screenshot you can see if the mouse is out of the clickable area. Pay special attention to this.
|
515 |
|
516 |
When you receive a task, break it down into step-by-step actions. On each step, look at the current screenshot to validate if previous steps worked and decide the next action.
|
517 |
We can only execute one action at a time. On each step, answer only a python blob with the action to perform
|
|
|
389 |
desktop.stream.start(require_auth=True)
|
390 |
setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
|
391 |
desktop.commands.run(setup_cmd)
|
392 |
+
|
393 |
+
pointer_highlight_cmd = """#!/bin/bash
|
394 |
+
sudo apt update
|
395 |
+
sudo apt install -y x11-apps xinput
|
396 |
+
|
397 |
+
cat << 'EOF' > /tmp/click_marker.sh
|
398 |
+
#!/bin/bash
|
399 |
+
|
400 |
+
echo "$(date): Script started" >> /tmp/click_debug.log
|
401 |
+
|
402 |
+
# Hardcoded mouse ID from your output
|
403 |
+
MOUSE_ID=6
|
404 |
+
|
405 |
+
# Keep track of current absolute position
|
406 |
+
CURRENT_X=0
|
407 |
+
CURRENT_Y=0
|
408 |
+
|
409 |
+
|
410 |
+
# Monitor raw mouse events
|
411 |
+
xinput test $MOUSE_ID | while read event; do
|
412 |
+
echo "$(date): Event: $event" >> /tmp/click_debug.log
|
413 |
+
|
414 |
+
# Update position from motion events
|
415 |
+
if echo "$event" | grep -q "motion"; then
|
416 |
+
# Extract absolute position values
|
417 |
+
if echo "$event" | grep -q "absolute"; then
|
418 |
+
X_VAL=$(echo "$event" | grep -o "a\[0\]=.*" | cut -d= -f2 | cut -d' ' -f1)
|
419 |
+
Y_VAL=$(echo "$event" | grep -o "a\[1\]=.*" | cut -d= -f2 | cut -d' ' -f1)
|
420 |
+
|
421 |
+
if [ ! -z "$X_VAL" ]; then
|
422 |
+
CURRENT_X=$X_VAL
|
423 |
+
fi
|
424 |
+
if [ ! -z "$Y_VAL" ]; then
|
425 |
+
CURRENT_Y=$Y_VAL
|
426 |
+
fi
|
427 |
+
|
428 |
+
echo "$(date): Position updated to $CURRENT_X,$CURRENT_Y" >> /tmp/click_debug.log
|
429 |
+
fi
|
430 |
+
fi
|
431 |
+
|
432 |
+
# Check if this is a button press event
|
433 |
+
if echo "$event" | grep -q "button press"; then
|
434 |
+
echo "$(date): Button press detected at $CURRENT_X,$CURRENT_Y" >> /tmp/click_debug.log
|
435 |
+
|
436 |
+
# Show xlogo at current position
|
437 |
+
xlogo -geometry 40x40+$CURRENT_X+$CURRENT_Y &
|
438 |
+
LOGO_PID=$!
|
439 |
+
echo "$(date): Started xlogo with PID $LOGO_PID" >> /tmp/click_debug.log
|
440 |
+
|
441 |
+
# Keep xlogo open for 2 seconds
|
442 |
+
sleep 2
|
443 |
+
|
444 |
+
# Kill xlogo
|
445 |
+
kill $LOGO_PID 2>/dev/null
|
446 |
+
echo "$(date): Closed xlogo" >> /tmp/click_debug.log
|
447 |
+
fi
|
448 |
+
done
|
449 |
+
|
450 |
+
echo "$(date): Script exited unexpectedly" >> /tmp/click_debug.log
|
451 |
+
EOF
|
452 |
+
|
453 |
+
# Make the script executable
|
454 |
+
chmod +x /tmp/click_marker.sh
|
455 |
+
|
456 |
+
# Create a setup log entry
|
457 |
+
echo "Click marker setup completed at $(date)" > /tmp/click_marker_setup.log
|
458 |
+
|
459 |
+
# Launch the script with nohup to keep it running after terminal closes
|
460 |
+
nohup /tmp/click_marker.sh > /dev/null 2>&1 &
|
461 |
+
|
462 |
+
# Record the PID in the log file
|
463 |
+
echo "Running with PID: $!" >> /tmp/click_marker_setup.log
|
464 |
+
echo "To stop it, run: kill $!" >> /tmp/click_marker_setup.log
|
465 |
+
"""
|
466 |
+
desktop.commands.run(pointer_highlight_cmd)
|
467 |
+
|
468 |
|
469 |
# Store sandbox with metadata
|
470 |
SANDBOXES[session_hash] = desktop
|
|
|
549 |
desktop=desktop,
|
550 |
max_steps=200,
|
551 |
verbosity_level=LogLevel.INFO,
|
552 |
+
planning_interval=10,
|
553 |
log_file = log_file
|
554 |
)
|
555 |
|
|
|
587 |
1. Look at elements on the screen to determine what to click or interact with
|
588 |
2. Use precise coordinates for mouse movements and clicks
|
589 |
3. Wait for page loads or animations to complete using the wait() tool
|
590 |
+
4. Sometimes you may have missed a click, so never assume that you're on the right page, always make sure that your previous action worked. In the screenshot you can see if the mouse is out of the clickable area. Pay special attention to this.
|
591 |
|
592 |
When you receive a task, break it down into step-by-step actions. On each step, look at the current screenshot to validate if previous steps worked and decide the next action.
|
593 |
We can only execute one action at a time. On each step, answer only a python blob with the action to perform
|
e2bqwen.py
CHANGED
@@ -98,7 +98,7 @@ class E2BVisionAgent(CodeAgent):
|
|
98 |
tools: List[tool] = None,
|
99 |
max_steps: int = 200,
|
100 |
verbosity_level: LogLevel = 4,
|
101 |
-
planning_interval: int =
|
102 |
log_file = None,
|
103 |
**kwargs
|
104 |
):
|
@@ -340,8 +340,7 @@ class E2BVisionAgent(CodeAgent):
|
|
340 |
memory_step.observations_images = [image.copy()] # This takes the original image directly.
|
341 |
|
342 |
# memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
|
343 |
-
|
344 |
-
|
345 |
|
346 |
def close(self):
|
347 |
"""Clean up resources"""
|
@@ -458,7 +457,7 @@ class QwenVLAPIModel(Model):
|
|
458 |
)
|
459 |
|
460 |
assert not self.hf_base_url.endswith("/v1/"), "Enter your base url without '/v1/' suffix."
|
461 |
-
|
462 |
# Initialize HF OpenAI-compatible client if token is provided
|
463 |
self.hf_client = None
|
464 |
if hf_token:
|
@@ -512,22 +511,23 @@ class QwenVLAPIModel(Model):
|
|
512 |
if item["type"] == "text":
|
513 |
content.append({"type": "text", "text": item["text"]})
|
514 |
elif item["type"] == "image":
|
515 |
-
# Handle image path or direct image object
|
516 |
-
if isinstance(item["image"], str):
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
else:
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
content.append({
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
})
|
|
|
531 |
else:
|
532 |
# Plain text message
|
533 |
content = [{"type": "text", "text": msg["content"]}]
|
@@ -540,7 +540,7 @@ class QwenVLAPIModel(Model):
|
|
540 |
"""Call the Hugging Face OpenAI-compatible endpoint"""
|
541 |
|
542 |
# Extract parameters with defaults
|
543 |
-
max_tokens = kwargs.get("max_new_tokens",
|
544 |
temperature = kwargs.get("temperature", 0.7)
|
545 |
top_p = kwargs.get("top_p", 0.9)
|
546 |
stream = kwargs.get("stream", False)
|
@@ -571,7 +571,7 @@ class QwenVLAPIModel(Model):
|
|
571 |
completion = self.hyperbolic_client.chat.completions.create(
|
572 |
model=self.model_path,
|
573 |
messages=formatted_messages,
|
574 |
-
max_tokens=kwargs.get("max_new_tokens",
|
575 |
temperature=kwargs.get("temperature", 0.7),
|
576 |
top_p=kwargs.get("top_p", 0.9),
|
577 |
stop=stop_sequences
|
|
|
98 |
tools: List[tool] = None,
|
99 |
max_steps: int = 200,
|
100 |
verbosity_level: LogLevel = 4,
|
101 |
+
planning_interval: int = 10,
|
102 |
log_file = None,
|
103 |
**kwargs
|
104 |
):
|
|
|
340 |
memory_step.observations_images = [image.copy()] # This takes the original image directly.
|
341 |
|
342 |
# memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
|
343 |
+
|
|
|
344 |
|
345 |
def close(self):
|
346 |
"""Clean up resources"""
|
|
|
457 |
)
|
458 |
|
459 |
assert not self.hf_base_url.endswith("/v1/"), "Enter your base url without '/v1/' suffix."
|
460 |
+
|
461 |
# Initialize HF OpenAI-compatible client if token is provided
|
462 |
self.hf_client = None
|
463 |
if hf_token:
|
|
|
511 |
if item["type"] == "text":
|
512 |
content.append({"type": "text", "text": item["text"]})
|
513 |
elif item["type"] == "image":
|
514 |
+
# # Handle image path or direct image object
|
515 |
+
# if isinstance(item["image"], str):
|
516 |
+
# # Image is a path
|
517 |
+
# with open(item["image"], "rb") as image_file:
|
518 |
+
# base64_image = base64.b64encode(image_file.read()).decode("utf-8")
|
519 |
+
# else:
|
520 |
+
# # Image is a PIL image or similar object
|
521 |
+
# img_byte_arr = BytesIO()
|
522 |
+
# base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
523 |
+
|
524 |
+
# content.append({
|
525 |
+
# "type": "image_url",
|
526 |
+
# "image_url": {
|
527 |
+
# "url": f"data:image/png;base64,{base64_image}"
|
528 |
+
# }
|
529 |
+
# })
|
530 |
+
pass
|
531 |
else:
|
532 |
# Plain text message
|
533 |
content = [{"type": "text", "text": msg["content"]}]
|
|
|
540 |
"""Call the Hugging Face OpenAI-compatible endpoint"""
|
541 |
|
542 |
# Extract parameters with defaults
|
543 |
+
max_tokens = kwargs.get("max_new_tokens", 4096)
|
544 |
temperature = kwargs.get("temperature", 0.7)
|
545 |
top_p = kwargs.get("top_p", 0.9)
|
546 |
stream = kwargs.get("stream", False)
|
|
|
571 |
completion = self.hyperbolic_client.chat.completions.create(
|
572 |
model=self.model_path,
|
573 |
messages=formatted_messages,
|
574 |
+
max_tokens=kwargs.get("max_new_tokens", 4096),
|
575 |
temperature=kwargs.get("temperature", 0.7),
|
576 |
top_p=kwargs.get("top_p", 0.9),
|
577 |
stop=stop_sequences
|