m-ric HF Staff commited on
Commit
c5a6fe8
·
1 Parent(s): ba38624

Try logging pointer clicks

Browse files
Files changed (2) hide show
  1. app.py +78 -2
  2. e2bqwen.py +22 -22
app.py CHANGED
@@ -389,6 +389,82 @@ def get_or_create_sandbox(session_hash):
389
  desktop.stream.start(require_auth=True)
390
  setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
391
  desktop.commands.run(setup_cmd)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
  # Store sandbox with metadata
394
  SANDBOXES[session_hash] = desktop
@@ -473,7 +549,7 @@ def create_agent(data_dir, desktop, log_file):
473
  desktop=desktop,
474
  max_steps=200,
475
  verbosity_level=LogLevel.INFO,
476
- planning_interval=5,
477
  log_file = log_file
478
  )
479
 
@@ -511,7 +587,7 @@ class EnrichedGradioUI(GradioUI):
511
  1. Look at elements on the screen to determine what to click or interact with
512
  2. Use precise coordinates for mouse movements and clicks
513
  3. Wait for page loads or animations to complete using the wait() tool
514
- 4. Sometimes you may have missed a click, so never assume that you're on the right page, always make sure that your previous action worked In the screenshot you can see if the mouse is out of the clickable area. Pay special attention to this.
515
 
516
  When you receive a task, break it down into step-by-step actions. On each step, look at the current screenshot to validate if previous steps worked and decide the next action.
517
  We can only execute one action at a time. On each step, answer only a python blob with the action to perform
 
389
  desktop.stream.start(require_auth=True)
390
  setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
391
  desktop.commands.run(setup_cmd)
392
+
393
+ pointer_highlight_cmd = """#!/bin/bash
394
+ sudo apt update
395
+ sudo apt install -y x11-apps xinput
396
+
397
+ cat << 'EOF' > /tmp/click_marker.sh
398
+ #!/bin/bash
399
+
400
+ echo "$(date): Script started" >> /tmp/click_debug.log
401
+
402
+ # Hardcoded mouse ID from your output
403
+ MOUSE_ID=6
404
+
405
+ # Keep track of current absolute position
406
+ CURRENT_X=0
407
+ CURRENT_Y=0
408
+
409
+
410
+ # Monitor raw mouse events
411
+ xinput test $MOUSE_ID | while read event; do
412
+ echo "$(date): Event: $event" >> /tmp/click_debug.log
413
+
414
+ # Update position from motion events
415
+ if echo "$event" | grep -q "motion"; then
416
+ # Extract absolute position values
417
+ if echo "$event" | grep -q "absolute"; then
418
+ X_VAL=$(echo "$event" | grep -o "a\[0\]=.*" | cut -d= -f2 | cut -d' ' -f1)
419
+ Y_VAL=$(echo "$event" | grep -o "a\[1\]=.*" | cut -d= -f2 | cut -d' ' -f1)
420
+
421
+ if [ ! -z "$X_VAL" ]; then
422
+ CURRENT_X=$X_VAL
423
+ fi
424
+ if [ ! -z "$Y_VAL" ]; then
425
+ CURRENT_Y=$Y_VAL
426
+ fi
427
+
428
+ echo "$(date): Position updated to $CURRENT_X,$CURRENT_Y" >> /tmp/click_debug.log
429
+ fi
430
+ fi
431
+
432
+ # Check if this is a button press event
433
+ if echo "$event" | grep -q "button press"; then
434
+ echo "$(date): Button press detected at $CURRENT_X,$CURRENT_Y" >> /tmp/click_debug.log
435
+
436
+ # Show xlogo at current position
437
+ xlogo -geometry 40x40+$CURRENT_X+$CURRENT_Y &
438
+ LOGO_PID=$!
439
+ echo "$(date): Started xlogo with PID $LOGO_PID" >> /tmp/click_debug.log
440
+
441
+ # Keep xlogo open for 2 seconds
442
+ sleep 2
443
+
444
+ # Kill xlogo
445
+ kill $LOGO_PID 2>/dev/null
446
+ echo "$(date): Closed xlogo" >> /tmp/click_debug.log
447
+ fi
448
+ done
449
+
450
+ echo "$(date): Script exited unexpectedly" >> /tmp/click_debug.log
451
+ EOF
452
+
453
+ # Make the script executable
454
+ chmod +x /tmp/click_marker.sh
455
+
456
+ # Create a setup log entry
457
+ echo "Click marker setup completed at $(date)" > /tmp/click_marker_setup.log
458
+
459
+ # Launch the script with nohup to keep it running after terminal closes
460
+ nohup /tmp/click_marker.sh > /dev/null 2>&1 &
461
+
462
+ # Record the PID in the log file
463
+ echo "Running with PID: $!" >> /tmp/click_marker_setup.log
464
+ echo "To stop it, run: kill $!" >> /tmp/click_marker_setup.log
465
+ """
466
+ desktop.commands.run(pointer_highlight_cmd)
467
+
468
 
469
  # Store sandbox with metadata
470
  SANDBOXES[session_hash] = desktop
 
549
  desktop=desktop,
550
  max_steps=200,
551
  verbosity_level=LogLevel.INFO,
552
+ planning_interval=10,
553
  log_file = log_file
554
  )
555
 
 
587
  1. Look at elements on the screen to determine what to click or interact with
588
  2. Use precise coordinates for mouse movements and clicks
589
  3. Wait for page loads or animations to complete using the wait() tool
590
+ 4. Sometimes you may have missed a click, so never assume that you're on the right page, always make sure that your previous action worked. In the screenshot you can see if the mouse is out of the clickable area. Pay special attention to this.
591
 
592
  When you receive a task, break it down into step-by-step actions. On each step, look at the current screenshot to validate if previous steps worked and decide the next action.
593
  We can only execute one action at a time. On each step, answer only a python blob with the action to perform
e2bqwen.py CHANGED
@@ -98,7 +98,7 @@ class E2BVisionAgent(CodeAgent):
98
  tools: List[tool] = None,
99
  max_steps: int = 200,
100
  verbosity_level: LogLevel = 4,
101
- planning_interval: int = 15,
102
  log_file = None,
103
  **kwargs
104
  ):
@@ -340,8 +340,7 @@ class E2BVisionAgent(CodeAgent):
340
  memory_step.observations_images = [image.copy()] # This takes the original image directly.
341
 
342
  # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
343
-
344
-
345
 
346
  def close(self):
347
  """Clean up resources"""
@@ -458,7 +457,7 @@ class QwenVLAPIModel(Model):
458
  )
459
 
460
  assert not self.hf_base_url.endswith("/v1/"), "Enter your base url without '/v1/' suffix."
461
-
462
  # Initialize HF OpenAI-compatible client if token is provided
463
  self.hf_client = None
464
  if hf_token:
@@ -512,22 +511,23 @@ class QwenVLAPIModel(Model):
512
  if item["type"] == "text":
513
  content.append({"type": "text", "text": item["text"]})
514
  elif item["type"] == "image":
515
- # Handle image path or direct image object
516
- if isinstance(item["image"], str):
517
- # Image is a path
518
- with open(item["image"], "rb") as image_file:
519
- base64_image = base64.b64encode(image_file.read()).decode("utf-8")
520
- else:
521
- # Image is a PIL image or similar object
522
- img_byte_arr = BytesIO()
523
- base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
524
-
525
- content.append({
526
- "type": "image_url",
527
- "image_url": {
528
- "url": f"data:image/png;base64,{base64_image}"
529
- }
530
- })
 
531
  else:
532
  # Plain text message
533
  content = [{"type": "text", "text": msg["content"]}]
@@ -540,7 +540,7 @@ class QwenVLAPIModel(Model):
540
  """Call the Hugging Face OpenAI-compatible endpoint"""
541
 
542
  # Extract parameters with defaults
543
- max_tokens = kwargs.get("max_new_tokens", 1024)
544
  temperature = kwargs.get("temperature", 0.7)
545
  top_p = kwargs.get("top_p", 0.9)
546
  stream = kwargs.get("stream", False)
@@ -571,7 +571,7 @@ class QwenVLAPIModel(Model):
571
  completion = self.hyperbolic_client.chat.completions.create(
572
  model=self.model_path,
573
  messages=formatted_messages,
574
- max_tokens=kwargs.get("max_new_tokens", 1024),
575
  temperature=kwargs.get("temperature", 0.7),
576
  top_p=kwargs.get("top_p", 0.9),
577
  stop=stop_sequences
 
98
  tools: List[tool] = None,
99
  max_steps: int = 200,
100
  verbosity_level: LogLevel = 4,
101
+ planning_interval: int = 10,
102
  log_file = None,
103
  **kwargs
104
  ):
 
340
  memory_step.observations_images = [image.copy()] # This takes the original image directly.
341
 
342
  # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
343
+
 
344
 
345
  def close(self):
346
  """Clean up resources"""
 
457
  )
458
 
459
  assert not self.hf_base_url.endswith("/v1/"), "Enter your base url without '/v1/' suffix."
460
+
461
  # Initialize HF OpenAI-compatible client if token is provided
462
  self.hf_client = None
463
  if hf_token:
 
511
  if item["type"] == "text":
512
  content.append({"type": "text", "text": item["text"]})
513
  elif item["type"] == "image":
514
+ # # Handle image path or direct image object
515
+ # if isinstance(item["image"], str):
516
+ # # Image is a path
517
+ # with open(item["image"], "rb") as image_file:
518
+ # base64_image = base64.b64encode(image_file.read()).decode("utf-8")
519
+ # else:
520
+ # # Image is a PIL image or similar object
521
+ # img_byte_arr = BytesIO()
522
+ # base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
523
+
524
+ # content.append({
525
+ # "type": "image_url",
526
+ # "image_url": {
527
+ # "url": f"data:image/png;base64,{base64_image}"
528
+ # }
529
+ # })
530
+ pass
531
  else:
532
  # Plain text message
533
  content = [{"type": "text", "text": msg["content"]}]
 
540
  """Call the Hugging Face OpenAI-compatible endpoint"""
541
 
542
  # Extract parameters with defaults
543
+ max_tokens = kwargs.get("max_new_tokens", 4096)
544
  temperature = kwargs.get("temperature", 0.7)
545
  top_p = kwargs.get("top_p", 0.9)
546
  stream = kwargs.get("stream", False)
 
571
  completion = self.hyperbolic_client.chat.completions.create(
572
  model=self.model_path,
573
  messages=formatted_messages,
574
+ max_tokens=kwargs.get("max_new_tokens", 4096),
575
  temperature=kwargs.get("temperature", 0.7),
576
  top_p=kwargs.get("top_p", 0.9),
577
  stop=stop_sequences