m-ric HF Staff commited on
Commit
ba38624
·
1 Parent(s): a088a9f

Working agent with saving after task'

Browse files
Files changed (2) hide show
  1. app.py +18 -17
  2. e2bqwen.py +127 -49
app.py CHANGED
@@ -387,8 +387,7 @@ def get_or_create_sandbox(session_hash):
387
  print(f"Creating new sandbox for session {session_hash}")
388
  desktop = Sandbox(api_key=E2B_API_KEY, resolution=(WIDTH, HEIGHT), dpi=96, timeout=SANDBOX_TIMEOUT)
389
  desktop.stream.start(require_auth=True)
390
- setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}'
391
- sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
392
  desktop.commands.run(setup_cmd)
393
 
394
  # Store sandbox with metadata
@@ -486,6 +485,7 @@ class EnrichedGradioUI(GradioUI):
486
  text_input,
487
  gr.Button(interactive=False),
488
  )
 
489
  def interact_with_agent(self, task_input, stored_messages, session_state, session_hash, request: gr.Request):
490
  import gradio as gr
491
 
@@ -517,24 +517,24 @@ class EnrichedGradioUI(GradioUI):
517
  We can only execute one action at a time. On each step, answer only a python blob with the action to perform
518
  """)
519
 
520
- try:
521
- stored_messages.append(gr.ChatMessage(role="user", content=task_input))
522
- yield stored_messages
523
-
524
- for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
525
- stored_messages.append(msg)
526
- yield stored_messages
527
 
 
 
528
  yield stored_messages
529
- save_final_status(data_dir, "completed", details = str(session_state["agent"].memory.get_succinct_steps()))
530
- except Exception as e:
531
- error_message=f"Error in interaction: {str(e)}"
532
- stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
533
- yield stored_messages
534
- save_final_status(data_dir, "failed", details = str(error_message))
535
 
536
- finally:
537
- upload_to_hf_and_remove(data_dir)
 
 
 
 
 
 
 
 
538
 
539
  theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
540
 
@@ -568,6 +568,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js, fill_width=True) as de
568
  "Write 'Hello World' in a text editor",
569
  "Search a flight Paris - Berlin for tomorrow",
570
  "Could you head to Fontainebleau (France) in Google Maps then drag and drop to position the castle of Fontainebleau exactly in the center?",
 
571
  ],
572
  inputs = task_input,
573
  label= "Example Tasks",
 
387
  print(f"Creating new sandbox for session {session_hash}")
388
  desktop = Sandbox(api_key=E2B_API_KEY, resolution=(WIDTH, HEIGHT), dpi=96, timeout=SANDBOX_TIMEOUT)
389
  desktop.stream.start(require_auth=True)
390
+ setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
 
391
  desktop.commands.run(setup_cmd)
392
 
393
  # Store sandbox with metadata
 
485
  text_input,
486
  gr.Button(interactive=False),
487
  )
488
+
489
  def interact_with_agent(self, task_input, stored_messages, session_state, session_hash, request: gr.Request):
490
  import gradio as gr
491
 
 
517
  We can only execute one action at a time. On each step, answer only a python blob with the action to perform
518
  """)
519
 
520
+ # try:
521
+ stored_messages.append(gr.ChatMessage(role="user", content=task_input))
522
+ yield stored_messages
 
 
 
 
523
 
524
+ for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
525
+ stored_messages.append(msg)
526
  yield stored_messages
 
 
 
 
 
 
527
 
528
+ yield stored_messages
529
+ # save_final_status(data_dir, "completed", details = str(session_state["agent"].memory.get_succinct_steps()))
530
+ # except Exception as e:
531
+ # error_message=f"Error in interaction: {str(e)}"
532
+ # stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
533
+ # yield stored_messages
534
+ # save_final_status(data_dir, "failed", details = str(error_message))
535
+
536
+ # finally:
537
+ # upload_to_hf_and_remove(data_dir)
538
 
539
  theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
540
 
 
568
  "Write 'Hello World' in a text editor",
569
  "Search a flight Paris - Berlin for tomorrow",
570
  "Could you head to Fontainebleau (France) in Google Maps then drag and drop to position the castle of Fontainebleau exactly in the center?",
571
+ "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background"
572
  ],
573
  inputs = task_input,
574
  label= "Example Tasks",
e2bqwen.py CHANGED
@@ -135,9 +135,11 @@ class E2BVisionAgent(CodeAgent):
135
  # Add default tools
136
  self._setup_desktop_tools()
137
  self.logger.log("Setting up agent tools...")
138
- self.step_callbacks.append(self.take_snapshot_callback)
139
  self.logger.log("Studying an action plan... that will take a bit.")
140
 
 
 
141
  def _setup_desktop_tools(self):
142
  """Register all desktop tools"""
143
  @tool
@@ -296,55 +298,50 @@ class E2BVisionAgent(CodeAgent):
296
  self.tools["drag_and_drop"] = drag_and_drop
297
 
298
 
299
- def store_metadata_to_file(self, agent) -> None:
300
  metadata_path = os.path.join(self.data_dir, "metadata.json")
301
  output = {}
302
- output_memory = self.write_memory_to_messages()
 
 
 
303
  a = open(metadata_path,"w")
304
- a.write(json.dumps(output_memory))
305
  a.close()
 
306
 
307
 
308
- def take_snapshot_callback(self, memory_step: ActionStep, agent=None) -> None:
309
  """Callback that takes a screenshot + memory snapshot after a step completes"""
310
  self.logger.log(self.log_path, "Analyzing screen content...")
311
 
312
  current_step = memory_step.step_number
313
  print(f"Taking screenshot for step {current_step}")
314
- # Check if desktop is still running
315
- if not self.desktop.is_running():
316
- print("Desktop is no longer running. Terminating agent.")
317
- self.close()
318
- # Add a final observation indicating why the agent was terminated
319
- memory_step.observations = "Desktop session ended. Agent terminated."
320
- # Store final metadata before exiting
321
- self.store_metadata_to_file(agent)
322
- return # Exit the callback without attempting to take a screenshot
323
-
324
- try:
325
- time.sleep(2.0) # Let things happen on the desktop
326
- screenshot_bytes = self.desktop.screenshot()
327
- image = Image.open(BytesIO(screenshot_bytes))
328
 
329
- # Create a filename with step number
330
- screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
331
- image.save(screenshot_path)
332
- print(f"Saved screenshot to {screenshot_path}")
333
 
334
- for previous_memory_step in agent.memory.steps: # Remove previous screenshots from logs for lean processing
335
- if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
336
- previous_memory_step.observations_images = None
 
337
 
338
- # Add to the current memory step
339
- memory_step.observations_images = [image.copy()] # This takes the original image directly.
340
- # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
 
 
 
 
 
341
 
342
- #Storing memory and metadata to file:
343
- self.store_metadata_to_file(agent)
344
-
345
 
346
- except Exception as e:
347
- print(f"Error taking screenshot: {e}")
 
348
 
349
  def close(self):
350
  """Clean up resources"""
@@ -356,6 +353,87 @@ class E2BVisionAgent(CodeAgent):
356
  self.desktop.kill()
357
  print("E2B sandbox terminated")
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
 
360
  class QwenVLAPIModel(Model):
361
  """Model wrapper for Qwen2.5VL API with fallback mechanism"""
@@ -401,18 +479,18 @@ class QwenVLAPIModel(Model):
401
  # Format messages once for both APIs
402
  formatted_messages = self._format_messages(messages)
403
 
404
- # First try the HF endpoint if available
405
- if self.hf_client:
406
- try:
407
- completion = self._call_hf_endpoint(
408
- formatted_messages,
409
- stop_sequences,
410
- **kwargs
411
- )
412
- return ChatMessage(role=MessageRole.ASSISTANT, content=completion)
413
- except Exception as e:
414
- print(f"HF endpoint failed with error: {e}. Falling back to hyperbolic.")
415
- # Continue to fallback
416
 
417
  # Fallback to hyperbolic
418
  try:
@@ -442,7 +520,6 @@ class QwenVLAPIModel(Model):
442
  else:
443
  # Image is a PIL image or similar object
444
  img_byte_arr = BytesIO()
445
- item["image"].save(img_byte_arr, format="PNG")
446
  base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
447
 
448
  content.append({
@@ -463,7 +540,7 @@ class QwenVLAPIModel(Model):
463
  """Call the Hugging Face OpenAI-compatible endpoint"""
464
 
465
  # Extract parameters with defaults
466
- max_tokens = kwargs.get("max_new_tokens", 512)
467
  temperature = kwargs.get("temperature", 0.7)
468
  top_p = kwargs.get("top_p", 0.9)
469
  stream = kwargs.get("stream", False)
@@ -494,9 +571,10 @@ class QwenVLAPIModel(Model):
494
  completion = self.hyperbolic_client.chat.completions.create(
495
  model=self.model_path,
496
  messages=formatted_messages,
497
- max_tokens=kwargs.get("max_new_tokens", 512),
498
  temperature=kwargs.get("temperature", 0.7),
499
  top_p=kwargs.get("top_p", 0.9),
 
500
  )
501
 
502
  # Extract the response text
 
135
  # Add default tools
136
  self._setup_desktop_tools()
137
  self.logger.log("Setting up agent tools...")
138
+ self.step_callbacks.append(self.take_screenshot_callback)
139
  self.logger.log("Studying an action plan... that will take a bit.")
140
 
141
+ self.final_answer_checks = [self.store_metadata_to_file]
142
+
143
  def _setup_desktop_tools(self):
144
  """Register all desktop tools"""
145
  @tool
 
298
  self.tools["drag_and_drop"] = drag_and_drop
299
 
300
 
301
+ def store_metadata_to_file(self, final_answer, memory) -> None:
302
  metadata_path = os.path.join(self.data_dir, "metadata.json")
303
  output = {}
304
+ # THIS ERASES IMAGES FROM MEMORY, USE WITH CAUTION
305
+ for memory_step in self.memory.steps:
306
+ if getattr(memory_step, "observations_images", None):
307
+ memory_step.observations_images = None
308
  a = open(metadata_path,"w")
309
+ a.write(json.dumps(self.write_memory_to_messages()))
310
  a.close()
311
+ return True
312
 
313
 
314
+ def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
315
  """Callback that takes a screenshot + memory snapshot after a step completes"""
316
  self.logger.log(self.log_path, "Analyzing screen content...")
317
 
318
  current_step = memory_step.step_number
319
  print(f"Taking screenshot for step {current_step}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
+ time.sleep(2.0) # Let things happen on the desktop
322
+ screenshot_bytes = self.desktop.screenshot()
323
+ image = Image.open(BytesIO(screenshot_bytes))
 
324
 
325
+ # Create a filename with step number
326
+ screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
327
+ image.save(screenshot_path)
328
+ print(f"Saved screenshot to {screenshot_path}")
329
 
330
+ for (
331
+ previous_memory_step
332
+ ) in agent.memory.steps: # Remove previous screenshots from logs for lean processing
333
+ if (
334
+ isinstance(previous_memory_step, ActionStep)
335
+ and previous_memory_step.step_number <= current_step - 2
336
+ ):
337
+ previous_memory_step.observations_images = None
338
 
339
+ # Add to the current memory step
340
+ memory_step.observations_images = [image.copy()] # This takes the original image directly.
 
341
 
342
+ # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
343
+
344
+
345
 
346
  def close(self):
347
  """Clean up resources"""
 
353
  self.desktop.kill()
354
  print("E2B sandbox terminated")
355
 
356
+ from smolagents import HfApiModel
357
+
358
+ # class QwenVLAPIModel(Model):
359
+ # """Model wrapper for Qwen2.5VL API with fallback mechanism"""
360
+
361
+ # def __init__(
362
+ # self,
363
+ # model_path: str = "Qwen/Qwen2.5-VL-72B-Instruct",
364
+ # provider: str = "hyperbolic",
365
+ # hf_token: str = None,
366
+ # hf_base_url: str = "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud"
367
+ # ):
368
+ # super().__init__()
369
+ # self.model_id = model_path
370
+ # self.hf_base_url = hf_base_url
371
+ # self.dedicated_endpoint_model = HfApiModel(
372
+ # hf_base_url,
373
+ # token=hf_token
374
+ # )
375
+ # self.fallback_model = HfApiModel(
376
+ # model_path,
377
+ # provider=provider,
378
+ # token=hf_token,
379
+ # )
380
+
381
+ # def __call__(
382
+ # self,
383
+ # messages: List[Dict[str, Any]],
384
+ # stop_sequences: Optional[List[str]] = None,
385
+ # **kwargs
386
+ # ) -> ChatMessage:
387
+
388
+ # try:
389
+ # return self.dedicated_endpoint_model(messages, stop_sequences, **kwargs)
390
+ # except Exception as e:
391
+ # print(f"HF endpoint failed with error: {e}. Falling back to hyperbolic.")
392
+
393
+ # # Continue to fallback
394
+ # try:
395
+ # return self.fallback_model(messages, stop_sequences, **kwargs)
396
+ # except Exception as e:
397
+ # raise Exception(f"Both endpoints failed. Last error: {e}")
398
+
399
+ # def _format_messages(self, messages: List[Dict[str, Any]]):
400
+ # """Format messages for API requests - works for both endpoints"""
401
+
402
+ # formatted_messages = []
403
+
404
+ # for msg in messages:
405
+ # role = msg["role"]
406
+ # content = []
407
+
408
+ # if isinstance(msg["content"], list):
409
+ # for item in msg["content"]:
410
+ # if item["type"] == "text":
411
+ # content.append({"type": "text", "text": item["text"]})
412
+ # elif item["type"] == "image":
413
+ # # Handle image path or direct image object
414
+ # if isinstance(item["image"], str):
415
+ # # Image is a path
416
+ # with open(item["image"], "rb") as image_file:
417
+ # base64_image = base64.b64encode(image_file.read()).decode("utf-8")
418
+ # else:
419
+ # # Image is a PIL image or similar object
420
+ # img_byte_arr = BytesIO()
421
+ # item["image"].save(img_byte_arr, format="PNG")
422
+ # base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
423
+
424
+ # content.append({
425
+ # "type": "image_url",
426
+ # "image_url": {
427
+ # "url": f"data:image/png;base64,{base64_image}"
428
+ # }
429
+ # })
430
+ # else:
431
+ # # Plain text message
432
+ # content = [{"type": "text", "text": msg["content"]}]
433
+
434
+ # formatted_messages.append({"role": role, "content": content})
435
+
436
+ # return formatted_messages
437
 
438
  class QwenVLAPIModel(Model):
439
  """Model wrapper for Qwen2.5VL API with fallback mechanism"""
 
479
  # Format messages once for both APIs
480
  formatted_messages = self._format_messages(messages)
481
 
482
+ # First try the HF endpoint if available - THIS ALWAYS FAILS SO SKIPPING
483
+ # if self.hf_client:
484
+ # try:
485
+ # completion = self._call_hf_endpoint(
486
+ # formatted_messages,
487
+ # stop_sequences,
488
+ # **kwargs
489
+ # )
490
+ # return ChatMessage(role=MessageRole.ASSISTANT, content=completion)
491
+ # except Exception as e:
492
+ # print(f"HF endpoint failed with error: {e}. Falling back to hyperbolic.")
493
+ # # Continue to fallback
494
 
495
  # Fallback to hyperbolic
496
  try:
 
520
  else:
521
  # Image is a PIL image or similar object
522
  img_byte_arr = BytesIO()
 
523
  base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
524
 
525
  content.append({
 
540
  """Call the Hugging Face OpenAI-compatible endpoint"""
541
 
542
  # Extract parameters with defaults
543
+ max_tokens = kwargs.get("max_new_tokens", 1024)
544
  temperature = kwargs.get("temperature", 0.7)
545
  top_p = kwargs.get("top_p", 0.9)
546
  stream = kwargs.get("stream", False)
 
571
  completion = self.hyperbolic_client.chat.completions.create(
572
  model=self.model_path,
573
  messages=formatted_messages,
574
+ max_tokens=kwargs.get("max_new_tokens", 1024),
575
  temperature=kwargs.get("temperature", 0.7),
576
  top_p=kwargs.get("top_p", 0.9),
577
+ stop=stop_sequences
578
  )
579
 
580
  # Extract the response text