Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Working agent with saving after task'
Browse files- app.py +18 -17
- e2bqwen.py +127 -49
app.py
CHANGED
@@ -387,8 +387,7 @@ def get_or_create_sandbox(session_hash):
|
|
387 |
print(f"Creating new sandbox for session {session_hash}")
|
388 |
desktop = Sandbox(api_key=E2B_API_KEY, resolution=(WIDTH, HEIGHT), dpi=96, timeout=SANDBOX_TIMEOUT)
|
389 |
desktop.stream.start(require_auth=True)
|
390 |
-
setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}'
|
391 |
-
sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
|
392 |
desktop.commands.run(setup_cmd)
|
393 |
|
394 |
# Store sandbox with metadata
|
@@ -486,6 +485,7 @@ class EnrichedGradioUI(GradioUI):
|
|
486 |
text_input,
|
487 |
gr.Button(interactive=False),
|
488 |
)
|
|
|
489 |
def interact_with_agent(self, task_input, stored_messages, session_state, session_hash, request: gr.Request):
|
490 |
import gradio as gr
|
491 |
|
@@ -517,24 +517,24 @@ class EnrichedGradioUI(GradioUI):
|
|
517 |
We can only execute one action at a time. On each step, answer only a python blob with the action to perform
|
518 |
""")
|
519 |
|
520 |
-
try:
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
|
525 |
-
stored_messages.append(msg)
|
526 |
-
yield stored_messages
|
527 |
|
|
|
|
|
528 |
yield stored_messages
|
529 |
-
save_final_status(data_dir, "completed", details = str(session_state["agent"].memory.get_succinct_steps()))
|
530 |
-
except Exception as e:
|
531 |
-
error_message=f"Error in interaction: {str(e)}"
|
532 |
-
stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
|
533 |
-
yield stored_messages
|
534 |
-
save_final_status(data_dir, "failed", details = str(error_message))
|
535 |
|
536 |
-
|
537 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
538 |
|
539 |
theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
|
540 |
|
@@ -568,6 +568,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js, fill_width=True) as de
|
|
568 |
"Write 'Hello World' in a text editor",
|
569 |
"Search a flight Paris - Berlin for tomorrow",
|
570 |
"Could you head to Fontainebleau (France) in Google Maps then drag and drop to position the castle of Fontainebleau exactly in the center?",
|
|
|
571 |
],
|
572 |
inputs = task_input,
|
573 |
label= "Example Tasks",
|
|
|
387 |
print(f"Creating new sandbox for session {session_hash}")
|
388 |
desktop = Sandbox(api_key=E2B_API_KEY, resolution=(WIDTH, HEIGHT), dpi=96, timeout=SANDBOX_TIMEOUT)
|
389 |
desktop.stream.start(require_auth=True)
|
390 |
+
setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
|
|
|
391 |
desktop.commands.run(setup_cmd)
|
392 |
|
393 |
# Store sandbox with metadata
|
|
|
485 |
text_input,
|
486 |
gr.Button(interactive=False),
|
487 |
)
|
488 |
+
|
489 |
def interact_with_agent(self, task_input, stored_messages, session_state, session_hash, request: gr.Request):
|
490 |
import gradio as gr
|
491 |
|
|
|
517 |
We can only execute one action at a time. On each step, answer only a python blob with the action to perform
|
518 |
""")
|
519 |
|
520 |
+
# try:
|
521 |
+
stored_messages.append(gr.ChatMessage(role="user", content=task_input))
|
522 |
+
yield stored_messages
|
|
|
|
|
|
|
|
|
523 |
|
524 |
+
for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
|
525 |
+
stored_messages.append(msg)
|
526 |
yield stored_messages
|
|
|
|
|
|
|
|
|
|
|
|
|
527 |
|
528 |
+
yield stored_messages
|
529 |
+
# save_final_status(data_dir, "completed", details = str(session_state["agent"].memory.get_succinct_steps()))
|
530 |
+
# except Exception as e:
|
531 |
+
# error_message=f"Error in interaction: {str(e)}"
|
532 |
+
# stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
|
533 |
+
# yield stored_messages
|
534 |
+
# save_final_status(data_dir, "failed", details = str(error_message))
|
535 |
+
|
536 |
+
# finally:
|
537 |
+
# upload_to_hf_and_remove(data_dir)
|
538 |
|
539 |
theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
|
540 |
|
|
|
568 |
"Write 'Hello World' in a text editor",
|
569 |
"Search a flight Paris - Berlin for tomorrow",
|
570 |
"Could you head to Fontainebleau (France) in Google Maps then drag and drop to position the castle of Fontainebleau exactly in the center?",
|
571 |
+
"Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background"
|
572 |
],
|
573 |
inputs = task_input,
|
574 |
label= "Example Tasks",
|
e2bqwen.py
CHANGED
@@ -135,9 +135,11 @@ class E2BVisionAgent(CodeAgent):
|
|
135 |
# Add default tools
|
136 |
self._setup_desktop_tools()
|
137 |
self.logger.log("Setting up agent tools...")
|
138 |
-
self.step_callbacks.append(self.
|
139 |
self.logger.log("Studying an action plan... that will take a bit.")
|
140 |
|
|
|
|
|
141 |
def _setup_desktop_tools(self):
|
142 |
"""Register all desktop tools"""
|
143 |
@tool
|
@@ -296,55 +298,50 @@ class E2BVisionAgent(CodeAgent):
|
|
296 |
self.tools["drag_and_drop"] = drag_and_drop
|
297 |
|
298 |
|
299 |
-
def store_metadata_to_file(self,
|
300 |
metadata_path = os.path.join(self.data_dir, "metadata.json")
|
301 |
output = {}
|
302 |
-
|
|
|
|
|
|
|
303 |
a = open(metadata_path,"w")
|
304 |
-
a.write(json.dumps(
|
305 |
a.close()
|
|
|
306 |
|
307 |
|
308 |
-
def
|
309 |
"""Callback that takes a screenshot + memory snapshot after a step completes"""
|
310 |
self.logger.log(self.log_path, "Analyzing screen content...")
|
311 |
|
312 |
current_step = memory_step.step_number
|
313 |
print(f"Taking screenshot for step {current_step}")
|
314 |
-
# Check if desktop is still running
|
315 |
-
if not self.desktop.is_running():
|
316 |
-
print("Desktop is no longer running. Terminating agent.")
|
317 |
-
self.close()
|
318 |
-
# Add a final observation indicating why the agent was terminated
|
319 |
-
memory_step.observations = "Desktop session ended. Agent terminated."
|
320 |
-
# Store final metadata before exiting
|
321 |
-
self.store_metadata_to_file(agent)
|
322 |
-
return # Exit the callback without attempting to take a screenshot
|
323 |
-
|
324 |
-
try:
|
325 |
-
time.sleep(2.0) # Let things happen on the desktop
|
326 |
-
screenshot_bytes = self.desktop.screenshot()
|
327 |
-
image = Image.open(BytesIO(screenshot_bytes))
|
328 |
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
print(f"Saved screenshot to {screenshot_path}")
|
333 |
|
334 |
-
|
335 |
-
|
336 |
-
|
|
|
337 |
|
338 |
-
|
339 |
-
|
340 |
-
|
|
|
|
|
|
|
|
|
|
|
341 |
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
|
346 |
-
|
347 |
-
|
|
|
348 |
|
349 |
def close(self):
|
350 |
"""Clean up resources"""
|
@@ -356,6 +353,87 @@ class E2BVisionAgent(CodeAgent):
|
|
356 |
self.desktop.kill()
|
357 |
print("E2B sandbox terminated")
|
358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
|
360 |
class QwenVLAPIModel(Model):
|
361 |
"""Model wrapper for Qwen2.5VL API with fallback mechanism"""
|
@@ -401,18 +479,18 @@ class QwenVLAPIModel(Model):
|
|
401 |
# Format messages once for both APIs
|
402 |
formatted_messages = self._format_messages(messages)
|
403 |
|
404 |
-
# First try the HF endpoint if available
|
405 |
-
if self.hf_client:
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
|
417 |
# Fallback to hyperbolic
|
418 |
try:
|
@@ -442,7 +520,6 @@ class QwenVLAPIModel(Model):
|
|
442 |
else:
|
443 |
# Image is a PIL image or similar object
|
444 |
img_byte_arr = BytesIO()
|
445 |
-
item["image"].save(img_byte_arr, format="PNG")
|
446 |
base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
447 |
|
448 |
content.append({
|
@@ -463,7 +540,7 @@ class QwenVLAPIModel(Model):
|
|
463 |
"""Call the Hugging Face OpenAI-compatible endpoint"""
|
464 |
|
465 |
# Extract parameters with defaults
|
466 |
-
max_tokens = kwargs.get("max_new_tokens",
|
467 |
temperature = kwargs.get("temperature", 0.7)
|
468 |
top_p = kwargs.get("top_p", 0.9)
|
469 |
stream = kwargs.get("stream", False)
|
@@ -494,9 +571,10 @@ class QwenVLAPIModel(Model):
|
|
494 |
completion = self.hyperbolic_client.chat.completions.create(
|
495 |
model=self.model_path,
|
496 |
messages=formatted_messages,
|
497 |
-
max_tokens=kwargs.get("max_new_tokens",
|
498 |
temperature=kwargs.get("temperature", 0.7),
|
499 |
top_p=kwargs.get("top_p", 0.9),
|
|
|
500 |
)
|
501 |
|
502 |
# Extract the response text
|
|
|
135 |
# Add default tools
|
136 |
self._setup_desktop_tools()
|
137 |
self.logger.log("Setting up agent tools...")
|
138 |
+
self.step_callbacks.append(self.take_screenshot_callback)
|
139 |
self.logger.log("Studying an action plan... that will take a bit.")
|
140 |
|
141 |
+
self.final_answer_checks = [self.store_metadata_to_file]
|
142 |
+
|
143 |
def _setup_desktop_tools(self):
|
144 |
"""Register all desktop tools"""
|
145 |
@tool
|
|
|
298 |
self.tools["drag_and_drop"] = drag_and_drop
|
299 |
|
300 |
|
301 |
+
def store_metadata_to_file(self, final_answer, memory) -> None:
|
302 |
metadata_path = os.path.join(self.data_dir, "metadata.json")
|
303 |
output = {}
|
304 |
+
# THIS ERASES IMAGES FROM MEMORY, USE WITH CAUTION
|
305 |
+
for memory_step in self.memory.steps:
|
306 |
+
if getattr(memory_step, "observations_images", None):
|
307 |
+
memory_step.observations_images = None
|
308 |
a = open(metadata_path,"w")
|
309 |
+
a.write(json.dumps(self.write_memory_to_messages()))
|
310 |
a.close()
|
311 |
+
return True
|
312 |
|
313 |
|
314 |
+
def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
|
315 |
"""Callback that takes a screenshot + memory snapshot after a step completes"""
|
316 |
self.logger.log(self.log_path, "Analyzing screen content...")
|
317 |
|
318 |
current_step = memory_step.step_number
|
319 |
print(f"Taking screenshot for step {current_step}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
|
321 |
+
time.sleep(2.0) # Let things happen on the desktop
|
322 |
+
screenshot_bytes = self.desktop.screenshot()
|
323 |
+
image = Image.open(BytesIO(screenshot_bytes))
|
|
|
324 |
|
325 |
+
# Create a filename with step number
|
326 |
+
screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
|
327 |
+
image.save(screenshot_path)
|
328 |
+
print(f"Saved screenshot to {screenshot_path}")
|
329 |
|
330 |
+
for (
|
331 |
+
previous_memory_step
|
332 |
+
) in agent.memory.steps: # Remove previous screenshots from logs for lean processing
|
333 |
+
if (
|
334 |
+
isinstance(previous_memory_step, ActionStep)
|
335 |
+
and previous_memory_step.step_number <= current_step - 2
|
336 |
+
):
|
337 |
+
previous_memory_step.observations_images = None
|
338 |
|
339 |
+
# Add to the current memory step
|
340 |
+
memory_step.observations_images = [image.copy()] # This takes the original image directly.
|
|
|
341 |
|
342 |
+
# memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
|
343 |
+
|
344 |
+
|
345 |
|
346 |
def close(self):
|
347 |
"""Clean up resources"""
|
|
|
353 |
self.desktop.kill()
|
354 |
print("E2B sandbox terminated")
|
355 |
|
356 |
+
from smolagents import HfApiModel
|
357 |
+
|
358 |
+
# class QwenVLAPIModel(Model):
|
359 |
+
# """Model wrapper for Qwen2.5VL API with fallback mechanism"""
|
360 |
+
|
361 |
+
# def __init__(
|
362 |
+
# self,
|
363 |
+
# model_path: str = "Qwen/Qwen2.5-VL-72B-Instruct",
|
364 |
+
# provider: str = "hyperbolic",
|
365 |
+
# hf_token: str = None,
|
366 |
+
# hf_base_url: str = "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud"
|
367 |
+
# ):
|
368 |
+
# super().__init__()
|
369 |
+
# self.model_id = model_path
|
370 |
+
# self.hf_base_url = hf_base_url
|
371 |
+
# self.dedicated_endpoint_model = HfApiModel(
|
372 |
+
# hf_base_url,
|
373 |
+
# token=hf_token
|
374 |
+
# )
|
375 |
+
# self.fallback_model = HfApiModel(
|
376 |
+
# model_path,
|
377 |
+
# provider=provider,
|
378 |
+
# token=hf_token,
|
379 |
+
# )
|
380 |
+
|
381 |
+
# def __call__(
|
382 |
+
# self,
|
383 |
+
# messages: List[Dict[str, Any]],
|
384 |
+
# stop_sequences: Optional[List[str]] = None,
|
385 |
+
# **kwargs
|
386 |
+
# ) -> ChatMessage:
|
387 |
+
|
388 |
+
# try:
|
389 |
+
# return self.dedicated_endpoint_model(messages, stop_sequences, **kwargs)
|
390 |
+
# except Exception as e:
|
391 |
+
# print(f"HF endpoint failed with error: {e}. Falling back to hyperbolic.")
|
392 |
+
|
393 |
+
# # Continue to fallback
|
394 |
+
# try:
|
395 |
+
# return self.fallback_model(messages, stop_sequences, **kwargs)
|
396 |
+
# except Exception as e:
|
397 |
+
# raise Exception(f"Both endpoints failed. Last error: {e}")
|
398 |
+
|
399 |
+
# def _format_messages(self, messages: List[Dict[str, Any]]):
|
400 |
+
# """Format messages for API requests - works for both endpoints"""
|
401 |
+
|
402 |
+
# formatted_messages = []
|
403 |
+
|
404 |
+
# for msg in messages:
|
405 |
+
# role = msg["role"]
|
406 |
+
# content = []
|
407 |
+
|
408 |
+
# if isinstance(msg["content"], list):
|
409 |
+
# for item in msg["content"]:
|
410 |
+
# if item["type"] == "text":
|
411 |
+
# content.append({"type": "text", "text": item["text"]})
|
412 |
+
# elif item["type"] == "image":
|
413 |
+
# # Handle image path or direct image object
|
414 |
+
# if isinstance(item["image"], str):
|
415 |
+
# # Image is a path
|
416 |
+
# with open(item["image"], "rb") as image_file:
|
417 |
+
# base64_image = base64.b64encode(image_file.read()).decode("utf-8")
|
418 |
+
# else:
|
419 |
+
# # Image is a PIL image or similar object
|
420 |
+
# img_byte_arr = BytesIO()
|
421 |
+
# item["image"].save(img_byte_arr, format="PNG")
|
422 |
+
# base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
423 |
+
|
424 |
+
# content.append({
|
425 |
+
# "type": "image_url",
|
426 |
+
# "image_url": {
|
427 |
+
# "url": f"data:image/png;base64,{base64_image}"
|
428 |
+
# }
|
429 |
+
# })
|
430 |
+
# else:
|
431 |
+
# # Plain text message
|
432 |
+
# content = [{"type": "text", "text": msg["content"]}]
|
433 |
+
|
434 |
+
# formatted_messages.append({"role": role, "content": content})
|
435 |
+
|
436 |
+
# return formatted_messages
|
437 |
|
438 |
class QwenVLAPIModel(Model):
|
439 |
"""Model wrapper for Qwen2.5VL API with fallback mechanism"""
|
|
|
479 |
# Format messages once for both APIs
|
480 |
formatted_messages = self._format_messages(messages)
|
481 |
|
482 |
+
# First try the HF endpoint if available - THIS ALWAYS FAILS SO SKIPPING
|
483 |
+
# if self.hf_client:
|
484 |
+
# try:
|
485 |
+
# completion = self._call_hf_endpoint(
|
486 |
+
# formatted_messages,
|
487 |
+
# stop_sequences,
|
488 |
+
# **kwargs
|
489 |
+
# )
|
490 |
+
# return ChatMessage(role=MessageRole.ASSISTANT, content=completion)
|
491 |
+
# except Exception as e:
|
492 |
+
# print(f"HF endpoint failed with error: {e}. Falling back to hyperbolic.")
|
493 |
+
# # Continue to fallback
|
494 |
|
495 |
# Fallback to hyperbolic
|
496 |
try:
|
|
|
520 |
else:
|
521 |
# Image is a PIL image or similar object
|
522 |
img_byte_arr = BytesIO()
|
|
|
523 |
base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
524 |
|
525 |
content.append({
|
|
|
540 |
"""Call the Hugging Face OpenAI-compatible endpoint"""
|
541 |
|
542 |
# Extract parameters with defaults
|
543 |
+
max_tokens = kwargs.get("max_new_tokens", 1024)
|
544 |
temperature = kwargs.get("temperature", 0.7)
|
545 |
top_p = kwargs.get("top_p", 0.9)
|
546 |
stream = kwargs.get("stream", False)
|
|
|
571 |
completion = self.hyperbolic_client.chat.completions.create(
|
572 |
model=self.model_path,
|
573 |
messages=formatted_messages,
|
574 |
+
max_tokens=kwargs.get("max_new_tokens", 1024),
|
575 |
temperature=kwargs.get("temperature", 0.7),
|
576 |
top_p=kwargs.get("top_p", 0.9),
|
577 |
+
stop=stop_sequences
|
578 |
)
|
579 |
|
580 |
# Extract the response text
|