Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Implement replay function
Browse files- app.py +18 -9
- e2bqwen.py +11 -4
- model_replay.py +10 -7
app.py
CHANGED
@@ -12,7 +12,7 @@ from e2b_desktop import Sandbox
|
|
12 |
from smolagents import CodeAgent
|
13 |
from smolagents.monitoring import LogLevel
|
14 |
from smolagents.gradio_ui import GradioUI, stream_to_gradio
|
15 |
-
from model_replay import
|
16 |
|
17 |
from e2bqwen import QwenVLAPIModel, E2BVisionAgent
|
18 |
|
@@ -488,7 +488,7 @@ class EnrichedGradioUI(GradioUI):
|
|
488 |
gr.Button(interactive=False),
|
489 |
)
|
490 |
|
491 |
-
def interact_with_agent(self, task_input, stored_messages, session_state, session_hash,
|
492 |
import gradio as gr
|
493 |
|
494 |
interaction_id = generate_interaction_id(request)
|
@@ -504,9 +504,10 @@ class EnrichedGradioUI(GradioUI):
|
|
504 |
else:
|
505 |
session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
|
506 |
|
507 |
-
if replay_log is not None:
|
508 |
original_model = session_state["agent"].model
|
509 |
-
session_state["agent"].model = FakeModelReplayLog(replay_log)
|
|
|
510 |
|
511 |
try:
|
512 |
stored_messages.append(gr.ChatMessage(role="user", content=task_input))
|
@@ -539,8 +540,9 @@ class EnrichedGradioUI(GradioUI):
|
|
539 |
save_final_status(data_dir, "failed", summary=[], error_message=error_message)
|
540 |
|
541 |
finally:
|
542 |
-
if replay_log: # Replace the model with original model
|
543 |
session_state["agent"].model = original_model
|
|
|
544 |
upload_to_hf_and_remove(data_dir)
|
545 |
|
546 |
theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
|
@@ -573,7 +575,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
|
|
573 |
"Check the commuting time between Bern and Zurich on Google maps",
|
574 |
"Write 'Hello World' in a text editor",
|
575 |
"Search a flight Paris - Berlin for tomorrow",
|
576 |
-
"
|
577 |
"Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background"
|
578 |
],
|
579 |
inputs = task_input,
|
@@ -685,9 +687,10 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
|
|
685 |
fn=clear_and_set_view_only,
|
686 |
inputs=[task_input],
|
687 |
outputs=[sandbox_html]
|
688 |
-
)
|
|
|
689 |
agent_ui.interact_with_agent,
|
690 |
-
inputs=[task_input, stored_messages, session_state, session_hash_state
|
691 |
outputs=[chatbot_display]
|
692 |
).then(
|
693 |
fn=set_interactive,
|
@@ -695,13 +698,19 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
|
|
695 |
outputs=[sandbox_html]
|
696 |
)
|
697 |
|
|
|
|
|
|
|
698 |
replay_btn.click(
|
699 |
fn=clear_and_set_view_only,
|
700 |
inputs=[task_input],
|
701 |
outputs=[sandbox_html]
|
|
|
|
|
|
|
702 |
).then(
|
703 |
agent_ui.interact_with_agent,
|
704 |
-
inputs=[task_input, stored_messages, session_state, session_hash_state
|
705 |
outputs=[chatbot_display]
|
706 |
).then(
|
707 |
fn=set_interactive,
|
|
|
12 |
from smolagents import CodeAgent
|
13 |
from smolagents.monitoring import LogLevel
|
14 |
from smolagents.gradio_ui import GradioUI, stream_to_gradio
|
15 |
+
from model_replay import FakeModelReplayLog
|
16 |
|
17 |
from e2bqwen import QwenVLAPIModel, E2BVisionAgent
|
18 |
|
|
|
488 |
gr.Button(interactive=False),
|
489 |
)
|
490 |
|
491 |
+
def interact_with_agent(self, task_input, stored_messages, session_state, session_hash, request: gr.Request):
|
492 |
import gradio as gr
|
493 |
|
494 |
interaction_id = generate_interaction_id(request)
|
|
|
504 |
else:
|
505 |
session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
|
506 |
|
507 |
+
if "replay_log" in session_state and session_state["replay_log"] is not None:
|
508 |
original_model = session_state["agent"].model
|
509 |
+
session_state["agent"].model = FakeModelReplayLog(session_state["replay_log"])
|
510 |
+
|
511 |
|
512 |
try:
|
513 |
stored_messages.append(gr.ChatMessage(role="user", content=task_input))
|
|
|
540 |
save_final_status(data_dir, "failed", summary=[], error_message=error_message)
|
541 |
|
542 |
finally:
|
543 |
+
if "replay_log" in session_state and session_state["replay_log"] is not None: # Replace the model with original model
|
544 |
session_state["agent"].model = original_model
|
545 |
+
session_state["replay_log"] = None
|
546 |
upload_to_hf_and_remove(data_dir)
|
547 |
|
548 |
theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
|
|
|
575 |
"Check the commuting time between Bern and Zurich on Google maps",
|
576 |
"Write 'Hello World' in a text editor",
|
577 |
"Search a flight Paris - Berlin for tomorrow",
|
578 |
+
"Search for Château de Fontainebleau in Google Maps",
|
579 |
"Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background"
|
580 |
],
|
581 |
inputs = task_input,
|
|
|
687 |
fn=clear_and_set_view_only,
|
688 |
inputs=[task_input],
|
689 |
outputs=[sandbox_html]
|
690 |
+
)
|
691 |
+
view_only_event.then(
|
692 |
agent_ui.interact_with_agent,
|
693 |
+
inputs=[task_input, stored_messages, session_state, session_hash_state],
|
694 |
outputs=[chatbot_display]
|
695 |
).then(
|
696 |
fn=set_interactive,
|
|
|
698 |
outputs=[sandbox_html]
|
699 |
)
|
700 |
|
701 |
+
def set_logs_source(session_state):
|
702 |
+
session_state["replay_log"] = "udupp2fyavq_1743170323"
|
703 |
+
|
704 |
replay_btn.click(
|
705 |
fn=clear_and_set_view_only,
|
706 |
inputs=[task_input],
|
707 |
outputs=[sandbox_html]
|
708 |
+
).then(
|
709 |
+
set_logs_source,
|
710 |
+
inputs=[session_state]
|
711 |
).then(
|
712 |
agent_ui.interact_with_agent,
|
713 |
+
inputs=[task_input, stored_messages, session_state, session_hash_state],
|
714 |
outputs=[chatbot_display]
|
715 |
).then(
|
716 |
fn=set_interactive,
|
e2bqwen.py
CHANGED
@@ -5,6 +5,7 @@ from io import BytesIO
|
|
5 |
from textwrap import dedent
|
6 |
from typing import Any, Dict, List, Optional, Tuple
|
7 |
import json
|
|
|
8 |
|
9 |
# HF API params
|
10 |
from huggingface_hub import InferenceClient
|
@@ -260,6 +261,9 @@ class E2BVisionAgent(CodeAgent):
|
|
260 |
self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
|
261 |
return f"Moved mouse to coordinates ({x}, {y})"
|
262 |
|
|
|
|
|
|
|
263 |
@tool
|
264 |
def type_text(text: str, delay_in_ms: int = 75) -> str:
|
265 |
"""
|
@@ -268,9 +272,10 @@ class E2BVisionAgent(CodeAgent):
|
|
268 |
text: The text to type
|
269 |
delay_in_ms: Delay between keystrokes in milliseconds
|
270 |
"""
|
271 |
-
|
272 |
-
self.
|
273 |
-
|
|
|
274 |
|
275 |
@tool
|
276 |
def press_key(key: str) -> str:
|
@@ -309,10 +314,12 @@ class E2BVisionAgent(CodeAgent):
|
|
309 |
return message
|
310 |
|
311 |
@tool
|
312 |
-
def scroll(direction: str = "down", amount: int = 1) -> str:
|
313 |
"""
|
314 |
Uses scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
|
315 |
Args:
|
|
|
|
|
316 |
direction: The direction to scroll ("up" or "down"), defaults to "down"
|
317 |
amount: The amount to scroll. A good amount is 1 or 2.
|
318 |
"""
|
|
|
5 |
from textwrap import dedent
|
6 |
from typing import Any, Dict, List, Optional, Tuple
|
7 |
import json
|
8 |
+
import unicodedata
|
9 |
|
10 |
# HF API params
|
11 |
from huggingface_hub import InferenceClient
|
|
|
261 |
self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
|
262 |
return f"Moved mouse to coordinates ({x}, {y})"
|
263 |
|
264 |
+
def normalize_text(text):
|
265 |
+
return ''.join(c for c in unicodedata.normalize('NFD', text) if not unicodedata.combining(c))
|
266 |
+
|
267 |
@tool
|
268 |
def type_text(text: str, delay_in_ms: int = 75) -> str:
|
269 |
"""
|
|
|
272 |
text: The text to type
|
273 |
delay_in_ms: Delay between keystrokes in milliseconds
|
274 |
"""
|
275 |
+
clean_text = normalize_text(text)
|
276 |
+
self.desktop.write(clean_text, delay_in_ms=delay_in_ms)
|
277 |
+
self.logger.log(f"Typed text: '{clean_text}'")
|
278 |
+
return f"Typed text: '{clean_text}'"
|
279 |
|
280 |
@tool
|
281 |
def press_key(key: str) -> str:
|
|
|
314 |
return message
|
315 |
|
316 |
@tool
|
317 |
+
def scroll(x: int, y: int, direction: str = "down", amount: int = 1) -> str:
|
318 |
"""
|
319 |
Uses scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
|
320 |
Args:
|
321 |
+
x: The x coordinate (horizontal position) of the element to scroll/zoom
|
322 |
+
y: The y coordinate (vertical position) of the element to scroll/zoom
|
323 |
direction: The direction to scroll ("up" or "down"), defaults to "down"
|
324 |
amount: The amount to scroll. A good amount is 1 or 2.
|
325 |
"""
|
model_replay.py
CHANGED
@@ -1,7 +1,11 @@
|
|
1 |
from smolagents.models import Model, ChatMessage, Tool, MessageRole
|
2 |
-
from time import
|
|
|
|
|
|
|
3 |
|
4 |
-
|
|
|
5 |
"""A model class that returns pre-recorded responses from a log file.
|
6 |
|
7 |
This class is useful for testing and debugging purposes, as it doesn't make
|
@@ -19,7 +23,7 @@ class FakeModelClass(Model):
|
|
19 |
**kwargs
|
20 |
):
|
21 |
super().__init__(**kwargs)
|
22 |
-
self.dataset_name = "smolagents/computer-agent-logs"
|
23 |
self.log_folder = log_folder
|
24 |
self.call_counter = 0
|
25 |
self.model_outputs = self._load_model_outputs()
|
@@ -40,9 +44,8 @@ class FakeModelClass(Model):
|
|
40 |
# Extract only the model_output from each step in tool_calls
|
41 |
model_outputs = []
|
42 |
|
43 |
-
for step in log_data
|
44 |
-
|
45 |
-
model_outputs.append(step["model_output_message"])
|
46 |
|
47 |
print(f"Loaded {len(model_outputs)} model outputs from log file")
|
48 |
return model_outputs
|
@@ -67,7 +70,7 @@ class FakeModelClass(Model):
|
|
67 |
Returns:
|
68 |
ChatMessage: The next pre-recorded response.
|
69 |
"""
|
70 |
-
|
71 |
|
72 |
# Get the next model output
|
73 |
if self.call_counter < len(self.model_outputs):
|
|
|
1 |
from smolagents.models import Model, ChatMessage, Tool, MessageRole
|
2 |
+
from time import sleep
|
3 |
+
from typing import List, Dict, Optional
|
4 |
+
from huggingface_hub import hf_hub_download
|
5 |
+
import json
|
6 |
|
7 |
+
|
8 |
+
class FakeModelReplayLog(Model):
|
9 |
"""A model class that returns pre-recorded responses from a log file.
|
10 |
|
11 |
This class is useful for testing and debugging purposes, as it doesn't make
|
|
|
23 |
**kwargs
|
24 |
):
|
25 |
super().__init__(**kwargs)
|
26 |
+
self.dataset_name = "smolagents/computer-agent-logs"
|
27 |
self.log_folder = log_folder
|
28 |
self.call_counter = 0
|
29 |
self.model_outputs = self._load_model_outputs()
|
|
|
44 |
# Extract only the model_output from each step in tool_calls
|
45 |
model_outputs = []
|
46 |
|
47 |
+
for step in log_data["summary"][1:]:
|
48 |
+
model_outputs.append(step["model_output_message"]["content"])
|
|
|
49 |
|
50 |
print(f"Loaded {len(model_outputs)} model outputs from log file")
|
51 |
return model_outputs
|
|
|
70 |
Returns:
|
71 |
ChatMessage: The next pre-recorded response.
|
72 |
"""
|
73 |
+
sleep(1.0)
|
74 |
|
75 |
# Get the next model output
|
76 |
if self.call_counter < len(self.model_outputs):
|