Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Working export log to dataset
Browse files- app.py +60 -54
- e2bqwen.py +14 -30
app.py
CHANGED
@@ -425,44 +425,51 @@ def generate_interaction_id(request):
|
|
425 |
"""Generate a unique ID combining session hash and timestamp"""
|
426 |
return f"{request.session_hash}_{int(time.time())}"
|
427 |
|
428 |
-
def save_final_status(folder, status, details = None):
|
429 |
-
a = open(os.path.join(folder,"status.json"),"w")
|
430 |
-
a.write(json.dumps({"status":status,"details":details}))
|
431 |
-
a.close()
|
432 |
|
433 |
-
def
|
434 |
-
"""
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
|
444 |
def initialize_session(interactive_mode, request: gr.Request):
|
445 |
session_hash = request.session_hash
|
446 |
-
# Create session-specific log file
|
447 |
-
log_path = get_log_file_path(session_hash)
|
448 |
-
# Initialize log file if it doesn't exist
|
449 |
-
if not os.path.exists(log_path):
|
450 |
-
with open(log_path, 'w') as f:
|
451 |
-
f.write(f"Ready to go...\n")
|
452 |
# Return HTML and session hash
|
453 |
return update_html(interactive_mode, request), session_hash
|
454 |
|
455 |
|
456 |
-
|
457 |
-
def update_terminal_from_session(session_hash):
|
458 |
-
if not session_hash:
|
459 |
-
return "Waiting for session..."
|
460 |
-
|
461 |
-
log_path = get_log_file_path(session_hash)
|
462 |
-
return read_log_content(log_path)
|
463 |
-
|
464 |
-
|
465 |
-
def create_agent(data_dir, desktop, log_file):
|
466 |
model = QwenVLAPIModel(
|
467 |
model_id="Qwen/Qwen2.5-VL-72B-Instruct",
|
468 |
hf_token = hf_token,
|
@@ -474,7 +481,6 @@ def create_agent(data_dir, desktop, log_file):
|
|
474 |
max_steps=200,
|
475 |
verbosity_level=2,
|
476 |
planning_interval=10,
|
477 |
-
log_file = log_file
|
478 |
)
|
479 |
|
480 |
class EnrichedGradioUI(GradioUI):
|
@@ -497,10 +503,9 @@ class EnrichedGradioUI(GradioUI):
|
|
497 |
if not os.path.exists(data_dir):
|
498 |
os.makedirs(data_dir)
|
499 |
|
500 |
-
log_file = get_log_file_path(session_hash)
|
501 |
|
502 |
if "agent" not in session_state:
|
503 |
-
session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop
|
504 |
|
505 |
# Construct the full task with instructions
|
506 |
full_task = task_input + dedent(f"""
|
@@ -517,31 +522,32 @@ class EnrichedGradioUI(GradioUI):
|
|
517 |
We can only execute one action at a time. On each step, answer only a python blob with the action to perform
|
518 |
""")
|
519 |
|
520 |
-
|
521 |
-
|
522 |
-
yield stored_messages
|
523 |
-
|
524 |
-
for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
|
525 |
-
if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
|
526 |
-
stored_messages.append(gr.ChatMessage(
|
527 |
-
role="assistant",
|
528 |
-
content={"path": session_state["agent"].last_screenshot.to_string(), "mime_type": "image/png"},
|
529 |
-
))
|
530 |
-
stored_messages.append(msg)
|
531 |
yield stored_messages
|
532 |
|
533 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
534 |
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
#
|
539 |
-
|
540 |
-
|
541 |
-
|
|
|
|
|
|
|
542 |
|
543 |
-
|
544 |
-
|
545 |
|
546 |
theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
|
547 |
|
|
|
425 |
"""Generate a unique ID combining session hash and timestamp"""
|
426 |
return f"{request.session_hash}_{int(time.time())}"
|
427 |
|
|
|
|
|
|
|
|
|
428 |
|
429 |
+
def chat_message_to_json(obj):
|
430 |
+
"""Custom JSON serializer for ChatMessage and related objects"""
|
431 |
+
if hasattr(obj, '__dict__'):
|
432 |
+
# Create a copy of the object's __dict__ to avoid modifying the original
|
433 |
+
result = obj.__dict__.copy()
|
434 |
+
|
435 |
+
# Remove the 'raw' field which may contain non-serializable data
|
436 |
+
if 'raw' in result:
|
437 |
+
del result['raw']
|
438 |
+
|
439 |
+
# Process the content or tool_calls if they exist
|
440 |
+
if 'content' in result and result['content'] is not None:
|
441 |
+
if hasattr(result['content'], '__dict__'):
|
442 |
+
result['content'] = chat_message_to_json(result['content'])
|
443 |
+
|
444 |
+
if 'tool_calls' in result and result['tool_calls'] is not None:
|
445 |
+
result['tool_calls'] = [chat_message_to_json(tc) for tc in result['tool_calls']]
|
446 |
+
|
447 |
+
return result
|
448 |
+
elif isinstance(obj, (list, tuple)):
|
449 |
+
return [chat_message_to_json(item) for item in obj]
|
450 |
+
else:
|
451 |
+
return obj
|
452 |
+
|
453 |
+
|
454 |
+
def save_final_status(folder, status: str, memory, error_message = None) -> None:
|
455 |
+
metadata_path = os.path.join(folder, "metadata.json")
|
456 |
+
output = {}
|
457 |
+
# THIS ERASES IMAGES FROM MEMORY, USE WITH CAUTION
|
458 |
+
for memory_step in memory.steps:
|
459 |
+
if getattr(memory_step, "observations_images", None):
|
460 |
+
memory_step.observations_images = None
|
461 |
+
a = open(metadata_path,"w")
|
462 |
+
summary = memory.get_succinct_steps()
|
463 |
+
a.write(json.dumps({"status":status, "summary":summary, "error_message": error_message}, default=chat_message_to_json))
|
464 |
+
a.close()
|
465 |
|
466 |
def initialize_session(interactive_mode, request: gr.Request):
|
467 |
session_hash = request.session_hash
|
|
|
|
|
|
|
|
|
|
|
|
|
468 |
# Return HTML and session hash
|
469 |
return update_html(interactive_mode, request), session_hash
|
470 |
|
471 |
|
472 |
+
def create_agent(data_dir, desktop):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
473 |
model = QwenVLAPIModel(
|
474 |
model_id="Qwen/Qwen2.5-VL-72B-Instruct",
|
475 |
hf_token = hf_token,
|
|
|
481 |
max_steps=200,
|
482 |
verbosity_level=2,
|
483 |
planning_interval=10,
|
|
|
484 |
)
|
485 |
|
486 |
class EnrichedGradioUI(GradioUI):
|
|
|
503 |
if not os.path.exists(data_dir):
|
504 |
os.makedirs(data_dir)
|
505 |
|
|
|
506 |
|
507 |
if "agent" not in session_state:
|
508 |
+
session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
|
509 |
|
510 |
# Construct the full task with instructions
|
511 |
full_task = task_input + dedent(f"""
|
|
|
522 |
We can only execute one action at a time. On each step, answer only a python blob with the action to perform
|
523 |
""")
|
524 |
|
525 |
+
try:
|
526 |
+
stored_messages.append(gr.ChatMessage(role="user", content=task_input))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
527 |
yield stored_messages
|
528 |
|
529 |
+
for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
|
530 |
+
if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
|
531 |
+
stored_messages.append(gr.ChatMessage(
|
532 |
+
role="assistant",
|
533 |
+
content={"path": session_state["agent"].last_screenshot.to_string(), "mime_type": "image/png"},
|
534 |
+
))
|
535 |
+
stored_messages.append(msg)
|
536 |
+
yield stored_messages
|
537 |
|
538 |
+
yield stored_messages
|
539 |
+
save_final_status(data_dir, "completed", memory = session_state["agent"].memory)
|
540 |
+
|
541 |
+
# # TODO: uncomment below after testing
|
542 |
+
except Exception as e:
|
543 |
+
error_message=f"Error in interaction: {str(e)}"
|
544 |
+
stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
|
545 |
+
yield stored_messages
|
546 |
+
raise e
|
547 |
+
save_final_status(data_dir, "failed", summary={}, error_message=error_message)
|
548 |
|
549 |
+
finally:
|
550 |
+
upload_to_hf_and_remove(data_dir)
|
551 |
|
552 |
theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
|
553 |
|
e2bqwen.py
CHANGED
@@ -101,12 +101,10 @@ class E2BVisionAgent(CodeAgent):
|
|
101 |
max_steps: int = 200,
|
102 |
verbosity_level: LogLevel = 2,
|
103 |
planning_interval: int = 10,
|
104 |
-
log_file = None,
|
105 |
**kwargs
|
106 |
):
|
107 |
self.desktop = desktop
|
108 |
self.data_dir = data_dir
|
109 |
-
self.log_path = log_file
|
110 |
self.planning_interval = planning_interval
|
111 |
# Initialize Desktop
|
112 |
self.width, self.height = self.desktop.get_screen_size()
|
@@ -137,7 +135,6 @@ class E2BVisionAgent(CodeAgent):
|
|
137 |
self.logger.log("Setting up agent tools...")
|
138 |
self._setup_desktop_tools()
|
139 |
self.step_callbacks.append(self.take_screenshot_callback)
|
140 |
-
self.final_answer_checks = [self.store_metadata_to_file]
|
141 |
|
142 |
def _setup_desktop_tools(self):
|
143 |
"""Register all desktop tools"""
|
@@ -151,7 +148,7 @@ class E2BVisionAgent(CodeAgent):
|
|
151 |
"""
|
152 |
self.desktop.move_mouse(x, y)
|
153 |
self.desktop.left_click()
|
154 |
-
self.logger.log(
|
155 |
return f"Clicked at coordinates ({x}, {y})"
|
156 |
|
157 |
@tool
|
@@ -164,7 +161,7 @@ class E2BVisionAgent(CodeAgent):
|
|
164 |
"""
|
165 |
self.desktop.move_mouse(x, y)
|
166 |
self.desktop.right_click()
|
167 |
-
self.logger.log(
|
168 |
return f"Right-clicked at coordinates ({x}, {y})"
|
169 |
|
170 |
@tool
|
@@ -177,7 +174,7 @@ class E2BVisionAgent(CodeAgent):
|
|
177 |
"""
|
178 |
self.desktop.move_mouse(x, y)
|
179 |
self.desktop.double_click()
|
180 |
-
self.logger.log(
|
181 |
return f"Double-clicked at coordinates ({x}, {y})"
|
182 |
|
183 |
@tool
|
@@ -189,7 +186,7 @@ class E2BVisionAgent(CodeAgent):
|
|
189 |
y: The y coordinate (vertical position)
|
190 |
"""
|
191 |
self.desktop.move_mouse(x, y)
|
192 |
-
self.logger.log(
|
193 |
return f"Moved mouse to coordinates ({x}, {y})"
|
194 |
|
195 |
@tool
|
@@ -201,7 +198,7 @@ class E2BVisionAgent(CodeAgent):
|
|
201 |
delay_in_ms: Delay between keystrokes in milliseconds
|
202 |
"""
|
203 |
self.desktop.write(text, delay_in_ms=delay_in_ms)
|
204 |
-
self.logger.log(
|
205 |
return f"Typed text: '{text}'"
|
206 |
|
207 |
@tool
|
@@ -214,7 +211,7 @@ class E2BVisionAgent(CodeAgent):
|
|
214 |
if key == "enter":
|
215 |
key = "Return"
|
216 |
self.desktop.press(key)
|
217 |
-
self.logger.log(
|
218 |
return f"Pressed key: {key}"
|
219 |
|
220 |
@tool
|
@@ -224,7 +221,7 @@ class E2BVisionAgent(CodeAgent):
|
|
224 |
Args:
|
225 |
"""
|
226 |
self.desktop.press(["alt", "left"])
|
227 |
-
self.logger.log(
|
228 |
return "Went back one page"
|
229 |
|
230 |
@tool
|
@@ -239,7 +236,7 @@ class E2BVisionAgent(CodeAgent):
|
|
239 |
"""
|
240 |
self.desktop.drag([x1, y1], [x2, y2])
|
241 |
message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
|
242 |
-
self.logger.log(
|
243 |
return message
|
244 |
|
245 |
@tool
|
@@ -251,7 +248,7 @@ class E2BVisionAgent(CodeAgent):
|
|
251 |
amount: The amount to scroll. A good amount is 1 or 2.
|
252 |
"""
|
253 |
self.desktop.scroll(direction=direction, amount=amount)
|
254 |
-
self.logger.log(
|
255 |
return f"Scrolled {direction} by {amount}"
|
256 |
|
257 |
@tool
|
@@ -262,7 +259,7 @@ class E2BVisionAgent(CodeAgent):
|
|
262 |
seconds: Number of seconds to wait, generally 3 is enough.
|
263 |
"""
|
264 |
time.sleep(seconds)
|
265 |
-
self.logger.log(
|
266 |
return f"Waited for {seconds} seconds"
|
267 |
|
268 |
@tool
|
@@ -279,7 +276,7 @@ class E2BVisionAgent(CodeAgent):
|
|
279 |
self.desktop.open(url)
|
280 |
# Give it time to load
|
281 |
time.sleep(2)
|
282 |
-
self.logger.log(
|
283 |
return f"Opened URL: {url}"
|
284 |
|
285 |
|
@@ -297,22 +294,9 @@ class E2BVisionAgent(CodeAgent):
|
|
297 |
self.tools["drag_and_drop"] = drag_and_drop
|
298 |
|
299 |
|
300 |
-
def store_metadata_to_file(self, final_answer, memory) -> None:
|
301 |
-
metadata_path = os.path.join(self.data_dir, "metadata.json")
|
302 |
-
output = {}
|
303 |
-
# THIS ERASES IMAGES FROM MEMORY, USE WITH CAUTION
|
304 |
-
for memory_step in self.memory.steps:
|
305 |
-
if getattr(memory_step, "observations_images", None):
|
306 |
-
memory_step.observations_images = None
|
307 |
-
a = open(metadata_path,"w")
|
308 |
-
a.write(json.dumps(self.write_memory_to_messages()))
|
309 |
-
a.close()
|
310 |
-
return True
|
311 |
-
|
312 |
-
|
313 |
def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
|
314 |
"""Callback that takes a screenshot + memory snapshot after a step completes"""
|
315 |
-
self.logger.log(
|
316 |
|
317 |
current_step = memory_step.step_number
|
318 |
|
@@ -362,12 +346,12 @@ class QwenVLAPIModel(Model):
|
|
362 |
self.model_id = model_id
|
363 |
self.base_model = HfApiModel(
|
364 |
model_id,
|
365 |
-
provider="
|
366 |
token=hf_token,
|
367 |
)
|
368 |
self.fallback_model = HfApiModel(
|
369 |
model_id,
|
370 |
-
provider="
|
371 |
token=hf_token,
|
372 |
)
|
373 |
|
|
|
101 |
max_steps: int = 200,
|
102 |
verbosity_level: LogLevel = 2,
|
103 |
planning_interval: int = 10,
|
|
|
104 |
**kwargs
|
105 |
):
|
106 |
self.desktop = desktop
|
107 |
self.data_dir = data_dir
|
|
|
108 |
self.planning_interval = planning_interval
|
109 |
# Initialize Desktop
|
110 |
self.width, self.height = self.desktop.get_screen_size()
|
|
|
135 |
self.logger.log("Setting up agent tools...")
|
136 |
self._setup_desktop_tools()
|
137 |
self.step_callbacks.append(self.take_screenshot_callback)
|
|
|
138 |
|
139 |
def _setup_desktop_tools(self):
|
140 |
"""Register all desktop tools"""
|
|
|
148 |
"""
|
149 |
self.desktop.move_mouse(x, y)
|
150 |
self.desktop.left_click()
|
151 |
+
self.logger.log(f"Clicked at coordinates ({x}, {y})")
|
152 |
return f"Clicked at coordinates ({x}, {y})"
|
153 |
|
154 |
@tool
|
|
|
161 |
"""
|
162 |
self.desktop.move_mouse(x, y)
|
163 |
self.desktop.right_click()
|
164 |
+
self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
|
165 |
return f"Right-clicked at coordinates ({x}, {y})"
|
166 |
|
167 |
@tool
|
|
|
174 |
"""
|
175 |
self.desktop.move_mouse(x, y)
|
176 |
self.desktop.double_click()
|
177 |
+
self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
|
178 |
return f"Double-clicked at coordinates ({x}, {y})"
|
179 |
|
180 |
@tool
|
|
|
186 |
y: The y coordinate (vertical position)
|
187 |
"""
|
188 |
self.desktop.move_mouse(x, y)
|
189 |
+
self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
|
190 |
return f"Moved mouse to coordinates ({x}, {y})"
|
191 |
|
192 |
@tool
|
|
|
198 |
delay_in_ms: Delay between keystrokes in milliseconds
|
199 |
"""
|
200 |
self.desktop.write(text, delay_in_ms=delay_in_ms)
|
201 |
+
self.logger.log(f"Typed text: '{text}'")
|
202 |
return f"Typed text: '{text}'"
|
203 |
|
204 |
@tool
|
|
|
211 |
if key == "enter":
|
212 |
key = "Return"
|
213 |
self.desktop.press(key)
|
214 |
+
self.logger.log(f"Pressed key: {key}")
|
215 |
return f"Pressed key: {key}"
|
216 |
|
217 |
@tool
|
|
|
221 |
Args:
|
222 |
"""
|
223 |
self.desktop.press(["alt", "left"])
|
224 |
+
self.logger.log("Went back one page")
|
225 |
return "Went back one page"
|
226 |
|
227 |
@tool
|
|
|
236 |
"""
|
237 |
self.desktop.drag([x1, y1], [x2, y2])
|
238 |
message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
|
239 |
+
self.logger.log(message)
|
240 |
return message
|
241 |
|
242 |
@tool
|
|
|
248 |
amount: The amount to scroll. A good amount is 1 or 2.
|
249 |
"""
|
250 |
self.desktop.scroll(direction=direction, amount=amount)
|
251 |
+
self.logger.log(f"Scrolled {direction} by {amount}")
|
252 |
return f"Scrolled {direction} by {amount}"
|
253 |
|
254 |
@tool
|
|
|
259 |
seconds: Number of seconds to wait, generally 3 is enough.
|
260 |
"""
|
261 |
time.sleep(seconds)
|
262 |
+
self.logger.log(f"Waited for {seconds} seconds")
|
263 |
return f"Waited for {seconds} seconds"
|
264 |
|
265 |
@tool
|
|
|
276 |
self.desktop.open(url)
|
277 |
# Give it time to load
|
278 |
time.sleep(2)
|
279 |
+
self.logger.log(f"Opening URL: {url}")
|
280 |
return f"Opened URL: {url}"
|
281 |
|
282 |
|
|
|
294 |
self.tools["drag_and_drop"] = drag_and_drop
|
295 |
|
296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
|
298 |
"""Callback that takes a screenshot + memory snapshot after a step completes"""
|
299 |
+
self.logger.log("Analyzing screen content...")
|
300 |
|
301 |
current_step = memory_step.step_number
|
302 |
|
|
|
346 |
self.model_id = model_id
|
347 |
self.base_model = HfApiModel(
|
348 |
model_id,
|
349 |
+
provider="hyperbolic",
|
350 |
token=hf_token,
|
351 |
)
|
352 |
self.fallback_model = HfApiModel(
|
353 |
model_id,
|
354 |
+
provider="nebius",
|
355 |
token=hf_token,
|
356 |
)
|
357 |
|