Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Working export log to dataset
Browse files- app.py +60 -54
- e2bqwen.py +14 -30
app.py
CHANGED
|
@@ -425,44 +425,51 @@ def generate_interaction_id(request):
|
|
| 425 |
"""Generate a unique ID combining session hash and timestamp"""
|
| 426 |
return f"{request.session_hash}_{int(time.time())}"
|
| 427 |
|
| 428 |
-
def save_final_status(folder, status, details = None):
|
| 429 |
-
a = open(os.path.join(folder,"status.json"),"w")
|
| 430 |
-
a.write(json.dumps({"status":status,"details":details}))
|
| 431 |
-
a.close()
|
| 432 |
|
| 433 |
-
def
|
| 434 |
-
"""
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
|
| 444 |
def initialize_session(interactive_mode, request: gr.Request):
|
| 445 |
session_hash = request.session_hash
|
| 446 |
-
# Create session-specific log file
|
| 447 |
-
log_path = get_log_file_path(session_hash)
|
| 448 |
-
# Initialize log file if it doesn't exist
|
| 449 |
-
if not os.path.exists(log_path):
|
| 450 |
-
with open(log_path, 'w') as f:
|
| 451 |
-
f.write(f"Ready to go...\n")
|
| 452 |
# Return HTML and session hash
|
| 453 |
return update_html(interactive_mode, request), session_hash
|
| 454 |
|
| 455 |
|
| 456 |
-
|
| 457 |
-
def update_terminal_from_session(session_hash):
|
| 458 |
-
if not session_hash:
|
| 459 |
-
return "Waiting for session..."
|
| 460 |
-
|
| 461 |
-
log_path = get_log_file_path(session_hash)
|
| 462 |
-
return read_log_content(log_path)
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
def create_agent(data_dir, desktop, log_file):
|
| 466 |
model = QwenVLAPIModel(
|
| 467 |
model_id="Qwen/Qwen2.5-VL-72B-Instruct",
|
| 468 |
hf_token = hf_token,
|
|
@@ -474,7 +481,6 @@ def create_agent(data_dir, desktop, log_file):
|
|
| 474 |
max_steps=200,
|
| 475 |
verbosity_level=2,
|
| 476 |
planning_interval=10,
|
| 477 |
-
log_file = log_file
|
| 478 |
)
|
| 479 |
|
| 480 |
class EnrichedGradioUI(GradioUI):
|
|
@@ -497,10 +503,9 @@ class EnrichedGradioUI(GradioUI):
|
|
| 497 |
if not os.path.exists(data_dir):
|
| 498 |
os.makedirs(data_dir)
|
| 499 |
|
| 500 |
-
log_file = get_log_file_path(session_hash)
|
| 501 |
|
| 502 |
if "agent" not in session_state:
|
| 503 |
-
session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop
|
| 504 |
|
| 505 |
# Construct the full task with instructions
|
| 506 |
full_task = task_input + dedent(f"""
|
|
@@ -517,31 +522,32 @@ class EnrichedGradioUI(GradioUI):
|
|
| 517 |
We can only execute one action at a time. On each step, answer only a python blob with the action to perform
|
| 518 |
""")
|
| 519 |
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
yield stored_messages
|
| 523 |
-
|
| 524 |
-
for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
|
| 525 |
-
if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
|
| 526 |
-
stored_messages.append(gr.ChatMessage(
|
| 527 |
-
role="assistant",
|
| 528 |
-
content={"path": session_state["agent"].last_screenshot.to_string(), "mime_type": "image/png"},
|
| 529 |
-
))
|
| 530 |
-
stored_messages.append(msg)
|
| 531 |
yield stored_messages
|
| 532 |
|
| 533 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
#
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
|
|
|
|
|
|
|
|
|
| 542 |
|
| 543 |
-
|
| 544 |
-
|
| 545 |
|
| 546 |
theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
|
| 547 |
|
|
|
|
| 425 |
"""Generate a unique ID combining session hash and timestamp"""
|
| 426 |
return f"{request.session_hash}_{int(time.time())}"
|
| 427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
+
def chat_message_to_json(obj):
|
| 430 |
+
"""Custom JSON serializer for ChatMessage and related objects"""
|
| 431 |
+
if hasattr(obj, '__dict__'):
|
| 432 |
+
# Create a copy of the object's __dict__ to avoid modifying the original
|
| 433 |
+
result = obj.__dict__.copy()
|
| 434 |
+
|
| 435 |
+
# Remove the 'raw' field which may contain non-serializable data
|
| 436 |
+
if 'raw' in result:
|
| 437 |
+
del result['raw']
|
| 438 |
+
|
| 439 |
+
# Process the content or tool_calls if they exist
|
| 440 |
+
if 'content' in result and result['content'] is not None:
|
| 441 |
+
if hasattr(result['content'], '__dict__'):
|
| 442 |
+
result['content'] = chat_message_to_json(result['content'])
|
| 443 |
+
|
| 444 |
+
if 'tool_calls' in result and result['tool_calls'] is not None:
|
| 445 |
+
result['tool_calls'] = [chat_message_to_json(tc) for tc in result['tool_calls']]
|
| 446 |
+
|
| 447 |
+
return result
|
| 448 |
+
elif isinstance(obj, (list, tuple)):
|
| 449 |
+
return [chat_message_to_json(item) for item in obj]
|
| 450 |
+
else:
|
| 451 |
+
return obj
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
def save_final_status(folder, status: str, memory, error_message = None) -> None:
|
| 455 |
+
metadata_path = os.path.join(folder, "metadata.json")
|
| 456 |
+
output = {}
|
| 457 |
+
# THIS ERASES IMAGES FROM MEMORY, USE WITH CAUTION
|
| 458 |
+
for memory_step in memory.steps:
|
| 459 |
+
if getattr(memory_step, "observations_images", None):
|
| 460 |
+
memory_step.observations_images = None
|
| 461 |
+
a = open(metadata_path,"w")
|
| 462 |
+
summary = memory.get_succinct_steps()
|
| 463 |
+
a.write(json.dumps({"status":status, "summary":summary, "error_message": error_message}, default=chat_message_to_json))
|
| 464 |
+
a.close()
|
| 465 |
|
| 466 |
def initialize_session(interactive_mode, request: gr.Request):
|
| 467 |
session_hash = request.session_hash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
# Return HTML and session hash
|
| 469 |
return update_html(interactive_mode, request), session_hash
|
| 470 |
|
| 471 |
|
| 472 |
+
def create_agent(data_dir, desktop):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
model = QwenVLAPIModel(
|
| 474 |
model_id="Qwen/Qwen2.5-VL-72B-Instruct",
|
| 475 |
hf_token = hf_token,
|
|
|
|
| 481 |
max_steps=200,
|
| 482 |
verbosity_level=2,
|
| 483 |
planning_interval=10,
|
|
|
|
| 484 |
)
|
| 485 |
|
| 486 |
class EnrichedGradioUI(GradioUI):
|
|
|
|
| 503 |
if not os.path.exists(data_dir):
|
| 504 |
os.makedirs(data_dir)
|
| 505 |
|
|
|
|
| 506 |
|
| 507 |
if "agent" not in session_state:
|
| 508 |
+
session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
|
| 509 |
|
| 510 |
# Construct the full task with instructions
|
| 511 |
full_task = task_input + dedent(f"""
|
|
|
|
| 522 |
We can only execute one action at a time. On each step, answer only a python blob with the action to perform
|
| 523 |
""")
|
| 524 |
|
| 525 |
+
try:
|
| 526 |
+
stored_messages.append(gr.ChatMessage(role="user", content=task_input))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
yield stored_messages
|
| 528 |
|
| 529 |
+
for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
|
| 530 |
+
if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
|
| 531 |
+
stored_messages.append(gr.ChatMessage(
|
| 532 |
+
role="assistant",
|
| 533 |
+
content={"path": session_state["agent"].last_screenshot.to_string(), "mime_type": "image/png"},
|
| 534 |
+
))
|
| 535 |
+
stored_messages.append(msg)
|
| 536 |
+
yield stored_messages
|
| 537 |
|
| 538 |
+
yield stored_messages
|
| 539 |
+
save_final_status(data_dir, "completed", memory = session_state["agent"].memory)
|
| 540 |
+
|
| 541 |
+
# # TODO: uncomment below after testing
|
| 542 |
+
except Exception as e:
|
| 543 |
+
error_message=f"Error in interaction: {str(e)}"
|
| 544 |
+
stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
|
| 545 |
+
yield stored_messages
|
| 546 |
+
raise e
|
| 547 |
+
save_final_status(data_dir, "failed", summary={}, error_message=error_message)
|
| 548 |
|
| 549 |
+
finally:
|
| 550 |
+
upload_to_hf_and_remove(data_dir)
|
| 551 |
|
| 552 |
theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
|
| 553 |
|
e2bqwen.py
CHANGED
|
@@ -101,12 +101,10 @@ class E2BVisionAgent(CodeAgent):
|
|
| 101 |
max_steps: int = 200,
|
| 102 |
verbosity_level: LogLevel = 2,
|
| 103 |
planning_interval: int = 10,
|
| 104 |
-
log_file = None,
|
| 105 |
**kwargs
|
| 106 |
):
|
| 107 |
self.desktop = desktop
|
| 108 |
self.data_dir = data_dir
|
| 109 |
-
self.log_path = log_file
|
| 110 |
self.planning_interval = planning_interval
|
| 111 |
# Initialize Desktop
|
| 112 |
self.width, self.height = self.desktop.get_screen_size()
|
|
@@ -137,7 +135,6 @@ class E2BVisionAgent(CodeAgent):
|
|
| 137 |
self.logger.log("Setting up agent tools...")
|
| 138 |
self._setup_desktop_tools()
|
| 139 |
self.step_callbacks.append(self.take_screenshot_callback)
|
| 140 |
-
self.final_answer_checks = [self.store_metadata_to_file]
|
| 141 |
|
| 142 |
def _setup_desktop_tools(self):
|
| 143 |
"""Register all desktop tools"""
|
|
@@ -151,7 +148,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 151 |
"""
|
| 152 |
self.desktop.move_mouse(x, y)
|
| 153 |
self.desktop.left_click()
|
| 154 |
-
self.logger.log(
|
| 155 |
return f"Clicked at coordinates ({x}, {y})"
|
| 156 |
|
| 157 |
@tool
|
|
@@ -164,7 +161,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 164 |
"""
|
| 165 |
self.desktop.move_mouse(x, y)
|
| 166 |
self.desktop.right_click()
|
| 167 |
-
self.logger.log(
|
| 168 |
return f"Right-clicked at coordinates ({x}, {y})"
|
| 169 |
|
| 170 |
@tool
|
|
@@ -177,7 +174,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 177 |
"""
|
| 178 |
self.desktop.move_mouse(x, y)
|
| 179 |
self.desktop.double_click()
|
| 180 |
-
self.logger.log(
|
| 181 |
return f"Double-clicked at coordinates ({x}, {y})"
|
| 182 |
|
| 183 |
@tool
|
|
@@ -189,7 +186,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 189 |
y: The y coordinate (vertical position)
|
| 190 |
"""
|
| 191 |
self.desktop.move_mouse(x, y)
|
| 192 |
-
self.logger.log(
|
| 193 |
return f"Moved mouse to coordinates ({x}, {y})"
|
| 194 |
|
| 195 |
@tool
|
|
@@ -201,7 +198,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 201 |
delay_in_ms: Delay between keystrokes in milliseconds
|
| 202 |
"""
|
| 203 |
self.desktop.write(text, delay_in_ms=delay_in_ms)
|
| 204 |
-
self.logger.log(
|
| 205 |
return f"Typed text: '{text}'"
|
| 206 |
|
| 207 |
@tool
|
|
@@ -214,7 +211,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 214 |
if key == "enter":
|
| 215 |
key = "Return"
|
| 216 |
self.desktop.press(key)
|
| 217 |
-
self.logger.log(
|
| 218 |
return f"Pressed key: {key}"
|
| 219 |
|
| 220 |
@tool
|
|
@@ -224,7 +221,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 224 |
Args:
|
| 225 |
"""
|
| 226 |
self.desktop.press(["alt", "left"])
|
| 227 |
-
self.logger.log(
|
| 228 |
return "Went back one page"
|
| 229 |
|
| 230 |
@tool
|
|
@@ -239,7 +236,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 239 |
"""
|
| 240 |
self.desktop.drag([x1, y1], [x2, y2])
|
| 241 |
message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
|
| 242 |
-
self.logger.log(
|
| 243 |
return message
|
| 244 |
|
| 245 |
@tool
|
|
@@ -251,7 +248,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 251 |
amount: The amount to scroll. A good amount is 1 or 2.
|
| 252 |
"""
|
| 253 |
self.desktop.scroll(direction=direction, amount=amount)
|
| 254 |
-
self.logger.log(
|
| 255 |
return f"Scrolled {direction} by {amount}"
|
| 256 |
|
| 257 |
@tool
|
|
@@ -262,7 +259,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 262 |
seconds: Number of seconds to wait, generally 3 is enough.
|
| 263 |
"""
|
| 264 |
time.sleep(seconds)
|
| 265 |
-
self.logger.log(
|
| 266 |
return f"Waited for {seconds} seconds"
|
| 267 |
|
| 268 |
@tool
|
|
@@ -279,7 +276,7 @@ class E2BVisionAgent(CodeAgent):
|
|
| 279 |
self.desktop.open(url)
|
| 280 |
# Give it time to load
|
| 281 |
time.sleep(2)
|
| 282 |
-
self.logger.log(
|
| 283 |
return f"Opened URL: {url}"
|
| 284 |
|
| 285 |
|
|
@@ -297,22 +294,9 @@ class E2BVisionAgent(CodeAgent):
|
|
| 297 |
self.tools["drag_and_drop"] = drag_and_drop
|
| 298 |
|
| 299 |
|
| 300 |
-
def store_metadata_to_file(self, final_answer, memory) -> None:
|
| 301 |
-
metadata_path = os.path.join(self.data_dir, "metadata.json")
|
| 302 |
-
output = {}
|
| 303 |
-
# THIS ERASES IMAGES FROM MEMORY, USE WITH CAUTION
|
| 304 |
-
for memory_step in self.memory.steps:
|
| 305 |
-
if getattr(memory_step, "observations_images", None):
|
| 306 |
-
memory_step.observations_images = None
|
| 307 |
-
a = open(metadata_path,"w")
|
| 308 |
-
a.write(json.dumps(self.write_memory_to_messages()))
|
| 309 |
-
a.close()
|
| 310 |
-
return True
|
| 311 |
-
|
| 312 |
-
|
| 313 |
def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
|
| 314 |
"""Callback that takes a screenshot + memory snapshot after a step completes"""
|
| 315 |
-
self.logger.log(
|
| 316 |
|
| 317 |
current_step = memory_step.step_number
|
| 318 |
|
|
@@ -362,12 +346,12 @@ class QwenVLAPIModel(Model):
|
|
| 362 |
self.model_id = model_id
|
| 363 |
self.base_model = HfApiModel(
|
| 364 |
model_id,
|
| 365 |
-
provider="
|
| 366 |
token=hf_token,
|
| 367 |
)
|
| 368 |
self.fallback_model = HfApiModel(
|
| 369 |
model_id,
|
| 370 |
-
provider="
|
| 371 |
token=hf_token,
|
| 372 |
)
|
| 373 |
|
|
|
|
| 101 |
max_steps: int = 200,
|
| 102 |
verbosity_level: LogLevel = 2,
|
| 103 |
planning_interval: int = 10,
|
|
|
|
| 104 |
**kwargs
|
| 105 |
):
|
| 106 |
self.desktop = desktop
|
| 107 |
self.data_dir = data_dir
|
|
|
|
| 108 |
self.planning_interval = planning_interval
|
| 109 |
# Initialize Desktop
|
| 110 |
self.width, self.height = self.desktop.get_screen_size()
|
|
|
|
| 135 |
self.logger.log("Setting up agent tools...")
|
| 136 |
self._setup_desktop_tools()
|
| 137 |
self.step_callbacks.append(self.take_screenshot_callback)
|
|
|
|
| 138 |
|
| 139 |
def _setup_desktop_tools(self):
|
| 140 |
"""Register all desktop tools"""
|
|
|
|
| 148 |
"""
|
| 149 |
self.desktop.move_mouse(x, y)
|
| 150 |
self.desktop.left_click()
|
| 151 |
+
self.logger.log(f"Clicked at coordinates ({x}, {y})")
|
| 152 |
return f"Clicked at coordinates ({x}, {y})"
|
| 153 |
|
| 154 |
@tool
|
|
|
|
| 161 |
"""
|
| 162 |
self.desktop.move_mouse(x, y)
|
| 163 |
self.desktop.right_click()
|
| 164 |
+
self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
|
| 165 |
return f"Right-clicked at coordinates ({x}, {y})"
|
| 166 |
|
| 167 |
@tool
|
|
|
|
| 174 |
"""
|
| 175 |
self.desktop.move_mouse(x, y)
|
| 176 |
self.desktop.double_click()
|
| 177 |
+
self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
|
| 178 |
return f"Double-clicked at coordinates ({x}, {y})"
|
| 179 |
|
| 180 |
@tool
|
|
|
|
| 186 |
y: The y coordinate (vertical position)
|
| 187 |
"""
|
| 188 |
self.desktop.move_mouse(x, y)
|
| 189 |
+
self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
|
| 190 |
return f"Moved mouse to coordinates ({x}, {y})"
|
| 191 |
|
| 192 |
@tool
|
|
|
|
| 198 |
delay_in_ms: Delay between keystrokes in milliseconds
|
| 199 |
"""
|
| 200 |
self.desktop.write(text, delay_in_ms=delay_in_ms)
|
| 201 |
+
self.logger.log(f"Typed text: '{text}'")
|
| 202 |
return f"Typed text: '{text}'"
|
| 203 |
|
| 204 |
@tool
|
|
|
|
| 211 |
if key == "enter":
|
| 212 |
key = "Return"
|
| 213 |
self.desktop.press(key)
|
| 214 |
+
self.logger.log(f"Pressed key: {key}")
|
| 215 |
return f"Pressed key: {key}"
|
| 216 |
|
| 217 |
@tool
|
|
|
|
| 221 |
Args:
|
| 222 |
"""
|
| 223 |
self.desktop.press(["alt", "left"])
|
| 224 |
+
self.logger.log("Went back one page")
|
| 225 |
return "Went back one page"
|
| 226 |
|
| 227 |
@tool
|
|
|
|
| 236 |
"""
|
| 237 |
self.desktop.drag([x1, y1], [x2, y2])
|
| 238 |
message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
|
| 239 |
+
self.logger.log(message)
|
| 240 |
return message
|
| 241 |
|
| 242 |
@tool
|
|
|
|
| 248 |
amount: The amount to scroll. A good amount is 1 or 2.
|
| 249 |
"""
|
| 250 |
self.desktop.scroll(direction=direction, amount=amount)
|
| 251 |
+
self.logger.log(f"Scrolled {direction} by {amount}")
|
| 252 |
return f"Scrolled {direction} by {amount}"
|
| 253 |
|
| 254 |
@tool
|
|
|
|
| 259 |
seconds: Number of seconds to wait, generally 3 is enough.
|
| 260 |
"""
|
| 261 |
time.sleep(seconds)
|
| 262 |
+
self.logger.log(f"Waited for {seconds} seconds")
|
| 263 |
return f"Waited for {seconds} seconds"
|
| 264 |
|
| 265 |
@tool
|
|
|
|
| 276 |
self.desktop.open(url)
|
| 277 |
# Give it time to load
|
| 278 |
time.sleep(2)
|
| 279 |
+
self.logger.log(f"Opening URL: {url}")
|
| 280 |
return f"Opened URL: {url}"
|
| 281 |
|
| 282 |
|
|
|
|
| 294 |
self.tools["drag_and_drop"] = drag_and_drop
|
| 295 |
|
| 296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
|
| 298 |
"""Callback that takes a screenshot + memory snapshot after a step completes"""
|
| 299 |
+
self.logger.log("Analyzing screen content...")
|
| 300 |
|
| 301 |
current_step = memory_step.step_number
|
| 302 |
|
|
|
|
| 346 |
self.model_id = model_id
|
| 347 |
self.base_model = HfApiModel(
|
| 348 |
model_id,
|
| 349 |
+
provider="hyperbolic",
|
| 350 |
token=hf_token,
|
| 351 |
)
|
| 352 |
self.fallback_model = HfApiModel(
|
| 353 |
model_id,
|
| 354 |
+
provider="nebius",
|
| 355 |
token=hf_token,
|
| 356 |
)
|
| 357 |
|