m-ric HF Staff commited on
Commit
dc86104
·
1 Parent(s): cf80979

Working export log to dataset

Browse files
Files changed (2) hide show
  1. app.py +60 -54
  2. e2bqwen.py +14 -30
app.py CHANGED
@@ -425,44 +425,51 @@ def generate_interaction_id(request):
425
  """Generate a unique ID combining session hash and timestamp"""
426
  return f"{request.session_hash}_{int(time.time())}"
427
 
428
- def save_final_status(folder, status, details = None):
429
- a = open(os.path.join(folder,"status.json"),"w")
430
- a.write(json.dumps({"status":status,"details":details}))
431
- a.close()
432
 
433
- def get_log_file_path(session_hash):
434
- """
435
- Creates a log file path based on the session hash.
436
- Makes sure the directory exists.
437
- """
438
- log_dir = os.path.join(TMP_DIR, session_hash)
439
- if not os.path.exists(log_dir):
440
- os.makedirs(log_dir)
441
-
442
- return os.path.join(log_dir, 'console.log')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
 
444
  def initialize_session(interactive_mode, request: gr.Request):
445
  session_hash = request.session_hash
446
- # Create session-specific log file
447
- log_path = get_log_file_path(session_hash)
448
- # Initialize log file if it doesn't exist
449
- if not os.path.exists(log_path):
450
- with open(log_path, 'w') as f:
451
- f.write(f"Ready to go...\n")
452
  # Return HTML and session hash
453
  return update_html(interactive_mode, request), session_hash
454
 
455
 
456
- # Function to read log content that gets the path from session hash
457
- def update_terminal_from_session(session_hash):
458
- if not session_hash:
459
- return "Waiting for session..."
460
-
461
- log_path = get_log_file_path(session_hash)
462
- return read_log_content(log_path)
463
-
464
-
465
- def create_agent(data_dir, desktop, log_file):
466
  model = QwenVLAPIModel(
467
  model_id="Qwen/Qwen2.5-VL-72B-Instruct",
468
  hf_token = hf_token,
@@ -474,7 +481,6 @@ def create_agent(data_dir, desktop, log_file):
474
  max_steps=200,
475
  verbosity_level=2,
476
  planning_interval=10,
477
- log_file = log_file
478
  )
479
 
480
  class EnrichedGradioUI(GradioUI):
@@ -497,10 +503,9 @@ class EnrichedGradioUI(GradioUI):
497
  if not os.path.exists(data_dir):
498
  os.makedirs(data_dir)
499
 
500
- log_file = get_log_file_path(session_hash)
501
 
502
  if "agent" not in session_state:
503
- session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop, log_file=log_file)
504
 
505
  # Construct the full task with instructions
506
  full_task = task_input + dedent(f"""
@@ -517,31 +522,32 @@ class EnrichedGradioUI(GradioUI):
517
  We can only execute one action at a time. On each step, answer only a python blob with the action to perform
518
  """)
519
 
520
- # try:
521
- stored_messages.append(gr.ChatMessage(role="user", content=task_input))
522
- yield stored_messages
523
-
524
- for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
525
- if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
526
- stored_messages.append(gr.ChatMessage(
527
- role="assistant",
528
- content={"path": session_state["agent"].last_screenshot.to_string(), "mime_type": "image/png"},
529
- ))
530
- stored_messages.append(msg)
531
  yield stored_messages
532
 
533
- yield stored_messages
 
 
 
 
 
 
 
534
 
535
- # TODO: uncomment below after testing
536
- # save_final_status(data_dir, "completed", details = str(session_state["agent"].memory.get_succinct_steps()))
537
- # except Exception as e:
538
- # error_message=f"Error in interaction: {str(e)}"
539
- # stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
540
- # yield stored_messages
541
- # save_final_status(data_dir, "failed", details = str(error_message))
 
 
 
542
 
543
- # finally:
544
- # upload_to_hf_and_remove(data_dir)
545
 
546
  theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
547
 
 
425
  """Generate a unique ID combining session hash and timestamp"""
426
  return f"{request.session_hash}_{int(time.time())}"
427
 
 
 
 
 
428
 
429
+ def chat_message_to_json(obj):
430
+ """Custom JSON serializer for ChatMessage and related objects"""
431
+ if hasattr(obj, '__dict__'):
432
+ # Create a copy of the object's __dict__ to avoid modifying the original
433
+ result = obj.__dict__.copy()
434
+
435
+ # Remove the 'raw' field which may contain non-serializable data
436
+ if 'raw' in result:
437
+ del result['raw']
438
+
439
+ # Process the content or tool_calls if they exist
440
+ if 'content' in result and result['content'] is not None:
441
+ if hasattr(result['content'], '__dict__'):
442
+ result['content'] = chat_message_to_json(result['content'])
443
+
444
+ if 'tool_calls' in result and result['tool_calls'] is not None:
445
+ result['tool_calls'] = [chat_message_to_json(tc) for tc in result['tool_calls']]
446
+
447
+ return result
448
+ elif isinstance(obj, (list, tuple)):
449
+ return [chat_message_to_json(item) for item in obj]
450
+ else:
451
+ return obj
452
+
453
+
454
+ def save_final_status(folder, status: str, memory, error_message = None) -> None:
455
+ metadata_path = os.path.join(folder, "metadata.json")
456
+ output = {}
457
+ # THIS ERASES IMAGES FROM MEMORY, USE WITH CAUTION
458
+ for memory_step in memory.steps:
459
+ if getattr(memory_step, "observations_images", None):
460
+ memory_step.observations_images = None
461
+ a = open(metadata_path,"w")
462
+ summary = memory.get_succinct_steps()
463
+ a.write(json.dumps({"status":status, "summary":summary, "error_message": error_message}, default=chat_message_to_json))
464
+ a.close()
465
 
466
  def initialize_session(interactive_mode, request: gr.Request):
467
  session_hash = request.session_hash
 
 
 
 
 
 
468
  # Return HTML and session hash
469
  return update_html(interactive_mode, request), session_hash
470
 
471
 
472
+ def create_agent(data_dir, desktop):
 
 
 
 
 
 
 
 
 
473
  model = QwenVLAPIModel(
474
  model_id="Qwen/Qwen2.5-VL-72B-Instruct",
475
  hf_token = hf_token,
 
481
  max_steps=200,
482
  verbosity_level=2,
483
  planning_interval=10,
 
484
  )
485
 
486
  class EnrichedGradioUI(GradioUI):
 
503
  if not os.path.exists(data_dir):
504
  os.makedirs(data_dir)
505
 
 
506
 
507
  if "agent" not in session_state:
508
+ session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
509
 
510
  # Construct the full task with instructions
511
  full_task = task_input + dedent(f"""
 
522
  We can only execute one action at a time. On each step, answer only a python blob with the action to perform
523
  """)
524
 
525
+ try:
526
+ stored_messages.append(gr.ChatMessage(role="user", content=task_input))
 
 
 
 
 
 
 
 
 
527
  yield stored_messages
528
 
529
+ for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
530
+ if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
531
+ stored_messages.append(gr.ChatMessage(
532
+ role="assistant",
533
+ content={"path": session_state["agent"].last_screenshot.to_string(), "mime_type": "image/png"},
534
+ ))
535
+ stored_messages.append(msg)
536
+ yield stored_messages
537
 
538
+ yield stored_messages
539
+ save_final_status(data_dir, "completed", memory = session_state["agent"].memory)
540
+
541
+ # # TODO: uncomment below after testing
542
+ except Exception as e:
543
+ error_message=f"Error in interaction: {str(e)}"
544
+ stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
545
+ yield stored_messages
546
+ raise e
547
+ save_final_status(data_dir, "failed", summary={}, error_message=error_message)
548
 
549
+ finally:
550
+ upload_to_hf_and_remove(data_dir)
551
 
552
  theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
553
 
e2bqwen.py CHANGED
@@ -101,12 +101,10 @@ class E2BVisionAgent(CodeAgent):
101
  max_steps: int = 200,
102
  verbosity_level: LogLevel = 2,
103
  planning_interval: int = 10,
104
- log_file = None,
105
  **kwargs
106
  ):
107
  self.desktop = desktop
108
  self.data_dir = data_dir
109
- self.log_path = log_file
110
  self.planning_interval = planning_interval
111
  # Initialize Desktop
112
  self.width, self.height = self.desktop.get_screen_size()
@@ -137,7 +135,6 @@ class E2BVisionAgent(CodeAgent):
137
  self.logger.log("Setting up agent tools...")
138
  self._setup_desktop_tools()
139
  self.step_callbacks.append(self.take_screenshot_callback)
140
- self.final_answer_checks = [self.store_metadata_to_file]
141
 
142
  def _setup_desktop_tools(self):
143
  """Register all desktop tools"""
@@ -151,7 +148,7 @@ class E2BVisionAgent(CodeAgent):
151
  """
152
  self.desktop.move_mouse(x, y)
153
  self.desktop.left_click()
154
- self.logger.log(self.log_path, f"Clicked at coordinates ({x}, {y})")
155
  return f"Clicked at coordinates ({x}, {y})"
156
 
157
  @tool
@@ -164,7 +161,7 @@ class E2BVisionAgent(CodeAgent):
164
  """
165
  self.desktop.move_mouse(x, y)
166
  self.desktop.right_click()
167
- self.logger.log(self.log_path, f"Right-clicked at coordinates ({x}, {y})")
168
  return f"Right-clicked at coordinates ({x}, {y})"
169
 
170
  @tool
@@ -177,7 +174,7 @@ class E2BVisionAgent(CodeAgent):
177
  """
178
  self.desktop.move_mouse(x, y)
179
  self.desktop.double_click()
180
- self.logger.log(self.log_path, f"Double-clicked at coordinates ({x}, {y})")
181
  return f"Double-clicked at coordinates ({x}, {y})"
182
 
183
  @tool
@@ -189,7 +186,7 @@ class E2BVisionAgent(CodeAgent):
189
  y: The y coordinate (vertical position)
190
  """
191
  self.desktop.move_mouse(x, y)
192
- self.logger.log(self.log_path, f"Moved mouse to coordinates ({x}, {y})")
193
  return f"Moved mouse to coordinates ({x}, {y})"
194
 
195
  @tool
@@ -201,7 +198,7 @@ class E2BVisionAgent(CodeAgent):
201
  delay_in_ms: Delay between keystrokes in milliseconds
202
  """
203
  self.desktop.write(text, delay_in_ms=delay_in_ms)
204
- self.logger.log(self.log_path, f"Typed text: '{text}'")
205
  return f"Typed text: '{text}'"
206
 
207
  @tool
@@ -214,7 +211,7 @@ class E2BVisionAgent(CodeAgent):
214
  if key == "enter":
215
  key = "Return"
216
  self.desktop.press(key)
217
- self.logger.log(self.log_path, f"Pressed key: {key}")
218
  return f"Pressed key: {key}"
219
 
220
  @tool
@@ -224,7 +221,7 @@ class E2BVisionAgent(CodeAgent):
224
  Args:
225
  """
226
  self.desktop.press(["alt", "left"])
227
- self.logger.log(self.log_path, "Went back one page")
228
  return "Went back one page"
229
 
230
  @tool
@@ -239,7 +236,7 @@ class E2BVisionAgent(CodeAgent):
239
  """
240
  self.desktop.drag([x1, y1], [x2, y2])
241
  message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
242
- self.logger.log(self.log_path, message)
243
  return message
244
 
245
  @tool
@@ -251,7 +248,7 @@ class E2BVisionAgent(CodeAgent):
251
  amount: The amount to scroll. A good amount is 1 or 2.
252
  """
253
  self.desktop.scroll(direction=direction, amount=amount)
254
- self.logger.log(self.log_path, f"Scrolled {direction} by {amount}")
255
  return f"Scrolled {direction} by {amount}"
256
 
257
  @tool
@@ -262,7 +259,7 @@ class E2BVisionAgent(CodeAgent):
262
  seconds: Number of seconds to wait, generally 3 is enough.
263
  """
264
  time.sleep(seconds)
265
- self.logger.log(self.log_path, f"Waited for {seconds} seconds")
266
  return f"Waited for {seconds} seconds"
267
 
268
  @tool
@@ -279,7 +276,7 @@ class E2BVisionAgent(CodeAgent):
279
  self.desktop.open(url)
280
  # Give it time to load
281
  time.sleep(2)
282
- self.logger.log(self.log_path, f"Opening URL: {url}")
283
  return f"Opened URL: {url}"
284
 
285
 
@@ -297,22 +294,9 @@ class E2BVisionAgent(CodeAgent):
297
  self.tools["drag_and_drop"] = drag_and_drop
298
 
299
 
300
- def store_metadata_to_file(self, final_answer, memory) -> None:
301
- metadata_path = os.path.join(self.data_dir, "metadata.json")
302
- output = {}
303
- # THIS ERASES IMAGES FROM MEMORY, USE WITH CAUTION
304
- for memory_step in self.memory.steps:
305
- if getattr(memory_step, "observations_images", None):
306
- memory_step.observations_images = None
307
- a = open(metadata_path,"w")
308
- a.write(json.dumps(self.write_memory_to_messages()))
309
- a.close()
310
- return True
311
-
312
-
313
  def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
314
  """Callback that takes a screenshot + memory snapshot after a step completes"""
315
- self.logger.log(self.log_path, "Analyzing screen content...")
316
 
317
  current_step = memory_step.step_number
318
 
@@ -362,12 +346,12 @@ class QwenVLAPIModel(Model):
362
  self.model_id = model_id
363
  self.base_model = HfApiModel(
364
  model_id,
365
- provider="nebius",
366
  token=hf_token,
367
  )
368
  self.fallback_model = HfApiModel(
369
  model_id,
370
- provider="hyperbolic",
371
  token=hf_token,
372
  )
373
 
 
101
  max_steps: int = 200,
102
  verbosity_level: LogLevel = 2,
103
  planning_interval: int = 10,
 
104
  **kwargs
105
  ):
106
  self.desktop = desktop
107
  self.data_dir = data_dir
 
108
  self.planning_interval = planning_interval
109
  # Initialize Desktop
110
  self.width, self.height = self.desktop.get_screen_size()
 
135
  self.logger.log("Setting up agent tools...")
136
  self._setup_desktop_tools()
137
  self.step_callbacks.append(self.take_screenshot_callback)
 
138
 
139
  def _setup_desktop_tools(self):
140
  """Register all desktop tools"""
 
148
  """
149
  self.desktop.move_mouse(x, y)
150
  self.desktop.left_click()
151
+ self.logger.log(f"Clicked at coordinates ({x}, {y})")
152
  return f"Clicked at coordinates ({x}, {y})"
153
 
154
  @tool
 
161
  """
162
  self.desktop.move_mouse(x, y)
163
  self.desktop.right_click()
164
+ self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
165
  return f"Right-clicked at coordinates ({x}, {y})"
166
 
167
  @tool
 
174
  """
175
  self.desktop.move_mouse(x, y)
176
  self.desktop.double_click()
177
+ self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
178
  return f"Double-clicked at coordinates ({x}, {y})"
179
 
180
  @tool
 
186
  y: The y coordinate (vertical position)
187
  """
188
  self.desktop.move_mouse(x, y)
189
+ self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
190
  return f"Moved mouse to coordinates ({x}, {y})"
191
 
192
  @tool
 
198
  delay_in_ms: Delay between keystrokes in milliseconds
199
  """
200
  self.desktop.write(text, delay_in_ms=delay_in_ms)
201
+ self.logger.log(f"Typed text: '{text}'")
202
  return f"Typed text: '{text}'"
203
 
204
  @tool
 
211
  if key == "enter":
212
  key = "Return"
213
  self.desktop.press(key)
214
+ self.logger.log(f"Pressed key: {key}")
215
  return f"Pressed key: {key}"
216
 
217
  @tool
 
221
  Args:
222
  """
223
  self.desktop.press(["alt", "left"])
224
+ self.logger.log("Went back one page")
225
  return "Went back one page"
226
 
227
  @tool
 
236
  """
237
  self.desktop.drag([x1, y1], [x2, y2])
238
  message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
239
+ self.logger.log(message)
240
  return message
241
 
242
  @tool
 
248
  amount: The amount to scroll. A good amount is 1 or 2.
249
  """
250
  self.desktop.scroll(direction=direction, amount=amount)
251
+ self.logger.log(f"Scrolled {direction} by {amount}")
252
  return f"Scrolled {direction} by {amount}"
253
 
254
  @tool
 
259
  seconds: Number of seconds to wait, generally 3 is enough.
260
  """
261
  time.sleep(seconds)
262
+ self.logger.log(f"Waited for {seconds} seconds")
263
  return f"Waited for {seconds} seconds"
264
 
265
  @tool
 
276
  self.desktop.open(url)
277
  # Give it time to load
278
  time.sleep(2)
279
+ self.logger.log(f"Opening URL: {url}")
280
  return f"Opened URL: {url}"
281
 
282
 
 
294
  self.tools["drag_and_drop"] = drag_and_drop
295
 
296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
298
  """Callback that takes a screenshot + memory snapshot after a step completes"""
299
+ self.logger.log("Analyzing screen content...")
300
 
301
  current_step = memory_step.step_number
302
 
 
346
  self.model_id = model_id
347
  self.base_model = HfApiModel(
348
  model_id,
349
+ provider="hyperbolic",
350
  token=hf_token,
351
  )
352
  self.fallback_model = HfApiModel(
353
  model_id,
354
+ provider="nebius",
355
  token=hf_token,
356
  )
357