m-ric HF Staff commited on
Commit
379f8cb
·
1 Parent(s): 4208d01

Implement replay function

Browse files
Files changed (3) hide show
  1. app.py +18 -9
  2. e2bqwen.py +11 -4
  3. model_replay.py +10 -7
app.py CHANGED
@@ -12,7 +12,7 @@ from e2b_desktop import Sandbox
12
  from smolagents import CodeAgent
13
  from smolagents.monitoring import LogLevel
14
  from smolagents.gradio_ui import GradioUI, stream_to_gradio
15
- from model_replay import FakeModelClass
16
 
17
  from e2bqwen import QwenVLAPIModel, E2BVisionAgent
18
 
@@ -488,7 +488,7 @@ class EnrichedGradioUI(GradioUI):
488
  gr.Button(interactive=False),
489
  )
490
 
491
- def interact_with_agent(self, task_input, stored_messages, session_state, session_hash, replay_log, request: gr.Request):
492
  import gradio as gr
493
 
494
  interaction_id = generate_interaction_id(request)
@@ -504,9 +504,10 @@ class EnrichedGradioUI(GradioUI):
504
  else:
505
  session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
506
 
507
- if replay_log is not None:
508
  original_model = session_state["agent"].model
509
- session_state["agent"].model = FakeModelReplayLog(replay_log)
 
510
 
511
  try:
512
  stored_messages.append(gr.ChatMessage(role="user", content=task_input))
@@ -539,8 +540,9 @@ class EnrichedGradioUI(GradioUI):
539
  save_final_status(data_dir, "failed", summary=[], error_message=error_message)
540
 
541
  finally:
542
- if replay_log: # Replace the model with original model
543
  session_state["agent"].model = original_model
 
544
  upload_to_hf_and_remove(data_dir)
545
 
546
  theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
@@ -573,7 +575,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
573
  "Check the commuting time between Bern and Zurich on Google maps",
574
  "Write 'Hello World' in a text editor",
575
  "Search a flight Paris - Berlin for tomorrow",
576
- "Could you head to Fontainebleau (France) in Google Maps, and get me the name of the pond just south of the castle?",
577
  "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background"
578
  ],
579
  inputs = task_input,
@@ -685,9 +687,10 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
685
  fn=clear_and_set_view_only,
686
  inputs=[task_input],
687
  outputs=[sandbox_html]
688
- ).then(
 
689
  agent_ui.interact_with_agent,
690
- inputs=[task_input, stored_messages, session_state, session_hash_state, None],
691
  outputs=[chatbot_display]
692
  ).then(
693
  fn=set_interactive,
@@ -695,13 +698,19 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
695
  outputs=[sandbox_html]
696
  )
697
 
 
 
 
698
  replay_btn.click(
699
  fn=clear_and_set_view_only,
700
  inputs=[task_input],
701
  outputs=[sandbox_html]
 
 
 
702
  ).then(
703
  agent_ui.interact_with_agent,
704
- inputs=[task_input, stored_messages, session_state, session_hash_state, "udupp2fyavq_1743170323"],
705
  outputs=[chatbot_display]
706
  ).then(
707
  fn=set_interactive,
 
12
  from smolagents import CodeAgent
13
  from smolagents.monitoring import LogLevel
14
  from smolagents.gradio_ui import GradioUI, stream_to_gradio
15
+ from model_replay import FakeModelReplayLog
16
 
17
  from e2bqwen import QwenVLAPIModel, E2BVisionAgent
18
 
 
488
  gr.Button(interactive=False),
489
  )
490
 
491
+ def interact_with_agent(self, task_input, stored_messages, session_state, session_hash, request: gr.Request):
492
  import gradio as gr
493
 
494
  interaction_id = generate_interaction_id(request)
 
504
  else:
505
  session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
506
 
507
+ if "replay_log" in session_state and session_state["replay_log"] is not None:
508
  original_model = session_state["agent"].model
509
+ session_state["agent"].model = FakeModelReplayLog(session_state["replay_log"])
510
+
511
 
512
  try:
513
  stored_messages.append(gr.ChatMessage(role="user", content=task_input))
 
540
  save_final_status(data_dir, "failed", summary=[], error_message=error_message)
541
 
542
  finally:
543
+ if "replay_log" in session_state and session_state["replay_log"] is not None: # Replace the model with original model
544
  session_state["agent"].model = original_model
545
+ session_state["replay_log"] = None
546
  upload_to_hf_and_remove(data_dir)
547
 
548
  theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
 
575
  "Check the commuting time between Bern and Zurich on Google maps",
576
  "Write 'Hello World' in a text editor",
577
  "Search a flight Paris - Berlin for tomorrow",
578
+ "Search for Château de Fontainebleau in Google Maps",
579
  "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background"
580
  ],
581
  inputs = task_input,
 
687
  fn=clear_and_set_view_only,
688
  inputs=[task_input],
689
  outputs=[sandbox_html]
690
+ )
691
+ view_only_event.then(
692
  agent_ui.interact_with_agent,
693
+ inputs=[task_input, stored_messages, session_state, session_hash_state],
694
  outputs=[chatbot_display]
695
  ).then(
696
  fn=set_interactive,
 
698
  outputs=[sandbox_html]
699
  )
700
 
701
+ def set_logs_source(session_state):
702
+ session_state["replay_log"] = "udupp2fyavq_1743170323"
703
+
704
  replay_btn.click(
705
  fn=clear_and_set_view_only,
706
  inputs=[task_input],
707
  outputs=[sandbox_html]
708
+ ).then(
709
+ set_logs_source,
710
+ inputs=[session_state]
711
  ).then(
712
  agent_ui.interact_with_agent,
713
+ inputs=[task_input, stored_messages, session_state, session_hash_state],
714
  outputs=[chatbot_display]
715
  ).then(
716
  fn=set_interactive,
e2bqwen.py CHANGED
@@ -5,6 +5,7 @@ from io import BytesIO
5
  from textwrap import dedent
6
  from typing import Any, Dict, List, Optional, Tuple
7
  import json
 
8
 
9
  # HF API params
10
  from huggingface_hub import InferenceClient
@@ -260,6 +261,9 @@ class E2BVisionAgent(CodeAgent):
260
  self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
261
  return f"Moved mouse to coordinates ({x}, {y})"
262
 
 
 
 
263
  @tool
264
  def type_text(text: str, delay_in_ms: int = 75) -> str:
265
  """
@@ -268,9 +272,10 @@ class E2BVisionAgent(CodeAgent):
268
  text: The text to type
269
  delay_in_ms: Delay between keystrokes in milliseconds
270
  """
271
- self.desktop.write(text, delay_in_ms=delay_in_ms)
272
- self.logger.log(f"Typed text: '{text}'")
273
- return f"Typed text: '{text}'"
 
274
 
275
  @tool
276
  def press_key(key: str) -> str:
@@ -309,10 +314,12 @@ class E2BVisionAgent(CodeAgent):
309
  return message
310
 
311
  @tool
312
- def scroll(direction: str = "down", amount: int = 1) -> str:
313
  """
314
  Uses scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
315
  Args:
 
 
316
  direction: The direction to scroll ("up" or "down"), defaults to "down"
317
  amount: The amount to scroll. A good amount is 1 or 2.
318
  """
 
5
  from textwrap import dedent
6
  from typing import Any, Dict, List, Optional, Tuple
7
  import json
8
+ import unicodedata
9
 
10
  # HF API params
11
  from huggingface_hub import InferenceClient
 
261
  self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
262
  return f"Moved mouse to coordinates ({x}, {y})"
263
 
264
+ def normalize_text(text):
265
+ return ''.join(c for c in unicodedata.normalize('NFD', text) if not unicodedata.combining(c))
266
+
267
  @tool
268
  def type_text(text: str, delay_in_ms: int = 75) -> str:
269
  """
 
272
  text: The text to type
273
  delay_in_ms: Delay between keystrokes in milliseconds
274
  """
275
+ clean_text = normalize_text(text)
276
+ self.desktop.write(clean_text, delay_in_ms=delay_in_ms)
277
+ self.logger.log(f"Typed text: '{clean_text}'")
278
+ return f"Typed text: '{clean_text}'"
279
 
280
  @tool
281
  def press_key(key: str) -> str:
 
314
  return message
315
 
316
  @tool
317
+ def scroll(x: int, y: int, direction: str = "down", amount: int = 1) -> str:
318
  """
319
  Uses scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
320
  Args:
321
+ x: The x coordinate (horizontal position) of the element to scroll/zoom
322
+ y: The y coordinate (vertical position) of the element to scroll/zoom
323
  direction: The direction to scroll ("up" or "down"), defaults to "down"
324
  amount: The amount to scroll. A good amount is 1 or 2.
325
  """
model_replay.py CHANGED
@@ -1,7 +1,11 @@
1
  from smolagents.models import Model, ChatMessage, Tool, MessageRole
2
- from time import time
 
 
 
3
 
4
- class FakeModelClass(Model):
 
5
  """A model class that returns pre-recorded responses from a log file.
6
 
7
  This class is useful for testing and debugging purposes, as it doesn't make
@@ -19,7 +23,7 @@ class FakeModelClass(Model):
19
  **kwargs
20
  ):
21
  super().__init__(**kwargs)
22
- self.dataset_name = "smolagents/computer-agent-logs",
23
  self.log_folder = log_folder
24
  self.call_counter = 0
25
  self.model_outputs = self._load_model_outputs()
@@ -40,9 +44,8 @@ class FakeModelClass(Model):
40
  # Extract only the model_output from each step in tool_calls
41
  model_outputs = []
42
 
43
- for step in log_data.get("tool_calls", []):
44
- if "model_output_message" in step:
45
- model_outputs.append(step["model_output_message"])
46
 
47
  print(f"Loaded {len(model_outputs)} model outputs from log file")
48
  return model_outputs
@@ -67,7 +70,7 @@ class FakeModelClass(Model):
67
  Returns:
68
  ChatMessage: The next pre-recorded response.
69
  """
70
- time.sleep(1.0)
71
 
72
  # Get the next model output
73
  if self.call_counter < len(self.model_outputs):
 
1
  from smolagents.models import Model, ChatMessage, Tool, MessageRole
2
+ from time import sleep
3
+ from typing import List, Dict, Optional
4
+ from huggingface_hub import hf_hub_download
5
+ import json
6
 
7
+
8
+ class FakeModelReplayLog(Model):
9
  """A model class that returns pre-recorded responses from a log file.
10
 
11
  This class is useful for testing and debugging purposes, as it doesn't make
 
23
  **kwargs
24
  ):
25
  super().__init__(**kwargs)
26
+ self.dataset_name = "smolagents/computer-agent-logs"
27
  self.log_folder = log_folder
28
  self.call_counter = 0
29
  self.model_outputs = self._load_model_outputs()
 
44
  # Extract only the model_output from each step in tool_calls
45
  model_outputs = []
46
 
47
+ for step in log_data["summary"][1:]:
48
+ model_outputs.append(step["model_output_message"]["content"])
 
49
 
50
  print(f"Loaded {len(model_outputs)} model outputs from log file")
51
  return model_outputs
 
70
  Returns:
71
  ChatMessage: The next pre-recorded response.
72
  """
73
+ sleep(1.0)
74
 
75
  # Get the next model output
76
  if self.call_counter < len(self.model_outputs):