ALEKSZOSIMOV commited on
Commit
3c2d0d5
·
verified ·
1 Parent(s): 1744fb6

Update app.py

Browse files

Use GPT-4o instead

Files changed (1) hide show
  1. app.py +0 -478
app.py CHANGED
@@ -1,482 +1,4 @@
1
  import json
2
- import os
3
- import shutil
4
- import tempfile
5
- import time
6
- import uuid
7
- from io import BytesIO
8
- from threading import Timer
9
- from typing import Any
10
-
11
- import gradio as gr
12
- from dotenv import load_dotenv
13
- from e2b_desktop import Sandbox
14
- from gradio_modal import Modal
15
- from huggingface_hub import login, upload_folder
16
- from PIL import Image
17
- from smolagents import CodeAgent, InferenceClientModel
18
- from smolagents.gradio_ui import GradioUI
19
-
20
- from e2bqwen import E2BVisionAgent, get_agent_summary_erase_images
21
- from gradio_script import stream_to_gradio
22
- from scripts_and_styling import (
23
- CUSTOM_JS,
24
- FOOTER_HTML,
25
- SANDBOX_CSS_TEMPLATE,
26
- SANDBOX_HTML_TEMPLATE,
27
- apply_theme,
28
- )
29
-
30
- load_dotenv(override=True)
31
-
32
-
33
- TASK_EXAMPLES = [
34
- "Use Google Maps to find the Hugging Face HQ in Paris",
35
- "Go to Wikipedia and find what happened on April 4th",
36
- "Find out the travel time by train from Bern to Basel on Google Maps",
37
- "Go to Hugging Face Spaces and then find the Space flux.1 schnell. Use the space to generate an image with the prompt 'a field of gpus'",
38
- ]
39
-
40
- E2B_API_KEY = os.getenv("E2B_API_KEY")
41
- SANDBOXES: dict[str, Sandbox] = {}
42
- SANDBOX_METADATA: dict[str, dict[str, Any]] = {}
43
- SANDBOX_TIMEOUT = 300
44
- WIDTH = 1280
45
- HEIGHT = 960
46
- TMP_DIR = "./tmp/"
47
- if not os.path.exists(TMP_DIR):
48
- os.makedirs(TMP_DIR)
49
-
50
- hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
51
- login(token=hf_token)
52
-
53
- custom_css = SANDBOX_CSS_TEMPLATE.replace("<<WIDTH>>", str(WIDTH + 15)).replace(
54
- "<<HEIGHT>>", str(HEIGHT + 10)
55
- )
56
-
57
- sandbox_html_template = SANDBOX_HTML_TEMPLATE.replace(
58
- "<<WIDTH>>", str(WIDTH + 15)
59
- ).replace("<<HEIGHT>>", str(HEIGHT + 10))
60
-
61
-
62
- def upload_to_hf_and_remove(folder_paths: list[str]):
63
- repo_id = "smolagents/computer-agent-logs-2"
64
-
65
- with tempfile.TemporaryDirectory(dir=TMP_DIR) as temp_dir:
66
- print(
67
- f"Uploading {len(folder_paths)} folders to {repo_id} (might end up with 0 folders uploaded if tasks are all examples)..."
68
- )
69
-
70
- # Copy all folders into the temporary directory
71
- for folder_path in folder_paths:
72
- folder_name = os.path.basename(os.path.normpath(folder_path))
73
- target_path = os.path.join(temp_dir, folder_name)
74
- print("Scanning folder", os.path.join(folder_path, "metadata.jsonl"))
75
- if os.path.exists(os.path.join(folder_path, "metadata.jsonl")):
76
- with open(os.path.join(folder_path, "metadata.jsonl"), "r") as f:
77
- json_content = [json.loads(line) for line in f]
78
- # Skip upload if the task is in the examples
79
- if json_content[0]["task"] not in TASK_EXAMPLES:
80
- print(f"Copying {folder_path} to temporary directory...")
81
- shutil.copytree(folder_path, target_path)
82
- # Remove the original folder after copying
83
- shutil.rmtree(folder_path)
84
- print(f"Original folder {folder_path} removed.")
85
-
86
- # Upload the entire temporary directory
87
- print(f"Uploading all folders to {repo_id}...")
88
- upload_folder(
89
- folder_path=temp_dir,
90
- repo_id=repo_id,
91
- repo_type="dataset",
92
- ignore_patterns=[".git/*", ".gitignore"],
93
- )
94
- print("Upload complete.")
95
-
96
- return f"Successfully uploaded {len(folder_paths)} folders to {repo_id}"
97
-
98
-
99
- def cleanup_sandboxes():
100
- """Remove sandboxes that haven't been accessed for longer than SANDBOX_TIMEOUT"""
101
- current_time = time.time()
102
- sandboxes_to_remove = []
103
-
104
- for session_id, metadata in SANDBOX_METADATA.items():
105
- if current_time - metadata["last_accessed"] > SANDBOX_TIMEOUT:
106
- sandboxes_to_remove.append(session_id)
107
-
108
- for session_id in sandboxes_to_remove:
109
- if session_id in SANDBOXES:
110
- try:
111
- # Upload data before removing if needed
112
- data_dir = os.path.join(TMP_DIR, session_id)
113
- if os.path.exists(data_dir):
114
- upload_to_hf_and_remove(data_dir)
115
-
116
- # Close the sandbox
117
- SANDBOXES[session_id].kill()
118
- del SANDBOXES[session_id]
119
- del SANDBOX_METADATA[session_id]
120
- print(f"Cleaned up sandbox for session {session_id}")
121
- except Exception as e:
122
- print(f"Error cleaning up sandbox {session_id}: {str(e)}")
123
-
124
-
125
- def get_or_create_sandbox(session_hash: str):
126
- current_time = time.time()
127
-
128
- if (
129
- session_hash in SANDBOXES
130
- and session_hash in SANDBOX_METADATA
131
- and current_time - SANDBOX_METADATA[session_hash]["created_at"]
132
- < SANDBOX_TIMEOUT
133
- ):
134
- print(f"Reusing Sandbox for session {session_hash}")
135
- SANDBOX_METADATA[session_hash]["last_accessed"] = current_time
136
- return SANDBOXES[session_hash]
137
- else:
138
- print("No sandbox found, creating a new one")
139
-
140
- if session_hash in SANDBOXES:
141
- try:
142
- print(f"Closing expired sandbox for session {session_hash}")
143
- SANDBOXES[session_hash].kill()
144
- except Exception as e:
145
- print(f"Error closing expired sandbox: {str(e)}")
146
-
147
- print(f"Creating new sandbox for session {session_hash}")
148
- desktop = Sandbox(
149
- api_key=E2B_API_KEY,
150
- resolution=(WIDTH, HEIGHT),
151
- dpi=96,
152
- timeout=SANDBOX_TIMEOUT,
153
- template="k0wmnzir0zuzye6dndlw",
154
- )
155
- desktop.stream.start(require_auth=True)
156
- setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
157
- desktop.commands.run(setup_cmd)
158
-
159
- print(f"Sandbox ID for session {session_hash} is {desktop.sandbox_id}.")
160
-
161
- SANDBOXES[session_hash] = desktop
162
- SANDBOX_METADATA[session_hash] = {
163
- "created_at": current_time,
164
- "last_accessed": current_time,
165
- }
166
- return desktop
167
-
168
-
169
- def update_html(interactive_mode: bool, session_hash: str):
170
- desktop = get_or_create_sandbox(session_hash)
171
- auth_key = desktop.stream.get_auth_key()
172
- base_url = desktop.stream.get_url(auth_key=auth_key)
173
- stream_url = base_url if interactive_mode else f"{base_url}&view_only=true"
174
-
175
- status_class = "status-interactive" if interactive_mode else "status-view-only"
176
- status_text = "Interactive" if interactive_mode else "Agent running..."
177
- creation_time = (
178
- SANDBOX_METADATA[session_hash]["created_at"]
179
- if session_hash in SANDBOX_METADATA
180
- else time.time()
181
- )
182
-
183
- sandbox_html_content = sandbox_html_template.format(
184
- stream_url=stream_url,
185
- status_class=status_class,
186
- status_text=status_text,
187
- )
188
- sandbox_html_content += f'<div id="sandbox-creation-time" style="display:none;" data-time="{creation_time}" data-timeout="{SANDBOX_TIMEOUT}"></div>'
189
- return sandbox_html_content
190
-
191
-
192
- def generate_interaction_id(session_hash: str):
193
- return f"{session_hash}_{int(time.time())}"
194
-
195
-
196
- def save_final_status(folder, status: str, summary, error_message=None) -> None:
197
- with open(os.path.join(folder, "metadata.jsonl"), "a") as output_file:
198
- output_file.write(
199
- "\n"
200
- + json.dumps(
201
- {"status": status, "summary": summary, "error_message": error_message},
202
- )
203
- )
204
-
205
-
206
- def extract_browser_uuid(js_uuid):
207
- print(f"[BROWSER] Got browser UUID from JS: {js_uuid}")
208
- return js_uuid
209
-
210
-
211
- def initialize_session(interactive_mode, request: gr.Request):
212
- assert request.session_hash is not None
213
- print("GETTING REQUEST HASH:", request.session_hash)
214
- new_uuid = str(uuid.uuid4())
215
- return update_html(interactive_mode, request.session_hash), new_uuid
216
-
217
-
218
- def create_agent(data_dir, desktop):
219
- model = InferenceClientModel(
220
- model_id="https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud",
221
- token=hf_token,
222
- )
223
-
224
- # model = OpenAIServerModel(
225
- # "gpt-4o",api_key=os.getenv("OPENAI_API_KEY")
226
- # )
227
- return E2BVisionAgent(
228
- model=model,
229
- data_dir=data_dir,
230
- desktop=desktop,
231
- max_steps=20,
232
- verbosity_level=2,
233
- # planning_interval=10,
234
- use_v1_prompt=True,
235
- )
236
-
237
-
238
- INTERACTION_IDS_PER_SESSION_HASH: dict[str, dict[str, bool]] = {}
239
-
240
-
241
- class EnrichedGradioUI(GradioUI):
242
- def log_user_message(self, text_input):
243
- import gradio as gr
244
-
245
- return (
246
- text_input,
247
- gr.Button(interactive=False),
248
- )
249
-
250
- def interact_with_agent(
251
- self,
252
- task_input,
253
- stored_messages,
254
- session_state,
255
- consent_storage,
256
- request: gr.Request,
257
- ):
258
- interaction_id = generate_interaction_id(request.session_hash)
259
- desktop = get_or_create_sandbox(request.session_hash)
260
- if request.session_hash not in INTERACTION_IDS_PER_SESSION_HASH:
261
- INTERACTION_IDS_PER_SESSION_HASH[request.session_hash] = {}
262
- INTERACTION_IDS_PER_SESSION_HASH[request.session_hash][interaction_id] = True
263
-
264
- data_dir = os.path.join(TMP_DIR, interaction_id)
265
- print("CREATING DATA DIR", data_dir, "FROM", TMP_DIR, interaction_id)
266
-
267
- if not os.path.exists(data_dir) and consent_storage:
268
- os.makedirs(data_dir)
269
-
270
- # Always re-create an agent from scratch, else Qwen-VL gets confused with past history
271
- session_state["agent"] = create_agent(data_dir=data_dir, desktop=desktop)
272
-
273
- if not task_input or len(task_input) == 0:
274
- raise gr.Error("Task cannot be empty")
275
-
276
- try:
277
- stored_messages.append(
278
- gr.ChatMessage(
279
- role="user", content=task_input, metadata={"status": "done"}
280
- )
281
- )
282
- yield stored_messages
283
-
284
- if consent_storage:
285
- with open(os.path.join(data_dir, "metadata.jsonl"), "w") as output_file:
286
- output_file.write(
287
- json.dumps(
288
- {"task": task_input},
289
- )
290
- )
291
-
292
- screenshot_bytes = session_state["agent"].desktop.screenshot(format="bytes")
293
- initial_screenshot = Image.open(BytesIO(screenshot_bytes))
294
- for msg in stream_to_gradio(
295
- session_state["agent"],
296
- task=task_input,
297
- reset_agent_memory=False,
298
- task_images=[initial_screenshot],
299
- ):
300
- if (
301
- hasattr(session_state["agent"], "last_marked_screenshot")
302
- and isinstance(msg, gr.ChatMessage)
303
- and msg.content == "-----"
304
- ): # Append the last screenshot before the end of step
305
- stored_messages.append(
306
- gr.ChatMessage(
307
- role="assistant",
308
- content={
309
- "path": session_state[
310
- "agent"
311
- ].last_marked_screenshot.to_string(),
312
- "mime_type": "image/png",
313
- },
314
- metadata={"status": "done"},
315
- )
316
- )
317
- if isinstance(msg, gr.ChatMessage):
318
- stored_messages.append(msg)
319
- elif isinstance(msg, str): # Then it's only a completion delta
320
- try:
321
- if stored_messages[-1].metadata["status"] == "pending":
322
- stored_messages[-1].content = msg
323
- else:
324
- stored_messages.append(
325
- gr.ChatMessage(
326
- role="assistant",
327
- content=msg,
328
- metadata={"status": "pending"},
329
- )
330
- )
331
- except Exception as e:
332
- raise e
333
- yield stored_messages
334
-
335
- status = "completed"
336
- yield stored_messages
337
-
338
- except Exception as e:
339
- error_message = f"Error in interaction: {str(e)}"
340
- print(error_message)
341
- stored_messages.append(
342
- gr.ChatMessage(
343
- role="assistant", content="Run failed:\n" + error_message
344
- )
345
- )
346
- status = "failed"
347
- yield stored_messages
348
- finally:
349
- if consent_storage:
350
- summary = get_agent_summary_erase_images(session_state["agent"])
351
- save_final_status(
352
- data_dir, status, summary=summary, error_message=error_message
353
- )
354
- print("SAVING FINAL STATUS", data_dir, status, summary, error_message)
355
-
356
-
357
- theme = gr.themes.Default(
358
- font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue"
359
- )
360
-
361
- # Create a Gradio app with Blocks
362
- with gr.Blocks(theme=theme, css=custom_css, js=CUSTOM_JS) as demo:
363
- # Storing session hash in a state variable
364
- print("Starting the app!")
365
- with gr.Row():
366
- sandbox_html = gr.HTML(
367
- value=sandbox_html_template.format(
368
- stream_url="",
369
- status_class="status-interactive",
370
- status_text="Interactive",
371
- ),
372
- label="Output",
373
- )
374
- with gr.Sidebar(position="left"):
375
- with Modal(visible=True) as modal:
376
- gr.Markdown("""### Welcome to smolagent's Computer agent demo 🖥️
377
- In this app, you'll be able to interact with an agent powered by [smolagents](https://github.com/huggingface/smolagents) and [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct).
378
-
379
- 👉 Type a task in the left sidebar, click the button, and watch the agent solving your task. ✨
380
-
381
- _Please note that we store the task logs by default so **do not write any personal information**; you can uncheck the logs storing on the task bar._
382
- """)
383
- task_input = gr.Textbox(
384
- placeholder="Find me pictures of cute puppies",
385
- label="Enter your task below:",
386
- elem_classes="primary-color-label",
387
- )
388
-
389
- run_btn = gr.Button("Let's go!", variant="primary")
390
-
391
- gr.Examples(
392
- examples=TASK_EXAMPLES,
393
- inputs=task_input,
394
- label="Example Tasks",
395
- examples_per_page=4,
396
- )
397
-
398
- session_state = gr.State({})
399
- stored_messages = gr.State([])
400
-
401
- minimalist_toggle = gr.Checkbox(label="Innie/Outie", value=False)
402
-
403
- consent_storage = gr.Checkbox(
404
- label="Store task and agent trace?", value=True
405
- )
406
-
407
- gr.Markdown(
408
- """
409
- - **Data**: To opt-out of storing your trace, uncheck the box above.
410
- - **Be patient**: The agent's first step can take a few seconds.
411
- - **Captcha**: Sometimes the VMs get flagged for weird behaviour and are blocked with a captcha. Best is then to interrupt the agent and solve the captcha manually.
412
- - **Restart**: If your agent seems stuck, the simplest way to restart is to refresh the page.
413
- """.strip()
414
- )
415
-
416
- # Hidden HTML element to inject CSS dynamically
417
- theme_styles = gr.HTML(apply_theme(False), visible=False)
418
- minimalist_toggle.change(
419
- fn=apply_theme, inputs=[minimalist_toggle], outputs=[theme_styles]
420
- )
421
-
422
- footer = gr.HTML(value=FOOTER_HTML, label="Footer")
423
-
424
- chatbot_display = gr.Chatbot(
425
- elem_id="chatbot",
426
- label="Agent's execution logs",
427
- type="messages",
428
- avatar_images=(
429
- None,
430
- "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
431
- ),
432
- resizable=True,
433
- )
434
-
435
- agent_ui = EnrichedGradioUI(
436
- CodeAgent(tools=[], model=None, name="ok", description="ok")
437
- )
438
-
439
- stop_btn = gr.Button("Stop the agent!", variant="huggingface")
440
-
441
- def read_log_content(log_file, tail=4):
442
- """Read the contents of a log file for a specific session"""
443
- if not log_file:
444
- return "Waiting for session..."
445
-
446
- if not os.path.exists(log_file):
447
- return "Waiting for machine from the future to boot..."
448
-
449
- try:
450
- with open(log_file, "r") as f:
451
- lines = f.readlines()
452
- return "".join(lines[-tail:] if len(lines) > tail else lines)
453
- except Exception as e:
454
- return f"Guru meditation: {str(e)}"
455
-
456
- # Function to set view-only mode
457
- def clear_and_set_view_only(task_input, request: gr.Request):
458
- return update_html(False, request.session_hash)
459
-
460
- def set_interactive(request: gr.Request):
461
- return update_html(True, request.session_hash)
462
-
463
- def reactivate_stop_btn():
464
- return gr.Button("Stop the agent!", variant="huggingface")
465
-
466
- is_interactive = gr.Checkbox(value=True, visible=False)
467
-
468
- # Chain the events
469
- run_event = (
470
- run_btn.click(
471
- fn=clear_and_set_view_only,
472
- inputs=[task_input],
473
- outputs=[sandbox_html],
474
- )
475
- .then(
476
- agent_ui.interact_with_agent,
477
- inputs=[
478
- task_input,
479
- stored_messages,
480
  session_state,
481
  consent_storage,
482
  ],
 
1
  import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  session_state,
3
  consent_storage,
4
  ],