阳渠 commited on
Commit
e78b889
·
1 Parent(s): d8d693c

Update space

Browse files
Files changed (3) hide show
  1. README.md +0 -14
  2. app.py +547 -52
  3. requirements.txt +9 -1
README.md DELETED
@@ -1,14 +0,0 @@
1
- ---
2
- title: PC Agent
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.0.1
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: A Hierarchical Multi-Agent Collaboration Framework for Compl
12
- ---
13
-
14
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,64 +1,559 @@
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
 
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
 
26
- messages.append({"role": "user", "content": message})
 
 
 
 
 
 
 
27
 
28
- response = ""
 
 
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
 
39
- response += token
40
- yield response
 
 
41
 
 
 
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
-
62
-
63
- if __name__ == "__main__":
64
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import re
3
+ import io
4
+ import os
5
+ import json
6
+ import copy
7
+ import shutil
8
+ import base64
9
+ import random
10
  import gradio as gr
11
+ from datetime import datetime
12
+ from modelscope.pipelines import pipeline
13
+ from modelscope import snapshot_download
14
+ from PIL import Image, ImageDraw, ImageFont
15
 
16
+ from PCAgent.api import inference_chat
17
+ from PCAgent.icon_localization import det
18
+ from PCAgent.text_localization import ocr
19
+ from PCAgent.prompt_qwen import get_subtask_prompt as get_subtask_prompt
20
+ from PCAgent.chat import init_action_chat, init_memory_chat, add_response
21
+ from PCAgent.prompt_qwen import get_action_prompt, get_process_prompt, get_memory_prompt
22
+ from PCAgent.merge_strategy import merge_boxes_and_texts, merge_all_icon_boxes, merge_boxes_and_texts_new
23
+
24
+ vl_model_version = os.environ.get('vl_model_version')
25
+ llm_model_version = os.environ.get('llm_model_version')
26
+ API_url = os.environ.get('API_url')
27
+ token = os.environ.get('token')
28
+ os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
29
+ os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
30
+ radius = 100
31
+
32
+ chatbot_css = """
33
+ <style>
34
+ .chat-container {
35
+ display: flex;
36
+ flex-direction: column;
37
+ overflow-y: auto;
38
+ max-height: 800px;
39
+ margin: 10px;
40
+ }
41
+ .user-message, .bot-message {
42
+ margin: 5px;
43
+ padding: 10px;
44
+ border-radius: 10px;
45
+ }
46
+ .user-message {
47
+ text-align: right;
48
+ background-color: #7B68EE;
49
+ color: white;
50
+ align-self: flex-end;
51
+ }
52
+ .bot-message {
53
+ text-align: left;
54
+ background-color: #ADD8E6;
55
+ color: black;
56
+ align-self: flex-start;
57
+ }
58
+ .user-image {
59
+ text-align: right;
60
+ align-self: flex-end;
61
+ max-width: 150px;
62
+ max-height: 300px;
63
+ }
64
+ .bot-image {
65
+ text-align: left;
66
+ align-self: flex-start;
67
+ max-width: 200px;
68
+ max-height: 400px;
69
+ }
70
+ </style>
71
  """
 
 
 
72
 
73
+ def cmyk_to_rgb(c, m, y, k):
74
+ r = 255 * (1.0 - c / 255) * (1.0 - k / 255)
75
+ g = 255 * (1.0 - m / 255) * (1.0 - k / 255)
76
+ b = 255 * (1.0 - y / 255) * (1.0 - k / 255)
77
+ return int(r), int(g), int(b)
78
 
79
+ def draw_coordinates_boxes_on_image(image_path, coordinates, output_image_path, font_path, no_text=0):
80
+ image = Image.open(image_path)
81
+ width, height = image.size
82
+ draw = ImageDraw.Draw(image)
83
+ total_boxes = len(coordinates)
84
+ colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for _ in
85
+ range(total_boxes)]
 
 
86
 
87
+ for i, coord in enumerate(coordinates):
88
+ c, m, y, k = colors[i]
89
+ color = cmyk_to_rgb(c, m, y, k)
 
 
90
 
91
+ draw.rectangle(coord, outline=color, width=int(height * 0.0025))
92
+
93
+ if no_text != 1:
94
+ font = ImageFont.truetype(font_path, int(height * 0.012))
95
+ text_x = coord[0] + int(height * 0.0025)
96
+ text_y = max(0, coord[1] - int(height * 0.013))
97
+ draw.text((text_x, text_y), str(i + 1), fill=color, font=font)
98
+ image = image.convert('RGB')
99
 
100
+ if os.path.exists(output_image_path):
101
+ os.remove(output_image_path)
102
+ image.save(output_image_path)
103
 
104
+ def get_perception_infos(screenshot_file, screenshot_som_file, font_path):
105
+
106
+ total_width, total_height = Image.open(screenshot_file).size
 
 
 
 
 
107
 
108
+ # no partition
109
+ img_list = [screenshot_file]
110
+ img_x_list = [0]
111
+ img_y_list = [0]
112
 
113
+ coordinates = []
114
+ texts = []
115
+ padding = total_height * 0.0025 # 10
116
 
117
+ for i, img in enumerate(img_list):
118
+ width, height = Image.open(img).size
119
+ sub_text, sub_coordinates = ocr(img) # for api
120
+ for coordinate in sub_coordinates:
121
+ coordinate[0] = int(max(0, img_x_list[i] + coordinate[0] - padding))
122
+ coordinate[2] = int(min(total_width, img_x_list[i] + coordinate[2] + padding))
123
+ coordinate[1] = int(max(0, img_y_list[i] + coordinate[1] - padding))
124
+ coordinate[3] = int(min(total_height,img_y_list[i] + coordinate[3] + padding))
125
+
126
+ sub_text_merge, sub_coordinates_merge = merge_boxes_and_texts_new(sub_text, sub_coordinates)
127
+ coordinates.extend(sub_coordinates_merge)
128
+ texts.extend(sub_text_merge)
129
+ merged_text, merged_text_coordinates = merge_boxes_and_texts(texts, coordinates)
130
+
131
+ filtered_merged_text = []
132
+ filtered_merged_text_coordinates = []
133
+ for i in range(len(merged_text)):
134
+ filtered_merged_text.append(merged_text[i])
135
+ filtered_merged_text_coordinates.append(merged_text_coordinates[i])
136
+ merged_text, merged_text_coordinates = filtered_merged_text, filtered_merged_text_coordinates
137
+
138
+ coordinates = []
139
+ for i, img in enumerate(img_list):
140
+ width, height = Image.open(img).size
141
+ sub_coordinates = det(img, "icon", groundingdino_model)
142
+ for coordinate in sub_coordinates:
143
+ coordinate[0] = int(max(0, img_x_list[i] + coordinate[0] - padding))
144
+ coordinate[2] = int(min(total_width, img_x_list[i] + coordinate[2] + padding))
145
+ coordinate[1] = int(max(0, img_y_list[i] + coordinate[1] - padding))
146
+ coordinate[3] = int(min(total_height, img_y_list[i] + coordinate[3] + padding))
147
+
148
+ sub_coordinates = merge_all_icon_boxes(sub_coordinates)
149
+ coordinates.extend(sub_coordinates)
150
+ merged_icon_coordinates = merge_all_icon_boxes(coordinates)
151
+
152
+ rec_list = merged_text_coordinates + merged_icon_coordinates
153
+ draw_coordinates_boxes_on_image(screenshot_file, copy.deepcopy(rec_list), screenshot_som_file, font_path)
154
+
155
+ mark_number = 0
156
+ perception_infos = []
157
+
158
+ for i in range(len(merged_text_coordinates)):
159
+ mark_number += 1
160
+ perception_info = {"text": "mark number: " + str(mark_number) + " text: " + merged_text[i], "coordinates": merged_text_coordinates[i]}
161
+ perception_infos.append(perception_info)
162
+
163
+ for i in range(len(merged_icon_coordinates)):
164
+ mark_number += 1
165
+ perception_info = {"text": "mark number: " + str(mark_number) + " icon", "coordinates": merged_icon_coordinates[i]}
166
+ perception_infos.append(perception_info)
167
+
168
+ for i in range(len(perception_infos)):
169
+ perception_infos[i]['coordinates'] = [int((perception_infos[i]['coordinates'][0]+perception_infos[i]['coordinates'][2])/2), int((perception_infos[i]['coordinates'][1]+perception_infos[i]['coordinates'][3])/2)]
170
+
171
+ return perception_infos, total_width, total_height
172
+
173
+ groundingdino_dir = snapshot_download('AI-ModelScope/GroundingDINO', revision='v1.0.0')
174
+ groundingdino_model = pipeline('grounding-dino-task', model=groundingdino_dir)
175
+
176
+ def analyze_string(s):
177
+ result = {
178
+ 'type': None,
179
+ 'format_keys': [],
180
+ 'dict_content': None
181
+ }
182
+
183
+ format_pattern = re.compile(r'\{(\w+)\}')
184
+
185
+ # {'key': 'value'}
186
+ dict_pattern = re.compile(
187
+ r'\{(?:\s*[\'\"]\w+[\'\"]\s*:\s*[\'\"][^{}\'\"]+[\'\"]\s*,?)*\}'
188
+ )
189
+
190
+ dict_matches = dict_pattern.findall(s)
191
+ dicts = []
192
+ for match in dict_matches:
193
+ try:
194
+ parsed_dict = ast.literal_eval(match)
195
+ if isinstance(parsed_dict, dict):
196
+ dicts.append(parsed_dict)
197
+ except (ValueError, SyntaxError):
198
+ continue
199
+
200
+ has_dict = len(dicts) > 0
201
+
202
+ s_without_dicts = dict_pattern.sub('', s)
203
+
204
+ format_keys = format_pattern.findall(s_without_dicts)
205
+ has_format = len(format_keys) > 0
206
+
207
+ has_format_and_dict = has_format and has_dict
208
+
209
+ if has_format_and_dict:
210
+ result['type'] = 4
211
+ elif has_format:
212
+ result['type'] = 2
213
+ elif has_dict:
214
+ result['type'] = 3
215
+ else:
216
+ result['type'] = 1
217
+
218
+ if has_format:
219
+ result['format_keys'] = format_keys
220
+
221
+ if has_dict:
222
+ result['dict_content'] = dicts[0]
223
+
224
+ return result
225
+
226
+ import re
227
+
228
+ def is_good_string(s):
229
+ # Regex to match the dictionary-like part {'key1': 'value1', ...}
230
+ dict_pattern = r"\{('[^']+' *: *'[^']+' *(, *'[^']+' *: *'[^']+')*)?\}"
231
+ # Regex to match the item list part {item1, item2,...} with no single quotes in items
232
+ item_pattern = r"\{([a-zA-Z0-9_]+( *, *[a-zA-Z0-9_]+)*)?\}"
233
+
234
+ # Find all parts of the string contained within braces
235
+ parts = re.findall(r'\{.*?\}', s)
236
+
237
+ for part in parts:
238
+ # Check if the part matches either the dictionary pattern or item pattern
239
+ if not re.fullmatch(dict_pattern, part) and not re.fullmatch(item_pattern, part):
240
+ return False
241
+ return True
242
+
243
+ screenshot_root = "screenshot"
244
+ if os.path.exists(screenshot_root):
245
+ shutil.rmtree(screenshot_root)
246
+ os.mkdir(screenshot_root)
247
+
248
+ def image_to_base64(image):
249
+ buffered = io.BytesIO()
250
+ image.save(buffered, format="PNG")
251
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
252
+ img_html = f'<img src="data:image/png;base64,{img_str}" />'
253
+ return img_html
254
+
255
+ def chatbot(image, instruction, add_info, history, chat_log):
256
+ if history == {}:
257
+ output_for_save = []
258
+ thought_history = []
259
+ summary_history = []
260
+ action_history = []
261
+ summary = ""
262
+ action = ""
263
+ completed_requirements = ""
264
+ memory = ""
265
+ insight = ""
266
+ error_flag = False
267
+ user_msg = "<div class='user-message'>{}</div>".format(instruction)
268
+ step_idx = 0
269
+ else:
270
+ output_for_save = history["output_for_save"]
271
+ thought_history = history["thought_history"]
272
+ summary_history = history["summary_history"]
273
+ action_history = history["action_history"]
274
+ summary = history["summary"]
275
+ action = history["action"]
276
+ completed_requirements = history["completed_requirements"]
277
+ memory = history["memory"][0]
278
+ insight = history["insight"]
279
+ error_flag = history["error_flag"]
280
+ user_msg = "<div class='user-message'>{}</div>".format("I have uploaded the screenshot. Please continue operating.")
281
+ step_idx = history["history"]
282
+
283
+ current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
284
+ temp_file = f"temp_{current_time}"
285
+ os.mkdir(temp_file)
286
+
287
+ screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png")
288
+ image.save(screenshot_file, format="PNG")
289
+ screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1]
290
+ perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="C:/Windows/Fonts/arial.ttf")
291
+ shutil.rmtree(temp_file)
292
+ os.mkdir(temp_file)
293
+
294
+ output_for_save_this_step = {}
295
+ prompt_action = get_action_prompt(instruction, perception_infos, width, height, thought_history, summary_history, action_history, [], summary, action, "", add_info, error_flag, completed_requirements, memory)
296
+ chat_action = init_action_chat()
297
+ chat_action = add_response("user", prompt_action, chat_action, [screenshot_som_file])
298
+ output_action = inference_chat(chat_action, vl_model_version, API_url, token)
299
+ output_for_save_this_step['action'] = output_action
300
+ action_json = json.loads(output_action.split('```json')[-1].split('```')[0])
301
+ thought = action_json['Thought']
302
+ summary = action_json['Summary']
303
+ action = action_json['Action']
304
+ chat_action = add_response("assistant", output_action, chat_action)
305
+
306
+ if "Double TapIdx" in action:
307
+ bot_response = "Please double click (click x 2) the red circle and upload the current screenshot again."
308
+ idx = action.split("(")[-1].split(")")[0]
309
+ coordinate = perception_infos[idx]['coordinates']
310
+ x, y = int(coordinate[0]), int(coordinate[1])
311
+ draw = ImageDraw.Draw(image)
312
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
313
+
314
+ elif "Double Tap" in action:
315
+ bot_response = "Please double click (click x 2) the red circle and upload the current screenshot again."
316
+ coordinate = action.split("(")[-1].split(")")[0].split(", ")
317
+ x, y = int(coordinate[0]), int(coordinate[1])
318
+ draw = ImageDraw.Draw(image)
319
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
320
+
321
+ elif "Triple TapIdx" in action:
322
+ bot_response = "Please triple click (click x 3) the red circle and upload the current screenshot again."
323
+ coordinate = action.split("(")[-1].split(")")[0].split(", ")
324
+ x, y = int(coordinate[0]), int(coordinate[1])
325
+ draw = ImageDraw.Draw(image)
326
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
327
+
328
+ elif "Triple Tap" in action:
329
+ bot_response = "Please triple click (click x 3) the red circle and upload the current screenshot again."
330
+ idx = action.split("(")[-1].split(")")[0]
331
+ coordinate = perception_infos[idx]['coordinates']
332
+ x, y = int(coordinate[0]), int(coordinate[1])
333
+ draw = ImageDraw.Draw(image)
334
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
335
+
336
+ elif "TapIdx" in action:
337
+ bot_response = "Please click (click x 1) the red circle and upload the current screenshot again."
338
+ idx = action.split("(")[-1].split(")")[0]
339
+ coordinate = perception_infos[idx]['coordinates']
340
+ x, y = int(coordinate[0]), int(coordinate[1])
341
+ draw = ImageDraw.Draw(image)
342
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
343
+
344
+ elif "Tap" in action:
345
+ bot_response = "Please click (click x 1) the red circle and upload the current screenshot again."
346
+ coordinate = action.split("(")[-1].split(")")[0].split(", ")
347
+ x, y = int(coordinate[0]), int(coordinate[1])
348
+ draw = ImageDraw.Draw(image)
349
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
350
+
351
+ elif "Shortcut" in action:
352
+ keys = action.split("(")[-1].split(")")[0].split(", ")
353
+ key1, key2 = keys[0].lower(), keys[1].lower()
354
+ bot_response = f"Please press {key1}+{key2} and upload the current screenshot again."
355
+
356
+ elif "Press" in action:
357
+ key = action.split("(")[-1].split(")")[0]
358
+ bot_response = f"Please press {key} and upload the current screenshot again."
359
+
360
+ elif "Open App" in action:
361
+ app = action.split("(")[-1].split(")")[0]
362
+ bot_response = f"Please open {app} app and upload the current screenshot again."
363
+
364
+ elif "Type" in action:
365
+ coordinate = action.split("(")[1].split(")")[0].split(", ")
366
+ x, y = int(coordinate[0]), int(coordinate[1])
367
+ if "[text]" not in action:
368
+ # for claude
369
+ if '[' not in action or ']' not in action:
370
+ # text = action.split('),')[-1].strip()
371
+ text = action.split('),')[-1].strip().split("(")[1].split(")")[0].replace("text: ", '').replace("'", "")
372
+ else:
373
+ text = action.split("[")[-1].split("]")[0]
374
+ else:
375
+ text = action.split(" \"")[-1].split("\"")[0]
376
+ draw = ImageDraw.Draw(image)
377
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
378
+ bot_response = f"Please type \"{text}\" in the red circle and upload the current screenshot again."
379
+
380
+ elif "Select (" in action:
381
+ content = action.split("(")[1].split(")")[0]
382
+ bot_response = f"Please select the text content \"{content}\" and upload the current screenshot again."
383
+
384
+ elif "Replace (" in action:
385
+ coordinate = action.split("(")[1].split(")")[0].split(", ")
386
+ x, y = int(coordinate[0]), int(coordinate[1])
387
+ if "[text]" not in action:
388
+ # for claude
389
+ if '[' not in action or ']' not in action:
390
+ # text = action.split('),')[-1].strip()
391
+ text = action.split('),')[-1].strip().split("(")[1].split(")")[0].replace("text: ", '')
392
+ else:
393
+ if "] with " in action:
394
+ text = action.split("] with ")[-1]
395
+ text = text.replace("\"", '').replace("'", '').strip('.')
396
+ else:
397
+ text = action.split("[")[-1].split("]")[0]
398
+ else:
399
+ text = action.split(" \"")[-1].split("\"")[0]
400
+ draw = ImageDraw.Draw(image)
401
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
402
+ bot_response = f"Please replace the text in the red circle by \"{text}\" and upload the current screenshot again."
403
+
404
+ elif "Append (" in action:
405
+ coordinate = action.split("(")[1].split(")")[0].split(", ")
406
+ x, y = int(coordinate[0]), int(coordinate[1])
407
+ if "[text]" not in action:
408
+ if '[' not in action or ']' not in action:
409
+ text = action.split('),')[-1].strip()
410
+ else:
411
+ text = action.split("[")[-1].split("]")[0]
412
+ else:
413
+ text = action.split(" \"")[-1].split("\"")[0]
414
+ draw = ImageDraw.Draw(image)
415
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
416
+ bot_response = f"Please insert the text \"{text}\" in the red circle and upload the current screenshot again."
417
+
418
+ elif "Stop" in action:
419
+ output_for_save.append(output_for_save_this_step)
420
+ bot_response = f"Answer: {output_for_save}, task completed"
421
+
422
+ prompt_memory = get_memory_prompt(insight)
423
+ chat_action = add_response("user", prompt_memory, chat_action)
424
+ output_memory = inference_chat(chat_action, vl_model_version, API_url, token)
425
+ chat_action = add_response("assistant", output_memory, chat_action)
426
+ output_memory = output_memory.split("### Important content ###")[-1].split("\n\n")[0].strip() + "\n"
427
+ if "None" not in output_memory and output_memory not in memory:
428
+ memory += output_memory
429
+
430
+ bot_text1 = "<div class='bot-message'>{}</div>".format("### Decision ###")
431
+ bot_thought = "<div class='bot-message'>{}</div>".format("Thought: " + thought)
432
+ bot_action = "<div class='bot-message'>{}</div>".format("Action: " + action)
433
+ bot_operation = "<div class='bot-message'>{}</div>".format("Operation: " + summary)
434
+ bot_text2 = "<div class='bot-message'>{}</div>".format("### Memory ###")
435
+ if len(memory) > 0:
436
+ bot_memory = "<div class='bot-message'>{}</div>".format(memory)
437
+ else:
438
+ bot_memory = "<div class='bot-message'>{}</div>".format("None")
439
+ bot_response = "<div class='bot-message'>{}</div>".format(bot_response)
440
+ if image is not None:
441
+ bot_img_html = image_to_base64(image)
442
+ bot_response = "<div class='bot-image'>{}</div>".format(bot_img_html) + bot_response
443
+
444
+ chat_log.append(user_msg)
445
+
446
+ shutil.rmtree(temp_file)
447
+ # os.remove(screenshot_file)
448
+ # os.remove(screenshot_som_file)
449
+
450
+ thought_history.append(thought)
451
+ summary_history.append(summary)
452
+ action_history.append(action)
453
+
454
+ prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history, completed_requirements, add_info)
455
+ chat_planning = init_memory_chat()
456
+ chat_planning = add_response("user", prompt_planning, chat_planning )
457
+ output_planning = inference_chat(chat_planning, llm_model_version, API_url, token)
458
+ output_for_save_this_step['planning'] = output_planning
459
+ chat_planning = add_response("assistant", output_planning, chat_planning )
460
+ completed_requirements = output_planning.split("### Completed contents ###")[-1].replace("\n", " ").strip()
461
+
462
+ bot_text3 = "<div class='bot-message'>{}</div>".format("### Planning ###")
463
+ output_planning = "<div class='bot-message'>{}</div>".format(output_planning)
464
+
465
+ history["thought_history"] = thought_history
466
+ history["summary_history"] = summary_history
467
+ history["action_history"] = action_history
468
+ history["summary"] = summary
469
+ history["action"] = action
470
+ history["memory"] = memory,
471
+ history["memory_switch"] = True,
472
+ history["insight"] = insight
473
+ history["error_flag"] = error_flag
474
+ history["completed_requirements"] = completed_requirements
475
+ history["output_for_save"] = output_for_save
476
+ history["history"] = step_idx + 1
477
+
478
+ chat_log.append(bot_text3)
479
+ chat_log.append(output_planning)
480
+ chat_log.append(bot_text1)
481
+ chat_log.append(bot_thought)
482
+ chat_log.append(bot_action)
483
+ chat_log.append(bot_operation)
484
+ chat_log.append(bot_text2)
485
+ chat_log.append(bot_memory)
486
+ chat_log.append(bot_response)
487
+
488
+ chat_html = "<div class='chat-container'>{}</div>".format("".join(chat_log))
489
+
490
+ return chatbot_css + chat_html, history, chat_log
491
+
492
+
493
+ def lock_input(instruction):
494
+ return gr.update(value=instruction, interactive=False), gr.update(value=None)
495
+
496
+
497
+ def reset_demo():
498
+ return gr.update(value="", interactive=True), gr.update(value=None, interactive=True), "<div class='chat-container'></div>", {}, []
499
+
500
+
501
+ tos_markdown = ("""<div style="display:flex; gap: 0.25rem;" align="center">
502
+ <a href='https://github.com/X-PLUG/MobileAgent'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
503
+ <a href="https://arxiv.org/abs/2502.14282"><img src="https://img.shields.io/badge/Arxiv-2502.14282-red"></a>
504
+ <a href='https://github.com/X-PLUG/MobileAgent/stargazers'><img src='https://img.shields.io/github/stars/X-PLUG/MobileAgent.svg?style=social'></a>
505
+ </div>
506
+ If you like our project, please give us a star ✨ on Github for latest update.
507
+
508
+ **Terms of use**
509
+ 1. Input your instruction in \"Instruction\", for example \"Turn on the dark mode\".
510
+ 2. You can input helpful operation knowledge in \"Knowledge\".
511
+ 3. Click \"Submit\" to get the operation. You need to operate your PC according to the operation and then upload the screenshot after your operation.
512
+ 4. We show two examples below, each with three screenshots. Click and submit from top to bottom to experience it.
513
+
514
+ **使用说明**
515
+ 1. 在“Instruction”中输入你的指令,例如“打开深色模式”。
516
+ 2. 你可以在“Knowledge”中输入帮助性的操作知识。
517
+ 3. 点击“Submit”来获得操作。你需要根据输出来操作PC,并且上传操作后的截图。
518
+ 4. 我们在下方展示了两个例子,每个例子有三张截屏。请从上到下依次点击并提交来体验。""")
519
+
520
+ title_markdowm = ("""# PC-Agent: A Hierarchical Multi-Agent Collaboration Framework for Complex Task Automation on PC""")
521
+
522
+ instruction_input = gr.Textbox(label="Instruction", placeholder="Input your instruction")
523
+ knowledge_input = gr.Textbox(label="Knowledge", placeholder="Input your knowledge")
524
+ with gr.Blocks() as demo:
525
+ history_state = gr.State(value={})
526
+ history_output = gr.State(value=[])
527
+ with gr.Row():
528
+ gr.Markdown(title_markdowm)
529
+ with gr.Row():
530
+ with gr.Column(scale=5):
531
+ gr.Markdown(tos_markdown)
532
+ image_input = gr.Image(label="Screenshot", type="pil", height=350, width=700)
533
+ gr.Examples(examples=[
534
+ ["./example/1-1.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索,或输入网址\" written on it."],
535
+ ["./example/1-2.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索,或输入网址\" written on it."],
536
+ ["./example/1-3.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索,或输入网址\" written on it."],
537
+ ], inputs=[image_input, instruction_input, knowledge_input])
538
+
539
+ with gr.Column(scale=6):
540
+ instruction_input.render()
541
+ knowledge_input.render()
542
+ with gr.Row():
543
+ start_button = gr.Button("Submit")
544
+ clear_button = gr.Button("Clear")
545
+ output_component = gr.HTML(label="Chat history", value="<div class='chat-container'></div>")
546
+
547
+ start_button.click(
548
+ fn=lambda image, instruction, add_info, history, output: chatbot(image, instruction, add_info, history, output),
549
+ inputs=[image_input, instruction_input, knowledge_input, history_state, history_output],
550
+ outputs=[output_component, history_state, history_output]
551
+ )
552
+
553
+ clear_button.click(
554
+ fn=reset_demo,
555
+ inputs=[],
556
+ outputs=[instruction_input, knowledge_input, output_component, history_state, history_output]
557
+ )
558
+
559
+ demo.queue().launch(share=True)
requirements.txt CHANGED
@@ -1 +1,9 @@
1
- huggingface_hub==0.25.2
 
 
 
 
 
 
 
 
 
1
+ modelscope==1.15.0
2
+ supervision==0.21.0
3
+ alibabacloud_tea_util
4
+ alibabacloud_tea_openapi
5
+ alibabacloud_ocr_api20210707
6
+ openai
7
+ dashscope
8
+ torch
9
+ opencv-python