import ast import re import io import os import json import copy import shutil import base64 import random import requests import gradio as gr from datetime import datetime from modelscope.pipelines import pipeline from modelscope import snapshot_download from modelscope.utils.constant import Tasks from PIL import Image, ImageDraw, ImageFont from PCAgent.api import inference_chat from PCAgent.icon_localization import det from PCAgent.text_localization_old import ocr from PCAgent.prompt_qwen import get_subtask_prompt as get_subtask_prompt from PCAgent.chat import init_action_chat, init_memory_chat, add_response from PCAgent.prompt_qwen import get_action_prompt, get_process_prompt, get_memory_prompt from PCAgent.merge_strategy import merge_boxes_and_texts, merge_all_icon_boxes, merge_boxes_and_texts_new vl_model_version = os.getenv('vl_model_version') llm_model_version = os.getenv('llm_model_version') API_url = os.getenv('API_url') token = os.getenv('token') # os.environ["OCR_ACCESS_KEY_ID"] = os.getenv('OCR_ACCESS_KEY_ID') # os.environ["OCR_ACCESS_KEY_SECRET"] = os.getenv('OCR_ACCESS_KEY_SECRET') ocr_detection = pipeline(Tasks.ocr_detection, model='damo/cv_resnet18_ocr-detection-line-level_damo') ocr_recognition = pipeline(Tasks.ocr_recognition, model='damo/cv_convnextTiny_ocr-recognition-document_damo') tff_file = os.environ.get('tff_file') radius = 100 def download_file(url, save_path): response = requests.get(url, stream=True) # 以流的方式下载 response.raise_for_status() # 确保请求成功 with open(save_path, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): # 分块写入，防止占用过多内存 file.write(chunk) download_file(tff_file, "arial.ttf") chatbot_css = """ """ def cmyk_to_rgb(c, m, y, k): r = 255 * (1.0 - c / 255) * (1.0 - k / 255) g = 255 * (1.0 - m / 255) * (1.0 - k / 255) b = 255 * (1.0 - y / 255) * (1.0 - k / 255) return int(r), int(g), int(b) def draw_coordinates_boxes_on_image(image_path, coordinates, output_image_path, font_path, no_text=0): image = Image.open(image_path) width, height = image.size draw = ImageDraw.Draw(image) total_boxes = len(coordinates) colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for _ in range(total_boxes)] for i, coord in enumerate(coordinates): c, m, y, k = colors[i] color = cmyk_to_rgb(c, m, y, k) draw.rectangle(coord, outline=color, width=int(height * 0.0025)) if no_text != 1: font = ImageFont.truetype(font_path, int(height * 0.012)) text_x = coord[0] + int(height * 0.0025) text_y = max(0, coord[1] - int(height * 0.013)) draw.text((text_x, text_y), str(i + 1), fill=color, font=font) image = image.convert('RGB') if os.path.exists(output_image_path): os.remove(output_image_path) image.save(output_image_path) def get_perception_infos(screenshot_file, screenshot_som_file, font_path): total_width, total_height = Image.open(screenshot_file).size # no partition img_list = [screenshot_file] img_x_list = [0] img_y_list = [0] coordinates = [] texts = [] padding = total_height * 0.0025 # 10 for i, img in enumerate(img_list): width, height = Image.open(img).size sub_text, sub_coordinates = ocr(img, ocr_detection, ocr_recognition) # for api for coordinate in sub_coordinates: coordinate[0] = int(max(0, img_x_list[i] + coordinate[0] - padding)) coordinate[2] = int(min(total_width, img_x_list[i] + coordinate[2] + padding)) coordinate[1] = int(max(0, img_y_list[i] + coordinate[1] - padding)) coordinate[3] = int(min(total_height,img_y_list[i] + coordinate[3] + padding)) sub_text_merge, sub_coordinates_merge = merge_boxes_and_texts_new(sub_text, sub_coordinates) coordinates.extend(sub_coordinates_merge) texts.extend(sub_text_merge) merged_text, merged_text_coordinates = merge_boxes_and_texts(texts, coordinates) filtered_merged_text = [] filtered_merged_text_coordinates = [] for i in range(len(merged_text)): filtered_merged_text.append(merged_text[i]) filtered_merged_text_coordinates.append(merged_text_coordinates[i]) merged_text, merged_text_coordinates = filtered_merged_text, filtered_merged_text_coordinates coordinates = [] for i, img in enumerate(img_list): width, height = Image.open(img).size sub_coordinates = det(img, "icon", groundingdino_model) for coordinate in sub_coordinates: coordinate[0] = int(max(0, img_x_list[i] + coordinate[0] - padding)) coordinate[2] = int(min(total_width, img_x_list[i] + coordinate[2] + padding)) coordinate[1] = int(max(0, img_y_list[i] + coordinate[1] - padding)) coordinate[3] = int(min(total_height, img_y_list[i] + coordinate[3] + padding)) sub_coordinates = merge_all_icon_boxes(sub_coordinates) coordinates.extend(sub_coordinates) merged_icon_coordinates = merge_all_icon_boxes(coordinates) rec_list = merged_text_coordinates + merged_icon_coordinates draw_coordinates_boxes_on_image(screenshot_file, copy.deepcopy(rec_list), screenshot_som_file, font_path) mark_number = 0 perception_infos = [] for i in range(len(merged_text_coordinates)): mark_number += 1 perception_info = {"text": "mark number: " + str(mark_number) + " text: " + merged_text[i], "coordinates": merged_text_coordinates[i]} perception_infos.append(perception_info) for i in range(len(merged_icon_coordinates)): mark_number += 1 perception_info = {"text": "mark number: " + str(mark_number) + " icon", "coordinates": merged_icon_coordinates[i]} perception_infos.append(perception_info) for i in range(len(perception_infos)): perception_infos[i]['coordinates'] = [int((perception_infos[i]['coordinates'][0]+perception_infos[i]['coordinates'][2])/2), int((perception_infos[i]['coordinates'][1]+perception_infos[i]['coordinates'][3])/2)] return perception_infos, total_width, total_height groundingdino_dir = snapshot_download('AI-ModelScope/GroundingDINO', revision='v1.0.0') groundingdino_model = pipeline('grounding-dino-task', model=groundingdino_dir) def analyze_string(s): result = { 'type': None, 'format_keys': [], 'dict_content': None } format_pattern = re.compile(r'\{(\w+)\}') # {'key': 'value'} dict_pattern = re.compile( r'\{(?:\s*[\'\"]\w+[\'\"]\s*:\s*[\'\"][^{}\'\"]+[\'\"]\s*,?)*\}' ) dict_matches = dict_pattern.findall(s) dicts = [] for match in dict_matches: try: parsed_dict = ast.literal_eval(match) if isinstance(parsed_dict, dict): dicts.append(parsed_dict) except (ValueError, SyntaxError): continue has_dict = len(dicts) > 0 s_without_dicts = dict_pattern.sub('', s) format_keys = format_pattern.findall(s_without_dicts) has_format = len(format_keys) > 0 has_format_and_dict = has_format and has_dict if has_format_and_dict: result['type'] = 4 elif has_format: result['type'] = 2 elif has_dict: result['type'] = 3 else: result['type'] = 1 if has_format: result['format_keys'] = format_keys if has_dict: result['dict_content'] = dicts[0] return result import re def is_good_string(s): # Regex to match the dictionary-like part {'key1': 'value1', ...} dict_pattern = r"\{('[^']+' *: *'[^']+' *(, *'[^']+' *: *'[^']+')*)?\}" # Regex to match the item list part {item1, item2,...} with no single quotes in items item_pattern = r"\{([a-zA-Z0-9_]+( *, *[a-zA-Z0-9_]+)*)?\}" # Find all parts of the string contained within braces parts = re.findall(r'\{.*?\}', s) for part in parts: # Check if the part matches either the dictionary pattern or item pattern if not re.fullmatch(dict_pattern, part) and not re.fullmatch(item_pattern, part): return False return True screenshot_root = "screenshot" if os.path.exists(screenshot_root): shutil.rmtree(screenshot_root) os.mkdir(screenshot_root) def image_to_base64(image): buffered = io.BytesIO() image.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") img_html = f'

' return img_html def chatbot(image, instruction, add_info, history, chat_log): if history == {}: output_for_save = [] thought_history = [] summary_history = [] action_history = [] summary = "" action = "" completed_requirements = "" memory = "" insight = "" error_flag = False user_msg = "

{}

".format(instruction) step_idx = 0 else: output_for_save = history["output_for_save"] thought_history = history["thought_history"] summary_history = history["summary_history"] action_history = history["action_history"] summary = history["summary"] action = history["action"] completed_requirements = history["completed_requirements"] memory = history["memory"][0] insight = history["insight"] error_flag = history["error_flag"] user_msg = "

{}

".format("I have uploaded the screenshot. Please continue operating.") step_idx = history["history"] current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") temp_file = f"temp_{current_time}" os.mkdir(temp_file) screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png") image.save(screenshot_file, format="PNG") screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1] perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="arial.ttf") shutil.rmtree(temp_file) os.mkdir(temp_file) output_for_save_this_step = {} prompt_action = get_action_prompt(instruction, perception_infos, width, height, thought_history, summary_history, action_history, [], summary, action, "", add_info, error_flag, completed_requirements, memory) chat_action = init_action_chat() chat_action = add_response("user", prompt_action, chat_action, [screenshot_som_file]) output_action = inference_chat(chat_action, vl_model_version, API_url, token) output_for_save_this_step['action'] = output_action action_json = json.loads(output_action.split("```json")[-1].split("```")[0]) thought = action_json['Thought'] summary = action_json['Summary'] action = action_json['Action'] chat_action = add_response("assistant", output_action, chat_action) if "Double TapIdx" in action: bot_response = "Please double click (click x 2) the red circle and upload the current screenshot again." idx = action.split("(")[-1].split(")")[0] coordinate = perception_infos[idx]['coordinates'] x, y = int(coordinate[0]), int(coordinate[1]) draw = ImageDraw.Draw(image) draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) elif "Double Tap" in action: bot_response = "Please double click (click x 2) the red circle and upload the current screenshot again." coordinate = action.split("(")[-1].split(")")[0].split(", ") x, y = int(coordinate[0]), int(coordinate[1]) draw = ImageDraw.Draw(image) draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) elif "Triple TapIdx" in action: bot_response = "Please triple click (click x 3) the red circle and upload the current screenshot again." coordinate = action.split("(")[-1].split(")")[0].split(", ") x, y = int(coordinate[0]), int(coordinate[1]) draw = ImageDraw.Draw(image) draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) elif "Triple Tap" in action: bot_response = "Please triple click (click x 3) the red circle and upload the current screenshot again." idx = action.split("(")[-1].split(")")[0] coordinate = perception_infos[idx]['coordinates'] x, y = int(coordinate[0]), int(coordinate[1]) draw = ImageDraw.Draw(image) draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) elif "TapIdx" in action: bot_response = "Please click (click x 1) the red circle and upload the current screenshot again." idx = action.split("(")[-1].split(")")[0] coordinate = perception_infos[idx]['coordinates'] x, y = int(coordinate[0]), int(coordinate[1]) draw = ImageDraw.Draw(image) draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) elif "Tap" in action: bot_response = "Please click (click x 1) the red circle and upload the current screenshot again." coordinate = action.split("(")[-1].split(")")[0].split(", ") x, y = int(coordinate[0]), int(coordinate[1]) draw = ImageDraw.Draw(image) draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) elif "Shortcut" in action: keys = action.split("(")[-1].split(")")[0].split(", ") key1, key2 = keys[0].lower(), keys[1].lower() bot_response = f"Please press {key1}+{key2} and upload the current screenshot again." elif "Press" in action: key = action.split("(")[-1].split(")")[0] bot_response = f"Please press {key} and upload the current screenshot again." elif "Open App" in action: app = action.split("(")[-1].split(")")[0] bot_response = f"Please open {app} app and upload the current screenshot again." elif "Type" in action: coordinate = action.split("(")[1].split(")")[0].split(", ") x, y = int(coordinate[0]), int(coordinate[1]) if "[text]" not in action: # for claude if '[' not in action or ']' not in action: # text = action.split('),')[-1].strip() text = action.split('),')[-1].strip().split("(")[1].split(")")[0].replace("text: ", '').replace("'", "") else: text = action.split("[")[-1].split("]")[0] else: text = action.split(" \"")[-1].split("\"")[0] draw = ImageDraw.Draw(image) draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) bot_response = f"Please type \"{text}\" in the red circle and upload the current screenshot again." elif "Select (" in action: content = action.split("(")[1].split(")")[0] bot_response = f"Please select the text content \"{content}\" and upload the current screenshot again." elif "Replace (" in action: coordinate = action.split("(")[1].split(")")[0].split(", ") x, y = int(coordinate[0]), int(coordinate[1]) if "[text]" not in action: # for claude if '[' not in action or ']' not in action: # text = action.split('),')[-1].strip() text = action.split('),')[-1].strip().split("(")[1].split(")")[0].replace("text: ", '') else: if "] with " in action: text = action.split("] with ")[-1] text = text.replace("\"", '').replace("'", '').strip('.') else: text = action.split("[")[-1].split("]")[0] else: text = action.split(" \"")[-1].split("\"")[0] draw = ImageDraw.Draw(image) draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) bot_response = f"Please replace the text in the red circle by \"{text}\" and upload the current screenshot again." elif "Append (" in action: coordinate = action.split("(")[1].split(")")[0].split(", ") x, y = int(coordinate[0]), int(coordinate[1]) if "[text]" not in action: if '[' not in action or ']' not in action: text = action.split('),')[-1].strip() else: text = action.split("[")[-1].split("]")[0] else: text = action.split(" \"")[-1].split("\"")[0] draw = ImageDraw.Draw(image) draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20) bot_response = f"Please insert the text \"{text}\" in the red circle and upload the current screenshot again." elif "Stop" in action: output_for_save.append(output_for_save_this_step) bot_response = f"Answer: {output_for_save}, task completed" prompt_memory = get_memory_prompt(insight) chat_action = add_response("user", prompt_memory, chat_action) output_memory = inference_chat(chat_action, vl_model_version, API_url, token) chat_action = add_response("assistant", output_memory, chat_action) output_memory = output_memory.split("### Important content ###")[-1].split("\n\n")[0].strip() + "\n" if "None" not in output_memory and output_memory not in memory: memory += output_memory bot_text1 = "

{}

".format("### Decision ###") bot_thought = "

{}

".format("Thought: " + thought) bot_action = "

{}

".format("Action: " + action) bot_operation = "

{}

".format("Operation: " + summary) bot_text2 = "

{}

".format("### Memory ###") if len(memory) > 0: bot_memory = "

{}

".format(memory) else: bot_memory = "

{}

".format("None") bot_response = "

{}

".format(bot_response) if image is not None: bot_img_html = image_to_base64(image) bot_response = "

{}

".format(bot_img_html) + bot_response chat_log.append(user_msg) shutil.rmtree(temp_file) # os.remove(screenshot_file) # os.remove(screenshot_som_file) thought_history.append(thought) summary_history.append(summary) action_history.append(action) prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history, completed_requirements, add_info) chat_planning = init_memory_chat() chat_planning = add_response("user", prompt_planning, chat_planning ) output_planning = inference_chat(chat_planning, llm_model_version, API_url, token) output_for_save_this_step['planning'] = output_planning chat_planning = add_response("assistant", output_planning, chat_planning ) completed_requirements = output_planning.split("### Completed contents ###")[-1].replace("\n", " ").strip() bot_text3 = "

{}

".format("### Planning ###") output_planning = "

{}

".format(output_planning) history["thought_history"] = thought_history history["summary_history"] = summary_history history["action_history"] = action_history history["summary"] = summary history["action"] = action history["memory"] = memory, history["memory_switch"] = True, history["insight"] = insight history["error_flag"] = error_flag history["completed_requirements"] = completed_requirements history["output_for_save"] = output_for_save history["history"] = step_idx + 1 chat_log.append(bot_text3) chat_log.append(output_planning) chat_log.append(bot_text1) chat_log.append(bot_thought) chat_log.append(bot_action) chat_log.append(bot_operation) chat_log.append(bot_text2) chat_log.append(bot_memory) chat_log.append(bot_response) chat_html = "

{}

".format("".join(chat_log)) return chatbot_css + chat_html, history, chat_log def lock_input(instruction): return gr.update(value=instruction, interactive=False), gr.update(value=None) def reset_demo(): return gr.update(value="", interactive=True), gr.update(value=None, interactive=True), "

", {}, [] tos_markdown = ("""

If you like our project, please give us a star ✨ on Github for latest update. **Terms of use** 1. Input your instruction in \"Instruction\", for example \"Turn on the dark mode\". 2. You can input helpful operation knowledge in \"Knowledge\". 3. Click \"Submit\" to get the operation. You need to operate your PC according to the operation and then upload the screenshot after your operation. 4. We show two examples below, each with three screenshots. Click and submit from top to bottom to experience it. **使用说明** 1. 在“Instruction”中输入你的指令，例如“打开深色模式”。 2. 你可以在“Knowledge”中输入帮助性的操作知识。 3. 点击“Submit”来获得操作。你需要根据输出来操作PC，并且上传操作后的截图。 4. 我们在下方展示了两个例子，每个例子有三张截屏。请从上到下依次点击并提交来体验。""") title_markdowm = ("""# PC-Agent: A Hierarchical Multi-Agent Collaboration Framework for Complex Task Automation on PC""") instruction_input = gr.Textbox(label="Instruction", placeholder="Input your instruction") knowledge_input = gr.Textbox(label="Knowledge", placeholder="Input your knowledge") image_input = gr.Image(label="Screenshot", type="pil", height=350, width=700) with gr.Blocks() as demo: history_state = gr.State(value={}) history_output = gr.State(value=[]) with gr.Row(): gr.Markdown(title_markdowm) with gr.Row(): with gr.Column(scale=5): gr.Markdown(tos_markdown) image_input.render() gr.Examples(examples=[ ["./example/1-1.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索，或输入网址\" written on it."], ["./example/1-2.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索，或输入网址\" written on it."], ["./example/1-3.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索，或输入网址\" written on it."], ], inputs=[image_input, instruction_input, knowledge_input]) with gr.Column(scale=6): instruction_input.render() knowledge_input.render() with gr.Row(): start_button = gr.Button("Submit") clear_button = gr.Button("Clear") output_component = gr.HTML(label="Chat history", value="

") start_button.click( fn=lambda image, instruction, add_info, history, output: chatbot(image, instruction, add_info, history, output), inputs=[image_input, instruction_input, knowledge_input, history_state, history_output], outputs=[output_component, history_state, history_output] ) clear_button.click( fn=reset_demo, inputs=[], outputs=[instruction_input, knowledge_input, output_component, history_state, history_output] ) demo.queue().launch(share=True)