Spaces:

junyangwang0410
/

PC-Agent

Runtime error

App Files Files Community

阳渠 commited on Mar 20

Commit

e78b889

1 Parent(s): d8d693c

Update space

Browse files

Files changed (3) hide show

README.md +0 -14
app.py +547 -52
requirements.txt +9 -1

README.md DELETED Viewed

@@ -1,14 +0,0 @@
----
-title: PC Agent
-emoji: 💬
-colorFrom: yellow
-colorTo: purple
-sdk: gradio
-sdk_version: 5.0.1
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: A Hierarchical Multi-Agent Collaboration Framework for Compl
----
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

app.py CHANGED Viewed

@@ -1,64 +1,559 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
 """
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

+import ast
+import re
+import io
+import os
+import json
+import copy
+import shutil
+import base64
+import random
 import gradio as gr
+from datetime import datetime
+from modelscope.pipelines import pipeline
+from modelscope import snapshot_download
+from PIL import Image, ImageDraw, ImageFont
+from PCAgent.api import inference_chat
+from PCAgent.icon_localization import det
+from PCAgent.text_localization import ocr
+from PCAgent.prompt_qwen import get_subtask_prompt as get_subtask_prompt
+from PCAgent.chat import init_action_chat, init_memory_chat, add_response
+from PCAgent.prompt_qwen import get_action_prompt, get_process_prompt, get_memory_prompt
+from PCAgent.merge_strategy import merge_boxes_and_texts, merge_all_icon_boxes, merge_boxes_and_texts_new
+vl_model_version = os.environ.get('vl_model_version')
+llm_model_version = os.environ.get('llm_model_version')
+API_url = os.environ.get('API_url')
+token = os.environ.get('token')
+os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
+os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
+radius = 100
+chatbot_css = """
+<style>
+.chat-container {
+    display: flex;
+    flex-direction: column;
+    overflow-y: auto;
+    max-height: 800px;
+    margin: 10px;
+}
+.user-message, .bot-message {
+    margin: 5px;
+    padding: 10px;
+    border-radius: 10px;
+}
+.user-message {
+    text-align: right;
+    background-color: #7B68EE;
+    color: white;
+    align-self: flex-end;
+}
+.bot-message {
+    text-align: left;
+    background-color: #ADD8E6;
+    color: black;
+    align-self: flex-start;
+}
+.user-image {
+    text-align: right;
+    align-self: flex-end;
+    max-width: 150px;
+    max-height: 300px;
+}
+.bot-image {
+    text-align: left;
+    align-self: flex-start;
+    max-width: 200px;
+    max-height: 400px;
+}
+</style>
 """
+def cmyk_to_rgb(c, m, y, k):
+    r = 255 * (1.0 - c / 255) * (1.0 - k / 255)
+    g = 255 * (1.0 - m / 255) * (1.0 - k / 255)
+    b = 255 * (1.0 - y / 255) * (1.0 - k / 255)
+    return int(r), int(g), int(b)
+def draw_coordinates_boxes_on_image(image_path, coordinates, output_image_path, font_path, no_text=0):
+    image = Image.open(image_path)
+    width, height = image.size
+    draw = ImageDraw.Draw(image)
+    total_boxes = len(coordinates)
+    colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for _ in
+              range(total_boxes)]
+    for i, coord in enumerate(coordinates):
+        c, m, y, k = colors[i]
+        color = cmyk_to_rgb(c, m, y, k)
+        draw.rectangle(coord, outline=color, width=int(height * 0.0025))
+        if no_text != 1:
+            font = ImageFont.truetype(font_path, int(height * 0.012))
+            text_x = coord[0] + int(height * 0.0025)
+            text_y = max(0, coord[1] - int(height * 0.013))
+            draw.text((text_x, text_y), str(i + 1), fill=color, font=font)
+    image = image.convert('RGB')
+    if os.path.exists(output_image_path):
+        os.remove(output_image_path)
+    image.save(output_image_path)
+def get_perception_infos(screenshot_file, screenshot_som_file, font_path):
+    total_width, total_height = Image.open(screenshot_file).size
+    # no partition
+    img_list = [screenshot_file]
+    img_x_list = [0]
+    img_y_list = [0]
+    coordinates = []
+    texts = []
+    padding = total_height * 0.0025  # 10
+    for i, img in enumerate(img_list):
+        width, height = Image.open(img).size
+        sub_text, sub_coordinates = ocr(img) # for api
+        for coordinate in sub_coordinates:
+            coordinate[0] = int(max(0, img_x_list[i] + coordinate[0] - padding))
+            coordinate[2] = int(min(total_width, img_x_list[i] + coordinate[2] + padding))
+            coordinate[1] = int(max(0, img_y_list[i] + coordinate[1] - padding))
+            coordinate[3] = int(min(total_height,img_y_list[i] + coordinate[3] + padding))
+        sub_text_merge, sub_coordinates_merge = merge_boxes_and_texts_new(sub_text, sub_coordinates)
+        coordinates.extend(sub_coordinates_merge)
+        texts.extend(sub_text_merge)
+    merged_text, merged_text_coordinates = merge_boxes_and_texts(texts, coordinates)
+    filtered_merged_text = []
+    filtered_merged_text_coordinates = []
+    for i in range(len(merged_text)):
+        filtered_merged_text.append(merged_text[i])
+        filtered_merged_text_coordinates.append(merged_text_coordinates[i])
+    merged_text, merged_text_coordinates = filtered_merged_text, filtered_merged_text_coordinates
+    coordinates = []
+    for i, img in enumerate(img_list):
+        width, height = Image.open(img).size
+        sub_coordinates = det(img, "icon", groundingdino_model)
+        for coordinate in sub_coordinates:
+            coordinate[0] = int(max(0, img_x_list[i] + coordinate[0] - padding))
+            coordinate[2] = int(min(total_width, img_x_list[i] + coordinate[2] + padding))
+            coordinate[1] = int(max(0, img_y_list[i] + coordinate[1] - padding))
+            coordinate[3] = int(min(total_height, img_y_list[i] + coordinate[3] + padding))
+        sub_coordinates = merge_all_icon_boxes(sub_coordinates)
+        coordinates.extend(sub_coordinates)
+    merged_icon_coordinates = merge_all_icon_boxes(coordinates)
+    rec_list = merged_text_coordinates + merged_icon_coordinates
+    draw_coordinates_boxes_on_image(screenshot_file, copy.deepcopy(rec_list), screenshot_som_file, font_path)
+    mark_number = 0
+    perception_infos = []
+    for i in range(len(merged_text_coordinates)):
+        mark_number += 1
+        perception_info = {"text": "mark number: " + str(mark_number) + " text: " + merged_text[i], "coordinates": merged_text_coordinates[i]}
+        perception_infos.append(perception_info)
+    for i in range(len(merged_icon_coordinates)):
+        mark_number += 1
+        perception_info = {"text": "mark number: " + str(mark_number) + " icon", "coordinates": merged_icon_coordinates[i]}
+        perception_infos.append(perception_info)
+    for i in range(len(perception_infos)):
+        perception_infos[i]['coordinates'] = [int((perception_infos[i]['coordinates'][0]+perception_infos[i]['coordinates'][2])/2), int((perception_infos[i]['coordinates'][1]+perception_infos[i]['coordinates'][3])/2)]
+    return perception_infos, total_width, total_height
+groundingdino_dir = snapshot_download('AI-ModelScope/GroundingDINO', revision='v1.0.0')
+groundingdino_model = pipeline('grounding-dino-task', model=groundingdino_dir)
+def analyze_string(s):
+    result = {
+        'type': None,
+        'format_keys': [],
+        'dict_content': None
+    }
+    format_pattern = re.compile(r'\{(\w+)\}')
+    #  {'key': 'value'}
+    dict_pattern = re.compile(
+        r'\{(?:\s*[\'\"]\w+[\'\"]\s*:\s*[\'\"][^{}\'\"]+[\'\"]\s*,?)*\}'
+    )
+    dict_matches = dict_pattern.findall(s)
+    dicts = []
+    for match in dict_matches:
+        try:
+            parsed_dict = ast.literal_eval(match)
+            if isinstance(parsed_dict, dict):
+                dicts.append(parsed_dict)
+        except (ValueError, SyntaxError):
+            continue
+    has_dict = len(dicts) > 0
+    s_without_dicts = dict_pattern.sub('', s)
+    format_keys = format_pattern.findall(s_without_dicts)
+    has_format = len(format_keys) > 0
+    has_format_and_dict = has_format and has_dict
+    if has_format_and_dict:
+        result['type'] = 4
+    elif has_format:
+        result['type'] = 2
+    elif has_dict:
+        result['type'] = 3
+    else:
+        result['type'] = 1
+    if has_format:
+        result['format_keys'] = format_keys
+    if has_dict:
+        result['dict_content'] = dicts[0]
+    return result
+import re
+def is_good_string(s):
+    # Regex to match the dictionary-like part {'key1': 'value1', ...}
+    dict_pattern = r"\{('[^']+' *: *'[^']+' *(, *'[^']+' *: *'[^']+')*)?\}"
+    # Regex to match the item list part {item1, item2,...} with no single quotes in items
+    item_pattern = r"\{([a-zA-Z0-9_]+( *, *[a-zA-Z0-9_]+)*)?\}"
+    # Find all parts of the string contained within braces
+    parts = re.findall(r'\{.*?\}', s)
+    for part in parts:
+        # Check if the part matches either the dictionary pattern or item pattern
+        if not re.fullmatch(dict_pattern, part) and not re.fullmatch(item_pattern, part):
+            return False
+    return True
+screenshot_root = "screenshot"
+if os.path.exists(screenshot_root):
+    shutil.rmtree(screenshot_root)
+os.mkdir(screenshot_root)
+def image_to_base64(image):
+    buffered = io.BytesIO()
+    image.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    img_html = f'<img src="data:image/png;base64,{img_str}" />'
+    return img_html
+def chatbot(image, instruction, add_info, history, chat_log):
+    if history == {}:
+        output_for_save = []
+        thought_history = []
+        summary_history = []
+        action_history = []
+        summary = ""
+        action = ""
+        completed_requirements = ""
+        memory = ""
+        insight = ""
+        error_flag = False
+        user_msg = "<div class='user-message'>{}</div>".format(instruction)
+        step_idx = 0
+    else:
+        output_for_save = history["output_for_save"]
+        thought_history = history["thought_history"]
+        summary_history = history["summary_history"]
+        action_history = history["action_history"]
+        summary = history["summary"]
+        action = history["action"]
+        completed_requirements = history["completed_requirements"]
+        memory = history["memory"][0]
+        insight = history["insight"]
+        error_flag = history["error_flag"]
+        user_msg = "<div class='user-message'>{}</div>".format("I have uploaded the screenshot. Please continue operating.")
+        step_idx = history["history"]
+    current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    temp_file = f"temp_{current_time}"
+    os.mkdir(temp_file)
+    screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png")
+    image.save(screenshot_file, format="PNG")
+    screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1]
+    perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="C:/Windows/Fonts/arial.ttf")
+    shutil.rmtree(temp_file)
+    os.mkdir(temp_file)
+    output_for_save_this_step = {}
+    prompt_action = get_action_prompt(instruction, perception_infos, width, height, thought_history, summary_history, action_history, [], summary, action, "", add_info, error_flag, completed_requirements, memory)
+    chat_action = init_action_chat()
+    chat_action = add_response("user", prompt_action, chat_action, [screenshot_som_file])
+    output_action = inference_chat(chat_action, vl_model_version, API_url, token)
+    output_for_save_this_step['action'] = output_action
+    action_json = json.loads(output_action.split('```json')[-1].split('```')[0])
+    thought = action_json['Thought']
+    summary = action_json['Summary']
+    action = action_json['Action']
+    chat_action = add_response("assistant", output_action, chat_action)
+    if "Double TapIdx" in action:
+        bot_response = "Please double click (click x 2) the red circle and upload the current screenshot again."
+        idx = action.split("(")[-1].split(")")[0]
+        coordinate = perception_infos[idx]['coordinates']
+        x, y = int(coordinate[0]), int(coordinate[1])
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
+    elif "Double Tap" in action:
+        bot_response = "Please double click (click x 2) the red circle and upload the current screenshot again."
+        coordinate = action.split("(")[-1].split(")")[0].split(", ")
+        x, y = int(coordinate[0]), int(coordinate[1])
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
+    elif "Triple TapIdx" in action:
+        bot_response = "Please triple click (click x 3) the red circle and upload the current screenshot again."
+        coordinate = action.split("(")[-1].split(")")[0].split(", ")
+        x, y = int(coordinate[0]), int(coordinate[1])
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
+    elif "Triple Tap" in action:
+        bot_response = "Please triple click (click x 3) the red circle and upload the current screenshot again."
+        idx = action.split("(")[-1].split(")")[0]
+        coordinate = perception_infos[idx]['coordinates']
+        x, y = int(coordinate[0]), int(coordinate[1])
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
+    elif "TapIdx" in action:
+        bot_response = "Please click (click x 1) the red circle and upload the current screenshot again."
+        idx = action.split("(")[-1].split(")")[0]
+        coordinate = perception_infos[idx]['coordinates']
+        x, y = int(coordinate[0]), int(coordinate[1])
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
+    elif "Tap" in action:
+        bot_response = "Please click (click x 1) the red circle and upload the current screenshot again."
+        coordinate = action.split("(")[-1].split(")")[0].split(", ")
+        x, y = int(coordinate[0]), int(coordinate[1])
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
+    elif "Shortcut" in action:
+        keys = action.split("(")[-1].split(")")[0].split(", ")
+        key1, key2 = keys[0].lower(), keys[1].lower()
+        bot_response = f"Please press {key1}+{key2} and upload the current screenshot again."
+    elif "Press" in action:
+        key = action.split("(")[-1].split(")")[0]
+        bot_response = f"Please press {key} and upload the current screenshot again."
+    elif "Open App" in action:
+        app = action.split("(")[-1].split(")")[0]
+        bot_response = f"Please open {app} app and upload the current screenshot again."
+    elif "Type" in action:
+        coordinate = action.split("(")[1].split(")")[0].split(", ")
+        x, y = int(coordinate[0]), int(coordinate[1])
+        if "[text]" not in action:
+            # for claude
+            if '[' not in action or ']' not in action:
+                # text = action.split('),')[-1].strip()
+                text = action.split('),')[-1].strip().split("(")[1].split(")")[0].replace("text: ", '').replace("'", "")
+            else:
+                text = action.split("[")[-1].split("]")[0]
+        else:
+            text = action.split(" \"")[-1].split("\"")[0]
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
+        bot_response = f"Please type \"{text}\" in the red circle and upload the current screenshot again."
+    elif "Select (" in action:
+        content = action.split("(")[1].split(")")[0]
+        bot_response = f"Please select the text content \"{content}\" and upload the current screenshot again."
+    elif "Replace (" in action:
+        coordinate = action.split("(")[1].split(")")[0].split(", ")
+        x, y = int(coordinate[0]), int(coordinate[1])
+        if "[text]" not in action:
+            # for claude
+            if '[' not in action or ']' not in action:
+                # text = action.split('),')[-1].strip()
+                text = action.split('),')[-1].strip().split("(")[1].split(")")[0].replace("text: ", '')
+            else:
+                if "] with " in action:
+                    text = action.split("] with ")[-1]
+                    text = text.replace("\"", '').replace("'", '').strip('.')
+                else:
+                    text = action.split("[")[-1].split("]")[0]
+        else:
+            text = action.split(" \"")[-1].split("\"")[0]
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
+        bot_response = f"Please replace the text in the red circle by \"{text}\" and upload the current screenshot again."
+    elif "Append (" in action:
+        coordinate = action.split("(")[1].split(")")[0].split(", ")
+        x, y = int(coordinate[0]), int(coordinate[1])
+        if "[text]" not in action:
+            if '[' not in action or ']' not in action:
+                text = action.split('),')[-1].strip()
+            else:
+                text = action.split("[")[-1].split("]")[0]
+        else:
+            text = action.split(" \"")[-1].split("\"")[0]
+        draw = ImageDraw.Draw(image)
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], outline='red', width=20)
+        bot_response = f"Please insert the text \"{text}\" in the red circle and upload the current screenshot again."
+    elif "Stop" in action:
+        output_for_save.append(output_for_save_this_step)
+        bot_response = f"Answer: {output_for_save}, task completed"
+    prompt_memory = get_memory_prompt(insight)
+    chat_action = add_response("user", prompt_memory, chat_action)
+    output_memory = inference_chat(chat_action, vl_model_version, API_url, token)
+    chat_action = add_response("assistant", output_memory, chat_action)
+    output_memory = output_memory.split("### Important content ###")[-1].split("\n\n")[0].strip() + "\n"
+    if "None" not in output_memory and output_memory not in memory:
+        memory += output_memory
+    bot_text1 = "<div class='bot-message'>{}</div>".format("### Decision ###")
+    bot_thought = "<div class='bot-message'>{}</div>".format("Thought: " + thought)
+    bot_action = "<div class='bot-message'>{}</div>".format("Action: " + action)
+    bot_operation = "<div class='bot-message'>{}</div>".format("Operation: " + summary)
+    bot_text2 = "<div class='bot-message'>{}</div>".format("### Memory ###")
+    if len(memory) > 0:
+        bot_memory = "<div class='bot-message'>{}</div>".format(memory)
+    else:
+        bot_memory = "<div class='bot-message'>{}</div>".format("None")
+    bot_response = "<div class='bot-message'>{}</div>".format(bot_response)
+    if image is not None:
+        bot_img_html = image_to_base64(image)
+        bot_response = "<div class='bot-image'>{}</div>".format(bot_img_html) + bot_response
+    chat_log.append(user_msg)
+    shutil.rmtree(temp_file)
+    # os.remove(screenshot_file)
+    # os.remove(screenshot_som_file)
+    thought_history.append(thought)
+    summary_history.append(summary)
+    action_history.append(action)
+    prompt_planning = get_process_prompt(instruction, thought_history, summary_history, action_history, completed_requirements, add_info)
+    chat_planning = init_memory_chat()
+    chat_planning = add_response("user", prompt_planning, chat_planning )
+    output_planning = inference_chat(chat_planning, llm_model_version, API_url, token)
+    output_for_save_this_step['planning'] = output_planning
+    chat_planning = add_response("assistant", output_planning, chat_planning )
+    completed_requirements = output_planning.split("### Completed contents ###")[-1].replace("\n", " ").strip()
+    bot_text3 = "<div class='bot-message'>{}</div>".format("### Planning ###")
+    output_planning = "<div class='bot-message'>{}</div>".format(output_planning)
+    history["thought_history"] = thought_history
+    history["summary_history"] = summary_history
+    history["action_history"] = action_history
+    history["summary"] = summary
+    history["action"] = action
+    history["memory"] = memory,
+    history["memory_switch"] = True,
+    history["insight"] = insight
+    history["error_flag"] = error_flag
+    history["completed_requirements"] = completed_requirements
+    history["output_for_save"] = output_for_save
+    history["history"] = step_idx + 1
+    chat_log.append(bot_text3)
+    chat_log.append(output_planning)
+    chat_log.append(bot_text1)
+    chat_log.append(bot_thought)
+    chat_log.append(bot_action)
+    chat_log.append(bot_operation)
+    chat_log.append(bot_text2)
+    chat_log.append(bot_memory)
+    chat_log.append(bot_response)
+    chat_html = "<div class='chat-container'>{}</div>".format("".join(chat_log))
+    return chatbot_css + chat_html, history, chat_log
+def lock_input(instruction):
+    return gr.update(value=instruction, interactive=False), gr.update(value=None)
+def reset_demo():
+    return gr.update(value="", interactive=True), gr.update(value=None, interactive=True), "<div class='chat-container'></div>", {}, []
+tos_markdown = ("""<div style="display:flex; gap: 0.25rem;" align="center">
+    <a href='https://github.com/X-PLUG/MobileAgent'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
+    <a href="https://arxiv.org/abs/2502.14282"><img src="https://img.shields.io/badge/Arxiv-2502.14282-red"></a>
+    <a href='https://github.com/X-PLUG/MobileAgent/stargazers'><img src='https://img.shields.io/github/stars/X-PLUG/MobileAgent.svg?style=social'></a>
+</div>
+If you like our project, please give us a star ✨ on Github for latest update.
+**Terms of use**
+1. Input your instruction in \"Instruction\", for example \"Turn on the dark mode\".
+2. You can input helpful operation knowledge in \"Knowledge\".
+3. Click \"Submit\" to get the operation. You need to operate your PC according to the operation and then upload the screenshot after your operation.
+4. We show two examples below, each with three screenshots. Click and submit from top to bottom to experience it.
+**使用说明**
+1. 在“Instruction”中输入你的指令，例如“打开深色模式”。
+2. 你可以在“Knowledge”中输入帮助性的操作知识。
+3. 点击“Submit”来获得操作。你需要根据输出来操作PC，并且上传操作后的截图。
+4. 我们在下方展示了两个例子，每个例子有三张截屏。请从上到下依次点击并提交来体验。""")
+title_markdowm = ("""# PC-Agent: A Hierarchical Multi-Agent Collaboration Framework for Complex Task Automation on PC""")
+instruction_input = gr.Textbox(label="Instruction", placeholder="Input your instruction")
+knowledge_input = gr.Textbox(label="Knowledge", placeholder="Input your knowledge")
+with gr.Blocks() as demo:
+    history_state = gr.State(value={})
+    history_output = gr.State(value=[])
+    with gr.Row():
+        gr.Markdown(title_markdowm)
+    with gr.Row():
+        with gr.Column(scale=5):
+            gr.Markdown(tos_markdown)
+            image_input = gr.Image(label="Screenshot", type="pil", height=350, width=700)
+            gr.Examples(examples=[
+                ["./example/1-1.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索，或输入网址\" written on it."],
+                ["./example/1-2.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索，或输入网址\" written on it."],
+                ["./example/1-3.jpg", "Search for Alibaba's stock price in Chrome", "The Chrome search bar is in the middle of the screen and has \"在Google 中搜索，或输入网址\" written on it."],
+            ], inputs=[image_input, instruction_input, knowledge_input])
+        with gr.Column(scale=6):
+            instruction_input.render()
+            knowledge_input.render()
+            with gr.Row():
+                start_button = gr.Button("Submit")
+                clear_button = gr.Button("Clear")
+            output_component = gr.HTML(label="Chat history", value="<div class='chat-container'></div>")
+    start_button.click(
+        fn=lambda image, instruction, add_info, history, output: chatbot(image, instruction, add_info, history, output),
+        inputs=[image_input, instruction_input, knowledge_input, history_state, history_output],
+        outputs=[output_component, history_state, history_output]
+    )
+    clear_button.click(
+        fn=reset_demo,
+        inputs=[],
+        outputs=[instruction_input, knowledge_input, output_component, history_state, history_output]
+    )
+demo.queue().launch(share=True)

requirements.txt CHANGED Viewed

	@@ -1 +1,9 @@
1	- ~~huggingface_hub~~==0.25.2

+modelscope==1.15.0
+supervision==0.21.0
+alibabacloud_tea_util
+alibabacloud_tea_openapi
+alibabacloud_ocr_api20210707
+openai
+dashscope
+torch
+opencv-python