Spaces:

junyangwang0410
/

PC-Agent

Runtime error

App Files Files Community

阳渠 commited on Mar 20

Commit

3afb4b6

1 Parent(s): e78b889

Update

Browse files

Files changed (20) hide show

PCAgent/__pycache__/api.cpython-310.pyc +0 -0
PCAgent/__pycache__/chat.cpython-310.pyc +0 -0
PCAgent/__pycache__/crop.cpython-310.pyc +0 -0
PCAgent/__pycache__/icon_localization.cpython-310.pyc +0 -0
PCAgent/__pycache__/merge_strategy.cpython-310.pyc +0 -0
PCAgent/__pycache__/prompt_qwen.cpython-310.pyc +0 -0
PCAgent/__pycache__/text_localization.cpython-310.pyc +0 -0
PCAgent/api.py +77 -0
PCAgent/chat.py +123 -0
PCAgent/crop.py +120 -0
PCAgent/icon_localization.py +59 -0
PCAgent/merge_strategy.py +275 -0
PCAgent/prompt_qwen.py +360 -0
PCAgent/text_localization.py +70 -0
PCAgent/text_localization_old.py +61 -0
README.md +14 -0
app.py +12 -1
example/1-1.jpg +0 -0
example/1-2.jpg +0 -0
example/1-3.jpg +0 -0

PCAgent/__pycache__/api.cpython-310.pyc ADDED Viewed

Binary file (1.47 kB). View file

PCAgent/__pycache__/chat.cpython-310.pyc ADDED Viewed

Binary file (2.45 kB). View file

PCAgent/__pycache__/crop.cpython-310.pyc ADDED Viewed

Binary file (3.18 kB). View file

PCAgent/__pycache__/icon_localization.cpython-310.pyc ADDED Viewed

Binary file (1.75 kB). View file

PCAgent/__pycache__/merge_strategy.cpython-310.pyc ADDED Viewed

Binary file (6.25 kB). View file

PCAgent/__pycache__/prompt_qwen.cpython-310.pyc ADDED Viewed

Binary file (20.5 kB). View file

PCAgent/__pycache__/text_localization.cpython-310.pyc ADDED Viewed

Binary file (2.87 kB). View file

PCAgent/api.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import base64
+import requests
+import time
+import pdb
+import dashscope
+from dashscope import MultiModalConversation
+from PIL import Image
+import io
+from openai import OpenAI
+import json
+def resize_encode_image(image_path, screen_scale_ratio=1):
+    with Image.open(image_path) as img:
+        new_width = int(img.width * screen_scale_ratio)
+        new_height = int(img.height * screen_scale_ratio)
+        resized_img = img.resize((new_width, new_height), Image.LANCZOS)
+        buffered = io.BytesIO()
+        resized_img.save(buffered, format="PNG")
+        img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+        return img_base64
+    # with open(image_path, "rb") as image_file:
+    #     return base64.b64encode(image_file.read()).decode('utf-8')
+def inference_chat(chat, model, api_url, token):
+    messages = []
+    for role, content in chat:
+        messages.append({"role": role, "content": content})
+    client = OpenAI(
+        # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx",
+        api_key=token,
+        base_url=api_url,
+    )
+    num_try = 5
+    for _ in range(num_try):
+        try:
+            completion = client.chat.completions.create(
+                model=model, # 此处以qwen-plus为例，可按需更换模型名称。模型列表：https://help.aliyun.com/zh/model-studio/getting-started/models
+                messages=messages
+            )
+        except:
+            print("Network Error:")
+            try:
+                print(completion.model_dump_json())
+            except:
+                print("Request Failed")
+            time.sleep(2)
+        else:
+            break
+    return json.loads(completion.model_dump_json())['choices'][0]['message']['content']
+    # headers = {
+    #     "Content-Type": "application/json",
+    #     "Authorization": f"Bearer {token}"
+    # }
+    # data = {
+    #     "model": model,
+    #     "messages": [],
+    #     "max_tokens": 2048,
+    #     'temperature': 0.0,
+    #     "seed": 1234
+    # }

PCAgent/chat.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import copy
+from PCAgent.api import resize_encode_image
+def init_subtask_chat():
+    operation_history = []
+    system_prompt = "You are a helpful AI assistant."
+    operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
+    return operation_history
+def init_action_chat():
+    operation_history = []
+    system_prompt = "You are a helpful AI PC operating assistant. You need to help me operate the PC to complete the user\'s instruction."
+    operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
+    return operation_history
+def init_reflect_chat():
+    operation_history = []
+    system_prompt = "You are a helpful AI PC operating assistant."
+    operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
+    return operation_history
+def init_memory_chat():
+    operation_history = []
+    system_prompt = "You are a helpful AI PC operating assistant."
+    operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
+    return operation_history
+def add_response_old(role, prompt, chat_history, image=None):
+    new_chat_history = copy.deepcopy(chat_history)
+    if image:
+        base64_image = resize_encode_image(image)
+        content = [
+            {
+                "type": "text",
+                "text": prompt
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{base64_image}"
+                }
+            },
+        ]
+    else:
+        content = [
+            {
+            "type": "text",
+            "text": prompt
+            },
+        ]
+    new_chat_history.append([role, content])
+    return new_chat_history
+def add_response(role, prompt, chat_history, image=[], use_qwen=False):
+    new_chat_history = copy.deepcopy(chat_history)
+    content = [
+        {
+        "type": "text",
+        "text": prompt
+        },
+    ]
+    for i in range(len(image)):
+        if not use_qwen:
+            base64_image = resize_encode_image(image[i])
+            content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{base64_image}"
+                    }
+                }
+            )
+        else:
+            content.append(
+                {
+                    "type": "image",
+                    "image": image[i]
+                }
+            )
+    new_chat_history.append([role, content])
+    return new_chat_history
+def add_response_two_image(role, prompt, chat_history, image):
+    new_chat_history = copy.deepcopy(chat_history)
+    base64_image1 = resize_encode_image(image[0])
+    base64_image2 = resize_encode_image(image[1])
+    content = [
+        {
+            "type": "text",
+            "text": prompt
+        },
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{base64_image1}"
+            }
+        },
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{base64_image2}"
+            }
+        },
+    ]
+    new_chat_history.append([role, content])
+    return new_chat_history
+def print_status(chat_history):
+    print("*"*100)
+    for chat in chat_history:
+        print("role:", chat[0])
+        print(chat[1][0]["text"] + "<image>"*(len(chat[1])-1) + "\n")
+    print("*"*100)

PCAgent/crop.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import math
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+def crop_image(img, position):
+    def distance(x1,y1,x2,y2):
+        return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))
+    position = position.tolist()
+    for i in range(4):
+        for j in range(i+1, 4):
+            if(position[i][0] > position[j][0]):
+                tmp = position[j]
+                position[j] = position[i]
+                position[i] = tmp
+    if position[0][1] > position[1][1]:
+        tmp = position[0]
+        position[0] = position[1]
+        position[1] = tmp
+    if position[2][1] > position[3][1]:
+        tmp = position[2]
+        position[2] = position[3]
+        position[3] = tmp
+    x1, y1 = position[0][0], position[0][1]
+    x2, y2 = position[2][0], position[2][1]
+    x3, y3 = position[3][0], position[3][1]
+    x4, y4 = position[1][0], position[1][1]
+    corners = np.zeros((4,2), np.float32)
+    corners[0] = [x1, y1]
+    corners[1] = [x2, y2]
+    corners[2] = [x4, y4]
+    corners[3] = [x3, y3]
+    img_width = distance((x1+x4)/2, (y1+y4)/2, (x2+x3)/2, (y2+y3)/2)
+    img_height = distance((x1+x2)/2, (y1+y2)/2, (x4+x3)/2, (y4+y3)/2)
+    corners_trans = np.zeros((4,2), np.float32)
+    corners_trans[0] = [0, 0]
+    corners_trans[1] = [img_width - 1, 0]
+    corners_trans[2] = [0, img_height - 1]
+    corners_trans[3] = [img_width - 1, img_height - 1]
+    transform = cv2.getPerspectiveTransform(corners, corners_trans)
+    dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height)))
+    return dst
+def calculate_size(box):
+    return (box[2]-box[0]) * (box[3]-box[1])
+def calculate_iou(box1, box2):
+    xA = max(box1[0], box2[0])
+    yA = max(box1[1], box2[1])
+    xB = min(box1[2], box2[2])
+    yB = min(box1[3], box2[3])
+    interArea = max(0, xB - xA) * max(0, yB - yA)
+    box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    unionArea = box1Area + box2Area - interArea
+    iou = interArea / unionArea
+    return iou
+def crop(image, box, i, text_data=None):
+    image = Image.open(image)
+    if text_data:
+        draw = ImageDraw.Draw(image)
+        draw.rectangle(((text_data[0], text_data[1]), (text_data[2], text_data[3])), outline="red", width=5)
+        # font_size = int((text_data[3] - text_data[1])*0.75)
+        # font = ImageFont.truetype("arial.ttf", font_size)
+        # draw.text((text_data[0]+5, text_data[1]+5), str(i), font=font, fill="red")
+    cropped_image = image.crop(box)
+    cropped_image.save(f"./temp/{i}.jpg")
+def in_box(box, target):
+    if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]):
+        return True
+    else:
+        return False
+def crop_for_clip(image, box, i, position):
+    image = Image.open(image)
+    w, h = image.size
+    if position == "left":
+        bound = [0, 0, w/2, h]
+    elif position == "right":
+        bound = [w/2, 0, w, h]
+    elif position == "top":
+        bound = [0, 0, w, h/2]
+    elif position == "bottom":
+        bound = [0, h/2, w, h]
+    elif position == "top left":
+        bound = [0, 0, w/2, h/2]
+    elif position == "top right":
+        bound = [w/2, 0, w, h/2]
+    elif position == "bottom left":
+        bound = [0, h/2, w/2, h]
+    elif position == "bottom right":
+        bound = [w/2, h/2, w, h]
+    else:
+        bound = [0, 0, w, h]
+    if in_box(box, bound):
+        cropped_image = image.crop(box)
+        cropped_image.save(f"./temp/{i}.jpg")
+        return True
+    else:
+        return False

PCAgent/icon_localization.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from PCAgent.crop import calculate_size, calculate_iou
+from modelscope.pipelines import pipeline
+from PIL import Image
+import torch
+def remove_boxes(boxes_filt, size, iou_threshold=0.5):
+    boxes_to_remove = set()
+    for i in range(len(boxes_filt)):
+        if calculate_size(boxes_filt[i]) > 0.05*size[0]*size[1]:
+            boxes_to_remove.add(i)
+        for j in range(len(boxes_filt)):
+            if calculate_size(boxes_filt[j]) > 0.05*size[0]*size[1]:
+                boxes_to_remove.add(j)
+            if i == j:
+                continue
+            if i in boxes_to_remove or j in boxes_to_remove:
+                continue
+            iou = calculate_iou(boxes_filt[i], boxes_filt[j])
+            if iou >= iou_threshold:
+                boxes_to_remove.add(j)
+    boxes_filt = [box for idx, box in enumerate(boxes_filt) if idx not in boxes_to_remove]
+    return boxes_filt
+def det(input_image_path, caption, groundingdino_model, box_threshold=0.05, text_threshold=0.5):
+    image = Image.open(input_image_path)
+    size = image.size
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith('.'):
+        caption = caption + '.'
+    inputs = {
+        'IMAGE_PATH': input_image_path,
+        'TEXT_PROMPT': caption,
+        'BOX_TRESHOLD': box_threshold,
+        'TEXT_TRESHOLD': text_threshold
+    }
+    result = groundingdino_model(inputs)
+    boxes_filt = result['boxes']
+    H, W = size[1], size[0]
+    for i in range(boxes_filt.size(0)):
+        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+        boxes_filt[i][2:] += boxes_filt[i][:2]
+    boxes_filt = boxes_filt.cpu().int().tolist()
+    filtered_boxes = remove_boxes(boxes_filt, size)  # [:9]
+    coordinates = []
+    for box in filtered_boxes:
+        coordinates.append([box[0], box[1], box[2], box[3]])
+    return coordinates

PCAgent/merge_strategy.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import numpy as np
+def calculate_iou(box1, box2):
+    x1_min, y1_min, x1_max, y1_max = box1
+    x2_min, y2_min, x2_max, y2_max = box2
+    inter_x_min = max(x1_min, x2_min)
+    inter_y_min = max(y1_min, y2_min)
+    inter_x_max = min(x1_max, x2_max)
+    inter_y_max = min(y1_max, y2_max)
+    inter_area = max(0, inter_x_max - inter_x_min) * max(0, inter_y_max - inter_y_min)
+    box1_area = (x1_max - x1_min) * (y1_max - y1_min)
+    box2_area = (x2_max - x2_min) * (y2_max - y2_min)
+    union_area = box1_area + box2_area - inter_area
+    iou = inter_area / union_area
+    return iou
+def compute_iou(box1, box2):
+    """
+    Compute the Intersection over Union (IoU) of two bounding boxes.
+    Parameters:
+    - box1: list or array [x1, y1, x2, y2]
+    - box2: list or array [x1, y1, x2, y2]
+    Returns:
+    - iou: float, IoU value
+    """
+    x1_inter = max(box1[0], box2[0])
+    y1_inter = max(box1[1], box2[1])
+    x2_inter = min(box1[2], box2[2])
+    y2_inter = min(box1[3], box2[3])
+    # print(x2_inter, x1_inter, y2_inter, y1_inter)
+    inter_area = max(0, x2_inter - x1_inter + 1) * max(0, y2_inter - y1_inter + 1)
+    box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
+    box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
+    iou = inter_area / float(box1_area + box2_area - inter_area)
+    return iou
+def merge_boxes(box1, box2):
+    x1_min, y1_min, x1_max, y1_max = box1
+    x2_min, y2_min, x2_max, y2_max = box2
+    merged_box = [min(x1_min, x2_min), min(y1_min, y2_min), max(x1_max, x2_max), max(y1_max, y2_max)]
+    return merged_box
+def merge_boxes_and_texts(texts, boxes, iou_threshold=0):
+    """
+    Merge bounding boxes and their corresponding texts based on IoU threshold.
+    Parameters:
+    - boxes: List of bounding boxes, with each box represented as [x1, y1, x2, y2].
+    - texts: List of texts corresponding to each bounding box.
+    - iou_threshold: Intersection-over-Union threshold for merging boxes.
+    Returns:
+    - merged_boxes: List of merged bounding boxes.
+    - merged_texts: List of merged texts corresponding to the bounding boxes.
+    """
+    if len(boxes) == 0:
+        return [], []
+    # boxes = np.array(boxes)
+    merged_boxes = []
+    merged_texts = []
+    while len(boxes) > 0:
+        box = boxes[0]
+        text = texts[0]
+        boxes = boxes[1:]
+        texts = texts[1:]
+        to_merge_boxes = [box]
+        to_merge_texts = [text]
+        keep_boxes = []
+        keep_texts = []
+        for i, other_box in enumerate(boxes):
+            if compute_iou(box, other_box) > iou_threshold:
+                to_merge_boxes.append(other_box)
+                to_merge_texts.append(texts[i])
+            else:
+                keep_boxes.append(other_box)
+                keep_texts.append(texts[i])
+        # Merge the to_merge boxes into a single box
+        if len(to_merge_boxes) > 1:
+            x1 = min(b[0] for b in to_merge_boxes)
+            y1 = min(b[1] for b in to_merge_boxes)
+            x2 = max(b[2] for b in to_merge_boxes)
+            y2 = max(b[3] for b in to_merge_boxes)
+            merged_box = [x1, y1, x2, y2]
+            merged_text = " ".join(to_merge_texts)  # You can change the merging strategy here
+            merged_boxes.append(merged_box)
+            merged_texts.append(merged_text)
+        else:
+            merged_boxes.extend(to_merge_boxes)
+            merged_texts.extend(to_merge_texts)
+        # boxes = np.array(keep_boxes)
+        boxes = keep_boxes
+        texts = keep_texts
+    return merged_texts, merged_boxes
+def is_contained(bbox1, bbox2):
+    x1_min, y1_min, x1_max, y1_max = bbox1
+    x2_min, y2_min, x2_max, y2_max = bbox2
+    if (x1_min >= x2_min and y1_min >= y2_min and x1_max <= x2_max and y1_max <= y2_max):
+        return True
+    elif (x2_min >= x1_min and y2_min >= y1_min and x2_max <= x1_max and y2_max <= y1_max):
+        return True
+    return False
+def is_overlapping(bbox1, bbox2):
+    x1_min, y1_min, x1_max, y1_max = bbox1
+    x2_min, y2_min, x2_max, y2_max = bbox2
+    inter_xmin = max(x1_min, x2_min)
+    inter_ymin = max(y1_min, y2_min)
+    inter_xmax = min(x1_max, x2_max)
+    inter_ymax = min(y1_max, y2_max)
+    if inter_xmin < inter_xmax and inter_ymin < inter_ymax:
+        return True
+    return False
+def get_area(bbox):
+    x_min, y_min, x_max, y_max = bbox
+    return (x_max - x_min) * (y_max - y_min)
+def merge_all_icon_boxes(bboxes):
+    result_bboxes = []
+    while bboxes:
+        bbox = bboxes.pop(0)
+        to_add = True
+        for idx, existing_bbox in enumerate(result_bboxes):
+            if is_contained(bbox, existing_bbox):
+                if get_area(bbox) > get_area(existing_bbox):
+                    result_bboxes[idx] = existing_bbox
+                to_add = False
+                break
+            elif is_overlapping(bbox, existing_bbox):
+                if get_area(bbox) < get_area(existing_bbox):
+                    result_bboxes[idx] = bbox
+                to_add = False
+                break
+        if to_add:
+            result_bboxes.append(bbox)
+    return result_bboxes
+def merge_all_icon_boxes_new(elements):
+    result_elements = []
+    while elements:
+        ele = elements.pop(0)
+        bbox = [ele['position'][0], ele['position'][1], ele['position'][0]+ele['size'][0], ele['position'][1]+ele['size'][1]]
+        # bbox = bboxes.pop(0)
+        to_add = True
+        for idx, existing_ele in enumerate(result_elements):
+            existing_bbox = [existing_ele['position'][0], existing_ele['position'][1], existing_ele['position'][0]+existing_ele['size'][0], existing_ele['position'][1]+existing_ele['size'][1]]
+            if is_contained(bbox, existing_bbox):
+                if get_area(bbox) > get_area(existing_bbox):
+                    result_elements[idx] = existing_ele
+                to_add = False
+                break
+            elif is_overlapping(bbox, existing_bbox):
+                if get_area(bbox) < get_area(existing_bbox):
+                    result_elements[idx] = ele
+                to_add = False
+                break
+        if to_add:
+            result_elements.append(ele)
+    return result_elements
+def merge_bbox_groups(A, B, iou_threshold=0.8):
+    i = 0
+    while i < len(A):
+        box_a = A[i]
+        has_merged = False
+        for j in range(len(B)):
+            box_b = B[j]
+            iou = calculate_iou(box_a, box_b)
+            if iou > iou_threshold:
+                merged_box = merge_boxes(box_a, box_b)
+                A[i] = merged_box
+                B.pop(j)
+                has_merged = True
+                break
+        if has_merged:
+            i -= 1
+        i += 1
+    return A, B
+def bbox_iou(boxA, boxB):
+    # Calculate Intersection over Union (IoU) between two bounding boxes
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
+    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
+    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
+    iou = interArea / float(boxAArea + boxBArea - interArea)
+    return iou
+def merge_boxes_and_texts_new(texts, bounding_boxes, iou_threshold=0):
+    if not bounding_boxes:
+        return [], []
+    bounding_boxes = np.array(bounding_boxes)
+    merged_boxes = []
+    merged_texts = []
+    used = np.zeros(len(bounding_boxes), dtype=bool)
+    for i, boxA in enumerate(bounding_boxes):
+        if used[i]:
+            continue
+        x_min, y_min, x_max, y_max = boxA
+        # text = texts[i]
+        text = ''
+        overlapping_indices = [i] # []
+        for j, boxB in enumerate(bounding_boxes):
+            # print(i,j, bbox_iou(boxA, boxB))
+            if i != j and not used[j] and bbox_iou(boxA, boxB) > iou_threshold:
+                overlapping_indices.append(j)
+        # Sort overlapping boxes by vertical position (top to bottom)
+        overlapping_indices.sort(key=lambda idx: (bounding_boxes[idx][1] + bounding_boxes[idx][3])/2) # TODO
+        for idx in overlapping_indices:
+            boxB = bounding_boxes[idx]
+            x_min = min(x_min, boxB[0])
+            y_min = min(y_min, boxB[1])
+            x_max = max(x_max, boxB[2])
+            y_max = max(y_max, boxB[3])
+            # text += " " + texts[idx]
+            text += texts[idx]
+            used[idx] = True
+        merged_boxes.append([x_min, y_min, x_max, y_max])
+        merged_texts.append(text)
+        used[i] = True
+    return merged_texts, merged_boxes

PCAgent/prompt_qwen.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# PC
+def get_subtask_prompt_cn(instruction):
+    func_prompt = '''多模态agent通过执行点击、输入等一系列操作来完成用户的指令。
+用户指令可能由跨越多个应用程序的数个子任务组成，我希望你能将这个复杂的指令，分解为一些子任务，子任务有4种类型：
+1. 常规字符串形式：例如“在系统设置中，打开深色模式”；
+2. 包含字典内容的字符串：当前子任务的执行结果需要以字典方式传递给其他子任务，例如“在Outlook中，查看‘Paul’发来的邮件信息，以dict形式输出{'contact': 'Paul', 'mail_content': 'content of the email'}”；
+3. 格式化字符串：利用前序子任务传递的信息，补全当前子任务后，能够完全独立执行，例如“将{mail_content}通过短信发送给‘Joey’”；
+4. 包含字典内容的格式化字符串：既需要前序子任务传递的信息，以补全当前子任务，同时执行的结果也需要以字典方式传递给其他子任务，例如“在谷歌中搜索{question}，并将相关信息以dict形式输出{'info': 'related information'}”。
+举例来说，复合指令“系统设置中打开深色模式，在微信中查看‘John’发来的问题，在Chrome中搜索问题的答案，将答案添加到一个新建word文档中，保存为‘作业.docx’，然后发送给‘John’。”可以被分解为：
+{
+"subtask 1": "在系统设置中，打开深色模式",
+"subtask 2": "在微信中，查看‘John’发来的问题，将问题以dict形式输出{'John_question': 'content of the question'}",
+"subtask 3": "在Chrome中，搜索{John_question}，将搜索到的答案以dict形式输出{'John_question_answer': 'answer to the question'}",
+"subtask 4": "在Word中，新建一个文档，写入{John_question_answer}，并保存为‘作业.docx’",
+"subtask 5": "在微信中，发送‘作业.docx’给‘John’"
+}
+需要注意：
+1. 包含字典内容的字符串或格式化字符串，需要尽可能详细地说明dict中各个key的含义，即将哪些内容以dict的形式输出；
+2. 每个格式化字符串形式的子任务中包含的key，在前序子任务中要有对应的dict形式输出，也就是说，前序子任务执行完成后，保证当前子任务能够通过参数传递得到补全，从而可以独立执行。
+3. 必须保证，每个子任务，无论是常规字符串，还是补全之后的格式化字符串，能够完全脱离其他子任务独立执行。例如“在Word中新建一个文档，写入{John_question_answer}”可以独立执行，但“将修改后的Word文档通过邮件发送给{name}”则因为‘Word文档’指代不明确无法独立执行。
+4. 拆解后的每个子任务要有明确的应用程序，例如‘在Chrome中’、‘在Word中’等。一般而言，docx格式文档用Word程序打开，xlsx格式表格用Excel程序打开。此外，需要打开文件时，要明确文件的名字。
+'''
+    inst_prompt = '''
+User Instruction:
+{}
+'''
+    format_prompt = '''
+请你按照如下格式输出拆分后的子任务：
+{
+"subtask 1": ,
+"subtask 2": ,
+...
+}
+'''
+    prompt = func_prompt + inst_prompt.format(instruction) + format_prompt
+    return prompt
+def get_subtask_prompt(instruction):
+    func_prompt = '''A multi-modal agent completes a user's instruction by performing a series of actions such as clicking and typing. A user's instruction may consist of multiple subtasks across different applications. I want you to break down this complex instruction into several subtasks, which are of four types:
+1. Regular string: For example, "Open dark mode in system settings";
+2. String containing dictionary content: The result of the current subtask needs to be passed to other subtasks in a dictionary format, for example, "Check the emails from 'Paul' in Outlook and output the email details in a dict format like {'contact': 'Paul', 'mail_content': 'content of the email'}";
+3. Formatted string containing the keys from previous subtasks: Use the information from previous subtasks to complete and independently execute the current subtask, for example, "Send {mail_content} via SMS to 'Joey'". Note: Note: The text in the first "{""}" must be a key from the output of a previous subtask, and there should be no "''";
+4. Formatted string containing the keys from previous subtasks and the dictionary content: This requires both information from previous subtasks to complete the current subtask and the result also needs to be passed to other subtasks in a dictionary format, for example, "Search for {question} on Google and output the relevant information in a dict format like {'info': 'related information'}". Note: The text in the first "{""}" must be a key from the output of a previous subtask, and there should be no "''".
+For example, the compound instruction "Open dark mode in system settings, check the two questions sent by 'John' in WeChat, search for answers to these two questions in Chrome, add the answers to a new Word document, save it as 'assignment.docx', and then send it to 'John'." can be broken down into:
+{
+"subtask 1": "Open dark mode in system settings",
+"subtask 2": "Check the questions sent by 'John' in WeChat and output the questions in a dict format {'John_question_1': 'content of John\'s question_1', 'John_question_2': 'content of John\'s question_2'}",
+"subtask 3": "Search for {John_question_1} in Chrome and output the found answer in a dict format {'John_question_1_answer': 'answer to the question_1'}",
+"subtask 4": "Search for {John_question_2} in Chrome and output the found answer in a dict format {'John_question_2_answer': 'answer to the question_2'}",
+"subtask 5": "Create a new document in Word, write {John_question_1_answer} and {John_question_2_answer} sequentially, then save it as 'assignment.docx'",
+"subtask 6": "Send 'assignment.docx' to 'John' via WeChat"
+}
+Notes:
+1. Strings or formatted strings containing dictionary content should explain as detailed as possible the meaning of each key in the dict, i.e., what content should be output in dict form;
+2. Each key in a formatted string subtask must have a corresponding dict output in preceding subtasks, ensuring that after a preceding subtask is completed, the current subtask can be fully completed through parameter passing and thus executed independently.
+3. Ensure each subtask, whether as a regular string or a completed formatted string, can be executed independently of other subtasks. For example, "Create a new document in Word and write {John_question_answer}" can be executed independently, but "Send the modified Word document via email to {name}" cannot because "Word document" is ambiguous and cannot be executed independently.
+4. Each subtask must specify a clear application, such as 'in Chrome' or 'in Word'. Generally, docx formatted documents are opened with Word, and xlsx spreadsheets are opened with Excel. Additionally, when opening a file, clearly state the file name.
+5. Note that if a subtask contains a dict, ensure that the values in the dictionary do not contain single quote characters to avoid format errors.
+'''
+    inst_prompt = '''
+User Instruction:
+{}
+'''
+    format_prompt = '''
+Please output the split subtasks in the following format:
+{
+"subtask 1": ,
+"subtask 2": ,
+...
+}
+'''
+    prompt = func_prompt + inst_prompt.format(instruction) + format_prompt
+    return prompt
+def get_select_prompt(content):
+    prompt_template = '''
+Analyze the specified text range {} and output the first line and last line of the specified range separately.
+How to identify paragraphs: There are 2 spaces at the beginning of each paragraph. Define the title as the single line at the top.
+If the content has only one line (such as title), it is both the first and last line.'''
+    prompt_format = '''
+You should respond in the following format:
+<first>The content of the first line</first>
+<last>The content of the last line</last>
+'''
+    prompt = prompt_template.format(content)+prompt_format
+    return prompt
+def get_select_prompt_simple(content):
+    prompt_template = '''
+Analyze the text range of this part of the current Word document: {}, and output the content of the first and last lines separately.
+If the content has only one line in total, this line is the first line and also the last line.'''
+    prompt_format = '''
+You should respond in the following format:
+<first>The content of the first line</first>
+<last>The content of the last line</last>
+'''
+    prompt = prompt_template.format(content)+prompt_format
+    return prompt
+def get_select_prompt_backup(content):
+    prompt_template = '''
+Directly output the first line and the last line of the content: {} in the current shown Microsoft Word document. If the content has only one line, output this line twice.'''
+    prompt_format = '''
+You should respond in the following format:
+<first>The content of the first line</first>
+<last>The content of the last line</last>
+'''
+    prompt = prompt_template.format(content)+prompt_format
+    return prompt
+def get_action_prompt(instruction, clickable_infos, width, height, thought_history, summary_history, action_history, reflection_history, last_summary, last_action, reflection_thought, add_info, error_flag, completed_content, memory):
+    prompt = "### Background ###\n"
+    prompt += f"This image is a computer screenshot where icons are marked with numbers. Its width is {width} pixels and its height is {height} pixels. The user\'s instruction is: {instruction}.\n\n"
+    prompt += "### Tips ###\n"
+    prompt += add_info
+    prompt += "\n\n"
+    prompt += "### Screenshot information ###\n"
+    prompt += "In order to help you better perceive the content in this screenshot, we extract some information of the current screenshot. "
+    prompt += "This information consists of two parts: coordinates; content. "
+    prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; "
+    prompt += "the content is a text or 'icon' respectively. "
+    prompt += "The information is as follow:\n"
+    for clickable_info in clickable_infos:
+        if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
+            prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
+    if len(action_history) > 0:
+        prompt += "### History operations ###\n"
+        prompt += "Before arriving at the current screenshot, you have completed the following operations:\n"
+        for i in range(len(action_history)):
+            if len(reflection_history) > 0:
+                prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(' to ')[0].strip() + "; Action: " + action_history[i] + "; Reflection: " + reflection_history[i] + "]\n"
+            else:
+                prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(' to ')[0].strip() + "; Action: " + action_history[i] + "]\n"
+        prompt += "\n"
+    if completed_content != "":
+        prompt += "### Progress ###\n"
+        prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
+        prompt += "Completed contents:\n" + completed_content + "\n\n"
+    if memory != "":
+        prompt += "### Memory ###\n"
+        prompt += "During the operations, you record the following contents on the screenshot for use in subsequent operations:\n"
+        prompt += "Memory:\n" + memory + "\n"
+    # 禁用
+    if error_flag:
+        prompt += "### Last operation ###\n"
+        prompt += f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time."
+        prompt += "\n\n"
+        print(f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time.")
+    prompt += "### Task requirements ###\n"
+    prompt += "In order to meet the user\'s requirements, you need to select one of the following operations to operate on the current screen:\n"
+    prompt += "Note that to open an app, use the Open App action, rather than tapping the app's icon. "
+    prompt += "For certain items that require selection, such as font and font size, direct input is more efficient than scrolling through choices."
+    prompt += "You must choose one of the actions below:\n"
+    prompt += "Open App (app name): If you want to open an app, you should use this action to open the app named 'app name'."
+    prompt += "Right Tap (x, y): Right tap the position (x, y) in current page. This can be used to create a new file.\n"
+    prompt += "Tap (x, y): Tap the position (x, y) in current page. This can be used to select an item.\n"
+    prompt += "Double Tap (x, y): Double tap the position (x, y) in the current page. This can be used to open a file. If Tap (x, y) in the last step doesn't work, you can try double tap the position (x, y) in the current page.\n"
+    prompt += '''
+    Shortcut (key1, key2): There are several shortcuts (key1+key2) you may use.
+    For example, if you can't find the download button, use command+s to save the page or download the file.
+    To select all, you can use command+a.
+    To create a new file in Word/Excel, you can use command+n.
+    To create a new tab for starting a new search in Chrome, you can use command+t.
+    To copy an item, you can first select it and then use command+c.
+    To paste the copied item, you can first select the location you want to paste it to, and then use command+v.
+    '''
+    prompt += '''
+    Press (key name): There are several keys that may help.
+    For example, if you want to delete the selected content, press 'backspace'.
+    You can press 'enter' to confirm, submit the input command, or insert a line break.
+    Also, you can press 'up', 'down', 'left', or 'right' to scroll the page or adjust the position of the selected object.
+    '''
+    prompt += "Type (x, y), (text): Tap the position (x, y) and type the \"text\" in the input box and press the enter key. You should replace the \"text\" with the actual input.\n"
+    prompt += "Select (content): Select the referred 'content' in the current document, such as 'title', 'the second paragraph' and 'the last two paragraphs'. This action is useful when you want to edit a certain part of the document, such as bolding, adding underlines, changing line spacing, centering text, etc.\n"
+    prompt += "Replace (x, y), (text): Replace the editable content in (x, y) with the \"text\". You should replace the \"text\" with the actual input. This action is very useful when you want to start a new search in Chrome or rename a file.\n"
+    prompt += "Append (x, y), (text): Append the \"text\" content after the content at (x, y) location. This action is useful when you want to append new content into a word document.\n"
+    prompt += "Tell (answer): Tell me the answer of the input query.\n"
+    prompt += "Stop: If all the operations to meet the user\'s requirements have been completed in ### History operation ###, use this operation to stop the whole process."
+    prompt += "\n\n"
+    prompt += "### Output format ###\n"
+    # modified 2.10
+    prompt += "You should output in the following json format:"
+    prompt += '''
+{"Thought": "This is your thinking about how to proceed the next operation, please output the thoughts about the history operations explicitly.", "Action": "Open App () or Tap () or Double Tap () or Triple Tap () or Shortcut () or Press() or Type () or Tell () or Stop. Only one action can be output at one time.", "Summary": "This is a one sentence summary of this operation."}
+'''
+    prompt += "\n\n"
+    return prompt
+def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width, height, summary, action, add_info, no_image=0):
+    if no_image == 1:
+        prompt = f"The computer screen's width is {width} pixels and the height is {height} pixels.\n\n"
+    else:
+        prompt = f"These images are two computer screenshots before and after an operation. Their widths are {width} pixels and their heights are {height} pixels.\n\n"
+    prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot. "
+    prompt += "The information consists of two parts, consisting of format: coordinates; content. "
+    prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively "
+    prompt += "\n\n"
+    prompt += "### Before the current operation ###\n"
+    prompt += "Screenshot information:\n"
+    for clickable_info in clickable_infos1:
+        if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
+            prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
+    prompt += "\n\n"
+    prompt += "### After the current operation ###\n"
+    prompt += "Screenshot information:\n"
+    for clickable_info in clickable_infos2:
+        if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
+            prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
+    prompt += "\n\n"
+    prompt += "### Current operation ###\n"
+    prompt += f"The user\'s instruction is: {instruction}."
+    if add_info != "":
+        prompt += f"You also need to note the following requirements: {add_info}."
+    prompt += "In the process of completing the requirements of instruction, an operation is performed on the computer. Below are the details of this operation:\n"
+    prompt += "Operation thought: " + summary.split(" to ")[0].strip() + "\n"
+    prompt += "Operation action: " + action
+    prompt += "\n\n"
+    prompt += "### Response requirements ###\n"
+    if no_image == 1:
+        prompt += "Now you need to output the following content based on the screenshots information before and after the current operation:\n"
+    else:
+        prompt += "Now you need to output the following content based on the screenshots before and after the current operation:\n"
+    prompt += "Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n"
+    prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\".\n"
+    prompt += "B: The \"Operation action\" results in a wrong page and I need to do something to correct this.\n"
+    prompt += "C: The \"Operation action\" produces no changes."
+    prompt += "\n\n"
+    prompt += "### Output format ###\n"
+    prompt += "Your output format is:\n"
+    prompt += "### Thought ###\nYour thought about the question\n"
+    prompt += "### Answer ###\nA or B or C"
+    return prompt
+def get_memory_prompt(insight):
+    if insight != "":
+        prompt  = "### Important content ###\n"
+        prompt += insight
+        prompt += "\n\n"
+        prompt += "### Response requirements ###\n"
+        prompt += "Please think about whether there is any content closely related to ### Important content ### on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
+    else:
+        prompt  = "### Response requirements ###\n"
+        prompt += "Please think about whether there is any content closely related to user\'s instrcution on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
+    prompt += "### Output format ###\n"
+    prompt += "Your output format is:\n"
+    prompt += "### Important content ###\nThe content or None. Please do not repeatedly output the information in ### Memory ###."
+    return prompt
+def get_process_prompt(instruction, thought_history, summary_history, action_history, completed_content, add_info, reflection_history=[]):
+    prompt = "### Background ###\n"
+    prompt += f"There is an user\'s instruction which is: {instruction}. You are a computer operating assistant and are operating the user\'s computer.\n\n"
+    if add_info != "":
+        prompt += "### Hint ###\n"
+        prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
+        prompt += add_info
+        prompt += "\n\n"
+    if len(thought_history) > 1:
+        prompt += "### History operations ###\n"
+        prompt += "To complete the requirements of user\'s instruction, you have performed a series of operations. These operations are as follow:\n"
+        for i in range(len(summary_history)):
+            operation = summary_history[i].split(" to ")[0].strip()
+            if len(reflection_history) > 0:
+                prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "; Operation reflection: " + reflection_history[i] + "]\n"
+            else:
+                prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "]\n"
+        prompt += "\n"
+        prompt += "### Progress thinking ###\n"
+        prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
+        prompt += "Completed contents:\n" + completed_content + "\n\n"
+        prompt += "### Response requirements ###\n"
+        prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the ### History operations ###.\n\n"
+        prompt += "### Output format ###\n"
+        prompt += "Your output format is:\n"
+        prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### History operations ###."
+    else:
+        prompt += "### Current operation ###\n"
+        prompt += "To complete the requirements of user\'s instruction, you have performed an operation. Your operation thought and action of this operation are as follows:\n"
+        prompt += f"Operation thought: {thought_history[-1]}\n"
+        operation = summary_history[-1].split(" to ")[0].strip()
+        if len(reflection_history) > 0:
+            prompt += f"Operation action: {operation}\n" + "Operation reflection: " + reflection_history[-1] + "\n\n"
+        else:
+            prompt += f"Operation action: {operation}\n\n"
+        # if reflection_thought is not None:
+        #     prompt += "A reflection model was adopted to analyze whether the last step's operation meets the expectation, you should combine its reflection thought to produce the \"Completed contents\"."
+        #     prompt += "Below is its reflection thought:\n"
+        #     prompt += reflection_thought + "\n"
+        prompt += "### Response requirements ###\n"
+        prompt += "Now you need to combine all of the above to generate the \"Completed contents\".\n"
+        prompt += "Completed contents is a general summary of the current contents that have been completed. You need to first focus on the requirements of user\'s instruction, and then summarize the contents that have been completed.\n\n"
+        prompt += "### Output format ###\n"
+        prompt += "Your output format is:\n"
+        prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### Current operation ###.\n"
+        prompt += "(Please use English to output)"
+    return prompt

PCAgent/text_localization.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import re
+import os
+import logging
+import os
+from alibabacloud_tea_util import models as util_models
+from alibabacloud_tea_openapi import models as open_api_models
+from alibabacloud_ocr_api20210707 import models as ocr_api_20210707_models
+from alibabacloud_ocr_api20210707.client import Client as ocr_api20210707Client
+class Sample:
+    def __init__(self):
+        pass
+    @staticmethod
+    def create_client() -> ocr_api20210707Client:
+        config = open_api_models.Config(
+            access_key_id=os.environ.get('OCR_ACCESS_KEY_ID'),
+            access_key_secret=os.environ.get('OCR_ACCESS_KEY_SECRET'),
+        )
+        config.endpoint = f'ocr-api.cn-hangzhou.aliyuncs.com'
+        return ocr_api20210707Client(config)
+    @staticmethod
+    def main(image) -> None:
+        client = Sample.create_client()
+        recognize_all_text_request = ocr_api_20210707_models.RecognizeAllTextRequest(
+            body=image,
+            type='Advanced',
+            output_coordinate='points',
+            output_oricoord=True,
+        )
+        runtime = util_models.RuntimeOptions()
+        output = client.recognize_all_text_with_options(recognize_all_text_request, runtime)
+        # logger.info(f'ocr response：{output}', extra={'request_id': ""})
+        output = output.body.data.sub_images[0].block_info.block_details
+        return output
+def image_to_binary(image_path):
+    with open(image_path, 'rb') as file:
+        binary_data = file.read()
+    return binary_data
+def remove_punctuation(text):
+    # 使用正则表达式删除标点符号、下划线和空格
+    cleaned_text = re.sub(r'[^\w\s]', '', text)  # 删除标点符号
+    cleaned_text = re.sub(r'_', '', cleaned_text)  # 删除下划线
+    cleaned_text = re.sub(r'\s', '', cleaned_text)  # 删除空格
+    return cleaned_text.replace("v", "").replace("o", "").replace("O", "").replace("T", "").replace("Q", "").replace("丶", "")
+class OCRError(Exception):
+    def __init__(self, message):
+        super().__init__(message)
+        self.message = message
+def ocr(image_path):
+    text = []
+    coordinate = []
+    image = image_to_binary(image_path)
+    print(image_path)
+    try:
+        outputs = Sample.main(image)
+    except Exception as e:
+        raise OCRError(e.message)
+    for output in outputs:
+        text.append(output.block_content)
+        bbox = [int(output.block_points[0].x), int(output.block_points[0].y), int(output.block_points[2].x), int(output.block_points[2].y)]
+        coordinate.append(bbox)
+    return text, coordinate

PCAgent/text_localization_old.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import cv2
+import numpy as np
+from PCAgent.crop import crop_image, calculate_size
+from PIL import Image
+def order_point(coor):
+    arr = np.array(coor).reshape([4, 2])
+    sum_ = np.sum(arr, 0)
+    centroid = sum_ / arr.shape[0]
+    theta = np.arctan2(arr[:, 1] - centroid[1], arr[:, 0] - centroid[0])
+    sort_points = arr[np.argsort(theta)]
+    sort_points = sort_points.reshape([4, -1])
+    if sort_points[0][0] > centroid[0]:
+        sort_points = np.concatenate([sort_points[3:], sort_points[:3]])
+    sort_points = sort_points.reshape([4, 2]).astype('float32')
+    return sort_points
+def longest_common_substring_length(str1, str2):
+    m = len(str1)
+    n = len(str2)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if str1[i - 1] == str2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+            else:
+                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
+    return dp[m][n]
+def ocr(image_path, ocr_detection, ocr_recognition):
+    text_data = []
+    coordinate = []
+    image_full = cv2.imread(image_path)
+    try:
+        det_result = ocr_detection(image_full)
+    except:
+        print('not text detected')
+        return ['no text'], [[0,0,0,0]]
+    det_result = det_result['polygons']
+    for i in range(det_result.shape[0]):
+        pts = order_point(det_result[i])
+        image_crop = crop_image(image_full, pts)
+        try:
+            result = ocr_recognition(image_crop)['text'][0]
+        except:
+            continue
+        box = [int(e) for e in list(pts.reshape(-1))]
+        box = [box[0], box[1], box[4], box[5]]
+        text_data.append(result)
+        coordinate.append(box)
+    return text_data, coordinate

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: PC Agent
+emoji: 💬
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+sdk_version: 5.0.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: A Hierarchical Multi-Agent Collaboration Framework for Compl
+---
+An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import copy
 import shutil
 import base64
 import random
 import gradio as gr
 from datetime import datetime
 from modelscope.pipelines import pipeline
@@ -27,8 +28,18 @@ API_url = os.environ.get('API_url')
 token = os.environ.get('token')
 os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
 os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
 radius = 100
 chatbot_css = """
 <style>
 .chat-container {
@@ -287,7 +298,7 @@ def chatbot(image, instruction, add_info, history, chat_log):
     screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png")
     image.save(screenshot_file, format="PNG")
     screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1]
-    perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="C:/Windows/Fonts/arial.ttf")
     shutil.rmtree(temp_file)
     os.mkdir(temp_file)

 import shutil
 import base64
 import random
+import requests
 import gradio as gr
 from datetime import datetime
 from modelscope.pipelines import pipeline
 token = os.environ.get('token')
 os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
 os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
+tff_file = os.environ.get('tff_file')
 radius = 100
+def download_file(url, save_path):
+    response = requests.get(url, stream=True)  # 以流的方式下载
+    response.raise_for_status()  # 确保请求成功
+    with open(save_path, 'wb') as file:
+        for chunk in response.iter_content(chunk_size=8192):  # 分块写入，防止占用过多内存
+            file.write(chunk)
+download_file(tff_file, "font/arial.ttf")
 chatbot_css = """
 <style>
 .chat-container {
     screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png")
     image.save(screenshot_file, format="PNG")
     screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1]
+    perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="font/arial.ttf")
     shutil.rmtree(temp_file)
     os.mkdir(temp_file)

example/1-1.jpg ADDED Viewed

example/1-2.jpg ADDED Viewed

example/1-3.jpg ADDED Viewed