阳渠 commited on
Commit
3afb4b6
·
1 Parent(s): e78b889
PCAgent/__pycache__/api.cpython-310.pyc ADDED
Binary file (1.47 kB). View file
 
PCAgent/__pycache__/chat.cpython-310.pyc ADDED
Binary file (2.45 kB). View file
 
PCAgent/__pycache__/crop.cpython-310.pyc ADDED
Binary file (3.18 kB). View file
 
PCAgent/__pycache__/icon_localization.cpython-310.pyc ADDED
Binary file (1.75 kB). View file
 
PCAgent/__pycache__/merge_strategy.cpython-310.pyc ADDED
Binary file (6.25 kB). View file
 
PCAgent/__pycache__/prompt_qwen.cpython-310.pyc ADDED
Binary file (20.5 kB). View file
 
PCAgent/__pycache__/text_localization.cpython-310.pyc ADDED
Binary file (2.87 kB). View file
 
PCAgent/api.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import requests
3
+ import time
4
+
5
+ import pdb
6
+ import dashscope
7
+ from dashscope import MultiModalConversation
8
+
9
+ from PIL import Image
10
+ import io
11
+ from openai import OpenAI
12
+ import json
13
+
14
+ def resize_encode_image(image_path, screen_scale_ratio=1):
15
+ with Image.open(image_path) as img:
16
+ new_width = int(img.width * screen_scale_ratio)
17
+ new_height = int(img.height * screen_scale_ratio)
18
+ resized_img = img.resize((new_width, new_height), Image.LANCZOS)
19
+
20
+ buffered = io.BytesIO()
21
+ resized_img.save(buffered, format="PNG")
22
+
23
+ img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
24
+ return img_base64
25
+ # with open(image_path, "rb") as image_file:
26
+ # return base64.b64encode(image_file.read()).decode('utf-8')
27
+
28
+
29
+
30
+
31
+
32
+ def inference_chat(chat, model, api_url, token):
33
+
34
+ messages = []
35
+ for role, content in chat:
36
+ messages.append({"role": role, "content": content})
37
+
38
+ client = OpenAI(
39
+ # 若没有配置环境变量,请用百炼API Key将下行替换为:api_key="sk-xxx",
40
+ api_key=token,
41
+ base_url=api_url,
42
+ )
43
+
44
+
45
+ num_try = 5
46
+ for _ in range(num_try):
47
+ try:
48
+ completion = client.chat.completions.create(
49
+ model=model, # 此处以qwen-plus为例,可按需更换模型名称。模型列表:https://help.aliyun.com/zh/model-studio/getting-started/models
50
+ messages=messages
51
+ )
52
+ except:
53
+ print("Network Error:")
54
+ try:
55
+ print(completion.model_dump_json())
56
+ except:
57
+ print("Request Failed")
58
+ time.sleep(2)
59
+ else:
60
+ break
61
+
62
+
63
+ return json.loads(completion.model_dump_json())['choices'][0]['message']['content']
64
+
65
+ # headers = {
66
+ # "Content-Type": "application/json",
67
+ # "Authorization": f"Bearer {token}"
68
+ # }
69
+
70
+ # data = {
71
+ # "model": model,
72
+ # "messages": [],
73
+ # "max_tokens": 2048,
74
+ # 'temperature': 0.0,
75
+ # "seed": 1234
76
+ # }
77
+
PCAgent/chat.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from PCAgent.api import resize_encode_image
3
+
4
+
5
+ def init_subtask_chat():
6
+ operation_history = []
7
+ system_prompt = "You are a helpful AI assistant."
8
+ operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
9
+ return operation_history
10
+
11
+
12
+ def init_action_chat():
13
+ operation_history = []
14
+ system_prompt = "You are a helpful AI PC operating assistant. You need to help me operate the PC to complete the user\'s instruction."
15
+ operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
16
+ return operation_history
17
+
18
+
19
+ def init_reflect_chat():
20
+ operation_history = []
21
+ system_prompt = "You are a helpful AI PC operating assistant."
22
+ operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
23
+ return operation_history
24
+
25
+
26
+ def init_memory_chat():
27
+ operation_history = []
28
+ system_prompt = "You are a helpful AI PC operating assistant."
29
+ operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
30
+ return operation_history
31
+
32
+
33
+ def add_response_old(role, prompt, chat_history, image=None):
34
+ new_chat_history = copy.deepcopy(chat_history)
35
+ if image:
36
+ base64_image = resize_encode_image(image)
37
+ content = [
38
+ {
39
+ "type": "text",
40
+ "text": prompt
41
+ },
42
+ {
43
+ "type": "image_url",
44
+ "image_url": {
45
+ "url": f"data:image/jpeg;base64,{base64_image}"
46
+ }
47
+ },
48
+ ]
49
+ else:
50
+ content = [
51
+ {
52
+ "type": "text",
53
+ "text": prompt
54
+ },
55
+ ]
56
+ new_chat_history.append([role, content])
57
+ return new_chat_history
58
+
59
+
60
+ def add_response(role, prompt, chat_history, image=[], use_qwen=False):
61
+ new_chat_history = copy.deepcopy(chat_history)
62
+ content = [
63
+ {
64
+ "type": "text",
65
+ "text": prompt
66
+ },
67
+ ]
68
+ for i in range(len(image)):
69
+ if not use_qwen:
70
+ base64_image = resize_encode_image(image[i])
71
+ content.append(
72
+ {
73
+ "type": "image_url",
74
+ "image_url": {
75
+ "url": f"data:image/png;base64,{base64_image}"
76
+ }
77
+ }
78
+ )
79
+ else:
80
+ content.append(
81
+ {
82
+ "type": "image",
83
+ "image": image[i]
84
+ }
85
+ )
86
+ new_chat_history.append([role, content])
87
+ return new_chat_history
88
+
89
+
90
+ def add_response_two_image(role, prompt, chat_history, image):
91
+ new_chat_history = copy.deepcopy(chat_history)
92
+
93
+ base64_image1 = resize_encode_image(image[0])
94
+ base64_image2 = resize_encode_image(image[1])
95
+ content = [
96
+ {
97
+ "type": "text",
98
+ "text": prompt
99
+ },
100
+ {
101
+ "type": "image_url",
102
+ "image_url": {
103
+ "url": f"data:image/jpeg;base64,{base64_image1}"
104
+ }
105
+ },
106
+ {
107
+ "type": "image_url",
108
+ "image_url": {
109
+ "url": f"data:image/jpeg;base64,{base64_image2}"
110
+ }
111
+ },
112
+ ]
113
+
114
+ new_chat_history.append([role, content])
115
+ return new_chat_history
116
+
117
+
118
+ def print_status(chat_history):
119
+ print("*"*100)
120
+ for chat in chat_history:
121
+ print("role:", chat[0])
122
+ print(chat[1][0]["text"] + "<image>"*(len(chat[1])-1) + "\n")
123
+ print("*"*100)
PCAgent/crop.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image, ImageDraw, ImageFont
5
+
6
+
7
+
8
+ def crop_image(img, position):
9
+ def distance(x1,y1,x2,y2):
10
+ return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))
11
+ position = position.tolist()
12
+ for i in range(4):
13
+ for j in range(i+1, 4):
14
+ if(position[i][0] > position[j][0]):
15
+ tmp = position[j]
16
+ position[j] = position[i]
17
+ position[i] = tmp
18
+ if position[0][1] > position[1][1]:
19
+ tmp = position[0]
20
+ position[0] = position[1]
21
+ position[1] = tmp
22
+
23
+ if position[2][1] > position[3][1]:
24
+ tmp = position[2]
25
+ position[2] = position[3]
26
+ position[3] = tmp
27
+
28
+ x1, y1 = position[0][0], position[0][1]
29
+ x2, y2 = position[2][0], position[2][1]
30
+ x3, y3 = position[3][0], position[3][1]
31
+ x4, y4 = position[1][0], position[1][1]
32
+
33
+ corners = np.zeros((4,2), np.float32)
34
+ corners[0] = [x1, y1]
35
+ corners[1] = [x2, y2]
36
+ corners[2] = [x4, y4]
37
+ corners[3] = [x3, y3]
38
+
39
+ img_width = distance((x1+x4)/2, (y1+y4)/2, (x2+x3)/2, (y2+y3)/2)
40
+ img_height = distance((x1+x2)/2, (y1+y2)/2, (x4+x3)/2, (y4+y3)/2)
41
+
42
+ corners_trans = np.zeros((4,2), np.float32)
43
+ corners_trans[0] = [0, 0]
44
+ corners_trans[1] = [img_width - 1, 0]
45
+ corners_trans[2] = [0, img_height - 1]
46
+ corners_trans[3] = [img_width - 1, img_height - 1]
47
+
48
+ transform = cv2.getPerspectiveTransform(corners, corners_trans)
49
+ dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height)))
50
+ return dst
51
+
52
+
53
+ def calculate_size(box):
54
+ return (box[2]-box[0]) * (box[3]-box[1])
55
+
56
+
57
+ def calculate_iou(box1, box2):
58
+ xA = max(box1[0], box2[0])
59
+ yA = max(box1[1], box2[1])
60
+ xB = min(box1[2], box2[2])
61
+ yB = min(box1[3], box2[3])
62
+
63
+ interArea = max(0, xB - xA) * max(0, yB - yA)
64
+ box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1])
65
+ box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1])
66
+ unionArea = box1Area + box2Area - interArea
67
+ iou = interArea / unionArea
68
+
69
+ return iou
70
+
71
+
72
+ def crop(image, box, i, text_data=None):
73
+ image = Image.open(image)
74
+
75
+ if text_data:
76
+ draw = ImageDraw.Draw(image)
77
+ draw.rectangle(((text_data[0], text_data[1]), (text_data[2], text_data[3])), outline="red", width=5)
78
+ # font_size = int((text_data[3] - text_data[1])*0.75)
79
+ # font = ImageFont.truetype("arial.ttf", font_size)
80
+ # draw.text((text_data[0]+5, text_data[1]+5), str(i), font=font, fill="red")
81
+
82
+ cropped_image = image.crop(box)
83
+ cropped_image.save(f"./temp/{i}.jpg")
84
+
85
+
86
+ def in_box(box, target):
87
+ if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]):
88
+ return True
89
+ else:
90
+ return False
91
+
92
+
93
+ def crop_for_clip(image, box, i, position):
94
+ image = Image.open(image)
95
+ w, h = image.size
96
+ if position == "left":
97
+ bound = [0, 0, w/2, h]
98
+ elif position == "right":
99
+ bound = [w/2, 0, w, h]
100
+ elif position == "top":
101
+ bound = [0, 0, w, h/2]
102
+ elif position == "bottom":
103
+ bound = [0, h/2, w, h]
104
+ elif position == "top left":
105
+ bound = [0, 0, w/2, h/2]
106
+ elif position == "top right":
107
+ bound = [w/2, 0, w, h/2]
108
+ elif position == "bottom left":
109
+ bound = [0, h/2, w/2, h]
110
+ elif position == "bottom right":
111
+ bound = [w/2, h/2, w, h]
112
+ else:
113
+ bound = [0, 0, w, h]
114
+
115
+ if in_box(box, bound):
116
+ cropped_image = image.crop(box)
117
+ cropped_image.save(f"./temp/{i}.jpg")
118
+ return True
119
+ else:
120
+ return False
PCAgent/icon_localization.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PCAgent.crop import calculate_size, calculate_iou
2
+ from modelscope.pipelines import pipeline
3
+ from PIL import Image
4
+ import torch
5
+
6
+ def remove_boxes(boxes_filt, size, iou_threshold=0.5):
7
+ boxes_to_remove = set()
8
+
9
+ for i in range(len(boxes_filt)):
10
+ if calculate_size(boxes_filt[i]) > 0.05*size[0]*size[1]:
11
+ boxes_to_remove.add(i)
12
+ for j in range(len(boxes_filt)):
13
+ if calculate_size(boxes_filt[j]) > 0.05*size[0]*size[1]:
14
+ boxes_to_remove.add(j)
15
+ if i == j:
16
+ continue
17
+ if i in boxes_to_remove or j in boxes_to_remove:
18
+ continue
19
+ iou = calculate_iou(boxes_filt[i], boxes_filt[j])
20
+ if iou >= iou_threshold:
21
+ boxes_to_remove.add(j)
22
+
23
+ boxes_filt = [box for idx, box in enumerate(boxes_filt) if idx not in boxes_to_remove]
24
+
25
+ return boxes_filt
26
+
27
+
28
+ def det(input_image_path, caption, groundingdino_model, box_threshold=0.05, text_threshold=0.5):
29
+ image = Image.open(input_image_path)
30
+ size = image.size
31
+
32
+ caption = caption.lower()
33
+ caption = caption.strip()
34
+ if not caption.endswith('.'):
35
+ caption = caption + '.'
36
+
37
+ inputs = {
38
+ 'IMAGE_PATH': input_image_path,
39
+ 'TEXT_PROMPT': caption,
40
+ 'BOX_TRESHOLD': box_threshold,
41
+ 'TEXT_TRESHOLD': text_threshold
42
+ }
43
+
44
+ result = groundingdino_model(inputs)
45
+ boxes_filt = result['boxes']
46
+
47
+ H, W = size[1], size[0]
48
+ for i in range(boxes_filt.size(0)):
49
+ boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
50
+ boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
51
+ boxes_filt[i][2:] += boxes_filt[i][:2]
52
+
53
+ boxes_filt = boxes_filt.cpu().int().tolist()
54
+ filtered_boxes = remove_boxes(boxes_filt, size) # [:9]
55
+ coordinates = []
56
+ for box in filtered_boxes:
57
+ coordinates.append([box[0], box[1], box[2], box[3]])
58
+
59
+ return coordinates
PCAgent/merge_strategy.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def calculate_iou(box1, box2):
5
+ x1_min, y1_min, x1_max, y1_max = box1
6
+ x2_min, y2_min, x2_max, y2_max = box2
7
+
8
+ inter_x_min = max(x1_min, x2_min)
9
+ inter_y_min = max(y1_min, y2_min)
10
+ inter_x_max = min(x1_max, x2_max)
11
+ inter_y_max = min(y1_max, y2_max)
12
+
13
+ inter_area = max(0, inter_x_max - inter_x_min) * max(0, inter_y_max - inter_y_min)
14
+ box1_area = (x1_max - x1_min) * (y1_max - y1_min)
15
+ box2_area = (x2_max - x2_min) * (y2_max - y2_min)
16
+
17
+ union_area = box1_area + box2_area - inter_area
18
+ iou = inter_area / union_area
19
+ return iou
20
+
21
+
22
+ def compute_iou(box1, box2):
23
+ """
24
+ Compute the Intersection over Union (IoU) of two bounding boxes.
25
+
26
+ Parameters:
27
+ - box1: list or array [x1, y1, x2, y2]
28
+ - box2: list or array [x1, y1, x2, y2]
29
+
30
+ Returns:
31
+ - iou: float, IoU value
32
+ """
33
+ x1_inter = max(box1[0], box2[0])
34
+ y1_inter = max(box1[1], box2[1])
35
+ x2_inter = min(box1[2], box2[2])
36
+ y2_inter = min(box1[3], box2[3])
37
+
38
+ # print(x2_inter, x1_inter, y2_inter, y1_inter)
39
+
40
+ inter_area = max(0, x2_inter - x1_inter + 1) * max(0, y2_inter - y1_inter + 1)
41
+
42
+ box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
43
+ box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
44
+
45
+ iou = inter_area / float(box1_area + box2_area - inter_area)
46
+
47
+ return iou
48
+
49
+
50
+ def merge_boxes(box1, box2):
51
+ x1_min, y1_min, x1_max, y1_max = box1
52
+ x2_min, y2_min, x2_max, y2_max = box2
53
+
54
+ merged_box = [min(x1_min, x2_min), min(y1_min, y2_min), max(x1_max, x2_max), max(y1_max, y2_max)]
55
+ return merged_box
56
+
57
+
58
+ def merge_boxes_and_texts(texts, boxes, iou_threshold=0):
59
+ """
60
+ Merge bounding boxes and their corresponding texts based on IoU threshold.
61
+
62
+ Parameters:
63
+ - boxes: List of bounding boxes, with each box represented as [x1, y1, x2, y2].
64
+ - texts: List of texts corresponding to each bounding box.
65
+ - iou_threshold: Intersection-over-Union threshold for merging boxes.
66
+
67
+ Returns:
68
+ - merged_boxes: List of merged bounding boxes.
69
+ - merged_texts: List of merged texts corresponding to the bounding boxes.
70
+ """
71
+ if len(boxes) == 0:
72
+ return [], []
73
+
74
+ # boxes = np.array(boxes)
75
+ merged_boxes = []
76
+ merged_texts = []
77
+
78
+ while len(boxes) > 0:
79
+ box = boxes[0]
80
+ text = texts[0]
81
+ boxes = boxes[1:]
82
+ texts = texts[1:]
83
+ to_merge_boxes = [box]
84
+ to_merge_texts = [text]
85
+ keep_boxes = []
86
+ keep_texts = []
87
+
88
+ for i, other_box in enumerate(boxes):
89
+ if compute_iou(box, other_box) > iou_threshold:
90
+ to_merge_boxes.append(other_box)
91
+ to_merge_texts.append(texts[i])
92
+ else:
93
+ keep_boxes.append(other_box)
94
+ keep_texts.append(texts[i])
95
+
96
+ # Merge the to_merge boxes into a single box
97
+ if len(to_merge_boxes) > 1:
98
+ x1 = min(b[0] for b in to_merge_boxes)
99
+ y1 = min(b[1] for b in to_merge_boxes)
100
+ x2 = max(b[2] for b in to_merge_boxes)
101
+ y2 = max(b[3] for b in to_merge_boxes)
102
+ merged_box = [x1, y1, x2, y2]
103
+ merged_text = " ".join(to_merge_texts) # You can change the merging strategy here
104
+ merged_boxes.append(merged_box)
105
+ merged_texts.append(merged_text)
106
+ else:
107
+ merged_boxes.extend(to_merge_boxes)
108
+ merged_texts.extend(to_merge_texts)
109
+
110
+ # boxes = np.array(keep_boxes)
111
+ boxes = keep_boxes
112
+ texts = keep_texts
113
+
114
+ return merged_texts, merged_boxes
115
+
116
+
117
+ def is_contained(bbox1, bbox2):
118
+ x1_min, y1_min, x1_max, y1_max = bbox1
119
+ x2_min, y2_min, x2_max, y2_max = bbox2
120
+
121
+ if (x1_min >= x2_min and y1_min >= y2_min and x1_max <= x2_max and y1_max <= y2_max):
122
+ return True
123
+ elif (x2_min >= x1_min and y2_min >= y1_min and x2_max <= x1_max and y2_max <= y1_max):
124
+ return True
125
+ return False
126
+
127
+
128
+ def is_overlapping(bbox1, bbox2):
129
+ x1_min, y1_min, x1_max, y1_max = bbox1
130
+ x2_min, y2_min, x2_max, y2_max = bbox2
131
+
132
+ inter_xmin = max(x1_min, x2_min)
133
+ inter_ymin = max(y1_min, y2_min)
134
+ inter_xmax = min(x1_max, x2_max)
135
+ inter_ymax = min(y1_max, y2_max)
136
+
137
+ if inter_xmin < inter_xmax and inter_ymin < inter_ymax:
138
+ return True
139
+ return False
140
+
141
+
142
+ def get_area(bbox):
143
+ x_min, y_min, x_max, y_max = bbox
144
+ return (x_max - x_min) * (y_max - y_min)
145
+
146
+
147
+ def merge_all_icon_boxes(bboxes):
148
+ result_bboxes = []
149
+ while bboxes:
150
+ bbox = bboxes.pop(0)
151
+ to_add = True
152
+
153
+ for idx, existing_bbox in enumerate(result_bboxes):
154
+ if is_contained(bbox, existing_bbox):
155
+ if get_area(bbox) > get_area(existing_bbox):
156
+ result_bboxes[idx] = existing_bbox
157
+ to_add = False
158
+ break
159
+ elif is_overlapping(bbox, existing_bbox):
160
+ if get_area(bbox) < get_area(existing_bbox):
161
+ result_bboxes[idx] = bbox
162
+ to_add = False
163
+ break
164
+
165
+ if to_add:
166
+ result_bboxes.append(bbox)
167
+
168
+ return result_bboxes
169
+
170
+
171
+ def merge_all_icon_boxes_new(elements):
172
+ result_elements = []
173
+ while elements:
174
+ ele = elements.pop(0)
175
+ bbox = [ele['position'][0], ele['position'][1], ele['position'][0]+ele['size'][0], ele['position'][1]+ele['size'][1]]
176
+ # bbox = bboxes.pop(0)
177
+ to_add = True
178
+
179
+ for idx, existing_ele in enumerate(result_elements):
180
+ existing_bbox = [existing_ele['position'][0], existing_ele['position'][1], existing_ele['position'][0]+existing_ele['size'][0], existing_ele['position'][1]+existing_ele['size'][1]]
181
+ if is_contained(bbox, existing_bbox):
182
+ if get_area(bbox) > get_area(existing_bbox):
183
+ result_elements[idx] = existing_ele
184
+ to_add = False
185
+ break
186
+ elif is_overlapping(bbox, existing_bbox):
187
+ if get_area(bbox) < get_area(existing_bbox):
188
+ result_elements[idx] = ele
189
+ to_add = False
190
+ break
191
+
192
+ if to_add:
193
+ result_elements.append(ele)
194
+
195
+ return result_elements
196
+
197
+
198
+
199
+
200
+ def merge_bbox_groups(A, B, iou_threshold=0.8):
201
+ i = 0
202
+ while i < len(A):
203
+ box_a = A[i]
204
+ has_merged = False
205
+ for j in range(len(B)):
206
+ box_b = B[j]
207
+ iou = calculate_iou(box_a, box_b)
208
+ if iou > iou_threshold:
209
+ merged_box = merge_boxes(box_a, box_b)
210
+ A[i] = merged_box
211
+ B.pop(j)
212
+ has_merged = True
213
+ break
214
+
215
+ if has_merged:
216
+ i -= 1
217
+ i += 1
218
+
219
+ return A, B
220
+
221
+
222
+ def bbox_iou(boxA, boxB):
223
+ # Calculate Intersection over Union (IoU) between two bounding boxes
224
+ xA = max(boxA[0], boxB[0])
225
+ yA = max(boxA[1], boxB[1])
226
+ xB = min(boxA[2], boxB[2])
227
+ yB = min(boxA[3], boxB[3])
228
+ interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
229
+ boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
230
+ boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
231
+ iou = interArea / float(boxAArea + boxBArea - interArea)
232
+ return iou
233
+
234
+
235
+ def merge_boxes_and_texts_new(texts, bounding_boxes, iou_threshold=0):
236
+ if not bounding_boxes:
237
+ return [], []
238
+
239
+ bounding_boxes = np.array(bounding_boxes)
240
+ merged_boxes = []
241
+ merged_texts = []
242
+
243
+ used = np.zeros(len(bounding_boxes), dtype=bool)
244
+
245
+ for i, boxA in enumerate(bounding_boxes):
246
+ if used[i]:
247
+ continue
248
+ x_min, y_min, x_max, y_max = boxA
249
+ # text = texts[i]
250
+ text = ''
251
+
252
+ overlapping_indices = [i] # []
253
+ for j, boxB in enumerate(bounding_boxes):
254
+ # print(i,j, bbox_iou(boxA, boxB))
255
+ if i != j and not used[j] and bbox_iou(boxA, boxB) > iou_threshold:
256
+ overlapping_indices.append(j)
257
+
258
+ # Sort overlapping boxes by vertical position (top to bottom)
259
+ overlapping_indices.sort(key=lambda idx: (bounding_boxes[idx][1] + bounding_boxes[idx][3])/2) # TODO
260
+
261
+ for idx in overlapping_indices:
262
+ boxB = bounding_boxes[idx]
263
+ x_min = min(x_min, boxB[0])
264
+ y_min = min(y_min, boxB[1])
265
+ x_max = max(x_max, boxB[2])
266
+ y_max = max(y_max, boxB[3])
267
+ # text += " " + texts[idx]
268
+ text += texts[idx]
269
+ used[idx] = True
270
+
271
+ merged_boxes.append([x_min, y_min, x_max, y_max])
272
+ merged_texts.append(text)
273
+ used[i] = True
274
+
275
+ return merged_texts, merged_boxes
PCAgent/prompt_qwen.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PC
2
+ def get_subtask_prompt_cn(instruction):
3
+ func_prompt = '''多模态agent通过执行点击、输入等一系列操作来完成用户的指令。
4
+ 用户指令可能由跨越多个应用程序的数个子任务组成,我希望你能将这个复杂的指令,分解为一些子任务,子任务有4种类型:
5
+ 1. 常规字符串形式:例如“在系统设置中,打开深色模式”;
6
+ 2. 包含字典内容的字符串:当前子任务的执行结果需要以字典方式传递给其他子任务,例如“在Outlook中,查看‘Paul’发来的邮件信息,以dict形式输出{'contact': 'Paul', 'mail_content': 'content of the email'}”;
7
+ 3. 格式化字符串:利用前序子任务传递的信息,补全当前子任务后,能够完全独立执行,例如“将{mail_content}通过短信发送给‘Joey’”;
8
+ 4. 包含字典内容的格式化字符串:既需要前序子任务传递的信息,以补全当前子任务,同时执行的结果也需要以字典方式传递给其他子任务,例如“在谷歌中搜索{question},并将相关信息以dict形式输出{'info': 'related information'}”。
9
+
10
+ 举例来说,复合指令“系统设置中打开深色模式,在微信中查看‘John’发来的问题,在Chrome中搜索问题的答案,将答案添加到一个新建word文档中,保存为‘作业.docx’,然后发送给‘John’。”可以被分解为:
11
+ {
12
+ "subtask 1": "在系统设置中,打开深色模式",
13
+ "subtask 2": "在微信中,查看‘John’发来的问题,将问题以dict形式输出{'John_question': 'content of the question'}",
14
+ "subtask 3": "在Chrome中,搜索{John_question},将搜索到的答案以dict形式输出{'John_question_answer': 'answer to the question'}",
15
+ "subtask 4": "在Word中,新建一个文档,写入{John_question_answer},并保存为‘作业.docx’",
16
+ "subtask 5": "在微信中,发送‘作业.docx’给‘John’"
17
+ }
18
+
19
+ 需要注意:
20
+ 1. 包含字典内容的字符串或格式化字符串,需要尽可能详细地说明dict中各个key的含义,即将哪些内容以dict的形式输出;
21
+ 2. 每个格式化字符串形式的子任务中包含的key,在前序子任务中要有对应的dict形式输出,也就是说,前序子任务执行完成后,保证当前子任务能够通过参数传递得到补全,从而可以独立执行。
22
+ 3. 必须保证,每个子任务,无论是常规字符串,还是补全之后的格式化字符串,能够完全脱离其他子任务独立执行。例如“在Word中新建一个文档,写入{John_question_answer}”可以独立执行,但“将修改后的Word文档通过邮件发送给{name}”则因为‘Word文档’指代不明确无法独立执行。
23
+ 4. 拆解后的每个子任务要有明确的应用程序,例如‘在Chrome中’、‘在Word中’等。一般而言,docx格式文档用Word程序打开,xlsx格式表格用Excel程序打开。此外,需要打开文件时,要明确文件的名字。
24
+ '''
25
+
26
+ inst_prompt = '''
27
+ User Instruction:
28
+ {}
29
+ '''
30
+
31
+ format_prompt = '''
32
+ 请你按照如下格式输出拆分后的子任务:
33
+ {
34
+ "subtask 1": ,
35
+ "subtask 2": ,
36
+ ...
37
+ }
38
+ '''
39
+ prompt = func_prompt + inst_prompt.format(instruction) + format_prompt
40
+ return prompt
41
+
42
+
43
+
44
+ def get_subtask_prompt(instruction):
45
+ func_prompt = '''A multi-modal agent completes a user's instruction by performing a series of actions such as clicking and typing. A user's instruction may consist of multiple subtasks across different applications. I want you to break down this complex instruction into several subtasks, which are of four types:
46
+
47
+ 1. Regular string: For example, "Open dark mode in system settings";
48
+ 2. String containing dictionary content: The result of the current subtask needs to be passed to other subtasks in a dictionary format, for example, "Check the emails from 'Paul' in Outlook and output the email details in a dict format like {'contact': 'Paul', 'mail_content': 'content of the email'}";
49
+ 3. Formatted string containing the keys from previous subtasks: Use the information from previous subtasks to complete and independently execute the current subtask, for example, "Send {mail_content} via SMS to 'Joey'". Note: Note: The text in the first "{""}" must be a key from the output of a previous subtask, and there should be no "''";
50
+ 4. Formatted string containing the keys from previous subtasks and the dictionary content: This requires both information from previous subtasks to complete the current subtask and the result also needs to be passed to other subtasks in a dictionary format, for example, "Search for {question} on Google and output the relevant information in a dict format like {'info': 'related information'}". Note: The text in the first "{""}" must be a key from the output of a previous subtask, and there should be no "''".
51
+
52
+
53
+ For example, the compound instruction "Open dark mode in system settings, check the two questions sent by 'John' in WeChat, search for answers to these two questions in Chrome, add the answers to a new Word document, save it as 'assignment.docx', and then send it to 'John'." can be broken down into:
54
+ {
55
+ "subtask 1": "Open dark mode in system settings",
56
+ "subtask 2": "Check the questions sent by 'John' in WeChat and output the questions in a dict format {'John_question_1': 'content of John\'s question_1', 'John_question_2': 'content of John\'s question_2'}",
57
+ "subtask 3": "Search for {John_question_1} in Chrome and output the found answer in a dict format {'John_question_1_answer': 'answer to the question_1'}",
58
+ "subtask 4": "Search for {John_question_2} in Chrome and output the found answer in a dict format {'John_question_2_answer': 'answer to the question_2'}",
59
+ "subtask 5": "Create a new document in Word, write {John_question_1_answer} and {John_question_2_answer} sequentially, then save it as 'assignment.docx'",
60
+ "subtask 6": "Send 'assignment.docx' to 'John' via WeChat"
61
+ }
62
+
63
+ Notes:
64
+ 1. Strings or formatted strings containing dictionary content should explain as detailed as possible the meaning of each key in the dict, i.e., what content should be output in dict form;
65
+ 2. Each key in a formatted string subtask must have a corresponding dict output in preceding subtasks, ensuring that after a preceding subtask is completed, the current subtask can be fully completed through parameter passing and thus executed independently.
66
+ 3. Ensure each subtask, whether as a regular string or a completed formatted string, can be executed independently of other subtasks. For example, "Create a new document in Word and write {John_question_answer}" can be executed independently, but "Send the modified Word document via email to {name}" cannot because "Word document" is ambiguous and cannot be executed independently.
67
+ 4. Each subtask must specify a clear application, such as 'in Chrome' or 'in Word'. Generally, docx formatted documents are opened with Word, and xlsx spreadsheets are opened with Excel. Additionally, when opening a file, clearly state the file name.
68
+ 5. Note that if a subtask contains a dict, ensure that the values in the dictionary do not contain single quote characters to avoid format errors.
69
+ '''
70
+
71
+ inst_prompt = '''
72
+ User Instruction:
73
+ {}
74
+ '''
75
+
76
+ format_prompt = '''
77
+ Please output the split subtasks in the following format:
78
+ {
79
+ "subtask 1": ,
80
+ "subtask 2": ,
81
+ ...
82
+ }
83
+ '''
84
+ prompt = func_prompt + inst_prompt.format(instruction) + format_prompt
85
+ return prompt
86
+
87
+
88
+
89
+
90
+ def get_select_prompt(content):
91
+ prompt_template = '''
92
+ Analyze the specified text range {} and output the first line and last line of the specified range separately.
93
+ How to identify paragraphs: There are 2 spaces at the beginning of each paragraph. Define the title as the single line at the top.
94
+ If the content has only one line (such as title), it is both the first and last line.'''
95
+
96
+ prompt_format = '''
97
+ You should respond in the following format:
98
+ <first>The content of the first line</first>
99
+ <last>The content of the last line</last>
100
+ '''
101
+ prompt = prompt_template.format(content)+prompt_format
102
+ return prompt
103
+
104
+
105
+
106
+
107
+ def get_select_prompt_simple(content):
108
+ prompt_template = '''
109
+ Analyze the text range of this part of the current Word document: {}, and output the content of the first and last lines separately.
110
+ If the content has only one line in total, this line is the first line and also the last line.'''
111
+
112
+ prompt_format = '''
113
+ You should respond in the following format:
114
+ <first>The content of the first line</first>
115
+ <last>The content of the last line</last>
116
+ '''
117
+ prompt = prompt_template.format(content)+prompt_format
118
+ return prompt
119
+
120
+
121
+
122
+ def get_select_prompt_backup(content):
123
+ prompt_template = '''
124
+ Directly output the first line and the last line of the content: {} in the current shown Microsoft Word document. If the content has only one line, output this line twice.'''
125
+
126
+ prompt_format = '''
127
+ You should respond in the following format:
128
+ <first>The content of the first line</first>
129
+ <last>The content of the last line</last>
130
+ '''
131
+ prompt = prompt_template.format(content)+prompt_format
132
+ return prompt
133
+
134
+
135
+ def get_action_prompt(instruction, clickable_infos, width, height, thought_history, summary_history, action_history, reflection_history, last_summary, last_action, reflection_thought, add_info, error_flag, completed_content, memory):
136
+ prompt = "### Background ###\n"
137
+ prompt += f"This image is a computer screenshot where icons are marked with numbers. Its width is {width} pixels and its height is {height} pixels. The user\'s instruction is: {instruction}.\n\n"
138
+
139
+ prompt += "### Tips ###\n"
140
+ prompt += add_info
141
+ prompt += "\n\n"
142
+
143
+ prompt += "### Screenshot information ###\n"
144
+ prompt += "In order to help you better perceive the content in this screenshot, we extract some information of the current screenshot. "
145
+ prompt += "This information consists of two parts: coordinates; content. "
146
+ prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; "
147
+
148
+ prompt += "the content is a text or 'icon' respectively. "
149
+ prompt += "The information is as follow:\n"
150
+
151
+ for clickable_info in clickable_infos:
152
+ if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
153
+ prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
154
+
155
+
156
+ if len(action_history) > 0:
157
+ prompt += "### History operations ###\n"
158
+ prompt += "Before arriving at the current screenshot, you have completed the following operations:\n"
159
+ for i in range(len(action_history)):
160
+ if len(reflection_history) > 0:
161
+ prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(' to ')[0].strip() + "; Action: " + action_history[i] + "; Reflection: " + reflection_history[i] + "]\n"
162
+ else:
163
+ prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(' to ')[0].strip() + "; Action: " + action_history[i] + "]\n"
164
+ prompt += "\n"
165
+
166
+ if completed_content != "":
167
+ prompt += "### Progress ###\n"
168
+ prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
169
+ prompt += "Completed contents:\n" + completed_content + "\n\n"
170
+
171
+ if memory != "":
172
+ prompt += "### Memory ###\n"
173
+ prompt += "During the operations, you record the following contents on the screenshot for use in subsequent operations:\n"
174
+ prompt += "Memory:\n" + memory + "\n"
175
+
176
+
177
+ # 禁用
178
+ if error_flag:
179
+ prompt += "### Last operation ###\n"
180
+ prompt += f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time."
181
+ prompt += "\n\n"
182
+ print(f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time.")
183
+
184
+ prompt += "### Task requirements ###\n"
185
+ prompt += "In order to meet the user\'s requirements, you need to select one of the following operations to operate on the current screen:\n"
186
+ prompt += "Note that to open an app, use the Open App action, rather than tapping the app's icon. "
187
+ prompt += "For certain items that require selection, such as font and font size, direct input is more efficient than scrolling through choices."
188
+ prompt += "You must choose one of the actions below:\n"
189
+ prompt += "Open App (app name): If you want to open an app, you should use this action to open the app named 'app name'."
190
+ prompt += "Right Tap (x, y): Right tap the position (x, y) in current page. This can be used to create a new file.\n"
191
+ prompt += "Tap (x, y): Tap the position (x, y) in current page. This can be used to select an item.\n"
192
+ prompt += "Double Tap (x, y): Double tap the position (x, y) in the current page. This can be used to open a file. If Tap (x, y) in the last step doesn't work, you can try double tap the position (x, y) in the current page.\n"
193
+
194
+
195
+ prompt += '''
196
+ Shortcut (key1, key2): There are several shortcuts (key1+key2) you may use.
197
+ For example, if you can't find the download button, use command+s to save the page or download the file.
198
+ To select all, you can use command+a.
199
+ To create a new file in Word/Excel, you can use command+n.
200
+ To create a new tab for starting a new search in Chrome, you can use command+t.
201
+ To copy an item, you can first select it and then use command+c.
202
+ To paste the copied item, you can first select the location you want to paste it to, and then use command+v.
203
+ '''
204
+ prompt += '''
205
+ Press (key name): There are several keys that may help.
206
+ For example, if you want to delete the selected content, press 'backspace'.
207
+ You can press 'enter' to confirm, submit the input command, or insert a line break.
208
+ Also, you can press 'up', 'down', 'left', or 'right' to scroll the page or adjust the position of the selected object.
209
+ '''
210
+
211
+ prompt += "Type (x, y), (text): Tap the position (x, y) and type the \"text\" in the input box and press the enter key. You should replace the \"text\" with the actual input.\n"
212
+
213
+ prompt += "Select (content): Select the referred 'content' in the current document, such as 'title', 'the second paragraph' and 'the last two paragraphs'. This action is useful when you want to edit a certain part of the document, such as bolding, adding underlines, changing line spacing, centering text, etc.\n"
214
+ prompt += "Replace (x, y), (text): Replace the editable content in (x, y) with the \"text\". You should replace the \"text\" with the actual input. This action is very useful when you want to start a new search in Chrome or rename a file.\n"
215
+ prompt += "Append (x, y), (text): Append the \"text\" content after the content at (x, y) location. This action is useful when you want to append new content into a word document.\n"
216
+
217
+ prompt += "Tell (answer): Tell me the answer of the input query.\n"
218
+ prompt += "Stop: If all the operations to meet the user\'s requirements have been completed in ### History operation ###, use this operation to stop the whole process."
219
+ prompt += "\n\n"
220
+
221
+ prompt += "### Output format ###\n"
222
+ # modified 2.10
223
+ prompt += "You should output in the following json format:"
224
+ prompt += '''
225
+ {"Thought": "This is your thinking about how to proceed the next operation, please output the thoughts about the history operations explicitly.", "Action": "Open App () or Tap () or Double Tap () or Triple Tap () or Shortcut () or Press() or Type () or Tell () or Stop. Only one action can be output at one time.", "Summary": "This is a one sentence summary of this operation."}
226
+ '''
227
+ prompt += "\n\n"
228
+
229
+ return prompt
230
+
231
+
232
+ def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width, height, summary, action, add_info, no_image=0):
233
+ if no_image == 1:
234
+ prompt = f"The computer screen's width is {width} pixels and the height is {height} pixels.\n\n"
235
+ else:
236
+ prompt = f"These images are two computer screenshots before and after an operation. Their widths are {width} pixels and their heights are {height} pixels.\n\n"
237
+
238
+ prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot. "
239
+ prompt += "The information consists of two parts, consisting of format: coordinates; content. "
240
+ prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively "
241
+ prompt += "\n\n"
242
+
243
+ prompt += "### Before the current operation ###\n"
244
+ prompt += "Screenshot information:\n"
245
+ for clickable_info in clickable_infos1:
246
+ if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
247
+ prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
248
+ prompt += "\n\n"
249
+
250
+ prompt += "### After the current operation ###\n"
251
+ prompt += "Screenshot information:\n"
252
+ for clickable_info in clickable_infos2:
253
+ if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
254
+ prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
255
+ prompt += "\n\n"
256
+
257
+ prompt += "### Current operation ###\n"
258
+ prompt += f"The user\'s instruction is: {instruction}."
259
+ if add_info != "":
260
+ prompt += f"You also need to note the following requirements: {add_info}."
261
+ prompt += "In the process of completing the requirements of instruction, an operation is performed on the computer. Below are the details of this operation:\n"
262
+ prompt += "Operation thought: " + summary.split(" to ")[0].strip() + "\n"
263
+ prompt += "Operation action: " + action
264
+ prompt += "\n\n"
265
+
266
+ prompt += "### Response requirements ###\n"
267
+ if no_image == 1:
268
+ prompt += "Now you need to output the following content based on the screenshots information before and after the current operation:\n"
269
+ else:
270
+ prompt += "Now you need to output the following content based on the screenshots before and after the current operation:\n"
271
+ prompt += "Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n"
272
+ prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\".\n"
273
+ prompt += "B: The \"Operation action\" results in a wrong page and I need to do something to correct this.\n"
274
+ prompt += "C: The \"Operation action\" produces no changes."
275
+ prompt += "\n\n"
276
+
277
+ prompt += "### Output format ###\n"
278
+ prompt += "Your output format is:\n"
279
+ prompt += "### Thought ###\nYour thought about the question\n"
280
+ prompt += "### Answer ###\nA or B or C"
281
+
282
+ return prompt
283
+
284
+
285
+ def get_memory_prompt(insight):
286
+ if insight != "":
287
+ prompt = "### Important content ###\n"
288
+ prompt += insight
289
+ prompt += "\n\n"
290
+
291
+ prompt += "### Response requirements ###\n"
292
+ prompt += "Please think about whether there is any content closely related to ### Important content ### on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
293
+
294
+ else:
295
+ prompt = "### Response requirements ###\n"
296
+ prompt += "Please think about whether there is any content closely related to user\'s instrcution on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
297
+
298
+ prompt += "### Output format ###\n"
299
+ prompt += "Your output format is:\n"
300
+ prompt += "### Important content ###\nThe content or None. Please do not repeatedly output the information in ### Memory ###."
301
+
302
+ return prompt
303
+
304
+ def get_process_prompt(instruction, thought_history, summary_history, action_history, completed_content, add_info, reflection_history=[]):
305
+ prompt = "### Background ###\n"
306
+ prompt += f"There is an user\'s instruction which is: {instruction}. You are a computer operating assistant and are operating the user\'s computer.\n\n"
307
+
308
+ if add_info != "":
309
+ prompt += "### Hint ###\n"
310
+ prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
311
+ prompt += add_info
312
+ prompt += "\n\n"
313
+
314
+ if len(thought_history) > 1:
315
+ prompt += "### History operations ###\n"
316
+ prompt += "To complete the requirements of user\'s instruction, you have performed a series of operations. These operations are as follow:\n"
317
+ for i in range(len(summary_history)):
318
+ operation = summary_history[i].split(" to ")[0].strip()
319
+ if len(reflection_history) > 0:
320
+ prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "; Operation reflection: " + reflection_history[i] + "]\n"
321
+ else:
322
+ prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "]\n"
323
+ prompt += "\n"
324
+
325
+ prompt += "### Progress thinking ###\n"
326
+ prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
327
+ prompt += "Completed contents:\n" + completed_content + "\n\n"
328
+
329
+ prompt += "### Response requirements ###\n"
330
+ prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the ### History operations ###.\n\n"
331
+
332
+ prompt += "### Output format ###\n"
333
+ prompt += "Your output format is:\n"
334
+ prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### History operations ###."
335
+
336
+ else:
337
+ prompt += "### Current operation ###\n"
338
+ prompt += "To complete the requirements of user\'s instruction, you have performed an operation. Your operation thought and action of this operation are as follows:\n"
339
+ prompt += f"Operation thought: {thought_history[-1]}\n"
340
+ operation = summary_history[-1].split(" to ")[0].strip()
341
+ if len(reflection_history) > 0:
342
+ prompt += f"Operation action: {operation}\n" + "Operation reflection: " + reflection_history[-1] + "\n\n"
343
+ else:
344
+ prompt += f"Operation action: {operation}\n\n"
345
+
346
+ # if reflection_thought is not None:
347
+ # prompt += "A reflection model was adopted to analyze whether the last step's operation meets the expectation, you should combine its reflection thought to produce the \"Completed contents\"."
348
+ # prompt += "Below is its reflection thought:\n"
349
+ # prompt += reflection_thought + "\n"
350
+
351
+ prompt += "### Response requirements ###\n"
352
+ prompt += "Now you need to combine all of the above to generate the \"Completed contents\".\n"
353
+ prompt += "Completed contents is a general summary of the current contents that have been completed. You need to first focus on the requirements of user\'s instruction, and then summarize the contents that have been completed.\n\n"
354
+
355
+ prompt += "### Output format ###\n"
356
+ prompt += "Your output format is:\n"
357
+ prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### Current operation ###.\n"
358
+ prompt += "(Please use English to output)"
359
+
360
+ return prompt
PCAgent/text_localization.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import logging
4
+ import os
5
+ from alibabacloud_tea_util import models as util_models
6
+ from alibabacloud_tea_openapi import models as open_api_models
7
+ from alibabacloud_ocr_api20210707 import models as ocr_api_20210707_models
8
+ from alibabacloud_ocr_api20210707.client import Client as ocr_api20210707Client
9
+
10
+ class Sample:
11
+ def __init__(self):
12
+ pass
13
+
14
+ @staticmethod
15
+ def create_client() -> ocr_api20210707Client:
16
+ config = open_api_models.Config(
17
+ access_key_id=os.environ.get('OCR_ACCESS_KEY_ID'),
18
+ access_key_secret=os.environ.get('OCR_ACCESS_KEY_SECRET'),
19
+ )
20
+ config.endpoint = f'ocr-api.cn-hangzhou.aliyuncs.com'
21
+ return ocr_api20210707Client(config)
22
+
23
+ @staticmethod
24
+ def main(image) -> None:
25
+ client = Sample.create_client()
26
+ recognize_all_text_request = ocr_api_20210707_models.RecognizeAllTextRequest(
27
+ body=image,
28
+ type='Advanced',
29
+ output_coordinate='points',
30
+ output_oricoord=True,
31
+ )
32
+ runtime = util_models.RuntimeOptions()
33
+ output = client.recognize_all_text_with_options(recognize_all_text_request, runtime)
34
+ # logger.info(f'ocr response:{output}', extra={'request_id': ""})
35
+ output = output.body.data.sub_images[0].block_info.block_details
36
+ return output
37
+
38
+ def image_to_binary(image_path):
39
+ with open(image_path, 'rb') as file:
40
+ binary_data = file.read()
41
+ return binary_data
42
+
43
+ def remove_punctuation(text):
44
+ # 使用正则表达式删除标点符号、下划线和空格
45
+ cleaned_text = re.sub(r'[^\w\s]', '', text) # 删除标点符号
46
+ cleaned_text = re.sub(r'_', '', cleaned_text) # 删除下划线
47
+ cleaned_text = re.sub(r'\s', '', cleaned_text) # 删除空格
48
+ return cleaned_text.replace("v", "").replace("o", "").replace("O", "").replace("T", "").replace("Q", "").replace("丶", "")
49
+
50
+
51
+ class OCRError(Exception):
52
+ def __init__(self, message):
53
+ super().__init__(message)
54
+ self.message = message
55
+
56
+ def ocr(image_path):
57
+ text = []
58
+ coordinate = []
59
+ image = image_to_binary(image_path)
60
+ print(image_path)
61
+ try:
62
+ outputs = Sample.main(image)
63
+ except Exception as e:
64
+ raise OCRError(e.message)
65
+ for output in outputs:
66
+ text.append(output.block_content)
67
+ bbox = [int(output.block_points[0].x), int(output.block_points[0].y), int(output.block_points[2].x), int(output.block_points[2].y)]
68
+ coordinate.append(bbox)
69
+
70
+ return text, coordinate
PCAgent/text_localization_old.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from PCAgent.crop import crop_image, calculate_size
4
+ from PIL import Image
5
+
6
+
7
+ def order_point(coor):
8
+ arr = np.array(coor).reshape([4, 2])
9
+ sum_ = np.sum(arr, 0)
10
+ centroid = sum_ / arr.shape[0]
11
+ theta = np.arctan2(arr[:, 1] - centroid[1], arr[:, 0] - centroid[0])
12
+ sort_points = arr[np.argsort(theta)]
13
+ sort_points = sort_points.reshape([4, -1])
14
+ if sort_points[0][0] > centroid[0]:
15
+ sort_points = np.concatenate([sort_points[3:], sort_points[:3]])
16
+ sort_points = sort_points.reshape([4, 2]).astype('float32')
17
+ return sort_points
18
+
19
+
20
+ def longest_common_substring_length(str1, str2):
21
+ m = len(str1)
22
+ n = len(str2)
23
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
24
+
25
+ for i in range(1, m + 1):
26
+ for j in range(1, n + 1):
27
+ if str1[i - 1] == str2[j - 1]:
28
+ dp[i][j] = dp[i - 1][j - 1] + 1
29
+ else:
30
+ dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
31
+
32
+ return dp[m][n]
33
+
34
+
35
+ def ocr(image_path, ocr_detection, ocr_recognition):
36
+ text_data = []
37
+ coordinate = []
38
+
39
+ image_full = cv2.imread(image_path)
40
+ try:
41
+ det_result = ocr_detection(image_full)
42
+ except:
43
+ print('not text detected')
44
+ return ['no text'], [[0,0,0,0]]
45
+ det_result = det_result['polygons']
46
+ for i in range(det_result.shape[0]):
47
+ pts = order_point(det_result[i])
48
+ image_crop = crop_image(image_full, pts)
49
+
50
+ try:
51
+ result = ocr_recognition(image_crop)['text'][0]
52
+ except:
53
+ continue
54
+
55
+ box = [int(e) for e in list(pts.reshape(-1))]
56
+ box = [box[0], box[1], box[4], box[5]]
57
+
58
+ text_data.append(result)
59
+ coordinate.append(box)
60
+
61
+ return text_data, coordinate
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PC Agent
3
+ emoji: 💬
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.0.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: A Hierarchical Multi-Agent Collaboration Framework for Compl
12
+ ---
13
+
14
+ An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
app.py CHANGED
@@ -7,6 +7,7 @@ import copy
7
  import shutil
8
  import base64
9
  import random
 
10
  import gradio as gr
11
  from datetime import datetime
12
  from modelscope.pipelines import pipeline
@@ -27,8 +28,18 @@ API_url = os.environ.get('API_url')
27
  token = os.environ.get('token')
28
  os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
29
  os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
 
30
  radius = 100
31
 
 
 
 
 
 
 
 
 
 
32
  chatbot_css = """
33
  <style>
34
  .chat-container {
@@ -287,7 +298,7 @@ def chatbot(image, instruction, add_info, history, chat_log):
287
  screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png")
288
  image.save(screenshot_file, format="PNG")
289
  screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1]
290
- perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="C:/Windows/Fonts/arial.ttf")
291
  shutil.rmtree(temp_file)
292
  os.mkdir(temp_file)
293
 
 
7
  import shutil
8
  import base64
9
  import random
10
+ import requests
11
  import gradio as gr
12
  from datetime import datetime
13
  from modelscope.pipelines import pipeline
 
28
  token = os.environ.get('token')
29
  os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
30
  os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
31
+ tff_file = os.environ.get('tff_file')
32
  radius = 100
33
 
34
+ def download_file(url, save_path):
35
+ response = requests.get(url, stream=True) # 以流的方式下载
36
+ response.raise_for_status() # 确保请求成功
37
+ with open(save_path, 'wb') as file:
38
+ for chunk in response.iter_content(chunk_size=8192): # 分块写入,防止占用过多内存
39
+ file.write(chunk)
40
+
41
+ download_file(tff_file, "font/arial.ttf")
42
+
43
  chatbot_css = """
44
  <style>
45
  .chat-container {
 
298
  screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png")
299
  image.save(screenshot_file, format="PNG")
300
  screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1]
301
+ perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="font/arial.ttf")
302
  shutil.rmtree(temp_file)
303
  os.mkdir(temp_file)
304
 
example/1-1.jpg ADDED
example/1-2.jpg ADDED
example/1-3.jpg ADDED