้˜ณๆธ  commited on
Commit
3afb4b6
ยท
1 Parent(s): e78b889
PCAgent/__pycache__/api.cpython-310.pyc ADDED
Binary file (1.47 kB). View file
 
PCAgent/__pycache__/chat.cpython-310.pyc ADDED
Binary file (2.45 kB). View file
 
PCAgent/__pycache__/crop.cpython-310.pyc ADDED
Binary file (3.18 kB). View file
 
PCAgent/__pycache__/icon_localization.cpython-310.pyc ADDED
Binary file (1.75 kB). View file
 
PCAgent/__pycache__/merge_strategy.cpython-310.pyc ADDED
Binary file (6.25 kB). View file
 
PCAgent/__pycache__/prompt_qwen.cpython-310.pyc ADDED
Binary file (20.5 kB). View file
 
PCAgent/__pycache__/text_localization.cpython-310.pyc ADDED
Binary file (2.87 kB). View file
 
PCAgent/api.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import requests
3
+ import time
4
+
5
+ import pdb
6
+ import dashscope
7
+ from dashscope import MultiModalConversation
8
+
9
+ from PIL import Image
10
+ import io
11
+ from openai import OpenAI
12
+ import json
13
+
14
+ def resize_encode_image(image_path, screen_scale_ratio=1):
15
+ with Image.open(image_path) as img:
16
+ new_width = int(img.width * screen_scale_ratio)
17
+ new_height = int(img.height * screen_scale_ratio)
18
+ resized_img = img.resize((new_width, new_height), Image.LANCZOS)
19
+
20
+ buffered = io.BytesIO()
21
+ resized_img.save(buffered, format="PNG")
22
+
23
+ img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
24
+ return img_base64
25
+ # with open(image_path, "rb") as image_file:
26
+ # return base64.b64encode(image_file.read()).decode('utf-8')
27
+
28
+
29
+
30
+
31
+
32
+ def inference_chat(chat, model, api_url, token):
33
+
34
+ messages = []
35
+ for role, content in chat:
36
+ messages.append({"role": role, "content": content})
37
+
38
+ client = OpenAI(
39
+ # ่‹ฅๆฒกๆœ‰้…็ฝฎ็Žฏๅขƒๅ˜้‡๏ผŒ่ฏท็”จ็™พ็‚ผAPI Keyๅฐ†ไธ‹่กŒๆ›ฟๆขไธบ๏ผšapi_key="sk-xxx",
40
+ api_key=token,
41
+ base_url=api_url,
42
+ )
43
+
44
+
45
+ num_try = 5
46
+ for _ in range(num_try):
47
+ try:
48
+ completion = client.chat.completions.create(
49
+ model=model, # ๆญคๅค„ไปฅqwen-plusไธบไพ‹๏ผŒๅฏๆŒ‰้œ€ๆ›ดๆขๆจกๅž‹ๅ็งฐใ€‚ๆจกๅž‹ๅˆ—่กจ๏ผšhttps://help.aliyun.com/zh/model-studio/getting-started/models
50
+ messages=messages
51
+ )
52
+ except:
53
+ print("Network Error:")
54
+ try:
55
+ print(completion.model_dump_json())
56
+ except:
57
+ print("Request Failed")
58
+ time.sleep(2)
59
+ else:
60
+ break
61
+
62
+
63
+ return json.loads(completion.model_dump_json())['choices'][0]['message']['content']
64
+
65
+ # headers = {
66
+ # "Content-Type": "application/json",
67
+ # "Authorization": f"Bearer {token}"
68
+ # }
69
+
70
+ # data = {
71
+ # "model": model,
72
+ # "messages": [],
73
+ # "max_tokens": 2048,
74
+ # 'temperature': 0.0,
75
+ # "seed": 1234
76
+ # }
77
+
PCAgent/chat.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from PCAgent.api import resize_encode_image
3
+
4
+
5
+ def init_subtask_chat():
6
+ operation_history = []
7
+ system_prompt = "You are a helpful AI assistant."
8
+ operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
9
+ return operation_history
10
+
11
+
12
+ def init_action_chat():
13
+ operation_history = []
14
+ system_prompt = "You are a helpful AI PC operating assistant. You need to help me operate the PC to complete the user\'s instruction."
15
+ operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
16
+ return operation_history
17
+
18
+
19
+ def init_reflect_chat():
20
+ operation_history = []
21
+ system_prompt = "You are a helpful AI PC operating assistant."
22
+ operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
23
+ return operation_history
24
+
25
+
26
+ def init_memory_chat():
27
+ operation_history = []
28
+ system_prompt = "You are a helpful AI PC operating assistant."
29
+ operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
30
+ return operation_history
31
+
32
+
33
+ def add_response_old(role, prompt, chat_history, image=None):
34
+ new_chat_history = copy.deepcopy(chat_history)
35
+ if image:
36
+ base64_image = resize_encode_image(image)
37
+ content = [
38
+ {
39
+ "type": "text",
40
+ "text": prompt
41
+ },
42
+ {
43
+ "type": "image_url",
44
+ "image_url": {
45
+ "url": f"data:image/jpeg;base64,{base64_image}"
46
+ }
47
+ },
48
+ ]
49
+ else:
50
+ content = [
51
+ {
52
+ "type": "text",
53
+ "text": prompt
54
+ },
55
+ ]
56
+ new_chat_history.append([role, content])
57
+ return new_chat_history
58
+
59
+
60
+ def add_response(role, prompt, chat_history, image=[], use_qwen=False):
61
+ new_chat_history = copy.deepcopy(chat_history)
62
+ content = [
63
+ {
64
+ "type": "text",
65
+ "text": prompt
66
+ },
67
+ ]
68
+ for i in range(len(image)):
69
+ if not use_qwen:
70
+ base64_image = resize_encode_image(image[i])
71
+ content.append(
72
+ {
73
+ "type": "image_url",
74
+ "image_url": {
75
+ "url": f"data:image/png;base64,{base64_image}"
76
+ }
77
+ }
78
+ )
79
+ else:
80
+ content.append(
81
+ {
82
+ "type": "image",
83
+ "image": image[i]
84
+ }
85
+ )
86
+ new_chat_history.append([role, content])
87
+ return new_chat_history
88
+
89
+
90
+ def add_response_two_image(role, prompt, chat_history, image):
91
+ new_chat_history = copy.deepcopy(chat_history)
92
+
93
+ base64_image1 = resize_encode_image(image[0])
94
+ base64_image2 = resize_encode_image(image[1])
95
+ content = [
96
+ {
97
+ "type": "text",
98
+ "text": prompt
99
+ },
100
+ {
101
+ "type": "image_url",
102
+ "image_url": {
103
+ "url": f"data:image/jpeg;base64,{base64_image1}"
104
+ }
105
+ },
106
+ {
107
+ "type": "image_url",
108
+ "image_url": {
109
+ "url": f"data:image/jpeg;base64,{base64_image2}"
110
+ }
111
+ },
112
+ ]
113
+
114
+ new_chat_history.append([role, content])
115
+ return new_chat_history
116
+
117
+
118
+ def print_status(chat_history):
119
+ print("*"*100)
120
+ for chat in chat_history:
121
+ print("role:", chat[0])
122
+ print(chat[1][0]["text"] + "<image>"*(len(chat[1])-1) + "\n")
123
+ print("*"*100)
PCAgent/crop.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image, ImageDraw, ImageFont
5
+
6
+
7
+
8
+ def crop_image(img, position):
9
+ def distance(x1,y1,x2,y2):
10
+ return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))
11
+ position = position.tolist()
12
+ for i in range(4):
13
+ for j in range(i+1, 4):
14
+ if(position[i][0] > position[j][0]):
15
+ tmp = position[j]
16
+ position[j] = position[i]
17
+ position[i] = tmp
18
+ if position[0][1] > position[1][1]:
19
+ tmp = position[0]
20
+ position[0] = position[1]
21
+ position[1] = tmp
22
+
23
+ if position[2][1] > position[3][1]:
24
+ tmp = position[2]
25
+ position[2] = position[3]
26
+ position[3] = tmp
27
+
28
+ x1, y1 = position[0][0], position[0][1]
29
+ x2, y2 = position[2][0], position[2][1]
30
+ x3, y3 = position[3][0], position[3][1]
31
+ x4, y4 = position[1][0], position[1][1]
32
+
33
+ corners = np.zeros((4,2), np.float32)
34
+ corners[0] = [x1, y1]
35
+ corners[1] = [x2, y2]
36
+ corners[2] = [x4, y4]
37
+ corners[3] = [x3, y3]
38
+
39
+ img_width = distance((x1+x4)/2, (y1+y4)/2, (x2+x3)/2, (y2+y3)/2)
40
+ img_height = distance((x1+x2)/2, (y1+y2)/2, (x4+x3)/2, (y4+y3)/2)
41
+
42
+ corners_trans = np.zeros((4,2), np.float32)
43
+ corners_trans[0] = [0, 0]
44
+ corners_trans[1] = [img_width - 1, 0]
45
+ corners_trans[2] = [0, img_height - 1]
46
+ corners_trans[3] = [img_width - 1, img_height - 1]
47
+
48
+ transform = cv2.getPerspectiveTransform(corners, corners_trans)
49
+ dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height)))
50
+ return dst
51
+
52
+
53
+ def calculate_size(box):
54
+ return (box[2]-box[0]) * (box[3]-box[1])
55
+
56
+
57
+ def calculate_iou(box1, box2):
58
+ xA = max(box1[0], box2[0])
59
+ yA = max(box1[1], box2[1])
60
+ xB = min(box1[2], box2[2])
61
+ yB = min(box1[3], box2[3])
62
+
63
+ interArea = max(0, xB - xA) * max(0, yB - yA)
64
+ box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1])
65
+ box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1])
66
+ unionArea = box1Area + box2Area - interArea
67
+ iou = interArea / unionArea
68
+
69
+ return iou
70
+
71
+
72
+ def crop(image, box, i, text_data=None):
73
+ image = Image.open(image)
74
+
75
+ if text_data:
76
+ draw = ImageDraw.Draw(image)
77
+ draw.rectangle(((text_data[0], text_data[1]), (text_data[2], text_data[3])), outline="red", width=5)
78
+ # font_size = int((text_data[3] - text_data[1])*0.75)
79
+ # font = ImageFont.truetype("arial.ttf", font_size)
80
+ # draw.text((text_data[0]+5, text_data[1]+5), str(i), font=font, fill="red")
81
+
82
+ cropped_image = image.crop(box)
83
+ cropped_image.save(f"./temp/{i}.jpg")
84
+
85
+
86
+ def in_box(box, target):
87
+ if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]):
88
+ return True
89
+ else:
90
+ return False
91
+
92
+
93
+ def crop_for_clip(image, box, i, position):
94
+ image = Image.open(image)
95
+ w, h = image.size
96
+ if position == "left":
97
+ bound = [0, 0, w/2, h]
98
+ elif position == "right":
99
+ bound = [w/2, 0, w, h]
100
+ elif position == "top":
101
+ bound = [0, 0, w, h/2]
102
+ elif position == "bottom":
103
+ bound = [0, h/2, w, h]
104
+ elif position == "top left":
105
+ bound = [0, 0, w/2, h/2]
106
+ elif position == "top right":
107
+ bound = [w/2, 0, w, h/2]
108
+ elif position == "bottom left":
109
+ bound = [0, h/2, w/2, h]
110
+ elif position == "bottom right":
111
+ bound = [w/2, h/2, w, h]
112
+ else:
113
+ bound = [0, 0, w, h]
114
+
115
+ if in_box(box, bound):
116
+ cropped_image = image.crop(box)
117
+ cropped_image.save(f"./temp/{i}.jpg")
118
+ return True
119
+ else:
120
+ return False
PCAgent/icon_localization.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PCAgent.crop import calculate_size, calculate_iou
2
+ from modelscope.pipelines import pipeline
3
+ from PIL import Image
4
+ import torch
5
+
6
+ def remove_boxes(boxes_filt, size, iou_threshold=0.5):
7
+ boxes_to_remove = set()
8
+
9
+ for i in range(len(boxes_filt)):
10
+ if calculate_size(boxes_filt[i]) > 0.05*size[0]*size[1]:
11
+ boxes_to_remove.add(i)
12
+ for j in range(len(boxes_filt)):
13
+ if calculate_size(boxes_filt[j]) > 0.05*size[0]*size[1]:
14
+ boxes_to_remove.add(j)
15
+ if i == j:
16
+ continue
17
+ if i in boxes_to_remove or j in boxes_to_remove:
18
+ continue
19
+ iou = calculate_iou(boxes_filt[i], boxes_filt[j])
20
+ if iou >= iou_threshold:
21
+ boxes_to_remove.add(j)
22
+
23
+ boxes_filt = [box for idx, box in enumerate(boxes_filt) if idx not in boxes_to_remove]
24
+
25
+ return boxes_filt
26
+
27
+
28
+ def det(input_image_path, caption, groundingdino_model, box_threshold=0.05, text_threshold=0.5):
29
+ image = Image.open(input_image_path)
30
+ size = image.size
31
+
32
+ caption = caption.lower()
33
+ caption = caption.strip()
34
+ if not caption.endswith('.'):
35
+ caption = caption + '.'
36
+
37
+ inputs = {
38
+ 'IMAGE_PATH': input_image_path,
39
+ 'TEXT_PROMPT': caption,
40
+ 'BOX_TRESHOLD': box_threshold,
41
+ 'TEXT_TRESHOLD': text_threshold
42
+ }
43
+
44
+ result = groundingdino_model(inputs)
45
+ boxes_filt = result['boxes']
46
+
47
+ H, W = size[1], size[0]
48
+ for i in range(boxes_filt.size(0)):
49
+ boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
50
+ boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
51
+ boxes_filt[i][2:] += boxes_filt[i][:2]
52
+
53
+ boxes_filt = boxes_filt.cpu().int().tolist()
54
+ filtered_boxes = remove_boxes(boxes_filt, size) # [:9]
55
+ coordinates = []
56
+ for box in filtered_boxes:
57
+ coordinates.append([box[0], box[1], box[2], box[3]])
58
+
59
+ return coordinates
PCAgent/merge_strategy.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def calculate_iou(box1, box2):
5
+ x1_min, y1_min, x1_max, y1_max = box1
6
+ x2_min, y2_min, x2_max, y2_max = box2
7
+
8
+ inter_x_min = max(x1_min, x2_min)
9
+ inter_y_min = max(y1_min, y2_min)
10
+ inter_x_max = min(x1_max, x2_max)
11
+ inter_y_max = min(y1_max, y2_max)
12
+
13
+ inter_area = max(0, inter_x_max - inter_x_min) * max(0, inter_y_max - inter_y_min)
14
+ box1_area = (x1_max - x1_min) * (y1_max - y1_min)
15
+ box2_area = (x2_max - x2_min) * (y2_max - y2_min)
16
+
17
+ union_area = box1_area + box2_area - inter_area
18
+ iou = inter_area / union_area
19
+ return iou
20
+
21
+
22
+ def compute_iou(box1, box2):
23
+ """
24
+ Compute the Intersection over Union (IoU) of two bounding boxes.
25
+
26
+ Parameters:
27
+ - box1: list or array [x1, y1, x2, y2]
28
+ - box2: list or array [x1, y1, x2, y2]
29
+
30
+ Returns:
31
+ - iou: float, IoU value
32
+ """
33
+ x1_inter = max(box1[0], box2[0])
34
+ y1_inter = max(box1[1], box2[1])
35
+ x2_inter = min(box1[2], box2[2])
36
+ y2_inter = min(box1[3], box2[3])
37
+
38
+ # print(x2_inter, x1_inter, y2_inter, y1_inter)
39
+
40
+ inter_area = max(0, x2_inter - x1_inter + 1) * max(0, y2_inter - y1_inter + 1)
41
+
42
+ box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
43
+ box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
44
+
45
+ iou = inter_area / float(box1_area + box2_area - inter_area)
46
+
47
+ return iou
48
+
49
+
50
+ def merge_boxes(box1, box2):
51
+ x1_min, y1_min, x1_max, y1_max = box1
52
+ x2_min, y2_min, x2_max, y2_max = box2
53
+
54
+ merged_box = [min(x1_min, x2_min), min(y1_min, y2_min), max(x1_max, x2_max), max(y1_max, y2_max)]
55
+ return merged_box
56
+
57
+
58
+ def merge_boxes_and_texts(texts, boxes, iou_threshold=0):
59
+ """
60
+ Merge bounding boxes and their corresponding texts based on IoU threshold.
61
+
62
+ Parameters:
63
+ - boxes: List of bounding boxes, with each box represented as [x1, y1, x2, y2].
64
+ - texts: List of texts corresponding to each bounding box.
65
+ - iou_threshold: Intersection-over-Union threshold for merging boxes.
66
+
67
+ Returns:
68
+ - merged_boxes: List of merged bounding boxes.
69
+ - merged_texts: List of merged texts corresponding to the bounding boxes.
70
+ """
71
+ if len(boxes) == 0:
72
+ return [], []
73
+
74
+ # boxes = np.array(boxes)
75
+ merged_boxes = []
76
+ merged_texts = []
77
+
78
+ while len(boxes) > 0:
79
+ box = boxes[0]
80
+ text = texts[0]
81
+ boxes = boxes[1:]
82
+ texts = texts[1:]
83
+ to_merge_boxes = [box]
84
+ to_merge_texts = [text]
85
+ keep_boxes = []
86
+ keep_texts = []
87
+
88
+ for i, other_box in enumerate(boxes):
89
+ if compute_iou(box, other_box) > iou_threshold:
90
+ to_merge_boxes.append(other_box)
91
+ to_merge_texts.append(texts[i])
92
+ else:
93
+ keep_boxes.append(other_box)
94
+ keep_texts.append(texts[i])
95
+
96
+ # Merge the to_merge boxes into a single box
97
+ if len(to_merge_boxes) > 1:
98
+ x1 = min(b[0] for b in to_merge_boxes)
99
+ y1 = min(b[1] for b in to_merge_boxes)
100
+ x2 = max(b[2] for b in to_merge_boxes)
101
+ y2 = max(b[3] for b in to_merge_boxes)
102
+ merged_box = [x1, y1, x2, y2]
103
+ merged_text = " ".join(to_merge_texts) # You can change the merging strategy here
104
+ merged_boxes.append(merged_box)
105
+ merged_texts.append(merged_text)
106
+ else:
107
+ merged_boxes.extend(to_merge_boxes)
108
+ merged_texts.extend(to_merge_texts)
109
+
110
+ # boxes = np.array(keep_boxes)
111
+ boxes = keep_boxes
112
+ texts = keep_texts
113
+
114
+ return merged_texts, merged_boxes
115
+
116
+
117
+ def is_contained(bbox1, bbox2):
118
+ x1_min, y1_min, x1_max, y1_max = bbox1
119
+ x2_min, y2_min, x2_max, y2_max = bbox2
120
+
121
+ if (x1_min >= x2_min and y1_min >= y2_min and x1_max <= x2_max and y1_max <= y2_max):
122
+ return True
123
+ elif (x2_min >= x1_min and y2_min >= y1_min and x2_max <= x1_max and y2_max <= y1_max):
124
+ return True
125
+ return False
126
+
127
+
128
+ def is_overlapping(bbox1, bbox2):
129
+ x1_min, y1_min, x1_max, y1_max = bbox1
130
+ x2_min, y2_min, x2_max, y2_max = bbox2
131
+
132
+ inter_xmin = max(x1_min, x2_min)
133
+ inter_ymin = max(y1_min, y2_min)
134
+ inter_xmax = min(x1_max, x2_max)
135
+ inter_ymax = min(y1_max, y2_max)
136
+
137
+ if inter_xmin < inter_xmax and inter_ymin < inter_ymax:
138
+ return True
139
+ return False
140
+
141
+
142
+ def get_area(bbox):
143
+ x_min, y_min, x_max, y_max = bbox
144
+ return (x_max - x_min) * (y_max - y_min)
145
+
146
+
147
+ def merge_all_icon_boxes(bboxes):
148
+ result_bboxes = []
149
+ while bboxes:
150
+ bbox = bboxes.pop(0)
151
+ to_add = True
152
+
153
+ for idx, existing_bbox in enumerate(result_bboxes):
154
+ if is_contained(bbox, existing_bbox):
155
+ if get_area(bbox) > get_area(existing_bbox):
156
+ result_bboxes[idx] = existing_bbox
157
+ to_add = False
158
+ break
159
+ elif is_overlapping(bbox, existing_bbox):
160
+ if get_area(bbox) < get_area(existing_bbox):
161
+ result_bboxes[idx] = bbox
162
+ to_add = False
163
+ break
164
+
165
+ if to_add:
166
+ result_bboxes.append(bbox)
167
+
168
+ return result_bboxes
169
+
170
+
171
+ def merge_all_icon_boxes_new(elements):
172
+ result_elements = []
173
+ while elements:
174
+ ele = elements.pop(0)
175
+ bbox = [ele['position'][0], ele['position'][1], ele['position'][0]+ele['size'][0], ele['position'][1]+ele['size'][1]]
176
+ # bbox = bboxes.pop(0)
177
+ to_add = True
178
+
179
+ for idx, existing_ele in enumerate(result_elements):
180
+ existing_bbox = [existing_ele['position'][0], existing_ele['position'][1], existing_ele['position'][0]+existing_ele['size'][0], existing_ele['position'][1]+existing_ele['size'][1]]
181
+ if is_contained(bbox, existing_bbox):
182
+ if get_area(bbox) > get_area(existing_bbox):
183
+ result_elements[idx] = existing_ele
184
+ to_add = False
185
+ break
186
+ elif is_overlapping(bbox, existing_bbox):
187
+ if get_area(bbox) < get_area(existing_bbox):
188
+ result_elements[idx] = ele
189
+ to_add = False
190
+ break
191
+
192
+ if to_add:
193
+ result_elements.append(ele)
194
+
195
+ return result_elements
196
+
197
+
198
+
199
+
200
+ def merge_bbox_groups(A, B, iou_threshold=0.8):
201
+ i = 0
202
+ while i < len(A):
203
+ box_a = A[i]
204
+ has_merged = False
205
+ for j in range(len(B)):
206
+ box_b = B[j]
207
+ iou = calculate_iou(box_a, box_b)
208
+ if iou > iou_threshold:
209
+ merged_box = merge_boxes(box_a, box_b)
210
+ A[i] = merged_box
211
+ B.pop(j)
212
+ has_merged = True
213
+ break
214
+
215
+ if has_merged:
216
+ i -= 1
217
+ i += 1
218
+
219
+ return A, B
220
+
221
+
222
+ def bbox_iou(boxA, boxB):
223
+ # Calculate Intersection over Union (IoU) between two bounding boxes
224
+ xA = max(boxA[0], boxB[0])
225
+ yA = max(boxA[1], boxB[1])
226
+ xB = min(boxA[2], boxB[2])
227
+ yB = min(boxA[3], boxB[3])
228
+ interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
229
+ boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
230
+ boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
231
+ iou = interArea / float(boxAArea + boxBArea - interArea)
232
+ return iou
233
+
234
+
235
+ def merge_boxes_and_texts_new(texts, bounding_boxes, iou_threshold=0):
236
+ if not bounding_boxes:
237
+ return [], []
238
+
239
+ bounding_boxes = np.array(bounding_boxes)
240
+ merged_boxes = []
241
+ merged_texts = []
242
+
243
+ used = np.zeros(len(bounding_boxes), dtype=bool)
244
+
245
+ for i, boxA in enumerate(bounding_boxes):
246
+ if used[i]:
247
+ continue
248
+ x_min, y_min, x_max, y_max = boxA
249
+ # text = texts[i]
250
+ text = ''
251
+
252
+ overlapping_indices = [i] # []
253
+ for j, boxB in enumerate(bounding_boxes):
254
+ # print(i,j, bbox_iou(boxA, boxB))
255
+ if i != j and not used[j] and bbox_iou(boxA, boxB) > iou_threshold:
256
+ overlapping_indices.append(j)
257
+
258
+ # Sort overlapping boxes by vertical position (top to bottom)
259
+ overlapping_indices.sort(key=lambda idx: (bounding_boxes[idx][1] + bounding_boxes[idx][3])/2) # TODO
260
+
261
+ for idx in overlapping_indices:
262
+ boxB = bounding_boxes[idx]
263
+ x_min = min(x_min, boxB[0])
264
+ y_min = min(y_min, boxB[1])
265
+ x_max = max(x_max, boxB[2])
266
+ y_max = max(y_max, boxB[3])
267
+ # text += " " + texts[idx]
268
+ text += texts[idx]
269
+ used[idx] = True
270
+
271
+ merged_boxes.append([x_min, y_min, x_max, y_max])
272
+ merged_texts.append(text)
273
+ used[i] = True
274
+
275
+ return merged_texts, merged_boxes
PCAgent/prompt_qwen.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PC
2
+ def get_subtask_prompt_cn(instruction):
3
+ func_prompt = '''ๅคšๆจกๆ€agent้€š่ฟ‡ๆ‰ง่กŒ็‚นๅ‡ปใ€่พ“ๅ…ฅ็ญ‰ไธ€็ณปๅˆ—ๆ“ไฝœๆฅๅฎŒๆˆ็”จๆˆท็š„ๆŒ‡ไปคใ€‚
4
+ ็”จๆˆทๆŒ‡ไปคๅฏ่ƒฝ็”ฑ่ทจ่ถŠๅคšไธชๅบ”็”จ็จ‹ๅบ็š„ๆ•ฐไธชๅญไปปๅŠก็ป„ๆˆ๏ผŒๆˆ‘ๅธŒๆœ›ไฝ ่ƒฝๅฐ†่ฟ™ไธชๅคๆ‚็š„ๆŒ‡ไปค๏ผŒๅˆ†่งฃไธบไธ€ไบ›ๅญไปปๅŠก๏ผŒๅญไปปๅŠกๆœ‰4็ง็ฑปๅž‹๏ผš
5
+ 1. ๅธธ่ง„ๅญ—็ฌฆไธฒๅฝขๅผ๏ผšไพ‹ๅฆ‚โ€œๅœจ็ณป็ปŸ่ฎพ็ฝฎไธญ๏ผŒๆ‰“ๅผ€ๆทฑ่‰ฒๆจกๅผโ€๏ผ›
6
+ 2. ๅŒ…ๅซๅญ—ๅ…ธๅ†…ๅฎน็š„ๅญ—็ฌฆไธฒ๏ผšๅฝ“ๅ‰ๅญไปปๅŠก็š„ๆ‰ง่กŒ็ป“ๆžœ้œ€่ฆไปฅๅญ—ๅ…ธๆ–นๅผไผ ้€’็ป™ๅ…ถไป–ๅญไปปๅŠก๏ผŒไพ‹ๅฆ‚โ€œๅœจOutlookไธญ๏ผŒๆŸฅ็œ‹โ€˜Paulโ€™ๅ‘ๆฅ็š„้‚ฎไปถไฟกๆฏ๏ผŒไปฅdictๅฝขๅผ่พ“ๅ‡บ{'contact': 'Paul', 'mail_content': 'content of the email'}โ€๏ผ›
7
+ 3. ๆ ผๅผๅŒ–ๅญ—็ฌฆไธฒ๏ผšๅˆฉ็”จๅ‰ๅบๅญไปปๅŠกไผ ้€’็š„ไฟกๆฏ๏ผŒ่กฅๅ…จๅฝ“ๅ‰ๅญไปปๅŠกๅŽ๏ผŒ่ƒฝๅคŸๅฎŒๅ…จ็‹ฌ็ซ‹ๆ‰ง่กŒ๏ผŒไพ‹ๅฆ‚โ€œๅฐ†{mail_content}้€š่ฟ‡็Ÿญไฟกๅ‘้€็ป™โ€˜Joeyโ€™โ€๏ผ›
8
+ 4. ๅŒ…ๅซๅญ—ๅ…ธๅ†…ๅฎน็š„ๆ ผๅผๅŒ–ๅญ—็ฌฆไธฒ๏ผšๆ—ข้œ€่ฆๅ‰ๅบๅญไปปๅŠกไผ ้€’็š„ไฟกๆฏ๏ผŒไปฅ่กฅๅ…จๅฝ“ๅ‰ๅญไปปๅŠก๏ผŒๅŒๆ—ถๆ‰ง่กŒ็š„็ป“ๆžœไนŸ้œ€่ฆไปฅๅญ—ๅ…ธๆ–นๅผไผ ้€’็ป™ๅ…ถไป–ๅญไปปๅŠก๏ผŒไพ‹ๅฆ‚โ€œๅœจ่ฐทๆญŒไธญๆœ็ดข{question}๏ผŒๅนถๅฐ†็›ธๅ…ณไฟกๆฏไปฅdictๅฝขๅผ่พ“ๅ‡บ{'info': 'related information'}โ€ใ€‚
9
+
10
+ ไธพไพ‹ๆฅ่ฏด๏ผŒๅคๅˆๆŒ‡ไปคโ€œ็ณป็ปŸ่ฎพ็ฝฎไธญๆ‰“ๅผ€ๆทฑ่‰ฒๆจกๅผ๏ผŒๅœจๅพฎไฟกไธญๆŸฅ็œ‹โ€˜Johnโ€™ๅ‘ๆฅ็š„้—ฎ้ข˜๏ผŒๅœจChromeไธญๆœ็ดข้—ฎ้ข˜็š„็ญ”ๆกˆ๏ผŒๅฐ†็ญ”ๆกˆๆทปๅŠ ๅˆฐไธ€ไธชๆ–ฐๅปบwordๆ–‡ๆกฃไธญ๏ผŒไฟๅญ˜ไธบโ€˜ไฝœไธš.docxโ€™๏ผŒ็„ถๅŽๅ‘้€็ป™โ€˜Johnโ€™ใ€‚โ€ๅฏไปฅ่ขซๅˆ†่งฃไธบ๏ผš
11
+ {
12
+ "subtask 1": "ๅœจ็ณป็ปŸ่ฎพ็ฝฎไธญ๏ผŒๆ‰“ๅผ€ๆทฑ่‰ฒๆจกๅผ",
13
+ "subtask 2": "ๅœจๅพฎไฟกไธญ๏ผŒๆŸฅ็œ‹โ€˜Johnโ€™ๅ‘ๆฅ็š„้—ฎ้ข˜๏ผŒๅฐ†้—ฎ้ข˜ไปฅdictๅฝขๅผ่พ“ๅ‡บ{'John_question': 'content of the question'}",
14
+ "subtask 3": "ๅœจChromeไธญ๏ผŒๆœ็ดข{John_question}๏ผŒๅฐ†ๆœ็ดขๅˆฐ็š„็ญ”ๆกˆไปฅdictๅฝขๅผ่พ“ๅ‡บ{'John_question_answer': 'answer to the question'}",
15
+ "subtask 4": "ๅœจWordไธญ๏ผŒๆ–ฐๅปบไธ€ไธชๆ–‡ๆกฃ๏ผŒๅ†™ๅ…ฅ{John_question_answer}๏ผŒๅนถไฟๅญ˜ไธบโ€˜ไฝœไธš.docxโ€™",
16
+ "subtask 5": "ๅœจๅพฎไฟกไธญ๏ผŒๅ‘้€โ€˜ไฝœไธš.docxโ€™็ป™โ€˜Johnโ€™"
17
+ }
18
+
19
+ ้œ€่ฆๆณจๆ„๏ผš
20
+ 1. ๅŒ…ๅซๅญ—ๅ…ธๅ†…ๅฎน็š„ๅญ—็ฌฆไธฒๆˆ–ๆ ผๅผๅŒ–ๅญ—็ฌฆไธฒ๏ผŒ้œ€่ฆๅฐฝๅฏ่ƒฝ่ฏฆ็ป†ๅœฐ่ฏดๆ˜Ždictไธญๅ„ไธชkey็š„ๅซไน‰๏ผŒๅณๅฐ†ๅ“ชไบ›ๅ†…ๅฎนไปฅdict็š„ๅฝขๅผ่พ“ๅ‡บ๏ผ›
21
+ 2. ๆฏไธชๆ ผๅผๅŒ–ๅญ—็ฌฆไธฒๅฝขๅผ็š„ๅญไปปๅŠกไธญๅŒ…ๅซ็š„key๏ผŒๅœจๅ‰ๅบๅญไปปๅŠกไธญ่ฆๆœ‰ๅฏนๅบ”็š„dictๅฝขๅผ่พ“ๅ‡บ๏ผŒไนŸๅฐฑๆ˜ฏ่ฏด๏ผŒๅ‰ๅบๅญไปปๅŠกๆ‰ง่กŒๅฎŒๆˆๅŽ๏ผŒไฟ่ฏๅฝ“ๅ‰ๅญไปปๅŠก่ƒฝๅคŸ้€š่ฟ‡ๅ‚ๆ•ฐไผ ้€’ๅพ—ๅˆฐ่กฅๅ…จ๏ผŒไปŽ่€Œๅฏไปฅ็‹ฌ็ซ‹ๆ‰ง่กŒใ€‚
22
+ 3. ๅฟ…้กปไฟ่ฏ๏ผŒๆฏไธชๅญไปปๅŠก๏ผŒๆ— ่ฎบๆ˜ฏๅธธ่ง„ๅญ—็ฌฆไธฒ๏ผŒ่ฟ˜ๆ˜ฏ่กฅๅ…จไน‹ๅŽ็š„ๆ ผๅผๅŒ–ๅญ—็ฌฆไธฒ๏ผŒ่ƒฝๅคŸๅฎŒๅ…จ่„ฑ็ฆปๅ…ถไป–ๅญไปปๅŠก็‹ฌ็ซ‹ๆ‰ง่กŒใ€‚ไพ‹ๅฆ‚โ€œๅœจWordไธญๆ–ฐๅปบไธ€ไธชๆ–‡ๆกฃ๏ผŒๅ†™ๅ…ฅ{John_question_answer}โ€ๅฏไปฅ็‹ฌ็ซ‹ๆ‰ง่กŒ๏ผŒไฝ†โ€œๅฐ†ไฟฎๆ”นๅŽ็š„Wordๆ–‡ๆกฃ้€š่ฟ‡้‚ฎไปถๅ‘้€็ป™{name}โ€ๅˆ™ๅ› ไธบโ€˜Wordๆ–‡ๆกฃโ€™ๆŒ‡ไปฃไธๆ˜Ž็กฎๆ— ๆณ•็‹ฌ็ซ‹ๆ‰ง่กŒใ€‚
23
+ 4. ๆ‹†่งฃๅŽ็š„ๆฏไธชๅญไปปๅŠก่ฆๆœ‰ๆ˜Ž็กฎ็š„ๅบ”็”จ็จ‹ๅบ๏ผŒไพ‹ๅฆ‚โ€˜ๅœจChromeไธญโ€™ใ€โ€˜ๅœจWordไธญโ€™็ญ‰ใ€‚ไธ€่ˆฌ่€Œ่จ€๏ผŒdocxๆ ผๅผๆ–‡ๆกฃ็”จWord็จ‹ๅบๆ‰“ๅผ€๏ผŒxlsxๆ ผๅผ่กจๆ ผ็”จExcel็จ‹ๅบๆ‰“ๅผ€ใ€‚ๆญคๅค–๏ผŒ้œ€่ฆๆ‰“ๅผ€ๆ–‡ไปถๆ—ถ๏ผŒ่ฆๆ˜Ž็กฎๆ–‡ไปถ็š„ๅๅญ—ใ€‚
24
+ '''
25
+
26
+ inst_prompt = '''
27
+ User Instruction:
28
+ {}
29
+ '''
30
+
31
+ format_prompt = '''
32
+ ่ฏทไฝ ๆŒ‰็…งๅฆ‚ไธ‹ๆ ผๅผ่พ“ๅ‡บๆ‹†ๅˆ†ๅŽ็š„ๅญไปปๅŠก๏ผš
33
+ {
34
+ "subtask 1": ,
35
+ "subtask 2": ,
36
+ ...
37
+ }
38
+ '''
39
+ prompt = func_prompt + inst_prompt.format(instruction) + format_prompt
40
+ return prompt
41
+
42
+
43
+
44
+ def get_subtask_prompt(instruction):
45
+ func_prompt = '''A multi-modal agent completes a user's instruction by performing a series of actions such as clicking and typing. A user's instruction may consist of multiple subtasks across different applications. I want you to break down this complex instruction into several subtasks, which are of four types:
46
+
47
+ 1. Regular string: For example, "Open dark mode in system settings";
48
+ 2. String containing dictionary content: The result of the current subtask needs to be passed to other subtasks in a dictionary format, for example, "Check the emails from 'Paul' in Outlook and output the email details in a dict format like {'contact': 'Paul', 'mail_content': 'content of the email'}";
49
+ 3. Formatted string containing the keys from previous subtasks: Use the information from previous subtasks to complete and independently execute the current subtask, for example, "Send {mail_content} via SMS to 'Joey'". Note: Note: The text in the first "{""}" must be a key from the output of a previous subtask, and there should be no "''";
50
+ 4. Formatted string containing the keys from previous subtasks and the dictionary content: This requires both information from previous subtasks to complete the current subtask and the result also needs to be passed to other subtasks in a dictionary format, for example, "Search for {question} on Google and output the relevant information in a dict format like {'info': 'related information'}". Note: The text in the first "{""}" must be a key from the output of a previous subtask, and there should be no "''".
51
+
52
+
53
+ For example, the compound instruction "Open dark mode in system settings, check the two questions sent by 'John' in WeChat, search for answers to these two questions in Chrome, add the answers to a new Word document, save it as 'assignment.docx', and then send it to 'John'." can be broken down into:
54
+ {
55
+ "subtask 1": "Open dark mode in system settings",
56
+ "subtask 2": "Check the questions sent by 'John' in WeChat and output the questions in a dict format {'John_question_1': 'content of John\'s question_1', 'John_question_2': 'content of John\'s question_2'}",
57
+ "subtask 3": "Search for {John_question_1} in Chrome and output the found answer in a dict format {'John_question_1_answer': 'answer to the question_1'}",
58
+ "subtask 4": "Search for {John_question_2} in Chrome and output the found answer in a dict format {'John_question_2_answer': 'answer to the question_2'}",
59
+ "subtask 5": "Create a new document in Word, write {John_question_1_answer} and {John_question_2_answer} sequentially, then save it as 'assignment.docx'",
60
+ "subtask 6": "Send 'assignment.docx' to 'John' via WeChat"
61
+ }
62
+
63
+ Notes:
64
+ 1. Strings or formatted strings containing dictionary content should explain as detailed as possible the meaning of each key in the dict, i.e., what content should be output in dict form;
65
+ 2. Each key in a formatted string subtask must have a corresponding dict output in preceding subtasks, ensuring that after a preceding subtask is completed, the current subtask can be fully completed through parameter passing and thus executed independently.
66
+ 3. Ensure each subtask, whether as a regular string or a completed formatted string, can be executed independently of other subtasks. For example, "Create a new document in Word and write {John_question_answer}" can be executed independently, but "Send the modified Word document via email to {name}" cannot because "Word document" is ambiguous and cannot be executed independently.
67
+ 4. Each subtask must specify a clear application, such as 'in Chrome' or 'in Word'. Generally, docx formatted documents are opened with Word, and xlsx spreadsheets are opened with Excel. Additionally, when opening a file, clearly state the file name.
68
+ 5. Note that if a subtask contains a dict, ensure that the values in the dictionary do not contain single quote characters to avoid format errors.
69
+ '''
70
+
71
+ inst_prompt = '''
72
+ User Instruction:
73
+ {}
74
+ '''
75
+
76
+ format_prompt = '''
77
+ Please output the split subtasks in the following format:
78
+ {
79
+ "subtask 1": ,
80
+ "subtask 2": ,
81
+ ...
82
+ }
83
+ '''
84
+ prompt = func_prompt + inst_prompt.format(instruction) + format_prompt
85
+ return prompt
86
+
87
+
88
+
89
+
90
+ def get_select_prompt(content):
91
+ prompt_template = '''
92
+ Analyze the specified text range {} and output the first line and last line of the specified range separately.
93
+ How to identify paragraphs: There are 2 spaces at the beginning of each paragraph. Define the title as the single line at the top.
94
+ If the content has only one line (such as title), it is both the first and last line.'''
95
+
96
+ prompt_format = '''
97
+ You should respond in the following format:
98
+ <first>The content of the first line</first>
99
+ <last>The content of the last line</last>
100
+ '''
101
+ prompt = prompt_template.format(content)+prompt_format
102
+ return prompt
103
+
104
+
105
+
106
+
107
+ def get_select_prompt_simple(content):
108
+ prompt_template = '''
109
+ Analyze the text range of this part of the current Word document: {}, and output the content of the first and last lines separately.
110
+ If the content has only one line in total, this line is the first line and also the last line.'''
111
+
112
+ prompt_format = '''
113
+ You should respond in the following format:
114
+ <first>The content of the first line</first>
115
+ <last>The content of the last line</last>
116
+ '''
117
+ prompt = prompt_template.format(content)+prompt_format
118
+ return prompt
119
+
120
+
121
+
122
+ def get_select_prompt_backup(content):
123
+ prompt_template = '''
124
+ Directly output the first line and the last line of the content: {} in the current shown Microsoft Word document. If the content has only one line, output this line twice.'''
125
+
126
+ prompt_format = '''
127
+ You should respond in the following format:
128
+ <first>The content of the first line</first>
129
+ <last>The content of the last line</last>
130
+ '''
131
+ prompt = prompt_template.format(content)+prompt_format
132
+ return prompt
133
+
134
+
135
+ def get_action_prompt(instruction, clickable_infos, width, height, thought_history, summary_history, action_history, reflection_history, last_summary, last_action, reflection_thought, add_info, error_flag, completed_content, memory):
136
+ prompt = "### Background ###\n"
137
+ prompt += f"This image is a computer screenshot where icons are marked with numbers. Its width is {width} pixels and its height is {height} pixels. The user\'s instruction is: {instruction}.\n\n"
138
+
139
+ prompt += "### Tips ###\n"
140
+ prompt += add_info
141
+ prompt += "\n\n"
142
+
143
+ prompt += "### Screenshot information ###\n"
144
+ prompt += "In order to help you better perceive the content in this screenshot, we extract some information of the current screenshot. "
145
+ prompt += "This information consists of two parts: coordinates; content. "
146
+ prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; "
147
+
148
+ prompt += "the content is a text or 'icon' respectively. "
149
+ prompt += "The information is as follow:\n"
150
+
151
+ for clickable_info in clickable_infos:
152
+ if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
153
+ prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
154
+
155
+
156
+ if len(action_history) > 0:
157
+ prompt += "### History operations ###\n"
158
+ prompt += "Before arriving at the current screenshot, you have completed the following operations:\n"
159
+ for i in range(len(action_history)):
160
+ if len(reflection_history) > 0:
161
+ prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(' to ')[0].strip() + "; Action: " + action_history[i] + "; Reflection: " + reflection_history[i] + "]\n"
162
+ else:
163
+ prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(' to ')[0].strip() + "; Action: " + action_history[i] + "]\n"
164
+ prompt += "\n"
165
+
166
+ if completed_content != "":
167
+ prompt += "### Progress ###\n"
168
+ prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
169
+ prompt += "Completed contents:\n" + completed_content + "\n\n"
170
+
171
+ if memory != "":
172
+ prompt += "### Memory ###\n"
173
+ prompt += "During the operations, you record the following contents on the screenshot for use in subsequent operations:\n"
174
+ prompt += "Memory:\n" + memory + "\n"
175
+
176
+
177
+ # ็ฆ็”จ
178
+ if error_flag:
179
+ prompt += "### Last operation ###\n"
180
+ prompt += f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time."
181
+ prompt += "\n\n"
182
+ print(f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time.")
183
+
184
+ prompt += "### Task requirements ###\n"
185
+ prompt += "In order to meet the user\'s requirements, you need to select one of the following operations to operate on the current screen:\n"
186
+ prompt += "Note that to open an app, use the Open App action, rather than tapping the app's icon. "
187
+ prompt += "For certain items that require selection, such as font and font size, direct input is more efficient than scrolling through choices."
188
+ prompt += "You must choose one of the actions below:\n"
189
+ prompt += "Open App (app name): If you want to open an app, you should use this action to open the app named 'app name'."
190
+ prompt += "Right Tap (x, y): Right tap the position (x, y) in current page. This can be used to create a new file.\n"
191
+ prompt += "Tap (x, y): Tap the position (x, y) in current page. This can be used to select an item.\n"
192
+ prompt += "Double Tap (x, y): Double tap the position (x, y) in the current page. This can be used to open a file. If Tap (x, y) in the last step doesn't work, you can try double tap the position (x, y) in the current page.\n"
193
+
194
+
195
+ prompt += '''
196
+ Shortcut (key1, key2): There are several shortcuts (key1+key2) you may use.
197
+ For example, if you can't find the download button, use command+s to save the page or download the file.
198
+ To select all, you can use command+a.
199
+ To create a new file in Word/Excel, you can use command+n.
200
+ To create a new tab for starting a new search in Chrome, you can use command+t.
201
+ To copy an item, you can first select it and then use command+c.
202
+ To paste the copied item, you can first select the location you want to paste it to, and then use command+v.
203
+ '''
204
+ prompt += '''
205
+ Press (key name): There are several keys that may help.
206
+ For example, if you want to delete the selected content, press 'backspace'.
207
+ You can press 'enter' to confirm, submit the input command, or insert a line break.
208
+ Also, you can press 'up', 'down', 'left', or 'right' to scroll the page or adjust the position of the selected object.
209
+ '''
210
+
211
+ prompt += "Type (x, y), (text): Tap the position (x, y) and type the \"text\" in the input box and press the enter key. You should replace the \"text\" with the actual input.\n"
212
+
213
+ prompt += "Select (content): Select the referred 'content' in the current document, such as 'title', 'the second paragraph' and 'the last two paragraphs'. This action is useful when you want to edit a certain part of the document, such as bolding, adding underlines, changing line spacing, centering text, etc.\n"
214
+ prompt += "Replace (x, y), (text): Replace the editable content in (x, y) with the \"text\". You should replace the \"text\" with the actual input. This action is very useful when you want to start a new search in Chrome or rename a file.\n"
215
+ prompt += "Append (x, y), (text): Append the \"text\" content after the content at (x, y) location. This action is useful when you want to append new content into a word document.\n"
216
+
217
+ prompt += "Tell (answer): Tell me the answer of the input query.\n"
218
+ prompt += "Stop: If all the operations to meet the user\'s requirements have been completed in ### History operation ###, use this operation to stop the whole process."
219
+ prompt += "\n\n"
220
+
221
+ prompt += "### Output format ###\n"
222
+ # modified 2.10
223
+ prompt += "You should output in the following json format:"
224
+ prompt += '''
225
+ {"Thought": "This is your thinking about how to proceed the next operation, please output the thoughts about the history operations explicitly.", "Action": "Open App () or Tap () or Double Tap () or Triple Tap () or Shortcut () or Press() or Type () or Tell () or Stop. Only one action can be output at one time.", "Summary": "This is a one sentence summary of this operation."}
226
+ '''
227
+ prompt += "\n\n"
228
+
229
+ return prompt
230
+
231
+
232
+ def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width, height, summary, action, add_info, no_image=0):
233
+ if no_image == 1:
234
+ prompt = f"The computer screen's width is {width} pixels and the height is {height} pixels.\n\n"
235
+ else:
236
+ prompt = f"These images are two computer screenshots before and after an operation. Their widths are {width} pixels and their heights are {height} pixels.\n\n"
237
+
238
+ prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot. "
239
+ prompt += "The information consists of two parts, consisting of format: coordinates; content. "
240
+ prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively "
241
+ prompt += "\n\n"
242
+
243
+ prompt += "### Before the current operation ###\n"
244
+ prompt += "Screenshot information:\n"
245
+ for clickable_info in clickable_infos1:
246
+ if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
247
+ prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
248
+ prompt += "\n\n"
249
+
250
+ prompt += "### After the current operation ###\n"
251
+ prompt += "Screenshot information:\n"
252
+ for clickable_info in clickable_infos2:
253
+ if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
254
+ prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
255
+ prompt += "\n\n"
256
+
257
+ prompt += "### Current operation ###\n"
258
+ prompt += f"The user\'s instruction is: {instruction}."
259
+ if add_info != "":
260
+ prompt += f"You also need to note the following requirements: {add_info}."
261
+ prompt += "In the process of completing the requirements of instruction, an operation is performed on the computer. Below are the details of this operation:\n"
262
+ prompt += "Operation thought: " + summary.split(" to ")[0].strip() + "\n"
263
+ prompt += "Operation action: " + action
264
+ prompt += "\n\n"
265
+
266
+ prompt += "### Response requirements ###\n"
267
+ if no_image == 1:
268
+ prompt += "Now you need to output the following content based on the screenshots information before and after the current operation:\n"
269
+ else:
270
+ prompt += "Now you need to output the following content based on the screenshots before and after the current operation:\n"
271
+ prompt += "Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n"
272
+ prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\".\n"
273
+ prompt += "B: The \"Operation action\" results in a wrong page and I need to do something to correct this.\n"
274
+ prompt += "C: The \"Operation action\" produces no changes."
275
+ prompt += "\n\n"
276
+
277
+ prompt += "### Output format ###\n"
278
+ prompt += "Your output format is:\n"
279
+ prompt += "### Thought ###\nYour thought about the question\n"
280
+ prompt += "### Answer ###\nA or B or C"
281
+
282
+ return prompt
283
+
284
+
285
+ def get_memory_prompt(insight):
286
+ if insight != "":
287
+ prompt = "### Important content ###\n"
288
+ prompt += insight
289
+ prompt += "\n\n"
290
+
291
+ prompt += "### Response requirements ###\n"
292
+ prompt += "Please think about whether there is any content closely related to ### Important content ### on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
293
+
294
+ else:
295
+ prompt = "### Response requirements ###\n"
296
+ prompt += "Please think about whether there is any content closely related to user\'s instrcution on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
297
+
298
+ prompt += "### Output format ###\n"
299
+ prompt += "Your output format is:\n"
300
+ prompt += "### Important content ###\nThe content or None. Please do not repeatedly output the information in ### Memory ###."
301
+
302
+ return prompt
303
+
304
+ def get_process_prompt(instruction, thought_history, summary_history, action_history, completed_content, add_info, reflection_history=[]):
305
+ prompt = "### Background ###\n"
306
+ prompt += f"There is an user\'s instruction which is: {instruction}. You are a computer operating assistant and are operating the user\'s computer.\n\n"
307
+
308
+ if add_info != "":
309
+ prompt += "### Hint ###\n"
310
+ prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
311
+ prompt += add_info
312
+ prompt += "\n\n"
313
+
314
+ if len(thought_history) > 1:
315
+ prompt += "### History operations ###\n"
316
+ prompt += "To complete the requirements of user\'s instruction, you have performed a series of operations. These operations are as follow:\n"
317
+ for i in range(len(summary_history)):
318
+ operation = summary_history[i].split(" to ")[0].strip()
319
+ if len(reflection_history) > 0:
320
+ prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "; Operation reflection: " + reflection_history[i] + "]\n"
321
+ else:
322
+ prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "]\n"
323
+ prompt += "\n"
324
+
325
+ prompt += "### Progress thinking ###\n"
326
+ prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
327
+ prompt += "Completed contents:\n" + completed_content + "\n\n"
328
+
329
+ prompt += "### Response requirements ###\n"
330
+ prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the ### History operations ###.\n\n"
331
+
332
+ prompt += "### Output format ###\n"
333
+ prompt += "Your output format is:\n"
334
+ prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### History operations ###."
335
+
336
+ else:
337
+ prompt += "### Current operation ###\n"
338
+ prompt += "To complete the requirements of user\'s instruction, you have performed an operation. Your operation thought and action of this operation are as follows:\n"
339
+ prompt += f"Operation thought: {thought_history[-1]}\n"
340
+ operation = summary_history[-1].split(" to ")[0].strip()
341
+ if len(reflection_history) > 0:
342
+ prompt += f"Operation action: {operation}\n" + "Operation reflection: " + reflection_history[-1] + "\n\n"
343
+ else:
344
+ prompt += f"Operation action: {operation}\n\n"
345
+
346
+ # if reflection_thought is not None:
347
+ # prompt += "A reflection model was adopted to analyze whether the last step's operation meets the expectation, you should combine its reflection thought to produce the \"Completed contents\"."
348
+ # prompt += "Below is its reflection thought:\n"
349
+ # prompt += reflection_thought + "\n"
350
+
351
+ prompt += "### Response requirements ###\n"
352
+ prompt += "Now you need to combine all of the above to generate the \"Completed contents\".\n"
353
+ prompt += "Completed contents is a general summary of the current contents that have been completed. You need to first focus on the requirements of user\'s instruction, and then summarize the contents that have been completed.\n\n"
354
+
355
+ prompt += "### Output format ###\n"
356
+ prompt += "Your output format is:\n"
357
+ prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### Current operation ###.\n"
358
+ prompt += "(Please use English to output)"
359
+
360
+ return prompt
PCAgent/text_localization.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import logging
4
+ import os
5
+ from alibabacloud_tea_util import models as util_models
6
+ from alibabacloud_tea_openapi import models as open_api_models
7
+ from alibabacloud_ocr_api20210707 import models as ocr_api_20210707_models
8
+ from alibabacloud_ocr_api20210707.client import Client as ocr_api20210707Client
9
+
10
+ class Sample:
11
+ def __init__(self):
12
+ pass
13
+
14
+ @staticmethod
15
+ def create_client() -> ocr_api20210707Client:
16
+ config = open_api_models.Config(
17
+ access_key_id=os.environ.get('OCR_ACCESS_KEY_ID'),
18
+ access_key_secret=os.environ.get('OCR_ACCESS_KEY_SECRET'),
19
+ )
20
+ config.endpoint = f'ocr-api.cn-hangzhou.aliyuncs.com'
21
+ return ocr_api20210707Client(config)
22
+
23
+ @staticmethod
24
+ def main(image) -> None:
25
+ client = Sample.create_client()
26
+ recognize_all_text_request = ocr_api_20210707_models.RecognizeAllTextRequest(
27
+ body=image,
28
+ type='Advanced',
29
+ output_coordinate='points',
30
+ output_oricoord=True,
31
+ )
32
+ runtime = util_models.RuntimeOptions()
33
+ output = client.recognize_all_text_with_options(recognize_all_text_request, runtime)
34
+ # logger.info(f'ocr response๏ผš{output}', extra={'request_id': ""})
35
+ output = output.body.data.sub_images[0].block_info.block_details
36
+ return output
37
+
38
+ def image_to_binary(image_path):
39
+ with open(image_path, 'rb') as file:
40
+ binary_data = file.read()
41
+ return binary_data
42
+
43
+ def remove_punctuation(text):
44
+ # ไฝฟ็”จๆญฃๅˆ™่กจ่พพๅผๅˆ ้™คๆ ‡็‚น็ฌฆๅทใ€ไธ‹ๅˆ’็บฟๅ’Œ็ฉบๆ ผ
45
+ cleaned_text = re.sub(r'[^\w\s]', '', text) # ๅˆ ้™คๆ ‡็‚น็ฌฆๅท
46
+ cleaned_text = re.sub(r'_', '', cleaned_text) # ๅˆ ้™คไธ‹ๅˆ’็บฟ
47
+ cleaned_text = re.sub(r'\s', '', cleaned_text) # ๅˆ ้™ค็ฉบๆ ผ
48
+ return cleaned_text.replace("v", "").replace("o", "").replace("O", "").replace("T", "").replace("Q", "").replace("ไธถ", "")
49
+
50
+
51
+ class OCRError(Exception):
52
+ def __init__(self, message):
53
+ super().__init__(message)
54
+ self.message = message
55
+
56
+ def ocr(image_path):
57
+ text = []
58
+ coordinate = []
59
+ image = image_to_binary(image_path)
60
+ print(image_path)
61
+ try:
62
+ outputs = Sample.main(image)
63
+ except Exception as e:
64
+ raise OCRError(e.message)
65
+ for output in outputs:
66
+ text.append(output.block_content)
67
+ bbox = [int(output.block_points[0].x), int(output.block_points[0].y), int(output.block_points[2].x), int(output.block_points[2].y)]
68
+ coordinate.append(bbox)
69
+
70
+ return text, coordinate
PCAgent/text_localization_old.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from PCAgent.crop import crop_image, calculate_size
4
+ from PIL import Image
5
+
6
+
7
+ def order_point(coor):
8
+ arr = np.array(coor).reshape([4, 2])
9
+ sum_ = np.sum(arr, 0)
10
+ centroid = sum_ / arr.shape[0]
11
+ theta = np.arctan2(arr[:, 1] - centroid[1], arr[:, 0] - centroid[0])
12
+ sort_points = arr[np.argsort(theta)]
13
+ sort_points = sort_points.reshape([4, -1])
14
+ if sort_points[0][0] > centroid[0]:
15
+ sort_points = np.concatenate([sort_points[3:], sort_points[:3]])
16
+ sort_points = sort_points.reshape([4, 2]).astype('float32')
17
+ return sort_points
18
+
19
+
20
+ def longest_common_substring_length(str1, str2):
21
+ m = len(str1)
22
+ n = len(str2)
23
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
24
+
25
+ for i in range(1, m + 1):
26
+ for j in range(1, n + 1):
27
+ if str1[i - 1] == str2[j - 1]:
28
+ dp[i][j] = dp[i - 1][j - 1] + 1
29
+ else:
30
+ dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
31
+
32
+ return dp[m][n]
33
+
34
+
35
+ def ocr(image_path, ocr_detection, ocr_recognition):
36
+ text_data = []
37
+ coordinate = []
38
+
39
+ image_full = cv2.imread(image_path)
40
+ try:
41
+ det_result = ocr_detection(image_full)
42
+ except:
43
+ print('not text detected')
44
+ return ['no text'], [[0,0,0,0]]
45
+ det_result = det_result['polygons']
46
+ for i in range(det_result.shape[0]):
47
+ pts = order_point(det_result[i])
48
+ image_crop = crop_image(image_full, pts)
49
+
50
+ try:
51
+ result = ocr_recognition(image_crop)['text'][0]
52
+ except:
53
+ continue
54
+
55
+ box = [int(e) for e in list(pts.reshape(-1))]
56
+ box = [box[0], box[1], box[4], box[5]]
57
+
58
+ text_data.append(result)
59
+ coordinate.append(box)
60
+
61
+ return text_data, coordinate
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PC Agent
3
+ emoji: ๐Ÿ’ฌ
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.0.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: A Hierarchical Multi-Agent Collaboration Framework for Compl
12
+ ---
13
+
14
+ An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
app.py CHANGED
@@ -7,6 +7,7 @@ import copy
7
  import shutil
8
  import base64
9
  import random
 
10
  import gradio as gr
11
  from datetime import datetime
12
  from modelscope.pipelines import pipeline
@@ -27,8 +28,18 @@ API_url = os.environ.get('API_url')
27
  token = os.environ.get('token')
28
  os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
29
  os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
 
30
  radius = 100
31
 
 
 
 
 
 
 
 
 
 
32
  chatbot_css = """
33
  <style>
34
  .chat-container {
@@ -287,7 +298,7 @@ def chatbot(image, instruction, add_info, history, chat_log):
287
  screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png")
288
  image.save(screenshot_file, format="PNG")
289
  screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1]
290
- perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="C:/Windows/Fonts/arial.ttf")
291
  shutil.rmtree(temp_file)
292
  os.mkdir(temp_file)
293
 
 
7
  import shutil
8
  import base64
9
  import random
10
+ import requests
11
  import gradio as gr
12
  from datetime import datetime
13
  from modelscope.pipelines import pipeline
 
28
  token = os.environ.get('token')
29
  os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
30
  os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
31
+ tff_file = os.environ.get('tff_file')
32
  radius = 100
33
 
34
+ def download_file(url, save_path):
35
+ response = requests.get(url, stream=True) # ไปฅๆต็š„ๆ–นๅผไธ‹่ฝฝ
36
+ response.raise_for_status() # ็กฎไฟ่ฏทๆฑ‚ๆˆๅŠŸ
37
+ with open(save_path, 'wb') as file:
38
+ for chunk in response.iter_content(chunk_size=8192): # ๅˆ†ๅ—ๅ†™ๅ…ฅ๏ผŒ้˜ฒๆญขๅ ็”จ่ฟ‡ๅคšๅ†…ๅญ˜
39
+ file.write(chunk)
40
+
41
+ download_file(tff_file, "font/arial.ttf")
42
+
43
  chatbot_css = """
44
  <style>
45
  .chat-container {
 
298
  screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png")
299
  image.save(screenshot_file, format="PNG")
300
  screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1]
301
+ perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="font/arial.ttf")
302
  shutil.rmtree(temp_file)
303
  os.mkdir(temp_file)
304
 
example/1-1.jpg ADDED
example/1-2.jpg ADDED
example/1-3.jpg ADDED