Spaces:
Runtime error
Runtime error
้ณๆธ
commited on
Commit
ยท
3afb4b6
1
Parent(s):
e78b889
Update
Browse files- PCAgent/__pycache__/api.cpython-310.pyc +0 -0
- PCAgent/__pycache__/chat.cpython-310.pyc +0 -0
- PCAgent/__pycache__/crop.cpython-310.pyc +0 -0
- PCAgent/__pycache__/icon_localization.cpython-310.pyc +0 -0
- PCAgent/__pycache__/merge_strategy.cpython-310.pyc +0 -0
- PCAgent/__pycache__/prompt_qwen.cpython-310.pyc +0 -0
- PCAgent/__pycache__/text_localization.cpython-310.pyc +0 -0
- PCAgent/api.py +77 -0
- PCAgent/chat.py +123 -0
- PCAgent/crop.py +120 -0
- PCAgent/icon_localization.py +59 -0
- PCAgent/merge_strategy.py +275 -0
- PCAgent/prompt_qwen.py +360 -0
- PCAgent/text_localization.py +70 -0
- PCAgent/text_localization_old.py +61 -0
- README.md +14 -0
- app.py +12 -1
- example/1-1.jpg +0 -0
- example/1-2.jpg +0 -0
- example/1-3.jpg +0 -0
PCAgent/__pycache__/api.cpython-310.pyc
ADDED
Binary file (1.47 kB). View file
|
|
PCAgent/__pycache__/chat.cpython-310.pyc
ADDED
Binary file (2.45 kB). View file
|
|
PCAgent/__pycache__/crop.cpython-310.pyc
ADDED
Binary file (3.18 kB). View file
|
|
PCAgent/__pycache__/icon_localization.cpython-310.pyc
ADDED
Binary file (1.75 kB). View file
|
|
PCAgent/__pycache__/merge_strategy.cpython-310.pyc
ADDED
Binary file (6.25 kB). View file
|
|
PCAgent/__pycache__/prompt_qwen.cpython-310.pyc
ADDED
Binary file (20.5 kB). View file
|
|
PCAgent/__pycache__/text_localization.cpython-310.pyc
ADDED
Binary file (2.87 kB). View file
|
|
PCAgent/api.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import requests
|
3 |
+
import time
|
4 |
+
|
5 |
+
import pdb
|
6 |
+
import dashscope
|
7 |
+
from dashscope import MultiModalConversation
|
8 |
+
|
9 |
+
from PIL import Image
|
10 |
+
import io
|
11 |
+
from openai import OpenAI
|
12 |
+
import json
|
13 |
+
|
14 |
+
def resize_encode_image(image_path, screen_scale_ratio=1):
|
15 |
+
with Image.open(image_path) as img:
|
16 |
+
new_width = int(img.width * screen_scale_ratio)
|
17 |
+
new_height = int(img.height * screen_scale_ratio)
|
18 |
+
resized_img = img.resize((new_width, new_height), Image.LANCZOS)
|
19 |
+
|
20 |
+
buffered = io.BytesIO()
|
21 |
+
resized_img.save(buffered, format="PNG")
|
22 |
+
|
23 |
+
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
24 |
+
return img_base64
|
25 |
+
# with open(image_path, "rb") as image_file:
|
26 |
+
# return base64.b64encode(image_file.read()).decode('utf-8')
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
def inference_chat(chat, model, api_url, token):
|
33 |
+
|
34 |
+
messages = []
|
35 |
+
for role, content in chat:
|
36 |
+
messages.append({"role": role, "content": content})
|
37 |
+
|
38 |
+
client = OpenAI(
|
39 |
+
# ่ฅๆฒกๆ้
็ฝฎ็ฏๅขๅ้๏ผ่ฏท็จ็พ็ผAPI Keyๅฐไธ่กๆฟๆขไธบ๏ผapi_key="sk-xxx",
|
40 |
+
api_key=token,
|
41 |
+
base_url=api_url,
|
42 |
+
)
|
43 |
+
|
44 |
+
|
45 |
+
num_try = 5
|
46 |
+
for _ in range(num_try):
|
47 |
+
try:
|
48 |
+
completion = client.chat.completions.create(
|
49 |
+
model=model, # ๆญคๅคไปฅqwen-plusไธบไพ๏ผๅฏๆ้ๆดๆขๆจกๅๅ็งฐใๆจกๅๅ่กจ๏ผhttps://help.aliyun.com/zh/model-studio/getting-started/models
|
50 |
+
messages=messages
|
51 |
+
)
|
52 |
+
except:
|
53 |
+
print("Network Error:")
|
54 |
+
try:
|
55 |
+
print(completion.model_dump_json())
|
56 |
+
except:
|
57 |
+
print("Request Failed")
|
58 |
+
time.sleep(2)
|
59 |
+
else:
|
60 |
+
break
|
61 |
+
|
62 |
+
|
63 |
+
return json.loads(completion.model_dump_json())['choices'][0]['message']['content']
|
64 |
+
|
65 |
+
# headers = {
|
66 |
+
# "Content-Type": "application/json",
|
67 |
+
# "Authorization": f"Bearer {token}"
|
68 |
+
# }
|
69 |
+
|
70 |
+
# data = {
|
71 |
+
# "model": model,
|
72 |
+
# "messages": [],
|
73 |
+
# "max_tokens": 2048,
|
74 |
+
# 'temperature': 0.0,
|
75 |
+
# "seed": 1234
|
76 |
+
# }
|
77 |
+
|
PCAgent/chat.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
from PCAgent.api import resize_encode_image
|
3 |
+
|
4 |
+
|
5 |
+
def init_subtask_chat():
|
6 |
+
operation_history = []
|
7 |
+
system_prompt = "You are a helpful AI assistant."
|
8 |
+
operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
|
9 |
+
return operation_history
|
10 |
+
|
11 |
+
|
12 |
+
def init_action_chat():
|
13 |
+
operation_history = []
|
14 |
+
system_prompt = "You are a helpful AI PC operating assistant. You need to help me operate the PC to complete the user\'s instruction."
|
15 |
+
operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
|
16 |
+
return operation_history
|
17 |
+
|
18 |
+
|
19 |
+
def init_reflect_chat():
|
20 |
+
operation_history = []
|
21 |
+
system_prompt = "You are a helpful AI PC operating assistant."
|
22 |
+
operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
|
23 |
+
return operation_history
|
24 |
+
|
25 |
+
|
26 |
+
def init_memory_chat():
|
27 |
+
operation_history = []
|
28 |
+
system_prompt = "You are a helpful AI PC operating assistant."
|
29 |
+
operation_history.append(["system", [{"type": "text", "text": system_prompt}]])
|
30 |
+
return operation_history
|
31 |
+
|
32 |
+
|
33 |
+
def add_response_old(role, prompt, chat_history, image=None):
|
34 |
+
new_chat_history = copy.deepcopy(chat_history)
|
35 |
+
if image:
|
36 |
+
base64_image = resize_encode_image(image)
|
37 |
+
content = [
|
38 |
+
{
|
39 |
+
"type": "text",
|
40 |
+
"text": prompt
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"type": "image_url",
|
44 |
+
"image_url": {
|
45 |
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
46 |
+
}
|
47 |
+
},
|
48 |
+
]
|
49 |
+
else:
|
50 |
+
content = [
|
51 |
+
{
|
52 |
+
"type": "text",
|
53 |
+
"text": prompt
|
54 |
+
},
|
55 |
+
]
|
56 |
+
new_chat_history.append([role, content])
|
57 |
+
return new_chat_history
|
58 |
+
|
59 |
+
|
60 |
+
def add_response(role, prompt, chat_history, image=[], use_qwen=False):
|
61 |
+
new_chat_history = copy.deepcopy(chat_history)
|
62 |
+
content = [
|
63 |
+
{
|
64 |
+
"type": "text",
|
65 |
+
"text": prompt
|
66 |
+
},
|
67 |
+
]
|
68 |
+
for i in range(len(image)):
|
69 |
+
if not use_qwen:
|
70 |
+
base64_image = resize_encode_image(image[i])
|
71 |
+
content.append(
|
72 |
+
{
|
73 |
+
"type": "image_url",
|
74 |
+
"image_url": {
|
75 |
+
"url": f"data:image/png;base64,{base64_image}"
|
76 |
+
}
|
77 |
+
}
|
78 |
+
)
|
79 |
+
else:
|
80 |
+
content.append(
|
81 |
+
{
|
82 |
+
"type": "image",
|
83 |
+
"image": image[i]
|
84 |
+
}
|
85 |
+
)
|
86 |
+
new_chat_history.append([role, content])
|
87 |
+
return new_chat_history
|
88 |
+
|
89 |
+
|
90 |
+
def add_response_two_image(role, prompt, chat_history, image):
|
91 |
+
new_chat_history = copy.deepcopy(chat_history)
|
92 |
+
|
93 |
+
base64_image1 = resize_encode_image(image[0])
|
94 |
+
base64_image2 = resize_encode_image(image[1])
|
95 |
+
content = [
|
96 |
+
{
|
97 |
+
"type": "text",
|
98 |
+
"text": prompt
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"type": "image_url",
|
102 |
+
"image_url": {
|
103 |
+
"url": f"data:image/jpeg;base64,{base64_image1}"
|
104 |
+
}
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"type": "image_url",
|
108 |
+
"image_url": {
|
109 |
+
"url": f"data:image/jpeg;base64,{base64_image2}"
|
110 |
+
}
|
111 |
+
},
|
112 |
+
]
|
113 |
+
|
114 |
+
new_chat_history.append([role, content])
|
115 |
+
return new_chat_history
|
116 |
+
|
117 |
+
|
118 |
+
def print_status(chat_history):
|
119 |
+
print("*"*100)
|
120 |
+
for chat in chat_history:
|
121 |
+
print("role:", chat[0])
|
122 |
+
print(chat[1][0]["text"] + "<image>"*(len(chat[1])-1) + "\n")
|
123 |
+
print("*"*100)
|
PCAgent/crop.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image, ImageDraw, ImageFont
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
def crop_image(img, position):
|
9 |
+
def distance(x1,y1,x2,y2):
|
10 |
+
return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))
|
11 |
+
position = position.tolist()
|
12 |
+
for i in range(4):
|
13 |
+
for j in range(i+1, 4):
|
14 |
+
if(position[i][0] > position[j][0]):
|
15 |
+
tmp = position[j]
|
16 |
+
position[j] = position[i]
|
17 |
+
position[i] = tmp
|
18 |
+
if position[0][1] > position[1][1]:
|
19 |
+
tmp = position[0]
|
20 |
+
position[0] = position[1]
|
21 |
+
position[1] = tmp
|
22 |
+
|
23 |
+
if position[2][1] > position[3][1]:
|
24 |
+
tmp = position[2]
|
25 |
+
position[2] = position[3]
|
26 |
+
position[3] = tmp
|
27 |
+
|
28 |
+
x1, y1 = position[0][0], position[0][1]
|
29 |
+
x2, y2 = position[2][0], position[2][1]
|
30 |
+
x3, y3 = position[3][0], position[3][1]
|
31 |
+
x4, y4 = position[1][0], position[1][1]
|
32 |
+
|
33 |
+
corners = np.zeros((4,2), np.float32)
|
34 |
+
corners[0] = [x1, y1]
|
35 |
+
corners[1] = [x2, y2]
|
36 |
+
corners[2] = [x4, y4]
|
37 |
+
corners[3] = [x3, y3]
|
38 |
+
|
39 |
+
img_width = distance((x1+x4)/2, (y1+y4)/2, (x2+x3)/2, (y2+y3)/2)
|
40 |
+
img_height = distance((x1+x2)/2, (y1+y2)/2, (x4+x3)/2, (y4+y3)/2)
|
41 |
+
|
42 |
+
corners_trans = np.zeros((4,2), np.float32)
|
43 |
+
corners_trans[0] = [0, 0]
|
44 |
+
corners_trans[1] = [img_width - 1, 0]
|
45 |
+
corners_trans[2] = [0, img_height - 1]
|
46 |
+
corners_trans[3] = [img_width - 1, img_height - 1]
|
47 |
+
|
48 |
+
transform = cv2.getPerspectiveTransform(corners, corners_trans)
|
49 |
+
dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height)))
|
50 |
+
return dst
|
51 |
+
|
52 |
+
|
53 |
+
def calculate_size(box):
|
54 |
+
return (box[2]-box[0]) * (box[3]-box[1])
|
55 |
+
|
56 |
+
|
57 |
+
def calculate_iou(box1, box2):
|
58 |
+
xA = max(box1[0], box2[0])
|
59 |
+
yA = max(box1[1], box2[1])
|
60 |
+
xB = min(box1[2], box2[2])
|
61 |
+
yB = min(box1[3], box2[3])
|
62 |
+
|
63 |
+
interArea = max(0, xB - xA) * max(0, yB - yA)
|
64 |
+
box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
65 |
+
box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
66 |
+
unionArea = box1Area + box2Area - interArea
|
67 |
+
iou = interArea / unionArea
|
68 |
+
|
69 |
+
return iou
|
70 |
+
|
71 |
+
|
72 |
+
def crop(image, box, i, text_data=None):
|
73 |
+
image = Image.open(image)
|
74 |
+
|
75 |
+
if text_data:
|
76 |
+
draw = ImageDraw.Draw(image)
|
77 |
+
draw.rectangle(((text_data[0], text_data[1]), (text_data[2], text_data[3])), outline="red", width=5)
|
78 |
+
# font_size = int((text_data[3] - text_data[1])*0.75)
|
79 |
+
# font = ImageFont.truetype("arial.ttf", font_size)
|
80 |
+
# draw.text((text_data[0]+5, text_data[1]+5), str(i), font=font, fill="red")
|
81 |
+
|
82 |
+
cropped_image = image.crop(box)
|
83 |
+
cropped_image.save(f"./temp/{i}.jpg")
|
84 |
+
|
85 |
+
|
86 |
+
def in_box(box, target):
|
87 |
+
if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]):
|
88 |
+
return True
|
89 |
+
else:
|
90 |
+
return False
|
91 |
+
|
92 |
+
|
93 |
+
def crop_for_clip(image, box, i, position):
|
94 |
+
image = Image.open(image)
|
95 |
+
w, h = image.size
|
96 |
+
if position == "left":
|
97 |
+
bound = [0, 0, w/2, h]
|
98 |
+
elif position == "right":
|
99 |
+
bound = [w/2, 0, w, h]
|
100 |
+
elif position == "top":
|
101 |
+
bound = [0, 0, w, h/2]
|
102 |
+
elif position == "bottom":
|
103 |
+
bound = [0, h/2, w, h]
|
104 |
+
elif position == "top left":
|
105 |
+
bound = [0, 0, w/2, h/2]
|
106 |
+
elif position == "top right":
|
107 |
+
bound = [w/2, 0, w, h/2]
|
108 |
+
elif position == "bottom left":
|
109 |
+
bound = [0, h/2, w/2, h]
|
110 |
+
elif position == "bottom right":
|
111 |
+
bound = [w/2, h/2, w, h]
|
112 |
+
else:
|
113 |
+
bound = [0, 0, w, h]
|
114 |
+
|
115 |
+
if in_box(box, bound):
|
116 |
+
cropped_image = image.crop(box)
|
117 |
+
cropped_image.save(f"./temp/{i}.jpg")
|
118 |
+
return True
|
119 |
+
else:
|
120 |
+
return False
|
PCAgent/icon_localization.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PCAgent.crop import calculate_size, calculate_iou
|
2 |
+
from modelscope.pipelines import pipeline
|
3 |
+
from PIL import Image
|
4 |
+
import torch
|
5 |
+
|
6 |
+
def remove_boxes(boxes_filt, size, iou_threshold=0.5):
|
7 |
+
boxes_to_remove = set()
|
8 |
+
|
9 |
+
for i in range(len(boxes_filt)):
|
10 |
+
if calculate_size(boxes_filt[i]) > 0.05*size[0]*size[1]:
|
11 |
+
boxes_to_remove.add(i)
|
12 |
+
for j in range(len(boxes_filt)):
|
13 |
+
if calculate_size(boxes_filt[j]) > 0.05*size[0]*size[1]:
|
14 |
+
boxes_to_remove.add(j)
|
15 |
+
if i == j:
|
16 |
+
continue
|
17 |
+
if i in boxes_to_remove or j in boxes_to_remove:
|
18 |
+
continue
|
19 |
+
iou = calculate_iou(boxes_filt[i], boxes_filt[j])
|
20 |
+
if iou >= iou_threshold:
|
21 |
+
boxes_to_remove.add(j)
|
22 |
+
|
23 |
+
boxes_filt = [box for idx, box in enumerate(boxes_filt) if idx not in boxes_to_remove]
|
24 |
+
|
25 |
+
return boxes_filt
|
26 |
+
|
27 |
+
|
28 |
+
def det(input_image_path, caption, groundingdino_model, box_threshold=0.05, text_threshold=0.5):
|
29 |
+
image = Image.open(input_image_path)
|
30 |
+
size = image.size
|
31 |
+
|
32 |
+
caption = caption.lower()
|
33 |
+
caption = caption.strip()
|
34 |
+
if not caption.endswith('.'):
|
35 |
+
caption = caption + '.'
|
36 |
+
|
37 |
+
inputs = {
|
38 |
+
'IMAGE_PATH': input_image_path,
|
39 |
+
'TEXT_PROMPT': caption,
|
40 |
+
'BOX_TRESHOLD': box_threshold,
|
41 |
+
'TEXT_TRESHOLD': text_threshold
|
42 |
+
}
|
43 |
+
|
44 |
+
result = groundingdino_model(inputs)
|
45 |
+
boxes_filt = result['boxes']
|
46 |
+
|
47 |
+
H, W = size[1], size[0]
|
48 |
+
for i in range(boxes_filt.size(0)):
|
49 |
+
boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
|
50 |
+
boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
|
51 |
+
boxes_filt[i][2:] += boxes_filt[i][:2]
|
52 |
+
|
53 |
+
boxes_filt = boxes_filt.cpu().int().tolist()
|
54 |
+
filtered_boxes = remove_boxes(boxes_filt, size) # [:9]
|
55 |
+
coordinates = []
|
56 |
+
for box in filtered_boxes:
|
57 |
+
coordinates.append([box[0], box[1], box[2], box[3]])
|
58 |
+
|
59 |
+
return coordinates
|
PCAgent/merge_strategy.py
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
def calculate_iou(box1, box2):
|
5 |
+
x1_min, y1_min, x1_max, y1_max = box1
|
6 |
+
x2_min, y2_min, x2_max, y2_max = box2
|
7 |
+
|
8 |
+
inter_x_min = max(x1_min, x2_min)
|
9 |
+
inter_y_min = max(y1_min, y2_min)
|
10 |
+
inter_x_max = min(x1_max, x2_max)
|
11 |
+
inter_y_max = min(y1_max, y2_max)
|
12 |
+
|
13 |
+
inter_area = max(0, inter_x_max - inter_x_min) * max(0, inter_y_max - inter_y_min)
|
14 |
+
box1_area = (x1_max - x1_min) * (y1_max - y1_min)
|
15 |
+
box2_area = (x2_max - x2_min) * (y2_max - y2_min)
|
16 |
+
|
17 |
+
union_area = box1_area + box2_area - inter_area
|
18 |
+
iou = inter_area / union_area
|
19 |
+
return iou
|
20 |
+
|
21 |
+
|
22 |
+
def compute_iou(box1, box2):
|
23 |
+
"""
|
24 |
+
Compute the Intersection over Union (IoU) of two bounding boxes.
|
25 |
+
|
26 |
+
Parameters:
|
27 |
+
- box1: list or array [x1, y1, x2, y2]
|
28 |
+
- box2: list or array [x1, y1, x2, y2]
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
- iou: float, IoU value
|
32 |
+
"""
|
33 |
+
x1_inter = max(box1[0], box2[0])
|
34 |
+
y1_inter = max(box1[1], box2[1])
|
35 |
+
x2_inter = min(box1[2], box2[2])
|
36 |
+
y2_inter = min(box1[3], box2[3])
|
37 |
+
|
38 |
+
# print(x2_inter, x1_inter, y2_inter, y1_inter)
|
39 |
+
|
40 |
+
inter_area = max(0, x2_inter - x1_inter + 1) * max(0, y2_inter - y1_inter + 1)
|
41 |
+
|
42 |
+
box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
|
43 |
+
box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
|
44 |
+
|
45 |
+
iou = inter_area / float(box1_area + box2_area - inter_area)
|
46 |
+
|
47 |
+
return iou
|
48 |
+
|
49 |
+
|
50 |
+
def merge_boxes(box1, box2):
|
51 |
+
x1_min, y1_min, x1_max, y1_max = box1
|
52 |
+
x2_min, y2_min, x2_max, y2_max = box2
|
53 |
+
|
54 |
+
merged_box = [min(x1_min, x2_min), min(y1_min, y2_min), max(x1_max, x2_max), max(y1_max, y2_max)]
|
55 |
+
return merged_box
|
56 |
+
|
57 |
+
|
58 |
+
def merge_boxes_and_texts(texts, boxes, iou_threshold=0):
|
59 |
+
"""
|
60 |
+
Merge bounding boxes and their corresponding texts based on IoU threshold.
|
61 |
+
|
62 |
+
Parameters:
|
63 |
+
- boxes: List of bounding boxes, with each box represented as [x1, y1, x2, y2].
|
64 |
+
- texts: List of texts corresponding to each bounding box.
|
65 |
+
- iou_threshold: Intersection-over-Union threshold for merging boxes.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
- merged_boxes: List of merged bounding boxes.
|
69 |
+
- merged_texts: List of merged texts corresponding to the bounding boxes.
|
70 |
+
"""
|
71 |
+
if len(boxes) == 0:
|
72 |
+
return [], []
|
73 |
+
|
74 |
+
# boxes = np.array(boxes)
|
75 |
+
merged_boxes = []
|
76 |
+
merged_texts = []
|
77 |
+
|
78 |
+
while len(boxes) > 0:
|
79 |
+
box = boxes[0]
|
80 |
+
text = texts[0]
|
81 |
+
boxes = boxes[1:]
|
82 |
+
texts = texts[1:]
|
83 |
+
to_merge_boxes = [box]
|
84 |
+
to_merge_texts = [text]
|
85 |
+
keep_boxes = []
|
86 |
+
keep_texts = []
|
87 |
+
|
88 |
+
for i, other_box in enumerate(boxes):
|
89 |
+
if compute_iou(box, other_box) > iou_threshold:
|
90 |
+
to_merge_boxes.append(other_box)
|
91 |
+
to_merge_texts.append(texts[i])
|
92 |
+
else:
|
93 |
+
keep_boxes.append(other_box)
|
94 |
+
keep_texts.append(texts[i])
|
95 |
+
|
96 |
+
# Merge the to_merge boxes into a single box
|
97 |
+
if len(to_merge_boxes) > 1:
|
98 |
+
x1 = min(b[0] for b in to_merge_boxes)
|
99 |
+
y1 = min(b[1] for b in to_merge_boxes)
|
100 |
+
x2 = max(b[2] for b in to_merge_boxes)
|
101 |
+
y2 = max(b[3] for b in to_merge_boxes)
|
102 |
+
merged_box = [x1, y1, x2, y2]
|
103 |
+
merged_text = " ".join(to_merge_texts) # You can change the merging strategy here
|
104 |
+
merged_boxes.append(merged_box)
|
105 |
+
merged_texts.append(merged_text)
|
106 |
+
else:
|
107 |
+
merged_boxes.extend(to_merge_boxes)
|
108 |
+
merged_texts.extend(to_merge_texts)
|
109 |
+
|
110 |
+
# boxes = np.array(keep_boxes)
|
111 |
+
boxes = keep_boxes
|
112 |
+
texts = keep_texts
|
113 |
+
|
114 |
+
return merged_texts, merged_boxes
|
115 |
+
|
116 |
+
|
117 |
+
def is_contained(bbox1, bbox2):
|
118 |
+
x1_min, y1_min, x1_max, y1_max = bbox1
|
119 |
+
x2_min, y2_min, x2_max, y2_max = bbox2
|
120 |
+
|
121 |
+
if (x1_min >= x2_min and y1_min >= y2_min and x1_max <= x2_max and y1_max <= y2_max):
|
122 |
+
return True
|
123 |
+
elif (x2_min >= x1_min and y2_min >= y1_min and x2_max <= x1_max and y2_max <= y1_max):
|
124 |
+
return True
|
125 |
+
return False
|
126 |
+
|
127 |
+
|
128 |
+
def is_overlapping(bbox1, bbox2):
|
129 |
+
x1_min, y1_min, x1_max, y1_max = bbox1
|
130 |
+
x2_min, y2_min, x2_max, y2_max = bbox2
|
131 |
+
|
132 |
+
inter_xmin = max(x1_min, x2_min)
|
133 |
+
inter_ymin = max(y1_min, y2_min)
|
134 |
+
inter_xmax = min(x1_max, x2_max)
|
135 |
+
inter_ymax = min(y1_max, y2_max)
|
136 |
+
|
137 |
+
if inter_xmin < inter_xmax and inter_ymin < inter_ymax:
|
138 |
+
return True
|
139 |
+
return False
|
140 |
+
|
141 |
+
|
142 |
+
def get_area(bbox):
|
143 |
+
x_min, y_min, x_max, y_max = bbox
|
144 |
+
return (x_max - x_min) * (y_max - y_min)
|
145 |
+
|
146 |
+
|
147 |
+
def merge_all_icon_boxes(bboxes):
|
148 |
+
result_bboxes = []
|
149 |
+
while bboxes:
|
150 |
+
bbox = bboxes.pop(0)
|
151 |
+
to_add = True
|
152 |
+
|
153 |
+
for idx, existing_bbox in enumerate(result_bboxes):
|
154 |
+
if is_contained(bbox, existing_bbox):
|
155 |
+
if get_area(bbox) > get_area(existing_bbox):
|
156 |
+
result_bboxes[idx] = existing_bbox
|
157 |
+
to_add = False
|
158 |
+
break
|
159 |
+
elif is_overlapping(bbox, existing_bbox):
|
160 |
+
if get_area(bbox) < get_area(existing_bbox):
|
161 |
+
result_bboxes[idx] = bbox
|
162 |
+
to_add = False
|
163 |
+
break
|
164 |
+
|
165 |
+
if to_add:
|
166 |
+
result_bboxes.append(bbox)
|
167 |
+
|
168 |
+
return result_bboxes
|
169 |
+
|
170 |
+
|
171 |
+
def merge_all_icon_boxes_new(elements):
|
172 |
+
result_elements = []
|
173 |
+
while elements:
|
174 |
+
ele = elements.pop(0)
|
175 |
+
bbox = [ele['position'][0], ele['position'][1], ele['position'][0]+ele['size'][0], ele['position'][1]+ele['size'][1]]
|
176 |
+
# bbox = bboxes.pop(0)
|
177 |
+
to_add = True
|
178 |
+
|
179 |
+
for idx, existing_ele in enumerate(result_elements):
|
180 |
+
existing_bbox = [existing_ele['position'][0], existing_ele['position'][1], existing_ele['position'][0]+existing_ele['size'][0], existing_ele['position'][1]+existing_ele['size'][1]]
|
181 |
+
if is_contained(bbox, existing_bbox):
|
182 |
+
if get_area(bbox) > get_area(existing_bbox):
|
183 |
+
result_elements[idx] = existing_ele
|
184 |
+
to_add = False
|
185 |
+
break
|
186 |
+
elif is_overlapping(bbox, existing_bbox):
|
187 |
+
if get_area(bbox) < get_area(existing_bbox):
|
188 |
+
result_elements[idx] = ele
|
189 |
+
to_add = False
|
190 |
+
break
|
191 |
+
|
192 |
+
if to_add:
|
193 |
+
result_elements.append(ele)
|
194 |
+
|
195 |
+
return result_elements
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
+
|
200 |
+
def merge_bbox_groups(A, B, iou_threshold=0.8):
|
201 |
+
i = 0
|
202 |
+
while i < len(A):
|
203 |
+
box_a = A[i]
|
204 |
+
has_merged = False
|
205 |
+
for j in range(len(B)):
|
206 |
+
box_b = B[j]
|
207 |
+
iou = calculate_iou(box_a, box_b)
|
208 |
+
if iou > iou_threshold:
|
209 |
+
merged_box = merge_boxes(box_a, box_b)
|
210 |
+
A[i] = merged_box
|
211 |
+
B.pop(j)
|
212 |
+
has_merged = True
|
213 |
+
break
|
214 |
+
|
215 |
+
if has_merged:
|
216 |
+
i -= 1
|
217 |
+
i += 1
|
218 |
+
|
219 |
+
return A, B
|
220 |
+
|
221 |
+
|
222 |
+
def bbox_iou(boxA, boxB):
|
223 |
+
# Calculate Intersection over Union (IoU) between two bounding boxes
|
224 |
+
xA = max(boxA[0], boxB[0])
|
225 |
+
yA = max(boxA[1], boxB[1])
|
226 |
+
xB = min(boxA[2], boxB[2])
|
227 |
+
yB = min(boxA[3], boxB[3])
|
228 |
+
interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
|
229 |
+
boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
|
230 |
+
boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
|
231 |
+
iou = interArea / float(boxAArea + boxBArea - interArea)
|
232 |
+
return iou
|
233 |
+
|
234 |
+
|
235 |
+
def merge_boxes_and_texts_new(texts, bounding_boxes, iou_threshold=0):
|
236 |
+
if not bounding_boxes:
|
237 |
+
return [], []
|
238 |
+
|
239 |
+
bounding_boxes = np.array(bounding_boxes)
|
240 |
+
merged_boxes = []
|
241 |
+
merged_texts = []
|
242 |
+
|
243 |
+
used = np.zeros(len(bounding_boxes), dtype=bool)
|
244 |
+
|
245 |
+
for i, boxA in enumerate(bounding_boxes):
|
246 |
+
if used[i]:
|
247 |
+
continue
|
248 |
+
x_min, y_min, x_max, y_max = boxA
|
249 |
+
# text = texts[i]
|
250 |
+
text = ''
|
251 |
+
|
252 |
+
overlapping_indices = [i] # []
|
253 |
+
for j, boxB in enumerate(bounding_boxes):
|
254 |
+
# print(i,j, bbox_iou(boxA, boxB))
|
255 |
+
if i != j and not used[j] and bbox_iou(boxA, boxB) > iou_threshold:
|
256 |
+
overlapping_indices.append(j)
|
257 |
+
|
258 |
+
# Sort overlapping boxes by vertical position (top to bottom)
|
259 |
+
overlapping_indices.sort(key=lambda idx: (bounding_boxes[idx][1] + bounding_boxes[idx][3])/2) # TODO
|
260 |
+
|
261 |
+
for idx in overlapping_indices:
|
262 |
+
boxB = bounding_boxes[idx]
|
263 |
+
x_min = min(x_min, boxB[0])
|
264 |
+
y_min = min(y_min, boxB[1])
|
265 |
+
x_max = max(x_max, boxB[2])
|
266 |
+
y_max = max(y_max, boxB[3])
|
267 |
+
# text += " " + texts[idx]
|
268 |
+
text += texts[idx]
|
269 |
+
used[idx] = True
|
270 |
+
|
271 |
+
merged_boxes.append([x_min, y_min, x_max, y_max])
|
272 |
+
merged_texts.append(text)
|
273 |
+
used[i] = True
|
274 |
+
|
275 |
+
return merged_texts, merged_boxes
|
PCAgent/prompt_qwen.py
ADDED
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PC
|
2 |
+
def get_subtask_prompt_cn(instruction):
|
3 |
+
func_prompt = '''ๅคๆจกๆagent้่ฟๆง่ก็นๅปใ่พๅ
ฅ็ญไธ็ณปๅๆไฝๆฅๅฎๆ็จๆท็ๆไปคใ
|
4 |
+
็จๆทๆไปคๅฏ่ฝ็ฑ่ทจ่ถๅคไธชๅบ็จ็จๅบ็ๆฐไธชๅญไปปๅก็ปๆ๏ผๆๅธๆไฝ ่ฝๅฐ่ฟไธชๅคๆ็ๆไปค๏ผๅ่งฃไธบไธไบๅญไปปๅก๏ผๅญไปปๅกๆ4็ง็ฑปๅ๏ผ
|
5 |
+
1. ๅธธ่งๅญ็ฌฆไธฒๅฝขๅผ๏ผไพๅฆโๅจ็ณป็ป่ฎพ็ฝฎไธญ๏ผๆๅผๆทฑ่ฒๆจกๅผโ๏ผ
|
6 |
+
2. ๅ
ๅซๅญๅ
ธๅ
ๅฎน็ๅญ็ฌฆไธฒ๏ผๅฝๅๅญไปปๅก็ๆง่ก็ปๆ้่ฆไปฅๅญๅ
ธๆนๅผไผ ้็ปๅ
ถไปๅญไปปๅก๏ผไพๅฆโๅจOutlookไธญ๏ผๆฅ็โPaulโๅๆฅ็้ฎไปถไฟกๆฏ๏ผไปฅdictๅฝขๅผ่พๅบ{'contact': 'Paul', 'mail_content': 'content of the email'}โ๏ผ
|
7 |
+
3. ๆ ผๅผๅๅญ็ฌฆไธฒ๏ผๅฉ็จๅๅบๅญไปปๅกไผ ้็ไฟกๆฏ๏ผ่กฅๅ
จๅฝๅๅญไปปๅกๅ๏ผ่ฝๅคๅฎๅ
จ็ฌ็ซๆง่ก๏ผไพๅฆโๅฐ{mail_content}้่ฟ็ญไฟกๅ้็ปโJoeyโโ๏ผ
|
8 |
+
4. ๅ
ๅซๅญๅ
ธๅ
ๅฎน็ๆ ผๅผๅๅญ็ฌฆไธฒ๏ผๆข้่ฆๅๅบๅญไปปๅกไผ ้็ไฟกๆฏ๏ผไปฅ่กฅๅ
จๅฝๅๅญไปปๅก๏ผๅๆถๆง่ก็็ปๆไน้่ฆไปฅๅญๅ
ธๆนๅผไผ ้็ปๅ
ถไปๅญไปปๅก๏ผไพๅฆโๅจ่ฐทๆญไธญๆ็ดข{question}๏ผๅนถๅฐ็ธๅ
ณไฟกๆฏไปฅdictๅฝขๅผ่พๅบ{'info': 'related information'}โใ
|
9 |
+
|
10 |
+
ไธพไพๆฅ่ฏด๏ผๅคๅๆไปคโ็ณป็ป่ฎพ็ฝฎไธญๆๅผๆทฑ่ฒๆจกๅผ๏ผๅจๅพฎไฟกไธญๆฅ็โJohnโๅๆฅ็้ฎ้ข๏ผๅจChromeไธญๆ็ดข้ฎ้ข็็ญๆก๏ผๅฐ็ญๆกๆทปๅ ๅฐไธไธชๆฐๅปบwordๆๆกฃไธญ๏ผไฟๅญไธบโไฝไธ.docxโ๏ผ็ถๅๅ้็ปโJohnโใโๅฏไปฅ่ขซๅ่งฃไธบ๏ผ
|
11 |
+
{
|
12 |
+
"subtask 1": "ๅจ็ณป็ป่ฎพ็ฝฎไธญ๏ผๆๅผๆทฑ่ฒๆจกๅผ",
|
13 |
+
"subtask 2": "ๅจๅพฎไฟกไธญ๏ผๆฅ็โJohnโๅๆฅ็้ฎ้ข๏ผๅฐ้ฎ้ขไปฅdictๅฝขๅผ่พๅบ{'John_question': 'content of the question'}",
|
14 |
+
"subtask 3": "ๅจChromeไธญ๏ผๆ็ดข{John_question}๏ผๅฐๆ็ดขๅฐ็็ญๆกไปฅdictๅฝขๅผ่พๅบ{'John_question_answer': 'answer to the question'}",
|
15 |
+
"subtask 4": "ๅจWordไธญ๏ผๆฐๅปบไธไธชๆๆกฃ๏ผๅๅ
ฅ{John_question_answer}๏ผๅนถไฟๅญไธบโไฝไธ.docxโ",
|
16 |
+
"subtask 5": "ๅจๅพฎไฟกไธญ๏ผๅ้โไฝไธ.docxโ็ปโJohnโ"
|
17 |
+
}
|
18 |
+
|
19 |
+
้่ฆๆณจๆ๏ผ
|
20 |
+
1. ๅ
ๅซๅญๅ
ธๅ
ๅฎน็ๅญ็ฌฆไธฒๆๆ ผๅผๅๅญ็ฌฆไธฒ๏ผ้่ฆๅฐฝๅฏ่ฝ่ฏฆ็ปๅฐ่ฏดๆdictไธญๅไธชkey็ๅซไน๏ผๅณๅฐๅชไบๅ
ๅฎนไปฅdict็ๅฝขๅผ่พๅบ๏ผ
|
21 |
+
2. ๆฏไธชๆ ผๅผๅๅญ็ฌฆไธฒๅฝขๅผ็ๅญไปปๅกไธญๅ
ๅซ็key๏ผๅจๅๅบๅญไปปๅกไธญ่ฆๆๅฏนๅบ็dictๅฝขๅผ่พๅบ๏ผไนๅฐฑๆฏ่ฏด๏ผๅๅบๅญไปปๅกๆง่กๅฎๆๅ๏ผไฟ่ฏๅฝๅๅญไปปๅก่ฝๅค้่ฟๅๆฐไผ ้ๅพๅฐ่กฅๅ
จ๏ผไป่ๅฏไปฅ็ฌ็ซๆง่กใ
|
22 |
+
3. ๅฟ
้กปไฟ่ฏ๏ผๆฏไธชๅญไปปๅก๏ผๆ ่ฎบๆฏๅธธ่งๅญ็ฌฆไธฒ๏ผ่ฟๆฏ่กฅๅ
จไนๅ็ๆ ผๅผๅๅญ็ฌฆไธฒ๏ผ่ฝๅคๅฎๅ
จ่ฑ็ฆปๅ
ถไปๅญไปปๅก็ฌ็ซๆง่กใไพๅฆโๅจWordไธญๆฐๅปบไธไธชๆๆกฃ๏ผๅๅ
ฅ{John_question_answer}โๅฏไปฅ็ฌ็ซๆง่ก๏ผไฝโๅฐไฟฎๆนๅ็Wordๆๆกฃ้่ฟ้ฎไปถๅ้็ป{name}โๅๅ ไธบโWordๆๆกฃโๆไปฃไธๆ็กฎๆ ๆณ็ฌ็ซๆง่กใ
|
23 |
+
4. ๆ่งฃๅ็ๆฏไธชๅญไปปๅก่ฆๆๆ็กฎ็ๅบ็จ็จๅบ๏ผไพๅฆโๅจChromeไธญโใโๅจWordไธญโ็ญใไธ่ฌ่่จ๏ผdocxๆ ผๅผๆๆกฃ็จWord็จๅบๆๅผ๏ผxlsxๆ ผๅผ่กจๆ ผ็จExcel็จๅบๆๅผใๆญคๅค๏ผ้่ฆๆๅผๆไปถๆถ๏ผ่ฆๆ็กฎๆไปถ็ๅๅญใ
|
24 |
+
'''
|
25 |
+
|
26 |
+
inst_prompt = '''
|
27 |
+
User Instruction:
|
28 |
+
{}
|
29 |
+
'''
|
30 |
+
|
31 |
+
format_prompt = '''
|
32 |
+
่ฏทไฝ ๆ็
งๅฆไธๆ ผๅผ่พๅบๆๅๅ็ๅญไปปๅก๏ผ
|
33 |
+
{
|
34 |
+
"subtask 1": ,
|
35 |
+
"subtask 2": ,
|
36 |
+
...
|
37 |
+
}
|
38 |
+
'''
|
39 |
+
prompt = func_prompt + inst_prompt.format(instruction) + format_prompt
|
40 |
+
return prompt
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
def get_subtask_prompt(instruction):
|
45 |
+
func_prompt = '''A multi-modal agent completes a user's instruction by performing a series of actions such as clicking and typing. A user's instruction may consist of multiple subtasks across different applications. I want you to break down this complex instruction into several subtasks, which are of four types:
|
46 |
+
|
47 |
+
1. Regular string: For example, "Open dark mode in system settings";
|
48 |
+
2. String containing dictionary content: The result of the current subtask needs to be passed to other subtasks in a dictionary format, for example, "Check the emails from 'Paul' in Outlook and output the email details in a dict format like {'contact': 'Paul', 'mail_content': 'content of the email'}";
|
49 |
+
3. Formatted string containing the keys from previous subtasks: Use the information from previous subtasks to complete and independently execute the current subtask, for example, "Send {mail_content} via SMS to 'Joey'". Note: Note: The text in the first "{""}" must be a key from the output of a previous subtask, and there should be no "''";
|
50 |
+
4. Formatted string containing the keys from previous subtasks and the dictionary content: This requires both information from previous subtasks to complete the current subtask and the result also needs to be passed to other subtasks in a dictionary format, for example, "Search for {question} on Google and output the relevant information in a dict format like {'info': 'related information'}". Note: The text in the first "{""}" must be a key from the output of a previous subtask, and there should be no "''".
|
51 |
+
|
52 |
+
|
53 |
+
For example, the compound instruction "Open dark mode in system settings, check the two questions sent by 'John' in WeChat, search for answers to these two questions in Chrome, add the answers to a new Word document, save it as 'assignment.docx', and then send it to 'John'." can be broken down into:
|
54 |
+
{
|
55 |
+
"subtask 1": "Open dark mode in system settings",
|
56 |
+
"subtask 2": "Check the questions sent by 'John' in WeChat and output the questions in a dict format {'John_question_1': 'content of John\'s question_1', 'John_question_2': 'content of John\'s question_2'}",
|
57 |
+
"subtask 3": "Search for {John_question_1} in Chrome and output the found answer in a dict format {'John_question_1_answer': 'answer to the question_1'}",
|
58 |
+
"subtask 4": "Search for {John_question_2} in Chrome and output the found answer in a dict format {'John_question_2_answer': 'answer to the question_2'}",
|
59 |
+
"subtask 5": "Create a new document in Word, write {John_question_1_answer} and {John_question_2_answer} sequentially, then save it as 'assignment.docx'",
|
60 |
+
"subtask 6": "Send 'assignment.docx' to 'John' via WeChat"
|
61 |
+
}
|
62 |
+
|
63 |
+
Notes:
|
64 |
+
1. Strings or formatted strings containing dictionary content should explain as detailed as possible the meaning of each key in the dict, i.e., what content should be output in dict form;
|
65 |
+
2. Each key in a formatted string subtask must have a corresponding dict output in preceding subtasks, ensuring that after a preceding subtask is completed, the current subtask can be fully completed through parameter passing and thus executed independently.
|
66 |
+
3. Ensure each subtask, whether as a regular string or a completed formatted string, can be executed independently of other subtasks. For example, "Create a new document in Word and write {John_question_answer}" can be executed independently, but "Send the modified Word document via email to {name}" cannot because "Word document" is ambiguous and cannot be executed independently.
|
67 |
+
4. Each subtask must specify a clear application, such as 'in Chrome' or 'in Word'. Generally, docx formatted documents are opened with Word, and xlsx spreadsheets are opened with Excel. Additionally, when opening a file, clearly state the file name.
|
68 |
+
5. Note that if a subtask contains a dict, ensure that the values in the dictionary do not contain single quote characters to avoid format errors.
|
69 |
+
'''
|
70 |
+
|
71 |
+
inst_prompt = '''
|
72 |
+
User Instruction:
|
73 |
+
{}
|
74 |
+
'''
|
75 |
+
|
76 |
+
format_prompt = '''
|
77 |
+
Please output the split subtasks in the following format:
|
78 |
+
{
|
79 |
+
"subtask 1": ,
|
80 |
+
"subtask 2": ,
|
81 |
+
...
|
82 |
+
}
|
83 |
+
'''
|
84 |
+
prompt = func_prompt + inst_prompt.format(instruction) + format_prompt
|
85 |
+
return prompt
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
def get_select_prompt(content):
|
91 |
+
prompt_template = '''
|
92 |
+
Analyze the specified text range {} and output the first line and last line of the specified range separately.
|
93 |
+
How to identify paragraphs: There are 2 spaces at the beginning of each paragraph. Define the title as the single line at the top.
|
94 |
+
If the content has only one line (such as title), it is both the first and last line.'''
|
95 |
+
|
96 |
+
prompt_format = '''
|
97 |
+
You should respond in the following format:
|
98 |
+
<first>The content of the first line</first>
|
99 |
+
<last>The content of the last line</last>
|
100 |
+
'''
|
101 |
+
prompt = prompt_template.format(content)+prompt_format
|
102 |
+
return prompt
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
def get_select_prompt_simple(content):
|
108 |
+
prompt_template = '''
|
109 |
+
Analyze the text range of this part of the current Word document: {}, and output the content of the first and last lines separately.
|
110 |
+
If the content has only one line in total, this line is the first line and also the last line.'''
|
111 |
+
|
112 |
+
prompt_format = '''
|
113 |
+
You should respond in the following format:
|
114 |
+
<first>The content of the first line</first>
|
115 |
+
<last>The content of the last line</last>
|
116 |
+
'''
|
117 |
+
prompt = prompt_template.format(content)+prompt_format
|
118 |
+
return prompt
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
def get_select_prompt_backup(content):
|
123 |
+
prompt_template = '''
|
124 |
+
Directly output the first line and the last line of the content: {} in the current shown Microsoft Word document. If the content has only one line, output this line twice.'''
|
125 |
+
|
126 |
+
prompt_format = '''
|
127 |
+
You should respond in the following format:
|
128 |
+
<first>The content of the first line</first>
|
129 |
+
<last>The content of the last line</last>
|
130 |
+
'''
|
131 |
+
prompt = prompt_template.format(content)+prompt_format
|
132 |
+
return prompt
|
133 |
+
|
134 |
+
|
135 |
+
def get_action_prompt(instruction, clickable_infos, width, height, thought_history, summary_history, action_history, reflection_history, last_summary, last_action, reflection_thought, add_info, error_flag, completed_content, memory):
|
136 |
+
prompt = "### Background ###\n"
|
137 |
+
prompt += f"This image is a computer screenshot where icons are marked with numbers. Its width is {width} pixels and its height is {height} pixels. The user\'s instruction is: {instruction}.\n\n"
|
138 |
+
|
139 |
+
prompt += "### Tips ###\n"
|
140 |
+
prompt += add_info
|
141 |
+
prompt += "\n\n"
|
142 |
+
|
143 |
+
prompt += "### Screenshot information ###\n"
|
144 |
+
prompt += "In order to help you better perceive the content in this screenshot, we extract some information of the current screenshot. "
|
145 |
+
prompt += "This information consists of two parts: coordinates; content. "
|
146 |
+
prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; "
|
147 |
+
|
148 |
+
prompt += "the content is a text or 'icon' respectively. "
|
149 |
+
prompt += "The information is as follow:\n"
|
150 |
+
|
151 |
+
for clickable_info in clickable_infos:
|
152 |
+
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
|
153 |
+
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
|
154 |
+
|
155 |
+
|
156 |
+
if len(action_history) > 0:
|
157 |
+
prompt += "### History operations ###\n"
|
158 |
+
prompt += "Before arriving at the current screenshot, you have completed the following operations:\n"
|
159 |
+
for i in range(len(action_history)):
|
160 |
+
if len(reflection_history) > 0:
|
161 |
+
prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(' to ')[0].strip() + "; Action: " + action_history[i] + "; Reflection: " + reflection_history[i] + "]\n"
|
162 |
+
else:
|
163 |
+
prompt += f"Step-{i+1}: [Operation: " + summary_history[i].split(' to ')[0].strip() + "; Action: " + action_history[i] + "]\n"
|
164 |
+
prompt += "\n"
|
165 |
+
|
166 |
+
if completed_content != "":
|
167 |
+
prompt += "### Progress ###\n"
|
168 |
+
prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
|
169 |
+
prompt += "Completed contents:\n" + completed_content + "\n\n"
|
170 |
+
|
171 |
+
if memory != "":
|
172 |
+
prompt += "### Memory ###\n"
|
173 |
+
prompt += "During the operations, you record the following contents on the screenshot for use in subsequent operations:\n"
|
174 |
+
prompt += "Memory:\n" + memory + "\n"
|
175 |
+
|
176 |
+
|
177 |
+
# ็ฆ็จ
|
178 |
+
if error_flag:
|
179 |
+
prompt += "### Last operation ###\n"
|
180 |
+
prompt += f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time."
|
181 |
+
prompt += "\n\n"
|
182 |
+
print(f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time.")
|
183 |
+
|
184 |
+
prompt += "### Task requirements ###\n"
|
185 |
+
prompt += "In order to meet the user\'s requirements, you need to select one of the following operations to operate on the current screen:\n"
|
186 |
+
prompt += "Note that to open an app, use the Open App action, rather than tapping the app's icon. "
|
187 |
+
prompt += "For certain items that require selection, such as font and font size, direct input is more efficient than scrolling through choices."
|
188 |
+
prompt += "You must choose one of the actions below:\n"
|
189 |
+
prompt += "Open App (app name): If you want to open an app, you should use this action to open the app named 'app name'."
|
190 |
+
prompt += "Right Tap (x, y): Right tap the position (x, y) in current page. This can be used to create a new file.\n"
|
191 |
+
prompt += "Tap (x, y): Tap the position (x, y) in current page. This can be used to select an item.\n"
|
192 |
+
prompt += "Double Tap (x, y): Double tap the position (x, y) in the current page. This can be used to open a file. If Tap (x, y) in the last step doesn't work, you can try double tap the position (x, y) in the current page.\n"
|
193 |
+
|
194 |
+
|
195 |
+
prompt += '''
|
196 |
+
Shortcut (key1, key2): There are several shortcuts (key1+key2) you may use.
|
197 |
+
For example, if you can't find the download button, use command+s to save the page or download the file.
|
198 |
+
To select all, you can use command+a.
|
199 |
+
To create a new file in Word/Excel, you can use command+n.
|
200 |
+
To create a new tab for starting a new search in Chrome, you can use command+t.
|
201 |
+
To copy an item, you can first select it and then use command+c.
|
202 |
+
To paste the copied item, you can first select the location you want to paste it to, and then use command+v.
|
203 |
+
'''
|
204 |
+
prompt += '''
|
205 |
+
Press (key name): There are several keys that may help.
|
206 |
+
For example, if you want to delete the selected content, press 'backspace'.
|
207 |
+
You can press 'enter' to confirm, submit the input command, or insert a line break.
|
208 |
+
Also, you can press 'up', 'down', 'left', or 'right' to scroll the page or adjust the position of the selected object.
|
209 |
+
'''
|
210 |
+
|
211 |
+
prompt += "Type (x, y), (text): Tap the position (x, y) and type the \"text\" in the input box and press the enter key. You should replace the \"text\" with the actual input.\n"
|
212 |
+
|
213 |
+
prompt += "Select (content): Select the referred 'content' in the current document, such as 'title', 'the second paragraph' and 'the last two paragraphs'. This action is useful when you want to edit a certain part of the document, such as bolding, adding underlines, changing line spacing, centering text, etc.\n"
|
214 |
+
prompt += "Replace (x, y), (text): Replace the editable content in (x, y) with the \"text\". You should replace the \"text\" with the actual input. This action is very useful when you want to start a new search in Chrome or rename a file.\n"
|
215 |
+
prompt += "Append (x, y), (text): Append the \"text\" content after the content at (x, y) location. This action is useful when you want to append new content into a word document.\n"
|
216 |
+
|
217 |
+
prompt += "Tell (answer): Tell me the answer of the input query.\n"
|
218 |
+
prompt += "Stop: If all the operations to meet the user\'s requirements have been completed in ### History operation ###, use this operation to stop the whole process."
|
219 |
+
prompt += "\n\n"
|
220 |
+
|
221 |
+
prompt += "### Output format ###\n"
|
222 |
+
# modified 2.10
|
223 |
+
prompt += "You should output in the following json format:"
|
224 |
+
prompt += '''
|
225 |
+
{"Thought": "This is your thinking about how to proceed the next operation, please output the thoughts about the history operations explicitly.", "Action": "Open App () or Tap () or Double Tap () or Triple Tap () or Shortcut () or Press() or Type () or Tell () or Stop. Only one action can be output at one time.", "Summary": "This is a one sentence summary of this operation."}
|
226 |
+
'''
|
227 |
+
prompt += "\n\n"
|
228 |
+
|
229 |
+
return prompt
|
230 |
+
|
231 |
+
|
232 |
+
def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width, height, summary, action, add_info, no_image=0):
|
233 |
+
if no_image == 1:
|
234 |
+
prompt = f"The computer screen's width is {width} pixels and the height is {height} pixels.\n\n"
|
235 |
+
else:
|
236 |
+
prompt = f"These images are two computer screenshots before and after an operation. Their widths are {width} pixels and their heights are {height} pixels.\n\n"
|
237 |
+
|
238 |
+
prompt += "In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot. "
|
239 |
+
prompt += "The information consists of two parts, consisting of format: coordinates; content. "
|
240 |
+
prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively "
|
241 |
+
prompt += "\n\n"
|
242 |
+
|
243 |
+
prompt += "### Before the current operation ###\n"
|
244 |
+
prompt += "Screenshot information:\n"
|
245 |
+
for clickable_info in clickable_infos1:
|
246 |
+
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
|
247 |
+
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
|
248 |
+
prompt += "\n\n"
|
249 |
+
|
250 |
+
prompt += "### After the current operation ###\n"
|
251 |
+
prompt += "Screenshot information:\n"
|
252 |
+
for clickable_info in clickable_infos2:
|
253 |
+
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
|
254 |
+
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
|
255 |
+
prompt += "\n\n"
|
256 |
+
|
257 |
+
prompt += "### Current operation ###\n"
|
258 |
+
prompt += f"The user\'s instruction is: {instruction}."
|
259 |
+
if add_info != "":
|
260 |
+
prompt += f"You also need to note the following requirements: {add_info}."
|
261 |
+
prompt += "In the process of completing the requirements of instruction, an operation is performed on the computer. Below are the details of this operation:\n"
|
262 |
+
prompt += "Operation thought: " + summary.split(" to ")[0].strip() + "\n"
|
263 |
+
prompt += "Operation action: " + action
|
264 |
+
prompt += "\n\n"
|
265 |
+
|
266 |
+
prompt += "### Response requirements ###\n"
|
267 |
+
if no_image == 1:
|
268 |
+
prompt += "Now you need to output the following content based on the screenshots information before and after the current operation:\n"
|
269 |
+
else:
|
270 |
+
prompt += "Now you need to output the following content based on the screenshots before and after the current operation:\n"
|
271 |
+
prompt += "Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n"
|
272 |
+
prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\".\n"
|
273 |
+
prompt += "B: The \"Operation action\" results in a wrong page and I need to do something to correct this.\n"
|
274 |
+
prompt += "C: The \"Operation action\" produces no changes."
|
275 |
+
prompt += "\n\n"
|
276 |
+
|
277 |
+
prompt += "### Output format ###\n"
|
278 |
+
prompt += "Your output format is:\n"
|
279 |
+
prompt += "### Thought ###\nYour thought about the question\n"
|
280 |
+
prompt += "### Answer ###\nA or B or C"
|
281 |
+
|
282 |
+
return prompt
|
283 |
+
|
284 |
+
|
285 |
+
def get_memory_prompt(insight):
|
286 |
+
if insight != "":
|
287 |
+
prompt = "### Important content ###\n"
|
288 |
+
prompt += insight
|
289 |
+
prompt += "\n\n"
|
290 |
+
|
291 |
+
prompt += "### Response requirements ###\n"
|
292 |
+
prompt += "Please think about whether there is any content closely related to ### Important content ### on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
|
293 |
+
|
294 |
+
else:
|
295 |
+
prompt = "### Response requirements ###\n"
|
296 |
+
prompt += "Please think about whether there is any content closely related to user\'s instrcution on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
|
297 |
+
|
298 |
+
prompt += "### Output format ###\n"
|
299 |
+
prompt += "Your output format is:\n"
|
300 |
+
prompt += "### Important content ###\nThe content or None. Please do not repeatedly output the information in ### Memory ###."
|
301 |
+
|
302 |
+
return prompt
|
303 |
+
|
304 |
+
def get_process_prompt(instruction, thought_history, summary_history, action_history, completed_content, add_info, reflection_history=[]):
|
305 |
+
prompt = "### Background ###\n"
|
306 |
+
prompt += f"There is an user\'s instruction which is: {instruction}. You are a computer operating assistant and are operating the user\'s computer.\n\n"
|
307 |
+
|
308 |
+
if add_info != "":
|
309 |
+
prompt += "### Hint ###\n"
|
310 |
+
prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
|
311 |
+
prompt += add_info
|
312 |
+
prompt += "\n\n"
|
313 |
+
|
314 |
+
if len(thought_history) > 1:
|
315 |
+
prompt += "### History operations ###\n"
|
316 |
+
prompt += "To complete the requirements of user\'s instruction, you have performed a series of operations. These operations are as follow:\n"
|
317 |
+
for i in range(len(summary_history)):
|
318 |
+
operation = summary_history[i].split(" to ")[0].strip()
|
319 |
+
if len(reflection_history) > 0:
|
320 |
+
prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "; Operation reflection: " + reflection_history[i] + "]\n"
|
321 |
+
else:
|
322 |
+
prompt += f"Step-{i+1}: [Operation thought: " + operation + "; Operation action: " + action_history[i] + "]\n"
|
323 |
+
prompt += "\n"
|
324 |
+
|
325 |
+
prompt += "### Progress thinking ###\n"
|
326 |
+
prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
|
327 |
+
prompt += "Completed contents:\n" + completed_content + "\n\n"
|
328 |
+
|
329 |
+
prompt += "### Response requirements ###\n"
|
330 |
+
prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the ### History operations ###.\n\n"
|
331 |
+
|
332 |
+
prompt += "### Output format ###\n"
|
333 |
+
prompt += "Your output format is:\n"
|
334 |
+
prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### History operations ###."
|
335 |
+
|
336 |
+
else:
|
337 |
+
prompt += "### Current operation ###\n"
|
338 |
+
prompt += "To complete the requirements of user\'s instruction, you have performed an operation. Your operation thought and action of this operation are as follows:\n"
|
339 |
+
prompt += f"Operation thought: {thought_history[-1]}\n"
|
340 |
+
operation = summary_history[-1].split(" to ")[0].strip()
|
341 |
+
if len(reflection_history) > 0:
|
342 |
+
prompt += f"Operation action: {operation}\n" + "Operation reflection: " + reflection_history[-1] + "\n\n"
|
343 |
+
else:
|
344 |
+
prompt += f"Operation action: {operation}\n\n"
|
345 |
+
|
346 |
+
# if reflection_thought is not None:
|
347 |
+
# prompt += "A reflection model was adopted to analyze whether the last step's operation meets the expectation, you should combine its reflection thought to produce the \"Completed contents\"."
|
348 |
+
# prompt += "Below is its reflection thought:\n"
|
349 |
+
# prompt += reflection_thought + "\n"
|
350 |
+
|
351 |
+
prompt += "### Response requirements ###\n"
|
352 |
+
prompt += "Now you need to combine all of the above to generate the \"Completed contents\".\n"
|
353 |
+
prompt += "Completed contents is a general summary of the current contents that have been completed. You need to first focus on the requirements of user\'s instruction, and then summarize the contents that have been completed.\n\n"
|
354 |
+
|
355 |
+
prompt += "### Output format ###\n"
|
356 |
+
prompt += "Your output format is:\n"
|
357 |
+
prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### Current operation ###.\n"
|
358 |
+
prompt += "(Please use English to output)"
|
359 |
+
|
360 |
+
return prompt
|
PCAgent/text_localization.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
import os
|
5 |
+
from alibabacloud_tea_util import models as util_models
|
6 |
+
from alibabacloud_tea_openapi import models as open_api_models
|
7 |
+
from alibabacloud_ocr_api20210707 import models as ocr_api_20210707_models
|
8 |
+
from alibabacloud_ocr_api20210707.client import Client as ocr_api20210707Client
|
9 |
+
|
10 |
+
class Sample:
|
11 |
+
def __init__(self):
|
12 |
+
pass
|
13 |
+
|
14 |
+
@staticmethod
|
15 |
+
def create_client() -> ocr_api20210707Client:
|
16 |
+
config = open_api_models.Config(
|
17 |
+
access_key_id=os.environ.get('OCR_ACCESS_KEY_ID'),
|
18 |
+
access_key_secret=os.environ.get('OCR_ACCESS_KEY_SECRET'),
|
19 |
+
)
|
20 |
+
config.endpoint = f'ocr-api.cn-hangzhou.aliyuncs.com'
|
21 |
+
return ocr_api20210707Client(config)
|
22 |
+
|
23 |
+
@staticmethod
|
24 |
+
def main(image) -> None:
|
25 |
+
client = Sample.create_client()
|
26 |
+
recognize_all_text_request = ocr_api_20210707_models.RecognizeAllTextRequest(
|
27 |
+
body=image,
|
28 |
+
type='Advanced',
|
29 |
+
output_coordinate='points',
|
30 |
+
output_oricoord=True,
|
31 |
+
)
|
32 |
+
runtime = util_models.RuntimeOptions()
|
33 |
+
output = client.recognize_all_text_with_options(recognize_all_text_request, runtime)
|
34 |
+
# logger.info(f'ocr response๏ผ{output}', extra={'request_id': ""})
|
35 |
+
output = output.body.data.sub_images[0].block_info.block_details
|
36 |
+
return output
|
37 |
+
|
38 |
+
def image_to_binary(image_path):
|
39 |
+
with open(image_path, 'rb') as file:
|
40 |
+
binary_data = file.read()
|
41 |
+
return binary_data
|
42 |
+
|
43 |
+
def remove_punctuation(text):
|
44 |
+
# ไฝฟ็จๆญฃๅ่กจ่พพๅผๅ ้คๆ ็น็ฌฆๅทใไธๅ็บฟๅ็ฉบๆ ผ
|
45 |
+
cleaned_text = re.sub(r'[^\w\s]', '', text) # ๅ ้คๆ ็น็ฌฆๅท
|
46 |
+
cleaned_text = re.sub(r'_', '', cleaned_text) # ๅ ้คไธๅ็บฟ
|
47 |
+
cleaned_text = re.sub(r'\s', '', cleaned_text) # ๅ ้ค็ฉบๆ ผ
|
48 |
+
return cleaned_text.replace("v", "").replace("o", "").replace("O", "").replace("T", "").replace("Q", "").replace("ไธถ", "")
|
49 |
+
|
50 |
+
|
51 |
+
class OCRError(Exception):
|
52 |
+
def __init__(self, message):
|
53 |
+
super().__init__(message)
|
54 |
+
self.message = message
|
55 |
+
|
56 |
+
def ocr(image_path):
|
57 |
+
text = []
|
58 |
+
coordinate = []
|
59 |
+
image = image_to_binary(image_path)
|
60 |
+
print(image_path)
|
61 |
+
try:
|
62 |
+
outputs = Sample.main(image)
|
63 |
+
except Exception as e:
|
64 |
+
raise OCRError(e.message)
|
65 |
+
for output in outputs:
|
66 |
+
text.append(output.block_content)
|
67 |
+
bbox = [int(output.block_points[0].x), int(output.block_points[0].y), int(output.block_points[2].x), int(output.block_points[2].y)]
|
68 |
+
coordinate.append(bbox)
|
69 |
+
|
70 |
+
return text, coordinate
|
PCAgent/text_localization_old.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
from PCAgent.crop import crop_image, calculate_size
|
4 |
+
from PIL import Image
|
5 |
+
|
6 |
+
|
7 |
+
def order_point(coor):
|
8 |
+
arr = np.array(coor).reshape([4, 2])
|
9 |
+
sum_ = np.sum(arr, 0)
|
10 |
+
centroid = sum_ / arr.shape[0]
|
11 |
+
theta = np.arctan2(arr[:, 1] - centroid[1], arr[:, 0] - centroid[0])
|
12 |
+
sort_points = arr[np.argsort(theta)]
|
13 |
+
sort_points = sort_points.reshape([4, -1])
|
14 |
+
if sort_points[0][0] > centroid[0]:
|
15 |
+
sort_points = np.concatenate([sort_points[3:], sort_points[:3]])
|
16 |
+
sort_points = sort_points.reshape([4, 2]).astype('float32')
|
17 |
+
return sort_points
|
18 |
+
|
19 |
+
|
20 |
+
def longest_common_substring_length(str1, str2):
|
21 |
+
m = len(str1)
|
22 |
+
n = len(str2)
|
23 |
+
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
24 |
+
|
25 |
+
for i in range(1, m + 1):
|
26 |
+
for j in range(1, n + 1):
|
27 |
+
if str1[i - 1] == str2[j - 1]:
|
28 |
+
dp[i][j] = dp[i - 1][j - 1] + 1
|
29 |
+
else:
|
30 |
+
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
|
31 |
+
|
32 |
+
return dp[m][n]
|
33 |
+
|
34 |
+
|
35 |
+
def ocr(image_path, ocr_detection, ocr_recognition):
|
36 |
+
text_data = []
|
37 |
+
coordinate = []
|
38 |
+
|
39 |
+
image_full = cv2.imread(image_path)
|
40 |
+
try:
|
41 |
+
det_result = ocr_detection(image_full)
|
42 |
+
except:
|
43 |
+
print('not text detected')
|
44 |
+
return ['no text'], [[0,0,0,0]]
|
45 |
+
det_result = det_result['polygons']
|
46 |
+
for i in range(det_result.shape[0]):
|
47 |
+
pts = order_point(det_result[i])
|
48 |
+
image_crop = crop_image(image_full, pts)
|
49 |
+
|
50 |
+
try:
|
51 |
+
result = ocr_recognition(image_crop)['text'][0]
|
52 |
+
except:
|
53 |
+
continue
|
54 |
+
|
55 |
+
box = [int(e) for e in list(pts.reshape(-1))]
|
56 |
+
box = [box[0], box[1], box[4], box[5]]
|
57 |
+
|
58 |
+
text_data.append(result)
|
59 |
+
coordinate.append(box)
|
60 |
+
|
61 |
+
return text_data, coordinate
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: PC Agent
|
3 |
+
emoji: ๐ฌ
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.0.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
short_description: A Hierarchical Multi-Agent Collaboration Framework for Compl
|
12 |
+
---
|
13 |
+
|
14 |
+
An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
|
app.py
CHANGED
@@ -7,6 +7,7 @@ import copy
|
|
7 |
import shutil
|
8 |
import base64
|
9 |
import random
|
|
|
10 |
import gradio as gr
|
11 |
from datetime import datetime
|
12 |
from modelscope.pipelines import pipeline
|
@@ -27,8 +28,18 @@ API_url = os.environ.get('API_url')
|
|
27 |
token = os.environ.get('token')
|
28 |
os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
|
29 |
os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
|
|
|
30 |
radius = 100
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
chatbot_css = """
|
33 |
<style>
|
34 |
.chat-container {
|
@@ -287,7 +298,7 @@ def chatbot(image, instruction, add_info, history, chat_log):
|
|
287 |
screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png")
|
288 |
image.save(screenshot_file, format="PNG")
|
289 |
screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1]
|
290 |
-
perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="
|
291 |
shutil.rmtree(temp_file)
|
292 |
os.mkdir(temp_file)
|
293 |
|
|
|
7 |
import shutil
|
8 |
import base64
|
9 |
import random
|
10 |
+
import requests
|
11 |
import gradio as gr
|
12 |
from datetime import datetime
|
13 |
from modelscope.pipelines import pipeline
|
|
|
28 |
token = os.environ.get('token')
|
29 |
os.environ["OCR_ACCESS_KEY_ID"] = os.environ.get('OCR_ACCESS_KEY_ID')
|
30 |
os.environ["OCR_ACCESS_KEY_SECRET"] = os.environ.get('OCR_ACCESS_KEY_SECRET')
|
31 |
+
tff_file = os.environ.get('tff_file')
|
32 |
radius = 100
|
33 |
|
34 |
+
def download_file(url, save_path):
|
35 |
+
response = requests.get(url, stream=True) # ไปฅๆต็ๆนๅผไธ่ฝฝ
|
36 |
+
response.raise_for_status() # ็กฎไฟ่ฏทๆฑๆๅ
|
37 |
+
with open(save_path, 'wb') as file:
|
38 |
+
for chunk in response.iter_content(chunk_size=8192): # ๅๅๅๅ
ฅ๏ผ้ฒๆญขๅ ็จ่ฟๅคๅ
ๅญ
|
39 |
+
file.write(chunk)
|
40 |
+
|
41 |
+
download_file(tff_file, "font/arial.ttf")
|
42 |
+
|
43 |
chatbot_css = """
|
44 |
<style>
|
45 |
.chat-container {
|
|
|
298 |
screenshot_file = os.path.join(screenshot_root, f"screenshot_{current_time}.png")
|
299 |
image.save(screenshot_file, format="PNG")
|
300 |
screenshot_som_file = screenshot_file.split(".")[0] + "_som." + screenshot_file.split(".")[1]
|
301 |
+
perception_infos, width, height = get_perception_infos(screenshot_file, screenshot_som_file, font_path="font/arial.ttf")
|
302 |
shutil.rmtree(temp_file)
|
303 |
os.mkdir(temp_file)
|
304 |
|
example/1-1.jpg
ADDED
![]() |
example/1-2.jpg
ADDED
![]() |
example/1-3.jpg
ADDED
![]() |